GitList

Browse code

moving postprocess to ffmpeg/libavcodec

Originally committed as revision 1586 to svn://svn.ffmpeg.org/ffmpeg/trunk
Originally committed as revision 9427 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
Originally committed as revision 9428 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

Michael Niedermayer authored on 2003/02/15 06:27:25
Showing 13 changed files

configure index 559467e..c403899 100755
libavcodec/Makefile index f8be2a8..6245462 100644
libavcodec/libpostproc/Makefile index 0000000..095dec9
libavcodec/libpostproc/mangle.h index 0000000..5f5dc48
libavcodec/libpostproc/postprocess.c index 0000000..d9261b4
libavcodec/libpostproc/postprocess.h index 0000000..c8af1b1
libavcodec/libpostproc/postprocess_internal.h index 0000000..f45a0cf
libavcodec/libpostproc/postprocess_template.c index 0000000..af48cdc
postproc/Makefile index 2ef7bbc..a817718 100644
postproc/postprocess.c index e470fd0..0000000
postproc/postprocess.h index c8af1b1..0000000
postproc/postprocess_internal.h index f45a0cf..0000000
postproc/postprocess_template.c index af48cdc..0000000

configure

History View file @ bba9b16

@@ -60,6 +60,8 @@ mp3lame="no"
                      vorbis="no"
                      a52="yes"
                      a52bin="no"
                     +pp="yes"
                     +shared_pp="no"
                      win32="no"
                      mingw32="no"
                      cygwin="no"
@@ -281,6 +283,10 @@ for opt do
                        ;;
                        --enable-a52bin) a52bin="yes" ; extralibs="$ldl $extralibs"
                        ;;
                     +  --disable-pp) pp="no"
                     +  ;;
                     +  --enable-shared-pp) shared_pp="yes"
                     +  ;;
                        --enable-mp3lame) mp3lame="yes"
                        ;;
                        --enable-vorbis) vorbis="yes"
@@ -578,6 +584,8 @@ echo "  --enable-win32           enable win32 cross compile"
                      echo "  --enable-mingw32         enable mingw32 native windows compile"
                      echo "  --disable-a52            disable GPL'ed A52 support [default=no]"
                      echo "  --enable-a52bin          open liba52.so.0 at runtime [default=no]"
                     +echo "  --disable-pp             disable GPL'ed post processing support [default=no]"
                     +echo "  --enable-shared-pp       use libpostproc.so [default=no]"
                      echo "  --enable-shared          build shared libraries [default=no]"
                      echo ""
                      echo "Advanced options (experts only):"
@@ -631,6 +639,8 @@ echo "mp3lame enabled  $mp3lame"
                      echo "vorbis enabled   $vorbis"
                      echo "a52 support      $a52"
                      echo "a52 dlopened     $a52bin"
                     +echo "pp support       $pp"
                     +echo "shared pp        $shared_pp"
                      echo "Video hooking    $vhook"
                      echo "risky / patent encumbered codecs $risky"
@@ -754,6 +764,17 @@ if test "$a52" = "yes" ; then
                        fi
                      fi
                     +# PP
                     +if test "$pp" = "yes" ; then
                     +  echo "#define CONFIG_PP 1" >> $TMPH
                     +  echo "CONFIG_PP=yes" >> config.mak
+                    +
                     +  if test "$shared_pp" = "yes" ; then
                     +    echo "#define SHARED_PP 1" >> $TMPH
                     +    echo "SHARED_PP=yes" >> config.mak
                     +  fi
                     +fi
+                    +
                      # mpeg audio high precision mode
                      if test "$mpegaudio_hp" = "yes" ; then
                        echo "#define CONFIG_MPEGAUDIO_HP 1" >> $TMPH

libavcodec/Makefile

History View file @ bba9b16

@@ -35,6 +35,15 @@ OBJS+= liba52/bit_allocate.o liba52/bitstream.o liba52/downmix.o \
                      endif
                      endif
                     +ifeq ($(CONFIG_PP),yes)
                     +ifeq ($(SHARED_PP),yes)
                     +EXTRALIBS += -lpostproc
                     +else
                     +# LIBS += libpostproc/libpostproc.a ... should be fixed
                     +OBJS += libpostproc/postprocess.o
                     +endif
                     +endif
+                    +
                      ifeq ($(CONFIG_MP3LAME),yes)
                      OBJS += mp3lameaudio.o
                      EXTRALIBS += -lmp3lame
@@ -125,6 +134,9 @@ $(SLIB): $(OBJS)
                      dsputil.o: dsputil.c dsputil.h
                     +libpostproc/libpostproc.a:
                     +	$(MAKE) -C libpostproc
+                    +
                      %.o: %.c
                      	$(CC) $(CFLAGS) -c -o $@ $<

libavcodec/libpostproc/Makefile

History View file @ bba9b16

                     new file mode 100644
@@ -0,0 +1,64 @@
+                    +
                     +include ../../config.mak
+                    +
                     +ifeq ($(SHARED_PP),yes)
                     +SPPLIB = libpostproc.so
                     +SPPVERSION = 0.0.1
                     +endif
                     +PPLIB = libpostproc.a
+                    +
                     +PPOBJS=postprocess.o
                     +SPPOBJS=postprocess_pic.o
+                    +
                     +CFLAGS  = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC)
                     +# -I/usr/X11R6/include/
+                    +
                     +.SUFFIXES: .c .o
+                    +
                     +# .PHONY: all clean
+                    +
                     +.c.o:
                     +	$(CC) -c $(CFLAGS) -I.. -I../.. -o $@ $<
+                    +
                     +all:    $(SWSLIB) $(PPLIB) $(SPPLIB)
+                    +
                     +clean:
                     +	rm -f *.o *.a *~ *.so
+                    +
                     +distclean:
                     +	rm -f Makefile.bak *.o *.a *~ *.so .depend
+                    +
                     +dep:    depend
+                    +
                     +depend:
                     +	$(CC) -MM $(CFLAGS) postprocess.c 1>.depend
+                    +
                     +ifeq ($(SHARED_PP),yes)
                     +postprocess_pic.o: postprocess.c
                     +	$(CC) -c $(CFLAGS) -fomit-frame-pointer -fPIC -DPIC -I.. -I../.. -o $@ $<
+                    +
                     +$(SPPLIB): $(SPPOBJS)
                     +	$(CC) -shared -Wl,-soname,$(SPPLIB).0 \
                     +	-o $(SPPLIB) $(SPPOBJS)
                     +endif
+                    +
                     +$(PPLIB): $(PPOBJS)
                     +	$(AR) r $(PPLIB) $(PPOBJS)
+                    +
                     +install: all
                     +ifeq ($(SHARED_PP),yes)
                     +	install -d $(prefix)/lib
                     +	install -s -m 755 $(SPPLIB) $(prefix)/lib/$(SPPLIB).$(SPPVERSION)
                     +	ln -sf $(SPPLIB).$(SPPVERSION) $(prefix)/lib/$(SPPLIB)
                     +	ldconfig || true
                     +	mkdir -p $(prefix)/include/postproc
                     +	install -m 644 postprocess.h $(prefix)/include/postproc/postprocess.h
                     +endif
+                    +
+                    +
                     +#
                     +# include dependency files if they exist
                     +#
                     +ifneq ($(wildcard .depend),)
                     +include .depend
                     +endif

libavcodec/libpostproc/mangle.h

History View file @ bba9b16

                     new file mode 100644
@@ -0,0 +1,19 @@
                     +/* mangle.h - This file has some CPP macros to deal with different symbol
                     + * mangling across binary formats.
                     + * (c)2002 by Felix Buenemann <atmosfear at users.sourceforge.net>
                     + * File licensed under the GPL, see http://www.fsf.org/ for more info.
                     + */
+                    +
                     +#ifndef __MANGLE_H
                     +#define __MANGLE_H
+                    +
                     +/* Feel free to add more to the list, eg. a.out IMO */
                     +#if defined(__CYGWIN__) || defined(__OS2__) || \
                     +   (defined(__OpenBSD__) && !defined(__ELF__))
                     +#define MANGLE(a) "_" #a
                     +#else
                     +#define MANGLE(a) #a
                     +#endif
+                    +
                     +#endif /* !__MANGLE_H */
+                    +

libavcodec/libpostproc/postprocess.c

History View file @ bba9b16

                     new file mode 100644
@@ -0,0 +1,875 @@
                     +/*
                     +    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
+                    +
                     +    This program is free software; you can redistribute it and/or modify
                     +    it under the terms of the GNU General Public License as published by
                     +    the Free Software Foundation; either version 2 of the License, or
                     +    (at your option) any later version.
+                    +
                     +    This program is distributed in the hope that it will be useful,
                     +    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     +    GNU General Public License for more details.
+                    +
                     +    You should have received a copy of the GNU General Public License
                     +    along with this program; if not, write to the Free Software
                     +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     +*/
+                    +
                     +/*
                     +			C	MMX	MMX2	3DNow
                     +isVertDC		Ec	Ec
                     +isVertMinMaxOk		Ec	Ec
                     +doVertLowPass		E		e	e
                     +doVertDefFilter		Ec	Ec	e	e
                     +isHorizDC		Ec	Ec
                     +isHorizMinMaxOk		a	E
                     +doHorizLowPass		E		e	e
                     +doHorizDefFilter	Ec	Ec	e	e
                     +deRing			E		e	e*
                     +Vertical RKAlgo1	E		a	a
                     +Horizontal RKAlgo1			a	a
                     +Vertical X1#		a		E	E
                     +Horizontal X1#		a		E	E
                     +LinIpolDeinterlace	e		E	E*
                     +CubicIpolDeinterlace	a		e	e*
                     +LinBlendDeinterlace	e		E	E*
                     +MedianDeinterlace#	E	Ec	Ec
                     +TempDeNoiser#		E		e	e
+                    +
                     +* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
                     +# more or less selfinvented filters so the exactness isnt too meaningfull
                     +E = Exact implementation
                     +e = allmost exact implementation (slightly different rounding,...)
                     +a = alternative / approximate impl
                     +c = checked against the other implementations (-vo md5)
                     +*/
+                    +
                     +/*
                     +TODO:
                     +reduce the time wasted on the mem transfer
                     +unroll stuff if instructions depend too much on the prior one
                     +move YScale thing to the end instead of fixing QP
                     +write a faster and higher quality deblocking filter :)
                     +make the mainloop more flexible (variable number of blocks at once
                     +	(the if/else stuff per block is slowing things down)
                     +compare the quality & speed of all filters
                     +split this huge file
                     +optimize c versions
                     +try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
                     +...
                     +*/
+                    +
                     +//Changelog: use the CVS log
+                    +
                     +#include "config.h"
                     +#include <inttypes.h>
                     +#include <stdio.h>
                     +#include <stdlib.h>
                     +#include <string.h>
                     +#ifdef HAVE_MALLOC_H
                     +#include <malloc.h>
                     +#endif
                     +//#undef HAVE_MMX2
                     +//#define HAVE_3DNOW
                     +//#undef HAVE_MMX
                     +//#undef ARCH_X86
                     +//#define DEBUG_BRIGHTNESS
                     +#ifdef USE_FASTMEMCPY
                     +#include "libvo/fastmemcpy.h"
                     +#endif
                     +#include "postprocess.h"
                     +#include "postprocess_internal.h"
+                    +
                     +#include "mangle.h" //FIXME should be supressed
+                    +
                     +#define MIN(a,b) ((a) > (b) ? (b) : (a))
                     +#define MAX(a,b) ((a) < (b) ? (b) : (a))
                     +#define ABS(a) ((a) > 0 ? (a) : (-(a)))
                     +#define SIGN(a) ((a) > 0 ? 1 : -1)
+                    +
                     +#define GET_MODE_BUFFER_SIZE 500
                     +#define OPTIONS_ARRAY_SIZE 10
                     +#define BLOCK_SIZE 8
                     +#define TEMP_STRIDE 8
                     +//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
+                    +
                     +#ifdef ARCH_X86
                     +static uint64_t __attribute__((aligned(8))) w05=		0x0005000500050005LL;
                     +static uint64_t __attribute__((aligned(8))) w20=		0x0020002000200020LL;
                     +static uint64_t __attribute__((aligned(8))) b00= 		0x0000000000000000LL;
                     +static uint64_t __attribute__((aligned(8))) b01= 		0x0101010101010101LL;
                     +static uint64_t __attribute__((aligned(8))) b02= 		0x0202020202020202LL;
                     +static uint64_t __attribute__((aligned(8))) b08= 		0x0808080808080808LL;
                     +static uint64_t __attribute__((aligned(8))) b80= 		0x8080808080808080LL;
                     +#endif
+                    +
                     +static int verbose= 0;
+                    +
                     +static const int deringThreshold= 20;
+                    +
+                    +
                     +static struct PPFilter filters[]=
                     +{
                     +	{"hb", "hdeblock", 		1, 1, 3, H_DEBLOCK},
                     +	{"vb", "vdeblock", 		1, 2, 4, V_DEBLOCK},
                     +/*	{"hr", "rkhdeblock", 		1, 1, 3, H_RK1_FILTER},
                     +	{"vr", "rkvdeblock", 		1, 2, 4, V_RK1_FILTER},*/
                     +	{"h1", "x1hdeblock", 		1, 1, 3, H_X1_FILTER},
                     +	{"v1", "x1vdeblock", 		1, 2, 4, V_X1_FILTER},
                     +	{"dr", "dering", 		1, 5, 6, DERING},
                     +	{"al", "autolevels", 		0, 1, 2, LEVEL_FIX},
                     +	{"lb", "linblenddeint", 	1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
                     +	{"li", "linipoldeint", 		1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
                     +	{"ci", "cubicipoldeint",	1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
                     +	{"md", "mediandeint", 		1, 1, 4, MEDIAN_DEINT_FILTER},
                     +	{"fd", "ffmpegdeint", 		1, 1, 4, FFMPEG_DEINT_FILTER},
                     +	{"tn", "tmpnoise", 		1, 7, 8, TEMP_NOISE_FILTER},
                     +	{"fq", "forcequant", 		1, 0, 0, FORCE_QUANT},
                     +	{NULL, NULL,0,0,0,0} //End Marker
                     +};
+                    +
                     +static char *replaceTable[]=
                     +{
                     +	"default", 	"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     +	"de", 		"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     +	"fast", 	"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     +	"fa", 		"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     +	NULL //End Marker
                     +};
+                    +
                     +#ifdef ARCH_X86
                     +static inline void unusedVariableWarningFixer()
                     +{
                     +	if(w05 + w20 + b00 + b01 + b02 + b08 + b80 == 0) b00=0;
                     +}
                     +#endif
+                    +
+                    +
                     +#ifdef ARCH_X86
                     +static inline void prefetchnta(void *p)
                     +{
                     +	asm volatile(	"prefetchnta (%0)\n\t"
                     +		: : "r" (p)
                     +	);
                     +}
+                    +
                     +static inline void prefetcht0(void *p)
                     +{
                     +	asm volatile(	"prefetcht0 (%0)\n\t"
                     +		: : "r" (p)
                     +	);
                     +}
+                    +
                     +static inline void prefetcht1(void *p)
                     +{
                     +	asm volatile(	"prefetcht1 (%0)\n\t"
                     +		: : "r" (p)
                     +	);
                     +}
+                    +
                     +static inline void prefetcht2(void *p)
                     +{
                     +	asm volatile(	"prefetcht2 (%0)\n\t"
                     +		: : "r" (p)
                     +	);
                     +}
                     +#endif
+                    +
                     +// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
+                    +
                     +/**
                     + * Check if the given 8x8 Block is mostly "flat"
                     + */
                     +static inline int isHorizDC(uint8_t src[], int stride, PPContext *c)
                     +{
                     +	int numEq= 0;
                     +	int y;
                     +	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
                     +	const int dcThreshold= dcOffset*2 + 1;
                     +	for(y=0; y<BLOCK_SIZE; y++)
                     +	{
                     +		if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
                     +		src+= stride;
                     +	}
                     +	return numEq > c->ppMode.flatnessThreshold;
                     +}
+                    +
                     +/**
                     + * Check if the middle 8x8 Block in the given 8x16 block is flat
                     + */
                     +static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
                     +	int numEq= 0;
                     +	int y;
                     +	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
                     +	const int dcThreshold= dcOffset*2 + 1;
                     +	src+= stride*4; // src points to begin of the 8x8 Block
                     +	for(y=0; y<BLOCK_SIZE-1; y++)
                     +	{
                     +		if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
                     +		src+= stride;
                     +	}
                     +	return numEq > c->ppMode.flatnessThreshold;
                     +}
+                    +
                     +static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
                     +{
                     +	if(abs(src[0] - src[7]) > 2*QP) return 0;
+                    +
                     +	return 1;
                     +}
+                    +
                     +static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
                     +{
                     +	int y;
                     +	for(y=0; y<BLOCK_SIZE; y++)
                     +	{
                     +		const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
+                    +
                     +		if(ABS(middleEnergy) < 8*QP)
                     +		{
                     +			const int q=(dst[3] - dst[4])/2;
                     +			const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
                     +			const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
+                    +
                     +			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
                     +			d= MAX(d, 0);
+                    +
                     +			d= (5*d + 32) >> 6;
                     +			d*= SIGN(-middleEnergy);
+                    +
                     +			if(q>0)
                     +			{
                     +				d= d<0 ? 0 : d;
                     +				d= d>q ? q : d;
                     +			}
                     +			else
                     +			{
                     +				d= d>0 ? 0 : d;
                     +				d= d<q ? q : d;
                     +			}
+                    +
                     +        		dst[3]-= d;
                     +	        	dst[4]+= d;
                     +		}
                     +		dst+= stride;
                     +	}
                     +}
+                    +
                     +/**
                     + * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
                     + * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
                     + */
                     +static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
                     +{
+                    +
                     +	int y;
                     +	for(y=0; y<BLOCK_SIZE; y++)
                     +	{
                     +		const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
                     +		const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
+                    +
                     +		int sums[9];
                     +		sums[0] = first + dst[0];
                     +		sums[1] = dst[0] + dst[1];
                     +		sums[2] = dst[1] + dst[2];
                     +		sums[3] = dst[2] + dst[3];
                     +		sums[4] = dst[3] + dst[4];
                     +		sums[5] = dst[4] + dst[5];
                     +		sums[6] = dst[5] + dst[6];
                     +		sums[7] = dst[6] + dst[7];
                     +		sums[8] = dst[7] + last;
+                    +
                     +		dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
                     +		dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
                     +		dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
                     +		dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
                     +		dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
                     +		dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
                     +		dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
                     +		dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
+                    +
                     +		dst+= stride;
                     +	}
                     +}
+                    +
                     +/**
                     + * Experimental Filter 1 (Horizontal)
                     + * will not damage linear gradients
                     + * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
                     + * can only smooth blocks at the expected locations (it cant smooth them if they did move)
                     + * MMX2 version does correct clipping C version doesnt
                     + * not identical with the vertical one
                     + */
                     +static inline void horizX1Filter(uint8_t *src, int stride, int QP)
                     +{
                     +	int y;
                     +	static uint64_t *lut= NULL;
                     +	if(lut==NULL)
                     +	{
                     +		int i;
                     +		lut= (uint64_t*)memalign(8, 256*8);
                     +		for(i=0; i<256; i++)
                     +		{
                     +			int v= i < 128 ? 2*i : 2*(i-256);
                     +/*
                     +//Simulate 112242211 9-Tap filter
                     +			uint64_t a= (v/16) & 0xFF;
                     +			uint64_t b= (v/8) & 0xFF;
                     +			uint64_t c= (v/4) & 0xFF;
                     +			uint64_t d= (3*v/8) & 0xFF;
                     +*/
                     +//Simulate piecewise linear interpolation
                     +			uint64_t a= (v/16) & 0xFF;
                     +			uint64_t b= (v*3/16) & 0xFF;
                     +			uint64_t c= (v*5/16) & 0xFF;
                     +			uint64_t d= (7*v/16) & 0xFF;
                     +			uint64_t A= (0x100 - a)&0xFF;
                     +			uint64_t B= (0x100 - b)&0xFF;
                     +			uint64_t C= (0x100 - c)&0xFF;
                     +			uint64_t D= (0x100 - c)&0xFF;
+                    +
                     +			lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
                     +				(D<<24) | (C<<16) | (B<<8) | (A);
                     +			//lut[i] = (v<<32) | (v<<24);
                     +		}
                     +	}
+                    +
                     +	for(y=0; y<BLOCK_SIZE; y++)
                     +	{
                     +		int a= src[1] - src[2];
                     +		int b= src[3] - src[4];
                     +		int c= src[5] - src[6];
+                    +
                     +		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
+                    +
                     +		if(d < QP)
                     +		{
                     +			int v = d * SIGN(-b);
+                    +
                     +			src[1] +=v/8;
                     +			src[2] +=v/4;
                     +			src[3] +=3*v/8;
                     +			src[4] -=3*v/8;
                     +			src[5] -=v/4;
                     +			src[6] -=v/8;
+                    +
                     +		}
                     +		src+=stride;
                     +	}
                     +}
+                    +
+                    +
                     +//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
                     +//Plain C versions
                     +#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
                     +#define COMPILE_C
                     +#endif
+                    +
                     +#ifdef ARCH_X86
+                    +
                     +#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
                     +#define COMPILE_MMX
                     +#endif
+                    +
                     +#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
                     +#define COMPILE_MMX2
                     +#endif
+                    +
                     +#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
                     +#define COMPILE_3DNOW
                     +#endif
                     +#endif //ARCH_X86
+                    +
                     +#undef HAVE_MMX
                     +#undef HAVE_MMX2
                     +#undef HAVE_3DNOW
                     +#undef ARCH_X86
+                    +
                     +#ifdef COMPILE_C
                     +#undef HAVE_MMX
                     +#undef HAVE_MMX2
                     +#undef HAVE_3DNOW
                     +#undef ARCH_X86
                     +#define RENAME(a) a ## _C
                     +#include "postprocess_template.c"
                     +#endif
+                    +
                     +//MMX versions
                     +#ifdef COMPILE_MMX
                     +#undef RENAME
                     +#define HAVE_MMX
                     +#undef HAVE_MMX2
                     +#undef HAVE_3DNOW
                     +#define ARCH_X86
                     +#define RENAME(a) a ## _MMX
                     +#include "postprocess_template.c"
                     +#endif
+                    +
                     +//MMX2 versions
                     +#ifdef COMPILE_MMX2
                     +#undef RENAME
                     +#define HAVE_MMX
                     +#define HAVE_MMX2
                     +#undef HAVE_3DNOW
                     +#define ARCH_X86
                     +#define RENAME(a) a ## _MMX2
                     +#include "postprocess_template.c"
                     +#endif
+                    +
                     +//3DNOW versions
                     +#ifdef COMPILE_3DNOW
                     +#undef RENAME
                     +#define HAVE_MMX
                     +#undef HAVE_MMX2
                     +#define HAVE_3DNOW
                     +#define ARCH_X86
                     +#define RENAME(a) a ## _3DNow
                     +#include "postprocess_template.c"
                     +#endif
+                    +
                     +// minor note: the HAVE_xyz is messed up after that line so dont use it
+                    +
                     +static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     +	QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
                     +{
                     +	PPContext *c= (PPContext *)vc;
                     +	PPMode *ppMode= (PPMode *)vm;
                     +	c->ppMode= *ppMode; //FIXME
+                    +
                     +	// useing ifs here as they are faster than function pointers allthough the
                     +	// difference wouldnt be messureable here but its much better because
                     +	// someone might exchange the cpu whithout restarting mplayer ;)
                     +#ifdef RUNTIME_CPUDETECT
                     +#ifdef ARCH_X86
                     +	// ordered per speed fasterst first
                     +	if(c->cpuCaps & PP_CPU_CAPS_MMX2)
                     +		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +	else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
                     +		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +	else if(c->cpuCaps & PP_CPU_CAPS_MMX)
                     +		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +	else
                     +		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +#else
                     +		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +#endif
                     +#else //RUNTIME_CPUDETECT
                     +#ifdef HAVE_MMX2
                     +		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +#elif defined (HAVE_3DNOW)
                     +		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +#elif defined (HAVE_MMX)
                     +		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +#else
                     +		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     +#endif
                     +#endif //!RUNTIME_CPUDETECT
                     +}
+                    +
                     +//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     +//	QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
+                    +
                     +/* -pp Command line Help
                     +*/
                     +char *pp_help=
                     +"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
                     +"long form example:\n"
                     +"vdeblock:autoq/hdeblock:autoq/linblenddeint	default,-vdeblock\n"
                     +"short form example:\n"
                     +"vb:a/hb:a/lb					de,-vb\n"
                     +"more examples:\n"
                     +"tn:64:128:256\n"
                     +"Filters			Options\n"
                     +"short	long name	short	long option	Description\n"
                     +"*	*		a	autoq		cpu power dependant enabler\n"
                     +"			c	chrom		chrominance filtring enabled\n"
                     +"			y	nochrom		chrominance filtring disabled\n"
                     +"hb	hdeblock	(2 Threshold)		horizontal deblocking filter\n"
                     +"	1. difference factor: default=64, higher -> more deblocking\n"
                     +"	2. flatness threshold: default=40, lower -> more deblocking\n"
                     +"			the h & v deblocking filters share these\n"
                     +"			so u cant set different thresholds for h / v\n"
                     +"vb	vdeblock	(2 Threshold)		vertical deblocking filter\n"
                     +"h1	x1hdeblock				Experimental h deblock filter 1\n"
                     +"v1	x1vdeblock				Experimental v deblock filter 1\n"
                     +"dr	dering					Deringing filter\n"
                     +"al	autolevels				automatic brightness / contrast\n"
                     +"			f	fullyrange	stretch luminance to (0..255)\n"
                     +"lb	linblenddeint				linear blend deinterlacer\n"
                     +"li	linipoldeint				linear interpolating deinterlace\n"
                     +"ci	cubicipoldeint				cubic interpolating deinterlacer\n"
                     +"md	mediandeint				median deinterlacer\n"
                     +"fd	ffmpegdeint				ffmpeg deinterlacer\n"
                     +"de	default					hb:a,vb:a,dr:a,al\n"
                     +"fa	fast					h1:a,v1:a,dr:a,al\n"
                     +"tn	tmpnoise	(3 Thresholds)		Temporal Noise Reducer\n"
                     +"			1. <= 2. <= 3.		larger -> stronger filtering\n"
                     +"fq	forceQuant	<quantizer>		Force quantizer\n"
                     +;
+                    +
                     +pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
                     +{
                     +	char temp[GET_MODE_BUFFER_SIZE];
                     +	char *p= temp;
                     +	char *filterDelimiters= ",/";
                     +	char *optionDelimiters= ":";
                     +	struct PPMode *ppMode;
                     +	char *filterToken;
+                    +
                     +	ppMode= memalign(8, sizeof(PPMode));
+                    +
                     +	ppMode->lumMode= 0;
                     +	ppMode->chromMode= 0;
                     +	ppMode->maxTmpNoise[0]= 700;
                     +	ppMode->maxTmpNoise[1]= 1500;
                     +	ppMode->maxTmpNoise[2]= 3000;
                     +	ppMode->maxAllowedY= 234;
                     +	ppMode->minAllowedY= 16;
                     +	ppMode->baseDcDiff= 256/4;
                     +	ppMode->flatnessThreshold= 56-16;
                     +	ppMode->maxClippedThreshold= 0.01;
                     +	ppMode->error=0;
+                    +
                     +	strncpy(temp, name, GET_MODE_BUFFER_SIZE);
+                    +
                     +	if(verbose>1) printf("pp: %s\n", name);
+                    +
                     +	for(;;){
                     +		char *filterName;
                     +		int q= 1000000; //PP_QUALITY_MAX;
                     +		int chrom=-1;
                     +		char *option;
                     +		char *options[OPTIONS_ARRAY_SIZE];
                     +		int i;
                     +		int filterNameOk=0;
                     +		int numOfUnknownOptions=0;
                     +		int enable=1; //does the user want us to enabled or disabled the filter
+                    +
                     +		filterToken= strtok(p, filterDelimiters);
                     +		if(filterToken == NULL) break;
                     +		p+= strlen(filterToken) + 1; // p points to next filterToken
                     +		filterName= strtok(filterToken, optionDelimiters);
                     +		if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
+                    +
                     +		if(*filterName == '-')
                     +		{
                     +			enable=0;
                     +			filterName++;
                     +		}
+                    +
                     +		for(;;){ //for all options
                     +			option= strtok(NULL, optionDelimiters);
                     +			if(option == NULL) break;
+                    +
                     +			if(verbose>1) printf("pp: option: %s\n", option);
                     +			if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
                     +			else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
                     +			else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
                     +			else
                     +			{
                     +				options[numOfUnknownOptions] = option;
                     +				numOfUnknownOptions++;
                     +			}
                     +			if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
                     +		}
                     +		options[numOfUnknownOptions] = NULL;
+                    +
                     +		/* replace stuff from the replace Table */
                     +		for(i=0; replaceTable[2*i]!=NULL; i++)
                     +		{
                     +			if(!strcmp(replaceTable[2*i], filterName))
                     +			{
                     +				int newlen= strlen(replaceTable[2*i + 1]);
                     +				int plen;
                     +				int spaceLeft;
+                    +
                     +				if(p==NULL) p= temp, *p=0; 	//last filter
                     +				else p--, *p=',';		//not last filter
+                    +
                     +				plen= strlen(p);
                     +				spaceLeft= p - temp + plen;
                     +				if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
                     +				{
                     +					ppMode->error++;
                     +					break;
                     +				}
                     +				memmove(p + newlen, p, plen+1);
                     +				memcpy(p, replaceTable[2*i + 1], newlen);
                     +				filterNameOk=1;
                     +			}
                     +		}
+                    +
                     +		for(i=0; filters[i].shortName!=NULL; i++)
                     +		{
                     +//			printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
                     +			if(   !strcmp(filters[i].longName, filterName)
                     +			   || !strcmp(filters[i].shortName, filterName))
                     +			{
                     +				ppMode->lumMode &= ~filters[i].mask;
                     +				ppMode->chromMode &= ~filters[i].mask;
+                    +
                     +				filterNameOk=1;
                     +				if(!enable) break; // user wants to disable it
+                    +
                     +				if(q >= filters[i].minLumQuality)
                     +					ppMode->lumMode|= filters[i].mask;
                     +				if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
                     +					if(q >= filters[i].minChromQuality)
                     +						ppMode->chromMode|= filters[i].mask;
+                    +
                     +				if(filters[i].mask == LEVEL_FIX)
                     +				{
                     +					int o;
                     +					ppMode->minAllowedY= 16;
                     +					ppMode->maxAllowedY= 234;
                     +					for(o=0; options[o]!=NULL; o++)
                     +					{
                     +						if(  !strcmp(options[o],"fullyrange")
                     +						   ||!strcmp(options[o],"f"))
                     +						{
                     +							ppMode->minAllowedY= 0;
                     +							ppMode->maxAllowedY= 255;
                     +							numOfUnknownOptions--;
                     +						}
                     +					}
                     +				}
                     +				else if(filters[i].mask == TEMP_NOISE_FILTER)
                     +				{
                     +					int o;
                     +					int numOfNoises=0;
+                    +
                     +					for(o=0; options[o]!=NULL; o++)
                     +					{
                     +						char *tail;
                     +						ppMode->maxTmpNoise[numOfNoises]=
                     +							strtol(options[o], &tail, 0);
                     +						if(tail!=options[o])
                     +						{
                     +							numOfNoises++;
                     +							numOfUnknownOptions--;
                     +							if(numOfNoises >= 3) break;
                     +						}
                     +					}
                     +				}
                     +				else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
                     +				{
                     +					int o;
+                    +
                     +					for(o=0; options[o]!=NULL && o<2; o++)
                     +					{
                     +						char *tail;
                     +						int val= strtol(options[o], &tail, 0);
                     +						if(tail==options[o]) break;
+                    +
                     +						numOfUnknownOptions--;
                     +						if(o==0) ppMode->baseDcDiff= val;
                     +						else ppMode->flatnessThreshold= val;
                     +					}
                     +				}
                     +				else if(filters[i].mask == FORCE_QUANT)
                     +				{
                     +					int o;
                     +					ppMode->forcedQuant= 15;
+                    +
                     +					for(o=0; options[o]!=NULL && o<1; o++)
                     +					{
                     +						char *tail;
                     +						int val= strtol(options[o], &tail, 0);
                     +						if(tail==options[o]) break;
+                    +
                     +						numOfUnknownOptions--;
                     +						ppMode->forcedQuant= val;
                     +					}
                     +				}
                     +			}
                     +		}
                     +		if(!filterNameOk) ppMode->error++;
                     +		ppMode->error += numOfUnknownOptions;
                     +	}
+                    +
                     +	if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
                     +	if(ppMode->error)
                     +	{
                     +		fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
                     +		free(ppMode);
                     +		return NULL;
                     +	}
                     +	return ppMode;
                     +}
+                    +
                     +void pp_free_mode(pp_mode_t *mode){
                     +    if(mode) free(mode);
                     +}
+                    +
                     +static void reallocAlign(void **p, int alignment, int size){
                     +	if(*p) free(*p);
                     +	*p= memalign(alignment, size);
                     +	memset(*p, 0, size);
                     +}
+                    +
                     +static void reallocBuffers(PPContext *c, int width, int height, int stride){
                     +	int mbWidth = (width+15)>>4;
                     +	int mbHeight= (height+15)>>4;
                     +	int i;
+                    +
                     +	c->stride= stride;
+                    +
                     +	reallocAlign((void **)&c->tempDst, 8, stride*24);
                     +	reallocAlign((void **)&c->tempSrc, 8, stride*24);
                     +	reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
                     +	reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
                     +	for(i=0; i<256; i++)
                     +		c->yHistogram[i]= width*height/64*15/256;
+                    +
                     +	for(i=0; i<3; i++)
                     +	{
                     +		//Note:the +17*1024 is just there so i dont have to worry about r/w over te end
                     +		reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
                     +		reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
                     +	}
+                    +
                     +	reallocAlign((void **)&c->deintTemp, 8, width+16);
                     +	reallocAlign((void **)&c->nonBQPTable, 8, mbWidth*mbHeight*sizeof(QP_STORE_T));
                     +	reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
                     +}
+                    +
                     +pp_context_t *pp_get_context(int width, int height, int cpuCaps){
                     +	PPContext *c= memalign(32, sizeof(PPContext));
                     +	int i;
                     +	int stride= (width+15)&(~15); //assumed / will realloc if needed
+                    +
                     +	memset(c, 0, sizeof(PPContext));
                     +	c->cpuCaps= cpuCaps;
                     +	if(cpuCaps&PP_FORMAT){
                     +		c->hChromaSubSample= cpuCaps&0x3;
                     +		c->vChromaSubSample= (cpuCaps>>4)&0x3;
                     +	}else{
                     +		c->hChromaSubSample= 1;
                     +		c->vChromaSubSample= 1;
                     +	}
+                    +
                     +	reallocBuffers(c, width, height, stride);
+                    +
                     +	c->frameNum=-1;
+                    +
                     +	return c;
                     +}
+                    +
                     +void pp_free_context(void *vc){
                     +	PPContext *c = (PPContext*)vc;
                     +	int i;
+                    +
                     +	for(i=0; i<3; i++) free(c->tempBlured[i]);
                     +	for(i=0; i<3; i++) free(c->tempBluredPast[i]);
+                    +
                     +	free(c->tempBlocks);
                     +	free(c->yHistogram);
                     +	free(c->tempDst);
                     +	free(c->tempSrc);
                     +	free(c->deintTemp);
                     +	free(c->nonBQPTable);
                     +	free(c->forcedQPTable);
+                    +
                     +	memset(c, 0, sizeof(PPContext));
+                    +
                     +	free(c);
                     +}
+                    +
                     +void  pp_postprocess(uint8_t * src[3], int srcStride[3],
                     +                 uint8_t * dst[3], int dstStride[3],
                     +                 int width, int height,
                     +                 QP_STORE_T *QP_store,  int QPStride,
                     +		 pp_mode_t *vm,  void *vc, int pict_type)
                     +{
                     +	int mbWidth = (width+15)>>4;
                     +	int mbHeight= (height+15)>>4;
                     +	PPMode *mode = (PPMode*)vm;
                     +	PPContext *c = (PPContext*)vc;
                     +        int minStride= MAX(srcStride[0], dstStride[0]);
+                    +
                     +	if(c->stride < minStride)
                     +		reallocBuffers(c, width, height, minStride);
+                    +
                     +	if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
                     +	{
                     +		int i;
                     +		QP_store= c->forcedQPTable;
                     +		QPStride= 0;
                     +		if(mode->lumMode & FORCE_QUANT)
                     +			for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
                     +		else
                     +			for(i=0; i<mbWidth; i++) QP_store[i]= 1;
                     +	}
                     +if(0){
                     +int x,y;
                     +for(y=0; y<mbHeight; y++){
                     +	for(x=0; x<mbWidth; x++){
                     +		printf("%2d ", QP_store[x + y*QPStride]);
                     +	}
                     +	printf("\n");
                     +}
                     +	printf("\n");
                     +}
                     +//printf("pict_type:%d\n", pict_type);
+                    +
                     +	if(pict_type!=3)
                     +	{
                     +		int x,y;
                     +		for(y=0; y<mbHeight; y++){
                     +			for(x=0; x<mbWidth; x++){
                     +				int qscale= QP_store[x + y*QPStride];
                     +				if(qscale&~31)
                     +				    qscale=31;
                     +				c->nonBQPTable[y*mbWidth + x]= qscale;
                     +			}
                     +		}
                     +	}
+                    +
                     +	if(verbose>2)
                     +	{
                     +		printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
                     +	}
+                    +
                     +	postProcess(src[0], srcStride[0], dst[0], dstStride[0],
                     +		width, height, QP_store, QPStride, 0, mode, c);
+                    +
                     +	width  = (width )>>c->hChromaSubSample;
                     +	height = (height)>>c->vChromaSubSample;
+                    +
                     +	if(mode->chromMode)
                     +	{
                     +		postProcess(src[1], srcStride[1], dst[1], dstStride[1],
                     +			width, height, QP_store, QPStride, 1, mode, c);
                     +		postProcess(src[2], srcStride[2], dst[2], dstStride[2],
                     +			width, height, QP_store, QPStride, 2, mode, c);
                     +	}
                     +	else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
                     +	{
                     +		memcpy(dst[1], src[1], srcStride[1]*height);
                     +		memcpy(dst[2], src[2], srcStride[2]*height);
                     +	}
                     +	else
                     +	{
                     +		int y;
                     +		for(y=0; y<height; y++)
                     +		{
                     +			memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
                     +			memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
                     +		}
                     +	}
                     +}
+                    +

libavcodec/libpostproc/postprocess.h

History View file @ bba9b16

                     new file mode 100644
@@ -0,0 +1,73 @@
                     +/*
                     +    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
+                    +
                     +    This program is free software; you can redistribute it and/or modify
                     +    it under the terms of the GNU General Public License as published by
                     +    the Free Software Foundation; either version 2 of the License, or
                     +    (at your option) any later version.
+                    +
                     +    This program is distributed in the hope that it will be useful,
                     +    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     +    GNU General Public License for more details.
+                    +
                     +    You should have received a copy of the GNU General Public License
                     +    along with this program; if not, write to the Free Software
                     +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     +*/
+                    +
                     +#ifndef NEWPOSTPROCESS_H
                     +#define NEWPOSTPROCESS_H
+                    +
                     +/**
                     + * @file postprocess.h
                     + * @brief
                     + *     external api for the pp stuff
                     + */
+                    +
                     +#ifdef __cplusplus
                     +extern "C" {
                     +#endif
+                    +
                     +#define PP_QUALITY_MAX 6
+                    +
                     +#define QP_STORE_T int8_t
+                    +
                     +typedef void pp_context_t;
                     +typedef void pp_mode_t;
+                    +
                     +extern char *pp_help; //a simple help text
+                    +
                     +void  pp_postprocess(uint8_t * src[3], int srcStride[3],
                     +                 uint8_t * dst[3], int dstStride[3],
                     +                 int horizontalSize, int verticalSize,
                     +                 QP_STORE_T *QP_store,  int QP_stride,
                     +		 pp_mode_t *mode, pp_context_t *ppContext, int pict_type);
+                    +
+                    +
                     +/**
                     + * returns a pp_mode_t or NULL if an error occured
                     + * name is the string after "-pp" on the command line
                     + * quality is a number from 0 to PP_QUALITY_MAX
                     + */
                     +pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality);
                     +void pp_free_mode(pp_mode_t *mode);
+                    +
                     +pp_context_t *pp_get_context(int width, int height, int flags);
                     +void pp_free_context(pp_context_t *ppContext);
+                    +
                     +#define PP_CPU_CAPS_MMX   0x80000000
                     +#define PP_CPU_CAPS_MMX2  0x20000000
                     +#define PP_CPU_CAPS_3DNOW 0x40000000
+                    +
                     +#define PP_FORMAT         0x00000008
                     +#define PP_FORMAT_420    (0x00000011|PP_FORMAT)
                     +#define PP_FORMAT_422    (0x00000001|PP_FORMAT)
                     +#define PP_FORMAT_411    (0x00000002|PP_FORMAT)
                     +#define PP_FORMAT_444    (0x00000000|PP_FORMAT)
+                    +
                     +#ifdef __cplusplus
                     +}
                     +#endif
+                    +
                     +#endif

libavcodec/libpostproc/postprocess_internal.h

History View file @ bba9b16

                     new file mode 100644
@@ -0,0 +1,128 @@
                     +/*
                     +    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
+                    +
                     +    This program is free software; you can redistribute it and/or modify
                     +    it under the terms of the GNU General Public License as published by
                     +    the Free Software Foundation; either version 2 of the License, or
                     +    (at your option) any later version.
+                    +
                     +    This program is distributed in the hope that it will be useful,
                     +    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     +    GNU General Public License for more details.
+                    +
                     +    You should have received a copy of the GNU General Public License
                     +    along with this program; if not, write to the Free Software
                     +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     +*/
+                    +
                     +#define V_DEBLOCK	0x01
                     +#define H_DEBLOCK	0x02
                     +#define DERING		0x04
                     +#define LEVEL_FIX	0x08 /* Brightness & Contrast */
+                    +
                     +#define LUM_V_DEBLOCK	V_DEBLOCK		//   1
                     +#define LUM_H_DEBLOCK	H_DEBLOCK		//   2
                     +#define CHROM_V_DEBLOCK	(V_DEBLOCK<<4)		//  16
                     +#define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)		//  32
                     +#define LUM_DERING	DERING			//   4
                     +#define CHROM_DERING	(DERING<<4)		//  64
                     +#define LUM_LEVEL_FIX	LEVEL_FIX		//   8
                     +#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)		// 128 (not implemented yet)
+                    +
                     +// Experimental vertical filters
                     +#define V_X1_FILTER	0x0200			// 512
+                    +
                     +// Experimental horizontal filters
                     +#define H_X1_FILTER	0x2000			// 8192
+                    +
                     +// select between full y range (255-0) or standart one (234-16)
                     +#define FULL_Y_RANGE	0x8000			// 32768
+                    +
                     +//Deinterlacing Filters
                     +#define	LINEAR_IPOL_DEINT_FILTER	0x10000	// 65536
                     +#define	LINEAR_BLEND_DEINT_FILTER	0x20000	// 131072
                     +#define	CUBIC_BLEND_DEINT_FILTER	0x8000	// (not implemented yet)
                     +#define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144
                     +#define	MEDIAN_DEINT_FILTER		0x80000	// 524288
                     +#define	FFMPEG_DEINT_FILTER		0x400000
+                    +
                     +#define TEMP_NOISE_FILTER		0x100000
                     +#define FORCE_QUANT			0x200000
+                    +
                     +//use if u want a faster postprocessing code
                     +//cant differentiate between chroma & luma filters (both on or both off)
                     +//obviosly the -pp option at the commandline has no effect except turning the here selected
                     +//filters on
                     +//#define COMPILE_TIME_MODE 0x77
+                    +
                     +struct PPFilter{
                     +	char *shortName;
                     +	char *longName;
                     +	int chromDefault; 	// is chrominance filtering on by default if this filter is manually activated
                     +	int minLumQuality; 	// minimum quality to turn luminance filtering on
                     +	int minChromQuality;	// minimum quality to turn chrominance filtering on
                     +	int mask; 		// Bitmask to turn this filter on
                     +};
+                    +
                     +typedef struct PPMode{
                     +	int lumMode; 			// acivates filters for luminance
                     +	int chromMode; 			// acivates filters for chrominance
                     +	int error; 			// non zero on error
+                    +
                     +	int minAllowedY; 		// for brigtness correction
                     +	int maxAllowedY; 		// for brihtness correction
                     +	float maxClippedThreshold;	// amount of "black" u r willing to loose to get a brightness corrected picture
+                    +
                     +	int maxTmpNoise[3]; 		// for Temporal Noise Reducing filter (Maximal sum of abs differences)
+                    +
                     +	int baseDcDiff;
                     +	int flatnessThreshold;
+                    +
                     +	int forcedQuant; 		// quantizer if FORCE_QUANT is used
                     +} PPMode;
+                    +
                     +typedef struct PPContext{
                     +	uint8_t *tempBlocks; //used for the horizontal code
+                    +
                     +	   after watching a black picture for 5 hours*/
                     +	uint64_t *yHistogram;
+                    +
                     +	uint64_t __attribute__((aligned(8))) packedYOffset;
                     +	uint64_t __attribute__((aligned(8))) packedYScale;
+                    +
                     +	/* Temporal noise reducing buffers */
                     +	uint8_t *tempBlured[3];
                     +	int32_t *tempBluredPast[3];
+                    +
                     +	/* Temporary buffers for handling the last row(s) */
                     +	uint8_t *tempDst;
                     +	uint8_t *tempSrc;
+                    +
                     +	uint8_t *deintTemp;
+                    +
                     +	uint64_t __attribute__((aligned(8))) pQPb;
                     +	uint64_t __attribute__((aligned(8))) pQPb2;
+                    +
                     +	uint64_t __attribute__((aligned(8))) mmxDcOffset[32];
                     +	uint64_t __attribute__((aligned(8))) mmxDcThreshold[32];
+                    +
                     +	QP_STORE_T *nonBQPTable;
                     +	QP_STORE_T *forcedQPTable;
+                    +
                     +	int QP;
                     +	int nonBQP;
+                    +
                     +	int frameNum;
+                    +
                     +	int cpuCaps;
+                    +
                     +	int stride; //size of some buffers (needed to realloc them if needed)
+                    +
                     +	int hChromaSubSample;
                     +	int vChromaSubSample;
+                    +
                     +	PPMode ppMode;
                     +} PPContext;
+                    +

libavcodec/libpostproc/postprocess_template.c

History View file @ bba9b16

                     new file mode 100644
@@ -0,0 +1,3127 @@
                     +/*
                     +    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
+                    +
                     +    This program is free software; you can redistribute it and/or modify
                     +    it under the terms of the GNU General Public License as published by
                     +    the Free Software Foundation; either version 2 of the License, or
                     +    (at your option) any later version.
+                    +
                     +    This program is distributed in the hope that it will be useful,
                     +    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     +    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     +    GNU General Public License for more details.
+                    +
                     +    You should have received a copy of the GNU General Public License
                     +    along with this program; if not, write to the Free Software
                     +    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     +*/
+                    +
                     +#undef PAVGB
                     +#undef PMINUB
                     +#undef PMAXUB
+                    +
                     +#ifdef HAVE_MMX2
                     +#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
                     +#elif defined (HAVE_3DNOW)
                     +#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
                     +#endif
+                    +
                     +#ifdef HAVE_MMX2
                     +#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
                     +#elif defined (HAVE_MMX)
                     +#define PMINUB(b,a,t) \
                     +	"movq " #a ", " #t " \n\t"\
                     +	"psubusb " #b ", " #t " \n\t"\
                     +	"psubb " #t ", " #a " \n\t"
                     +#endif
+                    +
                     +#ifdef HAVE_MMX2
                     +#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
                     +#elif defined (HAVE_MMX)
                     +#define PMAXUB(a,b) \
                     +	"psubusb " #a ", " #b " \n\t"\
                     +	"paddb " #a ", " #b " \n\t"
                     +#endif
+                    +
+                    +
                     +//FIXME? |255-0| = 1 (shouldnt be a problem ...)
                     +#ifdef HAVE_MMX
                     +/**
                     + * Check if the middle 8x8 Block in the given 8x16 block is flat
                     + */
                     +static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
                     +	int numEq= 0;
                     +	src+= stride*4; // src points to begin of the 8x8 Block
                     +asm volatile(
                     +		"leal (%1, %2), %%eax				\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%1	eax	eax+%2	eax+2%2	%1+4%2	ecx	ecx+%2	ecx+2%2	%1+8%2	ecx+4%2
                     +		"movq %3, %%mm7					\n\t"
                     +		"movq %4, %%mm6					\n\t"
+                    +
                     +		"movq (%1), %%mm0				\n\t"
                     +		"movq (%%eax), %%mm1				\n\t"
                     +		"psubb %%mm1, %%mm0				\n\t" // mm0 = differnece
                     +		"paddb %%mm7, %%mm0				\n\t"
                     +		"pcmpgtb %%mm6, %%mm0				\n\t"
+                    +
                     +		"movq (%%eax,%2), %%mm2				\n\t"
                     +		"psubb %%mm2, %%mm1				\n\t"
                     +		"paddb %%mm7, %%mm1				\n\t"
                     +		"pcmpgtb %%mm6, %%mm1				\n\t"
                     +		"paddb %%mm1, %%mm0				\n\t"
+                    +
                     +		"movq (%%eax, %2, 2), %%mm1			\n\t"
                     +		"psubb %%mm1, %%mm2				\n\t"
                     +		"paddb %%mm7, %%mm2				\n\t"
                     +		"pcmpgtb %%mm6, %%mm2				\n\t"
                     +		"paddb %%mm2, %%mm0				\n\t"
+                    +
                     +		"leal (%%eax, %2, 4), %%eax			\n\t"
+                    +
                     +		"movq (%1, %2, 4), %%mm2			\n\t"
                     +		"psubb %%mm2, %%mm1				\n\t"
                     +		"paddb %%mm7, %%mm1				\n\t"
                     +		"pcmpgtb %%mm6, %%mm1				\n\t"
                     +		"paddb %%mm1, %%mm0				\n\t"
+                    +
                     +		"movq (%%eax), %%mm1				\n\t"
                     +		"psubb %%mm1, %%mm2				\n\t"
                     +		"paddb %%mm7, %%mm2				\n\t"
                     +		"pcmpgtb %%mm6, %%mm2				\n\t"
                     +		"paddb %%mm2, %%mm0				\n\t"
+                    +
                     +		"movq (%%eax, %2), %%mm2			\n\t"
                     +		"psubb %%mm2, %%mm1				\n\t"
                     +		"paddb %%mm7, %%mm1				\n\t"
                     +		"pcmpgtb %%mm6, %%mm1				\n\t"
                     +		"paddb %%mm1, %%mm0				\n\t"
+                    +
                     +		"movq (%%eax, %2, 2), %%mm1			\n\t"
                     +		"psubb %%mm1, %%mm2				\n\t"
                     +		"paddb %%mm7, %%mm2				\n\t"
                     +		"pcmpgtb %%mm6, %%mm2				\n\t"
                     +		"paddb %%mm2, %%mm0				\n\t"
+                    +
                     +		"						\n\t"
                     +#ifdef HAVE_MMX2
                     +		"pxor %%mm7, %%mm7				\n\t"
                     +		"psadbw %%mm7, %%mm0				\n\t"
                     +#else
                     +		"movq %%mm0, %%mm1				\n\t"
                     +		"psrlw $8, %%mm0				\n\t"
                     +		"paddb %%mm1, %%mm0				\n\t"
                     +		"movq %%mm0, %%mm1				\n\t"
                     +		"psrlq $16, %%mm0				\n\t"
                     +		"paddb %%mm1, %%mm0				\n\t"
                     +		"movq %%mm0, %%mm1				\n\t"
                     +		"psrlq $32, %%mm0				\n\t"
                     +		"paddb %%mm1, %%mm0				\n\t"
                     +#endif
                     +		"movd %%mm0, %0					\n\t"
                     +		: "=r" (numEq)
                     +		: "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
                     +		: "%eax"
                     +		);
                     +	numEq= (-numEq) &0xFF;
                     +	return numEq > c->ppMode.flatnessThreshold;
                     +}
                     +#endif
+                    +
                     +static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
                     +{
                     +#ifdef HAVE_MMX
                     +	int isOk;
                     +	src+= stride*3;
                     +	asm volatile(
                     +		"movq (%1, %2), %%mm0				\n\t"
                     +		"movq (%1, %2, 8), %%mm1			\n\t"
                     +		"movq %%mm0, %%mm2				\n\t"
                     +		"psubusb %%mm1, %%mm0				\n\t"
                     +		"psubusb %%mm2, %%mm1				\n\t"
                     +		"por %%mm1, %%mm0				\n\t" // ABS Diff
+                    +
                     +		"movq %3, %%mm7					\n\t" // QP,..., QP
                     +		"paddusb %%mm7, %%mm7				\n\t" // 2QP ... 2QP
                     +		"psubusb %%mm7, %%mm0				\n\t" // Diff <= 2QP -> 0
                     +		"packssdw %%mm0, %%mm0				\n\t"
                     +		"movd %%mm0, %0					\n\t"
                     +		: "=r" (isOk)
                     +		: "r" (src), "r" (stride), "m" (c->pQPb)
                     +		);
                     +	return isOk==0;
                     +#else
                     +#if 1
                     +	int x;
                     +	const int QP= c->QP;
                     +	src+= stride*3;
                     +	for(x=0; x<BLOCK_SIZE; x++)
                     +	{
                     +		if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
                     +	}
+                    +
                     +	return 1;
                     +#else
                     +	int x;
                     +	const int QP= c->QP;
                     +	src+= stride*4;
                     +	for(x=0; x<BLOCK_SIZE; x++)
                     +	{
                     +		int min=255;
                     +		int max=0;
                     +		int y;
                     +		for(y=0; y<8; y++){
                     +			int v= src[x + y*stride];
                     +			if(v>max) max=v;
                     +			if(v<min) min=v;
                     +		}
                     +		if(max-min > 2*QP) return 0;
                     +	}
                     +	return 1;
                     +#endif
                     +#endif
                     +}
+                    +
                     +/**
                     + * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
                     + * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
                     + */
                     +static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	src+= stride*3;
                     +	asm volatile(	//"movv %0 %1 %2\n\t"
                     +		"movq %2, %%mm0			\n\t"  // QP,..., QP
                     +		"pxor %%mm4, %%mm4				\n\t"
+                    +
                     +		"movq (%0), %%mm6				\n\t"
                     +		"movq (%0, %1), %%mm5				\n\t"
                     +		"movq %%mm5, %%mm1				\n\t"
                     +		"movq %%mm6, %%mm2				\n\t"
                     +		"psubusb %%mm6, %%mm5				\n\t"
                     +		"psubusb %%mm1, %%mm2				\n\t"
                     +		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
                     +		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
                     +		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
+                    +
                     +		"pand %%mm2, %%mm6				\n\t"
                     +		"pandn %%mm1, %%mm2				\n\t"
                     +		"por %%mm2, %%mm6				\n\t"// First Line to Filter
+                    +
                     +		"movq (%0, %1, 8), %%mm5			\n\t"
                     +		"leal (%0, %1, 4), %%eax			\n\t"
                     +		"leal (%0, %1, 8), %%ecx			\n\t"
                     +		"subl %1, %%ecx					\n\t"
                     +		"addl %1, %0					\n\t" // %0 points to line 1 not 0
                     +		"movq (%0, %1, 8), %%mm7			\n\t"
                     +		"movq %%mm5, %%mm1				\n\t"
                     +		"movq %%mm7, %%mm2				\n\t"
                     +		"psubusb %%mm7, %%mm5				\n\t"
                     +		"psubusb %%mm1, %%mm2				\n\t"
                     +		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
                     +		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
                     +		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
+                    +
                     +		"pand %%mm2, %%mm7				\n\t"
                     +		"pandn %%mm1, %%mm2				\n\t"
                     +		"por %%mm2, %%mm7				\n\t" // First Line to Filter
+                    +
+                    +
                     +		// 	1	2	3	4	5	6	7	8
                     +		//	%0	%0+%1	%0+2%1	eax	%0+4%1	eax+2%1	ecx	eax+4%1
                     +		// 6 4 2 2 1 1
                     +		// 6 4 4 2
                     +		// 6 8 2
+                    +
                     +		"movq (%0, %1), %%mm0				\n\t" //  1
                     +		"movq %%mm0, %%mm1				\n\t" //  1
                     +		PAVGB(%%mm6, %%mm0)				      //1 1	/2
                     +		PAVGB(%%mm6, %%mm0)				      //3 1	/4
+                    +
                     +		"movq (%0, %1, 4), %%mm2			\n\t" //     1
                     +		"movq %%mm2, %%mm5				\n\t" //     1
                     +		PAVGB((%%eax), %%mm2)				      //    11	/2
                     +		PAVGB((%0, %1, 2), %%mm2)			      //   211	/4
                     +		"movq %%mm2, %%mm3				\n\t" //   211	/4
                     +		"movq (%0), %%mm4				\n\t" // 1
                     +		PAVGB(%%mm4, %%mm3)				      // 4 211	/8
                     +		PAVGB(%%mm0, %%mm3)				      //642211	/16
                     +		"movq %%mm3, (%0)				\n\t" // X
                     +		// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
                     +		"movq %%mm1, %%mm0				\n\t" //  1
                     +		PAVGB(%%mm6, %%mm0)				      //1 1	/2
                     +		"movq %%mm4, %%mm3				\n\t" // 1
                     +		PAVGB((%0,%1,2), %%mm3)				      // 1 1	/2
                     +		PAVGB((%%eax,%1,2), %%mm5)			      //     11	/2
                     +		PAVGB((%%eax), %%mm5)				      //    211 /4
                     +		PAVGB(%%mm5, %%mm3)				      // 2 2211 /8
                     +		PAVGB(%%mm0, %%mm3)				      //4242211 /16
                     +		"movq %%mm3, (%0,%1)				\n\t" //  X
                     +		// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
                     +		PAVGB(%%mm4, %%mm6)				      //11	/2
                     +		"movq (%%ecx), %%mm0				\n\t" //       1
                     +		PAVGB((%%eax, %1, 2), %%mm0)			      //      11/2
                     +		"movq %%mm0, %%mm3				\n\t" //      11/2
                     +		PAVGB(%%mm1, %%mm0)				      //  2   11/4
                     +		PAVGB(%%mm6, %%mm0)				      //222   11/8
                     +		PAVGB(%%mm2, %%mm0)				      //22242211/16
                     +		"movq (%0, %1, 2), %%mm2			\n\t" //   1
                     +		"movq %%mm0, (%0, %1, 2)			\n\t" //   X
                     +		// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
                     +		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
                     +		PAVGB((%%ecx), %%mm0)				      //       11	/2
                     +		PAVGB(%%mm0, %%mm6)				      //11     11	/4
                     +		PAVGB(%%mm1, %%mm4)				      // 11		/2
                     +		PAVGB(%%mm2, %%mm1)				      //  11		/2
                     +		PAVGB(%%mm1, %%mm6)				      //1122   11	/8
                     +		PAVGB(%%mm5, %%mm6)				      //112242211	/16
                     +		"movq (%%eax), %%mm5				\n\t" //    1
                     +		"movq %%mm6, (%%eax)				\n\t" //    X
                     +		// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
                     +		"movq (%%eax, %1, 4), %%mm6			\n\t" //        1
                     +		PAVGB(%%mm7, %%mm6)				      //        11	/2
                     +		PAVGB(%%mm4, %%mm6)				      // 11     11	/4
                     +		PAVGB(%%mm3, %%mm6)				      // 11   2211	/8
                     +		PAVGB(%%mm5, %%mm2)				      //   11		/2
                     +		"movq (%0, %1, 4), %%mm4			\n\t" //     1
                     +		PAVGB(%%mm4, %%mm2)				      //   112		/4
                     +		PAVGB(%%mm2, %%mm6)				      // 112242211	/16
                     +		"movq %%mm6, (%0, %1, 4)			\n\t" //     X
                     +		// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
                     +		PAVGB(%%mm7, %%mm1)				      //  11     2	/4
                     +		PAVGB(%%mm4, %%mm5)				      //    11		/2
                     +		PAVGB(%%mm5, %%mm0)				      //    11 11	/4
                     +		"movq (%%eax, %1, 2), %%mm6			\n\t" //      1
                     +		PAVGB(%%mm6, %%mm1)				      //  11  4  2	/8
                     +		PAVGB(%%mm0, %%mm1)				      //  11224222	/16
                     +		"movq %%mm1, (%%eax, %1, 2)			\n\t" //      X
                     +		// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
                     +		PAVGB((%%ecx), %%mm2)				      //   112 4	/8
                     +		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
                     +		PAVGB(%%mm0, %%mm6)				      //      1 1	/2
                     +		PAVGB(%%mm7, %%mm6)				      //      1 12	/4
                     +		PAVGB(%%mm2, %%mm6)				      //   1122424	/4
                     +		"movq %%mm6, (%%ecx)				\n\t" //       X
                     +		// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
                     +		PAVGB(%%mm7, %%mm5)				      //    11   2	/4
                     +		PAVGB(%%mm7, %%mm5)				      //    11   6	/8
+                    +
                     +		PAVGB(%%mm3, %%mm0)				      //      112	/4
                     +		PAVGB(%%mm0, %%mm5)				      //    112246	/16
                     +		"movq %%mm5, (%%eax, %1, 4)			\n\t" //        X
                     +		"subl %1, %0					\n\t"
+                    +
                     +		:
                     +		: "r" (src), "r" (stride), "m" (c->pQPb)
                     +		: "%eax", "%ecx"
                     +	);
                     +#else
                     +	const int l1= stride;
                     +	const int l2= stride + l1;
                     +	const int l3= stride + l2;
                     +	const int l4= stride + l3;
                     +	const int l5= stride + l4;
                     +	const int l6= stride + l5;
                     +	const int l7= stride + l6;
                     +	const int l8= stride + l7;
                     +	const int l9= stride + l8;
                     +	int x;
                     +	src+= stride*3;
                     +	for(x=0; x<BLOCK_SIZE; x++)
                     +	{
                     +		const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
                     +		const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
+                    +
                     +		int sums[9];
                     +		sums[0] = first + src[l1];
                     +		sums[1] = src[l1] + src[l2];
                     +		sums[2] = src[l2] + src[l3];
                     +		sums[3] = src[l3] + src[l4];
                     +		sums[4] = src[l4] + src[l5];
                     +		sums[5] = src[l5] + src[l6];
                     +		sums[6] = src[l6] + src[l7];
                     +		sums[7] = src[l7] + src[l8];
                     +		sums[8] = src[l8] + last;
+                    +
                     +		src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
                     +		src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
                     +		src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
                     +		src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
                     +		src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
                     +		src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
                     +		src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
                     +		src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
+                    +
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +#if 0
                     +/**
                     + * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
                     + * values are correctly clipped (MMX2)
                     + * values are wraparound (C)
                     + * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
                     +	0 8 16 24
                     +	x = 8
                     +	x/2 = 4
                     +	x/8 = 1
                     +	1 12 12 23
                     + */
                     +static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	src+= stride*3;
                     +// FIXME rounding
                     +	asm volatile(
                     +		"pxor %%mm7, %%mm7				\n\t" // 0
                     +		"movq "MANGLE(b80)", %%mm6			\n\t" // MIN_SIGNED_BYTE
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
                     +		"movq "MANGLE(pQPb)", %%mm0			\n\t" // QP,..., QP
                     +		"movq %%mm0, %%mm1				\n\t" // QP,..., QP
                     +		"paddusb "MANGLE(b02)", %%mm0			\n\t"
                     +		"psrlw $2, %%mm0				\n\t"
                     +		"pand "MANGLE(b3F)", %%mm0			\n\t" // QP/4,..., QP/4
                     +		"paddusb %%mm1, %%mm0				\n\t" // QP*1.25 ...
                     +		"movq (%0, %1, 4), %%mm2			\n\t" // line 4
                     +		"movq (%%ecx), %%mm3				\n\t" // line 5
                     +		"movq %%mm2, %%mm4				\n\t" // line 4
                     +		"pcmpeqb %%mm5, %%mm5				\n\t" // -1
                     +		"pxor %%mm2, %%mm5				\n\t" // -line 4 - 1
                     +		PAVGB(%%mm3, %%mm5)
                     +		"paddb %%mm6, %%mm5				\n\t" // (l5-l4)/2
                     +		"psubusb %%mm3, %%mm4				\n\t"
                     +		"psubusb %%mm2, %%mm3				\n\t"
                     +		"por %%mm3, %%mm4				\n\t" // |l4 - l5|
                     +		"psubusb %%mm0, %%mm4				\n\t"
                     +		"pcmpeqb %%mm7, %%mm4				\n\t"
                     +		"pand %%mm4, %%mm5				\n\t" // d/2
+                    +
                     +//		"paddb %%mm6, %%mm2				\n\t" // line 4 + 0x80
                     +		"paddb %%mm5, %%mm2				\n\t"
                     +//		"psubb %%mm6, %%mm2				\n\t"
                     +		"movq %%mm2, (%0,%1, 4)				\n\t"
+                    +
                     +		"movq (%%ecx), %%mm2				\n\t"
                     +//		"paddb %%mm6, %%mm2				\n\t" // line 5 + 0x80
                     +		"psubb %%mm5, %%mm2				\n\t"
                     +//		"psubb %%mm6, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ecx)				\n\t"
+                    +
                     +		"paddb %%mm6, %%mm5				\n\t"
                     +		"psrlw $2, %%mm5				\n\t"
                     +		"pand "MANGLE(b3F)", %%mm5			\n\t"
                     +		"psubb "MANGLE(b20)", %%mm5			\n\t" // (l5-l4)/8
+                    +
                     +		"movq (%%eax, %1, 2), %%mm2			\n\t"
                     +		"paddb %%mm6, %%mm2				\n\t" // line 3 + 0x80
                     +		"paddsb %%mm5, %%mm2				\n\t"
                     +		"psubb %%mm6, %%mm2				\n\t"
                     +		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    +
                     +		"movq (%%ecx, %1), %%mm2			\n\t"
                     +		"paddb %%mm6, %%mm2				\n\t" // line 6 + 0x80
                     +		"psubsb %%mm5, %%mm2				\n\t"
                     +		"psubb %%mm6, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ecx, %1)			\n\t"
+                    +
                     +		:
                     +		: "r" (src), "r" (stride)
                     +		: "%eax", "%ecx"
                     +	);
                     +#else
                     + 	const int l1= stride;
                     +	const int l2= stride + l1;
                     +	const int l3= stride + l2;
                     +	const int l4= stride + l3;
                     +	const int l5= stride + l4;
                     +	const int l6= stride + l5;
                     +//	const int l7= stride + l6;
                     +//	const int l8= stride + l7;
                     +//	const int l9= stride + l8;
                     +	int x;
                     +	const int QP15= QP + (QP>>2);
                     +	src+= stride*3;
                     +	for(x=0; x<BLOCK_SIZE; x++)
                     +	{
                     +		const int v = (src[x+l5] - src[x+l4]);
                     +		if(ABS(v) < QP15)
                     +		{
                     +			src[x+l3] +=v>>3;
                     +			src[x+l4] +=v>>1;
                     +			src[x+l5] -=v>>1;
                     +			src[x+l6] -=v>>3;
+                    +
                     +		}
                     +	}
+                    +
                     +#endif
                     +}
                     +#endif
+                    +
                     +/**
                     + * Experimental Filter 1
                     + * will not damage linear gradients
                     + * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
                     + * can only smooth blocks at the expected locations (it cant smooth them if they did move)
                     + * MMX2 version does correct clipping C version doesnt
                     + */
                     +static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	src+= stride*3;
+                    +
                     +	asm volatile(
                     +		"pxor %%mm7, %%mm7				\n\t" // 0
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
                     +		"movq (%0, %1, 4), %%mm1			\n\t" // line 4
                     +		"movq %%mm1, %%mm2				\n\t" // line 4
                     +		"psubusb %%mm0, %%mm1				\n\t"
                     +		"psubusb %%mm2, %%mm0				\n\t"
                     +		"por %%mm1, %%mm0				\n\t" // |l2 - l3|
                     +		"movq (%%ecx), %%mm3				\n\t" // line 5
                     +		"movq (%%ecx, %1), %%mm4			\n\t" // line 6
                     +		"movq %%mm3, %%mm5				\n\t" // line 5
                     +		"psubusb %%mm4, %%mm3				\n\t"
                     +		"psubusb %%mm5, %%mm4				\n\t"
                     +		"por %%mm4, %%mm3				\n\t" // |l5 - l6|
                     +		PAVGB(%%mm3, %%mm0)				      // (|l2 - l3| + |l5 - l6|)/2
                     +		"movq %%mm2, %%mm1				\n\t" // line 4
                     +		"psubusb %%mm5, %%mm2				\n\t"
                     +		"movq %%mm2, %%mm4				\n\t"
                     +		"pcmpeqb %%mm7, %%mm2				\n\t" // (l4 - l5) <= 0 ? -1 : 0
                     +		"psubusb %%mm1, %%mm5				\n\t"
                     +		"por %%mm5, %%mm4				\n\t" // |l4 - l5|
                     +		"psubusb %%mm0, %%mm4		\n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
                     +		"movq %%mm4, %%mm3				\n\t" // d
                     +		"movq %2, %%mm0			\n\t"
                     +                "paddusb %%mm0, %%mm0				\n\t"
                     +		"psubusb %%mm0, %%mm4				\n\t"
                     +		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
                     +		"psubusb "MANGLE(b01)", %%mm3			\n\t"
                     +		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
+                    +
                     +		PAVGB(%%mm7, %%mm3)				      // d/2
                     +		"movq %%mm3, %%mm1				\n\t" // d/2
                     +		PAVGB(%%mm7, %%mm3)				      // d/4
                     +		PAVGB(%%mm1, %%mm3)				      // 3*d/8
+                    +
                     +		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
                     +		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
                     +		"psubusb %%mm3, %%mm0				\n\t"
                     +		"pxor %%mm2, %%mm0				\n\t"
                     +		"movq %%mm0, (%0, %1, 4)			\n\t" // line 4
+                    +
                     +		"movq (%%ecx), %%mm0				\n\t" // line 5
                     +		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
                     +		"paddusb %%mm3, %%mm0				\n\t"
                     +		"pxor %%mm2, %%mm0				\n\t"
                     +		"movq %%mm0, (%%ecx)				\n\t" // line 5
+                    +
                     +		PAVGB(%%mm7, %%mm1)				      // d/4
+                    +
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
                     +		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
                     +		"psubusb %%mm1, %%mm0				\n\t"
                     +		"pxor %%mm2, %%mm0				\n\t"
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
+                    +
                     +		"movq (%%ecx, %1), %%mm0			\n\t" // line 6
                     +		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
                     +		"paddusb %%mm1, %%mm0				\n\t"
                     +		"pxor %%mm2, %%mm0				\n\t"
                     +		"movq %%mm0, (%%ecx, %1)			\n\t" // line 6
+                    +
                     +		PAVGB(%%mm7, %%mm1)				      // d/8
+                    +
                     +		"movq (%%eax, %1), %%mm0			\n\t" // line 2
                     +		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
                     +		"psubusb %%mm1, %%mm0				\n\t"
                     +		"pxor %%mm2, %%mm0				\n\t"
                     +		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
+                    +
                     +		"movq (%%ecx, %1, 2), %%mm0			\n\t" // line 7
                     +		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
                     +		"paddusb %%mm1, %%mm0				\n\t"
                     +		"pxor %%mm2, %%mm0				\n\t"
                     +		"movq %%mm0, (%%ecx, %1, 2)			\n\t" // line 7
+                    +
                     +		:
                     +		: "r" (src), "r" (stride), "m" (co->pQPb)
                     +		: "%eax", "%ecx"
                     +	);
                     +#else
+                    +
                     + 	const int l1= stride;
                     +	const int l2= stride + l1;
                     +	const int l3= stride + l2;
                     +	const int l4= stride + l3;
                     +	const int l5= stride + l4;
                     +	const int l6= stride + l5;
                     +	const int l7= stride + l6;
                     +//	const int l8= stride + l7;
                     +//	const int l9= stride + l8;
                     +	int x;
+                    +
                     +	src+= stride*3;
                     +	for(x=0; x<BLOCK_SIZE; x++)
                     +	{
                     +		int a= src[l3] - src[l4];
                     +		int b= src[l4] - src[l5];
                     +		int c= src[l5] - src[l6];
+                    +
                     +		int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
                     +		d= MAX(d, 0);
+                    +
                     +		if(d < co->QP*2)
                     +		{
                     +			int v = d * SIGN(-b);
+                    +
                     +			src[l2] +=v>>3;
                     +			src[l3] +=v>>2;
                     +			src[l4] +=(3*v)>>3;
                     +			src[l5] -=(3*v)>>3;
                     +			src[l6] -=v>>2;
                     +			src[l7] -=v>>3;
+                    +
                     +		}
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +/*
                     +	uint8_t tmp[16];
                     +	const int l1= stride;
                     +	const int l2= stride + l1;
                     +	const int l3= stride + l2;
                     +	const int l4= (int)tmp - (int)src - stride*3;
                     +	const int l5= (int)tmp - (int)src - stride*3 + 8;
                     +	const int l6= stride*3 + l3;
                     +	const int l7= stride + l6;
                     +	const int l8= stride + l7;
+                    +
                     +	memcpy(tmp, src+stride*7, 8);
                     +	memcpy(tmp+8, src+stride*8, 8);
                     +*/
                     +	src+= stride*4;
                     +	asm volatile(
+                    +
                     +#if 0 //sligtly more accurate and slightly slower
                     +		"pxor %%mm7, %%mm7				\n\t" // 0
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     +//	0	1	2	3	4	5	6	7
                     +//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
+                    +
+                    +
                     +		"movq (%0, %1, 2), %%mm0			\n\t" // l2
                     +		"movq (%0), %%mm1				\n\t" // l0
                     +		"movq %%mm0, %%mm2				\n\t" // l2
                     +		PAVGB(%%mm7, %%mm0)				      // ~l2/2
                     +		PAVGB(%%mm1, %%mm0)				      // ~(l2 + 2l0)/4
                     +		PAVGB(%%mm2, %%mm0)				      // ~(5l2 + 2l0)/8
+                    +
                     +		"movq (%%eax), %%mm1				\n\t" // l1
                     +		"movq (%%eax, %1, 2), %%mm3			\n\t" // l3
                     +		"movq %%mm1, %%mm4				\n\t" // l1
                     +		PAVGB(%%mm7, %%mm1)				      // ~l1/2
                     +		PAVGB(%%mm3, %%mm1)				      // ~(l1 + 2l3)/4
                     +		PAVGB(%%mm4, %%mm1)				      // ~(5l1 + 2l3)/8
+                    +
                     +		"movq %%mm0, %%mm4				\n\t" // ~(5l2 + 2l0)/8
                     +		"psubusb %%mm1, %%mm0				\n\t"
                     +		"psubusb %%mm4, %%mm1				\n\t"
                     +		"por %%mm0, %%mm1				\n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
                     +// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
+                    +
                     +		"movq (%0, %1, 4), %%mm0			\n\t" // l4
                     +		"movq %%mm0, %%mm4				\n\t" // l4
                     +		PAVGB(%%mm7, %%mm0)				      // ~l4/2
                     +		PAVGB(%%mm2, %%mm0)				      // ~(l4 + 2l2)/4
                     +		PAVGB(%%mm4, %%mm0)				      // ~(5l4 + 2l2)/8
+                    +
                     +		"movq (%%ecx), %%mm2				\n\t" // l5
                     +		"movq %%mm3, %%mm5				\n\t" // l3
                     +		PAVGB(%%mm7, %%mm3)				      // ~l3/2
                     +		PAVGB(%%mm2, %%mm3)				      // ~(l3 + 2l5)/4
                     +		PAVGB(%%mm5, %%mm3)				      // ~(5l3 + 2l5)/8
+                    +
                     +		"movq %%mm0, %%mm6				\n\t" // ~(5l4 + 2l2)/8
                     +		"psubusb %%mm3, %%mm0				\n\t"
                     +		"psubusb %%mm6, %%mm3				\n\t"
                     +		"por %%mm0, %%mm3				\n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
                     +		"pcmpeqb %%mm7, %%mm0				\n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
                     +// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
+                    +
                     +		"movq (%%ecx, %1), %%mm6			\n\t" // l6
                     +		"movq %%mm6, %%mm5				\n\t" // l6
                     +		PAVGB(%%mm7, %%mm6)				      // ~l6/2
                     +		PAVGB(%%mm4, %%mm6)				      // ~(l6 + 2l4)/4
                     +		PAVGB(%%mm5, %%mm6)				      // ~(5l6 + 2l4)/8
+                    +
                     +		"movq (%%ecx, %1, 2), %%mm5			\n\t" // l7
                     +		"movq %%mm2, %%mm4				\n\t" // l5
                     +		PAVGB(%%mm7, %%mm2)				      // ~l5/2
                     +		PAVGB(%%mm5, %%mm2)				      // ~(l5 + 2l7)/4
                     +		PAVGB(%%mm4, %%mm2)				      // ~(5l5 + 2l7)/8
+                    +
                     +		"movq %%mm6, %%mm4				\n\t" // ~(5l6 + 2l4)/8
                     +		"psubusb %%mm2, %%mm6				\n\t"
                     +		"psubusb %%mm4, %%mm2				\n\t"
                     +		"por %%mm6, %%mm2				\n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
                     +// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
+                    +
+                    +
                     +		PMINUB(%%mm2, %%mm1, %%mm4)			      // MIN(|lenergy|,|renergy|)/8
                     +		"movq %2, %%mm4					\n\t" // QP //FIXME QP+1 ?
                     +		"paddusb "MANGLE(b01)", %%mm4			\n\t"
                     +		"pcmpgtb %%mm3, %%mm4				\n\t" // |menergy|/8 < QP
                     +		"psubusb %%mm1, %%mm3				\n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
                     +		"pand %%mm4, %%mm3				\n\t"
+                    +
                     +		"movq %%mm3, %%mm1				\n\t"
                     +//		"psubusb "MANGLE(b01)", %%mm3			\n\t"
                     +		PAVGB(%%mm7, %%mm3)
                     +		PAVGB(%%mm7, %%mm3)
                     +		"paddusb %%mm1, %%mm3				\n\t"
                     +//		"paddusb "MANGLE(b01)", %%mm3			\n\t"
+                    +
                     +		"movq (%%eax, %1, 2), %%mm6			\n\t" //l3
                     +		"movq (%0, %1, 4), %%mm5			\n\t" //l4
                     +		"movq (%0, %1, 4), %%mm4			\n\t" //l4
                     +		"psubusb %%mm6, %%mm5				\n\t"
                     +		"psubusb %%mm4, %%mm6				\n\t"
                     +		"por %%mm6, %%mm5				\n\t" // |l3-l4|
                     +		"pcmpeqb %%mm7, %%mm6				\n\t" // SIGN(l3-l4)
                     +		"pxor %%mm6, %%mm0				\n\t"
                     +		"pand %%mm0, %%mm3				\n\t"
                     +		PMINUB(%%mm5, %%mm3, %%mm0)
+                    +
                     +		"psubusb "MANGLE(b01)", %%mm3			\n\t"
                     +		PAVGB(%%mm7, %%mm3)
+                    +
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t"
                     +		"movq (%0, %1, 4), %%mm2			\n\t"
                     +		"pxor %%mm6, %%mm0				\n\t"
                     +		"pxor %%mm6, %%mm2				\n\t"
                     +		"psubb %%mm3, %%mm0				\n\t"
                     +		"paddb %%mm3, %%mm2				\n\t"
                     +		"pxor %%mm6, %%mm0				\n\t"
                     +		"pxor %%mm6, %%mm2				\n\t"
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     +		"movq %%mm2, (%0, %1, 4)			\n\t"
                     +#endif
+                    +
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"pcmpeqb %%mm6, %%mm6				\n\t" // -1
                     +//	0	1	2	3	4	5	6	7
                     +//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
+                    +
+                    +
                     +		"movq (%%eax, %1, 2), %%mm1			\n\t" // l3
                     +		"movq (%0, %1, 4), %%mm0			\n\t" // l4
                     +		"pxor %%mm6, %%mm1				\n\t" // -l3-1
                     +		PAVGB(%%mm1, %%mm0)				      // -q+128 = (l4-l3+256)/2
                     +// mm1=-l3-1, mm0=128-q
+                    +
                     +		"movq (%%eax, %1, 4), %%mm2			\n\t" // l5
                     +		"movq (%%eax, %1), %%mm3			\n\t" // l2
                     +		"pxor %%mm6, %%mm2				\n\t" // -l5-1
                     +		"movq %%mm2, %%mm5				\n\t" // -l5-1
                     +		"movq "MANGLE(b80)", %%mm4			\n\t" // 128
                     +		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     +		PAVGB(%%mm3, %%mm2)				      // (l2-l5+256)/2
                     +		PAVGB(%%mm0, %%mm4)				      // ~(l4-l3)/4 + 128
                     +		PAVGB(%%mm2, %%mm4)				      // ~(l2-l5)/4 +(l4-l3)/8 + 128
                     +		PAVGB(%%mm0, %%mm4)				      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
                     +// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
+                    +
                     +		"movq (%%eax), %%mm2				\n\t" // l1
                     +		"pxor %%mm6, %%mm2				\n\t" // -l1-1
                     +		PAVGB(%%mm3, %%mm2)				      // (l2-l1+256)/2
                     +		PAVGB((%0), %%mm1)				      // (l0-l3+256)/2
                     +		"movq "MANGLE(b80)", %%mm3			\n\t" // 128
                     +		PAVGB(%%mm2, %%mm3)				      // ~(l2-l1)/4 + 128
                     +		PAVGB(%%mm1, %%mm3)				      // ~(l0-l3)/4 +(l2-l1)/8 + 128
                     +		PAVGB(%%mm2, %%mm3)				      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
                     +// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
+                    +
                     +		PAVGB((%%ecx, %1), %%mm5)			      // (l6-l5+256)/2
                     +		"movq (%%ecx, %1, 2), %%mm1			\n\t" // l7
                     +		"pxor %%mm6, %%mm1				\n\t" // -l7-1
                     +		PAVGB((%0, %1, 4), %%mm1)			      // (l4-l7+256)/2
                     +		"movq "MANGLE(b80)", %%mm2			\n\t" // 128
                     +		PAVGB(%%mm5, %%mm2)				      // ~(l6-l5)/4 + 128
                     +		PAVGB(%%mm1, %%mm2)				      // ~(l4-l7)/4 +(l6-l5)/8 + 128
                     +		PAVGB(%%mm5, %%mm2)				      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
                     +// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
+                    +
                     +		"movq "MANGLE(b00)", %%mm1			\n\t" // 0
                     +		"movq "MANGLE(b00)", %%mm5			\n\t" // 0
                     +		"psubb %%mm2, %%mm1				\n\t" // 128 - renergy/16
                     +		"psubb %%mm3, %%mm5				\n\t" // 128 - lenergy/16
                     +		PMAXUB(%%mm1, %%mm2)				      // 128 + |renergy/16|
                     + 		PMAXUB(%%mm5, %%mm3)				      // 128 + |lenergy/16|
                     +		PMINUB(%%mm2, %%mm3, %%mm1)			      // 128 + MIN(|lenergy|,|renergy|)/16
+                    +
                     +// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
+                    +
                     +		"movq "MANGLE(b00)", %%mm7			\n\t" // 0
                     +		"movq %2, %%mm2					\n\t" // QP
                     +		PAVGB(%%mm6, %%mm2)				      // 128 + QP/2
                     +		"psubb %%mm6, %%mm2				\n\t"
+                    +
                     +		"movq %%mm4, %%mm1				\n\t"
                     +		"pcmpgtb %%mm7, %%mm1				\n\t" // SIGN(menergy)
                     +		"pxor %%mm1, %%mm4				\n\t"
                     +		"psubb %%mm1, %%mm4				\n\t" // 128 + |menergy|/16
                     +		"pcmpgtb %%mm4, %%mm2				\n\t" // |menergy|/16 < QP/2
                     +		"psubusb %%mm3, %%mm4				\n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
                     +// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
+                    +
                     +		"movq %%mm4, %%mm3				\n\t" // d
                     +		"psubusb "MANGLE(b01)", %%mm4			\n\t"
                     +		PAVGB(%%mm7, %%mm4)				      // d/32
                     +		PAVGB(%%mm7, %%mm4)				      // (d + 32)/64
                     +		"paddb %%mm3, %%mm4				\n\t" // 5d/64
                     +		"pand %%mm2, %%mm4				\n\t"
+                    +
                     +		"movq "MANGLE(b80)", %%mm5			\n\t" // 128
                     +		"psubb %%mm0, %%mm5				\n\t" // q
                     +		"paddsb %%mm6, %%mm5				\n\t" // fix bad rounding
                     +		"pcmpgtb %%mm5, %%mm7				\n\t" // SIGN(q)
                     +		"pxor %%mm7, %%mm5				\n\t"
+                    +
                     +		PMINUB(%%mm5, %%mm4, %%mm3)			      // MIN(|q|, 5d/64)
                     +		"pxor %%mm1, %%mm7				\n\t" // SIGN(d*q)
+                    +
                     +		"pand %%mm7, %%mm4				\n\t"
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t"
                     +		"movq (%0, %1, 4), %%mm2			\n\t"
                     +		"pxor %%mm1, %%mm0				\n\t"
                     +		"pxor %%mm1, %%mm2				\n\t"
                     +		"paddb %%mm4, %%mm0				\n\t"
                     +		"psubb %%mm4, %%mm2				\n\t"
                     +		"pxor %%mm1, %%mm0				\n\t"
                     +		"pxor %%mm1, %%mm2				\n\t"
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     +		"movq %%mm2, (%0, %1, 4)			\n\t"
+                    +
                     +		:
                     +		: "r" (src), "r" (stride), "m" (c->pQPb)
                     +		: "%eax", "%ecx"
                     +	);
+                    +
                     +/*
                     +	{
                     +	int x;
                     +	src-= stride;
                     +	for(x=0; x<BLOCK_SIZE; x++)
                     +	{
                     +		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
                     +		if(ABS(middleEnergy)< 8*QP)
                     +		{
                     +			const int q=(src[l4] - src[l5])/2;
                     +			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                     +			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
+                    +
                     +			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
                     +			d= MAX(d, 0);
+                    +
                     +			d= (5*d + 32) >> 6;
                     +			d*= SIGN(-middleEnergy);
+                    +
                     +			if(q>0)
                     +			{
                     +				d= d<0 ? 0 : d;
                     +				d= d>q ? q : d;
                     +			}
                     +			else
                     +			{
                     +				d= d>0 ? 0 : d;
                     +				d= d<q ? q : d;
                     +			}
+                    +
                     +        		src[l4]-= d;
                     +	        	src[l5]+= d;
                     +		}
                     +		src++;
                     +	}
                     +src-=8;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		int y;
                     +		for(y=4; y<6; y++)
                     +		{
                     +			int d= src[x+y*stride] - tmp[x+(y-4)*8];
                     +			int ad= ABS(d);
                     +			static int max=0;
                     +			static int sum=0;
                     +			static int num=0;
                     +			static int bias=0;
+                    +
                     +			if(max<ad) max=ad;
                     +			sum+= ad>3 ? 1 : 0;
                     +			if(ad>3)
                     +			{
                     +				src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
                     +			}
                     +			if(y==4) bias+=d;
                     +			num++;
                     +			if(num%1000000 == 0)
                     +			{
                     +				printf(" %d %d %d %d\n", num, sum, max, bias);
                     +			}
                     +		}
                     +	}
                     +}
                     +*/
                     +#elif defined (HAVE_MMX)
                     +	src+= stride*4;
                     +	asm volatile(
                     +		"pxor %%mm7, %%mm7				\n\t"
                     +		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
                     +		"andl $0xFFFFFFF8, %%ecx			\n\t" // align
                     +//	0	1	2	3	4	5	6	7
                     +//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	edx+%1	edx+2%1
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1
+                    +
                     +		"movq (%0), %%mm0				\n\t"
                     +		"movq %%mm0, %%mm1				\n\t"
                     +		"punpcklbw %%mm7, %%mm0				\n\t" // low part of line 0
                     +		"punpckhbw %%mm7, %%mm1				\n\t" // high part of line 0
+                    +
                     +		"movq (%0, %1), %%mm2				\n\t"
                     +		"leal (%0, %1, 2), %%eax			\n\t"
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"punpcklbw %%mm7, %%mm2				\n\t" // low part of line 1
                     +		"punpckhbw %%mm7, %%mm3				\n\t" // high part of line 1
+                    +
                     +		"movq (%%eax), %%mm4				\n\t"
                     +		"movq %%mm4, %%mm5				\n\t"
                     +		"punpcklbw %%mm7, %%mm4				\n\t" // low part of line 2
                     +		"punpckhbw %%mm7, %%mm5				\n\t" // high part of line 2
+                    +
                     +		"paddw %%mm0, %%mm0				\n\t" // 2L0
                     +		"paddw %%mm1, %%mm1				\n\t" // 2H0
                     +		"psubw %%mm4, %%mm2				\n\t" // L1 - L2
                     +		"psubw %%mm5, %%mm3				\n\t" // H1 - H2
                     +		"psubw %%mm2, %%mm0				\n\t" // 2L0 - L1 + L2
                     +		"psubw %%mm3, %%mm1				\n\t" // 2H0 - H1 + H2
+                    +
                     +		"psllw $2, %%mm2				\n\t" // 4L1 - 4L2
                     +		"psllw $2, %%mm3				\n\t" // 4H1 - 4H2
                     +		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2
                     +		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2
+                    +
                     +		"movq (%%eax, %1), %%mm2			\n\t"
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"punpcklbw %%mm7, %%mm2				\n\t" // L3
                     +		"punpckhbw %%mm7, %%mm3				\n\t" // H3
+                    +
                     +		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - L3
                     +		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - H3
                     +		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
                     +		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
                     +		"movq %%mm0, (%%ecx)				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
                     +		"movq %%mm1, 8(%%ecx)				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+                    +
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t"
                     +		"movq %%mm0, %%mm1				\n\t"
                     +		"punpcklbw %%mm7, %%mm0				\n\t" // L4
                     +		"punpckhbw %%mm7, %%mm1				\n\t" // H4
+                    +
                     +		"psubw %%mm0, %%mm2				\n\t" // L3 - L4
                     +		"psubw %%mm1, %%mm3				\n\t" // H3 - H4
                     +		"movq %%mm2, 16(%%ecx)				\n\t" // L3 - L4
                     +		"movq %%mm3, 24(%%ecx)				\n\t" // H3 - H4
                     +		"paddw %%mm4, %%mm4				\n\t" // 2L2
                     +		"paddw %%mm5, %%mm5				\n\t" // 2H2
                     +		"psubw %%mm2, %%mm4				\n\t" // 2L2 - L3 + L4
                     +		"psubw %%mm3, %%mm5				\n\t" // 2H2 - H3 + H4
+                    +
                     +		"leal (%%eax, %1), %0				\n\t"
                     +		"psllw $2, %%mm2				\n\t" // 4L3 - 4L4
                     +		"psllw $2, %%mm3				\n\t" // 4H3 - 4H4
                     +		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4
                     +		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4
                     +//50 opcodes so far
                     +		"movq (%0, %1, 2), %%mm2			\n\t"
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"punpcklbw %%mm7, %%mm2				\n\t" // L5
                     +		"punpckhbw %%mm7, %%mm3				\n\t" // H5
                     +		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - L5
                     +		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - H5
                     +		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - 2L5
                     +		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - 2H5
+                    +
                     +		"movq (%%eax, %1, 4), %%mm6			\n\t"
                     +		"punpcklbw %%mm7, %%mm6				\n\t" // L6
                     +		"psubw %%mm6, %%mm2				\n\t" // L5 - L6
                     +		"movq (%%eax, %1, 4), %%mm6			\n\t"
                     +		"punpckhbw %%mm7, %%mm6				\n\t" // H6
                     +		"psubw %%mm6, %%mm3				\n\t" // H5 - H6
+                    +
                     +		"paddw %%mm0, %%mm0				\n\t" // 2L4
                     +		"paddw %%mm1, %%mm1				\n\t" // 2H4
                     +		"psubw %%mm2, %%mm0				\n\t" // 2L4 - L5 + L6
                     +		"psubw %%mm3, %%mm1				\n\t" // 2H4 - H5 + H6
+                    +
                     +		"psllw $2, %%mm2				\n\t" // 4L5 - 4L6
                     +		"psllw $2, %%mm3				\n\t" // 4H5 - 4H6
                     +		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6
                     +		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6
+                    +
                     +		"movq (%0, %1, 4), %%mm2			\n\t"
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"punpcklbw %%mm7, %%mm2				\n\t" // L7
                     +		"punpckhbw %%mm7, %%mm3				\n\t" // H7
+                    +
                     +		"paddw %%mm2, %%mm2				\n\t" // 2L7
                     +		"paddw %%mm3, %%mm3				\n\t" // 2H7
                     +		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6 - 2L7
                     +		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6 - 2H7
+                    +
                     +		"movq (%%ecx), %%mm2				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
                     +		"movq 8(%%ecx), %%mm3				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+                    +
                     +#ifdef HAVE_MMX2
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"psubw %%mm0, %%mm6				\n\t"
                     +		"pmaxsw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"psubw %%mm1, %%mm6				\n\t"
                     +		"pmaxsw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"psubw %%mm2, %%mm6				\n\t"
                     +		"pmaxsw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"psubw %%mm3, %%mm6				\n\t"
                     +		"pmaxsw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
                     +#else
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"pcmpgtw %%mm0, %%mm6				\n\t"
                     +		"pxor %%mm6, %%mm0				\n\t"
                     +		"psubw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"pcmpgtw %%mm1, %%mm6				\n\t"
                     +		"pxor %%mm6, %%mm1				\n\t"
                     +		"psubw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"pcmpgtw %%mm2, %%mm6				\n\t"
                     +		"pxor %%mm6, %%mm2				\n\t"
                     +		"psubw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"pcmpgtw %%mm3, %%mm6				\n\t"
                     +		"pxor %%mm6, %%mm3				\n\t"
                     +		"psubw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
                     +#endif
+                    +
                     +#ifdef HAVE_MMX2
                     +		"pminsw %%mm2, %%mm0				\n\t"
                     +		"pminsw %%mm3, %%mm1				\n\t"
                     +#else
                     +		"movq %%mm0, %%mm6				\n\t"
                     +		"psubusw %%mm2, %%mm6				\n\t"
                     +		"psubw %%mm6, %%mm0				\n\t"
                     +		"movq %%mm1, %%mm6				\n\t"
                     +		"psubusw %%mm3, %%mm6				\n\t"
                     +		"psubw %%mm6, %%mm1				\n\t"
                     +#endif
+                    +
                     +		"movq %%mm7, %%mm6				\n\t" // 0
                     +		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
                     +		"pxor %%mm6, %%mm4				\n\t"
                     +		"psubw %%mm6, %%mm4				\n\t" // |2L2 - 5L3 + 5L4 - 2L5|
                     +		"pcmpgtw %%mm5, %%mm7				\n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
                     +		"pxor %%mm7, %%mm5				\n\t"
                     +		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
                     +// 100 opcodes
                     +		"movd %2, %%mm2					\n\t" // QP
                     +		"psllw $3, %%mm2				\n\t" // 8QP
                     +		"movq %%mm2, %%mm3				\n\t" // 8QP
                     +		"pcmpgtw %%mm4, %%mm2				\n\t"
                     +		"pcmpgtw %%mm5, %%mm3				\n\t"
                     +		"pand %%mm2, %%mm4				\n\t"
                     +		"pand %%mm3, %%mm5				\n\t"
+                    +
+                    +
                     +		"psubusw %%mm0, %%mm4				\n\t" // hd
                     +		"psubusw %%mm1, %%mm5				\n\t" // ld
+                    +
+                    +
                     +		"movq "MANGLE(w05)", %%mm2			\n\t" // 5
                     +		"pmullw %%mm2, %%mm4				\n\t"
                     +		"pmullw %%mm2, %%mm5				\n\t"
                     +		"movq "MANGLE(w20)", %%mm2			\n\t" // 32
                     +		"paddw %%mm2, %%mm4				\n\t"
                     +		"paddw %%mm2, %%mm5				\n\t"
                     +		"psrlw $6, %%mm4				\n\t"
                     +		"psrlw $6, %%mm5				\n\t"
+                    +
                     +		"movq 16(%%ecx), %%mm0				\n\t" // L3 - L4
                     +		"movq 24(%%ecx), %%mm1				\n\t" // H3 - H4
+                    +
                     +		"pxor %%mm2, %%mm2				\n\t"
                     +		"pxor %%mm3, %%mm3				\n\t"
+                    +
                     +		"pcmpgtw %%mm0, %%mm2				\n\t" // sign (L3-L4)
                     +		"pcmpgtw %%mm1, %%mm3				\n\t" // sign (H3-H4)
                     +		"pxor %%mm2, %%mm0				\n\t"
                     +		"pxor %%mm3, %%mm1				\n\t"
                     +		"psubw %%mm2, %%mm0				\n\t" // |L3-L4|
                     +		"psubw %%mm3, %%mm1				\n\t" // |H3-H4|
                     +		"psrlw $1, %%mm0				\n\t" // |L3 - L4|/2
                     +		"psrlw $1, %%mm1				\n\t" // |H3 - H4|/2
+                    +
                     +		"pxor %%mm6, %%mm2				\n\t"
                     +		"pxor %%mm7, %%mm3				\n\t"
                     +		"pand %%mm2, %%mm4				\n\t"
                     +		"pand %%mm3, %%mm5				\n\t"
+                    +
                     +#ifdef HAVE_MMX2
                     +		"pminsw %%mm0, %%mm4				\n\t"
                     +		"pminsw %%mm1, %%mm5				\n\t"
                     +#else
                     +		"movq %%mm4, %%mm2				\n\t"
                     +		"psubusw %%mm0, %%mm2				\n\t"
                     +		"psubw %%mm2, %%mm4				\n\t"
                     +		"movq %%mm5, %%mm2				\n\t"
                     +		"psubusw %%mm1, %%mm2				\n\t"
                     +		"psubw %%mm2, %%mm5				\n\t"
                     +#endif
                     +		"pxor %%mm6, %%mm4				\n\t"
                     +		"pxor %%mm7, %%mm5				\n\t"
                     +		"psubw %%mm6, %%mm4				\n\t"
                     +		"psubw %%mm7, %%mm5				\n\t"
                     +		"packsswb %%mm5, %%mm4				\n\t"
                     +		"movq (%0), %%mm0				\n\t"
                     +		"paddb   %%mm4, %%mm0				\n\t"
                     +		"movq %%mm0, (%0)				\n\t"
                     +		"movq (%0, %1), %%mm0				\n\t"
                     +		"psubb %%mm4, %%mm0				\n\t"
                     +		"movq %%mm0, (%0, %1)				\n\t"
+                    +
                     +		: "+r" (src)
                     +		: "r" (stride), "m" (c->pQPb)
                     +		: "%eax", "%ecx"
                     +	);
                     +#else
                     +	const int l1= stride;
                     +	const int l2= stride + l1;
                     +	const int l3= stride + l2;
                     +	const int l4= stride + l3;
                     +	const int l5= stride + l4;
                     +	const int l6= stride + l5;
                     +	const int l7= stride + l6;
                     +	const int l8= stride + l7;
                     +//	const int l9= stride + l8;
                     +	int x;
                     +	src+= stride*3;
                     +	for(x=0; x<BLOCK_SIZE; x++)
                     +	{
                     +		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
                     +		if(ABS(middleEnergy) < 8*c->QP)
                     +		{
                     +			const int q=(src[l4] - src[l5])/2;
                     +			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                     +			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
+                    +
                     +			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
                     +			d= MAX(d, 0);
+                    +
                     +			d= (5*d + 32) >> 6;
                     +			d*= SIGN(-middleEnergy);
+                    +
                     +			if(q>0)
                     +			{
                     +				d= d<0 ? 0 : d;
                     +				d= d>q ? q : d;
                     +			}
                     +			else
                     +			{
                     +				d= d>0 ? 0 : d;
                     +				d= d<q ? q : d;
                     +			}
+                    +
                     +        		src[l4]-= d;
                     +	        	src[l5]+= d;
                     +		}
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"pxor %%mm6, %%mm6				\n\t"
                     +		"pcmpeqb %%mm7, %%mm7				\n\t"
                     +		"movq %2, %%mm0					\n\t"
                     +		"punpcklbw %%mm6, %%mm0				\n\t"
                     +		"psrlw $1, %%mm0				\n\t"
                     +		"psubw %%mm7, %%mm0				\n\t"
                     +		"packuswb %%mm0, %%mm0				\n\t"
                     +		"movq %%mm0, %3					\n\t"
+                    +
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%edx			\n\t"
+                    +
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
+                    +
                     +#undef FIND_MIN_MAX
                     +#ifdef HAVE_MMX2
                     +#define FIND_MIN_MAX(addr)\
                     +		"movq " #addr ", %%mm0				\n\t"\
                     +		"pminub %%mm0, %%mm7				\n\t"\
                     +		"pmaxub %%mm0, %%mm6				\n\t"
                     +#else
                     +#define FIND_MIN_MAX(addr)\
                     +		"movq " #addr ", %%mm0				\n\t"\
                     +		"movq %%mm7, %%mm1				\n\t"\
                     +		"psubusb %%mm0, %%mm6				\n\t"\
                     +		"paddb %%mm0, %%mm6				\n\t"\
                     +		"psubusb %%mm0, %%mm1				\n\t"\
                     +		"psubb %%mm1, %%mm7				\n\t"
                     +#endif
+                    +
                     +FIND_MIN_MAX((%%eax))
                     +FIND_MIN_MAX((%%eax, %1))
                     +FIND_MIN_MAX((%%eax, %1, 2))
                     +FIND_MIN_MAX((%0, %1, 4))
                     +FIND_MIN_MAX((%%edx))
                     +FIND_MIN_MAX((%%edx, %1))
                     +FIND_MIN_MAX((%%edx, %1, 2))
                     +FIND_MIN_MAX((%0, %1, 8))
+                    +
                     +		"movq %%mm7, %%mm4				\n\t"
                     +		"psrlq $8, %%mm7				\n\t"
                     +#ifdef HAVE_MMX2
                     +		"pminub %%mm4, %%mm7				\n\t" // min of pixels
                     +		"pshufw $0xF9, %%mm7, %%mm4			\n\t"
                     +		"pminub %%mm4, %%mm7				\n\t" // min of pixels
                     +		"pshufw $0xFE, %%mm7, %%mm4			\n\t"
                     +		"pminub %%mm4, %%mm7				\n\t"
                     +#else
                     +		"movq %%mm7, %%mm1				\n\t"
                     +		"psubusb %%mm4, %%mm1				\n\t"
                     +		"psubb %%mm1, %%mm7				\n\t"
                     +		"movq %%mm7, %%mm4				\n\t"
                     +		"psrlq $16, %%mm7				\n\t"
                     +		"movq %%mm7, %%mm1				\n\t"
                     +		"psubusb %%mm4, %%mm1				\n\t"
                     +		"psubb %%mm1, %%mm7				\n\t"
                     +		"movq %%mm7, %%mm4				\n\t"
                     +		"psrlq $32, %%mm7				\n\t"
                     +		"movq %%mm7, %%mm1				\n\t"
                     +		"psubusb %%mm4, %%mm1				\n\t"
                     +		"psubb %%mm1, %%mm7				\n\t"
                     +#endif
+                    +
+                    +
                     +		"movq %%mm6, %%mm4				\n\t"
                     +		"psrlq $8, %%mm6				\n\t"
                     +#ifdef HAVE_MMX2
                     +		"pmaxub %%mm4, %%mm6				\n\t" // max of pixels
                     +		"pshufw $0xF9, %%mm6, %%mm4			\n\t"
                     +		"pmaxub %%mm4, %%mm6				\n\t"
                     +		"pshufw $0xFE, %%mm6, %%mm4			\n\t"
                     +		"pmaxub %%mm4, %%mm6				\n\t"
                     +#else
                     +		"psubusb %%mm4, %%mm6				\n\t"
                     +		"paddb %%mm4, %%mm6				\n\t"
                     +		"movq %%mm6, %%mm4				\n\t"
                     +		"psrlq $16, %%mm6				\n\t"
                     +		"psubusb %%mm4, %%mm6				\n\t"
                     +		"paddb %%mm4, %%mm6				\n\t"
                     +		"movq %%mm6, %%mm4				\n\t"
                     +		"psrlq $32, %%mm6				\n\t"
                     +		"psubusb %%mm4, %%mm6				\n\t"
                     +		"paddb %%mm4, %%mm6				\n\t"
                     +#endif
                     +		"movq %%mm6, %%mm0				\n\t" // max
                     +		"psubb %%mm7, %%mm6				\n\t" // max - min
                     +		"movd %%mm6, %%ecx				\n\t"
                     +		"cmpb "MANGLE(deringThreshold)", %%cl		\n\t"
                     +		" jb 1f						\n\t"
                     +		"leal -24(%%esp), %%ecx				\n\t"
                     +		"andl $0xFFFFFFF8, %%ecx			\n\t"
                     +		PAVGB(%%mm0, %%mm7)				      // a=(max + min)/2
                     +		"punpcklbw %%mm7, %%mm7				\n\t"
                     +		"punpcklbw %%mm7, %%mm7				\n\t"
                     +		"punpcklbw %%mm7, %%mm7				\n\t"
                     +		"movq %%mm7, (%%ecx)				\n\t"
+                    +
                     +		"movq (%0), %%mm0				\n\t" // L10
                     +		"movq %%mm0, %%mm1				\n\t" // L10
                     +		"movq %%mm0, %%mm2				\n\t" // L10
                     +		"psllq $8, %%mm1				\n\t"
                     +		"psrlq $8, %%mm2				\n\t"
                     +		"movd -4(%0), %%mm3				\n\t"
                     +		"movd 8(%0), %%mm4				\n\t"
                     +		"psrlq $24, %%mm3				\n\t"
                     +		"psllq $56, %%mm4				\n\t"
                     +		"por %%mm3, %%mm1				\n\t" // L00
                     +		"por %%mm4, %%mm2				\n\t" // L20
                     +		"movq %%mm1, %%mm3				\n\t" // L00
                     +		PAVGB(%%mm2, %%mm1)				      // (L20 + L00)/2
                     +		PAVGB(%%mm0, %%mm1)				      // (L20 + L00 + 2L10)/4
                     +		"psubusb %%mm7, %%mm0				\n\t"
                     +		"psubusb %%mm7, %%mm2				\n\t"
                     +		"psubusb %%mm7, %%mm3				\n\t"
                     +		"pcmpeqb "MANGLE(b00)", %%mm0			\n\t" // L10 > a ? 0 : -1
                     +		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L20 > a ? 0 : -1
                     +		"pcmpeqb "MANGLE(b00)", %%mm3			\n\t" // L00 > a ? 0 : -1
                     +		"paddb %%mm2, %%mm0				\n\t"
                     +		"paddb %%mm3, %%mm0				\n\t"
+                    +
                     +		"movq (%%eax), %%mm2				\n\t" // L11
                     +		"movq %%mm2, %%mm3				\n\t" // L11
                     +		"movq %%mm2, %%mm4				\n\t" // L11
                     +		"psllq $8, %%mm3				\n\t"
                     +		"psrlq $8, %%mm4				\n\t"
                     +		"movd -4(%%eax), %%mm5				\n\t"
                     +		"movd 8(%%eax), %%mm6				\n\t"
                     +		"psrlq $24, %%mm5				\n\t"
                     +		"psllq $56, %%mm6				\n\t"
                     +		"por %%mm5, %%mm3				\n\t" // L01
                     +		"por %%mm6, %%mm4				\n\t" // L21
                     +		"movq %%mm3, %%mm5				\n\t" // L01
                     +		PAVGB(%%mm4, %%mm3)				      // (L21 + L01)/2
                     +		PAVGB(%%mm2, %%mm3)				      // (L21 + L01 + 2L11)/4
                     +		"psubusb %%mm7, %%mm2				\n\t"
                     +		"psubusb %%mm7, %%mm4				\n\t"
                     +		"psubusb %%mm7, %%mm5				\n\t"
                     +		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L11 > a ? 0 : -1
                     +		"pcmpeqb "MANGLE(b00)", %%mm4			\n\t" // L21 > a ? 0 : -1
                     +		"pcmpeqb "MANGLE(b00)", %%mm5			\n\t" // L01 > a ? 0 : -1
                     +		"paddb %%mm4, %%mm2				\n\t"
                     +		"paddb %%mm5, %%mm2				\n\t"
                     +// 0, 2, 3, 1
                     +#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
                     +		"movq " #src ", " #sx "				\n\t" /* src[0] */\
                     +		"movq " #sx ", " #lx "				\n\t" /* src[0] */\
                     +		"movq " #sx ", " #t0 "				\n\t" /* src[0] */\
                     +		"psllq $8, " #lx "				\n\t"\
                     +		"psrlq $8, " #t0 "				\n\t"\
                     +		"movd -4" #src ", " #t1 "			\n\t"\
                     +		"psrlq $24, " #t1 "				\n\t"\
                     +		"por " #t1 ", " #lx "				\n\t" /* src[-1] */\
                     +		"movd 8" #src ", " #t1 "			\n\t"\
                     +		"psllq $56, " #t1 "				\n\t"\
                     +		"por " #t1 ", " #t0 "				\n\t" /* src[+1] */\
                     +		"movq " #lx ", " #t1 "				\n\t" /* src[-1] */\
                     +		PAVGB(t0, lx)				              /* (src[-1] + src[+1])/2 */\
                     +		PAVGB(sx, lx)				      /* (src[-1] + 2src[0] + src[+1])/4 */\
                     +		PAVGB(lx, pplx)					     \
                     +		"movq " #lx ", 8(%%ecx)				\n\t"\
                     +		"movq (%%ecx), " #lx "				\n\t"\
                     +		"psubusb " #lx ", " #t1 "			\n\t"\
                     +		"psubusb " #lx ", " #t0 "			\n\t"\
                     +		"psubusb " #lx ", " #sx "			\n\t"\
                     +		"movq "MANGLE(b00)", " #lx "			\n\t"\
                     +		"pcmpeqb " #lx ", " #t1 "			\n\t" /* src[-1] > a ? 0 : -1*/\
                     +		"pcmpeqb " #lx ", " #t0 "			\n\t" /* src[+1] > a ? 0 : -1*/\
                     +		"pcmpeqb " #lx ", " #sx "			\n\t" /* src[0]  > a ? 0 : -1*/\
                     +		"paddb " #t1 ", " #t0 "				\n\t"\
                     +		"paddb " #t0 ", " #sx "				\n\t"\
                     +\
                     +		PAVGB(plx, pplx)				      /* filtered */\
                     +		"movq " #dst ", " #t0 "				\n\t" /* dst */\
                     +		"movq " #t0 ", " #t1 "				\n\t" /* dst */\
                     +		"psubusb %3, " #t0 "				\n\t"\
                     +		"paddusb %3, " #t1 "				\n\t"\
                     +		PMAXUB(t0, pplx)\
                     +		PMINUB(t1, pplx, t0)\
                     +		"paddb " #sx ", " #ppsx "			\n\t"\
                     +		"paddb " #psx ", " #ppsx "			\n\t"\
                     +		"#paddb "MANGLE(b02)", " #ppsx "		\n\t"\
                     +		"pand "MANGLE(b08)", " #ppsx "			\n\t"\
                     +		"pcmpeqb " #lx ", " #ppsx "			\n\t"\
                     +		"pand " #ppsx ", " #pplx "			\n\t"\
                     +		"pandn " #dst ", " #ppsx "			\n\t"\
                     +		"por " #pplx ", " #ppsx "			\n\t"\
                     +		"movq " #ppsx ", " #dst "			\n\t"\
                     +		"movq 8(%%ecx), " #lx "				\n\t"
+                    +
                     +/*
                     +0000000
                     +1111111
+                    +
                     +1111110
                     +1111101
                     +1111100
                     +1111011
                     +1111010
                     +1111001
+                    +
                     +1111000
                     +1110111
+                    +
                     +*/
                     +//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
                     +DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
                     +DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
                     +DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
                     +DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
                     +DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
                     +DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
                     +DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
                     +DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
+                    +
                     +		"1:			\n\t"
                     +		: : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
                     +		: "%eax", "%edx", "%ecx"
                     +	);
                     +#else
                     +	int y;
                     +	int min=255;
                     +	int max=0;
                     +	int avg;
                     +	uint8_t *p;
                     +	int s[10];
                     +	const int QP2= c->QP/2 + 1;
+                    +
                     +	for(y=1; y<9; y++)
                     +	{
                     +		int x;
                     +		p= src + stride*y;
                     +		for(x=1; x<9; x++)
                     +		{
                     +			p++;
                     +			if(*p > max) max= *p;
                     +			if(*p < min) min= *p;
                     +		}
                     +	}
                     +	avg= (min + max + 1)>>1;
+                    +
                     +	if(max - min <deringThreshold) return;
+                    +
                     +	for(y=0; y<10; y++)
                     +	{
                     +		int t = 0;
+                    +
                     +		if(src[stride*y + 0] > avg) t+= 1;
                     +		if(src[stride*y + 1] > avg) t+= 2;
                     +		if(src[stride*y + 2] > avg) t+= 4;
                     +		if(src[stride*y + 3] > avg) t+= 8;
                     +		if(src[stride*y + 4] > avg) t+= 16;
                     +		if(src[stride*y + 5] > avg) t+= 32;
                     +		if(src[stride*y + 6] > avg) t+= 64;
                     +		if(src[stride*y + 7] > avg) t+= 128;
                     +		if(src[stride*y + 8] > avg) t+= 256;
                     +		if(src[stride*y + 9] > avg) t+= 512;
+                    +
                     +		t |= (~t)<<16;
                     +		t &= (t<<1) & (t>>1);
                     +		s[y] = t;
                     +	}
+                    +
                     +	for(y=1; y<9; y++)
                     +	{
                     +		int t = s[y-1] & s[y] & s[y+1];
                     +		t|= t>>16;
                     +		s[y-1]= t;
                     +	}
+                    +
                     +	for(y=1; y<9; y++)
                     +	{
                     +		int x;
                     +		int t = s[y-1];
+                    +
                     +		p= src + stride*y;
                     +		for(x=1; x<9; x++)
                     +		{
                     +			p++;
                     +			if(t & (1<<x))
                     +			{
                     +				int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
                     +				      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
                     +				      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
                     +				f= (f + 8)>>4;
+                    +
                     +#ifdef DEBUG_DERING_THRESHOLD
                     +				asm volatile("emms\n\t":);
                     +				{
                     +				static long long numPixels=0;
                     +				if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
                     +//				if((max-min)<20 || (max-min)*QP<200)
                     +//				if((max-min)*QP < 500)
                     +//				if(max-min<QP/2)
                     +				if(max-min < 20)
                     +				{
                     +					static int numSkiped=0;
                     +					static int errorSum=0;
                     +					static int worstQP=0;
                     +					static int worstRange=0;
                     +					static int worstDiff=0;
                     +					int diff= (f - *p);
                     +					int absDiff= ABS(diff);
                     +					int error= diff*diff;
+                    +
                     +					if(x==1 || x==8 || y==1 || y==8) continue;
+                    +
                     +					numSkiped++;
                     +					if(absDiff > worstDiff)
                     +					{
                     +						worstDiff= absDiff;
                     +						worstQP= QP;
                     +						worstRange= max-min;
                     +					}
                     +					errorSum+= error;
+                    +
                     +					if(1024LL*1024LL*1024LL % numSkiped == 0)
                     +					{
                     +						printf( "sum:%1.3f, skip:%d, wQP:%d, "
                     +							"wRange:%d, wDiff:%d, relSkip:%1.3f\n",
                     +							(float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
                     +							worstDiff, (float)numSkiped/numPixels);
                     +					}
                     +				}
                     +				}
                     +#endif
                     +				if     (*p + QP2 < f) *p= *p + QP2;
                     +				else if(*p - QP2 > f) *p= *p - QP2;
                     +				else *p=f;
                     +			}
                     +		}
                     +	}
                     +#ifdef DEBUG_DERING_THRESHOLD
                     +	if(max-min < 20)
                     +	{
                     +		for(y=1; y<9; y++)
                     +		{
                     +			int x;
                     +			int t = 0;
                     +			p= src + stride*y;
                     +			for(x=1; x<9; x++)
                     +			{
                     +				p++;
                     +				*p = MIN(*p + 20, 255);
                     +			}
                     +		}
                     +//		src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
                     +	}
                     +#endif
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block and can read & write from line 4-15
                     + * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     + * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     + */
                     +static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	src+= 4*stride;
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t"
                     +		"movq (%%eax, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)
                     +		"movq %%mm0, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)
                     +		"movq %%mm1, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ecx, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)
                     +		"movq %%mm0, (%%ecx)				\n\t"
                     +		"movq (%0, %1, 8), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)
                     +		"movq %%mm1, (%%ecx, %1, 2)			\n\t"
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ecx"
                     +	);
                     +#else
                     +	int x;
                     +	src+= 4*stride;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[stride]   = (src[0]        + src[stride*2])>>1;
                     +		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
                     +		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
                     +		src[stride*7] = (src[stride*6] + src[stride*8])>>1;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block and can read & write from line 4-15
                     + * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     + * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     + * this filter will read lines 3-15 and write 7-13
                     + * no cliping in C version
                     + */
                     +static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	src+= stride*3;
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%edx			\n\t"
                     +		"leal (%%edx, %1, 4), %%ecx			\n\t"
                     +		"addl %1, %%ecx					\n\t"
                     +		"pxor %%mm7, %%mm7				\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9	10
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
+                    +
                     +#define DEINT_CUBIC(a,b,c,d,e)\
                     +		"movq " #a ", %%mm0				\n\t"\
                     +		"movq " #b ", %%mm1				\n\t"\
                     +		"movq " #d ", %%mm2				\n\t"\
                     +		"movq " #e ", %%mm3				\n\t"\
                     +		PAVGB(%%mm2, %%mm1)					/* (b+d) /2 */\
                     +		PAVGB(%%mm3, %%mm0)					/* a(a+e) /2 */\
                     +		"movq %%mm0, %%mm2				\n\t"\
                     +		"punpcklbw %%mm7, %%mm0				\n\t"\
                     +		"punpckhbw %%mm7, %%mm2				\n\t"\
                     +		"movq %%mm1, %%mm3				\n\t"\
                     +		"punpcklbw %%mm7, %%mm1				\n\t"\
                     +		"punpckhbw %%mm7, %%mm3				\n\t"\
                     +		"psubw %%mm1, %%mm0				\n\t"	/* L(a+e - (b+d))/2 */\
                     +		"psubw %%mm3, %%mm2				\n\t"	/* H(a+e - (b+d))/2 */\
                     +		"psraw $3, %%mm0				\n\t"	/* L(a+e - (b+d))/16 */\
                     +		"psraw $3, %%mm2				\n\t"	/* H(a+e - (b+d))/16 */\
                     +		"psubw %%mm0, %%mm1				\n\t"	/* L(9b + 9d - a - e)/16 */\
                     +		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
                     +		"packuswb %%mm3, %%mm1				\n\t"\
                     +		"movq %%mm1, " #c "				\n\t"
+                    +
                     +DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
                     +DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
                     +DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
                     +DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%edx", "ecx"
                     +	);
                     +#else
                     +	int x;
                     +	src+= stride*3;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
                     +		src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
                     +		src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
                     +		src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block and can read & write from line 4-15
                     + * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     + * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     + * this filter will read lines 4-13 and write 5-11
                     + * no cliping in C version
                     + */
                     +static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	src+= stride*4;
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%edx			\n\t"
                     +		"pxor %%mm7, %%mm7				\n\t"
                     +		"movq (%2), %%mm0				\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9	10
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
+                    +
                     +#define DEINT_FF(a,b,c,d)\
                     +		"movq " #a ", %%mm1				\n\t"\
                     +		"movq " #b ", %%mm2				\n\t"\
                     +		"movq " #c ", %%mm3				\n\t"\
                     +		"movq " #d ", %%mm4				\n\t"\
                     +		PAVGB(%%mm3, %%mm1)					\
                     +		PAVGB(%%mm4, %%mm0)					\
                     +		"movq %%mm0, %%mm3				\n\t"\
                     +		"punpcklbw %%mm7, %%mm0				\n\t"\
                     +		"punpckhbw %%mm7, %%mm3				\n\t"\
                     +		"movq %%mm1, %%mm4				\n\t"\
                     +		"punpcklbw %%mm7, %%mm1				\n\t"\
                     +		"punpckhbw %%mm7, %%mm4				\n\t"\
                     +		"psllw $2, %%mm1				\n\t"\
                     +		"psllw $2, %%mm4				\n\t"\
                     +		"psubw %%mm0, %%mm1				\n\t"\
                     +		"psubw %%mm3, %%mm4				\n\t"\
                     +		"movq %%mm2, %%mm5				\n\t"\
                     +		"movq %%mm2, %%mm0				\n\t"\
                     +		"punpcklbw %%mm7, %%mm2				\n\t"\
                     +		"punpckhbw %%mm7, %%mm5				\n\t"\
                     +		"paddw %%mm2, %%mm1				\n\t"\
                     +		"paddw %%mm5, %%mm4				\n\t"\
                     +		"psraw $2, %%mm1				\n\t"\
                     +		"psraw $2, %%mm4				\n\t"\
                     +		"packuswb %%mm4, %%mm1				\n\t"\
                     +		"movq %%mm1, " #b "				\n\t"\
+                    +
                     +DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
                     +DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
                     +DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
                     +DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
+                    +
                     +		"movq %%mm0, (%2)				\n\t"
                     +		: : "r" (src), "r" (stride), "r"(tmp)
                     +		: "%eax", "%edx"
                     +	);
                     +#else
                     +	int x;
                     +	src+= stride*4;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		int t1= tmp[x];
                     +		int t2= src[stride*1];
+                    +
                     +		src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
                     +		t1= src[stride*4];
                     +		src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
                     +		t2= src[stride*6];
                     +		src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
                     +		t1= src[stride*8];
                     +		src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
                     +		tmp[x]= t1;
+                    +
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block and can read & write from line 4-15
                     + * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     + * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     + * will shift the image up by 1 line (FIXME if this is a problem)
                     + * this filter will read lines 4-13 and write 4-11
                     + */
                     +static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	src+= 4*stride;
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%edx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%%eax, %1), %%mm1			\n\t" // L2
                     +		PAVGB(%%mm1, %%mm0)				      // L0+L2
                     +		"movq (%%eax), %%mm2				\n\t" // L1
                     +		PAVGB(%%mm2, %%mm0)
                     +		"movq %%mm0, (%0)				\n\t"
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
                     +		PAVGB(%%mm0, %%mm2)				      // L1+L3
                     +		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
                     +		"movq %%mm2, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm2			\n\t" // L4
                     +		PAVGB(%%mm2, %%mm1)				      // L2+L4
                     +		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
                     +		"movq %%mm1, (%%eax, %1)			\n\t"
                     +		"movq (%%edx), %%mm1				\n\t" // L5
                     +		PAVGB(%%mm1, %%mm0)				      // L3+L5
                     +		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%edx, %1), %%mm0			\n\t" // L6
                     +		PAVGB(%%mm0, %%mm2)				      // L4+L6
                     +		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
                     +		"movq %%mm2, (%0, %1, 4)			\n\t"
                     +		"movq (%%edx, %1, 2), %%mm2			\n\t" // L7
                     +		PAVGB(%%mm2, %%mm1)				      // L5+L7
                     +		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
                     +		"movq %%mm1, (%%edx)				\n\t"
                     +		"movq (%0, %1, 8), %%mm1			\n\t" // L8
                     +		PAVGB(%%mm1, %%mm0)				      // L6+L8
                     +		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
                     +		"movq %%mm0, (%%edx, %1)			\n\t"
                     +		"movq (%%edx, %1, 4), %%mm0			\n\t" // L9
                     +		PAVGB(%%mm0, %%mm2)				      // L7+L9
                     +		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
                     +		"movq %%mm2, (%%edx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%edx"
                     +	);
                     +#else
                     +	int x;
                     +	src+= 4*stride;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
                     +		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block and can read & write from line 4-15,
                     + * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     + * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     + */
                     +static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
                     +{
                     +#ifdef HAVE_MMX
                     +	src+= 4*stride;
                     +#ifdef HAVE_MMX2
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%edx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" //
                     +		"movq (%%eax, %1), %%mm2			\n\t" //
                     +		"movq (%%eax), %%mm1				\n\t" //
                     +		"movq %%mm0, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm2, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm0				\n\t"
                     +		"movq %%mm0, (%%eax)				\n\t"
+                    +
                     +		"movq (%0, %1, 4), %%mm0			\n\t" //
                     +		"movq (%%eax, %1, 2), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm0, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm2				\n\t"
                     +		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    +
                     +		"movq (%%edx), %%mm2				\n\t" //
                     +		"movq (%%edx, %1), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%edx)				\n\t"
+                    +
                     +		"movq (%%edx, %1, 2), %%mm2			\n\t" //
                     +		"movq (%0, %1, 8), %%mm0			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%edx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%edx"
                     +	);
+                    +
                     +#else // MMX without MMX2
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%edx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
                     +		"pxor %%mm7, %%mm7				\n\t"
+                    +
                     +#define MEDIAN(a,b,c)\
                     +		"movq " #a ", %%mm0				\n\t"\
                     +		"movq " #b ", %%mm2				\n\t"\
                     +		"movq " #c ", %%mm1				\n\t"\
                     +		"movq %%mm0, %%mm3				\n\t"\
                     +		"movq %%mm1, %%mm4				\n\t"\
                     +		"movq %%mm2, %%mm5				\n\t"\
                     +		"psubusb %%mm1, %%mm3				\n\t"\
                     +		"psubusb %%mm2, %%mm4				\n\t"\
                     +		"psubusb %%mm0, %%mm5				\n\t"\
                     +		"pcmpeqb %%mm7, %%mm3				\n\t"\
                     +		"pcmpeqb %%mm7, %%mm4				\n\t"\
                     +		"pcmpeqb %%mm7, %%mm5				\n\t"\
                     +		"movq %%mm3, %%mm6				\n\t"\
                     +		"pxor %%mm4, %%mm3				\n\t"\
                     +		"pxor %%mm5, %%mm4				\n\t"\
                     +		"pxor %%mm6, %%mm5				\n\t"\
                     +		"por %%mm3, %%mm1				\n\t"\
                     +		"por %%mm4, %%mm2				\n\t"\
                     +		"por %%mm5, %%mm0				\n\t"\
                     +		"pand %%mm2, %%mm0				\n\t"\
                     +		"pand %%mm1, %%mm0				\n\t"\
                     +		"movq %%mm0, " #b "				\n\t"
+                    +
                     +MEDIAN((%0), (%%eax), (%%eax, %1))
                     +MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
                     +MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
                     +MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%edx"
                     +	);
                     +#endif // MMX
                     +#else
                     +	int x, y;
                     +	src+= 4*stride;
                     +	// FIXME - there should be a way to do a few columns in parallel like w/mmx
                     +	for(x=0; x<8; x++)
                     +	{
                     +		uint8_t *colsrc = src;
                     +		for (y=0; y<4; y++)
                     +		{
                     +			int a, b, c, d, e, f;
                     +			a = colsrc[0       ];
                     +			b = colsrc[stride  ];
                     +			c = colsrc[stride*2];
                     +			d = (a-b)>>31;
                     +			e = (b-c)>>31;
                     +			f = (c-a)>>31;
                     +			colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
                     +			colsrc += stride*2;
                     +		}
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +#ifdef HAVE_MMX
                     +/**
                     + * transposes and shift the given 8x8 Block into dst1 and dst2
                     + */
                     +static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
                     +{
                     +	asm(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
                     +		"movq (%0), %%mm0		\n\t" // 12345678
                     +		"movq (%%eax), %%mm1		\n\t" // abcdefgh
                     +		"movq %%mm0, %%mm2		\n\t" // 12345678
                     +		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     +		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    +
                     +		"movq (%%eax, %1), %%mm1	\n\t"
                     +		"movq (%%eax, %1, 2), %%mm3	\n\t"
                     +		"movq %%mm1, %%mm4		\n\t"
                     +		"punpcklbw %%mm3, %%mm1		\n\t"
                     +		"punpckhbw %%mm3, %%mm4		\n\t"
+                    +
                     +		"movq %%mm0, %%mm3		\n\t"
                     +		"punpcklwd %%mm1, %%mm0		\n\t"
                     +		"punpckhwd %%mm1, %%mm3		\n\t"
                     +		"movq %%mm2, %%mm1		\n\t"
                     +		"punpcklwd %%mm4, %%mm2		\n\t"
                     +		"punpckhwd %%mm4, %%mm1		\n\t"
+                    +
                     +		"movd %%mm0, 128(%2)		\n\t"
                     +		"psrlq $32, %%mm0		\n\t"
                     +		"movd %%mm0, 144(%2)		\n\t"
                     +		"movd %%mm3, 160(%2)		\n\t"
                     +		"psrlq $32, %%mm3		\n\t"
                     +		"movd %%mm3, 176(%2)		\n\t"
                     +		"movd %%mm3, 48(%3)		\n\t"
                     +		"movd %%mm2, 192(%2)		\n\t"
                     +		"movd %%mm2, 64(%3)		\n\t"
                     +		"psrlq $32, %%mm2		\n\t"
                     +		"movd %%mm2, 80(%3)		\n\t"
                     +		"movd %%mm1, 96(%3)		\n\t"
                     +		"psrlq $32, %%mm1		\n\t"
                     +		"movd %%mm1, 112(%3)		\n\t"
+                    +
                     +		"leal (%%eax, %1, 4), %%eax	\n\t"
+                    +
                     +		"movq (%0, %1, 4), %%mm0	\n\t" // 12345678
                     +		"movq (%%eax), %%mm1		\n\t" // abcdefgh
                     +		"movq %%mm0, %%mm2		\n\t" // 12345678
                     +		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     +		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    +
                     +		"movq (%%eax, %1), %%mm1	\n\t"
                     +		"movq (%%eax, %1, 2), %%mm3	\n\t"
                     +		"movq %%mm1, %%mm4		\n\t"
                     +		"punpcklbw %%mm3, %%mm1		\n\t"
                     +		"punpckhbw %%mm3, %%mm4		\n\t"
+                    +
                     +		"movq %%mm0, %%mm3		\n\t"
                     +		"punpcklwd %%mm1, %%mm0		\n\t"
                     +		"punpckhwd %%mm1, %%mm3		\n\t"
                     +		"movq %%mm2, %%mm1		\n\t"
                     +		"punpcklwd %%mm4, %%mm2		\n\t"
                     +		"punpckhwd %%mm4, %%mm1		\n\t"
+                    +
                     +		"movd %%mm0, 132(%2)		\n\t"
                     +		"psrlq $32, %%mm0		\n\t"
                     +		"movd %%mm0, 148(%2)		\n\t"
                     +		"movd %%mm3, 164(%2)		\n\t"
                     +		"psrlq $32, %%mm3		\n\t"
                     +		"movd %%mm3, 180(%2)		\n\t"
                     +		"movd %%mm3, 52(%3)		\n\t"
                     +		"movd %%mm2, 196(%2)		\n\t"
                     +		"movd %%mm2, 68(%3)		\n\t"
                     +		"psrlq $32, %%mm2		\n\t"
                     +		"movd %%mm2, 84(%3)		\n\t"
                     +		"movd %%mm1, 100(%3)		\n\t"
                     +		"psrlq $32, %%mm1		\n\t"
                     +		"movd %%mm1, 116(%3)		\n\t"
+                    +
+                    +
                     +	:: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
                     +	: "%eax"
                     +	);
                     +}
+                    +
                     +/**
                     + * transposes the given 8x8 block
                     + */
                     +static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
                     +{
                     +	asm(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%edx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
                     +		"movq (%2), %%mm0		\n\t" // 12345678
                     +		"movq 16(%2), %%mm1		\n\t" // abcdefgh
                     +		"movq %%mm0, %%mm2		\n\t" // 12345678
                     +		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     +		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    +
                     +		"movq 32(%2), %%mm1		\n\t"
                     +		"movq 48(%2), %%mm3		\n\t"
                     +		"movq %%mm1, %%mm4		\n\t"
                     +		"punpcklbw %%mm3, %%mm1		\n\t"
                     +		"punpckhbw %%mm3, %%mm4		\n\t"
+                    +
                     +		"movq %%mm0, %%mm3		\n\t"
                     +		"punpcklwd %%mm1, %%mm0		\n\t"
                     +		"punpckhwd %%mm1, %%mm3		\n\t"
                     +		"movq %%mm2, %%mm1		\n\t"
                     +		"punpcklwd %%mm4, %%mm2		\n\t"
                     +		"punpckhwd %%mm4, %%mm1		\n\t"
+                    +
                     +		"movd %%mm0, (%0)		\n\t"
                     +		"psrlq $32, %%mm0		\n\t"
                     +		"movd %%mm0, (%%eax)		\n\t"
                     +		"movd %%mm3, (%%eax, %1)	\n\t"
                     +		"psrlq $32, %%mm3		\n\t"
                     +		"movd %%mm3, (%%eax, %1, 2)	\n\t"
                     +		"movd %%mm2, (%0, %1, 4)	\n\t"
                     +		"psrlq $32, %%mm2		\n\t"
                     +		"movd %%mm2, (%%edx)		\n\t"
                     +		"movd %%mm1, (%%edx, %1)	\n\t"
                     +		"psrlq $32, %%mm1		\n\t"
                     +		"movd %%mm1, (%%edx, %1, 2)	\n\t"
+                    +
+                    +
                     +		"movq 64(%2), %%mm0		\n\t" // 12345678
                     +		"movq 80(%2), %%mm1		\n\t" // abcdefgh
                     +		"movq %%mm0, %%mm2		\n\t" // 12345678
                     +		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     +		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    +
                     +		"movq 96(%2), %%mm1		\n\t"
                     +		"movq 112(%2), %%mm3		\n\t"
                     +		"movq %%mm1, %%mm4		\n\t"
                     +		"punpcklbw %%mm3, %%mm1		\n\t"
                     +		"punpckhbw %%mm3, %%mm4		\n\t"
+                    +
                     +		"movq %%mm0, %%mm3		\n\t"
                     +		"punpcklwd %%mm1, %%mm0		\n\t"
                     +		"punpckhwd %%mm1, %%mm3		\n\t"
                     +		"movq %%mm2, %%mm1		\n\t"
                     +		"punpcklwd %%mm4, %%mm2		\n\t"
                     +		"punpckhwd %%mm4, %%mm1		\n\t"
+                    +
                     +		"movd %%mm0, 4(%0)		\n\t"
                     +		"psrlq $32, %%mm0		\n\t"
                     +		"movd %%mm0, 4(%%eax)		\n\t"
                     +		"movd %%mm3, 4(%%eax, %1)	\n\t"
                     +		"psrlq $32, %%mm3		\n\t"
                     +		"movd %%mm3, 4(%%eax, %1, 2)	\n\t"
                     +		"movd %%mm2, 4(%0, %1, 4)	\n\t"
                     +		"psrlq $32, %%mm2		\n\t"
                     +		"movd %%mm2, 4(%%edx)		\n\t"
                     +		"movd %%mm1, 4(%%edx, %1)	\n\t"
                     +		"psrlq $32, %%mm1		\n\t"
                     +		"movd %%mm1, 4(%%edx, %1, 2)	\n\t"
+                    +
                     +	:: "r" (dst), "r" (dstStride), "r" (src)
                     +	: "%eax", "%edx"
                     +	);
                     +}
                     +#endif
                     +//static int test=0;
+                    +
                     +static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
                     +				    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
                     +{
                     +	// to save a register (FIXME do this outside of the loops)
                     +	tempBluredPast[127]= maxNoise[0];
                     +	tempBluredPast[128]= maxNoise[1];
                     +	tempBluredPast[129]= maxNoise[2];
+                    +
                     +#define FAST_L2_DIFF
                     +//#define L1_DIFF //u should change the thresholds too if u try that one
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%2, %2, 2), %%eax			\n\t" // 3*stride
                     +		"leal (%2, %2, 4), %%edx			\n\t" // 5*stride
                     +		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%x	%x+%2	%x+2%2	%x+eax	%x+4%2	%x+edx	%x+2eax	%x+ecx	%x+8%2
                     +//FIXME reorder?
                     +#ifdef L1_DIFF //needs mmx2
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"psadbw (%1), %%mm0				\n\t" // |L0-R0|
                     +		"movq (%0, %2), %%mm1				\n\t" // L1
                     +		"psadbw (%1, %2), %%mm1				\n\t" // |L1-R1|
                     +		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     +		"psadbw (%1, %2, 2), %%mm2			\n\t" // |L2-R2|
                     +		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     +		"psadbw (%1, %%eax), %%mm3			\n\t" // |L3-R3|
+                    +
                     +		"movq (%0, %2, 4), %%mm4			\n\t" // L4
                     +		"paddw %%mm1, %%mm0				\n\t"
                     +		"psadbw (%1, %2, 4), %%mm4			\n\t" // |L4-R4|
                     +		"movq (%0, %%edx), %%mm5			\n\t" // L5
                     +		"paddw %%mm2, %%mm0				\n\t"
                     +		"psadbw (%1, %%edx), %%mm5			\n\t" // |L5-R5|
                     +		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
                     +		"paddw %%mm3, %%mm0				\n\t"
                     +		"psadbw (%1, %%eax, 2), %%mm6			\n\t" // |L6-R6|
                     +		"movq (%0, %%ecx), %%mm7			\n\t" // L7
                     +		"paddw %%mm4, %%mm0				\n\t"
                     +		"psadbw (%1, %%ecx), %%mm7			\n\t" // |L7-R7|
                     +		"paddw %%mm5, %%mm6				\n\t"
                     +		"paddw %%mm7, %%mm6				\n\t"
                     +		"paddw %%mm6, %%mm0				\n\t"
                     +#elif defined (FAST_L2_DIFF)
                     +		"pcmpeqb %%mm7, %%mm7				\n\t"
                     +		"movq "MANGLE(b80)", %%mm6			\n\t"
                     +		"pxor %%mm0, %%mm0				\n\t"
                     +#define L2_DIFF_CORE(a, b)\
                     +		"movq " #a ", %%mm5				\n\t"\
                     +		"movq " #b ", %%mm2				\n\t"\
                     +		"pxor %%mm7, %%mm2				\n\t"\
                     +		PAVGB(%%mm2, %%mm5)\
                     +		"paddb %%mm6, %%mm5				\n\t"\
                     +		"movq %%mm5, %%mm2				\n\t"\
                     +		"psllw $8, %%mm5				\n\t"\
                     +		"pmaddwd %%mm5, %%mm5				\n\t"\
                     +		"pmaddwd %%mm2, %%mm2				\n\t"\
                     +		"paddd %%mm2, %%mm5				\n\t"\
                     +		"psrld $14, %%mm5				\n\t"\
                     +		"paddd %%mm5, %%mm0				\n\t"
+                    +
                     +L2_DIFF_CORE((%0), (%1))
                     +L2_DIFF_CORE((%0, %2), (%1, %2))
                     +L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
                     +L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
                     +L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
                     +L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
                     +L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
                     +L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
+                    +
                     +#else
                     +		"pxor %%mm7, %%mm7				\n\t"
                     +		"pxor %%mm0, %%mm0				\n\t"
                     +#define L2_DIFF_CORE(a, b)\
                     +		"movq " #a ", %%mm5				\n\t"\
                     +		"movq " #b ", %%mm2				\n\t"\
                     +		"movq %%mm5, %%mm1				\n\t"\
                     +		"movq %%mm2, %%mm3				\n\t"\
                     +		"punpcklbw %%mm7, %%mm5				\n\t"\
                     +		"punpckhbw %%mm7, %%mm1				\n\t"\
                     +		"punpcklbw %%mm7, %%mm2				\n\t"\
                     +		"punpckhbw %%mm7, %%mm3				\n\t"\
                     +		"psubw %%mm2, %%mm5				\n\t"\
                     +		"psubw %%mm3, %%mm1				\n\t"\
                     +		"pmaddwd %%mm5, %%mm5				\n\t"\
                     +		"pmaddwd %%mm1, %%mm1				\n\t"\
                     +		"paddd %%mm1, %%mm5				\n\t"\
                     +		"paddd %%mm5, %%mm0				\n\t"
+                    +
                     +L2_DIFF_CORE((%0), (%1))
                     +L2_DIFF_CORE((%0, %2), (%1, %2))
                     +L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
                     +L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
                     +L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
                     +L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
                     +L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
                     +L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
+                    +
                     +#endif
+                    +
                     +		"movq %%mm0, %%mm4				\n\t"
                     +		"psrlq $32, %%mm0				\n\t"
                     +		"paddd %%mm0, %%mm4				\n\t"
                     +		"movd %%mm4, %%ecx				\n\t"
                     +		"shll $2, %%ecx					\n\t"
                     +		"movl %3, %%edx					\n\t"
                     +		"addl -4(%%edx), %%ecx				\n\t"
                     +		"addl 4(%%edx), %%ecx				\n\t"
                     +		"addl -1024(%%edx), %%ecx			\n\t"
                     +		"addl $4, %%ecx					\n\t"
                     +		"addl 1024(%%edx), %%ecx			\n\t"
                     +		"shrl $3, %%ecx					\n\t"
                     +		"movl %%ecx, (%%edx)				\n\t"
+                    +
                     +//		"movl %3, %%ecx					\n\t"
                     +//		"movl %%ecx, test				\n\t"
                     +//		"jmp 4f \n\t"
                     +		"cmpl 512(%%edx), %%ecx				\n\t"
                     +		" jb 2f						\n\t"
                     +		"cmpl 516(%%edx), %%ecx				\n\t"
                     +		" jb 1f						\n\t"
+                    +
                     +		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     +		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%0, %2), %%mm1				\n\t" // L1
                     +		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     +		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     +		"movq (%0, %2, 4), %%mm4			\n\t" // L4
                     +		"movq (%0, %%edx), %%mm5			\n\t" // L5
                     +		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
                     +		"movq (%0, %%ecx), %%mm7			\n\t" // L7
                     +		"movq %%mm0, (%1)				\n\t" // L0
                     +		"movq %%mm1, (%1, %2)				\n\t" // L1
                     +		"movq %%mm2, (%1, %2, 2)			\n\t" // L2
                     +		"movq %%mm3, (%1, %%eax)			\n\t" // L3
                     +		"movq %%mm4, (%1, %2, 4)			\n\t" // L4
                     +		"movq %%mm5, (%1, %%edx)			\n\t" // L5
                     +		"movq %%mm6, (%1, %%eax, 2)			\n\t" // L6
                     +		"movq %%mm7, (%1, %%ecx)			\n\t" // L7
                     +		"jmp 4f						\n\t"
+                    +
                     +		"1:						\n\t"
                     +		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     +		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		PAVGB((%1), %%mm0)				      // L0
                     +		"movq (%0, %2), %%mm1				\n\t" // L1
                     +		PAVGB((%1, %2), %%mm1)				      // L1
                     +		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     +		PAVGB((%1, %2, 2), %%mm2)			      // L2
                     +		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     +		PAVGB((%1, %%eax), %%mm3)			      // L3
                     +		"movq (%0, %2, 4), %%mm4			\n\t" // L4
                     +		PAVGB((%1, %2, 4), %%mm4)			      // L4
                     +		"movq (%0, %%edx), %%mm5			\n\t" // L5
                     +		PAVGB((%1, %%edx), %%mm5)			      // L5
                     +		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
                     +		PAVGB((%1, %%eax, 2), %%mm6)			      // L6
                     +		"movq (%0, %%ecx), %%mm7			\n\t" // L7
                     +		PAVGB((%1, %%ecx), %%mm7)			      // L7
                     +		"movq %%mm0, (%1)				\n\t" // R0
                     +		"movq %%mm1, (%1, %2)				\n\t" // R1
                     +		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
                     +		"movq %%mm3, (%1, %%eax)			\n\t" // R3
                     +		"movq %%mm4, (%1, %2, 4)			\n\t" // R4
                     +		"movq %%mm5, (%1, %%edx)			\n\t" // R5
                     +		"movq %%mm6, (%1, %%eax, 2)			\n\t" // R6
                     +		"movq %%mm7, (%1, %%ecx)			\n\t" // R7
                     +		"movq %%mm0, (%0)				\n\t" // L0
                     +		"movq %%mm1, (%0, %2)				\n\t" // L1
                     +		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
                     +		"movq %%mm3, (%0, %%eax)			\n\t" // L3
                     +		"movq %%mm4, (%0, %2, 4)			\n\t" // L4
                     +		"movq %%mm5, (%0, %%edx)			\n\t" // L5
                     +		"movq %%mm6, (%0, %%eax, 2)			\n\t" // L6
                     +		"movq %%mm7, (%0, %%ecx)			\n\t" // L7
                     +		"jmp 4f						\n\t"
+                    +
                     +		"2:						\n\t"
                     +		"cmpl 508(%%edx), %%ecx				\n\t"
                     +		" jb 3f						\n\t"
+                    +
                     +		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     +		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%0, %2), %%mm1				\n\t" // L1
                     +		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     +		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     +		"movq (%1), %%mm4				\n\t" // R0
                     +		"movq (%1, %2), %%mm5				\n\t" // R1
                     +		"movq (%1, %2, 2), %%mm6			\n\t" // R2
                     +		"movq (%1, %%eax), %%mm7			\n\t" // R3
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		"movq %%mm0, (%1)				\n\t" // R0
                     +		"movq %%mm1, (%1, %2)				\n\t" // R1
                     +		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
                     +		"movq %%mm3, (%1, %%eax)			\n\t" // R3
                     +		"movq %%mm0, (%0)				\n\t" // L0
                     +		"movq %%mm1, (%0, %2)				\n\t" // L1
                     +		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
                     +		"movq %%mm3, (%0, %%eax)			\n\t" // L3
+                    +
                     +		"movq (%0, %2, 4), %%mm0			\n\t" // L4
                     +		"movq (%0, %%edx), %%mm1			\n\t" // L5
                     +		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
                     +		"movq (%0, %%ecx), %%mm3			\n\t" // L7
                     +		"movq (%1, %2, 4), %%mm4			\n\t" // R4
                     +		"movq (%1, %%edx), %%mm5			\n\t" // R5
                     +		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
                     +		"movq (%1, %%ecx), %%mm7			\n\t" // R7
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
                     +		"movq %%mm1, (%1, %%edx)			\n\t" // R5
                     +		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
                     +		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
                     +		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
                     +		"movq %%mm1, (%0, %%edx)			\n\t" // L5
                     +		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
                     +		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
                     +		"jmp 4f						\n\t"
+                    +
                     +		"3:						\n\t"
                     +		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     +		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%0, %2), %%mm1				\n\t" // L1
                     +		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     +		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     +		"movq (%1), %%mm4				\n\t" // R0
                     +		"movq (%1, %2), %%mm5				\n\t" // R1
                     +		"movq (%1, %2, 2), %%mm6			\n\t" // R2
                     +		"movq (%1, %%eax), %%mm7			\n\t" // R3
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		"movq %%mm0, (%1)				\n\t" // R0
                     +		"movq %%mm1, (%1, %2)				\n\t" // R1
                     +		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
                     +		"movq %%mm3, (%1, %%eax)			\n\t" // R3
                     +		"movq %%mm0, (%0)				\n\t" // L0
                     +		"movq %%mm1, (%0, %2)				\n\t" // L1
                     +		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
                     +		"movq %%mm3, (%0, %%eax)			\n\t" // L3
+                    +
                     +		"movq (%0, %2, 4), %%mm0			\n\t" // L4
                     +		"movq (%0, %%edx), %%mm1			\n\t" // L5
                     +		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
                     +		"movq (%0, %%ecx), %%mm3			\n\t" // L7
                     +		"movq (%1, %2, 4), %%mm4			\n\t" // R4
                     +		"movq (%1, %%edx), %%mm5			\n\t" // R5
                     +		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
                     +		"movq (%1, %%ecx), %%mm7			\n\t" // R7
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		PAVGB(%%mm4, %%mm0)
                     +		PAVGB(%%mm5, %%mm1)
                     +		PAVGB(%%mm6, %%mm2)
                     +		PAVGB(%%mm7, %%mm3)
                     +		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
                     +		"movq %%mm1, (%1, %%edx)			\n\t" // R5
                     +		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
                     +		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
                     +		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
                     +		"movq %%mm1, (%0, %%edx)			\n\t" // L5
                     +		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
                     +		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
+                    +
                     +		"4:						\n\t"
+                    +
                     +		:: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
                     +		: "%eax", "%edx", "%ecx", "memory"
                     +		);
                     +//printf("%d\n", test);
                     +#else
                     +{
                     +	int y;
                     +	int d=0;
                     +	int sysd=0;
                     +	int i;
+                    +
                     +	for(y=0; y<8; y++)
                     +	{
                     +		int x;
                     +		for(x=0; x<8; x++)
                     +		{
                     +			int ref= tempBlured[ x + y*stride ];
                     +			int cur= src[ x + y*stride ];
                     +			int d1=ref - cur;
                     +//			if(x==0 || x==7) d1+= d1>>1;
                     +//			if(y==0 || y==7) d1+= d1>>1;
                     +//			d+= ABS(d1);
                     +			d+= d1*d1;
                     +			sysd+= d1;
                     +		}
                     +	}
                     +	i=d;
                     +	d= 	(
                     +		4*d
                     +		+(*(tempBluredPast-256))
                     +		+(*(tempBluredPast-1))+ (*(tempBluredPast+1))
                     +		+(*(tempBluredPast+256))
                     +		+4)>>3;
                     +	*tempBluredPast=i;
                     +//	((*tempBluredPast)*3 + d + 2)>>2;
+                    +
                     +//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
                     +/*
                     +Switch between
                     + 1  0  0  0  0  0  0  (0)
                     +64 32 16  8  4  2  1  (1)
                     +64 48 36 27 20 15 11 (33) (approx)
                     +64 56 49 43 37 33 29 (200) (approx)
                     +*/
                     +	if(d > maxNoise[1])
                     +	{
                     +		if(d < maxNoise[2])
                     +		{
                     +			for(y=0; y<8; y++)
                     +			{
                     +				int x;
                     +				for(x=0; x<8; x++)
                     +				{
                     +					int ref= tempBlured[ x + y*stride ];
                     +					int cur= src[ x + y*stride ];
                     +					tempBlured[ x + y*stride ]=
                     +					src[ x + y*stride ]=
                     +						(ref + cur + 1)>>1;
                     +				}
                     +			}
                     +		}
                     +		else
                     +		{
                     +			for(y=0; y<8; y++)
                     +			{
                     +				int x;
                     +				for(x=0; x<8; x++)
                     +				{
                     +					tempBlured[ x + y*stride ]= src[ x + y*stride ];
                     +				}
                     +			}
                     +		}
                     +	}
                     +	else
                     +	{
                     +		if(d < maxNoise[0])
                     +		{
                     +			for(y=0; y<8; y++)
                     +			{
                     +				int x;
                     +				for(x=0; x<8; x++)
                     +				{
                     +					int ref= tempBlured[ x + y*stride ];
                     +					int cur= src[ x + y*stride ];
                     +					tempBlured[ x + y*stride ]=
                     +					src[ x + y*stride ]=
                     +						(ref*7 + cur + 4)>>3;
                     +				}
                     +			}
                     +		}
                     +		else
                     +		{
                     +			for(y=0; y<8; y++)
                     +			{
                     +				int x;
                     +				for(x=0; x<8; x++)
                     +				{
                     +					int ref= tempBlured[ x + y*stride ];
                     +					int cur= src[ x + y*stride ];
                     +					tempBlured[ x + y*stride ]=
                     +					src[ x + y*stride ]=
                     +						(ref*3 + cur + 2)>>2;
                     +				}
                     +			}
                     +		}
                     +	}
                     +}
                     +#endif
                     +}
+                    +
                     +static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     +	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
+                    +
                     +/**
                     + * Copies a block from src to dst and fixes the blacklevel
                     + * levelFix == 0 -> dont touch the brighness & contrast
                     + */
                     +#undef SCALED_CPY
+                    +
                     +static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
                     +	int levelFix, int64_t *packedOffsetAndScale)
                     +{
                     +#ifndef HAVE_MMX
                     +	int i;
                     +#endif
                     +	if(levelFix)
                     +	{
                     +#ifdef HAVE_MMX
                     +					asm volatile(
                     +						"movq (%%eax), %%mm2	\n\t" // packedYOffset
                     +						"movq 8(%%eax), %%mm3	\n\t" // packedYScale
                     +						"leal (%2,%4), %%eax	\n\t"
                     +						"leal (%3,%5), %%edx	\n\t"
                     +						"pxor %%mm4, %%mm4	\n\t"
                     +#ifdef HAVE_MMX2
                     +#define SCALED_CPY(src1, src2, dst1, dst2)					\
                     +						"movq " #src1 ", %%mm0	\n\t"\
                     +						"movq " #src1 ", %%mm5	\n\t"\
                     +						"movq " #src2 ", %%mm1	\n\t"\
                     +						"movq " #src2 ", %%mm6	\n\t"\
                     +						"punpcklbw %%mm0, %%mm0 \n\t"\
                     +						"punpckhbw %%mm5, %%mm5 \n\t"\
                     +						"punpcklbw %%mm1, %%mm1 \n\t"\
                     +						"punpckhbw %%mm6, %%mm6 \n\t"\
                     +						"pmulhuw %%mm3, %%mm0	\n\t"\
                     +						"pmulhuw %%mm3, %%mm5	\n\t"\
                     +						"pmulhuw %%mm3, %%mm1	\n\t"\
                     +						"pmulhuw %%mm3, %%mm6	\n\t"\
                     +						"psubw %%mm2, %%mm0	\n\t"\
                     +						"psubw %%mm2, %%mm5	\n\t"\
                     +						"psubw %%mm2, %%mm1	\n\t"\
                     +						"psubw %%mm2, %%mm6	\n\t"\
                     +						"packuswb %%mm5, %%mm0	\n\t"\
                     +						"packuswb %%mm6, %%mm1	\n\t"\
                     +						"movq %%mm0, " #dst1 "	\n\t"\
                     +						"movq %%mm1, " #dst2 "	\n\t"\
+                    +
                     +#else //HAVE_MMX2
                     +#define SCALED_CPY(src1, src2, dst1, dst2)					\
                     +						"movq " #src1 ", %%mm0	\n\t"\
                     +						"movq " #src1 ", %%mm5	\n\t"\
                     +						"punpcklbw %%mm4, %%mm0 \n\t"\
                     +						"punpckhbw %%mm4, %%mm5 \n\t"\
                     +						"psubw %%mm2, %%mm0	\n\t"\
                     +						"psubw %%mm2, %%mm5	\n\t"\
                     +						"movq " #src2 ", %%mm1	\n\t"\
                     +						"psllw $6, %%mm0	\n\t"\
                     +						"psllw $6, %%mm5	\n\t"\
                     +						"pmulhw %%mm3, %%mm0	\n\t"\
                     +						"movq " #src2 ", %%mm6	\n\t"\
                     +						"pmulhw %%mm3, %%mm5	\n\t"\
                     +						"punpcklbw %%mm4, %%mm1 \n\t"\
                     +						"punpckhbw %%mm4, %%mm6 \n\t"\
                     +						"psubw %%mm2, %%mm1	\n\t"\
                     +						"psubw %%mm2, %%mm6	\n\t"\
                     +						"psllw $6, %%mm1	\n\t"\
                     +						"psllw $6, %%mm6	\n\t"\
                     +						"pmulhw %%mm3, %%mm1	\n\t"\
                     +						"pmulhw %%mm3, %%mm6	\n\t"\
                     +						"packuswb %%mm5, %%mm0	\n\t"\
                     +						"packuswb %%mm6, %%mm1	\n\t"\
                     +						"movq %%mm0, " #dst1 "	\n\t"\
                     +						"movq %%mm1, " #dst2 "	\n\t"\
+                    +
                     +#endif //!HAVE_MMX2
+                    +
                     +SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
                     +SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
                     +SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
                     +						"leal (%%eax,%4,4), %%eax	\n\t"
                     +						"leal (%%edx,%5,4), %%edx	\n\t"
                     +SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
+                    +
+                    +
                     +						: "=&a" (packedOffsetAndScale)
                     +						: "0" (packedOffsetAndScale),
                     +						"r"(src),
                     +						"r"(dst),
                     +						"r" (srcStride),
                     +						"r" (dstStride)
                     +						: "%edx"
                     +					);
                     +#else
                     +				for(i=0; i<8; i++)
                     +					memcpy(	&(dst[dstStride*i]),
                     +						&(src[srcStride*i]), BLOCK_SIZE);
                     +#endif
                     +	}
                     +	else
                     +	{
                     +#ifdef HAVE_MMX
                     +					asm volatile(
                     +						"leal (%0,%2), %%eax	\n\t"
                     +						"leal (%1,%3), %%edx	\n\t"
+                    +
                     +#define SIMPLE_CPY(src1, src2, dst1, dst2)				\
                     +						"movq " #src1 ", %%mm0	\n\t"\
                     +						"movq " #src2 ", %%mm1	\n\t"\
                     +						"movq %%mm0, " #dst1 "	\n\t"\
                     +						"movq %%mm1, " #dst2 "	\n\t"\
+                    +
                     +SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
                     +SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
                     +SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
                     +						"leal (%%eax,%2,4), %%eax	\n\t"
                     +						"leal (%%edx,%3,4), %%edx	\n\t"
                     +SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
+                    +
                     +						: : "r" (src),
                     +						"r" (dst),
                     +						"r" (srcStride),
                     +						"r" (dstStride)
                     +						: "%eax", "%edx"
                     +					);
                     +#else
                     +				for(i=0; i<8; i++)
                     +					memcpy(	&(dst[dstStride*i]),
                     +						&(src[srcStride*i]), BLOCK_SIZE);
                     +#endif
                     +	}
                     +}
+                    +
                     +/**
                     + * Duplicates the given 8 src pixels ? times upward
                     + */
                     +static inline void RENAME(duplicate)(uint8_t src[], int stride)
                     +{
                     +#ifdef HAVE_MMX
                     +	asm volatile(
                     +		"movq (%0), %%mm0		\n\t"
                     +		"addl %1, %0			\n\t"
                     +		"movq %%mm0, (%0)		\n\t"
                     +		"movq %%mm0, (%0, %1)		\n\t"
                     +		"movq %%mm0, (%0, %1, 2)	\n\t"
                     +		: "+r" (src)
                     +		: "r" (-stride)
                     +	);
                     +#else
                     +	int i;
                     +	uint8_t *p=src;
                     +	for(i=0; i<3; i++)
                     +	{
                     +		p-= stride;
                     +		memcpy(p, src, 8);
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Filters array of bytes (Y or U or V values)
                     + */
                     +static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     +	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
                     +{
                     +	PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
                     +	int x,y;
                     +#ifdef COMPILE_TIME_MODE
                     +	const int mode= COMPILE_TIME_MODE;
                     +#else
                     +	const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
                     +#endif
                     +	int black=0, white=255; // blackest black and whitest white in the picture
                     +	int QPCorrecture= 256*256;
+                    +
                     +	int copyAhead;
                     +#ifdef HAVE_MMX
                     +	int i;
                     +#endif
+                    +
                     +	const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
                     +	const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
+                    +
                     +	//FIXME remove
                     +	uint64_t * const yHistogram= c.yHistogram;
                     +	uint8_t * const tempSrc= c.tempSrc;
                     +	uint8_t * const tempDst= c.tempDst;
                     +	const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
+                    +
                     +#ifdef HAVE_MMX
                     +	for(i=0; i<32; i++){
                     +		int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
                     +		int threshold= offset*2 + 1;
                     +		c.mmxDcOffset[i]= 0x7F - offset;
                     +		c.mmxDcThreshold[i]= 0x7F - threshold;
                     +		c.mmxDcOffset[i]*= 0x0101010101010101LL;
                     +		c.mmxDcThreshold[i]*= 0x0101010101010101LL;
                     +	}
                     +#endif
+                    +
                     +	if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
                     +	else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
                     +		|| (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
                     +	else if(   (mode & V_DEBLOCK)
                     +		|| (mode & LINEAR_IPOL_DEINT_FILTER)
                     +		|| (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
                     +	else if(mode & V_X1_FILTER) copyAhead=11;
                     +//	else if(mode & V_RK1_FILTER) copyAhead=10;
                     +	else if(mode & DERING) copyAhead=9;
                     +	else copyAhead=8;
+                    +
                     +	copyAhead-= 8;
+                    +
                     +	if(!isColor)
                     +	{
                     +		uint64_t sum= 0;
                     +		int i;
                     +		uint64_t maxClipped;
                     +		uint64_t clipped;
                     +		double scale;
+                    +
                     +		c.frameNum++;
                     +		// first frame is fscked so we ignore it
                     +		if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
+                    +
                     +		for(i=0; i<256; i++)
                     +		{
                     +			sum+= yHistogram[i];
                     +//			printf("%d ", yHistogram[i]);
                     +		}
                     +//		printf("\n\n");
+                    +
                     +		/* we allways get a completly black picture first */
                     +		maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
+                    +
                     +		clipped= sum;
                     +		for(black=255; black>0; black--)
                     +		{
                     +			if(clipped < maxClipped) break;
                     +			clipped-= yHistogram[black];
                     +		}
+                    +
                     +		clipped= sum;
                     +		for(white=0; white<256; white++)
                     +		{
                     +			if(clipped < maxClipped) break;
                     +			clipped-= yHistogram[white];
                     +		}
+                    +
                     +		scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
+                    +
                     +#ifdef HAVE_MMX2
                     +		c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
                     +		c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
                     +#else
                     +		c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
                     +		c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
                     +#endif
+                    +
                     +		c.packedYOffset|= c.packedYOffset<<32;
                     +		c.packedYOffset|= c.packedYOffset<<16;
+                    +
                     +		c.packedYScale|= c.packedYScale<<32;
                     +		c.packedYScale|= c.packedYScale<<16;
+                    +
                     +		if(mode & LEVEL_FIX)	QPCorrecture= (int)(scale*256*256 + 0.5);
                     +		else			QPCorrecture= 256*256;
                     +	}
                     +	else
                     +	{
                     +		c.packedYScale= 0x0100010001000100LL;
                     +		c.packedYOffset= 0;
                     +		QPCorrecture= 256*256;
                     +	}
+                    +
                     +	/* copy & deinterlace first row of blocks */
                     +	y=-BLOCK_SIZE;
                     +	{
                     +		uint8_t *srcBlock= &(src[y*srcStride]);
                     +		uint8_t *dstBlock= tempDst + dstStride;
+                    +
                     +		// From this point on it is guranteed that we can read and write 16 lines downward
                     +		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
                     +		for(x=0; x<width; x+=BLOCK_SIZE)
                     +		{
+                    +
                     +#ifdef HAVE_MMX2
                     +/*
                     +			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
                     +			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
                     +			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
                     +			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
                     +*/
+                    +
                     +			asm(
                     +				"movl %4, %%eax			\n\t"
                     +				"shrl $2, %%eax			\n\t"
                     +				"andl $6, %%eax			\n\t"
                     +				"addl %5, %%eax			\n\t"
                     +				"movl %%eax, %%edx		\n\t"
                     +				"imul %1, %%eax			\n\t"
                     +				"imul %3, %%edx			\n\t"
                     +				"prefetchnta 32(%%eax, %0)	\n\t"
                     +				"prefetcht0 32(%%edx, %2)	\n\t"
                     +				"addl %1, %%eax			\n\t"
                     +				"addl %3, %%edx			\n\t"
                     +				"prefetchnta 32(%%eax, %0)	\n\t"
                     +				"prefetcht0 32(%%edx, %2)	\n\t"
                     +			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
                     +			"m" (x), "m" (copyAhead)
                     +			: "%eax", "%edx"
                     +			);
+                    +
                     +#elif defined(HAVE_3DNOW)
                     +//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
                     +/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
                     +			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
                     +			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
                     +			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
                     +*/
                     +#endif
+                    +
                     +			RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
                     +				srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
+                    +
                     +			RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
+                    +
                     +			if(mode & LINEAR_IPOL_DEINT_FILTER)
                     +				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
                     +			else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     +				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
                     +			else if(mode & MEDIAN_DEINT_FILTER)
                     +				RENAME(deInterlaceMedian)(dstBlock, dstStride);
                     +			else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     +				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
                     +			else if(mode & FFMPEG_DEINT_FILTER)
                     +				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
                     +/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     +				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
                     +*/
                     +			dstBlock+=8;
                     +			srcBlock+=8;
                     +		}
                     +		if(width==dstStride)
                     +			memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride);
                     +		else
                     +		{
                     +			int i;
                     +			for(i=0; i<copyAhead; i++)
                     +			{
                     +				memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
                     +			}
                     +		}
                     +	}
+                    +
                     +//printf("\n");
                     +	for(y=0; y<height; y+=BLOCK_SIZE)
                     +	{
                     +		//1% speedup if these are here instead of the inner loop
                     +		uint8_t *srcBlock= &(src[y*srcStride]);
                     +		uint8_t *dstBlock= &(dst[y*dstStride]);
                     +#ifdef HAVE_MMX
                     +		uint8_t *tempBlock1= c.tempBlocks;
                     +		uint8_t *tempBlock2= c.tempBlocks + 8;
                     +#endif
                     +		int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
                     +		int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*mbWidth];
                     +		int QP=0;
                     +		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
                     +		   if not than use a temporary buffer */
                     +		if(y+15 >= height)
                     +		{
                     +			int i;
                     +			/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
                     +			   blockcopy to dst later */
                     +			memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
                     +				srcStride*MAX(height-y-copyAhead, 0) );
+                    +
                     +			/* duplicate last line of src to fill the void upto line (copyAhead+7) */
                     +			for(i=MAX(height-y, 8); i<copyAhead+8; i++)
                     +				memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
+                    +
                     +			/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
                     +			memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
+                    +
                     +			/* duplicate last line of dst to fill the void upto line (copyAhead) */
                     +			for(i=height-y+1; i<=copyAhead; i++)
                     +				memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
+                    +
                     +			dstBlock= tempDst + dstStride;
                     +			srcBlock= tempSrc;
                     +		}
                     +//printf("\n");
+                    +
                     +		// From this point on it is guranteed that we can read and write 16 lines downward
                     +		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
                     +		for(x=0; x<width; x+=BLOCK_SIZE)
                     +		{
                     +			const int stride= dstStride;
                     +#ifdef HAVE_MMX
                     +			uint8_t *tmpXchg;
                     +#endif
                     +			if(isColor)
                     +			{
                     +				QP= QPptr[x>>qpHShift];
                     +				c.nonBQP= nonBQPptr[x>>qpHShift];
                     +			}
                     +			else
                     +			{
                     +				QP= QPptr[x>>4];
                     +				QP= (QP* QPCorrecture + 256*128)>>16;
                     +				c.nonBQP= nonBQPptr[x>>4];
                     +				c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
                     +				yHistogram[ srcBlock[srcStride*12 + 4] ]++;
                     +			}
                     +			c.QP= QP;
                     +#ifdef HAVE_MMX
                     +			asm volatile(
                     +				"movd %1, %%mm7					\n\t"
                     +				"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
                     +				"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
                     +				"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
                     +				"movq %%mm7, %0			\n\t"
                     +				: "=m" (c.pQPb)
                     +				: "r" (QP)
                     +			);
                     +#endif
+                    +
+                    +
                     +#ifdef HAVE_MMX2
                     +/*
                     +			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
                     +			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
                     +			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
                     +			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
                     +*/
+                    +
                     +			asm(
                     +				"movl %4, %%eax			\n\t"
                     +				"shrl $2, %%eax			\n\t"
                     +				"andl $6, %%eax			\n\t"
                     +				"addl %5, %%eax			\n\t"
                     +				"movl %%eax, %%edx		\n\t"
                     +				"imul %1, %%eax			\n\t"
                     +				"imul %3, %%edx			\n\t"
                     +				"prefetchnta 32(%%eax, %0)	\n\t"
                     +				"prefetcht0 32(%%edx, %2)	\n\t"
                     +				"addl %1, %%eax			\n\t"
                     +				"addl %3, %%edx			\n\t"
                     +				"prefetchnta 32(%%eax, %0)	\n\t"
                     +				"prefetcht0 32(%%edx, %2)	\n\t"
                     +			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
                     +			"m" (x), "m" (copyAhead)
                     +			: "%eax", "%edx"
                     +			);
+                    +
                     +#elif defined(HAVE_3DNOW)
                     +//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
                     +/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
                     +			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
                     +			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
                     +			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
                     +*/
                     +#endif
+                    +
                     +			RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
                     +				srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
+                    +
                     +			if(mode & LINEAR_IPOL_DEINT_FILTER)
                     +				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
                     +			else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     +				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
                     +			else if(mode & MEDIAN_DEINT_FILTER)
                     +				RENAME(deInterlaceMedian)(dstBlock, dstStride);
                     +			else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     +				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
                     +			else if(mode & FFMPEG_DEINT_FILTER)
                     +				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
                     +/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     +				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
                     +*/
+                    +
                     +			/* only deblock if we have 2 blocks */
                     +			if(y + 8 < height)
                     +			{
                     +				if(mode & V_X1_FILTER)
                     +					RENAME(vertX1Filter)(dstBlock, stride, &c);
                     +				else if(mode & V_DEBLOCK)
                     +				{
                     +					if( RENAME(isVertDC)(dstBlock, stride, &c))
                     +					{
                     +						if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
                     +							RENAME(doVertLowPass)(dstBlock, stride, &c);
                     +					}
                     +					else
                     +						RENAME(doVertDefFilter)(dstBlock, stride, &c);
                     +				}
                     +			}
+                    +
                     +#ifdef HAVE_MMX
                     +			RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
                     +#endif
                     +			/* check if we have a previous block to deblock it with dstBlock */
                     +			if(x - 8 >= 0)
                     +			{
                     +#ifdef HAVE_MMX
                     +				if(mode & H_X1_FILTER)
                     +					RENAME(vertX1Filter)(tempBlock1, 16, &c);
                     +				else if(mode & H_DEBLOCK)
                     +				{
                     +					if( RENAME(isVertDC)(tempBlock1, 16, &c))
                     +					{
                     +						if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
                     +							RENAME(doVertLowPass)(tempBlock1, 16, &c);
                     +					}
                     +					else
                     +						RENAME(doVertDefFilter)(tempBlock1, 16, &c);
                     +				}
+                    +
                     +				RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
+                    +
                     +#else
                     +				if(mode & H_X1_FILTER)
                     +					horizX1Filter(dstBlock-4, stride, QP);
                     +				else if(mode & H_DEBLOCK)
                     +				{
                     +					if( isHorizDC(dstBlock-4, stride, &c))
                     +					{
                     +						if(isHorizMinMaxOk(dstBlock-4, stride, QP))
                     +							doHorizLowPass(dstBlock-4, stride, QP);
                     +					}
                     +					else
                     +						doHorizDefFilter(dstBlock-4, stride, QP);
                     +				}
                     +#endif
                     +				if(mode & DERING)
                     +				{
                     +				//FIXME filter first line
                     +					if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
                     +				}
+                    +
                     +				if(mode & TEMP_NOISE_FILTER)
                     +				{
                     +					RENAME(tempNoiseReducer)(dstBlock-8, stride,
                     +						c.tempBlured[isColor] + y*dstStride + x,
                     +						c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                     +						c.ppMode.maxTmpNoise);
                     +				}
                     +			}
+                    +
                     +			dstBlock+=8;
                     +			srcBlock+=8;
+                    +
                     +#ifdef HAVE_MMX
                     +			tmpXchg= tempBlock1;
                     +			tempBlock1= tempBlock2;
                     +			tempBlock2 = tmpXchg;
                     +#endif
                     +		}
+                    +
                     +		if(mode & DERING)
                     +		{
                     +				if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
                     +		}
+                    +
                     +		if((mode & TEMP_NOISE_FILTER))
                     +		{
                     +			RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
                     +				c.tempBlured[isColor] + y*dstStride + x,
                     +				c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                     +				c.ppMode.maxTmpNoise);
                     +		}
+                    +
                     +		/* did we use a tmp buffer for the last lines*/
                     +		if(y+15 >= height)
                     +		{
                     +			uint8_t *dstBlock= &(dst[y*dstStride]);
                     +			if(width==dstStride)
                     +				memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y));
                     +			else
                     +			{
                     +				int i;
                     +				for(i=0; i<height-y; i++)
                     +				{
                     +					memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
                     +				}
                     +			}
                     +		}
                     +/*
                     +		for(x=0; x<width; x+=32)
                     +		{
                     +			volatile int i;
                     +			i+=	+ dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
                     +				+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
                     +				+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
                     +//				+ dstBlock[x +13*dstStride]
                     +//				+ dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
                     +		}*/
                     +	}
                     +#ifdef HAVE_3DNOW
                     +	asm volatile("femms");
                     +#elif defined (HAVE_MMX)
                     +	asm volatile("emms");
                     +#endif
+                    +
                     +#ifdef DEBUG_BRIGHTNESS
                     +	if(!isColor)
                     +	{
                     +		int max=1;
                     +		int i;
                     +		for(i=0; i<256; i++)
                     +			if(yHistogram[i] > max) max=yHistogram[i];
+                    +
                     +		for(i=1; i<256; i++)
                     +		{
                     +			int x;
                     +			int start=yHistogram[i-1]/(max/256+1);
                     +			int end=yHistogram[i]/(max/256+1);
                     +			int inc= end > start ? 1 : -1;
                     +			for(x=start; x!=end+inc; x+=inc)
                     +				dst[ i*dstStride + x]+=128;
                     +		}
+                    +
                     +		for(i=0; i<100; i+=2)
                     +		{
                     +			dst[ (white)*dstStride + i]+=128;
                     +			dst[ (black)*dstStride + i]+=128;
                     +		}
+                    +
                     +	}
                     +#endif
+                    +
                     +	*c2= c; //copy local context back
+                    +
                     +}

postproc/Makefile

History View file @ bba9b16

@@ -2,16 +2,9 @@
                      include ../config.mak
                      SWSLIB = libswscale.a
                     -ifeq ($(SHARED_PP),yes)
                     -SPPLIB = libpostproc.so
                     -SPPVERSION = 0.0.1
                     -endif
                     -PPLIB = libpostproc.a
                      SWSSRCS=swscale.c rgb2rgb.c yuv2rgb.c
                      SWSOBJS=$(SWSSRCS:.c=.o)
                     -PPOBJS=postprocess.o
                     -SPPOBJS=postprocess_pic.o
                      CS_TEST_OBJS=cs_test.o rgb2rgb.o ../cpudetect.o ../mp_msg.o ../libvo/aclib.o
                      CFLAGS  = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC)
@@ -24,7 +17,7 @@ CFLAGS  = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC)
                      .c.o:
                      	$(CC) -c $(CFLAGS) -I.. -o $@ $<
                     -all:    $(SWSLIB) $(PPLIB) $(SPPLIB)
                     +all:    $(SWSLIB)
                      $(SWSLIB):     $(SWSOBJS)
                      	$(AR) r $(SWSLIB) $(SWSOBJS)
@@ -43,29 +36,6 @@ depend:
                      cs_test: $(CS_TEST_OBJS)
                      	$(CC) $(CS_TEST_OBJS) -o cs_test
                     -ifeq ($(SHARED_PP),yes)
                     -postprocess_pic.o: postprocess.c
                     -	$(CC) -c $(CFLAGS) -fomit-frame-pointer -fPIC -DPIC -I.. -o $@ $<
+                    -
                     -$(SPPLIB): $(SPPOBJS)
                     -	$(CC) -shared -Wl,-soname,$(SPPLIB).0 \
                     -	-o $(SPPLIB) $(SPPOBJS)
                     -endif
+                    -
                     -$(PPLIB): $(PPOBJS)
                     -	$(AR) r $(PPLIB) $(PPOBJS)
+                    -
                     -install: all
                     -ifeq ($(SHARED_PP),yes)
                     -	install -d $(prefix)/lib
                     -	install -s -m 755 $(SPPLIB) $(prefix)/lib/$(SPPLIB).$(SPPVERSION)
                     -	ln -sf $(SPPLIB).$(SPPVERSION) $(prefix)/lib/$(SPPLIB)
                     -	ldconfig || true
                     -	mkdir -p $(prefix)/include/postproc
                     -	install -m 644 postprocess.h $(prefix)/include/postproc/postprocess.h
                     -endif
+                    -
+                    -
+                     #
                      # include dependency files if they exist
+                     #

postproc/postprocess.c

History View file @ bba9b16

                     deleted file mode 100644
@@ -1,874 +0,0 @@
                     -/*
                     -    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
+                    -
                     -    This program is free software; you can redistribute it and/or modify
                     -    it under the terms of the GNU General Public License as published by
                     -    the Free Software Foundation; either version 2 of the License, or
                     -    (at your option) any later version.
+                    -
                     -    This program is distributed in the hope that it will be useful,
                     -    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     -    GNU General Public License for more details.
+                    -
                     -    You should have received a copy of the GNU General Public License
                     -    along with this program; if not, write to the Free Software
                     -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     -*/
+                    -
                     -/*
                     -			C	MMX	MMX2	3DNow
                     -isVertDC		Ec	Ec
                     -isVertMinMaxOk		Ec	Ec
                     -doVertLowPass		E		e	e
                     -doVertDefFilter		Ec	Ec	e	e
                     -isHorizDC		Ec	Ec
                     -isHorizMinMaxOk		a	E
                     -doHorizLowPass		E		e	e
                     -doHorizDefFilter	Ec	Ec	e	e
                     -deRing			E		e	e*
                     -Vertical RKAlgo1	E		a	a
                     -Horizontal RKAlgo1			a	a
                     -Vertical X1#		a		E	E
                     -Horizontal X1#		a		E	E
                     -LinIpolDeinterlace	e		E	E*
                     -CubicIpolDeinterlace	a		e	e*
                     -LinBlendDeinterlace	e		E	E*
                     -MedianDeinterlace#	E	Ec	Ec
                     -TempDeNoiser#		E		e	e
+                    -
                     -* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
                     -# more or less selfinvented filters so the exactness isnt too meaningfull
                     -E = Exact implementation
                     -e = allmost exact implementation (slightly different rounding,...)
                     -a = alternative / approximate impl
                     -c = checked against the other implementations (-vo md5)
                     -*/
+                    -
                     -/*
                     -TODO:
                     -reduce the time wasted on the mem transfer
                     -unroll stuff if instructions depend too much on the prior one
                     -move YScale thing to the end instead of fixing QP
                     -write a faster and higher quality deblocking filter :)
                     -make the mainloop more flexible (variable number of blocks at once
                     -	(the if/else stuff per block is slowing things down)
                     -compare the quality & speed of all filters
                     -split this huge file
                     -optimize c versions
                     -try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
                     -...
                     -*/
+                    -
                     -//Changelog: use the CVS log
+                    -
                     -#include "config.h"
                     -#include <inttypes.h>
                     -#include <stdio.h>
                     -#include <stdlib.h>
                     -#include <string.h>
                     -#ifdef HAVE_MALLOC_H
                     -#include <malloc.h>
                     -#endif
                     -//#undef HAVE_MMX2
                     -//#define HAVE_3DNOW
                     -//#undef HAVE_MMX
                     -//#undef ARCH_X86
                     -//#define DEBUG_BRIGHTNESS
                     -#ifndef PIC
                     -#include "../libvo/fastmemcpy.h"
                     -#endif
                     -#include "postprocess.h"
                     -#include "postprocess_internal.h"
                     -#include "mangle.h"
+                    -
                     -#define MIN(a,b) ((a) > (b) ? (b) : (a))
                     -#define MAX(a,b) ((a) < (b) ? (b) : (a))
                     -#define ABS(a) ((a) > 0 ? (a) : (-(a)))
                     -#define SIGN(a) ((a) > 0 ? 1 : -1)
+                    -
                     -#define GET_MODE_BUFFER_SIZE 500
                     -#define OPTIONS_ARRAY_SIZE 10
                     -#define BLOCK_SIZE 8
                     -#define TEMP_STRIDE 8
                     -//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
+                    -
                     -#ifdef ARCH_X86
                     -static uint64_t __attribute__((aligned(8))) w05=		0x0005000500050005LL;
                     -static uint64_t __attribute__((aligned(8))) w20=		0x0020002000200020LL;
                     -static uint64_t __attribute__((aligned(8))) b00= 		0x0000000000000000LL;
                     -static uint64_t __attribute__((aligned(8))) b01= 		0x0101010101010101LL;
                     -static uint64_t __attribute__((aligned(8))) b02= 		0x0202020202020202LL;
                     -static uint64_t __attribute__((aligned(8))) b08= 		0x0808080808080808LL;
                     -static uint64_t __attribute__((aligned(8))) b80= 		0x8080808080808080LL;
                     -#endif
+                    -
                     -static int verbose= 0;
+                    -
                     -static const int deringThreshold= 20;
+                    -
+                    -
                     -static struct PPFilter filters[]=
                     -{
                     -	{"hb", "hdeblock", 		1, 1, 3, H_DEBLOCK},
                     -	{"vb", "vdeblock", 		1, 2, 4, V_DEBLOCK},
                     -/*	{"hr", "rkhdeblock", 		1, 1, 3, H_RK1_FILTER},
                     -	{"vr", "rkvdeblock", 		1, 2, 4, V_RK1_FILTER},*/
                     -	{"h1", "x1hdeblock", 		1, 1, 3, H_X1_FILTER},
                     -	{"v1", "x1vdeblock", 		1, 2, 4, V_X1_FILTER},
                     -	{"dr", "dering", 		1, 5, 6, DERING},
                     -	{"al", "autolevels", 		0, 1, 2, LEVEL_FIX},
                     -	{"lb", "linblenddeint", 	1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
                     -	{"li", "linipoldeint", 		1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
                     -	{"ci", "cubicipoldeint",	1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
                     -	{"md", "mediandeint", 		1, 1, 4, MEDIAN_DEINT_FILTER},
                     -	{"fd", "ffmpegdeint", 		1, 1, 4, FFMPEG_DEINT_FILTER},
                     -	{"tn", "tmpnoise", 		1, 7, 8, TEMP_NOISE_FILTER},
                     -	{"fq", "forcequant", 		1, 0, 0, FORCE_QUANT},
                     -	{NULL, NULL,0,0,0,0} //End Marker
                     -};
+                    -
                     -static char *replaceTable[]=
                     -{
                     -	"default", 	"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     -	"de", 		"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     -	"fast", 	"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     -	"fa", 		"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
                     -	NULL //End Marker
                     -};
+                    -
                     -#ifdef ARCH_X86
                     -static inline void unusedVariableWarningFixer()
                     -{
                     -	if(w05 + w20 + b00 + b01 + b02 + b08 + b80 == 0) b00=0;
                     -}
                     -#endif
+                    -
+                    -
                     -#ifdef ARCH_X86
                     -static inline void prefetchnta(void *p)
                     -{
                     -	asm volatile(	"prefetchnta (%0)\n\t"
                     -		: : "r" (p)
                     -	);
                     -}
+                    -
                     -static inline void prefetcht0(void *p)
                     -{
                     -	asm volatile(	"prefetcht0 (%0)\n\t"
                     -		: : "r" (p)
                     -	);
                     -}
+                    -
                     -static inline void prefetcht1(void *p)
                     -{
                     -	asm volatile(	"prefetcht1 (%0)\n\t"
                     -		: : "r" (p)
                     -	);
                     -}
+                    -
                     -static inline void prefetcht2(void *p)
                     -{
                     -	asm volatile(	"prefetcht2 (%0)\n\t"
                     -		: : "r" (p)
                     -	);
                     -}
                     -#endif
+                    -
                     -// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
+                    -
                     -/**
                     - * Check if the given 8x8 Block is mostly "flat"
                     - */
                     -static inline int isHorizDC(uint8_t src[], int stride, PPContext *c)
                     -{
                     -	int numEq= 0;
                     -	int y;
                     -	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
                     -	const int dcThreshold= dcOffset*2 + 1;
                     -	for(y=0; y<BLOCK_SIZE; y++)
                     -	{
                     -		if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
                     -		src+= stride;
                     -	}
                     -	return numEq > c->ppMode.flatnessThreshold;
                     -}
+                    -
                     -/**
                     - * Check if the middle 8x8 Block in the given 8x16 block is flat
                     - */
                     -static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
                     -	int numEq= 0;
                     -	int y;
                     -	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
                     -	const int dcThreshold= dcOffset*2 + 1;
                     -	src+= stride*4; // src points to begin of the 8x8 Block
                     -	for(y=0; y<BLOCK_SIZE-1; y++)
                     -	{
                     -		if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
                     -		src+= stride;
                     -	}
                     -	return numEq > c->ppMode.flatnessThreshold;
                     -}
+                    -
                     -static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
                     -{
                     -	if(abs(src[0] - src[7]) > 2*QP) return 0;
+                    -
                     -	return 1;
                     -}
+                    -
                     -static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
                     -{
                     -	int y;
                     -	for(y=0; y<BLOCK_SIZE; y++)
                     -	{
                     -		const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
+                    -
                     -		if(ABS(middleEnergy) < 8*QP)
                     -		{
                     -			const int q=(dst[3] - dst[4])/2;
                     -			const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
                     -			const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
+                    -
                     -			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
                     -			d= MAX(d, 0);
+                    -
                     -			d= (5*d + 32) >> 6;
                     -			d*= SIGN(-middleEnergy);
+                    -
                     -			if(q>0)
                     -			{
                     -				d= d<0 ? 0 : d;
                     -				d= d>q ? q : d;
                     -			}
                     -			else
                     -			{
                     -				d= d>0 ? 0 : d;
                     -				d= d<q ? q : d;
                     -			}
+                    -
                     -        		dst[3]-= d;
                     -	        	dst[4]+= d;
                     -		}
                     -		dst+= stride;
                     -	}
                     -}
+                    -
                     -/**
                     - * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
                     - * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
                     - */
                     -static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
                     -{
+                    -
                     -	int y;
                     -	for(y=0; y<BLOCK_SIZE; y++)
                     -	{
                     -		const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
                     -		const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
+                    -
                     -		int sums[9];
                     -		sums[0] = first + dst[0];
                     -		sums[1] = dst[0] + dst[1];
                     -		sums[2] = dst[1] + dst[2];
                     -		sums[3] = dst[2] + dst[3];
                     -		sums[4] = dst[3] + dst[4];
                     -		sums[5] = dst[4] + dst[5];
                     -		sums[6] = dst[5] + dst[6];
                     -		sums[7] = dst[6] + dst[7];
                     -		sums[8] = dst[7] + last;
+                    -
                     -		dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
                     -		dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
                     -		dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
                     -		dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
                     -		dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
                     -		dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
                     -		dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
                     -		dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
+                    -
                     -		dst+= stride;
                     -	}
                     -}
+                    -
                     -/**
                     - * Experimental Filter 1 (Horizontal)
                     - * will not damage linear gradients
                     - * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
                     - * can only smooth blocks at the expected locations (it cant smooth them if they did move)
                     - * MMX2 version does correct clipping C version doesnt
                     - * not identical with the vertical one
                     - */
                     -static inline void horizX1Filter(uint8_t *src, int stride, int QP)
                     -{
                     -	int y;
                     -	static uint64_t *lut= NULL;
                     -	if(lut==NULL)
                     -	{
                     -		int i;
                     -		lut= (uint64_t*)memalign(8, 256*8);
                     -		for(i=0; i<256; i++)
                     -		{
                     -			int v= i < 128 ? 2*i : 2*(i-256);
                     -/*
                     -//Simulate 112242211 9-Tap filter
                     -			uint64_t a= (v/16) & 0xFF;
                     -			uint64_t b= (v/8) & 0xFF;
                     -			uint64_t c= (v/4) & 0xFF;
                     -			uint64_t d= (3*v/8) & 0xFF;
                     -*/
                     -//Simulate piecewise linear interpolation
                     -			uint64_t a= (v/16) & 0xFF;
                     -			uint64_t b= (v*3/16) & 0xFF;
                     -			uint64_t c= (v*5/16) & 0xFF;
                     -			uint64_t d= (7*v/16) & 0xFF;
                     -			uint64_t A= (0x100 - a)&0xFF;
                     -			uint64_t B= (0x100 - b)&0xFF;
                     -			uint64_t C= (0x100 - c)&0xFF;
                     -			uint64_t D= (0x100 - c)&0xFF;
+                    -
                     -			lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
                     -				(D<<24) | (C<<16) | (B<<8) | (A);
                     -			//lut[i] = (v<<32) | (v<<24);
                     -		}
                     -	}
+                    -
                     -	for(y=0; y<BLOCK_SIZE; y++)
                     -	{
                     -		int a= src[1] - src[2];
                     -		int b= src[3] - src[4];
                     -		int c= src[5] - src[6];
+                    -
                     -		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
+                    -
                     -		if(d < QP)
                     -		{
                     -			int v = d * SIGN(-b);
+                    -
                     -			src[1] +=v/8;
                     -			src[2] +=v/4;
                     -			src[3] +=3*v/8;
                     -			src[4] -=3*v/8;
                     -			src[5] -=v/4;
                     -			src[6] -=v/8;
+                    -
                     -		}
                     -		src+=stride;
                     -	}
                     -}
+                    -
+                    -
                     -//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
                     -//Plain C versions
                     -#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
                     -#define COMPILE_C
                     -#endif
+                    -
                     -#ifdef ARCH_X86
+                    -
                     -#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
                     -#define COMPILE_MMX
                     -#endif
+                    -
                     -#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
                     -#define COMPILE_MMX2
                     -#endif
+                    -
                     -#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
                     -#define COMPILE_3DNOW
                     -#endif
                     -#endif //ARCH_X86
+                    -
                     -#undef HAVE_MMX
                     -#undef HAVE_MMX2
                     -#undef HAVE_3DNOW
                     -#undef ARCH_X86
+                    -
                     -#ifdef COMPILE_C
                     -#undef HAVE_MMX
                     -#undef HAVE_MMX2
                     -#undef HAVE_3DNOW
                     -#undef ARCH_X86
                     -#define RENAME(a) a ## _C
                     -#include "postprocess_template.c"
                     -#endif
+                    -
                     -//MMX versions
                     -#ifdef COMPILE_MMX
                     -#undef RENAME
                     -#define HAVE_MMX
                     -#undef HAVE_MMX2
                     -#undef HAVE_3DNOW
                     -#define ARCH_X86
                     -#define RENAME(a) a ## _MMX
                     -#include "postprocess_template.c"
                     -#endif
+                    -
                     -//MMX2 versions
                     -#ifdef COMPILE_MMX2
                     -#undef RENAME
                     -#define HAVE_MMX
                     -#define HAVE_MMX2
                     -#undef HAVE_3DNOW
                     -#define ARCH_X86
                     -#define RENAME(a) a ## _MMX2
                     -#include "postprocess_template.c"
                     -#endif
+                    -
                     -//3DNOW versions
                     -#ifdef COMPILE_3DNOW
                     -#undef RENAME
                     -#define HAVE_MMX
                     -#undef HAVE_MMX2
                     -#define HAVE_3DNOW
                     -#define ARCH_X86
                     -#define RENAME(a) a ## _3DNow
                     -#include "postprocess_template.c"
                     -#endif
+                    -
                     -// minor note: the HAVE_xyz is messed up after that line so dont use it
+                    -
                     -static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     -	QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
                     -{
                     -	PPContext *c= (PPContext *)vc;
                     -	PPMode *ppMode= (PPMode *)vm;
                     -	c->ppMode= *ppMode; //FIXME
+                    -
                     -	// useing ifs here as they are faster than function pointers allthough the
                     -	// difference wouldnt be messureable here but its much better because
                     -	// someone might exchange the cpu whithout restarting mplayer ;)
                     -#ifdef RUNTIME_CPUDETECT
                     -#ifdef ARCH_X86
                     -	// ordered per speed fasterst first
                     -	if(c->cpuCaps & PP_CPU_CAPS_MMX2)
                     -		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -	else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
                     -		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -	else if(c->cpuCaps & PP_CPU_CAPS_MMX)
                     -		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -	else
                     -		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -#else
                     -		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -#endif
                     -#else //RUNTIME_CPUDETECT
                     -#ifdef HAVE_MMX2
                     -		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -#elif defined (HAVE_3DNOW)
                     -		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -#elif defined (HAVE_MMX)
                     -		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -#else
                     -		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
                     -#endif
                     -#endif //!RUNTIME_CPUDETECT
                     -}
+                    -
                     -//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     -//	QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
+                    -
                     -/* -pp Command line Help
                     -*/
                     -char *pp_help=
                     -"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
                     -"long form example:\n"
                     -"vdeblock:autoq/hdeblock:autoq/linblenddeint	default,-vdeblock\n"
                     -"short form example:\n"
                     -"vb:a/hb:a/lb					de,-vb\n"
                     -"more examples:\n"
                     -"tn:64:128:256\n"
                     -"Filters			Options\n"
                     -"short	long name	short	long option	Description\n"
                     -"*	*		a	autoq		cpu power dependant enabler\n"
                     -"			c	chrom		chrominance filtring enabled\n"
                     -"			y	nochrom		chrominance filtring disabled\n"
                     -"hb	hdeblock	(2 Threshold)		horizontal deblocking filter\n"
                     -"	1. difference factor: default=64, higher -> more deblocking\n"
                     -"	2. flatness threshold: default=40, lower -> more deblocking\n"
                     -"			the h & v deblocking filters share these\n"
                     -"			so u cant set different thresholds for h / v\n"
                     -"vb	vdeblock	(2 Threshold)		vertical deblocking filter\n"
                     -"h1	x1hdeblock				Experimental h deblock filter 1\n"
                     -"v1	x1vdeblock				Experimental v deblock filter 1\n"
                     -"dr	dering					Deringing filter\n"
                     -"al	autolevels				automatic brightness / contrast\n"
                     -"			f	fullyrange	stretch luminance to (0..255)\n"
                     -"lb	linblenddeint				linear blend deinterlacer\n"
                     -"li	linipoldeint				linear interpolating deinterlace\n"
                     -"ci	cubicipoldeint				cubic interpolating deinterlacer\n"
                     -"md	mediandeint				median deinterlacer\n"
                     -"fd	ffmpegdeint				ffmpeg deinterlacer\n"
                     -"de	default					hb:a,vb:a,dr:a,al\n"
                     -"fa	fast					h1:a,v1:a,dr:a,al\n"
                     -"tn	tmpnoise	(3 Thresholds)		Temporal Noise Reducer\n"
                     -"			1. <= 2. <= 3.		larger -> stronger filtering\n"
                     -"fq	forceQuant	<quantizer>		Force quantizer\n"
                     -;
+                    -
                     -pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
                     -{
                     -	char temp[GET_MODE_BUFFER_SIZE];
                     -	char *p= temp;
                     -	char *filterDelimiters= ",/";
                     -	char *optionDelimiters= ":";
                     -	struct PPMode *ppMode;
                     -	char *filterToken;
+                    -
                     -	ppMode= memalign(8, sizeof(PPMode));
+                    -
                     -	ppMode->lumMode= 0;
                     -	ppMode->chromMode= 0;
                     -	ppMode->maxTmpNoise[0]= 700;
                     -	ppMode->maxTmpNoise[1]= 1500;
                     -	ppMode->maxTmpNoise[2]= 3000;
                     -	ppMode->maxAllowedY= 234;
                     -	ppMode->minAllowedY= 16;
                     -	ppMode->baseDcDiff= 256/4;
                     -	ppMode->flatnessThreshold= 56-16;
                     -	ppMode->maxClippedThreshold= 0.01;
                     -	ppMode->error=0;
+                    -
                     -	strncpy(temp, name, GET_MODE_BUFFER_SIZE);
+                    -
                     -	if(verbose>1) printf("pp: %s\n", name);
+                    -
                     -	for(;;){
                     -		char *filterName;
                     -		int q= 1000000; //PP_QUALITY_MAX;
                     -		int chrom=-1;
                     -		char *option;
                     -		char *options[OPTIONS_ARRAY_SIZE];
                     -		int i;
                     -		int filterNameOk=0;
                     -		int numOfUnknownOptions=0;
                     -		int enable=1; //does the user want us to enabled or disabled the filter
+                    -
                     -		filterToken= strtok(p, filterDelimiters);
                     -		if(filterToken == NULL) break;
                     -		p+= strlen(filterToken) + 1; // p points to next filterToken
                     -		filterName= strtok(filterToken, optionDelimiters);
                     -		if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
+                    -
                     -		if(*filterName == '-')
                     -		{
                     -			enable=0;
                     -			filterName++;
                     -		}
+                    -
                     -		for(;;){ //for all options
                     -			option= strtok(NULL, optionDelimiters);
                     -			if(option == NULL) break;
+                    -
                     -			if(verbose>1) printf("pp: option: %s\n", option);
                     -			if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
                     -			else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
                     -			else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
                     -			else
                     -			{
                     -				options[numOfUnknownOptions] = option;
                     -				numOfUnknownOptions++;
                     -			}
                     -			if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
                     -		}
                     -		options[numOfUnknownOptions] = NULL;
+                    -
                     -		/* replace stuff from the replace Table */
                     -		for(i=0; replaceTable[2*i]!=NULL; i++)
                     -		{
                     -			if(!strcmp(replaceTable[2*i], filterName))
                     -			{
                     -				int newlen= strlen(replaceTable[2*i + 1]);
                     -				int plen;
                     -				int spaceLeft;
+                    -
                     -				if(p==NULL) p= temp, *p=0; 	//last filter
                     -				else p--, *p=',';		//not last filter
+                    -
                     -				plen= strlen(p);
                     -				spaceLeft= p - temp + plen;
                     -				if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
                     -				{
                     -					ppMode->error++;
                     -					break;
                     -				}
                     -				memmove(p + newlen, p, plen+1);
                     -				memcpy(p, replaceTable[2*i + 1], newlen);
                     -				filterNameOk=1;
                     -			}
                     -		}
+                    -
                     -		for(i=0; filters[i].shortName!=NULL; i++)
                     -		{
                     -//			printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
                     -			if(   !strcmp(filters[i].longName, filterName)
                     -			   || !strcmp(filters[i].shortName, filterName))
                     -			{
                     -				ppMode->lumMode &= ~filters[i].mask;
                     -				ppMode->chromMode &= ~filters[i].mask;
+                    -
                     -				filterNameOk=1;
                     -				if(!enable) break; // user wants to disable it
+                    -
                     -				if(q >= filters[i].minLumQuality)
                     -					ppMode->lumMode|= filters[i].mask;
                     -				if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
                     -					if(q >= filters[i].minChromQuality)
                     -						ppMode->chromMode|= filters[i].mask;
+                    -
                     -				if(filters[i].mask == LEVEL_FIX)
                     -				{
                     -					int o;
                     -					ppMode->minAllowedY= 16;
                     -					ppMode->maxAllowedY= 234;
                     -					for(o=0; options[o]!=NULL; o++)
                     -					{
                     -						if(  !strcmp(options[o],"fullyrange")
                     -						   ||!strcmp(options[o],"f"))
                     -						{
                     -							ppMode->minAllowedY= 0;
                     -							ppMode->maxAllowedY= 255;
                     -							numOfUnknownOptions--;
                     -						}
                     -					}
                     -				}
                     -				else if(filters[i].mask == TEMP_NOISE_FILTER)
                     -				{
                     -					int o;
                     -					int numOfNoises=0;
+                    -
                     -					for(o=0; options[o]!=NULL; o++)
                     -					{
                     -						char *tail;
                     -						ppMode->maxTmpNoise[numOfNoises]=
                     -							strtol(options[o], &tail, 0);
                     -						if(tail!=options[o])
                     -						{
                     -							numOfNoises++;
                     -							numOfUnknownOptions--;
                     -							if(numOfNoises >= 3) break;
                     -						}
                     -					}
                     -				}
                     -				else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
                     -				{
                     -					int o;
+                    -
                     -					for(o=0; options[o]!=NULL && o<2; o++)
                     -					{
                     -						char *tail;
                     -						int val= strtol(options[o], &tail, 0);
                     -						if(tail==options[o]) break;
+                    -
                     -						numOfUnknownOptions--;
                     -						if(o==0) ppMode->baseDcDiff= val;
                     -						else ppMode->flatnessThreshold= val;
                     -					}
                     -				}
                     -				else if(filters[i].mask == FORCE_QUANT)
                     -				{
                     -					int o;
                     -					ppMode->forcedQuant= 15;
+                    -
                     -					for(o=0; options[o]!=NULL && o<1; o++)
                     -					{
                     -						char *tail;
                     -						int val= strtol(options[o], &tail, 0);
                     -						if(tail==options[o]) break;
+                    -
                     -						numOfUnknownOptions--;
                     -						ppMode->forcedQuant= val;
                     -					}
                     -				}
                     -			}
                     -		}
                     -		if(!filterNameOk) ppMode->error++;
                     -		ppMode->error += numOfUnknownOptions;
                     -	}
+                    -
                     -	if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
                     -	if(ppMode->error)
                     -	{
                     -		fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
                     -		free(ppMode);
                     -		return NULL;
                     -	}
                     -	return ppMode;
                     -}
+                    -
                     -void pp_free_mode(pp_mode_t *mode){
                     -    if(mode) free(mode);
                     -}
+                    -
                     -static void reallocAlign(void **p, int alignment, int size){
                     -	if(*p) free(*p);
                     -	*p= memalign(alignment, size);
                     -	memset(*p, 0, size);
                     -}
+                    -
                     -static void reallocBuffers(PPContext *c, int width, int height, int stride){
                     -	int mbWidth = (width+15)>>4;
                     -	int mbHeight= (height+15)>>4;
                     -	int i;
+                    -
                     -	c->stride= stride;
+                    -
                     -	reallocAlign((void **)&c->tempDst, 8, stride*24);
                     -	reallocAlign((void **)&c->tempSrc, 8, stride*24);
                     -	reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
                     -	reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
                     -	for(i=0; i<256; i++)
                     -		c->yHistogram[i]= width*height/64*15/256;
+                    -
                     -	for(i=0; i<3; i++)
                     -	{
                     -		//Note:the +17*1024 is just there so i dont have to worry about r/w over te end
                     -		reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
                     -		reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
                     -	}
+                    -
                     -	reallocAlign((void **)&c->deintTemp, 8, width+16);
                     -	reallocAlign((void **)&c->nonBQPTable, 8, mbWidth*mbHeight*sizeof(QP_STORE_T));
                     -	reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
                     -}
+                    -
                     -pp_context_t *pp_get_context(int width, int height, int cpuCaps){
                     -	PPContext *c= memalign(32, sizeof(PPContext));
                     -	int i;
                     -	int stride= (width+15)&(~15); //assumed / will realloc if needed
+                    -
                     -	memset(c, 0, sizeof(PPContext));
                     -	c->cpuCaps= cpuCaps;
                     -	if(cpuCaps&PP_FORMAT){
                     -		c->hChromaSubSample= cpuCaps&0x3;
                     -		c->vChromaSubSample= (cpuCaps>>4)&0x3;
                     -	}else{
                     -		c->hChromaSubSample= 1;
                     -		c->vChromaSubSample= 1;
                     -	}
+                    -
                     -	reallocBuffers(c, width, height, stride);
+                    -
                     -	c->frameNum=-1;
+                    -
                     -	return c;
                     -}
+                    -
                     -void pp_free_context(void *vc){
                     -	PPContext *c = (PPContext*)vc;
                     -	int i;
+                    -
                     -	for(i=0; i<3; i++) free(c->tempBlured[i]);
                     -	for(i=0; i<3; i++) free(c->tempBluredPast[i]);
+                    -
                     -	free(c->tempBlocks);
                     -	free(c->yHistogram);
                     -	free(c->tempDst);
                     -	free(c->tempSrc);
                     -	free(c->deintTemp);
                     -	free(c->nonBQPTable);
                     -	free(c->forcedQPTable);
+                    -
                     -	memset(c, 0, sizeof(PPContext));
+                    -
                     -	free(c);
                     -}
+                    -
                     -void  pp_postprocess(uint8_t * src[3], int srcStride[3],
                     -                 uint8_t * dst[3], int dstStride[3],
                     -                 int width, int height,
                     -                 QP_STORE_T *QP_store,  int QPStride,
                     -		 pp_mode_t *vm,  void *vc, int pict_type)
                     -{
                     -	int mbWidth = (width+15)>>4;
                     -	int mbHeight= (height+15)>>4;
                     -	PPMode *mode = (PPMode*)vm;
                     -	PPContext *c = (PPContext*)vc;
                     -        int minStride= MAX(srcStride[0], dstStride[0]);
+                    -
                     -	if(c->stride < minStride)
                     -		reallocBuffers(c, width, height, minStride);
+                    -
                     -	if(QP_store==NULL || (mode->lumMode & FORCE_QUANT))
                     -	{
                     -		int i;
                     -		QP_store= c->forcedQPTable;
                     -		QPStride= 0;
                     -		if(mode->lumMode & FORCE_QUANT)
                     -			for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
                     -		else
                     -			for(i=0; i<mbWidth; i++) QP_store[i]= 1;
                     -	}
                     -if(0){
                     -int x,y;
                     -for(y=0; y<mbHeight; y++){
                     -	for(x=0; x<mbWidth; x++){
                     -		printf("%2d ", QP_store[x + y*QPStride]);
                     -	}
                     -	printf("\n");
                     -}
                     -	printf("\n");
                     -}
                     -//printf("pict_type:%d\n", pict_type);
+                    -
                     -	if(pict_type!=3)
                     -	{
                     -		int x,y;
                     -		for(y=0; y<mbHeight; y++){
                     -			for(x=0; x<mbWidth; x++){
                     -				int qscale= QP_store[x + y*QPStride];
                     -				if(qscale&~31)
                     -				    qscale=31;
                     -				c->nonBQPTable[y*mbWidth + x]= qscale;
                     -			}
                     -		}
                     -	}
+                    -
                     -	if(verbose>2)
                     -	{
                     -		printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
                     -	}
+                    -
                     -	postProcess(src[0], srcStride[0], dst[0], dstStride[0],
                     -		width, height, QP_store, QPStride, 0, mode, c);
+                    -
                     -	width  = (width )>>c->hChromaSubSample;
                     -	height = (height)>>c->vChromaSubSample;
+                    -
                     -	if(mode->chromMode)
                     -	{
                     -		postProcess(src[1], srcStride[1], dst[1], dstStride[1],
                     -			width, height, QP_store, QPStride, 1, mode, c);
                     -		postProcess(src[2], srcStride[2], dst[2], dstStride[2],
                     -			width, height, QP_store, QPStride, 2, mode, c);
                     -	}
                     -	else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
                     -	{
                     -		memcpy(dst[1], src[1], srcStride[1]*height);
                     -		memcpy(dst[2], src[2], srcStride[2]*height);
                     -	}
                     -	else
                     -	{
                     -		int y;
                     -		for(y=0; y<height; y++)
                     -		{
                     -			memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
                     -			memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
                     -		}
                     -	}
                     -}
+                    -

postproc/postprocess.h

History View file @ bba9b16

                     deleted file mode 100644
@@ -1,73 +0,0 @@
                     -/*
                     -    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
+                    -
                     -    This program is free software; you can redistribute it and/or modify
                     -    it under the terms of the GNU General Public License as published by
                     -    the Free Software Foundation; either version 2 of the License, or
                     -    (at your option) any later version.
+                    -
                     -    This program is distributed in the hope that it will be useful,
                     -    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     -    GNU General Public License for more details.
+                    -
                     -    You should have received a copy of the GNU General Public License
                     -    along with this program; if not, write to the Free Software
                     -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     -*/
+                    -
                     -#ifndef NEWPOSTPROCESS_H
                     -#define NEWPOSTPROCESS_H
+                    -
                     -/**
                     - * @file postprocess.h
                     - * @brief
                     - *     external api for the pp stuff
                     - */
+                    -
                     -#ifdef __cplusplus
                     -extern "C" {
                     -#endif
+                    -
                     -#define PP_QUALITY_MAX 6
+                    -
                     -#define QP_STORE_T int8_t
+                    -
                     -typedef void pp_context_t;
                     -typedef void pp_mode_t;
+                    -
                     -extern char *pp_help; //a simple help text
+                    -
                     -void  pp_postprocess(uint8_t * src[3], int srcStride[3],
                     -                 uint8_t * dst[3], int dstStride[3],
                     -                 int horizontalSize, int verticalSize,
                     -                 QP_STORE_T *QP_store,  int QP_stride,
                     -		 pp_mode_t *mode, pp_context_t *ppContext, int pict_type);
+                    -
+                    -
                     -/**
                     - * returns a pp_mode_t or NULL if an error occured
                     - * name is the string after "-pp" on the command line
                     - * quality is a number from 0 to PP_QUALITY_MAX
                     - */
                     -pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality);
                     -void pp_free_mode(pp_mode_t *mode);
+                    -
                     -pp_context_t *pp_get_context(int width, int height, int flags);
                     -void pp_free_context(pp_context_t *ppContext);
+                    -
                     -#define PP_CPU_CAPS_MMX   0x80000000
                     -#define PP_CPU_CAPS_MMX2  0x20000000
                     -#define PP_CPU_CAPS_3DNOW 0x40000000
+                    -
                     -#define PP_FORMAT         0x00000008
                     -#define PP_FORMAT_420    (0x00000011|PP_FORMAT)
                     -#define PP_FORMAT_422    (0x00000001|PP_FORMAT)
                     -#define PP_FORMAT_411    (0x00000002|PP_FORMAT)
                     -#define PP_FORMAT_444    (0x00000000|PP_FORMAT)
+                    -
                     -#ifdef __cplusplus
                     -}
                     -#endif
+                    -
                     -#endif

postproc/postprocess_internal.h

History View file @ bba9b16

                     deleted file mode 100644
@@ -1,128 +0,0 @@
                     -/*
                     -    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
+                    -
                     -    This program is free software; you can redistribute it and/or modify
                     -    it under the terms of the GNU General Public License as published by
                     -    the Free Software Foundation; either version 2 of the License, or
                     -    (at your option) any later version.
+                    -
                     -    This program is distributed in the hope that it will be useful,
                     -    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     -    GNU General Public License for more details.
+                    -
                     -    You should have received a copy of the GNU General Public License
                     -    along with this program; if not, write to the Free Software
                     -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     -*/
+                    -
                     -#define V_DEBLOCK	0x01
                     -#define H_DEBLOCK	0x02
                     -#define DERING		0x04
                     -#define LEVEL_FIX	0x08 /* Brightness & Contrast */
+                    -
                     -#define LUM_V_DEBLOCK	V_DEBLOCK		//   1
                     -#define LUM_H_DEBLOCK	H_DEBLOCK		//   2
                     -#define CHROM_V_DEBLOCK	(V_DEBLOCK<<4)		//  16
                     -#define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)		//  32
                     -#define LUM_DERING	DERING			//   4
                     -#define CHROM_DERING	(DERING<<4)		//  64
                     -#define LUM_LEVEL_FIX	LEVEL_FIX		//   8
                     -#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)		// 128 (not implemented yet)
+                    -
                     -// Experimental vertical filters
                     -#define V_X1_FILTER	0x0200			// 512
+                    -
                     -// Experimental horizontal filters
                     -#define H_X1_FILTER	0x2000			// 8192
+                    -
                     -// select between full y range (255-0) or standart one (234-16)
                     -#define FULL_Y_RANGE	0x8000			// 32768
+                    -
                     -//Deinterlacing Filters
                     -#define	LINEAR_IPOL_DEINT_FILTER	0x10000	// 65536
                     -#define	LINEAR_BLEND_DEINT_FILTER	0x20000	// 131072
                     -#define	CUBIC_BLEND_DEINT_FILTER	0x8000	// (not implemented yet)
                     -#define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144
                     -#define	MEDIAN_DEINT_FILTER		0x80000	// 524288
                     -#define	FFMPEG_DEINT_FILTER		0x400000
+                    -
                     -#define TEMP_NOISE_FILTER		0x100000
                     -#define FORCE_QUANT			0x200000
+                    -
                     -//use if u want a faster postprocessing code
                     -//cant differentiate between chroma & luma filters (both on or both off)
                     -//obviosly the -pp option at the commandline has no effect except turning the here selected
                     -//filters on
                     -//#define COMPILE_TIME_MODE 0x77
+                    -
                     -struct PPFilter{
                     -	char *shortName;
                     -	char *longName;
                     -	int chromDefault; 	// is chrominance filtering on by default if this filter is manually activated
                     -	int minLumQuality; 	// minimum quality to turn luminance filtering on
                     -	int minChromQuality;	// minimum quality to turn chrominance filtering on
                     -	int mask; 		// Bitmask to turn this filter on
                     -};
+                    -
                     -typedef struct PPMode{
                     -	int lumMode; 			// acivates filters for luminance
                     -	int chromMode; 			// acivates filters for chrominance
                     -	int error; 			// non zero on error
+                    -
                     -	int minAllowedY; 		// for brigtness correction
                     -	int maxAllowedY; 		// for brihtness correction
                     -	float maxClippedThreshold;	// amount of "black" u r willing to loose to get a brightness corrected picture
+                    -
                     -	int maxTmpNoise[3]; 		// for Temporal Noise Reducing filter (Maximal sum of abs differences)
+                    -
                     -	int baseDcDiff;
                     -	int flatnessThreshold;
+                    -
                     -	int forcedQuant; 		// quantizer if FORCE_QUANT is used
                     -} PPMode;
+                    -
                     -typedef struct PPContext{
                     -	uint8_t *tempBlocks; //used for the horizontal code
+                    -
                     -	   after watching a black picture for 5 hours*/
                     -	uint64_t *yHistogram;
+                    -
                     -	uint64_t __attribute__((aligned(8))) packedYOffset;
                     -	uint64_t __attribute__((aligned(8))) packedYScale;
+                    -
                     -	/* Temporal noise reducing buffers */
                     -	uint8_t *tempBlured[3];
                     -	int32_t *tempBluredPast[3];
+                    -
                     -	/* Temporary buffers for handling the last row(s) */
                     -	uint8_t *tempDst;
                     -	uint8_t *tempSrc;
+                    -
                     -	uint8_t *deintTemp;
+                    -
                     -	uint64_t __attribute__((aligned(8))) pQPb;
                     -	uint64_t __attribute__((aligned(8))) pQPb2;
+                    -
                     -	uint64_t __attribute__((aligned(8))) mmxDcOffset[32];
                     -	uint64_t __attribute__((aligned(8))) mmxDcThreshold[32];
+                    -
                     -	QP_STORE_T *nonBQPTable;
                     -	QP_STORE_T *forcedQPTable;
+                    -
                     -	int QP;
                     -	int nonBQP;
+                    -
                     -	int frameNum;
+                    -
                     -	int cpuCaps;
+                    -
                     -	int stride; //size of some buffers (needed to realloc them if needed)
+                    -
                     -	int hChromaSubSample;
                     -	int vChromaSubSample;
+                    -
                     -	PPMode ppMode;
                     -} PPContext;
+                    -

postproc/postprocess_template.c

History View file @ bba9b16

                     deleted file mode 100644
@@ -1,3127 +0,0 @@
                     -/*
                     -    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
+                    -
                     -    This program is free software; you can redistribute it and/or modify
                     -    it under the terms of the GNU General Public License as published by
                     -    the Free Software Foundation; either version 2 of the License, or
                     -    (at your option) any later version.
+                    -
                     -    This program is distributed in the hope that it will be useful,
                     -    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     -    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     -    GNU General Public License for more details.
+                    -
                     -    You should have received a copy of the GNU General Public License
                     -    along with this program; if not, write to the Free Software
                     -    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     -*/
+                    -
                     -#undef PAVGB
                     -#undef PMINUB
                     -#undef PMAXUB
+                    -
                     -#ifdef HAVE_MMX2
                     -#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
                     -#elif defined (HAVE_3DNOW)
                     -#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
                     -#endif
+                    -
                     -#ifdef HAVE_MMX2
                     -#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
                     -#elif defined (HAVE_MMX)
                     -#define PMINUB(b,a,t) \
                     -	"movq " #a ", " #t " \n\t"\
                     -	"psubusb " #b ", " #t " \n\t"\
                     -	"psubb " #t ", " #a " \n\t"
                     -#endif
+                    -
                     -#ifdef HAVE_MMX2
                     -#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
                     -#elif defined (HAVE_MMX)
                     -#define PMAXUB(a,b) \
                     -	"psubusb " #a ", " #b " \n\t"\
                     -	"paddb " #a ", " #b " \n\t"
                     -#endif
+                    -
+                    -
                     -//FIXME? |255-0| = 1 (shouldnt be a problem ...)
                     -#ifdef HAVE_MMX
                     -/**
                     - * Check if the middle 8x8 Block in the given 8x16 block is flat
                     - */
                     -static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
                     -	int numEq= 0;
                     -	src+= stride*4; // src points to begin of the 8x8 Block
                     -asm volatile(
                     -		"leal (%1, %2), %%eax				\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%1	eax	eax+%2	eax+2%2	%1+4%2	ecx	ecx+%2	ecx+2%2	%1+8%2	ecx+4%2
                     -		"movq %3, %%mm7					\n\t"
                     -		"movq %4, %%mm6					\n\t"
+                    -
                     -		"movq (%1), %%mm0				\n\t"
                     -		"movq (%%eax), %%mm1				\n\t"
                     -		"psubb %%mm1, %%mm0				\n\t" // mm0 = differnece
                     -		"paddb %%mm7, %%mm0				\n\t"
                     -		"pcmpgtb %%mm6, %%mm0				\n\t"
+                    -
                     -		"movq (%%eax,%2), %%mm2				\n\t"
                     -		"psubb %%mm2, %%mm1				\n\t"
                     -		"paddb %%mm7, %%mm1				\n\t"
                     -		"pcmpgtb %%mm6, %%mm1				\n\t"
                     -		"paddb %%mm1, %%mm0				\n\t"
+                    -
                     -		"movq (%%eax, %2, 2), %%mm1			\n\t"
                     -		"psubb %%mm1, %%mm2				\n\t"
                     -		"paddb %%mm7, %%mm2				\n\t"
                     -		"pcmpgtb %%mm6, %%mm2				\n\t"
                     -		"paddb %%mm2, %%mm0				\n\t"
+                    -
                     -		"leal (%%eax, %2, 4), %%eax			\n\t"
+                    -
                     -		"movq (%1, %2, 4), %%mm2			\n\t"
                     -		"psubb %%mm2, %%mm1				\n\t"
                     -		"paddb %%mm7, %%mm1				\n\t"
                     -		"pcmpgtb %%mm6, %%mm1				\n\t"
                     -		"paddb %%mm1, %%mm0				\n\t"
+                    -
                     -		"movq (%%eax), %%mm1				\n\t"
                     -		"psubb %%mm1, %%mm2				\n\t"
                     -		"paddb %%mm7, %%mm2				\n\t"
                     -		"pcmpgtb %%mm6, %%mm2				\n\t"
                     -		"paddb %%mm2, %%mm0				\n\t"
+                    -
                     -		"movq (%%eax, %2), %%mm2			\n\t"
                     -		"psubb %%mm2, %%mm1				\n\t"
                     -		"paddb %%mm7, %%mm1				\n\t"
                     -		"pcmpgtb %%mm6, %%mm1				\n\t"
                     -		"paddb %%mm1, %%mm0				\n\t"
+                    -
                     -		"movq (%%eax, %2, 2), %%mm1			\n\t"
                     -		"psubb %%mm1, %%mm2				\n\t"
                     -		"paddb %%mm7, %%mm2				\n\t"
                     -		"pcmpgtb %%mm6, %%mm2				\n\t"
                     -		"paddb %%mm2, %%mm0				\n\t"
+                    -
                     -		"						\n\t"
                     -#ifdef HAVE_MMX2
                     -		"pxor %%mm7, %%mm7				\n\t"
                     -		"psadbw %%mm7, %%mm0				\n\t"
                     -#else
                     -		"movq %%mm0, %%mm1				\n\t"
                     -		"psrlw $8, %%mm0				\n\t"
                     -		"paddb %%mm1, %%mm0				\n\t"
                     -		"movq %%mm0, %%mm1				\n\t"
                     -		"psrlq $16, %%mm0				\n\t"
                     -		"paddb %%mm1, %%mm0				\n\t"
                     -		"movq %%mm0, %%mm1				\n\t"
                     -		"psrlq $32, %%mm0				\n\t"
                     -		"paddb %%mm1, %%mm0				\n\t"
                     -#endif
                     -		"movd %%mm0, %0					\n\t"
                     -		: "=r" (numEq)
                     -		: "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
                     -		: "%eax"
                     -		);
                     -	numEq= (-numEq) &0xFF;
                     -	return numEq > c->ppMode.flatnessThreshold;
                     -}
                     -#endif
+                    -
                     -static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
                     -{
                     -#ifdef HAVE_MMX
                     -	int isOk;
                     -	src+= stride*3;
                     -	asm volatile(
                     -		"movq (%1, %2), %%mm0				\n\t"
                     -		"movq (%1, %2, 8), %%mm1			\n\t"
                     -		"movq %%mm0, %%mm2				\n\t"
                     -		"psubusb %%mm1, %%mm0				\n\t"
                     -		"psubusb %%mm2, %%mm1				\n\t"
                     -		"por %%mm1, %%mm0				\n\t" // ABS Diff
+                    -
                     -		"movq %3, %%mm7					\n\t" // QP,..., QP
                     -		"paddusb %%mm7, %%mm7				\n\t" // 2QP ... 2QP
                     -		"psubusb %%mm7, %%mm0				\n\t" // Diff <= 2QP -> 0
                     -		"packssdw %%mm0, %%mm0				\n\t"
                     -		"movd %%mm0, %0					\n\t"
                     -		: "=r" (isOk)
                     -		: "r" (src), "r" (stride), "m" (c->pQPb)
                     -		);
                     -	return isOk==0;
                     -#else
                     -#if 1
                     -	int x;
                     -	const int QP= c->QP;
                     -	src+= stride*3;
                     -	for(x=0; x<BLOCK_SIZE; x++)
                     -	{
                     -		if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
                     -	}
+                    -
                     -	return 1;
                     -#else
                     -	int x;
                     -	const int QP= c->QP;
                     -	src+= stride*4;
                     -	for(x=0; x<BLOCK_SIZE; x++)
                     -	{
                     -		int min=255;
                     -		int max=0;
                     -		int y;
                     -		for(y=0; y<8; y++){
                     -			int v= src[x + y*stride];
                     -			if(v>max) max=v;
                     -			if(v<min) min=v;
                     -		}
                     -		if(max-min > 2*QP) return 0;
                     -	}
                     -	return 1;
                     -#endif
                     -#endif
                     -}
+                    -
                     -/**
                     - * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
                     - * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
                     - */
                     -static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	src+= stride*3;
                     -	asm volatile(	//"movv %0 %1 %2\n\t"
                     -		"movq %2, %%mm0			\n\t"  // QP,..., QP
                     -		"pxor %%mm4, %%mm4				\n\t"
+                    -
                     -		"movq (%0), %%mm6				\n\t"
                     -		"movq (%0, %1), %%mm5				\n\t"
                     -		"movq %%mm5, %%mm1				\n\t"
                     -		"movq %%mm6, %%mm2				\n\t"
                     -		"psubusb %%mm6, %%mm5				\n\t"
                     -		"psubusb %%mm1, %%mm2				\n\t"
                     -		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
                     -		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
                     -		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
+                    -
                     -		"pand %%mm2, %%mm6				\n\t"
                     -		"pandn %%mm1, %%mm2				\n\t"
                     -		"por %%mm2, %%mm6				\n\t"// First Line to Filter
+                    -
                     -		"movq (%0, %1, 8), %%mm5			\n\t"
                     -		"leal (%0, %1, 4), %%eax			\n\t"
                     -		"leal (%0, %1, 8), %%ecx			\n\t"
                     -		"subl %1, %%ecx					\n\t"
                     -		"addl %1, %0					\n\t" // %0 points to line 1 not 0
                     -		"movq (%0, %1, 8), %%mm7			\n\t"
                     -		"movq %%mm5, %%mm1				\n\t"
                     -		"movq %%mm7, %%mm2				\n\t"
                     -		"psubusb %%mm7, %%mm5				\n\t"
                     -		"psubusb %%mm1, %%mm2				\n\t"
                     -		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
                     -		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
                     -		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
+                    -
                     -		"pand %%mm2, %%mm7				\n\t"
                     -		"pandn %%mm1, %%mm2				\n\t"
                     -		"por %%mm2, %%mm7				\n\t" // First Line to Filter
+                    -
+                    -
                     -		// 	1	2	3	4	5	6	7	8
                     -		//	%0	%0+%1	%0+2%1	eax	%0+4%1	eax+2%1	ecx	eax+4%1
                     -		// 6 4 2 2 1 1
                     -		// 6 4 4 2
                     -		// 6 8 2
+                    -
                     -		"movq (%0, %1), %%mm0				\n\t" //  1
                     -		"movq %%mm0, %%mm1				\n\t" //  1
                     -		PAVGB(%%mm6, %%mm0)				      //1 1	/2
                     -		PAVGB(%%mm6, %%mm0)				      //3 1	/4
+                    -
                     -		"movq (%0, %1, 4), %%mm2			\n\t" //     1
                     -		"movq %%mm2, %%mm5				\n\t" //     1
                     -		PAVGB((%%eax), %%mm2)				      //    11	/2
                     -		PAVGB((%0, %1, 2), %%mm2)			      //   211	/4
                     -		"movq %%mm2, %%mm3				\n\t" //   211	/4
                     -		"movq (%0), %%mm4				\n\t" // 1
                     -		PAVGB(%%mm4, %%mm3)				      // 4 211	/8
                     -		PAVGB(%%mm0, %%mm3)				      //642211	/16
                     -		"movq %%mm3, (%0)				\n\t" // X
                     -		// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
                     -		"movq %%mm1, %%mm0				\n\t" //  1
                     -		PAVGB(%%mm6, %%mm0)				      //1 1	/2
                     -		"movq %%mm4, %%mm3				\n\t" // 1
                     -		PAVGB((%0,%1,2), %%mm3)				      // 1 1	/2
                     -		PAVGB((%%eax,%1,2), %%mm5)			      //     11	/2
                     -		PAVGB((%%eax), %%mm5)				      //    211 /4
                     -		PAVGB(%%mm5, %%mm3)				      // 2 2211 /8
                     -		PAVGB(%%mm0, %%mm3)				      //4242211 /16
                     -		"movq %%mm3, (%0,%1)				\n\t" //  X
                     -		// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
                     -		PAVGB(%%mm4, %%mm6)				      //11	/2
                     -		"movq (%%ecx), %%mm0				\n\t" //       1
                     -		PAVGB((%%eax, %1, 2), %%mm0)			      //      11/2
                     -		"movq %%mm0, %%mm3				\n\t" //      11/2
                     -		PAVGB(%%mm1, %%mm0)				      //  2   11/4
                     -		PAVGB(%%mm6, %%mm0)				      //222   11/8
                     -		PAVGB(%%mm2, %%mm0)				      //22242211/16
                     -		"movq (%0, %1, 2), %%mm2			\n\t" //   1
                     -		"movq %%mm0, (%0, %1, 2)			\n\t" //   X
                     -		// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
                     -		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
                     -		PAVGB((%%ecx), %%mm0)				      //       11	/2
                     -		PAVGB(%%mm0, %%mm6)				      //11     11	/4
                     -		PAVGB(%%mm1, %%mm4)				      // 11		/2
                     -		PAVGB(%%mm2, %%mm1)				      //  11		/2
                     -		PAVGB(%%mm1, %%mm6)				      //1122   11	/8
                     -		PAVGB(%%mm5, %%mm6)				      //112242211	/16
                     -		"movq (%%eax), %%mm5				\n\t" //    1
                     -		"movq %%mm6, (%%eax)				\n\t" //    X
                     -		// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
                     -		"movq (%%eax, %1, 4), %%mm6			\n\t" //        1
                     -		PAVGB(%%mm7, %%mm6)				      //        11	/2
                     -		PAVGB(%%mm4, %%mm6)				      // 11     11	/4
                     -		PAVGB(%%mm3, %%mm6)				      // 11   2211	/8
                     -		PAVGB(%%mm5, %%mm2)				      //   11		/2
                     -		"movq (%0, %1, 4), %%mm4			\n\t" //     1
                     -		PAVGB(%%mm4, %%mm2)				      //   112		/4
                     -		PAVGB(%%mm2, %%mm6)				      // 112242211	/16
                     -		"movq %%mm6, (%0, %1, 4)			\n\t" //     X
                     -		// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
                     -		PAVGB(%%mm7, %%mm1)				      //  11     2	/4
                     -		PAVGB(%%mm4, %%mm5)				      //    11		/2
                     -		PAVGB(%%mm5, %%mm0)				      //    11 11	/4
                     -		"movq (%%eax, %1, 2), %%mm6			\n\t" //      1
                     -		PAVGB(%%mm6, %%mm1)				      //  11  4  2	/8
                     -		PAVGB(%%mm0, %%mm1)				      //  11224222	/16
                     -		"movq %%mm1, (%%eax, %1, 2)			\n\t" //      X
                     -		// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
                     -		PAVGB((%%ecx), %%mm2)				      //   112 4	/8
                     -		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
                     -		PAVGB(%%mm0, %%mm6)				      //      1 1	/2
                     -		PAVGB(%%mm7, %%mm6)				      //      1 12	/4
                     -		PAVGB(%%mm2, %%mm6)				      //   1122424	/4
                     -		"movq %%mm6, (%%ecx)				\n\t" //       X
                     -		// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
                     -		PAVGB(%%mm7, %%mm5)				      //    11   2	/4
                     -		PAVGB(%%mm7, %%mm5)				      //    11   6	/8
+                    -
                     -		PAVGB(%%mm3, %%mm0)				      //      112	/4
                     -		PAVGB(%%mm0, %%mm5)				      //    112246	/16
                     -		"movq %%mm5, (%%eax, %1, 4)			\n\t" //        X
                     -		"subl %1, %0					\n\t"
+                    -
                     -		:
                     -		: "r" (src), "r" (stride), "m" (c->pQPb)
                     -		: "%eax", "%ecx"
                     -	);
                     -#else
                     -	const int l1= stride;
                     -	const int l2= stride + l1;
                     -	const int l3= stride + l2;
                     -	const int l4= stride + l3;
                     -	const int l5= stride + l4;
                     -	const int l6= stride + l5;
                     -	const int l7= stride + l6;
                     -	const int l8= stride + l7;
                     -	const int l9= stride + l8;
                     -	int x;
                     -	src+= stride*3;
                     -	for(x=0; x<BLOCK_SIZE; x++)
                     -	{
                     -		const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
                     -		const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
+                    -
                     -		int sums[9];
                     -		sums[0] = first + src[l1];
                     -		sums[1] = src[l1] + src[l2];
                     -		sums[2] = src[l2] + src[l3];
                     -		sums[3] = src[l3] + src[l4];
                     -		sums[4] = src[l4] + src[l5];
                     -		sums[5] = src[l5] + src[l6];
                     -		sums[6] = src[l6] + src[l7];
                     -		sums[7] = src[l7] + src[l8];
                     -		sums[8] = src[l8] + last;
+                    -
                     -		src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
                     -		src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
                     -		src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
                     -		src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
                     -		src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
                     -		src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
                     -		src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
                     -		src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
+                    -
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -#if 0
                     -/**
                     - * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
                     - * values are correctly clipped (MMX2)
                     - * values are wraparound (C)
                     - * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
                     -	0 8 16 24
                     -	x = 8
                     -	x/2 = 4
                     -	x/8 = 1
                     -	1 12 12 23
                     - */
                     -static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	src+= stride*3;
                     -// FIXME rounding
                     -	asm volatile(
                     -		"pxor %%mm7, %%mm7				\n\t" // 0
                     -		"movq "MANGLE(b80)", %%mm6			\n\t" // MIN_SIGNED_BYTE
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
                     -		"movq "MANGLE(pQPb)", %%mm0			\n\t" // QP,..., QP
                     -		"movq %%mm0, %%mm1				\n\t" // QP,..., QP
                     -		"paddusb "MANGLE(b02)", %%mm0			\n\t"
                     -		"psrlw $2, %%mm0				\n\t"
                     -		"pand "MANGLE(b3F)", %%mm0			\n\t" // QP/4,..., QP/4
                     -		"paddusb %%mm1, %%mm0				\n\t" // QP*1.25 ...
                     -		"movq (%0, %1, 4), %%mm2			\n\t" // line 4
                     -		"movq (%%ecx), %%mm3				\n\t" // line 5
                     -		"movq %%mm2, %%mm4				\n\t" // line 4
                     -		"pcmpeqb %%mm5, %%mm5				\n\t" // -1
                     -		"pxor %%mm2, %%mm5				\n\t" // -line 4 - 1
                     -		PAVGB(%%mm3, %%mm5)
                     -		"paddb %%mm6, %%mm5				\n\t" // (l5-l4)/2
                     -		"psubusb %%mm3, %%mm4				\n\t"
                     -		"psubusb %%mm2, %%mm3				\n\t"
                     -		"por %%mm3, %%mm4				\n\t" // |l4 - l5|
                     -		"psubusb %%mm0, %%mm4				\n\t"
                     -		"pcmpeqb %%mm7, %%mm4				\n\t"
                     -		"pand %%mm4, %%mm5				\n\t" // d/2
+                    -
                     -//		"paddb %%mm6, %%mm2				\n\t" // line 4 + 0x80
                     -		"paddb %%mm5, %%mm2				\n\t"
                     -//		"psubb %%mm6, %%mm2				\n\t"
                     -		"movq %%mm2, (%0,%1, 4)				\n\t"
+                    -
                     -		"movq (%%ecx), %%mm2				\n\t"
                     -//		"paddb %%mm6, %%mm2				\n\t" // line 5 + 0x80
                     -		"psubb %%mm5, %%mm2				\n\t"
                     -//		"psubb %%mm6, %%mm2				\n\t"
                     -		"movq %%mm2, (%%ecx)				\n\t"
+                    -
                     -		"paddb %%mm6, %%mm5				\n\t"
                     -		"psrlw $2, %%mm5				\n\t"
                     -		"pand "MANGLE(b3F)", %%mm5			\n\t"
                     -		"psubb "MANGLE(b20)", %%mm5			\n\t" // (l5-l4)/8
+                    -
                     -		"movq (%%eax, %1, 2), %%mm2			\n\t"
                     -		"paddb %%mm6, %%mm2				\n\t" // line 3 + 0x80
                     -		"paddsb %%mm5, %%mm2				\n\t"
                     -		"psubb %%mm6, %%mm2				\n\t"
                     -		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    -
                     -		"movq (%%ecx, %1), %%mm2			\n\t"
                     -		"paddb %%mm6, %%mm2				\n\t" // line 6 + 0x80
                     -		"psubsb %%mm5, %%mm2				\n\t"
                     -		"psubb %%mm6, %%mm2				\n\t"
                     -		"movq %%mm2, (%%ecx, %1)			\n\t"
+                    -
                     -		:
                     -		: "r" (src), "r" (stride)
                     -		: "%eax", "%ecx"
                     -	);
                     -#else
                     - 	const int l1= stride;
                     -	const int l2= stride + l1;
                     -	const int l3= stride + l2;
                     -	const int l4= stride + l3;
                     -	const int l5= stride + l4;
                     -	const int l6= stride + l5;
                     -//	const int l7= stride + l6;
                     -//	const int l8= stride + l7;
                     -//	const int l9= stride + l8;
                     -	int x;
                     -	const int QP15= QP + (QP>>2);
                     -	src+= stride*3;
                     -	for(x=0; x<BLOCK_SIZE; x++)
                     -	{
                     -		const int v = (src[x+l5] - src[x+l4]);
                     -		if(ABS(v) < QP15)
                     -		{
                     -			src[x+l3] +=v>>3;
                     -			src[x+l4] +=v>>1;
                     -			src[x+l5] -=v>>1;
                     -			src[x+l6] -=v>>3;
+                    -
                     -		}
                     -	}
+                    -
                     -#endif
                     -}
                     -#endif
+                    -
                     -/**
                     - * Experimental Filter 1
                     - * will not damage linear gradients
                     - * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
                     - * can only smooth blocks at the expected locations (it cant smooth them if they did move)
                     - * MMX2 version does correct clipping C version doesnt
                     - */
                     -static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	src+= stride*3;
+                    -
                     -	asm volatile(
                     -		"pxor %%mm7, %%mm7				\n\t" // 0
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
                     -		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
                     -		"movq (%0, %1, 4), %%mm1			\n\t" // line 4
                     -		"movq %%mm1, %%mm2				\n\t" // line 4
                     -		"psubusb %%mm0, %%mm1				\n\t"
                     -		"psubusb %%mm2, %%mm0				\n\t"
                     -		"por %%mm1, %%mm0				\n\t" // |l2 - l3|
                     -		"movq (%%ecx), %%mm3				\n\t" // line 5
                     -		"movq (%%ecx, %1), %%mm4			\n\t" // line 6
                     -		"movq %%mm3, %%mm5				\n\t" // line 5
                     -		"psubusb %%mm4, %%mm3				\n\t"
                     -		"psubusb %%mm5, %%mm4				\n\t"
                     -		"por %%mm4, %%mm3				\n\t" // |l5 - l6|
                     -		PAVGB(%%mm3, %%mm0)				      // (|l2 - l3| + |l5 - l6|)/2
                     -		"movq %%mm2, %%mm1				\n\t" // line 4
                     -		"psubusb %%mm5, %%mm2				\n\t"
                     -		"movq %%mm2, %%mm4				\n\t"
                     -		"pcmpeqb %%mm7, %%mm2				\n\t" // (l4 - l5) <= 0 ? -1 : 0
                     -		"psubusb %%mm1, %%mm5				\n\t"
                     -		"por %%mm5, %%mm4				\n\t" // |l4 - l5|
                     -		"psubusb %%mm0, %%mm4		\n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
                     -		"movq %%mm4, %%mm3				\n\t" // d
                     -		"movq %2, %%mm0			\n\t"
                     -                "paddusb %%mm0, %%mm0				\n\t"
                     -		"psubusb %%mm0, %%mm4				\n\t"
                     -		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
                     -		"psubusb "MANGLE(b01)", %%mm3			\n\t"
                     -		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
+                    -
                     -		PAVGB(%%mm7, %%mm3)				      // d/2
                     -		"movq %%mm3, %%mm1				\n\t" // d/2
                     -		PAVGB(%%mm7, %%mm3)				      // d/4
                     -		PAVGB(%%mm1, %%mm3)				      // 3*d/8
+                    -
                     -		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
                     -		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
                     -		"psubusb %%mm3, %%mm0				\n\t"
                     -		"pxor %%mm2, %%mm0				\n\t"
                     -		"movq %%mm0, (%0, %1, 4)			\n\t" // line 4
+                    -
                     -		"movq (%%ecx), %%mm0				\n\t" // line 5
                     -		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
                     -		"paddusb %%mm3, %%mm0				\n\t"
                     -		"pxor %%mm2, %%mm0				\n\t"
                     -		"movq %%mm0, (%%ecx)				\n\t" // line 5
+                    -
                     -		PAVGB(%%mm7, %%mm1)				      // d/4
+                    -
                     -		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
                     -		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
                     -		"psubusb %%mm1, %%mm0				\n\t"
                     -		"pxor %%mm2, %%mm0				\n\t"
                     -		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
+                    -
                     -		"movq (%%ecx, %1), %%mm0			\n\t" // line 6
                     -		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
                     -		"paddusb %%mm1, %%mm0				\n\t"
                     -		"pxor %%mm2, %%mm0				\n\t"
                     -		"movq %%mm0, (%%ecx, %1)			\n\t" // line 6
+                    -
                     -		PAVGB(%%mm7, %%mm1)				      // d/8
+                    -
                     -		"movq (%%eax, %1), %%mm0			\n\t" // line 2
                     -		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
                     -		"psubusb %%mm1, %%mm0				\n\t"
                     -		"pxor %%mm2, %%mm0				\n\t"
                     -		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
+                    -
                     -		"movq (%%ecx, %1, 2), %%mm0			\n\t" // line 7
                     -		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
                     -		"paddusb %%mm1, %%mm0				\n\t"
                     -		"pxor %%mm2, %%mm0				\n\t"
                     -		"movq %%mm0, (%%ecx, %1, 2)			\n\t" // line 7
+                    -
                     -		:
                     -		: "r" (src), "r" (stride), "m" (co->pQPb)
                     -		: "%eax", "%ecx"
                     -	);
                     -#else
+                    -
                     - 	const int l1= stride;
                     -	const int l2= stride + l1;
                     -	const int l3= stride + l2;
                     -	const int l4= stride + l3;
                     -	const int l5= stride + l4;
                     -	const int l6= stride + l5;
                     -	const int l7= stride + l6;
                     -//	const int l8= stride + l7;
                     -//	const int l9= stride + l8;
                     -	int x;
+                    -
                     -	src+= stride*3;
                     -	for(x=0; x<BLOCK_SIZE; x++)
                     -	{
                     -		int a= src[l3] - src[l4];
                     -		int b= src[l4] - src[l5];
                     -		int c= src[l5] - src[l6];
+                    -
                     -		int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
                     -		d= MAX(d, 0);
+                    -
                     -		if(d < co->QP*2)
                     -		{
                     -			int v = d * SIGN(-b);
+                    -
                     -			src[l2] +=v>>3;
                     -			src[l3] +=v>>2;
                     -			src[l4] +=(3*v)>>3;
                     -			src[l5] -=(3*v)>>3;
                     -			src[l6] -=v>>2;
                     -			src[l7] -=v>>3;
+                    -
                     -		}
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -/*
                     -	uint8_t tmp[16];
                     -	const int l1= stride;
                     -	const int l2= stride + l1;
                     -	const int l3= stride + l2;
                     -	const int l4= (int)tmp - (int)src - stride*3;
                     -	const int l5= (int)tmp - (int)src - stride*3 + 8;
                     -	const int l6= stride*3 + l3;
                     -	const int l7= stride + l6;
                     -	const int l8= stride + l7;
+                    -
                     -	memcpy(tmp, src+stride*7, 8);
                     -	memcpy(tmp+8, src+stride*8, 8);
                     -*/
                     -	src+= stride*4;
                     -	asm volatile(
+                    -
                     -#if 0 //sligtly more accurate and slightly slower
                     -		"pxor %%mm7, %%mm7				\n\t" // 0
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     -//	0	1	2	3	4	5	6	7
                     -//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
+                    -
+                    -
                     -		"movq (%0, %1, 2), %%mm0			\n\t" // l2
                     -		"movq (%0), %%mm1				\n\t" // l0
                     -		"movq %%mm0, %%mm2				\n\t" // l2
                     -		PAVGB(%%mm7, %%mm0)				      // ~l2/2
                     -		PAVGB(%%mm1, %%mm0)				      // ~(l2 + 2l0)/4
                     -		PAVGB(%%mm2, %%mm0)				      // ~(5l2 + 2l0)/8
+                    -
                     -		"movq (%%eax), %%mm1				\n\t" // l1
                     -		"movq (%%eax, %1, 2), %%mm3			\n\t" // l3
                     -		"movq %%mm1, %%mm4				\n\t" // l1
                     -		PAVGB(%%mm7, %%mm1)				      // ~l1/2
                     -		PAVGB(%%mm3, %%mm1)				      // ~(l1 + 2l3)/4
                     -		PAVGB(%%mm4, %%mm1)				      // ~(5l1 + 2l3)/8
+                    -
                     -		"movq %%mm0, %%mm4				\n\t" // ~(5l2 + 2l0)/8
                     -		"psubusb %%mm1, %%mm0				\n\t"
                     -		"psubusb %%mm4, %%mm1				\n\t"
                     -		"por %%mm0, %%mm1				\n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
                     -// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
+                    -
                     -		"movq (%0, %1, 4), %%mm0			\n\t" // l4
                     -		"movq %%mm0, %%mm4				\n\t" // l4
                     -		PAVGB(%%mm7, %%mm0)				      // ~l4/2
                     -		PAVGB(%%mm2, %%mm0)				      // ~(l4 + 2l2)/4
                     -		PAVGB(%%mm4, %%mm0)				      // ~(5l4 + 2l2)/8
+                    -
                     -		"movq (%%ecx), %%mm2				\n\t" // l5
                     -		"movq %%mm3, %%mm5				\n\t" // l3
                     -		PAVGB(%%mm7, %%mm3)				      // ~l3/2
                     -		PAVGB(%%mm2, %%mm3)				      // ~(l3 + 2l5)/4
                     -		PAVGB(%%mm5, %%mm3)				      // ~(5l3 + 2l5)/8
+                    -
                     -		"movq %%mm0, %%mm6				\n\t" // ~(5l4 + 2l2)/8
                     -		"psubusb %%mm3, %%mm0				\n\t"
                     -		"psubusb %%mm6, %%mm3				\n\t"
                     -		"por %%mm0, %%mm3				\n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
                     -		"pcmpeqb %%mm7, %%mm0				\n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
                     -// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
+                    -
                     -		"movq (%%ecx, %1), %%mm6			\n\t" // l6
                     -		"movq %%mm6, %%mm5				\n\t" // l6
                     -		PAVGB(%%mm7, %%mm6)				      // ~l6/2
                     -		PAVGB(%%mm4, %%mm6)				      // ~(l6 + 2l4)/4
                     -		PAVGB(%%mm5, %%mm6)				      // ~(5l6 + 2l4)/8
+                    -
                     -		"movq (%%ecx, %1, 2), %%mm5			\n\t" // l7
                     -		"movq %%mm2, %%mm4				\n\t" // l5
                     -		PAVGB(%%mm7, %%mm2)				      // ~l5/2
                     -		PAVGB(%%mm5, %%mm2)				      // ~(l5 + 2l7)/4
                     -		PAVGB(%%mm4, %%mm2)				      // ~(5l5 + 2l7)/8
+                    -
                     -		"movq %%mm6, %%mm4				\n\t" // ~(5l6 + 2l4)/8
                     -		"psubusb %%mm2, %%mm6				\n\t"
                     -		"psubusb %%mm4, %%mm2				\n\t"
                     -		"por %%mm6, %%mm2				\n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
                     -// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
+                    -
+                    -
                     -		PMINUB(%%mm2, %%mm1, %%mm4)			      // MIN(|lenergy|,|renergy|)/8
                     -		"movq %2, %%mm4					\n\t" // QP //FIXME QP+1 ?
                     -		"paddusb "MANGLE(b01)", %%mm4			\n\t"
                     -		"pcmpgtb %%mm3, %%mm4				\n\t" // |menergy|/8 < QP
                     -		"psubusb %%mm1, %%mm3				\n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
                     -		"pand %%mm4, %%mm3				\n\t"
+                    -
                     -		"movq %%mm3, %%mm1				\n\t"
                     -//		"psubusb "MANGLE(b01)", %%mm3			\n\t"
                     -		PAVGB(%%mm7, %%mm3)
                     -		PAVGB(%%mm7, %%mm3)
                     -		"paddusb %%mm1, %%mm3				\n\t"
                     -//		"paddusb "MANGLE(b01)", %%mm3			\n\t"
+                    -
                     -		"movq (%%eax, %1, 2), %%mm6			\n\t" //l3
                     -		"movq (%0, %1, 4), %%mm5			\n\t" //l4
                     -		"movq (%0, %1, 4), %%mm4			\n\t" //l4
                     -		"psubusb %%mm6, %%mm5				\n\t"
                     -		"psubusb %%mm4, %%mm6				\n\t"
                     -		"por %%mm6, %%mm5				\n\t" // |l3-l4|
                     -		"pcmpeqb %%mm7, %%mm6				\n\t" // SIGN(l3-l4)
                     -		"pxor %%mm6, %%mm0				\n\t"
                     -		"pand %%mm0, %%mm3				\n\t"
                     -		PMINUB(%%mm5, %%mm3, %%mm0)
+                    -
                     -		"psubusb "MANGLE(b01)", %%mm3			\n\t"
                     -		PAVGB(%%mm7, %%mm3)
+                    -
                     -		"movq (%%eax, %1, 2), %%mm0			\n\t"
                     -		"movq (%0, %1, 4), %%mm2			\n\t"
                     -		"pxor %%mm6, %%mm0				\n\t"
                     -		"pxor %%mm6, %%mm2				\n\t"
                     -		"psubb %%mm3, %%mm0				\n\t"
                     -		"paddb %%mm3, %%mm2				\n\t"
                     -		"pxor %%mm6, %%mm0				\n\t"
                     -		"pxor %%mm6, %%mm2				\n\t"
                     -		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     -		"movq %%mm2, (%0, %1, 4)			\n\t"
                     -#endif
+                    -
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"pcmpeqb %%mm6, %%mm6				\n\t" // -1
                     -//	0	1	2	3	4	5	6	7
                     -//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
+                    -
+                    -
                     -		"movq (%%eax, %1, 2), %%mm1			\n\t" // l3
                     -		"movq (%0, %1, 4), %%mm0			\n\t" // l4
                     -		"pxor %%mm6, %%mm1				\n\t" // -l3-1
                     -		PAVGB(%%mm1, %%mm0)				      // -q+128 = (l4-l3+256)/2
                     -// mm1=-l3-1, mm0=128-q
+                    -
                     -		"movq (%%eax, %1, 4), %%mm2			\n\t" // l5
                     -		"movq (%%eax, %1), %%mm3			\n\t" // l2
                     -		"pxor %%mm6, %%mm2				\n\t" // -l5-1
                     -		"movq %%mm2, %%mm5				\n\t" // -l5-1
                     -		"movq "MANGLE(b80)", %%mm4			\n\t" // 128
                     -		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     -		PAVGB(%%mm3, %%mm2)				      // (l2-l5+256)/2
                     -		PAVGB(%%mm0, %%mm4)				      // ~(l4-l3)/4 + 128
                     -		PAVGB(%%mm2, %%mm4)				      // ~(l2-l5)/4 +(l4-l3)/8 + 128
                     -		PAVGB(%%mm0, %%mm4)				      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
                     -// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
+                    -
                     -		"movq (%%eax), %%mm2				\n\t" // l1
                     -		"pxor %%mm6, %%mm2				\n\t" // -l1-1
                     -		PAVGB(%%mm3, %%mm2)				      // (l2-l1+256)/2
                     -		PAVGB((%0), %%mm1)				      // (l0-l3+256)/2
                     -		"movq "MANGLE(b80)", %%mm3			\n\t" // 128
                     -		PAVGB(%%mm2, %%mm3)				      // ~(l2-l1)/4 + 128
                     -		PAVGB(%%mm1, %%mm3)				      // ~(l0-l3)/4 +(l2-l1)/8 + 128
                     -		PAVGB(%%mm2, %%mm3)				      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
                     -// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
+                    -
                     -		PAVGB((%%ecx, %1), %%mm5)			      // (l6-l5+256)/2
                     -		"movq (%%ecx, %1, 2), %%mm1			\n\t" // l7
                     -		"pxor %%mm6, %%mm1				\n\t" // -l7-1
                     -		PAVGB((%0, %1, 4), %%mm1)			      // (l4-l7+256)/2
                     -		"movq "MANGLE(b80)", %%mm2			\n\t" // 128
                     -		PAVGB(%%mm5, %%mm2)				      // ~(l6-l5)/4 + 128
                     -		PAVGB(%%mm1, %%mm2)				      // ~(l4-l7)/4 +(l6-l5)/8 + 128
                     -		PAVGB(%%mm5, %%mm2)				      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
                     -// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
+                    -
                     -		"movq "MANGLE(b00)", %%mm1			\n\t" // 0
                     -		"movq "MANGLE(b00)", %%mm5			\n\t" // 0
                     -		"psubb %%mm2, %%mm1				\n\t" // 128 - renergy/16
                     -		"psubb %%mm3, %%mm5				\n\t" // 128 - lenergy/16
                     -		PMAXUB(%%mm1, %%mm2)				      // 128 + |renergy/16|
                     - 		PMAXUB(%%mm5, %%mm3)				      // 128 + |lenergy/16|
                     -		PMINUB(%%mm2, %%mm3, %%mm1)			      // 128 + MIN(|lenergy|,|renergy|)/16
+                    -
                     -// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
+                    -
                     -		"movq "MANGLE(b00)", %%mm7			\n\t" // 0
                     -		"movq %2, %%mm2					\n\t" // QP
                     -		PAVGB(%%mm6, %%mm2)				      // 128 + QP/2
                     -		"psubb %%mm6, %%mm2				\n\t"
+                    -
                     -		"movq %%mm4, %%mm1				\n\t"
                     -		"pcmpgtb %%mm7, %%mm1				\n\t" // SIGN(menergy)
                     -		"pxor %%mm1, %%mm4				\n\t"
                     -		"psubb %%mm1, %%mm4				\n\t" // 128 + |menergy|/16
                     -		"pcmpgtb %%mm4, %%mm2				\n\t" // |menergy|/16 < QP/2
                     -		"psubusb %%mm3, %%mm4				\n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
                     -// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
+                    -
                     -		"movq %%mm4, %%mm3				\n\t" // d
                     -		"psubusb "MANGLE(b01)", %%mm4			\n\t"
                     -		PAVGB(%%mm7, %%mm4)				      // d/32
                     -		PAVGB(%%mm7, %%mm4)				      // (d + 32)/64
                     -		"paddb %%mm3, %%mm4				\n\t" // 5d/64
                     -		"pand %%mm2, %%mm4				\n\t"
+                    -
                     -		"movq "MANGLE(b80)", %%mm5			\n\t" // 128
                     -		"psubb %%mm0, %%mm5				\n\t" // q
                     -		"paddsb %%mm6, %%mm5				\n\t" // fix bad rounding
                     -		"pcmpgtb %%mm5, %%mm7				\n\t" // SIGN(q)
                     -		"pxor %%mm7, %%mm5				\n\t"
+                    -
                     -		PMINUB(%%mm5, %%mm4, %%mm3)			      // MIN(|q|, 5d/64)
                     -		"pxor %%mm1, %%mm7				\n\t" // SIGN(d*q)
+                    -
                     -		"pand %%mm7, %%mm4				\n\t"
                     -		"movq (%%eax, %1, 2), %%mm0			\n\t"
                     -		"movq (%0, %1, 4), %%mm2			\n\t"
                     -		"pxor %%mm1, %%mm0				\n\t"
                     -		"pxor %%mm1, %%mm2				\n\t"
                     -		"paddb %%mm4, %%mm0				\n\t"
                     -		"psubb %%mm4, %%mm2				\n\t"
                     -		"pxor %%mm1, %%mm0				\n\t"
                     -		"pxor %%mm1, %%mm2				\n\t"
                     -		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     -		"movq %%mm2, (%0, %1, 4)			\n\t"
+                    -
                     -		:
                     -		: "r" (src), "r" (stride), "m" (c->pQPb)
                     -		: "%eax", "%ecx"
                     -	);
+                    -
                     -/*
                     -	{
                     -	int x;
                     -	src-= stride;
                     -	for(x=0; x<BLOCK_SIZE; x++)
                     -	{
                     -		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
                     -		if(ABS(middleEnergy)< 8*QP)
                     -		{
                     -			const int q=(src[l4] - src[l5])/2;
                     -			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                     -			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
+                    -
                     -			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
                     -			d= MAX(d, 0);
+                    -
                     -			d= (5*d + 32) >> 6;
                     -			d*= SIGN(-middleEnergy);
+                    -
                     -			if(q>0)
                     -			{
                     -				d= d<0 ? 0 : d;
                     -				d= d>q ? q : d;
                     -			}
                     -			else
                     -			{
                     -				d= d>0 ? 0 : d;
                     -				d= d<q ? q : d;
                     -			}
+                    -
                     -        		src[l4]-= d;
                     -	        	src[l5]+= d;
                     -		}
                     -		src++;
                     -	}
                     -src-=8;
                     -	for(x=0; x<8; x++)
                     -	{
                     -		int y;
                     -		for(y=4; y<6; y++)
                     -		{
                     -			int d= src[x+y*stride] - tmp[x+(y-4)*8];
                     -			int ad= ABS(d);
                     -			static int max=0;
                     -			static int sum=0;
                     -			static int num=0;
                     -			static int bias=0;
+                    -
                     -			if(max<ad) max=ad;
                     -			sum+= ad>3 ? 1 : 0;
                     -			if(ad>3)
                     -			{
                     -				src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
                     -			}
                     -			if(y==4) bias+=d;
                     -			num++;
                     -			if(num%1000000 == 0)
                     -			{
                     -				printf(" %d %d %d %d\n", num, sum, max, bias);
                     -			}
                     -		}
                     -	}
                     -}
                     -*/
                     -#elif defined (HAVE_MMX)
                     -	src+= stride*4;
                     -	asm volatile(
                     -		"pxor %%mm7, %%mm7				\n\t"
                     -		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
                     -		"andl $0xFFFFFFF8, %%ecx			\n\t" // align
                     -//	0	1	2	3	4	5	6	7
                     -//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	edx+%1	edx+2%1
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1
+                    -
                     -		"movq (%0), %%mm0				\n\t"
                     -		"movq %%mm0, %%mm1				\n\t"
                     -		"punpcklbw %%mm7, %%mm0				\n\t" // low part of line 0
                     -		"punpckhbw %%mm7, %%mm1				\n\t" // high part of line 0
+                    -
                     -		"movq (%0, %1), %%mm2				\n\t"
                     -		"leal (%0, %1, 2), %%eax			\n\t"
                     -		"movq %%mm2, %%mm3				\n\t"
                     -		"punpcklbw %%mm7, %%mm2				\n\t" // low part of line 1
                     -		"punpckhbw %%mm7, %%mm3				\n\t" // high part of line 1
+                    -
                     -		"movq (%%eax), %%mm4				\n\t"
                     -		"movq %%mm4, %%mm5				\n\t"
                     -		"punpcklbw %%mm7, %%mm4				\n\t" // low part of line 2
                     -		"punpckhbw %%mm7, %%mm5				\n\t" // high part of line 2
+                    -
                     -		"paddw %%mm0, %%mm0				\n\t" // 2L0
                     -		"paddw %%mm1, %%mm1				\n\t" // 2H0
                     -		"psubw %%mm4, %%mm2				\n\t" // L1 - L2
                     -		"psubw %%mm5, %%mm3				\n\t" // H1 - H2
                     -		"psubw %%mm2, %%mm0				\n\t" // 2L0 - L1 + L2
                     -		"psubw %%mm3, %%mm1				\n\t" // 2H0 - H1 + H2
+                    -
                     -		"psllw $2, %%mm2				\n\t" // 4L1 - 4L2
                     -		"psllw $2, %%mm3				\n\t" // 4H1 - 4H2
                     -		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2
                     -		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2
+                    -
                     -		"movq (%%eax, %1), %%mm2			\n\t"
                     -		"movq %%mm2, %%mm3				\n\t"
                     -		"punpcklbw %%mm7, %%mm2				\n\t" // L3
                     -		"punpckhbw %%mm7, %%mm3				\n\t" // H3
+                    -
                     -		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - L3
                     -		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - H3
                     -		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
                     -		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
                     -		"movq %%mm0, (%%ecx)				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
                     -		"movq %%mm1, 8(%%ecx)				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+                    -
                     -		"movq (%%eax, %1, 2), %%mm0			\n\t"
                     -		"movq %%mm0, %%mm1				\n\t"
                     -		"punpcklbw %%mm7, %%mm0				\n\t" // L4
                     -		"punpckhbw %%mm7, %%mm1				\n\t" // H4
+                    -
                     -		"psubw %%mm0, %%mm2				\n\t" // L3 - L4
                     -		"psubw %%mm1, %%mm3				\n\t" // H3 - H4
                     -		"movq %%mm2, 16(%%ecx)				\n\t" // L3 - L4
                     -		"movq %%mm3, 24(%%ecx)				\n\t" // H3 - H4
                     -		"paddw %%mm4, %%mm4				\n\t" // 2L2
                     -		"paddw %%mm5, %%mm5				\n\t" // 2H2
                     -		"psubw %%mm2, %%mm4				\n\t" // 2L2 - L3 + L4
                     -		"psubw %%mm3, %%mm5				\n\t" // 2H2 - H3 + H4
+                    -
                     -		"leal (%%eax, %1), %0				\n\t"
                     -		"psllw $2, %%mm2				\n\t" // 4L3 - 4L4
                     -		"psllw $2, %%mm3				\n\t" // 4H3 - 4H4
                     -		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4
                     -		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4
                     -//50 opcodes so far
                     -		"movq (%0, %1, 2), %%mm2			\n\t"
                     -		"movq %%mm2, %%mm3				\n\t"
                     -		"punpcklbw %%mm7, %%mm2				\n\t" // L5
                     -		"punpckhbw %%mm7, %%mm3				\n\t" // H5
                     -		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - L5
                     -		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - H5
                     -		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - 2L5
                     -		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - 2H5
+                    -
                     -		"movq (%%eax, %1, 4), %%mm6			\n\t"
                     -		"punpcklbw %%mm7, %%mm6				\n\t" // L6
                     -		"psubw %%mm6, %%mm2				\n\t" // L5 - L6
                     -		"movq (%%eax, %1, 4), %%mm6			\n\t"
                     -		"punpckhbw %%mm7, %%mm6				\n\t" // H6
                     -		"psubw %%mm6, %%mm3				\n\t" // H5 - H6
+                    -
                     -		"paddw %%mm0, %%mm0				\n\t" // 2L4
                     -		"paddw %%mm1, %%mm1				\n\t" // 2H4
                     -		"psubw %%mm2, %%mm0				\n\t" // 2L4 - L5 + L6
                     -		"psubw %%mm3, %%mm1				\n\t" // 2H4 - H5 + H6
+                    -
                     -		"psllw $2, %%mm2				\n\t" // 4L5 - 4L6
                     -		"psllw $2, %%mm3				\n\t" // 4H5 - 4H6
                     -		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6
                     -		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6
+                    -
                     -		"movq (%0, %1, 4), %%mm2			\n\t"
                     -		"movq %%mm2, %%mm3				\n\t"
                     -		"punpcklbw %%mm7, %%mm2				\n\t" // L7
                     -		"punpckhbw %%mm7, %%mm3				\n\t" // H7
+                    -
                     -		"paddw %%mm2, %%mm2				\n\t" // 2L7
                     -		"paddw %%mm3, %%mm3				\n\t" // 2H7
                     -		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6 - 2L7
                     -		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6 - 2H7
+                    -
                     -		"movq (%%ecx), %%mm2				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
                     -		"movq 8(%%ecx), %%mm3				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
+                    -
                     -#ifdef HAVE_MMX2
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"psubw %%mm0, %%mm6				\n\t"
                     -		"pmaxsw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"psubw %%mm1, %%mm6				\n\t"
                     -		"pmaxsw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"psubw %%mm2, %%mm6				\n\t"
                     -		"pmaxsw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"psubw %%mm3, %%mm6				\n\t"
                     -		"pmaxsw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
                     -#else
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"pcmpgtw %%mm0, %%mm6				\n\t"
                     -		"pxor %%mm6, %%mm0				\n\t"
                     -		"psubw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"pcmpgtw %%mm1, %%mm6				\n\t"
                     -		"pxor %%mm6, %%mm1				\n\t"
                     -		"psubw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"pcmpgtw %%mm2, %%mm6				\n\t"
                     -		"pxor %%mm6, %%mm2				\n\t"
                     -		"psubw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"pcmpgtw %%mm3, %%mm6				\n\t"
                     -		"pxor %%mm6, %%mm3				\n\t"
                     -		"psubw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
                     -#endif
+                    -
                     -#ifdef HAVE_MMX2
                     -		"pminsw %%mm2, %%mm0				\n\t"
                     -		"pminsw %%mm3, %%mm1				\n\t"
                     -#else
                     -		"movq %%mm0, %%mm6				\n\t"
                     -		"psubusw %%mm2, %%mm6				\n\t"
                     -		"psubw %%mm6, %%mm0				\n\t"
                     -		"movq %%mm1, %%mm6				\n\t"
                     -		"psubusw %%mm3, %%mm6				\n\t"
                     -		"psubw %%mm6, %%mm1				\n\t"
                     -#endif
+                    -
                     -		"movq %%mm7, %%mm6				\n\t" // 0
                     -		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
                     -		"pxor %%mm6, %%mm4				\n\t"
                     -		"psubw %%mm6, %%mm4				\n\t" // |2L2 - 5L3 + 5L4 - 2L5|
                     -		"pcmpgtw %%mm5, %%mm7				\n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
                     -		"pxor %%mm7, %%mm5				\n\t"
                     -		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
                     -// 100 opcodes
                     -		"movd %2, %%mm2					\n\t" // QP
                     -		"psllw $3, %%mm2				\n\t" // 8QP
                     -		"movq %%mm2, %%mm3				\n\t" // 8QP
                     -		"pcmpgtw %%mm4, %%mm2				\n\t"
                     -		"pcmpgtw %%mm5, %%mm3				\n\t"
                     -		"pand %%mm2, %%mm4				\n\t"
                     -		"pand %%mm3, %%mm5				\n\t"
+                    -
+                    -
                     -		"psubusw %%mm0, %%mm4				\n\t" // hd
                     -		"psubusw %%mm1, %%mm5				\n\t" // ld
+                    -
+                    -
                     -		"movq "MANGLE(w05)", %%mm2			\n\t" // 5
                     -		"pmullw %%mm2, %%mm4				\n\t"
                     -		"pmullw %%mm2, %%mm5				\n\t"
                     -		"movq "MANGLE(w20)", %%mm2			\n\t" // 32
                     -		"paddw %%mm2, %%mm4				\n\t"
                     -		"paddw %%mm2, %%mm5				\n\t"
                     -		"psrlw $6, %%mm4				\n\t"
                     -		"psrlw $6, %%mm5				\n\t"
+                    -
                     -		"movq 16(%%ecx), %%mm0				\n\t" // L3 - L4
                     -		"movq 24(%%ecx), %%mm1				\n\t" // H3 - H4
+                    -
                     -		"pxor %%mm2, %%mm2				\n\t"
                     -		"pxor %%mm3, %%mm3				\n\t"
+                    -
                     -		"pcmpgtw %%mm0, %%mm2				\n\t" // sign (L3-L4)
                     -		"pcmpgtw %%mm1, %%mm3				\n\t" // sign (H3-H4)
                     -		"pxor %%mm2, %%mm0				\n\t"
                     -		"pxor %%mm3, %%mm1				\n\t"
                     -		"psubw %%mm2, %%mm0				\n\t" // |L3-L4|
                     -		"psubw %%mm3, %%mm1				\n\t" // |H3-H4|
                     -		"psrlw $1, %%mm0				\n\t" // |L3 - L4|/2
                     -		"psrlw $1, %%mm1				\n\t" // |H3 - H4|/2
+                    -
                     -		"pxor %%mm6, %%mm2				\n\t"
                     -		"pxor %%mm7, %%mm3				\n\t"
                     -		"pand %%mm2, %%mm4				\n\t"
                     -		"pand %%mm3, %%mm5				\n\t"
+                    -
                     -#ifdef HAVE_MMX2
                     -		"pminsw %%mm0, %%mm4				\n\t"
                     -		"pminsw %%mm1, %%mm5				\n\t"
                     -#else
                     -		"movq %%mm4, %%mm2				\n\t"
                     -		"psubusw %%mm0, %%mm2				\n\t"
                     -		"psubw %%mm2, %%mm4				\n\t"
                     -		"movq %%mm5, %%mm2				\n\t"
                     -		"psubusw %%mm1, %%mm2				\n\t"
                     -		"psubw %%mm2, %%mm5				\n\t"
                     -#endif
                     -		"pxor %%mm6, %%mm4				\n\t"
                     -		"pxor %%mm7, %%mm5				\n\t"
                     -		"psubw %%mm6, %%mm4				\n\t"
                     -		"psubw %%mm7, %%mm5				\n\t"
                     -		"packsswb %%mm5, %%mm4				\n\t"
                     -		"movq (%0), %%mm0				\n\t"
                     -		"paddb   %%mm4, %%mm0				\n\t"
                     -		"movq %%mm0, (%0)				\n\t"
                     -		"movq (%0, %1), %%mm0				\n\t"
                     -		"psubb %%mm4, %%mm0				\n\t"
                     -		"movq %%mm0, (%0, %1)				\n\t"
+                    -
                     -		: "+r" (src)
                     -		: "r" (stride), "m" (c->pQPb)
                     -		: "%eax", "%ecx"
                     -	);
                     -#else
                     -	const int l1= stride;
                     -	const int l2= stride + l1;
                     -	const int l3= stride + l2;
                     -	const int l4= stride + l3;
                     -	const int l5= stride + l4;
                     -	const int l6= stride + l5;
                     -	const int l7= stride + l6;
                     -	const int l8= stride + l7;
                     -//	const int l9= stride + l8;
                     -	int x;
                     -	src+= stride*3;
                     -	for(x=0; x<BLOCK_SIZE; x++)
                     -	{
                     -		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
                     -		if(ABS(middleEnergy) < 8*c->QP)
                     -		{
                     -			const int q=(src[l4] - src[l5])/2;
                     -			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
                     -			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
+                    -
                     -			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
                     -			d= MAX(d, 0);
+                    -
                     -			d= (5*d + 32) >> 6;
                     -			d*= SIGN(-middleEnergy);
+                    -
                     -			if(q>0)
                     -			{
                     -				d= d<0 ? 0 : d;
                     -				d= d>q ? q : d;
                     -			}
                     -			else
                     -			{
                     -				d= d>0 ? 0 : d;
                     -				d= d<q ? q : d;
                     -			}
+                    -
                     -        		src[l4]-= d;
                     -	        	src[l5]+= d;
                     -		}
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	asm volatile(
                     -		"pxor %%mm6, %%mm6				\n\t"
                     -		"pcmpeqb %%mm7, %%mm7				\n\t"
                     -		"movq %2, %%mm0					\n\t"
                     -		"punpcklbw %%mm6, %%mm0				\n\t"
                     -		"psrlw $1, %%mm0				\n\t"
                     -		"psubw %%mm7, %%mm0				\n\t"
                     -		"packuswb %%mm0, %%mm0				\n\t"
                     -		"movq %%mm0, %3					\n\t"
+                    -
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%edx			\n\t"
+                    -
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
+                    -
                     -#undef FIND_MIN_MAX
                     -#ifdef HAVE_MMX2
                     -#define FIND_MIN_MAX(addr)\
                     -		"movq " #addr ", %%mm0				\n\t"\
                     -		"pminub %%mm0, %%mm7				\n\t"\
                     -		"pmaxub %%mm0, %%mm6				\n\t"
                     -#else
                     -#define FIND_MIN_MAX(addr)\
                     -		"movq " #addr ", %%mm0				\n\t"\
                     -		"movq %%mm7, %%mm1				\n\t"\
                     -		"psubusb %%mm0, %%mm6				\n\t"\
                     -		"paddb %%mm0, %%mm6				\n\t"\
                     -		"psubusb %%mm0, %%mm1				\n\t"\
                     -		"psubb %%mm1, %%mm7				\n\t"
                     -#endif
+                    -
                     -FIND_MIN_MAX((%%eax))
                     -FIND_MIN_MAX((%%eax, %1))
                     -FIND_MIN_MAX((%%eax, %1, 2))
                     -FIND_MIN_MAX((%0, %1, 4))
                     -FIND_MIN_MAX((%%edx))
                     -FIND_MIN_MAX((%%edx, %1))
                     -FIND_MIN_MAX((%%edx, %1, 2))
                     -FIND_MIN_MAX((%0, %1, 8))
+                    -
                     -		"movq %%mm7, %%mm4				\n\t"
                     -		"psrlq $8, %%mm7				\n\t"
                     -#ifdef HAVE_MMX2
                     -		"pminub %%mm4, %%mm7				\n\t" // min of pixels
                     -		"pshufw $0xF9, %%mm7, %%mm4			\n\t"
                     -		"pminub %%mm4, %%mm7				\n\t" // min of pixels
                     -		"pshufw $0xFE, %%mm7, %%mm4			\n\t"
                     -		"pminub %%mm4, %%mm7				\n\t"
                     -#else
                     -		"movq %%mm7, %%mm1				\n\t"
                     -		"psubusb %%mm4, %%mm1				\n\t"
                     -		"psubb %%mm1, %%mm7				\n\t"
                     -		"movq %%mm7, %%mm4				\n\t"
                     -		"psrlq $16, %%mm7				\n\t"
                     -		"movq %%mm7, %%mm1				\n\t"
                     -		"psubusb %%mm4, %%mm1				\n\t"
                     -		"psubb %%mm1, %%mm7				\n\t"
                     -		"movq %%mm7, %%mm4				\n\t"
                     -		"psrlq $32, %%mm7				\n\t"
                     -		"movq %%mm7, %%mm1				\n\t"
                     -		"psubusb %%mm4, %%mm1				\n\t"
                     -		"psubb %%mm1, %%mm7				\n\t"
                     -#endif
+                    -
+                    -
                     -		"movq %%mm6, %%mm4				\n\t"
                     -		"psrlq $8, %%mm6				\n\t"
                     -#ifdef HAVE_MMX2
                     -		"pmaxub %%mm4, %%mm6				\n\t" // max of pixels
                     -		"pshufw $0xF9, %%mm6, %%mm4			\n\t"
                     -		"pmaxub %%mm4, %%mm6				\n\t"
                     -		"pshufw $0xFE, %%mm6, %%mm4			\n\t"
                     -		"pmaxub %%mm4, %%mm6				\n\t"
                     -#else
                     -		"psubusb %%mm4, %%mm6				\n\t"
                     -		"paddb %%mm4, %%mm6				\n\t"
                     -		"movq %%mm6, %%mm4				\n\t"
                     -		"psrlq $16, %%mm6				\n\t"
                     -		"psubusb %%mm4, %%mm6				\n\t"
                     -		"paddb %%mm4, %%mm6				\n\t"
                     -		"movq %%mm6, %%mm4				\n\t"
                     -		"psrlq $32, %%mm6				\n\t"
                     -		"psubusb %%mm4, %%mm6				\n\t"
                     -		"paddb %%mm4, %%mm6				\n\t"
                     -#endif
                     -		"movq %%mm6, %%mm0				\n\t" // max
                     -		"psubb %%mm7, %%mm6				\n\t" // max - min
                     -		"movd %%mm6, %%ecx				\n\t"
                     -		"cmpb "MANGLE(deringThreshold)", %%cl		\n\t"
                     -		" jb 1f						\n\t"
                     -		"leal -24(%%esp), %%ecx				\n\t"
                     -		"andl $0xFFFFFFF8, %%ecx			\n\t"
                     -		PAVGB(%%mm0, %%mm7)				      // a=(max + min)/2
                     -		"punpcklbw %%mm7, %%mm7				\n\t"
                     -		"punpcklbw %%mm7, %%mm7				\n\t"
                     -		"punpcklbw %%mm7, %%mm7				\n\t"
                     -		"movq %%mm7, (%%ecx)				\n\t"
+                    -
                     -		"movq (%0), %%mm0				\n\t" // L10
                     -		"movq %%mm0, %%mm1				\n\t" // L10
                     -		"movq %%mm0, %%mm2				\n\t" // L10
                     -		"psllq $8, %%mm1				\n\t"
                     -		"psrlq $8, %%mm2				\n\t"
                     -		"movd -4(%0), %%mm3				\n\t"
                     -		"movd 8(%0), %%mm4				\n\t"
                     -		"psrlq $24, %%mm3				\n\t"
                     -		"psllq $56, %%mm4				\n\t"
                     -		"por %%mm3, %%mm1				\n\t" // L00
                     -		"por %%mm4, %%mm2				\n\t" // L20
                     -		"movq %%mm1, %%mm3				\n\t" // L00
                     -		PAVGB(%%mm2, %%mm1)				      // (L20 + L00)/2
                     -		PAVGB(%%mm0, %%mm1)				      // (L20 + L00 + 2L10)/4
                     -		"psubusb %%mm7, %%mm0				\n\t"
                     -		"psubusb %%mm7, %%mm2				\n\t"
                     -		"psubusb %%mm7, %%mm3				\n\t"
                     -		"pcmpeqb "MANGLE(b00)", %%mm0			\n\t" // L10 > a ? 0 : -1
                     -		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L20 > a ? 0 : -1
                     -		"pcmpeqb "MANGLE(b00)", %%mm3			\n\t" // L00 > a ? 0 : -1
                     -		"paddb %%mm2, %%mm0				\n\t"
                     -		"paddb %%mm3, %%mm0				\n\t"
+                    -
                     -		"movq (%%eax), %%mm2				\n\t" // L11
                     -		"movq %%mm2, %%mm3				\n\t" // L11
                     -		"movq %%mm2, %%mm4				\n\t" // L11
                     -		"psllq $8, %%mm3				\n\t"
                     -		"psrlq $8, %%mm4				\n\t"
                     -		"movd -4(%%eax), %%mm5				\n\t"
                     -		"movd 8(%%eax), %%mm6				\n\t"
                     -		"psrlq $24, %%mm5				\n\t"
                     -		"psllq $56, %%mm6				\n\t"
                     -		"por %%mm5, %%mm3				\n\t" // L01
                     -		"por %%mm6, %%mm4				\n\t" // L21
                     -		"movq %%mm3, %%mm5				\n\t" // L01
                     -		PAVGB(%%mm4, %%mm3)				      // (L21 + L01)/2
                     -		PAVGB(%%mm2, %%mm3)				      // (L21 + L01 + 2L11)/4
                     -		"psubusb %%mm7, %%mm2				\n\t"
                     -		"psubusb %%mm7, %%mm4				\n\t"
                     -		"psubusb %%mm7, %%mm5				\n\t"
                     -		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L11 > a ? 0 : -1
                     -		"pcmpeqb "MANGLE(b00)", %%mm4			\n\t" // L21 > a ? 0 : -1
                     -		"pcmpeqb "MANGLE(b00)", %%mm5			\n\t" // L01 > a ? 0 : -1
                     -		"paddb %%mm4, %%mm2				\n\t"
                     -		"paddb %%mm5, %%mm2				\n\t"
                     -// 0, 2, 3, 1
                     -#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
                     -		"movq " #src ", " #sx "				\n\t" /* src[0] */\
                     -		"movq " #sx ", " #lx "				\n\t" /* src[0] */\
                     -		"movq " #sx ", " #t0 "				\n\t" /* src[0] */\
                     -		"psllq $8, " #lx "				\n\t"\
                     -		"psrlq $8, " #t0 "				\n\t"\
                     -		"movd -4" #src ", " #t1 "			\n\t"\
                     -		"psrlq $24, " #t1 "				\n\t"\
                     -		"por " #t1 ", " #lx "				\n\t" /* src[-1] */\
                     -		"movd 8" #src ", " #t1 "			\n\t"\
                     -		"psllq $56, " #t1 "				\n\t"\
                     -		"por " #t1 ", " #t0 "				\n\t" /* src[+1] */\
                     -		"movq " #lx ", " #t1 "				\n\t" /* src[-1] */\
                     -		PAVGB(t0, lx)				              /* (src[-1] + src[+1])/2 */\
                     -		PAVGB(sx, lx)				      /* (src[-1] + 2src[0] + src[+1])/4 */\
                     -		PAVGB(lx, pplx)					     \
                     -		"movq " #lx ", 8(%%ecx)				\n\t"\
                     -		"movq (%%ecx), " #lx "				\n\t"\
                     -		"psubusb " #lx ", " #t1 "			\n\t"\
                     -		"psubusb " #lx ", " #t0 "			\n\t"\
                     -		"psubusb " #lx ", " #sx "			\n\t"\
                     -		"movq "MANGLE(b00)", " #lx "			\n\t"\
                     -		"pcmpeqb " #lx ", " #t1 "			\n\t" /* src[-1] > a ? 0 : -1*/\
                     -		"pcmpeqb " #lx ", " #t0 "			\n\t" /* src[+1] > a ? 0 : -1*/\
                     -		"pcmpeqb " #lx ", " #sx "			\n\t" /* src[0]  > a ? 0 : -1*/\
                     -		"paddb " #t1 ", " #t0 "				\n\t"\
                     -		"paddb " #t0 ", " #sx "				\n\t"\
                     -\
                     -		PAVGB(plx, pplx)				      /* filtered */\
                     -		"movq " #dst ", " #t0 "				\n\t" /* dst */\
                     -		"movq " #t0 ", " #t1 "				\n\t" /* dst */\
                     -		"psubusb %3, " #t0 "				\n\t"\
                     -		"paddusb %3, " #t1 "				\n\t"\
                     -		PMAXUB(t0, pplx)\
                     -		PMINUB(t1, pplx, t0)\
                     -		"paddb " #sx ", " #ppsx "			\n\t"\
                     -		"paddb " #psx ", " #ppsx "			\n\t"\
                     -		"#paddb "MANGLE(b02)", " #ppsx "		\n\t"\
                     -		"pand "MANGLE(b08)", " #ppsx "			\n\t"\
                     -		"pcmpeqb " #lx ", " #ppsx "			\n\t"\
                     -		"pand " #ppsx ", " #pplx "			\n\t"\
                     -		"pandn " #dst ", " #ppsx "			\n\t"\
                     -		"por " #pplx ", " #ppsx "			\n\t"\
                     -		"movq " #ppsx ", " #dst "			\n\t"\
                     -		"movq 8(%%ecx), " #lx "				\n\t"
+                    -
                     -/*
                     -0000000
                     -1111111
+                    -
                     -1111110
                     -1111101
                     -1111100
                     -1111011
                     -1111010
                     -1111001
+                    -
                     -1111000
                     -1110111
+                    -
                     -*/
                     -//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
                     -DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
                     -DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
                     -DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
                     -DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
                     -DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
                     -DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
                     -DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
                     -DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
+                    -
                     -		"1:			\n\t"
                     -		: : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
                     -		: "%eax", "%edx", "%ecx"
                     -	);
                     -#else
                     -	int y;
                     -	int min=255;
                     -	int max=0;
                     -	int avg;
                     -	uint8_t *p;
                     -	int s[10];
                     -	const int QP2= c->QP/2 + 1;
+                    -
                     -	for(y=1; y<9; y++)
                     -	{
                     -		int x;
                     -		p= src + stride*y;
                     -		for(x=1; x<9; x++)
                     -		{
                     -			p++;
                     -			if(*p > max) max= *p;
                     -			if(*p < min) min= *p;
                     -		}
                     -	}
                     -	avg= (min + max + 1)>>1;
+                    -
                     -	if(max - min <deringThreshold) return;
+                    -
                     -	for(y=0; y<10; y++)
                     -	{
                     -		int t = 0;
+                    -
                     -		if(src[stride*y + 0] > avg) t+= 1;
                     -		if(src[stride*y + 1] > avg) t+= 2;
                     -		if(src[stride*y + 2] > avg) t+= 4;
                     -		if(src[stride*y + 3] > avg) t+= 8;
                     -		if(src[stride*y + 4] > avg) t+= 16;
                     -		if(src[stride*y + 5] > avg) t+= 32;
                     -		if(src[stride*y + 6] > avg) t+= 64;
                     -		if(src[stride*y + 7] > avg) t+= 128;
                     -		if(src[stride*y + 8] > avg) t+= 256;
                     -		if(src[stride*y + 9] > avg) t+= 512;
+                    -
                     -		t |= (~t)<<16;
                     -		t &= (t<<1) & (t>>1);
                     -		s[y] = t;
                     -	}
+                    -
                     -	for(y=1; y<9; y++)
                     -	{
                     -		int t = s[y-1] & s[y] & s[y+1];
                     -		t|= t>>16;
                     -		s[y-1]= t;
                     -	}
+                    -
                     -	for(y=1; y<9; y++)
                     -	{
                     -		int x;
                     -		int t = s[y-1];
+                    -
                     -		p= src + stride*y;
                     -		for(x=1; x<9; x++)
                     -		{
                     -			p++;
                     -			if(t & (1<<x))
                     -			{
                     -				int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
                     -				      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
                     -				      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
                     -				f= (f + 8)>>4;
+                    -
                     -#ifdef DEBUG_DERING_THRESHOLD
                     -				asm volatile("emms\n\t":);
                     -				{
                     -				static long long numPixels=0;
                     -				if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
                     -//				if((max-min)<20 || (max-min)*QP<200)
                     -//				if((max-min)*QP < 500)
                     -//				if(max-min<QP/2)
                     -				if(max-min < 20)
                     -				{
                     -					static int numSkiped=0;
                     -					static int errorSum=0;
                     -					static int worstQP=0;
                     -					static int worstRange=0;
                     -					static int worstDiff=0;
                     -					int diff= (f - *p);
                     -					int absDiff= ABS(diff);
                     -					int error= diff*diff;
+                    -
                     -					if(x==1 || x==8 || y==1 || y==8) continue;
+                    -
                     -					numSkiped++;
                     -					if(absDiff > worstDiff)
                     -					{
                     -						worstDiff= absDiff;
                     -						worstQP= QP;
                     -						worstRange= max-min;
                     -					}
                     -					errorSum+= error;
+                    -
                     -					if(1024LL*1024LL*1024LL % numSkiped == 0)
                     -					{
                     -						printf( "sum:%1.3f, skip:%d, wQP:%d, "
                     -							"wRange:%d, wDiff:%d, relSkip:%1.3f\n",
                     -							(float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
                     -							worstDiff, (float)numSkiped/numPixels);
                     -					}
                     -				}
                     -				}
                     -#endif
                     -				if     (*p + QP2 < f) *p= *p + QP2;
                     -				else if(*p - QP2 > f) *p= *p - QP2;
                     -				else *p=f;
                     -			}
                     -		}
                     -	}
                     -#ifdef DEBUG_DERING_THRESHOLD
                     -	if(max-min < 20)
                     -	{
                     -		for(y=1; y<9; y++)
                     -		{
                     -			int x;
                     -			int t = 0;
                     -			p= src + stride*y;
                     -			for(x=1; x<9; x++)
                     -			{
                     -				p++;
                     -				*p = MIN(*p + 20, 255);
                     -			}
                     -		}
                     -//		src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
                     -	}
                     -#endif
                     -#endif
                     -}
+                    -
                     -/**
                     - * Deinterlaces the given block
                     - * will be called for every 8x8 block and can read & write from line 4-15
                     - * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     - * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     - */
                     -static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	src+= 4*stride;
                     -	asm volatile(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%ecx			\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
+                    -
                     -		"movq (%0), %%mm0				\n\t"
                     -		"movq (%%eax, %1), %%mm1			\n\t"
                     -		PAVGB(%%mm1, %%mm0)
                     -		"movq %%mm0, (%%eax)				\n\t"
                     -		"movq (%0, %1, 4), %%mm0			\n\t"
                     -		PAVGB(%%mm0, %%mm1)
                     -		"movq %%mm1, (%%eax, %1, 2)			\n\t"
                     -		"movq (%%ecx, %1), %%mm1			\n\t"
                     -		PAVGB(%%mm1, %%mm0)
                     -		"movq %%mm0, (%%ecx)				\n\t"
                     -		"movq (%0, %1, 8), %%mm0			\n\t"
                     -		PAVGB(%%mm0, %%mm1)
                     -		"movq %%mm1, (%%ecx, %1, 2)			\n\t"
+                    -
                     -		: : "r" (src), "r" (stride)
                     -		: "%eax", "%ecx"
                     -	);
                     -#else
                     -	int x;
                     -	src+= 4*stride;
                     -	for(x=0; x<8; x++)
                     -	{
                     -		src[stride]   = (src[0]        + src[stride*2])>>1;
                     -		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
                     -		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
                     -		src[stride*7] = (src[stride*6] + src[stride*8])>>1;
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -/**
                     - * Deinterlaces the given block
                     - * will be called for every 8x8 block and can read & write from line 4-15
                     - * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     - * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     - * this filter will read lines 3-15 and write 7-13
                     - * no cliping in C version
                     - */
                     -static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	src+= stride*3;
                     -	asm volatile(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%edx			\n\t"
                     -		"leal (%%edx, %1, 4), %%ecx			\n\t"
                     -		"addl %1, %%ecx					\n\t"
                     -		"pxor %%mm7, %%mm7				\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9	10
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
+                    -
                     -#define DEINT_CUBIC(a,b,c,d,e)\
                     -		"movq " #a ", %%mm0				\n\t"\
                     -		"movq " #b ", %%mm1				\n\t"\
                     -		"movq " #d ", %%mm2				\n\t"\
                     -		"movq " #e ", %%mm3				\n\t"\
                     -		PAVGB(%%mm2, %%mm1)					/* (b+d) /2 */\
                     -		PAVGB(%%mm3, %%mm0)					/* a(a+e) /2 */\
                     -		"movq %%mm0, %%mm2				\n\t"\
                     -		"punpcklbw %%mm7, %%mm0				\n\t"\
                     -		"punpckhbw %%mm7, %%mm2				\n\t"\
                     -		"movq %%mm1, %%mm3				\n\t"\
                     -		"punpcklbw %%mm7, %%mm1				\n\t"\
                     -		"punpckhbw %%mm7, %%mm3				\n\t"\
                     -		"psubw %%mm1, %%mm0				\n\t"	/* L(a+e - (b+d))/2 */\
                     -		"psubw %%mm3, %%mm2				\n\t"	/* H(a+e - (b+d))/2 */\
                     -		"psraw $3, %%mm0				\n\t"	/* L(a+e - (b+d))/16 */\
                     -		"psraw $3, %%mm2				\n\t"	/* H(a+e - (b+d))/16 */\
                     -		"psubw %%mm0, %%mm1				\n\t"	/* L(9b + 9d - a - e)/16 */\
                     -		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
                     -		"packuswb %%mm3, %%mm1				\n\t"\
                     -		"movq %%mm1, " #c "				\n\t"
+                    -
                     -DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
                     -DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
                     -DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
                     -DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
+                    -
                     -		: : "r" (src), "r" (stride)
                     -		: "%eax", "%edx", "ecx"
                     -	);
                     -#else
                     -	int x;
                     -	src+= stride*3;
                     -	for(x=0; x<8; x++)
                     -	{
                     -		src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
                     -		src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
                     -		src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
                     -		src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -/**
                     - * Deinterlaces the given block
                     - * will be called for every 8x8 block and can read & write from line 4-15
                     - * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     - * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     - * this filter will read lines 4-13 and write 5-11
                     - * no cliping in C version
                     - */
                     -static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	src+= stride*4;
                     -	asm volatile(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%edx			\n\t"
                     -		"pxor %%mm7, %%mm7				\n\t"
                     -		"movq (%2), %%mm0				\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9	10
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
+                    -
                     -#define DEINT_FF(a,b,c,d)\
                     -		"movq " #a ", %%mm1				\n\t"\
                     -		"movq " #b ", %%mm2				\n\t"\
                     -		"movq " #c ", %%mm3				\n\t"\
                     -		"movq " #d ", %%mm4				\n\t"\
                     -		PAVGB(%%mm3, %%mm1)					\
                     -		PAVGB(%%mm4, %%mm0)					\
                     -		"movq %%mm0, %%mm3				\n\t"\
                     -		"punpcklbw %%mm7, %%mm0				\n\t"\
                     -		"punpckhbw %%mm7, %%mm3				\n\t"\
                     -		"movq %%mm1, %%mm4				\n\t"\
                     -		"punpcklbw %%mm7, %%mm1				\n\t"\
                     -		"punpckhbw %%mm7, %%mm4				\n\t"\
                     -		"psllw $2, %%mm1				\n\t"\
                     -		"psllw $2, %%mm4				\n\t"\
                     -		"psubw %%mm0, %%mm1				\n\t"\
                     -		"psubw %%mm3, %%mm4				\n\t"\
                     -		"movq %%mm2, %%mm5				\n\t"\
                     -		"movq %%mm2, %%mm0				\n\t"\
                     -		"punpcklbw %%mm7, %%mm2				\n\t"\
                     -		"punpckhbw %%mm7, %%mm5				\n\t"\
                     -		"paddw %%mm2, %%mm1				\n\t"\
                     -		"paddw %%mm5, %%mm4				\n\t"\
                     -		"psraw $2, %%mm1				\n\t"\
                     -		"psraw $2, %%mm4				\n\t"\
                     -		"packuswb %%mm4, %%mm1				\n\t"\
                     -		"movq %%mm1, " #b "				\n\t"\
+                    -
                     -DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
                     -DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
                     -DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
                     -DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
+                    -
                     -		"movq %%mm0, (%2)				\n\t"
                     -		: : "r" (src), "r" (stride), "r"(tmp)
                     -		: "%eax", "%edx"
                     -	);
                     -#else
                     -	int x;
                     -	src+= stride*4;
                     -	for(x=0; x<8; x++)
                     -	{
                     -		int t1= tmp[x];
                     -		int t2= src[stride*1];
+                    -
                     -		src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
                     -		t1= src[stride*4];
                     -		src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
                     -		t2= src[stride*6];
                     -		src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
                     -		t1= src[stride*8];
                     -		src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
                     -		tmp[x]= t1;
+                    -
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -/**
                     - * Deinterlaces the given block
                     - * will be called for every 8x8 block and can read & write from line 4-15
                     - * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     - * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     - * will shift the image up by 1 line (FIXME if this is a problem)
                     - * this filter will read lines 4-13 and write 4-11
                     - */
                     -static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
                     -{
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	src+= 4*stride;
                     -	asm volatile(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%edx			\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
+                    -
                     -		"movq (%0), %%mm0				\n\t" // L0
                     -		"movq (%%eax, %1), %%mm1			\n\t" // L2
                     -		PAVGB(%%mm1, %%mm0)				      // L0+L2
                     -		"movq (%%eax), %%mm2				\n\t" // L1
                     -		PAVGB(%%mm2, %%mm0)
                     -		"movq %%mm0, (%0)				\n\t"
                     -		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
                     -		PAVGB(%%mm0, %%mm2)				      // L1+L3
                     -		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
                     -		"movq %%mm2, (%%eax)				\n\t"
                     -		"movq (%0, %1, 4), %%mm2			\n\t" // L4
                     -		PAVGB(%%mm2, %%mm1)				      // L2+L4
                     -		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
                     -		"movq %%mm1, (%%eax, %1)			\n\t"
                     -		"movq (%%edx), %%mm1				\n\t" // L5
                     -		PAVGB(%%mm1, %%mm0)				      // L3+L5
                     -		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
                     -		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     -		"movq (%%edx, %1), %%mm0			\n\t" // L6
                     -		PAVGB(%%mm0, %%mm2)				      // L4+L6
                     -		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
                     -		"movq %%mm2, (%0, %1, 4)			\n\t"
                     -		"movq (%%edx, %1, 2), %%mm2			\n\t" // L7
                     -		PAVGB(%%mm2, %%mm1)				      // L5+L7
                     -		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
                     -		"movq %%mm1, (%%edx)				\n\t"
                     -		"movq (%0, %1, 8), %%mm1			\n\t" // L8
                     -		PAVGB(%%mm1, %%mm0)				      // L6+L8
                     -		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
                     -		"movq %%mm0, (%%edx, %1)			\n\t"
                     -		"movq (%%edx, %1, 4), %%mm0			\n\t" // L9
                     -		PAVGB(%%mm0, %%mm2)				      // L7+L9
                     -		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
                     -		"movq %%mm2, (%%edx, %1, 2)			\n\t"
+                    -
+                    -
                     -		: : "r" (src), "r" (stride)
                     -		: "%eax", "%edx"
                     -	);
                     -#else
                     -	int x;
                     -	src+= 4*stride;
                     -	for(x=0; x<8; x++)
                     -	{
                     -		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     -		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     -		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     -		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     -		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     -		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     -		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
                     -		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -/**
                     - * Deinterlaces the given block
                     - * will be called for every 8x8 block and can read & write from line 4-15,
                     - * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
                     - * lines 4-12 will be read into the deblocking filter and should be deinterlaced
                     - */
                     -static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
                     -{
                     -#ifdef HAVE_MMX
                     -	src+= 4*stride;
                     -#ifdef HAVE_MMX2
                     -	asm volatile(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%edx			\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
+                    -
                     -		"movq (%0), %%mm0				\n\t" //
                     -		"movq (%%eax, %1), %%mm2			\n\t" //
                     -		"movq (%%eax), %%mm1				\n\t" //
                     -		"movq %%mm0, %%mm3				\n\t"
                     -		"pmaxub %%mm1, %%mm0				\n\t" //
                     -		"pminub %%mm3, %%mm1				\n\t" //
                     -		"pmaxub %%mm2, %%mm1				\n\t" //
                     -		"pminub %%mm1, %%mm0				\n\t"
                     -		"movq %%mm0, (%%eax)				\n\t"
+                    -
                     -		"movq (%0, %1, 4), %%mm0			\n\t" //
                     -		"movq (%%eax, %1, 2), %%mm1			\n\t" //
                     -		"movq %%mm2, %%mm3				\n\t"
                     -		"pmaxub %%mm1, %%mm2				\n\t" //
                     -		"pminub %%mm3, %%mm1				\n\t" //
                     -		"pmaxub %%mm0, %%mm1				\n\t" //
                     -		"pminub %%mm1, %%mm2				\n\t"
                     -		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    -
                     -		"movq (%%edx), %%mm2				\n\t" //
                     -		"movq (%%edx, %1), %%mm1			\n\t" //
                     -		"movq %%mm2, %%mm3				\n\t"
                     -		"pmaxub %%mm0, %%mm2				\n\t" //
                     -		"pminub %%mm3, %%mm0				\n\t" //
                     -		"pmaxub %%mm1, %%mm0				\n\t" //
                     -		"pminub %%mm0, %%mm2				\n\t"
                     -		"movq %%mm2, (%%edx)				\n\t"
+                    -
                     -		"movq (%%edx, %1, 2), %%mm2			\n\t" //
                     -		"movq (%0, %1, 8), %%mm0			\n\t" //
                     -		"movq %%mm2, %%mm3				\n\t"
                     -		"pmaxub %%mm0, %%mm2				\n\t" //
                     -		"pminub %%mm3, %%mm0				\n\t" //
                     -		"pmaxub %%mm1, %%mm0				\n\t" //
                     -		"pminub %%mm0, %%mm2				\n\t"
                     -		"movq %%mm2, (%%edx, %1, 2)			\n\t"
+                    -
+                    -
                     -		: : "r" (src), "r" (stride)
                     -		: "%eax", "%edx"
                     -	);
+                    -
                     -#else // MMX without MMX2
                     -	asm volatile(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%edx			\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
                     -		"pxor %%mm7, %%mm7				\n\t"
+                    -
                     -#define MEDIAN(a,b,c)\
                     -		"movq " #a ", %%mm0				\n\t"\
                     -		"movq " #b ", %%mm2				\n\t"\
                     -		"movq " #c ", %%mm1				\n\t"\
                     -		"movq %%mm0, %%mm3				\n\t"\
                     -		"movq %%mm1, %%mm4				\n\t"\
                     -		"movq %%mm2, %%mm5				\n\t"\
                     -		"psubusb %%mm1, %%mm3				\n\t"\
                     -		"psubusb %%mm2, %%mm4				\n\t"\
                     -		"psubusb %%mm0, %%mm5				\n\t"\
                     -		"pcmpeqb %%mm7, %%mm3				\n\t"\
                     -		"pcmpeqb %%mm7, %%mm4				\n\t"\
                     -		"pcmpeqb %%mm7, %%mm5				\n\t"\
                     -		"movq %%mm3, %%mm6				\n\t"\
                     -		"pxor %%mm4, %%mm3				\n\t"\
                     -		"pxor %%mm5, %%mm4				\n\t"\
                     -		"pxor %%mm6, %%mm5				\n\t"\
                     -		"por %%mm3, %%mm1				\n\t"\
                     -		"por %%mm4, %%mm2				\n\t"\
                     -		"por %%mm5, %%mm0				\n\t"\
                     -		"pand %%mm2, %%mm0				\n\t"\
                     -		"pand %%mm1, %%mm0				\n\t"\
                     -		"movq %%mm0, " #b "				\n\t"
+                    -
                     -MEDIAN((%0), (%%eax), (%%eax, %1))
                     -MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
                     -MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
                     -MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
+                    -
                     -		: : "r" (src), "r" (stride)
                     -		: "%eax", "%edx"
                     -	);
                     -#endif // MMX
                     -#else
                     -	int x, y;
                     -	src+= 4*stride;
                     -	// FIXME - there should be a way to do a few columns in parallel like w/mmx
                     -	for(x=0; x<8; x++)
                     -	{
                     -		uint8_t *colsrc = src;
                     -		for (y=0; y<4; y++)
                     -		{
                     -			int a, b, c, d, e, f;
                     -			a = colsrc[0       ];
                     -			b = colsrc[stride  ];
                     -			c = colsrc[stride*2];
                     -			d = (a-b)>>31;
                     -			e = (b-c)>>31;
                     -			f = (c-a)>>31;
                     -			colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
                     -			colsrc += stride*2;
                     -		}
                     -		src++;
                     -	}
                     -#endif
                     -}
+                    -
                     -#ifdef HAVE_MMX
                     -/**
                     - * transposes and shift the given 8x8 Block into dst1 and dst2
                     - */
                     -static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
                     -{
                     -	asm(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
                     -		"movq (%0), %%mm0		\n\t" // 12345678
                     -		"movq (%%eax), %%mm1		\n\t" // abcdefgh
                     -		"movq %%mm0, %%mm2		\n\t" // 12345678
                     -		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     -		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    -
                     -		"movq (%%eax, %1), %%mm1	\n\t"
                     -		"movq (%%eax, %1, 2), %%mm3	\n\t"
                     -		"movq %%mm1, %%mm4		\n\t"
                     -		"punpcklbw %%mm3, %%mm1		\n\t"
                     -		"punpckhbw %%mm3, %%mm4		\n\t"
+                    -
                     -		"movq %%mm0, %%mm3		\n\t"
                     -		"punpcklwd %%mm1, %%mm0		\n\t"
                     -		"punpckhwd %%mm1, %%mm3		\n\t"
                     -		"movq %%mm2, %%mm1		\n\t"
                     -		"punpcklwd %%mm4, %%mm2		\n\t"
                     -		"punpckhwd %%mm4, %%mm1		\n\t"
+                    -
                     -		"movd %%mm0, 128(%2)		\n\t"
                     -		"psrlq $32, %%mm0		\n\t"
                     -		"movd %%mm0, 144(%2)		\n\t"
                     -		"movd %%mm3, 160(%2)		\n\t"
                     -		"psrlq $32, %%mm3		\n\t"
                     -		"movd %%mm3, 176(%2)		\n\t"
                     -		"movd %%mm3, 48(%3)		\n\t"
                     -		"movd %%mm2, 192(%2)		\n\t"
                     -		"movd %%mm2, 64(%3)		\n\t"
                     -		"psrlq $32, %%mm2		\n\t"
                     -		"movd %%mm2, 80(%3)		\n\t"
                     -		"movd %%mm1, 96(%3)		\n\t"
                     -		"psrlq $32, %%mm1		\n\t"
                     -		"movd %%mm1, 112(%3)		\n\t"
+                    -
                     -		"leal (%%eax, %1, 4), %%eax	\n\t"
+                    -
                     -		"movq (%0, %1, 4), %%mm0	\n\t" // 12345678
                     -		"movq (%%eax), %%mm1		\n\t" // abcdefgh
                     -		"movq %%mm0, %%mm2		\n\t" // 12345678
                     -		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     -		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    -
                     -		"movq (%%eax, %1), %%mm1	\n\t"
                     -		"movq (%%eax, %1, 2), %%mm3	\n\t"
                     -		"movq %%mm1, %%mm4		\n\t"
                     -		"punpcklbw %%mm3, %%mm1		\n\t"
                     -		"punpckhbw %%mm3, %%mm4		\n\t"
+                    -
                     -		"movq %%mm0, %%mm3		\n\t"
                     -		"punpcklwd %%mm1, %%mm0		\n\t"
                     -		"punpckhwd %%mm1, %%mm3		\n\t"
                     -		"movq %%mm2, %%mm1		\n\t"
                     -		"punpcklwd %%mm4, %%mm2		\n\t"
                     -		"punpckhwd %%mm4, %%mm1		\n\t"
+                    -
                     -		"movd %%mm0, 132(%2)		\n\t"
                     -		"psrlq $32, %%mm0		\n\t"
                     -		"movd %%mm0, 148(%2)		\n\t"
                     -		"movd %%mm3, 164(%2)		\n\t"
                     -		"psrlq $32, %%mm3		\n\t"
                     -		"movd %%mm3, 180(%2)		\n\t"
                     -		"movd %%mm3, 52(%3)		\n\t"
                     -		"movd %%mm2, 196(%2)		\n\t"
                     -		"movd %%mm2, 68(%3)		\n\t"
                     -		"psrlq $32, %%mm2		\n\t"
                     -		"movd %%mm2, 84(%3)		\n\t"
                     -		"movd %%mm1, 100(%3)		\n\t"
                     -		"psrlq $32, %%mm1		\n\t"
                     -		"movd %%mm1, 116(%3)		\n\t"
+                    -
+                    -
                     -	:: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
                     -	: "%eax"
                     -	);
                     -}
+                    -
                     -/**
                     - * transposes the given 8x8 block
                     - */
                     -static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
                     -{
                     -	asm(
                     -		"leal (%0, %1), %%eax				\n\t"
                     -		"leal (%%eax, %1, 4), %%edx			\n\t"
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
                     -		"movq (%2), %%mm0		\n\t" // 12345678
                     -		"movq 16(%2), %%mm1		\n\t" // abcdefgh
                     -		"movq %%mm0, %%mm2		\n\t" // 12345678
                     -		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     -		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    -
                     -		"movq 32(%2), %%mm1		\n\t"
                     -		"movq 48(%2), %%mm3		\n\t"
                     -		"movq %%mm1, %%mm4		\n\t"
                     -		"punpcklbw %%mm3, %%mm1		\n\t"
                     -		"punpckhbw %%mm3, %%mm4		\n\t"
+                    -
                     -		"movq %%mm0, %%mm3		\n\t"
                     -		"punpcklwd %%mm1, %%mm0		\n\t"
                     -		"punpckhwd %%mm1, %%mm3		\n\t"
                     -		"movq %%mm2, %%mm1		\n\t"
                     -		"punpcklwd %%mm4, %%mm2		\n\t"
                     -		"punpckhwd %%mm4, %%mm1		\n\t"
+                    -
                     -		"movd %%mm0, (%0)		\n\t"
                     -		"psrlq $32, %%mm0		\n\t"
                     -		"movd %%mm0, (%%eax)		\n\t"
                     -		"movd %%mm3, (%%eax, %1)	\n\t"
                     -		"psrlq $32, %%mm3		\n\t"
                     -		"movd %%mm3, (%%eax, %1, 2)	\n\t"
                     -		"movd %%mm2, (%0, %1, 4)	\n\t"
                     -		"psrlq $32, %%mm2		\n\t"
                     -		"movd %%mm2, (%%edx)		\n\t"
                     -		"movd %%mm1, (%%edx, %1)	\n\t"
                     -		"psrlq $32, %%mm1		\n\t"
                     -		"movd %%mm1, (%%edx, %1, 2)	\n\t"
+                    -
+                    -
                     -		"movq 64(%2), %%mm0		\n\t" // 12345678
                     -		"movq 80(%2), %%mm1		\n\t" // abcdefgh
                     -		"movq %%mm0, %%mm2		\n\t" // 12345678
                     -		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
                     -		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
+                    -
                     -		"movq 96(%2), %%mm1		\n\t"
                     -		"movq 112(%2), %%mm3		\n\t"
                     -		"movq %%mm1, %%mm4		\n\t"
                     -		"punpcklbw %%mm3, %%mm1		\n\t"
                     -		"punpckhbw %%mm3, %%mm4		\n\t"
+                    -
                     -		"movq %%mm0, %%mm3		\n\t"
                     -		"punpcklwd %%mm1, %%mm0		\n\t"
                     -		"punpckhwd %%mm1, %%mm3		\n\t"
                     -		"movq %%mm2, %%mm1		\n\t"
                     -		"punpcklwd %%mm4, %%mm2		\n\t"
                     -		"punpckhwd %%mm4, %%mm1		\n\t"
+                    -
                     -		"movd %%mm0, 4(%0)		\n\t"
                     -		"psrlq $32, %%mm0		\n\t"
                     -		"movd %%mm0, 4(%%eax)		\n\t"
                     -		"movd %%mm3, 4(%%eax, %1)	\n\t"
                     -		"psrlq $32, %%mm3		\n\t"
                     -		"movd %%mm3, 4(%%eax, %1, 2)	\n\t"
                     -		"movd %%mm2, 4(%0, %1, 4)	\n\t"
                     -		"psrlq $32, %%mm2		\n\t"
                     -		"movd %%mm2, 4(%%edx)		\n\t"
                     -		"movd %%mm1, 4(%%edx, %1)	\n\t"
                     -		"psrlq $32, %%mm1		\n\t"
                     -		"movd %%mm1, 4(%%edx, %1, 2)	\n\t"
+                    -
                     -	:: "r" (dst), "r" (dstStride), "r" (src)
                     -	: "%eax", "%edx"
                     -	);
                     -}
                     -#endif
                     -//static int test=0;
+                    -
                     -static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
                     -				    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
                     -{
                     -	// to save a register (FIXME do this outside of the loops)
                     -	tempBluredPast[127]= maxNoise[0];
                     -	tempBluredPast[128]= maxNoise[1];
                     -	tempBluredPast[129]= maxNoise[2];
+                    -
                     -#define FAST_L2_DIFF
                     -//#define L1_DIFF //u should change the thresholds too if u try that one
                     -#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     -	asm volatile(
                     -		"leal (%2, %2, 2), %%eax			\n\t" // 3*stride
                     -		"leal (%2, %2, 4), %%edx			\n\t" // 5*stride
                     -		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     -//	0	1	2	3	4	5	6	7	8	9
                     -//	%x	%x+%2	%x+2%2	%x+eax	%x+4%2	%x+edx	%x+2eax	%x+ecx	%x+8%2
                     -//FIXME reorder?
                     -#ifdef L1_DIFF //needs mmx2
                     -		"movq (%0), %%mm0				\n\t" // L0
                     -		"psadbw (%1), %%mm0				\n\t" // |L0-R0|
                     -		"movq (%0, %2), %%mm1				\n\t" // L1
                     -		"psadbw (%1, %2), %%mm1				\n\t" // |L1-R1|
                     -		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     -		"psadbw (%1, %2, 2), %%mm2			\n\t" // |L2-R2|
                     -		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     -		"psadbw (%1, %%eax), %%mm3			\n\t" // |L3-R3|
+                    -
                     -		"movq (%0, %2, 4), %%mm4			\n\t" // L4
                     -		"paddw %%mm1, %%mm0				\n\t"
                     -		"psadbw (%1, %2, 4), %%mm4			\n\t" // |L4-R4|
                     -		"movq (%0, %%edx), %%mm5			\n\t" // L5
                     -		"paddw %%mm2, %%mm0				\n\t"
                     -		"psadbw (%1, %%edx), %%mm5			\n\t" // |L5-R5|
                     -		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
                     -		"paddw %%mm3, %%mm0				\n\t"
                     -		"psadbw (%1, %%eax, 2), %%mm6			\n\t" // |L6-R6|
                     -		"movq (%0, %%ecx), %%mm7			\n\t" // L7
                     -		"paddw %%mm4, %%mm0				\n\t"
                     -		"psadbw (%1, %%ecx), %%mm7			\n\t" // |L7-R7|
                     -		"paddw %%mm5, %%mm6				\n\t"
                     -		"paddw %%mm7, %%mm6				\n\t"
                     -		"paddw %%mm6, %%mm0				\n\t"
                     -#elif defined (FAST_L2_DIFF)
                     -		"pcmpeqb %%mm7, %%mm7				\n\t"
                     -		"movq "MANGLE(b80)", %%mm6			\n\t"
                     -		"pxor %%mm0, %%mm0				\n\t"
                     -#define L2_DIFF_CORE(a, b)\
                     -		"movq " #a ", %%mm5				\n\t"\
                     -		"movq " #b ", %%mm2				\n\t"\
                     -		"pxor %%mm7, %%mm2				\n\t"\
                     -		PAVGB(%%mm2, %%mm5)\
                     -		"paddb %%mm6, %%mm5				\n\t"\
                     -		"movq %%mm5, %%mm2				\n\t"\
                     -		"psllw $8, %%mm5				\n\t"\
                     -		"pmaddwd %%mm5, %%mm5				\n\t"\
                     -		"pmaddwd %%mm2, %%mm2				\n\t"\
                     -		"paddd %%mm2, %%mm5				\n\t"\
                     -		"psrld $14, %%mm5				\n\t"\
                     -		"paddd %%mm5, %%mm0				\n\t"
+                    -
                     -L2_DIFF_CORE((%0), (%1))
                     -L2_DIFF_CORE((%0, %2), (%1, %2))
                     -L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
                     -L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
                     -L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
                     -L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
                     -L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
                     -L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
+                    -
                     -#else
                     -		"pxor %%mm7, %%mm7				\n\t"
                     -		"pxor %%mm0, %%mm0				\n\t"
                     -#define L2_DIFF_CORE(a, b)\
                     -		"movq " #a ", %%mm5				\n\t"\
                     -		"movq " #b ", %%mm2				\n\t"\
                     -		"movq %%mm5, %%mm1				\n\t"\
                     -		"movq %%mm2, %%mm3				\n\t"\
                     -		"punpcklbw %%mm7, %%mm5				\n\t"\
                     -		"punpckhbw %%mm7, %%mm1				\n\t"\
                     -		"punpcklbw %%mm7, %%mm2				\n\t"\
                     -		"punpckhbw %%mm7, %%mm3				\n\t"\
                     -		"psubw %%mm2, %%mm5				\n\t"\
                     -		"psubw %%mm3, %%mm1				\n\t"\
                     -		"pmaddwd %%mm5, %%mm5				\n\t"\
                     -		"pmaddwd %%mm1, %%mm1				\n\t"\
                     -		"paddd %%mm1, %%mm5				\n\t"\
                     -		"paddd %%mm5, %%mm0				\n\t"
+                    -
                     -L2_DIFF_CORE((%0), (%1))
                     -L2_DIFF_CORE((%0, %2), (%1, %2))
                     -L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
                     -L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
                     -L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
                     -L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
                     -L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
                     -L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
+                    -
                     -#endif
+                    -
                     -		"movq %%mm0, %%mm4				\n\t"
                     -		"psrlq $32, %%mm0				\n\t"
                     -		"paddd %%mm0, %%mm4				\n\t"
                     -		"movd %%mm4, %%ecx				\n\t"
                     -		"shll $2, %%ecx					\n\t"
                     -		"movl %3, %%edx					\n\t"
                     -		"addl -4(%%edx), %%ecx				\n\t"
                     -		"addl 4(%%edx), %%ecx				\n\t"
                     -		"addl -1024(%%edx), %%ecx			\n\t"
                     -		"addl $4, %%ecx					\n\t"
                     -		"addl 1024(%%edx), %%ecx			\n\t"
                     -		"shrl $3, %%ecx					\n\t"
                     -		"movl %%ecx, (%%edx)				\n\t"
+                    -
                     -//		"movl %3, %%ecx					\n\t"
                     -//		"movl %%ecx, test				\n\t"
                     -//		"jmp 4f \n\t"
                     -		"cmpl 512(%%edx), %%ecx				\n\t"
                     -		" jb 2f						\n\t"
                     -		"cmpl 516(%%edx), %%ecx				\n\t"
                     -		" jb 1f						\n\t"
+                    -
                     -		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     -		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     -		"movq (%0), %%mm0				\n\t" // L0
                     -		"movq (%0, %2), %%mm1				\n\t" // L1
                     -		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     -		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     -		"movq (%0, %2, 4), %%mm4			\n\t" // L4
                     -		"movq (%0, %%edx), %%mm5			\n\t" // L5
                     -		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
                     -		"movq (%0, %%ecx), %%mm7			\n\t" // L7
                     -		"movq %%mm0, (%1)				\n\t" // L0
                     -		"movq %%mm1, (%1, %2)				\n\t" // L1
                     -		"movq %%mm2, (%1, %2, 2)			\n\t" // L2
                     -		"movq %%mm3, (%1, %%eax)			\n\t" // L3
                     -		"movq %%mm4, (%1, %2, 4)			\n\t" // L4
                     -		"movq %%mm5, (%1, %%edx)			\n\t" // L5
                     -		"movq %%mm6, (%1, %%eax, 2)			\n\t" // L6
                     -		"movq %%mm7, (%1, %%ecx)			\n\t" // L7
                     -		"jmp 4f						\n\t"
+                    -
                     -		"1:						\n\t"
                     -		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     -		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     -		"movq (%0), %%mm0				\n\t" // L0
                     -		PAVGB((%1), %%mm0)				      // L0
                     -		"movq (%0, %2), %%mm1				\n\t" // L1
                     -		PAVGB((%1, %2), %%mm1)				      // L1
                     -		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     -		PAVGB((%1, %2, 2), %%mm2)			      // L2
                     -		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     -		PAVGB((%1, %%eax), %%mm3)			      // L3
                     -		"movq (%0, %2, 4), %%mm4			\n\t" // L4
                     -		PAVGB((%1, %2, 4), %%mm4)			      // L4
                     -		"movq (%0, %%edx), %%mm5			\n\t" // L5
                     -		PAVGB((%1, %%edx), %%mm5)			      // L5
                     -		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
                     -		PAVGB((%1, %%eax, 2), %%mm6)			      // L6
                     -		"movq (%0, %%ecx), %%mm7			\n\t" // L7
                     -		PAVGB((%1, %%ecx), %%mm7)			      // L7
                     -		"movq %%mm0, (%1)				\n\t" // R0
                     -		"movq %%mm1, (%1, %2)				\n\t" // R1
                     -		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
                     -		"movq %%mm3, (%1, %%eax)			\n\t" // R3
                     -		"movq %%mm4, (%1, %2, 4)			\n\t" // R4
                     -		"movq %%mm5, (%1, %%edx)			\n\t" // R5
                     -		"movq %%mm6, (%1, %%eax, 2)			\n\t" // R6
                     -		"movq %%mm7, (%1, %%ecx)			\n\t" // R7
                     -		"movq %%mm0, (%0)				\n\t" // L0
                     -		"movq %%mm1, (%0, %2)				\n\t" // L1
                     -		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
                     -		"movq %%mm3, (%0, %%eax)			\n\t" // L3
                     -		"movq %%mm4, (%0, %2, 4)			\n\t" // L4
                     -		"movq %%mm5, (%0, %%edx)			\n\t" // L5
                     -		"movq %%mm6, (%0, %%eax, 2)			\n\t" // L6
                     -		"movq %%mm7, (%0, %%ecx)			\n\t" // L7
                     -		"jmp 4f						\n\t"
+                    -
                     -		"2:						\n\t"
                     -		"cmpl 508(%%edx), %%ecx				\n\t"
                     -		" jb 3f						\n\t"
+                    -
                     -		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     -		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     -		"movq (%0), %%mm0				\n\t" // L0
                     -		"movq (%0, %2), %%mm1				\n\t" // L1
                     -		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     -		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     -		"movq (%1), %%mm4				\n\t" // R0
                     -		"movq (%1, %2), %%mm5				\n\t" // R1
                     -		"movq (%1, %2, 2), %%mm6			\n\t" // R2
                     -		"movq (%1, %%eax), %%mm7			\n\t" // R3
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		"movq %%mm0, (%1)				\n\t" // R0
                     -		"movq %%mm1, (%1, %2)				\n\t" // R1
                     -		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
                     -		"movq %%mm3, (%1, %%eax)			\n\t" // R3
                     -		"movq %%mm0, (%0)				\n\t" // L0
                     -		"movq %%mm1, (%0, %2)				\n\t" // L1
                     -		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
                     -		"movq %%mm3, (%0, %%eax)			\n\t" // L3
+                    -
                     -		"movq (%0, %2, 4), %%mm0			\n\t" // L4
                     -		"movq (%0, %%edx), %%mm1			\n\t" // L5
                     -		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
                     -		"movq (%0, %%ecx), %%mm3			\n\t" // L7
                     -		"movq (%1, %2, 4), %%mm4			\n\t" // R4
                     -		"movq (%1, %%edx), %%mm5			\n\t" // R5
                     -		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
                     -		"movq (%1, %%ecx), %%mm7			\n\t" // R7
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
                     -		"movq %%mm1, (%1, %%edx)			\n\t" // R5
                     -		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
                     -		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
                     -		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
                     -		"movq %%mm1, (%0, %%edx)			\n\t" // L5
                     -		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
                     -		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
                     -		"jmp 4f						\n\t"
+                    -
                     -		"3:						\n\t"
                     -		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
                     -		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
                     -		"movq (%0), %%mm0				\n\t" // L0
                     -		"movq (%0, %2), %%mm1				\n\t" // L1
                     -		"movq (%0, %2, 2), %%mm2			\n\t" // L2
                     -		"movq (%0, %%eax), %%mm3			\n\t" // L3
                     -		"movq (%1), %%mm4				\n\t" // R0
                     -		"movq (%1, %2), %%mm5				\n\t" // R1
                     -		"movq (%1, %2, 2), %%mm6			\n\t" // R2
                     -		"movq (%1, %%eax), %%mm7			\n\t" // R3
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		"movq %%mm0, (%1)				\n\t" // R0
                     -		"movq %%mm1, (%1, %2)				\n\t" // R1
                     -		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
                     -		"movq %%mm3, (%1, %%eax)			\n\t" // R3
                     -		"movq %%mm0, (%0)				\n\t" // L0
                     -		"movq %%mm1, (%0, %2)				\n\t" // L1
                     -		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
                     -		"movq %%mm3, (%0, %%eax)			\n\t" // L3
+                    -
                     -		"movq (%0, %2, 4), %%mm0			\n\t" // L4
                     -		"movq (%0, %%edx), %%mm1			\n\t" // L5
                     -		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
                     -		"movq (%0, %%ecx), %%mm3			\n\t" // L7
                     -		"movq (%1, %2, 4), %%mm4			\n\t" // R4
                     -		"movq (%1, %%edx), %%mm5			\n\t" // R5
                     -		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
                     -		"movq (%1, %%ecx), %%mm7			\n\t" // R7
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		PAVGB(%%mm4, %%mm0)
                     -		PAVGB(%%mm5, %%mm1)
                     -		PAVGB(%%mm6, %%mm2)
                     -		PAVGB(%%mm7, %%mm3)
                     -		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
                     -		"movq %%mm1, (%1, %%edx)			\n\t" // R5
                     -		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
                     -		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
                     -		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
                     -		"movq %%mm1, (%0, %%edx)			\n\t" // L5
                     -		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
                     -		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
+                    -
                     -		"4:						\n\t"
+                    -
                     -		:: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
                     -		: "%eax", "%edx", "%ecx", "memory"
                     -		);
                     -//printf("%d\n", test);
                     -#else
                     -{
                     -	int y;
                     -	int d=0;
                     -	int sysd=0;
                     -	int i;
+                    -
                     -	for(y=0; y<8; y++)
                     -	{
                     -		int x;
                     -		for(x=0; x<8; x++)
                     -		{
                     -			int ref= tempBlured[ x + y*stride ];
                     -			int cur= src[ x + y*stride ];
                     -			int d1=ref - cur;
                     -//			if(x==0 || x==7) d1+= d1>>1;
                     -//			if(y==0 || y==7) d1+= d1>>1;
                     -//			d+= ABS(d1);
                     -			d+= d1*d1;
                     -			sysd+= d1;
                     -		}
                     -	}
                     -	i=d;
                     -	d= 	(
                     -		4*d
                     -		+(*(tempBluredPast-256))
                     -		+(*(tempBluredPast-1))+ (*(tempBluredPast+1))
                     -		+(*(tempBluredPast+256))
                     -		+4)>>3;
                     -	*tempBluredPast=i;
                     -//	((*tempBluredPast)*3 + d + 2)>>2;
+                    -
                     -//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
                     -/*
                     -Switch between
                     - 1  0  0  0  0  0  0  (0)
                     -64 32 16  8  4  2  1  (1)
                     -64 48 36 27 20 15 11 (33) (approx)
                     -64 56 49 43 37 33 29 (200) (approx)
                     -*/
                     -	if(d > maxNoise[1])
                     -	{
                     -		if(d < maxNoise[2])
                     -		{
                     -			for(y=0; y<8; y++)
                     -			{
                     -				int x;
                     -				for(x=0; x<8; x++)
                     -				{
                     -					int ref= tempBlured[ x + y*stride ];
                     -					int cur= src[ x + y*stride ];
                     -					tempBlured[ x + y*stride ]=
                     -					src[ x + y*stride ]=
                     -						(ref + cur + 1)>>1;
                     -				}
                     -			}
                     -		}
                     -		else
                     -		{
                     -			for(y=0; y<8; y++)
                     -			{
                     -				int x;
                     -				for(x=0; x<8; x++)
                     -				{
                     -					tempBlured[ x + y*stride ]= src[ x + y*stride ];
                     -				}
                     -			}
                     -		}
                     -	}
                     -	else
                     -	{
                     -		if(d < maxNoise[0])
                     -		{
                     -			for(y=0; y<8; y++)
                     -			{
                     -				int x;
                     -				for(x=0; x<8; x++)
                     -				{
                     -					int ref= tempBlured[ x + y*stride ];
                     -					int cur= src[ x + y*stride ];
                     -					tempBlured[ x + y*stride ]=
                     -					src[ x + y*stride ]=
                     -						(ref*7 + cur + 4)>>3;
                     -				}
                     -			}
                     -		}
                     -		else
                     -		{
                     -			for(y=0; y<8; y++)
                     -			{
                     -				int x;
                     -				for(x=0; x<8; x++)
                     -				{
                     -					int ref= tempBlured[ x + y*stride ];
                     -					int cur= src[ x + y*stride ];
                     -					tempBlured[ x + y*stride ]=
                     -					src[ x + y*stride ]=
                     -						(ref*3 + cur + 2)>>2;
                     -				}
                     -			}
                     -		}
                     -	}
                     -}
                     -#endif
                     -}
+                    -
                     -static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     -	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
+                    -
                     -/**
                     - * Copies a block from src to dst and fixes the blacklevel
                     - * levelFix == 0 -> dont touch the brighness & contrast
                     - */
                     -#undef SCALED_CPY
+                    -
                     -static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
                     -	int levelFix, int64_t *packedOffsetAndScale)
                     -{
                     -#ifndef HAVE_MMX
                     -	int i;
                     -#endif
                     -	if(levelFix)
                     -	{
                     -#ifdef HAVE_MMX
                     -					asm volatile(
                     -						"movq (%%eax), %%mm2	\n\t" // packedYOffset
                     -						"movq 8(%%eax), %%mm3	\n\t" // packedYScale
                     -						"leal (%2,%4), %%eax	\n\t"
                     -						"leal (%3,%5), %%edx	\n\t"
                     -						"pxor %%mm4, %%mm4	\n\t"
                     -#ifdef HAVE_MMX2
                     -#define SCALED_CPY(src1, src2, dst1, dst2)					\
                     -						"movq " #src1 ", %%mm0	\n\t"\
                     -						"movq " #src1 ", %%mm5	\n\t"\
                     -						"movq " #src2 ", %%mm1	\n\t"\
                     -						"movq " #src2 ", %%mm6	\n\t"\
                     -						"punpcklbw %%mm0, %%mm0 \n\t"\
                     -						"punpckhbw %%mm5, %%mm5 \n\t"\
                     -						"punpcklbw %%mm1, %%mm1 \n\t"\
                     -						"punpckhbw %%mm6, %%mm6 \n\t"\
                     -						"pmulhuw %%mm3, %%mm0	\n\t"\
                     -						"pmulhuw %%mm3, %%mm5	\n\t"\
                     -						"pmulhuw %%mm3, %%mm1	\n\t"\
                     -						"pmulhuw %%mm3, %%mm6	\n\t"\
                     -						"psubw %%mm2, %%mm0	\n\t"\
                     -						"psubw %%mm2, %%mm5	\n\t"\
                     -						"psubw %%mm2, %%mm1	\n\t"\
                     -						"psubw %%mm2, %%mm6	\n\t"\
                     -						"packuswb %%mm5, %%mm0	\n\t"\
                     -						"packuswb %%mm6, %%mm1	\n\t"\
                     -						"movq %%mm0, " #dst1 "	\n\t"\
                     -						"movq %%mm1, " #dst2 "	\n\t"\
+                    -
                     -#else //HAVE_MMX2
                     -#define SCALED_CPY(src1, src2, dst1, dst2)					\
                     -						"movq " #src1 ", %%mm0	\n\t"\
                     -						"movq " #src1 ", %%mm5	\n\t"\
                     -						"punpcklbw %%mm4, %%mm0 \n\t"\
                     -						"punpckhbw %%mm4, %%mm5 \n\t"\
                     -						"psubw %%mm2, %%mm0	\n\t"\
                     -						"psubw %%mm2, %%mm5	\n\t"\
                     -						"movq " #src2 ", %%mm1	\n\t"\
                     -						"psllw $6, %%mm0	\n\t"\
                     -						"psllw $6, %%mm5	\n\t"\
                     -						"pmulhw %%mm3, %%mm0	\n\t"\
                     -						"movq " #src2 ", %%mm6	\n\t"\
                     -						"pmulhw %%mm3, %%mm5	\n\t"\
                     -						"punpcklbw %%mm4, %%mm1 \n\t"\
                     -						"punpckhbw %%mm4, %%mm6 \n\t"\
                     -						"psubw %%mm2, %%mm1	\n\t"\
                     -						"psubw %%mm2, %%mm6	\n\t"\
                     -						"psllw $6, %%mm1	\n\t"\
                     -						"psllw $6, %%mm6	\n\t"\
                     -						"pmulhw %%mm3, %%mm1	\n\t"\
                     -						"pmulhw %%mm3, %%mm6	\n\t"\
                     -						"packuswb %%mm5, %%mm0	\n\t"\
                     -						"packuswb %%mm6, %%mm1	\n\t"\
                     -						"movq %%mm0, " #dst1 "	\n\t"\
                     -						"movq %%mm1, " #dst2 "	\n\t"\
+                    -
                     -#endif //!HAVE_MMX2
+                    -
                     -SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
                     -SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
                     -SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
                     -						"leal (%%eax,%4,4), %%eax	\n\t"
                     -						"leal (%%edx,%5,4), %%edx	\n\t"
                     -SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
+                    -
+                    -
                     -						: "=&a" (packedOffsetAndScale)
                     -						: "0" (packedOffsetAndScale),
                     -						"r"(src),
                     -						"r"(dst),
                     -						"r" (srcStride),
                     -						"r" (dstStride)
                     -						: "%edx"
                     -					);
                     -#else
                     -				for(i=0; i<8; i++)
                     -					memcpy(	&(dst[dstStride*i]),
                     -						&(src[srcStride*i]), BLOCK_SIZE);
                     -#endif
                     -	}
                     -	else
                     -	{
                     -#ifdef HAVE_MMX
                     -					asm volatile(
                     -						"leal (%0,%2), %%eax	\n\t"
                     -						"leal (%1,%3), %%edx	\n\t"
+                    -
                     -#define SIMPLE_CPY(src1, src2, dst1, dst2)				\
                     -						"movq " #src1 ", %%mm0	\n\t"\
                     -						"movq " #src2 ", %%mm1	\n\t"\
                     -						"movq %%mm0, " #dst1 "	\n\t"\
                     -						"movq %%mm1, " #dst2 "	\n\t"\
+                    -
                     -SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
                     -SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
                     -SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
                     -						"leal (%%eax,%2,4), %%eax	\n\t"
                     -						"leal (%%edx,%3,4), %%edx	\n\t"
                     -SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
+                    -
                     -						: : "r" (src),
                     -						"r" (dst),
                     -						"r" (srcStride),
                     -						"r" (dstStride)
                     -						: "%eax", "%edx"
                     -					);
                     -#else
                     -				for(i=0; i<8; i++)
                     -					memcpy(	&(dst[dstStride*i]),
                     -						&(src[srcStride*i]), BLOCK_SIZE);
                     -#endif
                     -	}
                     -}
+                    -
                     -/**
                     - * Duplicates the given 8 src pixels ? times upward
                     - */
                     -static inline void RENAME(duplicate)(uint8_t src[], int stride)
                     -{
                     -#ifdef HAVE_MMX
                     -	asm volatile(
                     -		"movq (%0), %%mm0		\n\t"
                     -		"addl %1, %0			\n\t"
                     -		"movq %%mm0, (%0)		\n\t"
                     -		"movq %%mm0, (%0, %1)		\n\t"
                     -		"movq %%mm0, (%0, %1, 2)	\n\t"
                     -		: "+r" (src)
                     -		: "r" (-stride)
                     -	);
                     -#else
                     -	int i;
                     -	uint8_t *p=src;
                     -	for(i=0; i<3; i++)
                     -	{
                     -		p-= stride;
                     -		memcpy(p, src, 8);
                     -	}
                     -#endif
                     -}
+                    -
                     -/**
                     - * Filters array of bytes (Y or U or V values)
                     - */
                     -static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     -	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
                     -{
                     -	PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
                     -	int x,y;
                     -#ifdef COMPILE_TIME_MODE
                     -	const int mode= COMPILE_TIME_MODE;
                     -#else
                     -	const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
                     -#endif
                     -	int black=0, white=255; // blackest black and whitest white in the picture
                     -	int QPCorrecture= 256*256;
+                    -
                     -	int copyAhead;
                     -#ifdef HAVE_MMX
                     -	int i;
                     -#endif
+                    -
                     -	const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
                     -	const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
+                    -
                     -	//FIXME remove
                     -	uint64_t * const yHistogram= c.yHistogram;
                     -	uint8_t * const tempSrc= c.tempSrc;
                     -	uint8_t * const tempDst= c.tempDst;
                     -	const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
+                    -
                     -#ifdef HAVE_MMX
                     -	for(i=0; i<32; i++){
                     -		int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
                     -		int threshold= offset*2 + 1;
                     -		c.mmxDcOffset[i]= 0x7F - offset;
                     -		c.mmxDcThreshold[i]= 0x7F - threshold;
                     -		c.mmxDcOffset[i]*= 0x0101010101010101LL;
                     -		c.mmxDcThreshold[i]*= 0x0101010101010101LL;
                     -	}
                     -#endif
+                    -
                     -	if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
                     -	else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
                     -		|| (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
                     -	else if(   (mode & V_DEBLOCK)
                     -		|| (mode & LINEAR_IPOL_DEINT_FILTER)
                     -		|| (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
                     -	else if(mode & V_X1_FILTER) copyAhead=11;
                     -//	else if(mode & V_RK1_FILTER) copyAhead=10;
                     -	else if(mode & DERING) copyAhead=9;
                     -	else copyAhead=8;
+                    -
                     -	copyAhead-= 8;
+                    -
                     -	if(!isColor)
                     -	{
                     -		uint64_t sum= 0;
                     -		int i;
                     -		uint64_t maxClipped;
                     -		uint64_t clipped;
                     -		double scale;
+                    -
                     -		c.frameNum++;
                     -		// first frame is fscked so we ignore it
                     -		if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
+                    -
                     -		for(i=0; i<256; i++)
                     -		{
                     -			sum+= yHistogram[i];
                     -//			printf("%d ", yHistogram[i]);
                     -		}
                     -//		printf("\n\n");
+                    -
                     -		/* we allways get a completly black picture first */
                     -		maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
+                    -
                     -		clipped= sum;
                     -		for(black=255; black>0; black--)
                     -		{
                     -			if(clipped < maxClipped) break;
                     -			clipped-= yHistogram[black];
                     -		}
+                    -
                     -		clipped= sum;
                     -		for(white=0; white<256; white++)
                     -		{
                     -			if(clipped < maxClipped) break;
                     -			clipped-= yHistogram[white];
                     -		}
+                    -
                     -		scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
+                    -
                     -#ifdef HAVE_MMX2
                     -		c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
                     -		c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
                     -#else
                     -		c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
                     -		c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
                     -#endif
+                    -
                     -		c.packedYOffset|= c.packedYOffset<<32;
                     -		c.packedYOffset|= c.packedYOffset<<16;
+                    -
                     -		c.packedYScale|= c.packedYScale<<32;
                     -		c.packedYScale|= c.packedYScale<<16;
+                    -
                     -		if(mode & LEVEL_FIX)	QPCorrecture= (int)(scale*256*256 + 0.5);
                     -		else			QPCorrecture= 256*256;
                     -	}
                     -	else
                     -	{
                     -		c.packedYScale= 0x0100010001000100LL;
                     -		c.packedYOffset= 0;
                     -		QPCorrecture= 256*256;
                     -	}
+                    -
                     -	/* copy & deinterlace first row of blocks */
                     -	y=-BLOCK_SIZE;
                     -	{
                     -		uint8_t *srcBlock= &(src[y*srcStride]);
                     -		uint8_t *dstBlock= tempDst + dstStride;
+                    -
                     -		// From this point on it is guranteed that we can read and write 16 lines downward
                     -		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
                     -		for(x=0; x<width; x+=BLOCK_SIZE)
                     -		{
+                    -
                     -#ifdef HAVE_MMX2
                     -/*
                     -			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
                     -			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
                     -			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
                     -			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
                     -*/
+                    -
                     -			asm(
                     -				"movl %4, %%eax			\n\t"
                     -				"shrl $2, %%eax			\n\t"
                     -				"andl $6, %%eax			\n\t"
                     -				"addl %5, %%eax			\n\t"
                     -				"movl %%eax, %%edx		\n\t"
                     -				"imul %1, %%eax			\n\t"
                     -				"imul %3, %%edx			\n\t"
                     -				"prefetchnta 32(%%eax, %0)	\n\t"
                     -				"prefetcht0 32(%%edx, %2)	\n\t"
                     -				"addl %1, %%eax			\n\t"
                     -				"addl %3, %%edx			\n\t"
                     -				"prefetchnta 32(%%eax, %0)	\n\t"
                     -				"prefetcht0 32(%%edx, %2)	\n\t"
                     -			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
                     -			"m" (x), "m" (copyAhead)
                     -			: "%eax", "%edx"
                     -			);
+                    -
                     -#elif defined(HAVE_3DNOW)
                     -//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
                     -/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
                     -			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
                     -			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
                     -			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
                     -*/
                     -#endif
+                    -
                     -			RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
                     -				srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
+                    -
                     -			RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
+                    -
                     -			if(mode & LINEAR_IPOL_DEINT_FILTER)
                     -				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
                     -			else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     -				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
                     -			else if(mode & MEDIAN_DEINT_FILTER)
                     -				RENAME(deInterlaceMedian)(dstBlock, dstStride);
                     -			else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     -				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
                     -			else if(mode & FFMPEG_DEINT_FILTER)
                     -				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
                     -/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     -				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
                     -*/
                     -			dstBlock+=8;
                     -			srcBlock+=8;
                     -		}
                     -		if(width==dstStride)
                     -			memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride);
                     -		else
                     -		{
                     -			int i;
                     -			for(i=0; i<copyAhead; i++)
                     -			{
                     -				memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
                     -			}
                     -		}
                     -	}
+                    -
                     -//printf("\n");
                     -	for(y=0; y<height; y+=BLOCK_SIZE)
                     -	{
                     -		//1% speedup if these are here instead of the inner loop
                     -		uint8_t *srcBlock= &(src[y*srcStride]);
                     -		uint8_t *dstBlock= &(dst[y*dstStride]);
                     -#ifdef HAVE_MMX
                     -		uint8_t *tempBlock1= c.tempBlocks;
                     -		uint8_t *tempBlock2= c.tempBlocks + 8;
                     -#endif
                     -		int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
                     -		int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*mbWidth];
                     -		int QP=0;
                     -		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
                     -		   if not than use a temporary buffer */
                     -		if(y+15 >= height)
                     -		{
                     -			int i;
                     -			/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
                     -			   blockcopy to dst later */
                     -			memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
                     -				srcStride*MAX(height-y-copyAhead, 0) );
+                    -
                     -			/* duplicate last line of src to fill the void upto line (copyAhead+7) */
                     -			for(i=MAX(height-y, 8); i<copyAhead+8; i++)
                     -				memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
+                    -
                     -			/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
                     -			memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
+                    -
                     -			/* duplicate last line of dst to fill the void upto line (copyAhead) */
                     -			for(i=height-y+1; i<=copyAhead; i++)
                     -				memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
+                    -
                     -			dstBlock= tempDst + dstStride;
                     -			srcBlock= tempSrc;
                     -		}
                     -//printf("\n");
+                    -
                     -		// From this point on it is guranteed that we can read and write 16 lines downward
                     -		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
                     -		for(x=0; x<width; x+=BLOCK_SIZE)
                     -		{
                     -			const int stride= dstStride;
                     -#ifdef HAVE_MMX
                     -			uint8_t *tmpXchg;
                     -#endif
                     -			if(isColor)
                     -			{
                     -				QP= QPptr[x>>qpHShift];
                     -				c.nonBQP= nonBQPptr[x>>qpHShift];
                     -			}
                     -			else
                     -			{
                     -				QP= QPptr[x>>4];
                     -				QP= (QP* QPCorrecture + 256*128)>>16;
                     -				c.nonBQP= nonBQPptr[x>>4];
                     -				c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
                     -				yHistogram[ srcBlock[srcStride*12 + 4] ]++;
                     -			}
                     -			c.QP= QP;
                     -#ifdef HAVE_MMX
                     -			asm volatile(
                     -				"movd %1, %%mm7					\n\t"
                     -				"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
                     -				"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
                     -				"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
                     -				"movq %%mm7, %0			\n\t"
                     -				: "=m" (c.pQPb)
                     -				: "r" (QP)
                     -			);
                     -#endif
+                    -
+                    -
                     -#ifdef HAVE_MMX2
                     -/*
                     -			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
                     -			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
                     -			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
                     -			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
                     -*/
+                    -
                     -			asm(
                     -				"movl %4, %%eax			\n\t"
                     -				"shrl $2, %%eax			\n\t"
                     -				"andl $6, %%eax			\n\t"
                     -				"addl %5, %%eax			\n\t"
                     -				"movl %%eax, %%edx		\n\t"
                     -				"imul %1, %%eax			\n\t"
                     -				"imul %3, %%edx			\n\t"
                     -				"prefetchnta 32(%%eax, %0)	\n\t"
                     -				"prefetcht0 32(%%edx, %2)	\n\t"
                     -				"addl %1, %%eax			\n\t"
                     -				"addl %3, %%edx			\n\t"
                     -				"prefetchnta 32(%%eax, %0)	\n\t"
                     -				"prefetcht0 32(%%edx, %2)	\n\t"
                     -			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
                     -			"m" (x), "m" (copyAhead)
                     -			: "%eax", "%edx"
                     -			);
+                    -
                     -#elif defined(HAVE_3DNOW)
                     -//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
                     -/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
                     -			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
                     -			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
                     -			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
                     -*/
                     -#endif
+                    -
                     -			RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
                     -				srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
+                    -
                     -			if(mode & LINEAR_IPOL_DEINT_FILTER)
                     -				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
                     -			else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     -				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
                     -			else if(mode & MEDIAN_DEINT_FILTER)
                     -				RENAME(deInterlaceMedian)(dstBlock, dstStride);
                     -			else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     -				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
                     -			else if(mode & FFMPEG_DEINT_FILTER)
                     -				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
                     -/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     -				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
                     -*/
+                    -
                     -			/* only deblock if we have 2 blocks */
                     -			if(y + 8 < height)
                     -			{
                     -				if(mode & V_X1_FILTER)
                     -					RENAME(vertX1Filter)(dstBlock, stride, &c);
                     -				else if(mode & V_DEBLOCK)
                     -				{
                     -					if( RENAME(isVertDC)(dstBlock, stride, &c))
                     -					{
                     -						if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
                     -							RENAME(doVertLowPass)(dstBlock, stride, &c);
                     -					}
                     -					else
                     -						RENAME(doVertDefFilter)(dstBlock, stride, &c);
                     -				}
                     -			}
+                    -
                     -#ifdef HAVE_MMX
                     -			RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
                     -#endif
                     -			/* check if we have a previous block to deblock it with dstBlock */
                     -			if(x - 8 >= 0)
                     -			{
                     -#ifdef HAVE_MMX
                     -				if(mode & H_X1_FILTER)
                     -					RENAME(vertX1Filter)(tempBlock1, 16, &c);
                     -				else if(mode & H_DEBLOCK)
                     -				{
                     -					if( RENAME(isVertDC)(tempBlock1, 16, &c))
                     -					{
                     -						if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
                     -							RENAME(doVertLowPass)(tempBlock1, 16, &c);
                     -					}
                     -					else
                     -						RENAME(doVertDefFilter)(tempBlock1, 16, &c);
                     -				}
+                    -
                     -				RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
+                    -
                     -#else
                     -				if(mode & H_X1_FILTER)
                     -					horizX1Filter(dstBlock-4, stride, QP);
                     -				else if(mode & H_DEBLOCK)
                     -				{
                     -					if( isHorizDC(dstBlock-4, stride, &c))
                     -					{
                     -						if(isHorizMinMaxOk(dstBlock-4, stride, QP))
                     -							doHorizLowPass(dstBlock-4, stride, QP);
                     -					}
                     -					else
                     -						doHorizDefFilter(dstBlock-4, stride, QP);
                     -				}
                     -#endif
                     -				if(mode & DERING)
                     -				{
                     -				//FIXME filter first line
                     -					if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
                     -				}
+                    -
                     -				if(mode & TEMP_NOISE_FILTER)
                     -				{
                     -					RENAME(tempNoiseReducer)(dstBlock-8, stride,
                     -						c.tempBlured[isColor] + y*dstStride + x,
                     -						c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                     -						c.ppMode.maxTmpNoise);
                     -				}
                     -			}
+                    -
                     -			dstBlock+=8;
                     -			srcBlock+=8;
+                    -
                     -#ifdef HAVE_MMX
                     -			tmpXchg= tempBlock1;
                     -			tempBlock1= tempBlock2;
                     -			tempBlock2 = tmpXchg;
                     -#endif
                     -		}
+                    -
                     -		if(mode & DERING)
                     -		{
                     -				if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
                     -		}
+                    -
                     -		if((mode & TEMP_NOISE_FILTER))
                     -		{
                     -			RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
                     -				c.tempBlured[isColor] + y*dstStride + x,
                     -				c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
                     -				c.ppMode.maxTmpNoise);
                     -		}
+                    -
                     -		/* did we use a tmp buffer for the last lines*/
                     -		if(y+15 >= height)
                     -		{
                     -			uint8_t *dstBlock= &(dst[y*dstStride]);
                     -			if(width==dstStride)
                     -				memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y));
                     -			else
                     -			{
                     -				int i;
                     -				for(i=0; i<height-y; i++)
                     -				{
                     -					memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
                     -				}
                     -			}
                     -		}
                     -/*
                     -		for(x=0; x<width; x+=32)
                     -		{
                     -			volatile int i;
                     -			i+=	+ dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
                     -				+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
                     -				+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
                     -//				+ dstBlock[x +13*dstStride]
                     -//				+ dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
                     -		}*/
                     -	}
                     -#ifdef HAVE_3DNOW
                     -	asm volatile("femms");
                     -#elif defined (HAVE_MMX)
                     -	asm volatile("emms");
                     -#endif
+                    -
                     -#ifdef DEBUG_BRIGHTNESS
                     -	if(!isColor)
                     -	{
                     -		int max=1;
                     -		int i;
                     -		for(i=0; i<256; i++)
                     -			if(yHistogram[i] > max) max=yHistogram[i];
+                    -
                     -		for(i=1; i<256; i++)
                     -		{
                     -			int x;
                     -			int start=yHistogram[i-1]/(max/256+1);
                     -			int end=yHistogram[i]/(max/256+1);
                     -			int inc= end > start ? 1 : -1;
                     -			for(x=start; x!=end+inc; x+=inc)
                     -				dst[ i*dstStride + x]+=128;
                     -		}
+                    -
                     -		for(i=0; i<100; i+=2)
                     -		{
                     -			dst[ (white)*dstStride + i]+=128;
                     -			dst[ (black)*dstStride + i]+=128;
                     -		}
+                    -
                     -	}
                     -#endif
+                    -
                     -	*c2= c; //copy local context back
+                    -
                     -}