GitList

Browse code

rewrote the horizontal lowpass filter to fix a bug which caused a blocky look added deinterlace filters (linear interpolate, linear blend, median) minor cleanups (removed some outcommented stuff)

Originally committed as revision 2204 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

Michael Niedermayer authored on 2001/10/15 12:01:08
Showing 3 changed files

postproc/postprocess.c index 04e73cc..7d6ac4a 100644
postproc/postprocess.h index 81f5435..143ea57 100644
postproc/postprocess_template.c index 04e73cc..7d6ac4a 100644

@@ -17,19 +17,22 @@
                      */
                      /*
                     -			C	MMX	MMX2	3DNow*
                     +			C	MMX	MMX2	3DNow
                      isVertDC		Ec	Ec
                      isVertMinMaxOk		Ec	Ec
                     -doVertLowPass		E		e	e*
                     +doVertLowPass		E		e	e
                      doVertDefFilter		Ec	Ec	Ec
                      isHorizDC		Ec	Ec
                      isHorizMinMaxOk		a
                     -doHorizLowPass		E		a	a*
                     +doHorizLowPass		E		a	a
                      doHorizDefFilter	E	ac	ac
                      deRing
                     -Vertical RKAlgo1	E		a	a*
                     -Vertical X1		a		E	E*
                     -Horizontal X1		a		E	E*
                     +Vertical RKAlgo1	E		a	a
                     +Vertical X1		a		E	E
                     +Horizontal X1		a		E	E
                     +LinIpolDeinterlace	a		E	E*
                     +LinBlendDeinterlace	a		E	E*
                     +MedianDeinterlace	a		E
                      * i dont have a 3dnow CPU -> its untested
@@ -55,6 +58,7 @@ make the mainloop more flexible (variable number of blocks at once
                      compare the quality & speed of all filters
                      implement a few simple deinterlacing filters
                      split this huge file
                     +fix warnings (unused vars, ...)
                      ...
                      Notes:
@@ -63,6 +67,9 @@ Notes:
                      /*
                      Changelog: use the CVS log
                     +rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
                     +added deinterlace filters (linear interpolate, linear blend, median)
                     +minor cleanups (removed some outcommented stuff)
 .1.3
                      	bugfixes: last 3 lines not brightness/contrast corrected
                      		brightness statistics messed up with initial black pic
@@ -194,13 +201,11 @@ static inline void prefetcht2(void *p)
                       * Check if the middle 8x8 Block in the given 8x10 block is flat
                       */
                      static inline int isVertDC(uint8_t src[], int stride){
                     -//	return true;
                      	int numEq= 0;
                      	int y;
                      	src+= stride; // src points to begin of the 8x8 Block
                      #ifdef HAVE_MMX
                      	asm volatile(
                     -//		"int $3 \n\t"
                      		"pushl %1\n\t"
                      		"movq b7E, %%mm7					\n\t" // mm7 = 0x7F
                      		"movq b7C, %%mm6					\n\t" // mm6 = 0x7D
@@ -1577,9 +1582,9 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
+                     }
                      /**
                     - * Do a horizontal low pass filter on the 8x8 block
                     + * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
                       * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
                     - * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
                     + * useing the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
                       */
                      static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
+                     {
@@ -1635,14 +1640,6 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
                      */
                      // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
                      /*
                     - 31
                     - 121
                     -  121
                     -   121
                     -    121
                     -     121
                     -      121
                     -       13
                      Implemented	Exact 7-Tap
 		A321
 		64321
@@ -1654,6 +1651,7 @@ Implemented	Exact 7-Tap
 	   123A
                      */
+                    +
                      #ifdef HAVE_MMX2
                      #define HLP3(i)	"movq " #i "(%%eax), %%mm0				\n\t"\
                      		"movq %%mm0, %%mm1					\n\t"\
@@ -1680,12 +1678,12 @@ Implemented	Exact 7-Tap
                      #define HLP3(i)	"movq " #i "(%%eax), %%mm0				\n\t"\
                      		"movq %%mm0, %%mm1					\n\t"\
                      		"movq %%mm0, %%mm2					\n\t"\
                     -		"movq %%mm0, %%mm3					\n\t"\
                     -		"movq %%mm0, %%mm4					\n\t"\
                     +		"movd -4(%0), %%mm3					\n\t" /*0001000*/\
                     +		"movd 8(%0), %%mm4					\n\t" /*0001000*/\
                      		"psllq $8, %%mm1					\n\t"\
                      		"psrlq $8, %%mm2					\n\t"\
                     -		"pand bm00000001, %%mm3					\n\t"\
                     -		"pand bm10000000, %%mm4					\n\t"\
                     +		"psrlq $24, %%mm3					\n\t"\
                     +		"psllq $56, %%mm4					\n\t"\
                      		"por %%mm3, %%mm1					\n\t"\
                      		"por %%mm4, %%mm2					\n\t"\
                      		PAVGB(%%mm2, %%mm1)\
@@ -1708,7 +1706,80 @@ Implemented	Exact 7-Tap
                      		"movd %%mm0, 4(%0)					\n\t"
                      #endif
                     -#define HLP(i) HLP3(i)
                     +/* uses the 7-Tap Filter: 1112111 */
                     +#define NEW_HLP(i)\
                     +		"movq " #i "(%%eax), %%mm0				\n\t"\
                     +		"movq %%mm0, %%mm1					\n\t"\
                     +		"movq %%mm0, %%mm2					\n\t"\
                     +		"movd -4(%0), %%mm3					\n\t" /*0001000*/\
                     +		"movd 8(%0), %%mm4					\n\t" /*0001000*/\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"psrlq $24, %%mm3					\n\t"\
                     +		"psllq $56, %%mm4					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t"\
                     +		"por %%mm4, %%mm2					\n\t"\
                     +		"movq %%mm1, %%mm5					\n\t"\
                     +		PAVGB(%%mm2, %%mm1)\
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"psllq $8, %%mm5					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm5					\n\t"\
                     +		"por %%mm4, %%mm2					\n\t"\
                     +		"movq %%mm5, %%mm1					\n\t"\
                     +		PAVGB(%%mm2, %%mm5)\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t"\
                     +		"por %%mm4, %%mm2					\n\t"\
                     +		PAVGB(%%mm2, %%mm1)\
                     +		PAVGB(%%mm1, %%mm5)\
                     +		PAVGB(%%mm5, %%mm0)\
                     +		"movd %%mm0, (%0)					\n\t"\
                     +		"psrlq $32, %%mm0					\n\t"\
                     +		"movd %%mm0, 4(%0)					\n\t"
+                    +
                     +/* uses the 9-Tap Filter: 112242211 */
                     +#define NEW_HLP2(i)\
                     +		"movq " #i "(%%eax), %%mm0				\n\t" /*0001000*/\
                     +		"movq %%mm0, %%mm1					\n\t" /*0001000*/\
                     +		"movq %%mm0, %%mm2					\n\t" /*0001000*/\
                     +		"movd -4(%0), %%mm3					\n\t" /*0001000*/\
                     +		"movd 8(%0), %%mm4					\n\t" /*0001000*/\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"psrlq $24, %%mm3					\n\t"\
                     +		"psllq $56, %%mm4					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t" /*0010000*/\
                     +		"por %%mm4, %%mm2					\n\t" /*0000100*/\
                     +		"movq %%mm1, %%mm5					\n\t" /*0010000*/\
                     +		PAVGB(%%mm2, %%mm1)					      /*0010100*/\
                     +		PAVGB(%%mm1, %%mm0)					      /*0012100*/\
                     +		"psllq $8, %%mm5					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm5					\n\t" /*0100000*/\
                     +		"por %%mm4, %%mm2					\n\t" /*0000010*/\
                     +		"movq %%mm5, %%mm1					\n\t" /*0100000*/\
                     +		PAVGB(%%mm2, %%mm5)					      /*0100010*/\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t" /*1000000*/\
                     +		"por %%mm4, %%mm2					\n\t" /*0000001*/\
                     +		"movq %%mm1, %%mm6					\n\t" /*1000000*/\
                     +		PAVGB(%%mm2, %%mm1)					      /*1000001*/\
                     +		"psllq $8, %%mm6					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm6					\n\t"/*100000000*/\
                     +		"por %%mm4, %%mm2					\n\t"/*000000001*/\
                     +		PAVGB(%%mm2, %%mm6)					     /*100000001*/\
                     +		PAVGB(%%mm6, %%mm1)					     /*110000011*/\
                     +		PAVGB(%%mm1, %%mm5)					     /*112000211*/\
                     +		PAVGB(%%mm5, %%mm0)					     /*112242211*/\
                     +		"movd %%mm0, (%0)					\n\t"\
                     +		"psrlq $32, %%mm0					\n\t"\
                     +		"movd %%mm0, 4(%0)					\n\t"
+                    +
                     +#define HLP(i) NEW_HLP(i)
                      		HLP(0)
                      		"addl %1, %0						\n\t"
@@ -1828,6 +1899,363 @@ FIND_MIN_MAX(%%ebx, %1, 2)
                      #endif
+                     }
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
                     + */
                     +static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t"
                     +		"movq (%%eax, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)\
                     +		"movq %%mm1, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%ebx)				\n\t"
                     +		"movq (%0, %1, 8), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)\
                     +		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[stride]   = (src[0]        + src[stride*2])>>1;
                     +		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
                     +		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
                     +		src[stride*7] = (src[stride*6] + src[stride*8])>>1;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
                     + */
                     +static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t"
                     +		"movq (%%eax, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)\
                     +		"movq %%mm1, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%ebx)				\n\t"
                     +		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[stride]   = (src[0]        + src[stride*2])>>1;
                     +		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
                     +		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
                     +		src[stride*7] = src[stride*6];
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
                     + * will shift the image up by 1 line (FIXME if this is a problem)
                     + */
                     +static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%%eax, %1), %%mm1			\n\t" // L2
                     +		PAVGB(%%mm1, %%mm0)				      // L0+L2
                     +		"movq (%%eax), %%mm2				\n\t" // L1
                     +		PAVGB(%%mm2, %%mm0)
                     +		"movq %%mm0, (%0)				\n\t"
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
                     +		PAVGB(%%mm0, %%mm2)				      // L1+L3
                     +		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
                     +		"movq %%mm2, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm2			\n\t" // L4
                     +		PAVGB(%%mm2, %%mm1)				      // L2+L4
                     +		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
                     +		"movq %%mm1, (%%eax, %1)			\n\t"
                     +		"movq (%%ebx), %%mm1				\n\t" // L5
                     +		PAVGB(%%mm1, %%mm0)				      // L3+L5
                     +		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm0			\n\t" // L6
                     +		PAVGB(%%mm0, %%mm2)				      // L4+L6
                     +		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
                     +		"movq %%mm2, (%0, %1, 4)			\n\t"
                     +		"movq (%%ebx, %1, 2), %%mm2			\n\t" // L7
                     +		PAVGB(%%mm2, %%mm1)				      // L5+L7
                     +		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
                     +		"movq %%mm1, (%%ebx)				\n\t"
                     +		"movq (%0, %1, 8), %%mm1			\n\t" // L8
                     +		PAVGB(%%mm1, %%mm0)				      // L6+L8
                     +		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
                     +		"movq %%mm0, (%%ebx, %1)			\n\t"
                     +		"movq (%%ebx, %1, 4), %%mm0			\n\t" // L9
                     +		PAVGB(%%mm0, %%mm2)				      // L7+L9
                     +		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
                     +		"movq %%mm2, (%%ebx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
                     +		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
                     + * will shift the image up by 1 line (FIXME if this is a problem)
                     + */
                     +static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%%eax, %1), %%mm1			\n\t" // L2
                     +		PAVGB(%%mm1, %%mm0)				      // L0+L2
                     +		"movq (%%eax), %%mm2				\n\t" // L1
                     +		PAVGB(%%mm2, %%mm0)
                     +		"movq %%mm0, (%0)				\n\t"
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
                     +		PAVGB(%%mm0, %%mm2)				      // L1+L3
                     +		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
                     +		"movq %%mm2, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm2			\n\t" // L4
                     +		PAVGB(%%mm2, %%mm1)				      // L2+L4
                     +		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
                     +		"movq %%mm1, (%%eax, %1)			\n\t"
                     +		"movq (%%ebx), %%mm1				\n\t" // L5
                     +		PAVGB(%%mm1, %%mm0)				      // L3+L5
                     +		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm0			\n\t" // L6
                     +		PAVGB(%%mm0, %%mm2)				      // L4+L6
                     +		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
                     +		"movq %%mm2, (%0, %1, 4)			\n\t"
                     +		"movq (%%ebx, %1, 2), %%mm2			\n\t" // L7
                     +		PAVGB(%%mm2, %%mm1)				      // L5+L7
                     +		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
                     +		"movq %%mm1, (%%ebx)				\n\t"
                     +		PAVGB(%%mm2, %%mm0)				      // L7 + L8
                     +		"movq %%mm0, (%%ebx, %1)			\n\t"
                     +		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
                     +		src[stride*7] = src[stride*6];
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
                     + */
                     +static inline void deInterlaceMedian(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" //
                     +		"movq (%%eax, %1), %%mm2			\n\t" //
                     +		"movq (%%eax), %%mm1				\n\t" //
                     +		"movq %%mm0, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm2, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm0				\n\t"
                     +		"movq %%mm0, (%%eax)				\n\t"
+                    +
                     +		"movq (%0, %1, 4), %%mm0			\n\t" //
                     +		"movq (%%eax, %1, 2), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm0, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm2				\n\t"
                     +		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    +
                     +		"movq (%%ebx), %%mm2				\n\t" //
                     +		"movq (%%ebx, %1), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ebx)				\n\t"
+                    +
                     +		"movq (%%ebx, %1, 2), %%mm2			\n\t" //
                     +		"movq (%0, %1, 8), %%mm0			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ebx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	//FIXME
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
                     +		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
                     + * will shift the image up by 1 line (FIXME if this is a problem)
                     + */
                     +static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" //
                     +		"movq (%%eax, %1), %%mm2			\n\t" //
                     +		"movq (%%eax), %%mm1				\n\t" //
                     +		"movq %%mm0, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm2, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm0				\n\t"
                     +		"movq %%mm0, (%%eax)				\n\t"
+                    +
                     +		"movq (%0, %1, 4), %%mm0			\n\t" //
                     +		"movq (%%eax, %1, 2), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm0, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm2				\n\t"
                     +		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    +
                     +		"movq (%%ebx), %%mm2				\n\t" //
                     +		"movq (%%ebx, %1), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ebx)				\n\t"
+                    +
                     +		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	//FIXME
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
                     +		src[stride*7] = src[stride*6];
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
+                    +
                      #ifdef HAVE_ODIVX_POSTPROCESS
                      #include "../opendivx/postprocess.h"
                      int use_old_pp=0;
@@ -1841,7 +2269,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                       * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
                       * -63 is best quality -1 is worst
                       */
                     -//extern "C"{
                      void  postprocess(unsigned char * src[], int src_stride,
                                       unsigned char * dst[], int dst_stride,
                                       int horizontal_size,   int vertical_size,
@@ -2196,6 +2623,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                      				blockCopy(vertBlock + dstStride*2, dstStride,
                      					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
                     +				if(mode & LINEAR_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateLinear(dstBlock, dstStride);
                     +				else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendLinear(dstBlock, dstStride);
                     +				else if(mode & MEDIAN_DEINT_FILTER)
                     +					deInterlaceMedian(dstBlock, dstStride);
                     +/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateCubic(dstBlock, dstStride);
                     +				else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendCubic(dstBlock, dstStride);
                     +*/
                      #ifdef MORE_TIMEING
                      				T1= rdtsc();
@@ -2226,9 +2664,22 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                      #endif
+                     			}
                      			else
                     +			{
                      				blockCopy(vertBlock + dstStride*1, dstStride,
                      					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
                     +				if(mode & LINEAR_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
                     +				else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendLinearLastRow(dstBlock, dstStride);
                     +				else if(mode & MEDIAN_DEINT_FILTER)
                     +					deInterlaceMedianLastRow(dstBlock, dstStride);
                     +/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
                     +				else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendCubicLastRow(dstBlock, dstStride);
                     +*/
                     +			}
                      			if(x - 8 >= 0 && x<width)
+                     			{

postproc/postprocess.h

History View file @ 3b58b88

@@ -22,6 +22,7 @@
                      #define BLOCK_SIZE 8
                      #define TEMP_STRIDE 8
                     +//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
                      #define V_DEBLOCK	0x01
                      #define H_DEBLOCK	0x02
@@ -32,20 +33,28 @@
                      #define LUM_H_DEBLOCK	H_DEBLOCK		//   2
                      #define CHROM_V_DEBLOCK	(V_DEBLOCK<<4)		//  16
                      #define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)		//  32
                     -#define LUM_DERING	DERING			//   4
                     -#define CHROM_DERING	(DERING<<4)		//  64
                     +#define LUM_DERING	DERING			//   4 (not implemented yet)
                     +#define CHROM_DERING	(DERING<<4)		//  64 (not implemented yet)
                      #define LUM_LEVEL_FIX	LEVEL_FIX		//   8
                     -//not supported currently
                     -#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)		// 128
                     +#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)		// 128 (not implemented yet)
                      // Experimental vertical filters
                      #define V_RK1_FILTER	0x0100			// 256
                      #define V_X1_FILTER	0x0200			// 512
                      // Experimental horizontal filters
                     -#define H_RK1_FILTER	0x1000			// 4096
                     +#define H_RK1_FILTER	0x1000			// 4096 (not implemented yet)
                      #define H_X1_FILTER	0x2000			// 8192
                     +//Deinterlacing Filters
                     +#define DEINTERLACE_FILTER_MASK		0xF0000
                     +#define	LINEAR_IPOL_DEINT_FILTER	0x10000	// 65536
                     +#define	LINEAR_BLEND_DEINT_FILTER	0x20000	// 131072
                     +#define	CUBIC_BLEND_DEINT_FILTER	0x30000	// 196608 (not implemented yet)
                     +#define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144 (not implemented yet)
                     +#define	MEDIAN_DEINT_FILTER		0x80000	// 524288
+                    +
+                    +
                      #define GET_PP_QUALITY_MAX 6
                      //#define TIMEING
@@ -53,18 +62,6 @@
                      #define QP_STORE_T int
                     -//#ifdef __cplusplus
                     -//#include <inttypes.h>
+                    -
                     -//void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
                     -//	QP_STORE_T QPs[], int QPStride, int isColor, int mode);
                     -//#endif
+                    -
                     -//#ifdef __cplusplus
                     -//extern "C"
                     -//{
                     -//#endif
+                    -
                      void postprocess(unsigned char * src[], int src_stride,
                                       unsigned char * dst[], int dst_stride,
                                       int horizontal_size,   int vertical_size,
@@ -72,8 +69,4 @@ void postprocess(unsigned char * src[], int src_stride,
                      int getPpModeForQuality(int quality);
                     -//#ifdef __cplusplus
                     -//}
                     -//#endif
+                    -
                      #endif

postproc/postprocess_template.c

History View file @ 3b58b88

@@ -17,19 +17,22 @@
                      */
                      /*
                     -			C	MMX	MMX2	3DNow*
                     +			C	MMX	MMX2	3DNow
                      isVertDC		Ec	Ec
                      isVertMinMaxOk		Ec	Ec
                     -doVertLowPass		E		e	e*
                     +doVertLowPass		E		e	e
                      doVertDefFilter		Ec	Ec	Ec
                      isHorizDC		Ec	Ec
                      isHorizMinMaxOk		a
                     -doHorizLowPass		E		a	a*
                     +doHorizLowPass		E		a	a
                      doHorizDefFilter	E	ac	ac
                      deRing
                     -Vertical RKAlgo1	E		a	a*
                     -Vertical X1		a		E	E*
                     -Horizontal X1		a		E	E*
                     +Vertical RKAlgo1	E		a	a
                     +Vertical X1		a		E	E
                     +Horizontal X1		a		E	E
                     +LinIpolDeinterlace	a		E	E*
                     +LinBlendDeinterlace	a		E	E*
                     +MedianDeinterlace	a		E
                      * i dont have a 3dnow CPU -> its untested
@@ -55,6 +58,7 @@ make the mainloop more flexible (variable number of blocks at once
                      compare the quality & speed of all filters
                      implement a few simple deinterlacing filters
                      split this huge file
                     +fix warnings (unused vars, ...)
                      ...
                      Notes:
@@ -63,6 +67,9 @@ Notes:
                      /*
                      Changelog: use the CVS log
                     +rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
                     +added deinterlace filters (linear interpolate, linear blend, median)
                     +minor cleanups (removed some outcommented stuff)
 .1.3
                      	bugfixes: last 3 lines not brightness/contrast corrected
                      		brightness statistics messed up with initial black pic
@@ -194,13 +201,11 @@ static inline void prefetcht2(void *p)
                       * Check if the middle 8x8 Block in the given 8x10 block is flat
                       */
                      static inline int isVertDC(uint8_t src[], int stride){
                     -//	return true;
                      	int numEq= 0;
                      	int y;
                      	src+= stride; // src points to begin of the 8x8 Block
                      #ifdef HAVE_MMX
                      	asm volatile(
                     -//		"int $3 \n\t"
                      		"pushl %1\n\t"
                      		"movq b7E, %%mm7					\n\t" // mm7 = 0x7F
                      		"movq b7C, %%mm6					\n\t" // mm6 = 0x7D
@@ -1577,9 +1582,9 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP
+                     }
                      /**
                     - * Do a horizontal low pass filter on the 8x8 block
                     + * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
                       * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
                     - * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version)
                     + * useing the 7-Tap Filter   (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
                       */
                      static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
+                     {
@@ -1635,14 +1640,6 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
                      */
                      // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16
                      /*
                     - 31
                     - 121
                     -  121
                     -   121
                     -    121
                     -     121
                     -      121
                     -       13
                      Implemented	Exact 7-Tap
 		A321
 		64321
@@ -1654,6 +1651,7 @@ Implemented	Exact 7-Tap
 	   123A
                      */
+                    +
                      #ifdef HAVE_MMX2
                      #define HLP3(i)	"movq " #i "(%%eax), %%mm0				\n\t"\
                      		"movq %%mm0, %%mm1					\n\t"\
@@ -1680,12 +1678,12 @@ Implemented	Exact 7-Tap
                      #define HLP3(i)	"movq " #i "(%%eax), %%mm0				\n\t"\
                      		"movq %%mm0, %%mm1					\n\t"\
                      		"movq %%mm0, %%mm2					\n\t"\
                     -		"movq %%mm0, %%mm3					\n\t"\
                     -		"movq %%mm0, %%mm4					\n\t"\
                     +		"movd -4(%0), %%mm3					\n\t" /*0001000*/\
                     +		"movd 8(%0), %%mm4					\n\t" /*0001000*/\
                      		"psllq $8, %%mm1					\n\t"\
                      		"psrlq $8, %%mm2					\n\t"\
                     -		"pand bm00000001, %%mm3					\n\t"\
                     -		"pand bm10000000, %%mm4					\n\t"\
                     +		"psrlq $24, %%mm3					\n\t"\
                     +		"psllq $56, %%mm4					\n\t"\
                      		"por %%mm3, %%mm1					\n\t"\
                      		"por %%mm4, %%mm2					\n\t"\
                      		PAVGB(%%mm2, %%mm1)\
@@ -1708,7 +1706,80 @@ Implemented	Exact 7-Tap
                      		"movd %%mm0, 4(%0)					\n\t"
                      #endif
                     -#define HLP(i) HLP3(i)
                     +/* uses the 7-Tap Filter: 1112111 */
                     +#define NEW_HLP(i)\
                     +		"movq " #i "(%%eax), %%mm0				\n\t"\
                     +		"movq %%mm0, %%mm1					\n\t"\
                     +		"movq %%mm0, %%mm2					\n\t"\
                     +		"movd -4(%0), %%mm3					\n\t" /*0001000*/\
                     +		"movd 8(%0), %%mm4					\n\t" /*0001000*/\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"psrlq $24, %%mm3					\n\t"\
                     +		"psllq $56, %%mm4					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t"\
                     +		"por %%mm4, %%mm2					\n\t"\
                     +		"movq %%mm1, %%mm5					\n\t"\
                     +		PAVGB(%%mm2, %%mm1)\
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"psllq $8, %%mm5					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm5					\n\t"\
                     +		"por %%mm4, %%mm2					\n\t"\
                     +		"movq %%mm5, %%mm1					\n\t"\
                     +		PAVGB(%%mm2, %%mm5)\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t"\
                     +		"por %%mm4, %%mm2					\n\t"\
                     +		PAVGB(%%mm2, %%mm1)\
                     +		PAVGB(%%mm1, %%mm5)\
                     +		PAVGB(%%mm5, %%mm0)\
                     +		"movd %%mm0, (%0)					\n\t"\
                     +		"psrlq $32, %%mm0					\n\t"\
                     +		"movd %%mm0, 4(%0)					\n\t"
+                    +
                     +/* uses the 9-Tap Filter: 112242211 */
                     +#define NEW_HLP2(i)\
                     +		"movq " #i "(%%eax), %%mm0				\n\t" /*0001000*/\
                     +		"movq %%mm0, %%mm1					\n\t" /*0001000*/\
                     +		"movq %%mm0, %%mm2					\n\t" /*0001000*/\
                     +		"movd -4(%0), %%mm3					\n\t" /*0001000*/\
                     +		"movd 8(%0), %%mm4					\n\t" /*0001000*/\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"psrlq $24, %%mm3					\n\t"\
                     +		"psllq $56, %%mm4					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t" /*0010000*/\
                     +		"por %%mm4, %%mm2					\n\t" /*0000100*/\
                     +		"movq %%mm1, %%mm5					\n\t" /*0010000*/\
                     +		PAVGB(%%mm2, %%mm1)					      /*0010100*/\
                     +		PAVGB(%%mm1, %%mm0)					      /*0012100*/\
                     +		"psllq $8, %%mm5					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm5					\n\t" /*0100000*/\
                     +		"por %%mm4, %%mm2					\n\t" /*0000010*/\
                     +		"movq %%mm5, %%mm1					\n\t" /*0100000*/\
                     +		PAVGB(%%mm2, %%mm5)					      /*0100010*/\
                     +		"psllq $8, %%mm1					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm1					\n\t" /*1000000*/\
                     +		"por %%mm4, %%mm2					\n\t" /*0000001*/\
                     +		"movq %%mm1, %%mm6					\n\t" /*1000000*/\
                     +		PAVGB(%%mm2, %%mm1)					      /*1000001*/\
                     +		"psllq $8, %%mm6					\n\t"\
                     +		"psrlq $8, %%mm2					\n\t"\
                     +		"por %%mm3, %%mm6					\n\t"/*100000000*/\
                     +		"por %%mm4, %%mm2					\n\t"/*000000001*/\
                     +		PAVGB(%%mm2, %%mm6)					     /*100000001*/\
                     +		PAVGB(%%mm6, %%mm1)					     /*110000011*/\
                     +		PAVGB(%%mm1, %%mm5)					     /*112000211*/\
                     +		PAVGB(%%mm5, %%mm0)					     /*112242211*/\
                     +		"movd %%mm0, (%0)					\n\t"\
                     +		"psrlq $32, %%mm0					\n\t"\
                     +		"movd %%mm0, 4(%0)					\n\t"
+                    +
                     +#define HLP(i) NEW_HLP(i)
                      		HLP(0)
                      		"addl %1, %0						\n\t"
@@ -1828,6 +1899,363 @@ FIND_MIN_MAX(%%ebx, %1, 2)
                      #endif
+                     }
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
                     + */
                     +static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t"
                     +		"movq (%%eax, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)\
                     +		"movq %%mm1, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%ebx)				\n\t"
                     +		"movq (%0, %1, 8), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)\
                     +		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[stride]   = (src[0]        + src[stride*2])>>1;
                     +		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
                     +		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
                     +		src[stride*7] = (src[stride*6] + src[stride*8])>>1;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
                     + */
                     +static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t"
                     +		"movq (%%eax, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm0			\n\t"
                     +		PAVGB(%%mm0, %%mm1)\
                     +		"movq %%mm1, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm1			\n\t"
                     +		PAVGB(%%mm1, %%mm0)\
                     +		"movq %%mm0, (%%ebx)				\n\t"
                     +		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[stride]   = (src[0]        + src[stride*2])>>1;
                     +		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
                     +		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
                     +		src[stride*7] = src[stride*6];
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
                     + * will shift the image up by 1 line (FIXME if this is a problem)
                     + */
                     +static inline void deInterlaceBlendLinear(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%%eax, %1), %%mm1			\n\t" // L2
                     +		PAVGB(%%mm1, %%mm0)				      // L0+L2
                     +		"movq (%%eax), %%mm2				\n\t" // L1
                     +		PAVGB(%%mm2, %%mm0)
                     +		"movq %%mm0, (%0)				\n\t"
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
                     +		PAVGB(%%mm0, %%mm2)				      // L1+L3
                     +		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
                     +		"movq %%mm2, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm2			\n\t" // L4
                     +		PAVGB(%%mm2, %%mm1)				      // L2+L4
                     +		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
                     +		"movq %%mm1, (%%eax, %1)			\n\t"
                     +		"movq (%%ebx), %%mm1				\n\t" // L5
                     +		PAVGB(%%mm1, %%mm0)				      // L3+L5
                     +		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm0			\n\t" // L6
                     +		PAVGB(%%mm0, %%mm2)				      // L4+L6
                     +		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
                     +		"movq %%mm2, (%0, %1, 4)			\n\t"
                     +		"movq (%%ebx, %1, 2), %%mm2			\n\t" // L7
                     +		PAVGB(%%mm2, %%mm1)				      // L5+L7
                     +		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
                     +		"movq %%mm1, (%%ebx)				\n\t"
                     +		"movq (%0, %1, 8), %%mm1			\n\t" // L8
                     +		PAVGB(%%mm1, %%mm0)				      // L6+L8
                     +		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
                     +		"movq %%mm0, (%%ebx, %1)			\n\t"
                     +		"movq (%%ebx, %1, 4), %%mm0			\n\t" // L9
                     +		PAVGB(%%mm0, %%mm2)				      // L7+L9
                     +		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
                     +		"movq %%mm2, (%%ebx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
                     +		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
                     + * will shift the image up by 1 line (FIXME if this is a problem)
                     + */
                     +static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMSX2) || defined (HAVE_3DNOW)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" // L0
                     +		"movq (%%eax, %1), %%mm1			\n\t" // L2
                     +		PAVGB(%%mm1, %%mm0)				      // L0+L2
                     +		"movq (%%eax), %%mm2				\n\t" // L1
                     +		PAVGB(%%mm2, %%mm0)
                     +		"movq %%mm0, (%0)				\n\t"
                     +		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
                     +		PAVGB(%%mm0, %%mm2)				      // L1+L3
                     +		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
                     +		"movq %%mm2, (%%eax)				\n\t"
                     +		"movq (%0, %1, 4), %%mm2			\n\t" // L4
                     +		PAVGB(%%mm2, %%mm1)				      // L2+L4
                     +		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
                     +		"movq %%mm1, (%%eax, %1)			\n\t"
                     +		"movq (%%ebx), %%mm1				\n\t" // L5
                     +		PAVGB(%%mm1, %%mm0)				      // L3+L5
                     +		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
                     +		"movq %%mm0, (%%eax, %1, 2)			\n\t"
                     +		"movq (%%ebx, %1), %%mm0			\n\t" // L6
                     +		PAVGB(%%mm0, %%mm2)				      // L4+L6
                     +		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
                     +		"movq %%mm2, (%0, %1, 4)			\n\t"
                     +		"movq (%%ebx, %1, 2), %%mm2			\n\t" // L7
                     +		PAVGB(%%mm2, %%mm1)				      // L5+L7
                     +		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
                     +		"movq %%mm1, (%%ebx)				\n\t"
                     +		PAVGB(%%mm2, %%mm0)				      // L7 + L8
                     +		"movq %%mm0, (%%ebx, %1)			\n\t"
                     +		"movq %%mm0, (%%ebx, %1, 2)			\n\t"
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
                     +		src[stride*7] = src[stride*6];
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block
                     + */
                     +static inline void deInterlaceMedian(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" //
                     +		"movq (%%eax, %1), %%mm2			\n\t" //
                     +		"movq (%%eax), %%mm1				\n\t" //
                     +		"movq %%mm0, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm2, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm0				\n\t"
                     +		"movq %%mm0, (%%eax)				\n\t"
+                    +
                     +		"movq (%0, %1, 4), %%mm0			\n\t" //
                     +		"movq (%%eax, %1, 2), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm0, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm2				\n\t"
                     +		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    +
                     +		"movq (%%ebx), %%mm2				\n\t" //
                     +		"movq (%%ebx, %1), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ebx)				\n\t"
+                    +
                     +		"movq (%%ebx, %1, 2), %%mm2			\n\t" //
                     +		"movq (%0, %1, 8), %%mm0			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ebx, %1, 2)			\n\t"
+                    +
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	//FIXME
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
                     +		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
                     +/**
                     + * Deinterlaces the given block
                     + * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block
                     + * will shift the image up by 1 line (FIXME if this is a problem)
                     + */
                     +static inline void deInterlaceMedianLastRow(uint8_t src[], int stride)
                     +{
                     +#if defined (HAVE_MMX2)
                     +	asm volatile(
                     +		"leal (%0, %1), %%eax				\n\t"
                     +		"leal (%%eax, %1, 4), %%ebx			\n\t"
                     +//	0	1	2	3	4	5	6	7	8	9
                     +//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
+                    +
                     +		"movq (%0), %%mm0				\n\t" //
                     +		"movq (%%eax, %1), %%mm2			\n\t" //
                     +		"movq (%%eax), %%mm1				\n\t" //
                     +		"movq %%mm0, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm2, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm0				\n\t"
                     +		"movq %%mm0, (%%eax)				\n\t"
+                    +
                     +		"movq (%0, %1, 4), %%mm0			\n\t" //
                     +		"movq (%%eax, %1, 2), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm1, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm1				\n\t" //
                     +		"pmaxub %%mm0, %%mm1				\n\t" //
                     +		"pminub %%mm1, %%mm2				\n\t"
                     +		"movq %%mm2, (%%eax, %1, 2)			\n\t"
+                    +
                     +		"movq (%%ebx), %%mm2				\n\t" //
                     +		"movq (%%ebx, %1), %%mm1			\n\t" //
                     +		"movq %%mm2, %%mm3				\n\t"
                     +		"pmaxub %%mm0, %%mm2				\n\t" //
                     +		"pminub %%mm3, %%mm0				\n\t" //
                     +		"pmaxub %%mm1, %%mm0				\n\t" //
                     +		"pminub %%mm0, %%mm2				\n\t"
                     +		"movq %%mm2, (%%ebx)				\n\t"
+                    +
                     +		"movq %%mm1, (%%ebx, %1, 2)			\n\t"
+                    +
                     +		: : "r" (src), "r" (stride)
                     +		: "%eax", "%ebx"
                     +	);
                     +#else
                     +	//FIXME
                     +	int x;
                     +	for(x=0; x<8; x++)
                     +	{
                     +		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
                     +		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
                     +		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
                     +		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
                     +		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
                     +		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
                     +		src[stride*6] = (src[stride*6] +   src[stride*7])>>1;
                     +		src[stride*7] = src[stride*6];
                     +		src++;
                     +	}
                     +#endif
                     +}
+                    +
+                    +
                      #ifdef HAVE_ODIVX_POSTPROCESS
                      #include "../opendivx/postprocess.h"
                      int use_old_pp=0;
@@ -1841,7 +2269,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                       * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63)
                       * -63 is best quality -1 is worst
                       */
                     -//extern "C"{
                      void  postprocess(unsigned char * src[], int src_stride,
                                       unsigned char * dst[], int dst_stride,
                                       int horizontal_size,   int vertical_size,
@@ -2196,6 +2623,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                      				blockCopy(vertBlock + dstStride*2, dstStride,
                      					vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX);
                     +				if(mode & LINEAR_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateLinear(dstBlock, dstStride);
                     +				else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendLinear(dstBlock, dstStride);
                     +				else if(mode & MEDIAN_DEINT_FILTER)
                     +					deInterlaceMedian(dstBlock, dstStride);
                     +/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateCubic(dstBlock, dstStride);
                     +				else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendCubic(dstBlock, dstStride);
                     +*/
                      #ifdef MORE_TIMEING
                      				T1= rdtsc();
@@ -2226,9 +2664,22 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri
                      #endif
+                     			}
                      			else
                     +			{
                      				blockCopy(vertBlock + dstStride*1, dstStride,
                      					vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX);
                     +				if(mode & LINEAR_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateLinearLastRow(dstBlock, dstStride);
                     +				else if(mode & LINEAR_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendLinearLastRow(dstBlock, dstStride);
                     +				else if(mode & MEDIAN_DEINT_FILTER)
                     +					deInterlaceMedianLastRow(dstBlock, dstStride);
                     +/*				else if(mode & CUBIC_IPOL_DEINT_FILTER)
                     +					deInterlaceInterpolateCubicLastRow(dstBlock, dstStride);
                     +				else if(mode & CUBIC_BLEND_DEINT_FILTER)
                     +					deInterlaceBlendCubicLastRow(dstBlock, dstStride);
                     +*/
                     +			}
                      			if(x - 8 >= 0 && x<width)
+                     			{