Browse code

moving postprocess to ffmpeg/libavcodec

Originally committed as revision 1586 to svn://svn.ffmpeg.org/ffmpeg/trunk
Originally committed as revision 9427 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
Originally committed as revision 9428 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

Michael Niedermayer authored on 2003/02/15 06:27:25
Showing 13 changed files
... ...
@@ -60,6 +60,8 @@ mp3lame="no"
60 60
 vorbis="no"
61 61
 a52="yes"
62 62
 a52bin="no"
63
+pp="yes"
64
+shared_pp="no"
63 65
 win32="no"
64 66
 mingw32="no"
65 67
 cygwin="no"
... ...
@@ -281,6 +283,10 @@ for opt do
281 281
   ;;
282 282
   --enable-a52bin) a52bin="yes" ; extralibs="$ldl $extralibs"
283 283
   ;;
284
+  --disable-pp) pp="no"
285
+  ;;
286
+  --enable-shared-pp) shared_pp="yes"
287
+  ;;
284 288
   --enable-mp3lame) mp3lame="yes"
285 289
   ;;
286 290
   --enable-vorbis) vorbis="yes"
... ...
@@ -578,6 +584,8 @@ echo "  --enable-win32           enable win32 cross compile"
578 578
 echo "  --enable-mingw32         enable mingw32 native windows compile"
579 579
 echo "  --disable-a52            disable GPL'ed A52 support [default=no]"
580 580
 echo "  --enable-a52bin          open liba52.so.0 at runtime [default=no]"
581
+echo "  --disable-pp             disable GPL'ed post processing support [default=no]"
582
+echo "  --enable-shared-pp       use libpostproc.so [default=no]"
581 583
 echo "  --enable-shared          build shared libraries [default=no]"
582 584
 echo ""
583 585
 echo "Advanced options (experts only):"
... ...
@@ -631,6 +639,8 @@ echo "mp3lame enabled  $mp3lame"
631 631
 echo "vorbis enabled   $vorbis"
632 632
 echo "a52 support      $a52"
633 633
 echo "a52 dlopened     $a52bin"
634
+echo "pp support       $pp"
635
+echo "shared pp        $shared_pp"
634 636
 echo "Video hooking    $vhook"
635 637
 echo "risky / patent encumbered codecs $risky"
636 638
 
... ...
@@ -754,6 +764,17 @@ if test "$a52" = "yes" ; then
754 754
   fi
755 755
 fi
756 756
 
757
+# PP
758
+if test "$pp" = "yes" ; then
759
+  echo "#define CONFIG_PP 1" >> $TMPH
760
+  echo "CONFIG_PP=yes" >> config.mak
761
+
762
+  if test "$shared_pp" = "yes" ; then
763
+    echo "#define SHARED_PP 1" >> $TMPH
764
+    echo "SHARED_PP=yes" >> config.mak
765
+  fi
766
+fi
767
+
757 768
 # mpeg audio high precision mode
758 769
 if test "$mpegaudio_hp" = "yes" ; then
759 770
   echo "#define CONFIG_MPEGAUDIO_HP 1" >> $TMPH
... ...
@@ -35,6 +35,15 @@ OBJS+= liba52/bit_allocate.o liba52/bitstream.o liba52/downmix.o \
35 35
 endif
36 36
 endif
37 37
 
38
+ifeq ($(CONFIG_PP),yes)
39
+ifeq ($(SHARED_PP),yes)
40
+EXTRALIBS += -lpostproc
41
+else
42
+# LIBS += libpostproc/libpostproc.a ... should be fixed
43
+OBJS += libpostproc/postprocess.o
44
+endif
45
+endif
46
+
38 47
 ifeq ($(CONFIG_MP3LAME),yes)
39 48
 OBJS += mp3lameaudio.o
40 49
 EXTRALIBS += -lmp3lame
... ...
@@ -125,6 +134,9 @@ $(SLIB): $(OBJS)
125 125
 
126 126
 dsputil.o: dsputil.c dsputil.h
127 127
 
128
+libpostproc/libpostproc.a:
129
+	$(MAKE) -C libpostproc
130
+
128 131
 %.o: %.c
129 132
 	$(CC) $(CFLAGS) -c -o $@ $< 
130 133
 
131 134
new file mode 100644
... ...
@@ -0,0 +1,64 @@
0
+
1
+include ../../config.mak
2
+
3
+ifeq ($(SHARED_PP),yes)
4
+SPPLIB = libpostproc.so
5
+SPPVERSION = 0.0.1
6
+endif
7
+PPLIB = libpostproc.a
8
+
9
+PPOBJS=postprocess.o
10
+SPPOBJS=postprocess_pic.o
11
+
12
+CFLAGS  = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC)
13
+# -I/usr/X11R6/include/
14
+
15
+.SUFFIXES: .c .o
16
+
17
+# .PHONY: all clean
18
+
19
+.c.o:
20
+	$(CC) -c $(CFLAGS) -I.. -I../.. -o $@ $<
21
+
22
+all:    $(SWSLIB) $(PPLIB) $(SPPLIB)
23
+
24
+clean:
25
+	rm -f *.o *.a *~ *.so
26
+
27
+distclean:
28
+	rm -f Makefile.bak *.o *.a *~ *.so .depend
29
+
30
+dep:    depend
31
+
32
+depend:
33
+	$(CC) -MM $(CFLAGS) postprocess.c 1>.depend
34
+
35
+ifeq ($(SHARED_PP),yes)
36
+postprocess_pic.o: postprocess.c
37
+	$(CC) -c $(CFLAGS) -fomit-frame-pointer -fPIC -DPIC -I.. -I../.. -o $@ $<
38
+
39
+$(SPPLIB): $(SPPOBJS)
40
+	$(CC) -shared -Wl,-soname,$(SPPLIB).0 \
41
+	-o $(SPPLIB) $(SPPOBJS)
42
+endif
43
+
44
+$(PPLIB): $(PPOBJS)
45
+	$(AR) r $(PPLIB) $(PPOBJS)
46
+
47
+install: all
48
+ifeq ($(SHARED_PP),yes)
49
+	install -d $(prefix)/lib
50
+	install -s -m 755 $(SPPLIB) $(prefix)/lib/$(SPPLIB).$(SPPVERSION)
51
+	ln -sf $(SPPLIB).$(SPPVERSION) $(prefix)/lib/$(SPPLIB)
52
+	ldconfig || true
53
+	mkdir -p $(prefix)/include/postproc
54
+	install -m 644 postprocess.h $(prefix)/include/postproc/postprocess.h
55
+endif
56
+
57
+
58
+#
59
+# include dependency files if they exist
60
+#
61
+ifneq ($(wildcard .depend),)
62
+include .depend
63
+endif
0 64
new file mode 100644
... ...
@@ -0,0 +1,19 @@
0
+/* mangle.h - This file has some CPP macros to deal with different symbol
1
+ * mangling across binary formats.
2
+ * (c)2002 by Felix Buenemann <atmosfear at users.sourceforge.net>
3
+ * File licensed under the GPL, see http://www.fsf.org/ for more info.
4
+ */
5
+
6
+#ifndef __MANGLE_H
7
+#define __MANGLE_H
8
+
9
+/* Feel free to add more to the list, eg. a.out IMO */
10
+#if defined(__CYGWIN__) || defined(__OS2__) || \
11
+   (defined(__OpenBSD__) && !defined(__ELF__))
12
+#define MANGLE(a) "_" #a
13
+#else
14
+#define MANGLE(a) #a
15
+#endif
16
+
17
+#endif /* !__MANGLE_H */
18
+
0 19
new file mode 100644
... ...
@@ -0,0 +1,875 @@
0
+/*
1
+    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
2
+
3
+    This program is free software; you can redistribute it and/or modify
4
+    it under the terms of the GNU General Public License as published by
5
+    the Free Software Foundation; either version 2 of the License, or
6
+    (at your option) any later version.
7
+
8
+    This program is distributed in the hope that it will be useful,
9
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+    GNU General Public License for more details.
12
+
13
+    You should have received a copy of the GNU General Public License
14
+    along with this program; if not, write to the Free Software
15
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
+*/
17
+
18
+/*
19
+			C	MMX	MMX2	3DNow
20
+isVertDC		Ec	Ec
21
+isVertMinMaxOk		Ec	Ec
22
+doVertLowPass		E		e	e
23
+doVertDefFilter		Ec	Ec	e	e
24
+isHorizDC		Ec	Ec
25
+isHorizMinMaxOk		a	E
26
+doHorizLowPass		E		e	e
27
+doHorizDefFilter	Ec	Ec	e	e
28
+deRing			E		e	e*
29
+Vertical RKAlgo1	E		a	a
30
+Horizontal RKAlgo1			a	a
31
+Vertical X1#		a		E	E
32
+Horizontal X1#		a		E	E
33
+LinIpolDeinterlace	e		E	E*
34
+CubicIpolDeinterlace	a		e	e*
35
+LinBlendDeinterlace	e		E	E*
36
+MedianDeinterlace#	E	Ec	Ec
37
+TempDeNoiser#		E		e	e
38
+
39
+* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
40
+# more or less selfinvented filters so the exactness isnt too meaningfull
41
+E = Exact implementation
42
+e = allmost exact implementation (slightly different rounding,...)
43
+a = alternative / approximate impl
44
+c = checked against the other implementations (-vo md5)
45
+*/
46
+
47
+/*
48
+TODO:
49
+reduce the time wasted on the mem transfer
50
+unroll stuff if instructions depend too much on the prior one
51
+move YScale thing to the end instead of fixing QP
52
+write a faster and higher quality deblocking filter :)
53
+make the mainloop more flexible (variable number of blocks at once
54
+	(the if/else stuff per block is slowing things down)
55
+compare the quality & speed of all filters
56
+split this huge file
57
+optimize c versions
58
+try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
59
+...
60
+*/
61
+
62
+//Changelog: use the CVS log
63
+
64
+#include "config.h"
65
+#include <inttypes.h>
66
+#include <stdio.h>
67
+#include <stdlib.h>
68
+#include <string.h>
69
+#ifdef HAVE_MALLOC_H
70
+#include <malloc.h>
71
+#endif
72
+//#undef HAVE_MMX2
73
+//#define HAVE_3DNOW
74
+//#undef HAVE_MMX
75
+//#undef ARCH_X86
76
+//#define DEBUG_BRIGHTNESS
77
+#ifdef USE_FASTMEMCPY
78
+#include "libvo/fastmemcpy.h"
79
+#endif
80
+#include "postprocess.h"
81
+#include "postprocess_internal.h"
82
+
83
+#include "mangle.h" //FIXME should be supressed
84
+
85
+#define MIN(a,b) ((a) > (b) ? (b) : (a))
86
+#define MAX(a,b) ((a) < (b) ? (b) : (a))
87
+#define ABS(a) ((a) > 0 ? (a) : (-(a)))
88
+#define SIGN(a) ((a) > 0 ? 1 : -1)
89
+
90
+#define GET_MODE_BUFFER_SIZE 500
91
+#define OPTIONS_ARRAY_SIZE 10
92
+#define BLOCK_SIZE 8
93
+#define TEMP_STRIDE 8
94
+//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
95
+
96
+#ifdef ARCH_X86
97
+static uint64_t __attribute__((aligned(8))) w05=		0x0005000500050005LL;
98
+static uint64_t __attribute__((aligned(8))) w20=		0x0020002000200020LL;
99
+static uint64_t __attribute__((aligned(8))) b00= 		0x0000000000000000LL;
100
+static uint64_t __attribute__((aligned(8))) b01= 		0x0101010101010101LL;
101
+static uint64_t __attribute__((aligned(8))) b02= 		0x0202020202020202LL;
102
+static uint64_t __attribute__((aligned(8))) b08= 		0x0808080808080808LL;
103
+static uint64_t __attribute__((aligned(8))) b80= 		0x8080808080808080LL;
104
+#endif
105
+
106
+static int verbose= 0;
107
+
108
+static const int deringThreshold= 20;
109
+
110
+
111
+static struct PPFilter filters[]=
112
+{
113
+	{"hb", "hdeblock", 		1, 1, 3, H_DEBLOCK},
114
+	{"vb", "vdeblock", 		1, 2, 4, V_DEBLOCK},
115
+/*	{"hr", "rkhdeblock", 		1, 1, 3, H_RK1_FILTER},
116
+	{"vr", "rkvdeblock", 		1, 2, 4, V_RK1_FILTER},*/
117
+	{"h1", "x1hdeblock", 		1, 1, 3, H_X1_FILTER},
118
+	{"v1", "x1vdeblock", 		1, 2, 4, V_X1_FILTER},
119
+	{"dr", "dering", 		1, 5, 6, DERING},
120
+	{"al", "autolevels", 		0, 1, 2, LEVEL_FIX},
121
+	{"lb", "linblenddeint", 	1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
122
+	{"li", "linipoldeint", 		1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
123
+	{"ci", "cubicipoldeint",	1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
124
+	{"md", "mediandeint", 		1, 1, 4, MEDIAN_DEINT_FILTER},
125
+	{"fd", "ffmpegdeint", 		1, 1, 4, FFMPEG_DEINT_FILTER},
126
+	{"tn", "tmpnoise", 		1, 7, 8, TEMP_NOISE_FILTER},
127
+	{"fq", "forcequant", 		1, 0, 0, FORCE_QUANT},
128
+	{NULL, NULL,0,0,0,0} //End Marker
129
+};
130
+
131
+static char *replaceTable[]=
132
+{
133
+	"default", 	"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
134
+	"de", 		"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
135
+	"fast", 	"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
136
+	"fa", 		"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
137
+	NULL //End Marker
138
+};
139
+
140
+#ifdef ARCH_X86
141
+static inline void unusedVariableWarningFixer()
142
+{
143
+	if(w05 + w20 + b00 + b01 + b02 + b08 + b80 == 0) b00=0;
144
+}
145
+#endif
146
+
147
+
148
+#ifdef ARCH_X86
149
+static inline void prefetchnta(void *p)
150
+{
151
+	asm volatile(	"prefetchnta (%0)\n\t"
152
+		: : "r" (p)
153
+	);
154
+}
155
+
156
+static inline void prefetcht0(void *p)
157
+{
158
+	asm volatile(	"prefetcht0 (%0)\n\t"
159
+		: : "r" (p)
160
+	);
161
+}
162
+
163
+static inline void prefetcht1(void *p)
164
+{
165
+	asm volatile(	"prefetcht1 (%0)\n\t"
166
+		: : "r" (p)
167
+	);
168
+}
169
+
170
+static inline void prefetcht2(void *p)
171
+{
172
+	asm volatile(	"prefetcht2 (%0)\n\t"
173
+		: : "r" (p)
174
+	);
175
+}
176
+#endif
177
+
178
+// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
179
+
180
+/**
181
+ * Check if the given 8x8 Block is mostly "flat"
182
+ */
183
+static inline int isHorizDC(uint8_t src[], int stride, PPContext *c)
184
+{
185
+	int numEq= 0;
186
+	int y;
187
+	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
188
+	const int dcThreshold= dcOffset*2 + 1;
189
+	for(y=0; y<BLOCK_SIZE; y++)
190
+	{
191
+		if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
192
+		if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
193
+		if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
194
+		if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
195
+		if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
196
+		if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
197
+		if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
198
+		src+= stride;
199
+	}
200
+	return numEq > c->ppMode.flatnessThreshold;
201
+}
202
+
203
+/**
204
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
205
+ */
206
+static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
207
+	int numEq= 0;
208
+	int y;
209
+	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
210
+	const int dcThreshold= dcOffset*2 + 1;
211
+	src+= stride*4; // src points to begin of the 8x8 Block
212
+	for(y=0; y<BLOCK_SIZE-1; y++)
213
+	{
214
+		if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
215
+		if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
216
+		if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
217
+		if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
218
+		if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
219
+		if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
220
+		if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
221
+		if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
222
+		src+= stride;
223
+	}
224
+	return numEq > c->ppMode.flatnessThreshold;
225
+}
226
+
227
+static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
228
+{
229
+	if(abs(src[0] - src[7]) > 2*QP) return 0;
230
+
231
+	return 1;
232
+}
233
+
234
+static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
235
+{
236
+	int y;
237
+	for(y=0; y<BLOCK_SIZE; y++)
238
+	{
239
+		const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
240
+
241
+		if(ABS(middleEnergy) < 8*QP)
242
+		{
243
+			const int q=(dst[3] - dst[4])/2;
244
+			const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
245
+			const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
246
+
247
+			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
248
+			d= MAX(d, 0);
249
+
250
+			d= (5*d + 32) >> 6;
251
+			d*= SIGN(-middleEnergy);
252
+
253
+			if(q>0)
254
+			{
255
+				d= d<0 ? 0 : d;
256
+				d= d>q ? q : d;
257
+			}
258
+			else
259
+			{
260
+				d= d>0 ? 0 : d;
261
+				d= d<q ? q : d;
262
+			}
263
+
264
+        		dst[3]-= d;
265
+	        	dst[4]+= d;
266
+		}
267
+		dst+= stride;
268
+	}
269
+}
270
+
271
+/**
272
+ * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
273
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
274
+ */
275
+static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
276
+{
277
+
278
+	int y;
279
+	for(y=0; y<BLOCK_SIZE; y++)
280
+	{
281
+		const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
282
+		const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
283
+
284
+		int sums[9];
285
+		sums[0] = first + dst[0];
286
+		sums[1] = dst[0] + dst[1];
287
+		sums[2] = dst[1] + dst[2];
288
+		sums[3] = dst[2] + dst[3];
289
+		sums[4] = dst[3] + dst[4];
290
+		sums[5] = dst[4] + dst[5];
291
+		sums[6] = dst[5] + dst[6];
292
+		sums[7] = dst[6] + dst[7];
293
+		sums[8] = dst[7] + last;
294
+
295
+		dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
296
+		dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
297
+		dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
298
+		dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
299
+		dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
300
+		dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
301
+		dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
302
+		dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
303
+
304
+		dst+= stride;
305
+	}
306
+}
307
+
308
+/**
309
+ * Experimental Filter 1 (Horizontal)
310
+ * will not damage linear gradients
311
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
312
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move)
313
+ * MMX2 version does correct clipping C version doesnt
314
+ * not identical with the vertical one
315
+ */
316
+static inline void horizX1Filter(uint8_t *src, int stride, int QP)
317
+{
318
+	int y;
319
+	static uint64_t *lut= NULL;
320
+	if(lut==NULL)
321
+	{
322
+		int i;
323
+		lut= (uint64_t*)memalign(8, 256*8);
324
+		for(i=0; i<256; i++)
325
+		{
326
+			int v= i < 128 ? 2*i : 2*(i-256);
327
+/*
328
+//Simulate 112242211 9-Tap filter
329
+			uint64_t a= (v/16) & 0xFF;
330
+			uint64_t b= (v/8) & 0xFF;
331
+			uint64_t c= (v/4) & 0xFF;
332
+			uint64_t d= (3*v/8) & 0xFF;
333
+*/
334
+//Simulate piecewise linear interpolation
335
+			uint64_t a= (v/16) & 0xFF;
336
+			uint64_t b= (v*3/16) & 0xFF;
337
+			uint64_t c= (v*5/16) & 0xFF;
338
+			uint64_t d= (7*v/16) & 0xFF;
339
+			uint64_t A= (0x100 - a)&0xFF;
340
+			uint64_t B= (0x100 - b)&0xFF;
341
+			uint64_t C= (0x100 - c)&0xFF;
342
+			uint64_t D= (0x100 - c)&0xFF;
343
+
344
+			lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
345
+				(D<<24) | (C<<16) | (B<<8) | (A);
346
+			//lut[i] = (v<<32) | (v<<24);
347
+		}
348
+	}
349
+
350
+	for(y=0; y<BLOCK_SIZE; y++)
351
+	{
352
+		int a= src[1] - src[2];
353
+		int b= src[3] - src[4];
354
+		int c= src[5] - src[6];
355
+
356
+		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
357
+
358
+		if(d < QP)
359
+		{
360
+			int v = d * SIGN(-b);
361
+
362
+			src[1] +=v/8;
363
+			src[2] +=v/4;
364
+			src[3] +=3*v/8;
365
+			src[4] -=3*v/8;
366
+			src[5] -=v/4;
367
+			src[6] -=v/8;
368
+
369
+		}
370
+		src+=stride;
371
+	}
372
+}
373
+
374
+
375
+//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
376
+//Plain C versions
377
+#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
378
+#define COMPILE_C
379
+#endif
380
+
381
+#ifdef ARCH_X86
382
+
383
+#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
384
+#define COMPILE_MMX
385
+#endif
386
+
387
+#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
388
+#define COMPILE_MMX2
389
+#endif
390
+
391
+#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
392
+#define COMPILE_3DNOW
393
+#endif
394
+#endif //ARCH_X86
395
+
396
+#undef HAVE_MMX
397
+#undef HAVE_MMX2
398
+#undef HAVE_3DNOW
399
+#undef ARCH_X86
400
+
401
+#ifdef COMPILE_C
402
+#undef HAVE_MMX
403
+#undef HAVE_MMX2
404
+#undef HAVE_3DNOW
405
+#undef ARCH_X86
406
+#define RENAME(a) a ## _C
407
+#include "postprocess_template.c"
408
+#endif
409
+
410
+//MMX versions
411
+#ifdef COMPILE_MMX
412
+#undef RENAME
413
+#define HAVE_MMX
414
+#undef HAVE_MMX2
415
+#undef HAVE_3DNOW
416
+#define ARCH_X86
417
+#define RENAME(a) a ## _MMX
418
+#include "postprocess_template.c"
419
+#endif
420
+
421
+//MMX2 versions
422
+#ifdef COMPILE_MMX2
423
+#undef RENAME
424
+#define HAVE_MMX
425
+#define HAVE_MMX2
426
+#undef HAVE_3DNOW
427
+#define ARCH_X86
428
+#define RENAME(a) a ## _MMX2
429
+#include "postprocess_template.c"
430
+#endif
431
+
432
+//3DNOW versions
433
+#ifdef COMPILE_3DNOW
434
+#undef RENAME
435
+#define HAVE_MMX
436
+#undef HAVE_MMX2
437
+#define HAVE_3DNOW
438
+#define ARCH_X86
439
+#define RENAME(a) a ## _3DNow
440
+#include "postprocess_template.c"
441
+#endif
442
+
443
+// minor note: the HAVE_xyz is messed up after that line so dont use it
444
+
445
+static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
446
+	QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
447
+{
448
+	PPContext *c= (PPContext *)vc;
449
+	PPMode *ppMode= (PPMode *)vm;
450
+	c->ppMode= *ppMode; //FIXME
451
+
452
+	// useing ifs here as they are faster than function pointers allthough the
453
+	// difference wouldnt be messureable here but its much better because
454
+	// someone might exchange the cpu whithout restarting mplayer ;)
455
+#ifdef RUNTIME_CPUDETECT
456
+#ifdef ARCH_X86
457
+	// ordered per speed fasterst first
458
+	if(c->cpuCaps & PP_CPU_CAPS_MMX2)
459
+		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
460
+	else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
461
+		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
462
+	else if(c->cpuCaps & PP_CPU_CAPS_MMX)
463
+		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
464
+	else
465
+		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
466
+#else
467
+		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
468
+#endif
469
+#else //RUNTIME_CPUDETECT
470
+#ifdef HAVE_MMX2
471
+		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
472
+#elif defined (HAVE_3DNOW)
473
+		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
474
+#elif defined (HAVE_MMX)
475
+		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
476
+#else
477
+		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
478
+#endif
479
+#endif //!RUNTIME_CPUDETECT
480
+}
481
+
482
+//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
483
+//	QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
484
+
485
+/* -pp Command line Help
486
+*/
487
+char *pp_help=
488
+"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
489
+"long form example:\n"
490
+"vdeblock:autoq/hdeblock:autoq/linblenddeint	default,-vdeblock\n"
491
+"short form example:\n"
492
+"vb:a/hb:a/lb					de,-vb\n"
493
+"more examples:\n"
494
+"tn:64:128:256\n"
495
+"Filters			Options\n"
496
+"short	long name	short	long option	Description\n"
497
+"*	*		a	autoq		cpu power dependant enabler\n"
498
+"			c	chrom		chrominance filtring enabled\n"
499
+"			y	nochrom		chrominance filtring disabled\n"
500
+"hb	hdeblock	(2 Threshold)		horizontal deblocking filter\n"
501
+"	1. difference factor: default=64, higher -> more deblocking\n"
502
+"	2. flatness threshold: default=40, lower -> more deblocking\n"
503
+"			the h & v deblocking filters share these\n"
504
+"			so u cant set different thresholds for h / v\n"
505
+"vb	vdeblock	(2 Threshold)		vertical deblocking filter\n"
506
+"h1	x1hdeblock				Experimental h deblock filter 1\n"
507
+"v1	x1vdeblock				Experimental v deblock filter 1\n"
508
+"dr	dering					Deringing filter\n"
509
+"al	autolevels				automatic brightness / contrast\n"
510
+"			f	fullyrange	stretch luminance to (0..255)\n"
511
+"lb	linblenddeint				linear blend deinterlacer\n"
512
+"li	linipoldeint				linear interpolating deinterlace\n"
513
+"ci	cubicipoldeint				cubic interpolating deinterlacer\n"
514
+"md	mediandeint				median deinterlacer\n"
515
+"fd	ffmpegdeint				ffmpeg deinterlacer\n"
516
+"de	default					hb:a,vb:a,dr:a,al\n"
517
+"fa	fast					h1:a,v1:a,dr:a,al\n"
518
+"tn	tmpnoise	(3 Thresholds)		Temporal Noise Reducer\n"
519
+"			1. <= 2. <= 3.		larger -> stronger filtering\n"
520
+"fq	forceQuant	<quantizer>		Force quantizer\n"
521
+;
522
+
523
+pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
524
+{
525
+	char temp[GET_MODE_BUFFER_SIZE];
526
+	char *p= temp;
527
+	char *filterDelimiters= ",/";
528
+	char *optionDelimiters= ":";
529
+	struct PPMode *ppMode;
530
+	char *filterToken;
531
+
532
+	ppMode= memalign(8, sizeof(PPMode));
533
+	
534
+	ppMode->lumMode= 0;
535
+	ppMode->chromMode= 0;
536
+	ppMode->maxTmpNoise[0]= 700;
537
+	ppMode->maxTmpNoise[1]= 1500;
538
+	ppMode->maxTmpNoise[2]= 3000;
539
+	ppMode->maxAllowedY= 234;
540
+	ppMode->minAllowedY= 16;
541
+	ppMode->baseDcDiff= 256/4;
542
+	ppMode->flatnessThreshold= 56-16;
543
+	ppMode->maxClippedThreshold= 0.01;
544
+	ppMode->error=0;
545
+
546
+	strncpy(temp, name, GET_MODE_BUFFER_SIZE);
547
+
548
+	if(verbose>1) printf("pp: %s\n", name);
549
+
550
+	for(;;){
551
+		char *filterName;
552
+		int q= 1000000; //PP_QUALITY_MAX;
553
+		int chrom=-1;
554
+		char *option;
555
+		char *options[OPTIONS_ARRAY_SIZE];
556
+		int i;
557
+		int filterNameOk=0;
558
+		int numOfUnknownOptions=0;
559
+		int enable=1; //does the user want us to enabled or disabled the filter
560
+
561
+		filterToken= strtok(p, filterDelimiters);
562
+		if(filterToken == NULL) break;
563
+		p+= strlen(filterToken) + 1; // p points to next filterToken
564
+		filterName= strtok(filterToken, optionDelimiters);
565
+		if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
566
+
567
+		if(*filterName == '-')
568
+		{
569
+			enable=0;
570
+			filterName++;
571
+		}
572
+
573
+		for(;;){ //for all options
574
+			option= strtok(NULL, optionDelimiters);
575
+			if(option == NULL) break;
576
+
577
+			if(verbose>1) printf("pp: option: %s\n", option);
578
+			if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
579
+			else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
580
+			else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
581
+			else
582
+			{
583
+				options[numOfUnknownOptions] = option;
584
+				numOfUnknownOptions++;
585
+			}
586
+			if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
587
+		}
588
+		options[numOfUnknownOptions] = NULL;
589
+
590
+		/* replace stuff from the replace Table */
591
+		for(i=0; replaceTable[2*i]!=NULL; i++)
592
+		{
593
+			if(!strcmp(replaceTable[2*i], filterName))
594
+			{
595
+				int newlen= strlen(replaceTable[2*i + 1]);
596
+				int plen;
597
+				int spaceLeft;
598
+
599
+				if(p==NULL) p= temp, *p=0; 	//last filter
600
+				else p--, *p=',';		//not last filter
601
+
602
+				plen= strlen(p);
603
+				spaceLeft= p - temp + plen;
604
+				if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
605
+				{
606
+					ppMode->error++;
607
+					break;
608
+				}
609
+				memmove(p + newlen, p, plen+1);
610
+				memcpy(p, replaceTable[2*i + 1], newlen);
611
+				filterNameOk=1;
612
+			}
613
+		}
614
+
615
+		for(i=0; filters[i].shortName!=NULL; i++)
616
+		{
617
+//			printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
618
+			if(   !strcmp(filters[i].longName, filterName)
619
+			   || !strcmp(filters[i].shortName, filterName))
620
+			{
621
+				ppMode->lumMode &= ~filters[i].mask;
622
+				ppMode->chromMode &= ~filters[i].mask;
623
+
624
+				filterNameOk=1;
625
+				if(!enable) break; // user wants to disable it
626
+
627
+				if(q >= filters[i].minLumQuality)
628
+					ppMode->lumMode|= filters[i].mask;
629
+				if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
630
+					if(q >= filters[i].minChromQuality)
631
+						ppMode->chromMode|= filters[i].mask;
632
+
633
+				if(filters[i].mask == LEVEL_FIX)
634
+				{
635
+					int o;
636
+					ppMode->minAllowedY= 16;
637
+					ppMode->maxAllowedY= 234;
638
+					for(o=0; options[o]!=NULL; o++)
639
+					{
640
+						if(  !strcmp(options[o],"fullyrange")
641
+						   ||!strcmp(options[o],"f"))
642
+						{
643
+							ppMode->minAllowedY= 0;
644
+							ppMode->maxAllowedY= 255;
645
+							numOfUnknownOptions--;
646
+						}
647
+					}
648
+				}
649
+				else if(filters[i].mask == TEMP_NOISE_FILTER)
650
+				{
651
+					int o;
652
+					int numOfNoises=0;
653
+
654
+					for(o=0; options[o]!=NULL; o++)
655
+					{
656
+						char *tail;
657
+						ppMode->maxTmpNoise[numOfNoises]=
658
+							strtol(options[o], &tail, 0);
659
+						if(tail!=options[o])
660
+						{
661
+							numOfNoises++;
662
+							numOfUnknownOptions--;
663
+							if(numOfNoises >= 3) break;
664
+						}
665
+					}
666
+				}
667
+				else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
668
+				{
669
+					int o;
670
+
671
+					for(o=0; options[o]!=NULL && o<2; o++)
672
+					{
673
+						char *tail;
674
+						int val= strtol(options[o], &tail, 0);
675
+						if(tail==options[o]) break;
676
+
677
+						numOfUnknownOptions--;
678
+						if(o==0) ppMode->baseDcDiff= val;
679
+						else ppMode->flatnessThreshold= val;
680
+					}
681
+				}
682
+				else if(filters[i].mask == FORCE_QUANT)
683
+				{
684
+					int o;
685
+					ppMode->forcedQuant= 15;
686
+
687
+					for(o=0; options[o]!=NULL && o<1; o++)
688
+					{
689
+						char *tail;
690
+						int val= strtol(options[o], &tail, 0);
691
+						if(tail==options[o]) break;
692
+
693
+						numOfUnknownOptions--;
694
+						ppMode->forcedQuant= val;
695
+					}
696
+				}
697
+			}
698
+		}
699
+		if(!filterNameOk) ppMode->error++;
700
+		ppMode->error += numOfUnknownOptions;
701
+	}
702
+
703
+	if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
704
+	if(ppMode->error)
705
+	{
706
+		fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
707
+		free(ppMode);
708
+		return NULL;
709
+	}
710
+	return ppMode;
711
+}
712
+
713
+void pp_free_mode(pp_mode_t *mode){
714
+    if(mode) free(mode);
715
+}
716
+
717
+static void reallocAlign(void **p, int alignment, int size){
718
+	if(*p) free(*p);
719
+	*p= memalign(alignment, size);
720
+	memset(*p, 0, size);
721
+}
722
+
723
+static void reallocBuffers(PPContext *c, int width, int height, int stride){
724
+	int mbWidth = (width+15)>>4;
725
+	int mbHeight= (height+15)>>4;
726
+	int i;
727
+
728
+	c->stride= stride;
729
+
730
+	reallocAlign((void **)&c->tempDst, 8, stride*24);
731
+	reallocAlign((void **)&c->tempSrc, 8, stride*24);
732
+	reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
733
+	reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
734
+	for(i=0; i<256; i++)
735
+		c->yHistogram[i]= width*height/64*15/256;
736
+
737
+	for(i=0; i<3; i++)
738
+	{
739
+		//Note:the +17*1024 is just there so i dont have to worry about r/w over te end
740
+		reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
741
+		reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
742
+	}
743
+
744
+	reallocAlign((void **)&c->deintTemp, 8, width+16);
745
+	reallocAlign((void **)&c->nonBQPTable, 8, mbWidth*mbHeight*sizeof(QP_STORE_T));
746
+	reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
747
+}
748
+
749
+pp_context_t *pp_get_context(int width, int height, int cpuCaps){
750
+	PPContext *c= memalign(32, sizeof(PPContext));
751
+	int i;
752
+	int stride= (width+15)&(~15); //assumed / will realloc if needed
753
+        
754
+	memset(c, 0, sizeof(PPContext));
755
+	c->cpuCaps= cpuCaps;
756
+	if(cpuCaps&PP_FORMAT){
757
+		c->hChromaSubSample= cpuCaps&0x3;
758
+		c->vChromaSubSample= (cpuCaps>>4)&0x3;
759
+	}else{
760
+		c->hChromaSubSample= 1;
761
+		c->vChromaSubSample= 1;
762
+	}
763
+
764
+	reallocBuffers(c, width, height, stride);
765
+        
766
+	c->frameNum=-1;
767
+
768
+	return c;
769
+}
770
+
771
+void pp_free_context(void *vc){
772
+	PPContext *c = (PPContext*)vc;
773
+	int i;
774
+	
775
+	for(i=0; i<3; i++) free(c->tempBlured[i]);
776
+	for(i=0; i<3; i++) free(c->tempBluredPast[i]);
777
+	
778
+	free(c->tempBlocks);
779
+	free(c->yHistogram);
780
+	free(c->tempDst);
781
+	free(c->tempSrc);
782
+	free(c->deintTemp);
783
+	free(c->nonBQPTable);
784
+	free(c->forcedQPTable);
785
+        
786
+	memset(c, 0, sizeof(PPContext));
787
+
788
+	free(c);
789
+}
790
+
791
+void  pp_postprocess(uint8_t * src[3], int srcStride[3],
792
+                 uint8_t * dst[3], int dstStride[3],
793
+                 int width, int height,
794
+                 QP_STORE_T *QP_store,  int QPStride,
795
+		 pp_mode_t *vm,  void *vc, int pict_type)
796
+{
797
+	int mbWidth = (width+15)>>4;
798
+	int mbHeight= (height+15)>>4;
799
+	PPMode *mode = (PPMode*)vm;
800
+	PPContext *c = (PPContext*)vc;
801
+        int minStride= MAX(srcStride[0], dstStride[0]);
802
+	
803
+	if(c->stride < minStride)
804
+		reallocBuffers(c, width, height, minStride);
805
+
806
+	if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)) 
807
+	{
808
+		int i;
809
+		QP_store= c->forcedQPTable;
810
+		QPStride= 0;
811
+		if(mode->lumMode & FORCE_QUANT)
812
+			for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
813
+		else
814
+			for(i=0; i<mbWidth; i++) QP_store[i]= 1;
815
+	}
816
+if(0){
817
+int x,y;
818
+for(y=0; y<mbHeight; y++){
819
+	for(x=0; x<mbWidth; x++){
820
+		printf("%2d ", QP_store[x + y*QPStride]);
821
+	}
822
+	printf("\n");
823
+}
824
+	printf("\n");
825
+}
826
+//printf("pict_type:%d\n", pict_type);
827
+
828
+	if(pict_type!=3)
829
+	{
830
+		int x,y;
831
+		for(y=0; y<mbHeight; y++){
832
+			for(x=0; x<mbWidth; x++){
833
+				int qscale= QP_store[x + y*QPStride];
834
+				if(qscale&~31)
835
+				    qscale=31;
836
+				c->nonBQPTable[y*mbWidth + x]= qscale;
837
+			}
838
+		}
839
+	}
840
+
841
+	if(verbose>2)
842
+	{
843
+		printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
844
+	}
845
+
846
+	postProcess(src[0], srcStride[0], dst[0], dstStride[0],
847
+		width, height, QP_store, QPStride, 0, mode, c);
848
+
849
+	width  = (width )>>c->hChromaSubSample;
850
+	height = (height)>>c->vChromaSubSample;
851
+
852
+	if(mode->chromMode)
853
+	{
854
+		postProcess(src[1], srcStride[1], dst[1], dstStride[1],
855
+			width, height, QP_store, QPStride, 1, mode, c);
856
+		postProcess(src[2], srcStride[2], dst[2], dstStride[2],
857
+			width, height, QP_store, QPStride, 2, mode, c);
858
+	}
859
+	else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
860
+	{
861
+		memcpy(dst[1], src[1], srcStride[1]*height);
862
+		memcpy(dst[2], src[2], srcStride[2]*height);
863
+	}
864
+	else
865
+	{
866
+		int y;
867
+		for(y=0; y<height; y++)
868
+		{
869
+			memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
870
+			memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
871
+		}
872
+	}
873
+}
874
+
0 875
new file mode 100644
... ...
@@ -0,0 +1,73 @@
0
+/*
1
+    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
2
+
3
+    This program is free software; you can redistribute it and/or modify
4
+    it under the terms of the GNU General Public License as published by
5
+    the Free Software Foundation; either version 2 of the License, or
6
+    (at your option) any later version.
7
+
8
+    This program is distributed in the hope that it will be useful,
9
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+    GNU General Public License for more details.
12
+
13
+    You should have received a copy of the GNU General Public License
14
+    along with this program; if not, write to the Free Software
15
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
+*/
17
+
18
+#ifndef NEWPOSTPROCESS_H
19
+#define NEWPOSTPROCESS_H
20
+
21
+/**
22
+ * @file postprocess.h
23
+ * @brief 
24
+ *     external api for the pp stuff
25
+ */
26
+
27
+#ifdef __cplusplus
28
+extern "C" {
29
+#endif
30
+
31
+#define PP_QUALITY_MAX 6
32
+
33
+#define QP_STORE_T int8_t
34
+
35
+typedef void pp_context_t;
36
+typedef void pp_mode_t;
37
+
38
+extern char *pp_help; //a simple help text
39
+
40
+void  pp_postprocess(uint8_t * src[3], int srcStride[3],
41
+                 uint8_t * dst[3], int dstStride[3],
42
+                 int horizontalSize, int verticalSize,
43
+                 QP_STORE_T *QP_store,  int QP_stride,
44
+		 pp_mode_t *mode, pp_context_t *ppContext, int pict_type);
45
+
46
+
47
+/**
48
+ * returns a pp_mode_t or NULL if an error occured
49
+ * name is the string after "-pp" on the command line
50
+ * quality is a number from 0 to PP_QUALITY_MAX
51
+ */
52
+pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality);
53
+void pp_free_mode(pp_mode_t *mode);
54
+
55
+pp_context_t *pp_get_context(int width, int height, int flags);
56
+void pp_free_context(pp_context_t *ppContext);
57
+
58
+#define PP_CPU_CAPS_MMX   0x80000000
59
+#define PP_CPU_CAPS_MMX2  0x20000000
60
+#define PP_CPU_CAPS_3DNOW 0x40000000
61
+
62
+#define PP_FORMAT         0x00000008
63
+#define PP_FORMAT_420    (0x00000011|PP_FORMAT)
64
+#define PP_FORMAT_422    (0x00000001|PP_FORMAT)
65
+#define PP_FORMAT_411    (0x00000002|PP_FORMAT)
66
+#define PP_FORMAT_444    (0x00000000|PP_FORMAT)
67
+
68
+#ifdef __cplusplus
69
+}
70
+#endif
71
+
72
+#endif
0 73
new file mode 100644
... ...
@@ -0,0 +1,128 @@
0
+/*
1
+    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
2
+
3
+    This program is free software; you can redistribute it and/or modify
4
+    it under the terms of the GNU General Public License as published by
5
+    the Free Software Foundation; either version 2 of the License, or
6
+    (at your option) any later version.
7
+
8
+    This program is distributed in the hope that it will be useful,
9
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+    GNU General Public License for more details.
12
+
13
+    You should have received a copy of the GNU General Public License
14
+    along with this program; if not, write to the Free Software
15
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
+*/
17
+
18
+#define V_DEBLOCK	0x01
19
+#define H_DEBLOCK	0x02
20
+#define DERING		0x04
21
+#define LEVEL_FIX	0x08 /* Brightness & Contrast */
22
+
23
+#define LUM_V_DEBLOCK	V_DEBLOCK		//   1
24
+#define LUM_H_DEBLOCK	H_DEBLOCK		//   2
25
+#define CHROM_V_DEBLOCK	(V_DEBLOCK<<4)		//  16
26
+#define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)		//  32
27
+#define LUM_DERING	DERING			//   4
28
+#define CHROM_DERING	(DERING<<4)		//  64
29
+#define LUM_LEVEL_FIX	LEVEL_FIX		//   8
30
+#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)		// 128 (not implemented yet)
31
+
32
+// Experimental vertical filters
33
+#define V_X1_FILTER	0x0200			// 512
34
+
35
+// Experimental horizontal filters
36
+#define H_X1_FILTER	0x2000			// 8192
37
+
38
+// select between full y range (255-0) or standart one (234-16)
39
+#define FULL_Y_RANGE	0x8000			// 32768
40
+
41
+//Deinterlacing Filters
42
+#define	LINEAR_IPOL_DEINT_FILTER	0x10000	// 65536
43
+#define	LINEAR_BLEND_DEINT_FILTER	0x20000	// 131072
44
+#define	CUBIC_BLEND_DEINT_FILTER	0x8000	// (not implemented yet)
45
+#define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144
46
+#define	MEDIAN_DEINT_FILTER		0x80000	// 524288
47
+#define	FFMPEG_DEINT_FILTER		0x400000
48
+
49
+#define TEMP_NOISE_FILTER		0x100000
50
+#define FORCE_QUANT			0x200000
51
+
52
+//use if u want a faster postprocessing code
53
+//cant differentiate between chroma & luma filters (both on or both off)
54
+//obviosly the -pp option at the commandline has no effect except turning the here selected
55
+//filters on
56
+//#define COMPILE_TIME_MODE 0x77
57
+
58
+struct PPFilter{
59
+	char *shortName;
60
+	char *longName;
61
+	int chromDefault; 	// is chrominance filtering on by default if this filter is manually activated
62
+	int minLumQuality; 	// minimum quality to turn luminance filtering on
63
+	int minChromQuality;	// minimum quality to turn chrominance filtering on
64
+	int mask; 		// Bitmask to turn this filter on
65
+};
66
+
67
+typedef struct PPMode{
68
+	int lumMode; 			// acivates filters for luminance
69
+	int chromMode; 			// acivates filters for chrominance
70
+	int error; 			// non zero on error
71
+
72
+	int minAllowedY; 		// for brigtness correction
73
+	int maxAllowedY; 		// for brihtness correction
74
+	float maxClippedThreshold;	// amount of "black" u r willing to loose to get a brightness corrected picture
75
+
76
+	int maxTmpNoise[3]; 		// for Temporal Noise Reducing filter (Maximal sum of abs differences)
77
+
78
+	int baseDcDiff;
79
+	int flatnessThreshold;
80
+
81
+	int forcedQuant; 		// quantizer if FORCE_QUANT is used
82
+} PPMode;
83
+
84
+typedef struct PPContext{
85
+	uint8_t *tempBlocks; //used for the horizontal code
86
+
87

                
88
+	   after watching a black picture for 5 hours*/
89
+	uint64_t *yHistogram;
90
+
91
+	uint64_t __attribute__((aligned(8))) packedYOffset;
92
+	uint64_t __attribute__((aligned(8))) packedYScale;
93
+
94
+	/* Temporal noise reducing buffers */
95
+	uint8_t *tempBlured[3];
96
+	int32_t *tempBluredPast[3];
97
+
98
+	/* Temporary buffers for handling the last row(s) */
99
+	uint8_t *tempDst;
100
+	uint8_t *tempSrc;
101
+
102
+	uint8_t *deintTemp;
103
+
104
+	uint64_t __attribute__((aligned(8))) pQPb;
105
+	uint64_t __attribute__((aligned(8))) pQPb2;
106
+
107
+	uint64_t __attribute__((aligned(8))) mmxDcOffset[32];
108
+	uint64_t __attribute__((aligned(8))) mmxDcThreshold[32];
109
+
110
+	QP_STORE_T *nonBQPTable;
111
+	QP_STORE_T *forcedQPTable;
112
+
113
+	int QP;
114
+	int nonBQP;
115
+
116
+	int frameNum;
117
+	
118
+	int cpuCaps;
119
+        
120
+	int stride; //size of some buffers (needed to realloc them if needed)
121
+        
122
+	int hChromaSubSample;
123
+	int vChromaSubSample;
124
+
125
+	PPMode ppMode;
126
+} PPContext;
127
+
0 128
new file mode 100644
... ...
@@ -0,0 +1,3127 @@
0
+/*
1
+    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
2
+
3
+    This program is free software; you can redistribute it and/or modify
4
+    it under the terms of the GNU General Public License as published by
5
+    the Free Software Foundation; either version 2 of the License, or
6
+    (at your option) any later version.
7
+
8
+    This program is distributed in the hope that it will be useful,
9
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
+    GNU General Public License for more details.
12
+
13
+    You should have received a copy of the GNU General Public License
14
+    along with this program; if not, write to the Free Software
15
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16
+*/
17
+
18
+#undef PAVGB
19
+#undef PMINUB
20
+#undef PMAXUB
21
+
22
+#ifdef HAVE_MMX2
23
+#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
24
+#elif defined (HAVE_3DNOW)
25
+#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
26
+#endif
27
+
28
+#ifdef HAVE_MMX2
29
+#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
30
+#elif defined (HAVE_MMX)
31
+#define PMINUB(b,a,t) \
32
+	"movq " #a ", " #t " \n\t"\
33
+	"psubusb " #b ", " #t " \n\t"\
34
+	"psubb " #t ", " #a " \n\t"
35
+#endif
36
+
37
+#ifdef HAVE_MMX2
38
+#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
39
+#elif defined (HAVE_MMX)
40
+#define PMAXUB(a,b) \
41
+	"psubusb " #a ", " #b " \n\t"\
42
+	"paddb " #a ", " #b " \n\t"
43
+#endif
44
+
45
+
46
+//FIXME? |255-0| = 1 (shouldnt be a problem ...)
47
+#ifdef HAVE_MMX
48
+/**
49
+ * Check if the middle 8x8 Block in the given 8x16 block is flat
50
+ */
51
+static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
52
+	int numEq= 0;
53
+	src+= stride*4; // src points to begin of the 8x8 Block
54
+asm volatile(
55
+		"leal (%1, %2), %%eax				\n\t"
56
+//	0	1	2	3	4	5	6	7	8	9
57
+//	%1	eax	eax+%2	eax+2%2	%1+4%2	ecx	ecx+%2	ecx+2%2	%1+8%2	ecx+4%2
58
+		"movq %3, %%mm7					\n\t" 
59
+		"movq %4, %%mm6					\n\t" 
60
+
61
+		"movq (%1), %%mm0				\n\t"
62
+		"movq (%%eax), %%mm1				\n\t"
63
+		"psubb %%mm1, %%mm0				\n\t" // mm0 = differnece
64
+		"paddb %%mm7, %%mm0				\n\t"
65
+		"pcmpgtb %%mm6, %%mm0				\n\t"
66
+
67
+		"movq (%%eax,%2), %%mm2				\n\t"
68
+		"psubb %%mm2, %%mm1				\n\t"
69
+		"paddb %%mm7, %%mm1				\n\t"
70
+		"pcmpgtb %%mm6, %%mm1				\n\t"
71
+		"paddb %%mm1, %%mm0				\n\t"
72
+
73
+		"movq (%%eax, %2, 2), %%mm1			\n\t"
74
+		"psubb %%mm1, %%mm2				\n\t"
75
+		"paddb %%mm7, %%mm2				\n\t"
76
+		"pcmpgtb %%mm6, %%mm2				\n\t"
77
+		"paddb %%mm2, %%mm0				\n\t"
78
+		
79
+		"leal (%%eax, %2, 4), %%eax			\n\t"
80
+
81
+		"movq (%1, %2, 4), %%mm2			\n\t"
82
+		"psubb %%mm2, %%mm1				\n\t"
83
+		"paddb %%mm7, %%mm1				\n\t"
84
+		"pcmpgtb %%mm6, %%mm1				\n\t"
85
+		"paddb %%mm1, %%mm0				\n\t"
86
+
87
+		"movq (%%eax), %%mm1				\n\t"
88
+		"psubb %%mm1, %%mm2				\n\t"
89
+		"paddb %%mm7, %%mm2				\n\t"
90
+		"pcmpgtb %%mm6, %%mm2				\n\t"
91
+		"paddb %%mm2, %%mm0				\n\t"
92
+
93
+		"movq (%%eax, %2), %%mm2			\n\t"
94
+		"psubb %%mm2, %%mm1				\n\t"
95
+		"paddb %%mm7, %%mm1				\n\t"
96
+		"pcmpgtb %%mm6, %%mm1				\n\t"
97
+		"paddb %%mm1, %%mm0				\n\t"
98
+
99
+		"movq (%%eax, %2, 2), %%mm1			\n\t"
100
+		"psubb %%mm1, %%mm2				\n\t"
101
+		"paddb %%mm7, %%mm2				\n\t"
102
+		"pcmpgtb %%mm6, %%mm2				\n\t"
103
+		"paddb %%mm2, %%mm0				\n\t"
104
+
105
+		"						\n\t"
106
+#ifdef HAVE_MMX2
107
+		"pxor %%mm7, %%mm7				\n\t"
108
+		"psadbw %%mm7, %%mm0				\n\t"
109
+#else
110
+		"movq %%mm0, %%mm1				\n\t"
111
+		"psrlw $8, %%mm0				\n\t"
112
+		"paddb %%mm1, %%mm0				\n\t"
113
+		"movq %%mm0, %%mm1				\n\t"
114
+		"psrlq $16, %%mm0				\n\t"
115
+		"paddb %%mm1, %%mm0				\n\t"
116
+		"movq %%mm0, %%mm1				\n\t"
117
+		"psrlq $32, %%mm0				\n\t"
118
+		"paddb %%mm1, %%mm0				\n\t"
119
+#endif
120
+		"movd %%mm0, %0					\n\t"
121
+		: "=r" (numEq)
122
+		: "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
123
+		: "%eax"
124
+		);
125
+	numEq= (-numEq) &0xFF;
126
+	return numEq > c->ppMode.flatnessThreshold;
127
+}
128
+#endif
129
+
130
+static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
131
+{
132
+#ifdef HAVE_MMX
133
+	int isOk;
134
+	src+= stride*3;
135
+	asm volatile(
136
+		"movq (%1, %2), %%mm0				\n\t"
137
+		"movq (%1, %2, 8), %%mm1			\n\t"
138
+		"movq %%mm0, %%mm2				\n\t"
139
+		"psubusb %%mm1, %%mm0				\n\t"
140
+		"psubusb %%mm2, %%mm1				\n\t"
141
+		"por %%mm1, %%mm0				\n\t" // ABS Diff
142
+
143
+		"movq %3, %%mm7					\n\t" // QP,..., QP
144
+		"paddusb %%mm7, %%mm7				\n\t" // 2QP ... 2QP
145
+		"psubusb %%mm7, %%mm0				\n\t" // Diff <= 2QP -> 0
146
+		"packssdw %%mm0, %%mm0				\n\t"
147
+		"movd %%mm0, %0					\n\t"
148
+		: "=r" (isOk)
149
+		: "r" (src), "r" (stride), "m" (c->pQPb)
150
+		);
151
+	return isOk==0;
152
+#else
153
+#if 1
154
+	int x;
155
+	const int QP= c->QP;
156
+	src+= stride*3;
157
+	for(x=0; x<BLOCK_SIZE; x++)
158
+	{
159
+		if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
160
+	}
161
+
162
+	return 1;
163
+#else
164
+	int x;
165
+	const int QP= c->QP;
166
+	src+= stride*4;
167
+	for(x=0; x<BLOCK_SIZE; x++)
168
+	{
169
+		int min=255;
170
+		int max=0;
171
+		int y;
172
+		for(y=0; y<8; y++){
173
+			int v= src[x + y*stride];
174
+			if(v>max) max=v;
175
+			if(v<min) min=v;
176
+		}
177
+		if(max-min > 2*QP) return 0;
178
+	}
179
+	return 1;
180
+#endif
181
+#endif
182
+}
183
+
184
+/**
185
+ * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
186
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
187
+ */
188
+static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
189
+{
190
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
191
+	src+= stride*3;
192
+	asm volatile(	//"movv %0 %1 %2\n\t"
193
+		"movq %2, %%mm0			\n\t"  // QP,..., QP
194
+		"pxor %%mm4, %%mm4				\n\t"
195
+
196
+		"movq (%0), %%mm6				\n\t"
197
+		"movq (%0, %1), %%mm5				\n\t"
198
+		"movq %%mm5, %%mm1				\n\t"
199
+		"movq %%mm6, %%mm2				\n\t"
200
+		"psubusb %%mm6, %%mm5				\n\t"
201
+		"psubusb %%mm1, %%mm2				\n\t"
202
+		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
203
+		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
204
+		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
205
+
206
+		"pand %%mm2, %%mm6				\n\t"
207
+		"pandn %%mm1, %%mm2				\n\t"
208
+		"por %%mm2, %%mm6				\n\t"// First Line to Filter
209
+
210
+		"movq (%0, %1, 8), %%mm5			\n\t"
211
+		"leal (%0, %1, 4), %%eax			\n\t"
212
+		"leal (%0, %1, 8), %%ecx			\n\t"
213
+		"subl %1, %%ecx					\n\t"
214
+		"addl %1, %0					\n\t" // %0 points to line 1 not 0
215
+		"movq (%0, %1, 8), %%mm7			\n\t"
216
+		"movq %%mm5, %%mm1				\n\t"
217
+		"movq %%mm7, %%mm2				\n\t"
218
+		"psubusb %%mm7, %%mm5				\n\t"
219
+		"psubusb %%mm1, %%mm2				\n\t"
220
+		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
221
+		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
222
+		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
223
+
224
+		"pand %%mm2, %%mm7				\n\t"
225
+		"pandn %%mm1, %%mm2				\n\t"
226
+		"por %%mm2, %%mm7				\n\t" // First Line to Filter
227
+
228
+
229
+		// 	1	2	3	4	5	6	7	8
230
+		//	%0	%0+%1	%0+2%1	eax	%0+4%1	eax+2%1	ecx	eax+4%1
231
+		// 6 4 2 2 1 1
232
+		// 6 4 4 2
233
+		// 6 8 2
234
+
235
+		"movq (%0, %1), %%mm0				\n\t" //  1
236
+		"movq %%mm0, %%mm1				\n\t" //  1
237
+		PAVGB(%%mm6, %%mm0)				      //1 1	/2
238
+		PAVGB(%%mm6, %%mm0)				      //3 1	/4
239
+
240
+		"movq (%0, %1, 4), %%mm2			\n\t" //     1
241
+		"movq %%mm2, %%mm5				\n\t" //     1
242
+		PAVGB((%%eax), %%mm2)				      //    11	/2
243
+		PAVGB((%0, %1, 2), %%mm2)			      //   211	/4
244
+		"movq %%mm2, %%mm3				\n\t" //   211	/4
245
+		"movq (%0), %%mm4				\n\t" // 1
246
+		PAVGB(%%mm4, %%mm3)				      // 4 211	/8
247
+		PAVGB(%%mm0, %%mm3)				      //642211	/16
248
+		"movq %%mm3, (%0)				\n\t" // X
249
+		// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
250
+		"movq %%mm1, %%mm0				\n\t" //  1
251
+		PAVGB(%%mm6, %%mm0)				      //1 1	/2
252
+		"movq %%mm4, %%mm3				\n\t" // 1
253
+		PAVGB((%0,%1,2), %%mm3)				      // 1 1	/2
254
+		PAVGB((%%eax,%1,2), %%mm5)			      //     11	/2
255
+		PAVGB((%%eax), %%mm5)				      //    211 /4
256
+		PAVGB(%%mm5, %%mm3)				      // 2 2211 /8
257
+		PAVGB(%%mm0, %%mm3)				      //4242211 /16
258
+		"movq %%mm3, (%0,%1)				\n\t" //  X
259
+		// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
260
+		PAVGB(%%mm4, %%mm6)				      //11	/2
261
+		"movq (%%ecx), %%mm0				\n\t" //       1
262
+		PAVGB((%%eax, %1, 2), %%mm0)			      //      11/2
263
+		"movq %%mm0, %%mm3				\n\t" //      11/2
264
+		PAVGB(%%mm1, %%mm0)				      //  2   11/4
265
+		PAVGB(%%mm6, %%mm0)				      //222   11/8
266
+		PAVGB(%%mm2, %%mm0)				      //22242211/16
267
+		"movq (%0, %1, 2), %%mm2			\n\t" //   1
268
+		"movq %%mm0, (%0, %1, 2)			\n\t" //   X
269
+		// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
270
+		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
271
+		PAVGB((%%ecx), %%mm0)				      //       11	/2
272
+		PAVGB(%%mm0, %%mm6)				      //11     11	/4
273
+		PAVGB(%%mm1, %%mm4)				      // 11		/2
274
+		PAVGB(%%mm2, %%mm1)				      //  11		/2
275
+		PAVGB(%%mm1, %%mm6)				      //1122   11	/8
276
+		PAVGB(%%mm5, %%mm6)				      //112242211	/16
277
+		"movq (%%eax), %%mm5				\n\t" //    1
278
+		"movq %%mm6, (%%eax)				\n\t" //    X
279
+		// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
280
+		"movq (%%eax, %1, 4), %%mm6			\n\t" //        1
281
+		PAVGB(%%mm7, %%mm6)				      //        11	/2
282
+		PAVGB(%%mm4, %%mm6)				      // 11     11	/4
283
+		PAVGB(%%mm3, %%mm6)				      // 11   2211	/8
284
+		PAVGB(%%mm5, %%mm2)				      //   11		/2
285
+		"movq (%0, %1, 4), %%mm4			\n\t" //     1
286
+		PAVGB(%%mm4, %%mm2)				      //   112		/4
287
+		PAVGB(%%mm2, %%mm6)				      // 112242211	/16
288
+		"movq %%mm6, (%0, %1, 4)			\n\t" //     X
289
+		// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
290
+		PAVGB(%%mm7, %%mm1)				      //  11     2	/4
291
+		PAVGB(%%mm4, %%mm5)				      //    11		/2
292
+		PAVGB(%%mm5, %%mm0)				      //    11 11	/4
293
+		"movq (%%eax, %1, 2), %%mm6			\n\t" //      1
294
+		PAVGB(%%mm6, %%mm1)				      //  11  4  2	/8
295
+		PAVGB(%%mm0, %%mm1)				      //  11224222	/16
296
+		"movq %%mm1, (%%eax, %1, 2)			\n\t" //      X
297
+		// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
298
+		PAVGB((%%ecx), %%mm2)				      //   112 4	/8
299
+		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
300
+		PAVGB(%%mm0, %%mm6)				      //      1 1	/2
301
+		PAVGB(%%mm7, %%mm6)				      //      1 12	/4
302
+		PAVGB(%%mm2, %%mm6)				      //   1122424	/4
303
+		"movq %%mm6, (%%ecx)				\n\t" //       X
304
+		// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
305
+		PAVGB(%%mm7, %%mm5)				      //    11   2	/4
306
+		PAVGB(%%mm7, %%mm5)				      //    11   6	/8
307
+
308
+		PAVGB(%%mm3, %%mm0)				      //      112	/4
309
+		PAVGB(%%mm0, %%mm5)				      //    112246	/16
310
+		"movq %%mm5, (%%eax, %1, 4)			\n\t" //        X
311
+		"subl %1, %0					\n\t"
312
+
313
+		:
314
+		: "r" (src), "r" (stride), "m" (c->pQPb)
315
+		: "%eax", "%ecx"
316
+	);
317
+#else
318
+	const int l1= stride;
319
+	const int l2= stride + l1;
320
+	const int l3= stride + l2;
321
+	const int l4= stride + l3;
322
+	const int l5= stride + l4;
323
+	const int l6= stride + l5;
324
+	const int l7= stride + l6;
325
+	const int l8= stride + l7;
326
+	const int l9= stride + l8;
327
+	int x;
328
+	src+= stride*3;
329
+	for(x=0; x<BLOCK_SIZE; x++)
330
+	{
331
+		const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
332
+		const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
333
+
334
+		int sums[9];
335
+		sums[0] = first + src[l1];
336
+		sums[1] = src[l1] + src[l2];
337
+		sums[2] = src[l2] + src[l3];
338
+		sums[3] = src[l3] + src[l4];
339
+		sums[4] = src[l4] + src[l5];
340
+		sums[5] = src[l5] + src[l6];
341
+		sums[6] = src[l6] + src[l7];
342
+		sums[7] = src[l7] + src[l8];
343
+		sums[8] = src[l8] + last;
344
+
345
+		src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
346
+		src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
347
+		src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
348
+		src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
349
+		src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
350
+		src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
351
+		src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
352
+		src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
353
+
354
+		src++;
355
+	}
356
+#endif
357
+}
358
+
359
+#if 0
360
+/**
361
+ * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
362
+ * values are correctly clipped (MMX2)
363
+ * values are wraparound (C)
364
+ * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
365
+	0 8 16 24
366
+	x = 8
367
+	x/2 = 4
368
+	x/8 = 1
369
+	1 12 12 23
370
+ */
371
+static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
372
+{
373
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
374
+	src+= stride*3;
375
+// FIXME rounding
376
+	asm volatile(
377
+		"pxor %%mm7, %%mm7				\n\t" // 0
378
+		"movq "MANGLE(b80)", %%mm6			\n\t" // MIN_SIGNED_BYTE
379
+		"leal (%0, %1), %%eax				\n\t"
380
+		"leal (%%eax, %1, 4), %%ecx			\n\t"
381
+//	0	1	2	3	4	5	6	7	8	9
382
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
383
+		"movq "MANGLE(pQPb)", %%mm0			\n\t" // QP,..., QP
384
+		"movq %%mm0, %%mm1				\n\t" // QP,..., QP
385
+		"paddusb "MANGLE(b02)", %%mm0			\n\t"
386
+		"psrlw $2, %%mm0				\n\t"
387
+		"pand "MANGLE(b3F)", %%mm0			\n\t" // QP/4,..., QP/4
388
+		"paddusb %%mm1, %%mm0				\n\t" // QP*1.25 ...
389
+		"movq (%0, %1, 4), %%mm2			\n\t" // line 4
390
+		"movq (%%ecx), %%mm3				\n\t" // line 5
391
+		"movq %%mm2, %%mm4				\n\t" // line 4
392
+		"pcmpeqb %%mm5, %%mm5				\n\t" // -1
393
+		"pxor %%mm2, %%mm5				\n\t" // -line 4 - 1
394
+		PAVGB(%%mm3, %%mm5)
395
+		"paddb %%mm6, %%mm5				\n\t" // (l5-l4)/2
396
+		"psubusb %%mm3, %%mm4				\n\t"
397
+		"psubusb %%mm2, %%mm3				\n\t"
398
+		"por %%mm3, %%mm4				\n\t" // |l4 - l5|
399
+		"psubusb %%mm0, %%mm4				\n\t"
400
+		"pcmpeqb %%mm7, %%mm4				\n\t"
401
+		"pand %%mm4, %%mm5				\n\t" // d/2
402
+
403
+//		"paddb %%mm6, %%mm2				\n\t" // line 4 + 0x80
404
+		"paddb %%mm5, %%mm2				\n\t"
405
+//		"psubb %%mm6, %%mm2				\n\t"
406
+		"movq %%mm2, (%0,%1, 4)				\n\t"
407
+
408
+		"movq (%%ecx), %%mm2				\n\t"
409
+//		"paddb %%mm6, %%mm2				\n\t" // line 5 + 0x80
410
+		"psubb %%mm5, %%mm2				\n\t"
411
+//		"psubb %%mm6, %%mm2				\n\t"
412
+		"movq %%mm2, (%%ecx)				\n\t"
413
+
414
+		"paddb %%mm6, %%mm5				\n\t"
415
+		"psrlw $2, %%mm5				\n\t"
416
+		"pand "MANGLE(b3F)", %%mm5			\n\t"
417
+		"psubb "MANGLE(b20)", %%mm5			\n\t" // (l5-l4)/8
418
+
419
+		"movq (%%eax, %1, 2), %%mm2			\n\t"
420
+		"paddb %%mm6, %%mm2				\n\t" // line 3 + 0x80
421
+		"paddsb %%mm5, %%mm2				\n\t"
422
+		"psubb %%mm6, %%mm2				\n\t"
423
+		"movq %%mm2, (%%eax, %1, 2)			\n\t"
424
+
425
+		"movq (%%ecx, %1), %%mm2			\n\t"
426
+		"paddb %%mm6, %%mm2				\n\t" // line 6 + 0x80
427
+		"psubsb %%mm5, %%mm2				\n\t"
428
+		"psubb %%mm6, %%mm2				\n\t"
429
+		"movq %%mm2, (%%ecx, %1)			\n\t"
430
+
431
+		:
432
+		: "r" (src), "r" (stride)
433
+		: "%eax", "%ecx"
434
+	);
435
+#else
436
+ 	const int l1= stride;
437
+	const int l2= stride + l1;
438
+	const int l3= stride + l2;
439
+	const int l4= stride + l3;
440
+	const int l5= stride + l4;
441
+	const int l6= stride + l5;
442
+//	const int l7= stride + l6;
443
+//	const int l8= stride + l7;
444
+//	const int l9= stride + l8;
445
+	int x;
446
+	const int QP15= QP + (QP>>2);
447
+	src+= stride*3;
448
+	for(x=0; x<BLOCK_SIZE; x++)
449
+	{
450
+		const int v = (src[x+l5] - src[x+l4]);
451
+		if(ABS(v) < QP15)
452
+		{
453
+			src[x+l3] +=v>>3;
454
+			src[x+l4] +=v>>1;
455
+			src[x+l5] -=v>>1;
456
+			src[x+l6] -=v>>3;
457
+
458
+		}
459
+	}
460
+
461
+#endif
462
+}
463
+#endif
464
+
465
+/**
466
+ * Experimental Filter 1
467
+ * will not damage linear gradients
468
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
469
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move)
470
+ * MMX2 version does correct clipping C version doesnt
471
+ */
472
+static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
473
+{
474
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
475
+	src+= stride*3;
476
+
477
+	asm volatile(
478
+		"pxor %%mm7, %%mm7				\n\t" // 0
479
+		"leal (%0, %1), %%eax				\n\t"
480
+		"leal (%%eax, %1, 4), %%ecx			\n\t"
481
+//	0	1	2	3	4	5	6	7	8	9
482
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
483
+		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
484
+		"movq (%0, %1, 4), %%mm1			\n\t" // line 4
485
+		"movq %%mm1, %%mm2				\n\t" // line 4
486
+		"psubusb %%mm0, %%mm1				\n\t"
487
+		"psubusb %%mm2, %%mm0				\n\t"
488
+		"por %%mm1, %%mm0				\n\t" // |l2 - l3|
489
+		"movq (%%ecx), %%mm3				\n\t" // line 5
490
+		"movq (%%ecx, %1), %%mm4			\n\t" // line 6
491
+		"movq %%mm3, %%mm5				\n\t" // line 5
492
+		"psubusb %%mm4, %%mm3				\n\t"
493
+		"psubusb %%mm5, %%mm4				\n\t"
494
+		"por %%mm4, %%mm3				\n\t" // |l5 - l6|
495
+		PAVGB(%%mm3, %%mm0)				      // (|l2 - l3| + |l5 - l6|)/2
496
+		"movq %%mm2, %%mm1				\n\t" // line 4
497
+		"psubusb %%mm5, %%mm2				\n\t"
498
+		"movq %%mm2, %%mm4				\n\t"
499
+		"pcmpeqb %%mm7, %%mm2				\n\t" // (l4 - l5) <= 0 ? -1 : 0
500
+		"psubusb %%mm1, %%mm5				\n\t"
501
+		"por %%mm5, %%mm4				\n\t" // |l4 - l5|
502
+		"psubusb %%mm0, %%mm4		\n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
503
+		"movq %%mm4, %%mm3				\n\t" // d
504
+		"movq %2, %%mm0			\n\t"
505
+                "paddusb %%mm0, %%mm0				\n\t"
506
+		"psubusb %%mm0, %%mm4				\n\t"
507
+		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
508
+		"psubusb "MANGLE(b01)", %%mm3			\n\t"
509
+		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
510
+
511
+		PAVGB(%%mm7, %%mm3)				      // d/2
512
+		"movq %%mm3, %%mm1				\n\t" // d/2
513
+		PAVGB(%%mm7, %%mm3)				      // d/4
514
+		PAVGB(%%mm1, %%mm3)				      // 3*d/8
515
+
516
+		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
517
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
518
+		"psubusb %%mm3, %%mm0				\n\t"
519
+		"pxor %%mm2, %%mm0				\n\t"
520
+		"movq %%mm0, (%0, %1, 4)			\n\t" // line 4
521
+
522
+		"movq (%%ecx), %%mm0				\n\t" // line 5
523
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
524
+		"paddusb %%mm3, %%mm0				\n\t"
525
+		"pxor %%mm2, %%mm0				\n\t"
526
+		"movq %%mm0, (%%ecx)				\n\t" // line 5
527
+
528
+		PAVGB(%%mm7, %%mm1)				      // d/4
529
+
530
+		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
531
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
532
+		"psubusb %%mm1, %%mm0				\n\t"
533
+		"pxor %%mm2, %%mm0				\n\t"
534
+		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
535
+
536
+		"movq (%%ecx, %1), %%mm0			\n\t" // line 6
537
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
538
+		"paddusb %%mm1, %%mm0				\n\t"
539
+		"pxor %%mm2, %%mm0				\n\t"
540
+		"movq %%mm0, (%%ecx, %1)			\n\t" // line 6
541
+
542
+		PAVGB(%%mm7, %%mm1)				      // d/8
543
+
544
+		"movq (%%eax, %1), %%mm0			\n\t" // line 2
545
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
546
+		"psubusb %%mm1, %%mm0				\n\t"
547
+		"pxor %%mm2, %%mm0				\n\t"
548
+		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
549
+
550
+		"movq (%%ecx, %1, 2), %%mm0			\n\t" // line 7
551
+		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
552
+		"paddusb %%mm1, %%mm0				\n\t"
553
+		"pxor %%mm2, %%mm0				\n\t"
554
+		"movq %%mm0, (%%ecx, %1, 2)			\n\t" // line 7
555
+
556
+		:
557
+		: "r" (src), "r" (stride), "m" (co->pQPb)
558
+		: "%eax", "%ecx"
559
+	);
560
+#else
561
+
562
+ 	const int l1= stride;
563
+	const int l2= stride + l1;
564
+	const int l3= stride + l2;
565
+	const int l4= stride + l3;
566
+	const int l5= stride + l4;
567
+	const int l6= stride + l5;
568
+	const int l7= stride + l6;
569
+//	const int l8= stride + l7;
570
+//	const int l9= stride + l8;
571
+	int x;
572
+
573
+	src+= stride*3;
574
+	for(x=0; x<BLOCK_SIZE; x++)
575
+	{
576
+		int a= src[l3] - src[l4];
577
+		int b= src[l4] - src[l5];
578
+		int c= src[l5] - src[l6];
579
+
580
+		int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
581
+		d= MAX(d, 0);
582
+
583
+		if(d < co->QP*2)
584
+		{
585
+			int v = d * SIGN(-b);
586
+
587
+			src[l2] +=v>>3;
588
+			src[l3] +=v>>2;
589
+			src[l4] +=(3*v)>>3;
590
+			src[l5] -=(3*v)>>3;
591
+			src[l6] -=v>>2;
592
+			src[l7] -=v>>3;
593
+
594
+		}
595
+		src++;
596
+	}
597
+#endif
598
+}
599
+
600
+static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
601
+{
602
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
603
+/*
604
+	uint8_t tmp[16];
605
+	const int l1= stride;
606
+	const int l2= stride + l1;
607
+	const int l3= stride + l2;
608
+	const int l4= (int)tmp - (int)src - stride*3;
609
+	const int l5= (int)tmp - (int)src - stride*3 + 8;
610
+	const int l6= stride*3 + l3;
611
+	const int l7= stride + l6;
612
+	const int l8= stride + l7;
613
+
614
+	memcpy(tmp, src+stride*7, 8);
615
+	memcpy(tmp+8, src+stride*8, 8);
616
+*/
617
+	src+= stride*4;
618
+	asm volatile(
619
+
620
+#if 0 //sligtly more accurate and slightly slower
621
+		"pxor %%mm7, %%mm7				\n\t" // 0
622
+		"leal (%0, %1), %%eax				\n\t"
623
+		"leal (%%eax, %1, 4), %%ecx			\n\t"
624
+//	0	1	2	3	4	5	6	7
625
+//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
626
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
627
+
628
+
629
+		"movq (%0, %1, 2), %%mm0			\n\t" // l2
630
+		"movq (%0), %%mm1				\n\t" // l0
631
+		"movq %%mm0, %%mm2				\n\t" // l2
632
+		PAVGB(%%mm7, %%mm0)				      // ~l2/2
633
+		PAVGB(%%mm1, %%mm0)				      // ~(l2 + 2l0)/4
634
+		PAVGB(%%mm2, %%mm0)				      // ~(5l2 + 2l0)/8
635
+
636
+		"movq (%%eax), %%mm1				\n\t" // l1
637
+		"movq (%%eax, %1, 2), %%mm3			\n\t" // l3
638
+		"movq %%mm1, %%mm4				\n\t" // l1
639
+		PAVGB(%%mm7, %%mm1)				      // ~l1/2
640
+		PAVGB(%%mm3, %%mm1)				      // ~(l1 + 2l3)/4
641
+		PAVGB(%%mm4, %%mm1)				      // ~(5l1 + 2l3)/8
642
+
643
+		"movq %%mm0, %%mm4				\n\t" // ~(5l2 + 2l0)/8
644
+		"psubusb %%mm1, %%mm0				\n\t"
645
+		"psubusb %%mm4, %%mm1				\n\t"
646
+		"por %%mm0, %%mm1				\n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
647
+// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
648
+
649
+		"movq (%0, %1, 4), %%mm0			\n\t" // l4
650
+		"movq %%mm0, %%mm4				\n\t" // l4
651
+		PAVGB(%%mm7, %%mm0)				      // ~l4/2
652
+		PAVGB(%%mm2, %%mm0)				      // ~(l4 + 2l2)/4
653
+		PAVGB(%%mm4, %%mm0)				      // ~(5l4 + 2l2)/8
654
+
655
+		"movq (%%ecx), %%mm2				\n\t" // l5
656
+		"movq %%mm3, %%mm5				\n\t" // l3
657
+		PAVGB(%%mm7, %%mm3)				      // ~l3/2
658
+		PAVGB(%%mm2, %%mm3)				      // ~(l3 + 2l5)/4
659
+		PAVGB(%%mm5, %%mm3)				      // ~(5l3 + 2l5)/8
660
+
661
+		"movq %%mm0, %%mm6				\n\t" // ~(5l4 + 2l2)/8
662
+		"psubusb %%mm3, %%mm0				\n\t"
663
+		"psubusb %%mm6, %%mm3				\n\t"
664
+		"por %%mm0, %%mm3				\n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
665
+		"pcmpeqb %%mm7, %%mm0				\n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
666
+// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
667
+
668
+		"movq (%%ecx, %1), %%mm6			\n\t" // l6
669
+		"movq %%mm6, %%mm5				\n\t" // l6
670
+		PAVGB(%%mm7, %%mm6)				      // ~l6/2
671
+		PAVGB(%%mm4, %%mm6)				      // ~(l6 + 2l4)/4
672
+		PAVGB(%%mm5, %%mm6)				      // ~(5l6 + 2l4)/8
673
+
674
+		"movq (%%ecx, %1, 2), %%mm5			\n\t" // l7
675
+		"movq %%mm2, %%mm4				\n\t" // l5
676
+		PAVGB(%%mm7, %%mm2)				      // ~l5/2
677
+		PAVGB(%%mm5, %%mm2)				      // ~(l5 + 2l7)/4
678
+		PAVGB(%%mm4, %%mm2)				      // ~(5l5 + 2l7)/8
679
+
680
+		"movq %%mm6, %%mm4				\n\t" // ~(5l6 + 2l4)/8
681
+		"psubusb %%mm2, %%mm6				\n\t"
682
+		"psubusb %%mm4, %%mm2				\n\t"
683
+		"por %%mm6, %%mm2				\n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
684
+// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
685
+
686
+
687
+		PMINUB(%%mm2, %%mm1, %%mm4)			      // MIN(|lenergy|,|renergy|)/8
688
+		"movq %2, %%mm4					\n\t" // QP //FIXME QP+1 ?
689
+		"paddusb "MANGLE(b01)", %%mm4			\n\t"
690
+		"pcmpgtb %%mm3, %%mm4				\n\t" // |menergy|/8 < QP
691
+		"psubusb %%mm1, %%mm3				\n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
692
+		"pand %%mm4, %%mm3				\n\t"
693
+
694
+		"movq %%mm3, %%mm1				\n\t"
695
+//		"psubusb "MANGLE(b01)", %%mm3			\n\t"
696
+		PAVGB(%%mm7, %%mm3)
697
+		PAVGB(%%mm7, %%mm3)
698
+		"paddusb %%mm1, %%mm3				\n\t"
699
+//		"paddusb "MANGLE(b01)", %%mm3			\n\t"
700
+
701
+		"movq (%%eax, %1, 2), %%mm6			\n\t" //l3
702
+		"movq (%0, %1, 4), %%mm5			\n\t" //l4
703
+		"movq (%0, %1, 4), %%mm4			\n\t" //l4
704
+		"psubusb %%mm6, %%mm5				\n\t"
705
+		"psubusb %%mm4, %%mm6				\n\t"
706
+		"por %%mm6, %%mm5				\n\t" // |l3-l4|
707
+		"pcmpeqb %%mm7, %%mm6				\n\t" // SIGN(l3-l4)
708
+		"pxor %%mm6, %%mm0				\n\t"
709
+		"pand %%mm0, %%mm3				\n\t"
710
+		PMINUB(%%mm5, %%mm3, %%mm0)
711
+
712
+		"psubusb "MANGLE(b01)", %%mm3			\n\t"
713
+		PAVGB(%%mm7, %%mm3)
714
+
715
+		"movq (%%eax, %1, 2), %%mm0			\n\t"
716
+		"movq (%0, %1, 4), %%mm2			\n\t"
717
+		"pxor %%mm6, %%mm0				\n\t"
718
+		"pxor %%mm6, %%mm2				\n\t"
719
+		"psubb %%mm3, %%mm0				\n\t"
720
+		"paddb %%mm3, %%mm2				\n\t"
721
+		"pxor %%mm6, %%mm0				\n\t"
722
+		"pxor %%mm6, %%mm2				\n\t"
723
+		"movq %%mm0, (%%eax, %1, 2)			\n\t"
724
+		"movq %%mm2, (%0, %1, 4)			\n\t"
725
+#endif
726
+
727
+		"leal (%0, %1), %%eax				\n\t"
728
+		"pcmpeqb %%mm6, %%mm6				\n\t" // -1
729
+//	0	1	2	3	4	5	6	7
730
+//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
731
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
732
+
733
+
734
+		"movq (%%eax, %1, 2), %%mm1			\n\t" // l3
735
+		"movq (%0, %1, 4), %%mm0			\n\t" // l4
736
+		"pxor %%mm6, %%mm1				\n\t" // -l3-1
737
+		PAVGB(%%mm1, %%mm0)				      // -q+128 = (l4-l3+256)/2
738
+// mm1=-l3-1, mm0=128-q
739
+
740
+		"movq (%%eax, %1, 4), %%mm2			\n\t" // l5
741
+		"movq (%%eax, %1), %%mm3			\n\t" // l2
742
+		"pxor %%mm6, %%mm2				\n\t" // -l5-1
743
+		"movq %%mm2, %%mm5				\n\t" // -l5-1
744
+		"movq "MANGLE(b80)", %%mm4			\n\t" // 128
745
+		"leal (%%eax, %1, 4), %%ecx			\n\t"
746
+		PAVGB(%%mm3, %%mm2)				      // (l2-l5+256)/2
747
+		PAVGB(%%mm0, %%mm4)				      // ~(l4-l3)/4 + 128
748
+		PAVGB(%%mm2, %%mm4)				      // ~(l2-l5)/4 +(l4-l3)/8 + 128
749
+		PAVGB(%%mm0, %%mm4)				      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
750
+// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
751
+
752
+		"movq (%%eax), %%mm2				\n\t" // l1
753
+		"pxor %%mm6, %%mm2				\n\t" // -l1-1
754
+		PAVGB(%%mm3, %%mm2)				      // (l2-l1+256)/2
755
+		PAVGB((%0), %%mm1)				      // (l0-l3+256)/2
756
+		"movq "MANGLE(b80)", %%mm3			\n\t" // 128
757
+		PAVGB(%%mm2, %%mm3)				      // ~(l2-l1)/4 + 128
758
+		PAVGB(%%mm1, %%mm3)				      // ~(l0-l3)/4 +(l2-l1)/8 + 128
759
+		PAVGB(%%mm2, %%mm3)				      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
760
+// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
761
+
762
+		PAVGB((%%ecx, %1), %%mm5)			      // (l6-l5+256)/2
763
+		"movq (%%ecx, %1, 2), %%mm1			\n\t" // l7
764
+		"pxor %%mm6, %%mm1				\n\t" // -l7-1
765
+		PAVGB((%0, %1, 4), %%mm1)			      // (l4-l7+256)/2
766
+		"movq "MANGLE(b80)", %%mm2			\n\t" // 128
767
+		PAVGB(%%mm5, %%mm2)				      // ~(l6-l5)/4 + 128
768
+		PAVGB(%%mm1, %%mm2)				      // ~(l4-l7)/4 +(l6-l5)/8 + 128
769
+		PAVGB(%%mm5, %%mm2)				      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
770
+// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
771
+
772
+		"movq "MANGLE(b00)", %%mm1			\n\t" // 0
773
+		"movq "MANGLE(b00)", %%mm5			\n\t" // 0
774
+		"psubb %%mm2, %%mm1				\n\t" // 128 - renergy/16
775
+		"psubb %%mm3, %%mm5				\n\t" // 128 - lenergy/16
776
+		PMAXUB(%%mm1, %%mm2)				      // 128 + |renergy/16|
777
+ 		PMAXUB(%%mm5, %%mm3)				      // 128 + |lenergy/16|
778
+		PMINUB(%%mm2, %%mm3, %%mm1)			      // 128 + MIN(|lenergy|,|renergy|)/16
779
+
780
+// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
781
+
782
+		"movq "MANGLE(b00)", %%mm7			\n\t" // 0
783
+		"movq %2, %%mm2					\n\t" // QP
784
+		PAVGB(%%mm6, %%mm2)				      // 128 + QP/2
785
+		"psubb %%mm6, %%mm2				\n\t"
786
+
787
+		"movq %%mm4, %%mm1				\n\t"
788
+		"pcmpgtb %%mm7, %%mm1				\n\t" // SIGN(menergy)
789
+		"pxor %%mm1, %%mm4				\n\t"
790
+		"psubb %%mm1, %%mm4				\n\t" // 128 + |menergy|/16
791
+		"pcmpgtb %%mm4, %%mm2				\n\t" // |menergy|/16 < QP/2
792
+		"psubusb %%mm3, %%mm4				\n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
793
+// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
794
+
795
+		"movq %%mm4, %%mm3				\n\t" // d
796
+		"psubusb "MANGLE(b01)", %%mm4			\n\t"
797
+		PAVGB(%%mm7, %%mm4)				      // d/32
798
+		PAVGB(%%mm7, %%mm4)				      // (d + 32)/64
799
+		"paddb %%mm3, %%mm4				\n\t" // 5d/64
800
+		"pand %%mm2, %%mm4				\n\t"
801
+
802
+		"movq "MANGLE(b80)", %%mm5			\n\t" // 128
803
+		"psubb %%mm0, %%mm5				\n\t" // q
804
+		"paddsb %%mm6, %%mm5				\n\t" // fix bad rounding
805
+		"pcmpgtb %%mm5, %%mm7				\n\t" // SIGN(q)
806
+		"pxor %%mm7, %%mm5				\n\t"
807
+
808
+		PMINUB(%%mm5, %%mm4, %%mm3)			      // MIN(|q|, 5d/64)
809
+		"pxor %%mm1, %%mm7				\n\t" // SIGN(d*q)
810
+
811
+		"pand %%mm7, %%mm4				\n\t"
812
+		"movq (%%eax, %1, 2), %%mm0			\n\t"
813
+		"movq (%0, %1, 4), %%mm2			\n\t"
814
+		"pxor %%mm1, %%mm0				\n\t"
815
+		"pxor %%mm1, %%mm2				\n\t"
816
+		"paddb %%mm4, %%mm0				\n\t"
817
+		"psubb %%mm4, %%mm2				\n\t"
818
+		"pxor %%mm1, %%mm0				\n\t"
819
+		"pxor %%mm1, %%mm2				\n\t"
820
+		"movq %%mm0, (%%eax, %1, 2)			\n\t"
821
+		"movq %%mm2, (%0, %1, 4)			\n\t"
822
+
823
+		:
824
+		: "r" (src), "r" (stride), "m" (c->pQPb)
825
+		: "%eax", "%ecx"
826
+	);
827
+
828
+/*
829
+	{
830
+	int x;
831
+	src-= stride;
832
+	for(x=0; x<BLOCK_SIZE; x++)
833
+	{
834
+		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
835
+		if(ABS(middleEnergy)< 8*QP)
836
+		{
837
+			const int q=(src[l4] - src[l5])/2;
838
+			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
839
+			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
840
+
841
+			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
842
+			d= MAX(d, 0);
843
+
844
+			d= (5*d + 32) >> 6;
845
+			d*= SIGN(-middleEnergy);
846
+
847
+			if(q>0)
848
+			{
849
+				d= d<0 ? 0 : d;
850
+				d= d>q ? q : d;
851
+			}
852
+			else
853
+			{
854
+				d= d>0 ? 0 : d;
855
+				d= d<q ? q : d;
856
+			}
857
+
858
+        		src[l4]-= d;
859
+	        	src[l5]+= d;
860
+		}
861
+		src++;
862
+	}
863
+src-=8;
864
+	for(x=0; x<8; x++)
865
+	{
866
+		int y;
867
+		for(y=4; y<6; y++)
868
+		{
869
+			int d= src[x+y*stride] - tmp[x+(y-4)*8];
870
+			int ad= ABS(d);
871
+			static int max=0;
872
+			static int sum=0;
873
+			static int num=0;
874
+			static int bias=0;
875
+
876
+			if(max<ad) max=ad;
877
+			sum+= ad>3 ? 1 : 0;
878
+			if(ad>3)
879
+			{
880
+				src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
881
+			}
882
+			if(y==4) bias+=d;
883
+			num++;
884
+			if(num%1000000 == 0)
885
+			{
886
+				printf(" %d %d %d %d\n", num, sum, max, bias);
887
+			}
888
+		}
889
+	}
890
+}
891
+*/
892
+#elif defined (HAVE_MMX)
893
+	src+= stride*4;
894
+	asm volatile(
895
+		"pxor %%mm7, %%mm7				\n\t"
896
+		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
897
+		"andl $0xFFFFFFF8, %%ecx			\n\t" // align
898
+//	0	1	2	3	4	5	6	7
899
+//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	edx+%1	edx+2%1
900
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1
901
+
902
+		"movq (%0), %%mm0				\n\t"
903
+		"movq %%mm0, %%mm1				\n\t"
904
+		"punpcklbw %%mm7, %%mm0				\n\t" // low part of line 0
905
+		"punpckhbw %%mm7, %%mm1				\n\t" // high part of line 0
906
+
907
+		"movq (%0, %1), %%mm2				\n\t"
908
+		"leal (%0, %1, 2), %%eax			\n\t"
909
+		"movq %%mm2, %%mm3				\n\t"
910
+		"punpcklbw %%mm7, %%mm2				\n\t" // low part of line 1
911
+		"punpckhbw %%mm7, %%mm3				\n\t" // high part of line 1
912
+
913
+		"movq (%%eax), %%mm4				\n\t"
914
+		"movq %%mm4, %%mm5				\n\t"
915
+		"punpcklbw %%mm7, %%mm4				\n\t" // low part of line 2
916
+		"punpckhbw %%mm7, %%mm5				\n\t" // high part of line 2
917
+
918
+		"paddw %%mm0, %%mm0				\n\t" // 2L0
919
+		"paddw %%mm1, %%mm1				\n\t" // 2H0
920
+		"psubw %%mm4, %%mm2				\n\t" // L1 - L2
921
+		"psubw %%mm5, %%mm3				\n\t" // H1 - H2
922
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - L1 + L2
923
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - H1 + H2
924
+
925
+		"psllw $2, %%mm2				\n\t" // 4L1 - 4L2
926
+		"psllw $2, %%mm3				\n\t" // 4H1 - 4H2
927
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2
928
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2
929
+
930
+		"movq (%%eax, %1), %%mm2			\n\t"
931
+		"movq %%mm2, %%mm3				\n\t"
932
+		"punpcklbw %%mm7, %%mm2				\n\t" // L3
933
+		"punpckhbw %%mm7, %%mm3				\n\t" // H3
934
+
935
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - L3
936
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - H3
937
+		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
938
+		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
939
+		"movq %%mm0, (%%ecx)				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
940
+		"movq %%mm1, 8(%%ecx)				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
941
+
942
+		"movq (%%eax, %1, 2), %%mm0			\n\t"
943
+		"movq %%mm0, %%mm1				\n\t"
944
+		"punpcklbw %%mm7, %%mm0				\n\t" // L4
945
+		"punpckhbw %%mm7, %%mm1				\n\t" // H4
946
+
947
+		"psubw %%mm0, %%mm2				\n\t" // L3 - L4
948
+		"psubw %%mm1, %%mm3				\n\t" // H3 - H4
949
+		"movq %%mm2, 16(%%ecx)				\n\t" // L3 - L4
950
+		"movq %%mm3, 24(%%ecx)				\n\t" // H3 - H4
951
+		"paddw %%mm4, %%mm4				\n\t" // 2L2
952
+		"paddw %%mm5, %%mm5				\n\t" // 2H2
953
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - L3 + L4
954
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - H3 + H4
955
+
956
+		"leal (%%eax, %1), %0				\n\t"
957
+		"psllw $2, %%mm2				\n\t" // 4L3 - 4L4
958
+		"psllw $2, %%mm3				\n\t" // 4H3 - 4H4
959
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4
960
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4
961
+//50 opcodes so far
962
+		"movq (%0, %1, 2), %%mm2			\n\t"
963
+		"movq %%mm2, %%mm3				\n\t"
964
+		"punpcklbw %%mm7, %%mm2				\n\t" // L5
965
+		"punpckhbw %%mm7, %%mm3				\n\t" // H5
966
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - L5
967
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - H5
968
+		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - 2L5
969
+		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - 2H5
970
+
971
+		"movq (%%eax, %1, 4), %%mm6			\n\t"
972
+		"punpcklbw %%mm7, %%mm6				\n\t" // L6
973
+		"psubw %%mm6, %%mm2				\n\t" // L5 - L6
974
+		"movq (%%eax, %1, 4), %%mm6			\n\t"
975
+		"punpckhbw %%mm7, %%mm6				\n\t" // H6
976
+		"psubw %%mm6, %%mm3				\n\t" // H5 - H6
977
+
978
+		"paddw %%mm0, %%mm0				\n\t" // 2L4
979
+		"paddw %%mm1, %%mm1				\n\t" // 2H4
980
+		"psubw %%mm2, %%mm0				\n\t" // 2L4 - L5 + L6
981
+		"psubw %%mm3, %%mm1				\n\t" // 2H4 - H5 + H6
982
+
983
+		"psllw $2, %%mm2				\n\t" // 4L5 - 4L6
984
+		"psllw $2, %%mm3				\n\t" // 4H5 - 4H6
985
+		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6
986
+		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6
987
+
988
+		"movq (%0, %1, 4), %%mm2			\n\t"
989
+		"movq %%mm2, %%mm3				\n\t"
990
+		"punpcklbw %%mm7, %%mm2				\n\t" // L7
991
+		"punpckhbw %%mm7, %%mm3				\n\t" // H7
992
+
993
+		"paddw %%mm2, %%mm2				\n\t" // 2L7
994
+		"paddw %%mm3, %%mm3				\n\t" // 2H7
995
+		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6 - 2L7
996
+		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6 - 2H7
997
+
998
+		"movq (%%ecx), %%mm2				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
999
+		"movq 8(%%ecx), %%mm3				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
1000
+
1001
+#ifdef HAVE_MMX2
1002
+		"movq %%mm7, %%mm6				\n\t" // 0
1003
+		"psubw %%mm0, %%mm6				\n\t"
1004
+		"pmaxsw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1005
+		"movq %%mm7, %%mm6				\n\t" // 0
1006
+		"psubw %%mm1, %%mm6				\n\t"
1007
+		"pmaxsw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1008
+		"movq %%mm7, %%mm6				\n\t" // 0
1009
+		"psubw %%mm2, %%mm6				\n\t"
1010
+		"pmaxsw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1011
+		"movq %%mm7, %%mm6				\n\t" // 0
1012
+		"psubw %%mm3, %%mm6				\n\t"
1013
+		"pmaxsw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1014
+#else
1015
+		"movq %%mm7, %%mm6				\n\t" // 0
1016
+		"pcmpgtw %%mm0, %%mm6				\n\t"
1017
+		"pxor %%mm6, %%mm0				\n\t"
1018
+		"psubw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1019
+		"movq %%mm7, %%mm6				\n\t" // 0
1020
+		"pcmpgtw %%mm1, %%mm6				\n\t"
1021
+		"pxor %%mm6, %%mm1				\n\t"
1022
+		"psubw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1023
+		"movq %%mm7, %%mm6				\n\t" // 0
1024
+		"pcmpgtw %%mm2, %%mm6				\n\t"
1025
+		"pxor %%mm6, %%mm2				\n\t"
1026
+		"psubw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1027
+		"movq %%mm7, %%mm6				\n\t" // 0
1028
+		"pcmpgtw %%mm3, %%mm6				\n\t"
1029
+		"pxor %%mm6, %%mm3				\n\t"
1030
+		"psubw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1031
+#endif
1032
+
1033
+#ifdef HAVE_MMX2
1034
+		"pminsw %%mm2, %%mm0				\n\t"
1035
+		"pminsw %%mm3, %%mm1				\n\t"
1036
+#else
1037
+		"movq %%mm0, %%mm6				\n\t"
1038
+		"psubusw %%mm2, %%mm6				\n\t"
1039
+		"psubw %%mm6, %%mm0				\n\t"
1040
+		"movq %%mm1, %%mm6				\n\t"
1041
+		"psubusw %%mm3, %%mm6				\n\t"
1042
+		"psubw %%mm6, %%mm1				\n\t"
1043
+#endif
1044
+
1045
+		"movq %%mm7, %%mm6				\n\t" // 0
1046
+		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1047
+		"pxor %%mm6, %%mm4				\n\t"
1048
+		"psubw %%mm6, %%mm4				\n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1049
+		"pcmpgtw %%mm5, %%mm7				\n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1050
+		"pxor %%mm7, %%mm5				\n\t"
1051
+		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1052
+// 100 opcodes
1053
+		"movd %2, %%mm2					\n\t" // QP
1054
+		"psllw $3, %%mm2				\n\t" // 8QP
1055
+		"movq %%mm2, %%mm3				\n\t" // 8QP
1056
+		"pcmpgtw %%mm4, %%mm2				\n\t"
1057
+		"pcmpgtw %%mm5, %%mm3				\n\t"
1058
+		"pand %%mm2, %%mm4				\n\t"
1059
+		"pand %%mm3, %%mm5				\n\t"
1060
+
1061
+
1062
+		"psubusw %%mm0, %%mm4				\n\t" // hd
1063
+		"psubusw %%mm1, %%mm5				\n\t" // ld
1064
+
1065
+
1066
+		"movq "MANGLE(w05)", %%mm2			\n\t" // 5
1067
+		"pmullw %%mm2, %%mm4				\n\t"
1068
+		"pmullw %%mm2, %%mm5				\n\t"
1069
+		"movq "MANGLE(w20)", %%mm2			\n\t" // 32
1070
+		"paddw %%mm2, %%mm4				\n\t"
1071
+		"paddw %%mm2, %%mm5				\n\t"
1072
+		"psrlw $6, %%mm4				\n\t"
1073
+		"psrlw $6, %%mm5				\n\t"
1074
+
1075
+		"movq 16(%%ecx), %%mm0				\n\t" // L3 - L4
1076
+		"movq 24(%%ecx), %%mm1				\n\t" // H3 - H4
1077
+
1078
+		"pxor %%mm2, %%mm2				\n\t"
1079
+		"pxor %%mm3, %%mm3				\n\t"
1080
+
1081
+		"pcmpgtw %%mm0, %%mm2				\n\t" // sign (L3-L4)
1082
+		"pcmpgtw %%mm1, %%mm3				\n\t" // sign (H3-H4)
1083
+		"pxor %%mm2, %%mm0				\n\t"
1084
+		"pxor %%mm3, %%mm1				\n\t"
1085
+		"psubw %%mm2, %%mm0				\n\t" // |L3-L4|
1086
+		"psubw %%mm3, %%mm1				\n\t" // |H3-H4|
1087
+		"psrlw $1, %%mm0				\n\t" // |L3 - L4|/2
1088
+		"psrlw $1, %%mm1				\n\t" // |H3 - H4|/2
1089
+
1090
+		"pxor %%mm6, %%mm2				\n\t"
1091
+		"pxor %%mm7, %%mm3				\n\t"
1092
+		"pand %%mm2, %%mm4				\n\t"
1093
+		"pand %%mm3, %%mm5				\n\t"
1094
+
1095
+#ifdef HAVE_MMX2
1096
+		"pminsw %%mm0, %%mm4				\n\t"
1097
+		"pminsw %%mm1, %%mm5				\n\t"
1098
+#else
1099
+		"movq %%mm4, %%mm2				\n\t"
1100
+		"psubusw %%mm0, %%mm2				\n\t"
1101
+		"psubw %%mm2, %%mm4				\n\t"
1102
+		"movq %%mm5, %%mm2				\n\t"
1103
+		"psubusw %%mm1, %%mm2				\n\t"
1104
+		"psubw %%mm2, %%mm5				\n\t"
1105
+#endif
1106
+		"pxor %%mm6, %%mm4				\n\t"
1107
+		"pxor %%mm7, %%mm5				\n\t"
1108
+		"psubw %%mm6, %%mm4				\n\t"
1109
+		"psubw %%mm7, %%mm5				\n\t"
1110
+		"packsswb %%mm5, %%mm4				\n\t"
1111
+		"movq (%0), %%mm0				\n\t"
1112
+		"paddb   %%mm4, %%mm0				\n\t"
1113
+		"movq %%mm0, (%0)				\n\t"
1114
+		"movq (%0, %1), %%mm0				\n\t"
1115
+		"psubb %%mm4, %%mm0				\n\t"
1116
+		"movq %%mm0, (%0, %1)				\n\t"
1117
+
1118
+		: "+r" (src)
1119
+		: "r" (stride), "m" (c->pQPb)
1120
+		: "%eax", "%ecx"
1121
+	);
1122
+#else
1123
+	const int l1= stride;
1124
+	const int l2= stride + l1;
1125
+	const int l3= stride + l2;
1126
+	const int l4= stride + l3;
1127
+	const int l5= stride + l4;
1128
+	const int l6= stride + l5;
1129
+	const int l7= stride + l6;
1130
+	const int l8= stride + l7;
1131
+//	const int l9= stride + l8;
1132
+	int x;
1133
+	src+= stride*3;
1134
+	for(x=0; x<BLOCK_SIZE; x++)
1135
+	{
1136
+		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1137
+		if(ABS(middleEnergy) < 8*c->QP)
1138
+		{
1139
+			const int q=(src[l4] - src[l5])/2;
1140
+			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1141
+			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1142
+
1143
+			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1144
+			d= MAX(d, 0);
1145
+
1146
+			d= (5*d + 32) >> 6;
1147
+			d*= SIGN(-middleEnergy);
1148
+
1149
+			if(q>0)
1150
+			{
1151
+				d= d<0 ? 0 : d;
1152
+				d= d>q ? q : d;
1153
+			}
1154
+			else
1155
+			{
1156
+				d= d>0 ? 0 : d;
1157
+				d= d<q ? q : d;
1158
+			}
1159
+
1160
+        		src[l4]-= d;
1161
+	        	src[l5]+= d;
1162
+		}
1163
+		src++;
1164
+	}
1165
+#endif
1166
+}
1167
+
1168
+static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1169
+{
1170
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1171
+	asm volatile(
1172
+		"pxor %%mm6, %%mm6				\n\t"
1173
+		"pcmpeqb %%mm7, %%mm7				\n\t"
1174
+		"movq %2, %%mm0					\n\t"
1175
+		"punpcklbw %%mm6, %%mm0				\n\t"
1176
+		"psrlw $1, %%mm0				\n\t"
1177
+		"psubw %%mm7, %%mm0				\n\t"
1178
+		"packuswb %%mm0, %%mm0				\n\t"
1179
+		"movq %%mm0, %3					\n\t"
1180
+
1181
+		"leal (%0, %1), %%eax				\n\t"
1182
+		"leal (%%eax, %1, 4), %%edx			\n\t"
1183
+		
1184
+//	0	1	2	3	4	5	6	7	8	9
1185
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1186
+
1187
+#undef FIND_MIN_MAX
1188
+#ifdef HAVE_MMX2
1189
+#define FIND_MIN_MAX(addr)\
1190
+		"movq " #addr ", %%mm0				\n\t"\
1191
+		"pminub %%mm0, %%mm7				\n\t"\
1192
+		"pmaxub %%mm0, %%mm6				\n\t"
1193
+#else
1194
+#define FIND_MIN_MAX(addr)\
1195
+		"movq " #addr ", %%mm0				\n\t"\
1196
+		"movq %%mm7, %%mm1				\n\t"\
1197
+		"psubusb %%mm0, %%mm6				\n\t"\
1198
+		"paddb %%mm0, %%mm6				\n\t"\
1199
+		"psubusb %%mm0, %%mm1				\n\t"\
1200
+		"psubb %%mm1, %%mm7				\n\t"
1201
+#endif
1202
+
1203
+FIND_MIN_MAX((%%eax))
1204
+FIND_MIN_MAX((%%eax, %1))
1205
+FIND_MIN_MAX((%%eax, %1, 2))
1206
+FIND_MIN_MAX((%0, %1, 4))
1207
+FIND_MIN_MAX((%%edx))
1208
+FIND_MIN_MAX((%%edx, %1))
1209
+FIND_MIN_MAX((%%edx, %1, 2))
1210
+FIND_MIN_MAX((%0, %1, 8))
1211
+
1212
+		"movq %%mm7, %%mm4				\n\t"
1213
+		"psrlq $8, %%mm7				\n\t"
1214
+#ifdef HAVE_MMX2
1215
+		"pminub %%mm4, %%mm7				\n\t" // min of pixels
1216
+		"pshufw $0xF9, %%mm7, %%mm4			\n\t"
1217
+		"pminub %%mm4, %%mm7				\n\t" // min of pixels
1218
+		"pshufw $0xFE, %%mm7, %%mm4			\n\t"
1219
+		"pminub %%mm4, %%mm7				\n\t"
1220
+#else
1221
+		"movq %%mm7, %%mm1				\n\t"
1222
+		"psubusb %%mm4, %%mm1				\n\t"
1223
+		"psubb %%mm1, %%mm7				\n\t"
1224
+		"movq %%mm7, %%mm4				\n\t"
1225
+		"psrlq $16, %%mm7				\n\t"
1226
+		"movq %%mm7, %%mm1				\n\t"
1227
+		"psubusb %%mm4, %%mm1				\n\t"
1228
+		"psubb %%mm1, %%mm7				\n\t"
1229
+		"movq %%mm7, %%mm4				\n\t"
1230
+		"psrlq $32, %%mm7				\n\t"
1231
+		"movq %%mm7, %%mm1				\n\t"
1232
+		"psubusb %%mm4, %%mm1				\n\t"
1233
+		"psubb %%mm1, %%mm7				\n\t"
1234
+#endif
1235
+
1236
+
1237
+		"movq %%mm6, %%mm4				\n\t"
1238
+		"psrlq $8, %%mm6				\n\t"
1239
+#ifdef HAVE_MMX2
1240
+		"pmaxub %%mm4, %%mm6				\n\t" // max of pixels
1241
+		"pshufw $0xF9, %%mm6, %%mm4			\n\t"
1242
+		"pmaxub %%mm4, %%mm6				\n\t"
1243
+		"pshufw $0xFE, %%mm6, %%mm4			\n\t"
1244
+		"pmaxub %%mm4, %%mm6				\n\t"
1245
+#else
1246
+		"psubusb %%mm4, %%mm6				\n\t"
1247
+		"paddb %%mm4, %%mm6				\n\t"
1248
+		"movq %%mm6, %%mm4				\n\t"
1249
+		"psrlq $16, %%mm6				\n\t"
1250
+		"psubusb %%mm4, %%mm6				\n\t"
1251
+		"paddb %%mm4, %%mm6				\n\t"
1252
+		"movq %%mm6, %%mm4				\n\t"
1253
+		"psrlq $32, %%mm6				\n\t"
1254
+		"psubusb %%mm4, %%mm6				\n\t"
1255
+		"paddb %%mm4, %%mm6				\n\t"
1256
+#endif
1257
+		"movq %%mm6, %%mm0				\n\t" // max
1258
+		"psubb %%mm7, %%mm6				\n\t" // max - min
1259
+		"movd %%mm6, %%ecx				\n\t"
1260
+		"cmpb "MANGLE(deringThreshold)", %%cl		\n\t"
1261
+		" jb 1f						\n\t"
1262
+		"leal -24(%%esp), %%ecx				\n\t"
1263
+		"andl $0xFFFFFFF8, %%ecx			\n\t" 
1264
+		PAVGB(%%mm0, %%mm7)				      // a=(max + min)/2
1265
+		"punpcklbw %%mm7, %%mm7				\n\t"
1266
+		"punpcklbw %%mm7, %%mm7				\n\t"
1267
+		"punpcklbw %%mm7, %%mm7				\n\t"
1268
+		"movq %%mm7, (%%ecx)				\n\t"
1269
+
1270
+		"movq (%0), %%mm0				\n\t" // L10
1271
+		"movq %%mm0, %%mm1				\n\t" // L10
1272
+		"movq %%mm0, %%mm2				\n\t" // L10
1273
+		"psllq $8, %%mm1				\n\t"
1274
+		"psrlq $8, %%mm2				\n\t"
1275
+		"movd -4(%0), %%mm3				\n\t"
1276
+		"movd 8(%0), %%mm4				\n\t"
1277
+		"psrlq $24, %%mm3				\n\t"
1278
+		"psllq $56, %%mm4				\n\t"
1279
+		"por %%mm3, %%mm1				\n\t" // L00
1280
+		"por %%mm4, %%mm2				\n\t" // L20
1281
+		"movq %%mm1, %%mm3				\n\t" // L00
1282
+		PAVGB(%%mm2, %%mm1)				      // (L20 + L00)/2
1283
+		PAVGB(%%mm0, %%mm1)				      // (L20 + L00 + 2L10)/4
1284
+		"psubusb %%mm7, %%mm0				\n\t"
1285
+		"psubusb %%mm7, %%mm2				\n\t"
1286
+		"psubusb %%mm7, %%mm3				\n\t"
1287
+		"pcmpeqb "MANGLE(b00)", %%mm0			\n\t" // L10 > a ? 0 : -1
1288
+		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L20 > a ? 0 : -1
1289
+		"pcmpeqb "MANGLE(b00)", %%mm3			\n\t" // L00 > a ? 0 : -1
1290
+		"paddb %%mm2, %%mm0				\n\t"
1291
+		"paddb %%mm3, %%mm0				\n\t"
1292
+
1293
+		"movq (%%eax), %%mm2				\n\t" // L11
1294
+		"movq %%mm2, %%mm3				\n\t" // L11
1295
+		"movq %%mm2, %%mm4				\n\t" // L11
1296
+		"psllq $8, %%mm3				\n\t"
1297
+		"psrlq $8, %%mm4				\n\t"
1298
+		"movd -4(%%eax), %%mm5				\n\t"
1299
+		"movd 8(%%eax), %%mm6				\n\t"
1300
+		"psrlq $24, %%mm5				\n\t"
1301
+		"psllq $56, %%mm6				\n\t"
1302
+		"por %%mm5, %%mm3				\n\t" // L01
1303
+		"por %%mm6, %%mm4				\n\t" // L21
1304
+		"movq %%mm3, %%mm5				\n\t" // L01
1305
+		PAVGB(%%mm4, %%mm3)				      // (L21 + L01)/2
1306
+		PAVGB(%%mm2, %%mm3)				      // (L21 + L01 + 2L11)/4
1307
+		"psubusb %%mm7, %%mm2				\n\t"
1308
+		"psubusb %%mm7, %%mm4				\n\t"
1309
+		"psubusb %%mm7, %%mm5				\n\t"
1310
+		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L11 > a ? 0 : -1
1311
+		"pcmpeqb "MANGLE(b00)", %%mm4			\n\t" // L21 > a ? 0 : -1
1312
+		"pcmpeqb "MANGLE(b00)", %%mm5			\n\t" // L01 > a ? 0 : -1
1313
+		"paddb %%mm4, %%mm2				\n\t"
1314
+		"paddb %%mm5, %%mm2				\n\t"
1315
+// 0, 2, 3, 1
1316
+#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1317
+		"movq " #src ", " #sx "				\n\t" /* src[0] */\
1318
+		"movq " #sx ", " #lx "				\n\t" /* src[0] */\
1319
+		"movq " #sx ", " #t0 "				\n\t" /* src[0] */\
1320
+		"psllq $8, " #lx "				\n\t"\
1321
+		"psrlq $8, " #t0 "				\n\t"\
1322
+		"movd -4" #src ", " #t1 "			\n\t"\
1323
+		"psrlq $24, " #t1 "				\n\t"\
1324
+		"por " #t1 ", " #lx "				\n\t" /* src[-1] */\
1325
+		"movd 8" #src ", " #t1 "			\n\t"\
1326
+		"psllq $56, " #t1 "				\n\t"\
1327
+		"por " #t1 ", " #t0 "				\n\t" /* src[+1] */\
1328
+		"movq " #lx ", " #t1 "				\n\t" /* src[-1] */\
1329
+		PAVGB(t0, lx)				              /* (src[-1] + src[+1])/2 */\
1330
+		PAVGB(sx, lx)				      /* (src[-1] + 2src[0] + src[+1])/4 */\
1331
+		PAVGB(lx, pplx)					     \
1332
+		"movq " #lx ", 8(%%ecx)				\n\t"\
1333
+		"movq (%%ecx), " #lx "				\n\t"\
1334
+		"psubusb " #lx ", " #t1 "			\n\t"\
1335
+		"psubusb " #lx ", " #t0 "			\n\t"\
1336
+		"psubusb " #lx ", " #sx "			\n\t"\
1337
+		"movq "MANGLE(b00)", " #lx "			\n\t"\
1338
+		"pcmpeqb " #lx ", " #t1 "			\n\t" /* src[-1] > a ? 0 : -1*/\
1339
+		"pcmpeqb " #lx ", " #t0 "			\n\t" /* src[+1] > a ? 0 : -1*/\
1340
+		"pcmpeqb " #lx ", " #sx "			\n\t" /* src[0]  > a ? 0 : -1*/\
1341
+		"paddb " #t1 ", " #t0 "				\n\t"\
1342
+		"paddb " #t0 ", " #sx "				\n\t"\
1343
+\
1344
+		PAVGB(plx, pplx)				      /* filtered */\
1345
+		"movq " #dst ", " #t0 "				\n\t" /* dst */\
1346
+		"movq " #t0 ", " #t1 "				\n\t" /* dst */\
1347
+		"psubusb %3, " #t0 "				\n\t"\
1348
+		"paddusb %3, " #t1 "				\n\t"\
1349
+		PMAXUB(t0, pplx)\
1350
+		PMINUB(t1, pplx, t0)\
1351
+		"paddb " #sx ", " #ppsx "			\n\t"\
1352
+		"paddb " #psx ", " #ppsx "			\n\t"\
1353
+		"#paddb "MANGLE(b02)", " #ppsx "		\n\t"\
1354
+		"pand "MANGLE(b08)", " #ppsx "			\n\t"\
1355
+		"pcmpeqb " #lx ", " #ppsx "			\n\t"\
1356
+		"pand " #ppsx ", " #pplx "			\n\t"\
1357
+		"pandn " #dst ", " #ppsx "			\n\t"\
1358
+		"por " #pplx ", " #ppsx "			\n\t"\
1359
+		"movq " #ppsx ", " #dst "			\n\t"\
1360
+		"movq 8(%%ecx), " #lx "				\n\t"
1361
+
1362
+/*
1363
+0000000
1364
+1111111
1365
+
1366
+1111110
1367
+1111101
1368
+1111100
1369
+1111011
1370
+1111010
1371
+1111001
1372
+
1373
+1111000
1374
+1110111
1375
+
1376
+*/
1377
+//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1378
+DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1379
+DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1380
+DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1381
+DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1382
+DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1383
+DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1384
+DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1385
+DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1386
+
1387
+		"1:			\n\t"
1388
+		: : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1389
+		: "%eax", "%edx", "%ecx"
1390
+	);
1391
+#else
1392
+	int y;
1393
+	int min=255;
1394
+	int max=0;
1395
+	int avg;
1396
+	uint8_t *p;
1397
+	int s[10];
1398
+	const int QP2= c->QP/2 + 1;
1399
+
1400
+	for(y=1; y<9; y++)
1401
+	{
1402
+		int x;
1403
+		p= src + stride*y;
1404
+		for(x=1; x<9; x++)
1405
+		{
1406
+			p++;
1407
+			if(*p > max) max= *p;
1408
+			if(*p < min) min= *p;
1409
+		}
1410
+	}
1411
+	avg= (min + max + 1)>>1;
1412
+
1413
+	if(max - min <deringThreshold) return;
1414
+
1415
+	for(y=0; y<10; y++)
1416
+	{
1417
+		int t = 0;
1418
+
1419
+		if(src[stride*y + 0] > avg) t+= 1;
1420
+		if(src[stride*y + 1] > avg) t+= 2;
1421
+		if(src[stride*y + 2] > avg) t+= 4;
1422
+		if(src[stride*y + 3] > avg) t+= 8;
1423
+		if(src[stride*y + 4] > avg) t+= 16;
1424
+		if(src[stride*y + 5] > avg) t+= 32;
1425
+		if(src[stride*y + 6] > avg) t+= 64;
1426
+		if(src[stride*y + 7] > avg) t+= 128;
1427
+		if(src[stride*y + 8] > avg) t+= 256;
1428
+		if(src[stride*y + 9] > avg) t+= 512;
1429
+		
1430
+		t |= (~t)<<16;
1431
+		t &= (t<<1) & (t>>1);
1432
+		s[y] = t;
1433
+	}
1434
+	
1435
+	for(y=1; y<9; y++)
1436
+	{
1437
+		int t = s[y-1] & s[y] & s[y+1];
1438
+		t|= t>>16;
1439
+		s[y-1]= t;
1440
+	}
1441
+
1442
+	for(y=1; y<9; y++)
1443
+	{
1444
+		int x;
1445
+		int t = s[y-1];
1446
+
1447
+		p= src + stride*y;
1448
+		for(x=1; x<9; x++)
1449
+		{
1450
+			p++;
1451
+			if(t & (1<<x))
1452
+			{
1453
+				int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1454
+				      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1455
+				      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1456
+				f= (f + 8)>>4;
1457
+
1458
+#ifdef DEBUG_DERING_THRESHOLD
1459
+				asm volatile("emms\n\t":);
1460
+				{
1461
+				static long long numPixels=0;
1462
+				if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1463
+//				if((max-min)<20 || (max-min)*QP<200)
1464
+//				if((max-min)*QP < 500)
1465
+//				if(max-min<QP/2)
1466
+				if(max-min < 20)
1467
+				{
1468
+					static int numSkiped=0;
1469
+					static int errorSum=0;
1470
+					static int worstQP=0;
1471
+					static int worstRange=0;
1472
+					static int worstDiff=0;
1473
+					int diff= (f - *p);
1474
+					int absDiff= ABS(diff);
1475
+					int error= diff*diff;
1476
+
1477
+					if(x==1 || x==8 || y==1 || y==8) continue;
1478
+
1479
+					numSkiped++;
1480
+					if(absDiff > worstDiff)
1481
+					{
1482
+						worstDiff= absDiff;
1483
+						worstQP= QP;
1484
+						worstRange= max-min;
1485
+					}
1486
+					errorSum+= error;
1487
+
1488
+					if(1024LL*1024LL*1024LL % numSkiped == 0)
1489
+					{
1490
+						printf( "sum:%1.3f, skip:%d, wQP:%d, "
1491
+							"wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1492
+							(float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1493
+							worstDiff, (float)numSkiped/numPixels);
1494
+					}
1495
+				}
1496
+				}
1497
+#endif
1498
+				if     (*p + QP2 < f) *p= *p + QP2;
1499
+				else if(*p - QP2 > f) *p= *p - QP2;
1500
+				else *p=f;
1501
+			}
1502
+		}
1503
+	}
1504
+#ifdef DEBUG_DERING_THRESHOLD
1505
+	if(max-min < 20)
1506
+	{
1507
+		for(y=1; y<9; y++)
1508
+		{
1509
+			int x;
1510
+			int t = 0;
1511
+			p= src + stride*y;
1512
+			for(x=1; x<9; x++)
1513
+			{
1514
+				p++;
1515
+				*p = MIN(*p + 20, 255);
1516
+			}
1517
+		}
1518
+//		src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1519
+	}
1520
+#endif
1521
+#endif
1522
+}
1523
+
1524
+/**
1525
+ * Deinterlaces the given block
1526
+ * will be called for every 8x8 block and can read & write from line 4-15
1527
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1528
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1529
+ */
1530
+static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1531
+{
1532
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1533
+	src+= 4*stride;
1534
+	asm volatile(
1535
+		"leal (%0, %1), %%eax				\n\t"
1536
+		"leal (%%eax, %1, 4), %%ecx			\n\t"
1537
+//	0	1	2	3	4	5	6	7	8	9
1538
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
1539
+
1540
+		"movq (%0), %%mm0				\n\t"
1541
+		"movq (%%eax, %1), %%mm1			\n\t"
1542
+		PAVGB(%%mm1, %%mm0)
1543
+		"movq %%mm0, (%%eax)				\n\t"
1544
+		"movq (%0, %1, 4), %%mm0			\n\t"
1545
+		PAVGB(%%mm0, %%mm1)
1546
+		"movq %%mm1, (%%eax, %1, 2)			\n\t"
1547
+		"movq (%%ecx, %1), %%mm1			\n\t"
1548
+		PAVGB(%%mm1, %%mm0)
1549
+		"movq %%mm0, (%%ecx)				\n\t"
1550
+		"movq (%0, %1, 8), %%mm0			\n\t"
1551
+		PAVGB(%%mm0, %%mm1)
1552
+		"movq %%mm1, (%%ecx, %1, 2)			\n\t"
1553
+
1554
+		: : "r" (src), "r" (stride)
1555
+		: "%eax", "%ecx"
1556
+	);
1557
+#else
1558
+	int x;
1559
+	src+= 4*stride;
1560
+	for(x=0; x<8; x++)
1561
+	{
1562
+		src[stride]   = (src[0]        + src[stride*2])>>1;
1563
+		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1564
+		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1565
+		src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1566
+		src++;
1567
+	}
1568
+#endif
1569
+}
1570
+
1571
+/**
1572
+ * Deinterlaces the given block
1573
+ * will be called for every 8x8 block and can read & write from line 4-15
1574
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1575
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1576
+ * this filter will read lines 3-15 and write 7-13
1577
+ * no cliping in C version
1578
+ */
1579
+static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1580
+{
1581
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1582
+	src+= stride*3;
1583
+	asm volatile(
1584
+		"leal (%0, %1), %%eax				\n\t"
1585
+		"leal (%%eax, %1, 4), %%edx			\n\t"
1586
+		"leal (%%edx, %1, 4), %%ecx			\n\t"
1587
+		"addl %1, %%ecx					\n\t"
1588
+		"pxor %%mm7, %%mm7				\n\t"
1589
+//	0	1	2	3	4	5	6	7	8	9	10
1590
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
1591
+
1592
+#define DEINT_CUBIC(a,b,c,d,e)\
1593
+		"movq " #a ", %%mm0				\n\t"\
1594
+		"movq " #b ", %%mm1				\n\t"\
1595
+		"movq " #d ", %%mm2				\n\t"\
1596
+		"movq " #e ", %%mm3				\n\t"\
1597
+		PAVGB(%%mm2, %%mm1)					/* (b+d) /2 */\
1598
+		PAVGB(%%mm3, %%mm0)					/* a(a+e) /2 */\
1599
+		"movq %%mm0, %%mm2				\n\t"\
1600
+		"punpcklbw %%mm7, %%mm0				\n\t"\
1601
+		"punpckhbw %%mm7, %%mm2				\n\t"\
1602
+		"movq %%mm1, %%mm3				\n\t"\
1603
+		"punpcklbw %%mm7, %%mm1				\n\t"\
1604
+		"punpckhbw %%mm7, %%mm3				\n\t"\
1605
+		"psubw %%mm1, %%mm0				\n\t"	/* L(a+e - (b+d))/2 */\
1606
+		"psubw %%mm3, %%mm2				\n\t"	/* H(a+e - (b+d))/2 */\
1607
+		"psraw $3, %%mm0				\n\t"	/* L(a+e - (b+d))/16 */\
1608
+		"psraw $3, %%mm2				\n\t"	/* H(a+e - (b+d))/16 */\
1609
+		"psubw %%mm0, %%mm1				\n\t"	/* L(9b + 9d - a - e)/16 */\
1610
+		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
1611
+		"packuswb %%mm3, %%mm1				\n\t"\
1612
+		"movq %%mm1, " #c "				\n\t"
1613
+
1614
+DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
1615
+DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
1616
+DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
1617
+DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1618
+
1619
+		: : "r" (src), "r" (stride)
1620
+		: "%eax", "%edx", "ecx"
1621
+	);
1622
+#else
1623
+	int x;
1624
+	src+= stride*3;
1625
+	for(x=0; x<8; x++)
1626
+	{
1627
+		src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1628
+		src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1629
+		src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1630
+		src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1631
+		src++;
1632
+	}
1633
+#endif
1634
+}
1635
+
1636
+/**
1637
+ * Deinterlaces the given block
1638
+ * will be called for every 8x8 block and can read & write from line 4-15
1639
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1640
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1641
+ * this filter will read lines 4-13 and write 5-11
1642
+ * no cliping in C version
1643
+ */
1644
+static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1645
+{
1646
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1647
+	src+= stride*4;
1648
+	asm volatile(
1649
+		"leal (%0, %1), %%eax				\n\t"
1650
+		"leal (%%eax, %1, 4), %%edx			\n\t"
1651
+		"pxor %%mm7, %%mm7				\n\t"
1652
+		"movq (%2), %%mm0				\n\t"
1653
+//	0	1	2	3	4	5	6	7	8	9	10
1654
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
1655
+
1656
+#define DEINT_FF(a,b,c,d)\
1657
+		"movq " #a ", %%mm1				\n\t"\
1658
+		"movq " #b ", %%mm2				\n\t"\
1659
+		"movq " #c ", %%mm3				\n\t"\
1660
+		"movq " #d ", %%mm4				\n\t"\
1661
+		PAVGB(%%mm3, %%mm1)					\
1662
+		PAVGB(%%mm4, %%mm0)					\
1663
+		"movq %%mm0, %%mm3				\n\t"\
1664
+		"punpcklbw %%mm7, %%mm0				\n\t"\
1665
+		"punpckhbw %%mm7, %%mm3				\n\t"\
1666
+		"movq %%mm1, %%mm4				\n\t"\
1667
+		"punpcklbw %%mm7, %%mm1				\n\t"\
1668
+		"punpckhbw %%mm7, %%mm4				\n\t"\
1669
+		"psllw $2, %%mm1				\n\t"\
1670
+		"psllw $2, %%mm4				\n\t"\
1671
+		"psubw %%mm0, %%mm1				\n\t"\
1672
+		"psubw %%mm3, %%mm4				\n\t"\
1673
+		"movq %%mm2, %%mm5				\n\t"\
1674
+		"movq %%mm2, %%mm0				\n\t"\
1675
+		"punpcklbw %%mm7, %%mm2				\n\t"\
1676
+		"punpckhbw %%mm7, %%mm5				\n\t"\
1677
+		"paddw %%mm2, %%mm1				\n\t"\
1678
+		"paddw %%mm5, %%mm4				\n\t"\
1679
+		"psraw $2, %%mm1				\n\t"\
1680
+		"psraw $2, %%mm4				\n\t"\
1681
+		"packuswb %%mm4, %%mm1				\n\t"\
1682
+		"movq %%mm1, " #b "				\n\t"\
1683
+
1684
+DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
1685
+DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
1686
+DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
1687
+DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
1688
+
1689
+		"movq %%mm0, (%2)				\n\t"
1690
+		: : "r" (src), "r" (stride), "r"(tmp)
1691
+		: "%eax", "%edx"
1692
+	);
1693
+#else
1694
+	int x;
1695
+	src+= stride*4;
1696
+	for(x=0; x<8; x++)
1697
+	{
1698
+		int t1= tmp[x];
1699
+		int t2= src[stride*1];
1700
+
1701
+		src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
1702
+		t1= src[stride*4];
1703
+		src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
1704
+		t2= src[stride*6];
1705
+		src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
1706
+		t1= src[stride*8];
1707
+		src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
1708
+		tmp[x]= t1;
1709
+
1710
+		src++;
1711
+	}
1712
+#endif
1713
+}
1714
+
1715
+/**
1716
+ * Deinterlaces the given block
1717
+ * will be called for every 8x8 block and can read & write from line 4-15
1718
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1719
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1720
+ * will shift the image up by 1 line (FIXME if this is a problem)
1721
+ * this filter will read lines 4-13 and write 4-11
1722
+ */
1723
+static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
1724
+{
1725
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1726
+	src+= 4*stride;
1727
+	asm volatile(
1728
+		"leal (%0, %1), %%eax				\n\t"
1729
+		"leal (%%eax, %1, 4), %%edx			\n\t"
1730
+//	0	1	2	3	4	5	6	7	8	9
1731
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1732
+
1733
+		"movq (%0), %%mm0				\n\t" // L0
1734
+		"movq (%%eax, %1), %%mm1			\n\t" // L2
1735
+		PAVGB(%%mm1, %%mm0)				      // L0+L2
1736
+		"movq (%%eax), %%mm2				\n\t" // L1
1737
+		PAVGB(%%mm2, %%mm0)
1738
+		"movq %%mm0, (%0)				\n\t"
1739
+		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
1740
+		PAVGB(%%mm0, %%mm2)				      // L1+L3
1741
+		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
1742
+		"movq %%mm2, (%%eax)				\n\t"
1743
+		"movq (%0, %1, 4), %%mm2			\n\t" // L4
1744
+		PAVGB(%%mm2, %%mm1)				      // L2+L4
1745
+		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
1746
+		"movq %%mm1, (%%eax, %1)			\n\t"
1747
+		"movq (%%edx), %%mm1				\n\t" // L5
1748
+		PAVGB(%%mm1, %%mm0)				      // L3+L5
1749
+		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
1750
+		"movq %%mm0, (%%eax, %1, 2)			\n\t"
1751
+		"movq (%%edx, %1), %%mm0			\n\t" // L6
1752
+		PAVGB(%%mm0, %%mm2)				      // L4+L6
1753
+		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
1754
+		"movq %%mm2, (%0, %1, 4)			\n\t"
1755
+		"movq (%%edx, %1, 2), %%mm2			\n\t" // L7
1756
+		PAVGB(%%mm2, %%mm1)				      // L5+L7
1757
+		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
1758
+		"movq %%mm1, (%%edx)				\n\t"
1759
+		"movq (%0, %1, 8), %%mm1			\n\t" // L8
1760
+		PAVGB(%%mm1, %%mm0)				      // L6+L8
1761
+		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
1762
+		"movq %%mm0, (%%edx, %1)			\n\t"
1763
+		"movq (%%edx, %1, 4), %%mm0			\n\t" // L9
1764
+		PAVGB(%%mm0, %%mm2)				      // L7+L9
1765
+		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
1766
+		"movq %%mm2, (%%edx, %1, 2)			\n\t"
1767
+
1768
+
1769
+		: : "r" (src), "r" (stride)
1770
+		: "%eax", "%edx"
1771
+	);
1772
+#else
1773
+	int x;
1774
+	src+= 4*stride;
1775
+	for(x=0; x<8; x++)
1776
+	{
1777
+		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1778
+		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1779
+		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1780
+		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1781
+		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1782
+		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1783
+		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1784
+		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1785
+		src++;
1786
+	}
1787
+#endif
1788
+}
1789
+
1790
+/**
1791
+ * Deinterlaces the given block
1792
+ * will be called for every 8x8 block and can read & write from line 4-15,
1793
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1794
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1795
+ */
1796
+static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1797
+{
1798
+#ifdef HAVE_MMX
1799
+	src+= 4*stride;
1800
+#ifdef HAVE_MMX2
1801
+	asm volatile(
1802
+		"leal (%0, %1), %%eax				\n\t"
1803
+		"leal (%%eax, %1, 4), %%edx			\n\t"
1804
+//	0	1	2	3	4	5	6	7	8	9
1805
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1806
+
1807
+		"movq (%0), %%mm0				\n\t" //
1808
+		"movq (%%eax, %1), %%mm2			\n\t" //
1809
+		"movq (%%eax), %%mm1				\n\t" //
1810
+		"movq %%mm0, %%mm3				\n\t"
1811
+		"pmaxub %%mm1, %%mm0				\n\t" //
1812
+		"pminub %%mm3, %%mm1				\n\t" //
1813
+		"pmaxub %%mm2, %%mm1				\n\t" //
1814
+		"pminub %%mm1, %%mm0				\n\t"
1815
+		"movq %%mm0, (%%eax)				\n\t"
1816
+
1817
+		"movq (%0, %1, 4), %%mm0			\n\t" //
1818
+		"movq (%%eax, %1, 2), %%mm1			\n\t" //
1819
+		"movq %%mm2, %%mm3				\n\t"
1820
+		"pmaxub %%mm1, %%mm2				\n\t" //
1821
+		"pminub %%mm3, %%mm1				\n\t" //
1822
+		"pmaxub %%mm0, %%mm1				\n\t" //
1823
+		"pminub %%mm1, %%mm2				\n\t"
1824
+		"movq %%mm2, (%%eax, %1, 2)			\n\t"
1825
+
1826
+		"movq (%%edx), %%mm2				\n\t" //
1827
+		"movq (%%edx, %1), %%mm1			\n\t" //
1828
+		"movq %%mm2, %%mm3				\n\t"
1829
+		"pmaxub %%mm0, %%mm2				\n\t" //
1830
+		"pminub %%mm3, %%mm0				\n\t" //
1831
+		"pmaxub %%mm1, %%mm0				\n\t" //
1832
+		"pminub %%mm0, %%mm2				\n\t"
1833
+		"movq %%mm2, (%%edx)				\n\t"
1834
+
1835
+		"movq (%%edx, %1, 2), %%mm2			\n\t" //
1836
+		"movq (%0, %1, 8), %%mm0			\n\t" //
1837
+		"movq %%mm2, %%mm3				\n\t"
1838
+		"pmaxub %%mm0, %%mm2				\n\t" //
1839
+		"pminub %%mm3, %%mm0				\n\t" //
1840
+		"pmaxub %%mm1, %%mm0				\n\t" //
1841
+		"pminub %%mm0, %%mm2				\n\t"
1842
+		"movq %%mm2, (%%edx, %1, 2)			\n\t"
1843
+
1844
+
1845
+		: : "r" (src), "r" (stride)
1846
+		: "%eax", "%edx"
1847
+	);
1848
+
1849
+#else // MMX without MMX2
1850
+	asm volatile(
1851
+		"leal (%0, %1), %%eax				\n\t"
1852
+		"leal (%%eax, %1, 4), %%edx			\n\t"
1853
+//	0	1	2	3	4	5	6	7	8	9
1854
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1855
+		"pxor %%mm7, %%mm7				\n\t"
1856
+
1857
+#define MEDIAN(a,b,c)\
1858
+		"movq " #a ", %%mm0				\n\t"\
1859
+		"movq " #b ", %%mm2				\n\t"\
1860
+		"movq " #c ", %%mm1				\n\t"\
1861
+		"movq %%mm0, %%mm3				\n\t"\
1862
+		"movq %%mm1, %%mm4				\n\t"\
1863
+		"movq %%mm2, %%mm5				\n\t"\
1864
+		"psubusb %%mm1, %%mm3				\n\t"\
1865
+		"psubusb %%mm2, %%mm4				\n\t"\
1866
+		"psubusb %%mm0, %%mm5				\n\t"\
1867
+		"pcmpeqb %%mm7, %%mm3				\n\t"\
1868
+		"pcmpeqb %%mm7, %%mm4				\n\t"\
1869
+		"pcmpeqb %%mm7, %%mm5				\n\t"\
1870
+		"movq %%mm3, %%mm6				\n\t"\
1871
+		"pxor %%mm4, %%mm3				\n\t"\
1872
+		"pxor %%mm5, %%mm4				\n\t"\
1873
+		"pxor %%mm6, %%mm5				\n\t"\
1874
+		"por %%mm3, %%mm1				\n\t"\
1875
+		"por %%mm4, %%mm2				\n\t"\
1876
+		"por %%mm5, %%mm0				\n\t"\
1877
+		"pand %%mm2, %%mm0				\n\t"\
1878
+		"pand %%mm1, %%mm0				\n\t"\
1879
+		"movq %%mm0, " #b "				\n\t"
1880
+
1881
+MEDIAN((%0), (%%eax), (%%eax, %1))
1882
+MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1883
+MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
1884
+MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
1885
+
1886
+		: : "r" (src), "r" (stride)
1887
+		: "%eax", "%edx"
1888
+	);
1889
+#endif // MMX
1890
+#else
1891
+	int x, y;
1892
+	src+= 4*stride;
1893
+	// FIXME - there should be a way to do a few columns in parallel like w/mmx
1894
+	for(x=0; x<8; x++)
1895
+	{
1896
+		uint8_t *colsrc = src;
1897
+		for (y=0; y<4; y++)
1898
+		{
1899
+			int a, b, c, d, e, f;
1900
+			a = colsrc[0       ];
1901
+			b = colsrc[stride  ];
1902
+			c = colsrc[stride*2];
1903
+			d = (a-b)>>31;
1904
+			e = (b-c)>>31;
1905
+			f = (c-a)>>31;
1906
+			colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1907
+			colsrc += stride*2;
1908
+		}
1909
+		src++;
1910
+	}
1911
+#endif
1912
+}
1913
+
1914
+#ifdef HAVE_MMX
1915
+/**
1916
+ * transposes and shift the given 8x8 Block into dst1 and dst2
1917
+ */
1918
+static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1919
+{
1920
+	asm(
1921
+		"leal (%0, %1), %%eax				\n\t"
1922
+//	0	1	2	3	4	5	6	7	8	9
1923
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1924
+		"movq (%0), %%mm0		\n\t" // 12345678
1925
+		"movq (%%eax), %%mm1		\n\t" // abcdefgh
1926
+		"movq %%mm0, %%mm2		\n\t" // 12345678
1927
+		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
1928
+		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
1929
+
1930
+		"movq (%%eax, %1), %%mm1	\n\t"
1931
+		"movq (%%eax, %1, 2), %%mm3	\n\t"
1932
+		"movq %%mm1, %%mm4		\n\t"
1933
+		"punpcklbw %%mm3, %%mm1		\n\t"
1934
+		"punpckhbw %%mm3, %%mm4		\n\t"
1935
+
1936
+		"movq %%mm0, %%mm3		\n\t"
1937
+		"punpcklwd %%mm1, %%mm0		\n\t"
1938
+		"punpckhwd %%mm1, %%mm3		\n\t"
1939
+		"movq %%mm2, %%mm1		\n\t"
1940
+		"punpcklwd %%mm4, %%mm2		\n\t"
1941
+		"punpckhwd %%mm4, %%mm1		\n\t"
1942
+
1943
+		"movd %%mm0, 128(%2)		\n\t"
1944
+		"psrlq $32, %%mm0		\n\t"
1945
+		"movd %%mm0, 144(%2)		\n\t"
1946
+		"movd %%mm3, 160(%2)		\n\t"
1947
+		"psrlq $32, %%mm3		\n\t"
1948
+		"movd %%mm3, 176(%2)		\n\t"
1949
+		"movd %%mm3, 48(%3)		\n\t"
1950
+		"movd %%mm2, 192(%2)		\n\t"
1951
+		"movd %%mm2, 64(%3)		\n\t"
1952
+		"psrlq $32, %%mm2		\n\t"
1953
+		"movd %%mm2, 80(%3)		\n\t"
1954
+		"movd %%mm1, 96(%3)		\n\t"
1955
+		"psrlq $32, %%mm1		\n\t"
1956
+		"movd %%mm1, 112(%3)		\n\t"
1957
+
1958
+		"leal (%%eax, %1, 4), %%eax	\n\t"
1959
+		
1960
+		"movq (%0, %1, 4), %%mm0	\n\t" // 12345678
1961
+		"movq (%%eax), %%mm1		\n\t" // abcdefgh
1962
+		"movq %%mm0, %%mm2		\n\t" // 12345678
1963
+		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
1964
+		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
1965
+
1966
+		"movq (%%eax, %1), %%mm1	\n\t"
1967
+		"movq (%%eax, %1, 2), %%mm3	\n\t"
1968
+		"movq %%mm1, %%mm4		\n\t"
1969
+		"punpcklbw %%mm3, %%mm1		\n\t"
1970
+		"punpckhbw %%mm3, %%mm4		\n\t"
1971
+
1972
+		"movq %%mm0, %%mm3		\n\t"
1973
+		"punpcklwd %%mm1, %%mm0		\n\t"
1974
+		"punpckhwd %%mm1, %%mm3		\n\t"
1975
+		"movq %%mm2, %%mm1		\n\t"
1976
+		"punpcklwd %%mm4, %%mm2		\n\t"
1977
+		"punpckhwd %%mm4, %%mm1		\n\t"
1978
+
1979
+		"movd %%mm0, 132(%2)		\n\t"
1980
+		"psrlq $32, %%mm0		\n\t"
1981
+		"movd %%mm0, 148(%2)		\n\t"
1982
+		"movd %%mm3, 164(%2)		\n\t"
1983
+		"psrlq $32, %%mm3		\n\t"
1984
+		"movd %%mm3, 180(%2)		\n\t"
1985
+		"movd %%mm3, 52(%3)		\n\t"
1986
+		"movd %%mm2, 196(%2)		\n\t"
1987
+		"movd %%mm2, 68(%3)		\n\t"
1988
+		"psrlq $32, %%mm2		\n\t"
1989
+		"movd %%mm2, 84(%3)		\n\t"
1990
+		"movd %%mm1, 100(%3)		\n\t"
1991
+		"psrlq $32, %%mm1		\n\t"
1992
+		"movd %%mm1, 116(%3)		\n\t"
1993
+
1994
+
1995
+	:: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
1996
+	: "%eax"
1997
+	);
1998
+}
1999
+
2000
+/**
2001
+ * transposes the given 8x8 block
2002
+ */
2003
+static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2004
+{
2005
+	asm(
2006
+		"leal (%0, %1), %%eax				\n\t"
2007
+		"leal (%%eax, %1, 4), %%edx			\n\t"
2008
+//	0	1	2	3	4	5	6	7	8	9
2009
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
2010
+		"movq (%2), %%mm0		\n\t" // 12345678
2011
+		"movq 16(%2), %%mm1		\n\t" // abcdefgh
2012
+		"movq %%mm0, %%mm2		\n\t" // 12345678
2013
+		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
2014
+		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
2015
+
2016
+		"movq 32(%2), %%mm1		\n\t"
2017
+		"movq 48(%2), %%mm3		\n\t"
2018
+		"movq %%mm1, %%mm4		\n\t"
2019
+		"punpcklbw %%mm3, %%mm1		\n\t"
2020
+		"punpckhbw %%mm3, %%mm4		\n\t"
2021
+
2022
+		"movq %%mm0, %%mm3		\n\t"
2023
+		"punpcklwd %%mm1, %%mm0		\n\t"
2024
+		"punpckhwd %%mm1, %%mm3		\n\t"
2025
+		"movq %%mm2, %%mm1		\n\t"
2026
+		"punpcklwd %%mm4, %%mm2		\n\t"
2027
+		"punpckhwd %%mm4, %%mm1		\n\t"
2028
+
2029
+		"movd %%mm0, (%0)		\n\t"
2030
+		"psrlq $32, %%mm0		\n\t"
2031
+		"movd %%mm0, (%%eax)		\n\t"
2032
+		"movd %%mm3, (%%eax, %1)	\n\t"
2033
+		"psrlq $32, %%mm3		\n\t"
2034
+		"movd %%mm3, (%%eax, %1, 2)	\n\t"
2035
+		"movd %%mm2, (%0, %1, 4)	\n\t"
2036
+		"psrlq $32, %%mm2		\n\t"
2037
+		"movd %%mm2, (%%edx)		\n\t"
2038
+		"movd %%mm1, (%%edx, %1)	\n\t"
2039
+		"psrlq $32, %%mm1		\n\t"
2040
+		"movd %%mm1, (%%edx, %1, 2)	\n\t"
2041
+
2042
+
2043
+		"movq 64(%2), %%mm0		\n\t" // 12345678
2044
+		"movq 80(%2), %%mm1		\n\t" // abcdefgh
2045
+		"movq %%mm0, %%mm2		\n\t" // 12345678
2046
+		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
2047
+		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
2048
+
2049
+		"movq 96(%2), %%mm1		\n\t"
2050
+		"movq 112(%2), %%mm3		\n\t"
2051
+		"movq %%mm1, %%mm4		\n\t"
2052
+		"punpcklbw %%mm3, %%mm1		\n\t"
2053
+		"punpckhbw %%mm3, %%mm4		\n\t"
2054
+
2055
+		"movq %%mm0, %%mm3		\n\t"
2056
+		"punpcklwd %%mm1, %%mm0		\n\t"
2057
+		"punpckhwd %%mm1, %%mm3		\n\t"
2058
+		"movq %%mm2, %%mm1		\n\t"
2059
+		"punpcklwd %%mm4, %%mm2		\n\t"
2060
+		"punpckhwd %%mm4, %%mm1		\n\t"
2061
+
2062
+		"movd %%mm0, 4(%0)		\n\t"
2063
+		"psrlq $32, %%mm0		\n\t"
2064
+		"movd %%mm0, 4(%%eax)		\n\t"
2065
+		"movd %%mm3, 4(%%eax, %1)	\n\t"
2066
+		"psrlq $32, %%mm3		\n\t"
2067
+		"movd %%mm3, 4(%%eax, %1, 2)	\n\t"
2068
+		"movd %%mm2, 4(%0, %1, 4)	\n\t"
2069
+		"psrlq $32, %%mm2		\n\t"
2070
+		"movd %%mm2, 4(%%edx)		\n\t"
2071
+		"movd %%mm1, 4(%%edx, %1)	\n\t"
2072
+		"psrlq $32, %%mm1		\n\t"
2073
+		"movd %%mm1, 4(%%edx, %1, 2)	\n\t"
2074
+
2075
+	:: "r" (dst), "r" (dstStride), "r" (src)
2076
+	: "%eax", "%edx"
2077
+	);
2078
+}
2079
+#endif
2080
+//static int test=0;
2081
+
2082
+static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2083
+				    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2084
+{
2085
+	// to save a register (FIXME do this outside of the loops)
2086
+	tempBluredPast[127]= maxNoise[0];
2087
+	tempBluredPast[128]= maxNoise[1];
2088
+	tempBluredPast[129]= maxNoise[2];
2089
+        
2090
+#define FAST_L2_DIFF
2091
+//#define L1_DIFF //u should change the thresholds too if u try that one
2092
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2093
+	asm volatile(
2094
+		"leal (%2, %2, 2), %%eax			\n\t" // 3*stride
2095
+		"leal (%2, %2, 4), %%edx			\n\t" // 5*stride
2096
+		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2097
+//	0	1	2	3	4	5	6	7	8	9
2098
+//	%x	%x+%2	%x+2%2	%x+eax	%x+4%2	%x+edx	%x+2eax	%x+ecx	%x+8%2
2099
+//FIXME reorder?
2100
+#ifdef L1_DIFF //needs mmx2
2101
+		"movq (%0), %%mm0				\n\t" // L0
2102
+		"psadbw (%1), %%mm0				\n\t" // |L0-R0|
2103
+		"movq (%0, %2), %%mm1				\n\t" // L1
2104
+		"psadbw (%1, %2), %%mm1				\n\t" // |L1-R1|
2105
+		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2106
+		"psadbw (%1, %2, 2), %%mm2			\n\t" // |L2-R2|
2107
+		"movq (%0, %%eax), %%mm3			\n\t" // L3
2108
+		"psadbw (%1, %%eax), %%mm3			\n\t" // |L3-R3|
2109
+
2110
+		"movq (%0, %2, 4), %%mm4			\n\t" // L4
2111
+		"paddw %%mm1, %%mm0				\n\t"
2112
+		"psadbw (%1, %2, 4), %%mm4			\n\t" // |L4-R4|
2113
+		"movq (%0, %%edx), %%mm5			\n\t" // L5
2114
+		"paddw %%mm2, %%mm0				\n\t"
2115
+		"psadbw (%1, %%edx), %%mm5			\n\t" // |L5-R5|
2116
+		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
2117
+		"paddw %%mm3, %%mm0				\n\t"
2118
+		"psadbw (%1, %%eax, 2), %%mm6			\n\t" // |L6-R6|
2119
+		"movq (%0, %%ecx), %%mm7			\n\t" // L7
2120
+		"paddw %%mm4, %%mm0				\n\t"
2121
+		"psadbw (%1, %%ecx), %%mm7			\n\t" // |L7-R7|
2122
+		"paddw %%mm5, %%mm6				\n\t"
2123
+		"paddw %%mm7, %%mm6				\n\t"
2124
+		"paddw %%mm6, %%mm0				\n\t"
2125
+#elif defined (FAST_L2_DIFF)
2126
+		"pcmpeqb %%mm7, %%mm7				\n\t"
2127
+		"movq "MANGLE(b80)", %%mm6			\n\t"
2128
+		"pxor %%mm0, %%mm0				\n\t"
2129
+#define L2_DIFF_CORE(a, b)\
2130
+		"movq " #a ", %%mm5				\n\t"\
2131
+		"movq " #b ", %%mm2				\n\t"\
2132
+		"pxor %%mm7, %%mm2				\n\t"\
2133
+		PAVGB(%%mm2, %%mm5)\
2134
+		"paddb %%mm6, %%mm5				\n\t"\
2135
+		"movq %%mm5, %%mm2				\n\t"\
2136
+		"psllw $8, %%mm5				\n\t"\
2137
+		"pmaddwd %%mm5, %%mm5				\n\t"\
2138
+		"pmaddwd %%mm2, %%mm2				\n\t"\
2139
+		"paddd %%mm2, %%mm5				\n\t"\
2140
+		"psrld $14, %%mm5				\n\t"\
2141
+		"paddd %%mm5, %%mm0				\n\t"
2142
+
2143
+L2_DIFF_CORE((%0), (%1))
2144
+L2_DIFF_CORE((%0, %2), (%1, %2))
2145
+L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2146
+L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2147
+L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2148
+L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2149
+L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2150
+L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2151
+
2152
+#else
2153
+		"pxor %%mm7, %%mm7				\n\t"
2154
+		"pxor %%mm0, %%mm0				\n\t"
2155
+#define L2_DIFF_CORE(a, b)\
2156
+		"movq " #a ", %%mm5				\n\t"\
2157
+		"movq " #b ", %%mm2				\n\t"\
2158
+		"movq %%mm5, %%mm1				\n\t"\
2159
+		"movq %%mm2, %%mm3				\n\t"\
2160
+		"punpcklbw %%mm7, %%mm5				\n\t"\
2161
+		"punpckhbw %%mm7, %%mm1				\n\t"\
2162
+		"punpcklbw %%mm7, %%mm2				\n\t"\
2163
+		"punpckhbw %%mm7, %%mm3				\n\t"\
2164
+		"psubw %%mm2, %%mm5				\n\t"\
2165
+		"psubw %%mm3, %%mm1				\n\t"\
2166
+		"pmaddwd %%mm5, %%mm5				\n\t"\
2167
+		"pmaddwd %%mm1, %%mm1				\n\t"\
2168
+		"paddd %%mm1, %%mm5				\n\t"\
2169
+		"paddd %%mm5, %%mm0				\n\t"
2170
+
2171
+L2_DIFF_CORE((%0), (%1))
2172
+L2_DIFF_CORE((%0, %2), (%1, %2))
2173
+L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2174
+L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2175
+L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2176
+L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2177
+L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2178
+L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2179
+
2180
+#endif
2181
+
2182
+		"movq %%mm0, %%mm4				\n\t"
2183
+		"psrlq $32, %%mm0				\n\t"
2184
+		"paddd %%mm0, %%mm4				\n\t"
2185
+		"movd %%mm4, %%ecx				\n\t"
2186
+		"shll $2, %%ecx					\n\t"
2187
+		"movl %3, %%edx					\n\t"
2188
+		"addl -4(%%edx), %%ecx				\n\t"
2189
+		"addl 4(%%edx), %%ecx				\n\t"
2190
+		"addl -1024(%%edx), %%ecx			\n\t"
2191
+		"addl $4, %%ecx					\n\t"
2192
+		"addl 1024(%%edx), %%ecx			\n\t"
2193
+		"shrl $3, %%ecx					\n\t"
2194
+		"movl %%ecx, (%%edx)				\n\t"
2195
+
2196
+//		"movl %3, %%ecx					\n\t"
2197
+//		"movl %%ecx, test				\n\t"
2198
+//		"jmp 4f \n\t"
2199
+		"cmpl 512(%%edx), %%ecx				\n\t"
2200
+		" jb 2f						\n\t"
2201
+		"cmpl 516(%%edx), %%ecx				\n\t"
2202
+		" jb 1f						\n\t"
2203
+
2204
+		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2205
+		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2206
+		"movq (%0), %%mm0				\n\t" // L0
2207
+		"movq (%0, %2), %%mm1				\n\t" // L1
2208
+		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2209
+		"movq (%0, %%eax), %%mm3			\n\t" // L3
2210
+		"movq (%0, %2, 4), %%mm4			\n\t" // L4
2211
+		"movq (%0, %%edx), %%mm5			\n\t" // L5
2212
+		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
2213
+		"movq (%0, %%ecx), %%mm7			\n\t" // L7
2214
+		"movq %%mm0, (%1)				\n\t" // L0
2215
+		"movq %%mm1, (%1, %2)				\n\t" // L1
2216
+		"movq %%mm2, (%1, %2, 2)			\n\t" // L2
2217
+		"movq %%mm3, (%1, %%eax)			\n\t" // L3
2218
+		"movq %%mm4, (%1, %2, 4)			\n\t" // L4
2219
+		"movq %%mm5, (%1, %%edx)			\n\t" // L5
2220
+		"movq %%mm6, (%1, %%eax, 2)			\n\t" // L6
2221
+		"movq %%mm7, (%1, %%ecx)			\n\t" // L7
2222
+		"jmp 4f						\n\t"
2223
+
2224
+		"1:						\n\t"
2225
+		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2226
+		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2227
+		"movq (%0), %%mm0				\n\t" // L0
2228
+		PAVGB((%1), %%mm0)				      // L0
2229
+		"movq (%0, %2), %%mm1				\n\t" // L1
2230
+		PAVGB((%1, %2), %%mm1)				      // L1
2231
+		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2232
+		PAVGB((%1, %2, 2), %%mm2)			      // L2
2233
+		"movq (%0, %%eax), %%mm3			\n\t" // L3
2234
+		PAVGB((%1, %%eax), %%mm3)			      // L3
2235
+		"movq (%0, %2, 4), %%mm4			\n\t" // L4
2236
+		PAVGB((%1, %2, 4), %%mm4)			      // L4
2237
+		"movq (%0, %%edx), %%mm5			\n\t" // L5
2238
+		PAVGB((%1, %%edx), %%mm5)			      // L5
2239
+		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
2240
+		PAVGB((%1, %%eax, 2), %%mm6)			      // L6
2241
+		"movq (%0, %%ecx), %%mm7			\n\t" // L7
2242
+		PAVGB((%1, %%ecx), %%mm7)			      // L7
2243
+		"movq %%mm0, (%1)				\n\t" // R0
2244
+		"movq %%mm1, (%1, %2)				\n\t" // R1
2245
+		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
2246
+		"movq %%mm3, (%1, %%eax)			\n\t" // R3
2247
+		"movq %%mm4, (%1, %2, 4)			\n\t" // R4
2248
+		"movq %%mm5, (%1, %%edx)			\n\t" // R5
2249
+		"movq %%mm6, (%1, %%eax, 2)			\n\t" // R6
2250
+		"movq %%mm7, (%1, %%ecx)			\n\t" // R7
2251
+		"movq %%mm0, (%0)				\n\t" // L0
2252
+		"movq %%mm1, (%0, %2)				\n\t" // L1
2253
+		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
2254
+		"movq %%mm3, (%0, %%eax)			\n\t" // L3
2255
+		"movq %%mm4, (%0, %2, 4)			\n\t" // L4
2256
+		"movq %%mm5, (%0, %%edx)			\n\t" // L5
2257
+		"movq %%mm6, (%0, %%eax, 2)			\n\t" // L6
2258
+		"movq %%mm7, (%0, %%ecx)			\n\t" // L7
2259
+		"jmp 4f						\n\t"
2260
+
2261
+		"2:						\n\t"
2262
+		"cmpl 508(%%edx), %%ecx				\n\t"
2263
+		" jb 3f						\n\t"
2264
+
2265
+		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2266
+		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2267
+		"movq (%0), %%mm0				\n\t" // L0
2268
+		"movq (%0, %2), %%mm1				\n\t" // L1
2269
+		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2270
+		"movq (%0, %%eax), %%mm3			\n\t" // L3
2271
+		"movq (%1), %%mm4				\n\t" // R0
2272
+		"movq (%1, %2), %%mm5				\n\t" // R1
2273
+		"movq (%1, %2, 2), %%mm6			\n\t" // R2
2274
+		"movq (%1, %%eax), %%mm7			\n\t" // R3
2275
+		PAVGB(%%mm4, %%mm0)
2276
+		PAVGB(%%mm5, %%mm1)
2277
+		PAVGB(%%mm6, %%mm2)
2278
+		PAVGB(%%mm7, %%mm3)
2279
+		PAVGB(%%mm4, %%mm0)
2280
+		PAVGB(%%mm5, %%mm1)
2281
+		PAVGB(%%mm6, %%mm2)
2282
+		PAVGB(%%mm7, %%mm3)
2283
+		"movq %%mm0, (%1)				\n\t" // R0
2284
+		"movq %%mm1, (%1, %2)				\n\t" // R1
2285
+		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
2286
+		"movq %%mm3, (%1, %%eax)			\n\t" // R3
2287
+		"movq %%mm0, (%0)				\n\t" // L0
2288
+		"movq %%mm1, (%0, %2)				\n\t" // L1
2289
+		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
2290
+		"movq %%mm3, (%0, %%eax)			\n\t" // L3
2291
+
2292
+		"movq (%0, %2, 4), %%mm0			\n\t" // L4
2293
+		"movq (%0, %%edx), %%mm1			\n\t" // L5
2294
+		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
2295
+		"movq (%0, %%ecx), %%mm3			\n\t" // L7
2296
+		"movq (%1, %2, 4), %%mm4			\n\t" // R4
2297
+		"movq (%1, %%edx), %%mm5			\n\t" // R5
2298
+		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
2299
+		"movq (%1, %%ecx), %%mm7			\n\t" // R7
2300
+		PAVGB(%%mm4, %%mm0)
2301
+		PAVGB(%%mm5, %%mm1)
2302
+		PAVGB(%%mm6, %%mm2)
2303
+		PAVGB(%%mm7, %%mm3)
2304
+		PAVGB(%%mm4, %%mm0)
2305
+		PAVGB(%%mm5, %%mm1)
2306
+		PAVGB(%%mm6, %%mm2)
2307
+		PAVGB(%%mm7, %%mm3)
2308
+		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
2309
+		"movq %%mm1, (%1, %%edx)			\n\t" // R5
2310
+		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
2311
+		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
2312
+		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
2313
+		"movq %%mm1, (%0, %%edx)			\n\t" // L5
2314
+		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
2315
+		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
2316
+		"jmp 4f						\n\t"
2317
+
2318
+		"3:						\n\t"
2319
+		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2320
+		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2321
+		"movq (%0), %%mm0				\n\t" // L0
2322
+		"movq (%0, %2), %%mm1				\n\t" // L1
2323
+		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2324
+		"movq (%0, %%eax), %%mm3			\n\t" // L3
2325
+		"movq (%1), %%mm4				\n\t" // R0
2326
+		"movq (%1, %2), %%mm5				\n\t" // R1
2327
+		"movq (%1, %2, 2), %%mm6			\n\t" // R2
2328
+		"movq (%1, %%eax), %%mm7			\n\t" // R3
2329
+		PAVGB(%%mm4, %%mm0)
2330
+		PAVGB(%%mm5, %%mm1)
2331
+		PAVGB(%%mm6, %%mm2)
2332
+		PAVGB(%%mm7, %%mm3)
2333
+		PAVGB(%%mm4, %%mm0)
2334
+		PAVGB(%%mm5, %%mm1)
2335
+		PAVGB(%%mm6, %%mm2)
2336
+		PAVGB(%%mm7, %%mm3)
2337
+		PAVGB(%%mm4, %%mm0)
2338
+		PAVGB(%%mm5, %%mm1)
2339
+		PAVGB(%%mm6, %%mm2)
2340
+		PAVGB(%%mm7, %%mm3)
2341
+		"movq %%mm0, (%1)				\n\t" // R0
2342
+		"movq %%mm1, (%1, %2)				\n\t" // R1
2343
+		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
2344
+		"movq %%mm3, (%1, %%eax)			\n\t" // R3
2345
+		"movq %%mm0, (%0)				\n\t" // L0
2346
+		"movq %%mm1, (%0, %2)				\n\t" // L1
2347
+		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
2348
+		"movq %%mm3, (%0, %%eax)			\n\t" // L3
2349
+
2350
+		"movq (%0, %2, 4), %%mm0			\n\t" // L4
2351
+		"movq (%0, %%edx), %%mm1			\n\t" // L5
2352
+		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
2353
+		"movq (%0, %%ecx), %%mm3			\n\t" // L7
2354
+		"movq (%1, %2, 4), %%mm4			\n\t" // R4
2355
+		"movq (%1, %%edx), %%mm5			\n\t" // R5
2356
+		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
2357
+		"movq (%1, %%ecx), %%mm7			\n\t" // R7
2358
+		PAVGB(%%mm4, %%mm0)
2359
+		PAVGB(%%mm5, %%mm1)
2360
+		PAVGB(%%mm6, %%mm2)
2361
+		PAVGB(%%mm7, %%mm3)
2362
+		PAVGB(%%mm4, %%mm0)
2363
+		PAVGB(%%mm5, %%mm1)
2364
+		PAVGB(%%mm6, %%mm2)
2365
+		PAVGB(%%mm7, %%mm3)
2366
+		PAVGB(%%mm4, %%mm0)
2367
+		PAVGB(%%mm5, %%mm1)
2368
+		PAVGB(%%mm6, %%mm2)
2369
+		PAVGB(%%mm7, %%mm3)
2370
+		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
2371
+		"movq %%mm1, (%1, %%edx)			\n\t" // R5
2372
+		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
2373
+		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
2374
+		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
2375
+		"movq %%mm1, (%0, %%edx)			\n\t" // L5
2376
+		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
2377
+		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
2378
+
2379
+		"4:						\n\t"
2380
+
2381
+		:: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2382
+		: "%eax", "%edx", "%ecx", "memory"
2383
+		);
2384
+//printf("%d\n", test);
2385
+#else
2386
+{
2387
+	int y;
2388
+	int d=0;
2389
+	int sysd=0;
2390
+	int i;
2391
+
2392
+	for(y=0; y<8; y++)
2393
+	{
2394
+		int x;
2395
+		for(x=0; x<8; x++)
2396
+		{
2397
+			int ref= tempBlured[ x + y*stride ];
2398
+			int cur= src[ x + y*stride ];
2399
+			int d1=ref - cur;
2400
+//			if(x==0 || x==7) d1+= d1>>1;
2401
+//			if(y==0 || y==7) d1+= d1>>1;
2402
+//			d+= ABS(d1);
2403
+			d+= d1*d1;
2404
+			sysd+= d1;
2405
+		}
2406
+	}
2407
+	i=d;
2408
+	d= 	(
2409
+		4*d
2410
+		+(*(tempBluredPast-256))
2411
+		+(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2412
+		+(*(tempBluredPast+256))
2413
+		+4)>>3;
2414
+	*tempBluredPast=i;
2415
+//	((*tempBluredPast)*3 + d + 2)>>2;
2416
+
2417
+//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2418
+/*
2419
+Switch between
2420
+ 1  0  0  0  0  0  0  (0)
2421
+64 32 16  8  4  2  1  (1)
2422
+64 48 36 27 20 15 11 (33) (approx)
2423
+64 56 49 43 37 33 29 (200) (approx)
2424
+*/
2425
+	if(d > maxNoise[1])
2426
+	{
2427
+		if(d < maxNoise[2])
2428
+		{
2429
+			for(y=0; y<8; y++)
2430
+			{
2431
+				int x;
2432
+				for(x=0; x<8; x++)
2433
+				{
2434
+					int ref= tempBlured[ x + y*stride ];
2435
+					int cur= src[ x + y*stride ];
2436
+					tempBlured[ x + y*stride ]=
2437
+					src[ x + y*stride ]=
2438
+						(ref + cur + 1)>>1;
2439
+				}
2440
+			}
2441
+		}
2442
+		else
2443
+		{
2444
+			for(y=0; y<8; y++)
2445
+			{
2446
+				int x;
2447
+				for(x=0; x<8; x++)
2448
+				{
2449
+					tempBlured[ x + y*stride ]= src[ x + y*stride ];
2450
+				}
2451
+			}
2452
+		}
2453
+	}
2454
+	else
2455
+	{
2456
+		if(d < maxNoise[0])
2457
+		{
2458
+			for(y=0; y<8; y++)
2459
+			{
2460
+				int x;
2461
+				for(x=0; x<8; x++)
2462
+				{
2463
+					int ref= tempBlured[ x + y*stride ];
2464
+					int cur= src[ x + y*stride ];
2465
+					tempBlured[ x + y*stride ]=
2466
+					src[ x + y*stride ]=
2467
+						(ref*7 + cur + 4)>>3;
2468
+				}
2469
+			}
2470
+		}
2471
+		else
2472
+		{
2473
+			for(y=0; y<8; y++)
2474
+			{
2475
+				int x;
2476
+				for(x=0; x<8; x++)
2477
+				{
2478
+					int ref= tempBlured[ x + y*stride ];
2479
+					int cur= src[ x + y*stride ];
2480
+					tempBlured[ x + y*stride ]=
2481
+					src[ x + y*stride ]=
2482
+						(ref*3 + cur + 2)>>2;
2483
+				}
2484
+			}
2485
+		}
2486
+	}
2487
+}
2488
+#endif
2489
+}
2490
+
2491
+static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2492
+	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
2493
+
2494
+/**
2495
+ * Copies a block from src to dst and fixes the blacklevel
2496
+ * levelFix == 0 -> dont touch the brighness & contrast
2497
+ */
2498
+#undef SCALED_CPY
2499
+
2500
+static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2501
+	int levelFix, int64_t *packedOffsetAndScale)
2502
+{
2503
+#ifndef HAVE_MMX
2504
+	int i;
2505
+#endif
2506
+	if(levelFix)
2507
+	{
2508
+#ifdef HAVE_MMX
2509
+					asm volatile(
2510
+						"movq (%%eax), %%mm2	\n\t" // packedYOffset
2511
+						"movq 8(%%eax), %%mm3	\n\t" // packedYScale
2512
+						"leal (%2,%4), %%eax	\n\t"
2513
+						"leal (%3,%5), %%edx	\n\t"
2514
+						"pxor %%mm4, %%mm4	\n\t"
2515
+#ifdef HAVE_MMX2
2516
+#define SCALED_CPY(src1, src2, dst1, dst2)					\
2517
+						"movq " #src1 ", %%mm0	\n\t"\
2518
+						"movq " #src1 ", %%mm5	\n\t"\
2519
+						"movq " #src2 ", %%mm1	\n\t"\
2520
+						"movq " #src2 ", %%mm6	\n\t"\
2521
+						"punpcklbw %%mm0, %%mm0 \n\t"\
2522
+						"punpckhbw %%mm5, %%mm5 \n\t"\
2523
+						"punpcklbw %%mm1, %%mm1 \n\t"\
2524
+						"punpckhbw %%mm6, %%mm6 \n\t"\
2525
+						"pmulhuw %%mm3, %%mm0	\n\t"\
2526
+						"pmulhuw %%mm3, %%mm5	\n\t"\
2527
+						"pmulhuw %%mm3, %%mm1	\n\t"\
2528
+						"pmulhuw %%mm3, %%mm6	\n\t"\
2529
+						"psubw %%mm2, %%mm0	\n\t"\
2530
+						"psubw %%mm2, %%mm5	\n\t"\
2531
+						"psubw %%mm2, %%mm1	\n\t"\
2532
+						"psubw %%mm2, %%mm6	\n\t"\
2533
+						"packuswb %%mm5, %%mm0	\n\t"\
2534
+						"packuswb %%mm6, %%mm1	\n\t"\
2535
+						"movq %%mm0, " #dst1 "	\n\t"\
2536
+						"movq %%mm1, " #dst2 "	\n\t"\
2537
+
2538
+#else //HAVE_MMX2
2539
+#define SCALED_CPY(src1, src2, dst1, dst2)					\
2540
+						"movq " #src1 ", %%mm0	\n\t"\
2541
+						"movq " #src1 ", %%mm5	\n\t"\
2542
+						"punpcklbw %%mm4, %%mm0 \n\t"\
2543
+						"punpckhbw %%mm4, %%mm5 \n\t"\
2544
+						"psubw %%mm2, %%mm0	\n\t"\
2545
+						"psubw %%mm2, %%mm5	\n\t"\
2546
+						"movq " #src2 ", %%mm1	\n\t"\
2547
+						"psllw $6, %%mm0	\n\t"\
2548
+						"psllw $6, %%mm5	\n\t"\
2549
+						"pmulhw %%mm3, %%mm0	\n\t"\
2550
+						"movq " #src2 ", %%mm6	\n\t"\
2551
+						"pmulhw %%mm3, %%mm5	\n\t"\
2552
+						"punpcklbw %%mm4, %%mm1 \n\t"\
2553
+						"punpckhbw %%mm4, %%mm6 \n\t"\
2554
+						"psubw %%mm2, %%mm1	\n\t"\
2555
+						"psubw %%mm2, %%mm6	\n\t"\
2556
+						"psllw $6, %%mm1	\n\t"\
2557
+						"psllw $6, %%mm6	\n\t"\
2558
+						"pmulhw %%mm3, %%mm1	\n\t"\
2559
+						"pmulhw %%mm3, %%mm6	\n\t"\
2560
+						"packuswb %%mm5, %%mm0	\n\t"\
2561
+						"packuswb %%mm6, %%mm1	\n\t"\
2562
+						"movq %%mm0, " #dst1 "	\n\t"\
2563
+						"movq %%mm1, " #dst2 "	\n\t"\
2564
+
2565
+#endif //!HAVE_MMX2
2566
+
2567
+SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
2568
+SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
2569
+SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
2570
+						"leal (%%eax,%4,4), %%eax	\n\t"
2571
+						"leal (%%edx,%5,4), %%edx	\n\t"
2572
+SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
2573
+
2574
+
2575
+						: "=&a" (packedOffsetAndScale)
2576
+						: "0" (packedOffsetAndScale),
2577
+						"r"(src),
2578
+						"r"(dst),
2579
+						"r" (srcStride),
2580
+						"r" (dstStride)
2581
+						: "%edx"
2582
+					);
2583
+#else
2584
+				for(i=0; i<8; i++)
2585
+					memcpy(	&(dst[dstStride*i]),
2586
+						&(src[srcStride*i]), BLOCK_SIZE);
2587
+#endif
2588
+	}
2589
+	else
2590
+	{
2591
+#ifdef HAVE_MMX
2592
+					asm volatile(
2593
+						"leal (%0,%2), %%eax	\n\t"
2594
+						"leal (%1,%3), %%edx	\n\t"
2595
+
2596
+#define SIMPLE_CPY(src1, src2, dst1, dst2)				\
2597
+						"movq " #src1 ", %%mm0	\n\t"\
2598
+						"movq " #src2 ", %%mm1	\n\t"\
2599
+						"movq %%mm0, " #dst1 "	\n\t"\
2600
+						"movq %%mm1, " #dst2 "	\n\t"\
2601
+
2602
+SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
2603
+SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
2604
+SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
2605
+						"leal (%%eax,%2,4), %%eax	\n\t"
2606
+						"leal (%%edx,%3,4), %%edx	\n\t"
2607
+SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
2608
+
2609
+						: : "r" (src),
2610
+						"r" (dst),
2611
+						"r" (srcStride),
2612
+						"r" (dstStride)
2613
+						: "%eax", "%edx"
2614
+					);
2615
+#else
2616
+				for(i=0; i<8; i++)
2617
+					memcpy(	&(dst[dstStride*i]),
2618
+						&(src[srcStride*i]), BLOCK_SIZE);
2619
+#endif
2620
+	}
2621
+}
2622
+
2623
+/**
2624
+ * Duplicates the given 8 src pixels ? times upward
2625
+ */
2626
+static inline void RENAME(duplicate)(uint8_t src[], int stride)
2627
+{
2628
+#ifdef HAVE_MMX
2629
+	asm volatile(
2630
+		"movq (%0), %%mm0		\n\t"
2631
+		"addl %1, %0			\n\t"
2632
+		"movq %%mm0, (%0)		\n\t"
2633
+		"movq %%mm0, (%0, %1)		\n\t"
2634
+		"movq %%mm0, (%0, %1, 2)	\n\t"
2635
+		: "+r" (src)
2636
+		: "r" (-stride)
2637
+	);
2638
+#else
2639
+	int i;
2640
+	uint8_t *p=src;
2641
+	for(i=0; i<3; i++)
2642
+	{
2643
+		p-= stride;
2644
+		memcpy(p, src, 8);
2645
+	}
2646
+#endif
2647
+}
2648
+
2649
+/**
2650
+ * Filters array of bytes (Y or U or V values)
2651
+ */
2652
+static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2653
+	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
2654
+{
2655
+	PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
2656
+	int x,y;
2657
+#ifdef COMPILE_TIME_MODE
2658
+	const int mode= COMPILE_TIME_MODE;
2659
+#else
2660
+	const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
2661
+#endif
2662
+	int black=0, white=255; // blackest black and whitest white in the picture
2663
+	int QPCorrecture= 256*256;
2664
+
2665
+	int copyAhead;
2666
+#ifdef HAVE_MMX
2667
+	int i;
2668
+#endif
2669
+
2670
+	const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
2671
+	const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
2672
+
2673
+	//FIXME remove
2674
+	uint64_t * const yHistogram= c.yHistogram;
2675
+	uint8_t * const tempSrc= c.tempSrc;
2676
+	uint8_t * const tempDst= c.tempDst;
2677
+	const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2678
+
2679
+#ifdef HAVE_MMX
2680
+	for(i=0; i<32; i++){
2681
+		int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
2682
+		int threshold= offset*2 + 1;
2683
+		c.mmxDcOffset[i]= 0x7F - offset;
2684
+		c.mmxDcThreshold[i]= 0x7F - threshold;
2685
+		c.mmxDcOffset[i]*= 0x0101010101010101LL;
2686
+		c.mmxDcThreshold[i]*= 0x0101010101010101LL;
2687
+	}
2688
+#endif
2689
+
2690
+	if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2691
+	else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
2692
+		|| (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
2693
+	else if(   (mode & V_DEBLOCK)
2694
+		|| (mode & LINEAR_IPOL_DEINT_FILTER)
2695
+		|| (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
2696
+	else if(mode & V_X1_FILTER) copyAhead=11;
2697
+//	else if(mode & V_RK1_FILTER) copyAhead=10;
2698
+	else if(mode & DERING) copyAhead=9;
2699
+	else copyAhead=8;
2700
+
2701
+	copyAhead-= 8;
2702
+
2703
+	if(!isColor)
2704
+	{
2705
+		uint64_t sum= 0;
2706
+		int i;
2707
+		uint64_t maxClipped;
2708
+		uint64_t clipped;
2709
+		double scale;
2710
+
2711
+		c.frameNum++;
2712
+		// first frame is fscked so we ignore it
2713
+		if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
2714
+
2715
+		for(i=0; i<256; i++)
2716
+		{
2717
+			sum+= yHistogram[i];
2718
+//			printf("%d ", yHistogram[i]);
2719
+		}
2720
+//		printf("\n\n");
2721
+
2722
+		/* we allways get a completly black picture first */
2723
+		maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
2724
+
2725
+		clipped= sum;
2726
+		for(black=255; black>0; black--)
2727
+		{
2728
+			if(clipped < maxClipped) break;
2729
+			clipped-= yHistogram[black];
2730
+		}
2731
+
2732
+		clipped= sum;
2733
+		for(white=0; white<256; white++)
2734
+		{
2735
+			if(clipped < maxClipped) break;
2736
+			clipped-= yHistogram[white];
2737
+		}
2738
+
2739
+		scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
2740
+
2741
+#ifdef HAVE_MMX2
2742
+		c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
2743
+		c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
2744
+#else
2745
+		c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2746
+		c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
2747
+#endif
2748
+
2749
+		c.packedYOffset|= c.packedYOffset<<32;
2750
+		c.packedYOffset|= c.packedYOffset<<16;
2751
+
2752
+		c.packedYScale|= c.packedYScale<<32;
2753
+		c.packedYScale|= c.packedYScale<<16;
2754
+		
2755
+		if(mode & LEVEL_FIX)	QPCorrecture= (int)(scale*256*256 + 0.5);
2756
+		else			QPCorrecture= 256*256;
2757
+	}
2758
+	else
2759
+	{
2760
+		c.packedYScale= 0x0100010001000100LL;
2761
+		c.packedYOffset= 0;
2762
+		QPCorrecture= 256*256;
2763
+	}
2764
+
2765
+	/* copy & deinterlace first row of blocks */
2766
+	y=-BLOCK_SIZE;
2767
+	{
2768
+		uint8_t *srcBlock= &(src[y*srcStride]);
2769
+		uint8_t *dstBlock= tempDst + dstStride;
2770
+
2771
+		// From this point on it is guranteed that we can read and write 16 lines downward
2772

                
2773
+		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2774
+		for(x=0; x<width; x+=BLOCK_SIZE)
2775
+		{
2776
+
2777
+#ifdef HAVE_MMX2
2778
+/*
2779
+			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2780
+			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2781
+			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2782
+			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2783
+*/
2784
+
2785
+			asm(
2786
+				"movl %4, %%eax			\n\t"
2787
+				"shrl $2, %%eax			\n\t"
2788
+				"andl $6, %%eax			\n\t"
2789
+				"addl %5, %%eax			\n\t"
2790
+				"movl %%eax, %%edx		\n\t"
2791
+				"imul %1, %%eax			\n\t"
2792
+				"imul %3, %%edx			\n\t"
2793
+				"prefetchnta 32(%%eax, %0)	\n\t"
2794
+				"prefetcht0 32(%%edx, %2)	\n\t"
2795
+				"addl %1, %%eax			\n\t"
2796
+				"addl %3, %%edx			\n\t"
2797
+				"prefetchnta 32(%%eax, %0)	\n\t"
2798
+				"prefetcht0 32(%%edx, %2)	\n\t"
2799
+			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2800
+			"m" (x), "m" (copyAhead)
2801
+			: "%eax", "%edx"
2802
+			);
2803
+
2804
+#elif defined(HAVE_3DNOW)
2805
+//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2806
+/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2807
+			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2808
+			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2809
+			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2810
+*/
2811
+#endif
2812
+
2813
+			RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2814
+				srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2815
+
2816
+			RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2817
+
2818
+			if(mode & LINEAR_IPOL_DEINT_FILTER)
2819
+				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2820
+			else if(mode & LINEAR_BLEND_DEINT_FILTER)
2821
+				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2822
+			else if(mode & MEDIAN_DEINT_FILTER)
2823
+				RENAME(deInterlaceMedian)(dstBlock, dstStride);
2824
+			else if(mode & CUBIC_IPOL_DEINT_FILTER)
2825
+				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2826
+			else if(mode & FFMPEG_DEINT_FILTER)
2827
+				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2828
+/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
2829
+				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2830
+*/
2831
+			dstBlock+=8;
2832
+			srcBlock+=8;
2833
+		}
2834
+		if(width==dstStride)
2835
+			memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride);
2836
+		else
2837
+		{
2838
+			int i;
2839
+			for(i=0; i<copyAhead; i++)
2840
+			{
2841
+				memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
2842
+			}
2843
+		}
2844
+	}
2845
+
2846
+//printf("\n");
2847
+	for(y=0; y<height; y+=BLOCK_SIZE)
2848
+	{
2849
+		//1% speedup if these are here instead of the inner loop
2850
+		uint8_t *srcBlock= &(src[y*srcStride]);
2851
+		uint8_t *dstBlock= &(dst[y*dstStride]);
2852
+#ifdef HAVE_MMX
2853
+		uint8_t *tempBlock1= c.tempBlocks;
2854
+		uint8_t *tempBlock2= c.tempBlocks + 8;
2855
+#endif
2856
+		int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
2857
+		int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*mbWidth];
2858
+		int QP=0;
2859
+		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2860
+		   if not than use a temporary buffer */
2861
+		if(y+15 >= height)
2862
+		{
2863
+			int i;
2864
+			/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
2865
+			   blockcopy to dst later */
2866
+			memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
2867
+				srcStride*MAX(height-y-copyAhead, 0) );
2868
+
2869
+			/* duplicate last line of src to fill the void upto line (copyAhead+7) */
2870
+			for(i=MAX(height-y, 8); i<copyAhead+8; i++)
2871
+				memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
2872
+
2873
+			/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
2874
+			memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
2875
+
2876
+			/* duplicate last line of dst to fill the void upto line (copyAhead) */
2877
+			for(i=height-y+1; i<=copyAhead; i++)
2878
+				memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
2879
+
2880
+			dstBlock= tempDst + dstStride;
2881
+			srcBlock= tempSrc;
2882
+		}
2883
+//printf("\n");
2884
+
2885
+		// From this point on it is guranteed that we can read and write 16 lines downward
2886

                
2887
+		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2888
+		for(x=0; x<width; x+=BLOCK_SIZE)
2889
+		{
2890
+			const int stride= dstStride;
2891
+#ifdef HAVE_MMX
2892
+			uint8_t *tmpXchg;
2893
+#endif
2894
+			if(isColor)
2895
+			{
2896
+				QP= QPptr[x>>qpHShift];
2897
+				c.nonBQP= nonBQPptr[x>>qpHShift];
2898
+			}
2899
+			else
2900
+			{
2901
+				QP= QPptr[x>>4];
2902
+				QP= (QP* QPCorrecture + 256*128)>>16;
2903
+				c.nonBQP= nonBQPptr[x>>4];
2904
+				c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
2905
+				yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2906
+			}
2907
+			c.QP= QP;
2908
+#ifdef HAVE_MMX
2909
+			asm volatile(
2910
+				"movd %1, %%mm7					\n\t"
2911
+				"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2912
+				"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2913
+				"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
2914
+				"movq %%mm7, %0			\n\t"
2915
+				: "=m" (c.pQPb) 
2916
+				: "r" (QP)
2917
+			);
2918
+#endif
2919
+
2920
+
2921
+#ifdef HAVE_MMX2
2922
+/*
2923
+			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2924
+			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2925
+			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2926
+			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2927
+*/
2928
+
2929
+			asm(
2930
+				"movl %4, %%eax			\n\t"
2931
+				"shrl $2, %%eax			\n\t"
2932
+				"andl $6, %%eax			\n\t"
2933
+				"addl %5, %%eax			\n\t"
2934
+				"movl %%eax, %%edx		\n\t"
2935
+				"imul %1, %%eax			\n\t"
2936
+				"imul %3, %%edx			\n\t"
2937
+				"prefetchnta 32(%%eax, %0)	\n\t"
2938
+				"prefetcht0 32(%%edx, %2)	\n\t"
2939
+				"addl %1, %%eax			\n\t"
2940
+				"addl %3, %%edx			\n\t"
2941
+				"prefetchnta 32(%%eax, %0)	\n\t"
2942
+				"prefetcht0 32(%%edx, %2)	\n\t"
2943
+			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2944
+			"m" (x), "m" (copyAhead)
2945
+			: "%eax", "%edx"
2946
+			);
2947
+
2948
+#elif defined(HAVE_3DNOW)
2949
+//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2950
+/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2951
+			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2952
+			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2953
+			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2954
+*/
2955
+#endif
2956
+
2957
+			RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
2958
+				srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2959
+
2960
+			if(mode & LINEAR_IPOL_DEINT_FILTER)
2961
+				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2962
+			else if(mode & LINEAR_BLEND_DEINT_FILTER)
2963
+				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2964
+			else if(mode & MEDIAN_DEINT_FILTER)
2965
+				RENAME(deInterlaceMedian)(dstBlock, dstStride);
2966
+			else if(mode & CUBIC_IPOL_DEINT_FILTER)
2967
+				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2968
+			else if(mode & FFMPEG_DEINT_FILTER)
2969
+				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2970
+/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
2971
+				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2972
+*/
2973
+
2974
+			/* only deblock if we have 2 blocks */
2975
+			if(y + 8 < height)
2976
+			{
2977
+				if(mode & V_X1_FILTER)
2978
+					RENAME(vertX1Filter)(dstBlock, stride, &c);
2979
+				else if(mode & V_DEBLOCK)
2980
+				{
2981
+					if( RENAME(isVertDC)(dstBlock, stride, &c))
2982
+					{
2983
+						if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
2984
+							RENAME(doVertLowPass)(dstBlock, stride, &c);
2985
+					}
2986
+					else
2987
+						RENAME(doVertDefFilter)(dstBlock, stride, &c);
2988
+				}
2989
+			}
2990
+
2991
+#ifdef HAVE_MMX
2992
+			RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
2993
+#endif
2994
+			/* check if we have a previous block to deblock it with dstBlock */
2995
+			if(x - 8 >= 0)
2996
+			{
2997
+#ifdef HAVE_MMX
2998
+				if(mode & H_X1_FILTER)
2999
+					RENAME(vertX1Filter)(tempBlock1, 16, &c);
3000
+				else if(mode & H_DEBLOCK)
3001
+				{
3002
+					if( RENAME(isVertDC)(tempBlock1, 16, &c))
3003
+					{
3004
+						if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
3005
+							RENAME(doVertLowPass)(tempBlock1, 16, &c);
3006
+					}
3007
+					else
3008
+						RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3009
+				}
3010
+
3011
+				RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3012
+
3013
+#else
3014
+				if(mode & H_X1_FILTER)
3015
+					horizX1Filter(dstBlock-4, stride, QP);
3016
+				else if(mode & H_DEBLOCK)
3017
+				{
3018
+					if( isHorizDC(dstBlock-4, stride, &c))
3019
+					{
3020
+						if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3021
+							doHorizLowPass(dstBlock-4, stride, QP);
3022
+					}
3023
+					else
3024
+						doHorizDefFilter(dstBlock-4, stride, QP);
3025
+				}
3026
+#endif
3027
+				if(mode & DERING)
3028
+				{
3029
+				//FIXME filter first line
3030
+					if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3031
+				}
3032
+
3033
+				if(mode & TEMP_NOISE_FILTER)
3034
+				{
3035
+					RENAME(tempNoiseReducer)(dstBlock-8, stride,
3036
+						c.tempBlured[isColor] + y*dstStride + x,
3037
+						c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3038
+						c.ppMode.maxTmpNoise);
3039
+				}
3040
+			}
3041
+
3042
+			dstBlock+=8;
3043
+			srcBlock+=8;
3044
+
3045
+#ifdef HAVE_MMX
3046
+			tmpXchg= tempBlock1;
3047
+			tempBlock1= tempBlock2;
3048
+			tempBlock2 = tmpXchg;
3049
+#endif
3050
+		}
3051
+
3052
+		if(mode & DERING)
3053
+		{
3054
+				if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3055
+		}
3056
+
3057
+		if((mode & TEMP_NOISE_FILTER))
3058
+		{
3059
+			RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3060
+				c.tempBlured[isColor] + y*dstStride + x,
3061
+				c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3062
+				c.ppMode.maxTmpNoise);
3063
+		}
3064
+
3065
+		/* did we use a tmp buffer for the last lines*/
3066
+		if(y+15 >= height)
3067
+		{
3068
+			uint8_t *dstBlock= &(dst[y*dstStride]);
3069
+			if(width==dstStride)
3070
+				memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y));
3071
+			else
3072
+			{
3073
+				int i;
3074
+				for(i=0; i<height-y; i++)
3075
+				{
3076
+					memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3077
+				}
3078
+			}
3079
+		}
3080
+/*
3081
+		for(x=0; x<width; x+=32)
3082
+		{
3083
+			volatile int i;
3084
+			i+=	+ dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3085
+				+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3086
+				+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3087
+//				+ dstBlock[x +13*dstStride]
3088
+//				+ dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3089
+		}*/
3090
+	}
3091
+#ifdef HAVE_3DNOW
3092
+	asm volatile("femms");
3093
+#elif defined (HAVE_MMX)
3094
+	asm volatile("emms");
3095
+#endif
3096
+
3097
+#ifdef DEBUG_BRIGHTNESS
3098
+	if(!isColor)
3099
+	{
3100
+		int max=1;
3101
+		int i;
3102
+		for(i=0; i<256; i++)
3103
+			if(yHistogram[i] > max) max=yHistogram[i];
3104
+
3105
+		for(i=1; i<256; i++)
3106
+		{
3107
+			int x;
3108
+			int start=yHistogram[i-1]/(max/256+1);
3109
+			int end=yHistogram[i]/(max/256+1);
3110
+			int inc= end > start ? 1 : -1;
3111
+			for(x=start; x!=end+inc; x+=inc)
3112
+				dst[ i*dstStride + x]+=128;
3113
+		}
3114
+
3115
+		for(i=0; i<100; i+=2)
3116
+		{
3117
+			dst[ (white)*dstStride + i]+=128;
3118
+			dst[ (black)*dstStride + i]+=128;
3119
+		}
3120
+
3121
+	}
3122
+#endif
3123
+
3124
+	*c2= c; //copy local context back
3125
+
3126
+}
... ...
@@ -2,16 +2,9 @@
2 2
 include ../config.mak
3 3
 
4 4
 SWSLIB = libswscale.a
5
-ifeq ($(SHARED_PP),yes)
6
-SPPLIB = libpostproc.so
7
-SPPVERSION = 0.0.1
8
-endif
9
-PPLIB = libpostproc.a
10 5
 
11 6
 SWSSRCS=swscale.c rgb2rgb.c yuv2rgb.c
12 7
 SWSOBJS=$(SWSSRCS:.c=.o)
13
-PPOBJS=postprocess.o
14
-SPPOBJS=postprocess_pic.o
15 8
 CS_TEST_OBJS=cs_test.o rgb2rgb.o ../cpudetect.o ../mp_msg.o ../libvo/aclib.o
16 9
 
17 10
 CFLAGS  = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC)
... ...
@@ -24,7 +17,7 @@ CFLAGS  = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC)
24 24
 .c.o:
25 25
 	$(CC) -c $(CFLAGS) -I.. -o $@ $<
26 26
 
27
-all:    $(SWSLIB) $(PPLIB) $(SPPLIB)
27
+all:    $(SWSLIB)
28 28
 
29 29
 $(SWSLIB):     $(SWSOBJS)
30 30
 	$(AR) r $(SWSLIB) $(SWSOBJS)
... ...
@@ -43,29 +36,6 @@ depend:
43 43
 cs_test: $(CS_TEST_OBJS)
44 44
 	$(CC) $(CS_TEST_OBJS) -o cs_test
45 45
 
46
-ifeq ($(SHARED_PP),yes)
47
-postprocess_pic.o: postprocess.c
48
-	$(CC) -c $(CFLAGS) -fomit-frame-pointer -fPIC -DPIC -I.. -o $@ $<
49
-
50
-$(SPPLIB): $(SPPOBJS)
51
-	$(CC) -shared -Wl,-soname,$(SPPLIB).0 \
52
-	-o $(SPPLIB) $(SPPOBJS)
53
-endif
54
-
55
-$(PPLIB): $(PPOBJS)
56
-	$(AR) r $(PPLIB) $(PPOBJS)
57
-
58
-install: all
59
-ifeq ($(SHARED_PP),yes)
60
-	install -d $(prefix)/lib
61
-	install -s -m 755 $(SPPLIB) $(prefix)/lib/$(SPPLIB).$(SPPVERSION)
62
-	ln -sf $(SPPLIB).$(SPPVERSION) $(prefix)/lib/$(SPPLIB)
63
-	ldconfig || true
64
-	mkdir -p $(prefix)/include/postproc
65
-	install -m 644 postprocess.h $(prefix)/include/postproc/postprocess.h
66
-endif
67
-
68
-
69 46
 #
70 47
 # include dependency files if they exist
71 48
 #
72 49
deleted file mode 100644
... ...
@@ -1,874 +0,0 @@
1
-/*
2
-    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3
-
4
-    This program is free software; you can redistribute it and/or modify
5
-    it under the terms of the GNU General Public License as published by
6
-    the Free Software Foundation; either version 2 of the License, or
7
-    (at your option) any later version.
8
-
9
-    This program is distributed in the hope that it will be useful,
10
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
-    GNU General Public License for more details.
13
-
14
-    You should have received a copy of the GNU General Public License
15
-    along with this program; if not, write to the Free Software
16
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
-*/
18
-
19
-/*
20
-			C	MMX	MMX2	3DNow
21
-isVertDC		Ec	Ec
22
-isVertMinMaxOk		Ec	Ec
23
-doVertLowPass		E		e	e
24
-doVertDefFilter		Ec	Ec	e	e
25
-isHorizDC		Ec	Ec
26
-isHorizMinMaxOk		a	E
27
-doHorizLowPass		E		e	e
28
-doHorizDefFilter	Ec	Ec	e	e
29
-deRing			E		e	e*
30
-Vertical RKAlgo1	E		a	a
31
-Horizontal RKAlgo1			a	a
32
-Vertical X1#		a		E	E
33
-Horizontal X1#		a		E	E
34
-LinIpolDeinterlace	e		E	E*
35
-CubicIpolDeinterlace	a		e	e*
36
-LinBlendDeinterlace	e		E	E*
37
-MedianDeinterlace#	E	Ec	Ec
38
-TempDeNoiser#		E		e	e
39
-
40
-* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
41
-# more or less selfinvented filters so the exactness isnt too meaningfull
42
-E = Exact implementation
43
-e = allmost exact implementation (slightly different rounding,...)
44
-a = alternative / approximate impl
45
-c = checked against the other implementations (-vo md5)
46
-*/
47
-
48
-/*
49
-TODO:
50
-reduce the time wasted on the mem transfer
51
-unroll stuff if instructions depend too much on the prior one
52
-move YScale thing to the end instead of fixing QP
53
-write a faster and higher quality deblocking filter :)
54
-make the mainloop more flexible (variable number of blocks at once
55
-	(the if/else stuff per block is slowing things down)
56
-compare the quality & speed of all filters
57
-split this huge file
58
-optimize c versions
59
-try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
60
-...
61
-*/
62
-
63
-//Changelog: use the CVS log
64
-
65
-#include "config.h"
66
-#include <inttypes.h>
67
-#include <stdio.h>
68
-#include <stdlib.h>
69
-#include <string.h>
70
-#ifdef HAVE_MALLOC_H
71
-#include <malloc.h>
72
-#endif
73
-//#undef HAVE_MMX2
74
-//#define HAVE_3DNOW
75
-//#undef HAVE_MMX
76
-//#undef ARCH_X86
77
-//#define DEBUG_BRIGHTNESS
78
-#ifndef PIC
79
-#include "../libvo/fastmemcpy.h"
80
-#endif
81
-#include "postprocess.h"
82
-#include "postprocess_internal.h"
83
-#include "mangle.h"
84
-
85
-#define MIN(a,b) ((a) > (b) ? (b) : (a))
86
-#define MAX(a,b) ((a) < (b) ? (b) : (a))
87
-#define ABS(a) ((a) > 0 ? (a) : (-(a)))
88
-#define SIGN(a) ((a) > 0 ? 1 : -1)
89
-
90
-#define GET_MODE_BUFFER_SIZE 500
91
-#define OPTIONS_ARRAY_SIZE 10
92
-#define BLOCK_SIZE 8
93
-#define TEMP_STRIDE 8
94
-//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
95
-
96
-#ifdef ARCH_X86
97
-static uint64_t __attribute__((aligned(8))) w05=		0x0005000500050005LL;
98
-static uint64_t __attribute__((aligned(8))) w20=		0x0020002000200020LL;
99
-static uint64_t __attribute__((aligned(8))) b00= 		0x0000000000000000LL;
100
-static uint64_t __attribute__((aligned(8))) b01= 		0x0101010101010101LL;
101
-static uint64_t __attribute__((aligned(8))) b02= 		0x0202020202020202LL;
102
-static uint64_t __attribute__((aligned(8))) b08= 		0x0808080808080808LL;
103
-static uint64_t __attribute__((aligned(8))) b80= 		0x8080808080808080LL;
104
-#endif
105
-
106
-static int verbose= 0;
107
-
108
-static const int deringThreshold= 20;
109
-
110
-
111
-static struct PPFilter filters[]=
112
-{
113
-	{"hb", "hdeblock", 		1, 1, 3, H_DEBLOCK},
114
-	{"vb", "vdeblock", 		1, 2, 4, V_DEBLOCK},
115
-/*	{"hr", "rkhdeblock", 		1, 1, 3, H_RK1_FILTER},
116
-	{"vr", "rkvdeblock", 		1, 2, 4, V_RK1_FILTER},*/
117
-	{"h1", "x1hdeblock", 		1, 1, 3, H_X1_FILTER},
118
-	{"v1", "x1vdeblock", 		1, 2, 4, V_X1_FILTER},
119
-	{"dr", "dering", 		1, 5, 6, DERING},
120
-	{"al", "autolevels", 		0, 1, 2, LEVEL_FIX},
121
-	{"lb", "linblenddeint", 	1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
122
-	{"li", "linipoldeint", 		1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
123
-	{"ci", "cubicipoldeint",	1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
124
-	{"md", "mediandeint", 		1, 1, 4, MEDIAN_DEINT_FILTER},
125
-	{"fd", "ffmpegdeint", 		1, 1, 4, FFMPEG_DEINT_FILTER},
126
-	{"tn", "tmpnoise", 		1, 7, 8, TEMP_NOISE_FILTER},
127
-	{"fq", "forcequant", 		1, 0, 0, FORCE_QUANT},
128
-	{NULL, NULL,0,0,0,0} //End Marker
129
-};
130
-
131
-static char *replaceTable[]=
132
-{
133
-	"default", 	"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
134
-	"de", 		"hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
135
-	"fast", 	"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
136
-	"fa", 		"x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400",
137
-	NULL //End Marker
138
-};
139
-
140
-#ifdef ARCH_X86
141
-static inline void unusedVariableWarningFixer()
142
-{
143
-	if(w05 + w20 + b00 + b01 + b02 + b08 + b80 == 0) b00=0;
144
-}
145
-#endif
146
-
147
-
148
-#ifdef ARCH_X86
149
-static inline void prefetchnta(void *p)
150
-{
151
-	asm volatile(	"prefetchnta (%0)\n\t"
152
-		: : "r" (p)
153
-	);
154
-}
155
-
156
-static inline void prefetcht0(void *p)
157
-{
158
-	asm volatile(	"prefetcht0 (%0)\n\t"
159
-		: : "r" (p)
160
-	);
161
-}
162
-
163
-static inline void prefetcht1(void *p)
164
-{
165
-	asm volatile(	"prefetcht1 (%0)\n\t"
166
-		: : "r" (p)
167
-	);
168
-}
169
-
170
-static inline void prefetcht2(void *p)
171
-{
172
-	asm volatile(	"prefetcht2 (%0)\n\t"
173
-		: : "r" (p)
174
-	);
175
-}
176
-#endif
177
-
178
-// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing
179
-
180
-/**
181
- * Check if the given 8x8 Block is mostly "flat"
182
- */
183
-static inline int isHorizDC(uint8_t src[], int stride, PPContext *c)
184
-{
185
-	int numEq= 0;
186
-	int y;
187
-	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
188
-	const int dcThreshold= dcOffset*2 + 1;
189
-	for(y=0; y<BLOCK_SIZE; y++)
190
-	{
191
-		if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
192
-		if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
193
-		if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
194
-		if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
195
-		if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
196
-		if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
197
-		if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
198
-		src+= stride;
199
-	}
200
-	return numEq > c->ppMode.flatnessThreshold;
201
-}
202
-
203
-/**
204
- * Check if the middle 8x8 Block in the given 8x16 block is flat
205
- */
206
-static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){
207
-	int numEq= 0;
208
-	int y;
209
-	const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1;
210
-	const int dcThreshold= dcOffset*2 + 1;
211
-	src+= stride*4; // src points to begin of the 8x8 Block
212
-	for(y=0; y<BLOCK_SIZE-1; y++)
213
-	{
214
-		if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
215
-		if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
216
-		if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
217
-		if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
218
-		if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
219
-		if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
220
-		if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
221
-		if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
222
-		src+= stride;
223
-	}
224
-	return numEq > c->ppMode.flatnessThreshold;
225
-}
226
-
227
-static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
228
-{
229
-	if(abs(src[0] - src[7]) > 2*QP) return 0;
230
-
231
-	return 1;
232
-}
233
-
234
-static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
235
-{
236
-	int y;
237
-	for(y=0; y<BLOCK_SIZE; y++)
238
-	{
239
-		const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
240
-
241
-		if(ABS(middleEnergy) < 8*QP)
242
-		{
243
-			const int q=(dst[3] - dst[4])/2;
244
-			const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
245
-			const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
246
-
247
-			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
248
-			d= MAX(d, 0);
249
-
250
-			d= (5*d + 32) >> 6;
251
-			d*= SIGN(-middleEnergy);
252
-
253
-			if(q>0)
254
-			{
255
-				d= d<0 ? 0 : d;
256
-				d= d>q ? q : d;
257
-			}
258
-			else
259
-			{
260
-				d= d>0 ? 0 : d;
261
-				d= d<q ? q : d;
262
-			}
263
-
264
-        		dst[3]-= d;
265
-	        	dst[4]+= d;
266
-		}
267
-		dst+= stride;
268
-	}
269
-}
270
-
271
-/**
272
- * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
273
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
274
- */
275
-static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
276
-{
277
-
278
-	int y;
279
-	for(y=0; y<BLOCK_SIZE; y++)
280
-	{
281
-		const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
282
-		const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
283
-
284
-		int sums[9];
285
-		sums[0] = first + dst[0];
286
-		sums[1] = dst[0] + dst[1];
287
-		sums[2] = dst[1] + dst[2];
288
-		sums[3] = dst[2] + dst[3];
289
-		sums[4] = dst[3] + dst[4];
290
-		sums[5] = dst[4] + dst[5];
291
-		sums[6] = dst[5] + dst[6];
292
-		sums[7] = dst[6] + dst[7];
293
-		sums[8] = dst[7] + last;
294
-
295
-		dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
296
-		dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
297
-		dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
298
-		dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
299
-		dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
300
-		dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
301
-		dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4;
302
-		dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
303
-
304
-		dst+= stride;
305
-	}
306
-}
307
-
308
-/**
309
- * Experimental Filter 1 (Horizontal)
310
- * will not damage linear gradients
311
- * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
312
- * can only smooth blocks at the expected locations (it cant smooth them if they did move)
313
- * MMX2 version does correct clipping C version doesnt
314
- * not identical with the vertical one
315
- */
316
-static inline void horizX1Filter(uint8_t *src, int stride, int QP)
317
-{
318
-	int y;
319
-	static uint64_t *lut= NULL;
320
-	if(lut==NULL)
321
-	{
322
-		int i;
323
-		lut= (uint64_t*)memalign(8, 256*8);
324
-		for(i=0; i<256; i++)
325
-		{
326
-			int v= i < 128 ? 2*i : 2*(i-256);
327
-/*
328
-//Simulate 112242211 9-Tap filter
329
-			uint64_t a= (v/16) & 0xFF;
330
-			uint64_t b= (v/8) & 0xFF;
331
-			uint64_t c= (v/4) & 0xFF;
332
-			uint64_t d= (3*v/8) & 0xFF;
333
-*/
334
-//Simulate piecewise linear interpolation
335
-			uint64_t a= (v/16) & 0xFF;
336
-			uint64_t b= (v*3/16) & 0xFF;
337
-			uint64_t c= (v*5/16) & 0xFF;
338
-			uint64_t d= (7*v/16) & 0xFF;
339
-			uint64_t A= (0x100 - a)&0xFF;
340
-			uint64_t B= (0x100 - b)&0xFF;
341
-			uint64_t C= (0x100 - c)&0xFF;
342
-			uint64_t D= (0x100 - c)&0xFF;
343
-
344
-			lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
345
-				(D<<24) | (C<<16) | (B<<8) | (A);
346
-			//lut[i] = (v<<32) | (v<<24);
347
-		}
348
-	}
349
-
350
-	for(y=0; y<BLOCK_SIZE; y++)
351
-	{
352
-		int a= src[1] - src[2];
353
-		int b= src[3] - src[4];
354
-		int c= src[5] - src[6];
355
-
356
-		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
357
-
358
-		if(d < QP)
359
-		{
360
-			int v = d * SIGN(-b);
361
-
362
-			src[1] +=v/8;
363
-			src[2] +=v/4;
364
-			src[3] +=3*v/8;
365
-			src[4] -=3*v/8;
366
-			src[5] -=v/4;
367
-			src[6] -=v/8;
368
-
369
-		}
370
-		src+=stride;
371
-	}
372
-}
373
-
374
-
375
-//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
376
-//Plain C versions
377
-#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
378
-#define COMPILE_C
379
-#endif
380
-
381
-#ifdef ARCH_X86
382
-
383
-#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
384
-#define COMPILE_MMX
385
-#endif
386
-
387
-#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
388
-#define COMPILE_MMX2
389
-#endif
390
-
391
-#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
392
-#define COMPILE_3DNOW
393
-#endif
394
-#endif //ARCH_X86
395
-
396
-#undef HAVE_MMX
397
-#undef HAVE_MMX2
398
-#undef HAVE_3DNOW
399
-#undef ARCH_X86
400
-
401
-#ifdef COMPILE_C
402
-#undef HAVE_MMX
403
-#undef HAVE_MMX2
404
-#undef HAVE_3DNOW
405
-#undef ARCH_X86
406
-#define RENAME(a) a ## _C
407
-#include "postprocess_template.c"
408
-#endif
409
-
410
-//MMX versions
411
-#ifdef COMPILE_MMX
412
-#undef RENAME
413
-#define HAVE_MMX
414
-#undef HAVE_MMX2
415
-#undef HAVE_3DNOW
416
-#define ARCH_X86
417
-#define RENAME(a) a ## _MMX
418
-#include "postprocess_template.c"
419
-#endif
420
-
421
-//MMX2 versions
422
-#ifdef COMPILE_MMX2
423
-#undef RENAME
424
-#define HAVE_MMX
425
-#define HAVE_MMX2
426
-#undef HAVE_3DNOW
427
-#define ARCH_X86
428
-#define RENAME(a) a ## _MMX2
429
-#include "postprocess_template.c"
430
-#endif
431
-
432
-//3DNOW versions
433
-#ifdef COMPILE_3DNOW
434
-#undef RENAME
435
-#define HAVE_MMX
436
-#undef HAVE_MMX2
437
-#define HAVE_3DNOW
438
-#define ARCH_X86
439
-#define RENAME(a) a ## _3DNow
440
-#include "postprocess_template.c"
441
-#endif
442
-
443
-// minor note: the HAVE_xyz is messed up after that line so dont use it
444
-
445
-static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
446
-	QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc)
447
-{
448
-	PPContext *c= (PPContext *)vc;
449
-	PPMode *ppMode= (PPMode *)vm;
450
-	c->ppMode= *ppMode; //FIXME
451
-
452
-	// useing ifs here as they are faster than function pointers allthough the
453
-	// difference wouldnt be messureable here but its much better because
454
-	// someone might exchange the cpu whithout restarting mplayer ;)
455
-#ifdef RUNTIME_CPUDETECT
456
-#ifdef ARCH_X86
457
-	// ordered per speed fasterst first
458
-	if(c->cpuCaps & PP_CPU_CAPS_MMX2)
459
-		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
460
-	else if(c->cpuCaps & PP_CPU_CAPS_3DNOW)
461
-		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
462
-	else if(c->cpuCaps & PP_CPU_CAPS_MMX)
463
-		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
464
-	else
465
-		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
466
-#else
467
-		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
468
-#endif
469
-#else //RUNTIME_CPUDETECT
470
-#ifdef HAVE_MMX2
471
-		postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
472
-#elif defined (HAVE_3DNOW)
473
-		postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
474
-#elif defined (HAVE_MMX)
475
-		postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
476
-#else
477
-		postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
478
-#endif
479
-#endif //!RUNTIME_CPUDETECT
480
-}
481
-
482
-//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
483
-//	QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode);
484
-
485
-/* -pp Command line Help
486
-*/
487
-char *pp_help=
488
-"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
489
-"long form example:\n"
490
-"vdeblock:autoq/hdeblock:autoq/linblenddeint	default,-vdeblock\n"
491
-"short form example:\n"
492
-"vb:a/hb:a/lb					de,-vb\n"
493
-"more examples:\n"
494
-"tn:64:128:256\n"
495
-"Filters			Options\n"
496
-"short	long name	short	long option	Description\n"
497
-"*	*		a	autoq		cpu power dependant enabler\n"
498
-"			c	chrom		chrominance filtring enabled\n"
499
-"			y	nochrom		chrominance filtring disabled\n"
500
-"hb	hdeblock	(2 Threshold)		horizontal deblocking filter\n"
501
-"	1. difference factor: default=64, higher -> more deblocking\n"
502
-"	2. flatness threshold: default=40, lower -> more deblocking\n"
503
-"			the h & v deblocking filters share these\n"
504
-"			so u cant set different thresholds for h / v\n"
505
-"vb	vdeblock	(2 Threshold)		vertical deblocking filter\n"
506
-"h1	x1hdeblock				Experimental h deblock filter 1\n"
507
-"v1	x1vdeblock				Experimental v deblock filter 1\n"
508
-"dr	dering					Deringing filter\n"
509
-"al	autolevels				automatic brightness / contrast\n"
510
-"			f	fullyrange	stretch luminance to (0..255)\n"
511
-"lb	linblenddeint				linear blend deinterlacer\n"
512
-"li	linipoldeint				linear interpolating deinterlace\n"
513
-"ci	cubicipoldeint				cubic interpolating deinterlacer\n"
514
-"md	mediandeint				median deinterlacer\n"
515
-"fd	ffmpegdeint				ffmpeg deinterlacer\n"
516
-"de	default					hb:a,vb:a,dr:a,al\n"
517
-"fa	fast					h1:a,v1:a,dr:a,al\n"
518
-"tn	tmpnoise	(3 Thresholds)		Temporal Noise Reducer\n"
519
-"			1. <= 2. <= 3.		larger -> stronger filtering\n"
520
-"fq	forceQuant	<quantizer>		Force quantizer\n"
521
-;
522
-
523
-pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality)
524
-{
525
-	char temp[GET_MODE_BUFFER_SIZE];
526
-	char *p= temp;
527
-	char *filterDelimiters= ",/";
528
-	char *optionDelimiters= ":";
529
-	struct PPMode *ppMode;
530
-	char *filterToken;
531
-
532
-	ppMode= memalign(8, sizeof(PPMode));
533
-	
534
-	ppMode->lumMode= 0;
535
-	ppMode->chromMode= 0;
536
-	ppMode->maxTmpNoise[0]= 700;
537
-	ppMode->maxTmpNoise[1]= 1500;
538
-	ppMode->maxTmpNoise[2]= 3000;
539
-	ppMode->maxAllowedY= 234;
540
-	ppMode->minAllowedY= 16;
541
-	ppMode->baseDcDiff= 256/4;
542
-	ppMode->flatnessThreshold= 56-16;
543
-	ppMode->maxClippedThreshold= 0.01;
544
-	ppMode->error=0;
545
-
546
-	strncpy(temp, name, GET_MODE_BUFFER_SIZE);
547
-
548
-	if(verbose>1) printf("pp: %s\n", name);
549
-
550
-	for(;;){
551
-		char *filterName;
552
-		int q= 1000000; //PP_QUALITY_MAX;
553
-		int chrom=-1;
554
-		char *option;
555
-		char *options[OPTIONS_ARRAY_SIZE];
556
-		int i;
557
-		int filterNameOk=0;
558
-		int numOfUnknownOptions=0;
559
-		int enable=1; //does the user want us to enabled or disabled the filter
560
-
561
-		filterToken= strtok(p, filterDelimiters);
562
-		if(filterToken == NULL) break;
563
-		p+= strlen(filterToken) + 1; // p points to next filterToken
564
-		filterName= strtok(filterToken, optionDelimiters);
565
-		if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName);
566
-
567
-		if(*filterName == '-')
568
-		{
569
-			enable=0;
570
-			filterName++;
571
-		}
572
-
573
-		for(;;){ //for all options
574
-			option= strtok(NULL, optionDelimiters);
575
-			if(option == NULL) break;
576
-
577
-			if(verbose>1) printf("pp: option: %s\n", option);
578
-			if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
579
-			else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
580
-			else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
581
-			else
582
-			{
583
-				options[numOfUnknownOptions] = option;
584
-				numOfUnknownOptions++;
585
-			}
586
-			if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
587
-		}
588
-		options[numOfUnknownOptions] = NULL;
589
-
590
-		/* replace stuff from the replace Table */
591
-		for(i=0; replaceTable[2*i]!=NULL; i++)
592
-		{
593
-			if(!strcmp(replaceTable[2*i], filterName))
594
-			{
595
-				int newlen= strlen(replaceTable[2*i + 1]);
596
-				int plen;
597
-				int spaceLeft;
598
-
599
-				if(p==NULL) p= temp, *p=0; 	//last filter
600
-				else p--, *p=',';		//not last filter
601
-
602
-				plen= strlen(p);
603
-				spaceLeft= p - temp + plen;
604
-				if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE)
605
-				{
606
-					ppMode->error++;
607
-					break;
608
-				}
609
-				memmove(p + newlen, p, plen+1);
610
-				memcpy(p, replaceTable[2*i + 1], newlen);
611
-				filterNameOk=1;
612
-			}
613
-		}
614
-
615
-		for(i=0; filters[i].shortName!=NULL; i++)
616
-		{
617
-//			printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName);
618
-			if(   !strcmp(filters[i].longName, filterName)
619
-			   || !strcmp(filters[i].shortName, filterName))
620
-			{
621
-				ppMode->lumMode &= ~filters[i].mask;
622
-				ppMode->chromMode &= ~filters[i].mask;
623
-
624
-				filterNameOk=1;
625
-				if(!enable) break; // user wants to disable it
626
-
627
-				if(q >= filters[i].minLumQuality)
628
-					ppMode->lumMode|= filters[i].mask;
629
-				if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
630
-					if(q >= filters[i].minChromQuality)
631
-						ppMode->chromMode|= filters[i].mask;
632
-
633
-				if(filters[i].mask == LEVEL_FIX)
634
-				{
635
-					int o;
636
-					ppMode->minAllowedY= 16;
637
-					ppMode->maxAllowedY= 234;
638
-					for(o=0; options[o]!=NULL; o++)
639
-					{
640
-						if(  !strcmp(options[o],"fullyrange")
641
-						   ||!strcmp(options[o],"f"))
642
-						{
643
-							ppMode->minAllowedY= 0;
644
-							ppMode->maxAllowedY= 255;
645
-							numOfUnknownOptions--;
646
-						}
647
-					}
648
-				}
649
-				else if(filters[i].mask == TEMP_NOISE_FILTER)
650
-				{
651
-					int o;
652
-					int numOfNoises=0;
653
-
654
-					for(o=0; options[o]!=NULL; o++)
655
-					{
656
-						char *tail;
657
-						ppMode->maxTmpNoise[numOfNoises]=
658
-							strtol(options[o], &tail, 0);
659
-						if(tail!=options[o])
660
-						{
661
-							numOfNoises++;
662
-							numOfUnknownOptions--;
663
-							if(numOfNoises >= 3) break;
664
-						}
665
-					}
666
-				}
667
-				else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK)
668
-				{
669
-					int o;
670
-
671
-					for(o=0; options[o]!=NULL && o<2; o++)
672
-					{
673
-						char *tail;
674
-						int val= strtol(options[o], &tail, 0);
675
-						if(tail==options[o]) break;
676
-
677
-						numOfUnknownOptions--;
678
-						if(o==0) ppMode->baseDcDiff= val;
679
-						else ppMode->flatnessThreshold= val;
680
-					}
681
-				}
682
-				else if(filters[i].mask == FORCE_QUANT)
683
-				{
684
-					int o;
685
-					ppMode->forcedQuant= 15;
686
-
687
-					for(o=0; options[o]!=NULL && o<1; o++)
688
-					{
689
-						char *tail;
690
-						int val= strtol(options[o], &tail, 0);
691
-						if(tail==options[o]) break;
692
-
693
-						numOfUnknownOptions--;
694
-						ppMode->forcedQuant= val;
695
-					}
696
-				}
697
-			}
698
-		}
699
-		if(!filterNameOk) ppMode->error++;
700
-		ppMode->error += numOfUnknownOptions;
701
-	}
702
-
703
-	if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
704
-	if(ppMode->error)
705
-	{
706
-		fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
707
-		free(ppMode);
708
-		return NULL;
709
-	}
710
-	return ppMode;
711
-}
712
-
713
-void pp_free_mode(pp_mode_t *mode){
714
-    if(mode) free(mode);
715
-}
716
-
717
-static void reallocAlign(void **p, int alignment, int size){
718
-	if(*p) free(*p);
719
-	*p= memalign(alignment, size);
720
-	memset(*p, 0, size);
721
-}
722
-
723
-static void reallocBuffers(PPContext *c, int width, int height, int stride){
724
-	int mbWidth = (width+15)>>4;
725
-	int mbHeight= (height+15)>>4;
726
-	int i;
727
-
728
-	c->stride= stride;
729
-
730
-	reallocAlign((void **)&c->tempDst, 8, stride*24);
731
-	reallocAlign((void **)&c->tempSrc, 8, stride*24);
732
-	reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
733
-	reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
734
-	for(i=0; i<256; i++)
735
-		c->yHistogram[i]= width*height/64*15/256;
736
-
737
-	for(i=0; i<3; i++)
738
-	{
739
-		//Note:the +17*1024 is just there so i dont have to worry about r/w over te end
740
-		reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024);
741
-		reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
742
-	}
743
-
744
-	reallocAlign((void **)&c->deintTemp, 8, width+16);
745
-	reallocAlign((void **)&c->nonBQPTable, 8, mbWidth*mbHeight*sizeof(QP_STORE_T));
746
-	reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
747
-}
748
-
749
-pp_context_t *pp_get_context(int width, int height, int cpuCaps){
750
-	PPContext *c= memalign(32, sizeof(PPContext));
751
-	int i;
752
-	int stride= (width+15)&(~15); //assumed / will realloc if needed
753
-        
754
-	memset(c, 0, sizeof(PPContext));
755
-	c->cpuCaps= cpuCaps;
756
-	if(cpuCaps&PP_FORMAT){
757
-		c->hChromaSubSample= cpuCaps&0x3;
758
-		c->vChromaSubSample= (cpuCaps>>4)&0x3;
759
-	}else{
760
-		c->hChromaSubSample= 1;
761
-		c->vChromaSubSample= 1;
762
-	}
763
-
764
-	reallocBuffers(c, width, height, stride);
765
-        
766
-	c->frameNum=-1;
767
-
768
-	return c;
769
-}
770
-
771
-void pp_free_context(void *vc){
772
-	PPContext *c = (PPContext*)vc;
773
-	int i;
774
-	
775
-	for(i=0; i<3; i++) free(c->tempBlured[i]);
776
-	for(i=0; i<3; i++) free(c->tempBluredPast[i]);
777
-	
778
-	free(c->tempBlocks);
779
-	free(c->yHistogram);
780
-	free(c->tempDst);
781
-	free(c->tempSrc);
782
-	free(c->deintTemp);
783
-	free(c->nonBQPTable);
784
-	free(c->forcedQPTable);
785
-        
786
-	memset(c, 0, sizeof(PPContext));
787
-
788
-	free(c);
789
-}
790
-
791
-void  pp_postprocess(uint8_t * src[3], int srcStride[3],
792
-                 uint8_t * dst[3], int dstStride[3],
793
-                 int width, int height,
794
-                 QP_STORE_T *QP_store,  int QPStride,
795
-		 pp_mode_t *vm,  void *vc, int pict_type)
796
-{
797
-	int mbWidth = (width+15)>>4;
798
-	int mbHeight= (height+15)>>4;
799
-	PPMode *mode = (PPMode*)vm;
800
-	PPContext *c = (PPContext*)vc;
801
-        int minStride= MAX(srcStride[0], dstStride[0]);
802
-	
803
-	if(c->stride < minStride)
804
-		reallocBuffers(c, width, height, minStride);
805
-
806
-	if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)) 
807
-	{
808
-		int i;
809
-		QP_store= c->forcedQPTable;
810
-		QPStride= 0;
811
-		if(mode->lumMode & FORCE_QUANT)
812
-			for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant;
813
-		else
814
-			for(i=0; i<mbWidth; i++) QP_store[i]= 1;
815
-	}
816
-if(0){
817
-int x,y;
818
-for(y=0; y<mbHeight; y++){
819
-	for(x=0; x<mbWidth; x++){
820
-		printf("%2d ", QP_store[x + y*QPStride]);
821
-	}
822
-	printf("\n");
823
-}
824
-	printf("\n");
825
-}
826
-//printf("pict_type:%d\n", pict_type);
827
-
828
-	if(pict_type!=3)
829
-	{
830
-		int x,y;
831
-		for(y=0; y<mbHeight; y++){
832
-			for(x=0; x<mbWidth; x++){
833
-				int qscale= QP_store[x + y*QPStride];
834
-				if(qscale&~31)
835
-				    qscale=31;
836
-				c->nonBQPTable[y*mbWidth + x]= qscale;
837
-			}
838
-		}
839
-	}
840
-
841
-	if(verbose>2)
842
-	{
843
-		printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode);
844
-	}
845
-
846
-	postProcess(src[0], srcStride[0], dst[0], dstStride[0],
847
-		width, height, QP_store, QPStride, 0, mode, c);
848
-
849
-	width  = (width )>>c->hChromaSubSample;
850
-	height = (height)>>c->vChromaSubSample;
851
-
852
-	if(mode->chromMode)
853
-	{
854
-		postProcess(src[1], srcStride[1], dst[1], dstStride[1],
855
-			width, height, QP_store, QPStride, 1, mode, c);
856
-		postProcess(src[2], srcStride[2], dst[2], dstStride[2],
857
-			width, height, QP_store, QPStride, 2, mode, c);
858
-	}
859
-	else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2])
860
-	{
861
-		memcpy(dst[1], src[1], srcStride[1]*height);
862
-		memcpy(dst[2], src[2], srcStride[2]*height);
863
-	}
864
-	else
865
-	{
866
-		int y;
867
-		for(y=0; y<height; y++)
868
-		{
869
-			memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
870
-			memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
871
-		}
872
-	}
873
-}
874
-
875 1
deleted file mode 100644
... ...
@@ -1,73 +0,0 @@
1
-/*
2
-    Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3
-
4
-    This program is free software; you can redistribute it and/or modify
5
-    it under the terms of the GNU General Public License as published by
6
-    the Free Software Foundation; either version 2 of the License, or
7
-    (at your option) any later version.
8
-
9
-    This program is distributed in the hope that it will be useful,
10
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
-    GNU General Public License for more details.
13
-
14
-    You should have received a copy of the GNU General Public License
15
-    along with this program; if not, write to the Free Software
16
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
-*/
18
-
19
-#ifndef NEWPOSTPROCESS_H
20
-#define NEWPOSTPROCESS_H
21
-
22
-/**
23
- * @file postprocess.h
24
- * @brief 
25
- *     external api for the pp stuff
26
- */
27
-
28
-#ifdef __cplusplus
29
-extern "C" {
30
-#endif
31
-
32
-#define PP_QUALITY_MAX 6
33
-
34
-#define QP_STORE_T int8_t
35
-
36
-typedef void pp_context_t;
37
-typedef void pp_mode_t;
38
-
39
-extern char *pp_help; //a simple help text
40
-
41
-void  pp_postprocess(uint8_t * src[3], int srcStride[3],
42
-                 uint8_t * dst[3], int dstStride[3],
43
-                 int horizontalSize, int verticalSize,
44
-                 QP_STORE_T *QP_store,  int QP_stride,
45
-		 pp_mode_t *mode, pp_context_t *ppContext, int pict_type);
46
-
47
-
48
-/**
49
- * returns a pp_mode_t or NULL if an error occured
50
- * name is the string after "-pp" on the command line
51
- * quality is a number from 0 to PP_QUALITY_MAX
52
- */
53
-pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality);
54
-void pp_free_mode(pp_mode_t *mode);
55
-
56
-pp_context_t *pp_get_context(int width, int height, int flags);
57
-void pp_free_context(pp_context_t *ppContext);
58
-
59
-#define PP_CPU_CAPS_MMX   0x80000000
60
-#define PP_CPU_CAPS_MMX2  0x20000000
61
-#define PP_CPU_CAPS_3DNOW 0x40000000
62
-
63
-#define PP_FORMAT         0x00000008
64
-#define PP_FORMAT_420    (0x00000011|PP_FORMAT)
65
-#define PP_FORMAT_422    (0x00000001|PP_FORMAT)
66
-#define PP_FORMAT_411    (0x00000002|PP_FORMAT)
67
-#define PP_FORMAT_444    (0x00000000|PP_FORMAT)
68
-
69
-#ifdef __cplusplus
70
-}
71
-#endif
72
-
73
-#endif
74 1
deleted file mode 100644
... ...
@@ -1,128 +0,0 @@
1
-/*
2
-    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3
-
4
-    This program is free software; you can redistribute it and/or modify
5
-    it under the terms of the GNU General Public License as published by
6
-    the Free Software Foundation; either version 2 of the License, or
7
-    (at your option) any later version.
8
-
9
-    This program is distributed in the hope that it will be useful,
10
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
-    GNU General Public License for more details.
13
-
14
-    You should have received a copy of the GNU General Public License
15
-    along with this program; if not, write to the Free Software
16
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
-*/
18
-
19
-#define V_DEBLOCK	0x01
20
-#define H_DEBLOCK	0x02
21
-#define DERING		0x04
22
-#define LEVEL_FIX	0x08 /* Brightness & Contrast */
23
-
24
-#define LUM_V_DEBLOCK	V_DEBLOCK		//   1
25
-#define LUM_H_DEBLOCK	H_DEBLOCK		//   2
26
-#define CHROM_V_DEBLOCK	(V_DEBLOCK<<4)		//  16
27
-#define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)		//  32
28
-#define LUM_DERING	DERING			//   4
29
-#define CHROM_DERING	(DERING<<4)		//  64
30
-#define LUM_LEVEL_FIX	LEVEL_FIX		//   8
31
-#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)		// 128 (not implemented yet)
32
-
33
-// Experimental vertical filters
34
-#define V_X1_FILTER	0x0200			// 512
35
-
36
-// Experimental horizontal filters
37
-#define H_X1_FILTER	0x2000			// 8192
38
-
39
-// select between full y range (255-0) or standart one (234-16)
40
-#define FULL_Y_RANGE	0x8000			// 32768
41
-
42
-//Deinterlacing Filters
43
-#define	LINEAR_IPOL_DEINT_FILTER	0x10000	// 65536
44
-#define	LINEAR_BLEND_DEINT_FILTER	0x20000	// 131072
45
-#define	CUBIC_BLEND_DEINT_FILTER	0x8000	// (not implemented yet)
46
-#define	CUBIC_IPOL_DEINT_FILTER		0x40000	// 262144
47
-#define	MEDIAN_DEINT_FILTER		0x80000	// 524288
48
-#define	FFMPEG_DEINT_FILTER		0x400000
49
-
50
-#define TEMP_NOISE_FILTER		0x100000
51
-#define FORCE_QUANT			0x200000
52
-
53
-//use if u want a faster postprocessing code
54
-//cant differentiate between chroma & luma filters (both on or both off)
55
-//obviosly the -pp option at the commandline has no effect except turning the here selected
56
-//filters on
57
-//#define COMPILE_TIME_MODE 0x77
58
-
59
-struct PPFilter{
60
-	char *shortName;
61
-	char *longName;
62
-	int chromDefault; 	// is chrominance filtering on by default if this filter is manually activated
63
-	int minLumQuality; 	// minimum quality to turn luminance filtering on
64
-	int minChromQuality;	// minimum quality to turn chrominance filtering on
65
-	int mask; 		// Bitmask to turn this filter on
66
-};
67
-
68
-typedef struct PPMode{
69
-	int lumMode; 			// acivates filters for luminance
70
-	int chromMode; 			// acivates filters for chrominance
71
-	int error; 			// non zero on error
72
-
73
-	int minAllowedY; 		// for brigtness correction
74
-	int maxAllowedY; 		// for brihtness correction
75
-	float maxClippedThreshold;	// amount of "black" u r willing to loose to get a brightness corrected picture
76
-
77
-	int maxTmpNoise[3]; 		// for Temporal Noise Reducing filter (Maximal sum of abs differences)
78
-
79
-	int baseDcDiff;
80
-	int flatnessThreshold;
81
-
82
-	int forcedQuant; 		// quantizer if FORCE_QUANT is used
83
-} PPMode;
84
-
85
-typedef struct PPContext{
86
-	uint8_t *tempBlocks; //used for the horizontal code
87
-
88

                
89
-	   after watching a black picture for 5 hours*/
90
-	uint64_t *yHistogram;
91
-
92
-	uint64_t __attribute__((aligned(8))) packedYOffset;
93
-	uint64_t __attribute__((aligned(8))) packedYScale;
94
-
95
-	/* Temporal noise reducing buffers */
96
-	uint8_t *tempBlured[3];
97
-	int32_t *tempBluredPast[3];
98
-
99
-	/* Temporary buffers for handling the last row(s) */
100
-	uint8_t *tempDst;
101
-	uint8_t *tempSrc;
102
-
103
-	uint8_t *deintTemp;
104
-
105
-	uint64_t __attribute__((aligned(8))) pQPb;
106
-	uint64_t __attribute__((aligned(8))) pQPb2;
107
-
108
-	uint64_t __attribute__((aligned(8))) mmxDcOffset[32];
109
-	uint64_t __attribute__((aligned(8))) mmxDcThreshold[32];
110
-
111
-	QP_STORE_T *nonBQPTable;
112
-	QP_STORE_T *forcedQPTable;
113
-
114
-	int QP;
115
-	int nonBQP;
116
-
117
-	int frameNum;
118
-	
119
-	int cpuCaps;
120
-        
121
-	int stride; //size of some buffers (needed to realloc them if needed)
122
-        
123
-	int hChromaSubSample;
124
-	int vChromaSubSample;
125
-
126
-	PPMode ppMode;
127
-} PPContext;
128
-
129 1
deleted file mode 100644
... ...
@@ -1,3127 +0,0 @@
1
-/*
2
-    Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3
-
4
-    This program is free software; you can redistribute it and/or modify
5
-    it under the terms of the GNU General Public License as published by
6
-    the Free Software Foundation; either version 2 of the License, or
7
-    (at your option) any later version.
8
-
9
-    This program is distributed in the hope that it will be useful,
10
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
-    GNU General Public License for more details.
13
-
14
-    You should have received a copy of the GNU General Public License
15
-    along with this program; if not, write to the Free Software
16
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
-*/
18
-
19
-#undef PAVGB
20
-#undef PMINUB
21
-#undef PMAXUB
22
-
23
-#ifdef HAVE_MMX2
24
-#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
25
-#elif defined (HAVE_3DNOW)
26
-#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
27
-#endif
28
-
29
-#ifdef HAVE_MMX2
30
-#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
31
-#elif defined (HAVE_MMX)
32
-#define PMINUB(b,a,t) \
33
-	"movq " #a ", " #t " \n\t"\
34
-	"psubusb " #b ", " #t " \n\t"\
35
-	"psubb " #t ", " #a " \n\t"
36
-#endif
37
-
38
-#ifdef HAVE_MMX2
39
-#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
40
-#elif defined (HAVE_MMX)
41
-#define PMAXUB(a,b) \
42
-	"psubusb " #a ", " #b " \n\t"\
43
-	"paddb " #a ", " #b " \n\t"
44
-#endif
45
-
46
-
47
-//FIXME? |255-0| = 1 (shouldnt be a problem ...)
48
-#ifdef HAVE_MMX
49
-/**
50
- * Check if the middle 8x8 Block in the given 8x16 block is flat
51
- */
52
-static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){
53
-	int numEq= 0;
54
-	src+= stride*4; // src points to begin of the 8x8 Block
55
-asm volatile(
56
-		"leal (%1, %2), %%eax				\n\t"
57
-//	0	1	2	3	4	5	6	7	8	9
58
-//	%1	eax	eax+%2	eax+2%2	%1+4%2	ecx	ecx+%2	ecx+2%2	%1+8%2	ecx+4%2
59
-		"movq %3, %%mm7					\n\t" 
60
-		"movq %4, %%mm6					\n\t" 
61
-
62
-		"movq (%1), %%mm0				\n\t"
63
-		"movq (%%eax), %%mm1				\n\t"
64
-		"psubb %%mm1, %%mm0				\n\t" // mm0 = differnece
65
-		"paddb %%mm7, %%mm0				\n\t"
66
-		"pcmpgtb %%mm6, %%mm0				\n\t"
67
-
68
-		"movq (%%eax,%2), %%mm2				\n\t"
69
-		"psubb %%mm2, %%mm1				\n\t"
70
-		"paddb %%mm7, %%mm1				\n\t"
71
-		"pcmpgtb %%mm6, %%mm1				\n\t"
72
-		"paddb %%mm1, %%mm0				\n\t"
73
-
74
-		"movq (%%eax, %2, 2), %%mm1			\n\t"
75
-		"psubb %%mm1, %%mm2				\n\t"
76
-		"paddb %%mm7, %%mm2				\n\t"
77
-		"pcmpgtb %%mm6, %%mm2				\n\t"
78
-		"paddb %%mm2, %%mm0				\n\t"
79
-		
80
-		"leal (%%eax, %2, 4), %%eax			\n\t"
81
-
82
-		"movq (%1, %2, 4), %%mm2			\n\t"
83
-		"psubb %%mm2, %%mm1				\n\t"
84
-		"paddb %%mm7, %%mm1				\n\t"
85
-		"pcmpgtb %%mm6, %%mm1				\n\t"
86
-		"paddb %%mm1, %%mm0				\n\t"
87
-
88
-		"movq (%%eax), %%mm1				\n\t"
89
-		"psubb %%mm1, %%mm2				\n\t"
90
-		"paddb %%mm7, %%mm2				\n\t"
91
-		"pcmpgtb %%mm6, %%mm2				\n\t"
92
-		"paddb %%mm2, %%mm0				\n\t"
93
-
94
-		"movq (%%eax, %2), %%mm2			\n\t"
95
-		"psubb %%mm2, %%mm1				\n\t"
96
-		"paddb %%mm7, %%mm1				\n\t"
97
-		"pcmpgtb %%mm6, %%mm1				\n\t"
98
-		"paddb %%mm1, %%mm0				\n\t"
99
-
100
-		"movq (%%eax, %2, 2), %%mm1			\n\t"
101
-		"psubb %%mm1, %%mm2				\n\t"
102
-		"paddb %%mm7, %%mm2				\n\t"
103
-		"pcmpgtb %%mm6, %%mm2				\n\t"
104
-		"paddb %%mm2, %%mm0				\n\t"
105
-
106
-		"						\n\t"
107
-#ifdef HAVE_MMX2
108
-		"pxor %%mm7, %%mm7				\n\t"
109
-		"psadbw %%mm7, %%mm0				\n\t"
110
-#else
111
-		"movq %%mm0, %%mm1				\n\t"
112
-		"psrlw $8, %%mm0				\n\t"
113
-		"paddb %%mm1, %%mm0				\n\t"
114
-		"movq %%mm0, %%mm1				\n\t"
115
-		"psrlq $16, %%mm0				\n\t"
116
-		"paddb %%mm1, %%mm0				\n\t"
117
-		"movq %%mm0, %%mm1				\n\t"
118
-		"psrlq $32, %%mm0				\n\t"
119
-		"paddb %%mm1, %%mm0				\n\t"
120
-#endif
121
-		"movd %%mm0, %0					\n\t"
122
-		: "=r" (numEq)
123
-		: "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]),  "m" (c->mmxDcThreshold[c->nonBQP])
124
-		: "%eax"
125
-		);
126
-	numEq= (-numEq) &0xFF;
127
-	return numEq > c->ppMode.flatnessThreshold;
128
-}
129
-#endif
130
-
131
-static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c)
132
-{
133
-#ifdef HAVE_MMX
134
-	int isOk;
135
-	src+= stride*3;
136
-	asm volatile(
137
-		"movq (%1, %2), %%mm0				\n\t"
138
-		"movq (%1, %2, 8), %%mm1			\n\t"
139
-		"movq %%mm0, %%mm2				\n\t"
140
-		"psubusb %%mm1, %%mm0				\n\t"
141
-		"psubusb %%mm2, %%mm1				\n\t"
142
-		"por %%mm1, %%mm0				\n\t" // ABS Diff
143
-
144
-		"movq %3, %%mm7					\n\t" // QP,..., QP
145
-		"paddusb %%mm7, %%mm7				\n\t" // 2QP ... 2QP
146
-		"psubusb %%mm7, %%mm0				\n\t" // Diff <= 2QP -> 0
147
-		"packssdw %%mm0, %%mm0				\n\t"
148
-		"movd %%mm0, %0					\n\t"
149
-		: "=r" (isOk)
150
-		: "r" (src), "r" (stride), "m" (c->pQPb)
151
-		);
152
-	return isOk==0;
153
-#else
154
-#if 1
155
-	int x;
156
-	const int QP= c->QP;
157
-	src+= stride*3;
158
-	for(x=0; x<BLOCK_SIZE; x++)
159
-	{
160
-		if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0;
161
-	}
162
-
163
-	return 1;
164
-#else
165
-	int x;
166
-	const int QP= c->QP;
167
-	src+= stride*4;
168
-	for(x=0; x<BLOCK_SIZE; x++)
169
-	{
170
-		int min=255;
171
-		int max=0;
172
-		int y;
173
-		for(y=0; y<8; y++){
174
-			int v= src[x + y*stride];
175
-			if(v>max) max=v;
176
-			if(v<min) min=v;
177
-		}
178
-		if(max-min > 2*QP) return 0;
179
-	}
180
-	return 1;
181
-#endif
182
-#endif
183
-}
184
-
185
-/**
186
- * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
187
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
188
- */
189
-static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
190
-{
191
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
192
-	src+= stride*3;
193
-	asm volatile(	//"movv %0 %1 %2\n\t"
194
-		"movq %2, %%mm0			\n\t"  // QP,..., QP
195
-		"pxor %%mm4, %%mm4				\n\t"
196
-
197
-		"movq (%0), %%mm6				\n\t"
198
-		"movq (%0, %1), %%mm5				\n\t"
199
-		"movq %%mm5, %%mm1				\n\t"
200
-		"movq %%mm6, %%mm2				\n\t"
201
-		"psubusb %%mm6, %%mm5				\n\t"
202
-		"psubusb %%mm1, %%mm2				\n\t"
203
-		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
204
-		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
205
-		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
206
-
207
-		"pand %%mm2, %%mm6				\n\t"
208
-		"pandn %%mm1, %%mm2				\n\t"
209
-		"por %%mm2, %%mm6				\n\t"// First Line to Filter
210
-
211
-		"movq (%0, %1, 8), %%mm5			\n\t"
212
-		"leal (%0, %1, 4), %%eax			\n\t"
213
-		"leal (%0, %1, 8), %%ecx			\n\t"
214
-		"subl %1, %%ecx					\n\t"
215
-		"addl %1, %0					\n\t" // %0 points to line 1 not 0
216
-		"movq (%0, %1, 8), %%mm7			\n\t"
217
-		"movq %%mm5, %%mm1				\n\t"
218
-		"movq %%mm7, %%mm2				\n\t"
219
-		"psubusb %%mm7, %%mm5				\n\t"
220
-		"psubusb %%mm1, %%mm2				\n\t"
221
-		"por %%mm5, %%mm2				\n\t" // ABS Diff of lines
222
-		"psubusb %%mm0, %%mm2				\n\t" // diff <= QP -> 0
223
-		"pcmpeqb %%mm4, %%mm2			\n\t" // diff <= QP -> FF
224
-
225
-		"pand %%mm2, %%mm7				\n\t"
226
-		"pandn %%mm1, %%mm2				\n\t"
227
-		"por %%mm2, %%mm7				\n\t" // First Line to Filter
228
-
229
-
230
-		// 	1	2	3	4	5	6	7	8
231
-		//	%0	%0+%1	%0+2%1	eax	%0+4%1	eax+2%1	ecx	eax+4%1
232
-		// 6 4 2 2 1 1
233
-		// 6 4 4 2
234
-		// 6 8 2
235
-
236
-		"movq (%0, %1), %%mm0				\n\t" //  1
237
-		"movq %%mm0, %%mm1				\n\t" //  1
238
-		PAVGB(%%mm6, %%mm0)				      //1 1	/2
239
-		PAVGB(%%mm6, %%mm0)				      //3 1	/4
240
-
241
-		"movq (%0, %1, 4), %%mm2			\n\t" //     1
242
-		"movq %%mm2, %%mm5				\n\t" //     1
243
-		PAVGB((%%eax), %%mm2)				      //    11	/2
244
-		PAVGB((%0, %1, 2), %%mm2)			      //   211	/4
245
-		"movq %%mm2, %%mm3				\n\t" //   211	/4
246
-		"movq (%0), %%mm4				\n\t" // 1
247
-		PAVGB(%%mm4, %%mm3)				      // 4 211	/8
248
-		PAVGB(%%mm0, %%mm3)				      //642211	/16
249
-		"movq %%mm3, (%0)				\n\t" // X
250
-		// mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
251
-		"movq %%mm1, %%mm0				\n\t" //  1
252
-		PAVGB(%%mm6, %%mm0)				      //1 1	/2
253
-		"movq %%mm4, %%mm3				\n\t" // 1
254
-		PAVGB((%0,%1,2), %%mm3)				      // 1 1	/2
255
-		PAVGB((%%eax,%1,2), %%mm5)			      //     11	/2
256
-		PAVGB((%%eax), %%mm5)				      //    211 /4
257
-		PAVGB(%%mm5, %%mm3)				      // 2 2211 /8
258
-		PAVGB(%%mm0, %%mm3)				      //4242211 /16
259
-		"movq %%mm3, (%0,%1)				\n\t" //  X
260
-		// mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
261
-		PAVGB(%%mm4, %%mm6)				      //11	/2
262
-		"movq (%%ecx), %%mm0				\n\t" //       1
263
-		PAVGB((%%eax, %1, 2), %%mm0)			      //      11/2
264
-		"movq %%mm0, %%mm3				\n\t" //      11/2
265
-		PAVGB(%%mm1, %%mm0)				      //  2   11/4
266
-		PAVGB(%%mm6, %%mm0)				      //222   11/8
267
-		PAVGB(%%mm2, %%mm0)				      //22242211/16
268
-		"movq (%0, %1, 2), %%mm2			\n\t" //   1
269
-		"movq %%mm0, (%0, %1, 2)			\n\t" //   X
270
-		// mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
271
-		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
272
-		PAVGB((%%ecx), %%mm0)				      //       11	/2
273
-		PAVGB(%%mm0, %%mm6)				      //11     11	/4
274
-		PAVGB(%%mm1, %%mm4)				      // 11		/2
275
-		PAVGB(%%mm2, %%mm1)				      //  11		/2
276
-		PAVGB(%%mm1, %%mm6)				      //1122   11	/8
277
-		PAVGB(%%mm5, %%mm6)				      //112242211	/16
278
-		"movq (%%eax), %%mm5				\n\t" //    1
279
-		"movq %%mm6, (%%eax)				\n\t" //    X
280
-		// mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
281
-		"movq (%%eax, %1, 4), %%mm6			\n\t" //        1
282
-		PAVGB(%%mm7, %%mm6)				      //        11	/2
283
-		PAVGB(%%mm4, %%mm6)				      // 11     11	/4
284
-		PAVGB(%%mm3, %%mm6)				      // 11   2211	/8
285
-		PAVGB(%%mm5, %%mm2)				      //   11		/2
286
-		"movq (%0, %1, 4), %%mm4			\n\t" //     1
287
-		PAVGB(%%mm4, %%mm2)				      //   112		/4
288
-		PAVGB(%%mm2, %%mm6)				      // 112242211	/16
289
-		"movq %%mm6, (%0, %1, 4)			\n\t" //     X
290
-		// mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
291
-		PAVGB(%%mm7, %%mm1)				      //  11     2	/4
292
-		PAVGB(%%mm4, %%mm5)				      //    11		/2
293
-		PAVGB(%%mm5, %%mm0)				      //    11 11	/4
294
-		"movq (%%eax, %1, 2), %%mm6			\n\t" //      1
295
-		PAVGB(%%mm6, %%mm1)				      //  11  4  2	/8
296
-		PAVGB(%%mm0, %%mm1)				      //  11224222	/16
297
-		"movq %%mm1, (%%eax, %1, 2)			\n\t" //      X
298
-		// mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
299
-		PAVGB((%%ecx), %%mm2)				      //   112 4	/8
300
-		"movq (%%eax, %1, 4), %%mm0			\n\t" //        1
301
-		PAVGB(%%mm0, %%mm6)				      //      1 1	/2
302
-		PAVGB(%%mm7, %%mm6)				      //      1 12	/4
303
-		PAVGB(%%mm2, %%mm6)				      //   1122424	/4
304
-		"movq %%mm6, (%%ecx)				\n\t" //       X
305
-		// mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
306
-		PAVGB(%%mm7, %%mm5)				      //    11   2	/4
307
-		PAVGB(%%mm7, %%mm5)				      //    11   6	/8
308
-
309
-		PAVGB(%%mm3, %%mm0)				      //      112	/4
310
-		PAVGB(%%mm0, %%mm5)				      //    112246	/16
311
-		"movq %%mm5, (%%eax, %1, 4)			\n\t" //        X
312
-		"subl %1, %0					\n\t"
313
-
314
-		:
315
-		: "r" (src), "r" (stride), "m" (c->pQPb)
316
-		: "%eax", "%ecx"
317
-	);
318
-#else
319
-	const int l1= stride;
320
-	const int l2= stride + l1;
321
-	const int l3= stride + l2;
322
-	const int l4= stride + l3;
323
-	const int l5= stride + l4;
324
-	const int l6= stride + l5;
325
-	const int l7= stride + l6;
326
-	const int l8= stride + l7;
327
-	const int l9= stride + l8;
328
-	int x;
329
-	src+= stride*3;
330
-	for(x=0; x<BLOCK_SIZE; x++)
331
-	{
332
-		const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
333
-		const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
334
-
335
-		int sums[9];
336
-		sums[0] = first + src[l1];
337
-		sums[1] = src[l1] + src[l2];
338
-		sums[2] = src[l2] + src[l3];
339
-		sums[3] = src[l3] + src[l4];
340
-		sums[4] = src[l4] + src[l5];
341
-		sums[5] = src[l5] + src[l6];
342
-		sums[6] = src[l6] + src[l7];
343
-		sums[7] = src[l7] + src[l8];
344
-		sums[8] = src[l8] + last;
345
-
346
-		src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
347
-		src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4;
348
-		src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4;
349
-		src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4;
350
-		src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4;
351
-		src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4;
352
-		src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4;
353
-		src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4;
354
-
355
-		src++;
356
-	}
357
-#endif
358
-}
359
-
360
-#if 0
361
-/**
362
- * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar
363
- * values are correctly clipped (MMX2)
364
- * values are wraparound (C)
365
- * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient
366
-	0 8 16 24
367
-	x = 8
368
-	x/2 = 4
369
-	x/8 = 1
370
-	1 12 12 23
371
- */
372
-static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP)
373
-{
374
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
375
-	src+= stride*3;
376
-// FIXME rounding
377
-	asm volatile(
378
-		"pxor %%mm7, %%mm7				\n\t" // 0
379
-		"movq "MANGLE(b80)", %%mm6			\n\t" // MIN_SIGNED_BYTE
380
-		"leal (%0, %1), %%eax				\n\t"
381
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
382
-//	0	1	2	3	4	5	6	7	8	9
383
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
384
-		"movq "MANGLE(pQPb)", %%mm0			\n\t" // QP,..., QP
385
-		"movq %%mm0, %%mm1				\n\t" // QP,..., QP
386
-		"paddusb "MANGLE(b02)", %%mm0			\n\t"
387
-		"psrlw $2, %%mm0				\n\t"
388
-		"pand "MANGLE(b3F)", %%mm0			\n\t" // QP/4,..., QP/4
389
-		"paddusb %%mm1, %%mm0				\n\t" // QP*1.25 ...
390
-		"movq (%0, %1, 4), %%mm2			\n\t" // line 4
391
-		"movq (%%ecx), %%mm3				\n\t" // line 5
392
-		"movq %%mm2, %%mm4				\n\t" // line 4
393
-		"pcmpeqb %%mm5, %%mm5				\n\t" // -1
394
-		"pxor %%mm2, %%mm5				\n\t" // -line 4 - 1
395
-		PAVGB(%%mm3, %%mm5)
396
-		"paddb %%mm6, %%mm5				\n\t" // (l5-l4)/2
397
-		"psubusb %%mm3, %%mm4				\n\t"
398
-		"psubusb %%mm2, %%mm3				\n\t"
399
-		"por %%mm3, %%mm4				\n\t" // |l4 - l5|
400
-		"psubusb %%mm0, %%mm4				\n\t"
401
-		"pcmpeqb %%mm7, %%mm4				\n\t"
402
-		"pand %%mm4, %%mm5				\n\t" // d/2
403
-
404
-//		"paddb %%mm6, %%mm2				\n\t" // line 4 + 0x80
405
-		"paddb %%mm5, %%mm2				\n\t"
406
-//		"psubb %%mm6, %%mm2				\n\t"
407
-		"movq %%mm2, (%0,%1, 4)				\n\t"
408
-
409
-		"movq (%%ecx), %%mm2				\n\t"
410
-//		"paddb %%mm6, %%mm2				\n\t" // line 5 + 0x80
411
-		"psubb %%mm5, %%mm2				\n\t"
412
-//		"psubb %%mm6, %%mm2				\n\t"
413
-		"movq %%mm2, (%%ecx)				\n\t"
414
-
415
-		"paddb %%mm6, %%mm5				\n\t"
416
-		"psrlw $2, %%mm5				\n\t"
417
-		"pand "MANGLE(b3F)", %%mm5			\n\t"
418
-		"psubb "MANGLE(b20)", %%mm5			\n\t" // (l5-l4)/8
419
-
420
-		"movq (%%eax, %1, 2), %%mm2			\n\t"
421
-		"paddb %%mm6, %%mm2				\n\t" // line 3 + 0x80
422
-		"paddsb %%mm5, %%mm2				\n\t"
423
-		"psubb %%mm6, %%mm2				\n\t"
424
-		"movq %%mm2, (%%eax, %1, 2)			\n\t"
425
-
426
-		"movq (%%ecx, %1), %%mm2			\n\t"
427
-		"paddb %%mm6, %%mm2				\n\t" // line 6 + 0x80
428
-		"psubsb %%mm5, %%mm2				\n\t"
429
-		"psubb %%mm6, %%mm2				\n\t"
430
-		"movq %%mm2, (%%ecx, %1)			\n\t"
431
-
432
-		:
433
-		: "r" (src), "r" (stride)
434
-		: "%eax", "%ecx"
435
-	);
436
-#else
437
- 	const int l1= stride;
438
-	const int l2= stride + l1;
439
-	const int l3= stride + l2;
440
-	const int l4= stride + l3;
441
-	const int l5= stride + l4;
442
-	const int l6= stride + l5;
443
-//	const int l7= stride + l6;
444
-//	const int l8= stride + l7;
445
-//	const int l9= stride + l8;
446
-	int x;
447
-	const int QP15= QP + (QP>>2);
448
-	src+= stride*3;
449
-	for(x=0; x<BLOCK_SIZE; x++)
450
-	{
451
-		const int v = (src[x+l5] - src[x+l4]);
452
-		if(ABS(v) < QP15)
453
-		{
454
-			src[x+l3] +=v>>3;
455
-			src[x+l4] +=v>>1;
456
-			src[x+l5] -=v>>1;
457
-			src[x+l6] -=v>>3;
458
-
459
-		}
460
-	}
461
-
462
-#endif
463
-}
464
-#endif
465
-
466
-/**
467
- * Experimental Filter 1
468
- * will not damage linear gradients
469
- * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
470
- * can only smooth blocks at the expected locations (it cant smooth them if they did move)
471
- * MMX2 version does correct clipping C version doesnt
472
- */
473
-static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
474
-{
475
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
476
-	src+= stride*3;
477
-
478
-	asm volatile(
479
-		"pxor %%mm7, %%mm7				\n\t" // 0
480
-		"leal (%0, %1), %%eax				\n\t"
481
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
482
-//	0	1	2	3	4	5	6	7	8	9
483
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
484
-		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
485
-		"movq (%0, %1, 4), %%mm1			\n\t" // line 4
486
-		"movq %%mm1, %%mm2				\n\t" // line 4
487
-		"psubusb %%mm0, %%mm1				\n\t"
488
-		"psubusb %%mm2, %%mm0				\n\t"
489
-		"por %%mm1, %%mm0				\n\t" // |l2 - l3|
490
-		"movq (%%ecx), %%mm3				\n\t" // line 5
491
-		"movq (%%ecx, %1), %%mm4			\n\t" // line 6
492
-		"movq %%mm3, %%mm5				\n\t" // line 5
493
-		"psubusb %%mm4, %%mm3				\n\t"
494
-		"psubusb %%mm5, %%mm4				\n\t"
495
-		"por %%mm4, %%mm3				\n\t" // |l5 - l6|
496
-		PAVGB(%%mm3, %%mm0)				      // (|l2 - l3| + |l5 - l6|)/2
497
-		"movq %%mm2, %%mm1				\n\t" // line 4
498
-		"psubusb %%mm5, %%mm2				\n\t"
499
-		"movq %%mm2, %%mm4				\n\t"
500
-		"pcmpeqb %%mm7, %%mm2				\n\t" // (l4 - l5) <= 0 ? -1 : 0
501
-		"psubusb %%mm1, %%mm5				\n\t"
502
-		"por %%mm5, %%mm4				\n\t" // |l4 - l5|
503
-		"psubusb %%mm0, %%mm4		\n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
504
-		"movq %%mm4, %%mm3				\n\t" // d
505
-		"movq %2, %%mm0			\n\t"
506
-                "paddusb %%mm0, %%mm0				\n\t"
507
-		"psubusb %%mm0, %%mm4				\n\t"
508
-		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
509
-		"psubusb "MANGLE(b01)", %%mm3			\n\t"
510
-		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
511
-
512
-		PAVGB(%%mm7, %%mm3)				      // d/2
513
-		"movq %%mm3, %%mm1				\n\t" // d/2
514
-		PAVGB(%%mm7, %%mm3)				      // d/4
515
-		PAVGB(%%mm1, %%mm3)				      // 3*d/8
516
-
517
-		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
518
-		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
519
-		"psubusb %%mm3, %%mm0				\n\t"
520
-		"pxor %%mm2, %%mm0				\n\t"
521
-		"movq %%mm0, (%0, %1, 4)			\n\t" // line 4
522
-
523
-		"movq (%%ecx), %%mm0				\n\t" // line 5
524
-		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
525
-		"paddusb %%mm3, %%mm0				\n\t"
526
-		"pxor %%mm2, %%mm0				\n\t"
527
-		"movq %%mm0, (%%ecx)				\n\t" // line 5
528
-
529
-		PAVGB(%%mm7, %%mm1)				      // d/4
530
-
531
-		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
532
-		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
533
-		"psubusb %%mm1, %%mm0				\n\t"
534
-		"pxor %%mm2, %%mm0				\n\t"
535
-		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
536
-
537
-		"movq (%%ecx, %1), %%mm0			\n\t" // line 6
538
-		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
539
-		"paddusb %%mm1, %%mm0				\n\t"
540
-		"pxor %%mm2, %%mm0				\n\t"
541
-		"movq %%mm0, (%%ecx, %1)			\n\t" // line 6
542
-
543
-		PAVGB(%%mm7, %%mm1)				      // d/8
544
-
545
-		"movq (%%eax, %1), %%mm0			\n\t" // line 2
546
-		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
547
-		"psubusb %%mm1, %%mm0				\n\t"
548
-		"pxor %%mm2, %%mm0				\n\t"
549
-		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
550
-
551
-		"movq (%%ecx, %1, 2), %%mm0			\n\t" // line 7
552
-		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
553
-		"paddusb %%mm1, %%mm0				\n\t"
554
-		"pxor %%mm2, %%mm0				\n\t"
555
-		"movq %%mm0, (%%ecx, %1, 2)			\n\t" // line 7
556
-
557
-		:
558
-		: "r" (src), "r" (stride), "m" (co->pQPb)
559
-		: "%eax", "%ecx"
560
-	);
561
-#else
562
-
563
- 	const int l1= stride;
564
-	const int l2= stride + l1;
565
-	const int l3= stride + l2;
566
-	const int l4= stride + l3;
567
-	const int l5= stride + l4;
568
-	const int l6= stride + l5;
569
-	const int l7= stride + l6;
570
-//	const int l8= stride + l7;
571
-//	const int l9= stride + l8;
572
-	int x;
573
-
574
-	src+= stride*3;
575
-	for(x=0; x<BLOCK_SIZE; x++)
576
-	{
577
-		int a= src[l3] - src[l4];
578
-		int b= src[l4] - src[l5];
579
-		int c= src[l5] - src[l6];
580
-
581
-		int d= ABS(b) - ((ABS(a) + ABS(c))>>1);
582
-		d= MAX(d, 0);
583
-
584
-		if(d < co->QP*2)
585
-		{
586
-			int v = d * SIGN(-b);
587
-
588
-			src[l2] +=v>>3;
589
-			src[l3] +=v>>2;
590
-			src[l4] +=(3*v)>>3;
591
-			src[l5] -=(3*v)>>3;
592
-			src[l6] -=v>>2;
593
-			src[l7] -=v>>3;
594
-
595
-		}
596
-		src++;
597
-	}
598
-#endif
599
-}
600
-
601
-static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
602
-{
603
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
604
-/*
605
-	uint8_t tmp[16];
606
-	const int l1= stride;
607
-	const int l2= stride + l1;
608
-	const int l3= stride + l2;
609
-	const int l4= (int)tmp - (int)src - stride*3;
610
-	const int l5= (int)tmp - (int)src - stride*3 + 8;
611
-	const int l6= stride*3 + l3;
612
-	const int l7= stride + l6;
613
-	const int l8= stride + l7;
614
-
615
-	memcpy(tmp, src+stride*7, 8);
616
-	memcpy(tmp+8, src+stride*8, 8);
617
-*/
618
-	src+= stride*4;
619
-	asm volatile(
620
-
621
-#if 0 //sligtly more accurate and slightly slower
622
-		"pxor %%mm7, %%mm7				\n\t" // 0
623
-		"leal (%0, %1), %%eax				\n\t"
624
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
625
-//	0	1	2	3	4	5	6	7
626
-//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
627
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
628
-
629
-
630
-		"movq (%0, %1, 2), %%mm0			\n\t" // l2
631
-		"movq (%0), %%mm1				\n\t" // l0
632
-		"movq %%mm0, %%mm2				\n\t" // l2
633
-		PAVGB(%%mm7, %%mm0)				      // ~l2/2
634
-		PAVGB(%%mm1, %%mm0)				      // ~(l2 + 2l0)/4
635
-		PAVGB(%%mm2, %%mm0)				      // ~(5l2 + 2l0)/8
636
-
637
-		"movq (%%eax), %%mm1				\n\t" // l1
638
-		"movq (%%eax, %1, 2), %%mm3			\n\t" // l3
639
-		"movq %%mm1, %%mm4				\n\t" // l1
640
-		PAVGB(%%mm7, %%mm1)				      // ~l1/2
641
-		PAVGB(%%mm3, %%mm1)				      // ~(l1 + 2l3)/4
642
-		PAVGB(%%mm4, %%mm1)				      // ~(5l1 + 2l3)/8
643
-
644
-		"movq %%mm0, %%mm4				\n\t" // ~(5l2 + 2l0)/8
645
-		"psubusb %%mm1, %%mm0				\n\t"
646
-		"psubusb %%mm4, %%mm1				\n\t"
647
-		"por %%mm0, %%mm1				\n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
648
-// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
649
-
650
-		"movq (%0, %1, 4), %%mm0			\n\t" // l4
651
-		"movq %%mm0, %%mm4				\n\t" // l4
652
-		PAVGB(%%mm7, %%mm0)				      // ~l4/2
653
-		PAVGB(%%mm2, %%mm0)				      // ~(l4 + 2l2)/4
654
-		PAVGB(%%mm4, %%mm0)				      // ~(5l4 + 2l2)/8
655
-
656
-		"movq (%%ecx), %%mm2				\n\t" // l5
657
-		"movq %%mm3, %%mm5				\n\t" // l3
658
-		PAVGB(%%mm7, %%mm3)				      // ~l3/2
659
-		PAVGB(%%mm2, %%mm3)				      // ~(l3 + 2l5)/4
660
-		PAVGB(%%mm5, %%mm3)				      // ~(5l3 + 2l5)/8
661
-
662
-		"movq %%mm0, %%mm6				\n\t" // ~(5l4 + 2l2)/8
663
-		"psubusb %%mm3, %%mm0				\n\t"
664
-		"psubusb %%mm6, %%mm3				\n\t"
665
-		"por %%mm0, %%mm3				\n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
666
-		"pcmpeqb %%mm7, %%mm0				\n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
667
-// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
668
-
669
-		"movq (%%ecx, %1), %%mm6			\n\t" // l6
670
-		"movq %%mm6, %%mm5				\n\t" // l6
671
-		PAVGB(%%mm7, %%mm6)				      // ~l6/2
672
-		PAVGB(%%mm4, %%mm6)				      // ~(l6 + 2l4)/4
673
-		PAVGB(%%mm5, %%mm6)				      // ~(5l6 + 2l4)/8
674
-
675
-		"movq (%%ecx, %1, 2), %%mm5			\n\t" // l7
676
-		"movq %%mm2, %%mm4				\n\t" // l5
677
-		PAVGB(%%mm7, %%mm2)				      // ~l5/2
678
-		PAVGB(%%mm5, %%mm2)				      // ~(l5 + 2l7)/4
679
-		PAVGB(%%mm4, %%mm2)				      // ~(5l5 + 2l7)/8
680
-
681
-		"movq %%mm6, %%mm4				\n\t" // ~(5l6 + 2l4)/8
682
-		"psubusb %%mm2, %%mm6				\n\t"
683
-		"psubusb %%mm4, %%mm2				\n\t"
684
-		"por %%mm6, %%mm2				\n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
685
-// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
686
-
687
-
688
-		PMINUB(%%mm2, %%mm1, %%mm4)			      // MIN(|lenergy|,|renergy|)/8
689
-		"movq %2, %%mm4					\n\t" // QP //FIXME QP+1 ?
690
-		"paddusb "MANGLE(b01)", %%mm4			\n\t"
691
-		"pcmpgtb %%mm3, %%mm4				\n\t" // |menergy|/8 < QP
692
-		"psubusb %%mm1, %%mm3				\n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
693
-		"pand %%mm4, %%mm3				\n\t"
694
-
695
-		"movq %%mm3, %%mm1				\n\t"
696
-//		"psubusb "MANGLE(b01)", %%mm3			\n\t"
697
-		PAVGB(%%mm7, %%mm3)
698
-		PAVGB(%%mm7, %%mm3)
699
-		"paddusb %%mm1, %%mm3				\n\t"
700
-//		"paddusb "MANGLE(b01)", %%mm3			\n\t"
701
-
702
-		"movq (%%eax, %1, 2), %%mm6			\n\t" //l3
703
-		"movq (%0, %1, 4), %%mm5			\n\t" //l4
704
-		"movq (%0, %1, 4), %%mm4			\n\t" //l4
705
-		"psubusb %%mm6, %%mm5				\n\t"
706
-		"psubusb %%mm4, %%mm6				\n\t"
707
-		"por %%mm6, %%mm5				\n\t" // |l3-l4|
708
-		"pcmpeqb %%mm7, %%mm6				\n\t" // SIGN(l3-l4)
709
-		"pxor %%mm6, %%mm0				\n\t"
710
-		"pand %%mm0, %%mm3				\n\t"
711
-		PMINUB(%%mm5, %%mm3, %%mm0)
712
-
713
-		"psubusb "MANGLE(b01)", %%mm3			\n\t"
714
-		PAVGB(%%mm7, %%mm3)
715
-
716
-		"movq (%%eax, %1, 2), %%mm0			\n\t"
717
-		"movq (%0, %1, 4), %%mm2			\n\t"
718
-		"pxor %%mm6, %%mm0				\n\t"
719
-		"pxor %%mm6, %%mm2				\n\t"
720
-		"psubb %%mm3, %%mm0				\n\t"
721
-		"paddb %%mm3, %%mm2				\n\t"
722
-		"pxor %%mm6, %%mm0				\n\t"
723
-		"pxor %%mm6, %%mm2				\n\t"
724
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
725
-		"movq %%mm2, (%0, %1, 4)			\n\t"
726
-#endif
727
-
728
-		"leal (%0, %1), %%eax				\n\t"
729
-		"pcmpeqb %%mm6, %%mm6				\n\t" // -1
730
-//	0	1	2	3	4	5	6	7
731
-//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	ecx+%1	ecx+2%1
732
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1
733
-
734
-
735
-		"movq (%%eax, %1, 2), %%mm1			\n\t" // l3
736
-		"movq (%0, %1, 4), %%mm0			\n\t" // l4
737
-		"pxor %%mm6, %%mm1				\n\t" // -l3-1
738
-		PAVGB(%%mm1, %%mm0)				      // -q+128 = (l4-l3+256)/2
739
-// mm1=-l3-1, mm0=128-q
740
-
741
-		"movq (%%eax, %1, 4), %%mm2			\n\t" // l5
742
-		"movq (%%eax, %1), %%mm3			\n\t" // l2
743
-		"pxor %%mm6, %%mm2				\n\t" // -l5-1
744
-		"movq %%mm2, %%mm5				\n\t" // -l5-1
745
-		"movq "MANGLE(b80)", %%mm4			\n\t" // 128
746
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
747
-		PAVGB(%%mm3, %%mm2)				      // (l2-l5+256)/2
748
-		PAVGB(%%mm0, %%mm4)				      // ~(l4-l3)/4 + 128
749
-		PAVGB(%%mm2, %%mm4)				      // ~(l2-l5)/4 +(l4-l3)/8 + 128
750
-		PAVGB(%%mm0, %%mm4)				      // ~(l2-l5)/8 +5(l4-l3)/16 + 128
751
-// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
752
-
753
-		"movq (%%eax), %%mm2				\n\t" // l1
754
-		"pxor %%mm6, %%mm2				\n\t" // -l1-1
755
-		PAVGB(%%mm3, %%mm2)				      // (l2-l1+256)/2
756
-		PAVGB((%0), %%mm1)				      // (l0-l3+256)/2
757
-		"movq "MANGLE(b80)", %%mm3			\n\t" // 128
758
-		PAVGB(%%mm2, %%mm3)				      // ~(l2-l1)/4 + 128
759
-		PAVGB(%%mm1, %%mm3)				      // ~(l0-l3)/4 +(l2-l1)/8 + 128
760
-		PAVGB(%%mm2, %%mm3)				      // ~(l0-l3)/8 +5(l2-l1)/16 + 128
761
-// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
762
-
763
-		PAVGB((%%ecx, %1), %%mm5)			      // (l6-l5+256)/2
764
-		"movq (%%ecx, %1, 2), %%mm1			\n\t" // l7
765
-		"pxor %%mm6, %%mm1				\n\t" // -l7-1
766
-		PAVGB((%0, %1, 4), %%mm1)			      // (l4-l7+256)/2
767
-		"movq "MANGLE(b80)", %%mm2			\n\t" // 128
768
-		PAVGB(%%mm5, %%mm2)				      // ~(l6-l5)/4 + 128
769
-		PAVGB(%%mm1, %%mm2)				      // ~(l4-l7)/4 +(l6-l5)/8 + 128
770
-		PAVGB(%%mm5, %%mm2)				      // ~(l4-l7)/8 +5(l6-l5)/16 + 128
771
-// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
772
-
773
-		"movq "MANGLE(b00)", %%mm1			\n\t" // 0
774
-		"movq "MANGLE(b00)", %%mm5			\n\t" // 0
775
-		"psubb %%mm2, %%mm1				\n\t" // 128 - renergy/16
776
-		"psubb %%mm3, %%mm5				\n\t" // 128 - lenergy/16
777
-		PMAXUB(%%mm1, %%mm2)				      // 128 + |renergy/16|
778
- 		PMAXUB(%%mm5, %%mm3)				      // 128 + |lenergy/16|
779
-		PMINUB(%%mm2, %%mm3, %%mm1)			      // 128 + MIN(|lenergy|,|renergy|)/16
780
-
781
-// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
782
-
783
-		"movq "MANGLE(b00)", %%mm7			\n\t" // 0
784
-		"movq %2, %%mm2					\n\t" // QP
785
-		PAVGB(%%mm6, %%mm2)				      // 128 + QP/2
786
-		"psubb %%mm6, %%mm2				\n\t"
787
-
788
-		"movq %%mm4, %%mm1				\n\t"
789
-		"pcmpgtb %%mm7, %%mm1				\n\t" // SIGN(menergy)
790
-		"pxor %%mm1, %%mm4				\n\t"
791
-		"psubb %%mm1, %%mm4				\n\t" // 128 + |menergy|/16
792
-		"pcmpgtb %%mm4, %%mm2				\n\t" // |menergy|/16 < QP/2
793
-		"psubusb %%mm3, %%mm4				\n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
794
-// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
795
-
796
-		"movq %%mm4, %%mm3				\n\t" // d
797
-		"psubusb "MANGLE(b01)", %%mm4			\n\t"
798
-		PAVGB(%%mm7, %%mm4)				      // d/32
799
-		PAVGB(%%mm7, %%mm4)				      // (d + 32)/64
800
-		"paddb %%mm3, %%mm4				\n\t" // 5d/64
801
-		"pand %%mm2, %%mm4				\n\t"
802
-
803
-		"movq "MANGLE(b80)", %%mm5			\n\t" // 128
804
-		"psubb %%mm0, %%mm5				\n\t" // q
805
-		"paddsb %%mm6, %%mm5				\n\t" // fix bad rounding
806
-		"pcmpgtb %%mm5, %%mm7				\n\t" // SIGN(q)
807
-		"pxor %%mm7, %%mm5				\n\t"
808
-
809
-		PMINUB(%%mm5, %%mm4, %%mm3)			      // MIN(|q|, 5d/64)
810
-		"pxor %%mm1, %%mm7				\n\t" // SIGN(d*q)
811
-
812
-		"pand %%mm7, %%mm4				\n\t"
813
-		"movq (%%eax, %1, 2), %%mm0			\n\t"
814
-		"movq (%0, %1, 4), %%mm2			\n\t"
815
-		"pxor %%mm1, %%mm0				\n\t"
816
-		"pxor %%mm1, %%mm2				\n\t"
817
-		"paddb %%mm4, %%mm0				\n\t"
818
-		"psubb %%mm4, %%mm2				\n\t"
819
-		"pxor %%mm1, %%mm0				\n\t"
820
-		"pxor %%mm1, %%mm2				\n\t"
821
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
822
-		"movq %%mm2, (%0, %1, 4)			\n\t"
823
-
824
-		:
825
-		: "r" (src), "r" (stride), "m" (c->pQPb)
826
-		: "%eax", "%ecx"
827
-	);
828
-
829
-/*
830
-	{
831
-	int x;
832
-	src-= stride;
833
-	for(x=0; x<BLOCK_SIZE; x++)
834
-	{
835
-		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
836
-		if(ABS(middleEnergy)< 8*QP)
837
-		{
838
-			const int q=(src[l4] - src[l5])/2;
839
-			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
840
-			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
841
-
842
-			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
843
-			d= MAX(d, 0);
844
-
845
-			d= (5*d + 32) >> 6;
846
-			d*= SIGN(-middleEnergy);
847
-
848
-			if(q>0)
849
-			{
850
-				d= d<0 ? 0 : d;
851
-				d= d>q ? q : d;
852
-			}
853
-			else
854
-			{
855
-				d= d>0 ? 0 : d;
856
-				d= d<q ? q : d;
857
-			}
858
-
859
-        		src[l4]-= d;
860
-	        	src[l5]+= d;
861
-		}
862
-		src++;
863
-	}
864
-src-=8;
865
-	for(x=0; x<8; x++)
866
-	{
867
-		int y;
868
-		for(y=4; y<6; y++)
869
-		{
870
-			int d= src[x+y*stride] - tmp[x+(y-4)*8];
871
-			int ad= ABS(d);
872
-			static int max=0;
873
-			static int sum=0;
874
-			static int num=0;
875
-			static int bias=0;
876
-
877
-			if(max<ad) max=ad;
878
-			sum+= ad>3 ? 1 : 0;
879
-			if(ad>3)
880
-			{
881
-				src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
882
-			}
883
-			if(y==4) bias+=d;
884
-			num++;
885
-			if(num%1000000 == 0)
886
-			{
887
-				printf(" %d %d %d %d\n", num, sum, max, bias);
888
-			}
889
-		}
890
-	}
891
-}
892
-*/
893
-#elif defined (HAVE_MMX)
894
-	src+= stride*4;
895
-	asm volatile(
896
-		"pxor %%mm7, %%mm7				\n\t"
897
-		"leal -40(%%esp), %%ecx				\n\t" // make space for 4 8-byte vars
898
-		"andl $0xFFFFFFF8, %%ecx			\n\t" // align
899
-//	0	1	2	3	4	5	6	7
900
-//	%0	%0+%1	%0+2%1	eax+2%1	%0+4%1	eax+4%1	edx+%1	edx+2%1
901
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1
902
-
903
-		"movq (%0), %%mm0				\n\t"
904
-		"movq %%mm0, %%mm1				\n\t"
905
-		"punpcklbw %%mm7, %%mm0				\n\t" // low part of line 0
906
-		"punpckhbw %%mm7, %%mm1				\n\t" // high part of line 0
907
-
908
-		"movq (%0, %1), %%mm2				\n\t"
909
-		"leal (%0, %1, 2), %%eax			\n\t"
910
-		"movq %%mm2, %%mm3				\n\t"
911
-		"punpcklbw %%mm7, %%mm2				\n\t" // low part of line 1
912
-		"punpckhbw %%mm7, %%mm3				\n\t" // high part of line 1
913
-
914
-		"movq (%%eax), %%mm4				\n\t"
915
-		"movq %%mm4, %%mm5				\n\t"
916
-		"punpcklbw %%mm7, %%mm4				\n\t" // low part of line 2
917
-		"punpckhbw %%mm7, %%mm5				\n\t" // high part of line 2
918
-
919
-		"paddw %%mm0, %%mm0				\n\t" // 2L0
920
-		"paddw %%mm1, %%mm1				\n\t" // 2H0
921
-		"psubw %%mm4, %%mm2				\n\t" // L1 - L2
922
-		"psubw %%mm5, %%mm3				\n\t" // H1 - H2
923
-		"psubw %%mm2, %%mm0				\n\t" // 2L0 - L1 + L2
924
-		"psubw %%mm3, %%mm1				\n\t" // 2H0 - H1 + H2
925
-
926
-		"psllw $2, %%mm2				\n\t" // 4L1 - 4L2
927
-		"psllw $2, %%mm3				\n\t" // 4H1 - 4H2
928
-		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2
929
-		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2
930
-
931
-		"movq (%%eax, %1), %%mm2			\n\t"
932
-		"movq %%mm2, %%mm3				\n\t"
933
-		"punpcklbw %%mm7, %%mm2				\n\t" // L3
934
-		"punpckhbw %%mm7, %%mm3				\n\t" // H3
935
-
936
-		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - L3
937
-		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - H3
938
-		"psubw %%mm2, %%mm0				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
939
-		"psubw %%mm3, %%mm1				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
940
-		"movq %%mm0, (%%ecx)				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
941
-		"movq %%mm1, 8(%%ecx)				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
942
-
943
-		"movq (%%eax, %1, 2), %%mm0			\n\t"
944
-		"movq %%mm0, %%mm1				\n\t"
945
-		"punpcklbw %%mm7, %%mm0				\n\t" // L4
946
-		"punpckhbw %%mm7, %%mm1				\n\t" // H4
947
-
948
-		"psubw %%mm0, %%mm2				\n\t" // L3 - L4
949
-		"psubw %%mm1, %%mm3				\n\t" // H3 - H4
950
-		"movq %%mm2, 16(%%ecx)				\n\t" // L3 - L4
951
-		"movq %%mm3, 24(%%ecx)				\n\t" // H3 - H4
952
-		"paddw %%mm4, %%mm4				\n\t" // 2L2
953
-		"paddw %%mm5, %%mm5				\n\t" // 2H2
954
-		"psubw %%mm2, %%mm4				\n\t" // 2L2 - L3 + L4
955
-		"psubw %%mm3, %%mm5				\n\t" // 2H2 - H3 + H4
956
-
957
-		"leal (%%eax, %1), %0				\n\t"
958
-		"psllw $2, %%mm2				\n\t" // 4L3 - 4L4
959
-		"psllw $2, %%mm3				\n\t" // 4H3 - 4H4
960
-		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4
961
-		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4
962
-//50 opcodes so far
963
-		"movq (%0, %1, 2), %%mm2			\n\t"
964
-		"movq %%mm2, %%mm3				\n\t"
965
-		"punpcklbw %%mm7, %%mm2				\n\t" // L5
966
-		"punpckhbw %%mm7, %%mm3				\n\t" // H5
967
-		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - L5
968
-		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - H5
969
-		"psubw %%mm2, %%mm4				\n\t" // 2L2 - 5L3 + 5L4 - 2L5
970
-		"psubw %%mm3, %%mm5				\n\t" // 2H2 - 5H3 + 5H4 - 2H5
971
-
972
-		"movq (%%eax, %1, 4), %%mm6			\n\t"
973
-		"punpcklbw %%mm7, %%mm6				\n\t" // L6
974
-		"psubw %%mm6, %%mm2				\n\t" // L5 - L6
975
-		"movq (%%eax, %1, 4), %%mm6			\n\t"
976
-		"punpckhbw %%mm7, %%mm6				\n\t" // H6
977
-		"psubw %%mm6, %%mm3				\n\t" // H5 - H6
978
-
979
-		"paddw %%mm0, %%mm0				\n\t" // 2L4
980
-		"paddw %%mm1, %%mm1				\n\t" // 2H4
981
-		"psubw %%mm2, %%mm0				\n\t" // 2L4 - L5 + L6
982
-		"psubw %%mm3, %%mm1				\n\t" // 2H4 - H5 + H6
983
-
984
-		"psllw $2, %%mm2				\n\t" // 4L5 - 4L6
985
-		"psllw $2, %%mm3				\n\t" // 4H5 - 4H6
986
-		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6
987
-		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6
988
-
989
-		"movq (%0, %1, 4), %%mm2			\n\t"
990
-		"movq %%mm2, %%mm3				\n\t"
991
-		"punpcklbw %%mm7, %%mm2				\n\t" // L7
992
-		"punpckhbw %%mm7, %%mm3				\n\t" // H7
993
-
994
-		"paddw %%mm2, %%mm2				\n\t" // 2L7
995
-		"paddw %%mm3, %%mm3				\n\t" // 2H7
996
-		"psubw %%mm2, %%mm0				\n\t" // 2L4 - 5L5 + 5L6 - 2L7
997
-		"psubw %%mm3, %%mm1				\n\t" // 2H4 - 5H5 + 5H6 - 2H7
998
-
999
-		"movq (%%ecx), %%mm2				\n\t" // 2L0 - 5L1 + 5L2 - 2L3
1000
-		"movq 8(%%ecx), %%mm3				\n\t" // 2H0 - 5H1 + 5H2 - 2H3
1001
-
1002
-#ifdef HAVE_MMX2
1003
-		"movq %%mm7, %%mm6				\n\t" // 0
1004
-		"psubw %%mm0, %%mm6				\n\t"
1005
-		"pmaxsw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1006
-		"movq %%mm7, %%mm6				\n\t" // 0
1007
-		"psubw %%mm1, %%mm6				\n\t"
1008
-		"pmaxsw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1009
-		"movq %%mm7, %%mm6				\n\t" // 0
1010
-		"psubw %%mm2, %%mm6				\n\t"
1011
-		"pmaxsw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1012
-		"movq %%mm7, %%mm6				\n\t" // 0
1013
-		"psubw %%mm3, %%mm6				\n\t"
1014
-		"pmaxsw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1015
-#else
1016
-		"movq %%mm7, %%mm6				\n\t" // 0
1017
-		"pcmpgtw %%mm0, %%mm6				\n\t"
1018
-		"pxor %%mm6, %%mm0				\n\t"
1019
-		"psubw %%mm6, %%mm0				\n\t" // |2L4 - 5L5 + 5L6 - 2L7|
1020
-		"movq %%mm7, %%mm6				\n\t" // 0
1021
-		"pcmpgtw %%mm1, %%mm6				\n\t"
1022
-		"pxor %%mm6, %%mm1				\n\t"
1023
-		"psubw %%mm6, %%mm1				\n\t" // |2H4 - 5H5 + 5H6 - 2H7|
1024
-		"movq %%mm7, %%mm6				\n\t" // 0
1025
-		"pcmpgtw %%mm2, %%mm6				\n\t"
1026
-		"pxor %%mm6, %%mm2				\n\t"
1027
-		"psubw %%mm6, %%mm2				\n\t" // |2L0 - 5L1 + 5L2 - 2L3|
1028
-		"movq %%mm7, %%mm6				\n\t" // 0
1029
-		"pcmpgtw %%mm3, %%mm6				\n\t"
1030
-		"pxor %%mm6, %%mm3				\n\t"
1031
-		"psubw %%mm6, %%mm3				\n\t" // |2H0 - 5H1 + 5H2 - 2H3|
1032
-#endif
1033
-
1034
-#ifdef HAVE_MMX2
1035
-		"pminsw %%mm2, %%mm0				\n\t"
1036
-		"pminsw %%mm3, %%mm1				\n\t"
1037
-#else
1038
-		"movq %%mm0, %%mm6				\n\t"
1039
-		"psubusw %%mm2, %%mm6				\n\t"
1040
-		"psubw %%mm6, %%mm0				\n\t"
1041
-		"movq %%mm1, %%mm6				\n\t"
1042
-		"psubusw %%mm3, %%mm6				\n\t"
1043
-		"psubw %%mm6, %%mm1				\n\t"
1044
-#endif
1045
-
1046
-		"movq %%mm7, %%mm6				\n\t" // 0
1047
-		"pcmpgtw %%mm4, %%mm6				\n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
1048
-		"pxor %%mm6, %%mm4				\n\t"
1049
-		"psubw %%mm6, %%mm4				\n\t" // |2L2 - 5L3 + 5L4 - 2L5|
1050
-		"pcmpgtw %%mm5, %%mm7				\n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
1051
-		"pxor %%mm7, %%mm5				\n\t"
1052
-		"psubw %%mm7, %%mm5				\n\t" // |2H2 - 5H3 + 5H4 - 2H5|
1053
-// 100 opcodes
1054
-		"movd %2, %%mm2					\n\t" // QP
1055
-		"psllw $3, %%mm2				\n\t" // 8QP
1056
-		"movq %%mm2, %%mm3				\n\t" // 8QP
1057
-		"pcmpgtw %%mm4, %%mm2				\n\t"
1058
-		"pcmpgtw %%mm5, %%mm3				\n\t"
1059
-		"pand %%mm2, %%mm4				\n\t"
1060
-		"pand %%mm3, %%mm5				\n\t"
1061
-
1062
-
1063
-		"psubusw %%mm0, %%mm4				\n\t" // hd
1064
-		"psubusw %%mm1, %%mm5				\n\t" // ld
1065
-
1066
-
1067
-		"movq "MANGLE(w05)", %%mm2			\n\t" // 5
1068
-		"pmullw %%mm2, %%mm4				\n\t"
1069
-		"pmullw %%mm2, %%mm5				\n\t"
1070
-		"movq "MANGLE(w20)", %%mm2			\n\t" // 32
1071
-		"paddw %%mm2, %%mm4				\n\t"
1072
-		"paddw %%mm2, %%mm5				\n\t"
1073
-		"psrlw $6, %%mm4				\n\t"
1074
-		"psrlw $6, %%mm5				\n\t"
1075
-
1076
-		"movq 16(%%ecx), %%mm0				\n\t" // L3 - L4
1077
-		"movq 24(%%ecx), %%mm1				\n\t" // H3 - H4
1078
-
1079
-		"pxor %%mm2, %%mm2				\n\t"
1080
-		"pxor %%mm3, %%mm3				\n\t"
1081
-
1082
-		"pcmpgtw %%mm0, %%mm2				\n\t" // sign (L3-L4)
1083
-		"pcmpgtw %%mm1, %%mm3				\n\t" // sign (H3-H4)
1084
-		"pxor %%mm2, %%mm0				\n\t"
1085
-		"pxor %%mm3, %%mm1				\n\t"
1086
-		"psubw %%mm2, %%mm0				\n\t" // |L3-L4|
1087
-		"psubw %%mm3, %%mm1				\n\t" // |H3-H4|
1088
-		"psrlw $1, %%mm0				\n\t" // |L3 - L4|/2
1089
-		"psrlw $1, %%mm1				\n\t" // |H3 - H4|/2
1090
-
1091
-		"pxor %%mm6, %%mm2				\n\t"
1092
-		"pxor %%mm7, %%mm3				\n\t"
1093
-		"pand %%mm2, %%mm4				\n\t"
1094
-		"pand %%mm3, %%mm5				\n\t"
1095
-
1096
-#ifdef HAVE_MMX2
1097
-		"pminsw %%mm0, %%mm4				\n\t"
1098
-		"pminsw %%mm1, %%mm5				\n\t"
1099
-#else
1100
-		"movq %%mm4, %%mm2				\n\t"
1101
-		"psubusw %%mm0, %%mm2				\n\t"
1102
-		"psubw %%mm2, %%mm4				\n\t"
1103
-		"movq %%mm5, %%mm2				\n\t"
1104
-		"psubusw %%mm1, %%mm2				\n\t"
1105
-		"psubw %%mm2, %%mm5				\n\t"
1106
-#endif
1107
-		"pxor %%mm6, %%mm4				\n\t"
1108
-		"pxor %%mm7, %%mm5				\n\t"
1109
-		"psubw %%mm6, %%mm4				\n\t"
1110
-		"psubw %%mm7, %%mm5				\n\t"
1111
-		"packsswb %%mm5, %%mm4				\n\t"
1112
-		"movq (%0), %%mm0				\n\t"
1113
-		"paddb   %%mm4, %%mm0				\n\t"
1114
-		"movq %%mm0, (%0)				\n\t"
1115
-		"movq (%0, %1), %%mm0				\n\t"
1116
-		"psubb %%mm4, %%mm0				\n\t"
1117
-		"movq %%mm0, (%0, %1)				\n\t"
1118
-
1119
-		: "+r" (src)
1120
-		: "r" (stride), "m" (c->pQPb)
1121
-		: "%eax", "%ecx"
1122
-	);
1123
-#else
1124
-	const int l1= stride;
1125
-	const int l2= stride + l1;
1126
-	const int l3= stride + l2;
1127
-	const int l4= stride + l3;
1128
-	const int l5= stride + l4;
1129
-	const int l6= stride + l5;
1130
-	const int l7= stride + l6;
1131
-	const int l8= stride + l7;
1132
-//	const int l9= stride + l8;
1133
-	int x;
1134
-	src+= stride*3;
1135
-	for(x=0; x<BLOCK_SIZE; x++)
1136
-	{
1137
-		const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1138
-		if(ABS(middleEnergy) < 8*c->QP)
1139
-		{
1140
-			const int q=(src[l4] - src[l5])/2;
1141
-			const int leftEnergy=  5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1142
-			const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1143
-
1144
-			int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1145
-			d= MAX(d, 0);
1146
-
1147
-			d= (5*d + 32) >> 6;
1148
-			d*= SIGN(-middleEnergy);
1149
-
1150
-			if(q>0)
1151
-			{
1152
-				d= d<0 ? 0 : d;
1153
-				d= d>q ? q : d;
1154
-			}
1155
-			else
1156
-			{
1157
-				d= d>0 ? 0 : d;
1158
-				d= d<q ? q : d;
1159
-			}
1160
-
1161
-        		src[l4]-= d;
1162
-	        	src[l5]+= d;
1163
-		}
1164
-		src++;
1165
-	}
1166
-#endif
1167
-}
1168
-
1169
-static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1170
-{
1171
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1172
-	asm volatile(
1173
-		"pxor %%mm6, %%mm6				\n\t"
1174
-		"pcmpeqb %%mm7, %%mm7				\n\t"
1175
-		"movq %2, %%mm0					\n\t"
1176
-		"punpcklbw %%mm6, %%mm0				\n\t"
1177
-		"psrlw $1, %%mm0				\n\t"
1178
-		"psubw %%mm7, %%mm0				\n\t"
1179
-		"packuswb %%mm0, %%mm0				\n\t"
1180
-		"movq %%mm0, %3					\n\t"
1181
-
1182
-		"leal (%0, %1), %%eax				\n\t"
1183
-		"leal (%%eax, %1, 4), %%edx			\n\t"
1184
-		
1185
-//	0	1	2	3	4	5	6	7	8	9
1186
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1187
-
1188
-#undef FIND_MIN_MAX
1189
-#ifdef HAVE_MMX2
1190
-#define FIND_MIN_MAX(addr)\
1191
-		"movq " #addr ", %%mm0				\n\t"\
1192
-		"pminub %%mm0, %%mm7				\n\t"\
1193
-		"pmaxub %%mm0, %%mm6				\n\t"
1194
-#else
1195
-#define FIND_MIN_MAX(addr)\
1196
-		"movq " #addr ", %%mm0				\n\t"\
1197
-		"movq %%mm7, %%mm1				\n\t"\
1198
-		"psubusb %%mm0, %%mm6				\n\t"\
1199
-		"paddb %%mm0, %%mm6				\n\t"\
1200
-		"psubusb %%mm0, %%mm1				\n\t"\
1201
-		"psubb %%mm1, %%mm7				\n\t"
1202
-#endif
1203
-
1204
-FIND_MIN_MAX((%%eax))
1205
-FIND_MIN_MAX((%%eax, %1))
1206
-FIND_MIN_MAX((%%eax, %1, 2))
1207
-FIND_MIN_MAX((%0, %1, 4))
1208
-FIND_MIN_MAX((%%edx))
1209
-FIND_MIN_MAX((%%edx, %1))
1210
-FIND_MIN_MAX((%%edx, %1, 2))
1211
-FIND_MIN_MAX((%0, %1, 8))
1212
-
1213
-		"movq %%mm7, %%mm4				\n\t"
1214
-		"psrlq $8, %%mm7				\n\t"
1215
-#ifdef HAVE_MMX2
1216
-		"pminub %%mm4, %%mm7				\n\t" // min of pixels
1217
-		"pshufw $0xF9, %%mm7, %%mm4			\n\t"
1218
-		"pminub %%mm4, %%mm7				\n\t" // min of pixels
1219
-		"pshufw $0xFE, %%mm7, %%mm4			\n\t"
1220
-		"pminub %%mm4, %%mm7				\n\t"
1221
-#else
1222
-		"movq %%mm7, %%mm1				\n\t"
1223
-		"psubusb %%mm4, %%mm1				\n\t"
1224
-		"psubb %%mm1, %%mm7				\n\t"
1225
-		"movq %%mm7, %%mm4				\n\t"
1226
-		"psrlq $16, %%mm7				\n\t"
1227
-		"movq %%mm7, %%mm1				\n\t"
1228
-		"psubusb %%mm4, %%mm1				\n\t"
1229
-		"psubb %%mm1, %%mm7				\n\t"
1230
-		"movq %%mm7, %%mm4				\n\t"
1231
-		"psrlq $32, %%mm7				\n\t"
1232
-		"movq %%mm7, %%mm1				\n\t"
1233
-		"psubusb %%mm4, %%mm1				\n\t"
1234
-		"psubb %%mm1, %%mm7				\n\t"
1235
-#endif
1236
-
1237
-
1238
-		"movq %%mm6, %%mm4				\n\t"
1239
-		"psrlq $8, %%mm6				\n\t"
1240
-#ifdef HAVE_MMX2
1241
-		"pmaxub %%mm4, %%mm6				\n\t" // max of pixels
1242
-		"pshufw $0xF9, %%mm6, %%mm4			\n\t"
1243
-		"pmaxub %%mm4, %%mm6				\n\t"
1244
-		"pshufw $0xFE, %%mm6, %%mm4			\n\t"
1245
-		"pmaxub %%mm4, %%mm6				\n\t"
1246
-#else
1247
-		"psubusb %%mm4, %%mm6				\n\t"
1248
-		"paddb %%mm4, %%mm6				\n\t"
1249
-		"movq %%mm6, %%mm4				\n\t"
1250
-		"psrlq $16, %%mm6				\n\t"
1251
-		"psubusb %%mm4, %%mm6				\n\t"
1252
-		"paddb %%mm4, %%mm6				\n\t"
1253
-		"movq %%mm6, %%mm4				\n\t"
1254
-		"psrlq $32, %%mm6				\n\t"
1255
-		"psubusb %%mm4, %%mm6				\n\t"
1256
-		"paddb %%mm4, %%mm6				\n\t"
1257
-#endif
1258
-		"movq %%mm6, %%mm0				\n\t" // max
1259
-		"psubb %%mm7, %%mm6				\n\t" // max - min
1260
-		"movd %%mm6, %%ecx				\n\t"
1261
-		"cmpb "MANGLE(deringThreshold)", %%cl		\n\t"
1262
-		" jb 1f						\n\t"
1263
-		"leal -24(%%esp), %%ecx				\n\t"
1264
-		"andl $0xFFFFFFF8, %%ecx			\n\t" 
1265
-		PAVGB(%%mm0, %%mm7)				      // a=(max + min)/2
1266
-		"punpcklbw %%mm7, %%mm7				\n\t"
1267
-		"punpcklbw %%mm7, %%mm7				\n\t"
1268
-		"punpcklbw %%mm7, %%mm7				\n\t"
1269
-		"movq %%mm7, (%%ecx)				\n\t"
1270
-
1271
-		"movq (%0), %%mm0				\n\t" // L10
1272
-		"movq %%mm0, %%mm1				\n\t" // L10
1273
-		"movq %%mm0, %%mm2				\n\t" // L10
1274
-		"psllq $8, %%mm1				\n\t"
1275
-		"psrlq $8, %%mm2				\n\t"
1276
-		"movd -4(%0), %%mm3				\n\t"
1277
-		"movd 8(%0), %%mm4				\n\t"
1278
-		"psrlq $24, %%mm3				\n\t"
1279
-		"psllq $56, %%mm4				\n\t"
1280
-		"por %%mm3, %%mm1				\n\t" // L00
1281
-		"por %%mm4, %%mm2				\n\t" // L20
1282
-		"movq %%mm1, %%mm3				\n\t" // L00
1283
-		PAVGB(%%mm2, %%mm1)				      // (L20 + L00)/2
1284
-		PAVGB(%%mm0, %%mm1)				      // (L20 + L00 + 2L10)/4
1285
-		"psubusb %%mm7, %%mm0				\n\t"
1286
-		"psubusb %%mm7, %%mm2				\n\t"
1287
-		"psubusb %%mm7, %%mm3				\n\t"
1288
-		"pcmpeqb "MANGLE(b00)", %%mm0			\n\t" // L10 > a ? 0 : -1
1289
-		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L20 > a ? 0 : -1
1290
-		"pcmpeqb "MANGLE(b00)", %%mm3			\n\t" // L00 > a ? 0 : -1
1291
-		"paddb %%mm2, %%mm0				\n\t"
1292
-		"paddb %%mm3, %%mm0				\n\t"
1293
-
1294
-		"movq (%%eax), %%mm2				\n\t" // L11
1295
-		"movq %%mm2, %%mm3				\n\t" // L11
1296
-		"movq %%mm2, %%mm4				\n\t" // L11
1297
-		"psllq $8, %%mm3				\n\t"
1298
-		"psrlq $8, %%mm4				\n\t"
1299
-		"movd -4(%%eax), %%mm5				\n\t"
1300
-		"movd 8(%%eax), %%mm6				\n\t"
1301
-		"psrlq $24, %%mm5				\n\t"
1302
-		"psllq $56, %%mm6				\n\t"
1303
-		"por %%mm5, %%mm3				\n\t" // L01
1304
-		"por %%mm6, %%mm4				\n\t" // L21
1305
-		"movq %%mm3, %%mm5				\n\t" // L01
1306
-		PAVGB(%%mm4, %%mm3)				      // (L21 + L01)/2
1307
-		PAVGB(%%mm2, %%mm3)				      // (L21 + L01 + 2L11)/4
1308
-		"psubusb %%mm7, %%mm2				\n\t"
1309
-		"psubusb %%mm7, %%mm4				\n\t"
1310
-		"psubusb %%mm7, %%mm5				\n\t"
1311
-		"pcmpeqb "MANGLE(b00)", %%mm2			\n\t" // L11 > a ? 0 : -1
1312
-		"pcmpeqb "MANGLE(b00)", %%mm4			\n\t" // L21 > a ? 0 : -1
1313
-		"pcmpeqb "MANGLE(b00)", %%mm5			\n\t" // L01 > a ? 0 : -1
1314
-		"paddb %%mm4, %%mm2				\n\t"
1315
-		"paddb %%mm5, %%mm2				\n\t"
1316
-// 0, 2, 3, 1
1317
-#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1318
-		"movq " #src ", " #sx "				\n\t" /* src[0] */\
1319
-		"movq " #sx ", " #lx "				\n\t" /* src[0] */\
1320
-		"movq " #sx ", " #t0 "				\n\t" /* src[0] */\
1321
-		"psllq $8, " #lx "				\n\t"\
1322
-		"psrlq $8, " #t0 "				\n\t"\
1323
-		"movd -4" #src ", " #t1 "			\n\t"\
1324
-		"psrlq $24, " #t1 "				\n\t"\
1325
-		"por " #t1 ", " #lx "				\n\t" /* src[-1] */\
1326
-		"movd 8" #src ", " #t1 "			\n\t"\
1327
-		"psllq $56, " #t1 "				\n\t"\
1328
-		"por " #t1 ", " #t0 "				\n\t" /* src[+1] */\
1329
-		"movq " #lx ", " #t1 "				\n\t" /* src[-1] */\
1330
-		PAVGB(t0, lx)				              /* (src[-1] + src[+1])/2 */\
1331
-		PAVGB(sx, lx)				      /* (src[-1] + 2src[0] + src[+1])/4 */\
1332
-		PAVGB(lx, pplx)					     \
1333
-		"movq " #lx ", 8(%%ecx)				\n\t"\
1334
-		"movq (%%ecx), " #lx "				\n\t"\
1335
-		"psubusb " #lx ", " #t1 "			\n\t"\
1336
-		"psubusb " #lx ", " #t0 "			\n\t"\
1337
-		"psubusb " #lx ", " #sx "			\n\t"\
1338
-		"movq "MANGLE(b00)", " #lx "			\n\t"\
1339
-		"pcmpeqb " #lx ", " #t1 "			\n\t" /* src[-1] > a ? 0 : -1*/\
1340
-		"pcmpeqb " #lx ", " #t0 "			\n\t" /* src[+1] > a ? 0 : -1*/\
1341
-		"pcmpeqb " #lx ", " #sx "			\n\t" /* src[0]  > a ? 0 : -1*/\
1342
-		"paddb " #t1 ", " #t0 "				\n\t"\
1343
-		"paddb " #t0 ", " #sx "				\n\t"\
1344
-\
1345
-		PAVGB(plx, pplx)				      /* filtered */\
1346
-		"movq " #dst ", " #t0 "				\n\t" /* dst */\
1347
-		"movq " #t0 ", " #t1 "				\n\t" /* dst */\
1348
-		"psubusb %3, " #t0 "				\n\t"\
1349
-		"paddusb %3, " #t1 "				\n\t"\
1350
-		PMAXUB(t0, pplx)\
1351
-		PMINUB(t1, pplx, t0)\
1352
-		"paddb " #sx ", " #ppsx "			\n\t"\
1353
-		"paddb " #psx ", " #ppsx "			\n\t"\
1354
-		"#paddb "MANGLE(b02)", " #ppsx "		\n\t"\
1355
-		"pand "MANGLE(b08)", " #ppsx "			\n\t"\
1356
-		"pcmpeqb " #lx ", " #ppsx "			\n\t"\
1357
-		"pand " #ppsx ", " #pplx "			\n\t"\
1358
-		"pandn " #dst ", " #ppsx "			\n\t"\
1359
-		"por " #pplx ", " #ppsx "			\n\t"\
1360
-		"movq " #ppsx ", " #dst "			\n\t"\
1361
-		"movq 8(%%ecx), " #lx "				\n\t"
1362
-
1363
-/*
1364
-0000000
1365
-1111111
1366
-
1367
-1111110
1368
-1111101
1369
-1111100
1370
-1111011
1371
-1111010
1372
-1111001
1373
-
1374
-1111000
1375
-1110111
1376
-
1377
-*/
1378
-//DERING_CORE(dst,src                  ,ppsx ,psx  ,sx   ,pplx ,plx  ,lx   ,t0   ,t1)
1379
-DERING_CORE((%%eax),(%%eax, %1)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1380
-DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1381
-DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1382
-DERING_CORE((%0, %1, 4),(%%edx)        ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1383
-DERING_CORE((%%edx),(%%edx, %1)        ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1384
-DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1385
-DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1386
-DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1387
-
1388
-		"1:			\n\t"
1389
-		: : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2)
1390
-		: "%eax", "%edx", "%ecx"
1391
-	);
1392
-#else
1393
-	int y;
1394
-	int min=255;
1395
-	int max=0;
1396
-	int avg;
1397
-	uint8_t *p;
1398
-	int s[10];
1399
-	const int QP2= c->QP/2 + 1;
1400
-
1401
-	for(y=1; y<9; y++)
1402
-	{
1403
-		int x;
1404
-		p= src + stride*y;
1405
-		for(x=1; x<9; x++)
1406
-		{
1407
-			p++;
1408
-			if(*p > max) max= *p;
1409
-			if(*p < min) min= *p;
1410
-		}
1411
-	}
1412
-	avg= (min + max + 1)>>1;
1413
-
1414
-	if(max - min <deringThreshold) return;
1415
-
1416
-	for(y=0; y<10; y++)
1417
-	{
1418
-		int t = 0;
1419
-
1420
-		if(src[stride*y + 0] > avg) t+= 1;
1421
-		if(src[stride*y + 1] > avg) t+= 2;
1422
-		if(src[stride*y + 2] > avg) t+= 4;
1423
-		if(src[stride*y + 3] > avg) t+= 8;
1424
-		if(src[stride*y + 4] > avg) t+= 16;
1425
-		if(src[stride*y + 5] > avg) t+= 32;
1426
-		if(src[stride*y + 6] > avg) t+= 64;
1427
-		if(src[stride*y + 7] > avg) t+= 128;
1428
-		if(src[stride*y + 8] > avg) t+= 256;
1429
-		if(src[stride*y + 9] > avg) t+= 512;
1430
-		
1431
-		t |= (~t)<<16;
1432
-		t &= (t<<1) & (t>>1);
1433
-		s[y] = t;
1434
-	}
1435
-	
1436
-	for(y=1; y<9; y++)
1437
-	{
1438
-		int t = s[y-1] & s[y] & s[y+1];
1439
-		t|= t>>16;
1440
-		s[y-1]= t;
1441
-	}
1442
-
1443
-	for(y=1; y<9; y++)
1444
-	{
1445
-		int x;
1446
-		int t = s[y-1];
1447
-
1448
-		p= src + stride*y;
1449
-		for(x=1; x<9; x++)
1450
-		{
1451
-			p++;
1452
-			if(t & (1<<x))
1453
-			{
1454
-				int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1455
-				      +2*(*(p     -1)) + 4*(*p         ) + 2*(*(p     +1))
1456
-				      +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1457
-				f= (f + 8)>>4;
1458
-
1459
-#ifdef DEBUG_DERING_THRESHOLD
1460
-				asm volatile("emms\n\t":);
1461
-				{
1462
-				static long long numPixels=0;
1463
-				if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1464
-//				if((max-min)<20 || (max-min)*QP<200)
1465
-//				if((max-min)*QP < 500)
1466
-//				if(max-min<QP/2)
1467
-				if(max-min < 20)
1468
-				{
1469
-					static int numSkiped=0;
1470
-					static int errorSum=0;
1471
-					static int worstQP=0;
1472
-					static int worstRange=0;
1473
-					static int worstDiff=0;
1474
-					int diff= (f - *p);
1475
-					int absDiff= ABS(diff);
1476
-					int error= diff*diff;
1477
-
1478
-					if(x==1 || x==8 || y==1 || y==8) continue;
1479
-
1480
-					numSkiped++;
1481
-					if(absDiff > worstDiff)
1482
-					{
1483
-						worstDiff= absDiff;
1484
-						worstQP= QP;
1485
-						worstRange= max-min;
1486
-					}
1487
-					errorSum+= error;
1488
-
1489
-					if(1024LL*1024LL*1024LL % numSkiped == 0)
1490
-					{
1491
-						printf( "sum:%1.3f, skip:%d, wQP:%d, "
1492
-							"wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1493
-							(float)errorSum/numSkiped, numSkiped, worstQP, worstRange,
1494
-							worstDiff, (float)numSkiped/numPixels);
1495
-					}
1496
-				}
1497
-				}
1498
-#endif
1499
-				if     (*p + QP2 < f) *p= *p + QP2;
1500
-				else if(*p - QP2 > f) *p= *p - QP2;
1501
-				else *p=f;
1502
-			}
1503
-		}
1504
-	}
1505
-#ifdef DEBUG_DERING_THRESHOLD
1506
-	if(max-min < 20)
1507
-	{
1508
-		for(y=1; y<9; y++)
1509
-		{
1510
-			int x;
1511
-			int t = 0;
1512
-			p= src + stride*y;
1513
-			for(x=1; x<9; x++)
1514
-			{
1515
-				p++;
1516
-				*p = MIN(*p + 20, 255);
1517
-			}
1518
-		}
1519
-//		src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1520
-	}
1521
-#endif
1522
-#endif
1523
-}
1524
-
1525
-/**
1526
- * Deinterlaces the given block
1527
- * will be called for every 8x8 block and can read & write from line 4-15
1528
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1529
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1530
- */
1531
-static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1532
-{
1533
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1534
-	src+= 4*stride;
1535
-	asm volatile(
1536
-		"leal (%0, %1), %%eax				\n\t"
1537
-		"leal (%%eax, %1, 4), %%ecx			\n\t"
1538
-//	0	1	2	3	4	5	6	7	8	9
1539
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	ecx	ecx+%1	ecx+2%1	%0+8%1	ecx+4%1
1540
-
1541
-		"movq (%0), %%mm0				\n\t"
1542
-		"movq (%%eax, %1), %%mm1			\n\t"
1543
-		PAVGB(%%mm1, %%mm0)
1544
-		"movq %%mm0, (%%eax)				\n\t"
1545
-		"movq (%0, %1, 4), %%mm0			\n\t"
1546
-		PAVGB(%%mm0, %%mm1)
1547
-		"movq %%mm1, (%%eax, %1, 2)			\n\t"
1548
-		"movq (%%ecx, %1), %%mm1			\n\t"
1549
-		PAVGB(%%mm1, %%mm0)
1550
-		"movq %%mm0, (%%ecx)				\n\t"
1551
-		"movq (%0, %1, 8), %%mm0			\n\t"
1552
-		PAVGB(%%mm0, %%mm1)
1553
-		"movq %%mm1, (%%ecx, %1, 2)			\n\t"
1554
-
1555
-		: : "r" (src), "r" (stride)
1556
-		: "%eax", "%ecx"
1557
-	);
1558
-#else
1559
-	int x;
1560
-	src+= 4*stride;
1561
-	for(x=0; x<8; x++)
1562
-	{
1563
-		src[stride]   = (src[0]        + src[stride*2])>>1;
1564
-		src[stride*3] = (src[stride*2] + src[stride*4])>>1;
1565
-		src[stride*5] = (src[stride*4] + src[stride*6])>>1;
1566
-		src[stride*7] = (src[stride*6] + src[stride*8])>>1;
1567
-		src++;
1568
-	}
1569
-#endif
1570
-}
1571
-
1572
-/**
1573
- * Deinterlaces the given block
1574
- * will be called for every 8x8 block and can read & write from line 4-15
1575
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1576
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1577
- * this filter will read lines 3-15 and write 7-13
1578
- * no cliping in C version
1579
- */
1580
-static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1581
-{
1582
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1583
-	src+= stride*3;
1584
-	asm volatile(
1585
-		"leal (%0, %1), %%eax				\n\t"
1586
-		"leal (%%eax, %1, 4), %%edx			\n\t"
1587
-		"leal (%%edx, %1, 4), %%ecx			\n\t"
1588
-		"addl %1, %%ecx					\n\t"
1589
-		"pxor %%mm7, %%mm7				\n\t"
1590
-//	0	1	2	3	4	5	6	7	8	9	10
1591
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
1592
-
1593
-#define DEINT_CUBIC(a,b,c,d,e)\
1594
-		"movq " #a ", %%mm0				\n\t"\
1595
-		"movq " #b ", %%mm1				\n\t"\
1596
-		"movq " #d ", %%mm2				\n\t"\
1597
-		"movq " #e ", %%mm3				\n\t"\
1598
-		PAVGB(%%mm2, %%mm1)					/* (b+d) /2 */\
1599
-		PAVGB(%%mm3, %%mm0)					/* a(a+e) /2 */\
1600
-		"movq %%mm0, %%mm2				\n\t"\
1601
-		"punpcklbw %%mm7, %%mm0				\n\t"\
1602
-		"punpckhbw %%mm7, %%mm2				\n\t"\
1603
-		"movq %%mm1, %%mm3				\n\t"\
1604
-		"punpcklbw %%mm7, %%mm1				\n\t"\
1605
-		"punpckhbw %%mm7, %%mm3				\n\t"\
1606
-		"psubw %%mm1, %%mm0				\n\t"	/* L(a+e - (b+d))/2 */\
1607
-		"psubw %%mm3, %%mm2				\n\t"	/* H(a+e - (b+d))/2 */\
1608
-		"psraw $3, %%mm0				\n\t"	/* L(a+e - (b+d))/16 */\
1609
-		"psraw $3, %%mm2				\n\t"	/* H(a+e - (b+d))/16 */\
1610
-		"psubw %%mm0, %%mm1				\n\t"	/* L(9b + 9d - a - e)/16 */\
1611
-		"psubw %%mm2, %%mm3				\n\t"	/* H(9b + 9d - a - e)/16 */\
1612
-		"packuswb %%mm3, %%mm1				\n\t"\
1613
-		"movq %%mm1, " #c "				\n\t"
1614
-
1615
-DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1))
1616
-DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8))
1617
-DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx))
1618
-DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2))
1619
-
1620
-		: : "r" (src), "r" (stride)
1621
-		: "%eax", "%edx", "ecx"
1622
-	);
1623
-#else
1624
-	int x;
1625
-	src+= stride*3;
1626
-	for(x=0; x<8; x++)
1627
-	{
1628
-		src[stride*3] = (-src[0]        + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4;
1629
-		src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4;
1630
-		src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4;
1631
-		src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4;
1632
-		src++;
1633
-	}
1634
-#endif
1635
-}
1636
-
1637
-/**
1638
- * Deinterlaces the given block
1639
- * will be called for every 8x8 block and can read & write from line 4-15
1640
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1641
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1642
- * this filter will read lines 4-13 and write 5-11
1643
- * no cliping in C version
1644
- */
1645
-static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1646
-{
1647
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1648
-	src+= stride*4;
1649
-	asm volatile(
1650
-		"leal (%0, %1), %%eax				\n\t"
1651
-		"leal (%%eax, %1, 4), %%edx			\n\t"
1652
-		"pxor %%mm7, %%mm7				\n\t"
1653
-		"movq (%2), %%mm0				\n\t"
1654
-//	0	1	2	3	4	5	6	7	8	9	10
1655
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1 ecx
1656
-
1657
-#define DEINT_FF(a,b,c,d)\
1658
-		"movq " #a ", %%mm1				\n\t"\
1659
-		"movq " #b ", %%mm2				\n\t"\
1660
-		"movq " #c ", %%mm3				\n\t"\
1661
-		"movq " #d ", %%mm4				\n\t"\
1662
-		PAVGB(%%mm3, %%mm1)					\
1663
-		PAVGB(%%mm4, %%mm0)					\
1664
-		"movq %%mm0, %%mm3				\n\t"\
1665
-		"punpcklbw %%mm7, %%mm0				\n\t"\
1666
-		"punpckhbw %%mm7, %%mm3				\n\t"\
1667
-		"movq %%mm1, %%mm4				\n\t"\
1668
-		"punpcklbw %%mm7, %%mm1				\n\t"\
1669
-		"punpckhbw %%mm7, %%mm4				\n\t"\
1670
-		"psllw $2, %%mm1				\n\t"\
1671
-		"psllw $2, %%mm4				\n\t"\
1672
-		"psubw %%mm0, %%mm1				\n\t"\
1673
-		"psubw %%mm3, %%mm4				\n\t"\
1674
-		"movq %%mm2, %%mm5				\n\t"\
1675
-		"movq %%mm2, %%mm0				\n\t"\
1676
-		"punpcklbw %%mm7, %%mm2				\n\t"\
1677
-		"punpckhbw %%mm7, %%mm5				\n\t"\
1678
-		"paddw %%mm2, %%mm1				\n\t"\
1679
-		"paddw %%mm5, %%mm4				\n\t"\
1680
-		"psraw $2, %%mm1				\n\t"\
1681
-		"psraw $2, %%mm4				\n\t"\
1682
-		"packuswb %%mm4, %%mm1				\n\t"\
1683
-		"movq %%mm1, " #b "				\n\t"\
1684
-
1685
-DEINT_FF((%0)       , (%%eax)       , (%%eax, %1), (%%eax, %1, 2))
1686
-DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx)       )
1687
-DEINT_FF((%0, %1, 4), (%%edx)       , (%%edx, %1), (%%edx, %1, 2))
1688
-DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4))
1689
-
1690
-		"movq %%mm0, (%2)				\n\t"
1691
-		: : "r" (src), "r" (stride), "r"(tmp)
1692
-		: "%eax", "%edx"
1693
-	);
1694
-#else
1695
-	int x;
1696
-	src+= stride*4;
1697
-	for(x=0; x<8; x++)
1698
-	{
1699
-		int t1= tmp[x];
1700
-		int t2= src[stride*1];
1701
-
1702
-		src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3;
1703
-		t1= src[stride*4];
1704
-		src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3;
1705
-		t2= src[stride*6];
1706
-		src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3;
1707
-		t1= src[stride*8];
1708
-		src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3;
1709
-		tmp[x]= t1;
1710
-
1711
-		src++;
1712
-	}
1713
-#endif
1714
-}
1715
-
1716
-/**
1717
- * Deinterlaces the given block
1718
- * will be called for every 8x8 block and can read & write from line 4-15
1719
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1720
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1721
- * will shift the image up by 1 line (FIXME if this is a problem)
1722
- * this filter will read lines 4-13 and write 4-11
1723
- */
1724
-static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride)
1725
-{
1726
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1727
-	src+= 4*stride;
1728
-	asm volatile(
1729
-		"leal (%0, %1), %%eax				\n\t"
1730
-		"leal (%%eax, %1, 4), %%edx			\n\t"
1731
-//	0	1	2	3	4	5	6	7	8	9
1732
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1733
-
1734
-		"movq (%0), %%mm0				\n\t" // L0
1735
-		"movq (%%eax, %1), %%mm1			\n\t" // L2
1736
-		PAVGB(%%mm1, %%mm0)				      // L0+L2
1737
-		"movq (%%eax), %%mm2				\n\t" // L1
1738
-		PAVGB(%%mm2, %%mm0)
1739
-		"movq %%mm0, (%0)				\n\t"
1740
-		"movq (%%eax, %1, 2), %%mm0			\n\t" // L3
1741
-		PAVGB(%%mm0, %%mm2)				      // L1+L3
1742
-		PAVGB(%%mm1, %%mm2)				      // 2L2 + L1 + L3
1743
-		"movq %%mm2, (%%eax)				\n\t"
1744
-		"movq (%0, %1, 4), %%mm2			\n\t" // L4
1745
-		PAVGB(%%mm2, %%mm1)				      // L2+L4
1746
-		PAVGB(%%mm0, %%mm1)				      // 2L3 + L2 + L4
1747
-		"movq %%mm1, (%%eax, %1)			\n\t"
1748
-		"movq (%%edx), %%mm1				\n\t" // L5
1749
-		PAVGB(%%mm1, %%mm0)				      // L3+L5
1750
-		PAVGB(%%mm2, %%mm0)				      // 2L4 + L3 + L5
1751
-		"movq %%mm0, (%%eax, %1, 2)			\n\t"
1752
-		"movq (%%edx, %1), %%mm0			\n\t" // L6
1753
-		PAVGB(%%mm0, %%mm2)				      // L4+L6
1754
-		PAVGB(%%mm1, %%mm2)				      // 2L5 + L4 + L6
1755
-		"movq %%mm2, (%0, %1, 4)			\n\t"
1756
-		"movq (%%edx, %1, 2), %%mm2			\n\t" // L7
1757
-		PAVGB(%%mm2, %%mm1)				      // L5+L7
1758
-		PAVGB(%%mm0, %%mm1)				      // 2L6 + L5 + L7
1759
-		"movq %%mm1, (%%edx)				\n\t"
1760
-		"movq (%0, %1, 8), %%mm1			\n\t" // L8
1761
-		PAVGB(%%mm1, %%mm0)				      // L6+L8
1762
-		PAVGB(%%mm2, %%mm0)				      // 2L7 + L6 + L8
1763
-		"movq %%mm0, (%%edx, %1)			\n\t"
1764
-		"movq (%%edx, %1, 4), %%mm0			\n\t" // L9
1765
-		PAVGB(%%mm0, %%mm2)				      // L7+L9
1766
-		PAVGB(%%mm1, %%mm2)				      // 2L8 + L7 + L9
1767
-		"movq %%mm2, (%%edx, %1, 2)			\n\t"
1768
-
1769
-
1770
-		: : "r" (src), "r" (stride)
1771
-		: "%eax", "%edx"
1772
-	);
1773
-#else
1774
-	int x;
1775
-	src+= 4*stride;
1776
-	for(x=0; x<8; x++)
1777
-	{
1778
-		src[0       ] = (src[0       ] + 2*src[stride  ] + src[stride*2])>>2;
1779
-		src[stride  ] = (src[stride  ] + 2*src[stride*2] + src[stride*3])>>2;
1780
-		src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2;
1781
-		src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2;
1782
-		src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2;
1783
-		src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2;
1784
-		src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2;
1785
-		src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2;
1786
-		src++;
1787
-	}
1788
-#endif
1789
-}
1790
-
1791
-/**
1792
- * Deinterlaces the given block
1793
- * will be called for every 8x8 block and can read & write from line 4-15,
1794
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too
1795
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1796
- */
1797
-static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1798
-{
1799
-#ifdef HAVE_MMX
1800
-	src+= 4*stride;
1801
-#ifdef HAVE_MMX2
1802
-	asm volatile(
1803
-		"leal (%0, %1), %%eax				\n\t"
1804
-		"leal (%%eax, %1, 4), %%edx			\n\t"
1805
-//	0	1	2	3	4	5	6	7	8	9
1806
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1807
-
1808
-		"movq (%0), %%mm0				\n\t" //
1809
-		"movq (%%eax, %1), %%mm2			\n\t" //
1810
-		"movq (%%eax), %%mm1				\n\t" //
1811
-		"movq %%mm0, %%mm3				\n\t"
1812
-		"pmaxub %%mm1, %%mm0				\n\t" //
1813
-		"pminub %%mm3, %%mm1				\n\t" //
1814
-		"pmaxub %%mm2, %%mm1				\n\t" //
1815
-		"pminub %%mm1, %%mm0				\n\t"
1816
-		"movq %%mm0, (%%eax)				\n\t"
1817
-
1818
-		"movq (%0, %1, 4), %%mm0			\n\t" //
1819
-		"movq (%%eax, %1, 2), %%mm1			\n\t" //
1820
-		"movq %%mm2, %%mm3				\n\t"
1821
-		"pmaxub %%mm1, %%mm2				\n\t" //
1822
-		"pminub %%mm3, %%mm1				\n\t" //
1823
-		"pmaxub %%mm0, %%mm1				\n\t" //
1824
-		"pminub %%mm1, %%mm2				\n\t"
1825
-		"movq %%mm2, (%%eax, %1, 2)			\n\t"
1826
-
1827
-		"movq (%%edx), %%mm2				\n\t" //
1828
-		"movq (%%edx, %1), %%mm1			\n\t" //
1829
-		"movq %%mm2, %%mm3				\n\t"
1830
-		"pmaxub %%mm0, %%mm2				\n\t" //
1831
-		"pminub %%mm3, %%mm0				\n\t" //
1832
-		"pmaxub %%mm1, %%mm0				\n\t" //
1833
-		"pminub %%mm0, %%mm2				\n\t"
1834
-		"movq %%mm2, (%%edx)				\n\t"
1835
-
1836
-		"movq (%%edx, %1, 2), %%mm2			\n\t" //
1837
-		"movq (%0, %1, 8), %%mm0			\n\t" //
1838
-		"movq %%mm2, %%mm3				\n\t"
1839
-		"pmaxub %%mm0, %%mm2				\n\t" //
1840
-		"pminub %%mm3, %%mm0				\n\t" //
1841
-		"pmaxub %%mm1, %%mm0				\n\t" //
1842
-		"pminub %%mm0, %%mm2				\n\t"
1843
-		"movq %%mm2, (%%edx, %1, 2)			\n\t"
1844
-
1845
-
1846
-		: : "r" (src), "r" (stride)
1847
-		: "%eax", "%edx"
1848
-	);
1849
-
1850
-#else // MMX without MMX2
1851
-	asm volatile(
1852
-		"leal (%0, %1), %%eax				\n\t"
1853
-		"leal (%%eax, %1, 4), %%edx			\n\t"
1854
-//	0	1	2	3	4	5	6	7	8	9
1855
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1856
-		"pxor %%mm7, %%mm7				\n\t"
1857
-
1858
-#define MEDIAN(a,b,c)\
1859
-		"movq " #a ", %%mm0				\n\t"\
1860
-		"movq " #b ", %%mm2				\n\t"\
1861
-		"movq " #c ", %%mm1				\n\t"\
1862
-		"movq %%mm0, %%mm3				\n\t"\
1863
-		"movq %%mm1, %%mm4				\n\t"\
1864
-		"movq %%mm2, %%mm5				\n\t"\
1865
-		"psubusb %%mm1, %%mm3				\n\t"\
1866
-		"psubusb %%mm2, %%mm4				\n\t"\
1867
-		"psubusb %%mm0, %%mm5				\n\t"\
1868
-		"pcmpeqb %%mm7, %%mm3				\n\t"\
1869
-		"pcmpeqb %%mm7, %%mm4				\n\t"\
1870
-		"pcmpeqb %%mm7, %%mm5				\n\t"\
1871
-		"movq %%mm3, %%mm6				\n\t"\
1872
-		"pxor %%mm4, %%mm3				\n\t"\
1873
-		"pxor %%mm5, %%mm4				\n\t"\
1874
-		"pxor %%mm6, %%mm5				\n\t"\
1875
-		"por %%mm3, %%mm1				\n\t"\
1876
-		"por %%mm4, %%mm2				\n\t"\
1877
-		"por %%mm5, %%mm0				\n\t"\
1878
-		"pand %%mm2, %%mm0				\n\t"\
1879
-		"pand %%mm1, %%mm0				\n\t"\
1880
-		"movq %%mm0, " #b "				\n\t"
1881
-
1882
-MEDIAN((%0), (%%eax), (%%eax, %1))
1883
-MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4))
1884
-MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1))
1885
-MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8))
1886
-
1887
-		: : "r" (src), "r" (stride)
1888
-		: "%eax", "%edx"
1889
-	);
1890
-#endif // MMX
1891
-#else
1892
-	int x, y;
1893
-	src+= 4*stride;
1894
-	// FIXME - there should be a way to do a few columns in parallel like w/mmx
1895
-	for(x=0; x<8; x++)
1896
-	{
1897
-		uint8_t *colsrc = src;
1898
-		for (y=0; y<4; y++)
1899
-		{
1900
-			int a, b, c, d, e, f;
1901
-			a = colsrc[0       ];
1902
-			b = colsrc[stride  ];
1903
-			c = colsrc[stride*2];
1904
-			d = (a-b)>>31;
1905
-			e = (b-c)>>31;
1906
-			f = (c-a)>>31;
1907
-			colsrc[stride  ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1908
-			colsrc += stride*2;
1909
-		}
1910
-		src++;
1911
-	}
1912
-#endif
1913
-}
1914
-
1915
-#ifdef HAVE_MMX
1916
-/**
1917
- * transposes and shift the given 8x8 Block into dst1 and dst2
1918
- */
1919
-static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
1920
-{
1921
-	asm(
1922
-		"leal (%0, %1), %%eax				\n\t"
1923
-//	0	1	2	3	4	5	6	7	8	9
1924
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
1925
-		"movq (%0), %%mm0		\n\t" // 12345678
1926
-		"movq (%%eax), %%mm1		\n\t" // abcdefgh
1927
-		"movq %%mm0, %%mm2		\n\t" // 12345678
1928
-		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
1929
-		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
1930
-
1931
-		"movq (%%eax, %1), %%mm1	\n\t"
1932
-		"movq (%%eax, %1, 2), %%mm3	\n\t"
1933
-		"movq %%mm1, %%mm4		\n\t"
1934
-		"punpcklbw %%mm3, %%mm1		\n\t"
1935
-		"punpckhbw %%mm3, %%mm4		\n\t"
1936
-
1937
-		"movq %%mm0, %%mm3		\n\t"
1938
-		"punpcklwd %%mm1, %%mm0		\n\t"
1939
-		"punpckhwd %%mm1, %%mm3		\n\t"
1940
-		"movq %%mm2, %%mm1		\n\t"
1941
-		"punpcklwd %%mm4, %%mm2		\n\t"
1942
-		"punpckhwd %%mm4, %%mm1		\n\t"
1943
-
1944
-		"movd %%mm0, 128(%2)		\n\t"
1945
-		"psrlq $32, %%mm0		\n\t"
1946
-		"movd %%mm0, 144(%2)		\n\t"
1947
-		"movd %%mm3, 160(%2)		\n\t"
1948
-		"psrlq $32, %%mm3		\n\t"
1949
-		"movd %%mm3, 176(%2)		\n\t"
1950
-		"movd %%mm3, 48(%3)		\n\t"
1951
-		"movd %%mm2, 192(%2)		\n\t"
1952
-		"movd %%mm2, 64(%3)		\n\t"
1953
-		"psrlq $32, %%mm2		\n\t"
1954
-		"movd %%mm2, 80(%3)		\n\t"
1955
-		"movd %%mm1, 96(%3)		\n\t"
1956
-		"psrlq $32, %%mm1		\n\t"
1957
-		"movd %%mm1, 112(%3)		\n\t"
1958
-
1959
-		"leal (%%eax, %1, 4), %%eax	\n\t"
1960
-		
1961
-		"movq (%0, %1, 4), %%mm0	\n\t" // 12345678
1962
-		"movq (%%eax), %%mm1		\n\t" // abcdefgh
1963
-		"movq %%mm0, %%mm2		\n\t" // 12345678
1964
-		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
1965
-		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
1966
-
1967
-		"movq (%%eax, %1), %%mm1	\n\t"
1968
-		"movq (%%eax, %1, 2), %%mm3	\n\t"
1969
-		"movq %%mm1, %%mm4		\n\t"
1970
-		"punpcklbw %%mm3, %%mm1		\n\t"
1971
-		"punpckhbw %%mm3, %%mm4		\n\t"
1972
-
1973
-		"movq %%mm0, %%mm3		\n\t"
1974
-		"punpcklwd %%mm1, %%mm0		\n\t"
1975
-		"punpckhwd %%mm1, %%mm3		\n\t"
1976
-		"movq %%mm2, %%mm1		\n\t"
1977
-		"punpcklwd %%mm4, %%mm2		\n\t"
1978
-		"punpckhwd %%mm4, %%mm1		\n\t"
1979
-
1980
-		"movd %%mm0, 132(%2)		\n\t"
1981
-		"psrlq $32, %%mm0		\n\t"
1982
-		"movd %%mm0, 148(%2)		\n\t"
1983
-		"movd %%mm3, 164(%2)		\n\t"
1984
-		"psrlq $32, %%mm3		\n\t"
1985
-		"movd %%mm3, 180(%2)		\n\t"
1986
-		"movd %%mm3, 52(%3)		\n\t"
1987
-		"movd %%mm2, 196(%2)		\n\t"
1988
-		"movd %%mm2, 68(%3)		\n\t"
1989
-		"psrlq $32, %%mm2		\n\t"
1990
-		"movd %%mm2, 84(%3)		\n\t"
1991
-		"movd %%mm1, 100(%3)		\n\t"
1992
-		"psrlq $32, %%mm1		\n\t"
1993
-		"movd %%mm1, 116(%3)		\n\t"
1994
-
1995
-
1996
-	:: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
1997
-	: "%eax"
1998
-	);
1999
-}
2000
-
2001
-/**
2002
- * transposes the given 8x8 block
2003
- */
2004
-static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
2005
-{
2006
-	asm(
2007
-		"leal (%0, %1), %%eax				\n\t"
2008
-		"leal (%%eax, %1, 4), %%edx			\n\t"
2009
-//	0	1	2	3	4	5	6	7	8	9
2010
-//	%0	eax	eax+%1	eax+2%1	%0+4%1	edx	edx+%1	edx+2%1	%0+8%1	edx+4%1
2011
-		"movq (%2), %%mm0		\n\t" // 12345678
2012
-		"movq 16(%2), %%mm1		\n\t" // abcdefgh
2013
-		"movq %%mm0, %%mm2		\n\t" // 12345678
2014
-		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
2015
-		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
2016
-
2017
-		"movq 32(%2), %%mm1		\n\t"
2018
-		"movq 48(%2), %%mm3		\n\t"
2019
-		"movq %%mm1, %%mm4		\n\t"
2020
-		"punpcklbw %%mm3, %%mm1		\n\t"
2021
-		"punpckhbw %%mm3, %%mm4		\n\t"
2022
-
2023
-		"movq %%mm0, %%mm3		\n\t"
2024
-		"punpcklwd %%mm1, %%mm0		\n\t"
2025
-		"punpckhwd %%mm1, %%mm3		\n\t"
2026
-		"movq %%mm2, %%mm1		\n\t"
2027
-		"punpcklwd %%mm4, %%mm2		\n\t"
2028
-		"punpckhwd %%mm4, %%mm1		\n\t"
2029
-
2030
-		"movd %%mm0, (%0)		\n\t"
2031
-		"psrlq $32, %%mm0		\n\t"
2032
-		"movd %%mm0, (%%eax)		\n\t"
2033
-		"movd %%mm3, (%%eax, %1)	\n\t"
2034
-		"psrlq $32, %%mm3		\n\t"
2035
-		"movd %%mm3, (%%eax, %1, 2)	\n\t"
2036
-		"movd %%mm2, (%0, %1, 4)	\n\t"
2037
-		"psrlq $32, %%mm2		\n\t"
2038
-		"movd %%mm2, (%%edx)		\n\t"
2039
-		"movd %%mm1, (%%edx, %1)	\n\t"
2040
-		"psrlq $32, %%mm1		\n\t"
2041
-		"movd %%mm1, (%%edx, %1, 2)	\n\t"
2042
-
2043
-
2044
-		"movq 64(%2), %%mm0		\n\t" // 12345678
2045
-		"movq 80(%2), %%mm1		\n\t" // abcdefgh
2046
-		"movq %%mm0, %%mm2		\n\t" // 12345678
2047
-		"punpcklbw %%mm1, %%mm0		\n\t" // 1a2b3c4d
2048
-		"punpckhbw %%mm1, %%mm2		\n\t" // 5e6f7g8h
2049
-
2050
-		"movq 96(%2), %%mm1		\n\t"
2051
-		"movq 112(%2), %%mm3		\n\t"
2052
-		"movq %%mm1, %%mm4		\n\t"
2053
-		"punpcklbw %%mm3, %%mm1		\n\t"
2054
-		"punpckhbw %%mm3, %%mm4		\n\t"
2055
-
2056
-		"movq %%mm0, %%mm3		\n\t"
2057
-		"punpcklwd %%mm1, %%mm0		\n\t"
2058
-		"punpckhwd %%mm1, %%mm3		\n\t"
2059
-		"movq %%mm2, %%mm1		\n\t"
2060
-		"punpcklwd %%mm4, %%mm2		\n\t"
2061
-		"punpckhwd %%mm4, %%mm1		\n\t"
2062
-
2063
-		"movd %%mm0, 4(%0)		\n\t"
2064
-		"psrlq $32, %%mm0		\n\t"
2065
-		"movd %%mm0, 4(%%eax)		\n\t"
2066
-		"movd %%mm3, 4(%%eax, %1)	\n\t"
2067
-		"psrlq $32, %%mm3		\n\t"
2068
-		"movd %%mm3, 4(%%eax, %1, 2)	\n\t"
2069
-		"movd %%mm2, 4(%0, %1, 4)	\n\t"
2070
-		"psrlq $32, %%mm2		\n\t"
2071
-		"movd %%mm2, 4(%%edx)		\n\t"
2072
-		"movd %%mm1, 4(%%edx, %1)	\n\t"
2073
-		"psrlq $32, %%mm1		\n\t"
2074
-		"movd %%mm1, 4(%%edx, %1, 2)	\n\t"
2075
-
2076
-	:: "r" (dst), "r" (dstStride), "r" (src)
2077
-	: "%eax", "%edx"
2078
-	);
2079
-}
2080
-#endif
2081
-//static int test=0;
2082
-
2083
-static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2084
-				    uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
2085
-{
2086
-	// to save a register (FIXME do this outside of the loops)
2087
-	tempBluredPast[127]= maxNoise[0];
2088
-	tempBluredPast[128]= maxNoise[1];
2089
-	tempBluredPast[129]= maxNoise[2];
2090
-        
2091
-#define FAST_L2_DIFF
2092
-//#define L1_DIFF //u should change the thresholds too if u try that one
2093
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2094
-	asm volatile(
2095
-		"leal (%2, %2, 2), %%eax			\n\t" // 3*stride
2096
-		"leal (%2, %2, 4), %%edx			\n\t" // 5*stride
2097
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2098
-//	0	1	2	3	4	5	6	7	8	9
2099
-//	%x	%x+%2	%x+2%2	%x+eax	%x+4%2	%x+edx	%x+2eax	%x+ecx	%x+8%2
2100
-//FIXME reorder?
2101
-#ifdef L1_DIFF //needs mmx2
2102
-		"movq (%0), %%mm0				\n\t" // L0
2103
-		"psadbw (%1), %%mm0				\n\t" // |L0-R0|
2104
-		"movq (%0, %2), %%mm1				\n\t" // L1
2105
-		"psadbw (%1, %2), %%mm1				\n\t" // |L1-R1|
2106
-		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2107
-		"psadbw (%1, %2, 2), %%mm2			\n\t" // |L2-R2|
2108
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
2109
-		"psadbw (%1, %%eax), %%mm3			\n\t" // |L3-R3|
2110
-
2111
-		"movq (%0, %2, 4), %%mm4			\n\t" // L4
2112
-		"paddw %%mm1, %%mm0				\n\t"
2113
-		"psadbw (%1, %2, 4), %%mm4			\n\t" // |L4-R4|
2114
-		"movq (%0, %%edx), %%mm5			\n\t" // L5
2115
-		"paddw %%mm2, %%mm0				\n\t"
2116
-		"psadbw (%1, %%edx), %%mm5			\n\t" // |L5-R5|
2117
-		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
2118
-		"paddw %%mm3, %%mm0				\n\t"
2119
-		"psadbw (%1, %%eax, 2), %%mm6			\n\t" // |L6-R6|
2120
-		"movq (%0, %%ecx), %%mm7			\n\t" // L7
2121
-		"paddw %%mm4, %%mm0				\n\t"
2122
-		"psadbw (%1, %%ecx), %%mm7			\n\t" // |L7-R7|
2123
-		"paddw %%mm5, %%mm6				\n\t"
2124
-		"paddw %%mm7, %%mm6				\n\t"
2125
-		"paddw %%mm6, %%mm0				\n\t"
2126
-#elif defined (FAST_L2_DIFF)
2127
-		"pcmpeqb %%mm7, %%mm7				\n\t"
2128
-		"movq "MANGLE(b80)", %%mm6			\n\t"
2129
-		"pxor %%mm0, %%mm0				\n\t"
2130
-#define L2_DIFF_CORE(a, b)\
2131
-		"movq " #a ", %%mm5				\n\t"\
2132
-		"movq " #b ", %%mm2				\n\t"\
2133
-		"pxor %%mm7, %%mm2				\n\t"\
2134
-		PAVGB(%%mm2, %%mm5)\
2135
-		"paddb %%mm6, %%mm5				\n\t"\
2136
-		"movq %%mm5, %%mm2				\n\t"\
2137
-		"psllw $8, %%mm5				\n\t"\
2138
-		"pmaddwd %%mm5, %%mm5				\n\t"\
2139
-		"pmaddwd %%mm2, %%mm2				\n\t"\
2140
-		"paddd %%mm2, %%mm5				\n\t"\
2141
-		"psrld $14, %%mm5				\n\t"\
2142
-		"paddd %%mm5, %%mm0				\n\t"
2143
-
2144
-L2_DIFF_CORE((%0), (%1))
2145
-L2_DIFF_CORE((%0, %2), (%1, %2))
2146
-L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2147
-L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2148
-L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2149
-L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2150
-L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2151
-L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2152
-
2153
-#else
2154
-		"pxor %%mm7, %%mm7				\n\t"
2155
-		"pxor %%mm0, %%mm0				\n\t"
2156
-#define L2_DIFF_CORE(a, b)\
2157
-		"movq " #a ", %%mm5				\n\t"\
2158
-		"movq " #b ", %%mm2				\n\t"\
2159
-		"movq %%mm5, %%mm1				\n\t"\
2160
-		"movq %%mm2, %%mm3				\n\t"\
2161
-		"punpcklbw %%mm7, %%mm5				\n\t"\
2162
-		"punpckhbw %%mm7, %%mm1				\n\t"\
2163
-		"punpcklbw %%mm7, %%mm2				\n\t"\
2164
-		"punpckhbw %%mm7, %%mm3				\n\t"\
2165
-		"psubw %%mm2, %%mm5				\n\t"\
2166
-		"psubw %%mm3, %%mm1				\n\t"\
2167
-		"pmaddwd %%mm5, %%mm5				\n\t"\
2168
-		"pmaddwd %%mm1, %%mm1				\n\t"\
2169
-		"paddd %%mm1, %%mm5				\n\t"\
2170
-		"paddd %%mm5, %%mm0				\n\t"
2171
-
2172
-L2_DIFF_CORE((%0), (%1))
2173
-L2_DIFF_CORE((%0, %2), (%1, %2))
2174
-L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2))
2175
-L2_DIFF_CORE((%0, %%eax), (%1, %%eax))
2176
-L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4))
2177
-L2_DIFF_CORE((%0, %%edx), (%1, %%edx))
2178
-L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2))
2179
-L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx))
2180
-
2181
-#endif
2182
-
2183
-		"movq %%mm0, %%mm4				\n\t"
2184
-		"psrlq $32, %%mm0				\n\t"
2185
-		"paddd %%mm0, %%mm4				\n\t"
2186
-		"movd %%mm4, %%ecx				\n\t"
2187
-		"shll $2, %%ecx					\n\t"
2188
-		"movl %3, %%edx					\n\t"
2189
-		"addl -4(%%edx), %%ecx				\n\t"
2190
-		"addl 4(%%edx), %%ecx				\n\t"
2191
-		"addl -1024(%%edx), %%ecx			\n\t"
2192
-		"addl $4, %%ecx					\n\t"
2193
-		"addl 1024(%%edx), %%ecx			\n\t"
2194
-		"shrl $3, %%ecx					\n\t"
2195
-		"movl %%ecx, (%%edx)				\n\t"
2196
-
2197
-//		"movl %3, %%ecx					\n\t"
2198
-//		"movl %%ecx, test				\n\t"
2199
-//		"jmp 4f \n\t"
2200
-		"cmpl 512(%%edx), %%ecx				\n\t"
2201
-		" jb 2f						\n\t"
2202
-		"cmpl 516(%%edx), %%ecx				\n\t"
2203
-		" jb 1f						\n\t"
2204
-
2205
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2206
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2207
-		"movq (%0), %%mm0				\n\t" // L0
2208
-		"movq (%0, %2), %%mm1				\n\t" // L1
2209
-		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2210
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
2211
-		"movq (%0, %2, 4), %%mm4			\n\t" // L4
2212
-		"movq (%0, %%edx), %%mm5			\n\t" // L5
2213
-		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
2214
-		"movq (%0, %%ecx), %%mm7			\n\t" // L7
2215
-		"movq %%mm0, (%1)				\n\t" // L0
2216
-		"movq %%mm1, (%1, %2)				\n\t" // L1
2217
-		"movq %%mm2, (%1, %2, 2)			\n\t" // L2
2218
-		"movq %%mm3, (%1, %%eax)			\n\t" // L3
2219
-		"movq %%mm4, (%1, %2, 4)			\n\t" // L4
2220
-		"movq %%mm5, (%1, %%edx)			\n\t" // L5
2221
-		"movq %%mm6, (%1, %%eax, 2)			\n\t" // L6
2222
-		"movq %%mm7, (%1, %%ecx)			\n\t" // L7
2223
-		"jmp 4f						\n\t"
2224
-
2225
-		"1:						\n\t"
2226
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2227
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2228
-		"movq (%0), %%mm0				\n\t" // L0
2229
-		PAVGB((%1), %%mm0)				      // L0
2230
-		"movq (%0, %2), %%mm1				\n\t" // L1
2231
-		PAVGB((%1, %2), %%mm1)				      // L1
2232
-		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2233
-		PAVGB((%1, %2, 2), %%mm2)			      // L2
2234
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
2235
-		PAVGB((%1, %%eax), %%mm3)			      // L3
2236
-		"movq (%0, %2, 4), %%mm4			\n\t" // L4
2237
-		PAVGB((%1, %2, 4), %%mm4)			      // L4
2238
-		"movq (%0, %%edx), %%mm5			\n\t" // L5
2239
-		PAVGB((%1, %%edx), %%mm5)			      // L5
2240
-		"movq (%0, %%eax, 2), %%mm6			\n\t" // L6
2241
-		PAVGB((%1, %%eax, 2), %%mm6)			      // L6
2242
-		"movq (%0, %%ecx), %%mm7			\n\t" // L7
2243
-		PAVGB((%1, %%ecx), %%mm7)			      // L7
2244
-		"movq %%mm0, (%1)				\n\t" // R0
2245
-		"movq %%mm1, (%1, %2)				\n\t" // R1
2246
-		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
2247
-		"movq %%mm3, (%1, %%eax)			\n\t" // R3
2248
-		"movq %%mm4, (%1, %2, 4)			\n\t" // R4
2249
-		"movq %%mm5, (%1, %%edx)			\n\t" // R5
2250
-		"movq %%mm6, (%1, %%eax, 2)			\n\t" // R6
2251
-		"movq %%mm7, (%1, %%ecx)			\n\t" // R7
2252
-		"movq %%mm0, (%0)				\n\t" // L0
2253
-		"movq %%mm1, (%0, %2)				\n\t" // L1
2254
-		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
2255
-		"movq %%mm3, (%0, %%eax)			\n\t" // L3
2256
-		"movq %%mm4, (%0, %2, 4)			\n\t" // L4
2257
-		"movq %%mm5, (%0, %%edx)			\n\t" // L5
2258
-		"movq %%mm6, (%0, %%eax, 2)			\n\t" // L6
2259
-		"movq %%mm7, (%0, %%ecx)			\n\t" // L7
2260
-		"jmp 4f						\n\t"
2261
-
2262
-		"2:						\n\t"
2263
-		"cmpl 508(%%edx), %%ecx				\n\t"
2264
-		" jb 3f						\n\t"
2265
-
2266
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2267
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2268
-		"movq (%0), %%mm0				\n\t" // L0
2269
-		"movq (%0, %2), %%mm1				\n\t" // L1
2270
-		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2271
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
2272
-		"movq (%1), %%mm4				\n\t" // R0
2273
-		"movq (%1, %2), %%mm5				\n\t" // R1
2274
-		"movq (%1, %2, 2), %%mm6			\n\t" // R2
2275
-		"movq (%1, %%eax), %%mm7			\n\t" // R3
2276
-		PAVGB(%%mm4, %%mm0)
2277
-		PAVGB(%%mm5, %%mm1)
2278
-		PAVGB(%%mm6, %%mm2)
2279
-		PAVGB(%%mm7, %%mm3)
2280
-		PAVGB(%%mm4, %%mm0)
2281
-		PAVGB(%%mm5, %%mm1)
2282
-		PAVGB(%%mm6, %%mm2)
2283
-		PAVGB(%%mm7, %%mm3)
2284
-		"movq %%mm0, (%1)				\n\t" // R0
2285
-		"movq %%mm1, (%1, %2)				\n\t" // R1
2286
-		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
2287
-		"movq %%mm3, (%1, %%eax)			\n\t" // R3
2288
-		"movq %%mm0, (%0)				\n\t" // L0
2289
-		"movq %%mm1, (%0, %2)				\n\t" // L1
2290
-		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
2291
-		"movq %%mm3, (%0, %%eax)			\n\t" // L3
2292
-
2293
-		"movq (%0, %2, 4), %%mm0			\n\t" // L4
2294
-		"movq (%0, %%edx), %%mm1			\n\t" // L5
2295
-		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
2296
-		"movq (%0, %%ecx), %%mm3			\n\t" // L7
2297
-		"movq (%1, %2, 4), %%mm4			\n\t" // R4
2298
-		"movq (%1, %%edx), %%mm5			\n\t" // R5
2299
-		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
2300
-		"movq (%1, %%ecx), %%mm7			\n\t" // R7
2301
-		PAVGB(%%mm4, %%mm0)
2302
-		PAVGB(%%mm5, %%mm1)
2303
-		PAVGB(%%mm6, %%mm2)
2304
-		PAVGB(%%mm7, %%mm3)
2305
-		PAVGB(%%mm4, %%mm0)
2306
-		PAVGB(%%mm5, %%mm1)
2307
-		PAVGB(%%mm6, %%mm2)
2308
-		PAVGB(%%mm7, %%mm3)
2309
-		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
2310
-		"movq %%mm1, (%1, %%edx)			\n\t" // R5
2311
-		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
2312
-		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
2313
-		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
2314
-		"movq %%mm1, (%0, %%edx)			\n\t" // L5
2315
-		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
2316
-		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
2317
-		"jmp 4f						\n\t"
2318
-
2319
-		"3:						\n\t"
2320
-		"leal (%%eax, %2, 2), %%edx			\n\t" // 5*stride
2321
-		"leal (%%edx, %2, 2), %%ecx			\n\t" // 7*stride
2322
-		"movq (%0), %%mm0				\n\t" // L0
2323
-		"movq (%0, %2), %%mm1				\n\t" // L1
2324
-		"movq (%0, %2, 2), %%mm2			\n\t" // L2
2325
-		"movq (%0, %%eax), %%mm3			\n\t" // L3
2326
-		"movq (%1), %%mm4				\n\t" // R0
2327
-		"movq (%1, %2), %%mm5				\n\t" // R1
2328
-		"movq (%1, %2, 2), %%mm6			\n\t" // R2
2329
-		"movq (%1, %%eax), %%mm7			\n\t" // R3
2330
-		PAVGB(%%mm4, %%mm0)
2331
-		PAVGB(%%mm5, %%mm1)
2332
-		PAVGB(%%mm6, %%mm2)
2333
-		PAVGB(%%mm7, %%mm3)
2334
-		PAVGB(%%mm4, %%mm0)
2335
-		PAVGB(%%mm5, %%mm1)
2336
-		PAVGB(%%mm6, %%mm2)
2337
-		PAVGB(%%mm7, %%mm3)
2338
-		PAVGB(%%mm4, %%mm0)
2339
-		PAVGB(%%mm5, %%mm1)
2340
-		PAVGB(%%mm6, %%mm2)
2341
-		PAVGB(%%mm7, %%mm3)
2342
-		"movq %%mm0, (%1)				\n\t" // R0
2343
-		"movq %%mm1, (%1, %2)				\n\t" // R1
2344
-		"movq %%mm2, (%1, %2, 2)			\n\t" // R2
2345
-		"movq %%mm3, (%1, %%eax)			\n\t" // R3
2346
-		"movq %%mm0, (%0)				\n\t" // L0
2347
-		"movq %%mm1, (%0, %2)				\n\t" // L1
2348
-		"movq %%mm2, (%0, %2, 2)			\n\t" // L2
2349
-		"movq %%mm3, (%0, %%eax)			\n\t" // L3
2350
-
2351
-		"movq (%0, %2, 4), %%mm0			\n\t" // L4
2352
-		"movq (%0, %%edx), %%mm1			\n\t" // L5
2353
-		"movq (%0, %%eax, 2), %%mm2			\n\t" // L6
2354
-		"movq (%0, %%ecx), %%mm3			\n\t" // L7
2355
-		"movq (%1, %2, 4), %%mm4			\n\t" // R4
2356
-		"movq (%1, %%edx), %%mm5			\n\t" // R5
2357
-		"movq (%1, %%eax, 2), %%mm6			\n\t" // R6
2358
-		"movq (%1, %%ecx), %%mm7			\n\t" // R7
2359
-		PAVGB(%%mm4, %%mm0)
2360
-		PAVGB(%%mm5, %%mm1)
2361
-		PAVGB(%%mm6, %%mm2)
2362
-		PAVGB(%%mm7, %%mm3)
2363
-		PAVGB(%%mm4, %%mm0)
2364
-		PAVGB(%%mm5, %%mm1)
2365
-		PAVGB(%%mm6, %%mm2)
2366
-		PAVGB(%%mm7, %%mm3)
2367
-		PAVGB(%%mm4, %%mm0)
2368
-		PAVGB(%%mm5, %%mm1)
2369
-		PAVGB(%%mm6, %%mm2)
2370
-		PAVGB(%%mm7, %%mm3)
2371
-		"movq %%mm0, (%1, %2, 4)			\n\t" // R4
2372
-		"movq %%mm1, (%1, %%edx)			\n\t" // R5
2373
-		"movq %%mm2, (%1, %%eax, 2)			\n\t" // R6
2374
-		"movq %%mm3, (%1, %%ecx)			\n\t" // R7
2375
-		"movq %%mm0, (%0, %2, 4)			\n\t" // L4
2376
-		"movq %%mm1, (%0, %%edx)			\n\t" // L5
2377
-		"movq %%mm2, (%0, %%eax, 2)			\n\t" // L6
2378
-		"movq %%mm3, (%0, %%ecx)			\n\t" // L7
2379
-
2380
-		"4:						\n\t"
2381
-
2382
-		:: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast)
2383
-		: "%eax", "%edx", "%ecx", "memory"
2384
-		);
2385
-//printf("%d\n", test);
2386
-#else
2387
-{
2388
-	int y;
2389
-	int d=0;
2390
-	int sysd=0;
2391
-	int i;
2392
-
2393
-	for(y=0; y<8; y++)
2394
-	{
2395
-		int x;
2396
-		for(x=0; x<8; x++)
2397
-		{
2398
-			int ref= tempBlured[ x + y*stride ];
2399
-			int cur= src[ x + y*stride ];
2400
-			int d1=ref - cur;
2401
-//			if(x==0 || x==7) d1+= d1>>1;
2402
-//			if(y==0 || y==7) d1+= d1>>1;
2403
-//			d+= ABS(d1);
2404
-			d+= d1*d1;
2405
-			sysd+= d1;
2406
-		}
2407
-	}
2408
-	i=d;
2409
-	d= 	(
2410
-		4*d
2411
-		+(*(tempBluredPast-256))
2412
-		+(*(tempBluredPast-1))+ (*(tempBluredPast+1))
2413
-		+(*(tempBluredPast+256))
2414
-		+4)>>3;
2415
-	*tempBluredPast=i;
2416
-//	((*tempBluredPast)*3 + d + 2)>>2;
2417
-
2418
-//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]);
2419
-/*
2420
-Switch between
2421
- 1  0  0  0  0  0  0  (0)
2422
-64 32 16  8  4  2  1  (1)
2423
-64 48 36 27 20 15 11 (33) (approx)
2424
-64 56 49 43 37 33 29 (200) (approx)
2425
-*/
2426
-	if(d > maxNoise[1])
2427
-	{
2428
-		if(d < maxNoise[2])
2429
-		{
2430
-			for(y=0; y<8; y++)
2431
-			{
2432
-				int x;
2433
-				for(x=0; x<8; x++)
2434
-				{
2435
-					int ref= tempBlured[ x + y*stride ];
2436
-					int cur= src[ x + y*stride ];
2437
-					tempBlured[ x + y*stride ]=
2438
-					src[ x + y*stride ]=
2439
-						(ref + cur + 1)>>1;
2440
-				}
2441
-			}
2442
-		}
2443
-		else
2444
-		{
2445
-			for(y=0; y<8; y++)
2446
-			{
2447
-				int x;
2448
-				for(x=0; x<8; x++)
2449
-				{
2450
-					tempBlured[ x + y*stride ]= src[ x + y*stride ];
2451
-				}
2452
-			}
2453
-		}
2454
-	}
2455
-	else
2456
-	{
2457
-		if(d < maxNoise[0])
2458
-		{
2459
-			for(y=0; y<8; y++)
2460
-			{
2461
-				int x;
2462
-				for(x=0; x<8; x++)
2463
-				{
2464
-					int ref= tempBlured[ x + y*stride ];
2465
-					int cur= src[ x + y*stride ];
2466
-					tempBlured[ x + y*stride ]=
2467
-					src[ x + y*stride ]=
2468
-						(ref*7 + cur + 4)>>3;
2469
-				}
2470
-			}
2471
-		}
2472
-		else
2473
-		{
2474
-			for(y=0; y<8; y++)
2475
-			{
2476
-				int x;
2477
-				for(x=0; x<8; x++)
2478
-				{
2479
-					int ref= tempBlured[ x + y*stride ];
2480
-					int cur= src[ x + y*stride ];
2481
-					tempBlured[ x + y*stride ]=
2482
-					src[ x + y*stride ]=
2483
-						(ref*3 + cur + 2)>>2;
2484
-				}
2485
-			}
2486
-		}
2487
-	}
2488
-}
2489
-#endif
2490
-}
2491
-
2492
-static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2493
-	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
2494
-
2495
-/**
2496
- * Copies a block from src to dst and fixes the blacklevel
2497
- * levelFix == 0 -> dont touch the brighness & contrast
2498
- */
2499
-#undef SCALED_CPY
2500
-
2501
-static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride,
2502
-	int levelFix, int64_t *packedOffsetAndScale)
2503
-{
2504
-#ifndef HAVE_MMX
2505
-	int i;
2506
-#endif
2507
-	if(levelFix)
2508
-	{
2509
-#ifdef HAVE_MMX
2510
-					asm volatile(
2511
-						"movq (%%eax), %%mm2	\n\t" // packedYOffset
2512
-						"movq 8(%%eax), %%mm3	\n\t" // packedYScale
2513
-						"leal (%2,%4), %%eax	\n\t"
2514
-						"leal (%3,%5), %%edx	\n\t"
2515
-						"pxor %%mm4, %%mm4	\n\t"
2516
-#ifdef HAVE_MMX2
2517
-#define SCALED_CPY(src1, src2, dst1, dst2)					\
2518
-						"movq " #src1 ", %%mm0	\n\t"\
2519
-						"movq " #src1 ", %%mm5	\n\t"\
2520
-						"movq " #src2 ", %%mm1	\n\t"\
2521
-						"movq " #src2 ", %%mm6	\n\t"\
2522
-						"punpcklbw %%mm0, %%mm0 \n\t"\
2523
-						"punpckhbw %%mm5, %%mm5 \n\t"\
2524
-						"punpcklbw %%mm1, %%mm1 \n\t"\
2525
-						"punpckhbw %%mm6, %%mm6 \n\t"\
2526
-						"pmulhuw %%mm3, %%mm0	\n\t"\
2527
-						"pmulhuw %%mm3, %%mm5	\n\t"\
2528
-						"pmulhuw %%mm3, %%mm1	\n\t"\
2529
-						"pmulhuw %%mm3, %%mm6	\n\t"\
2530
-						"psubw %%mm2, %%mm0	\n\t"\
2531
-						"psubw %%mm2, %%mm5	\n\t"\
2532
-						"psubw %%mm2, %%mm1	\n\t"\
2533
-						"psubw %%mm2, %%mm6	\n\t"\
2534
-						"packuswb %%mm5, %%mm0	\n\t"\
2535
-						"packuswb %%mm6, %%mm1	\n\t"\
2536
-						"movq %%mm0, " #dst1 "	\n\t"\
2537
-						"movq %%mm1, " #dst2 "	\n\t"\
2538
-
2539
-#else //HAVE_MMX2
2540
-#define SCALED_CPY(src1, src2, dst1, dst2)					\
2541
-						"movq " #src1 ", %%mm0	\n\t"\
2542
-						"movq " #src1 ", %%mm5	\n\t"\
2543
-						"punpcklbw %%mm4, %%mm0 \n\t"\
2544
-						"punpckhbw %%mm4, %%mm5 \n\t"\
2545
-						"psubw %%mm2, %%mm0	\n\t"\
2546
-						"psubw %%mm2, %%mm5	\n\t"\
2547
-						"movq " #src2 ", %%mm1	\n\t"\
2548
-						"psllw $6, %%mm0	\n\t"\
2549
-						"psllw $6, %%mm5	\n\t"\
2550
-						"pmulhw %%mm3, %%mm0	\n\t"\
2551
-						"movq " #src2 ", %%mm6	\n\t"\
2552
-						"pmulhw %%mm3, %%mm5	\n\t"\
2553
-						"punpcklbw %%mm4, %%mm1 \n\t"\
2554
-						"punpckhbw %%mm4, %%mm6 \n\t"\
2555
-						"psubw %%mm2, %%mm1	\n\t"\
2556
-						"psubw %%mm2, %%mm6	\n\t"\
2557
-						"psllw $6, %%mm1	\n\t"\
2558
-						"psllw $6, %%mm6	\n\t"\
2559
-						"pmulhw %%mm3, %%mm1	\n\t"\
2560
-						"pmulhw %%mm3, %%mm6	\n\t"\
2561
-						"packuswb %%mm5, %%mm0	\n\t"\
2562
-						"packuswb %%mm6, %%mm1	\n\t"\
2563
-						"movq %%mm0, " #dst1 "	\n\t"\
2564
-						"movq %%mm1, " #dst2 "	\n\t"\
2565
-
2566
-#endif //!HAVE_MMX2
2567
-
2568
-SCALED_CPY((%2)       , (%2, %4)      , (%3)       , (%3, %5))
2569
-SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2))
2570
-SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4))
2571
-						"leal (%%eax,%4,4), %%eax	\n\t"
2572
-						"leal (%%edx,%5,4), %%edx	\n\t"
2573
-SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2))
2574
-
2575
-
2576
-						: "=&a" (packedOffsetAndScale)
2577
-						: "0" (packedOffsetAndScale),
2578
-						"r"(src),
2579
-						"r"(dst),
2580
-						"r" (srcStride),
2581
-						"r" (dstStride)
2582
-						: "%edx"
2583
-					);
2584
-#else
2585
-				for(i=0; i<8; i++)
2586
-					memcpy(	&(dst[dstStride*i]),
2587
-						&(src[srcStride*i]), BLOCK_SIZE);
2588
-#endif
2589
-	}
2590
-	else
2591
-	{
2592
-#ifdef HAVE_MMX
2593
-					asm volatile(
2594
-						"leal (%0,%2), %%eax	\n\t"
2595
-						"leal (%1,%3), %%edx	\n\t"
2596
-
2597
-#define SIMPLE_CPY(src1, src2, dst1, dst2)				\
2598
-						"movq " #src1 ", %%mm0	\n\t"\
2599
-						"movq " #src2 ", %%mm1	\n\t"\
2600
-						"movq %%mm0, " #dst1 "	\n\t"\
2601
-						"movq %%mm1, " #dst2 "	\n\t"\
2602
-
2603
-SIMPLE_CPY((%0)       , (%0, %2)      , (%1)       , (%1, %3))
2604
-SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2))
2605
-SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4))
2606
-						"leal (%%eax,%2,4), %%eax	\n\t"
2607
-						"leal (%%edx,%3,4), %%edx	\n\t"
2608
-SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2))
2609
-
2610
-						: : "r" (src),
2611
-						"r" (dst),
2612
-						"r" (srcStride),
2613
-						"r" (dstStride)
2614
-						: "%eax", "%edx"
2615
-					);
2616
-#else
2617
-				for(i=0; i<8; i++)
2618
-					memcpy(	&(dst[dstStride*i]),
2619
-						&(src[srcStride*i]), BLOCK_SIZE);
2620
-#endif
2621
-	}
2622
-}
2623
-
2624
-/**
2625
- * Duplicates the given 8 src pixels ? times upward
2626
- */
2627
-static inline void RENAME(duplicate)(uint8_t src[], int stride)
2628
-{
2629
-#ifdef HAVE_MMX
2630
-	asm volatile(
2631
-		"movq (%0), %%mm0		\n\t"
2632
-		"addl %1, %0			\n\t"
2633
-		"movq %%mm0, (%0)		\n\t"
2634
-		"movq %%mm0, (%0, %1)		\n\t"
2635
-		"movq %%mm0, (%0, %1, 2)	\n\t"
2636
-		: "+r" (src)
2637
-		: "r" (-stride)
2638
-	);
2639
-#else
2640
-	int i;
2641
-	uint8_t *p=src;
2642
-	for(i=0; i<3; i++)
2643
-	{
2644
-		p-= stride;
2645
-		memcpy(p, src, 8);
2646
-	}
2647
-#endif
2648
-}
2649
-
2650
-/**
2651
- * Filters array of bytes (Y or U or V values)
2652
- */
2653
-static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
2654
-	QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
2655
-{
2656
-	PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access
2657
-	int x,y;
2658
-#ifdef COMPILE_TIME_MODE
2659
-	const int mode= COMPILE_TIME_MODE;
2660
-#else
2661
-	const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
2662
-#endif
2663
-	int black=0, white=255; // blackest black and whitest white in the picture
2664
-	int QPCorrecture= 256*256;
2665
-
2666
-	int copyAhead;
2667
-#ifdef HAVE_MMX
2668
-	int i;
2669
-#endif
2670
-
2671
-	const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
2672
-	const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
2673
-
2674
-	//FIXME remove
2675
-	uint64_t * const yHistogram= c.yHistogram;
2676
-	uint8_t * const tempSrc= c.tempSrc;
2677
-	uint8_t * const tempDst= c.tempDst;
2678
-	const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
2679
-
2680
-#ifdef HAVE_MMX
2681
-	for(i=0; i<32; i++){
2682
-		int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
2683
-		int threshold= offset*2 + 1;
2684
-		c.mmxDcOffset[i]= 0x7F - offset;
2685
-		c.mmxDcThreshold[i]= 0x7F - threshold;
2686
-		c.mmxDcOffset[i]*= 0x0101010101010101LL;
2687
-		c.mmxDcThreshold[i]*= 0x0101010101010101LL;
2688
-	}
2689
-#endif
2690
-
2691
-	if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
2692
-	else if(   (mode & LINEAR_BLEND_DEINT_FILTER)
2693
-		|| (mode & FFMPEG_DEINT_FILTER)) copyAhead=14;
2694
-	else if(   (mode & V_DEBLOCK)
2695
-		|| (mode & LINEAR_IPOL_DEINT_FILTER)
2696
-		|| (mode & MEDIAN_DEINT_FILTER)) copyAhead=13;
2697
-	else if(mode & V_X1_FILTER) copyAhead=11;
2698
-//	else if(mode & V_RK1_FILTER) copyAhead=10;
2699
-	else if(mode & DERING) copyAhead=9;
2700
-	else copyAhead=8;
2701
-
2702
-	copyAhead-= 8;
2703
-
2704
-	if(!isColor)
2705
-	{
2706
-		uint64_t sum= 0;
2707
-		int i;
2708
-		uint64_t maxClipped;
2709
-		uint64_t clipped;
2710
-		double scale;
2711
-
2712
-		c.frameNum++;
2713
-		// first frame is fscked so we ignore it
2714
-		if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
2715
-
2716
-		for(i=0; i<256; i++)
2717
-		{
2718
-			sum+= yHistogram[i];
2719
-//			printf("%d ", yHistogram[i]);
2720
-		}
2721
-//		printf("\n\n");
2722
-
2723
-		/* we allways get a completly black picture first */
2724
-		maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
2725
-
2726
-		clipped= sum;
2727
-		for(black=255; black>0; black--)
2728
-		{
2729
-			if(clipped < maxClipped) break;
2730
-			clipped-= yHistogram[black];
2731
-		}
2732
-
2733
-		clipped= sum;
2734
-		for(white=0; white<256; white++)
2735
-		{
2736
-			if(clipped < maxClipped) break;
2737
-			clipped-= yHistogram[white];
2738
-		}
2739
-
2740
-		scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
2741
-
2742
-#ifdef HAVE_MMX2
2743
-		c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
2744
-		c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
2745
-#else
2746
-		c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
2747
-		c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
2748
-#endif
2749
-
2750
-		c.packedYOffset|= c.packedYOffset<<32;
2751
-		c.packedYOffset|= c.packedYOffset<<16;
2752
-
2753
-		c.packedYScale|= c.packedYScale<<32;
2754
-		c.packedYScale|= c.packedYScale<<16;
2755
-		
2756
-		if(mode & LEVEL_FIX)	QPCorrecture= (int)(scale*256*256 + 0.5);
2757
-		else			QPCorrecture= 256*256;
2758
-	}
2759
-	else
2760
-	{
2761
-		c.packedYScale= 0x0100010001000100LL;
2762
-		c.packedYOffset= 0;
2763
-		QPCorrecture= 256*256;
2764
-	}
2765
-
2766
-	/* copy & deinterlace first row of blocks */
2767
-	y=-BLOCK_SIZE;
2768
-	{
2769
-		uint8_t *srcBlock= &(src[y*srcStride]);
2770
-		uint8_t *dstBlock= tempDst + dstStride;
2771
-
2772
-		// From this point on it is guranteed that we can read and write 16 lines downward
2773

                
2774
-		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2775
-		for(x=0; x<width; x+=BLOCK_SIZE)
2776
-		{
2777
-
2778
-#ifdef HAVE_MMX2
2779
-/*
2780
-			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2781
-			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2782
-			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2783
-			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2784
-*/
2785
-
2786
-			asm(
2787
-				"movl %4, %%eax			\n\t"
2788
-				"shrl $2, %%eax			\n\t"
2789
-				"andl $6, %%eax			\n\t"
2790
-				"addl %5, %%eax			\n\t"
2791
-				"movl %%eax, %%edx		\n\t"
2792
-				"imul %1, %%eax			\n\t"
2793
-				"imul %3, %%edx			\n\t"
2794
-				"prefetchnta 32(%%eax, %0)	\n\t"
2795
-				"prefetcht0 32(%%edx, %2)	\n\t"
2796
-				"addl %1, %%eax			\n\t"
2797
-				"addl %3, %%edx			\n\t"
2798
-				"prefetchnta 32(%%eax, %0)	\n\t"
2799
-				"prefetcht0 32(%%edx, %2)	\n\t"
2800
-			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2801
-			"m" (x), "m" (copyAhead)
2802
-			: "%eax", "%edx"
2803
-			);
2804
-
2805
-#elif defined(HAVE_3DNOW)
2806
-//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2807
-/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2808
-			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2809
-			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2810
-			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2811
-*/
2812
-#endif
2813
-
2814
-			RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
2815
-				srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2816
-
2817
-			RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
2818
-
2819
-			if(mode & LINEAR_IPOL_DEINT_FILTER)
2820
-				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2821
-			else if(mode & LINEAR_BLEND_DEINT_FILTER)
2822
-				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2823
-			else if(mode & MEDIAN_DEINT_FILTER)
2824
-				RENAME(deInterlaceMedian)(dstBlock, dstStride);
2825
-			else if(mode & CUBIC_IPOL_DEINT_FILTER)
2826
-				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2827
-			else if(mode & FFMPEG_DEINT_FILTER)
2828
-				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2829
-/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
2830
-				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2831
-*/
2832
-			dstBlock+=8;
2833
-			srcBlock+=8;
2834
-		}
2835
-		if(width==dstStride)
2836
-			memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride);
2837
-		else
2838
-		{
2839
-			int i;
2840
-			for(i=0; i<copyAhead; i++)
2841
-			{
2842
-				memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
2843
-			}
2844
-		}
2845
-	}
2846
-
2847
-//printf("\n");
2848
-	for(y=0; y<height; y+=BLOCK_SIZE)
2849
-	{
2850
-		//1% speedup if these are here instead of the inner loop
2851
-		uint8_t *srcBlock= &(src[y*srcStride]);
2852
-		uint8_t *dstBlock= &(dst[y*dstStride]);
2853
-#ifdef HAVE_MMX
2854
-		uint8_t *tempBlock1= c.tempBlocks;
2855
-		uint8_t *tempBlock2= c.tempBlocks + 8;
2856
-#endif
2857
-		int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
2858
-		int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*mbWidth];
2859
-		int QP=0;
2860
-		/* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
2861
-		   if not than use a temporary buffer */
2862
-		if(y+15 >= height)
2863
-		{
2864
-			int i;
2865
-			/* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
2866
-			   blockcopy to dst later */
2867
-			memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
2868
-				srcStride*MAX(height-y-copyAhead, 0) );
2869
-
2870
-			/* duplicate last line of src to fill the void upto line (copyAhead+7) */
2871
-			for(i=MAX(height-y, 8); i<copyAhead+8; i++)
2872
-				memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride);
2873
-
2874
-			/* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
2875
-			memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) );
2876
-
2877
-			/* duplicate last line of dst to fill the void upto line (copyAhead) */
2878
-			for(i=height-y+1; i<=copyAhead; i++)
2879
-				memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride);
2880
-
2881
-			dstBlock= tempDst + dstStride;
2882
-			srcBlock= tempSrc;
2883
-		}
2884
-//printf("\n");
2885
-
2886
-		// From this point on it is guranteed that we can read and write 16 lines downward
2887

                
2888
-		// with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2889
-		for(x=0; x<width; x+=BLOCK_SIZE)
2890
-		{
2891
-			const int stride= dstStride;
2892
-#ifdef HAVE_MMX
2893
-			uint8_t *tmpXchg;
2894
-#endif
2895
-			if(isColor)
2896
-			{
2897
-				QP= QPptr[x>>qpHShift];
2898
-				c.nonBQP= nonBQPptr[x>>qpHShift];
2899
-			}
2900
-			else
2901
-			{
2902
-				QP= QPptr[x>>4];
2903
-				QP= (QP* QPCorrecture + 256*128)>>16;
2904
-				c.nonBQP= nonBQPptr[x>>4];
2905
-				c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
2906
-				yHistogram[ srcBlock[srcStride*12 + 4] ]++;
2907
-			}
2908
-			c.QP= QP;
2909
-#ifdef HAVE_MMX
2910
-			asm volatile(
2911
-				"movd %1, %%mm7					\n\t"
2912
-				"packuswb %%mm7, %%mm7				\n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2913
-				"packuswb %%mm7, %%mm7				\n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2914
-				"packuswb %%mm7, %%mm7				\n\t" // QP,..., QP
2915
-				"movq %%mm7, %0			\n\t"
2916
-				: "=m" (c.pQPb) 
2917
-				: "r" (QP)
2918
-			);
2919
-#endif
2920
-
2921
-
2922
-#ifdef HAVE_MMX2
2923
-/*
2924
-			prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32);
2925
-			prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32);
2926
-			prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32);
2927
-			prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32);
2928
-*/
2929
-
2930
-			asm(
2931
-				"movl %4, %%eax			\n\t"
2932
-				"shrl $2, %%eax			\n\t"
2933
-				"andl $6, %%eax			\n\t"
2934
-				"addl %5, %%eax			\n\t"
2935
-				"movl %%eax, %%edx		\n\t"
2936
-				"imul %1, %%eax			\n\t"
2937
-				"imul %3, %%edx			\n\t"
2938
-				"prefetchnta 32(%%eax, %0)	\n\t"
2939
-				"prefetcht0 32(%%edx, %2)	\n\t"
2940
-				"addl %1, %%eax			\n\t"
2941
-				"addl %3, %%edx			\n\t"
2942
-				"prefetchnta 32(%%eax, %0)	\n\t"
2943
-				"prefetcht0 32(%%edx, %2)	\n\t"
2944
-			:: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride),
2945
-			"m" (x), "m" (copyAhead)
2946
-			: "%eax", "%edx"
2947
-			);
2948
-
2949
-#elif defined(HAVE_3DNOW)
2950
-//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ...
2951
-/*			prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32);
2952
-			prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2953
-			prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2954
-			prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2955
-*/
2956
-#endif
2957
-
2958
-			RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
2959
-				srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
2960
-
2961
-			if(mode & LINEAR_IPOL_DEINT_FILTER)
2962
-				RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
2963
-			else if(mode & LINEAR_BLEND_DEINT_FILTER)
2964
-				RENAME(deInterlaceBlendLinear)(dstBlock, dstStride);
2965
-			else if(mode & MEDIAN_DEINT_FILTER)
2966
-				RENAME(deInterlaceMedian)(dstBlock, dstStride);
2967
-			else if(mode & CUBIC_IPOL_DEINT_FILTER)
2968
-				RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
2969
-			else if(mode & FFMPEG_DEINT_FILTER)
2970
-				RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
2971
-/*			else if(mode & CUBIC_BLEND_DEINT_FILTER)
2972
-				RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
2973
-*/
2974
-
2975
-			/* only deblock if we have 2 blocks */
2976
-			if(y + 8 < height)
2977
-			{
2978
-				if(mode & V_X1_FILTER)
2979
-					RENAME(vertX1Filter)(dstBlock, stride, &c);
2980
-				else if(mode & V_DEBLOCK)
2981
-				{
2982
-					if( RENAME(isVertDC)(dstBlock, stride, &c))
2983
-					{
2984
-						if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c))
2985
-							RENAME(doVertLowPass)(dstBlock, stride, &c);
2986
-					}
2987
-					else
2988
-						RENAME(doVertDefFilter)(dstBlock, stride, &c);
2989
-				}
2990
-			}
2991
-
2992
-#ifdef HAVE_MMX
2993
-			RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
2994
-#endif
2995
-			/* check if we have a previous block to deblock it with dstBlock */
2996
-			if(x - 8 >= 0)
2997
-			{
2998
-#ifdef HAVE_MMX
2999
-				if(mode & H_X1_FILTER)
3000
-					RENAME(vertX1Filter)(tempBlock1, 16, &c);
3001
-				else if(mode & H_DEBLOCK)
3002
-				{
3003
-					if( RENAME(isVertDC)(tempBlock1, 16, &c))
3004
-					{
3005
-						if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c))
3006
-							RENAME(doVertLowPass)(tempBlock1, 16, &c);
3007
-					}
3008
-					else
3009
-						RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3010
-				}
3011
-
3012
-				RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3013
-
3014
-#else
3015
-				if(mode & H_X1_FILTER)
3016
-					horizX1Filter(dstBlock-4, stride, QP);
3017
-				else if(mode & H_DEBLOCK)
3018
-				{
3019
-					if( isHorizDC(dstBlock-4, stride, &c))
3020
-					{
3021
-						if(isHorizMinMaxOk(dstBlock-4, stride, QP))
3022
-							doHorizLowPass(dstBlock-4, stride, QP);
3023
-					}
3024
-					else
3025
-						doHorizDefFilter(dstBlock-4, stride, QP);
3026
-				}
3027
-#endif
3028
-				if(mode & DERING)
3029
-				{
3030
-				//FIXME filter first line
3031
-					if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3032
-				}
3033
-
3034
-				if(mode & TEMP_NOISE_FILTER)
3035
-				{
3036
-					RENAME(tempNoiseReducer)(dstBlock-8, stride,
3037
-						c.tempBlured[isColor] + y*dstStride + x,
3038
-						c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3039
-						c.ppMode.maxTmpNoise);
3040
-				}
3041
-			}
3042
-
3043
-			dstBlock+=8;
3044
-			srcBlock+=8;
3045
-
3046
-#ifdef HAVE_MMX
3047
-			tmpXchg= tempBlock1;
3048
-			tempBlock1= tempBlock2;
3049
-			tempBlock2 = tmpXchg;
3050
-#endif
3051
-		}
3052
-
3053
-		if(mode & DERING)
3054
-		{
3055
-				if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3056
-		}
3057
-
3058
-		if((mode & TEMP_NOISE_FILTER))
3059
-		{
3060
-			RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3061
-				c.tempBlured[isColor] + y*dstStride + x,
3062
-				c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3),
3063
-				c.ppMode.maxTmpNoise);
3064
-		}
3065
-
3066
-		/* did we use a tmp buffer for the last lines*/
3067
-		if(y+15 >= height)
3068
-		{
3069
-			uint8_t *dstBlock= &(dst[y*dstStride]);
3070
-			if(width==dstStride)
3071
-				memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y));
3072
-			else
3073
-			{
3074
-				int i;
3075
-				for(i=0; i<height-y; i++)
3076
-				{
3077
-					memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3078
-				}
3079
-			}
3080
-		}
3081
-/*
3082
-		for(x=0; x<width; x+=32)
3083
-		{
3084
-			volatile int i;
3085
-			i+=	+ dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride]
3086
-				+ dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride]
3087
-				+ dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride];
3088
-//				+ dstBlock[x +13*dstStride]
3089
-//				+ dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride];
3090
-		}*/
3091
-	}
3092
-#ifdef HAVE_3DNOW
3093
-	asm volatile("femms");
3094
-#elif defined (HAVE_MMX)
3095
-	asm volatile("emms");
3096
-#endif
3097
-
3098
-#ifdef DEBUG_BRIGHTNESS
3099
-	if(!isColor)
3100
-	{
3101
-		int max=1;
3102
-		int i;
3103
-		for(i=0; i<256; i++)
3104
-			if(yHistogram[i] > max) max=yHistogram[i];
3105
-
3106
-		for(i=1; i<256; i++)
3107
-		{
3108
-			int x;
3109
-			int start=yHistogram[i-1]/(max/256+1);
3110
-			int end=yHistogram[i]/(max/256+1);
3111
-			int inc= end > start ? 1 : -1;
3112
-			for(x=start; x!=end+inc; x+=inc)
3113
-				dst[ i*dstStride + x]+=128;
3114
-		}
3115
-
3116
-		for(i=0; i<100; i+=2)
3117
-		{
3118
-			dst[ (white)*dstStride + i]+=128;
3119
-			dst[ (black)*dstStride + i]+=128;
3120
-		}
3121
-
3122
-	}
3123
-#endif
3124
-
3125
-	*c2= c; //copy local context back
3126
-
3127
-}