Originally committed as revision 1586 to svn://svn.ffmpeg.org/ffmpeg/trunk
Originally committed as revision 9427 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
Originally committed as revision 9428 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
... | ... |
@@ -60,6 +60,8 @@ mp3lame="no" |
60 | 60 |
vorbis="no" |
61 | 61 |
a52="yes" |
62 | 62 |
a52bin="no" |
63 |
+pp="yes" |
|
64 |
+shared_pp="no" |
|
63 | 65 |
win32="no" |
64 | 66 |
mingw32="no" |
65 | 67 |
cygwin="no" |
... | ... |
@@ -281,6 +283,10 @@ for opt do |
281 | 281 |
;; |
282 | 282 |
--enable-a52bin) a52bin="yes" ; extralibs="$ldl $extralibs" |
283 | 283 |
;; |
284 |
+ --disable-pp) pp="no" |
|
285 |
+ ;; |
|
286 |
+ --enable-shared-pp) shared_pp="yes" |
|
287 |
+ ;; |
|
284 | 288 |
--enable-mp3lame) mp3lame="yes" |
285 | 289 |
;; |
286 | 290 |
--enable-vorbis) vorbis="yes" |
... | ... |
@@ -578,6 +584,8 @@ echo " --enable-win32 enable win32 cross compile" |
578 | 578 |
echo " --enable-mingw32 enable mingw32 native windows compile" |
579 | 579 |
echo " --disable-a52 disable GPL'ed A52 support [default=no]" |
580 | 580 |
echo " --enable-a52bin open liba52.so.0 at runtime [default=no]" |
581 |
+echo " --disable-pp disable GPL'ed post processing support [default=no]" |
|
582 |
+echo " --enable-shared-pp use libpostproc.so [default=no]" |
|
581 | 583 |
echo " --enable-shared build shared libraries [default=no]" |
582 | 584 |
echo "" |
583 | 585 |
echo "Advanced options (experts only):" |
... | ... |
@@ -631,6 +639,8 @@ echo "mp3lame enabled $mp3lame" |
631 | 631 |
echo "vorbis enabled $vorbis" |
632 | 632 |
echo "a52 support $a52" |
633 | 633 |
echo "a52 dlopened $a52bin" |
634 |
+echo "pp support $pp" |
|
635 |
+echo "shared pp $shared_pp" |
|
634 | 636 |
echo "Video hooking $vhook" |
635 | 637 |
echo "risky / patent encumbered codecs $risky" |
636 | 638 |
|
... | ... |
@@ -754,6 +764,17 @@ if test "$a52" = "yes" ; then |
754 | 754 |
fi |
755 | 755 |
fi |
756 | 756 |
|
757 |
+# PP |
|
758 |
+if test "$pp" = "yes" ; then |
|
759 |
+ echo "#define CONFIG_PP 1" >> $TMPH |
|
760 |
+ echo "CONFIG_PP=yes" >> config.mak |
|
761 |
+ |
|
762 |
+ if test "$shared_pp" = "yes" ; then |
|
763 |
+ echo "#define SHARED_PP 1" >> $TMPH |
|
764 |
+ echo "SHARED_PP=yes" >> config.mak |
|
765 |
+ fi |
|
766 |
+fi |
|
767 |
+ |
|
757 | 768 |
# mpeg audio high precision mode |
758 | 769 |
if test "$mpegaudio_hp" = "yes" ; then |
759 | 770 |
echo "#define CONFIG_MPEGAUDIO_HP 1" >> $TMPH |
... | ... |
@@ -35,6 +35,15 @@ OBJS+= liba52/bit_allocate.o liba52/bitstream.o liba52/downmix.o \ |
35 | 35 |
endif |
36 | 36 |
endif |
37 | 37 |
|
38 |
+ifeq ($(CONFIG_PP),yes) |
|
39 |
+ifeq ($(SHARED_PP),yes) |
|
40 |
+EXTRALIBS += -lpostproc |
|
41 |
+else |
|
42 |
+# LIBS += libpostproc/libpostproc.a ... should be fixed |
|
43 |
+OBJS += libpostproc/postprocess.o |
|
44 |
+endif |
|
45 |
+endif |
|
46 |
+ |
|
38 | 47 |
ifeq ($(CONFIG_MP3LAME),yes) |
39 | 48 |
OBJS += mp3lameaudio.o |
40 | 49 |
EXTRALIBS += -lmp3lame |
... | ... |
@@ -125,6 +134,9 @@ $(SLIB): $(OBJS) |
125 | 125 |
|
126 | 126 |
dsputil.o: dsputil.c dsputil.h |
127 | 127 |
|
128 |
+libpostproc/libpostproc.a: |
|
129 |
+ $(MAKE) -C libpostproc |
|
130 |
+ |
|
128 | 131 |
%.o: %.c |
129 | 132 |
$(CC) $(CFLAGS) -c -o $@ $< |
130 | 133 |
|
131 | 134 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,64 @@ |
0 |
+ |
|
1 |
+include ../../config.mak |
|
2 |
+ |
|
3 |
+ifeq ($(SHARED_PP),yes) |
|
4 |
+SPPLIB = libpostproc.so |
|
5 |
+SPPVERSION = 0.0.1 |
|
6 |
+endif |
|
7 |
+PPLIB = libpostproc.a |
|
8 |
+ |
|
9 |
+PPOBJS=postprocess.o |
|
10 |
+SPPOBJS=postprocess_pic.o |
|
11 |
+ |
|
12 |
+CFLAGS = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC) |
|
13 |
+# -I/usr/X11R6/include/ |
|
14 |
+ |
|
15 |
+.SUFFIXES: .c .o |
|
16 |
+ |
|
17 |
+# .PHONY: all clean |
|
18 |
+ |
|
19 |
+.c.o: |
|
20 |
+ $(CC) -c $(CFLAGS) -I.. -I../.. -o $@ $< |
|
21 |
+ |
|
22 |
+all: $(SWSLIB) $(PPLIB) $(SPPLIB) |
|
23 |
+ |
|
24 |
+clean: |
|
25 |
+ rm -f *.o *.a *~ *.so |
|
26 |
+ |
|
27 |
+distclean: |
|
28 |
+ rm -f Makefile.bak *.o *.a *~ *.so .depend |
|
29 |
+ |
|
30 |
+dep: depend |
|
31 |
+ |
|
32 |
+depend: |
|
33 |
+ $(CC) -MM $(CFLAGS) postprocess.c 1>.depend |
|
34 |
+ |
|
35 |
+ifeq ($(SHARED_PP),yes) |
|
36 |
+postprocess_pic.o: postprocess.c |
|
37 |
+ $(CC) -c $(CFLAGS) -fomit-frame-pointer -fPIC -DPIC -I.. -I../.. -o $@ $< |
|
38 |
+ |
|
39 |
+$(SPPLIB): $(SPPOBJS) |
|
40 |
+ $(CC) -shared -Wl,-soname,$(SPPLIB).0 \ |
|
41 |
+ -o $(SPPLIB) $(SPPOBJS) |
|
42 |
+endif |
|
43 |
+ |
|
44 |
+$(PPLIB): $(PPOBJS) |
|
45 |
+ $(AR) r $(PPLIB) $(PPOBJS) |
|
46 |
+ |
|
47 |
+install: all |
|
48 |
+ifeq ($(SHARED_PP),yes) |
|
49 |
+ install -d $(prefix)/lib |
|
50 |
+ install -s -m 755 $(SPPLIB) $(prefix)/lib/$(SPPLIB).$(SPPVERSION) |
|
51 |
+ ln -sf $(SPPLIB).$(SPPVERSION) $(prefix)/lib/$(SPPLIB) |
|
52 |
+ ldconfig || true |
|
53 |
+ mkdir -p $(prefix)/include/postproc |
|
54 |
+ install -m 644 postprocess.h $(prefix)/include/postproc/postprocess.h |
|
55 |
+endif |
|
56 |
+ |
|
57 |
+ |
|
58 |
+# |
|
59 |
+# include dependency files if they exist |
|
60 |
+# |
|
61 |
+ifneq ($(wildcard .depend),) |
|
62 |
+include .depend |
|
63 |
+endif |
0 | 64 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,19 @@ |
0 |
+/* mangle.h - This file has some CPP macros to deal with different symbol |
|
1 |
+ * mangling across binary formats. |
|
2 |
+ * (c)2002 by Felix Buenemann <atmosfear at users.sourceforge.net> |
|
3 |
+ * File licensed under the GPL, see http://www.fsf.org/ for more info. |
|
4 |
+ */ |
|
5 |
+ |
|
6 |
+#ifndef __MANGLE_H |
|
7 |
+#define __MANGLE_H |
|
8 |
+ |
|
9 |
+/* Feel free to add more to the list, eg. a.out IMO */ |
|
10 |
+#if defined(__CYGWIN__) || defined(__OS2__) || \ |
|
11 |
+ (defined(__OpenBSD__) && !defined(__ELF__)) |
|
12 |
+#define MANGLE(a) "_" #a |
|
13 |
+#else |
|
14 |
+#define MANGLE(a) #a |
|
15 |
+#endif |
|
16 |
+ |
|
17 |
+#endif /* !__MANGLE_H */ |
|
18 |
+ |
0 | 19 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,875 @@ |
0 |
+/* |
|
1 |
+ Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) |
|
2 |
+ |
|
3 |
+ This program is free software; you can redistribute it and/or modify |
|
4 |
+ it under the terms of the GNU General Public License as published by |
|
5 |
+ the Free Software Foundation; either version 2 of the License, or |
|
6 |
+ (at your option) any later version. |
|
7 |
+ |
|
8 |
+ This program is distributed in the hope that it will be useful, |
|
9 |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11 |
+ GNU General Public License for more details. |
|
12 |
+ |
|
13 |
+ You should have received a copy of the GNU General Public License |
|
14 |
+ along with this program; if not, write to the Free Software |
|
15 |
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
16 |
+*/ |
|
17 |
+ |
|
18 |
+/* |
|
19 |
+ C MMX MMX2 3DNow |
|
20 |
+isVertDC Ec Ec |
|
21 |
+isVertMinMaxOk Ec Ec |
|
22 |
+doVertLowPass E e e |
|
23 |
+doVertDefFilter Ec Ec e e |
|
24 |
+isHorizDC Ec Ec |
|
25 |
+isHorizMinMaxOk a E |
|
26 |
+doHorizLowPass E e e |
|
27 |
+doHorizDefFilter Ec Ec e e |
|
28 |
+deRing E e e* |
|
29 |
+Vertical RKAlgo1 E a a |
|
30 |
+Horizontal RKAlgo1 a a |
|
31 |
+Vertical X1# a E E |
|
32 |
+Horizontal X1# a E E |
|
33 |
+LinIpolDeinterlace e E E* |
|
34 |
+CubicIpolDeinterlace a e e* |
|
35 |
+LinBlendDeinterlace e E E* |
|
36 |
+MedianDeinterlace# E Ec Ec |
|
37 |
+TempDeNoiser# E e e |
|
38 |
+ |
|
39 |
+* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work |
|
40 |
+# more or less selfinvented filters so the exactness isnt too meaningfull |
|
41 |
+E = Exact implementation |
|
42 |
+e = allmost exact implementation (slightly different rounding,...) |
|
43 |
+a = alternative / approximate impl |
|
44 |
+c = checked against the other implementations (-vo md5) |
|
45 |
+*/ |
|
46 |
+ |
|
47 |
+/* |
|
48 |
+TODO: |
|
49 |
+reduce the time wasted on the mem transfer |
|
50 |
+unroll stuff if instructions depend too much on the prior one |
|
51 |
+move YScale thing to the end instead of fixing QP |
|
52 |
+write a faster and higher quality deblocking filter :) |
|
53 |
+make the mainloop more flexible (variable number of blocks at once |
|
54 |
+ (the if/else stuff per block is slowing things down) |
|
55 |
+compare the quality & speed of all filters |
|
56 |
+split this huge file |
|
57 |
+optimize c versions |
|
58 |
+try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks |
|
59 |
+... |
|
60 |
+*/ |
|
61 |
+ |
|
62 |
+//Changelog: use the CVS log |
|
63 |
+ |
|
64 |
+#include "config.h" |
|
65 |
+#include <inttypes.h> |
|
66 |
+#include <stdio.h> |
|
67 |
+#include <stdlib.h> |
|
68 |
+#include <string.h> |
|
69 |
+#ifdef HAVE_MALLOC_H |
|
70 |
+#include <malloc.h> |
|
71 |
+#endif |
|
72 |
+//#undef HAVE_MMX2 |
|
73 |
+//#define HAVE_3DNOW |
|
74 |
+//#undef HAVE_MMX |
|
75 |
+//#undef ARCH_X86 |
|
76 |
+//#define DEBUG_BRIGHTNESS |
|
77 |
+#ifdef USE_FASTMEMCPY |
|
78 |
+#include "libvo/fastmemcpy.h" |
|
79 |
+#endif |
|
80 |
+#include "postprocess.h" |
|
81 |
+#include "postprocess_internal.h" |
|
82 |
+ |
|
83 |
+#include "mangle.h" //FIXME should be supressed |
|
84 |
+ |
|
85 |
+#define MIN(a,b) ((a) > (b) ? (b) : (a)) |
|
86 |
+#define MAX(a,b) ((a) < (b) ? (b) : (a)) |
|
87 |
+#define ABS(a) ((a) > 0 ? (a) : (-(a))) |
|
88 |
+#define SIGN(a) ((a) > 0 ? 1 : -1) |
|
89 |
+ |
|
90 |
+#define GET_MODE_BUFFER_SIZE 500 |
|
91 |
+#define OPTIONS_ARRAY_SIZE 10 |
|
92 |
+#define BLOCK_SIZE 8 |
|
93 |
+#define TEMP_STRIDE 8 |
|
94 |
+//#define NUM_BLOCKS_AT_ONCE 16 //not used yet |
|
95 |
+ |
|
96 |
+#ifdef ARCH_X86 |
|
97 |
+static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL; |
|
98 |
+static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL; |
|
99 |
+static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL; |
|
100 |
+static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL; |
|
101 |
+static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL; |
|
102 |
+static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL; |
|
103 |
+static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL; |
|
104 |
+#endif |
|
105 |
+ |
|
106 |
+static int verbose= 0; |
|
107 |
+ |
|
108 |
+static const int deringThreshold= 20; |
|
109 |
+ |
|
110 |
+ |
|
111 |
+static struct PPFilter filters[]= |
|
112 |
+{ |
|
113 |
+ {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, |
|
114 |
+ {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, |
|
115 |
+/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER}, |
|
116 |
+ {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/ |
|
117 |
+ {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, |
|
118 |
+ {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, |
|
119 |
+ {"dr", "dering", 1, 5, 6, DERING}, |
|
120 |
+ {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, |
|
121 |
+ {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER}, |
|
122 |
+ {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER}, |
|
123 |
+ {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER}, |
|
124 |
+ {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER}, |
|
125 |
+ {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER}, |
|
126 |
+ {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, |
|
127 |
+ {"fq", "forcequant", 1, 0, 0, FORCE_QUANT}, |
|
128 |
+ {NULL, NULL,0,0,0,0} //End Marker |
|
129 |
+}; |
|
130 |
+ |
|
131 |
+static char *replaceTable[]= |
|
132 |
+{ |
|
133 |
+ "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
134 |
+ "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
135 |
+ "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
136 |
+ "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
137 |
+ NULL //End Marker |
|
138 |
+}; |
|
139 |
+ |
|
140 |
+#ifdef ARCH_X86 |
|
141 |
+static inline void unusedVariableWarningFixer() |
|
142 |
+{ |
|
143 |
+ if(w05 + w20 + b00 + b01 + b02 + b08 + b80 == 0) b00=0; |
|
144 |
+} |
|
145 |
+#endif |
|
146 |
+ |
|
147 |
+ |
|
148 |
+#ifdef ARCH_X86 |
|
149 |
+static inline void prefetchnta(void *p) |
|
150 |
+{ |
|
151 |
+ asm volatile( "prefetchnta (%0)\n\t" |
|
152 |
+ : : "r" (p) |
|
153 |
+ ); |
|
154 |
+} |
|
155 |
+ |
|
156 |
+static inline void prefetcht0(void *p) |
|
157 |
+{ |
|
158 |
+ asm volatile( "prefetcht0 (%0)\n\t" |
|
159 |
+ : : "r" (p) |
|
160 |
+ ); |
|
161 |
+} |
|
162 |
+ |
|
163 |
+static inline void prefetcht1(void *p) |
|
164 |
+{ |
|
165 |
+ asm volatile( "prefetcht1 (%0)\n\t" |
|
166 |
+ : : "r" (p) |
|
167 |
+ ); |
|
168 |
+} |
|
169 |
+ |
|
170 |
+static inline void prefetcht2(void *p) |
|
171 |
+{ |
|
172 |
+ asm volatile( "prefetcht2 (%0)\n\t" |
|
173 |
+ : : "r" (p) |
|
174 |
+ ); |
|
175 |
+} |
|
176 |
+#endif |
|
177 |
+ |
|
178 |
+// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing |
|
179 |
+ |
|
180 |
+/** |
|
181 |
+ * Check if the given 8x8 Block is mostly "flat" |
|
182 |
+ */ |
|
183 |
+static inline int isHorizDC(uint8_t src[], int stride, PPContext *c) |
|
184 |
+{ |
|
185 |
+ int numEq= 0; |
|
186 |
+ int y; |
|
187 |
+ const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1; |
|
188 |
+ const int dcThreshold= dcOffset*2 + 1; |
|
189 |
+ for(y=0; y<BLOCK_SIZE; y++) |
|
190 |
+ { |
|
191 |
+ if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++; |
|
192 |
+ if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++; |
|
193 |
+ if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++; |
|
194 |
+ if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++; |
|
195 |
+ if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++; |
|
196 |
+ if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++; |
|
197 |
+ if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++; |
|
198 |
+ src+= stride; |
|
199 |
+ } |
|
200 |
+ return numEq > c->ppMode.flatnessThreshold; |
|
201 |
+} |
|
202 |
+ |
|
203 |
+/** |
|
204 |
+ * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
205 |
+ */ |
|
206 |
+static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){ |
|
207 |
+ int numEq= 0; |
|
208 |
+ int y; |
|
209 |
+ const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1; |
|
210 |
+ const int dcThreshold= dcOffset*2 + 1; |
|
211 |
+ src+= stride*4; // src points to begin of the 8x8 Block |
|
212 |
+ for(y=0; y<BLOCK_SIZE-1; y++) |
|
213 |
+ { |
|
214 |
+ if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++; |
|
215 |
+ if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++; |
|
216 |
+ if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++; |
|
217 |
+ if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++; |
|
218 |
+ if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++; |
|
219 |
+ if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++; |
|
220 |
+ if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++; |
|
221 |
+ if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++; |
|
222 |
+ src+= stride; |
|
223 |
+ } |
|
224 |
+ return numEq > c->ppMode.flatnessThreshold; |
|
225 |
+} |
|
226 |
+ |
|
227 |
+static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) |
|
228 |
+{ |
|
229 |
+ if(abs(src[0] - src[7]) > 2*QP) return 0; |
|
230 |
+ |
|
231 |
+ return 1; |
|
232 |
+} |
|
233 |
+ |
|
234 |
+static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) |
|
235 |
+{ |
|
236 |
+ int y; |
|
237 |
+ for(y=0; y<BLOCK_SIZE; y++) |
|
238 |
+ { |
|
239 |
+ const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); |
|
240 |
+ |
|
241 |
+ if(ABS(middleEnergy) < 8*QP) |
|
242 |
+ { |
|
243 |
+ const int q=(dst[3] - dst[4])/2; |
|
244 |
+ const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); |
|
245 |
+ const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); |
|
246 |
+ |
|
247 |
+ int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
248 |
+ d= MAX(d, 0); |
|
249 |
+ |
|
250 |
+ d= (5*d + 32) >> 6; |
|
251 |
+ d*= SIGN(-middleEnergy); |
|
252 |
+ |
|
253 |
+ if(q>0) |
|
254 |
+ { |
|
255 |
+ d= d<0 ? 0 : d; |
|
256 |
+ d= d>q ? q : d; |
|
257 |
+ } |
|
258 |
+ else |
|
259 |
+ { |
|
260 |
+ d= d>0 ? 0 : d; |
|
261 |
+ d= d<q ? q : d; |
|
262 |
+ } |
|
263 |
+ |
|
264 |
+ dst[3]-= d; |
|
265 |
+ dst[4]+= d; |
|
266 |
+ } |
|
267 |
+ dst+= stride; |
|
268 |
+ } |
|
269 |
+} |
|
270 |
+ |
|
271 |
+/** |
|
272 |
+ * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) |
|
273 |
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) |
|
274 |
+ */ |
|
275 |
+static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) |
|
276 |
+{ |
|
277 |
+ |
|
278 |
+ int y; |
|
279 |
+ for(y=0; y<BLOCK_SIZE; y++) |
|
280 |
+ { |
|
281 |
+ const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; |
|
282 |
+ const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; |
|
283 |
+ |
|
284 |
+ int sums[9]; |
|
285 |
+ sums[0] = first + dst[0]; |
|
286 |
+ sums[1] = dst[0] + dst[1]; |
|
287 |
+ sums[2] = dst[1] + dst[2]; |
|
288 |
+ sums[3] = dst[2] + dst[3]; |
|
289 |
+ sums[4] = dst[3] + dst[4]; |
|
290 |
+ sums[5] = dst[4] + dst[5]; |
|
291 |
+ sums[6] = dst[5] + dst[6]; |
|
292 |
+ sums[7] = dst[6] + dst[7]; |
|
293 |
+ sums[8] = dst[7] + last; |
|
294 |
+ |
|
295 |
+ dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
|
296 |
+ dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
|
297 |
+ dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
|
298 |
+ dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
|
299 |
+ dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
|
300 |
+ dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
|
301 |
+ dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4; |
|
302 |
+ dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
|
303 |
+ |
|
304 |
+ dst+= stride; |
|
305 |
+ } |
|
306 |
+} |
|
307 |
+ |
|
308 |
+/** |
|
309 |
+ * Experimental Filter 1 (Horizontal) |
|
310 |
+ * will not damage linear gradients |
|
311 |
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
312 |
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
313 |
+ * MMX2 version does correct clipping C version doesnt |
|
314 |
+ * not identical with the vertical one |
|
315 |
+ */ |
|
316 |
+static inline void horizX1Filter(uint8_t *src, int stride, int QP) |
|
317 |
+{ |
|
318 |
+ int y; |
|
319 |
+ static uint64_t *lut= NULL; |
|
320 |
+ if(lut==NULL) |
|
321 |
+ { |
|
322 |
+ int i; |
|
323 |
+ lut= (uint64_t*)memalign(8, 256*8); |
|
324 |
+ for(i=0; i<256; i++) |
|
325 |
+ { |
|
326 |
+ int v= i < 128 ? 2*i : 2*(i-256); |
|
327 |
+/* |
|
328 |
+//Simulate 112242211 9-Tap filter |
|
329 |
+ uint64_t a= (v/16) & 0xFF; |
|
330 |
+ uint64_t b= (v/8) & 0xFF; |
|
331 |
+ uint64_t c= (v/4) & 0xFF; |
|
332 |
+ uint64_t d= (3*v/8) & 0xFF; |
|
333 |
+*/ |
|
334 |
+//Simulate piecewise linear interpolation |
|
335 |
+ uint64_t a= (v/16) & 0xFF; |
|
336 |
+ uint64_t b= (v*3/16) & 0xFF; |
|
337 |
+ uint64_t c= (v*5/16) & 0xFF; |
|
338 |
+ uint64_t d= (7*v/16) & 0xFF; |
|
339 |
+ uint64_t A= (0x100 - a)&0xFF; |
|
340 |
+ uint64_t B= (0x100 - b)&0xFF; |
|
341 |
+ uint64_t C= (0x100 - c)&0xFF; |
|
342 |
+ uint64_t D= (0x100 - c)&0xFF; |
|
343 |
+ |
|
344 |
+ lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | |
|
345 |
+ (D<<24) | (C<<16) | (B<<8) | (A); |
|
346 |
+ //lut[i] = (v<<32) | (v<<24); |
|
347 |
+ } |
|
348 |
+ } |
|
349 |
+ |
|
350 |
+ for(y=0; y<BLOCK_SIZE; y++) |
|
351 |
+ { |
|
352 |
+ int a= src[1] - src[2]; |
|
353 |
+ int b= src[3] - src[4]; |
|
354 |
+ int c= src[5] - src[6]; |
|
355 |
+ |
|
356 |
+ int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
|
357 |
+ |
|
358 |
+ if(d < QP) |
|
359 |
+ { |
|
360 |
+ int v = d * SIGN(-b); |
|
361 |
+ |
|
362 |
+ src[1] +=v/8; |
|
363 |
+ src[2] +=v/4; |
|
364 |
+ src[3] +=3*v/8; |
|
365 |
+ src[4] -=3*v/8; |
|
366 |
+ src[5] -=v/4; |
|
367 |
+ src[6] -=v/8; |
|
368 |
+ |
|
369 |
+ } |
|
370 |
+ src+=stride; |
|
371 |
+ } |
|
372 |
+} |
|
373 |
+ |
|
374 |
+ |
|
375 |
+//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one |
|
376 |
+//Plain C versions |
|
377 |
+#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) |
|
378 |
+#define COMPILE_C |
|
379 |
+#endif |
|
380 |
+ |
|
381 |
+#ifdef ARCH_X86 |
|
382 |
+ |
|
383 |
+#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) |
|
384 |
+#define COMPILE_MMX |
|
385 |
+#endif |
|
386 |
+ |
|
387 |
+#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT) |
|
388 |
+#define COMPILE_MMX2 |
|
389 |
+#endif |
|
390 |
+ |
|
391 |
+#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) |
|
392 |
+#define COMPILE_3DNOW |
|
393 |
+#endif |
|
394 |
+#endif //ARCH_X86 |
|
395 |
+ |
|
396 |
+#undef HAVE_MMX |
|
397 |
+#undef HAVE_MMX2 |
|
398 |
+#undef HAVE_3DNOW |
|
399 |
+#undef ARCH_X86 |
|
400 |
+ |
|
401 |
+#ifdef COMPILE_C |
|
402 |
+#undef HAVE_MMX |
|
403 |
+#undef HAVE_MMX2 |
|
404 |
+#undef HAVE_3DNOW |
|
405 |
+#undef ARCH_X86 |
|
406 |
+#define RENAME(a) a ## _C |
|
407 |
+#include "postprocess_template.c" |
|
408 |
+#endif |
|
409 |
+ |
|
410 |
+//MMX versions |
|
411 |
+#ifdef COMPILE_MMX |
|
412 |
+#undef RENAME |
|
413 |
+#define HAVE_MMX |
|
414 |
+#undef HAVE_MMX2 |
|
415 |
+#undef HAVE_3DNOW |
|
416 |
+#define ARCH_X86 |
|
417 |
+#define RENAME(a) a ## _MMX |
|
418 |
+#include "postprocess_template.c" |
|
419 |
+#endif |
|
420 |
+ |
|
421 |
+//MMX2 versions |
|
422 |
+#ifdef COMPILE_MMX2 |
|
423 |
+#undef RENAME |
|
424 |
+#define HAVE_MMX |
|
425 |
+#define HAVE_MMX2 |
|
426 |
+#undef HAVE_3DNOW |
|
427 |
+#define ARCH_X86 |
|
428 |
+#define RENAME(a) a ## _MMX2 |
|
429 |
+#include "postprocess_template.c" |
|
430 |
+#endif |
|
431 |
+ |
|
432 |
+//3DNOW versions |
|
433 |
+#ifdef COMPILE_3DNOW |
|
434 |
+#undef RENAME |
|
435 |
+#define HAVE_MMX |
|
436 |
+#undef HAVE_MMX2 |
|
437 |
+#define HAVE_3DNOW |
|
438 |
+#define ARCH_X86 |
|
439 |
+#define RENAME(a) a ## _3DNow |
|
440 |
+#include "postprocess_template.c" |
|
441 |
+#endif |
|
442 |
+ |
|
443 |
+// minor note: the HAVE_xyz is messed up after that line so dont use it |
|
444 |
+ |
|
445 |
+static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
446 |
+ QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc) |
|
447 |
+{ |
|
448 |
+ PPContext *c= (PPContext *)vc; |
|
449 |
+ PPMode *ppMode= (PPMode *)vm; |
|
450 |
+ c->ppMode= *ppMode; //FIXME |
|
451 |
+ |
|
452 |
+ // useing ifs here as they are faster than function pointers allthough the |
|
453 |
+ // difference wouldnt be messureable here but its much better because |
|
454 |
+ // someone might exchange the cpu whithout restarting mplayer ;) |
|
455 |
+#ifdef RUNTIME_CPUDETECT |
|
456 |
+#ifdef ARCH_X86 |
|
457 |
+ // ordered per speed fasterst first |
|
458 |
+ if(c->cpuCaps & PP_CPU_CAPS_MMX2) |
|
459 |
+ postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
460 |
+ else if(c->cpuCaps & PP_CPU_CAPS_3DNOW) |
|
461 |
+ postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
462 |
+ else if(c->cpuCaps & PP_CPU_CAPS_MMX) |
|
463 |
+ postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
464 |
+ else |
|
465 |
+ postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
466 |
+#else |
|
467 |
+ postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
468 |
+#endif |
|
469 |
+#else //RUNTIME_CPUDETECT |
|
470 |
+#ifdef HAVE_MMX2 |
|
471 |
+ postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
472 |
+#elif defined (HAVE_3DNOW) |
|
473 |
+ postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
474 |
+#elif defined (HAVE_MMX) |
|
475 |
+ postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
476 |
+#else |
|
477 |
+ postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
478 |
+#endif |
|
479 |
+#endif //!RUNTIME_CPUDETECT |
|
480 |
+} |
|
481 |
+ |
|
482 |
+//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
483 |
+// QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); |
|
484 |
+ |
|
485 |
+/* -pp Command line Help |
|
486 |
+*/ |
|
487 |
+char *pp_help= |
|
488 |
+"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n" |
|
489 |
+"long form example:\n" |
|
490 |
+"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n" |
|
491 |
+"short form example:\n" |
|
492 |
+"vb:a/hb:a/lb de,-vb\n" |
|
493 |
+"more examples:\n" |
|
494 |
+"tn:64:128:256\n" |
|
495 |
+"Filters Options\n" |
|
496 |
+"short long name short long option Description\n" |
|
497 |
+"* * a autoq cpu power dependant enabler\n" |
|
498 |
+" c chrom chrominance filtring enabled\n" |
|
499 |
+" y nochrom chrominance filtring disabled\n" |
|
500 |
+"hb hdeblock (2 Threshold) horizontal deblocking filter\n" |
|
501 |
+" 1. difference factor: default=64, higher -> more deblocking\n" |
|
502 |
+" 2. flatness threshold: default=40, lower -> more deblocking\n" |
|
503 |
+" the h & v deblocking filters share these\n" |
|
504 |
+" so u cant set different thresholds for h / v\n" |
|
505 |
+"vb vdeblock (2 Threshold) vertical deblocking filter\n" |
|
506 |
+"h1 x1hdeblock Experimental h deblock filter 1\n" |
|
507 |
+"v1 x1vdeblock Experimental v deblock filter 1\n" |
|
508 |
+"dr dering Deringing filter\n" |
|
509 |
+"al autolevels automatic brightness / contrast\n" |
|
510 |
+" f fullyrange stretch luminance to (0..255)\n" |
|
511 |
+"lb linblenddeint linear blend deinterlacer\n" |
|
512 |
+"li linipoldeint linear interpolating deinterlace\n" |
|
513 |
+"ci cubicipoldeint cubic interpolating deinterlacer\n" |
|
514 |
+"md mediandeint median deinterlacer\n" |
|
515 |
+"fd ffmpegdeint ffmpeg deinterlacer\n" |
|
516 |
+"de default hb:a,vb:a,dr:a,al\n" |
|
517 |
+"fa fast h1:a,v1:a,dr:a,al\n" |
|
518 |
+"tn tmpnoise (3 Thresholds) Temporal Noise Reducer\n" |
|
519 |
+" 1. <= 2. <= 3. larger -> stronger filtering\n" |
|
520 |
+"fq forceQuant <quantizer> Force quantizer\n" |
|
521 |
+; |
|
522 |
+ |
|
523 |
+pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality) |
|
524 |
+{ |
|
525 |
+ char temp[GET_MODE_BUFFER_SIZE]; |
|
526 |
+ char *p= temp; |
|
527 |
+ char *filterDelimiters= ",/"; |
|
528 |
+ char *optionDelimiters= ":"; |
|
529 |
+ struct PPMode *ppMode; |
|
530 |
+ char *filterToken; |
|
531 |
+ |
|
532 |
+ ppMode= memalign(8, sizeof(PPMode)); |
|
533 |
+ |
|
534 |
+ ppMode->lumMode= 0; |
|
535 |
+ ppMode->chromMode= 0; |
|
536 |
+ ppMode->maxTmpNoise[0]= 700; |
|
537 |
+ ppMode->maxTmpNoise[1]= 1500; |
|
538 |
+ ppMode->maxTmpNoise[2]= 3000; |
|
539 |
+ ppMode->maxAllowedY= 234; |
|
540 |
+ ppMode->minAllowedY= 16; |
|
541 |
+ ppMode->baseDcDiff= 256/4; |
|
542 |
+ ppMode->flatnessThreshold= 56-16; |
|
543 |
+ ppMode->maxClippedThreshold= 0.01; |
|
544 |
+ ppMode->error=0; |
|
545 |
+ |
|
546 |
+ strncpy(temp, name, GET_MODE_BUFFER_SIZE); |
|
547 |
+ |
|
548 |
+ if(verbose>1) printf("pp: %s\n", name); |
|
549 |
+ |
|
550 |
+ for(;;){ |
|
551 |
+ char *filterName; |
|
552 |
+ int q= 1000000; //PP_QUALITY_MAX; |
|
553 |
+ int chrom=-1; |
|
554 |
+ char *option; |
|
555 |
+ char *options[OPTIONS_ARRAY_SIZE]; |
|
556 |
+ int i; |
|
557 |
+ int filterNameOk=0; |
|
558 |
+ int numOfUnknownOptions=0; |
|
559 |
+ int enable=1; //does the user want us to enabled or disabled the filter |
|
560 |
+ |
|
561 |
+ filterToken= strtok(p, filterDelimiters); |
|
562 |
+ if(filterToken == NULL) break; |
|
563 |
+ p+= strlen(filterToken) + 1; // p points to next filterToken |
|
564 |
+ filterName= strtok(filterToken, optionDelimiters); |
|
565 |
+ if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName); |
|
566 |
+ |
|
567 |
+ if(*filterName == '-') |
|
568 |
+ { |
|
569 |
+ enable=0; |
|
570 |
+ filterName++; |
|
571 |
+ } |
|
572 |
+ |
|
573 |
+ for(;;){ //for all options |
|
574 |
+ option= strtok(NULL, optionDelimiters); |
|
575 |
+ if(option == NULL) break; |
|
576 |
+ |
|
577 |
+ if(verbose>1) printf("pp: option: %s\n", option); |
|
578 |
+ if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; |
|
579 |
+ else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; |
|
580 |
+ else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; |
|
581 |
+ else |
|
582 |
+ { |
|
583 |
+ options[numOfUnknownOptions] = option; |
|
584 |
+ numOfUnknownOptions++; |
|
585 |
+ } |
|
586 |
+ if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; |
|
587 |
+ } |
|
588 |
+ options[numOfUnknownOptions] = NULL; |
|
589 |
+ |
|
590 |
+ /* replace stuff from the replace Table */ |
|
591 |
+ for(i=0; replaceTable[2*i]!=NULL; i++) |
|
592 |
+ { |
|
593 |
+ if(!strcmp(replaceTable[2*i], filterName)) |
|
594 |
+ { |
|
595 |
+ int newlen= strlen(replaceTable[2*i + 1]); |
|
596 |
+ int plen; |
|
597 |
+ int spaceLeft; |
|
598 |
+ |
|
599 |
+ if(p==NULL) p= temp, *p=0; //last filter |
|
600 |
+ else p--, *p=','; //not last filter |
|
601 |
+ |
|
602 |
+ plen= strlen(p); |
|
603 |
+ spaceLeft= p - temp + plen; |
|
604 |
+ if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE) |
|
605 |
+ { |
|
606 |
+ ppMode->error++; |
|
607 |
+ break; |
|
608 |
+ } |
|
609 |
+ memmove(p + newlen, p, plen+1); |
|
610 |
+ memcpy(p, replaceTable[2*i + 1], newlen); |
|
611 |
+ filterNameOk=1; |
|
612 |
+ } |
|
613 |
+ } |
|
614 |
+ |
|
615 |
+ for(i=0; filters[i].shortName!=NULL; i++) |
|
616 |
+ { |
|
617 |
+// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName); |
|
618 |
+ if( !strcmp(filters[i].longName, filterName) |
|
619 |
+ || !strcmp(filters[i].shortName, filterName)) |
|
620 |
+ { |
|
621 |
+ ppMode->lumMode &= ~filters[i].mask; |
|
622 |
+ ppMode->chromMode &= ~filters[i].mask; |
|
623 |
+ |
|
624 |
+ filterNameOk=1; |
|
625 |
+ if(!enable) break; // user wants to disable it |
|
626 |
+ |
|
627 |
+ if(q >= filters[i].minLumQuality) |
|
628 |
+ ppMode->lumMode|= filters[i].mask; |
|
629 |
+ if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) |
|
630 |
+ if(q >= filters[i].minChromQuality) |
|
631 |
+ ppMode->chromMode|= filters[i].mask; |
|
632 |
+ |
|
633 |
+ if(filters[i].mask == LEVEL_FIX) |
|
634 |
+ { |
|
635 |
+ int o; |
|
636 |
+ ppMode->minAllowedY= 16; |
|
637 |
+ ppMode->maxAllowedY= 234; |
|
638 |
+ for(o=0; options[o]!=NULL; o++) |
|
639 |
+ { |
|
640 |
+ if( !strcmp(options[o],"fullyrange") |
|
641 |
+ ||!strcmp(options[o],"f")) |
|
642 |
+ { |
|
643 |
+ ppMode->minAllowedY= 0; |
|
644 |
+ ppMode->maxAllowedY= 255; |
|
645 |
+ numOfUnknownOptions--; |
|
646 |
+ } |
|
647 |
+ } |
|
648 |
+ } |
|
649 |
+ else if(filters[i].mask == TEMP_NOISE_FILTER) |
|
650 |
+ { |
|
651 |
+ int o; |
|
652 |
+ int numOfNoises=0; |
|
653 |
+ |
|
654 |
+ for(o=0; options[o]!=NULL; o++) |
|
655 |
+ { |
|
656 |
+ char *tail; |
|
657 |
+ ppMode->maxTmpNoise[numOfNoises]= |
|
658 |
+ strtol(options[o], &tail, 0); |
|
659 |
+ if(tail!=options[o]) |
|
660 |
+ { |
|
661 |
+ numOfNoises++; |
|
662 |
+ numOfUnknownOptions--; |
|
663 |
+ if(numOfNoises >= 3) break; |
|
664 |
+ } |
|
665 |
+ } |
|
666 |
+ } |
|
667 |
+ else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK) |
|
668 |
+ { |
|
669 |
+ int o; |
|
670 |
+ |
|
671 |
+ for(o=0; options[o]!=NULL && o<2; o++) |
|
672 |
+ { |
|
673 |
+ char *tail; |
|
674 |
+ int val= strtol(options[o], &tail, 0); |
|
675 |
+ if(tail==options[o]) break; |
|
676 |
+ |
|
677 |
+ numOfUnknownOptions--; |
|
678 |
+ if(o==0) ppMode->baseDcDiff= val; |
|
679 |
+ else ppMode->flatnessThreshold= val; |
|
680 |
+ } |
|
681 |
+ } |
|
682 |
+ else if(filters[i].mask == FORCE_QUANT) |
|
683 |
+ { |
|
684 |
+ int o; |
|
685 |
+ ppMode->forcedQuant= 15; |
|
686 |
+ |
|
687 |
+ for(o=0; options[o]!=NULL && o<1; o++) |
|
688 |
+ { |
|
689 |
+ char *tail; |
|
690 |
+ int val= strtol(options[o], &tail, 0); |
|
691 |
+ if(tail==options[o]) break; |
|
692 |
+ |
|
693 |
+ numOfUnknownOptions--; |
|
694 |
+ ppMode->forcedQuant= val; |
|
695 |
+ } |
|
696 |
+ } |
|
697 |
+ } |
|
698 |
+ } |
|
699 |
+ if(!filterNameOk) ppMode->error++; |
|
700 |
+ ppMode->error += numOfUnknownOptions; |
|
701 |
+ } |
|
702 |
+ |
|
703 |
+ if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode); |
|
704 |
+ if(ppMode->error) |
|
705 |
+ { |
|
706 |
+ fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name); |
|
707 |
+ free(ppMode); |
|
708 |
+ return NULL; |
|
709 |
+ } |
|
710 |
+ return ppMode; |
|
711 |
+} |
|
712 |
+ |
|
713 |
+void pp_free_mode(pp_mode_t *mode){ |
|
714 |
+ if(mode) free(mode); |
|
715 |
+} |
|
716 |
+ |
|
717 |
+static void reallocAlign(void **p, int alignment, int size){ |
|
718 |
+ if(*p) free(*p); |
|
719 |
+ *p= memalign(alignment, size); |
|
720 |
+ memset(*p, 0, size); |
|
721 |
+} |
|
722 |
+ |
|
723 |
+static void reallocBuffers(PPContext *c, int width, int height, int stride){ |
|
724 |
+ int mbWidth = (width+15)>>4; |
|
725 |
+ int mbHeight= (height+15)>>4; |
|
726 |
+ int i; |
|
727 |
+ |
|
728 |
+ c->stride= stride; |
|
729 |
+ |
|
730 |
+ reallocAlign((void **)&c->tempDst, 8, stride*24); |
|
731 |
+ reallocAlign((void **)&c->tempSrc, 8, stride*24); |
|
732 |
+ reallocAlign((void **)&c->tempBlocks, 8, 2*16*8); |
|
733 |
+ reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t)); |
|
734 |
+ for(i=0; i<256; i++) |
|
735 |
+ c->yHistogram[i]= width*height/64*15/256; |
|
736 |
+ |
|
737 |
+ for(i=0; i<3; i++) |
|
738 |
+ { |
|
739 |
+ //Note:the +17*1024 is just there so i dont have to worry about r/w over te end |
|
740 |
+ reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024); |
|
741 |
+ reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size |
|
742 |
+ } |
|
743 |
+ |
|
744 |
+ reallocAlign((void **)&c->deintTemp, 8, width+16); |
|
745 |
+ reallocAlign((void **)&c->nonBQPTable, 8, mbWidth*mbHeight*sizeof(QP_STORE_T)); |
|
746 |
+ reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T)); |
|
747 |
+} |
|
748 |
+ |
|
749 |
+pp_context_t *pp_get_context(int width, int height, int cpuCaps){ |
|
750 |
+ PPContext *c= memalign(32, sizeof(PPContext)); |
|
751 |
+ int i; |
|
752 |
+ int stride= (width+15)&(~15); //assumed / will realloc if needed |
|
753 |
+ |
|
754 |
+ memset(c, 0, sizeof(PPContext)); |
|
755 |
+ c->cpuCaps= cpuCaps; |
|
756 |
+ if(cpuCaps&PP_FORMAT){ |
|
757 |
+ c->hChromaSubSample= cpuCaps&0x3; |
|
758 |
+ c->vChromaSubSample= (cpuCaps>>4)&0x3; |
|
759 |
+ }else{ |
|
760 |
+ c->hChromaSubSample= 1; |
|
761 |
+ c->vChromaSubSample= 1; |
|
762 |
+ } |
|
763 |
+ |
|
764 |
+ reallocBuffers(c, width, height, stride); |
|
765 |
+ |
|
766 |
+ c->frameNum=-1; |
|
767 |
+ |
|
768 |
+ return c; |
|
769 |
+} |
|
770 |
+ |
|
771 |
+void pp_free_context(void *vc){ |
|
772 |
+ PPContext *c = (PPContext*)vc; |
|
773 |
+ int i; |
|
774 |
+ |
|
775 |
+ for(i=0; i<3; i++) free(c->tempBlured[i]); |
|
776 |
+ for(i=0; i<3; i++) free(c->tempBluredPast[i]); |
|
777 |
+ |
|
778 |
+ free(c->tempBlocks); |
|
779 |
+ free(c->yHistogram); |
|
780 |
+ free(c->tempDst); |
|
781 |
+ free(c->tempSrc); |
|
782 |
+ free(c->deintTemp); |
|
783 |
+ free(c->nonBQPTable); |
|
784 |
+ free(c->forcedQPTable); |
|
785 |
+ |
|
786 |
+ memset(c, 0, sizeof(PPContext)); |
|
787 |
+ |
|
788 |
+ free(c); |
|
789 |
+} |
|
790 |
+ |
|
791 |
+void pp_postprocess(uint8_t * src[3], int srcStride[3], |
|
792 |
+ uint8_t * dst[3], int dstStride[3], |
|
793 |
+ int width, int height, |
|
794 |
+ QP_STORE_T *QP_store, int QPStride, |
|
795 |
+ pp_mode_t *vm, void *vc, int pict_type) |
|
796 |
+{ |
|
797 |
+ int mbWidth = (width+15)>>4; |
|
798 |
+ int mbHeight= (height+15)>>4; |
|
799 |
+ PPMode *mode = (PPMode*)vm; |
|
800 |
+ PPContext *c = (PPContext*)vc; |
|
801 |
+ int minStride= MAX(srcStride[0], dstStride[0]); |
|
802 |
+ |
|
803 |
+ if(c->stride < minStride) |
|
804 |
+ reallocBuffers(c, width, height, minStride); |
|
805 |
+ |
|
806 |
+ if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)) |
|
807 |
+ { |
|
808 |
+ int i; |
|
809 |
+ QP_store= c->forcedQPTable; |
|
810 |
+ QPStride= 0; |
|
811 |
+ if(mode->lumMode & FORCE_QUANT) |
|
812 |
+ for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant; |
|
813 |
+ else |
|
814 |
+ for(i=0; i<mbWidth; i++) QP_store[i]= 1; |
|
815 |
+ } |
|
816 |
+if(0){ |
|
817 |
+int x,y; |
|
818 |
+for(y=0; y<mbHeight; y++){ |
|
819 |
+ for(x=0; x<mbWidth; x++){ |
|
820 |
+ printf("%2d ", QP_store[x + y*QPStride]); |
|
821 |
+ } |
|
822 |
+ printf("\n"); |
|
823 |
+} |
|
824 |
+ printf("\n"); |
|
825 |
+} |
|
826 |
+//printf("pict_type:%d\n", pict_type); |
|
827 |
+ |
|
828 |
+ if(pict_type!=3) |
|
829 |
+ { |
|
830 |
+ int x,y; |
|
831 |
+ for(y=0; y<mbHeight; y++){ |
|
832 |
+ for(x=0; x<mbWidth; x++){ |
|
833 |
+ int qscale= QP_store[x + y*QPStride]; |
|
834 |
+ if(qscale&~31) |
|
835 |
+ qscale=31; |
|
836 |
+ c->nonBQPTable[y*mbWidth + x]= qscale; |
|
837 |
+ } |
|
838 |
+ } |
|
839 |
+ } |
|
840 |
+ |
|
841 |
+ if(verbose>2) |
|
842 |
+ { |
|
843 |
+ printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode); |
|
844 |
+ } |
|
845 |
+ |
|
846 |
+ postProcess(src[0], srcStride[0], dst[0], dstStride[0], |
|
847 |
+ width, height, QP_store, QPStride, 0, mode, c); |
|
848 |
+ |
|
849 |
+ width = (width )>>c->hChromaSubSample; |
|
850 |
+ height = (height)>>c->vChromaSubSample; |
|
851 |
+ |
|
852 |
+ if(mode->chromMode) |
|
853 |
+ { |
|
854 |
+ postProcess(src[1], srcStride[1], dst[1], dstStride[1], |
|
855 |
+ width, height, QP_store, QPStride, 1, mode, c); |
|
856 |
+ postProcess(src[2], srcStride[2], dst[2], dstStride[2], |
|
857 |
+ width, height, QP_store, QPStride, 2, mode, c); |
|
858 |
+ } |
|
859 |
+ else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]) |
|
860 |
+ { |
|
861 |
+ memcpy(dst[1], src[1], srcStride[1]*height); |
|
862 |
+ memcpy(dst[2], src[2], srcStride[2]*height); |
|
863 |
+ } |
|
864 |
+ else |
|
865 |
+ { |
|
866 |
+ int y; |
|
867 |
+ for(y=0; y<height; y++) |
|
868 |
+ { |
|
869 |
+ memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width); |
|
870 |
+ memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width); |
|
871 |
+ } |
|
872 |
+ } |
|
873 |
+} |
|
874 |
+ |
0 | 875 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,73 @@ |
0 |
+/* |
|
1 |
+ Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) |
|
2 |
+ |
|
3 |
+ This program is free software; you can redistribute it and/or modify |
|
4 |
+ it under the terms of the GNU General Public License as published by |
|
5 |
+ the Free Software Foundation; either version 2 of the License, or |
|
6 |
+ (at your option) any later version. |
|
7 |
+ |
|
8 |
+ This program is distributed in the hope that it will be useful, |
|
9 |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11 |
+ GNU General Public License for more details. |
|
12 |
+ |
|
13 |
+ You should have received a copy of the GNU General Public License |
|
14 |
+ along with this program; if not, write to the Free Software |
|
15 |
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
16 |
+*/ |
|
17 |
+ |
|
18 |
+#ifndef NEWPOSTPROCESS_H |
|
19 |
+#define NEWPOSTPROCESS_H |
|
20 |
+ |
|
21 |
+/** |
|
22 |
+ * @file postprocess.h |
|
23 |
+ * @brief |
|
24 |
+ * external api for the pp stuff |
|
25 |
+ */ |
|
26 |
+ |
|
27 |
+#ifdef __cplusplus |
|
28 |
+extern "C" { |
|
29 |
+#endif |
|
30 |
+ |
|
31 |
+#define PP_QUALITY_MAX 6 |
|
32 |
+ |
|
33 |
+#define QP_STORE_T int8_t |
|
34 |
+ |
|
35 |
+typedef void pp_context_t; |
|
36 |
+typedef void pp_mode_t; |
|
37 |
+ |
|
38 |
+extern char *pp_help; //a simple help text |
|
39 |
+ |
|
40 |
+void pp_postprocess(uint8_t * src[3], int srcStride[3], |
|
41 |
+ uint8_t * dst[3], int dstStride[3], |
|
42 |
+ int horizontalSize, int verticalSize, |
|
43 |
+ QP_STORE_T *QP_store, int QP_stride, |
|
44 |
+ pp_mode_t *mode, pp_context_t *ppContext, int pict_type); |
|
45 |
+ |
|
46 |
+ |
|
47 |
+/** |
|
48 |
+ * returns a pp_mode_t or NULL if an error occured |
|
49 |
+ * name is the string after "-pp" on the command line |
|
50 |
+ * quality is a number from 0 to PP_QUALITY_MAX |
|
51 |
+ */ |
|
52 |
+pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality); |
|
53 |
+void pp_free_mode(pp_mode_t *mode); |
|
54 |
+ |
|
55 |
+pp_context_t *pp_get_context(int width, int height, int flags); |
|
56 |
+void pp_free_context(pp_context_t *ppContext); |
|
57 |
+ |
|
58 |
+#define PP_CPU_CAPS_MMX 0x80000000 |
|
59 |
+#define PP_CPU_CAPS_MMX2 0x20000000 |
|
60 |
+#define PP_CPU_CAPS_3DNOW 0x40000000 |
|
61 |
+ |
|
62 |
+#define PP_FORMAT 0x00000008 |
|
63 |
+#define PP_FORMAT_420 (0x00000011|PP_FORMAT) |
|
64 |
+#define PP_FORMAT_422 (0x00000001|PP_FORMAT) |
|
65 |
+#define PP_FORMAT_411 (0x00000002|PP_FORMAT) |
|
66 |
+#define PP_FORMAT_444 (0x00000000|PP_FORMAT) |
|
67 |
+ |
|
68 |
+#ifdef __cplusplus |
|
69 |
+} |
|
70 |
+#endif |
|
71 |
+ |
|
72 |
+#endif |
0 | 73 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,128 @@ |
0 |
+/* |
|
1 |
+ Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
2 |
+ |
|
3 |
+ This program is free software; you can redistribute it and/or modify |
|
4 |
+ it under the terms of the GNU General Public License as published by |
|
5 |
+ the Free Software Foundation; either version 2 of the License, or |
|
6 |
+ (at your option) any later version. |
|
7 |
+ |
|
8 |
+ This program is distributed in the hope that it will be useful, |
|
9 |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11 |
+ GNU General Public License for more details. |
|
12 |
+ |
|
13 |
+ You should have received a copy of the GNU General Public License |
|
14 |
+ along with this program; if not, write to the Free Software |
|
15 |
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
16 |
+*/ |
|
17 |
+ |
|
18 |
+#define V_DEBLOCK 0x01 |
|
19 |
+#define H_DEBLOCK 0x02 |
|
20 |
+#define DERING 0x04 |
|
21 |
+#define LEVEL_FIX 0x08 /* Brightness & Contrast */ |
|
22 |
+ |
|
23 |
+#define LUM_V_DEBLOCK V_DEBLOCK // 1 |
|
24 |
+#define LUM_H_DEBLOCK H_DEBLOCK // 2 |
|
25 |
+#define CHROM_V_DEBLOCK (V_DEBLOCK<<4) // 16 |
|
26 |
+#define CHROM_H_DEBLOCK (H_DEBLOCK<<4) // 32 |
|
27 |
+#define LUM_DERING DERING // 4 |
|
28 |
+#define CHROM_DERING (DERING<<4) // 64 |
|
29 |
+#define LUM_LEVEL_FIX LEVEL_FIX // 8 |
|
30 |
+#define CHROM_LEVEL_FIX (LEVEL_FIX<<4) // 128 (not implemented yet) |
|
31 |
+ |
|
32 |
+// Experimental vertical filters |
|
33 |
+#define V_X1_FILTER 0x0200 // 512 |
|
34 |
+ |
|
35 |
+// Experimental horizontal filters |
|
36 |
+#define H_X1_FILTER 0x2000 // 8192 |
|
37 |
+ |
|
38 |
+// select between full y range (255-0) or standart one (234-16) |
|
39 |
+#define FULL_Y_RANGE 0x8000 // 32768 |
|
40 |
+ |
|
41 |
+//Deinterlacing Filters |
|
42 |
+#define LINEAR_IPOL_DEINT_FILTER 0x10000 // 65536 |
|
43 |
+#define LINEAR_BLEND_DEINT_FILTER 0x20000 // 131072 |
|
44 |
+#define CUBIC_BLEND_DEINT_FILTER 0x8000 // (not implemented yet) |
|
45 |
+#define CUBIC_IPOL_DEINT_FILTER 0x40000 // 262144 |
|
46 |
+#define MEDIAN_DEINT_FILTER 0x80000 // 524288 |
|
47 |
+#define FFMPEG_DEINT_FILTER 0x400000 |
|
48 |
+ |
|
49 |
+#define TEMP_NOISE_FILTER 0x100000 |
|
50 |
+#define FORCE_QUANT 0x200000 |
|
51 |
+ |
|
52 |
+//use if u want a faster postprocessing code |
|
53 |
+//cant differentiate between chroma & luma filters (both on or both off) |
|
54 |
+//obviosly the -pp option at the commandline has no effect except turning the here selected |
|
55 |
+//filters on |
|
56 |
+//#define COMPILE_TIME_MODE 0x77 |
|
57 |
+ |
|
58 |
+struct PPFilter{ |
|
59 |
+ char *shortName; |
|
60 |
+ char *longName; |
|
61 |
+ int chromDefault; // is chrominance filtering on by default if this filter is manually activated |
|
62 |
+ int minLumQuality; // minimum quality to turn luminance filtering on |
|
63 |
+ int minChromQuality; // minimum quality to turn chrominance filtering on |
|
64 |
+ int mask; // Bitmask to turn this filter on |
|
65 |
+}; |
|
66 |
+ |
|
67 |
+typedef struct PPMode{ |
|
68 |
+ int lumMode; // acivates filters for luminance |
|
69 |
+ int chromMode; // acivates filters for chrominance |
|
70 |
+ int error; // non zero on error |
|
71 |
+ |
|
72 |
+ int minAllowedY; // for brigtness correction |
|
73 |
+ int maxAllowedY; // for brihtness correction |
|
74 |
+ float maxClippedThreshold; // amount of "black" u r willing to loose to get a brightness corrected picture |
|
75 |
+ |
|
76 |
+ int maxTmpNoise[3]; // for Temporal Noise Reducing filter (Maximal sum of abs differences) |
|
77 |
+ |
|
78 |
+ int baseDcDiff; |
|
79 |
+ int flatnessThreshold; |
|
80 |
+ |
|
81 |
+ int forcedQuant; // quantizer if FORCE_QUANT is used |
|
82 |
+} PPMode; |
|
83 |
+ |
|
84 |
+typedef struct PPContext{ |
|
85 |
+ uint8_t *tempBlocks; //used for the horizontal code |
|
86 |
+ |
|
87 | ||
88 |
+ after watching a black picture for 5 hours*/ |
|
89 |
+ uint64_t *yHistogram; |
|
90 |
+ |
|
91 |
+ uint64_t __attribute__((aligned(8))) packedYOffset; |
|
92 |
+ uint64_t __attribute__((aligned(8))) packedYScale; |
|
93 |
+ |
|
94 |
+ /* Temporal noise reducing buffers */ |
|
95 |
+ uint8_t *tempBlured[3]; |
|
96 |
+ int32_t *tempBluredPast[3]; |
|
97 |
+ |
|
98 |
+ /* Temporary buffers for handling the last row(s) */ |
|
99 |
+ uint8_t *tempDst; |
|
100 |
+ uint8_t *tempSrc; |
|
101 |
+ |
|
102 |
+ uint8_t *deintTemp; |
|
103 |
+ |
|
104 |
+ uint64_t __attribute__((aligned(8))) pQPb; |
|
105 |
+ uint64_t __attribute__((aligned(8))) pQPb2; |
|
106 |
+ |
|
107 |
+ uint64_t __attribute__((aligned(8))) mmxDcOffset[32]; |
|
108 |
+ uint64_t __attribute__((aligned(8))) mmxDcThreshold[32]; |
|
109 |
+ |
|
110 |
+ QP_STORE_T *nonBQPTable; |
|
111 |
+ QP_STORE_T *forcedQPTable; |
|
112 |
+ |
|
113 |
+ int QP; |
|
114 |
+ int nonBQP; |
|
115 |
+ |
|
116 |
+ int frameNum; |
|
117 |
+ |
|
118 |
+ int cpuCaps; |
|
119 |
+ |
|
120 |
+ int stride; //size of some buffers (needed to realloc them if needed) |
|
121 |
+ |
|
122 |
+ int hChromaSubSample; |
|
123 |
+ int vChromaSubSample; |
|
124 |
+ |
|
125 |
+ PPMode ppMode; |
|
126 |
+} PPContext; |
|
127 |
+ |
0 | 128 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,3127 @@ |
0 |
+/* |
|
1 |
+ Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
2 |
+ |
|
3 |
+ This program is free software; you can redistribute it and/or modify |
|
4 |
+ it under the terms of the GNU General Public License as published by |
|
5 |
+ the Free Software Foundation; either version 2 of the License, or |
|
6 |
+ (at your option) any later version. |
|
7 |
+ |
|
8 |
+ This program is distributed in the hope that it will be useful, |
|
9 |
+ but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
11 |
+ GNU General Public License for more details. |
|
12 |
+ |
|
13 |
+ You should have received a copy of the GNU General Public License |
|
14 |
+ along with this program; if not, write to the Free Software |
|
15 |
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
16 |
+*/ |
|
17 |
+ |
|
18 |
+#undef PAVGB |
|
19 |
+#undef PMINUB |
|
20 |
+#undef PMAXUB |
|
21 |
+ |
|
22 |
+#ifdef HAVE_MMX2 |
|
23 |
+#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
|
24 |
+#elif defined (HAVE_3DNOW) |
|
25 |
+#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
|
26 |
+#endif |
|
27 |
+ |
|
28 |
+#ifdef HAVE_MMX2 |
|
29 |
+#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" |
|
30 |
+#elif defined (HAVE_MMX) |
|
31 |
+#define PMINUB(b,a,t) \ |
|
32 |
+ "movq " #a ", " #t " \n\t"\ |
|
33 |
+ "psubusb " #b ", " #t " \n\t"\ |
|
34 |
+ "psubb " #t ", " #a " \n\t" |
|
35 |
+#endif |
|
36 |
+ |
|
37 |
+#ifdef HAVE_MMX2 |
|
38 |
+#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" |
|
39 |
+#elif defined (HAVE_MMX) |
|
40 |
+#define PMAXUB(a,b) \ |
|
41 |
+ "psubusb " #a ", " #b " \n\t"\ |
|
42 |
+ "paddb " #a ", " #b " \n\t" |
|
43 |
+#endif |
|
44 |
+ |
|
45 |
+ |
|
46 |
+//FIXME? |255-0| = 1 (shouldnt be a problem ...) |
|
47 |
+#ifdef HAVE_MMX |
|
48 |
+/** |
|
49 |
+ * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
50 |
+ */ |
|
51 |
+static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){ |
|
52 |
+ int numEq= 0; |
|
53 |
+ src+= stride*4; // src points to begin of the 8x8 Block |
|
54 |
+asm volatile( |
|
55 |
+ "leal (%1, %2), %%eax \n\t" |
|
56 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
57 |
+// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
|
58 |
+ "movq %3, %%mm7 \n\t" |
|
59 |
+ "movq %4, %%mm6 \n\t" |
|
60 |
+ |
|
61 |
+ "movq (%1), %%mm0 \n\t" |
|
62 |
+ "movq (%%eax), %%mm1 \n\t" |
|
63 |
+ "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
|
64 |
+ "paddb %%mm7, %%mm0 \n\t" |
|
65 |
+ "pcmpgtb %%mm6, %%mm0 \n\t" |
|
66 |
+ |
|
67 |
+ "movq (%%eax,%2), %%mm2 \n\t" |
|
68 |
+ "psubb %%mm2, %%mm1 \n\t" |
|
69 |
+ "paddb %%mm7, %%mm1 \n\t" |
|
70 |
+ "pcmpgtb %%mm6, %%mm1 \n\t" |
|
71 |
+ "paddb %%mm1, %%mm0 \n\t" |
|
72 |
+ |
|
73 |
+ "movq (%%eax, %2, 2), %%mm1 \n\t" |
|
74 |
+ "psubb %%mm1, %%mm2 \n\t" |
|
75 |
+ "paddb %%mm7, %%mm2 \n\t" |
|
76 |
+ "pcmpgtb %%mm6, %%mm2 \n\t" |
|
77 |
+ "paddb %%mm2, %%mm0 \n\t" |
|
78 |
+ |
|
79 |
+ "leal (%%eax, %2, 4), %%eax \n\t" |
|
80 |
+ |
|
81 |
+ "movq (%1, %2, 4), %%mm2 \n\t" |
|
82 |
+ "psubb %%mm2, %%mm1 \n\t" |
|
83 |
+ "paddb %%mm7, %%mm1 \n\t" |
|
84 |
+ "pcmpgtb %%mm6, %%mm1 \n\t" |
|
85 |
+ "paddb %%mm1, %%mm0 \n\t" |
|
86 |
+ |
|
87 |
+ "movq (%%eax), %%mm1 \n\t" |
|
88 |
+ "psubb %%mm1, %%mm2 \n\t" |
|
89 |
+ "paddb %%mm7, %%mm2 \n\t" |
|
90 |
+ "pcmpgtb %%mm6, %%mm2 \n\t" |
|
91 |
+ "paddb %%mm2, %%mm0 \n\t" |
|
92 |
+ |
|
93 |
+ "movq (%%eax, %2), %%mm2 \n\t" |
|
94 |
+ "psubb %%mm2, %%mm1 \n\t" |
|
95 |
+ "paddb %%mm7, %%mm1 \n\t" |
|
96 |
+ "pcmpgtb %%mm6, %%mm1 \n\t" |
|
97 |
+ "paddb %%mm1, %%mm0 \n\t" |
|
98 |
+ |
|
99 |
+ "movq (%%eax, %2, 2), %%mm1 \n\t" |
|
100 |
+ "psubb %%mm1, %%mm2 \n\t" |
|
101 |
+ "paddb %%mm7, %%mm2 \n\t" |
|
102 |
+ "pcmpgtb %%mm6, %%mm2 \n\t" |
|
103 |
+ "paddb %%mm2, %%mm0 \n\t" |
|
104 |
+ |
|
105 |
+ " \n\t" |
|
106 |
+#ifdef HAVE_MMX2 |
|
107 |
+ "pxor %%mm7, %%mm7 \n\t" |
|
108 |
+ "psadbw %%mm7, %%mm0 \n\t" |
|
109 |
+#else |
|
110 |
+ "movq %%mm0, %%mm1 \n\t" |
|
111 |
+ "psrlw $8, %%mm0 \n\t" |
|
112 |
+ "paddb %%mm1, %%mm0 \n\t" |
|
113 |
+ "movq %%mm0, %%mm1 \n\t" |
|
114 |
+ "psrlq $16, %%mm0 \n\t" |
|
115 |
+ "paddb %%mm1, %%mm0 \n\t" |
|
116 |
+ "movq %%mm0, %%mm1 \n\t" |
|
117 |
+ "psrlq $32, %%mm0 \n\t" |
|
118 |
+ "paddb %%mm1, %%mm0 \n\t" |
|
119 |
+#endif |
|
120 |
+ "movd %%mm0, %0 \n\t" |
|
121 |
+ : "=r" (numEq) |
|
122 |
+ : "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
|
123 |
+ : "%eax" |
|
124 |
+ ); |
|
125 |
+ numEq= (-numEq) &0xFF; |
|
126 |
+ return numEq > c->ppMode.flatnessThreshold; |
|
127 |
+} |
|
128 |
+#endif |
|
129 |
+ |
|
130 |
+static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c) |
|
131 |
+{ |
|
132 |
+#ifdef HAVE_MMX |
|
133 |
+ int isOk; |
|
134 |
+ src+= stride*3; |
|
135 |
+ asm volatile( |
|
136 |
+ "movq (%1, %2), %%mm0 \n\t" |
|
137 |
+ "movq (%1, %2, 8), %%mm1 \n\t" |
|
138 |
+ "movq %%mm0, %%mm2 \n\t" |
|
139 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
140 |
+ "psubusb %%mm2, %%mm1 \n\t" |
|
141 |
+ "por %%mm1, %%mm0 \n\t" // ABS Diff |
|
142 |
+ |
|
143 |
+ "movq %3, %%mm7 \n\t" // QP,..., QP |
|
144 |
+ "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
|
145 |
+ "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 |
|
146 |
+ "packssdw %%mm0, %%mm0 \n\t" |
|
147 |
+ "movd %%mm0, %0 \n\t" |
|
148 |
+ : "=r" (isOk) |
|
149 |
+ : "r" (src), "r" (stride), "m" (c->pQPb) |
|
150 |
+ ); |
|
151 |
+ return isOk==0; |
|
152 |
+#else |
|
153 |
+#if 1 |
|
154 |
+ int x; |
|
155 |
+ const int QP= c->QP; |
|
156 |
+ src+= stride*3; |
|
157 |
+ for(x=0; x<BLOCK_SIZE; x++) |
|
158 |
+ { |
|
159 |
+ if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; |
|
160 |
+ } |
|
161 |
+ |
|
162 |
+ return 1; |
|
163 |
+#else |
|
164 |
+ int x; |
|
165 |
+ const int QP= c->QP; |
|
166 |
+ src+= stride*4; |
|
167 |
+ for(x=0; x<BLOCK_SIZE; x++) |
|
168 |
+ { |
|
169 |
+ int min=255; |
|
170 |
+ int max=0; |
|
171 |
+ int y; |
|
172 |
+ for(y=0; y<8; y++){ |
|
173 |
+ int v= src[x + y*stride]; |
|
174 |
+ if(v>max) max=v; |
|
175 |
+ if(v<min) min=v; |
|
176 |
+ } |
|
177 |
+ if(max-min > 2*QP) return 0; |
|
178 |
+ } |
|
179 |
+ return 1; |
|
180 |
+#endif |
|
181 |
+#endif |
|
182 |
+} |
|
183 |
+ |
|
184 |
+/** |
|
185 |
+ * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
|
186 |
+ * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
|
187 |
+ */ |
|
188 |
+static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
|
189 |
+{ |
|
190 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
191 |
+ src+= stride*3; |
|
192 |
+ asm volatile( //"movv %0 %1 %2\n\t" |
|
193 |
+ "movq %2, %%mm0 \n\t" // QP,..., QP |
|
194 |
+ "pxor %%mm4, %%mm4 \n\t" |
|
195 |
+ |
|
196 |
+ "movq (%0), %%mm6 \n\t" |
|
197 |
+ "movq (%0, %1), %%mm5 \n\t" |
|
198 |
+ "movq %%mm5, %%mm1 \n\t" |
|
199 |
+ "movq %%mm6, %%mm2 \n\t" |
|
200 |
+ "psubusb %%mm6, %%mm5 \n\t" |
|
201 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
202 |
+ "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
203 |
+ "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
|
204 |
+ "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
205 |
+ |
|
206 |
+ "pand %%mm2, %%mm6 \n\t" |
|
207 |
+ "pandn %%mm1, %%mm2 \n\t" |
|
208 |
+ "por %%mm2, %%mm6 \n\t"// First Line to Filter |
|
209 |
+ |
|
210 |
+ "movq (%0, %1, 8), %%mm5 \n\t" |
|
211 |
+ "leal (%0, %1, 4), %%eax \n\t" |
|
212 |
+ "leal (%0, %1, 8), %%ecx \n\t" |
|
213 |
+ "subl %1, %%ecx \n\t" |
|
214 |
+ "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
|
215 |
+ "movq (%0, %1, 8), %%mm7 \n\t" |
|
216 |
+ "movq %%mm5, %%mm1 \n\t" |
|
217 |
+ "movq %%mm7, %%mm2 \n\t" |
|
218 |
+ "psubusb %%mm7, %%mm5 \n\t" |
|
219 |
+ "psubusb %%mm1, %%mm2 \n\t" |
|
220 |
+ "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
221 |
+ "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
|
222 |
+ "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
223 |
+ |
|
224 |
+ "pand %%mm2, %%mm7 \n\t" |
|
225 |
+ "pandn %%mm1, %%mm2 \n\t" |
|
226 |
+ "por %%mm2, %%mm7 \n\t" // First Line to Filter |
|
227 |
+ |
|
228 |
+ |
|
229 |
+ // 1 2 3 4 5 6 7 8 |
|
230 |
+ // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
|
231 |
+ // 6 4 2 2 1 1 |
|
232 |
+ // 6 4 4 2 |
|
233 |
+ // 6 8 2 |
|
234 |
+ |
|
235 |
+ "movq (%0, %1), %%mm0 \n\t" // 1 |
|
236 |
+ "movq %%mm0, %%mm1 \n\t" // 1 |
|
237 |
+ PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
238 |
+ PAVGB(%%mm6, %%mm0) //3 1 /4 |
|
239 |
+ |
|
240 |
+ "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
|
241 |
+ "movq %%mm2, %%mm5 \n\t" // 1 |
|
242 |
+ PAVGB((%%eax), %%mm2) // 11 /2 |
|
243 |
+ PAVGB((%0, %1, 2), %%mm2) // 211 /4 |
|
244 |
+ "movq %%mm2, %%mm3 \n\t" // 211 /4 |
|
245 |
+ "movq (%0), %%mm4 \n\t" // 1 |
|
246 |
+ PAVGB(%%mm4, %%mm3) // 4 211 /8 |
|
247 |
+ PAVGB(%%mm0, %%mm3) //642211 /16 |
|
248 |
+ "movq %%mm3, (%0) \n\t" // X |
|
249 |
+ // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
|
250 |
+ "movq %%mm1, %%mm0 \n\t" // 1 |
|
251 |
+ PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
252 |
+ "movq %%mm4, %%mm3 \n\t" // 1 |
|
253 |
+ PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
|
254 |
+ PAVGB((%%eax,%1,2), %%mm5) // 11 /2 |
|
255 |
+ PAVGB((%%eax), %%mm5) // 211 /4 |
|
256 |
+ PAVGB(%%mm5, %%mm3) // 2 2211 /8 |
|
257 |
+ PAVGB(%%mm0, %%mm3) //4242211 /16 |
|
258 |
+ "movq %%mm3, (%0,%1) \n\t" // X |
|
259 |
+ // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
|
260 |
+ PAVGB(%%mm4, %%mm6) //11 /2 |
|
261 |
+ "movq (%%ecx), %%mm0 \n\t" // 1 |
|
262 |
+ PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
|
263 |
+ "movq %%mm0, %%mm3 \n\t" // 11/2 |
|
264 |
+ PAVGB(%%mm1, %%mm0) // 2 11/4 |
|
265 |
+ PAVGB(%%mm6, %%mm0) //222 11/8 |
|
266 |
+ PAVGB(%%mm2, %%mm0) //22242211/16 |
|
267 |
+ "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
|
268 |
+ "movq %%mm0, (%0, %1, 2) \n\t" // X |
|
269 |
+ // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
|
270 |
+ "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
|
271 |
+ PAVGB((%%ecx), %%mm0) // 11 /2 |
|
272 |
+ PAVGB(%%mm0, %%mm6) //11 11 /4 |
|
273 |
+ PAVGB(%%mm1, %%mm4) // 11 /2 |
|
274 |
+ PAVGB(%%mm2, %%mm1) // 11 /2 |
|
275 |
+ PAVGB(%%mm1, %%mm6) //1122 11 /8 |
|
276 |
+ PAVGB(%%mm5, %%mm6) //112242211 /16 |
|
277 |
+ "movq (%%eax), %%mm5 \n\t" // 1 |
|
278 |
+ "movq %%mm6, (%%eax) \n\t" // X |
|
279 |
+ // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
|
280 |
+ "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
|
281 |
+ PAVGB(%%mm7, %%mm6) // 11 /2 |
|
282 |
+ PAVGB(%%mm4, %%mm6) // 11 11 /4 |
|
283 |
+ PAVGB(%%mm3, %%mm6) // 11 2211 /8 |
|
284 |
+ PAVGB(%%mm5, %%mm2) // 11 /2 |
|
285 |
+ "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
|
286 |
+ PAVGB(%%mm4, %%mm2) // 112 /4 |
|
287 |
+ PAVGB(%%mm2, %%mm6) // 112242211 /16 |
|
288 |
+ "movq %%mm6, (%0, %1, 4) \n\t" // X |
|
289 |
+ // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
|
290 |
+ PAVGB(%%mm7, %%mm1) // 11 2 /4 |
|
291 |
+ PAVGB(%%mm4, %%mm5) // 11 /2 |
|
292 |
+ PAVGB(%%mm5, %%mm0) // 11 11 /4 |
|
293 |
+ "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
|
294 |
+ PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
|
295 |
+ PAVGB(%%mm0, %%mm1) // 11224222 /16 |
|
296 |
+ "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
|
297 |
+ // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
|
298 |
+ PAVGB((%%ecx), %%mm2) // 112 4 /8 |
|
299 |
+ "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
|
300 |
+ PAVGB(%%mm0, %%mm6) // 1 1 /2 |
|
301 |
+ PAVGB(%%mm7, %%mm6) // 1 12 /4 |
|
302 |
+ PAVGB(%%mm2, %%mm6) // 1122424 /4 |
|
303 |
+ "movq %%mm6, (%%ecx) \n\t" // X |
|
304 |
+ // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
|
305 |
+ PAVGB(%%mm7, %%mm5) // 11 2 /4 |
|
306 |
+ PAVGB(%%mm7, %%mm5) // 11 6 /8 |
|
307 |
+ |
|
308 |
+ PAVGB(%%mm3, %%mm0) // 112 /4 |
|
309 |
+ PAVGB(%%mm0, %%mm5) // 112246 /16 |
|
310 |
+ "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
|
311 |
+ "subl %1, %0 \n\t" |
|
312 |
+ |
|
313 |
+ : |
|
314 |
+ : "r" (src), "r" (stride), "m" (c->pQPb) |
|
315 |
+ : "%eax", "%ecx" |
|
316 |
+ ); |
|
317 |
+#else |
|
318 |
+ const int l1= stride; |
|
319 |
+ const int l2= stride + l1; |
|
320 |
+ const int l3= stride + l2; |
|
321 |
+ const int l4= stride + l3; |
|
322 |
+ const int l5= stride + l4; |
|
323 |
+ const int l6= stride + l5; |
|
324 |
+ const int l7= stride + l6; |
|
325 |
+ const int l8= stride + l7; |
|
326 |
+ const int l9= stride + l8; |
|
327 |
+ int x; |
|
328 |
+ src+= stride*3; |
|
329 |
+ for(x=0; x<BLOCK_SIZE; x++) |
|
330 |
+ { |
|
331 |
+ const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
|
332 |
+ const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; |
|
333 |
+ |
|
334 |
+ int sums[9]; |
|
335 |
+ sums[0] = first + src[l1]; |
|
336 |
+ sums[1] = src[l1] + src[l2]; |
|
337 |
+ sums[2] = src[l2] + src[l3]; |
|
338 |
+ sums[3] = src[l3] + src[l4]; |
|
339 |
+ sums[4] = src[l4] + src[l5]; |
|
340 |
+ sums[5] = src[l5] + src[l6]; |
|
341 |
+ sums[6] = src[l6] + src[l7]; |
|
342 |
+ sums[7] = src[l7] + src[l8]; |
|
343 |
+ sums[8] = src[l8] + last; |
|
344 |
+ |
|
345 |
+ src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
|
346 |
+ src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
|
347 |
+ src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
|
348 |
+ src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
|
349 |
+ src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
|
350 |
+ src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
|
351 |
+ src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; |
|
352 |
+ src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
|
353 |
+ |
|
354 |
+ src++; |
|
355 |
+ } |
|
356 |
+#endif |
|
357 |
+} |
|
358 |
+ |
|
359 |
+#if 0 |
|
360 |
+/** |
|
361 |
+ * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar |
|
362 |
+ * values are correctly clipped (MMX2) |
|
363 |
+ * values are wraparound (C) |
|
364 |
+ * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient |
|
365 |
+ 0 8 16 24 |
|
366 |
+ x = 8 |
|
367 |
+ x/2 = 4 |
|
368 |
+ x/8 = 1 |
|
369 |
+ 1 12 12 23 |
|
370 |
+ */ |
|
371 |
+static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
|
372 |
+{ |
|
373 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
374 |
+ src+= stride*3; |
|
375 |
+// FIXME rounding |
|
376 |
+ asm volatile( |
|
377 |
+ "pxor %%mm7, %%mm7 \n\t" // 0 |
|
378 |
+ "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
|
379 |
+ "leal (%0, %1), %%eax \n\t" |
|
380 |
+ "leal (%%eax, %1, 4), %%ecx \n\t" |
|
381 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
382 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
383 |
+ "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
|
384 |
+ "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
|
385 |
+ "paddusb "MANGLE(b02)", %%mm0 \n\t" |
|
386 |
+ "psrlw $2, %%mm0 \n\t" |
|
387 |
+ "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
|
388 |
+ "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
|
389 |
+ "movq (%0, %1, 4), %%mm2 \n\t" // line 4 |
|
390 |
+ "movq (%%ecx), %%mm3 \n\t" // line 5 |
|
391 |
+ "movq %%mm2, %%mm4 \n\t" // line 4 |
|
392 |
+ "pcmpeqb %%mm5, %%mm5 \n\t" // -1 |
|
393 |
+ "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 |
|
394 |
+ PAVGB(%%mm3, %%mm5) |
|
395 |
+ "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
|
396 |
+ "psubusb %%mm3, %%mm4 \n\t" |
|
397 |
+ "psubusb %%mm2, %%mm3 \n\t" |
|
398 |
+ "por %%mm3, %%mm4 \n\t" // |l4 - l5| |
|
399 |
+ "psubusb %%mm0, %%mm4 \n\t" |
|
400 |
+ "pcmpeqb %%mm7, %%mm4 \n\t" |
|
401 |
+ "pand %%mm4, %%mm5 \n\t" // d/2 |
|
402 |
+ |
|
403 |
+// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 |
|
404 |
+ "paddb %%mm5, %%mm2 \n\t" |
|
405 |
+// "psubb %%mm6, %%mm2 \n\t" |
|
406 |
+ "movq %%mm2, (%0,%1, 4) \n\t" |
|
407 |
+ |
|
408 |
+ "movq (%%ecx), %%mm2 \n\t" |
|
409 |
+// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
|
410 |
+ "psubb %%mm5, %%mm2 \n\t" |
|
411 |
+// "psubb %%mm6, %%mm2 \n\t" |
|
412 |
+ "movq %%mm2, (%%ecx) \n\t" |
|
413 |
+ |
|
414 |
+ "paddb %%mm6, %%mm5 \n\t" |
|
415 |
+ "psrlw $2, %%mm5 \n\t" |
|
416 |
+ "pand "MANGLE(b3F)", %%mm5 \n\t" |
|
417 |
+ "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 |
|
418 |
+ |
|
419 |
+ "movq (%%eax, %1, 2), %%mm2 \n\t" |
|
420 |
+ "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 |
|
421 |
+ "paddsb %%mm5, %%mm2 \n\t" |
|
422 |
+ "psubb %%mm6, %%mm2 \n\t" |
|
423 |
+ "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
424 |
+ |
|
425 |
+ "movq (%%ecx, %1), %%mm2 \n\t" |
|
426 |
+ "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
|
427 |
+ "psubsb %%mm5, %%mm2 \n\t" |
|
428 |
+ "psubb %%mm6, %%mm2 \n\t" |
|
429 |
+ "movq %%mm2, (%%ecx, %1) \n\t" |
|
430 |
+ |
|
431 |
+ : |
|
432 |
+ : "r" (src), "r" (stride) |
|
433 |
+ : "%eax", "%ecx" |
|
434 |
+ ); |
|
435 |
+#else |
|
436 |
+ const int l1= stride; |
|
437 |
+ const int l2= stride + l1; |
|
438 |
+ const int l3= stride + l2; |
|
439 |
+ const int l4= stride + l3; |
|
440 |
+ const int l5= stride + l4; |
|
441 |
+ const int l6= stride + l5; |
|
442 |
+// const int l7= stride + l6; |
|
443 |
+// const int l8= stride + l7; |
|
444 |
+// const int l9= stride + l8; |
|
445 |
+ int x; |
|
446 |
+ const int QP15= QP + (QP>>2); |
|
447 |
+ src+= stride*3; |
|
448 |
+ for(x=0; x<BLOCK_SIZE; x++) |
|
449 |
+ { |
|
450 |
+ const int v = (src[x+l5] - src[x+l4]); |
|
451 |
+ if(ABS(v) < QP15) |
|
452 |
+ { |
|
453 |
+ src[x+l3] +=v>>3; |
|
454 |
+ src[x+l4] +=v>>1; |
|
455 |
+ src[x+l5] -=v>>1; |
|
456 |
+ src[x+l6] -=v>>3; |
|
457 |
+ |
|
458 |
+ } |
|
459 |
+ } |
|
460 |
+ |
|
461 |
+#endif |
|
462 |
+} |
|
463 |
+#endif |
|
464 |
+ |
|
465 |
+/** |
|
466 |
+ * Experimental Filter 1 |
|
467 |
+ * will not damage linear gradients |
|
468 |
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
469 |
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
470 |
+ * MMX2 version does correct clipping C version doesnt |
|
471 |
+ */ |
|
472 |
+static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
|
473 |
+{ |
|
474 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
475 |
+ src+= stride*3; |
|
476 |
+ |
|
477 |
+ asm volatile( |
|
478 |
+ "pxor %%mm7, %%mm7 \n\t" // 0 |
|
479 |
+ "leal (%0, %1), %%eax \n\t" |
|
480 |
+ "leal (%%eax, %1, 4), %%ecx \n\t" |
|
481 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
482 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
483 |
+ "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
484 |
+ "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
|
485 |
+ "movq %%mm1, %%mm2 \n\t" // line 4 |
|
486 |
+ "psubusb %%mm0, %%mm1 \n\t" |
|
487 |
+ "psubusb %%mm2, %%mm0 \n\t" |
|
488 |
+ "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
|
489 |
+ "movq (%%ecx), %%mm3 \n\t" // line 5 |
|
490 |
+ "movq (%%ecx, %1), %%mm4 \n\t" // line 6 |
|
491 |
+ "movq %%mm3, %%mm5 \n\t" // line 5 |
|
492 |
+ "psubusb %%mm4, %%mm3 \n\t" |
|
493 |
+ "psubusb %%mm5, %%mm4 \n\t" |
|
494 |
+ "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
|
495 |
+ PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
|
496 |
+ "movq %%mm2, %%mm1 \n\t" // line 4 |
|
497 |
+ "psubusb %%mm5, %%mm2 \n\t" |
|
498 |
+ "movq %%mm2, %%mm4 \n\t" |
|
499 |
+ "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
|
500 |
+ "psubusb %%mm1, %%mm5 \n\t" |
|
501 |
+ "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
|
502 |
+ "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
|
503 |
+ "movq %%mm4, %%mm3 \n\t" // d |
|
504 |
+ "movq %2, %%mm0 \n\t" |
|
505 |
+ "paddusb %%mm0, %%mm0 \n\t" |
|
506 |
+ "psubusb %%mm0, %%mm4 \n\t" |
|
507 |
+ "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
|
508 |
+ "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
509 |
+ "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
|
510 |
+ |
|
511 |
+ PAVGB(%%mm7, %%mm3) // d/2 |
|
512 |
+ "movq %%mm3, %%mm1 \n\t" // d/2 |
|
513 |
+ PAVGB(%%mm7, %%mm3) // d/4 |
|
514 |
+ PAVGB(%%mm1, %%mm3) // 3*d/8 |
|
515 |
+ |
|
516 |
+ "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
|
517 |
+ "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
518 |
+ "psubusb %%mm3, %%mm0 \n\t" |
|
519 |
+ "pxor %%mm2, %%mm0 \n\t" |
|
520 |
+ "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
|
521 |
+ |
|
522 |
+ "movq (%%ecx), %%mm0 \n\t" // line 5 |
|
523 |
+ "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
524 |
+ "paddusb %%mm3, %%mm0 \n\t" |
|
525 |
+ "pxor %%mm2, %%mm0 \n\t" |
|
526 |
+ "movq %%mm0, (%%ecx) \n\t" // line 5 |
|
527 |
+ |
|
528 |
+ PAVGB(%%mm7, %%mm1) // d/4 |
|
529 |
+ |
|
530 |
+ "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
531 |
+ "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
532 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
533 |
+ "pxor %%mm2, %%mm0 \n\t" |
|
534 |
+ "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
|
535 |
+ |
|
536 |
+ "movq (%%ecx, %1), %%mm0 \n\t" // line 6 |
|
537 |
+ "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
538 |
+ "paddusb %%mm1, %%mm0 \n\t" |
|
539 |
+ "pxor %%mm2, %%mm0 \n\t" |
|
540 |
+ "movq %%mm0, (%%ecx, %1) \n\t" // line 6 |
|
541 |
+ |
|
542 |
+ PAVGB(%%mm7, %%mm1) // d/8 |
|
543 |
+ |
|
544 |
+ "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
|
545 |
+ "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
|
546 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
547 |
+ "pxor %%mm2, %%mm0 \n\t" |
|
548 |
+ "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
|
549 |
+ |
|
550 |
+ "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 |
|
551 |
+ "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
|
552 |
+ "paddusb %%mm1, %%mm0 \n\t" |
|
553 |
+ "pxor %%mm2, %%mm0 \n\t" |
|
554 |
+ "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 |
|
555 |
+ |
|
556 |
+ : |
|
557 |
+ : "r" (src), "r" (stride), "m" (co->pQPb) |
|
558 |
+ : "%eax", "%ecx" |
|
559 |
+ ); |
|
560 |
+#else |
|
561 |
+ |
|
562 |
+ const int l1= stride; |
|
563 |
+ const int l2= stride + l1; |
|
564 |
+ const int l3= stride + l2; |
|
565 |
+ const int l4= stride + l3; |
|
566 |
+ const int l5= stride + l4; |
|
567 |
+ const int l6= stride + l5; |
|
568 |
+ const int l7= stride + l6; |
|
569 |
+// const int l8= stride + l7; |
|
570 |
+// const int l9= stride + l8; |
|
571 |
+ int x; |
|
572 |
+ |
|
573 |
+ src+= stride*3; |
|
574 |
+ for(x=0; x<BLOCK_SIZE; x++) |
|
575 |
+ { |
|
576 |
+ int a= src[l3] - src[l4]; |
|
577 |
+ int b= src[l4] - src[l5]; |
|
578 |
+ int c= src[l5] - src[l6]; |
|
579 |
+ |
|
580 |
+ int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
|
581 |
+ d= MAX(d, 0); |
|
582 |
+ |
|
583 |
+ if(d < co->QP*2) |
|
584 |
+ { |
|
585 |
+ int v = d * SIGN(-b); |
|
586 |
+ |
|
587 |
+ src[l2] +=v>>3; |
|
588 |
+ src[l3] +=v>>2; |
|
589 |
+ src[l4] +=(3*v)>>3; |
|
590 |
+ src[l5] -=(3*v)>>3; |
|
591 |
+ src[l6] -=v>>2; |
|
592 |
+ src[l7] -=v>>3; |
|
593 |
+ |
|
594 |
+ } |
|
595 |
+ src++; |
|
596 |
+ } |
|
597 |
+#endif |
|
598 |
+} |
|
599 |
+ |
|
600 |
+static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
|
601 |
+{ |
|
602 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
603 |
+/* |
|
604 |
+ uint8_t tmp[16]; |
|
605 |
+ const int l1= stride; |
|
606 |
+ const int l2= stride + l1; |
|
607 |
+ const int l3= stride + l2; |
|
608 |
+ const int l4= (int)tmp - (int)src - stride*3; |
|
609 |
+ const int l5= (int)tmp - (int)src - stride*3 + 8; |
|
610 |
+ const int l6= stride*3 + l3; |
|
611 |
+ const int l7= stride + l6; |
|
612 |
+ const int l8= stride + l7; |
|
613 |
+ |
|
614 |
+ memcpy(tmp, src+stride*7, 8); |
|
615 |
+ memcpy(tmp+8, src+stride*8, 8); |
|
616 |
+*/ |
|
617 |
+ src+= stride*4; |
|
618 |
+ asm volatile( |
|
619 |
+ |
|
620 |
+#if 0 //sligtly more accurate and slightly slower |
|
621 |
+ "pxor %%mm7, %%mm7 \n\t" // 0 |
|
622 |
+ "leal (%0, %1), %%eax \n\t" |
|
623 |
+ "leal (%%eax, %1, 4), %%ecx \n\t" |
|
624 |
+// 0 1 2 3 4 5 6 7 |
|
625 |
+// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
|
626 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
|
627 |
+ |
|
628 |
+ |
|
629 |
+ "movq (%0, %1, 2), %%mm0 \n\t" // l2 |
|
630 |
+ "movq (%0), %%mm1 \n\t" // l0 |
|
631 |
+ "movq %%mm0, %%mm2 \n\t" // l2 |
|
632 |
+ PAVGB(%%mm7, %%mm0) // ~l2/2 |
|
633 |
+ PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 |
|
634 |
+ PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 |
|
635 |
+ |
|
636 |
+ "movq (%%eax), %%mm1 \n\t" // l1 |
|
637 |
+ "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 |
|
638 |
+ "movq %%mm1, %%mm4 \n\t" // l1 |
|
639 |
+ PAVGB(%%mm7, %%mm1) // ~l1/2 |
|
640 |
+ PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 |
|
641 |
+ PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 |
|
642 |
+ |
|
643 |
+ "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 |
|
644 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
645 |
+ "psubusb %%mm4, %%mm1 \n\t" |
|
646 |
+ "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 |
|
647 |
+// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 |
|
648 |
+ |
|
649 |
+ "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
|
650 |
+ "movq %%mm0, %%mm4 \n\t" // l4 |
|
651 |
+ PAVGB(%%mm7, %%mm0) // ~l4/2 |
|
652 |
+ PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 |
|
653 |
+ PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 |
|
654 |
+ |
|
655 |
+ "movq (%%ecx), %%mm2 \n\t" // l5 |
|
656 |
+ "movq %%mm3, %%mm5 \n\t" // l3 |
|
657 |
+ PAVGB(%%mm7, %%mm3) // ~l3/2 |
|
658 |
+ PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 |
|
659 |
+ PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 |
|
660 |
+ |
|
661 |
+ "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 |
|
662 |
+ "psubusb %%mm3, %%mm0 \n\t" |
|
663 |
+ "psubusb %%mm6, %%mm3 \n\t" |
|
664 |
+ "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 |
|
665 |
+ "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) |
|
666 |
+// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 |
|
667 |
+ |
|
668 |
+ "movq (%%ecx, %1), %%mm6 \n\t" // l6 |
|
669 |
+ "movq %%mm6, %%mm5 \n\t" // l6 |
|
670 |
+ PAVGB(%%mm7, %%mm6) // ~l6/2 |
|
671 |
+ PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 |
|
672 |
+ PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 |
|
673 |
+ |
|
674 |
+ "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 |
|
675 |
+ "movq %%mm2, %%mm4 \n\t" // l5 |
|
676 |
+ PAVGB(%%mm7, %%mm2) // ~l5/2 |
|
677 |
+ PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 |
|
678 |
+ PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 |
|
679 |
+ |
|
680 |
+ "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 |
|
681 |
+ "psubusb %%mm2, %%mm6 \n\t" |
|
682 |
+ "psubusb %%mm4, %%mm2 \n\t" |
|
683 |
+ "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 |
|
684 |
+// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 |
|
685 |
+ |
|
686 |
+ |
|
687 |
+ PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 |
|
688 |
+ "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
|
689 |
+ "paddusb "MANGLE(b01)", %%mm4 \n\t" |
|
690 |
+ "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
|
691 |
+ "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 |
|
692 |
+ "pand %%mm4, %%mm3 \n\t" |
|
693 |
+ |
|
694 |
+ "movq %%mm3, %%mm1 \n\t" |
|
695 |
+// "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
696 |
+ PAVGB(%%mm7, %%mm3) |
|
697 |
+ PAVGB(%%mm7, %%mm3) |
|
698 |
+ "paddusb %%mm1, %%mm3 \n\t" |
|
699 |
+// "paddusb "MANGLE(b01)", %%mm3 \n\t" |
|
700 |
+ |
|
701 |
+ "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 |
|
702 |
+ "movq (%0, %1, 4), %%mm5 \n\t" //l4 |
|
703 |
+ "movq (%0, %1, 4), %%mm4 \n\t" //l4 |
|
704 |
+ "psubusb %%mm6, %%mm5 \n\t" |
|
705 |
+ "psubusb %%mm4, %%mm6 \n\t" |
|
706 |
+ "por %%mm6, %%mm5 \n\t" // |l3-l4| |
|
707 |
+ "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) |
|
708 |
+ "pxor %%mm6, %%mm0 \n\t" |
|
709 |
+ "pand %%mm0, %%mm3 \n\t" |
|
710 |
+ PMINUB(%%mm5, %%mm3, %%mm0) |
|
711 |
+ |
|
712 |
+ "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
713 |
+ PAVGB(%%mm7, %%mm3) |
|
714 |
+ |
|
715 |
+ "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
716 |
+ "movq (%0, %1, 4), %%mm2 \n\t" |
|
717 |
+ "pxor %%mm6, %%mm0 \n\t" |
|
718 |
+ "pxor %%mm6, %%mm2 \n\t" |
|
719 |
+ "psubb %%mm3, %%mm0 \n\t" |
|
720 |
+ "paddb %%mm3, %%mm2 \n\t" |
|
721 |
+ "pxor %%mm6, %%mm0 \n\t" |
|
722 |
+ "pxor %%mm6, %%mm2 \n\t" |
|
723 |
+ "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
724 |
+ "movq %%mm2, (%0, %1, 4) \n\t" |
|
725 |
+#endif |
|
726 |
+ |
|
727 |
+ "leal (%0, %1), %%eax \n\t" |
|
728 |
+ "pcmpeqb %%mm6, %%mm6 \n\t" // -1 |
|
729 |
+// 0 1 2 3 4 5 6 7 |
|
730 |
+// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
|
731 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
|
732 |
+ |
|
733 |
+ |
|
734 |
+ "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 |
|
735 |
+ "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
|
736 |
+ "pxor %%mm6, %%mm1 \n\t" // -l3-1 |
|
737 |
+ PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 |
|
738 |
+// mm1=-l3-1, mm0=128-q |
|
739 |
+ |
|
740 |
+ "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 |
|
741 |
+ "movq (%%eax, %1), %%mm3 \n\t" // l2 |
|
742 |
+ "pxor %%mm6, %%mm2 \n\t" // -l5-1 |
|
743 |
+ "movq %%mm2, %%mm5 \n\t" // -l5-1 |
|
744 |
+ "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
|
745 |
+ "leal (%%eax, %1, 4), %%ecx \n\t" |
|
746 |
+ PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
|
747 |
+ PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 |
|
748 |
+ PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 |
|
749 |
+ PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 |
|
750 |
+// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 |
|
751 |
+ |
|
752 |
+ "movq (%%eax), %%mm2 \n\t" // l1 |
|
753 |
+ "pxor %%mm6, %%mm2 \n\t" // -l1-1 |
|
754 |
+ PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 |
|
755 |
+ PAVGB((%0), %%mm1) // (l0-l3+256)/2 |
|
756 |
+ "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
|
757 |
+ PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
|
758 |
+ PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 |
|
759 |
+ PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 |
|
760 |
+// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 |
|
761 |
+ |
|
762 |
+ PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 |
|
763 |
+ "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 |
|
764 |
+ "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
|
765 |
+ PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 |
|
766 |
+ "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
|
767 |
+ PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
|
768 |
+ PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 |
|
769 |
+ PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 |
|
770 |
+// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 |
|
771 |
+ |
|
772 |
+ "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
|
773 |
+ "movq "MANGLE(b00)", %%mm5 \n\t" // 0 |
|
774 |
+ "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
|
775 |
+ "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 |
|
776 |
+ PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| |
|
777 |
+ PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| |
|
778 |
+ PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 |
|
779 |
+ |
|
780 |
+// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 |
|
781 |
+ |
|
782 |
+ "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
|
783 |
+ "movq %2, %%mm2 \n\t" // QP |
|
784 |
+ PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
|
785 |
+ "psubb %%mm6, %%mm2 \n\t" |
|
786 |
+ |
|
787 |
+ "movq %%mm4, %%mm1 \n\t" |
|
788 |
+ "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) |
|
789 |
+ "pxor %%mm1, %%mm4 \n\t" |
|
790 |
+ "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 |
|
791 |
+ "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 |
|
792 |
+ "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 |
|
793 |
+// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 |
|
794 |
+ |
|
795 |
+ "movq %%mm4, %%mm3 \n\t" // d |
|
796 |
+ "psubusb "MANGLE(b01)", %%mm4 \n\t" |
|
797 |
+ PAVGB(%%mm7, %%mm4) // d/32 |
|
798 |
+ PAVGB(%%mm7, %%mm4) // (d + 32)/64 |
|
799 |
+ "paddb %%mm3, %%mm4 \n\t" // 5d/64 |
|
800 |
+ "pand %%mm2, %%mm4 \n\t" |
|
801 |
+ |
|
802 |
+ "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
|
803 |
+ "psubb %%mm0, %%mm5 \n\t" // q |
|
804 |
+ "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding |
|
805 |
+ "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) |
|
806 |
+ "pxor %%mm7, %%mm5 \n\t" |
|
807 |
+ |
|
808 |
+ PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) |
|
809 |
+ "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) |
|
810 |
+ |
|
811 |
+ "pand %%mm7, %%mm4 \n\t" |
|
812 |
+ "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
813 |
+ "movq (%0, %1, 4), %%mm2 \n\t" |
|
814 |
+ "pxor %%mm1, %%mm0 \n\t" |
|
815 |
+ "pxor %%mm1, %%mm2 \n\t" |
|
816 |
+ "paddb %%mm4, %%mm0 \n\t" |
|
817 |
+ "psubb %%mm4, %%mm2 \n\t" |
|
818 |
+ "pxor %%mm1, %%mm0 \n\t" |
|
819 |
+ "pxor %%mm1, %%mm2 \n\t" |
|
820 |
+ "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
821 |
+ "movq %%mm2, (%0, %1, 4) \n\t" |
|
822 |
+ |
|
823 |
+ : |
|
824 |
+ : "r" (src), "r" (stride), "m" (c->pQPb) |
|
825 |
+ : "%eax", "%ecx" |
|
826 |
+ ); |
|
827 |
+ |
|
828 |
+/* |
|
829 |
+ { |
|
830 |
+ int x; |
|
831 |
+ src-= stride; |
|
832 |
+ for(x=0; x<BLOCK_SIZE; x++) |
|
833 |
+ { |
|
834 |
+ const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
|
835 |
+ if(ABS(middleEnergy)< 8*QP) |
|
836 |
+ { |
|
837 |
+ const int q=(src[l4] - src[l5])/2; |
|
838 |
+ const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
839 |
+ const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
840 |
+ |
|
841 |
+ int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
842 |
+ d= MAX(d, 0); |
|
843 |
+ |
|
844 |
+ d= (5*d + 32) >> 6; |
|
845 |
+ d*= SIGN(-middleEnergy); |
|
846 |
+ |
|
847 |
+ if(q>0) |
|
848 |
+ { |
|
849 |
+ d= d<0 ? 0 : d; |
|
850 |
+ d= d>q ? q : d; |
|
851 |
+ } |
|
852 |
+ else |
|
853 |
+ { |
|
854 |
+ d= d>0 ? 0 : d; |
|
855 |
+ d= d<q ? q : d; |
|
856 |
+ } |
|
857 |
+ |
|
858 |
+ src[l4]-= d; |
|
859 |
+ src[l5]+= d; |
|
860 |
+ } |
|
861 |
+ src++; |
|
862 |
+ } |
|
863 |
+src-=8; |
|
864 |
+ for(x=0; x<8; x++) |
|
865 |
+ { |
|
866 |
+ int y; |
|
867 |
+ for(y=4; y<6; y++) |
|
868 |
+ { |
|
869 |
+ int d= src[x+y*stride] - tmp[x+(y-4)*8]; |
|
870 |
+ int ad= ABS(d); |
|
871 |
+ static int max=0; |
|
872 |
+ static int sum=0; |
|
873 |
+ static int num=0; |
|
874 |
+ static int bias=0; |
|
875 |
+ |
|
876 |
+ if(max<ad) max=ad; |
|
877 |
+ sum+= ad>3 ? 1 : 0; |
|
878 |
+ if(ad>3) |
|
879 |
+ { |
|
880 |
+ src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; |
|
881 |
+ } |
|
882 |
+ if(y==4) bias+=d; |
|
883 |
+ num++; |
|
884 |
+ if(num%1000000 == 0) |
|
885 |
+ { |
|
886 |
+ printf(" %d %d %d %d\n", num, sum, max, bias); |
|
887 |
+ } |
|
888 |
+ } |
|
889 |
+ } |
|
890 |
+} |
|
891 |
+*/ |
|
892 |
+#elif defined (HAVE_MMX) |
|
893 |
+ src+= stride*4; |
|
894 |
+ asm volatile( |
|
895 |
+ "pxor %%mm7, %%mm7 \n\t" |
|
896 |
+ "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
|
897 |
+ "andl $0xFFFFFFF8, %%ecx \n\t" // align |
|
898 |
+// 0 1 2 3 4 5 6 7 |
|
899 |
+// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
|
900 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 |
|
901 |
+ |
|
902 |
+ "movq (%0), %%mm0 \n\t" |
|
903 |
+ "movq %%mm0, %%mm1 \n\t" |
|
904 |
+ "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
|
905 |
+ "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
|
906 |
+ |
|
907 |
+ "movq (%0, %1), %%mm2 \n\t" |
|
908 |
+ "leal (%0, %1, 2), %%eax \n\t" |
|
909 |
+ "movq %%mm2, %%mm3 \n\t" |
|
910 |
+ "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
|
911 |
+ "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
|
912 |
+ |
|
913 |
+ "movq (%%eax), %%mm4 \n\t" |
|
914 |
+ "movq %%mm4, %%mm5 \n\t" |
|
915 |
+ "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
|
916 |
+ "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
|
917 |
+ |
|
918 |
+ "paddw %%mm0, %%mm0 \n\t" // 2L0 |
|
919 |
+ "paddw %%mm1, %%mm1 \n\t" // 2H0 |
|
920 |
+ "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
|
921 |
+ "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
|
922 |
+ "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
|
923 |
+ "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
|
924 |
+ |
|
925 |
+ "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
|
926 |
+ "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
|
927 |
+ "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
|
928 |
+ "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
|
929 |
+ |
|
930 |
+ "movq (%%eax, %1), %%mm2 \n\t" |
|
931 |
+ "movq %%mm2, %%mm3 \n\t" |
|
932 |
+ "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
|
933 |
+ "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
|
934 |
+ |
|
935 |
+ "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
|
936 |
+ "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
|
937 |
+ "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
938 |
+ "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
939 |
+ "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
940 |
+ "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
941 |
+ |
|
942 |
+ "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
943 |
+ "movq %%mm0, %%mm1 \n\t" |
|
944 |
+ "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
|
945 |
+ "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
|
946 |
+ |
|
947 |
+ "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
|
948 |
+ "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
|
949 |
+ "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
|
950 |
+ "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 |
|
951 |
+ "paddw %%mm4, %%mm4 \n\t" // 2L2 |
|
952 |
+ "paddw %%mm5, %%mm5 \n\t" // 2H2 |
|
953 |
+ "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
|
954 |
+ "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
|
955 |
+ |
|
956 |
+ "leal (%%eax, %1), %0 \n\t" |
|
957 |
+ "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
|
958 |
+ "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
|
959 |
+ "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
|
960 |
+ "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
|
961 |
+//50 opcodes so far |
|
962 |
+ "movq (%0, %1, 2), %%mm2 \n\t" |
|
963 |
+ "movq %%mm2, %%mm3 \n\t" |
|
964 |
+ "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
|
965 |
+ "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
|
966 |
+ "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
|
967 |
+ "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
|
968 |
+ "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
|
969 |
+ "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
|
970 |
+ |
|
971 |
+ "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
972 |
+ "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
|
973 |
+ "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
|
974 |
+ "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
975 |
+ "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
|
976 |
+ "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
|
977 |
+ |
|
978 |
+ "paddw %%mm0, %%mm0 \n\t" // 2L4 |
|
979 |
+ "paddw %%mm1, %%mm1 \n\t" // 2H4 |
|
980 |
+ "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
|
981 |
+ "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
|
982 |
+ |
|
983 |
+ "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
|
984 |
+ "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
|
985 |
+ "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
|
986 |
+ "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
|
987 |
+ |
|
988 |
+ "movq (%0, %1, 4), %%mm2 \n\t" |
|
989 |
+ "movq %%mm2, %%mm3 \n\t" |
|
990 |
+ "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
|
991 |
+ "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
|
992 |
+ |
|
993 |
+ "paddw %%mm2, %%mm2 \n\t" // 2L7 |
|
994 |
+ "paddw %%mm3, %%mm3 \n\t" // 2H7 |
|
995 |
+ "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
|
996 |
+ "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
|
997 |
+ |
|
998 |
+ "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
999 |
+ "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
1000 |
+ |
|
1001 |
+#ifdef HAVE_MMX2 |
|
1002 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1003 |
+ "psubw %%mm0, %%mm6 \n\t" |
|
1004 |
+ "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
1005 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1006 |
+ "psubw %%mm1, %%mm6 \n\t" |
|
1007 |
+ "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
1008 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1009 |
+ "psubw %%mm2, %%mm6 \n\t" |
|
1010 |
+ "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
1011 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1012 |
+ "psubw %%mm3, %%mm6 \n\t" |
|
1013 |
+ "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
|
1014 |
+#else |
|
1015 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1016 |
+ "pcmpgtw %%mm0, %%mm6 \n\t" |
|
1017 |
+ "pxor %%mm6, %%mm0 \n\t" |
|
1018 |
+ "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
1019 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1020 |
+ "pcmpgtw %%mm1, %%mm6 \n\t" |
|
1021 |
+ "pxor %%mm6, %%mm1 \n\t" |
|
1022 |
+ "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
1023 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1024 |
+ "pcmpgtw %%mm2, %%mm6 \n\t" |
|
1025 |
+ "pxor %%mm6, %%mm2 \n\t" |
|
1026 |
+ "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
1027 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1028 |
+ "pcmpgtw %%mm3, %%mm6 \n\t" |
|
1029 |
+ "pxor %%mm6, %%mm3 \n\t" |
|
1030 |
+ "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
|
1031 |
+#endif |
|
1032 |
+ |
|
1033 |
+#ifdef HAVE_MMX2 |
|
1034 |
+ "pminsw %%mm2, %%mm0 \n\t" |
|
1035 |
+ "pminsw %%mm3, %%mm1 \n\t" |
|
1036 |
+#else |
|
1037 |
+ "movq %%mm0, %%mm6 \n\t" |
|
1038 |
+ "psubusw %%mm2, %%mm6 \n\t" |
|
1039 |
+ "psubw %%mm6, %%mm0 \n\t" |
|
1040 |
+ "movq %%mm1, %%mm6 \n\t" |
|
1041 |
+ "psubusw %%mm3, %%mm6 \n\t" |
|
1042 |
+ "psubw %%mm6, %%mm1 \n\t" |
|
1043 |
+#endif |
|
1044 |
+ |
|
1045 |
+ "movq %%mm7, %%mm6 \n\t" // 0 |
|
1046 |
+ "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
|
1047 |
+ "pxor %%mm6, %%mm4 \n\t" |
|
1048 |
+ "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
|
1049 |
+ "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
|
1050 |
+ "pxor %%mm7, %%mm5 \n\t" |
|
1051 |
+ "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
|
1052 |
+// 100 opcodes |
|
1053 |
+ "movd %2, %%mm2 \n\t" // QP |
|
1054 |
+ "psllw $3, %%mm2 \n\t" // 8QP |
|
1055 |
+ "movq %%mm2, %%mm3 \n\t" // 8QP |
|
1056 |
+ "pcmpgtw %%mm4, %%mm2 \n\t" |
|
1057 |
+ "pcmpgtw %%mm5, %%mm3 \n\t" |
|
1058 |
+ "pand %%mm2, %%mm4 \n\t" |
|
1059 |
+ "pand %%mm3, %%mm5 \n\t" |
|
1060 |
+ |
|
1061 |
+ |
|
1062 |
+ "psubusw %%mm0, %%mm4 \n\t" // hd |
|
1063 |
+ "psubusw %%mm1, %%mm5 \n\t" // ld |
|
1064 |
+ |
|
1065 |
+ |
|
1066 |
+ "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
|
1067 |
+ "pmullw %%mm2, %%mm4 \n\t" |
|
1068 |
+ "pmullw %%mm2, %%mm5 \n\t" |
|
1069 |
+ "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
|
1070 |
+ "paddw %%mm2, %%mm4 \n\t" |
|
1071 |
+ "paddw %%mm2, %%mm5 \n\t" |
|
1072 |
+ "psrlw $6, %%mm4 \n\t" |
|
1073 |
+ "psrlw $6, %%mm5 \n\t" |
|
1074 |
+ |
|
1075 |
+ "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
|
1076 |
+ "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 |
|
1077 |
+ |
|
1078 |
+ "pxor %%mm2, %%mm2 \n\t" |
|
1079 |
+ "pxor %%mm3, %%mm3 \n\t" |
|
1080 |
+ |
|
1081 |
+ "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
|
1082 |
+ "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
|
1083 |
+ "pxor %%mm2, %%mm0 \n\t" |
|
1084 |
+ "pxor %%mm3, %%mm1 \n\t" |
|
1085 |
+ "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
|
1086 |
+ "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
|
1087 |
+ "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
|
1088 |
+ "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
|
1089 |
+ |
|
1090 |
+ "pxor %%mm6, %%mm2 \n\t" |
|
1091 |
+ "pxor %%mm7, %%mm3 \n\t" |
|
1092 |
+ "pand %%mm2, %%mm4 \n\t" |
|
1093 |
+ "pand %%mm3, %%mm5 \n\t" |
|
1094 |
+ |
|
1095 |
+#ifdef HAVE_MMX2 |
|
1096 |
+ "pminsw %%mm0, %%mm4 \n\t" |
|
1097 |
+ "pminsw %%mm1, %%mm5 \n\t" |
|
1098 |
+#else |
|
1099 |
+ "movq %%mm4, %%mm2 \n\t" |
|
1100 |
+ "psubusw %%mm0, %%mm2 \n\t" |
|
1101 |
+ "psubw %%mm2, %%mm4 \n\t" |
|
1102 |
+ "movq %%mm5, %%mm2 \n\t" |
|
1103 |
+ "psubusw %%mm1, %%mm2 \n\t" |
|
1104 |
+ "psubw %%mm2, %%mm5 \n\t" |
|
1105 |
+#endif |
|
1106 |
+ "pxor %%mm6, %%mm4 \n\t" |
|
1107 |
+ "pxor %%mm7, %%mm5 \n\t" |
|
1108 |
+ "psubw %%mm6, %%mm4 \n\t" |
|
1109 |
+ "psubw %%mm7, %%mm5 \n\t" |
|
1110 |
+ "packsswb %%mm5, %%mm4 \n\t" |
|
1111 |
+ "movq (%0), %%mm0 \n\t" |
|
1112 |
+ "paddb %%mm4, %%mm0 \n\t" |
|
1113 |
+ "movq %%mm0, (%0) \n\t" |
|
1114 |
+ "movq (%0, %1), %%mm0 \n\t" |
|
1115 |
+ "psubb %%mm4, %%mm0 \n\t" |
|
1116 |
+ "movq %%mm0, (%0, %1) \n\t" |
|
1117 |
+ |
|
1118 |
+ : "+r" (src) |
|
1119 |
+ : "r" (stride), "m" (c->pQPb) |
|
1120 |
+ : "%eax", "%ecx" |
|
1121 |
+ ); |
|
1122 |
+#else |
|
1123 |
+ const int l1= stride; |
|
1124 |
+ const int l2= stride + l1; |
|
1125 |
+ const int l3= stride + l2; |
|
1126 |
+ const int l4= stride + l3; |
|
1127 |
+ const int l5= stride + l4; |
|
1128 |
+ const int l6= stride + l5; |
|
1129 |
+ const int l7= stride + l6; |
|
1130 |
+ const int l8= stride + l7; |
|
1131 |
+// const int l9= stride + l8; |
|
1132 |
+ int x; |
|
1133 |
+ src+= stride*3; |
|
1134 |
+ for(x=0; x<BLOCK_SIZE; x++) |
|
1135 |
+ { |
|
1136 |
+ const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
|
1137 |
+ if(ABS(middleEnergy) < 8*c->QP) |
|
1138 |
+ { |
|
1139 |
+ const int q=(src[l4] - src[l5])/2; |
|
1140 |
+ const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
1141 |
+ const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
1142 |
+ |
|
1143 |
+ int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
1144 |
+ d= MAX(d, 0); |
|
1145 |
+ |
|
1146 |
+ d= (5*d + 32) >> 6; |
|
1147 |
+ d*= SIGN(-middleEnergy); |
|
1148 |
+ |
|
1149 |
+ if(q>0) |
|
1150 |
+ { |
|
1151 |
+ d= d<0 ? 0 : d; |
|
1152 |
+ d= d>q ? q : d; |
|
1153 |
+ } |
|
1154 |
+ else |
|
1155 |
+ { |
|
1156 |
+ d= d>0 ? 0 : d; |
|
1157 |
+ d= d<q ? q : d; |
|
1158 |
+ } |
|
1159 |
+ |
|
1160 |
+ src[l4]-= d; |
|
1161 |
+ src[l5]+= d; |
|
1162 |
+ } |
|
1163 |
+ src++; |
|
1164 |
+ } |
|
1165 |
+#endif |
|
1166 |
+} |
|
1167 |
+ |
|
1168 |
+static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
|
1169 |
+{ |
|
1170 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1171 |
+ asm volatile( |
|
1172 |
+ "pxor %%mm6, %%mm6 \n\t" |
|
1173 |
+ "pcmpeqb %%mm7, %%mm7 \n\t" |
|
1174 |
+ "movq %2, %%mm0 \n\t" |
|
1175 |
+ "punpcklbw %%mm6, %%mm0 \n\t" |
|
1176 |
+ "psrlw $1, %%mm0 \n\t" |
|
1177 |
+ "psubw %%mm7, %%mm0 \n\t" |
|
1178 |
+ "packuswb %%mm0, %%mm0 \n\t" |
|
1179 |
+ "movq %%mm0, %3 \n\t" |
|
1180 |
+ |
|
1181 |
+ "leal (%0, %1), %%eax \n\t" |
|
1182 |
+ "leal (%%eax, %1, 4), %%edx \n\t" |
|
1183 |
+ |
|
1184 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
1185 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1186 |
+ |
|
1187 |
+#undef FIND_MIN_MAX |
|
1188 |
+#ifdef HAVE_MMX2 |
|
1189 |
+#define FIND_MIN_MAX(addr)\ |
|
1190 |
+ "movq " #addr ", %%mm0 \n\t"\ |
|
1191 |
+ "pminub %%mm0, %%mm7 \n\t"\ |
|
1192 |
+ "pmaxub %%mm0, %%mm6 \n\t" |
|
1193 |
+#else |
|
1194 |
+#define FIND_MIN_MAX(addr)\ |
|
1195 |
+ "movq " #addr ", %%mm0 \n\t"\ |
|
1196 |
+ "movq %%mm7, %%mm1 \n\t"\ |
|
1197 |
+ "psubusb %%mm0, %%mm6 \n\t"\ |
|
1198 |
+ "paddb %%mm0, %%mm6 \n\t"\ |
|
1199 |
+ "psubusb %%mm0, %%mm1 \n\t"\ |
|
1200 |
+ "psubb %%mm1, %%mm7 \n\t" |
|
1201 |
+#endif |
|
1202 |
+ |
|
1203 |
+FIND_MIN_MAX((%%eax)) |
|
1204 |
+FIND_MIN_MAX((%%eax, %1)) |
|
1205 |
+FIND_MIN_MAX((%%eax, %1, 2)) |
|
1206 |
+FIND_MIN_MAX((%0, %1, 4)) |
|
1207 |
+FIND_MIN_MAX((%%edx)) |
|
1208 |
+FIND_MIN_MAX((%%edx, %1)) |
|
1209 |
+FIND_MIN_MAX((%%edx, %1, 2)) |
|
1210 |
+FIND_MIN_MAX((%0, %1, 8)) |
|
1211 |
+ |
|
1212 |
+ "movq %%mm7, %%mm4 \n\t" |
|
1213 |
+ "psrlq $8, %%mm7 \n\t" |
|
1214 |
+#ifdef HAVE_MMX2 |
|
1215 |
+ "pminub %%mm4, %%mm7 \n\t" // min of pixels |
|
1216 |
+ "pshufw $0xF9, %%mm7, %%mm4 \n\t" |
|
1217 |
+ "pminub %%mm4, %%mm7 \n\t" // min of pixels |
|
1218 |
+ "pshufw $0xFE, %%mm7, %%mm4 \n\t" |
|
1219 |
+ "pminub %%mm4, %%mm7 \n\t" |
|
1220 |
+#else |
|
1221 |
+ "movq %%mm7, %%mm1 \n\t" |
|
1222 |
+ "psubusb %%mm4, %%mm1 \n\t" |
|
1223 |
+ "psubb %%mm1, %%mm7 \n\t" |
|
1224 |
+ "movq %%mm7, %%mm4 \n\t" |
|
1225 |
+ "psrlq $16, %%mm7 \n\t" |
|
1226 |
+ "movq %%mm7, %%mm1 \n\t" |
|
1227 |
+ "psubusb %%mm4, %%mm1 \n\t" |
|
1228 |
+ "psubb %%mm1, %%mm7 \n\t" |
|
1229 |
+ "movq %%mm7, %%mm4 \n\t" |
|
1230 |
+ "psrlq $32, %%mm7 \n\t" |
|
1231 |
+ "movq %%mm7, %%mm1 \n\t" |
|
1232 |
+ "psubusb %%mm4, %%mm1 \n\t" |
|
1233 |
+ "psubb %%mm1, %%mm7 \n\t" |
|
1234 |
+#endif |
|
1235 |
+ |
|
1236 |
+ |
|
1237 |
+ "movq %%mm6, %%mm4 \n\t" |
|
1238 |
+ "psrlq $8, %%mm6 \n\t" |
|
1239 |
+#ifdef HAVE_MMX2 |
|
1240 |
+ "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
|
1241 |
+ "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
|
1242 |
+ "pmaxub %%mm4, %%mm6 \n\t" |
|
1243 |
+ "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
|
1244 |
+ "pmaxub %%mm4, %%mm6 \n\t" |
|
1245 |
+#else |
|
1246 |
+ "psubusb %%mm4, %%mm6 \n\t" |
|
1247 |
+ "paddb %%mm4, %%mm6 \n\t" |
|
1248 |
+ "movq %%mm6, %%mm4 \n\t" |
|
1249 |
+ "psrlq $16, %%mm6 \n\t" |
|
1250 |
+ "psubusb %%mm4, %%mm6 \n\t" |
|
1251 |
+ "paddb %%mm4, %%mm6 \n\t" |
|
1252 |
+ "movq %%mm6, %%mm4 \n\t" |
|
1253 |
+ "psrlq $32, %%mm6 \n\t" |
|
1254 |
+ "psubusb %%mm4, %%mm6 \n\t" |
|
1255 |
+ "paddb %%mm4, %%mm6 \n\t" |
|
1256 |
+#endif |
|
1257 |
+ "movq %%mm6, %%mm0 \n\t" // max |
|
1258 |
+ "psubb %%mm7, %%mm6 \n\t" // max - min |
|
1259 |
+ "movd %%mm6, %%ecx \n\t" |
|
1260 |
+ "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
|
1261 |
+ " jb 1f \n\t" |
|
1262 |
+ "leal -24(%%esp), %%ecx \n\t" |
|
1263 |
+ "andl $0xFFFFFFF8, %%ecx \n\t" |
|
1264 |
+ PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
|
1265 |
+ "punpcklbw %%mm7, %%mm7 \n\t" |
|
1266 |
+ "punpcklbw %%mm7, %%mm7 \n\t" |
|
1267 |
+ "punpcklbw %%mm7, %%mm7 \n\t" |
|
1268 |
+ "movq %%mm7, (%%ecx) \n\t" |
|
1269 |
+ |
|
1270 |
+ "movq (%0), %%mm0 \n\t" // L10 |
|
1271 |
+ "movq %%mm0, %%mm1 \n\t" // L10 |
|
1272 |
+ "movq %%mm0, %%mm2 \n\t" // L10 |
|
1273 |
+ "psllq $8, %%mm1 \n\t" |
|
1274 |
+ "psrlq $8, %%mm2 \n\t" |
|
1275 |
+ "movd -4(%0), %%mm3 \n\t" |
|
1276 |
+ "movd 8(%0), %%mm4 \n\t" |
|
1277 |
+ "psrlq $24, %%mm3 \n\t" |
|
1278 |
+ "psllq $56, %%mm4 \n\t" |
|
1279 |
+ "por %%mm3, %%mm1 \n\t" // L00 |
|
1280 |
+ "por %%mm4, %%mm2 \n\t" // L20 |
|
1281 |
+ "movq %%mm1, %%mm3 \n\t" // L00 |
|
1282 |
+ PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 |
|
1283 |
+ PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 |
|
1284 |
+ "psubusb %%mm7, %%mm0 \n\t" |
|
1285 |
+ "psubusb %%mm7, %%mm2 \n\t" |
|
1286 |
+ "psubusb %%mm7, %%mm3 \n\t" |
|
1287 |
+ "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
|
1288 |
+ "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 |
|
1289 |
+ "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 |
|
1290 |
+ "paddb %%mm2, %%mm0 \n\t" |
|
1291 |
+ "paddb %%mm3, %%mm0 \n\t" |
|
1292 |
+ |
|
1293 |
+ "movq (%%eax), %%mm2 \n\t" // L11 |
|
1294 |
+ "movq %%mm2, %%mm3 \n\t" // L11 |
|
1295 |
+ "movq %%mm2, %%mm4 \n\t" // L11 |
|
1296 |
+ "psllq $8, %%mm3 \n\t" |
|
1297 |
+ "psrlq $8, %%mm4 \n\t" |
|
1298 |
+ "movd -4(%%eax), %%mm5 \n\t" |
|
1299 |
+ "movd 8(%%eax), %%mm6 \n\t" |
|
1300 |
+ "psrlq $24, %%mm5 \n\t" |
|
1301 |
+ "psllq $56, %%mm6 \n\t" |
|
1302 |
+ "por %%mm5, %%mm3 \n\t" // L01 |
|
1303 |
+ "por %%mm6, %%mm4 \n\t" // L21 |
|
1304 |
+ "movq %%mm3, %%mm5 \n\t" // L01 |
|
1305 |
+ PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 |
|
1306 |
+ PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 |
|
1307 |
+ "psubusb %%mm7, %%mm2 \n\t" |
|
1308 |
+ "psubusb %%mm7, %%mm4 \n\t" |
|
1309 |
+ "psubusb %%mm7, %%mm5 \n\t" |
|
1310 |
+ "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
|
1311 |
+ "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 |
|
1312 |
+ "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 |
|
1313 |
+ "paddb %%mm4, %%mm2 \n\t" |
|
1314 |
+ "paddb %%mm5, %%mm2 \n\t" |
|
1315 |
+// 0, 2, 3, 1 |
|
1316 |
+#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
|
1317 |
+ "movq " #src ", " #sx " \n\t" /* src[0] */\ |
|
1318 |
+ "movq " #sx ", " #lx " \n\t" /* src[0] */\ |
|
1319 |
+ "movq " #sx ", " #t0 " \n\t" /* src[0] */\ |
|
1320 |
+ "psllq $8, " #lx " \n\t"\ |
|
1321 |
+ "psrlq $8, " #t0 " \n\t"\ |
|
1322 |
+ "movd -4" #src ", " #t1 " \n\t"\ |
|
1323 |
+ "psrlq $24, " #t1 " \n\t"\ |
|
1324 |
+ "por " #t1 ", " #lx " \n\t" /* src[-1] */\ |
|
1325 |
+ "movd 8" #src ", " #t1 " \n\t"\ |
|
1326 |
+ "psllq $56, " #t1 " \n\t"\ |
|
1327 |
+ "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ |
|
1328 |
+ "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ |
|
1329 |
+ PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ |
|
1330 |
+ PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ |
|
1331 |
+ PAVGB(lx, pplx) \ |
|
1332 |
+ "movq " #lx ", 8(%%ecx) \n\t"\ |
|
1333 |
+ "movq (%%ecx), " #lx " \n\t"\ |
|
1334 |
+ "psubusb " #lx ", " #t1 " \n\t"\ |
|
1335 |
+ "psubusb " #lx ", " #t0 " \n\t"\ |
|
1336 |
+ "psubusb " #lx ", " #sx " \n\t"\ |
|
1337 |
+ "movq "MANGLE(b00)", " #lx " \n\t"\ |
|
1338 |
+ "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
|
1339 |
+ "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ |
|
1340 |
+ "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ |
|
1341 |
+ "paddb " #t1 ", " #t0 " \n\t"\ |
|
1342 |
+ "paddb " #t0 ", " #sx " \n\t"\ |
|
1343 |
+\ |
|
1344 |
+ PAVGB(plx, pplx) /* filtered */\ |
|
1345 |
+ "movq " #dst ", " #t0 " \n\t" /* dst */\ |
|
1346 |
+ "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
|
1347 |
+ "psubusb %3, " #t0 " \n\t"\ |
|
1348 |
+ "paddusb %3, " #t1 " \n\t"\ |
|
1349 |
+ PMAXUB(t0, pplx)\ |
|
1350 |
+ PMINUB(t1, pplx, t0)\ |
|
1351 |
+ "paddb " #sx ", " #ppsx " \n\t"\ |
|
1352 |
+ "paddb " #psx ", " #ppsx " \n\t"\ |
|
1353 |
+ "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
|
1354 |
+ "pand "MANGLE(b08)", " #ppsx " \n\t"\ |
|
1355 |
+ "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
|
1356 |
+ "pand " #ppsx ", " #pplx " \n\t"\ |
|
1357 |
+ "pandn " #dst ", " #ppsx " \n\t"\ |
|
1358 |
+ "por " #pplx ", " #ppsx " \n\t"\ |
|
1359 |
+ "movq " #ppsx ", " #dst " \n\t"\ |
|
1360 |
+ "movq 8(%%ecx), " #lx " \n\t" |
|
1361 |
+ |
|
1362 |
+/* |
|
1363 |
+0000000 |
|
1364 |
+1111111 |
|
1365 |
+ |
|
1366 |
+1111110 |
|
1367 |
+1111101 |
|
1368 |
+1111100 |
|
1369 |
+1111011 |
|
1370 |
+1111010 |
|
1371 |
+1111001 |
|
1372 |
+ |
|
1373 |
+1111000 |
|
1374 |
+1110111 |
|
1375 |
+ |
|
1376 |
+*/ |
|
1377 |
+//DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) |
|
1378 |
+DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
|
1379 |
+DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
|
1380 |
+DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
|
1381 |
+DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
|
1382 |
+DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
|
1383 |
+DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
|
1384 |
+DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
|
1385 |
+DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
|
1386 |
+ |
|
1387 |
+ "1: \n\t" |
|
1388 |
+ : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) |
|
1389 |
+ : "%eax", "%edx", "%ecx" |
|
1390 |
+ ); |
|
1391 |
+#else |
|
1392 |
+ int y; |
|
1393 |
+ int min=255; |
|
1394 |
+ int max=0; |
|
1395 |
+ int avg; |
|
1396 |
+ uint8_t *p; |
|
1397 |
+ int s[10]; |
|
1398 |
+ const int QP2= c->QP/2 + 1; |
|
1399 |
+ |
|
1400 |
+ for(y=1; y<9; y++) |
|
1401 |
+ { |
|
1402 |
+ int x; |
|
1403 |
+ p= src + stride*y; |
|
1404 |
+ for(x=1; x<9; x++) |
|
1405 |
+ { |
|
1406 |
+ p++; |
|
1407 |
+ if(*p > max) max= *p; |
|
1408 |
+ if(*p < min) min= *p; |
|
1409 |
+ } |
|
1410 |
+ } |
|
1411 |
+ avg= (min + max + 1)>>1; |
|
1412 |
+ |
|
1413 |
+ if(max - min <deringThreshold) return; |
|
1414 |
+ |
|
1415 |
+ for(y=0; y<10; y++) |
|
1416 |
+ { |
|
1417 |
+ int t = 0; |
|
1418 |
+ |
|
1419 |
+ if(src[stride*y + 0] > avg) t+= 1; |
|
1420 |
+ if(src[stride*y + 1] > avg) t+= 2; |
|
1421 |
+ if(src[stride*y + 2] > avg) t+= 4; |
|
1422 |
+ if(src[stride*y + 3] > avg) t+= 8; |
|
1423 |
+ if(src[stride*y + 4] > avg) t+= 16; |
|
1424 |
+ if(src[stride*y + 5] > avg) t+= 32; |
|
1425 |
+ if(src[stride*y + 6] > avg) t+= 64; |
|
1426 |
+ if(src[stride*y + 7] > avg) t+= 128; |
|
1427 |
+ if(src[stride*y + 8] > avg) t+= 256; |
|
1428 |
+ if(src[stride*y + 9] > avg) t+= 512; |
|
1429 |
+ |
|
1430 |
+ t |= (~t)<<16; |
|
1431 |
+ t &= (t<<1) & (t>>1); |
|
1432 |
+ s[y] = t; |
|
1433 |
+ } |
|
1434 |
+ |
|
1435 |
+ for(y=1; y<9; y++) |
|
1436 |
+ { |
|
1437 |
+ int t = s[y-1] & s[y] & s[y+1]; |
|
1438 |
+ t|= t>>16; |
|
1439 |
+ s[y-1]= t; |
|
1440 |
+ } |
|
1441 |
+ |
|
1442 |
+ for(y=1; y<9; y++) |
|
1443 |
+ { |
|
1444 |
+ int x; |
|
1445 |
+ int t = s[y-1]; |
|
1446 |
+ |
|
1447 |
+ p= src + stride*y; |
|
1448 |
+ for(x=1; x<9; x++) |
|
1449 |
+ { |
|
1450 |
+ p++; |
|
1451 |
+ if(t & (1<<x)) |
|
1452 |
+ { |
|
1453 |
+ int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) |
|
1454 |
+ +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) |
|
1455 |
+ +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); |
|
1456 |
+ f= (f + 8)>>4; |
|
1457 |
+ |
|
1458 |
+#ifdef DEBUG_DERING_THRESHOLD |
|
1459 |
+ asm volatile("emms\n\t":); |
|
1460 |
+ { |
|
1461 |
+ static long long numPixels=0; |
|
1462 |
+ if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; |
|
1463 |
+// if((max-min)<20 || (max-min)*QP<200) |
|
1464 |
+// if((max-min)*QP < 500) |
|
1465 |
+// if(max-min<QP/2) |
|
1466 |
+ if(max-min < 20) |
|
1467 |
+ { |
|
1468 |
+ static int numSkiped=0; |
|
1469 |
+ static int errorSum=0; |
|
1470 |
+ static int worstQP=0; |
|
1471 |
+ static int worstRange=0; |
|
1472 |
+ static int worstDiff=0; |
|
1473 |
+ int diff= (f - *p); |
|
1474 |
+ int absDiff= ABS(diff); |
|
1475 |
+ int error= diff*diff; |
|
1476 |
+ |
|
1477 |
+ if(x==1 || x==8 || y==1 || y==8) continue; |
|
1478 |
+ |
|
1479 |
+ numSkiped++; |
|
1480 |
+ if(absDiff > worstDiff) |
|
1481 |
+ { |
|
1482 |
+ worstDiff= absDiff; |
|
1483 |
+ worstQP= QP; |
|
1484 |
+ worstRange= max-min; |
|
1485 |
+ } |
|
1486 |
+ errorSum+= error; |
|
1487 |
+ |
|
1488 |
+ if(1024LL*1024LL*1024LL % numSkiped == 0) |
|
1489 |
+ { |
|
1490 |
+ printf( "sum:%1.3f, skip:%d, wQP:%d, " |
|
1491 |
+ "wRange:%d, wDiff:%d, relSkip:%1.3f\n", |
|
1492 |
+ (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, |
|
1493 |
+ worstDiff, (float)numSkiped/numPixels); |
|
1494 |
+ } |
|
1495 |
+ } |
|
1496 |
+ } |
|
1497 |
+#endif |
|
1498 |
+ if (*p + QP2 < f) *p= *p + QP2; |
|
1499 |
+ else if(*p - QP2 > f) *p= *p - QP2; |
|
1500 |
+ else *p=f; |
|
1501 |
+ } |
|
1502 |
+ } |
|
1503 |
+ } |
|
1504 |
+#ifdef DEBUG_DERING_THRESHOLD |
|
1505 |
+ if(max-min < 20) |
|
1506 |
+ { |
|
1507 |
+ for(y=1; y<9; y++) |
|
1508 |
+ { |
|
1509 |
+ int x; |
|
1510 |
+ int t = 0; |
|
1511 |
+ p= src + stride*y; |
|
1512 |
+ for(x=1; x<9; x++) |
|
1513 |
+ { |
|
1514 |
+ p++; |
|
1515 |
+ *p = MIN(*p + 20, 255); |
|
1516 |
+ } |
|
1517 |
+ } |
|
1518 |
+// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; |
|
1519 |
+ } |
|
1520 |
+#endif |
|
1521 |
+#endif |
|
1522 |
+} |
|
1523 |
+ |
|
1524 |
+/** |
|
1525 |
+ * Deinterlaces the given block |
|
1526 |
+ * will be called for every 8x8 block and can read & write from line 4-15 |
|
1527 |
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1528 |
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1529 |
+ */ |
|
1530 |
+static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
|
1531 |
+{ |
|
1532 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1533 |
+ src+= 4*stride; |
|
1534 |
+ asm volatile( |
|
1535 |
+ "leal (%0, %1), %%eax \n\t" |
|
1536 |
+ "leal (%%eax, %1, 4), %%ecx \n\t" |
|
1537 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
1538 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
1539 |
+ |
|
1540 |
+ "movq (%0), %%mm0 \n\t" |
|
1541 |
+ "movq (%%eax, %1), %%mm1 \n\t" |
|
1542 |
+ PAVGB(%%mm1, %%mm0) |
|
1543 |
+ "movq %%mm0, (%%eax) \n\t" |
|
1544 |
+ "movq (%0, %1, 4), %%mm0 \n\t" |
|
1545 |
+ PAVGB(%%mm0, %%mm1) |
|
1546 |
+ "movq %%mm1, (%%eax, %1, 2) \n\t" |
|
1547 |
+ "movq (%%ecx, %1), %%mm1 \n\t" |
|
1548 |
+ PAVGB(%%mm1, %%mm0) |
|
1549 |
+ "movq %%mm0, (%%ecx) \n\t" |
|
1550 |
+ "movq (%0, %1, 8), %%mm0 \n\t" |
|
1551 |
+ PAVGB(%%mm0, %%mm1) |
|
1552 |
+ "movq %%mm1, (%%ecx, %1, 2) \n\t" |
|
1553 |
+ |
|
1554 |
+ : : "r" (src), "r" (stride) |
|
1555 |
+ : "%eax", "%ecx" |
|
1556 |
+ ); |
|
1557 |
+#else |
|
1558 |
+ int x; |
|
1559 |
+ src+= 4*stride; |
|
1560 |
+ for(x=0; x<8; x++) |
|
1561 |
+ { |
|
1562 |
+ src[stride] = (src[0] + src[stride*2])>>1; |
|
1563 |
+ src[stride*3] = (src[stride*2] + src[stride*4])>>1; |
|
1564 |
+ src[stride*5] = (src[stride*4] + src[stride*6])>>1; |
|
1565 |
+ src[stride*7] = (src[stride*6] + src[stride*8])>>1; |
|
1566 |
+ src++; |
|
1567 |
+ } |
|
1568 |
+#endif |
|
1569 |
+} |
|
1570 |
+ |
|
1571 |
+/** |
|
1572 |
+ * Deinterlaces the given block |
|
1573 |
+ * will be called for every 8x8 block and can read & write from line 4-15 |
|
1574 |
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1575 |
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1576 |
+ * this filter will read lines 3-15 and write 7-13 |
|
1577 |
+ * no cliping in C version |
|
1578 |
+ */ |
|
1579 |
+static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
|
1580 |
+{ |
|
1581 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1582 |
+ src+= stride*3; |
|
1583 |
+ asm volatile( |
|
1584 |
+ "leal (%0, %1), %%eax \n\t" |
|
1585 |
+ "leal (%%eax, %1, 4), %%edx \n\t" |
|
1586 |
+ "leal (%%edx, %1, 4), %%ecx \n\t" |
|
1587 |
+ "addl %1, %%ecx \n\t" |
|
1588 |
+ "pxor %%mm7, %%mm7 \n\t" |
|
1589 |
+// 0 1 2 3 4 5 6 7 8 9 10 |
|
1590 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
|
1591 |
+ |
|
1592 |
+#define DEINT_CUBIC(a,b,c,d,e)\ |
|
1593 |
+ "movq " #a ", %%mm0 \n\t"\ |
|
1594 |
+ "movq " #b ", %%mm1 \n\t"\ |
|
1595 |
+ "movq " #d ", %%mm2 \n\t"\ |
|
1596 |
+ "movq " #e ", %%mm3 \n\t"\ |
|
1597 |
+ PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ |
|
1598 |
+ PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ |
|
1599 |
+ "movq %%mm0, %%mm2 \n\t"\ |
|
1600 |
+ "punpcklbw %%mm7, %%mm0 \n\t"\ |
|
1601 |
+ "punpckhbw %%mm7, %%mm2 \n\t"\ |
|
1602 |
+ "movq %%mm1, %%mm3 \n\t"\ |
|
1603 |
+ "punpcklbw %%mm7, %%mm1 \n\t"\ |
|
1604 |
+ "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
1605 |
+ "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ |
|
1606 |
+ "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ |
|
1607 |
+ "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ |
|
1608 |
+ "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ |
|
1609 |
+ "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ |
|
1610 |
+ "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ |
|
1611 |
+ "packuswb %%mm3, %%mm1 \n\t"\ |
|
1612 |
+ "movq %%mm1, " #c " \n\t" |
|
1613 |
+ |
|
1614 |
+DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) |
|
1615 |
+DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) |
|
1616 |
+DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) |
|
1617 |
+DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) |
|
1618 |
+ |
|
1619 |
+ : : "r" (src), "r" (stride) |
|
1620 |
+ : "%eax", "%edx", "ecx" |
|
1621 |
+ ); |
|
1622 |
+#else |
|
1623 |
+ int x; |
|
1624 |
+ src+= stride*3; |
|
1625 |
+ for(x=0; x<8; x++) |
|
1626 |
+ { |
|
1627 |
+ src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
|
1628 |
+ src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; |
|
1629 |
+ src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; |
|
1630 |
+ src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; |
|
1631 |
+ src++; |
|
1632 |
+ } |
|
1633 |
+#endif |
|
1634 |
+} |
|
1635 |
+ |
|
1636 |
+/** |
|
1637 |
+ * Deinterlaces the given block |
|
1638 |
+ * will be called for every 8x8 block and can read & write from line 4-15 |
|
1639 |
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1640 |
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1641 |
+ * this filter will read lines 4-13 and write 5-11 |
|
1642 |
+ * no cliping in C version |
|
1643 |
+ */ |
|
1644 |
+static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) |
|
1645 |
+{ |
|
1646 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1647 |
+ src+= stride*4; |
|
1648 |
+ asm volatile( |
|
1649 |
+ "leal (%0, %1), %%eax \n\t" |
|
1650 |
+ "leal (%%eax, %1, 4), %%edx \n\t" |
|
1651 |
+ "pxor %%mm7, %%mm7 \n\t" |
|
1652 |
+ "movq (%2), %%mm0 \n\t" |
|
1653 |
+// 0 1 2 3 4 5 6 7 8 9 10 |
|
1654 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
|
1655 |
+ |
|
1656 |
+#define DEINT_FF(a,b,c,d)\ |
|
1657 |
+ "movq " #a ", %%mm1 \n\t"\ |
|
1658 |
+ "movq " #b ", %%mm2 \n\t"\ |
|
1659 |
+ "movq " #c ", %%mm3 \n\t"\ |
|
1660 |
+ "movq " #d ", %%mm4 \n\t"\ |
|
1661 |
+ PAVGB(%%mm3, %%mm1) \ |
|
1662 |
+ PAVGB(%%mm4, %%mm0) \ |
|
1663 |
+ "movq %%mm0, %%mm3 \n\t"\ |
|
1664 |
+ "punpcklbw %%mm7, %%mm0 \n\t"\ |
|
1665 |
+ "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
1666 |
+ "movq %%mm1, %%mm4 \n\t"\ |
|
1667 |
+ "punpcklbw %%mm7, %%mm1 \n\t"\ |
|
1668 |
+ "punpckhbw %%mm7, %%mm4 \n\t"\ |
|
1669 |
+ "psllw $2, %%mm1 \n\t"\ |
|
1670 |
+ "psllw $2, %%mm4 \n\t"\ |
|
1671 |
+ "psubw %%mm0, %%mm1 \n\t"\ |
|
1672 |
+ "psubw %%mm3, %%mm4 \n\t"\ |
|
1673 |
+ "movq %%mm2, %%mm5 \n\t"\ |
|
1674 |
+ "movq %%mm2, %%mm0 \n\t"\ |
|
1675 |
+ "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
1676 |
+ "punpckhbw %%mm7, %%mm5 \n\t"\ |
|
1677 |
+ "paddw %%mm2, %%mm1 \n\t"\ |
|
1678 |
+ "paddw %%mm5, %%mm4 \n\t"\ |
|
1679 |
+ "psraw $2, %%mm1 \n\t"\ |
|
1680 |
+ "psraw $2, %%mm4 \n\t"\ |
|
1681 |
+ "packuswb %%mm4, %%mm1 \n\t"\ |
|
1682 |
+ "movq %%mm1, " #b " \n\t"\ |
|
1683 |
+ |
|
1684 |
+DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) |
|
1685 |
+DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) |
|
1686 |
+DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) |
|
1687 |
+DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) |
|
1688 |
+ |
|
1689 |
+ "movq %%mm0, (%2) \n\t" |
|
1690 |
+ : : "r" (src), "r" (stride), "r"(tmp) |
|
1691 |
+ : "%eax", "%edx" |
|
1692 |
+ ); |
|
1693 |
+#else |
|
1694 |
+ int x; |
|
1695 |
+ src+= stride*4; |
|
1696 |
+ for(x=0; x<8; x++) |
|
1697 |
+ { |
|
1698 |
+ int t1= tmp[x]; |
|
1699 |
+ int t2= src[stride*1]; |
|
1700 |
+ |
|
1701 |
+ src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3; |
|
1702 |
+ t1= src[stride*4]; |
|
1703 |
+ src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3; |
|
1704 |
+ t2= src[stride*6]; |
|
1705 |
+ src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3; |
|
1706 |
+ t1= src[stride*8]; |
|
1707 |
+ src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3; |
|
1708 |
+ tmp[x]= t1; |
|
1709 |
+ |
|
1710 |
+ src++; |
|
1711 |
+ } |
|
1712 |
+#endif |
|
1713 |
+} |
|
1714 |
+ |
|
1715 |
+/** |
|
1716 |
+ * Deinterlaces the given block |
|
1717 |
+ * will be called for every 8x8 block and can read & write from line 4-15 |
|
1718 |
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1719 |
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1720 |
+ * will shift the image up by 1 line (FIXME if this is a problem) |
|
1721 |
+ * this filter will read lines 4-13 and write 4-11 |
|
1722 |
+ */ |
|
1723 |
+static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride) |
|
1724 |
+{ |
|
1725 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1726 |
+ src+= 4*stride; |
|
1727 |
+ asm volatile( |
|
1728 |
+ "leal (%0, %1), %%eax \n\t" |
|
1729 |
+ "leal (%%eax, %1, 4), %%edx \n\t" |
|
1730 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
1731 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1732 |
+ |
|
1733 |
+ "movq (%0), %%mm0 \n\t" // L0 |
|
1734 |
+ "movq (%%eax, %1), %%mm1 \n\t" // L2 |
|
1735 |
+ PAVGB(%%mm1, %%mm0) // L0+L2 |
|
1736 |
+ "movq (%%eax), %%mm2 \n\t" // L1 |
|
1737 |
+ PAVGB(%%mm2, %%mm0) |
|
1738 |
+ "movq %%mm0, (%0) \n\t" |
|
1739 |
+ "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 |
|
1740 |
+ PAVGB(%%mm0, %%mm2) // L1+L3 |
|
1741 |
+ PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
|
1742 |
+ "movq %%mm2, (%%eax) \n\t" |
|
1743 |
+ "movq (%0, %1, 4), %%mm2 \n\t" // L4 |
|
1744 |
+ PAVGB(%%mm2, %%mm1) // L2+L4 |
|
1745 |
+ PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
|
1746 |
+ "movq %%mm1, (%%eax, %1) \n\t" |
|
1747 |
+ "movq (%%edx), %%mm1 \n\t" // L5 |
|
1748 |
+ PAVGB(%%mm1, %%mm0) // L3+L5 |
|
1749 |
+ PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
|
1750 |
+ "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
1751 |
+ "movq (%%edx, %1), %%mm0 \n\t" // L6 |
|
1752 |
+ PAVGB(%%mm0, %%mm2) // L4+L6 |
|
1753 |
+ PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
|
1754 |
+ "movq %%mm2, (%0, %1, 4) \n\t" |
|
1755 |
+ "movq (%%edx, %1, 2), %%mm2 \n\t" // L7 |
|
1756 |
+ PAVGB(%%mm2, %%mm1) // L5+L7 |
|
1757 |
+ PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
|
1758 |
+ "movq %%mm1, (%%edx) \n\t" |
|
1759 |
+ "movq (%0, %1, 8), %%mm1 \n\t" // L8 |
|
1760 |
+ PAVGB(%%mm1, %%mm0) // L6+L8 |
|
1761 |
+ PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
|
1762 |
+ "movq %%mm0, (%%edx, %1) \n\t" |
|
1763 |
+ "movq (%%edx, %1, 4), %%mm0 \n\t" // L9 |
|
1764 |
+ PAVGB(%%mm0, %%mm2) // L7+L9 |
|
1765 |
+ PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
|
1766 |
+ "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
1767 |
+ |
|
1768 |
+ |
|
1769 |
+ : : "r" (src), "r" (stride) |
|
1770 |
+ : "%eax", "%edx" |
|
1771 |
+ ); |
|
1772 |
+#else |
|
1773 |
+ int x; |
|
1774 |
+ src+= 4*stride; |
|
1775 |
+ for(x=0; x<8; x++) |
|
1776 |
+ { |
|
1777 |
+ src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
|
1778 |
+ src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
|
1779 |
+ src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
|
1780 |
+ src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; |
|
1781 |
+ src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; |
|
1782 |
+ src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; |
|
1783 |
+ src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; |
|
1784 |
+ src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; |
|
1785 |
+ src++; |
|
1786 |
+ } |
|
1787 |
+#endif |
|
1788 |
+} |
|
1789 |
+ |
|
1790 |
+/** |
|
1791 |
+ * Deinterlaces the given block |
|
1792 |
+ * will be called for every 8x8 block and can read & write from line 4-15, |
|
1793 |
+ * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1794 |
+ * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1795 |
+ */ |
|
1796 |
+static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
|
1797 |
+{ |
|
1798 |
+#ifdef HAVE_MMX |
|
1799 |
+ src+= 4*stride; |
|
1800 |
+#ifdef HAVE_MMX2 |
|
1801 |
+ asm volatile( |
|
1802 |
+ "leal (%0, %1), %%eax \n\t" |
|
1803 |
+ "leal (%%eax, %1, 4), %%edx \n\t" |
|
1804 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
1805 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1806 |
+ |
|
1807 |
+ "movq (%0), %%mm0 \n\t" // |
|
1808 |
+ "movq (%%eax, %1), %%mm2 \n\t" // |
|
1809 |
+ "movq (%%eax), %%mm1 \n\t" // |
|
1810 |
+ "movq %%mm0, %%mm3 \n\t" |
|
1811 |
+ "pmaxub %%mm1, %%mm0 \n\t" // |
|
1812 |
+ "pminub %%mm3, %%mm1 \n\t" // |
|
1813 |
+ "pmaxub %%mm2, %%mm1 \n\t" // |
|
1814 |
+ "pminub %%mm1, %%mm0 \n\t" |
|
1815 |
+ "movq %%mm0, (%%eax) \n\t" |
|
1816 |
+ |
|
1817 |
+ "movq (%0, %1, 4), %%mm0 \n\t" // |
|
1818 |
+ "movq (%%eax, %1, 2), %%mm1 \n\t" // |
|
1819 |
+ "movq %%mm2, %%mm3 \n\t" |
|
1820 |
+ "pmaxub %%mm1, %%mm2 \n\t" // |
|
1821 |
+ "pminub %%mm3, %%mm1 \n\t" // |
|
1822 |
+ "pmaxub %%mm0, %%mm1 \n\t" // |
|
1823 |
+ "pminub %%mm1, %%mm2 \n\t" |
|
1824 |
+ "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
1825 |
+ |
|
1826 |
+ "movq (%%edx), %%mm2 \n\t" // |
|
1827 |
+ "movq (%%edx, %1), %%mm1 \n\t" // |
|
1828 |
+ "movq %%mm2, %%mm3 \n\t" |
|
1829 |
+ "pmaxub %%mm0, %%mm2 \n\t" // |
|
1830 |
+ "pminub %%mm3, %%mm0 \n\t" // |
|
1831 |
+ "pmaxub %%mm1, %%mm0 \n\t" // |
|
1832 |
+ "pminub %%mm0, %%mm2 \n\t" |
|
1833 |
+ "movq %%mm2, (%%edx) \n\t" |
|
1834 |
+ |
|
1835 |
+ "movq (%%edx, %1, 2), %%mm2 \n\t" // |
|
1836 |
+ "movq (%0, %1, 8), %%mm0 \n\t" // |
|
1837 |
+ "movq %%mm2, %%mm3 \n\t" |
|
1838 |
+ "pmaxub %%mm0, %%mm2 \n\t" // |
|
1839 |
+ "pminub %%mm3, %%mm0 \n\t" // |
|
1840 |
+ "pmaxub %%mm1, %%mm0 \n\t" // |
|
1841 |
+ "pminub %%mm0, %%mm2 \n\t" |
|
1842 |
+ "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
1843 |
+ |
|
1844 |
+ |
|
1845 |
+ : : "r" (src), "r" (stride) |
|
1846 |
+ : "%eax", "%edx" |
|
1847 |
+ ); |
|
1848 |
+ |
|
1849 |
+#else // MMX without MMX2 |
|
1850 |
+ asm volatile( |
|
1851 |
+ "leal (%0, %1), %%eax \n\t" |
|
1852 |
+ "leal (%%eax, %1, 4), %%edx \n\t" |
|
1853 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
1854 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1855 |
+ "pxor %%mm7, %%mm7 \n\t" |
|
1856 |
+ |
|
1857 |
+#define MEDIAN(a,b,c)\ |
|
1858 |
+ "movq " #a ", %%mm0 \n\t"\ |
|
1859 |
+ "movq " #b ", %%mm2 \n\t"\ |
|
1860 |
+ "movq " #c ", %%mm1 \n\t"\ |
|
1861 |
+ "movq %%mm0, %%mm3 \n\t"\ |
|
1862 |
+ "movq %%mm1, %%mm4 \n\t"\ |
|
1863 |
+ "movq %%mm2, %%mm5 \n\t"\ |
|
1864 |
+ "psubusb %%mm1, %%mm3 \n\t"\ |
|
1865 |
+ "psubusb %%mm2, %%mm4 \n\t"\ |
|
1866 |
+ "psubusb %%mm0, %%mm5 \n\t"\ |
|
1867 |
+ "pcmpeqb %%mm7, %%mm3 \n\t"\ |
|
1868 |
+ "pcmpeqb %%mm7, %%mm4 \n\t"\ |
|
1869 |
+ "pcmpeqb %%mm7, %%mm5 \n\t"\ |
|
1870 |
+ "movq %%mm3, %%mm6 \n\t"\ |
|
1871 |
+ "pxor %%mm4, %%mm3 \n\t"\ |
|
1872 |
+ "pxor %%mm5, %%mm4 \n\t"\ |
|
1873 |
+ "pxor %%mm6, %%mm5 \n\t"\ |
|
1874 |
+ "por %%mm3, %%mm1 \n\t"\ |
|
1875 |
+ "por %%mm4, %%mm2 \n\t"\ |
|
1876 |
+ "por %%mm5, %%mm0 \n\t"\ |
|
1877 |
+ "pand %%mm2, %%mm0 \n\t"\ |
|
1878 |
+ "pand %%mm1, %%mm0 \n\t"\ |
|
1879 |
+ "movq %%mm0, " #b " \n\t" |
|
1880 |
+ |
|
1881 |
+MEDIAN((%0), (%%eax), (%%eax, %1)) |
|
1882 |
+MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) |
|
1883 |
+MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) |
|
1884 |
+MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) |
|
1885 |
+ |
|
1886 |
+ : : "r" (src), "r" (stride) |
|
1887 |
+ : "%eax", "%edx" |
|
1888 |
+ ); |
|
1889 |
+#endif // MMX |
|
1890 |
+#else |
|
1891 |
+ int x, y; |
|
1892 |
+ src+= 4*stride; |
|
1893 |
+ // FIXME - there should be a way to do a few columns in parallel like w/mmx |
|
1894 |
+ for(x=0; x<8; x++) |
|
1895 |
+ { |
|
1896 |
+ uint8_t *colsrc = src; |
|
1897 |
+ for (y=0; y<4; y++) |
|
1898 |
+ { |
|
1899 |
+ int a, b, c, d, e, f; |
|
1900 |
+ a = colsrc[0 ]; |
|
1901 |
+ b = colsrc[stride ]; |
|
1902 |
+ c = colsrc[stride*2]; |
|
1903 |
+ d = (a-b)>>31; |
|
1904 |
+ e = (b-c)>>31; |
|
1905 |
+ f = (c-a)>>31; |
|
1906 |
+ colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); |
|
1907 |
+ colsrc += stride*2; |
|
1908 |
+ } |
|
1909 |
+ src++; |
|
1910 |
+ } |
|
1911 |
+#endif |
|
1912 |
+} |
|
1913 |
+ |
|
1914 |
+#ifdef HAVE_MMX |
|
1915 |
+/** |
|
1916 |
+ * transposes and shift the given 8x8 Block into dst1 and dst2 |
|
1917 |
+ */ |
|
1918 |
+static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
|
1919 |
+{ |
|
1920 |
+ asm( |
|
1921 |
+ "leal (%0, %1), %%eax \n\t" |
|
1922 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
1923 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1924 |
+ "movq (%0), %%mm0 \n\t" // 12345678 |
|
1925 |
+ "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
1926 |
+ "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
1927 |
+ "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
1928 |
+ "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
1929 |
+ |
|
1930 |
+ "movq (%%eax, %1), %%mm1 \n\t" |
|
1931 |
+ "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
1932 |
+ "movq %%mm1, %%mm4 \n\t" |
|
1933 |
+ "punpcklbw %%mm3, %%mm1 \n\t" |
|
1934 |
+ "punpckhbw %%mm3, %%mm4 \n\t" |
|
1935 |
+ |
|
1936 |
+ "movq %%mm0, %%mm3 \n\t" |
|
1937 |
+ "punpcklwd %%mm1, %%mm0 \n\t" |
|
1938 |
+ "punpckhwd %%mm1, %%mm3 \n\t" |
|
1939 |
+ "movq %%mm2, %%mm1 \n\t" |
|
1940 |
+ "punpcklwd %%mm4, %%mm2 \n\t" |
|
1941 |
+ "punpckhwd %%mm4, %%mm1 \n\t" |
|
1942 |
+ |
|
1943 |
+ "movd %%mm0, 128(%2) \n\t" |
|
1944 |
+ "psrlq $32, %%mm0 \n\t" |
|
1945 |
+ "movd %%mm0, 144(%2) \n\t" |
|
1946 |
+ "movd %%mm3, 160(%2) \n\t" |
|
1947 |
+ "psrlq $32, %%mm3 \n\t" |
|
1948 |
+ "movd %%mm3, 176(%2) \n\t" |
|
1949 |
+ "movd %%mm3, 48(%3) \n\t" |
|
1950 |
+ "movd %%mm2, 192(%2) \n\t" |
|
1951 |
+ "movd %%mm2, 64(%3) \n\t" |
|
1952 |
+ "psrlq $32, %%mm2 \n\t" |
|
1953 |
+ "movd %%mm2, 80(%3) \n\t" |
|
1954 |
+ "movd %%mm1, 96(%3) \n\t" |
|
1955 |
+ "psrlq $32, %%mm1 \n\t" |
|
1956 |
+ "movd %%mm1, 112(%3) \n\t" |
|
1957 |
+ |
|
1958 |
+ "leal (%%eax, %1, 4), %%eax \n\t" |
|
1959 |
+ |
|
1960 |
+ "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
|
1961 |
+ "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
1962 |
+ "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
1963 |
+ "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
1964 |
+ "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
1965 |
+ |
|
1966 |
+ "movq (%%eax, %1), %%mm1 \n\t" |
|
1967 |
+ "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
1968 |
+ "movq %%mm1, %%mm4 \n\t" |
|
1969 |
+ "punpcklbw %%mm3, %%mm1 \n\t" |
|
1970 |
+ "punpckhbw %%mm3, %%mm4 \n\t" |
|
1971 |
+ |
|
1972 |
+ "movq %%mm0, %%mm3 \n\t" |
|
1973 |
+ "punpcklwd %%mm1, %%mm0 \n\t" |
|
1974 |
+ "punpckhwd %%mm1, %%mm3 \n\t" |
|
1975 |
+ "movq %%mm2, %%mm1 \n\t" |
|
1976 |
+ "punpcklwd %%mm4, %%mm2 \n\t" |
|
1977 |
+ "punpckhwd %%mm4, %%mm1 \n\t" |
|
1978 |
+ |
|
1979 |
+ "movd %%mm0, 132(%2) \n\t" |
|
1980 |
+ "psrlq $32, %%mm0 \n\t" |
|
1981 |
+ "movd %%mm0, 148(%2) \n\t" |
|
1982 |
+ "movd %%mm3, 164(%2) \n\t" |
|
1983 |
+ "psrlq $32, %%mm3 \n\t" |
|
1984 |
+ "movd %%mm3, 180(%2) \n\t" |
|
1985 |
+ "movd %%mm3, 52(%3) \n\t" |
|
1986 |
+ "movd %%mm2, 196(%2) \n\t" |
|
1987 |
+ "movd %%mm2, 68(%3) \n\t" |
|
1988 |
+ "psrlq $32, %%mm2 \n\t" |
|
1989 |
+ "movd %%mm2, 84(%3) \n\t" |
|
1990 |
+ "movd %%mm1, 100(%3) \n\t" |
|
1991 |
+ "psrlq $32, %%mm1 \n\t" |
|
1992 |
+ "movd %%mm1, 116(%3) \n\t" |
|
1993 |
+ |
|
1994 |
+ |
|
1995 |
+ :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
|
1996 |
+ : "%eax" |
|
1997 |
+ ); |
|
1998 |
+} |
|
1999 |
+ |
|
2000 |
+/** |
|
2001 |
+ * transposes the given 8x8 block |
|
2002 |
+ */ |
|
2003 |
+static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
|
2004 |
+{ |
|
2005 |
+ asm( |
|
2006 |
+ "leal (%0, %1), %%eax \n\t" |
|
2007 |
+ "leal (%%eax, %1, 4), %%edx \n\t" |
|
2008 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
2009 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
2010 |
+ "movq (%2), %%mm0 \n\t" // 12345678 |
|
2011 |
+ "movq 16(%2), %%mm1 \n\t" // abcdefgh |
|
2012 |
+ "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
2013 |
+ "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
2014 |
+ "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
2015 |
+ |
|
2016 |
+ "movq 32(%2), %%mm1 \n\t" |
|
2017 |
+ "movq 48(%2), %%mm3 \n\t" |
|
2018 |
+ "movq %%mm1, %%mm4 \n\t" |
|
2019 |
+ "punpcklbw %%mm3, %%mm1 \n\t" |
|
2020 |
+ "punpckhbw %%mm3, %%mm4 \n\t" |
|
2021 |
+ |
|
2022 |
+ "movq %%mm0, %%mm3 \n\t" |
|
2023 |
+ "punpcklwd %%mm1, %%mm0 \n\t" |
|
2024 |
+ "punpckhwd %%mm1, %%mm3 \n\t" |
|
2025 |
+ "movq %%mm2, %%mm1 \n\t" |
|
2026 |
+ "punpcklwd %%mm4, %%mm2 \n\t" |
|
2027 |
+ "punpckhwd %%mm4, %%mm1 \n\t" |
|
2028 |
+ |
|
2029 |
+ "movd %%mm0, (%0) \n\t" |
|
2030 |
+ "psrlq $32, %%mm0 \n\t" |
|
2031 |
+ "movd %%mm0, (%%eax) \n\t" |
|
2032 |
+ "movd %%mm3, (%%eax, %1) \n\t" |
|
2033 |
+ "psrlq $32, %%mm3 \n\t" |
|
2034 |
+ "movd %%mm3, (%%eax, %1, 2) \n\t" |
|
2035 |
+ "movd %%mm2, (%0, %1, 4) \n\t" |
|
2036 |
+ "psrlq $32, %%mm2 \n\t" |
|
2037 |
+ "movd %%mm2, (%%edx) \n\t" |
|
2038 |
+ "movd %%mm1, (%%edx, %1) \n\t" |
|
2039 |
+ "psrlq $32, %%mm1 \n\t" |
|
2040 |
+ "movd %%mm1, (%%edx, %1, 2) \n\t" |
|
2041 |
+ |
|
2042 |
+ |
|
2043 |
+ "movq 64(%2), %%mm0 \n\t" // 12345678 |
|
2044 |
+ "movq 80(%2), %%mm1 \n\t" // abcdefgh |
|
2045 |
+ "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
2046 |
+ "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
2047 |
+ "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
2048 |
+ |
|
2049 |
+ "movq 96(%2), %%mm1 \n\t" |
|
2050 |
+ "movq 112(%2), %%mm3 \n\t" |
|
2051 |
+ "movq %%mm1, %%mm4 \n\t" |
|
2052 |
+ "punpcklbw %%mm3, %%mm1 \n\t" |
|
2053 |
+ "punpckhbw %%mm3, %%mm4 \n\t" |
|
2054 |
+ |
|
2055 |
+ "movq %%mm0, %%mm3 \n\t" |
|
2056 |
+ "punpcklwd %%mm1, %%mm0 \n\t" |
|
2057 |
+ "punpckhwd %%mm1, %%mm3 \n\t" |
|
2058 |
+ "movq %%mm2, %%mm1 \n\t" |
|
2059 |
+ "punpcklwd %%mm4, %%mm2 \n\t" |
|
2060 |
+ "punpckhwd %%mm4, %%mm1 \n\t" |
|
2061 |
+ |
|
2062 |
+ "movd %%mm0, 4(%0) \n\t" |
|
2063 |
+ "psrlq $32, %%mm0 \n\t" |
|
2064 |
+ "movd %%mm0, 4(%%eax) \n\t" |
|
2065 |
+ "movd %%mm3, 4(%%eax, %1) \n\t" |
|
2066 |
+ "psrlq $32, %%mm3 \n\t" |
|
2067 |
+ "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
|
2068 |
+ "movd %%mm2, 4(%0, %1, 4) \n\t" |
|
2069 |
+ "psrlq $32, %%mm2 \n\t" |
|
2070 |
+ "movd %%mm2, 4(%%edx) \n\t" |
|
2071 |
+ "movd %%mm1, 4(%%edx, %1) \n\t" |
|
2072 |
+ "psrlq $32, %%mm1 \n\t" |
|
2073 |
+ "movd %%mm1, 4(%%edx, %1, 2) \n\t" |
|
2074 |
+ |
|
2075 |
+ :: "r" (dst), "r" (dstStride), "r" (src) |
|
2076 |
+ : "%eax", "%edx" |
|
2077 |
+ ); |
|
2078 |
+} |
|
2079 |
+#endif |
|
2080 |
+//static int test=0; |
|
2081 |
+ |
|
2082 |
+static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
|
2083 |
+ uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
|
2084 |
+{ |
|
2085 |
+ // to save a register (FIXME do this outside of the loops) |
|
2086 |
+ tempBluredPast[127]= maxNoise[0]; |
|
2087 |
+ tempBluredPast[128]= maxNoise[1]; |
|
2088 |
+ tempBluredPast[129]= maxNoise[2]; |
|
2089 |
+ |
|
2090 |
+#define FAST_L2_DIFF |
|
2091 |
+//#define L1_DIFF //u should change the thresholds too if u try that one |
|
2092 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
2093 |
+ asm volatile( |
|
2094 |
+ "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
|
2095 |
+ "leal (%2, %2, 4), %%edx \n\t" // 5*stride |
|
2096 |
+ "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2097 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
2098 |
+// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
|
2099 |
+//FIXME reorder? |
|
2100 |
+#ifdef L1_DIFF //needs mmx2 |
|
2101 |
+ "movq (%0), %%mm0 \n\t" // L0 |
|
2102 |
+ "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
|
2103 |
+ "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2104 |
+ "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
|
2105 |
+ "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2106 |
+ "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
|
2107 |
+ "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2108 |
+ "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
|
2109 |
+ |
|
2110 |
+ "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
2111 |
+ "paddw %%mm1, %%mm0 \n\t" |
|
2112 |
+ "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
|
2113 |
+ "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
2114 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
2115 |
+ "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| |
|
2116 |
+ "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
2117 |
+ "paddw %%mm3, %%mm0 \n\t" |
|
2118 |
+ "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
|
2119 |
+ "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
2120 |
+ "paddw %%mm4, %%mm0 \n\t" |
|
2121 |
+ "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| |
|
2122 |
+ "paddw %%mm5, %%mm6 \n\t" |
|
2123 |
+ "paddw %%mm7, %%mm6 \n\t" |
|
2124 |
+ "paddw %%mm6, %%mm0 \n\t" |
|
2125 |
+#elif defined (FAST_L2_DIFF) |
|
2126 |
+ "pcmpeqb %%mm7, %%mm7 \n\t" |
|
2127 |
+ "movq "MANGLE(b80)", %%mm6 \n\t" |
|
2128 |
+ "pxor %%mm0, %%mm0 \n\t" |
|
2129 |
+#define L2_DIFF_CORE(a, b)\ |
|
2130 |
+ "movq " #a ", %%mm5 \n\t"\ |
|
2131 |
+ "movq " #b ", %%mm2 \n\t"\ |
|
2132 |
+ "pxor %%mm7, %%mm2 \n\t"\ |
|
2133 |
+ PAVGB(%%mm2, %%mm5)\ |
|
2134 |
+ "paddb %%mm6, %%mm5 \n\t"\ |
|
2135 |
+ "movq %%mm5, %%mm2 \n\t"\ |
|
2136 |
+ "psllw $8, %%mm5 \n\t"\ |
|
2137 |
+ "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
2138 |
+ "pmaddwd %%mm2, %%mm2 \n\t"\ |
|
2139 |
+ "paddd %%mm2, %%mm5 \n\t"\ |
|
2140 |
+ "psrld $14, %%mm5 \n\t"\ |
|
2141 |
+ "paddd %%mm5, %%mm0 \n\t" |
|
2142 |
+ |
|
2143 |
+L2_DIFF_CORE((%0), (%1)) |
|
2144 |
+L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
2145 |
+L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
2146 |
+L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
2147 |
+L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
|
2148 |
+L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
2149 |
+L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
2150 |
+L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
2151 |
+ |
|
2152 |
+#else |
|
2153 |
+ "pxor %%mm7, %%mm7 \n\t" |
|
2154 |
+ "pxor %%mm0, %%mm0 \n\t" |
|
2155 |
+#define L2_DIFF_CORE(a, b)\ |
|
2156 |
+ "movq " #a ", %%mm5 \n\t"\ |
|
2157 |
+ "movq " #b ", %%mm2 \n\t"\ |
|
2158 |
+ "movq %%mm5, %%mm1 \n\t"\ |
|
2159 |
+ "movq %%mm2, %%mm3 \n\t"\ |
|
2160 |
+ "punpcklbw %%mm7, %%mm5 \n\t"\ |
|
2161 |
+ "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
2162 |
+ "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
2163 |
+ "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
2164 |
+ "psubw %%mm2, %%mm5 \n\t"\ |
|
2165 |
+ "psubw %%mm3, %%mm1 \n\t"\ |
|
2166 |
+ "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
2167 |
+ "pmaddwd %%mm1, %%mm1 \n\t"\ |
|
2168 |
+ "paddd %%mm1, %%mm5 \n\t"\ |
|
2169 |
+ "paddd %%mm5, %%mm0 \n\t" |
|
2170 |
+ |
|
2171 |
+L2_DIFF_CORE((%0), (%1)) |
|
2172 |
+L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
2173 |
+L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
2174 |
+L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
2175 |
+L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
|
2176 |
+L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
2177 |
+L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
2178 |
+L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
2179 |
+ |
|
2180 |
+#endif |
|
2181 |
+ |
|
2182 |
+ "movq %%mm0, %%mm4 \n\t" |
|
2183 |
+ "psrlq $32, %%mm0 \n\t" |
|
2184 |
+ "paddd %%mm0, %%mm4 \n\t" |
|
2185 |
+ "movd %%mm4, %%ecx \n\t" |
|
2186 |
+ "shll $2, %%ecx \n\t" |
|
2187 |
+ "movl %3, %%edx \n\t" |
|
2188 |
+ "addl -4(%%edx), %%ecx \n\t" |
|
2189 |
+ "addl 4(%%edx), %%ecx \n\t" |
|
2190 |
+ "addl -1024(%%edx), %%ecx \n\t" |
|
2191 |
+ "addl $4, %%ecx \n\t" |
|
2192 |
+ "addl 1024(%%edx), %%ecx \n\t" |
|
2193 |
+ "shrl $3, %%ecx \n\t" |
|
2194 |
+ "movl %%ecx, (%%edx) \n\t" |
|
2195 |
+ |
|
2196 |
+// "movl %3, %%ecx \n\t" |
|
2197 |
+// "movl %%ecx, test \n\t" |
|
2198 |
+// "jmp 4f \n\t" |
|
2199 |
+ "cmpl 512(%%edx), %%ecx \n\t" |
|
2200 |
+ " jb 2f \n\t" |
|
2201 |
+ "cmpl 516(%%edx), %%ecx \n\t" |
|
2202 |
+ " jb 1f \n\t" |
|
2203 |
+ |
|
2204 |
+ "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2205 |
+ "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2206 |
+ "movq (%0), %%mm0 \n\t" // L0 |
|
2207 |
+ "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2208 |
+ "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2209 |
+ "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2210 |
+ "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
2211 |
+ "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
2212 |
+ "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
2213 |
+ "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
2214 |
+ "movq %%mm0, (%1) \n\t" // L0 |
|
2215 |
+ "movq %%mm1, (%1, %2) \n\t" // L1 |
|
2216 |
+ "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
|
2217 |
+ "movq %%mm3, (%1, %%eax) \n\t" // L3 |
|
2218 |
+ "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
|
2219 |
+ "movq %%mm5, (%1, %%edx) \n\t" // L5 |
|
2220 |
+ "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
|
2221 |
+ "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
|
2222 |
+ "jmp 4f \n\t" |
|
2223 |
+ |
|
2224 |
+ "1: \n\t" |
|
2225 |
+ "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2226 |
+ "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2227 |
+ "movq (%0), %%mm0 \n\t" // L0 |
|
2228 |
+ PAVGB((%1), %%mm0) // L0 |
|
2229 |
+ "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2230 |
+ PAVGB((%1, %2), %%mm1) // L1 |
|
2231 |
+ "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2232 |
+ PAVGB((%1, %2, 2), %%mm2) // L2 |
|
2233 |
+ "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2234 |
+ PAVGB((%1, %%eax), %%mm3) // L3 |
|
2235 |
+ "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
2236 |
+ PAVGB((%1, %2, 4), %%mm4) // L4 |
|
2237 |
+ "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
2238 |
+ PAVGB((%1, %%edx), %%mm5) // L5 |
|
2239 |
+ "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
2240 |
+ PAVGB((%1, %%eax, 2), %%mm6) // L6 |
|
2241 |
+ "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
2242 |
+ PAVGB((%1, %%ecx), %%mm7) // L7 |
|
2243 |
+ "movq %%mm0, (%1) \n\t" // R0 |
|
2244 |
+ "movq %%mm1, (%1, %2) \n\t" // R1 |
|
2245 |
+ "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
2246 |
+ "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
2247 |
+ "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
|
2248 |
+ "movq %%mm5, (%1, %%edx) \n\t" // R5 |
|
2249 |
+ "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
|
2250 |
+ "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
|
2251 |
+ "movq %%mm0, (%0) \n\t" // L0 |
|
2252 |
+ "movq %%mm1, (%0, %2) \n\t" // L1 |
|
2253 |
+ "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
2254 |
+ "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
2255 |
+ "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
|
2256 |
+ "movq %%mm5, (%0, %%edx) \n\t" // L5 |
|
2257 |
+ "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
|
2258 |
+ "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
|
2259 |
+ "jmp 4f \n\t" |
|
2260 |
+ |
|
2261 |
+ "2: \n\t" |
|
2262 |
+ "cmpl 508(%%edx), %%ecx \n\t" |
|
2263 |
+ " jb 3f \n\t" |
|
2264 |
+ |
|
2265 |
+ "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2266 |
+ "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2267 |
+ "movq (%0), %%mm0 \n\t" // L0 |
|
2268 |
+ "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2269 |
+ "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2270 |
+ "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2271 |
+ "movq (%1), %%mm4 \n\t" // R0 |
|
2272 |
+ "movq (%1, %2), %%mm5 \n\t" // R1 |
|
2273 |
+ "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
2274 |
+ "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
2275 |
+ PAVGB(%%mm4, %%mm0) |
|
2276 |
+ PAVGB(%%mm5, %%mm1) |
|
2277 |
+ PAVGB(%%mm6, %%mm2) |
|
2278 |
+ PAVGB(%%mm7, %%mm3) |
|
2279 |
+ PAVGB(%%mm4, %%mm0) |
|
2280 |
+ PAVGB(%%mm5, %%mm1) |
|
2281 |
+ PAVGB(%%mm6, %%mm2) |
|
2282 |
+ PAVGB(%%mm7, %%mm3) |
|
2283 |
+ "movq %%mm0, (%1) \n\t" // R0 |
|
2284 |
+ "movq %%mm1, (%1, %2) \n\t" // R1 |
|
2285 |
+ "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
2286 |
+ "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
2287 |
+ "movq %%mm0, (%0) \n\t" // L0 |
|
2288 |
+ "movq %%mm1, (%0, %2) \n\t" // L1 |
|
2289 |
+ "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
2290 |
+ "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
2291 |
+ |
|
2292 |
+ "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
|
2293 |
+ "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
2294 |
+ "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
2295 |
+ "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
2296 |
+ "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
|
2297 |
+ "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
2298 |
+ "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
2299 |
+ "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
2300 |
+ PAVGB(%%mm4, %%mm0) |
|
2301 |
+ PAVGB(%%mm5, %%mm1) |
|
2302 |
+ PAVGB(%%mm6, %%mm2) |
|
2303 |
+ PAVGB(%%mm7, %%mm3) |
|
2304 |
+ PAVGB(%%mm4, %%mm0) |
|
2305 |
+ PAVGB(%%mm5, %%mm1) |
|
2306 |
+ PAVGB(%%mm6, %%mm2) |
|
2307 |
+ PAVGB(%%mm7, %%mm3) |
|
2308 |
+ "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
|
2309 |
+ "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
2310 |
+ "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
2311 |
+ "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
2312 |
+ "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
|
2313 |
+ "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
2314 |
+ "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
2315 |
+ "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
2316 |
+ "jmp 4f \n\t" |
|
2317 |
+ |
|
2318 |
+ "3: \n\t" |
|
2319 |
+ "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2320 |
+ "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2321 |
+ "movq (%0), %%mm0 \n\t" // L0 |
|
2322 |
+ "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2323 |
+ "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2324 |
+ "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2325 |
+ "movq (%1), %%mm4 \n\t" // R0 |
|
2326 |
+ "movq (%1, %2), %%mm5 \n\t" // R1 |
|
2327 |
+ "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
2328 |
+ "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
2329 |
+ PAVGB(%%mm4, %%mm0) |
|
2330 |
+ PAVGB(%%mm5, %%mm1) |
|
2331 |
+ PAVGB(%%mm6, %%mm2) |
|
2332 |
+ PAVGB(%%mm7, %%mm3) |
|
2333 |
+ PAVGB(%%mm4, %%mm0) |
|
2334 |
+ PAVGB(%%mm5, %%mm1) |
|
2335 |
+ PAVGB(%%mm6, %%mm2) |
|
2336 |
+ PAVGB(%%mm7, %%mm3) |
|
2337 |
+ PAVGB(%%mm4, %%mm0) |
|
2338 |
+ PAVGB(%%mm5, %%mm1) |
|
2339 |
+ PAVGB(%%mm6, %%mm2) |
|
2340 |
+ PAVGB(%%mm7, %%mm3) |
|
2341 |
+ "movq %%mm0, (%1) \n\t" // R0 |
|
2342 |
+ "movq %%mm1, (%1, %2) \n\t" // R1 |
|
2343 |
+ "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
2344 |
+ "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
2345 |
+ "movq %%mm0, (%0) \n\t" // L0 |
|
2346 |
+ "movq %%mm1, (%0, %2) \n\t" // L1 |
|
2347 |
+ "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
2348 |
+ "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
2349 |
+ |
|
2350 |
+ "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
|
2351 |
+ "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
2352 |
+ "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
2353 |
+ "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
2354 |
+ "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
|
2355 |
+ "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
2356 |
+ "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
2357 |
+ "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
2358 |
+ PAVGB(%%mm4, %%mm0) |
|
2359 |
+ PAVGB(%%mm5, %%mm1) |
|
2360 |
+ PAVGB(%%mm6, %%mm2) |
|
2361 |
+ PAVGB(%%mm7, %%mm3) |
|
2362 |
+ PAVGB(%%mm4, %%mm0) |
|
2363 |
+ PAVGB(%%mm5, %%mm1) |
|
2364 |
+ PAVGB(%%mm6, %%mm2) |
|
2365 |
+ PAVGB(%%mm7, %%mm3) |
|
2366 |
+ PAVGB(%%mm4, %%mm0) |
|
2367 |
+ PAVGB(%%mm5, %%mm1) |
|
2368 |
+ PAVGB(%%mm6, %%mm2) |
|
2369 |
+ PAVGB(%%mm7, %%mm3) |
|
2370 |
+ "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
|
2371 |
+ "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
2372 |
+ "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
2373 |
+ "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
2374 |
+ "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
|
2375 |
+ "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
2376 |
+ "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
2377 |
+ "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
2378 |
+ |
|
2379 |
+ "4: \n\t" |
|
2380 |
+ |
|
2381 |
+ :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
|
2382 |
+ : "%eax", "%edx", "%ecx", "memory" |
|
2383 |
+ ); |
|
2384 |
+//printf("%d\n", test); |
|
2385 |
+#else |
|
2386 |
+{ |
|
2387 |
+ int y; |
|
2388 |
+ int d=0; |
|
2389 |
+ int sysd=0; |
|
2390 |
+ int i; |
|
2391 |
+ |
|
2392 |
+ for(y=0; y<8; y++) |
|
2393 |
+ { |
|
2394 |
+ int x; |
|
2395 |
+ for(x=0; x<8; x++) |
|
2396 |
+ { |
|
2397 |
+ int ref= tempBlured[ x + y*stride ]; |
|
2398 |
+ int cur= src[ x + y*stride ]; |
|
2399 |
+ int d1=ref - cur; |
|
2400 |
+// if(x==0 || x==7) d1+= d1>>1; |
|
2401 |
+// if(y==0 || y==7) d1+= d1>>1; |
|
2402 |
+// d+= ABS(d1); |
|
2403 |
+ d+= d1*d1; |
|
2404 |
+ sysd+= d1; |
|
2405 |
+ } |
|
2406 |
+ } |
|
2407 |
+ i=d; |
|
2408 |
+ d= ( |
|
2409 |
+ 4*d |
|
2410 |
+ +(*(tempBluredPast-256)) |
|
2411 |
+ +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) |
|
2412 |
+ +(*(tempBluredPast+256)) |
|
2413 |
+ +4)>>3; |
|
2414 |
+ *tempBluredPast=i; |
|
2415 |
+// ((*tempBluredPast)*3 + d + 2)>>2; |
|
2416 |
+ |
|
2417 |
+//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
|
2418 |
+/* |
|
2419 |
+Switch between |
|
2420 |
+ 1 0 0 0 0 0 0 (0) |
|
2421 |
+64 32 16 8 4 2 1 (1) |
|
2422 |
+64 48 36 27 20 15 11 (33) (approx) |
|
2423 |
+64 56 49 43 37 33 29 (200) (approx) |
|
2424 |
+*/ |
|
2425 |
+ if(d > maxNoise[1]) |
|
2426 |
+ { |
|
2427 |
+ if(d < maxNoise[2]) |
|
2428 |
+ { |
|
2429 |
+ for(y=0; y<8; y++) |
|
2430 |
+ { |
|
2431 |
+ int x; |
|
2432 |
+ for(x=0; x<8; x++) |
|
2433 |
+ { |
|
2434 |
+ int ref= tempBlured[ x + y*stride ]; |
|
2435 |
+ int cur= src[ x + y*stride ]; |
|
2436 |
+ tempBlured[ x + y*stride ]= |
|
2437 |
+ src[ x + y*stride ]= |
|
2438 |
+ (ref + cur + 1)>>1; |
|
2439 |
+ } |
|
2440 |
+ } |
|
2441 |
+ } |
|
2442 |
+ else |
|
2443 |
+ { |
|
2444 |
+ for(y=0; y<8; y++) |
|
2445 |
+ { |
|
2446 |
+ int x; |
|
2447 |
+ for(x=0; x<8; x++) |
|
2448 |
+ { |
|
2449 |
+ tempBlured[ x + y*stride ]= src[ x + y*stride ]; |
|
2450 |
+ } |
|
2451 |
+ } |
|
2452 |
+ } |
|
2453 |
+ } |
|
2454 |
+ else |
|
2455 |
+ { |
|
2456 |
+ if(d < maxNoise[0]) |
|
2457 |
+ { |
|
2458 |
+ for(y=0; y<8; y++) |
|
2459 |
+ { |
|
2460 |
+ int x; |
|
2461 |
+ for(x=0; x<8; x++) |
|
2462 |
+ { |
|
2463 |
+ int ref= tempBlured[ x + y*stride ]; |
|
2464 |
+ int cur= src[ x + y*stride ]; |
|
2465 |
+ tempBlured[ x + y*stride ]= |
|
2466 |
+ src[ x + y*stride ]= |
|
2467 |
+ (ref*7 + cur + 4)>>3; |
|
2468 |
+ } |
|
2469 |
+ } |
|
2470 |
+ } |
|
2471 |
+ else |
|
2472 |
+ { |
|
2473 |
+ for(y=0; y<8; y++) |
|
2474 |
+ { |
|
2475 |
+ int x; |
|
2476 |
+ for(x=0; x<8; x++) |
|
2477 |
+ { |
|
2478 |
+ int ref= tempBlured[ x + y*stride ]; |
|
2479 |
+ int cur= src[ x + y*stride ]; |
|
2480 |
+ tempBlured[ x + y*stride ]= |
|
2481 |
+ src[ x + y*stride ]= |
|
2482 |
+ (ref*3 + cur + 2)>>2; |
|
2483 |
+ } |
|
2484 |
+ } |
|
2485 |
+ } |
|
2486 |
+ } |
|
2487 |
+} |
|
2488 |
+#endif |
|
2489 |
+} |
|
2490 |
+ |
|
2491 |
+static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
2492 |
+ QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
|
2493 |
+ |
|
2494 |
+/** |
|
2495 |
+ * Copies a block from src to dst and fixes the blacklevel |
|
2496 |
+ * levelFix == 0 -> dont touch the brighness & contrast |
|
2497 |
+ */ |
|
2498 |
+#undef SCALED_CPY |
|
2499 |
+ |
|
2500 |
+static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
|
2501 |
+ int levelFix, int64_t *packedOffsetAndScale) |
|
2502 |
+{ |
|
2503 |
+#ifndef HAVE_MMX |
|
2504 |
+ int i; |
|
2505 |
+#endif |
|
2506 |
+ if(levelFix) |
|
2507 |
+ { |
|
2508 |
+#ifdef HAVE_MMX |
|
2509 |
+ asm volatile( |
|
2510 |
+ "movq (%%eax), %%mm2 \n\t" // packedYOffset |
|
2511 |
+ "movq 8(%%eax), %%mm3 \n\t" // packedYScale |
|
2512 |
+ "leal (%2,%4), %%eax \n\t" |
|
2513 |
+ "leal (%3,%5), %%edx \n\t" |
|
2514 |
+ "pxor %%mm4, %%mm4 \n\t" |
|
2515 |
+#ifdef HAVE_MMX2 |
|
2516 |
+#define SCALED_CPY(src1, src2, dst1, dst2) \ |
|
2517 |
+ "movq " #src1 ", %%mm0 \n\t"\ |
|
2518 |
+ "movq " #src1 ", %%mm5 \n\t"\ |
|
2519 |
+ "movq " #src2 ", %%mm1 \n\t"\ |
|
2520 |
+ "movq " #src2 ", %%mm6 \n\t"\ |
|
2521 |
+ "punpcklbw %%mm0, %%mm0 \n\t"\ |
|
2522 |
+ "punpckhbw %%mm5, %%mm5 \n\t"\ |
|
2523 |
+ "punpcklbw %%mm1, %%mm1 \n\t"\ |
|
2524 |
+ "punpckhbw %%mm6, %%mm6 \n\t"\ |
|
2525 |
+ "pmulhuw %%mm3, %%mm0 \n\t"\ |
|
2526 |
+ "pmulhuw %%mm3, %%mm5 \n\t"\ |
|
2527 |
+ "pmulhuw %%mm3, %%mm1 \n\t"\ |
|
2528 |
+ "pmulhuw %%mm3, %%mm6 \n\t"\ |
|
2529 |
+ "psubw %%mm2, %%mm0 \n\t"\ |
|
2530 |
+ "psubw %%mm2, %%mm5 \n\t"\ |
|
2531 |
+ "psubw %%mm2, %%mm1 \n\t"\ |
|
2532 |
+ "psubw %%mm2, %%mm6 \n\t"\ |
|
2533 |
+ "packuswb %%mm5, %%mm0 \n\t"\ |
|
2534 |
+ "packuswb %%mm6, %%mm1 \n\t"\ |
|
2535 |
+ "movq %%mm0, " #dst1 " \n\t"\ |
|
2536 |
+ "movq %%mm1, " #dst2 " \n\t"\ |
|
2537 |
+ |
|
2538 |
+#else //HAVE_MMX2 |
|
2539 |
+#define SCALED_CPY(src1, src2, dst1, dst2) \ |
|
2540 |
+ "movq " #src1 ", %%mm0 \n\t"\ |
|
2541 |
+ "movq " #src1 ", %%mm5 \n\t"\ |
|
2542 |
+ "punpcklbw %%mm4, %%mm0 \n\t"\ |
|
2543 |
+ "punpckhbw %%mm4, %%mm5 \n\t"\ |
|
2544 |
+ "psubw %%mm2, %%mm0 \n\t"\ |
|
2545 |
+ "psubw %%mm2, %%mm5 \n\t"\ |
|
2546 |
+ "movq " #src2 ", %%mm1 \n\t"\ |
|
2547 |
+ "psllw $6, %%mm0 \n\t"\ |
|
2548 |
+ "psllw $6, %%mm5 \n\t"\ |
|
2549 |
+ "pmulhw %%mm3, %%mm0 \n\t"\ |
|
2550 |
+ "movq " #src2 ", %%mm6 \n\t"\ |
|
2551 |
+ "pmulhw %%mm3, %%mm5 \n\t"\ |
|
2552 |
+ "punpcklbw %%mm4, %%mm1 \n\t"\ |
|
2553 |
+ "punpckhbw %%mm4, %%mm6 \n\t"\ |
|
2554 |
+ "psubw %%mm2, %%mm1 \n\t"\ |
|
2555 |
+ "psubw %%mm2, %%mm6 \n\t"\ |
|
2556 |
+ "psllw $6, %%mm1 \n\t"\ |
|
2557 |
+ "psllw $6, %%mm6 \n\t"\ |
|
2558 |
+ "pmulhw %%mm3, %%mm1 \n\t"\ |
|
2559 |
+ "pmulhw %%mm3, %%mm6 \n\t"\ |
|
2560 |
+ "packuswb %%mm5, %%mm0 \n\t"\ |
|
2561 |
+ "packuswb %%mm6, %%mm1 \n\t"\ |
|
2562 |
+ "movq %%mm0, " #dst1 " \n\t"\ |
|
2563 |
+ "movq %%mm1, " #dst2 " \n\t"\ |
|
2564 |
+ |
|
2565 |
+#endif //!HAVE_MMX2 |
|
2566 |
+ |
|
2567 |
+SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
|
2568 |
+SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) |
|
2569 |
+SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) |
|
2570 |
+ "leal (%%eax,%4,4), %%eax \n\t" |
|
2571 |
+ "leal (%%edx,%5,4), %%edx \n\t" |
|
2572 |
+SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) |
|
2573 |
+ |
|
2574 |
+ |
|
2575 |
+ : "=&a" (packedOffsetAndScale) |
|
2576 |
+ : "0" (packedOffsetAndScale), |
|
2577 |
+ "r"(src), |
|
2578 |
+ "r"(dst), |
|
2579 |
+ "r" (srcStride), |
|
2580 |
+ "r" (dstStride) |
|
2581 |
+ : "%edx" |
|
2582 |
+ ); |
|
2583 |
+#else |
|
2584 |
+ for(i=0; i<8; i++) |
|
2585 |
+ memcpy( &(dst[dstStride*i]), |
|
2586 |
+ &(src[srcStride*i]), BLOCK_SIZE); |
|
2587 |
+#endif |
|
2588 |
+ } |
|
2589 |
+ else |
|
2590 |
+ { |
|
2591 |
+#ifdef HAVE_MMX |
|
2592 |
+ asm volatile( |
|
2593 |
+ "leal (%0,%2), %%eax \n\t" |
|
2594 |
+ "leal (%1,%3), %%edx \n\t" |
|
2595 |
+ |
|
2596 |
+#define SIMPLE_CPY(src1, src2, dst1, dst2) \ |
|
2597 |
+ "movq " #src1 ", %%mm0 \n\t"\ |
|
2598 |
+ "movq " #src2 ", %%mm1 \n\t"\ |
|
2599 |
+ "movq %%mm0, " #dst1 " \n\t"\ |
|
2600 |
+ "movq %%mm1, " #dst2 " \n\t"\ |
|
2601 |
+ |
|
2602 |
+SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) |
|
2603 |
+SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) |
|
2604 |
+SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) |
|
2605 |
+ "leal (%%eax,%2,4), %%eax \n\t" |
|
2606 |
+ "leal (%%edx,%3,4), %%edx \n\t" |
|
2607 |
+SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) |
|
2608 |
+ |
|
2609 |
+ : : "r" (src), |
|
2610 |
+ "r" (dst), |
|
2611 |
+ "r" (srcStride), |
|
2612 |
+ "r" (dstStride) |
|
2613 |
+ : "%eax", "%edx" |
|
2614 |
+ ); |
|
2615 |
+#else |
|
2616 |
+ for(i=0; i<8; i++) |
|
2617 |
+ memcpy( &(dst[dstStride*i]), |
|
2618 |
+ &(src[srcStride*i]), BLOCK_SIZE); |
|
2619 |
+#endif |
|
2620 |
+ } |
|
2621 |
+} |
|
2622 |
+ |
|
2623 |
+/** |
|
2624 |
+ * Duplicates the given 8 src pixels ? times upward |
|
2625 |
+ */ |
|
2626 |
+static inline void RENAME(duplicate)(uint8_t src[], int stride) |
|
2627 |
+{ |
|
2628 |
+#ifdef HAVE_MMX |
|
2629 |
+ asm volatile( |
|
2630 |
+ "movq (%0), %%mm0 \n\t" |
|
2631 |
+ "addl %1, %0 \n\t" |
|
2632 |
+ "movq %%mm0, (%0) \n\t" |
|
2633 |
+ "movq %%mm0, (%0, %1) \n\t" |
|
2634 |
+ "movq %%mm0, (%0, %1, 2) \n\t" |
|
2635 |
+ : "+r" (src) |
|
2636 |
+ : "r" (-stride) |
|
2637 |
+ ); |
|
2638 |
+#else |
|
2639 |
+ int i; |
|
2640 |
+ uint8_t *p=src; |
|
2641 |
+ for(i=0; i<3; i++) |
|
2642 |
+ { |
|
2643 |
+ p-= stride; |
|
2644 |
+ memcpy(p, src, 8); |
|
2645 |
+ } |
|
2646 |
+#endif |
|
2647 |
+} |
|
2648 |
+ |
|
2649 |
+/** |
|
2650 |
+ * Filters array of bytes (Y or U or V values) |
|
2651 |
+ */ |
|
2652 |
+static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
2653 |
+ QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
|
2654 |
+{ |
|
2655 |
+ PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
|
2656 |
+ int x,y; |
|
2657 |
+#ifdef COMPILE_TIME_MODE |
|
2658 |
+ const int mode= COMPILE_TIME_MODE; |
|
2659 |
+#else |
|
2660 |
+ const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
|
2661 |
+#endif |
|
2662 |
+ int black=0, white=255; // blackest black and whitest white in the picture |
|
2663 |
+ int QPCorrecture= 256*256; |
|
2664 |
+ |
|
2665 |
+ int copyAhead; |
|
2666 |
+#ifdef HAVE_MMX |
|
2667 |
+ int i; |
|
2668 |
+#endif |
|
2669 |
+ |
|
2670 |
+ const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
|
2671 |
+ const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; |
|
2672 |
+ |
|
2673 |
+ //FIXME remove |
|
2674 |
+ uint64_t * const yHistogram= c.yHistogram; |
|
2675 |
+ uint8_t * const tempSrc= c.tempSrc; |
|
2676 |
+ uint8_t * const tempDst= c.tempDst; |
|
2677 |
+ const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
|
2678 |
+ |
|
2679 |
+#ifdef HAVE_MMX |
|
2680 |
+ for(i=0; i<32; i++){ |
|
2681 |
+ int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; |
|
2682 |
+ int threshold= offset*2 + 1; |
|
2683 |
+ c.mmxDcOffset[i]= 0x7F - offset; |
|
2684 |
+ c.mmxDcThreshold[i]= 0x7F - threshold; |
|
2685 |
+ c.mmxDcOffset[i]*= 0x0101010101010101LL; |
|
2686 |
+ c.mmxDcThreshold[i]*= 0x0101010101010101LL; |
|
2687 |
+ } |
|
2688 |
+#endif |
|
2689 |
+ |
|
2690 |
+ if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
|
2691 |
+ else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
|
2692 |
+ || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14; |
|
2693 |
+ else if( (mode & V_DEBLOCK) |
|
2694 |
+ || (mode & LINEAR_IPOL_DEINT_FILTER) |
|
2695 |
+ || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; |
|
2696 |
+ else if(mode & V_X1_FILTER) copyAhead=11; |
|
2697 |
+// else if(mode & V_RK1_FILTER) copyAhead=10; |
|
2698 |
+ else if(mode & DERING) copyAhead=9; |
|
2699 |
+ else copyAhead=8; |
|
2700 |
+ |
|
2701 |
+ copyAhead-= 8; |
|
2702 |
+ |
|
2703 |
+ if(!isColor) |
|
2704 |
+ { |
|
2705 |
+ uint64_t sum= 0; |
|
2706 |
+ int i; |
|
2707 |
+ uint64_t maxClipped; |
|
2708 |
+ uint64_t clipped; |
|
2709 |
+ double scale; |
|
2710 |
+ |
|
2711 |
+ c.frameNum++; |
|
2712 |
+ // first frame is fscked so we ignore it |
|
2713 |
+ if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; |
|
2714 |
+ |
|
2715 |
+ for(i=0; i<256; i++) |
|
2716 |
+ { |
|
2717 |
+ sum+= yHistogram[i]; |
|
2718 |
+// printf("%d ", yHistogram[i]); |
|
2719 |
+ } |
|
2720 |
+// printf("\n\n"); |
|
2721 |
+ |
|
2722 |
+ /* we allways get a completly black picture first */ |
|
2723 |
+ maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); |
|
2724 |
+ |
|
2725 |
+ clipped= sum; |
|
2726 |
+ for(black=255; black>0; black--) |
|
2727 |
+ { |
|
2728 |
+ if(clipped < maxClipped) break; |
|
2729 |
+ clipped-= yHistogram[black]; |
|
2730 |
+ } |
|
2731 |
+ |
|
2732 |
+ clipped= sum; |
|
2733 |
+ for(white=0; white<256; white++) |
|
2734 |
+ { |
|
2735 |
+ if(clipped < maxClipped) break; |
|
2736 |
+ clipped-= yHistogram[white]; |
|
2737 |
+ } |
|
2738 |
+ |
|
2739 |
+ scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
|
2740 |
+ |
|
2741 |
+#ifdef HAVE_MMX2 |
|
2742 |
+ c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
|
2743 |
+ c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; |
|
2744 |
+#else |
|
2745 |
+ c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
|
2746 |
+ c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; |
|
2747 |
+#endif |
|
2748 |
+ |
|
2749 |
+ c.packedYOffset|= c.packedYOffset<<32; |
|
2750 |
+ c.packedYOffset|= c.packedYOffset<<16; |
|
2751 |
+ |
|
2752 |
+ c.packedYScale|= c.packedYScale<<32; |
|
2753 |
+ c.packedYScale|= c.packedYScale<<16; |
|
2754 |
+ |
|
2755 |
+ if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); |
|
2756 |
+ else QPCorrecture= 256*256; |
|
2757 |
+ } |
|
2758 |
+ else |
|
2759 |
+ { |
|
2760 |
+ c.packedYScale= 0x0100010001000100LL; |
|
2761 |
+ c.packedYOffset= 0; |
|
2762 |
+ QPCorrecture= 256*256; |
|
2763 |
+ } |
|
2764 |
+ |
|
2765 |
+ /* copy & deinterlace first row of blocks */ |
|
2766 |
+ y=-BLOCK_SIZE; |
|
2767 |
+ { |
|
2768 |
+ uint8_t *srcBlock= &(src[y*srcStride]); |
|
2769 |
+ uint8_t *dstBlock= tempDst + dstStride; |
|
2770 |
+ |
|
2771 |
+ // From this point on it is guranteed that we can read and write 16 lines downward |
|
2772 | ||
2773 |
+ // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
2774 |
+ for(x=0; x<width; x+=BLOCK_SIZE) |
|
2775 |
+ { |
|
2776 |
+ |
|
2777 |
+#ifdef HAVE_MMX2 |
|
2778 |
+/* |
|
2779 |
+ prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
|
2780 |
+ prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
|
2781 |
+ prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
|
2782 |
+ prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
|
2783 |
+*/ |
|
2784 |
+ |
|
2785 |
+ asm( |
|
2786 |
+ "movl %4, %%eax \n\t" |
|
2787 |
+ "shrl $2, %%eax \n\t" |
|
2788 |
+ "andl $6, %%eax \n\t" |
|
2789 |
+ "addl %5, %%eax \n\t" |
|
2790 |
+ "movl %%eax, %%edx \n\t" |
|
2791 |
+ "imul %1, %%eax \n\t" |
|
2792 |
+ "imul %3, %%edx \n\t" |
|
2793 |
+ "prefetchnta 32(%%eax, %0) \n\t" |
|
2794 |
+ "prefetcht0 32(%%edx, %2) \n\t" |
|
2795 |
+ "addl %1, %%eax \n\t" |
|
2796 |
+ "addl %3, %%edx \n\t" |
|
2797 |
+ "prefetchnta 32(%%eax, %0) \n\t" |
|
2798 |
+ "prefetcht0 32(%%edx, %2) \n\t" |
|
2799 |
+ :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
|
2800 |
+ "m" (x), "m" (copyAhead) |
|
2801 |
+ : "%eax", "%edx" |
|
2802 |
+ ); |
|
2803 |
+ |
|
2804 |
+#elif defined(HAVE_3DNOW) |
|
2805 |
+//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
|
2806 |
+/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
|
2807 |
+ prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
|
2808 |
+ prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
|
2809 |
+ prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
|
2810 |
+*/ |
|
2811 |
+#endif |
|
2812 |
+ |
|
2813 |
+ RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
|
2814 |
+ srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
2815 |
+ |
|
2816 |
+ RENAME(duplicate)(dstBlock + dstStride*8, dstStride); |
|
2817 |
+ |
|
2818 |
+ if(mode & LINEAR_IPOL_DEINT_FILTER) |
|
2819 |
+ RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
|
2820 |
+ else if(mode & LINEAR_BLEND_DEINT_FILTER) |
|
2821 |
+ RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
|
2822 |
+ else if(mode & MEDIAN_DEINT_FILTER) |
|
2823 |
+ RENAME(deInterlaceMedian)(dstBlock, dstStride); |
|
2824 |
+ else if(mode & CUBIC_IPOL_DEINT_FILTER) |
|
2825 |
+ RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
|
2826 |
+ else if(mode & FFMPEG_DEINT_FILTER) |
|
2827 |
+ RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); |
|
2828 |
+/* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
|
2829 |
+ RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
2830 |
+*/ |
|
2831 |
+ dstBlock+=8; |
|
2832 |
+ srcBlock+=8; |
|
2833 |
+ } |
|
2834 |
+ if(width==dstStride) |
|
2835 |
+ memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride); |
|
2836 |
+ else |
|
2837 |
+ { |
|
2838 |
+ int i; |
|
2839 |
+ for(i=0; i<copyAhead; i++) |
|
2840 |
+ { |
|
2841 |
+ memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); |
|
2842 |
+ } |
|
2843 |
+ } |
|
2844 |
+ } |
|
2845 |
+ |
|
2846 |
+//printf("\n"); |
|
2847 |
+ for(y=0; y<height; y+=BLOCK_SIZE) |
|
2848 |
+ { |
|
2849 |
+ //1% speedup if these are here instead of the inner loop |
|
2850 |
+ uint8_t *srcBlock= &(src[y*srcStride]); |
|
2851 |
+ uint8_t *dstBlock= &(dst[y*dstStride]); |
|
2852 |
+#ifdef HAVE_MMX |
|
2853 |
+ uint8_t *tempBlock1= c.tempBlocks; |
|
2854 |
+ uint8_t *tempBlock2= c.tempBlocks + 8; |
|
2855 |
+#endif |
|
2856 |
+ int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
|
2857 |
+ int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*mbWidth]; |
|
2858 |
+ int QP=0; |
|
2859 |
+ /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
|
2860 |
+ if not than use a temporary buffer */ |
|
2861 |
+ if(y+15 >= height) |
|
2862 |
+ { |
|
2863 |
+ int i; |
|
2864 |
+ /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
|
2865 |
+ blockcopy to dst later */ |
|
2866 |
+ memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
|
2867 |
+ srcStride*MAX(height-y-copyAhead, 0) ); |
|
2868 |
+ |
|
2869 |
+ /* duplicate last line of src to fill the void upto line (copyAhead+7) */ |
|
2870 |
+ for(i=MAX(height-y, 8); i<copyAhead+8; i++) |
|
2871 |
+ memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
|
2872 |
+ |
|
2873 |
+ /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
|
2874 |
+ memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); |
|
2875 |
+ |
|
2876 |
+ /* duplicate last line of dst to fill the void upto line (copyAhead) */ |
|
2877 |
+ for(i=height-y+1; i<=copyAhead; i++) |
|
2878 |
+ memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
|
2879 |
+ |
|
2880 |
+ dstBlock= tempDst + dstStride; |
|
2881 |
+ srcBlock= tempSrc; |
|
2882 |
+ } |
|
2883 |
+//printf("\n"); |
|
2884 |
+ |
|
2885 |
+ // From this point on it is guranteed that we can read and write 16 lines downward |
|
2886 | ||
2887 |
+ // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
2888 |
+ for(x=0; x<width; x+=BLOCK_SIZE) |
|
2889 |
+ { |
|
2890 |
+ const int stride= dstStride; |
|
2891 |
+#ifdef HAVE_MMX |
|
2892 |
+ uint8_t *tmpXchg; |
|
2893 |
+#endif |
|
2894 |
+ if(isColor) |
|
2895 |
+ { |
|
2896 |
+ QP= QPptr[x>>qpHShift]; |
|
2897 |
+ c.nonBQP= nonBQPptr[x>>qpHShift]; |
|
2898 |
+ } |
|
2899 |
+ else |
|
2900 |
+ { |
|
2901 |
+ QP= QPptr[x>>4]; |
|
2902 |
+ QP= (QP* QPCorrecture + 256*128)>>16; |
|
2903 |
+ c.nonBQP= nonBQPptr[x>>4]; |
|
2904 |
+ c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; |
|
2905 |
+ yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
|
2906 |
+ } |
|
2907 |
+ c.QP= QP; |
|
2908 |
+#ifdef HAVE_MMX |
|
2909 |
+ asm volatile( |
|
2910 |
+ "movd %1, %%mm7 \n\t" |
|
2911 |
+ "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
|
2912 |
+ "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
|
2913 |
+ "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
|
2914 |
+ "movq %%mm7, %0 \n\t" |
|
2915 |
+ : "=m" (c.pQPb) |
|
2916 |
+ : "r" (QP) |
|
2917 |
+ ); |
|
2918 |
+#endif |
|
2919 |
+ |
|
2920 |
+ |
|
2921 |
+#ifdef HAVE_MMX2 |
|
2922 |
+/* |
|
2923 |
+ prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
|
2924 |
+ prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
|
2925 |
+ prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
|
2926 |
+ prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
|
2927 |
+*/ |
|
2928 |
+ |
|
2929 |
+ asm( |
|
2930 |
+ "movl %4, %%eax \n\t" |
|
2931 |
+ "shrl $2, %%eax \n\t" |
|
2932 |
+ "andl $6, %%eax \n\t" |
|
2933 |
+ "addl %5, %%eax \n\t" |
|
2934 |
+ "movl %%eax, %%edx \n\t" |
|
2935 |
+ "imul %1, %%eax \n\t" |
|
2936 |
+ "imul %3, %%edx \n\t" |
|
2937 |
+ "prefetchnta 32(%%eax, %0) \n\t" |
|
2938 |
+ "prefetcht0 32(%%edx, %2) \n\t" |
|
2939 |
+ "addl %1, %%eax \n\t" |
|
2940 |
+ "addl %3, %%edx \n\t" |
|
2941 |
+ "prefetchnta 32(%%eax, %0) \n\t" |
|
2942 |
+ "prefetcht0 32(%%edx, %2) \n\t" |
|
2943 |
+ :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
|
2944 |
+ "m" (x), "m" (copyAhead) |
|
2945 |
+ : "%eax", "%edx" |
|
2946 |
+ ); |
|
2947 |
+ |
|
2948 |
+#elif defined(HAVE_3DNOW) |
|
2949 |
+//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
|
2950 |
+/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
|
2951 |
+ prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
|
2952 |
+ prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
|
2953 |
+ prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
|
2954 |
+*/ |
|
2955 |
+#endif |
|
2956 |
+ |
|
2957 |
+ RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
|
2958 |
+ srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
2959 |
+ |
|
2960 |
+ if(mode & LINEAR_IPOL_DEINT_FILTER) |
|
2961 |
+ RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
|
2962 |
+ else if(mode & LINEAR_BLEND_DEINT_FILTER) |
|
2963 |
+ RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
|
2964 |
+ else if(mode & MEDIAN_DEINT_FILTER) |
|
2965 |
+ RENAME(deInterlaceMedian)(dstBlock, dstStride); |
|
2966 |
+ else if(mode & CUBIC_IPOL_DEINT_FILTER) |
|
2967 |
+ RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
|
2968 |
+ else if(mode & FFMPEG_DEINT_FILTER) |
|
2969 |
+ RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); |
|
2970 |
+/* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
|
2971 |
+ RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
2972 |
+*/ |
|
2973 |
+ |
|
2974 |
+ /* only deblock if we have 2 blocks */ |
|
2975 |
+ if(y + 8 < height) |
|
2976 |
+ { |
|
2977 |
+ if(mode & V_X1_FILTER) |
|
2978 |
+ RENAME(vertX1Filter)(dstBlock, stride, &c); |
|
2979 |
+ else if(mode & V_DEBLOCK) |
|
2980 |
+ { |
|
2981 |
+ if( RENAME(isVertDC)(dstBlock, stride, &c)) |
|
2982 |
+ { |
|
2983 |
+ if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c)) |
|
2984 |
+ RENAME(doVertLowPass)(dstBlock, stride, &c); |
|
2985 |
+ } |
|
2986 |
+ else |
|
2987 |
+ RENAME(doVertDefFilter)(dstBlock, stride, &c); |
|
2988 |
+ } |
|
2989 |
+ } |
|
2990 |
+ |
|
2991 |
+#ifdef HAVE_MMX |
|
2992 |
+ RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
|
2993 |
+#endif |
|
2994 |
+ /* check if we have a previous block to deblock it with dstBlock */ |
|
2995 |
+ if(x - 8 >= 0) |
|
2996 |
+ { |
|
2997 |
+#ifdef HAVE_MMX |
|
2998 |
+ if(mode & H_X1_FILTER) |
|
2999 |
+ RENAME(vertX1Filter)(tempBlock1, 16, &c); |
|
3000 |
+ else if(mode & H_DEBLOCK) |
|
3001 |
+ { |
|
3002 |
+ if( RENAME(isVertDC)(tempBlock1, 16, &c)) |
|
3003 |
+ { |
|
3004 |
+ if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c)) |
|
3005 |
+ RENAME(doVertLowPass)(tempBlock1, 16, &c); |
|
3006 |
+ } |
|
3007 |
+ else |
|
3008 |
+ RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
|
3009 |
+ } |
|
3010 |
+ |
|
3011 |
+ RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
|
3012 |
+ |
|
3013 |
+#else |
|
3014 |
+ if(mode & H_X1_FILTER) |
|
3015 |
+ horizX1Filter(dstBlock-4, stride, QP); |
|
3016 |
+ else if(mode & H_DEBLOCK) |
|
3017 |
+ { |
|
3018 |
+ if( isHorizDC(dstBlock-4, stride, &c)) |
|
3019 |
+ { |
|
3020 |
+ if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
|
3021 |
+ doHorizLowPass(dstBlock-4, stride, QP); |
|
3022 |
+ } |
|
3023 |
+ else |
|
3024 |
+ doHorizDefFilter(dstBlock-4, stride, QP); |
|
3025 |
+ } |
|
3026 |
+#endif |
|
3027 |
+ if(mode & DERING) |
|
3028 |
+ { |
|
3029 |
+ //FIXME filter first line |
|
3030 |
+ if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
|
3031 |
+ } |
|
3032 |
+ |
|
3033 |
+ if(mode & TEMP_NOISE_FILTER) |
|
3034 |
+ { |
|
3035 |
+ RENAME(tempNoiseReducer)(dstBlock-8, stride, |
|
3036 |
+ c.tempBlured[isColor] + y*dstStride + x, |
|
3037 |
+ c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
|
3038 |
+ c.ppMode.maxTmpNoise); |
|
3039 |
+ } |
|
3040 |
+ } |
|
3041 |
+ |
|
3042 |
+ dstBlock+=8; |
|
3043 |
+ srcBlock+=8; |
|
3044 |
+ |
|
3045 |
+#ifdef HAVE_MMX |
|
3046 |
+ tmpXchg= tempBlock1; |
|
3047 |
+ tempBlock1= tempBlock2; |
|
3048 |
+ tempBlock2 = tmpXchg; |
|
3049 |
+#endif |
|
3050 |
+ } |
|
3051 |
+ |
|
3052 |
+ if(mode & DERING) |
|
3053 |
+ { |
|
3054 |
+ if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
|
3055 |
+ } |
|
3056 |
+ |
|
3057 |
+ if((mode & TEMP_NOISE_FILTER)) |
|
3058 |
+ { |
|
3059 |
+ RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
|
3060 |
+ c.tempBlured[isColor] + y*dstStride + x, |
|
3061 |
+ c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
|
3062 |
+ c.ppMode.maxTmpNoise); |
|
3063 |
+ } |
|
3064 |
+ |
|
3065 |
+ /* did we use a tmp buffer for the last lines*/ |
|
3066 |
+ if(y+15 >= height) |
|
3067 |
+ { |
|
3068 |
+ uint8_t *dstBlock= &(dst[y*dstStride]); |
|
3069 |
+ if(width==dstStride) |
|
3070 |
+ memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y)); |
|
3071 |
+ else |
|
3072 |
+ { |
|
3073 |
+ int i; |
|
3074 |
+ for(i=0; i<height-y; i++) |
|
3075 |
+ { |
|
3076 |
+ memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); |
|
3077 |
+ } |
|
3078 |
+ } |
|
3079 |
+ } |
|
3080 |
+/* |
|
3081 |
+ for(x=0; x<width; x+=32) |
|
3082 |
+ { |
|
3083 |
+ volatile int i; |
|
3084 |
+ i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
|
3085 |
+ + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] |
|
3086 |
+ + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
|
3087 |
+// + dstBlock[x +13*dstStride] |
|
3088 |
+// + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; |
|
3089 |
+ }*/ |
|
3090 |
+ } |
|
3091 |
+#ifdef HAVE_3DNOW |
|
3092 |
+ asm volatile("femms"); |
|
3093 |
+#elif defined (HAVE_MMX) |
|
3094 |
+ asm volatile("emms"); |
|
3095 |
+#endif |
|
3096 |
+ |
|
3097 |
+#ifdef DEBUG_BRIGHTNESS |
|
3098 |
+ if(!isColor) |
|
3099 |
+ { |
|
3100 |
+ int max=1; |
|
3101 |
+ int i; |
|
3102 |
+ for(i=0; i<256; i++) |
|
3103 |
+ if(yHistogram[i] > max) max=yHistogram[i]; |
|
3104 |
+ |
|
3105 |
+ for(i=1; i<256; i++) |
|
3106 |
+ { |
|
3107 |
+ int x; |
|
3108 |
+ int start=yHistogram[i-1]/(max/256+1); |
|
3109 |
+ int end=yHistogram[i]/(max/256+1); |
|
3110 |
+ int inc= end > start ? 1 : -1; |
|
3111 |
+ for(x=start; x!=end+inc; x+=inc) |
|
3112 |
+ dst[ i*dstStride + x]+=128; |
|
3113 |
+ } |
|
3114 |
+ |
|
3115 |
+ for(i=0; i<100; i+=2) |
|
3116 |
+ { |
|
3117 |
+ dst[ (white)*dstStride + i]+=128; |
|
3118 |
+ dst[ (black)*dstStride + i]+=128; |
|
3119 |
+ } |
|
3120 |
+ |
|
3121 |
+ } |
|
3122 |
+#endif |
|
3123 |
+ |
|
3124 |
+ *c2= c; //copy local context back |
|
3125 |
+ |
|
3126 |
+} |
... | ... |
@@ -2,16 +2,9 @@ |
2 | 2 |
include ../config.mak |
3 | 3 |
|
4 | 4 |
SWSLIB = libswscale.a |
5 |
-ifeq ($(SHARED_PP),yes) |
|
6 |
-SPPLIB = libpostproc.so |
|
7 |
-SPPVERSION = 0.0.1 |
|
8 |
-endif |
|
9 |
-PPLIB = libpostproc.a |
|
10 | 5 |
|
11 | 6 |
SWSSRCS=swscale.c rgb2rgb.c yuv2rgb.c |
12 | 7 |
SWSOBJS=$(SWSSRCS:.c=.o) |
13 |
-PPOBJS=postprocess.o |
|
14 |
-SPPOBJS=postprocess_pic.o |
|
15 | 8 |
CS_TEST_OBJS=cs_test.o rgb2rgb.o ../cpudetect.o ../mp_msg.o ../libvo/aclib.o |
16 | 9 |
|
17 | 10 |
CFLAGS = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC) |
... | ... |
@@ -24,7 +17,7 @@ CFLAGS = $(OPTFLAGS) $(MLIB_INC) -I. -I.. $(EXTRA_INC) |
24 | 24 |
.c.o: |
25 | 25 |
$(CC) -c $(CFLAGS) -I.. -o $@ $< |
26 | 26 |
|
27 |
-all: $(SWSLIB) $(PPLIB) $(SPPLIB) |
|
27 |
+all: $(SWSLIB) |
|
28 | 28 |
|
29 | 29 |
$(SWSLIB): $(SWSOBJS) |
30 | 30 |
$(AR) r $(SWSLIB) $(SWSOBJS) |
... | ... |
@@ -43,29 +36,6 @@ depend: |
43 | 43 |
cs_test: $(CS_TEST_OBJS) |
44 | 44 |
$(CC) $(CS_TEST_OBJS) -o cs_test |
45 | 45 |
|
46 |
-ifeq ($(SHARED_PP),yes) |
|
47 |
-postprocess_pic.o: postprocess.c |
|
48 |
- $(CC) -c $(CFLAGS) -fomit-frame-pointer -fPIC -DPIC -I.. -o $@ $< |
|
49 |
- |
|
50 |
-$(SPPLIB): $(SPPOBJS) |
|
51 |
- $(CC) -shared -Wl,-soname,$(SPPLIB).0 \ |
|
52 |
- -o $(SPPLIB) $(SPPOBJS) |
|
53 |
-endif |
|
54 |
- |
|
55 |
-$(PPLIB): $(PPOBJS) |
|
56 |
- $(AR) r $(PPLIB) $(PPOBJS) |
|
57 |
- |
|
58 |
-install: all |
|
59 |
-ifeq ($(SHARED_PP),yes) |
|
60 |
- install -d $(prefix)/lib |
|
61 |
- install -s -m 755 $(SPPLIB) $(prefix)/lib/$(SPPLIB).$(SPPVERSION) |
|
62 |
- ln -sf $(SPPLIB).$(SPPVERSION) $(prefix)/lib/$(SPPLIB) |
|
63 |
- ldconfig || true |
|
64 |
- mkdir -p $(prefix)/include/postproc |
|
65 |
- install -m 644 postprocess.h $(prefix)/include/postproc/postprocess.h |
|
66 |
-endif |
|
67 |
- |
|
68 |
- |
|
69 | 46 |
# |
70 | 47 |
# include dependency files if they exist |
71 | 48 |
# |
72 | 49 |
deleted file mode 100644 |
... | ... |
@@ -1,874 +0,0 @@ |
1 |
-/* |
|
2 |
- Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) |
|
3 |
- |
|
4 |
- This program is free software; you can redistribute it and/or modify |
|
5 |
- it under the terms of the GNU General Public License as published by |
|
6 |
- the Free Software Foundation; either version 2 of the License, or |
|
7 |
- (at your option) any later version. |
|
8 |
- |
|
9 |
- This program is distributed in the hope that it will be useful, |
|
10 |
- but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 |
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
12 |
- GNU General Public License for more details. |
|
13 |
- |
|
14 |
- You should have received a copy of the GNU General Public License |
|
15 |
- along with this program; if not, write to the Free Software |
|
16 |
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
17 |
-*/ |
|
18 |
- |
|
19 |
-/* |
|
20 |
- C MMX MMX2 3DNow |
|
21 |
-isVertDC Ec Ec |
|
22 |
-isVertMinMaxOk Ec Ec |
|
23 |
-doVertLowPass E e e |
|
24 |
-doVertDefFilter Ec Ec e e |
|
25 |
-isHorizDC Ec Ec |
|
26 |
-isHorizMinMaxOk a E |
|
27 |
-doHorizLowPass E e e |
|
28 |
-doHorizDefFilter Ec Ec e e |
|
29 |
-deRing E e e* |
|
30 |
-Vertical RKAlgo1 E a a |
|
31 |
-Horizontal RKAlgo1 a a |
|
32 |
-Vertical X1# a E E |
|
33 |
-Horizontal X1# a E E |
|
34 |
-LinIpolDeinterlace e E E* |
|
35 |
-CubicIpolDeinterlace a e e* |
|
36 |
-LinBlendDeinterlace e E E* |
|
37 |
-MedianDeinterlace# E Ec Ec |
|
38 |
-TempDeNoiser# E e e |
|
39 |
- |
|
40 |
-* i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work |
|
41 |
-# more or less selfinvented filters so the exactness isnt too meaningfull |
|
42 |
-E = Exact implementation |
|
43 |
-e = allmost exact implementation (slightly different rounding,...) |
|
44 |
-a = alternative / approximate impl |
|
45 |
-c = checked against the other implementations (-vo md5) |
|
46 |
-*/ |
|
47 |
- |
|
48 |
-/* |
|
49 |
-TODO: |
|
50 |
-reduce the time wasted on the mem transfer |
|
51 |
-unroll stuff if instructions depend too much on the prior one |
|
52 |
-move YScale thing to the end instead of fixing QP |
|
53 |
-write a faster and higher quality deblocking filter :) |
|
54 |
-make the mainloop more flexible (variable number of blocks at once |
|
55 |
- (the if/else stuff per block is slowing things down) |
|
56 |
-compare the quality & speed of all filters |
|
57 |
-split this huge file |
|
58 |
-optimize c versions |
|
59 |
-try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks |
|
60 |
-... |
|
61 |
-*/ |
|
62 |
- |
|
63 |
-//Changelog: use the CVS log |
|
64 |
- |
|
65 |
-#include "config.h" |
|
66 |
-#include <inttypes.h> |
|
67 |
-#include <stdio.h> |
|
68 |
-#include <stdlib.h> |
|
69 |
-#include <string.h> |
|
70 |
-#ifdef HAVE_MALLOC_H |
|
71 |
-#include <malloc.h> |
|
72 |
-#endif |
|
73 |
-//#undef HAVE_MMX2 |
|
74 |
-//#define HAVE_3DNOW |
|
75 |
-//#undef HAVE_MMX |
|
76 |
-//#undef ARCH_X86 |
|
77 |
-//#define DEBUG_BRIGHTNESS |
|
78 |
-#ifndef PIC |
|
79 |
-#include "../libvo/fastmemcpy.h" |
|
80 |
-#endif |
|
81 |
-#include "postprocess.h" |
|
82 |
-#include "postprocess_internal.h" |
|
83 |
-#include "mangle.h" |
|
84 |
- |
|
85 |
-#define MIN(a,b) ((a) > (b) ? (b) : (a)) |
|
86 |
-#define MAX(a,b) ((a) < (b) ? (b) : (a)) |
|
87 |
-#define ABS(a) ((a) > 0 ? (a) : (-(a))) |
|
88 |
-#define SIGN(a) ((a) > 0 ? 1 : -1) |
|
89 |
- |
|
90 |
-#define GET_MODE_BUFFER_SIZE 500 |
|
91 |
-#define OPTIONS_ARRAY_SIZE 10 |
|
92 |
-#define BLOCK_SIZE 8 |
|
93 |
-#define TEMP_STRIDE 8 |
|
94 |
-//#define NUM_BLOCKS_AT_ONCE 16 //not used yet |
|
95 |
- |
|
96 |
-#ifdef ARCH_X86 |
|
97 |
-static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL; |
|
98 |
-static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL; |
|
99 |
-static uint64_t __attribute__((aligned(8))) b00= 0x0000000000000000LL; |
|
100 |
-static uint64_t __attribute__((aligned(8))) b01= 0x0101010101010101LL; |
|
101 |
-static uint64_t __attribute__((aligned(8))) b02= 0x0202020202020202LL; |
|
102 |
-static uint64_t __attribute__((aligned(8))) b08= 0x0808080808080808LL; |
|
103 |
-static uint64_t __attribute__((aligned(8))) b80= 0x8080808080808080LL; |
|
104 |
-#endif |
|
105 |
- |
|
106 |
-static int verbose= 0; |
|
107 |
- |
|
108 |
-static const int deringThreshold= 20; |
|
109 |
- |
|
110 |
- |
|
111 |
-static struct PPFilter filters[]= |
|
112 |
-{ |
|
113 |
- {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK}, |
|
114 |
- {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK}, |
|
115 |
-/* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER}, |
|
116 |
- {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/ |
|
117 |
- {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER}, |
|
118 |
- {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER}, |
|
119 |
- {"dr", "dering", 1, 5, 6, DERING}, |
|
120 |
- {"al", "autolevels", 0, 1, 2, LEVEL_FIX}, |
|
121 |
- {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER}, |
|
122 |
- {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER}, |
|
123 |
- {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER}, |
|
124 |
- {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER}, |
|
125 |
- {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER}, |
|
126 |
- {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER}, |
|
127 |
- {"fq", "forcequant", 1, 0, 0, FORCE_QUANT}, |
|
128 |
- {NULL, NULL,0,0,0,0} //End Marker |
|
129 |
-}; |
|
130 |
- |
|
131 |
-static char *replaceTable[]= |
|
132 |
-{ |
|
133 |
- "default", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
134 |
- "de", "hdeblock:a,vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
135 |
- "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
136 |
- "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
|
137 |
- NULL //End Marker |
|
138 |
-}; |
|
139 |
- |
|
140 |
-#ifdef ARCH_X86 |
|
141 |
-static inline void unusedVariableWarningFixer() |
|
142 |
-{ |
|
143 |
- if(w05 + w20 + b00 + b01 + b02 + b08 + b80 == 0) b00=0; |
|
144 |
-} |
|
145 |
-#endif |
|
146 |
- |
|
147 |
- |
|
148 |
-#ifdef ARCH_X86 |
|
149 |
-static inline void prefetchnta(void *p) |
|
150 |
-{ |
|
151 |
- asm volatile( "prefetchnta (%0)\n\t" |
|
152 |
- : : "r" (p) |
|
153 |
- ); |
|
154 |
-} |
|
155 |
- |
|
156 |
-static inline void prefetcht0(void *p) |
|
157 |
-{ |
|
158 |
- asm volatile( "prefetcht0 (%0)\n\t" |
|
159 |
- : : "r" (p) |
|
160 |
- ); |
|
161 |
-} |
|
162 |
- |
|
163 |
-static inline void prefetcht1(void *p) |
|
164 |
-{ |
|
165 |
- asm volatile( "prefetcht1 (%0)\n\t" |
|
166 |
- : : "r" (p) |
|
167 |
- ); |
|
168 |
-} |
|
169 |
- |
|
170 |
-static inline void prefetcht2(void *p) |
|
171 |
-{ |
|
172 |
- asm volatile( "prefetcht2 (%0)\n\t" |
|
173 |
- : : "r" (p) |
|
174 |
- ); |
|
175 |
-} |
|
176 |
-#endif |
|
177 |
- |
|
178 |
-// The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing |
|
179 |
- |
|
180 |
-/** |
|
181 |
- * Check if the given 8x8 Block is mostly "flat" |
|
182 |
- */ |
|
183 |
-static inline int isHorizDC(uint8_t src[], int stride, PPContext *c) |
|
184 |
-{ |
|
185 |
- int numEq= 0; |
|
186 |
- int y; |
|
187 |
- const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1; |
|
188 |
- const int dcThreshold= dcOffset*2 + 1; |
|
189 |
- for(y=0; y<BLOCK_SIZE; y++) |
|
190 |
- { |
|
191 |
- if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++; |
|
192 |
- if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++; |
|
193 |
- if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++; |
|
194 |
- if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++; |
|
195 |
- if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++; |
|
196 |
- if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++; |
|
197 |
- if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++; |
|
198 |
- src+= stride; |
|
199 |
- } |
|
200 |
- return numEq > c->ppMode.flatnessThreshold; |
|
201 |
-} |
|
202 |
- |
|
203 |
-/** |
|
204 |
- * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
205 |
- */ |
|
206 |
-static inline int isVertDC_C(uint8_t src[], int stride, PPContext *c){ |
|
207 |
- int numEq= 0; |
|
208 |
- int y; |
|
209 |
- const int dcOffset= ((c->QP*c->ppMode.baseDcDiff)>>8) + 1; |
|
210 |
- const int dcThreshold= dcOffset*2 + 1; |
|
211 |
- src+= stride*4; // src points to begin of the 8x8 Block |
|
212 |
- for(y=0; y<BLOCK_SIZE-1; y++) |
|
213 |
- { |
|
214 |
- if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++; |
|
215 |
- if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++; |
|
216 |
- if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++; |
|
217 |
- if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++; |
|
218 |
- if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++; |
|
219 |
- if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++; |
|
220 |
- if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++; |
|
221 |
- if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++; |
|
222 |
- src+= stride; |
|
223 |
- } |
|
224 |
- return numEq > c->ppMode.flatnessThreshold; |
|
225 |
-} |
|
226 |
- |
|
227 |
-static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) |
|
228 |
-{ |
|
229 |
- if(abs(src[0] - src[7]) > 2*QP) return 0; |
|
230 |
- |
|
231 |
- return 1; |
|
232 |
-} |
|
233 |
- |
|
234 |
-static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) |
|
235 |
-{ |
|
236 |
- int y; |
|
237 |
- for(y=0; y<BLOCK_SIZE; y++) |
|
238 |
- { |
|
239 |
- const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); |
|
240 |
- |
|
241 |
- if(ABS(middleEnergy) < 8*QP) |
|
242 |
- { |
|
243 |
- const int q=(dst[3] - dst[4])/2; |
|
244 |
- const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); |
|
245 |
- const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); |
|
246 |
- |
|
247 |
- int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
248 |
- d= MAX(d, 0); |
|
249 |
- |
|
250 |
- d= (5*d + 32) >> 6; |
|
251 |
- d*= SIGN(-middleEnergy); |
|
252 |
- |
|
253 |
- if(q>0) |
|
254 |
- { |
|
255 |
- d= d<0 ? 0 : d; |
|
256 |
- d= d>q ? q : d; |
|
257 |
- } |
|
258 |
- else |
|
259 |
- { |
|
260 |
- d= d>0 ? 0 : d; |
|
261 |
- d= d<q ? q : d; |
|
262 |
- } |
|
263 |
- |
|
264 |
- dst[3]-= d; |
|
265 |
- dst[4]+= d; |
|
266 |
- } |
|
267 |
- dst+= stride; |
|
268 |
- } |
|
269 |
-} |
|
270 |
- |
|
271 |
-/** |
|
272 |
- * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) |
|
273 |
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) |
|
274 |
- */ |
|
275 |
-static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) |
|
276 |
-{ |
|
277 |
- |
|
278 |
- int y; |
|
279 |
- for(y=0; y<BLOCK_SIZE; y++) |
|
280 |
- { |
|
281 |
- const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; |
|
282 |
- const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; |
|
283 |
- |
|
284 |
- int sums[9]; |
|
285 |
- sums[0] = first + dst[0]; |
|
286 |
- sums[1] = dst[0] + dst[1]; |
|
287 |
- sums[2] = dst[1] + dst[2]; |
|
288 |
- sums[3] = dst[2] + dst[3]; |
|
289 |
- sums[4] = dst[3] + dst[4]; |
|
290 |
- sums[5] = dst[4] + dst[5]; |
|
291 |
- sums[6] = dst[5] + dst[6]; |
|
292 |
- sums[7] = dst[6] + dst[7]; |
|
293 |
- sums[8] = dst[7] + last; |
|
294 |
- |
|
295 |
- dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
|
296 |
- dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
|
297 |
- dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
|
298 |
- dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
|
299 |
- dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
|
300 |
- dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
|
301 |
- dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4; |
|
302 |
- dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
|
303 |
- |
|
304 |
- dst+= stride; |
|
305 |
- } |
|
306 |
-} |
|
307 |
- |
|
308 |
-/** |
|
309 |
- * Experimental Filter 1 (Horizontal) |
|
310 |
- * will not damage linear gradients |
|
311 |
- * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
312 |
- * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
313 |
- * MMX2 version does correct clipping C version doesnt |
|
314 |
- * not identical with the vertical one |
|
315 |
- */ |
|
316 |
-static inline void horizX1Filter(uint8_t *src, int stride, int QP) |
|
317 |
-{ |
|
318 |
- int y; |
|
319 |
- static uint64_t *lut= NULL; |
|
320 |
- if(lut==NULL) |
|
321 |
- { |
|
322 |
- int i; |
|
323 |
- lut= (uint64_t*)memalign(8, 256*8); |
|
324 |
- for(i=0; i<256; i++) |
|
325 |
- { |
|
326 |
- int v= i < 128 ? 2*i : 2*(i-256); |
|
327 |
-/* |
|
328 |
-//Simulate 112242211 9-Tap filter |
|
329 |
- uint64_t a= (v/16) & 0xFF; |
|
330 |
- uint64_t b= (v/8) & 0xFF; |
|
331 |
- uint64_t c= (v/4) & 0xFF; |
|
332 |
- uint64_t d= (3*v/8) & 0xFF; |
|
333 |
-*/ |
|
334 |
-//Simulate piecewise linear interpolation |
|
335 |
- uint64_t a= (v/16) & 0xFF; |
|
336 |
- uint64_t b= (v*3/16) & 0xFF; |
|
337 |
- uint64_t c= (v*5/16) & 0xFF; |
|
338 |
- uint64_t d= (7*v/16) & 0xFF; |
|
339 |
- uint64_t A= (0x100 - a)&0xFF; |
|
340 |
- uint64_t B= (0x100 - b)&0xFF; |
|
341 |
- uint64_t C= (0x100 - c)&0xFF; |
|
342 |
- uint64_t D= (0x100 - c)&0xFF; |
|
343 |
- |
|
344 |
- lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | |
|
345 |
- (D<<24) | (C<<16) | (B<<8) | (A); |
|
346 |
- //lut[i] = (v<<32) | (v<<24); |
|
347 |
- } |
|
348 |
- } |
|
349 |
- |
|
350 |
- for(y=0; y<BLOCK_SIZE; y++) |
|
351 |
- { |
|
352 |
- int a= src[1] - src[2]; |
|
353 |
- int b= src[3] - src[4]; |
|
354 |
- int c= src[5] - src[6]; |
|
355 |
- |
|
356 |
- int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
|
357 |
- |
|
358 |
- if(d < QP) |
|
359 |
- { |
|
360 |
- int v = d * SIGN(-b); |
|
361 |
- |
|
362 |
- src[1] +=v/8; |
|
363 |
- src[2] +=v/4; |
|
364 |
- src[3] +=3*v/8; |
|
365 |
- src[4] -=3*v/8; |
|
366 |
- src[5] -=v/4; |
|
367 |
- src[6] -=v/8; |
|
368 |
- |
|
369 |
- } |
|
370 |
- src+=stride; |
|
371 |
- } |
|
372 |
-} |
|
373 |
- |
|
374 |
- |
|
375 |
-//Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one |
|
376 |
-//Plain C versions |
|
377 |
-#if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT) |
|
378 |
-#define COMPILE_C |
|
379 |
-#endif |
|
380 |
- |
|
381 |
-#ifdef ARCH_X86 |
|
382 |
- |
|
383 |
-#if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) |
|
384 |
-#define COMPILE_MMX |
|
385 |
-#endif |
|
386 |
- |
|
387 |
-#if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT) |
|
388 |
-#define COMPILE_MMX2 |
|
389 |
-#endif |
|
390 |
- |
|
391 |
-#if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) |
|
392 |
-#define COMPILE_3DNOW |
|
393 |
-#endif |
|
394 |
-#endif //ARCH_X86 |
|
395 |
- |
|
396 |
-#undef HAVE_MMX |
|
397 |
-#undef HAVE_MMX2 |
|
398 |
-#undef HAVE_3DNOW |
|
399 |
-#undef ARCH_X86 |
|
400 |
- |
|
401 |
-#ifdef COMPILE_C |
|
402 |
-#undef HAVE_MMX |
|
403 |
-#undef HAVE_MMX2 |
|
404 |
-#undef HAVE_3DNOW |
|
405 |
-#undef ARCH_X86 |
|
406 |
-#define RENAME(a) a ## _C |
|
407 |
-#include "postprocess_template.c" |
|
408 |
-#endif |
|
409 |
- |
|
410 |
-//MMX versions |
|
411 |
-#ifdef COMPILE_MMX |
|
412 |
-#undef RENAME |
|
413 |
-#define HAVE_MMX |
|
414 |
-#undef HAVE_MMX2 |
|
415 |
-#undef HAVE_3DNOW |
|
416 |
-#define ARCH_X86 |
|
417 |
-#define RENAME(a) a ## _MMX |
|
418 |
-#include "postprocess_template.c" |
|
419 |
-#endif |
|
420 |
- |
|
421 |
-//MMX2 versions |
|
422 |
-#ifdef COMPILE_MMX2 |
|
423 |
-#undef RENAME |
|
424 |
-#define HAVE_MMX |
|
425 |
-#define HAVE_MMX2 |
|
426 |
-#undef HAVE_3DNOW |
|
427 |
-#define ARCH_X86 |
|
428 |
-#define RENAME(a) a ## _MMX2 |
|
429 |
-#include "postprocess_template.c" |
|
430 |
-#endif |
|
431 |
- |
|
432 |
-//3DNOW versions |
|
433 |
-#ifdef COMPILE_3DNOW |
|
434 |
-#undef RENAME |
|
435 |
-#define HAVE_MMX |
|
436 |
-#undef HAVE_MMX2 |
|
437 |
-#define HAVE_3DNOW |
|
438 |
-#define ARCH_X86 |
|
439 |
-#define RENAME(a) a ## _3DNow |
|
440 |
-#include "postprocess_template.c" |
|
441 |
-#endif |
|
442 |
- |
|
443 |
-// minor note: the HAVE_xyz is messed up after that line so dont use it |
|
444 |
- |
|
445 |
-static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
446 |
- QP_STORE_T QPs[], int QPStride, int isColor, pp_mode_t *vm, pp_context_t *vc) |
|
447 |
-{ |
|
448 |
- PPContext *c= (PPContext *)vc; |
|
449 |
- PPMode *ppMode= (PPMode *)vm; |
|
450 |
- c->ppMode= *ppMode; //FIXME |
|
451 |
- |
|
452 |
- // useing ifs here as they are faster than function pointers allthough the |
|
453 |
- // difference wouldnt be messureable here but its much better because |
|
454 |
- // someone might exchange the cpu whithout restarting mplayer ;) |
|
455 |
-#ifdef RUNTIME_CPUDETECT |
|
456 |
-#ifdef ARCH_X86 |
|
457 |
- // ordered per speed fasterst first |
|
458 |
- if(c->cpuCaps & PP_CPU_CAPS_MMX2) |
|
459 |
- postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
460 |
- else if(c->cpuCaps & PP_CPU_CAPS_3DNOW) |
|
461 |
- postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
462 |
- else if(c->cpuCaps & PP_CPU_CAPS_MMX) |
|
463 |
- postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
464 |
- else |
|
465 |
- postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
466 |
-#else |
|
467 |
- postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
468 |
-#endif |
|
469 |
-#else //RUNTIME_CPUDETECT |
|
470 |
-#ifdef HAVE_MMX2 |
|
471 |
- postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
472 |
-#elif defined (HAVE_3DNOW) |
|
473 |
- postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
474 |
-#elif defined (HAVE_MMX) |
|
475 |
- postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
476 |
-#else |
|
477 |
- postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); |
|
478 |
-#endif |
|
479 |
-#endif //!RUNTIME_CPUDETECT |
|
480 |
-} |
|
481 |
- |
|
482 |
-//static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
483 |
-// QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); |
|
484 |
- |
|
485 |
-/* -pp Command line Help |
|
486 |
-*/ |
|
487 |
-char *pp_help= |
|
488 |
-"<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n" |
|
489 |
-"long form example:\n" |
|
490 |
-"vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n" |
|
491 |
-"short form example:\n" |
|
492 |
-"vb:a/hb:a/lb de,-vb\n" |
|
493 |
-"more examples:\n" |
|
494 |
-"tn:64:128:256\n" |
|
495 |
-"Filters Options\n" |
|
496 |
-"short long name short long option Description\n" |
|
497 |
-"* * a autoq cpu power dependant enabler\n" |
|
498 |
-" c chrom chrominance filtring enabled\n" |
|
499 |
-" y nochrom chrominance filtring disabled\n" |
|
500 |
-"hb hdeblock (2 Threshold) horizontal deblocking filter\n" |
|
501 |
-" 1. difference factor: default=64, higher -> more deblocking\n" |
|
502 |
-" 2. flatness threshold: default=40, lower -> more deblocking\n" |
|
503 |
-" the h & v deblocking filters share these\n" |
|
504 |
-" so u cant set different thresholds for h / v\n" |
|
505 |
-"vb vdeblock (2 Threshold) vertical deblocking filter\n" |
|
506 |
-"h1 x1hdeblock Experimental h deblock filter 1\n" |
|
507 |
-"v1 x1vdeblock Experimental v deblock filter 1\n" |
|
508 |
-"dr dering Deringing filter\n" |
|
509 |
-"al autolevels automatic brightness / contrast\n" |
|
510 |
-" f fullyrange stretch luminance to (0..255)\n" |
|
511 |
-"lb linblenddeint linear blend deinterlacer\n" |
|
512 |
-"li linipoldeint linear interpolating deinterlace\n" |
|
513 |
-"ci cubicipoldeint cubic interpolating deinterlacer\n" |
|
514 |
-"md mediandeint median deinterlacer\n" |
|
515 |
-"fd ffmpegdeint ffmpeg deinterlacer\n" |
|
516 |
-"de default hb:a,vb:a,dr:a,al\n" |
|
517 |
-"fa fast h1:a,v1:a,dr:a,al\n" |
|
518 |
-"tn tmpnoise (3 Thresholds) Temporal Noise Reducer\n" |
|
519 |
-" 1. <= 2. <= 3. larger -> stronger filtering\n" |
|
520 |
-"fq forceQuant <quantizer> Force quantizer\n" |
|
521 |
-; |
|
522 |
- |
|
523 |
-pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality) |
|
524 |
-{ |
|
525 |
- char temp[GET_MODE_BUFFER_SIZE]; |
|
526 |
- char *p= temp; |
|
527 |
- char *filterDelimiters= ",/"; |
|
528 |
- char *optionDelimiters= ":"; |
|
529 |
- struct PPMode *ppMode; |
|
530 |
- char *filterToken; |
|
531 |
- |
|
532 |
- ppMode= memalign(8, sizeof(PPMode)); |
|
533 |
- |
|
534 |
- ppMode->lumMode= 0; |
|
535 |
- ppMode->chromMode= 0; |
|
536 |
- ppMode->maxTmpNoise[0]= 700; |
|
537 |
- ppMode->maxTmpNoise[1]= 1500; |
|
538 |
- ppMode->maxTmpNoise[2]= 3000; |
|
539 |
- ppMode->maxAllowedY= 234; |
|
540 |
- ppMode->minAllowedY= 16; |
|
541 |
- ppMode->baseDcDiff= 256/4; |
|
542 |
- ppMode->flatnessThreshold= 56-16; |
|
543 |
- ppMode->maxClippedThreshold= 0.01; |
|
544 |
- ppMode->error=0; |
|
545 |
- |
|
546 |
- strncpy(temp, name, GET_MODE_BUFFER_SIZE); |
|
547 |
- |
|
548 |
- if(verbose>1) printf("pp: %s\n", name); |
|
549 |
- |
|
550 |
- for(;;){ |
|
551 |
- char *filterName; |
|
552 |
- int q= 1000000; //PP_QUALITY_MAX; |
|
553 |
- int chrom=-1; |
|
554 |
- char *option; |
|
555 |
- char *options[OPTIONS_ARRAY_SIZE]; |
|
556 |
- int i; |
|
557 |
- int filterNameOk=0; |
|
558 |
- int numOfUnknownOptions=0; |
|
559 |
- int enable=1; //does the user want us to enabled or disabled the filter |
|
560 |
- |
|
561 |
- filterToken= strtok(p, filterDelimiters); |
|
562 |
- if(filterToken == NULL) break; |
|
563 |
- p+= strlen(filterToken) + 1; // p points to next filterToken |
|
564 |
- filterName= strtok(filterToken, optionDelimiters); |
|
565 |
- if(verbose>1) printf("pp: %s::%s\n", filterToken, filterName); |
|
566 |
- |
|
567 |
- if(*filterName == '-') |
|
568 |
- { |
|
569 |
- enable=0; |
|
570 |
- filterName++; |
|
571 |
- } |
|
572 |
- |
|
573 |
- for(;;){ //for all options |
|
574 |
- option= strtok(NULL, optionDelimiters); |
|
575 |
- if(option == NULL) break; |
|
576 |
- |
|
577 |
- if(verbose>1) printf("pp: option: %s\n", option); |
|
578 |
- if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality; |
|
579 |
- else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0; |
|
580 |
- else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1; |
|
581 |
- else |
|
582 |
- { |
|
583 |
- options[numOfUnknownOptions] = option; |
|
584 |
- numOfUnknownOptions++; |
|
585 |
- } |
|
586 |
- if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break; |
|
587 |
- } |
|
588 |
- options[numOfUnknownOptions] = NULL; |
|
589 |
- |
|
590 |
- /* replace stuff from the replace Table */ |
|
591 |
- for(i=0; replaceTable[2*i]!=NULL; i++) |
|
592 |
- { |
|
593 |
- if(!strcmp(replaceTable[2*i], filterName)) |
|
594 |
- { |
|
595 |
- int newlen= strlen(replaceTable[2*i + 1]); |
|
596 |
- int plen; |
|
597 |
- int spaceLeft; |
|
598 |
- |
|
599 |
- if(p==NULL) p= temp, *p=0; //last filter |
|
600 |
- else p--, *p=','; //not last filter |
|
601 |
- |
|
602 |
- plen= strlen(p); |
|
603 |
- spaceLeft= p - temp + plen; |
|
604 |
- if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE) |
|
605 |
- { |
|
606 |
- ppMode->error++; |
|
607 |
- break; |
|
608 |
- } |
|
609 |
- memmove(p + newlen, p, plen+1); |
|
610 |
- memcpy(p, replaceTable[2*i + 1], newlen); |
|
611 |
- filterNameOk=1; |
|
612 |
- } |
|
613 |
- } |
|
614 |
- |
|
615 |
- for(i=0; filters[i].shortName!=NULL; i++) |
|
616 |
- { |
|
617 |
-// printf("Compareing %s, %s, %s\n", filters[i].shortName,filters[i].longName, filterName); |
|
618 |
- if( !strcmp(filters[i].longName, filterName) |
|
619 |
- || !strcmp(filters[i].shortName, filterName)) |
|
620 |
- { |
|
621 |
- ppMode->lumMode &= ~filters[i].mask; |
|
622 |
- ppMode->chromMode &= ~filters[i].mask; |
|
623 |
- |
|
624 |
- filterNameOk=1; |
|
625 |
- if(!enable) break; // user wants to disable it |
|
626 |
- |
|
627 |
- if(q >= filters[i].minLumQuality) |
|
628 |
- ppMode->lumMode|= filters[i].mask; |
|
629 |
- if(chrom==1 || (chrom==-1 && filters[i].chromDefault)) |
|
630 |
- if(q >= filters[i].minChromQuality) |
|
631 |
- ppMode->chromMode|= filters[i].mask; |
|
632 |
- |
|
633 |
- if(filters[i].mask == LEVEL_FIX) |
|
634 |
- { |
|
635 |
- int o; |
|
636 |
- ppMode->minAllowedY= 16; |
|
637 |
- ppMode->maxAllowedY= 234; |
|
638 |
- for(o=0; options[o]!=NULL; o++) |
|
639 |
- { |
|
640 |
- if( !strcmp(options[o],"fullyrange") |
|
641 |
- ||!strcmp(options[o],"f")) |
|
642 |
- { |
|
643 |
- ppMode->minAllowedY= 0; |
|
644 |
- ppMode->maxAllowedY= 255; |
|
645 |
- numOfUnknownOptions--; |
|
646 |
- } |
|
647 |
- } |
|
648 |
- } |
|
649 |
- else if(filters[i].mask == TEMP_NOISE_FILTER) |
|
650 |
- { |
|
651 |
- int o; |
|
652 |
- int numOfNoises=0; |
|
653 |
- |
|
654 |
- for(o=0; options[o]!=NULL; o++) |
|
655 |
- { |
|
656 |
- char *tail; |
|
657 |
- ppMode->maxTmpNoise[numOfNoises]= |
|
658 |
- strtol(options[o], &tail, 0); |
|
659 |
- if(tail!=options[o]) |
|
660 |
- { |
|
661 |
- numOfNoises++; |
|
662 |
- numOfUnknownOptions--; |
|
663 |
- if(numOfNoises >= 3) break; |
|
664 |
- } |
|
665 |
- } |
|
666 |
- } |
|
667 |
- else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK) |
|
668 |
- { |
|
669 |
- int o; |
|
670 |
- |
|
671 |
- for(o=0; options[o]!=NULL && o<2; o++) |
|
672 |
- { |
|
673 |
- char *tail; |
|
674 |
- int val= strtol(options[o], &tail, 0); |
|
675 |
- if(tail==options[o]) break; |
|
676 |
- |
|
677 |
- numOfUnknownOptions--; |
|
678 |
- if(o==0) ppMode->baseDcDiff= val; |
|
679 |
- else ppMode->flatnessThreshold= val; |
|
680 |
- } |
|
681 |
- } |
|
682 |
- else if(filters[i].mask == FORCE_QUANT) |
|
683 |
- { |
|
684 |
- int o; |
|
685 |
- ppMode->forcedQuant= 15; |
|
686 |
- |
|
687 |
- for(o=0; options[o]!=NULL && o<1; o++) |
|
688 |
- { |
|
689 |
- char *tail; |
|
690 |
- int val= strtol(options[o], &tail, 0); |
|
691 |
- if(tail==options[o]) break; |
|
692 |
- |
|
693 |
- numOfUnknownOptions--; |
|
694 |
- ppMode->forcedQuant= val; |
|
695 |
- } |
|
696 |
- } |
|
697 |
- } |
|
698 |
- } |
|
699 |
- if(!filterNameOk) ppMode->error++; |
|
700 |
- ppMode->error += numOfUnknownOptions; |
|
701 |
- } |
|
702 |
- |
|
703 |
- if(verbose>1) printf("pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode); |
|
704 |
- if(ppMode->error) |
|
705 |
- { |
|
706 |
- fprintf(stderr, "%d errors in postprocess string \"%s\"\n", ppMode->error, name); |
|
707 |
- free(ppMode); |
|
708 |
- return NULL; |
|
709 |
- } |
|
710 |
- return ppMode; |
|
711 |
-} |
|
712 |
- |
|
713 |
-void pp_free_mode(pp_mode_t *mode){ |
|
714 |
- if(mode) free(mode); |
|
715 |
-} |
|
716 |
- |
|
717 |
-static void reallocAlign(void **p, int alignment, int size){ |
|
718 |
- if(*p) free(*p); |
|
719 |
- *p= memalign(alignment, size); |
|
720 |
- memset(*p, 0, size); |
|
721 |
-} |
|
722 |
- |
|
723 |
-static void reallocBuffers(PPContext *c, int width, int height, int stride){ |
|
724 |
- int mbWidth = (width+15)>>4; |
|
725 |
- int mbHeight= (height+15)>>4; |
|
726 |
- int i; |
|
727 |
- |
|
728 |
- c->stride= stride; |
|
729 |
- |
|
730 |
- reallocAlign((void **)&c->tempDst, 8, stride*24); |
|
731 |
- reallocAlign((void **)&c->tempSrc, 8, stride*24); |
|
732 |
- reallocAlign((void **)&c->tempBlocks, 8, 2*16*8); |
|
733 |
- reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t)); |
|
734 |
- for(i=0; i<256; i++) |
|
735 |
- c->yHistogram[i]= width*height/64*15/256; |
|
736 |
- |
|
737 |
- for(i=0; i<3; i++) |
|
738 |
- { |
|
739 |
- //Note:the +17*1024 is just there so i dont have to worry about r/w over te end |
|
740 |
- reallocAlign((void **)&c->tempBlured[i], 8, stride*mbHeight*16 + 17*1024); |
|
741 |
- reallocAlign((void **)&c->tempBluredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size |
|
742 |
- } |
|
743 |
- |
|
744 |
- reallocAlign((void **)&c->deintTemp, 8, width+16); |
|
745 |
- reallocAlign((void **)&c->nonBQPTable, 8, mbWidth*mbHeight*sizeof(QP_STORE_T)); |
|
746 |
- reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T)); |
|
747 |
-} |
|
748 |
- |
|
749 |
-pp_context_t *pp_get_context(int width, int height, int cpuCaps){ |
|
750 |
- PPContext *c= memalign(32, sizeof(PPContext)); |
|
751 |
- int i; |
|
752 |
- int stride= (width+15)&(~15); //assumed / will realloc if needed |
|
753 |
- |
|
754 |
- memset(c, 0, sizeof(PPContext)); |
|
755 |
- c->cpuCaps= cpuCaps; |
|
756 |
- if(cpuCaps&PP_FORMAT){ |
|
757 |
- c->hChromaSubSample= cpuCaps&0x3; |
|
758 |
- c->vChromaSubSample= (cpuCaps>>4)&0x3; |
|
759 |
- }else{ |
|
760 |
- c->hChromaSubSample= 1; |
|
761 |
- c->vChromaSubSample= 1; |
|
762 |
- } |
|
763 |
- |
|
764 |
- reallocBuffers(c, width, height, stride); |
|
765 |
- |
|
766 |
- c->frameNum=-1; |
|
767 |
- |
|
768 |
- return c; |
|
769 |
-} |
|
770 |
- |
|
771 |
-void pp_free_context(void *vc){ |
|
772 |
- PPContext *c = (PPContext*)vc; |
|
773 |
- int i; |
|
774 |
- |
|
775 |
- for(i=0; i<3; i++) free(c->tempBlured[i]); |
|
776 |
- for(i=0; i<3; i++) free(c->tempBluredPast[i]); |
|
777 |
- |
|
778 |
- free(c->tempBlocks); |
|
779 |
- free(c->yHistogram); |
|
780 |
- free(c->tempDst); |
|
781 |
- free(c->tempSrc); |
|
782 |
- free(c->deintTemp); |
|
783 |
- free(c->nonBQPTable); |
|
784 |
- free(c->forcedQPTable); |
|
785 |
- |
|
786 |
- memset(c, 0, sizeof(PPContext)); |
|
787 |
- |
|
788 |
- free(c); |
|
789 |
-} |
|
790 |
- |
|
791 |
-void pp_postprocess(uint8_t * src[3], int srcStride[3], |
|
792 |
- uint8_t * dst[3], int dstStride[3], |
|
793 |
- int width, int height, |
|
794 |
- QP_STORE_T *QP_store, int QPStride, |
|
795 |
- pp_mode_t *vm, void *vc, int pict_type) |
|
796 |
-{ |
|
797 |
- int mbWidth = (width+15)>>4; |
|
798 |
- int mbHeight= (height+15)>>4; |
|
799 |
- PPMode *mode = (PPMode*)vm; |
|
800 |
- PPContext *c = (PPContext*)vc; |
|
801 |
- int minStride= MAX(srcStride[0], dstStride[0]); |
|
802 |
- |
|
803 |
- if(c->stride < minStride) |
|
804 |
- reallocBuffers(c, width, height, minStride); |
|
805 |
- |
|
806 |
- if(QP_store==NULL || (mode->lumMode & FORCE_QUANT)) |
|
807 |
- { |
|
808 |
- int i; |
|
809 |
- QP_store= c->forcedQPTable; |
|
810 |
- QPStride= 0; |
|
811 |
- if(mode->lumMode & FORCE_QUANT) |
|
812 |
- for(i=0; i<mbWidth; i++) QP_store[i]= mode->forcedQuant; |
|
813 |
- else |
|
814 |
- for(i=0; i<mbWidth; i++) QP_store[i]= 1; |
|
815 |
- } |
|
816 |
-if(0){ |
|
817 |
-int x,y; |
|
818 |
-for(y=0; y<mbHeight; y++){ |
|
819 |
- for(x=0; x<mbWidth; x++){ |
|
820 |
- printf("%2d ", QP_store[x + y*QPStride]); |
|
821 |
- } |
|
822 |
- printf("\n"); |
|
823 |
-} |
|
824 |
- printf("\n"); |
|
825 |
-} |
|
826 |
-//printf("pict_type:%d\n", pict_type); |
|
827 |
- |
|
828 |
- if(pict_type!=3) |
|
829 |
- { |
|
830 |
- int x,y; |
|
831 |
- for(y=0; y<mbHeight; y++){ |
|
832 |
- for(x=0; x<mbWidth; x++){ |
|
833 |
- int qscale= QP_store[x + y*QPStride]; |
|
834 |
- if(qscale&~31) |
|
835 |
- qscale=31; |
|
836 |
- c->nonBQPTable[y*mbWidth + x]= qscale; |
|
837 |
- } |
|
838 |
- } |
|
839 |
- } |
|
840 |
- |
|
841 |
- if(verbose>2) |
|
842 |
- { |
|
843 |
- printf("using npp filters 0x%X/0x%X\n", mode->lumMode, mode->chromMode); |
|
844 |
- } |
|
845 |
- |
|
846 |
- postProcess(src[0], srcStride[0], dst[0], dstStride[0], |
|
847 |
- width, height, QP_store, QPStride, 0, mode, c); |
|
848 |
- |
|
849 |
- width = (width )>>c->hChromaSubSample; |
|
850 |
- height = (height)>>c->vChromaSubSample; |
|
851 |
- |
|
852 |
- if(mode->chromMode) |
|
853 |
- { |
|
854 |
- postProcess(src[1], srcStride[1], dst[1], dstStride[1], |
|
855 |
- width, height, QP_store, QPStride, 1, mode, c); |
|
856 |
- postProcess(src[2], srcStride[2], dst[2], dstStride[2], |
|
857 |
- width, height, QP_store, QPStride, 2, mode, c); |
|
858 |
- } |
|
859 |
- else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]) |
|
860 |
- { |
|
861 |
- memcpy(dst[1], src[1], srcStride[1]*height); |
|
862 |
- memcpy(dst[2], src[2], srcStride[2]*height); |
|
863 |
- } |
|
864 |
- else |
|
865 |
- { |
|
866 |
- int y; |
|
867 |
- for(y=0; y<height; y++) |
|
868 |
- { |
|
869 |
- memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width); |
|
870 |
- memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width); |
|
871 |
- } |
|
872 |
- } |
|
873 |
-} |
|
874 |
- |
875 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,73 +0,0 @@ |
1 |
-/* |
|
2 |
- Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) |
|
3 |
- |
|
4 |
- This program is free software; you can redistribute it and/or modify |
|
5 |
- it under the terms of the GNU General Public License as published by |
|
6 |
- the Free Software Foundation; either version 2 of the License, or |
|
7 |
- (at your option) any later version. |
|
8 |
- |
|
9 |
- This program is distributed in the hope that it will be useful, |
|
10 |
- but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 |
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
12 |
- GNU General Public License for more details. |
|
13 |
- |
|
14 |
- You should have received a copy of the GNU General Public License |
|
15 |
- along with this program; if not, write to the Free Software |
|
16 |
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
17 |
-*/ |
|
18 |
- |
|
19 |
-#ifndef NEWPOSTPROCESS_H |
|
20 |
-#define NEWPOSTPROCESS_H |
|
21 |
- |
|
22 |
-/** |
|
23 |
- * @file postprocess.h |
|
24 |
- * @brief |
|
25 |
- * external api for the pp stuff |
|
26 |
- */ |
|
27 |
- |
|
28 |
-#ifdef __cplusplus |
|
29 |
-extern "C" { |
|
30 |
-#endif |
|
31 |
- |
|
32 |
-#define PP_QUALITY_MAX 6 |
|
33 |
- |
|
34 |
-#define QP_STORE_T int8_t |
|
35 |
- |
|
36 |
-typedef void pp_context_t; |
|
37 |
-typedef void pp_mode_t; |
|
38 |
- |
|
39 |
-extern char *pp_help; //a simple help text |
|
40 |
- |
|
41 |
-void pp_postprocess(uint8_t * src[3], int srcStride[3], |
|
42 |
- uint8_t * dst[3], int dstStride[3], |
|
43 |
- int horizontalSize, int verticalSize, |
|
44 |
- QP_STORE_T *QP_store, int QP_stride, |
|
45 |
- pp_mode_t *mode, pp_context_t *ppContext, int pict_type); |
|
46 |
- |
|
47 |
- |
|
48 |
-/** |
|
49 |
- * returns a pp_mode_t or NULL if an error occured |
|
50 |
- * name is the string after "-pp" on the command line |
|
51 |
- * quality is a number from 0 to PP_QUALITY_MAX |
|
52 |
- */ |
|
53 |
-pp_mode_t *pp_get_mode_by_name_and_quality(char *name, int quality); |
|
54 |
-void pp_free_mode(pp_mode_t *mode); |
|
55 |
- |
|
56 |
-pp_context_t *pp_get_context(int width, int height, int flags); |
|
57 |
-void pp_free_context(pp_context_t *ppContext); |
|
58 |
- |
|
59 |
-#define PP_CPU_CAPS_MMX 0x80000000 |
|
60 |
-#define PP_CPU_CAPS_MMX2 0x20000000 |
|
61 |
-#define PP_CPU_CAPS_3DNOW 0x40000000 |
|
62 |
- |
|
63 |
-#define PP_FORMAT 0x00000008 |
|
64 |
-#define PP_FORMAT_420 (0x00000011|PP_FORMAT) |
|
65 |
-#define PP_FORMAT_422 (0x00000001|PP_FORMAT) |
|
66 |
-#define PP_FORMAT_411 (0x00000002|PP_FORMAT) |
|
67 |
-#define PP_FORMAT_444 (0x00000000|PP_FORMAT) |
|
68 |
- |
|
69 |
-#ifdef __cplusplus |
|
70 |
-} |
|
71 |
-#endif |
|
72 |
- |
|
73 |
-#endif |
74 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,128 +0,0 @@ |
1 |
-/* |
|
2 |
- Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
3 |
- |
|
4 |
- This program is free software; you can redistribute it and/or modify |
|
5 |
- it under the terms of the GNU General Public License as published by |
|
6 |
- the Free Software Foundation; either version 2 of the License, or |
|
7 |
- (at your option) any later version. |
|
8 |
- |
|
9 |
- This program is distributed in the hope that it will be useful, |
|
10 |
- but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 |
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
12 |
- GNU General Public License for more details. |
|
13 |
- |
|
14 |
- You should have received a copy of the GNU General Public License |
|
15 |
- along with this program; if not, write to the Free Software |
|
16 |
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
17 |
-*/ |
|
18 |
- |
|
19 |
-#define V_DEBLOCK 0x01 |
|
20 |
-#define H_DEBLOCK 0x02 |
|
21 |
-#define DERING 0x04 |
|
22 |
-#define LEVEL_FIX 0x08 /* Brightness & Contrast */ |
|
23 |
- |
|
24 |
-#define LUM_V_DEBLOCK V_DEBLOCK // 1 |
|
25 |
-#define LUM_H_DEBLOCK H_DEBLOCK // 2 |
|
26 |
-#define CHROM_V_DEBLOCK (V_DEBLOCK<<4) // 16 |
|
27 |
-#define CHROM_H_DEBLOCK (H_DEBLOCK<<4) // 32 |
|
28 |
-#define LUM_DERING DERING // 4 |
|
29 |
-#define CHROM_DERING (DERING<<4) // 64 |
|
30 |
-#define LUM_LEVEL_FIX LEVEL_FIX // 8 |
|
31 |
-#define CHROM_LEVEL_FIX (LEVEL_FIX<<4) // 128 (not implemented yet) |
|
32 |
- |
|
33 |
-// Experimental vertical filters |
|
34 |
-#define V_X1_FILTER 0x0200 // 512 |
|
35 |
- |
|
36 |
-// Experimental horizontal filters |
|
37 |
-#define H_X1_FILTER 0x2000 // 8192 |
|
38 |
- |
|
39 |
-// select between full y range (255-0) or standart one (234-16) |
|
40 |
-#define FULL_Y_RANGE 0x8000 // 32768 |
|
41 |
- |
|
42 |
-//Deinterlacing Filters |
|
43 |
-#define LINEAR_IPOL_DEINT_FILTER 0x10000 // 65536 |
|
44 |
-#define LINEAR_BLEND_DEINT_FILTER 0x20000 // 131072 |
|
45 |
-#define CUBIC_BLEND_DEINT_FILTER 0x8000 // (not implemented yet) |
|
46 |
-#define CUBIC_IPOL_DEINT_FILTER 0x40000 // 262144 |
|
47 |
-#define MEDIAN_DEINT_FILTER 0x80000 // 524288 |
|
48 |
-#define FFMPEG_DEINT_FILTER 0x400000 |
|
49 |
- |
|
50 |
-#define TEMP_NOISE_FILTER 0x100000 |
|
51 |
-#define FORCE_QUANT 0x200000 |
|
52 |
- |
|
53 |
-//use if u want a faster postprocessing code |
|
54 |
-//cant differentiate between chroma & luma filters (both on or both off) |
|
55 |
-//obviosly the -pp option at the commandline has no effect except turning the here selected |
|
56 |
-//filters on |
|
57 |
-//#define COMPILE_TIME_MODE 0x77 |
|
58 |
- |
|
59 |
-struct PPFilter{ |
|
60 |
- char *shortName; |
|
61 |
- char *longName; |
|
62 |
- int chromDefault; // is chrominance filtering on by default if this filter is manually activated |
|
63 |
- int minLumQuality; // minimum quality to turn luminance filtering on |
|
64 |
- int minChromQuality; // minimum quality to turn chrominance filtering on |
|
65 |
- int mask; // Bitmask to turn this filter on |
|
66 |
-}; |
|
67 |
- |
|
68 |
-typedef struct PPMode{ |
|
69 |
- int lumMode; // acivates filters for luminance |
|
70 |
- int chromMode; // acivates filters for chrominance |
|
71 |
- int error; // non zero on error |
|
72 |
- |
|
73 |
- int minAllowedY; // for brigtness correction |
|
74 |
- int maxAllowedY; // for brihtness correction |
|
75 |
- float maxClippedThreshold; // amount of "black" u r willing to loose to get a brightness corrected picture |
|
76 |
- |
|
77 |
- int maxTmpNoise[3]; // for Temporal Noise Reducing filter (Maximal sum of abs differences) |
|
78 |
- |
|
79 |
- int baseDcDiff; |
|
80 |
- int flatnessThreshold; |
|
81 |
- |
|
82 |
- int forcedQuant; // quantizer if FORCE_QUANT is used |
|
83 |
-} PPMode; |
|
84 |
- |
|
85 |
-typedef struct PPContext{ |
|
86 |
- uint8_t *tempBlocks; //used for the horizontal code |
|
87 |
- |
|
88 | ||
89 |
- after watching a black picture for 5 hours*/ |
|
90 |
- uint64_t *yHistogram; |
|
91 |
- |
|
92 |
- uint64_t __attribute__((aligned(8))) packedYOffset; |
|
93 |
- uint64_t __attribute__((aligned(8))) packedYScale; |
|
94 |
- |
|
95 |
- /* Temporal noise reducing buffers */ |
|
96 |
- uint8_t *tempBlured[3]; |
|
97 |
- int32_t *tempBluredPast[3]; |
|
98 |
- |
|
99 |
- /* Temporary buffers for handling the last row(s) */ |
|
100 |
- uint8_t *tempDst; |
|
101 |
- uint8_t *tempSrc; |
|
102 |
- |
|
103 |
- uint8_t *deintTemp; |
|
104 |
- |
|
105 |
- uint64_t __attribute__((aligned(8))) pQPb; |
|
106 |
- uint64_t __attribute__((aligned(8))) pQPb2; |
|
107 |
- |
|
108 |
- uint64_t __attribute__((aligned(8))) mmxDcOffset[32]; |
|
109 |
- uint64_t __attribute__((aligned(8))) mmxDcThreshold[32]; |
|
110 |
- |
|
111 |
- QP_STORE_T *nonBQPTable; |
|
112 |
- QP_STORE_T *forcedQPTable; |
|
113 |
- |
|
114 |
- int QP; |
|
115 |
- int nonBQP; |
|
116 |
- |
|
117 |
- int frameNum; |
|
118 |
- |
|
119 |
- int cpuCaps; |
|
120 |
- |
|
121 |
- int stride; //size of some buffers (needed to realloc them if needed) |
|
122 |
- |
|
123 |
- int hChromaSubSample; |
|
124 |
- int vChromaSubSample; |
|
125 |
- |
|
126 |
- PPMode ppMode; |
|
127 |
-} PPContext; |
|
128 |
- |
129 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,3127 +0,0 @@ |
1 |
-/* |
|
2 |
- Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
3 |
- |
|
4 |
- This program is free software; you can redistribute it and/or modify |
|
5 |
- it under the terms of the GNU General Public License as published by |
|
6 |
- the Free Software Foundation; either version 2 of the License, or |
|
7 |
- (at your option) any later version. |
|
8 |
- |
|
9 |
- This program is distributed in the hope that it will be useful, |
|
10 |
- but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 |
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
12 |
- GNU General Public License for more details. |
|
13 |
- |
|
14 |
- You should have received a copy of the GNU General Public License |
|
15 |
- along with this program; if not, write to the Free Software |
|
16 |
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
17 |
-*/ |
|
18 |
- |
|
19 |
-#undef PAVGB |
|
20 |
-#undef PMINUB |
|
21 |
-#undef PMAXUB |
|
22 |
- |
|
23 |
-#ifdef HAVE_MMX2 |
|
24 |
-#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
|
25 |
-#elif defined (HAVE_3DNOW) |
|
26 |
-#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
|
27 |
-#endif |
|
28 |
- |
|
29 |
-#ifdef HAVE_MMX2 |
|
30 |
-#define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" |
|
31 |
-#elif defined (HAVE_MMX) |
|
32 |
-#define PMINUB(b,a,t) \ |
|
33 |
- "movq " #a ", " #t " \n\t"\ |
|
34 |
- "psubusb " #b ", " #t " \n\t"\ |
|
35 |
- "psubb " #t ", " #a " \n\t" |
|
36 |
-#endif |
|
37 |
- |
|
38 |
-#ifdef HAVE_MMX2 |
|
39 |
-#define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" |
|
40 |
-#elif defined (HAVE_MMX) |
|
41 |
-#define PMAXUB(a,b) \ |
|
42 |
- "psubusb " #a ", " #b " \n\t"\ |
|
43 |
- "paddb " #a ", " #b " \n\t" |
|
44 |
-#endif |
|
45 |
- |
|
46 |
- |
|
47 |
-//FIXME? |255-0| = 1 (shouldnt be a problem ...) |
|
48 |
-#ifdef HAVE_MMX |
|
49 |
-/** |
|
50 |
- * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
51 |
- */ |
|
52 |
-static inline int RENAME(isVertDC)(uint8_t src[], int stride, PPContext *c){ |
|
53 |
- int numEq= 0; |
|
54 |
- src+= stride*4; // src points to begin of the 8x8 Block |
|
55 |
-asm volatile( |
|
56 |
- "leal (%1, %2), %%eax \n\t" |
|
57 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
58 |
-// %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
|
59 |
- "movq %3, %%mm7 \n\t" |
|
60 |
- "movq %4, %%mm6 \n\t" |
|
61 |
- |
|
62 |
- "movq (%1), %%mm0 \n\t" |
|
63 |
- "movq (%%eax), %%mm1 \n\t" |
|
64 |
- "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
|
65 |
- "paddb %%mm7, %%mm0 \n\t" |
|
66 |
- "pcmpgtb %%mm6, %%mm0 \n\t" |
|
67 |
- |
|
68 |
- "movq (%%eax,%2), %%mm2 \n\t" |
|
69 |
- "psubb %%mm2, %%mm1 \n\t" |
|
70 |
- "paddb %%mm7, %%mm1 \n\t" |
|
71 |
- "pcmpgtb %%mm6, %%mm1 \n\t" |
|
72 |
- "paddb %%mm1, %%mm0 \n\t" |
|
73 |
- |
|
74 |
- "movq (%%eax, %2, 2), %%mm1 \n\t" |
|
75 |
- "psubb %%mm1, %%mm2 \n\t" |
|
76 |
- "paddb %%mm7, %%mm2 \n\t" |
|
77 |
- "pcmpgtb %%mm6, %%mm2 \n\t" |
|
78 |
- "paddb %%mm2, %%mm0 \n\t" |
|
79 |
- |
|
80 |
- "leal (%%eax, %2, 4), %%eax \n\t" |
|
81 |
- |
|
82 |
- "movq (%1, %2, 4), %%mm2 \n\t" |
|
83 |
- "psubb %%mm2, %%mm1 \n\t" |
|
84 |
- "paddb %%mm7, %%mm1 \n\t" |
|
85 |
- "pcmpgtb %%mm6, %%mm1 \n\t" |
|
86 |
- "paddb %%mm1, %%mm0 \n\t" |
|
87 |
- |
|
88 |
- "movq (%%eax), %%mm1 \n\t" |
|
89 |
- "psubb %%mm1, %%mm2 \n\t" |
|
90 |
- "paddb %%mm7, %%mm2 \n\t" |
|
91 |
- "pcmpgtb %%mm6, %%mm2 \n\t" |
|
92 |
- "paddb %%mm2, %%mm0 \n\t" |
|
93 |
- |
|
94 |
- "movq (%%eax, %2), %%mm2 \n\t" |
|
95 |
- "psubb %%mm2, %%mm1 \n\t" |
|
96 |
- "paddb %%mm7, %%mm1 \n\t" |
|
97 |
- "pcmpgtb %%mm6, %%mm1 \n\t" |
|
98 |
- "paddb %%mm1, %%mm0 \n\t" |
|
99 |
- |
|
100 |
- "movq (%%eax, %2, 2), %%mm1 \n\t" |
|
101 |
- "psubb %%mm1, %%mm2 \n\t" |
|
102 |
- "paddb %%mm7, %%mm2 \n\t" |
|
103 |
- "pcmpgtb %%mm6, %%mm2 \n\t" |
|
104 |
- "paddb %%mm2, %%mm0 \n\t" |
|
105 |
- |
|
106 |
- " \n\t" |
|
107 |
-#ifdef HAVE_MMX2 |
|
108 |
- "pxor %%mm7, %%mm7 \n\t" |
|
109 |
- "psadbw %%mm7, %%mm0 \n\t" |
|
110 |
-#else |
|
111 |
- "movq %%mm0, %%mm1 \n\t" |
|
112 |
- "psrlw $8, %%mm0 \n\t" |
|
113 |
- "paddb %%mm1, %%mm0 \n\t" |
|
114 |
- "movq %%mm0, %%mm1 \n\t" |
|
115 |
- "psrlq $16, %%mm0 \n\t" |
|
116 |
- "paddb %%mm1, %%mm0 \n\t" |
|
117 |
- "movq %%mm0, %%mm1 \n\t" |
|
118 |
- "psrlq $32, %%mm0 \n\t" |
|
119 |
- "paddb %%mm1, %%mm0 \n\t" |
|
120 |
-#endif |
|
121 |
- "movd %%mm0, %0 \n\t" |
|
122 |
- : "=r" (numEq) |
|
123 |
- : "r" (src), "r" (stride), "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
|
124 |
- : "%eax" |
|
125 |
- ); |
|
126 |
- numEq= (-numEq) &0xFF; |
|
127 |
- return numEq > c->ppMode.flatnessThreshold; |
|
128 |
-} |
|
129 |
-#endif |
|
130 |
- |
|
131 |
-static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, PPContext *c) |
|
132 |
-{ |
|
133 |
-#ifdef HAVE_MMX |
|
134 |
- int isOk; |
|
135 |
- src+= stride*3; |
|
136 |
- asm volatile( |
|
137 |
- "movq (%1, %2), %%mm0 \n\t" |
|
138 |
- "movq (%1, %2, 8), %%mm1 \n\t" |
|
139 |
- "movq %%mm0, %%mm2 \n\t" |
|
140 |
- "psubusb %%mm1, %%mm0 \n\t" |
|
141 |
- "psubusb %%mm2, %%mm1 \n\t" |
|
142 |
- "por %%mm1, %%mm0 \n\t" // ABS Diff |
|
143 |
- |
|
144 |
- "movq %3, %%mm7 \n\t" // QP,..., QP |
|
145 |
- "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
|
146 |
- "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 |
|
147 |
- "packssdw %%mm0, %%mm0 \n\t" |
|
148 |
- "movd %%mm0, %0 \n\t" |
|
149 |
- : "=r" (isOk) |
|
150 |
- : "r" (src), "r" (stride), "m" (c->pQPb) |
|
151 |
- ); |
|
152 |
- return isOk==0; |
|
153 |
-#else |
|
154 |
-#if 1 |
|
155 |
- int x; |
|
156 |
- const int QP= c->QP; |
|
157 |
- src+= stride*3; |
|
158 |
- for(x=0; x<BLOCK_SIZE; x++) |
|
159 |
- { |
|
160 |
- if((unsigned)(src[x + stride] - src[x + (stride<<3)] + 2*QP) > 4*QP) return 0; |
|
161 |
- } |
|
162 |
- |
|
163 |
- return 1; |
|
164 |
-#else |
|
165 |
- int x; |
|
166 |
- const int QP= c->QP; |
|
167 |
- src+= stride*4; |
|
168 |
- for(x=0; x<BLOCK_SIZE; x++) |
|
169 |
- { |
|
170 |
- int min=255; |
|
171 |
- int max=0; |
|
172 |
- int y; |
|
173 |
- for(y=0; y<8; y++){ |
|
174 |
- int v= src[x + y*stride]; |
|
175 |
- if(v>max) max=v; |
|
176 |
- if(v<min) min=v; |
|
177 |
- } |
|
178 |
- if(max-min > 2*QP) return 0; |
|
179 |
- } |
|
180 |
- return 1; |
|
181 |
-#endif |
|
182 |
-#endif |
|
183 |
-} |
|
184 |
- |
|
185 |
-/** |
|
186 |
- * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
|
187 |
- * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
|
188 |
- */ |
|
189 |
-static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
|
190 |
-{ |
|
191 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
192 |
- src+= stride*3; |
|
193 |
- asm volatile( //"movv %0 %1 %2\n\t" |
|
194 |
- "movq %2, %%mm0 \n\t" // QP,..., QP |
|
195 |
- "pxor %%mm4, %%mm4 \n\t" |
|
196 |
- |
|
197 |
- "movq (%0), %%mm6 \n\t" |
|
198 |
- "movq (%0, %1), %%mm5 \n\t" |
|
199 |
- "movq %%mm5, %%mm1 \n\t" |
|
200 |
- "movq %%mm6, %%mm2 \n\t" |
|
201 |
- "psubusb %%mm6, %%mm5 \n\t" |
|
202 |
- "psubusb %%mm1, %%mm2 \n\t" |
|
203 |
- "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
204 |
- "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
|
205 |
- "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
206 |
- |
|
207 |
- "pand %%mm2, %%mm6 \n\t" |
|
208 |
- "pandn %%mm1, %%mm2 \n\t" |
|
209 |
- "por %%mm2, %%mm6 \n\t"// First Line to Filter |
|
210 |
- |
|
211 |
- "movq (%0, %1, 8), %%mm5 \n\t" |
|
212 |
- "leal (%0, %1, 4), %%eax \n\t" |
|
213 |
- "leal (%0, %1, 8), %%ecx \n\t" |
|
214 |
- "subl %1, %%ecx \n\t" |
|
215 |
- "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
|
216 |
- "movq (%0, %1, 8), %%mm7 \n\t" |
|
217 |
- "movq %%mm5, %%mm1 \n\t" |
|
218 |
- "movq %%mm7, %%mm2 \n\t" |
|
219 |
- "psubusb %%mm7, %%mm5 \n\t" |
|
220 |
- "psubusb %%mm1, %%mm2 \n\t" |
|
221 |
- "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
222 |
- "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
|
223 |
- "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
224 |
- |
|
225 |
- "pand %%mm2, %%mm7 \n\t" |
|
226 |
- "pandn %%mm1, %%mm2 \n\t" |
|
227 |
- "por %%mm2, %%mm7 \n\t" // First Line to Filter |
|
228 |
- |
|
229 |
- |
|
230 |
- // 1 2 3 4 5 6 7 8 |
|
231 |
- // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
|
232 |
- // 6 4 2 2 1 1 |
|
233 |
- // 6 4 4 2 |
|
234 |
- // 6 8 2 |
|
235 |
- |
|
236 |
- "movq (%0, %1), %%mm0 \n\t" // 1 |
|
237 |
- "movq %%mm0, %%mm1 \n\t" // 1 |
|
238 |
- PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
239 |
- PAVGB(%%mm6, %%mm0) //3 1 /4 |
|
240 |
- |
|
241 |
- "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
|
242 |
- "movq %%mm2, %%mm5 \n\t" // 1 |
|
243 |
- PAVGB((%%eax), %%mm2) // 11 /2 |
|
244 |
- PAVGB((%0, %1, 2), %%mm2) // 211 /4 |
|
245 |
- "movq %%mm2, %%mm3 \n\t" // 211 /4 |
|
246 |
- "movq (%0), %%mm4 \n\t" // 1 |
|
247 |
- PAVGB(%%mm4, %%mm3) // 4 211 /8 |
|
248 |
- PAVGB(%%mm0, %%mm3) //642211 /16 |
|
249 |
- "movq %%mm3, (%0) \n\t" // X |
|
250 |
- // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
|
251 |
- "movq %%mm1, %%mm0 \n\t" // 1 |
|
252 |
- PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
253 |
- "movq %%mm4, %%mm3 \n\t" // 1 |
|
254 |
- PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
|
255 |
- PAVGB((%%eax,%1,2), %%mm5) // 11 /2 |
|
256 |
- PAVGB((%%eax), %%mm5) // 211 /4 |
|
257 |
- PAVGB(%%mm5, %%mm3) // 2 2211 /8 |
|
258 |
- PAVGB(%%mm0, %%mm3) //4242211 /16 |
|
259 |
- "movq %%mm3, (%0,%1) \n\t" // X |
|
260 |
- // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
|
261 |
- PAVGB(%%mm4, %%mm6) //11 /2 |
|
262 |
- "movq (%%ecx), %%mm0 \n\t" // 1 |
|
263 |
- PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
|
264 |
- "movq %%mm0, %%mm3 \n\t" // 11/2 |
|
265 |
- PAVGB(%%mm1, %%mm0) // 2 11/4 |
|
266 |
- PAVGB(%%mm6, %%mm0) //222 11/8 |
|
267 |
- PAVGB(%%mm2, %%mm0) //22242211/16 |
|
268 |
- "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
|
269 |
- "movq %%mm0, (%0, %1, 2) \n\t" // X |
|
270 |
- // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
|
271 |
- "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
|
272 |
- PAVGB((%%ecx), %%mm0) // 11 /2 |
|
273 |
- PAVGB(%%mm0, %%mm6) //11 11 /4 |
|
274 |
- PAVGB(%%mm1, %%mm4) // 11 /2 |
|
275 |
- PAVGB(%%mm2, %%mm1) // 11 /2 |
|
276 |
- PAVGB(%%mm1, %%mm6) //1122 11 /8 |
|
277 |
- PAVGB(%%mm5, %%mm6) //112242211 /16 |
|
278 |
- "movq (%%eax), %%mm5 \n\t" // 1 |
|
279 |
- "movq %%mm6, (%%eax) \n\t" // X |
|
280 |
- // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
|
281 |
- "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
|
282 |
- PAVGB(%%mm7, %%mm6) // 11 /2 |
|
283 |
- PAVGB(%%mm4, %%mm6) // 11 11 /4 |
|
284 |
- PAVGB(%%mm3, %%mm6) // 11 2211 /8 |
|
285 |
- PAVGB(%%mm5, %%mm2) // 11 /2 |
|
286 |
- "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
|
287 |
- PAVGB(%%mm4, %%mm2) // 112 /4 |
|
288 |
- PAVGB(%%mm2, %%mm6) // 112242211 /16 |
|
289 |
- "movq %%mm6, (%0, %1, 4) \n\t" // X |
|
290 |
- // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
|
291 |
- PAVGB(%%mm7, %%mm1) // 11 2 /4 |
|
292 |
- PAVGB(%%mm4, %%mm5) // 11 /2 |
|
293 |
- PAVGB(%%mm5, %%mm0) // 11 11 /4 |
|
294 |
- "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
|
295 |
- PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
|
296 |
- PAVGB(%%mm0, %%mm1) // 11224222 /16 |
|
297 |
- "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
|
298 |
- // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
|
299 |
- PAVGB((%%ecx), %%mm2) // 112 4 /8 |
|
300 |
- "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
|
301 |
- PAVGB(%%mm0, %%mm6) // 1 1 /2 |
|
302 |
- PAVGB(%%mm7, %%mm6) // 1 12 /4 |
|
303 |
- PAVGB(%%mm2, %%mm6) // 1122424 /4 |
|
304 |
- "movq %%mm6, (%%ecx) \n\t" // X |
|
305 |
- // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
|
306 |
- PAVGB(%%mm7, %%mm5) // 11 2 /4 |
|
307 |
- PAVGB(%%mm7, %%mm5) // 11 6 /8 |
|
308 |
- |
|
309 |
- PAVGB(%%mm3, %%mm0) // 112 /4 |
|
310 |
- PAVGB(%%mm0, %%mm5) // 112246 /16 |
|
311 |
- "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
|
312 |
- "subl %1, %0 \n\t" |
|
313 |
- |
|
314 |
- : |
|
315 |
- : "r" (src), "r" (stride), "m" (c->pQPb) |
|
316 |
- : "%eax", "%ecx" |
|
317 |
- ); |
|
318 |
-#else |
|
319 |
- const int l1= stride; |
|
320 |
- const int l2= stride + l1; |
|
321 |
- const int l3= stride + l2; |
|
322 |
- const int l4= stride + l3; |
|
323 |
- const int l5= stride + l4; |
|
324 |
- const int l6= stride + l5; |
|
325 |
- const int l7= stride + l6; |
|
326 |
- const int l8= stride + l7; |
|
327 |
- const int l9= stride + l8; |
|
328 |
- int x; |
|
329 |
- src+= stride*3; |
|
330 |
- for(x=0; x<BLOCK_SIZE; x++) |
|
331 |
- { |
|
332 |
- const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
|
333 |
- const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; |
|
334 |
- |
|
335 |
- int sums[9]; |
|
336 |
- sums[0] = first + src[l1]; |
|
337 |
- sums[1] = src[l1] + src[l2]; |
|
338 |
- sums[2] = src[l2] + src[l3]; |
|
339 |
- sums[3] = src[l3] + src[l4]; |
|
340 |
- sums[4] = src[l4] + src[l5]; |
|
341 |
- sums[5] = src[l5] + src[l6]; |
|
342 |
- sums[6] = src[l6] + src[l7]; |
|
343 |
- sums[7] = src[l7] + src[l8]; |
|
344 |
- sums[8] = src[l8] + last; |
|
345 |
- |
|
346 |
- src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
|
347 |
- src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
|
348 |
- src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
|
349 |
- src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
|
350 |
- src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
|
351 |
- src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
|
352 |
- src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; |
|
353 |
- src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
|
354 |
- |
|
355 |
- src++; |
|
356 |
- } |
|
357 |
-#endif |
|
358 |
-} |
|
359 |
- |
|
360 |
-#if 0 |
|
361 |
-/** |
|
362 |
- * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar |
|
363 |
- * values are correctly clipped (MMX2) |
|
364 |
- * values are wraparound (C) |
|
365 |
- * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient |
|
366 |
- 0 8 16 24 |
|
367 |
- x = 8 |
|
368 |
- x/2 = 4 |
|
369 |
- x/8 = 1 |
|
370 |
- 1 12 12 23 |
|
371 |
- */ |
|
372 |
-static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
|
373 |
-{ |
|
374 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
375 |
- src+= stride*3; |
|
376 |
-// FIXME rounding |
|
377 |
- asm volatile( |
|
378 |
- "pxor %%mm7, %%mm7 \n\t" // 0 |
|
379 |
- "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
|
380 |
- "leal (%0, %1), %%eax \n\t" |
|
381 |
- "leal (%%eax, %1, 4), %%ecx \n\t" |
|
382 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
383 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
384 |
- "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
|
385 |
- "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
|
386 |
- "paddusb "MANGLE(b02)", %%mm0 \n\t" |
|
387 |
- "psrlw $2, %%mm0 \n\t" |
|
388 |
- "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
|
389 |
- "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
|
390 |
- "movq (%0, %1, 4), %%mm2 \n\t" // line 4 |
|
391 |
- "movq (%%ecx), %%mm3 \n\t" // line 5 |
|
392 |
- "movq %%mm2, %%mm4 \n\t" // line 4 |
|
393 |
- "pcmpeqb %%mm5, %%mm5 \n\t" // -1 |
|
394 |
- "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 |
|
395 |
- PAVGB(%%mm3, %%mm5) |
|
396 |
- "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
|
397 |
- "psubusb %%mm3, %%mm4 \n\t" |
|
398 |
- "psubusb %%mm2, %%mm3 \n\t" |
|
399 |
- "por %%mm3, %%mm4 \n\t" // |l4 - l5| |
|
400 |
- "psubusb %%mm0, %%mm4 \n\t" |
|
401 |
- "pcmpeqb %%mm7, %%mm4 \n\t" |
|
402 |
- "pand %%mm4, %%mm5 \n\t" // d/2 |
|
403 |
- |
|
404 |
-// "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 |
|
405 |
- "paddb %%mm5, %%mm2 \n\t" |
|
406 |
-// "psubb %%mm6, %%mm2 \n\t" |
|
407 |
- "movq %%mm2, (%0,%1, 4) \n\t" |
|
408 |
- |
|
409 |
- "movq (%%ecx), %%mm2 \n\t" |
|
410 |
-// "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
|
411 |
- "psubb %%mm5, %%mm2 \n\t" |
|
412 |
-// "psubb %%mm6, %%mm2 \n\t" |
|
413 |
- "movq %%mm2, (%%ecx) \n\t" |
|
414 |
- |
|
415 |
- "paddb %%mm6, %%mm5 \n\t" |
|
416 |
- "psrlw $2, %%mm5 \n\t" |
|
417 |
- "pand "MANGLE(b3F)", %%mm5 \n\t" |
|
418 |
- "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 |
|
419 |
- |
|
420 |
- "movq (%%eax, %1, 2), %%mm2 \n\t" |
|
421 |
- "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 |
|
422 |
- "paddsb %%mm5, %%mm2 \n\t" |
|
423 |
- "psubb %%mm6, %%mm2 \n\t" |
|
424 |
- "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
425 |
- |
|
426 |
- "movq (%%ecx, %1), %%mm2 \n\t" |
|
427 |
- "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
|
428 |
- "psubsb %%mm5, %%mm2 \n\t" |
|
429 |
- "psubb %%mm6, %%mm2 \n\t" |
|
430 |
- "movq %%mm2, (%%ecx, %1) \n\t" |
|
431 |
- |
|
432 |
- : |
|
433 |
- : "r" (src), "r" (stride) |
|
434 |
- : "%eax", "%ecx" |
|
435 |
- ); |
|
436 |
-#else |
|
437 |
- const int l1= stride; |
|
438 |
- const int l2= stride + l1; |
|
439 |
- const int l3= stride + l2; |
|
440 |
- const int l4= stride + l3; |
|
441 |
- const int l5= stride + l4; |
|
442 |
- const int l6= stride + l5; |
|
443 |
-// const int l7= stride + l6; |
|
444 |
-// const int l8= stride + l7; |
|
445 |
-// const int l9= stride + l8; |
|
446 |
- int x; |
|
447 |
- const int QP15= QP + (QP>>2); |
|
448 |
- src+= stride*3; |
|
449 |
- for(x=0; x<BLOCK_SIZE; x++) |
|
450 |
- { |
|
451 |
- const int v = (src[x+l5] - src[x+l4]); |
|
452 |
- if(ABS(v) < QP15) |
|
453 |
- { |
|
454 |
- src[x+l3] +=v>>3; |
|
455 |
- src[x+l4] +=v>>1; |
|
456 |
- src[x+l5] -=v>>1; |
|
457 |
- src[x+l6] -=v>>3; |
|
458 |
- |
|
459 |
- } |
|
460 |
- } |
|
461 |
- |
|
462 |
-#endif |
|
463 |
-} |
|
464 |
-#endif |
|
465 |
- |
|
466 |
-/** |
|
467 |
- * Experimental Filter 1 |
|
468 |
- * will not damage linear gradients |
|
469 |
- * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
470 |
- * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
471 |
- * MMX2 version does correct clipping C version doesnt |
|
472 |
- */ |
|
473 |
-static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
|
474 |
-{ |
|
475 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
476 |
- src+= stride*3; |
|
477 |
- |
|
478 |
- asm volatile( |
|
479 |
- "pxor %%mm7, %%mm7 \n\t" // 0 |
|
480 |
- "leal (%0, %1), %%eax \n\t" |
|
481 |
- "leal (%%eax, %1, 4), %%ecx \n\t" |
|
482 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
483 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
484 |
- "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
485 |
- "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
|
486 |
- "movq %%mm1, %%mm2 \n\t" // line 4 |
|
487 |
- "psubusb %%mm0, %%mm1 \n\t" |
|
488 |
- "psubusb %%mm2, %%mm0 \n\t" |
|
489 |
- "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
|
490 |
- "movq (%%ecx), %%mm3 \n\t" // line 5 |
|
491 |
- "movq (%%ecx, %1), %%mm4 \n\t" // line 6 |
|
492 |
- "movq %%mm3, %%mm5 \n\t" // line 5 |
|
493 |
- "psubusb %%mm4, %%mm3 \n\t" |
|
494 |
- "psubusb %%mm5, %%mm4 \n\t" |
|
495 |
- "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
|
496 |
- PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
|
497 |
- "movq %%mm2, %%mm1 \n\t" // line 4 |
|
498 |
- "psubusb %%mm5, %%mm2 \n\t" |
|
499 |
- "movq %%mm2, %%mm4 \n\t" |
|
500 |
- "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
|
501 |
- "psubusb %%mm1, %%mm5 \n\t" |
|
502 |
- "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
|
503 |
- "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
|
504 |
- "movq %%mm4, %%mm3 \n\t" // d |
|
505 |
- "movq %2, %%mm0 \n\t" |
|
506 |
- "paddusb %%mm0, %%mm0 \n\t" |
|
507 |
- "psubusb %%mm0, %%mm4 \n\t" |
|
508 |
- "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
|
509 |
- "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
510 |
- "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
|
511 |
- |
|
512 |
- PAVGB(%%mm7, %%mm3) // d/2 |
|
513 |
- "movq %%mm3, %%mm1 \n\t" // d/2 |
|
514 |
- PAVGB(%%mm7, %%mm3) // d/4 |
|
515 |
- PAVGB(%%mm1, %%mm3) // 3*d/8 |
|
516 |
- |
|
517 |
- "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
|
518 |
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
519 |
- "psubusb %%mm3, %%mm0 \n\t" |
|
520 |
- "pxor %%mm2, %%mm0 \n\t" |
|
521 |
- "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
|
522 |
- |
|
523 |
- "movq (%%ecx), %%mm0 \n\t" // line 5 |
|
524 |
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
525 |
- "paddusb %%mm3, %%mm0 \n\t" |
|
526 |
- "pxor %%mm2, %%mm0 \n\t" |
|
527 |
- "movq %%mm0, (%%ecx) \n\t" // line 5 |
|
528 |
- |
|
529 |
- PAVGB(%%mm7, %%mm1) // d/4 |
|
530 |
- |
|
531 |
- "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
532 |
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
533 |
- "psubusb %%mm1, %%mm0 \n\t" |
|
534 |
- "pxor %%mm2, %%mm0 \n\t" |
|
535 |
- "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
|
536 |
- |
|
537 |
- "movq (%%ecx, %1), %%mm0 \n\t" // line 6 |
|
538 |
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
539 |
- "paddusb %%mm1, %%mm0 \n\t" |
|
540 |
- "pxor %%mm2, %%mm0 \n\t" |
|
541 |
- "movq %%mm0, (%%ecx, %1) \n\t" // line 6 |
|
542 |
- |
|
543 |
- PAVGB(%%mm7, %%mm1) // d/8 |
|
544 |
- |
|
545 |
- "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
|
546 |
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
|
547 |
- "psubusb %%mm1, %%mm0 \n\t" |
|
548 |
- "pxor %%mm2, %%mm0 \n\t" |
|
549 |
- "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
|
550 |
- |
|
551 |
- "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 |
|
552 |
- "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
|
553 |
- "paddusb %%mm1, %%mm0 \n\t" |
|
554 |
- "pxor %%mm2, %%mm0 \n\t" |
|
555 |
- "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 |
|
556 |
- |
|
557 |
- : |
|
558 |
- : "r" (src), "r" (stride), "m" (co->pQPb) |
|
559 |
- : "%eax", "%ecx" |
|
560 |
- ); |
|
561 |
-#else |
|
562 |
- |
|
563 |
- const int l1= stride; |
|
564 |
- const int l2= stride + l1; |
|
565 |
- const int l3= stride + l2; |
|
566 |
- const int l4= stride + l3; |
|
567 |
- const int l5= stride + l4; |
|
568 |
- const int l6= stride + l5; |
|
569 |
- const int l7= stride + l6; |
|
570 |
-// const int l8= stride + l7; |
|
571 |
-// const int l9= stride + l8; |
|
572 |
- int x; |
|
573 |
- |
|
574 |
- src+= stride*3; |
|
575 |
- for(x=0; x<BLOCK_SIZE; x++) |
|
576 |
- { |
|
577 |
- int a= src[l3] - src[l4]; |
|
578 |
- int b= src[l4] - src[l5]; |
|
579 |
- int c= src[l5] - src[l6]; |
|
580 |
- |
|
581 |
- int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
|
582 |
- d= MAX(d, 0); |
|
583 |
- |
|
584 |
- if(d < co->QP*2) |
|
585 |
- { |
|
586 |
- int v = d * SIGN(-b); |
|
587 |
- |
|
588 |
- src[l2] +=v>>3; |
|
589 |
- src[l3] +=v>>2; |
|
590 |
- src[l4] +=(3*v)>>3; |
|
591 |
- src[l5] -=(3*v)>>3; |
|
592 |
- src[l6] -=v>>2; |
|
593 |
- src[l7] -=v>>3; |
|
594 |
- |
|
595 |
- } |
|
596 |
- src++; |
|
597 |
- } |
|
598 |
-#endif |
|
599 |
-} |
|
600 |
- |
|
601 |
-static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
|
602 |
-{ |
|
603 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
604 |
-/* |
|
605 |
- uint8_t tmp[16]; |
|
606 |
- const int l1= stride; |
|
607 |
- const int l2= stride + l1; |
|
608 |
- const int l3= stride + l2; |
|
609 |
- const int l4= (int)tmp - (int)src - stride*3; |
|
610 |
- const int l5= (int)tmp - (int)src - stride*3 + 8; |
|
611 |
- const int l6= stride*3 + l3; |
|
612 |
- const int l7= stride + l6; |
|
613 |
- const int l8= stride + l7; |
|
614 |
- |
|
615 |
- memcpy(tmp, src+stride*7, 8); |
|
616 |
- memcpy(tmp+8, src+stride*8, 8); |
|
617 |
-*/ |
|
618 |
- src+= stride*4; |
|
619 |
- asm volatile( |
|
620 |
- |
|
621 |
-#if 0 //sligtly more accurate and slightly slower |
|
622 |
- "pxor %%mm7, %%mm7 \n\t" // 0 |
|
623 |
- "leal (%0, %1), %%eax \n\t" |
|
624 |
- "leal (%%eax, %1, 4), %%ecx \n\t" |
|
625 |
-// 0 1 2 3 4 5 6 7 |
|
626 |
-// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
|
627 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
|
628 |
- |
|
629 |
- |
|
630 |
- "movq (%0, %1, 2), %%mm0 \n\t" // l2 |
|
631 |
- "movq (%0), %%mm1 \n\t" // l0 |
|
632 |
- "movq %%mm0, %%mm2 \n\t" // l2 |
|
633 |
- PAVGB(%%mm7, %%mm0) // ~l2/2 |
|
634 |
- PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 |
|
635 |
- PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 |
|
636 |
- |
|
637 |
- "movq (%%eax), %%mm1 \n\t" // l1 |
|
638 |
- "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 |
|
639 |
- "movq %%mm1, %%mm4 \n\t" // l1 |
|
640 |
- PAVGB(%%mm7, %%mm1) // ~l1/2 |
|
641 |
- PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 |
|
642 |
- PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 |
|
643 |
- |
|
644 |
- "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 |
|
645 |
- "psubusb %%mm1, %%mm0 \n\t" |
|
646 |
- "psubusb %%mm4, %%mm1 \n\t" |
|
647 |
- "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 |
|
648 |
-// mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 |
|
649 |
- |
|
650 |
- "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
|
651 |
- "movq %%mm0, %%mm4 \n\t" // l4 |
|
652 |
- PAVGB(%%mm7, %%mm0) // ~l4/2 |
|
653 |
- PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 |
|
654 |
- PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 |
|
655 |
- |
|
656 |
- "movq (%%ecx), %%mm2 \n\t" // l5 |
|
657 |
- "movq %%mm3, %%mm5 \n\t" // l3 |
|
658 |
- PAVGB(%%mm7, %%mm3) // ~l3/2 |
|
659 |
- PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 |
|
660 |
- PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 |
|
661 |
- |
|
662 |
- "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 |
|
663 |
- "psubusb %%mm3, %%mm0 \n\t" |
|
664 |
- "psubusb %%mm6, %%mm3 \n\t" |
|
665 |
- "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 |
|
666 |
- "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) |
|
667 |
-// mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 |
|
668 |
- |
|
669 |
- "movq (%%ecx, %1), %%mm6 \n\t" // l6 |
|
670 |
- "movq %%mm6, %%mm5 \n\t" // l6 |
|
671 |
- PAVGB(%%mm7, %%mm6) // ~l6/2 |
|
672 |
- PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 |
|
673 |
- PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 |
|
674 |
- |
|
675 |
- "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 |
|
676 |
- "movq %%mm2, %%mm4 \n\t" // l5 |
|
677 |
- PAVGB(%%mm7, %%mm2) // ~l5/2 |
|
678 |
- PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 |
|
679 |
- PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 |
|
680 |
- |
|
681 |
- "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 |
|
682 |
- "psubusb %%mm2, %%mm6 \n\t" |
|
683 |
- "psubusb %%mm4, %%mm2 \n\t" |
|
684 |
- "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 |
|
685 |
-// mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 |
|
686 |
- |
|
687 |
- |
|
688 |
- PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 |
|
689 |
- "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
|
690 |
- "paddusb "MANGLE(b01)", %%mm4 \n\t" |
|
691 |
- "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
|
692 |
- "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 |
|
693 |
- "pand %%mm4, %%mm3 \n\t" |
|
694 |
- |
|
695 |
- "movq %%mm3, %%mm1 \n\t" |
|
696 |
-// "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
697 |
- PAVGB(%%mm7, %%mm3) |
|
698 |
- PAVGB(%%mm7, %%mm3) |
|
699 |
- "paddusb %%mm1, %%mm3 \n\t" |
|
700 |
-// "paddusb "MANGLE(b01)", %%mm3 \n\t" |
|
701 |
- |
|
702 |
- "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 |
|
703 |
- "movq (%0, %1, 4), %%mm5 \n\t" //l4 |
|
704 |
- "movq (%0, %1, 4), %%mm4 \n\t" //l4 |
|
705 |
- "psubusb %%mm6, %%mm5 \n\t" |
|
706 |
- "psubusb %%mm4, %%mm6 \n\t" |
|
707 |
- "por %%mm6, %%mm5 \n\t" // |l3-l4| |
|
708 |
- "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) |
|
709 |
- "pxor %%mm6, %%mm0 \n\t" |
|
710 |
- "pand %%mm0, %%mm3 \n\t" |
|
711 |
- PMINUB(%%mm5, %%mm3, %%mm0) |
|
712 |
- |
|
713 |
- "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
714 |
- PAVGB(%%mm7, %%mm3) |
|
715 |
- |
|
716 |
- "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
717 |
- "movq (%0, %1, 4), %%mm2 \n\t" |
|
718 |
- "pxor %%mm6, %%mm0 \n\t" |
|
719 |
- "pxor %%mm6, %%mm2 \n\t" |
|
720 |
- "psubb %%mm3, %%mm0 \n\t" |
|
721 |
- "paddb %%mm3, %%mm2 \n\t" |
|
722 |
- "pxor %%mm6, %%mm0 \n\t" |
|
723 |
- "pxor %%mm6, %%mm2 \n\t" |
|
724 |
- "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
725 |
- "movq %%mm2, (%0, %1, 4) \n\t" |
|
726 |
-#endif |
|
727 |
- |
|
728 |
- "leal (%0, %1), %%eax \n\t" |
|
729 |
- "pcmpeqb %%mm6, %%mm6 \n\t" // -1 |
|
730 |
-// 0 1 2 3 4 5 6 7 |
|
731 |
-// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
|
732 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
|
733 |
- |
|
734 |
- |
|
735 |
- "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 |
|
736 |
- "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
|
737 |
- "pxor %%mm6, %%mm1 \n\t" // -l3-1 |
|
738 |
- PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 |
|
739 |
-// mm1=-l3-1, mm0=128-q |
|
740 |
- |
|
741 |
- "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 |
|
742 |
- "movq (%%eax, %1), %%mm3 \n\t" // l2 |
|
743 |
- "pxor %%mm6, %%mm2 \n\t" // -l5-1 |
|
744 |
- "movq %%mm2, %%mm5 \n\t" // -l5-1 |
|
745 |
- "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
|
746 |
- "leal (%%eax, %1, 4), %%ecx \n\t" |
|
747 |
- PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
|
748 |
- PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 |
|
749 |
- PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 |
|
750 |
- PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 |
|
751 |
-// mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 |
|
752 |
- |
|
753 |
- "movq (%%eax), %%mm2 \n\t" // l1 |
|
754 |
- "pxor %%mm6, %%mm2 \n\t" // -l1-1 |
|
755 |
- PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 |
|
756 |
- PAVGB((%0), %%mm1) // (l0-l3+256)/2 |
|
757 |
- "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
|
758 |
- PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
|
759 |
- PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 |
|
760 |
- PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 |
|
761 |
-// mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 |
|
762 |
- |
|
763 |
- PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 |
|
764 |
- "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 |
|
765 |
- "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
|
766 |
- PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 |
|
767 |
- "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
|
768 |
- PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
|
769 |
- PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 |
|
770 |
- PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 |
|
771 |
-// mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 |
|
772 |
- |
|
773 |
- "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
|
774 |
- "movq "MANGLE(b00)", %%mm5 \n\t" // 0 |
|
775 |
- "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
|
776 |
- "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 |
|
777 |
- PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| |
|
778 |
- PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| |
|
779 |
- PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 |
|
780 |
- |
|
781 |
-// mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 |
|
782 |
- |
|
783 |
- "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
|
784 |
- "movq %2, %%mm2 \n\t" // QP |
|
785 |
- PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
|
786 |
- "psubb %%mm6, %%mm2 \n\t" |
|
787 |
- |
|
788 |
- "movq %%mm4, %%mm1 \n\t" |
|
789 |
- "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) |
|
790 |
- "pxor %%mm1, %%mm4 \n\t" |
|
791 |
- "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 |
|
792 |
- "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 |
|
793 |
- "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 |
|
794 |
-// mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 |
|
795 |
- |
|
796 |
- "movq %%mm4, %%mm3 \n\t" // d |
|
797 |
- "psubusb "MANGLE(b01)", %%mm4 \n\t" |
|
798 |
- PAVGB(%%mm7, %%mm4) // d/32 |
|
799 |
- PAVGB(%%mm7, %%mm4) // (d + 32)/64 |
|
800 |
- "paddb %%mm3, %%mm4 \n\t" // 5d/64 |
|
801 |
- "pand %%mm2, %%mm4 \n\t" |
|
802 |
- |
|
803 |
- "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
|
804 |
- "psubb %%mm0, %%mm5 \n\t" // q |
|
805 |
- "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding |
|
806 |
- "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) |
|
807 |
- "pxor %%mm7, %%mm5 \n\t" |
|
808 |
- |
|
809 |
- PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) |
|
810 |
- "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) |
|
811 |
- |
|
812 |
- "pand %%mm7, %%mm4 \n\t" |
|
813 |
- "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
814 |
- "movq (%0, %1, 4), %%mm2 \n\t" |
|
815 |
- "pxor %%mm1, %%mm0 \n\t" |
|
816 |
- "pxor %%mm1, %%mm2 \n\t" |
|
817 |
- "paddb %%mm4, %%mm0 \n\t" |
|
818 |
- "psubb %%mm4, %%mm2 \n\t" |
|
819 |
- "pxor %%mm1, %%mm0 \n\t" |
|
820 |
- "pxor %%mm1, %%mm2 \n\t" |
|
821 |
- "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
822 |
- "movq %%mm2, (%0, %1, 4) \n\t" |
|
823 |
- |
|
824 |
- : |
|
825 |
- : "r" (src), "r" (stride), "m" (c->pQPb) |
|
826 |
- : "%eax", "%ecx" |
|
827 |
- ); |
|
828 |
- |
|
829 |
-/* |
|
830 |
- { |
|
831 |
- int x; |
|
832 |
- src-= stride; |
|
833 |
- for(x=0; x<BLOCK_SIZE; x++) |
|
834 |
- { |
|
835 |
- const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
|
836 |
- if(ABS(middleEnergy)< 8*QP) |
|
837 |
- { |
|
838 |
- const int q=(src[l4] - src[l5])/2; |
|
839 |
- const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
840 |
- const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
841 |
- |
|
842 |
- int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
843 |
- d= MAX(d, 0); |
|
844 |
- |
|
845 |
- d= (5*d + 32) >> 6; |
|
846 |
- d*= SIGN(-middleEnergy); |
|
847 |
- |
|
848 |
- if(q>0) |
|
849 |
- { |
|
850 |
- d= d<0 ? 0 : d; |
|
851 |
- d= d>q ? q : d; |
|
852 |
- } |
|
853 |
- else |
|
854 |
- { |
|
855 |
- d= d>0 ? 0 : d; |
|
856 |
- d= d<q ? q : d; |
|
857 |
- } |
|
858 |
- |
|
859 |
- src[l4]-= d; |
|
860 |
- src[l5]+= d; |
|
861 |
- } |
|
862 |
- src++; |
|
863 |
- } |
|
864 |
-src-=8; |
|
865 |
- for(x=0; x<8; x++) |
|
866 |
- { |
|
867 |
- int y; |
|
868 |
- for(y=4; y<6; y++) |
|
869 |
- { |
|
870 |
- int d= src[x+y*stride] - tmp[x+(y-4)*8]; |
|
871 |
- int ad= ABS(d); |
|
872 |
- static int max=0; |
|
873 |
- static int sum=0; |
|
874 |
- static int num=0; |
|
875 |
- static int bias=0; |
|
876 |
- |
|
877 |
- if(max<ad) max=ad; |
|
878 |
- sum+= ad>3 ? 1 : 0; |
|
879 |
- if(ad>3) |
|
880 |
- { |
|
881 |
- src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; |
|
882 |
- } |
|
883 |
- if(y==4) bias+=d; |
|
884 |
- num++; |
|
885 |
- if(num%1000000 == 0) |
|
886 |
- { |
|
887 |
- printf(" %d %d %d %d\n", num, sum, max, bias); |
|
888 |
- } |
|
889 |
- } |
|
890 |
- } |
|
891 |
-} |
|
892 |
-*/ |
|
893 |
-#elif defined (HAVE_MMX) |
|
894 |
- src+= stride*4; |
|
895 |
- asm volatile( |
|
896 |
- "pxor %%mm7, %%mm7 \n\t" |
|
897 |
- "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
|
898 |
- "andl $0xFFFFFFF8, %%ecx \n\t" // align |
|
899 |
-// 0 1 2 3 4 5 6 7 |
|
900 |
-// %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
|
901 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 |
|
902 |
- |
|
903 |
- "movq (%0), %%mm0 \n\t" |
|
904 |
- "movq %%mm0, %%mm1 \n\t" |
|
905 |
- "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
|
906 |
- "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
|
907 |
- |
|
908 |
- "movq (%0, %1), %%mm2 \n\t" |
|
909 |
- "leal (%0, %1, 2), %%eax \n\t" |
|
910 |
- "movq %%mm2, %%mm3 \n\t" |
|
911 |
- "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
|
912 |
- "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
|
913 |
- |
|
914 |
- "movq (%%eax), %%mm4 \n\t" |
|
915 |
- "movq %%mm4, %%mm5 \n\t" |
|
916 |
- "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
|
917 |
- "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
|
918 |
- |
|
919 |
- "paddw %%mm0, %%mm0 \n\t" // 2L0 |
|
920 |
- "paddw %%mm1, %%mm1 \n\t" // 2H0 |
|
921 |
- "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
|
922 |
- "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
|
923 |
- "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
|
924 |
- "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
|
925 |
- |
|
926 |
- "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
|
927 |
- "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
|
928 |
- "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
|
929 |
- "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
|
930 |
- |
|
931 |
- "movq (%%eax, %1), %%mm2 \n\t" |
|
932 |
- "movq %%mm2, %%mm3 \n\t" |
|
933 |
- "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
|
934 |
- "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
|
935 |
- |
|
936 |
- "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
|
937 |
- "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
|
938 |
- "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
939 |
- "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
940 |
- "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
941 |
- "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
942 |
- |
|
943 |
- "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
944 |
- "movq %%mm0, %%mm1 \n\t" |
|
945 |
- "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
|
946 |
- "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
|
947 |
- |
|
948 |
- "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
|
949 |
- "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
|
950 |
- "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
|
951 |
- "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 |
|
952 |
- "paddw %%mm4, %%mm4 \n\t" // 2L2 |
|
953 |
- "paddw %%mm5, %%mm5 \n\t" // 2H2 |
|
954 |
- "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
|
955 |
- "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
|
956 |
- |
|
957 |
- "leal (%%eax, %1), %0 \n\t" |
|
958 |
- "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
|
959 |
- "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
|
960 |
- "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
|
961 |
- "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
|
962 |
-//50 opcodes so far |
|
963 |
- "movq (%0, %1, 2), %%mm2 \n\t" |
|
964 |
- "movq %%mm2, %%mm3 \n\t" |
|
965 |
- "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
|
966 |
- "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
|
967 |
- "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
|
968 |
- "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
|
969 |
- "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
|
970 |
- "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
|
971 |
- |
|
972 |
- "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
973 |
- "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
|
974 |
- "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
|
975 |
- "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
976 |
- "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
|
977 |
- "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
|
978 |
- |
|
979 |
- "paddw %%mm0, %%mm0 \n\t" // 2L4 |
|
980 |
- "paddw %%mm1, %%mm1 \n\t" // 2H4 |
|
981 |
- "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
|
982 |
- "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
|
983 |
- |
|
984 |
- "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
|
985 |
- "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
|
986 |
- "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
|
987 |
- "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
|
988 |
- |
|
989 |
- "movq (%0, %1, 4), %%mm2 \n\t" |
|
990 |
- "movq %%mm2, %%mm3 \n\t" |
|
991 |
- "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
|
992 |
- "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
|
993 |
- |
|
994 |
- "paddw %%mm2, %%mm2 \n\t" // 2L7 |
|
995 |
- "paddw %%mm3, %%mm3 \n\t" // 2H7 |
|
996 |
- "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
|
997 |
- "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
|
998 |
- |
|
999 |
- "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
1000 |
- "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
1001 |
- |
|
1002 |
-#ifdef HAVE_MMX2 |
|
1003 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1004 |
- "psubw %%mm0, %%mm6 \n\t" |
|
1005 |
- "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
1006 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1007 |
- "psubw %%mm1, %%mm6 \n\t" |
|
1008 |
- "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
1009 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1010 |
- "psubw %%mm2, %%mm6 \n\t" |
|
1011 |
- "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
1012 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1013 |
- "psubw %%mm3, %%mm6 \n\t" |
|
1014 |
- "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
|
1015 |
-#else |
|
1016 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1017 |
- "pcmpgtw %%mm0, %%mm6 \n\t" |
|
1018 |
- "pxor %%mm6, %%mm0 \n\t" |
|
1019 |
- "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
1020 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1021 |
- "pcmpgtw %%mm1, %%mm6 \n\t" |
|
1022 |
- "pxor %%mm6, %%mm1 \n\t" |
|
1023 |
- "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
1024 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1025 |
- "pcmpgtw %%mm2, %%mm6 \n\t" |
|
1026 |
- "pxor %%mm6, %%mm2 \n\t" |
|
1027 |
- "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
1028 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1029 |
- "pcmpgtw %%mm3, %%mm6 \n\t" |
|
1030 |
- "pxor %%mm6, %%mm3 \n\t" |
|
1031 |
- "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
|
1032 |
-#endif |
|
1033 |
- |
|
1034 |
-#ifdef HAVE_MMX2 |
|
1035 |
- "pminsw %%mm2, %%mm0 \n\t" |
|
1036 |
- "pminsw %%mm3, %%mm1 \n\t" |
|
1037 |
-#else |
|
1038 |
- "movq %%mm0, %%mm6 \n\t" |
|
1039 |
- "psubusw %%mm2, %%mm6 \n\t" |
|
1040 |
- "psubw %%mm6, %%mm0 \n\t" |
|
1041 |
- "movq %%mm1, %%mm6 \n\t" |
|
1042 |
- "psubusw %%mm3, %%mm6 \n\t" |
|
1043 |
- "psubw %%mm6, %%mm1 \n\t" |
|
1044 |
-#endif |
|
1045 |
- |
|
1046 |
- "movq %%mm7, %%mm6 \n\t" // 0 |
|
1047 |
- "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
|
1048 |
- "pxor %%mm6, %%mm4 \n\t" |
|
1049 |
- "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
|
1050 |
- "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
|
1051 |
- "pxor %%mm7, %%mm5 \n\t" |
|
1052 |
- "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
|
1053 |
-// 100 opcodes |
|
1054 |
- "movd %2, %%mm2 \n\t" // QP |
|
1055 |
- "psllw $3, %%mm2 \n\t" // 8QP |
|
1056 |
- "movq %%mm2, %%mm3 \n\t" // 8QP |
|
1057 |
- "pcmpgtw %%mm4, %%mm2 \n\t" |
|
1058 |
- "pcmpgtw %%mm5, %%mm3 \n\t" |
|
1059 |
- "pand %%mm2, %%mm4 \n\t" |
|
1060 |
- "pand %%mm3, %%mm5 \n\t" |
|
1061 |
- |
|
1062 |
- |
|
1063 |
- "psubusw %%mm0, %%mm4 \n\t" // hd |
|
1064 |
- "psubusw %%mm1, %%mm5 \n\t" // ld |
|
1065 |
- |
|
1066 |
- |
|
1067 |
- "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
|
1068 |
- "pmullw %%mm2, %%mm4 \n\t" |
|
1069 |
- "pmullw %%mm2, %%mm5 \n\t" |
|
1070 |
- "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
|
1071 |
- "paddw %%mm2, %%mm4 \n\t" |
|
1072 |
- "paddw %%mm2, %%mm5 \n\t" |
|
1073 |
- "psrlw $6, %%mm4 \n\t" |
|
1074 |
- "psrlw $6, %%mm5 \n\t" |
|
1075 |
- |
|
1076 |
- "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
|
1077 |
- "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 |
|
1078 |
- |
|
1079 |
- "pxor %%mm2, %%mm2 \n\t" |
|
1080 |
- "pxor %%mm3, %%mm3 \n\t" |
|
1081 |
- |
|
1082 |
- "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
|
1083 |
- "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
|
1084 |
- "pxor %%mm2, %%mm0 \n\t" |
|
1085 |
- "pxor %%mm3, %%mm1 \n\t" |
|
1086 |
- "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
|
1087 |
- "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
|
1088 |
- "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
|
1089 |
- "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
|
1090 |
- |
|
1091 |
- "pxor %%mm6, %%mm2 \n\t" |
|
1092 |
- "pxor %%mm7, %%mm3 \n\t" |
|
1093 |
- "pand %%mm2, %%mm4 \n\t" |
|
1094 |
- "pand %%mm3, %%mm5 \n\t" |
|
1095 |
- |
|
1096 |
-#ifdef HAVE_MMX2 |
|
1097 |
- "pminsw %%mm0, %%mm4 \n\t" |
|
1098 |
- "pminsw %%mm1, %%mm5 \n\t" |
|
1099 |
-#else |
|
1100 |
- "movq %%mm4, %%mm2 \n\t" |
|
1101 |
- "psubusw %%mm0, %%mm2 \n\t" |
|
1102 |
- "psubw %%mm2, %%mm4 \n\t" |
|
1103 |
- "movq %%mm5, %%mm2 \n\t" |
|
1104 |
- "psubusw %%mm1, %%mm2 \n\t" |
|
1105 |
- "psubw %%mm2, %%mm5 \n\t" |
|
1106 |
-#endif |
|
1107 |
- "pxor %%mm6, %%mm4 \n\t" |
|
1108 |
- "pxor %%mm7, %%mm5 \n\t" |
|
1109 |
- "psubw %%mm6, %%mm4 \n\t" |
|
1110 |
- "psubw %%mm7, %%mm5 \n\t" |
|
1111 |
- "packsswb %%mm5, %%mm4 \n\t" |
|
1112 |
- "movq (%0), %%mm0 \n\t" |
|
1113 |
- "paddb %%mm4, %%mm0 \n\t" |
|
1114 |
- "movq %%mm0, (%0) \n\t" |
|
1115 |
- "movq (%0, %1), %%mm0 \n\t" |
|
1116 |
- "psubb %%mm4, %%mm0 \n\t" |
|
1117 |
- "movq %%mm0, (%0, %1) \n\t" |
|
1118 |
- |
|
1119 |
- : "+r" (src) |
|
1120 |
- : "r" (stride), "m" (c->pQPb) |
|
1121 |
- : "%eax", "%ecx" |
|
1122 |
- ); |
|
1123 |
-#else |
|
1124 |
- const int l1= stride; |
|
1125 |
- const int l2= stride + l1; |
|
1126 |
- const int l3= stride + l2; |
|
1127 |
- const int l4= stride + l3; |
|
1128 |
- const int l5= stride + l4; |
|
1129 |
- const int l6= stride + l5; |
|
1130 |
- const int l7= stride + l6; |
|
1131 |
- const int l8= stride + l7; |
|
1132 |
-// const int l9= stride + l8; |
|
1133 |
- int x; |
|
1134 |
- src+= stride*3; |
|
1135 |
- for(x=0; x<BLOCK_SIZE; x++) |
|
1136 |
- { |
|
1137 |
- const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
|
1138 |
- if(ABS(middleEnergy) < 8*c->QP) |
|
1139 |
- { |
|
1140 |
- const int q=(src[l4] - src[l5])/2; |
|
1141 |
- const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
1142 |
- const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
1143 |
- |
|
1144 |
- int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
1145 |
- d= MAX(d, 0); |
|
1146 |
- |
|
1147 |
- d= (5*d + 32) >> 6; |
|
1148 |
- d*= SIGN(-middleEnergy); |
|
1149 |
- |
|
1150 |
- if(q>0) |
|
1151 |
- { |
|
1152 |
- d= d<0 ? 0 : d; |
|
1153 |
- d= d>q ? q : d; |
|
1154 |
- } |
|
1155 |
- else |
|
1156 |
- { |
|
1157 |
- d= d>0 ? 0 : d; |
|
1158 |
- d= d<q ? q : d; |
|
1159 |
- } |
|
1160 |
- |
|
1161 |
- src[l4]-= d; |
|
1162 |
- src[l5]+= d; |
|
1163 |
- } |
|
1164 |
- src++; |
|
1165 |
- } |
|
1166 |
-#endif |
|
1167 |
-} |
|
1168 |
- |
|
1169 |
-static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
|
1170 |
-{ |
|
1171 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1172 |
- asm volatile( |
|
1173 |
- "pxor %%mm6, %%mm6 \n\t" |
|
1174 |
- "pcmpeqb %%mm7, %%mm7 \n\t" |
|
1175 |
- "movq %2, %%mm0 \n\t" |
|
1176 |
- "punpcklbw %%mm6, %%mm0 \n\t" |
|
1177 |
- "psrlw $1, %%mm0 \n\t" |
|
1178 |
- "psubw %%mm7, %%mm0 \n\t" |
|
1179 |
- "packuswb %%mm0, %%mm0 \n\t" |
|
1180 |
- "movq %%mm0, %3 \n\t" |
|
1181 |
- |
|
1182 |
- "leal (%0, %1), %%eax \n\t" |
|
1183 |
- "leal (%%eax, %1, 4), %%edx \n\t" |
|
1184 |
- |
|
1185 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
1186 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1187 |
- |
|
1188 |
-#undef FIND_MIN_MAX |
|
1189 |
-#ifdef HAVE_MMX2 |
|
1190 |
-#define FIND_MIN_MAX(addr)\ |
|
1191 |
- "movq " #addr ", %%mm0 \n\t"\ |
|
1192 |
- "pminub %%mm0, %%mm7 \n\t"\ |
|
1193 |
- "pmaxub %%mm0, %%mm6 \n\t" |
|
1194 |
-#else |
|
1195 |
-#define FIND_MIN_MAX(addr)\ |
|
1196 |
- "movq " #addr ", %%mm0 \n\t"\ |
|
1197 |
- "movq %%mm7, %%mm1 \n\t"\ |
|
1198 |
- "psubusb %%mm0, %%mm6 \n\t"\ |
|
1199 |
- "paddb %%mm0, %%mm6 \n\t"\ |
|
1200 |
- "psubusb %%mm0, %%mm1 \n\t"\ |
|
1201 |
- "psubb %%mm1, %%mm7 \n\t" |
|
1202 |
-#endif |
|
1203 |
- |
|
1204 |
-FIND_MIN_MAX((%%eax)) |
|
1205 |
-FIND_MIN_MAX((%%eax, %1)) |
|
1206 |
-FIND_MIN_MAX((%%eax, %1, 2)) |
|
1207 |
-FIND_MIN_MAX((%0, %1, 4)) |
|
1208 |
-FIND_MIN_MAX((%%edx)) |
|
1209 |
-FIND_MIN_MAX((%%edx, %1)) |
|
1210 |
-FIND_MIN_MAX((%%edx, %1, 2)) |
|
1211 |
-FIND_MIN_MAX((%0, %1, 8)) |
|
1212 |
- |
|
1213 |
- "movq %%mm7, %%mm4 \n\t" |
|
1214 |
- "psrlq $8, %%mm7 \n\t" |
|
1215 |
-#ifdef HAVE_MMX2 |
|
1216 |
- "pminub %%mm4, %%mm7 \n\t" // min of pixels |
|
1217 |
- "pshufw $0xF9, %%mm7, %%mm4 \n\t" |
|
1218 |
- "pminub %%mm4, %%mm7 \n\t" // min of pixels |
|
1219 |
- "pshufw $0xFE, %%mm7, %%mm4 \n\t" |
|
1220 |
- "pminub %%mm4, %%mm7 \n\t" |
|
1221 |
-#else |
|
1222 |
- "movq %%mm7, %%mm1 \n\t" |
|
1223 |
- "psubusb %%mm4, %%mm1 \n\t" |
|
1224 |
- "psubb %%mm1, %%mm7 \n\t" |
|
1225 |
- "movq %%mm7, %%mm4 \n\t" |
|
1226 |
- "psrlq $16, %%mm7 \n\t" |
|
1227 |
- "movq %%mm7, %%mm1 \n\t" |
|
1228 |
- "psubusb %%mm4, %%mm1 \n\t" |
|
1229 |
- "psubb %%mm1, %%mm7 \n\t" |
|
1230 |
- "movq %%mm7, %%mm4 \n\t" |
|
1231 |
- "psrlq $32, %%mm7 \n\t" |
|
1232 |
- "movq %%mm7, %%mm1 \n\t" |
|
1233 |
- "psubusb %%mm4, %%mm1 \n\t" |
|
1234 |
- "psubb %%mm1, %%mm7 \n\t" |
|
1235 |
-#endif |
|
1236 |
- |
|
1237 |
- |
|
1238 |
- "movq %%mm6, %%mm4 \n\t" |
|
1239 |
- "psrlq $8, %%mm6 \n\t" |
|
1240 |
-#ifdef HAVE_MMX2 |
|
1241 |
- "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
|
1242 |
- "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
|
1243 |
- "pmaxub %%mm4, %%mm6 \n\t" |
|
1244 |
- "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
|
1245 |
- "pmaxub %%mm4, %%mm6 \n\t" |
|
1246 |
-#else |
|
1247 |
- "psubusb %%mm4, %%mm6 \n\t" |
|
1248 |
- "paddb %%mm4, %%mm6 \n\t" |
|
1249 |
- "movq %%mm6, %%mm4 \n\t" |
|
1250 |
- "psrlq $16, %%mm6 \n\t" |
|
1251 |
- "psubusb %%mm4, %%mm6 \n\t" |
|
1252 |
- "paddb %%mm4, %%mm6 \n\t" |
|
1253 |
- "movq %%mm6, %%mm4 \n\t" |
|
1254 |
- "psrlq $32, %%mm6 \n\t" |
|
1255 |
- "psubusb %%mm4, %%mm6 \n\t" |
|
1256 |
- "paddb %%mm4, %%mm6 \n\t" |
|
1257 |
-#endif |
|
1258 |
- "movq %%mm6, %%mm0 \n\t" // max |
|
1259 |
- "psubb %%mm7, %%mm6 \n\t" // max - min |
|
1260 |
- "movd %%mm6, %%ecx \n\t" |
|
1261 |
- "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
|
1262 |
- " jb 1f \n\t" |
|
1263 |
- "leal -24(%%esp), %%ecx \n\t" |
|
1264 |
- "andl $0xFFFFFFF8, %%ecx \n\t" |
|
1265 |
- PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
|
1266 |
- "punpcklbw %%mm7, %%mm7 \n\t" |
|
1267 |
- "punpcklbw %%mm7, %%mm7 \n\t" |
|
1268 |
- "punpcklbw %%mm7, %%mm7 \n\t" |
|
1269 |
- "movq %%mm7, (%%ecx) \n\t" |
|
1270 |
- |
|
1271 |
- "movq (%0), %%mm0 \n\t" // L10 |
|
1272 |
- "movq %%mm0, %%mm1 \n\t" // L10 |
|
1273 |
- "movq %%mm0, %%mm2 \n\t" // L10 |
|
1274 |
- "psllq $8, %%mm1 \n\t" |
|
1275 |
- "psrlq $8, %%mm2 \n\t" |
|
1276 |
- "movd -4(%0), %%mm3 \n\t" |
|
1277 |
- "movd 8(%0), %%mm4 \n\t" |
|
1278 |
- "psrlq $24, %%mm3 \n\t" |
|
1279 |
- "psllq $56, %%mm4 \n\t" |
|
1280 |
- "por %%mm3, %%mm1 \n\t" // L00 |
|
1281 |
- "por %%mm4, %%mm2 \n\t" // L20 |
|
1282 |
- "movq %%mm1, %%mm3 \n\t" // L00 |
|
1283 |
- PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 |
|
1284 |
- PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 |
|
1285 |
- "psubusb %%mm7, %%mm0 \n\t" |
|
1286 |
- "psubusb %%mm7, %%mm2 \n\t" |
|
1287 |
- "psubusb %%mm7, %%mm3 \n\t" |
|
1288 |
- "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
|
1289 |
- "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 |
|
1290 |
- "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 |
|
1291 |
- "paddb %%mm2, %%mm0 \n\t" |
|
1292 |
- "paddb %%mm3, %%mm0 \n\t" |
|
1293 |
- |
|
1294 |
- "movq (%%eax), %%mm2 \n\t" // L11 |
|
1295 |
- "movq %%mm2, %%mm3 \n\t" // L11 |
|
1296 |
- "movq %%mm2, %%mm4 \n\t" // L11 |
|
1297 |
- "psllq $8, %%mm3 \n\t" |
|
1298 |
- "psrlq $8, %%mm4 \n\t" |
|
1299 |
- "movd -4(%%eax), %%mm5 \n\t" |
|
1300 |
- "movd 8(%%eax), %%mm6 \n\t" |
|
1301 |
- "psrlq $24, %%mm5 \n\t" |
|
1302 |
- "psllq $56, %%mm6 \n\t" |
|
1303 |
- "por %%mm5, %%mm3 \n\t" // L01 |
|
1304 |
- "por %%mm6, %%mm4 \n\t" // L21 |
|
1305 |
- "movq %%mm3, %%mm5 \n\t" // L01 |
|
1306 |
- PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 |
|
1307 |
- PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 |
|
1308 |
- "psubusb %%mm7, %%mm2 \n\t" |
|
1309 |
- "psubusb %%mm7, %%mm4 \n\t" |
|
1310 |
- "psubusb %%mm7, %%mm5 \n\t" |
|
1311 |
- "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
|
1312 |
- "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 |
|
1313 |
- "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 |
|
1314 |
- "paddb %%mm4, %%mm2 \n\t" |
|
1315 |
- "paddb %%mm5, %%mm2 \n\t" |
|
1316 |
-// 0, 2, 3, 1 |
|
1317 |
-#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
|
1318 |
- "movq " #src ", " #sx " \n\t" /* src[0] */\ |
|
1319 |
- "movq " #sx ", " #lx " \n\t" /* src[0] */\ |
|
1320 |
- "movq " #sx ", " #t0 " \n\t" /* src[0] */\ |
|
1321 |
- "psllq $8, " #lx " \n\t"\ |
|
1322 |
- "psrlq $8, " #t0 " \n\t"\ |
|
1323 |
- "movd -4" #src ", " #t1 " \n\t"\ |
|
1324 |
- "psrlq $24, " #t1 " \n\t"\ |
|
1325 |
- "por " #t1 ", " #lx " \n\t" /* src[-1] */\ |
|
1326 |
- "movd 8" #src ", " #t1 " \n\t"\ |
|
1327 |
- "psllq $56, " #t1 " \n\t"\ |
|
1328 |
- "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ |
|
1329 |
- "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ |
|
1330 |
- PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ |
|
1331 |
- PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ |
|
1332 |
- PAVGB(lx, pplx) \ |
|
1333 |
- "movq " #lx ", 8(%%ecx) \n\t"\ |
|
1334 |
- "movq (%%ecx), " #lx " \n\t"\ |
|
1335 |
- "psubusb " #lx ", " #t1 " \n\t"\ |
|
1336 |
- "psubusb " #lx ", " #t0 " \n\t"\ |
|
1337 |
- "psubusb " #lx ", " #sx " \n\t"\ |
|
1338 |
- "movq "MANGLE(b00)", " #lx " \n\t"\ |
|
1339 |
- "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
|
1340 |
- "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ |
|
1341 |
- "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ |
|
1342 |
- "paddb " #t1 ", " #t0 " \n\t"\ |
|
1343 |
- "paddb " #t0 ", " #sx " \n\t"\ |
|
1344 |
-\ |
|
1345 |
- PAVGB(plx, pplx) /* filtered */\ |
|
1346 |
- "movq " #dst ", " #t0 " \n\t" /* dst */\ |
|
1347 |
- "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
|
1348 |
- "psubusb %3, " #t0 " \n\t"\ |
|
1349 |
- "paddusb %3, " #t1 " \n\t"\ |
|
1350 |
- PMAXUB(t0, pplx)\ |
|
1351 |
- PMINUB(t1, pplx, t0)\ |
|
1352 |
- "paddb " #sx ", " #ppsx " \n\t"\ |
|
1353 |
- "paddb " #psx ", " #ppsx " \n\t"\ |
|
1354 |
- "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
|
1355 |
- "pand "MANGLE(b08)", " #ppsx " \n\t"\ |
|
1356 |
- "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
|
1357 |
- "pand " #ppsx ", " #pplx " \n\t"\ |
|
1358 |
- "pandn " #dst ", " #ppsx " \n\t"\ |
|
1359 |
- "por " #pplx ", " #ppsx " \n\t"\ |
|
1360 |
- "movq " #ppsx ", " #dst " \n\t"\ |
|
1361 |
- "movq 8(%%ecx), " #lx " \n\t" |
|
1362 |
- |
|
1363 |
-/* |
|
1364 |
-0000000 |
|
1365 |
-1111111 |
|
1366 |
- |
|
1367 |
-1111110 |
|
1368 |
-1111101 |
|
1369 |
-1111100 |
|
1370 |
-1111011 |
|
1371 |
-1111010 |
|
1372 |
-1111001 |
|
1373 |
- |
|
1374 |
-1111000 |
|
1375 |
-1110111 |
|
1376 |
- |
|
1377 |
-*/ |
|
1378 |
-//DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) |
|
1379 |
-DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
|
1380 |
-DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
|
1381 |
-DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
|
1382 |
-DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
|
1383 |
-DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
|
1384 |
-DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
|
1385 |
-DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
|
1386 |
-DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
|
1387 |
- |
|
1388 |
- "1: \n\t" |
|
1389 |
- : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) |
|
1390 |
- : "%eax", "%edx", "%ecx" |
|
1391 |
- ); |
|
1392 |
-#else |
|
1393 |
- int y; |
|
1394 |
- int min=255; |
|
1395 |
- int max=0; |
|
1396 |
- int avg; |
|
1397 |
- uint8_t *p; |
|
1398 |
- int s[10]; |
|
1399 |
- const int QP2= c->QP/2 + 1; |
|
1400 |
- |
|
1401 |
- for(y=1; y<9; y++) |
|
1402 |
- { |
|
1403 |
- int x; |
|
1404 |
- p= src + stride*y; |
|
1405 |
- for(x=1; x<9; x++) |
|
1406 |
- { |
|
1407 |
- p++; |
|
1408 |
- if(*p > max) max= *p; |
|
1409 |
- if(*p < min) min= *p; |
|
1410 |
- } |
|
1411 |
- } |
|
1412 |
- avg= (min + max + 1)>>1; |
|
1413 |
- |
|
1414 |
- if(max - min <deringThreshold) return; |
|
1415 |
- |
|
1416 |
- for(y=0; y<10; y++) |
|
1417 |
- { |
|
1418 |
- int t = 0; |
|
1419 |
- |
|
1420 |
- if(src[stride*y + 0] > avg) t+= 1; |
|
1421 |
- if(src[stride*y + 1] > avg) t+= 2; |
|
1422 |
- if(src[stride*y + 2] > avg) t+= 4; |
|
1423 |
- if(src[stride*y + 3] > avg) t+= 8; |
|
1424 |
- if(src[stride*y + 4] > avg) t+= 16; |
|
1425 |
- if(src[stride*y + 5] > avg) t+= 32; |
|
1426 |
- if(src[stride*y + 6] > avg) t+= 64; |
|
1427 |
- if(src[stride*y + 7] > avg) t+= 128; |
|
1428 |
- if(src[stride*y + 8] > avg) t+= 256; |
|
1429 |
- if(src[stride*y + 9] > avg) t+= 512; |
|
1430 |
- |
|
1431 |
- t |= (~t)<<16; |
|
1432 |
- t &= (t<<1) & (t>>1); |
|
1433 |
- s[y] = t; |
|
1434 |
- } |
|
1435 |
- |
|
1436 |
- for(y=1; y<9; y++) |
|
1437 |
- { |
|
1438 |
- int t = s[y-1] & s[y] & s[y+1]; |
|
1439 |
- t|= t>>16; |
|
1440 |
- s[y-1]= t; |
|
1441 |
- } |
|
1442 |
- |
|
1443 |
- for(y=1; y<9; y++) |
|
1444 |
- { |
|
1445 |
- int x; |
|
1446 |
- int t = s[y-1]; |
|
1447 |
- |
|
1448 |
- p= src + stride*y; |
|
1449 |
- for(x=1; x<9; x++) |
|
1450 |
- { |
|
1451 |
- p++; |
|
1452 |
- if(t & (1<<x)) |
|
1453 |
- { |
|
1454 |
- int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) |
|
1455 |
- +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) |
|
1456 |
- +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); |
|
1457 |
- f= (f + 8)>>4; |
|
1458 |
- |
|
1459 |
-#ifdef DEBUG_DERING_THRESHOLD |
|
1460 |
- asm volatile("emms\n\t":); |
|
1461 |
- { |
|
1462 |
- static long long numPixels=0; |
|
1463 |
- if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; |
|
1464 |
-// if((max-min)<20 || (max-min)*QP<200) |
|
1465 |
-// if((max-min)*QP < 500) |
|
1466 |
-// if(max-min<QP/2) |
|
1467 |
- if(max-min < 20) |
|
1468 |
- { |
|
1469 |
- static int numSkiped=0; |
|
1470 |
- static int errorSum=0; |
|
1471 |
- static int worstQP=0; |
|
1472 |
- static int worstRange=0; |
|
1473 |
- static int worstDiff=0; |
|
1474 |
- int diff= (f - *p); |
|
1475 |
- int absDiff= ABS(diff); |
|
1476 |
- int error= diff*diff; |
|
1477 |
- |
|
1478 |
- if(x==1 || x==8 || y==1 || y==8) continue; |
|
1479 |
- |
|
1480 |
- numSkiped++; |
|
1481 |
- if(absDiff > worstDiff) |
|
1482 |
- { |
|
1483 |
- worstDiff= absDiff; |
|
1484 |
- worstQP= QP; |
|
1485 |
- worstRange= max-min; |
|
1486 |
- } |
|
1487 |
- errorSum+= error; |
|
1488 |
- |
|
1489 |
- if(1024LL*1024LL*1024LL % numSkiped == 0) |
|
1490 |
- { |
|
1491 |
- printf( "sum:%1.3f, skip:%d, wQP:%d, " |
|
1492 |
- "wRange:%d, wDiff:%d, relSkip:%1.3f\n", |
|
1493 |
- (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, |
|
1494 |
- worstDiff, (float)numSkiped/numPixels); |
|
1495 |
- } |
|
1496 |
- } |
|
1497 |
- } |
|
1498 |
-#endif |
|
1499 |
- if (*p + QP2 < f) *p= *p + QP2; |
|
1500 |
- else if(*p - QP2 > f) *p= *p - QP2; |
|
1501 |
- else *p=f; |
|
1502 |
- } |
|
1503 |
- } |
|
1504 |
- } |
|
1505 |
-#ifdef DEBUG_DERING_THRESHOLD |
|
1506 |
- if(max-min < 20) |
|
1507 |
- { |
|
1508 |
- for(y=1; y<9; y++) |
|
1509 |
- { |
|
1510 |
- int x; |
|
1511 |
- int t = 0; |
|
1512 |
- p= src + stride*y; |
|
1513 |
- for(x=1; x<9; x++) |
|
1514 |
- { |
|
1515 |
- p++; |
|
1516 |
- *p = MIN(*p + 20, 255); |
|
1517 |
- } |
|
1518 |
- } |
|
1519 |
-// src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; |
|
1520 |
- } |
|
1521 |
-#endif |
|
1522 |
-#endif |
|
1523 |
-} |
|
1524 |
- |
|
1525 |
-/** |
|
1526 |
- * Deinterlaces the given block |
|
1527 |
- * will be called for every 8x8 block and can read & write from line 4-15 |
|
1528 |
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1529 |
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1530 |
- */ |
|
1531 |
-static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
|
1532 |
-{ |
|
1533 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1534 |
- src+= 4*stride; |
|
1535 |
- asm volatile( |
|
1536 |
- "leal (%0, %1), %%eax \n\t" |
|
1537 |
- "leal (%%eax, %1, 4), %%ecx \n\t" |
|
1538 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
1539 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
1540 |
- |
|
1541 |
- "movq (%0), %%mm0 \n\t" |
|
1542 |
- "movq (%%eax, %1), %%mm1 \n\t" |
|
1543 |
- PAVGB(%%mm1, %%mm0) |
|
1544 |
- "movq %%mm0, (%%eax) \n\t" |
|
1545 |
- "movq (%0, %1, 4), %%mm0 \n\t" |
|
1546 |
- PAVGB(%%mm0, %%mm1) |
|
1547 |
- "movq %%mm1, (%%eax, %1, 2) \n\t" |
|
1548 |
- "movq (%%ecx, %1), %%mm1 \n\t" |
|
1549 |
- PAVGB(%%mm1, %%mm0) |
|
1550 |
- "movq %%mm0, (%%ecx) \n\t" |
|
1551 |
- "movq (%0, %1, 8), %%mm0 \n\t" |
|
1552 |
- PAVGB(%%mm0, %%mm1) |
|
1553 |
- "movq %%mm1, (%%ecx, %1, 2) \n\t" |
|
1554 |
- |
|
1555 |
- : : "r" (src), "r" (stride) |
|
1556 |
- : "%eax", "%ecx" |
|
1557 |
- ); |
|
1558 |
-#else |
|
1559 |
- int x; |
|
1560 |
- src+= 4*stride; |
|
1561 |
- for(x=0; x<8; x++) |
|
1562 |
- { |
|
1563 |
- src[stride] = (src[0] + src[stride*2])>>1; |
|
1564 |
- src[stride*3] = (src[stride*2] + src[stride*4])>>1; |
|
1565 |
- src[stride*5] = (src[stride*4] + src[stride*6])>>1; |
|
1566 |
- src[stride*7] = (src[stride*6] + src[stride*8])>>1; |
|
1567 |
- src++; |
|
1568 |
- } |
|
1569 |
-#endif |
|
1570 |
-} |
|
1571 |
- |
|
1572 |
-/** |
|
1573 |
- * Deinterlaces the given block |
|
1574 |
- * will be called for every 8x8 block and can read & write from line 4-15 |
|
1575 |
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1576 |
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1577 |
- * this filter will read lines 3-15 and write 7-13 |
|
1578 |
- * no cliping in C version |
|
1579 |
- */ |
|
1580 |
-static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
|
1581 |
-{ |
|
1582 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1583 |
- src+= stride*3; |
|
1584 |
- asm volatile( |
|
1585 |
- "leal (%0, %1), %%eax \n\t" |
|
1586 |
- "leal (%%eax, %1, 4), %%edx \n\t" |
|
1587 |
- "leal (%%edx, %1, 4), %%ecx \n\t" |
|
1588 |
- "addl %1, %%ecx \n\t" |
|
1589 |
- "pxor %%mm7, %%mm7 \n\t" |
|
1590 |
-// 0 1 2 3 4 5 6 7 8 9 10 |
|
1591 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
|
1592 |
- |
|
1593 |
-#define DEINT_CUBIC(a,b,c,d,e)\ |
|
1594 |
- "movq " #a ", %%mm0 \n\t"\ |
|
1595 |
- "movq " #b ", %%mm1 \n\t"\ |
|
1596 |
- "movq " #d ", %%mm2 \n\t"\ |
|
1597 |
- "movq " #e ", %%mm3 \n\t"\ |
|
1598 |
- PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ |
|
1599 |
- PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ |
|
1600 |
- "movq %%mm0, %%mm2 \n\t"\ |
|
1601 |
- "punpcklbw %%mm7, %%mm0 \n\t"\ |
|
1602 |
- "punpckhbw %%mm7, %%mm2 \n\t"\ |
|
1603 |
- "movq %%mm1, %%mm3 \n\t"\ |
|
1604 |
- "punpcklbw %%mm7, %%mm1 \n\t"\ |
|
1605 |
- "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
1606 |
- "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ |
|
1607 |
- "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ |
|
1608 |
- "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ |
|
1609 |
- "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ |
|
1610 |
- "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ |
|
1611 |
- "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ |
|
1612 |
- "packuswb %%mm3, %%mm1 \n\t"\ |
|
1613 |
- "movq %%mm1, " #c " \n\t" |
|
1614 |
- |
|
1615 |
-DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) |
|
1616 |
-DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) |
|
1617 |
-DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) |
|
1618 |
-DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) |
|
1619 |
- |
|
1620 |
- : : "r" (src), "r" (stride) |
|
1621 |
- : "%eax", "%edx", "ecx" |
|
1622 |
- ); |
|
1623 |
-#else |
|
1624 |
- int x; |
|
1625 |
- src+= stride*3; |
|
1626 |
- for(x=0; x<8; x++) |
|
1627 |
- { |
|
1628 |
- src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
|
1629 |
- src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; |
|
1630 |
- src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; |
|
1631 |
- src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; |
|
1632 |
- src++; |
|
1633 |
- } |
|
1634 |
-#endif |
|
1635 |
-} |
|
1636 |
- |
|
1637 |
-/** |
|
1638 |
- * Deinterlaces the given block |
|
1639 |
- * will be called for every 8x8 block and can read & write from line 4-15 |
|
1640 |
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1641 |
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1642 |
- * this filter will read lines 4-13 and write 5-11 |
|
1643 |
- * no cliping in C version |
|
1644 |
- */ |
|
1645 |
-static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) |
|
1646 |
-{ |
|
1647 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1648 |
- src+= stride*4; |
|
1649 |
- asm volatile( |
|
1650 |
- "leal (%0, %1), %%eax \n\t" |
|
1651 |
- "leal (%%eax, %1, 4), %%edx \n\t" |
|
1652 |
- "pxor %%mm7, %%mm7 \n\t" |
|
1653 |
- "movq (%2), %%mm0 \n\t" |
|
1654 |
-// 0 1 2 3 4 5 6 7 8 9 10 |
|
1655 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
|
1656 |
- |
|
1657 |
-#define DEINT_FF(a,b,c,d)\ |
|
1658 |
- "movq " #a ", %%mm1 \n\t"\ |
|
1659 |
- "movq " #b ", %%mm2 \n\t"\ |
|
1660 |
- "movq " #c ", %%mm3 \n\t"\ |
|
1661 |
- "movq " #d ", %%mm4 \n\t"\ |
|
1662 |
- PAVGB(%%mm3, %%mm1) \ |
|
1663 |
- PAVGB(%%mm4, %%mm0) \ |
|
1664 |
- "movq %%mm0, %%mm3 \n\t"\ |
|
1665 |
- "punpcklbw %%mm7, %%mm0 \n\t"\ |
|
1666 |
- "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
1667 |
- "movq %%mm1, %%mm4 \n\t"\ |
|
1668 |
- "punpcklbw %%mm7, %%mm1 \n\t"\ |
|
1669 |
- "punpckhbw %%mm7, %%mm4 \n\t"\ |
|
1670 |
- "psllw $2, %%mm1 \n\t"\ |
|
1671 |
- "psllw $2, %%mm4 \n\t"\ |
|
1672 |
- "psubw %%mm0, %%mm1 \n\t"\ |
|
1673 |
- "psubw %%mm3, %%mm4 \n\t"\ |
|
1674 |
- "movq %%mm2, %%mm5 \n\t"\ |
|
1675 |
- "movq %%mm2, %%mm0 \n\t"\ |
|
1676 |
- "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
1677 |
- "punpckhbw %%mm7, %%mm5 \n\t"\ |
|
1678 |
- "paddw %%mm2, %%mm1 \n\t"\ |
|
1679 |
- "paddw %%mm5, %%mm4 \n\t"\ |
|
1680 |
- "psraw $2, %%mm1 \n\t"\ |
|
1681 |
- "psraw $2, %%mm4 \n\t"\ |
|
1682 |
- "packuswb %%mm4, %%mm1 \n\t"\ |
|
1683 |
- "movq %%mm1, " #b " \n\t"\ |
|
1684 |
- |
|
1685 |
-DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) |
|
1686 |
-DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) |
|
1687 |
-DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) |
|
1688 |
-DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) |
|
1689 |
- |
|
1690 |
- "movq %%mm0, (%2) \n\t" |
|
1691 |
- : : "r" (src), "r" (stride), "r"(tmp) |
|
1692 |
- : "%eax", "%edx" |
|
1693 |
- ); |
|
1694 |
-#else |
|
1695 |
- int x; |
|
1696 |
- src+= stride*4; |
|
1697 |
- for(x=0; x<8; x++) |
|
1698 |
- { |
|
1699 |
- int t1= tmp[x]; |
|
1700 |
- int t2= src[stride*1]; |
|
1701 |
- |
|
1702 |
- src[stride*1]= (-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3; |
|
1703 |
- t1= src[stride*4]; |
|
1704 |
- src[stride*3]= (-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3; |
|
1705 |
- t2= src[stride*6]; |
|
1706 |
- src[stride*5]= (-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3; |
|
1707 |
- t1= src[stride*8]; |
|
1708 |
- src[stride*7]= (-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3; |
|
1709 |
- tmp[x]= t1; |
|
1710 |
- |
|
1711 |
- src++; |
|
1712 |
- } |
|
1713 |
-#endif |
|
1714 |
-} |
|
1715 |
- |
|
1716 |
-/** |
|
1717 |
- * Deinterlaces the given block |
|
1718 |
- * will be called for every 8x8 block and can read & write from line 4-15 |
|
1719 |
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1720 |
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1721 |
- * will shift the image up by 1 line (FIXME if this is a problem) |
|
1722 |
- * this filter will read lines 4-13 and write 4-11 |
|
1723 |
- */ |
|
1724 |
-static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride) |
|
1725 |
-{ |
|
1726 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
1727 |
- src+= 4*stride; |
|
1728 |
- asm volatile( |
|
1729 |
- "leal (%0, %1), %%eax \n\t" |
|
1730 |
- "leal (%%eax, %1, 4), %%edx \n\t" |
|
1731 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
1732 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1733 |
- |
|
1734 |
- "movq (%0), %%mm0 \n\t" // L0 |
|
1735 |
- "movq (%%eax, %1), %%mm1 \n\t" // L2 |
|
1736 |
- PAVGB(%%mm1, %%mm0) // L0+L2 |
|
1737 |
- "movq (%%eax), %%mm2 \n\t" // L1 |
|
1738 |
- PAVGB(%%mm2, %%mm0) |
|
1739 |
- "movq %%mm0, (%0) \n\t" |
|
1740 |
- "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 |
|
1741 |
- PAVGB(%%mm0, %%mm2) // L1+L3 |
|
1742 |
- PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
|
1743 |
- "movq %%mm2, (%%eax) \n\t" |
|
1744 |
- "movq (%0, %1, 4), %%mm2 \n\t" // L4 |
|
1745 |
- PAVGB(%%mm2, %%mm1) // L2+L4 |
|
1746 |
- PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
|
1747 |
- "movq %%mm1, (%%eax, %1) \n\t" |
|
1748 |
- "movq (%%edx), %%mm1 \n\t" // L5 |
|
1749 |
- PAVGB(%%mm1, %%mm0) // L3+L5 |
|
1750 |
- PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
|
1751 |
- "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
1752 |
- "movq (%%edx, %1), %%mm0 \n\t" // L6 |
|
1753 |
- PAVGB(%%mm0, %%mm2) // L4+L6 |
|
1754 |
- PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
|
1755 |
- "movq %%mm2, (%0, %1, 4) \n\t" |
|
1756 |
- "movq (%%edx, %1, 2), %%mm2 \n\t" // L7 |
|
1757 |
- PAVGB(%%mm2, %%mm1) // L5+L7 |
|
1758 |
- PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
|
1759 |
- "movq %%mm1, (%%edx) \n\t" |
|
1760 |
- "movq (%0, %1, 8), %%mm1 \n\t" // L8 |
|
1761 |
- PAVGB(%%mm1, %%mm0) // L6+L8 |
|
1762 |
- PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
|
1763 |
- "movq %%mm0, (%%edx, %1) \n\t" |
|
1764 |
- "movq (%%edx, %1, 4), %%mm0 \n\t" // L9 |
|
1765 |
- PAVGB(%%mm0, %%mm2) // L7+L9 |
|
1766 |
- PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
|
1767 |
- "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
1768 |
- |
|
1769 |
- |
|
1770 |
- : : "r" (src), "r" (stride) |
|
1771 |
- : "%eax", "%edx" |
|
1772 |
- ); |
|
1773 |
-#else |
|
1774 |
- int x; |
|
1775 |
- src+= 4*stride; |
|
1776 |
- for(x=0; x<8; x++) |
|
1777 |
- { |
|
1778 |
- src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
|
1779 |
- src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
|
1780 |
- src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
|
1781 |
- src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; |
|
1782 |
- src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; |
|
1783 |
- src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; |
|
1784 |
- src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; |
|
1785 |
- src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; |
|
1786 |
- src++; |
|
1787 |
- } |
|
1788 |
-#endif |
|
1789 |
-} |
|
1790 |
- |
|
1791 |
-/** |
|
1792 |
- * Deinterlaces the given block |
|
1793 |
- * will be called for every 8x8 block and can read & write from line 4-15, |
|
1794 |
- * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too |
|
1795 |
- * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
|
1796 |
- */ |
|
1797 |
-static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
|
1798 |
-{ |
|
1799 |
-#ifdef HAVE_MMX |
|
1800 |
- src+= 4*stride; |
|
1801 |
-#ifdef HAVE_MMX2 |
|
1802 |
- asm volatile( |
|
1803 |
- "leal (%0, %1), %%eax \n\t" |
|
1804 |
- "leal (%%eax, %1, 4), %%edx \n\t" |
|
1805 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
1806 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1807 |
- |
|
1808 |
- "movq (%0), %%mm0 \n\t" // |
|
1809 |
- "movq (%%eax, %1), %%mm2 \n\t" // |
|
1810 |
- "movq (%%eax), %%mm1 \n\t" // |
|
1811 |
- "movq %%mm0, %%mm3 \n\t" |
|
1812 |
- "pmaxub %%mm1, %%mm0 \n\t" // |
|
1813 |
- "pminub %%mm3, %%mm1 \n\t" // |
|
1814 |
- "pmaxub %%mm2, %%mm1 \n\t" // |
|
1815 |
- "pminub %%mm1, %%mm0 \n\t" |
|
1816 |
- "movq %%mm0, (%%eax) \n\t" |
|
1817 |
- |
|
1818 |
- "movq (%0, %1, 4), %%mm0 \n\t" // |
|
1819 |
- "movq (%%eax, %1, 2), %%mm1 \n\t" // |
|
1820 |
- "movq %%mm2, %%mm3 \n\t" |
|
1821 |
- "pmaxub %%mm1, %%mm2 \n\t" // |
|
1822 |
- "pminub %%mm3, %%mm1 \n\t" // |
|
1823 |
- "pmaxub %%mm0, %%mm1 \n\t" // |
|
1824 |
- "pminub %%mm1, %%mm2 \n\t" |
|
1825 |
- "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
1826 |
- |
|
1827 |
- "movq (%%edx), %%mm2 \n\t" // |
|
1828 |
- "movq (%%edx, %1), %%mm1 \n\t" // |
|
1829 |
- "movq %%mm2, %%mm3 \n\t" |
|
1830 |
- "pmaxub %%mm0, %%mm2 \n\t" // |
|
1831 |
- "pminub %%mm3, %%mm0 \n\t" // |
|
1832 |
- "pmaxub %%mm1, %%mm0 \n\t" // |
|
1833 |
- "pminub %%mm0, %%mm2 \n\t" |
|
1834 |
- "movq %%mm2, (%%edx) \n\t" |
|
1835 |
- |
|
1836 |
- "movq (%%edx, %1, 2), %%mm2 \n\t" // |
|
1837 |
- "movq (%0, %1, 8), %%mm0 \n\t" // |
|
1838 |
- "movq %%mm2, %%mm3 \n\t" |
|
1839 |
- "pmaxub %%mm0, %%mm2 \n\t" // |
|
1840 |
- "pminub %%mm3, %%mm0 \n\t" // |
|
1841 |
- "pmaxub %%mm1, %%mm0 \n\t" // |
|
1842 |
- "pminub %%mm0, %%mm2 \n\t" |
|
1843 |
- "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
1844 |
- |
|
1845 |
- |
|
1846 |
- : : "r" (src), "r" (stride) |
|
1847 |
- : "%eax", "%edx" |
|
1848 |
- ); |
|
1849 |
- |
|
1850 |
-#else // MMX without MMX2 |
|
1851 |
- asm volatile( |
|
1852 |
- "leal (%0, %1), %%eax \n\t" |
|
1853 |
- "leal (%%eax, %1, 4), %%edx \n\t" |
|
1854 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
1855 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1856 |
- "pxor %%mm7, %%mm7 \n\t" |
|
1857 |
- |
|
1858 |
-#define MEDIAN(a,b,c)\ |
|
1859 |
- "movq " #a ", %%mm0 \n\t"\ |
|
1860 |
- "movq " #b ", %%mm2 \n\t"\ |
|
1861 |
- "movq " #c ", %%mm1 \n\t"\ |
|
1862 |
- "movq %%mm0, %%mm3 \n\t"\ |
|
1863 |
- "movq %%mm1, %%mm4 \n\t"\ |
|
1864 |
- "movq %%mm2, %%mm5 \n\t"\ |
|
1865 |
- "psubusb %%mm1, %%mm3 \n\t"\ |
|
1866 |
- "psubusb %%mm2, %%mm4 \n\t"\ |
|
1867 |
- "psubusb %%mm0, %%mm5 \n\t"\ |
|
1868 |
- "pcmpeqb %%mm7, %%mm3 \n\t"\ |
|
1869 |
- "pcmpeqb %%mm7, %%mm4 \n\t"\ |
|
1870 |
- "pcmpeqb %%mm7, %%mm5 \n\t"\ |
|
1871 |
- "movq %%mm3, %%mm6 \n\t"\ |
|
1872 |
- "pxor %%mm4, %%mm3 \n\t"\ |
|
1873 |
- "pxor %%mm5, %%mm4 \n\t"\ |
|
1874 |
- "pxor %%mm6, %%mm5 \n\t"\ |
|
1875 |
- "por %%mm3, %%mm1 \n\t"\ |
|
1876 |
- "por %%mm4, %%mm2 \n\t"\ |
|
1877 |
- "por %%mm5, %%mm0 \n\t"\ |
|
1878 |
- "pand %%mm2, %%mm0 \n\t"\ |
|
1879 |
- "pand %%mm1, %%mm0 \n\t"\ |
|
1880 |
- "movq %%mm0, " #b " \n\t" |
|
1881 |
- |
|
1882 |
-MEDIAN((%0), (%%eax), (%%eax, %1)) |
|
1883 |
-MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) |
|
1884 |
-MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) |
|
1885 |
-MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) |
|
1886 |
- |
|
1887 |
- : : "r" (src), "r" (stride) |
|
1888 |
- : "%eax", "%edx" |
|
1889 |
- ); |
|
1890 |
-#endif // MMX |
|
1891 |
-#else |
|
1892 |
- int x, y; |
|
1893 |
- src+= 4*stride; |
|
1894 |
- // FIXME - there should be a way to do a few columns in parallel like w/mmx |
|
1895 |
- for(x=0; x<8; x++) |
|
1896 |
- { |
|
1897 |
- uint8_t *colsrc = src; |
|
1898 |
- for (y=0; y<4; y++) |
|
1899 |
- { |
|
1900 |
- int a, b, c, d, e, f; |
|
1901 |
- a = colsrc[0 ]; |
|
1902 |
- b = colsrc[stride ]; |
|
1903 |
- c = colsrc[stride*2]; |
|
1904 |
- d = (a-b)>>31; |
|
1905 |
- e = (b-c)>>31; |
|
1906 |
- f = (c-a)>>31; |
|
1907 |
- colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); |
|
1908 |
- colsrc += stride*2; |
|
1909 |
- } |
|
1910 |
- src++; |
|
1911 |
- } |
|
1912 |
-#endif |
|
1913 |
-} |
|
1914 |
- |
|
1915 |
-#ifdef HAVE_MMX |
|
1916 |
-/** |
|
1917 |
- * transposes and shift the given 8x8 Block into dst1 and dst2 |
|
1918 |
- */ |
|
1919 |
-static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
|
1920 |
-{ |
|
1921 |
- asm( |
|
1922 |
- "leal (%0, %1), %%eax \n\t" |
|
1923 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
1924 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
1925 |
- "movq (%0), %%mm0 \n\t" // 12345678 |
|
1926 |
- "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
1927 |
- "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
1928 |
- "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
1929 |
- "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
1930 |
- |
|
1931 |
- "movq (%%eax, %1), %%mm1 \n\t" |
|
1932 |
- "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
1933 |
- "movq %%mm1, %%mm4 \n\t" |
|
1934 |
- "punpcklbw %%mm3, %%mm1 \n\t" |
|
1935 |
- "punpckhbw %%mm3, %%mm4 \n\t" |
|
1936 |
- |
|
1937 |
- "movq %%mm0, %%mm3 \n\t" |
|
1938 |
- "punpcklwd %%mm1, %%mm0 \n\t" |
|
1939 |
- "punpckhwd %%mm1, %%mm3 \n\t" |
|
1940 |
- "movq %%mm2, %%mm1 \n\t" |
|
1941 |
- "punpcklwd %%mm4, %%mm2 \n\t" |
|
1942 |
- "punpckhwd %%mm4, %%mm1 \n\t" |
|
1943 |
- |
|
1944 |
- "movd %%mm0, 128(%2) \n\t" |
|
1945 |
- "psrlq $32, %%mm0 \n\t" |
|
1946 |
- "movd %%mm0, 144(%2) \n\t" |
|
1947 |
- "movd %%mm3, 160(%2) \n\t" |
|
1948 |
- "psrlq $32, %%mm3 \n\t" |
|
1949 |
- "movd %%mm3, 176(%2) \n\t" |
|
1950 |
- "movd %%mm3, 48(%3) \n\t" |
|
1951 |
- "movd %%mm2, 192(%2) \n\t" |
|
1952 |
- "movd %%mm2, 64(%3) \n\t" |
|
1953 |
- "psrlq $32, %%mm2 \n\t" |
|
1954 |
- "movd %%mm2, 80(%3) \n\t" |
|
1955 |
- "movd %%mm1, 96(%3) \n\t" |
|
1956 |
- "psrlq $32, %%mm1 \n\t" |
|
1957 |
- "movd %%mm1, 112(%3) \n\t" |
|
1958 |
- |
|
1959 |
- "leal (%%eax, %1, 4), %%eax \n\t" |
|
1960 |
- |
|
1961 |
- "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
|
1962 |
- "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
1963 |
- "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
1964 |
- "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
1965 |
- "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
1966 |
- |
|
1967 |
- "movq (%%eax, %1), %%mm1 \n\t" |
|
1968 |
- "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
1969 |
- "movq %%mm1, %%mm4 \n\t" |
|
1970 |
- "punpcklbw %%mm3, %%mm1 \n\t" |
|
1971 |
- "punpckhbw %%mm3, %%mm4 \n\t" |
|
1972 |
- |
|
1973 |
- "movq %%mm0, %%mm3 \n\t" |
|
1974 |
- "punpcklwd %%mm1, %%mm0 \n\t" |
|
1975 |
- "punpckhwd %%mm1, %%mm3 \n\t" |
|
1976 |
- "movq %%mm2, %%mm1 \n\t" |
|
1977 |
- "punpcklwd %%mm4, %%mm2 \n\t" |
|
1978 |
- "punpckhwd %%mm4, %%mm1 \n\t" |
|
1979 |
- |
|
1980 |
- "movd %%mm0, 132(%2) \n\t" |
|
1981 |
- "psrlq $32, %%mm0 \n\t" |
|
1982 |
- "movd %%mm0, 148(%2) \n\t" |
|
1983 |
- "movd %%mm3, 164(%2) \n\t" |
|
1984 |
- "psrlq $32, %%mm3 \n\t" |
|
1985 |
- "movd %%mm3, 180(%2) \n\t" |
|
1986 |
- "movd %%mm3, 52(%3) \n\t" |
|
1987 |
- "movd %%mm2, 196(%2) \n\t" |
|
1988 |
- "movd %%mm2, 68(%3) \n\t" |
|
1989 |
- "psrlq $32, %%mm2 \n\t" |
|
1990 |
- "movd %%mm2, 84(%3) \n\t" |
|
1991 |
- "movd %%mm1, 100(%3) \n\t" |
|
1992 |
- "psrlq $32, %%mm1 \n\t" |
|
1993 |
- "movd %%mm1, 116(%3) \n\t" |
|
1994 |
- |
|
1995 |
- |
|
1996 |
- :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
|
1997 |
- : "%eax" |
|
1998 |
- ); |
|
1999 |
-} |
|
2000 |
- |
|
2001 |
-/** |
|
2002 |
- * transposes the given 8x8 block |
|
2003 |
- */ |
|
2004 |
-static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
|
2005 |
-{ |
|
2006 |
- asm( |
|
2007 |
- "leal (%0, %1), %%eax \n\t" |
|
2008 |
- "leal (%%eax, %1, 4), %%edx \n\t" |
|
2009 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
2010 |
-// %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
2011 |
- "movq (%2), %%mm0 \n\t" // 12345678 |
|
2012 |
- "movq 16(%2), %%mm1 \n\t" // abcdefgh |
|
2013 |
- "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
2014 |
- "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
2015 |
- "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
2016 |
- |
|
2017 |
- "movq 32(%2), %%mm1 \n\t" |
|
2018 |
- "movq 48(%2), %%mm3 \n\t" |
|
2019 |
- "movq %%mm1, %%mm4 \n\t" |
|
2020 |
- "punpcklbw %%mm3, %%mm1 \n\t" |
|
2021 |
- "punpckhbw %%mm3, %%mm4 \n\t" |
|
2022 |
- |
|
2023 |
- "movq %%mm0, %%mm3 \n\t" |
|
2024 |
- "punpcklwd %%mm1, %%mm0 \n\t" |
|
2025 |
- "punpckhwd %%mm1, %%mm3 \n\t" |
|
2026 |
- "movq %%mm2, %%mm1 \n\t" |
|
2027 |
- "punpcklwd %%mm4, %%mm2 \n\t" |
|
2028 |
- "punpckhwd %%mm4, %%mm1 \n\t" |
|
2029 |
- |
|
2030 |
- "movd %%mm0, (%0) \n\t" |
|
2031 |
- "psrlq $32, %%mm0 \n\t" |
|
2032 |
- "movd %%mm0, (%%eax) \n\t" |
|
2033 |
- "movd %%mm3, (%%eax, %1) \n\t" |
|
2034 |
- "psrlq $32, %%mm3 \n\t" |
|
2035 |
- "movd %%mm3, (%%eax, %1, 2) \n\t" |
|
2036 |
- "movd %%mm2, (%0, %1, 4) \n\t" |
|
2037 |
- "psrlq $32, %%mm2 \n\t" |
|
2038 |
- "movd %%mm2, (%%edx) \n\t" |
|
2039 |
- "movd %%mm1, (%%edx, %1) \n\t" |
|
2040 |
- "psrlq $32, %%mm1 \n\t" |
|
2041 |
- "movd %%mm1, (%%edx, %1, 2) \n\t" |
|
2042 |
- |
|
2043 |
- |
|
2044 |
- "movq 64(%2), %%mm0 \n\t" // 12345678 |
|
2045 |
- "movq 80(%2), %%mm1 \n\t" // abcdefgh |
|
2046 |
- "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
2047 |
- "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
2048 |
- "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
2049 |
- |
|
2050 |
- "movq 96(%2), %%mm1 \n\t" |
|
2051 |
- "movq 112(%2), %%mm3 \n\t" |
|
2052 |
- "movq %%mm1, %%mm4 \n\t" |
|
2053 |
- "punpcklbw %%mm3, %%mm1 \n\t" |
|
2054 |
- "punpckhbw %%mm3, %%mm4 \n\t" |
|
2055 |
- |
|
2056 |
- "movq %%mm0, %%mm3 \n\t" |
|
2057 |
- "punpcklwd %%mm1, %%mm0 \n\t" |
|
2058 |
- "punpckhwd %%mm1, %%mm3 \n\t" |
|
2059 |
- "movq %%mm2, %%mm1 \n\t" |
|
2060 |
- "punpcklwd %%mm4, %%mm2 \n\t" |
|
2061 |
- "punpckhwd %%mm4, %%mm1 \n\t" |
|
2062 |
- |
|
2063 |
- "movd %%mm0, 4(%0) \n\t" |
|
2064 |
- "psrlq $32, %%mm0 \n\t" |
|
2065 |
- "movd %%mm0, 4(%%eax) \n\t" |
|
2066 |
- "movd %%mm3, 4(%%eax, %1) \n\t" |
|
2067 |
- "psrlq $32, %%mm3 \n\t" |
|
2068 |
- "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
|
2069 |
- "movd %%mm2, 4(%0, %1, 4) \n\t" |
|
2070 |
- "psrlq $32, %%mm2 \n\t" |
|
2071 |
- "movd %%mm2, 4(%%edx) \n\t" |
|
2072 |
- "movd %%mm1, 4(%%edx, %1) \n\t" |
|
2073 |
- "psrlq $32, %%mm1 \n\t" |
|
2074 |
- "movd %%mm1, 4(%%edx, %1, 2) \n\t" |
|
2075 |
- |
|
2076 |
- :: "r" (dst), "r" (dstStride), "r" (src) |
|
2077 |
- : "%eax", "%edx" |
|
2078 |
- ); |
|
2079 |
-} |
|
2080 |
-#endif |
|
2081 |
-//static int test=0; |
|
2082 |
- |
|
2083 |
-static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
|
2084 |
- uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
|
2085 |
-{ |
|
2086 |
- // to save a register (FIXME do this outside of the loops) |
|
2087 |
- tempBluredPast[127]= maxNoise[0]; |
|
2088 |
- tempBluredPast[128]= maxNoise[1]; |
|
2089 |
- tempBluredPast[129]= maxNoise[2]; |
|
2090 |
- |
|
2091 |
-#define FAST_L2_DIFF |
|
2092 |
-//#define L1_DIFF //u should change the thresholds too if u try that one |
|
2093 |
-#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
2094 |
- asm volatile( |
|
2095 |
- "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
|
2096 |
- "leal (%2, %2, 4), %%edx \n\t" // 5*stride |
|
2097 |
- "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2098 |
-// 0 1 2 3 4 5 6 7 8 9 |
|
2099 |
-// %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
|
2100 |
-//FIXME reorder? |
|
2101 |
-#ifdef L1_DIFF //needs mmx2 |
|
2102 |
- "movq (%0), %%mm0 \n\t" // L0 |
|
2103 |
- "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
|
2104 |
- "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2105 |
- "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
|
2106 |
- "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2107 |
- "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
|
2108 |
- "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2109 |
- "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
|
2110 |
- |
|
2111 |
- "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
2112 |
- "paddw %%mm1, %%mm0 \n\t" |
|
2113 |
- "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
|
2114 |
- "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
2115 |
- "paddw %%mm2, %%mm0 \n\t" |
|
2116 |
- "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| |
|
2117 |
- "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
2118 |
- "paddw %%mm3, %%mm0 \n\t" |
|
2119 |
- "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
|
2120 |
- "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
2121 |
- "paddw %%mm4, %%mm0 \n\t" |
|
2122 |
- "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| |
|
2123 |
- "paddw %%mm5, %%mm6 \n\t" |
|
2124 |
- "paddw %%mm7, %%mm6 \n\t" |
|
2125 |
- "paddw %%mm6, %%mm0 \n\t" |
|
2126 |
-#elif defined (FAST_L2_DIFF) |
|
2127 |
- "pcmpeqb %%mm7, %%mm7 \n\t" |
|
2128 |
- "movq "MANGLE(b80)", %%mm6 \n\t" |
|
2129 |
- "pxor %%mm0, %%mm0 \n\t" |
|
2130 |
-#define L2_DIFF_CORE(a, b)\ |
|
2131 |
- "movq " #a ", %%mm5 \n\t"\ |
|
2132 |
- "movq " #b ", %%mm2 \n\t"\ |
|
2133 |
- "pxor %%mm7, %%mm2 \n\t"\ |
|
2134 |
- PAVGB(%%mm2, %%mm5)\ |
|
2135 |
- "paddb %%mm6, %%mm5 \n\t"\ |
|
2136 |
- "movq %%mm5, %%mm2 \n\t"\ |
|
2137 |
- "psllw $8, %%mm5 \n\t"\ |
|
2138 |
- "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
2139 |
- "pmaddwd %%mm2, %%mm2 \n\t"\ |
|
2140 |
- "paddd %%mm2, %%mm5 \n\t"\ |
|
2141 |
- "psrld $14, %%mm5 \n\t"\ |
|
2142 |
- "paddd %%mm5, %%mm0 \n\t" |
|
2143 |
- |
|
2144 |
-L2_DIFF_CORE((%0), (%1)) |
|
2145 |
-L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
2146 |
-L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
2147 |
-L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
2148 |
-L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
|
2149 |
-L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
2150 |
-L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
2151 |
-L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
2152 |
- |
|
2153 |
-#else |
|
2154 |
- "pxor %%mm7, %%mm7 \n\t" |
|
2155 |
- "pxor %%mm0, %%mm0 \n\t" |
|
2156 |
-#define L2_DIFF_CORE(a, b)\ |
|
2157 |
- "movq " #a ", %%mm5 \n\t"\ |
|
2158 |
- "movq " #b ", %%mm2 \n\t"\ |
|
2159 |
- "movq %%mm5, %%mm1 \n\t"\ |
|
2160 |
- "movq %%mm2, %%mm3 \n\t"\ |
|
2161 |
- "punpcklbw %%mm7, %%mm5 \n\t"\ |
|
2162 |
- "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
2163 |
- "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
2164 |
- "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
2165 |
- "psubw %%mm2, %%mm5 \n\t"\ |
|
2166 |
- "psubw %%mm3, %%mm1 \n\t"\ |
|
2167 |
- "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
2168 |
- "pmaddwd %%mm1, %%mm1 \n\t"\ |
|
2169 |
- "paddd %%mm1, %%mm5 \n\t"\ |
|
2170 |
- "paddd %%mm5, %%mm0 \n\t" |
|
2171 |
- |
|
2172 |
-L2_DIFF_CORE((%0), (%1)) |
|
2173 |
-L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
2174 |
-L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
2175 |
-L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
2176 |
-L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
|
2177 |
-L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
2178 |
-L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
2179 |
-L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
2180 |
- |
|
2181 |
-#endif |
|
2182 |
- |
|
2183 |
- "movq %%mm0, %%mm4 \n\t" |
|
2184 |
- "psrlq $32, %%mm0 \n\t" |
|
2185 |
- "paddd %%mm0, %%mm4 \n\t" |
|
2186 |
- "movd %%mm4, %%ecx \n\t" |
|
2187 |
- "shll $2, %%ecx \n\t" |
|
2188 |
- "movl %3, %%edx \n\t" |
|
2189 |
- "addl -4(%%edx), %%ecx \n\t" |
|
2190 |
- "addl 4(%%edx), %%ecx \n\t" |
|
2191 |
- "addl -1024(%%edx), %%ecx \n\t" |
|
2192 |
- "addl $4, %%ecx \n\t" |
|
2193 |
- "addl 1024(%%edx), %%ecx \n\t" |
|
2194 |
- "shrl $3, %%ecx \n\t" |
|
2195 |
- "movl %%ecx, (%%edx) \n\t" |
|
2196 |
- |
|
2197 |
-// "movl %3, %%ecx \n\t" |
|
2198 |
-// "movl %%ecx, test \n\t" |
|
2199 |
-// "jmp 4f \n\t" |
|
2200 |
- "cmpl 512(%%edx), %%ecx \n\t" |
|
2201 |
- " jb 2f \n\t" |
|
2202 |
- "cmpl 516(%%edx), %%ecx \n\t" |
|
2203 |
- " jb 1f \n\t" |
|
2204 |
- |
|
2205 |
- "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2206 |
- "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2207 |
- "movq (%0), %%mm0 \n\t" // L0 |
|
2208 |
- "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2209 |
- "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2210 |
- "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2211 |
- "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
2212 |
- "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
2213 |
- "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
2214 |
- "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
2215 |
- "movq %%mm0, (%1) \n\t" // L0 |
|
2216 |
- "movq %%mm1, (%1, %2) \n\t" // L1 |
|
2217 |
- "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
|
2218 |
- "movq %%mm3, (%1, %%eax) \n\t" // L3 |
|
2219 |
- "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
|
2220 |
- "movq %%mm5, (%1, %%edx) \n\t" // L5 |
|
2221 |
- "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
|
2222 |
- "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
|
2223 |
- "jmp 4f \n\t" |
|
2224 |
- |
|
2225 |
- "1: \n\t" |
|
2226 |
- "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2227 |
- "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2228 |
- "movq (%0), %%mm0 \n\t" // L0 |
|
2229 |
- PAVGB((%1), %%mm0) // L0 |
|
2230 |
- "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2231 |
- PAVGB((%1, %2), %%mm1) // L1 |
|
2232 |
- "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2233 |
- PAVGB((%1, %2, 2), %%mm2) // L2 |
|
2234 |
- "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2235 |
- PAVGB((%1, %%eax), %%mm3) // L3 |
|
2236 |
- "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
2237 |
- PAVGB((%1, %2, 4), %%mm4) // L4 |
|
2238 |
- "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
2239 |
- PAVGB((%1, %%edx), %%mm5) // L5 |
|
2240 |
- "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
2241 |
- PAVGB((%1, %%eax, 2), %%mm6) // L6 |
|
2242 |
- "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
2243 |
- PAVGB((%1, %%ecx), %%mm7) // L7 |
|
2244 |
- "movq %%mm0, (%1) \n\t" // R0 |
|
2245 |
- "movq %%mm1, (%1, %2) \n\t" // R1 |
|
2246 |
- "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
2247 |
- "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
2248 |
- "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
|
2249 |
- "movq %%mm5, (%1, %%edx) \n\t" // R5 |
|
2250 |
- "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
|
2251 |
- "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
|
2252 |
- "movq %%mm0, (%0) \n\t" // L0 |
|
2253 |
- "movq %%mm1, (%0, %2) \n\t" // L1 |
|
2254 |
- "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
2255 |
- "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
2256 |
- "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
|
2257 |
- "movq %%mm5, (%0, %%edx) \n\t" // L5 |
|
2258 |
- "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
|
2259 |
- "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
|
2260 |
- "jmp 4f \n\t" |
|
2261 |
- |
|
2262 |
- "2: \n\t" |
|
2263 |
- "cmpl 508(%%edx), %%ecx \n\t" |
|
2264 |
- " jb 3f \n\t" |
|
2265 |
- |
|
2266 |
- "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2267 |
- "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2268 |
- "movq (%0), %%mm0 \n\t" // L0 |
|
2269 |
- "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2270 |
- "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2271 |
- "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2272 |
- "movq (%1), %%mm4 \n\t" // R0 |
|
2273 |
- "movq (%1, %2), %%mm5 \n\t" // R1 |
|
2274 |
- "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
2275 |
- "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
2276 |
- PAVGB(%%mm4, %%mm0) |
|
2277 |
- PAVGB(%%mm5, %%mm1) |
|
2278 |
- PAVGB(%%mm6, %%mm2) |
|
2279 |
- PAVGB(%%mm7, %%mm3) |
|
2280 |
- PAVGB(%%mm4, %%mm0) |
|
2281 |
- PAVGB(%%mm5, %%mm1) |
|
2282 |
- PAVGB(%%mm6, %%mm2) |
|
2283 |
- PAVGB(%%mm7, %%mm3) |
|
2284 |
- "movq %%mm0, (%1) \n\t" // R0 |
|
2285 |
- "movq %%mm1, (%1, %2) \n\t" // R1 |
|
2286 |
- "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
2287 |
- "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
2288 |
- "movq %%mm0, (%0) \n\t" // L0 |
|
2289 |
- "movq %%mm1, (%0, %2) \n\t" // L1 |
|
2290 |
- "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
2291 |
- "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
2292 |
- |
|
2293 |
- "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
|
2294 |
- "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
2295 |
- "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
2296 |
- "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
2297 |
- "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
|
2298 |
- "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
2299 |
- "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
2300 |
- "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
2301 |
- PAVGB(%%mm4, %%mm0) |
|
2302 |
- PAVGB(%%mm5, %%mm1) |
|
2303 |
- PAVGB(%%mm6, %%mm2) |
|
2304 |
- PAVGB(%%mm7, %%mm3) |
|
2305 |
- PAVGB(%%mm4, %%mm0) |
|
2306 |
- PAVGB(%%mm5, %%mm1) |
|
2307 |
- PAVGB(%%mm6, %%mm2) |
|
2308 |
- PAVGB(%%mm7, %%mm3) |
|
2309 |
- "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
|
2310 |
- "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
2311 |
- "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
2312 |
- "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
2313 |
- "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
|
2314 |
- "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
2315 |
- "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
2316 |
- "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
2317 |
- "jmp 4f \n\t" |
|
2318 |
- |
|
2319 |
- "3: \n\t" |
|
2320 |
- "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
|
2321 |
- "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride |
|
2322 |
- "movq (%0), %%mm0 \n\t" // L0 |
|
2323 |
- "movq (%0, %2), %%mm1 \n\t" // L1 |
|
2324 |
- "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
2325 |
- "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
2326 |
- "movq (%1), %%mm4 \n\t" // R0 |
|
2327 |
- "movq (%1, %2), %%mm5 \n\t" // R1 |
|
2328 |
- "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
2329 |
- "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
2330 |
- PAVGB(%%mm4, %%mm0) |
|
2331 |
- PAVGB(%%mm5, %%mm1) |
|
2332 |
- PAVGB(%%mm6, %%mm2) |
|
2333 |
- PAVGB(%%mm7, %%mm3) |
|
2334 |
- PAVGB(%%mm4, %%mm0) |
|
2335 |
- PAVGB(%%mm5, %%mm1) |
|
2336 |
- PAVGB(%%mm6, %%mm2) |
|
2337 |
- PAVGB(%%mm7, %%mm3) |
|
2338 |
- PAVGB(%%mm4, %%mm0) |
|
2339 |
- PAVGB(%%mm5, %%mm1) |
|
2340 |
- PAVGB(%%mm6, %%mm2) |
|
2341 |
- PAVGB(%%mm7, %%mm3) |
|
2342 |
- "movq %%mm0, (%1) \n\t" // R0 |
|
2343 |
- "movq %%mm1, (%1, %2) \n\t" // R1 |
|
2344 |
- "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
2345 |
- "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
2346 |
- "movq %%mm0, (%0) \n\t" // L0 |
|
2347 |
- "movq %%mm1, (%0, %2) \n\t" // L1 |
|
2348 |
- "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
2349 |
- "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
2350 |
- |
|
2351 |
- "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
|
2352 |
- "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
2353 |
- "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
2354 |
- "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
2355 |
- "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
|
2356 |
- "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
2357 |
- "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
2358 |
- "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
2359 |
- PAVGB(%%mm4, %%mm0) |
|
2360 |
- PAVGB(%%mm5, %%mm1) |
|
2361 |
- PAVGB(%%mm6, %%mm2) |
|
2362 |
- PAVGB(%%mm7, %%mm3) |
|
2363 |
- PAVGB(%%mm4, %%mm0) |
|
2364 |
- PAVGB(%%mm5, %%mm1) |
|
2365 |
- PAVGB(%%mm6, %%mm2) |
|
2366 |
- PAVGB(%%mm7, %%mm3) |
|
2367 |
- PAVGB(%%mm4, %%mm0) |
|
2368 |
- PAVGB(%%mm5, %%mm1) |
|
2369 |
- PAVGB(%%mm6, %%mm2) |
|
2370 |
- PAVGB(%%mm7, %%mm3) |
|
2371 |
- "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
|
2372 |
- "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
2373 |
- "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
2374 |
- "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
2375 |
- "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
|
2376 |
- "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
2377 |
- "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
2378 |
- "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
2379 |
- |
|
2380 |
- "4: \n\t" |
|
2381 |
- |
|
2382 |
- :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
|
2383 |
- : "%eax", "%edx", "%ecx", "memory" |
|
2384 |
- ); |
|
2385 |
-//printf("%d\n", test); |
|
2386 |
-#else |
|
2387 |
-{ |
|
2388 |
- int y; |
|
2389 |
- int d=0; |
|
2390 |
- int sysd=0; |
|
2391 |
- int i; |
|
2392 |
- |
|
2393 |
- for(y=0; y<8; y++) |
|
2394 |
- { |
|
2395 |
- int x; |
|
2396 |
- for(x=0; x<8; x++) |
|
2397 |
- { |
|
2398 |
- int ref= tempBlured[ x + y*stride ]; |
|
2399 |
- int cur= src[ x + y*stride ]; |
|
2400 |
- int d1=ref - cur; |
|
2401 |
-// if(x==0 || x==7) d1+= d1>>1; |
|
2402 |
-// if(y==0 || y==7) d1+= d1>>1; |
|
2403 |
-// d+= ABS(d1); |
|
2404 |
- d+= d1*d1; |
|
2405 |
- sysd+= d1; |
|
2406 |
- } |
|
2407 |
- } |
|
2408 |
- i=d; |
|
2409 |
- d= ( |
|
2410 |
- 4*d |
|
2411 |
- +(*(tempBluredPast-256)) |
|
2412 |
- +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) |
|
2413 |
- +(*(tempBluredPast+256)) |
|
2414 |
- +4)>>3; |
|
2415 |
- *tempBluredPast=i; |
|
2416 |
-// ((*tempBluredPast)*3 + d + 2)>>2; |
|
2417 |
- |
|
2418 |
-//printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
|
2419 |
-/* |
|
2420 |
-Switch between |
|
2421 |
- 1 0 0 0 0 0 0 (0) |
|
2422 |
-64 32 16 8 4 2 1 (1) |
|
2423 |
-64 48 36 27 20 15 11 (33) (approx) |
|
2424 |
-64 56 49 43 37 33 29 (200) (approx) |
|
2425 |
-*/ |
|
2426 |
- if(d > maxNoise[1]) |
|
2427 |
- { |
|
2428 |
- if(d < maxNoise[2]) |
|
2429 |
- { |
|
2430 |
- for(y=0; y<8; y++) |
|
2431 |
- { |
|
2432 |
- int x; |
|
2433 |
- for(x=0; x<8; x++) |
|
2434 |
- { |
|
2435 |
- int ref= tempBlured[ x + y*stride ]; |
|
2436 |
- int cur= src[ x + y*stride ]; |
|
2437 |
- tempBlured[ x + y*stride ]= |
|
2438 |
- src[ x + y*stride ]= |
|
2439 |
- (ref + cur + 1)>>1; |
|
2440 |
- } |
|
2441 |
- } |
|
2442 |
- } |
|
2443 |
- else |
|
2444 |
- { |
|
2445 |
- for(y=0; y<8; y++) |
|
2446 |
- { |
|
2447 |
- int x; |
|
2448 |
- for(x=0; x<8; x++) |
|
2449 |
- { |
|
2450 |
- tempBlured[ x + y*stride ]= src[ x + y*stride ]; |
|
2451 |
- } |
|
2452 |
- } |
|
2453 |
- } |
|
2454 |
- } |
|
2455 |
- else |
|
2456 |
- { |
|
2457 |
- if(d < maxNoise[0]) |
|
2458 |
- { |
|
2459 |
- for(y=0; y<8; y++) |
|
2460 |
- { |
|
2461 |
- int x; |
|
2462 |
- for(x=0; x<8; x++) |
|
2463 |
- { |
|
2464 |
- int ref= tempBlured[ x + y*stride ]; |
|
2465 |
- int cur= src[ x + y*stride ]; |
|
2466 |
- tempBlured[ x + y*stride ]= |
|
2467 |
- src[ x + y*stride ]= |
|
2468 |
- (ref*7 + cur + 4)>>3; |
|
2469 |
- } |
|
2470 |
- } |
|
2471 |
- } |
|
2472 |
- else |
|
2473 |
- { |
|
2474 |
- for(y=0; y<8; y++) |
|
2475 |
- { |
|
2476 |
- int x; |
|
2477 |
- for(x=0; x<8; x++) |
|
2478 |
- { |
|
2479 |
- int ref= tempBlured[ x + y*stride ]; |
|
2480 |
- int cur= src[ x + y*stride ]; |
|
2481 |
- tempBlured[ x + y*stride ]= |
|
2482 |
- src[ x + y*stride ]= |
|
2483 |
- (ref*3 + cur + 2)>>2; |
|
2484 |
- } |
|
2485 |
- } |
|
2486 |
- } |
|
2487 |
- } |
|
2488 |
-} |
|
2489 |
-#endif |
|
2490 |
-} |
|
2491 |
- |
|
2492 |
-static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
2493 |
- QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
|
2494 |
- |
|
2495 |
-/** |
|
2496 |
- * Copies a block from src to dst and fixes the blacklevel |
|
2497 |
- * levelFix == 0 -> dont touch the brighness & contrast |
|
2498 |
- */ |
|
2499 |
-#undef SCALED_CPY |
|
2500 |
- |
|
2501 |
-static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
|
2502 |
- int levelFix, int64_t *packedOffsetAndScale) |
|
2503 |
-{ |
|
2504 |
-#ifndef HAVE_MMX |
|
2505 |
- int i; |
|
2506 |
-#endif |
|
2507 |
- if(levelFix) |
|
2508 |
- { |
|
2509 |
-#ifdef HAVE_MMX |
|
2510 |
- asm volatile( |
|
2511 |
- "movq (%%eax), %%mm2 \n\t" // packedYOffset |
|
2512 |
- "movq 8(%%eax), %%mm3 \n\t" // packedYScale |
|
2513 |
- "leal (%2,%4), %%eax \n\t" |
|
2514 |
- "leal (%3,%5), %%edx \n\t" |
|
2515 |
- "pxor %%mm4, %%mm4 \n\t" |
|
2516 |
-#ifdef HAVE_MMX2 |
|
2517 |
-#define SCALED_CPY(src1, src2, dst1, dst2) \ |
|
2518 |
- "movq " #src1 ", %%mm0 \n\t"\ |
|
2519 |
- "movq " #src1 ", %%mm5 \n\t"\ |
|
2520 |
- "movq " #src2 ", %%mm1 \n\t"\ |
|
2521 |
- "movq " #src2 ", %%mm6 \n\t"\ |
|
2522 |
- "punpcklbw %%mm0, %%mm0 \n\t"\ |
|
2523 |
- "punpckhbw %%mm5, %%mm5 \n\t"\ |
|
2524 |
- "punpcklbw %%mm1, %%mm1 \n\t"\ |
|
2525 |
- "punpckhbw %%mm6, %%mm6 \n\t"\ |
|
2526 |
- "pmulhuw %%mm3, %%mm0 \n\t"\ |
|
2527 |
- "pmulhuw %%mm3, %%mm5 \n\t"\ |
|
2528 |
- "pmulhuw %%mm3, %%mm1 \n\t"\ |
|
2529 |
- "pmulhuw %%mm3, %%mm6 \n\t"\ |
|
2530 |
- "psubw %%mm2, %%mm0 \n\t"\ |
|
2531 |
- "psubw %%mm2, %%mm5 \n\t"\ |
|
2532 |
- "psubw %%mm2, %%mm1 \n\t"\ |
|
2533 |
- "psubw %%mm2, %%mm6 \n\t"\ |
|
2534 |
- "packuswb %%mm5, %%mm0 \n\t"\ |
|
2535 |
- "packuswb %%mm6, %%mm1 \n\t"\ |
|
2536 |
- "movq %%mm0, " #dst1 " \n\t"\ |
|
2537 |
- "movq %%mm1, " #dst2 " \n\t"\ |
|
2538 |
- |
|
2539 |
-#else //HAVE_MMX2 |
|
2540 |
-#define SCALED_CPY(src1, src2, dst1, dst2) \ |
|
2541 |
- "movq " #src1 ", %%mm0 \n\t"\ |
|
2542 |
- "movq " #src1 ", %%mm5 \n\t"\ |
|
2543 |
- "punpcklbw %%mm4, %%mm0 \n\t"\ |
|
2544 |
- "punpckhbw %%mm4, %%mm5 \n\t"\ |
|
2545 |
- "psubw %%mm2, %%mm0 \n\t"\ |
|
2546 |
- "psubw %%mm2, %%mm5 \n\t"\ |
|
2547 |
- "movq " #src2 ", %%mm1 \n\t"\ |
|
2548 |
- "psllw $6, %%mm0 \n\t"\ |
|
2549 |
- "psllw $6, %%mm5 \n\t"\ |
|
2550 |
- "pmulhw %%mm3, %%mm0 \n\t"\ |
|
2551 |
- "movq " #src2 ", %%mm6 \n\t"\ |
|
2552 |
- "pmulhw %%mm3, %%mm5 \n\t"\ |
|
2553 |
- "punpcklbw %%mm4, %%mm1 \n\t"\ |
|
2554 |
- "punpckhbw %%mm4, %%mm6 \n\t"\ |
|
2555 |
- "psubw %%mm2, %%mm1 \n\t"\ |
|
2556 |
- "psubw %%mm2, %%mm6 \n\t"\ |
|
2557 |
- "psllw $6, %%mm1 \n\t"\ |
|
2558 |
- "psllw $6, %%mm6 \n\t"\ |
|
2559 |
- "pmulhw %%mm3, %%mm1 \n\t"\ |
|
2560 |
- "pmulhw %%mm3, %%mm6 \n\t"\ |
|
2561 |
- "packuswb %%mm5, %%mm0 \n\t"\ |
|
2562 |
- "packuswb %%mm6, %%mm1 \n\t"\ |
|
2563 |
- "movq %%mm0, " #dst1 " \n\t"\ |
|
2564 |
- "movq %%mm1, " #dst2 " \n\t"\ |
|
2565 |
- |
|
2566 |
-#endif //!HAVE_MMX2 |
|
2567 |
- |
|
2568 |
-SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
|
2569 |
-SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) |
|
2570 |
-SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) |
|
2571 |
- "leal (%%eax,%4,4), %%eax \n\t" |
|
2572 |
- "leal (%%edx,%5,4), %%edx \n\t" |
|
2573 |
-SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) |
|
2574 |
- |
|
2575 |
- |
|
2576 |
- : "=&a" (packedOffsetAndScale) |
|
2577 |
- : "0" (packedOffsetAndScale), |
|
2578 |
- "r"(src), |
|
2579 |
- "r"(dst), |
|
2580 |
- "r" (srcStride), |
|
2581 |
- "r" (dstStride) |
|
2582 |
- : "%edx" |
|
2583 |
- ); |
|
2584 |
-#else |
|
2585 |
- for(i=0; i<8; i++) |
|
2586 |
- memcpy( &(dst[dstStride*i]), |
|
2587 |
- &(src[srcStride*i]), BLOCK_SIZE); |
|
2588 |
-#endif |
|
2589 |
- } |
|
2590 |
- else |
|
2591 |
- { |
|
2592 |
-#ifdef HAVE_MMX |
|
2593 |
- asm volatile( |
|
2594 |
- "leal (%0,%2), %%eax \n\t" |
|
2595 |
- "leal (%1,%3), %%edx \n\t" |
|
2596 |
- |
|
2597 |
-#define SIMPLE_CPY(src1, src2, dst1, dst2) \ |
|
2598 |
- "movq " #src1 ", %%mm0 \n\t"\ |
|
2599 |
- "movq " #src2 ", %%mm1 \n\t"\ |
|
2600 |
- "movq %%mm0, " #dst1 " \n\t"\ |
|
2601 |
- "movq %%mm1, " #dst2 " \n\t"\ |
|
2602 |
- |
|
2603 |
-SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) |
|
2604 |
-SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) |
|
2605 |
-SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) |
|
2606 |
- "leal (%%eax,%2,4), %%eax \n\t" |
|
2607 |
- "leal (%%edx,%3,4), %%edx \n\t" |
|
2608 |
-SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) |
|
2609 |
- |
|
2610 |
- : : "r" (src), |
|
2611 |
- "r" (dst), |
|
2612 |
- "r" (srcStride), |
|
2613 |
- "r" (dstStride) |
|
2614 |
- : "%eax", "%edx" |
|
2615 |
- ); |
|
2616 |
-#else |
|
2617 |
- for(i=0; i<8; i++) |
|
2618 |
- memcpy( &(dst[dstStride*i]), |
|
2619 |
- &(src[srcStride*i]), BLOCK_SIZE); |
|
2620 |
-#endif |
|
2621 |
- } |
|
2622 |
-} |
|
2623 |
- |
|
2624 |
-/** |
|
2625 |
- * Duplicates the given 8 src pixels ? times upward |
|
2626 |
- */ |
|
2627 |
-static inline void RENAME(duplicate)(uint8_t src[], int stride) |
|
2628 |
-{ |
|
2629 |
-#ifdef HAVE_MMX |
|
2630 |
- asm volatile( |
|
2631 |
- "movq (%0), %%mm0 \n\t" |
|
2632 |
- "addl %1, %0 \n\t" |
|
2633 |
- "movq %%mm0, (%0) \n\t" |
|
2634 |
- "movq %%mm0, (%0, %1) \n\t" |
|
2635 |
- "movq %%mm0, (%0, %1, 2) \n\t" |
|
2636 |
- : "+r" (src) |
|
2637 |
- : "r" (-stride) |
|
2638 |
- ); |
|
2639 |
-#else |
|
2640 |
- int i; |
|
2641 |
- uint8_t *p=src; |
|
2642 |
- for(i=0; i<3; i++) |
|
2643 |
- { |
|
2644 |
- p-= stride; |
|
2645 |
- memcpy(p, src, 8); |
|
2646 |
- } |
|
2647 |
-#endif |
|
2648 |
-} |
|
2649 |
- |
|
2650 |
-/** |
|
2651 |
- * Filters array of bytes (Y or U or V values) |
|
2652 |
- */ |
|
2653 |
-static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
|
2654 |
- QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
|
2655 |
-{ |
|
2656 |
- PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
|
2657 |
- int x,y; |
|
2658 |
-#ifdef COMPILE_TIME_MODE |
|
2659 |
- const int mode= COMPILE_TIME_MODE; |
|
2660 |
-#else |
|
2661 |
- const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
|
2662 |
-#endif |
|
2663 |
- int black=0, white=255; // blackest black and whitest white in the picture |
|
2664 |
- int QPCorrecture= 256*256; |
|
2665 |
- |
|
2666 |
- int copyAhead; |
|
2667 |
-#ifdef HAVE_MMX |
|
2668 |
- int i; |
|
2669 |
-#endif |
|
2670 |
- |
|
2671 |
- const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
|
2672 |
- const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; |
|
2673 |
- |
|
2674 |
- //FIXME remove |
|
2675 |
- uint64_t * const yHistogram= c.yHistogram; |
|
2676 |
- uint8_t * const tempSrc= c.tempSrc; |
|
2677 |
- uint8_t * const tempDst= c.tempDst; |
|
2678 |
- const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
|
2679 |
- |
|
2680 |
-#ifdef HAVE_MMX |
|
2681 |
- for(i=0; i<32; i++){ |
|
2682 |
- int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; |
|
2683 |
- int threshold= offset*2 + 1; |
|
2684 |
- c.mmxDcOffset[i]= 0x7F - offset; |
|
2685 |
- c.mmxDcThreshold[i]= 0x7F - threshold; |
|
2686 |
- c.mmxDcOffset[i]*= 0x0101010101010101LL; |
|
2687 |
- c.mmxDcThreshold[i]*= 0x0101010101010101LL; |
|
2688 |
- } |
|
2689 |
-#endif |
|
2690 |
- |
|
2691 |
- if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
|
2692 |
- else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
|
2693 |
- || (mode & FFMPEG_DEINT_FILTER)) copyAhead=14; |
|
2694 |
- else if( (mode & V_DEBLOCK) |
|
2695 |
- || (mode & LINEAR_IPOL_DEINT_FILTER) |
|
2696 |
- || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; |
|
2697 |
- else if(mode & V_X1_FILTER) copyAhead=11; |
|
2698 |
-// else if(mode & V_RK1_FILTER) copyAhead=10; |
|
2699 |
- else if(mode & DERING) copyAhead=9; |
|
2700 |
- else copyAhead=8; |
|
2701 |
- |
|
2702 |
- copyAhead-= 8; |
|
2703 |
- |
|
2704 |
- if(!isColor) |
|
2705 |
- { |
|
2706 |
- uint64_t sum= 0; |
|
2707 |
- int i; |
|
2708 |
- uint64_t maxClipped; |
|
2709 |
- uint64_t clipped; |
|
2710 |
- double scale; |
|
2711 |
- |
|
2712 |
- c.frameNum++; |
|
2713 |
- // first frame is fscked so we ignore it |
|
2714 |
- if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; |
|
2715 |
- |
|
2716 |
- for(i=0; i<256; i++) |
|
2717 |
- { |
|
2718 |
- sum+= yHistogram[i]; |
|
2719 |
-// printf("%d ", yHistogram[i]); |
|
2720 |
- } |
|
2721 |
-// printf("\n\n"); |
|
2722 |
- |
|
2723 |
- /* we allways get a completly black picture first */ |
|
2724 |
- maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); |
|
2725 |
- |
|
2726 |
- clipped= sum; |
|
2727 |
- for(black=255; black>0; black--) |
|
2728 |
- { |
|
2729 |
- if(clipped < maxClipped) break; |
|
2730 |
- clipped-= yHistogram[black]; |
|
2731 |
- } |
|
2732 |
- |
|
2733 |
- clipped= sum; |
|
2734 |
- for(white=0; white<256; white++) |
|
2735 |
- { |
|
2736 |
- if(clipped < maxClipped) break; |
|
2737 |
- clipped-= yHistogram[white]; |
|
2738 |
- } |
|
2739 |
- |
|
2740 |
- scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
|
2741 |
- |
|
2742 |
-#ifdef HAVE_MMX2 |
|
2743 |
- c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
|
2744 |
- c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; |
|
2745 |
-#else |
|
2746 |
- c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
|
2747 |
- c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; |
|
2748 |
-#endif |
|
2749 |
- |
|
2750 |
- c.packedYOffset|= c.packedYOffset<<32; |
|
2751 |
- c.packedYOffset|= c.packedYOffset<<16; |
|
2752 |
- |
|
2753 |
- c.packedYScale|= c.packedYScale<<32; |
|
2754 |
- c.packedYScale|= c.packedYScale<<16; |
|
2755 |
- |
|
2756 |
- if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); |
|
2757 |
- else QPCorrecture= 256*256; |
|
2758 |
- } |
|
2759 |
- else |
|
2760 |
- { |
|
2761 |
- c.packedYScale= 0x0100010001000100LL; |
|
2762 |
- c.packedYOffset= 0; |
|
2763 |
- QPCorrecture= 256*256; |
|
2764 |
- } |
|
2765 |
- |
|
2766 |
- /* copy & deinterlace first row of blocks */ |
|
2767 |
- y=-BLOCK_SIZE; |
|
2768 |
- { |
|
2769 |
- uint8_t *srcBlock= &(src[y*srcStride]); |
|
2770 |
- uint8_t *dstBlock= tempDst + dstStride; |
|
2771 |
- |
|
2772 |
- // From this point on it is guranteed that we can read and write 16 lines downward |
|
2773 | ||
2774 |
- // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
2775 |
- for(x=0; x<width; x+=BLOCK_SIZE) |
|
2776 |
- { |
|
2777 |
- |
|
2778 |
-#ifdef HAVE_MMX2 |
|
2779 |
-/* |
|
2780 |
- prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
|
2781 |
- prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
|
2782 |
- prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
|
2783 |
- prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
|
2784 |
-*/ |
|
2785 |
- |
|
2786 |
- asm( |
|
2787 |
- "movl %4, %%eax \n\t" |
|
2788 |
- "shrl $2, %%eax \n\t" |
|
2789 |
- "andl $6, %%eax \n\t" |
|
2790 |
- "addl %5, %%eax \n\t" |
|
2791 |
- "movl %%eax, %%edx \n\t" |
|
2792 |
- "imul %1, %%eax \n\t" |
|
2793 |
- "imul %3, %%edx \n\t" |
|
2794 |
- "prefetchnta 32(%%eax, %0) \n\t" |
|
2795 |
- "prefetcht0 32(%%edx, %2) \n\t" |
|
2796 |
- "addl %1, %%eax \n\t" |
|
2797 |
- "addl %3, %%edx \n\t" |
|
2798 |
- "prefetchnta 32(%%eax, %0) \n\t" |
|
2799 |
- "prefetcht0 32(%%edx, %2) \n\t" |
|
2800 |
- :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
|
2801 |
- "m" (x), "m" (copyAhead) |
|
2802 |
- : "%eax", "%edx" |
|
2803 |
- ); |
|
2804 |
- |
|
2805 |
-#elif defined(HAVE_3DNOW) |
|
2806 |
-//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
|
2807 |
-/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
|
2808 |
- prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
|
2809 |
- prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
|
2810 |
- prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
|
2811 |
-*/ |
|
2812 |
-#endif |
|
2813 |
- |
|
2814 |
- RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
|
2815 |
- srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
2816 |
- |
|
2817 |
- RENAME(duplicate)(dstBlock + dstStride*8, dstStride); |
|
2818 |
- |
|
2819 |
- if(mode & LINEAR_IPOL_DEINT_FILTER) |
|
2820 |
- RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
|
2821 |
- else if(mode & LINEAR_BLEND_DEINT_FILTER) |
|
2822 |
- RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
|
2823 |
- else if(mode & MEDIAN_DEINT_FILTER) |
|
2824 |
- RENAME(deInterlaceMedian)(dstBlock, dstStride); |
|
2825 |
- else if(mode & CUBIC_IPOL_DEINT_FILTER) |
|
2826 |
- RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
|
2827 |
- else if(mode & FFMPEG_DEINT_FILTER) |
|
2828 |
- RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); |
|
2829 |
-/* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
|
2830 |
- RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
2831 |
-*/ |
|
2832 |
- dstBlock+=8; |
|
2833 |
- srcBlock+=8; |
|
2834 |
- } |
|
2835 |
- if(width==dstStride) |
|
2836 |
- memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride); |
|
2837 |
- else |
|
2838 |
- { |
|
2839 |
- int i; |
|
2840 |
- for(i=0; i<copyAhead; i++) |
|
2841 |
- { |
|
2842 |
- memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); |
|
2843 |
- } |
|
2844 |
- } |
|
2845 |
- } |
|
2846 |
- |
|
2847 |
-//printf("\n"); |
|
2848 |
- for(y=0; y<height; y+=BLOCK_SIZE) |
|
2849 |
- { |
|
2850 |
- //1% speedup if these are here instead of the inner loop |
|
2851 |
- uint8_t *srcBlock= &(src[y*srcStride]); |
|
2852 |
- uint8_t *dstBlock= &(dst[y*dstStride]); |
|
2853 |
-#ifdef HAVE_MMX |
|
2854 |
- uint8_t *tempBlock1= c.tempBlocks; |
|
2855 |
- uint8_t *tempBlock2= c.tempBlocks + 8; |
|
2856 |
-#endif |
|
2857 |
- int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
|
2858 |
- int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*mbWidth]; |
|
2859 |
- int QP=0; |
|
2860 |
- /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
|
2861 |
- if not than use a temporary buffer */ |
|
2862 |
- if(y+15 >= height) |
|
2863 |
- { |
|
2864 |
- int i; |
|
2865 |
- /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
|
2866 |
- blockcopy to dst later */ |
|
2867 |
- memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
|
2868 |
- srcStride*MAX(height-y-copyAhead, 0) ); |
|
2869 |
- |
|
2870 |
- /* duplicate last line of src to fill the void upto line (copyAhead+7) */ |
|
2871 |
- for(i=MAX(height-y, 8); i<copyAhead+8; i++) |
|
2872 |
- memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
|
2873 |
- |
|
2874 |
- /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
|
2875 |
- memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); |
|
2876 |
- |
|
2877 |
- /* duplicate last line of dst to fill the void upto line (copyAhead) */ |
|
2878 |
- for(i=height-y+1; i<=copyAhead; i++) |
|
2879 |
- memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
|
2880 |
- |
|
2881 |
- dstBlock= tempDst + dstStride; |
|
2882 |
- srcBlock= tempSrc; |
|
2883 |
- } |
|
2884 |
-//printf("\n"); |
|
2885 |
- |
|
2886 |
- // From this point on it is guranteed that we can read and write 16 lines downward |
|
2887 | ||
2888 |
- // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
2889 |
- for(x=0; x<width; x+=BLOCK_SIZE) |
|
2890 |
- { |
|
2891 |
- const int stride= dstStride; |
|
2892 |
-#ifdef HAVE_MMX |
|
2893 |
- uint8_t *tmpXchg; |
|
2894 |
-#endif |
|
2895 |
- if(isColor) |
|
2896 |
- { |
|
2897 |
- QP= QPptr[x>>qpHShift]; |
|
2898 |
- c.nonBQP= nonBQPptr[x>>qpHShift]; |
|
2899 |
- } |
|
2900 |
- else |
|
2901 |
- { |
|
2902 |
- QP= QPptr[x>>4]; |
|
2903 |
- QP= (QP* QPCorrecture + 256*128)>>16; |
|
2904 |
- c.nonBQP= nonBQPptr[x>>4]; |
|
2905 |
- c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; |
|
2906 |
- yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
|
2907 |
- } |
|
2908 |
- c.QP= QP; |
|
2909 |
-#ifdef HAVE_MMX |
|
2910 |
- asm volatile( |
|
2911 |
- "movd %1, %%mm7 \n\t" |
|
2912 |
- "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
|
2913 |
- "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
|
2914 |
- "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
|
2915 |
- "movq %%mm7, %0 \n\t" |
|
2916 |
- : "=m" (c.pQPb) |
|
2917 |
- : "r" (QP) |
|
2918 |
- ); |
|
2919 |
-#endif |
|
2920 |
- |
|
2921 |
- |
|
2922 |
-#ifdef HAVE_MMX2 |
|
2923 |
-/* |
|
2924 |
- prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
|
2925 |
- prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
|
2926 |
- prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
|
2927 |
- prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
|
2928 |
-*/ |
|
2929 |
- |
|
2930 |
- asm( |
|
2931 |
- "movl %4, %%eax \n\t" |
|
2932 |
- "shrl $2, %%eax \n\t" |
|
2933 |
- "andl $6, %%eax \n\t" |
|
2934 |
- "addl %5, %%eax \n\t" |
|
2935 |
- "movl %%eax, %%edx \n\t" |
|
2936 |
- "imul %1, %%eax \n\t" |
|
2937 |
- "imul %3, %%edx \n\t" |
|
2938 |
- "prefetchnta 32(%%eax, %0) \n\t" |
|
2939 |
- "prefetcht0 32(%%edx, %2) \n\t" |
|
2940 |
- "addl %1, %%eax \n\t" |
|
2941 |
- "addl %3, %%edx \n\t" |
|
2942 |
- "prefetchnta 32(%%eax, %0) \n\t" |
|
2943 |
- "prefetcht0 32(%%edx, %2) \n\t" |
|
2944 |
- :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
|
2945 |
- "m" (x), "m" (copyAhead) |
|
2946 |
- : "%eax", "%edx" |
|
2947 |
- ); |
|
2948 |
- |
|
2949 |
-#elif defined(HAVE_3DNOW) |
|
2950 |
-//FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
|
2951 |
-/* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
|
2952 |
- prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
|
2953 |
- prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
|
2954 |
- prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
|
2955 |
-*/ |
|
2956 |
-#endif |
|
2957 |
- |
|
2958 |
- RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
|
2959 |
- srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
2960 |
- |
|
2961 |
- if(mode & LINEAR_IPOL_DEINT_FILTER) |
|
2962 |
- RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
|
2963 |
- else if(mode & LINEAR_BLEND_DEINT_FILTER) |
|
2964 |
- RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
|
2965 |
- else if(mode & MEDIAN_DEINT_FILTER) |
|
2966 |
- RENAME(deInterlaceMedian)(dstBlock, dstStride); |
|
2967 |
- else if(mode & CUBIC_IPOL_DEINT_FILTER) |
|
2968 |
- RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
|
2969 |
- else if(mode & FFMPEG_DEINT_FILTER) |
|
2970 |
- RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); |
|
2971 |
-/* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
|
2972 |
- RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
2973 |
-*/ |
|
2974 |
- |
|
2975 |
- /* only deblock if we have 2 blocks */ |
|
2976 |
- if(y + 8 < height) |
|
2977 |
- { |
|
2978 |
- if(mode & V_X1_FILTER) |
|
2979 |
- RENAME(vertX1Filter)(dstBlock, stride, &c); |
|
2980 |
- else if(mode & V_DEBLOCK) |
|
2981 |
- { |
|
2982 |
- if( RENAME(isVertDC)(dstBlock, stride, &c)) |
|
2983 |
- { |
|
2984 |
- if(RENAME(isVertMinMaxOk)(dstBlock, stride, &c)) |
|
2985 |
- RENAME(doVertLowPass)(dstBlock, stride, &c); |
|
2986 |
- } |
|
2987 |
- else |
|
2988 |
- RENAME(doVertDefFilter)(dstBlock, stride, &c); |
|
2989 |
- } |
|
2990 |
- } |
|
2991 |
- |
|
2992 |
-#ifdef HAVE_MMX |
|
2993 |
- RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
|
2994 |
-#endif |
|
2995 |
- /* check if we have a previous block to deblock it with dstBlock */ |
|
2996 |
- if(x - 8 >= 0) |
|
2997 |
- { |
|
2998 |
-#ifdef HAVE_MMX |
|
2999 |
- if(mode & H_X1_FILTER) |
|
3000 |
- RENAME(vertX1Filter)(tempBlock1, 16, &c); |
|
3001 |
- else if(mode & H_DEBLOCK) |
|
3002 |
- { |
|
3003 |
- if( RENAME(isVertDC)(tempBlock1, 16, &c)) |
|
3004 |
- { |
|
3005 |
- if(RENAME(isVertMinMaxOk)(tempBlock1, 16, &c)) |
|
3006 |
- RENAME(doVertLowPass)(tempBlock1, 16, &c); |
|
3007 |
- } |
|
3008 |
- else |
|
3009 |
- RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
|
3010 |
- } |
|
3011 |
- |
|
3012 |
- RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
|
3013 |
- |
|
3014 |
-#else |
|
3015 |
- if(mode & H_X1_FILTER) |
|
3016 |
- horizX1Filter(dstBlock-4, stride, QP); |
|
3017 |
- else if(mode & H_DEBLOCK) |
|
3018 |
- { |
|
3019 |
- if( isHorizDC(dstBlock-4, stride, &c)) |
|
3020 |
- { |
|
3021 |
- if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
|
3022 |
- doHorizLowPass(dstBlock-4, stride, QP); |
|
3023 |
- } |
|
3024 |
- else |
|
3025 |
- doHorizDefFilter(dstBlock-4, stride, QP); |
|
3026 |
- } |
|
3027 |
-#endif |
|
3028 |
- if(mode & DERING) |
|
3029 |
- { |
|
3030 |
- //FIXME filter first line |
|
3031 |
- if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
|
3032 |
- } |
|
3033 |
- |
|
3034 |
- if(mode & TEMP_NOISE_FILTER) |
|
3035 |
- { |
|
3036 |
- RENAME(tempNoiseReducer)(dstBlock-8, stride, |
|
3037 |
- c.tempBlured[isColor] + y*dstStride + x, |
|
3038 |
- c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
|
3039 |
- c.ppMode.maxTmpNoise); |
|
3040 |
- } |
|
3041 |
- } |
|
3042 |
- |
|
3043 |
- dstBlock+=8; |
|
3044 |
- srcBlock+=8; |
|
3045 |
- |
|
3046 |
-#ifdef HAVE_MMX |
|
3047 |
- tmpXchg= tempBlock1; |
|
3048 |
- tempBlock1= tempBlock2; |
|
3049 |
- tempBlock2 = tmpXchg; |
|
3050 |
-#endif |
|
3051 |
- } |
|
3052 |
- |
|
3053 |
- if(mode & DERING) |
|
3054 |
- { |
|
3055 |
- if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
|
3056 |
- } |
|
3057 |
- |
|
3058 |
- if((mode & TEMP_NOISE_FILTER)) |
|
3059 |
- { |
|
3060 |
- RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
|
3061 |
- c.tempBlured[isColor] + y*dstStride + x, |
|
3062 |
- c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
|
3063 |
- c.ppMode.maxTmpNoise); |
|
3064 |
- } |
|
3065 |
- |
|
3066 |
- /* did we use a tmp buffer for the last lines*/ |
|
3067 |
- if(y+15 >= height) |
|
3068 |
- { |
|
3069 |
- uint8_t *dstBlock= &(dst[y*dstStride]); |
|
3070 |
- if(width==dstStride) |
|
3071 |
- memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y)); |
|
3072 |
- else |
|
3073 |
- { |
|
3074 |
- int i; |
|
3075 |
- for(i=0; i<height-y; i++) |
|
3076 |
- { |
|
3077 |
- memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); |
|
3078 |
- } |
|
3079 |
- } |
|
3080 |
- } |
|
3081 |
-/* |
|
3082 |
- for(x=0; x<width; x+=32) |
|
3083 |
- { |
|
3084 |
- volatile int i; |
|
3085 |
- i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
|
3086 |
- + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] |
|
3087 |
- + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
|
3088 |
-// + dstBlock[x +13*dstStride] |
|
3089 |
-// + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; |
|
3090 |
- }*/ |
|
3091 |
- } |
|
3092 |
-#ifdef HAVE_3DNOW |
|
3093 |
- asm volatile("femms"); |
|
3094 |
-#elif defined (HAVE_MMX) |
|
3095 |
- asm volatile("emms"); |
|
3096 |
-#endif |
|
3097 |
- |
|
3098 |
-#ifdef DEBUG_BRIGHTNESS |
|
3099 |
- if(!isColor) |
|
3100 |
- { |
|
3101 |
- int max=1; |
|
3102 |
- int i; |
|
3103 |
- for(i=0; i<256; i++) |
|
3104 |
- if(yHistogram[i] > max) max=yHistogram[i]; |
|
3105 |
- |
|
3106 |
- for(i=1; i<256; i++) |
|
3107 |
- { |
|
3108 |
- int x; |
|
3109 |
- int start=yHistogram[i-1]/(max/256+1); |
|
3110 |
- int end=yHistogram[i]/(max/256+1); |
|
3111 |
- int inc= end > start ? 1 : -1; |
|
3112 |
- for(x=start; x!=end+inc; x+=inc) |
|
3113 |
- dst[ i*dstStride + x]+=128; |
|
3114 |
- } |
|
3115 |
- |
|
3116 |
- for(i=0; i<100; i+=2) |
|
3117 |
- { |
|
3118 |
- dst[ (white)*dstStride + i]+=128; |
|
3119 |
- dst[ (black)*dstStride + i]+=128; |
|
3120 |
- } |
|
3121 |
- |
|
3122 |
- } |
|
3123 |
-#endif |
|
3124 |
- |
|
3125 |
- *c2= c; //copy local context back |
|
3126 |
- |
|
3127 |
-} |