Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>
... | ... |
@@ -151,11 +151,11 @@ static void store_slice2_c(uint8_t *dst, int16_t *src, |
151 | 151 |
} |
152 | 152 |
} |
153 | 153 |
|
154 |
-static void mul_thrmat_c(FSPPContext *p, int q) |
|
154 |
+static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q) |
|
155 | 155 |
{ |
156 | 156 |
int a; |
157 | 157 |
for (a = 0; a < 64; a++) |
158 |
- ((int16_t *)p->threshold_mtx)[a] = q * ((int16_t *)p->threshold_mtx_noq)[a];//ints faster in C |
|
158 |
+ thr_adr[a] = q * thr_adr_noq[a]; |
|
159 | 159 |
} |
160 | 160 |
|
161 | 161 |
static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, |
... | ... |
@@ -220,7 +220,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src, |
220 | 220 |
t = qp_store[qy + (t >> qpsh)]; |
221 | 221 |
t = norm_qscale(t, p->qscale_type); |
222 | 222 |
|
223 |
- if (t != p->prev_q) p->prev_q = t, p->mul_thrmat(p, t); |
|
223 |
+ if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t); |
|
224 | 224 |
p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT |
225 | 225 |
} |
226 | 226 |
p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1)); |
... | ... |
@@ -378,7 +378,7 @@ static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int |
378 | 378 |
} |
379 | 379 |
} |
380 | 380 |
|
381 |
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_stride, int cnt) |
|
381 |
+static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt) |
|
382 | 382 |
{ |
383 | 383 |
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
384 | 384 |
int_simd16_t tmp10, tmp11, tmp12, tmp13; |
... | ... |
@@ -440,7 +440,7 @@ static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_strid |
440 | 440 |
} |
441 | 441 |
} |
442 | 442 |
|
443 |
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt) |
|
443 |
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt) |
|
444 | 444 |
{ |
445 | 445 |
int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
446 | 446 |
int_simd16_t tmp10, tmp11, tmp12, tmp13; |
... | ... |
@@ -582,7 +582,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) |
582 | 582 |
} |
583 | 583 |
|
584 | 584 |
if (fspp->qp) |
585 |
- fspp->prev_q = fspp->qp, fspp->mul_thrmat(fspp, fspp->qp); |
|
585 |
+ fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp); |
|
586 | 586 |
|
587 | 587 |
/* if we are not in a constant user quantizer mode and we don't want to use |
588 | 588 |
* the quantizers from the B-frames (B-frames often have a higher QP), we |
... | ... |
@@ -79,16 +79,16 @@ typedef struct FSPPContext { |
79 | 79 |
ptrdiff_t dst_stride, ptrdiff_t src_stride, |
80 | 80 |
ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); |
81 | 81 |
|
82 |
- void (*mul_thrmat)(struct FSPPContext *fspp, int q); |
|
82 |
+ void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q); |
|
83 | 83 |
|
84 | 84 |
void (*column_fidct)(int16_t *thr_adr, int16_t *data, |
85 | 85 |
int16_t *output, int cnt); |
86 | 86 |
|
87 | 87 |
void (*row_idct)(int16_t *workspace, int16_t *output_adr, |
88 |
- int output_stride, int cnt); |
|
88 |
+ ptrdiff_t output_stride, int cnt); |
|
89 | 89 |
|
90 | 90 |
void (*row_fdct)(int16_t *data, const uint8_t *pixels, |
91 |
- int line_size, int cnt); |
|
91 |
+ ptrdiff_t line_size, int cnt); |
|
92 | 92 |
|
93 | 93 |
} FSPPContext; |
94 | 94 |
|
... | ... |
@@ -1,4 +1,4 @@ |
1 |
-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o |
|
1 |
+OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o |
|
2 | 2 |
OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o |
3 | 3 |
OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o |
4 | 4 |
OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o |
... | ... |
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o |
10 | 10 |
OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o |
11 | 11 |
OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o |
12 | 12 |
|
13 |
+YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o |
|
13 | 14 |
YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o |
14 | 15 |
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o |
15 | 16 |
YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o |
16 | 17 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,727 @@ |
0 |
+;***************************************************************************** |
|
1 |
+;* x86-optimized functions for fspp filter |
|
2 |
+;* |
|
3 |
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> |
|
4 |
+;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru> |
|
5 |
+;* |
|
6 |
+;* This file is part of FFmpeg. |
|
7 |
+;* |
|
8 |
+;* FFmpeg is free software; you can redistribute it and/or modify |
|
9 |
+;* it under the terms of the GNU General Public License as published by |
|
10 |
+;* the Free Software Foundation; either version 2 of the License, or |
|
11 |
+;* (at your option) any later version. |
|
12 |
+;* |
|
13 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
14 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
16 |
+;* GNU General Public License for more details. |
|
17 |
+;* |
|
18 |
+;* You should have received a copy of the GNU General Public License along |
|
19 |
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc., |
|
20 |
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
|
21 |
+;****************************************************************************** |
|
22 |
+ |
|
23 |
+%include "libavutil/x86/x86util.asm" |
|
24 |
+ |
|
25 |
+SECTION_RODATA |
|
26 |
+ |
|
27 |
+pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \ |
|
28 |
+ 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \ |
|
29 |
+ 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \ |
|
30 |
+ 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21 |
|
31 |
+pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) |
|
32 |
+pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) |
|
33 |
+pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) |
|
34 |
+pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) |
|
35 |
+pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) |
|
36 |
+pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) |
|
37 |
+pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) |
|
38 |
+pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14) |
|
39 |
+pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14) |
|
40 |
+pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14) |
|
41 |
+pw_4: times 4 dw 4 |
|
42 |
+pw_2: times 4 dw 2 |
|
43 |
+ |
|
44 |
+SECTION .text |
|
45 |
+ |
|
46 |
+%define DCTSIZE 8 |
|
47 |
+ |
|
48 |
+INIT_MMX mmx |
|
49 |
+ |
|
50 |
+;void ff_store_slice_mmx(uint8_t *dst, int16_t *src, |
|
51 |
+; ptrdiff_t dst_stride, ptrdiff_t src_stride, |
|
52 |
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) |
|
53 |
+%if ARCH_X86_64 |
|
54 |
+cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 |
|
55 |
+%else |
|
56 |
+cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 |
|
57 |
+%define dst_strideq r2m |
|
58 |
+%define src_strideq r3m |
|
59 |
+ mov widthq, r4m |
|
60 |
+ mov dither_heightq, r5m |
|
61 |
+ mov ditherq, r6m ; log2_scale |
|
62 |
+%endif |
|
63 |
+ add widthq, 7 |
|
64 |
+ mov tmpq, src_strideq |
|
65 |
+ and widthq, ~7 |
|
66 |
+ sub dst_strideq, widthq |
|
67 |
+ movd m5, ditherq ; log2_scale |
|
68 |
+ xor ditherq, -1 ; log2_scale |
|
69 |
+ mov tmp2q, tmpq |
|
70 |
+ add ditherq, 7 ; log2_scale |
|
71 |
+ neg tmpq |
|
72 |
+ sub tmp2q, widthq |
|
73 |
+ movd m2, ditherq ; log2_scale |
|
74 |
+ add tmp2q, tmp2q |
|
75 |
+ lea ditherq, [pb_dither] |
|
76 |
+ mov src_strideq, tmp2q |
|
77 |
+ shl tmpq, 4 |
|
78 |
+ lea dither_heightq, [ditherq+dither_heightq*8] |
|
79 |
+ |
|
80 |
+.loop_height: |
|
81 |
+ movq m3, [ditherq] |
|
82 |
+ movq m4, m3 |
|
83 |
+ pxor m7, m7 |
|
84 |
+ punpcklbw m3, m7 |
|
85 |
+ punpckhbw m4, m7 |
|
86 |
+ mov tmp2q, widthq |
|
87 |
+ psraw m3, m5 |
|
88 |
+ psraw m4, m5 |
|
89 |
+ |
|
90 |
+.loop_width: |
|
91 |
+ movq [srcq+tmpq], m7 |
|
92 |
+ movq m0, [srcq] |
|
93 |
+ movq m1, [srcq+8] |
|
94 |
+ movq [srcq+tmpq+8], m7 |
|
95 |
+ paddw m0, m3 |
|
96 |
+ paddw m1, m4 |
|
97 |
+ movq [srcq], m7 |
|
98 |
+ psraw m0, m2 |
|
99 |
+ psraw m1, m2 |
|
100 |
+ movq [srcq+8], m7 |
|
101 |
+ packuswb m0, m1 |
|
102 |
+ add srcq, 16 |
|
103 |
+ movq [dstq], m0 |
|
104 |
+ add dstq, 8 |
|
105 |
+ sub tmp2q, 8 |
|
106 |
+ jg .loop_width |
|
107 |
+ |
|
108 |
+ add srcq, src_strideq |
|
109 |
+ add ditherq, 8 |
|
110 |
+ add dstq, dst_strideq |
|
111 |
+ cmp ditherq, dither_heightq |
|
112 |
+ jl .loop_height |
|
113 |
+ RET |
|
114 |
+ |
|
115 |
+;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, |
|
116 |
+; ptrdiff_t dst_stride, ptrdiff_t src_stride, |
|
117 |
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) |
|
118 |
+%if ARCH_X86_64 |
|
119 |
+cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 |
|
120 |
+%else |
|
121 |
+cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 |
|
122 |
+%define dst_strideq r2m |
|
123 |
+%define src_strideq r3m |
|
124 |
+ mov dstq, dstm |
|
125 |
+ mov srcq, srcm |
|
126 |
+ mov widthq, r4m |
|
127 |
+ mov dither_heightq, r5m |
|
128 |
+ mov ditherq, r6m ; log2_scale |
|
129 |
+%endif |
|
130 |
+ add widthq, 7 |
|
131 |
+ mov tmpq, src_strideq |
|
132 |
+ and widthq, ~7 |
|
133 |
+ sub dst_strideq, widthq |
|
134 |
+ movd m5, ditherq ; log2_scale |
|
135 |
+ xor ditherq, -1 ; log2_scale |
|
136 |
+ mov tmp2q, tmpq |
|
137 |
+ add ditherq, 7 ; log2_scale |
|
138 |
+ sub tmp2q, widthq |
|
139 |
+ movd m2, ditherq ; log2_scale |
|
140 |
+ add tmp2q, tmp2q |
|
141 |
+ lea ditherq, [pb_dither] |
|
142 |
+ mov src_strideq, tmp2q |
|
143 |
+ shl tmpq, 5 |
|
144 |
+ lea dither_heightq, [ditherq+dither_heightq*8] |
|
145 |
+ |
|
146 |
+.loop_height: |
|
147 |
+ movq m3, [ditherq] |
|
148 |
+ movq m4, m3 |
|
149 |
+ pxor m7, m7 |
|
150 |
+ punpcklbw m3, m7 |
|
151 |
+ punpckhbw m4, m7 |
|
152 |
+ mov tmp2q,widthq |
|
153 |
+ psraw m3, m5 |
|
154 |
+ psraw m4, m5 |
|
155 |
+ |
|
156 |
+.loop_width: |
|
157 |
+ movq m0, [srcq] |
|
158 |
+ movq m1, [srcq+8] |
|
159 |
+ paddw m0, m3 |
|
160 |
+ paddw m0, [srcq+tmpq] |
|
161 |
+ paddw m1, m4 |
|
162 |
+ movq m6, [srcq+tmpq+8] |
|
163 |
+ movq [srcq+tmpq], m7 |
|
164 |
+ psraw m0, m2 |
|
165 |
+ paddw m1, m6 |
|
166 |
+ movq [srcq+tmpq+8], m7 |
|
167 |
+ psraw m1, m2 |
|
168 |
+ packuswb m0, m1 |
|
169 |
+ movq [dstq], m0 |
|
170 |
+ add srcq, 16 |
|
171 |
+ add dstq, 8 |
|
172 |
+ sub tmp2q, 8 |
|
173 |
+ jg .loop_width |
|
174 |
+ |
|
175 |
+ add srcq, src_strideq |
|
176 |
+ add ditherq, 8 |
|
177 |
+ add dstq, dst_strideq |
|
178 |
+ cmp ditherq, dither_heightq |
|
179 |
+ jl .loop_height |
|
180 |
+ RET |
|
181 |
+ |
|
182 |
+;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); |
|
183 |
+cglobal mul_thrmat, 3, 3, 0, thrn, thr, q |
|
184 |
+ movd m7, qd |
|
185 |
+ movq m0, [thrnq] |
|
186 |
+ punpcklwd m7, m7 |
|
187 |
+ movq m1, [thrnq+8] |
|
188 |
+ punpckldq m7, m7 |
|
189 |
+ pmullw m0, m7 |
|
190 |
+ movq m2, [thrnq+8*2] |
|
191 |
+ pmullw m1, m7 |
|
192 |
+ movq m3, [thrnq+8*3] |
|
193 |
+ pmullw m2, m7 |
|
194 |
+ movq [thrq], m0 |
|
195 |
+ movq m4, [thrnq+8*4] |
|
196 |
+ pmullw m3, m7 |
|
197 |
+ movq [thrq+8], m1 |
|
198 |
+ movq m5, [thrnq+8*5] |
|
199 |
+ pmullw m4, m7 |
|
200 |
+ movq [thrq+8*2], m2 |
|
201 |
+ movq m6, [thrnq+8*6] |
|
202 |
+ pmullw m5, m7 |
|
203 |
+ movq [thrq+8*3], m3 |
|
204 |
+ movq m0, [thrnq+8*7] |
|
205 |
+ pmullw m6, m7 |
|
206 |
+ movq [thrq+8*4], m4 |
|
207 |
+ movq m1, [thrnq+8*7+8] |
|
208 |
+ pmullw m0, m7 |
|
209 |
+ movq [thrq+8*5], m5 |
|
210 |
+ movq m2, [thrnq+8*7+8*2] |
|
211 |
+ pmullw m1, m7 |
|
212 |
+ movq [thrq+8*6], m6 |
|
213 |
+ movq m3, [thrnq+8*7+8*3] |
|
214 |
+ pmullw m2, m7 |
|
215 |
+ movq [thrq+8*7], m0 |
|
216 |
+ movq m4, [thrnq+8*7+8*4] |
|
217 |
+ pmullw m3, m7 |
|
218 |
+ movq [thrq+8*7+8], m1 |
|
219 |
+ movq m5, [thrnq+8*7+8*5] |
|
220 |
+ pmullw m4, m7 |
|
221 |
+ movq [thrq+8*7+8*2], m2 |
|
222 |
+ movq m6, [thrnq+8*7+8*6] |
|
223 |
+ pmullw m5, m7 |
|
224 |
+ movq [thrq+8*7+8*3], m3 |
|
225 |
+ movq m0, [thrnq+14*8] |
|
226 |
+ pmullw m6, m7 |
|
227 |
+ movq [thrq+8*7+8*4], m4 |
|
228 |
+ movq m1, [thrnq+14*8+8] |
|
229 |
+ pmullw m0, m7 |
|
230 |
+ movq [thrq+8*7+8*5], m5 |
|
231 |
+ pmullw m1, m7 |
|
232 |
+ movq [thrq+8*7+8*6], m6 |
|
233 |
+ movq [thrq+14*8], m0 |
|
234 |
+ movq [thrq+14*8+8], m1 |
|
235 |
+ RET |
|
236 |
+ |
|
237 |
+%macro COLUMN_FDCT 1-3 0, 0 |
|
238 |
+ movq m1, [srcq+DCTSIZE*0*2] |
|
239 |
+ movq m7, [srcq+DCTSIZE*3*2] |
|
240 |
+ movq m0, m1 |
|
241 |
+ paddw m1, [srcq+DCTSIZE*7*2] |
|
242 |
+ movq m3, m7 |
|
243 |
+ paddw m7, [srcq+DCTSIZE*4*2] |
|
244 |
+ movq m5, m1 |
|
245 |
+ movq m6, [srcq+DCTSIZE*1*2] |
|
246 |
+ psubw m1, m7 |
|
247 |
+ movq m2, [srcq+DCTSIZE*2*2] |
|
248 |
+ movq m4, m6 |
|
249 |
+ paddw m6, [srcq+DCTSIZE*6*2] |
|
250 |
+ paddw m5, m7 |
|
251 |
+ paddw m2, [srcq+DCTSIZE*5*2] |
|
252 |
+ movq m7, m6 |
|
253 |
+ paddw m6, m2 |
|
254 |
+ psubw m7, m2 |
|
255 |
+ movq m2, m5 |
|
256 |
+ paddw m5, m6 |
|
257 |
+ psubw m2, m6 |
|
258 |
+ paddw m7, m1 |
|
259 |
+ movq m6, [thrq+4*16+%2] |
|
260 |
+ psllw m7, 2 |
|
261 |
+ psubw m5, [thrq+%2] |
|
262 |
+ psubw m2, m6 |
|
263 |
+ paddusw m5, [thrq+%2] |
|
264 |
+ paddusw m2, m6 |
|
265 |
+ pmulhw m7, [pw_2D41] |
|
266 |
+ paddw m5, [thrq+%2] |
|
267 |
+ paddw m2, m6 |
|
268 |
+ psubusw m5, [thrq+%2] |
|
269 |
+ psubusw m2, m6 |
|
270 |
+ paddw m5, [pw_2] |
|
271 |
+ movq m6, m2 |
|
272 |
+ paddw m2, m5 |
|
273 |
+ psubw m5, m6 |
|
274 |
+ movq m6, m1 |
|
275 |
+ paddw m1, m7 |
|
276 |
+ psubw m1, [thrq+2*16+%2] |
|
277 |
+ psubw m6, m7 |
|
278 |
+ movq m7, [thrq+6*16+%2] |
|
279 |
+ psraw m5, 2 |
|
280 |
+ paddusw m1, [thrq+2*16+%2] |
|
281 |
+ psubw m6, m7 |
|
282 |
+ paddw m1, [thrq+2*16+%2] |
|
283 |
+ paddusw m6, m7 |
|
284 |
+ psubusw m1, [thrq+2*16+%2] |
|
285 |
+ paddw m6, m7 |
|
286 |
+ psubw m3, [srcq+DCTSIZE*4*2] |
|
287 |
+ psubusw m6, m7 |
|
288 |
+ movq m7, m1 |
|
289 |
+ psraw m2, 2 |
|
290 |
+ psubw m4, [srcq+DCTSIZE*6*2] |
|
291 |
+ psubw m1, m6 |
|
292 |
+ psubw m0, [srcq+DCTSIZE*7*2] |
|
293 |
+ paddw m6, m7 |
|
294 |
+ psraw m6, 2 |
|
295 |
+ movq m7, m2 |
|
296 |
+ pmulhw m1, [pw_5A82] |
|
297 |
+ paddw m2, m6 |
|
298 |
+ movq [rsp], m2 |
|
299 |
+ psubw m7, m6 |
|
300 |
+ movq m2, [srcq+DCTSIZE*2*2] |
|
301 |
+ psubw m1, m6 |
|
302 |
+ psubw m2, [srcq+DCTSIZE*5*2] |
|
303 |
+ movq m6, m5 |
|
304 |
+ movq [rsp+8*3], m7 |
|
305 |
+ paddw m3, m2 |
|
306 |
+ paddw m2, m4 |
|
307 |
+ paddw m4, m0 |
|
308 |
+ movq m7, m3 |
|
309 |
+ psubw m3, m4 |
|
310 |
+ psllw m3, 2 |
|
311 |
+ psllw m7, 2 |
|
312 |
+ pmulhw m3, [pw_187E] |
|
313 |
+ psllw m4, 2 |
|
314 |
+ pmulhw m7, [pw_22A3] |
|
315 |
+ psllw m2, 2 |
|
316 |
+ pmulhw m4, [pw_539F] |
|
317 |
+ paddw m5, m1 |
|
318 |
+ pmulhw m2, [pw_2D41] |
|
319 |
+ psubw m6, m1 |
|
320 |
+ paddw m7, m3 |
|
321 |
+ movq [rsp+8], m5 |
|
322 |
+ paddw m4, m3 |
|
323 |
+ movq m3, [thrq+3*16+%2] |
|
324 |
+ movq m1, m0 |
|
325 |
+ movq [rsp+8*2], m6 |
|
326 |
+ psubw m1, m2 |
|
327 |
+ paddw m0, m2 |
|
328 |
+ movq m5, m1 |
|
329 |
+ movq m2, [thrq+5*16+%2] |
|
330 |
+ psubw m1, m7 |
|
331 |
+ paddw m5, m7 |
|
332 |
+ psubw m1, m3 |
|
333 |
+ movq m7, [thrq+16+%2] |
|
334 |
+ psubw m5, m2 |
|
335 |
+ movq m6, m0 |
|
336 |
+ paddw m0, m4 |
|
337 |
+ paddusw m1, m3 |
|
338 |
+ psubw m6, m4 |
|
339 |
+ movq m4, [thrq+7*16+%2] |
|
340 |
+ psubw m0, m7 |
|
341 |
+ psubw m6, m4 |
|
342 |
+ paddusw m5, m2 |
|
343 |
+ paddusw m6, m4 |
|
344 |
+ paddw m1, m3 |
|
345 |
+ paddw m5, m2 |
|
346 |
+ paddw m6, m4 |
|
347 |
+ psubusw m1, m3 |
|
348 |
+ psubusw m5, m2 |
|
349 |
+ psubusw m6, m4 |
|
350 |
+ movq m4, m1 |
|
351 |
+ por m4, m5 |
|
352 |
+ paddusw m0, m7 |
|
353 |
+ por m4, m6 |
|
354 |
+ paddw m0, m7 |
|
355 |
+ packssdw m4, m4 |
|
356 |
+ psubusw m0, m7 |
|
357 |
+ movd tmpd, m4 |
|
358 |
+ or tmpd, tmpd |
|
359 |
+ jnz %1 |
|
360 |
+ movq m4, [rsp] |
|
361 |
+ movq m1, m0 |
|
362 |
+ pmulhw m0, [pw_3642] |
|
363 |
+ movq m2, m1 |
|
364 |
+ movq m5, [outq+DCTSIZE*0*2] |
|
365 |
+ movq m3, m2 |
|
366 |
+ pmulhw m1, [pw_2441] |
|
367 |
+ paddw m5, m4 |
|
368 |
+ movq m6, [rsp+8] |
|
369 |
+ psraw m3, 2 |
|
370 |
+ pmulhw m2, [pw_0CBB] |
|
371 |
+ psubw m4, m3 |
|
372 |
+ movq m7, [outq+DCTSIZE*1*2] |
|
373 |
+ paddw m5, m3 |
|
374 |
+ movq [outq+DCTSIZE*7*2], m4 |
|
375 |
+ paddw m7, m6 |
|
376 |
+ movq m3, [rsp+8*2] |
|
377 |
+ psubw m6, m0 |
|
378 |
+ movq m4, [outq+DCTSIZE*2*2] |
|
379 |
+ paddw m7, m0 |
|
380 |
+ movq [outq], m5 |
|
381 |
+ paddw m4, m3 |
|
382 |
+ movq [outq+DCTSIZE*6*2], m6 |
|
383 |
+ psubw m3, m1 |
|
384 |
+ movq m5, [outq+DCTSIZE*5*2] |
|
385 |
+ paddw m4, m1 |
|
386 |
+ movq m6, [outq+DCTSIZE*3*2] |
|
387 |
+ paddw m5, m3 |
|
388 |
+ movq m0, [rsp+8*3] |
|
389 |
+ add srcq, 8+%3 |
|
390 |
+ movq [outq+DCTSIZE*1*2], m7 |
|
391 |
+ paddw m6, m0 |
|
392 |
+ movq [outq+DCTSIZE*2*2], m4 |
|
393 |
+ psubw m0, m2 |
|
394 |
+ movq m7, [outq+DCTSIZE*4*2] |
|
395 |
+ paddw m6, m2 |
|
396 |
+ movq [outq+DCTSIZE*5*2], m5 |
|
397 |
+ paddw m7, m0 |
|
398 |
+ movq [outq+DCTSIZE*3*2], m6 |
|
399 |
+ movq [outq+DCTSIZE*4*2], m7 |
|
400 |
+ add outq, 8+%3 |
|
401 |
+%endmacro |
|
402 |
+ |
|
403 |
+%macro COLUMN_IDCT 0-1 0 |
|
404 |
+ movq m3, m5 |
|
405 |
+ psubw m5, m1 |
|
406 |
+ psllw m5, 1 |
|
407 |
+ paddw m3, m1 |
|
408 |
+ movq m2, m0 |
|
409 |
+ psubw m0, m6 |
|
410 |
+ movq m1, m5 |
|
411 |
+ psllw m0, 1 |
|
412 |
+ pmulhw m1, [pw_AC62] |
|
413 |
+ paddw m5, m0 |
|
414 |
+ pmulhw m5, [pw_3B21] |
|
415 |
+ paddw m2, m6 |
|
416 |
+ pmulhw m0, [pw_22A3] |
|
417 |
+ movq m7, m2 |
|
418 |
+ movq m4, [rsp] |
|
419 |
+ psubw m2, m3 |
|
420 |
+ psllw m2, 1 |
|
421 |
+ paddw m7, m3 |
|
422 |
+ pmulhw m2, [pw_2D41] |
|
423 |
+ movq m6, m4 |
|
424 |
+ psraw m7, 2 |
|
425 |
+ paddw m4, [outq] |
|
426 |
+ psubw m6, m7 |
|
427 |
+ movq m3, [rsp+8] |
|
428 |
+ paddw m4, m7 |
|
429 |
+ movq [outq+DCTSIZE*7*2], m6 |
|
430 |
+ paddw m1, m5 |
|
431 |
+ movq [outq], m4 |
|
432 |
+ psubw m1, m7 |
|
433 |
+ movq m7, [rsp+8*2] |
|
434 |
+ psubw m0, m5 |
|
435 |
+ movq m6, [rsp+8*3] |
|
436 |
+ movq m5, m3 |
|
437 |
+ paddw m3, [outq+DCTSIZE*1*2] |
|
438 |
+ psubw m5, m1 |
|
439 |
+ psubw m2, m1 |
|
440 |
+ paddw m3, m1 |
|
441 |
+ movq [outq+DCTSIZE*6*2], m5 |
|
442 |
+ movq m4, m7 |
|
443 |
+ paddw m7, [outq+DCTSIZE*2*2] |
|
444 |
+ psubw m4, m2 |
|
445 |
+ paddw m4, [outq+DCTSIZE*5*2] |
|
446 |
+ paddw m7, m2 |
|
447 |
+ movq [outq+DCTSIZE*1*2], m3 |
|
448 |
+ paddw m0, m2 |
|
449 |
+ movq [outq+DCTSIZE*2*2], m7 |
|
450 |
+ movq m1, m6 |
|
451 |
+ paddw m6, [outq+DCTSIZE*4*2] |
|
452 |
+ psubw m1, m0 |
|
453 |
+ paddw m1, [outq+DCTSIZE*3*2] |
|
454 |
+ paddw m6, m0 |
|
455 |
+ movq [outq+DCTSIZE*5*2], m4 |
|
456 |
+ add srcq, 8+%1 |
|
457 |
+ movq [outq+DCTSIZE*4*2], m6 |
|
458 |
+ movq [outq+DCTSIZE*3*2], m1 |
|
459 |
+ add outq, 8+%1 |
|
460 |
+%endmacro |
|
461 |
+ |
|
462 |
+;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); |
|
463 |
+cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp |
|
464 |
+.fdct1: |
|
465 |
+ COLUMN_FDCT .idct1 |
|
466 |
+ jmp .fdct2 |
|
467 |
+ |
|
468 |
+.idct1: |
|
469 |
+ COLUMN_IDCT |
|
470 |
+ |
|
471 |
+.fdct2: |
|
472 |
+ COLUMN_FDCT .idct2, 8, 16 |
|
473 |
+ sub cntd, 2 |
|
474 |
+ jnz .fdct1 |
|
475 |
+ RET |
|
476 |
+ |
|
477 |
+.idct2: |
|
478 |
+ COLUMN_IDCT 16 |
|
479 |
+ sub cntd, 2 |
|
480 |
+ jnz .fdct1 |
|
481 |
+ RET |
|
482 |
+ |
|
483 |
+;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); |
|
484 |
+cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 |
|
485 |
+ add strideq, strideq |
|
486 |
+ lea stride3q, [strideq+strideq*2] |
|
487 |
+.loop: |
|
488 |
+ movq m0, [srcq+DCTSIZE*0*2] |
|
489 |
+ movq m1, [srcq+DCTSIZE*1*2] |
|
490 |
+ movq m4, m0 |
|
491 |
+ movq m2, [srcq+DCTSIZE*2*2] |
|
492 |
+ punpcklwd m0, m1 |
|
493 |
+ movq m3, [srcq+DCTSIZE*3*2] |
|
494 |
+ punpckhwd m4, m1 |
|
495 |
+ movq m7, m2 |
|
496 |
+ punpcklwd m2, m3 |
|
497 |
+ movq m6, m0 |
|
498 |
+ punpckldq m0, m2 |
|
499 |
+ punpckhdq m6, m2 |
|
500 |
+ movq m5, m0 |
|
501 |
+ punpckhwd m7, m3 |
|
502 |
+ psubw m0, m6 |
|
503 |
+ pmulhw m0, [pw_5A82] |
|
504 |
+ movq m2, m4 |
|
505 |
+ punpckldq m4, m7 |
|
506 |
+ paddw m5, m6 |
|
507 |
+ punpckhdq m2, m7 |
|
508 |
+ movq m1, m4 |
|
509 |
+ psllw m0, 2 |
|
510 |
+ paddw m4, m2 |
|
511 |
+ movq m3, [srcq+DCTSIZE*0*2+8] |
|
512 |
+ psubw m1, m2 |
|
513 |
+ movq m2, [srcq+DCTSIZE*1*2+8] |
|
514 |
+ psubw m0, m5 |
|
515 |
+ movq m6, m4 |
|
516 |
+ paddw m4, m5 |
|
517 |
+ psubw m6, m5 |
|
518 |
+ movq m7, m1 |
|
519 |
+ movq m5, [srcq+DCTSIZE*2*2+8] |
|
520 |
+ paddw m1, m0 |
|
521 |
+ movq [rsp], m4 |
|
522 |
+ movq m4, m3 |
|
523 |
+ movq [rsp+8], m6 |
|
524 |
+ punpcklwd m3, m2 |
|
525 |
+ movq m6, [srcq+DCTSIZE*3*2+8] |
|
526 |
+ punpckhwd m4, m2 |
|
527 |
+ movq m2, m5 |
|
528 |
+ punpcklwd m5, m6 |
|
529 |
+ psubw m7, m0 |
|
530 |
+ punpckhwd m2, m6 |
|
531 |
+ movq m0, m3 |
|
532 |
+ punpckldq m3, m5 |
|
533 |
+ punpckhdq m0, m5 |
|
534 |
+ movq m5, m4 |
|
535 |
+ movq m6, m3 |
|
536 |
+ punpckldq m4, m2 |
|
537 |
+ psubw m3, m0 |
|
538 |
+ punpckhdq m5, m2 |
|
539 |
+ paddw m6, m0 |
|
540 |
+ movq m2, m4 |
|
541 |
+ movq m0, m3 |
|
542 |
+ psubw m4, m5 |
|
543 |
+ pmulhw m0, [pw_AC62] |
|
544 |
+ paddw m3, m4 |
|
545 |
+ pmulhw m3, [pw_3B21] |
|
546 |
+ paddw m2, m5 |
|
547 |
+ pmulhw m4, [pw_22A3] |
|
548 |
+ movq m5, m2 |
|
549 |
+ psubw m2, m6 |
|
550 |
+ paddw m5, m6 |
|
551 |
+ pmulhw m2, [pw_2D41] |
|
552 |
+ paddw m0, m3 |
|
553 |
+ psllw m0, 3 |
|
554 |
+ psubw m4, m3 |
|
555 |
+ movq m6, [rsp] |
|
556 |
+ movq m3, m1 |
|
557 |
+ psllw m4, 3 |
|
558 |
+ psubw m0, m5 |
|
559 |
+ psllw m2, 3 |
|
560 |
+ paddw m1, m0 |
|
561 |
+ psubw m2, m0 |
|
562 |
+ psubw m3, m0 |
|
563 |
+ paddw m4, m2 |
|
564 |
+ movq m0, m7 |
|
565 |
+ paddw m7, m2 |
|
566 |
+ psubw m0, m2 |
|
567 |
+ movq m2, [pw_4] |
|
568 |
+ psubw m6, m5 |
|
569 |
+ paddw m5, [rsp] |
|
570 |
+ paddw m1, m2 |
|
571 |
+ paddw m5, m2 |
|
572 |
+ psraw m1, 3 |
|
573 |
+ paddw m7, m2 |
|
574 |
+ psraw m5, 3 |
|
575 |
+ paddw m5, [dstq] |
|
576 |
+ psraw m7, 3 |
|
577 |
+ paddw m1, [dstq+strideq*1] |
|
578 |
+ paddw m0, m2 |
|
579 |
+ paddw m7, [dstq+strideq*2] |
|
580 |
+ paddw m3, m2 |
|
581 |
+ movq [dstq], m5 |
|
582 |
+ paddw m6, m2 |
|
583 |
+ movq [dstq+strideq*1], m1 |
|
584 |
+ psraw m0, 3 |
|
585 |
+ movq [dstq+strideq*2], m7 |
|
586 |
+ add dstq, stride3q |
|
587 |
+ movq m5, [rsp+8] |
|
588 |
+ psraw m3, 3 |
|
589 |
+ paddw m0, [dstq+strideq*2] |
|
590 |
+ psubw m5, m4 |
|
591 |
+ paddw m3, [dstq+stride3q*1] |
|
592 |
+ psraw m6, 3 |
|
593 |
+ paddw m4, [rsp+8] |
|
594 |
+ paddw m5, m2 |
|
595 |
+ paddw m6, [dstq+strideq*4] |
|
596 |
+ paddw m4, m2 |
|
597 |
+ movq [dstq+strideq*2], m0 |
|
598 |
+ psraw m5, 3 |
|
599 |
+ paddw m5, [dstq] |
|
600 |
+ psraw m4, 3 |
|
601 |
+ paddw m4, [dstq+strideq*1] |
|
602 |
+ add srcq, DCTSIZE*2*4 |
|
603 |
+ movq [dstq+stride3q*1], m3 |
|
604 |
+ movq [dstq+strideq*4], m6 |
|
605 |
+ movq [dstq], m5 |
|
606 |
+ movq [dstq+strideq*1], m4 |
|
607 |
+ sub dstq, stride3q |
|
608 |
+ add dstq, 8 |
|
609 |
+ dec r3d |
|
610 |
+ jnz .loop |
|
611 |
+ RET |
|
612 |
+ |
|
613 |
+;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); |
|
614 |
+cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3 |
|
615 |
+ lea stride3q, [strideq+strideq*2] |
|
616 |
+.loop: |
|
617 |
+ movd m0, [pixq] |
|
618 |
+ pxor m7, m7 |
|
619 |
+ movd m1, [pixq+strideq*1] |
|
620 |
+ punpcklbw m0, m7 |
|
621 |
+ movd m2, [pixq+strideq*2] |
|
622 |
+ punpcklbw m1, m7 |
|
623 |
+ punpcklbw m2, m7 |
|
624 |
+ add pixq,stride3q |
|
625 |
+ movq m5, m0 |
|
626 |
+ movd m3, [pixq+strideq*4] |
|
627 |
+ movq m6, m1 |
|
628 |
+ movd m4, [pixq+stride3q*1] |
|
629 |
+ punpcklbw m3, m7 |
|
630 |
+ psubw m5, m3 |
|
631 |
+ punpcklbw m4, m7 |
|
632 |
+ paddw m0, m3 |
|
633 |
+ psubw m6, m4 |
|
634 |
+ movd m3, [pixq+strideq*2] |
|
635 |
+ paddw m1, m4 |
|
636 |
+ movq [rsp], m5 |
|
637 |
+ punpcklbw m3, m7 |
|
638 |
+ movq [rsp+8], m6 |
|
639 |
+ movq m4, m2 |
|
640 |
+ movd m5, [pixq] |
|
641 |
+ paddw m2, m3 |
|
642 |
+ movd m6, [pixq+strideq*1] |
|
643 |
+ punpcklbw m5, m7 |
|
644 |
+ psubw m4, m3 |
|
645 |
+ punpcklbw m6, m7 |
|
646 |
+ movq m3, m5 |
|
647 |
+ paddw m5, m6 |
|
648 |
+ psubw m3, m6 |
|
649 |
+ movq m6, m0 |
|
650 |
+ movq m7, m1 |
|
651 |
+ psubw m0, m5 |
|
652 |
+ psubw m1, m2 |
|
653 |
+ paddw m7, m2 |
|
654 |
+ paddw m1, m0 |
|
655 |
+ movq m2, m7 |
|
656 |
+ psllw m1, 2 |
|
657 |
+ paddw m6, m5 |
|
658 |
+ pmulhw m1, [pw_2D41] |
|
659 |
+ paddw m7, m6 |
|
660 |
+ psubw m6, m2 |
|
661 |
+ movq m5, m0 |
|
662 |
+ movq m2, m7 |
|
663 |
+ punpcklwd m7, m6 |
|
664 |
+ paddw m0, m1 |
|
665 |
+ punpckhwd m2, m6 |
|
666 |
+ psubw m5, m1 |
|
667 |
+ movq m6, m0 |
|
668 |
+ movq m1, [rsp+8] |
|
669 |
+ punpcklwd m0, m5 |
|
670 |
+ punpckhwd m6, m5 |
|
671 |
+ movq m5, m0 |
|
672 |
+ punpckldq m0, m7 |
|
673 |
+ paddw m3, m4 |
|
674 |
+ punpckhdq m5, m7 |
|
675 |
+ movq m7, m6 |
|
676 |
+ movq [srcq+DCTSIZE*0*2], m0 |
|
677 |
+ punpckldq m6, m2 |
|
678 |
+ movq [srcq+DCTSIZE*1*2], m5 |
|
679 |
+ punpckhdq m7, m2 |
|
680 |
+ movq [srcq+DCTSIZE*2*2], m6 |
|
681 |
+ paddw m4, m1 |
|
682 |
+ movq [srcq+DCTSIZE*3*2], m7 |
|
683 |
+ psllw m3, 2 |
|
684 |
+ movq m2, [rsp] |
|
685 |
+ psllw m4, 2 |
|
686 |
+ pmulhw m4, [pw_2D41] |
|
687 |
+ paddw m1, m2 |
|
688 |
+ psllw m1, 2 |
|
689 |
+ movq m0, m3 |
|
690 |
+ pmulhw m0, [pw_22A3] |
|
691 |
+ psubw m3, m1 |
|
692 |
+ pmulhw m3, [pw_187E] |
|
693 |
+ movq m5, m2 |
|
694 |
+ pmulhw m1, [pw_539F] |
|
695 |
+ psubw m2, m4 |
|
696 |
+ paddw m5, m4 |
|
697 |
+ movq m6, m2 |
|
698 |
+ paddw m0, m3 |
|
699 |
+ movq m7, m5 |
|
700 |
+ paddw m2, m0 |
|
701 |
+ psubw m6, m0 |
|
702 |
+ movq m4, m2 |
|
703 |
+ paddw m1, m3 |
|
704 |
+ punpcklwd m2, m6 |
|
705 |
+ paddw m5, m1 |
|
706 |
+ punpckhwd m4, m6 |
|
707 |
+ psubw m7, m1 |
|
708 |
+ movq m6, m5 |
|
709 |
+ punpcklwd m5, m7 |
|
710 |
+ punpckhwd m6, m7 |
|
711 |
+ movq m7, m2 |
|
712 |
+ punpckldq m2, m5 |
|
713 |
+ sub pixq, stride3q |
|
714 |
+ punpckhdq m7, m5 |
|
715 |
+ movq m5, m4 |
|
716 |
+ movq [srcq+DCTSIZE*0*2+8], m2 |
|
717 |
+ punpckldq m4, m6 |
|
718 |
+ movq [srcq+DCTSIZE*1*2+8], m7 |
|
719 |
+ punpckhdq m5, m6 |
|
720 |
+ movq [srcq+DCTSIZE*2*2+8], m4 |
|
721 |
+ add pixq, 4 |
|
722 |
+ movq [srcq+DCTSIZE*3*2+8], m5 |
|
723 |
+ add srcq, DCTSIZE*4*2 |
|
724 |
+ dec cntd |
|
725 |
+ jnz .loop |
|
726 |
+ RET |
0 | 727 |
deleted file mode 100644 |
... | ... |
@@ -1,1409 +0,0 @@ |
1 |
-/* |
|
2 |
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> |
|
3 |
- * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru> |
|
4 |
- * |
|
5 |
- * This file is part of FFmpeg. |
|
6 |
- * |
|
7 |
- * FFmpeg is free software; you can redistribute it and/or modify |
|
8 |
- * it under the terms of the GNU General Public License as published by |
|
9 |
- * the Free Software Foundation; either version 2 of the License, or |
|
10 |
- * (at your option) any later version. |
|
11 |
- * |
|
12 |
- * FFmpeg is distributed in the hope that it will be useful, |
|
13 |
- * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
15 |
- * GNU General Public License for more details. |
|
16 |
- * |
|
17 |
- * You should have received a copy of the GNU General Public License along |
|
18 |
- * with FFmpeg; if not, write to the Free Software Foundation, Inc., |
|
19 |
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
- */ |
|
21 |
- |
|
22 |
-#include "libavutil/attributes.h" |
|
23 |
-#include "libavutil/cpu.h" |
|
24 |
-#include "libavutil/mem.h" |
|
25 |
-#include "libavutil/x86/asm.h" |
|
26 |
-#include "libavfilter/vf_fspp.h" |
|
27 |
- |
|
28 |
-#if HAVE_MMX_INLINE |
|
29 |
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = { |
|
30 |
- { 0, 48, 12, 60, 3, 51, 15, 63, }, |
|
31 |
- { 32, 16, 44, 28, 35, 19, 47, 31, }, |
|
32 |
- { 8, 56, 4, 52, 11, 59, 7, 55, }, |
|
33 |
- { 40, 24, 36, 20, 43, 27, 39, 23, }, |
|
34 |
- { 2, 50, 14, 62, 1, 49, 13, 61, }, |
|
35 |
- { 34, 18, 46, 30, 33, 17, 45, 29, }, |
|
36 |
- { 10, 58, 6, 54, 9, 57, 5, 53, }, |
|
37 |
- { 42, 26, 38, 22, 41, 25, 37, 21, }, |
|
38 |
-}; |
|
39 |
- |
|
40 |
-//This func reads from 1 slice, 1 and clears 0 & 1 |
|
41 |
-static void store_slice_mmx(uint8_t *dst, int16_t *src, |
|
42 |
- ptrdiff_t dst_stride, ptrdiff_t src_stride, |
|
43 |
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) |
|
44 |
-{ |
|
45 |
- const uint8_t *od = &dither[0][0]; |
|
46 |
- const uint8_t *end = &dither[height][0]; |
|
47 |
- width = (width + 7) & ~7; |
|
48 |
- dst_stride -= width; |
|
49 |
- |
|
50 |
- __asm__ volatile( |
|
51 |
- "mov %5 , %%"REG_d" \n\t" |
|
52 |
- "mov %6 , %%"REG_S" \n\t" |
|
53 |
- "mov %7 , %%"REG_D" \n\t" |
|
54 |
- "mov %1 , %%"REG_a" \n\t" |
|
55 |
- "movd %%"REG_d" , %%mm5 \n\t" |
|
56 |
- "xor $-1 , %%"REG_d" \n\t" |
|
57 |
- "mov %%"REG_a" , %%"REG_c" \n\t" |
|
58 |
- "add $7 , %%"REG_d" \n\t" |
|
59 |
- "neg %%"REG_a" \n\t" |
|
60 |
- "sub %0 , %%"REG_c" \n\t" |
|
61 |
- "add %%"REG_c" , %%"REG_c" \n\t" |
|
62 |
- "movd %%"REG_d" , %%mm2 \n\t" |
|
63 |
- "mov %%"REG_c" , %1 \n\t" |
|
64 |
- "mov %2 , %%"REG_d" \n\t" |
|
65 |
- "shl $4 , %%"REG_a" \n\t" |
|
66 |
- |
|
67 |
- "2: \n\t" |
|
68 |
- "movq (%%"REG_d") , %%mm3 \n\t" |
|
69 |
- "movq %%mm3 , %%mm4 \n\t" |
|
70 |
- "pxor %%mm7 , %%mm7 \n\t" |
|
71 |
- "punpcklbw %%mm7 , %%mm3 \n\t" |
|
72 |
- "punpckhbw %%mm7 , %%mm4 \n\t" |
|
73 |
- "mov %0 , %%"REG_c" \n\t" |
|
74 |
- "psraw %%mm5 , %%mm3 \n\t" |
|
75 |
- "psraw %%mm5 , %%mm4 \n\t" |
|
76 |
- "1: \n\t" |
|
77 |
- "movq %%mm7, (%%"REG_S",%%"REG_a") \n\t" |
|
78 |
- "movq (%%"REG_S") , %%mm0 \n\t" |
|
79 |
- "movq 8(%%"REG_S"), %%mm1 \n\t" |
|
80 |
- |
|
81 |
- "movq %%mm7, 8(%%"REG_S",%%"REG_a")\n\t" |
|
82 |
- "paddw %%mm3, %%mm0 \n\t" |
|
83 |
- "paddw %%mm4, %%mm1 \n\t" |
|
84 |
- |
|
85 |
- "movq %%mm7, (%%"REG_S") \n\t" |
|
86 |
- "psraw %%mm2, %%mm0 \n\t" |
|
87 |
- "psraw %%mm2, %%mm1 \n\t" |
|
88 |
- |
|
89 |
- "movq %%mm7, 8(%%"REG_S") \n\t" |
|
90 |
- "packuswb %%mm1, %%mm0 \n\t" |
|
91 |
- "add $16, %%"REG_S" \n\t" |
|
92 |
- |
|
93 |
- "movq %%mm0, (%%"REG_D") \n\t" |
|
94 |
- "add $8, %%"REG_D" \n\t" |
|
95 |
- "sub $8, %%"REG_c" \n\t" |
|
96 |
- "jg 1b \n\t" |
|
97 |
- "add %1, %%"REG_S" \n\t" |
|
98 |
- "add $8, %%"REG_d" \n\t" |
|
99 |
- "add %3, %%"REG_D" \n\t" |
|
100 |
- "cmp %4, %%"REG_d" \n\t" |
|
101 |
- "jl 2b \n\t" |
|
102 |
- |
|
103 |
- : |
|
104 |
- : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end), |
|
105 |
- "m" (log2_scale), "m" (src), "m" (dst) //input |
|
106 |
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
|
107 |
- ); |
|
108 |
-} |
|
109 |
- |
|
110 |
-//This func reads from 2 slices, 0 & 2 and clears 2-nd |
|
111 |
-static void store_slice2_mmx(uint8_t *dst, int16_t *src, |
|
112 |
- ptrdiff_t dst_stride, ptrdiff_t src_stride, |
|
113 |
- ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) |
|
114 |
-{ |
|
115 |
- const uint8_t *od = &dither[0][0]; |
|
116 |
- const uint8_t *end = &dither[height][0]; |
|
117 |
- width = (width + 7) & ~7; |
|
118 |
- dst_stride -= width; |
|
119 |
- |
|
120 |
- __asm__ volatile( |
|
121 |
- "mov %5, %%"REG_d" \n\t" |
|
122 |
- "mov %6, %%"REG_S" \n\t" |
|
123 |
- "mov %7, %%"REG_D" \n\t" |
|
124 |
- "mov %1, %%"REG_a" \n\t" |
|
125 |
- "movd %%"REG_d", %%mm5 \n\t" |
|
126 |
- "xor $-1, %%"REG_d" \n\t" |
|
127 |
- "mov %%"REG_a", %%"REG_c" \n\t" |
|
128 |
- "add $7, %%"REG_d" \n\t" |
|
129 |
- "sub %0, %%"REG_c" \n\t" |
|
130 |
- "add %%"REG_c", %%"REG_c" \n\t" |
|
131 |
- "movd %%"REG_d", %%mm2 \n\t" |
|
132 |
- "mov %%"REG_c", %1 \n\t" |
|
133 |
- "mov %2, %%"REG_d" \n\t" |
|
134 |
- "shl $5, %%"REG_a" \n\t" |
|
135 |
- |
|
136 |
- "2: \n\t" |
|
137 |
- "movq (%%"REG_d"), %%mm3 \n\t" |
|
138 |
- "movq %%mm3, %%mm4 \n\t" |
|
139 |
- "pxor %%mm7, %%mm7 \n\t" |
|
140 |
- "punpcklbw %%mm7, %%mm3 \n\t" |
|
141 |
- "punpckhbw %%mm7, %%mm4 \n\t" |
|
142 |
- "mov %0, %%"REG_c" \n\t" |
|
143 |
- "psraw %%mm5, %%mm3 \n\t" |
|
144 |
- "psraw %%mm5, %%mm4 \n\t" |
|
145 |
- "1: \n\t" |
|
146 |
- "movq (%%"REG_S"), %%mm0 \n\t" |
|
147 |
- "movq 8(%%"REG_S"), %%mm1 \n\t" |
|
148 |
- "paddw %%mm3, %%mm0 \n\t" |
|
149 |
- |
|
150 |
- "paddw (%%"REG_S",%%"REG_a"),%%mm0\n\t" |
|
151 |
- "paddw %%mm4, %%mm1 \n\t" |
|
152 |
- "movq 8(%%"REG_S",%%"REG_a"),%%mm6\n\t" |
|
153 |
- |
|
154 |
- "movq %%mm7, (%%"REG_S",%%"REG_a")\n\t" |
|
155 |
- "psraw %%mm2, %%mm0 \n\t" |
|
156 |
- "paddw %%mm6, %%mm1 \n\t" |
|
157 |
- |
|
158 |
- "movq %%mm7,8(%%"REG_S",%%"REG_a")\n\t" |
|
159 |
- "psraw %%mm2, %%mm1 \n\t" |
|
160 |
- "packuswb %%mm1, %%mm0 \n\t" |
|
161 |
- |
|
162 |
- "movq %%mm0, (%%"REG_D") \n\t" |
|
163 |
- "add $16, %%"REG_S" \n\t" |
|
164 |
- "add $8, %%"REG_D" \n\t" |
|
165 |
- "sub $8, %%"REG_c" \n\t" |
|
166 |
- "jg 1b \n\t" |
|
167 |
- "add %1, %%"REG_S" \n\t" |
|
168 |
- "add $8, %%"REG_d" \n\t" |
|
169 |
- "add %3, %%"REG_D" \n\t" |
|
170 |
- "cmp %4, %%"REG_d" \n\t" |
|
171 |
- "jl 2b \n\t" |
|
172 |
- |
|
173 |
- : |
|
174 |
- : "m" (width), "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end), |
|
175 |
- "m" (log2_scale), "m" (src), "m" (dst) //input |
|
176 |
- : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S |
|
177 |
- ); |
|
178 |
-} |
|
179 |
- |
|
180 |
-static void mul_thrmat_mmx(FSPPContext *p, int q) |
|
181 |
-{ |
|
182 |
- uint64_t *adr = &p->threshold_mtx_noq[0]; |
|
183 |
- |
|
184 |
- __asm__ volatile( |
|
185 |
- "movd %0, %%mm7 \n\t" |
|
186 |
- "add $8*8*2, %%"REG_D" \n\t" |
|
187 |
- "movq 0*8(%%"REG_S"), %%mm0 \n\t" |
|
188 |
- "punpcklwd %%mm7, %%mm7 \n\t" |
|
189 |
- "movq 1*8(%%"REG_S"), %%mm1 \n\t" |
|
190 |
- "punpckldq %%mm7, %%mm7 \n\t" |
|
191 |
- "pmullw %%mm7, %%mm0 \n\t" |
|
192 |
- |
|
193 |
- "movq 2*8(%%"REG_S"), %%mm2 \n\t" |
|
194 |
- "pmullw %%mm7, %%mm1 \n\t" |
|
195 |
- |
|
196 |
- "movq 3*8(%%"REG_S"), %%mm3 \n\t" |
|
197 |
- "pmullw %%mm7, %%mm2 \n\t" |
|
198 |
- |
|
199 |
- "movq %%mm0, 0*8(%%"REG_D") \n\t" |
|
200 |
- "movq 4*8(%%"REG_S"), %%mm4 \n\t" |
|
201 |
- "pmullw %%mm7, %%mm3 \n\t" |
|
202 |
- |
|
203 |
- "movq %%mm1, 1*8(%%"REG_D") \n\t" |
|
204 |
- "movq 5*8(%%"REG_S"), %%mm5 \n\t" |
|
205 |
- "pmullw %%mm7, %%mm4 \n\t" |
|
206 |
- |
|
207 |
- "movq %%mm2, 2*8(%%"REG_D") \n\t" |
|
208 |
- "movq 6*8(%%"REG_S"), %%mm6 \n\t" |
|
209 |
- "pmullw %%mm7, %%mm5 \n\t" |
|
210 |
- |
|
211 |
- "movq %%mm3, 3*8(%%"REG_D") \n\t" |
|
212 |
- "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t" |
|
213 |
- "pmullw %%mm7, %%mm6 \n\t" |
|
214 |
- |
|
215 |
- "movq %%mm4, 4*8(%%"REG_D") \n\t" |
|
216 |
- "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t" |
|
217 |
- "pmullw %%mm7, %%mm0 \n\t" |
|
218 |
- |
|
219 |
- "movq %%mm5, 5*8(%%"REG_D") \n\t" |
|
220 |
- "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t" |
|
221 |
- "pmullw %%mm7, %%mm1 \n\t" |
|
222 |
- |
|
223 |
- "movq %%mm6, 6*8(%%"REG_D") \n\t" |
|
224 |
- "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t" |
|
225 |
- "pmullw %%mm7, %%mm2 \n\t" |
|
226 |
- |
|
227 |
- "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t" |
|
228 |
- "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t" |
|
229 |
- "pmullw %%mm7, %%mm3 \n\t" |
|
230 |
- |
|
231 |
- "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t" |
|
232 |
- "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t" |
|
233 |
- "pmullw %%mm7, %%mm4 \n\t" |
|
234 |
- |
|
235 |
- "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t" |
|
236 |
- "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t" |
|
237 |
- "pmullw %%mm7, %%mm5 \n\t" |
|
238 |
- |
|
239 |
- "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t" |
|
240 |
- "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t" |
|
241 |
- "pmullw %%mm7, %%mm6 \n\t" |
|
242 |
- |
|
243 |
- "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t" |
|
244 |
- "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t" |
|
245 |
- "pmullw %%mm7, %%mm0 \n\t" |
|
246 |
- |
|
247 |
- "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t" |
|
248 |
- "pmullw %%mm7, %%mm1 \n\t" |
|
249 |
- |
|
250 |
- "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t" |
|
251 |
- "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t" |
|
252 |
- "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t" |
|
253 |
- |
|
254 |
- : "+g" (q), "+S" (adr), "+D" (adr) |
|
255 |
- : |
|
256 |
- ); |
|
257 |
-} |
|
258 |
- |
|
259 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433) = FIX64(0.382683433, 14); |
|
260 |
-DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_541196100)= FIX64(0.541196100, 14); |
|
261 |
-DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_707106781)= FIX64(0.707106781, 14); |
|
262 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965) = FIX64(1.306562965, 14); |
|
263 |
- |
|
264 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A) = FIX64(1.414213562, 14); |
|
265 |
- |
|
266 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065) = FIX64(1.847759065, 13); |
|
267 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930) = FIX64(-2.613125930, 13); |
|
268 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562) = FIX64(1.414213562, 13); |
|
269 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200) = FIX64(1.082392200, 13); |
|
270 |
-//for t3,t5,t7 == 0 shortcut |
|
271 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065) = FIX64(0.847759065, 14); |
|
272 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497) = FIX64(0.566454497, 14); |
|
273 |
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367) = FIX64(0.198912367, 14); |
|
274 |
- |
|
275 |
-DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND) = C64(4); |
|
276 |
-DECLARE_ASM_CONST(8, uint64_t, MM_2) = C64(2); |
|
277 |
- |
|
278 |
-static void column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt) |
|
279 |
-{ |
|
280 |
- DECLARE_ALIGNED(8, uint64_t, temps)[4]; |
|
281 |
- |
|
282 |
- __asm__ volatile( |
|
283 |
- |
|
284 |
- "1: \n\t" |
|
285 |
- "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t" |
|
286 |
- // |
|
287 |
- "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t" |
|
288 |
- "movq %%mm1, %%mm0 \n\t" |
|
289 |
- |
|
290 |
- "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1 \n\t" //t0 |
|
291 |
- "movq %%mm7, %%mm3 \n\t" |
|
292 |
- |
|
293 |
- "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7 \n\t" //t3 |
|
294 |
- "movq %%mm1, %%mm5 \n\t" |
|
295 |
- |
|
296 |
- "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t" |
|
297 |
- "psubw %%mm7, %%mm1 \n\t" //t13 |
|
298 |
- |
|
299 |
- "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" |
|
300 |
- "movq %%mm6, %%mm4 \n\t" |
|
301 |
- |
|
302 |
- "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6 \n\t" //t1 |
|
303 |
- "paddw %%mm7, %%mm5 \n\t" //t10 |
|
304 |
- |
|
305 |
- "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t2 |
|
306 |
- "movq %%mm6, %%mm7 \n\t" |
|
307 |
- |
|
308 |
- "paddw %%mm2, %%mm6 \n\t" //t11 |
|
309 |
- "psubw %%mm2, %%mm7 \n\t" //t12 |
|
310 |
- |
|
311 |
- "movq %%mm5, %%mm2 \n\t" |
|
312 |
- "paddw %%mm6, %%mm5 \n\t" //d0 |
|
313 |
- // i0 t13 t12 i3 i1 d0 - d4 |
|
314 |
- "psubw %%mm6, %%mm2 \n\t" //d4 |
|
315 |
- "paddw %%mm1, %%mm7 \n\t" |
|
316 |
- |
|
317 |
- "movq 4*16(%%"REG_d"), %%mm6 \n\t" |
|
318 |
- "psllw $2, %%mm7 \n\t" |
|
319 |
- |
|
320 |
- "psubw 0*16(%%"REG_d"), %%mm5 \n\t" |
|
321 |
- "psubw %%mm6, %%mm2 \n\t" |
|
322 |
- |
|
323 |
- "paddusw 0*16(%%"REG_d"), %%mm5 \n\t" |
|
324 |
- "paddusw %%mm6, %%mm2 \n\t" |
|
325 |
- |
|
326 |
- "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t" |
|
327 |
- // |
|
328 |
- "paddw 0*16(%%"REG_d"), %%mm5 \n\t" |
|
329 |
- "paddw %%mm6, %%mm2 \n\t" |
|
330 |
- |
|
331 |
- "psubusw 0*16(%%"REG_d"), %%mm5 \n\t" |
|
332 |
- "psubusw %%mm6, %%mm2 \n\t" |
|
333 |
- |
|
334 |
-//This func is totally compute-bound, operates at huge speed. So, DC shortcut |
|
335 |
-// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3). |
|
336 |
-//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare. |
|
337 |
- "paddw "MANGLE(MM_2)", %%mm5 \n\t" |
|
338 |
- "movq %%mm2, %%mm6 \n\t" |
|
339 |
- |
|
340 |
- "paddw %%mm5, %%mm2 \n\t" |
|
341 |
- "psubw %%mm6, %%mm5 \n\t" |
|
342 |
- |
|
343 |
- "movq %%mm1, %%mm6 \n\t" |
|
344 |
- "paddw %%mm7, %%mm1 \n\t" //d2 |
|
345 |
- |
|
346 |
- "psubw 2*16(%%"REG_d"), %%mm1 \n\t" |
|
347 |
- "psubw %%mm7, %%mm6 \n\t" //d6 |
|
348 |
- |
|
349 |
- "movq 6*16(%%"REG_d"), %%mm7 \n\t" |
|
350 |
- "psraw $2, %%mm5 \n\t" |
|
351 |
- |
|
352 |
- "paddusw 2*16(%%"REG_d"), %%mm1 \n\t" |
|
353 |
- "psubw %%mm7, %%mm6 \n\t" |
|
354 |
- // t7 d2 /t11 t4 t6 - d6 /t10 |
|
355 |
- |
|
356 |
- "paddw 2*16(%%"REG_d"), %%mm1 \n\t" |
|
357 |
- "paddusw %%mm7, %%mm6 \n\t" |
|
358 |
- |
|
359 |
- "psubusw 2*16(%%"REG_d"), %%mm1 \n\t" |
|
360 |
- "paddw %%mm7, %%mm6 \n\t" |
|
361 |
- |
|
362 |
- "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3 \n\t" |
|
363 |
- "psubusw %%mm7, %%mm6 \n\t" |
|
364 |
- |
|
365 |
- //movq [edi+"DCTSIZE_S"*2*2], mm1 |
|
366 |
- //movq [edi+"DCTSIZE_S"*6*2], mm6 |
|
367 |
- "movq %%mm1, %%mm7 \n\t" |
|
368 |
- "psraw $2, %%mm2 \n\t" |
|
369 |
- |
|
370 |
- "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4 \n\t" |
|
371 |
- "psubw %%mm6, %%mm1 \n\t" |
|
372 |
- |
|
373 |
- "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0 \n\t" |
|
374 |
- "paddw %%mm7, %%mm6 \n\t" //'t13 |
|
375 |
- |
|
376 |
- "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! --- |
|
377 |
- "movq %%mm2, %%mm7 \n\t" |
|
378 |
- |
|
379 |
- "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t" |
|
380 |
- "paddw %%mm6, %%mm2 \n\t" //'t0 |
|
381 |
- |
|
382 |
- "movq %%mm2, 0*8+%3 \n\t" //! |
|
383 |
- "psubw %%mm6, %%mm7 \n\t" //'t3 |
|
384 |
- |
|
385 |
- "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" |
|
386 |
- "psubw %%mm6, %%mm1 \n\t" //'t12 |
|
387 |
- |
|
388 |
- "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2 \n\t" //t5 |
|
389 |
- "movq %%mm5, %%mm6 \n\t" |
|
390 |
- |
|
391 |
- "movq %%mm7, 3*8+%3 \n\t" |
|
392 |
- "paddw %%mm2, %%mm3 \n\t" //t10 |
|
393 |
- |
|
394 |
- "paddw %%mm4, %%mm2 \n\t" //t11 |
|
395 |
- "paddw %%mm0, %%mm4 \n\t" //t12 |
|
396 |
- |
|
397 |
- "movq %%mm3, %%mm7 \n\t" |
|
398 |
- "psubw %%mm4, %%mm3 \n\t" |
|
399 |
- |
|
400 |
- "psllw $2, %%mm3 \n\t" |
|
401 |
- "psllw $2, %%mm7 \n\t" //opt for P6 |
|
402 |
- |
|
403 |
- "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" |
|
404 |
- "psllw $2, %%mm4 \n\t" |
|
405 |
- |
|
406 |
- "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t" |
|
407 |
- "psllw $2, %%mm2 \n\t" |
|
408 |
- |
|
409 |
- "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t" |
|
410 |
- "paddw %%mm1, %%mm5 \n\t" //'t1 |
|
411 |
- |
|
412 |
- "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t" |
|
413 |
- "psubw %%mm1, %%mm6 \n\t" //'t2 |
|
414 |
- // t7 't12 't11 t4 t6 - 't13 't10 --- |
|
415 |
- |
|
416 |
- "paddw %%mm3, %%mm7 \n\t" //z2 |
|
417 |
- |
|
418 |
- "movq %%mm5, 1*8+%3 \n\t" |
|
419 |
- "paddw %%mm3, %%mm4 \n\t" //z4 |
|
420 |
- |
|
421 |
- "movq 3*16(%%"REG_d"), %%mm3 \n\t" |
|
422 |
- "movq %%mm0, %%mm1 \n\t" |
|
423 |
- |
|
424 |
- "movq %%mm6, 2*8+%3 \n\t" |
|
425 |
- "psubw %%mm2, %%mm1 \n\t" //z13 |
|
426 |
- |
|
427 |
-//=== |
|
428 |
- "paddw %%mm2, %%mm0 \n\t" //z11 |
|
429 |
- "movq %%mm1, %%mm5 \n\t" |
|
430 |
- |
|
431 |
- "movq 5*16(%%"REG_d"), %%mm2 \n\t" |
|
432 |
- "psubw %%mm7, %%mm1 \n\t" //d3 |
|
433 |
- |
|
434 |
- "paddw %%mm7, %%mm5 \n\t" //d5 |
|
435 |
- "psubw %%mm3, %%mm1 \n\t" |
|
436 |
- |
|
437 |
- "movq 1*16(%%"REG_d"), %%mm7 \n\t" |
|
438 |
- "psubw %%mm2, %%mm5 \n\t" |
|
439 |
- |
|
440 |
- "movq %%mm0, %%mm6 \n\t" |
|
441 |
- "paddw %%mm4, %%mm0 \n\t" //d1 |
|
442 |
- |
|
443 |
- "paddusw %%mm3, %%mm1 \n\t" |
|
444 |
- "psubw %%mm4, %%mm6 \n\t" //d7 |
|
445 |
- |
|
446 |
- // d1 d3 - - - d5 d7 - |
|
447 |
- "movq 7*16(%%"REG_d"), %%mm4 \n\t" |
|
448 |
- "psubw %%mm7, %%mm0 \n\t" |
|
449 |
- |
|
450 |
- "psubw %%mm4, %%mm6 \n\t" |
|
451 |
- "paddusw %%mm2, %%mm5 \n\t" |
|
452 |
- |
|
453 |
- "paddusw %%mm4, %%mm6 \n\t" |
|
454 |
- "paddw %%mm3, %%mm1 \n\t" |
|
455 |
- |
|
456 |
- "paddw %%mm2, %%mm5 \n\t" |
|
457 |
- "paddw %%mm4, %%mm6 \n\t" |
|
458 |
- |
|
459 |
- "psubusw %%mm3, %%mm1 \n\t" |
|
460 |
- "psubusw %%mm2, %%mm5 \n\t" |
|
461 |
- |
|
462 |
- "psubusw %%mm4, %%mm6 \n\t" |
|
463 |
- "movq %%mm1, %%mm4 \n\t" |
|
464 |
- |
|
465 |
- "por %%mm5, %%mm4 \n\t" |
|
466 |
- "paddusw %%mm7, %%mm0 \n\t" |
|
467 |
- |
|
468 |
- "por %%mm6, %%mm4 \n\t" |
|
469 |
- "paddw %%mm7, %%mm0 \n\t" |
|
470 |
- |
|
471 |
- "packssdw %%mm4, %%mm4 \n\t" |
|
472 |
- "psubusw %%mm7, %%mm0 \n\t" |
|
473 |
- |
|
474 |
- "movd %%mm4, %%"REG_a" \n\t" |
|
475 |
- "or %%"REG_a", %%"REG_a" \n\t" |
|
476 |
- "jnz 2f \n\t" |
|
477 |
- //movq [edi+"DCTSIZE_S"*3*2], mm1 |
|
478 |
- //movq [edi+"DCTSIZE_S"*5*2], mm5 |
|
479 |
- //movq [edi+"DCTSIZE_S"*1*2], mm0 |
|
480 |
- //movq [edi+"DCTSIZE_S"*7*2], mm6 |
|
481 |
- // t4 t5 - - - t6 t7 - |
|
482 |
- //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0 |
|
483 |
-//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile |
|
484 |
- "movq 0*8+%3, %%mm4 \n\t" |
|
485 |
- "movq %%mm0, %%mm1 \n\t" |
|
486 |
- |
|
487 |
- "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6 |
|
488 |
- "movq %%mm1, %%mm2 \n\t" |
|
489 |
- |
|
490 |
- "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t" |
|
491 |
- "movq %%mm2, %%mm3 \n\t" |
|
492 |
- |
|
493 |
- "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5 |
|
494 |
- "paddw %%mm4, %%mm5 \n\t" |
|
495 |
- |
|
496 |
- "movq 1*8+%3, %%mm6 \n\t" |
|
497 |
- //paddw mm3, MM_2 |
|
498 |
- "psraw $2, %%mm3 \n\t" //tmp7 |
|
499 |
- |
|
500 |
- "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4 |
|
501 |
- "psubw %%mm3, %%mm4 \n\t" |
|
502 |
- |
|
503 |
- "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t" |
|
504 |
- "paddw %%mm3, %%mm5 \n\t" |
|
505 |
- |
|
506 |
- "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" |
|
507 |
- "paddw %%mm6, %%mm7 \n\t" |
|
508 |
- |
|
509 |
- "movq 2*8+%3, %%mm3 \n\t" |
|
510 |
- "psubw %%mm0, %%mm6 \n\t" |
|
511 |
- |
|
512 |
- "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t" |
|
513 |
- "paddw %%mm0, %%mm7 \n\t" |
|
514 |
- |
|
515 |
- "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" |
|
516 |
- "paddw %%mm3, %%mm4 \n\t" |
|
517 |
- |
|
518 |
- "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" |
|
519 |
- "psubw %%mm1, %%mm3 \n\t" |
|
520 |
- |
|
521 |
- "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t" |
|
522 |
- "paddw %%mm1, %%mm4 \n\t" |
|
523 |
- |
|
524 |
- "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t" |
|
525 |
- "paddw %%mm3, %%mm5 \n\t" |
|
526 |
- |
|
527 |
- "movq 3*8+%3, %%mm0 \n\t" |
|
528 |
- "add $8, %%"REG_S" \n\t" |
|
529 |
- |
|
530 |
- "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" |
|
531 |
- "paddw %%mm0, %%mm6 \n\t" |
|
532 |
- |
|
533 |
- "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" |
|
534 |
- "psubw %%mm2, %%mm0 \n\t" |
|
535 |
- |
|
536 |
- "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t" |
|
537 |
- "paddw %%mm2, %%mm6 \n\t" |
|
538 |
- |
|
539 |
- "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" |
|
540 |
- "paddw %%mm0, %%mm7 \n\t" |
|
541 |
- |
|
542 |
- "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" |
|
543 |
- |
|
544 |
- "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" |
|
545 |
- "add $8, %%"REG_D" \n\t" |
|
546 |
- "jmp 4f \n\t" |
|
547 |
- |
|
548 |
- "2: \n\t" |
|
549 |
- //--- non DC2 |
|
550 |
- //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) |
|
551 |
- //psraw mm5, 2 |
|
552 |
- //psraw mm0, 2 |
|
553 |
- //psraw mm6, 2 |
|
554 |
- "movq %%mm5, %%mm3 \n\t" |
|
555 |
- "psubw %%mm1, %%mm5 \n\t" |
|
556 |
- |
|
557 |
- "psllw $1, %%mm5 \n\t" //'z10 |
|
558 |
- "paddw %%mm1, %%mm3 \n\t" //'z13 |
|
559 |
- |
|
560 |
- "movq %%mm0, %%mm2 \n\t" |
|
561 |
- "psubw %%mm6, %%mm0 \n\t" |
|
562 |
- |
|
563 |
- "movq %%mm5, %%mm1 \n\t" |
|
564 |
- "psllw $1, %%mm0 \n\t" //'z12 |
|
565 |
- |
|
566 |
- "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //- |
|
567 |
- "paddw %%mm0, %%mm5 \n\t" |
|
568 |
- |
|
569 |
- "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5 |
|
570 |
- "paddw %%mm6, %%mm2 \n\t" //'z11 |
|
571 |
- |
|
572 |
- "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t" |
|
573 |
- "movq %%mm2, %%mm7 \n\t" |
|
574 |
- |
|
575 |
- //--- |
|
576 |
- "movq 0*8+%3, %%mm4 \n\t" |
|
577 |
- "psubw %%mm3, %%mm2 \n\t" |
|
578 |
- |
|
579 |
- "psllw $1, %%mm2 \n\t" |
|
580 |
- "paddw %%mm3, %%mm7 \n\t" //'t7 |
|
581 |
- |
|
582 |
- "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11 |
|
583 |
- "movq %%mm4, %%mm6 \n\t" |
|
584 |
- //paddw mm7, MM_2 |
|
585 |
- "psraw $2, %%mm7 \n\t" |
|
586 |
- |
|
587 |
- "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4\n\t" |
|
588 |
- "psubw %%mm7, %%mm6 \n\t" |
|
589 |
- |
|
590 |
- "movq 1*8+%3, %%mm3 \n\t" |
|
591 |
- "paddw %%mm7, %%mm4 \n\t" |
|
592 |
- |
|
593 |
- "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" |
|
594 |
- "paddw %%mm5, %%mm1 \n\t" //'t12 |
|
595 |
- |
|
596 |
- "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" |
|
597 |
- "psubw %%mm7, %%mm1 \n\t" //'t6 |
|
598 |
- |
|
599 |
- "movq 2*8+%3, %%mm7 \n\t" |
|
600 |
- "psubw %%mm5, %%mm0 \n\t" //'t10 |
|
601 |
- |
|
602 |
- "movq 3*8+%3, %%mm6 \n\t" |
|
603 |
- "movq %%mm3, %%mm5 \n\t" |
|
604 |
- |
|
605 |
- "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3\n\t" |
|
606 |
- "psubw %%mm1, %%mm5 \n\t" |
|
607 |
- |
|
608 |
- "psubw %%mm1, %%mm2 \n\t" //'t5 |
|
609 |
- "paddw %%mm1, %%mm3 \n\t" |
|
610 |
- |
|
611 |
- "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" |
|
612 |
- "movq %%mm7, %%mm4 \n\t" |
|
613 |
- |
|
614 |
- "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7\n\t" |
|
615 |
- "psubw %%mm2, %%mm4 \n\t" |
|
616 |
- |
|
617 |
- "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4\n\t" |
|
618 |
- "paddw %%mm2, %%mm7 \n\t" |
|
619 |
- |
|
620 |
- "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" |
|
621 |
- "paddw %%mm2, %%mm0 \n\t" //'t4 |
|
622 |
- |
|
623 |
- // 't4 't6 't5 - - - - 't7 |
|
624 |
- "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" |
|
625 |
- "movq %%mm6, %%mm1 \n\t" |
|
626 |
- |
|
627 |
- "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6\n\t" |
|
628 |
- "psubw %%mm0, %%mm1 \n\t" |
|
629 |
- |
|
630 |
- "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1\n\t" |
|
631 |
- "paddw %%mm0, %%mm6 \n\t" |
|
632 |
- |
|
633 |
- "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" |
|
634 |
- "add $8, %%"REG_S" \n\t" |
|
635 |
- |
|
636 |
- "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" |
|
637 |
- |
|
638 |
- "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" |
|
639 |
- "add $8, %%"REG_D" \n\t" |
|
640 |
- |
|
641 |
- "4: \n\t" |
|
642 |
- "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t" |
|
643 |
- // |
|
644 |
- "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t" |
|
645 |
- "movq %%mm1, %%mm0 \n\t" |
|
646 |
- |
|
647 |
- "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1\n\t" //t0 |
|
648 |
- "movq %%mm7, %%mm3 \n\t" |
|
649 |
- |
|
650 |
- "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7\n\t" //t3 |
|
651 |
- "movq %%mm1, %%mm5 \n\t" |
|
652 |
- |
|
653 |
- "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t" |
|
654 |
- "psubw %%mm7, %%mm1 \n\t" //t13 |
|
655 |
- |
|
656 |
- "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" |
|
657 |
- "movq %%mm6, %%mm4 \n\t" |
|
658 |
- |
|
659 |
- "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6\n\t" //t1 |
|
660 |
- "paddw %%mm7, %%mm5 \n\t" //t10 |
|
661 |
- |
|
662 |
- "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t2 |
|
663 |
- "movq %%mm6, %%mm7 \n\t" |
|
664 |
- |
|
665 |
- "paddw %%mm2, %%mm6 \n\t" //t11 |
|
666 |
- "psubw %%mm2, %%mm7 \n\t" //t12 |
|
667 |
- |
|
668 |
- "movq %%mm5, %%mm2 \n\t" |
|
669 |
- "paddw %%mm6, %%mm5 \n\t" //d0 |
|
670 |
- // i0 t13 t12 i3 i1 d0 - d4 |
|
671 |
- "psubw %%mm6, %%mm2 \n\t" //d4 |
|
672 |
- "paddw %%mm1, %%mm7 \n\t" |
|
673 |
- |
|
674 |
- "movq 1*8+4*16(%%"REG_d"), %%mm6 \n\t" |
|
675 |
- "psllw $2, %%mm7 \n\t" |
|
676 |
- |
|
677 |
- "psubw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" |
|
678 |
- "psubw %%mm6, %%mm2 \n\t" |
|
679 |
- |
|
680 |
- "paddusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" |
|
681 |
- "paddusw %%mm6, %%mm2 \n\t" |
|
682 |
- |
|
683 |
- "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t" |
|
684 |
- // |
|
685 |
- "paddw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" |
|
686 |
- "paddw %%mm6, %%mm2 \n\t" |
|
687 |
- |
|
688 |
- "psubusw 1*8+0*16(%%"REG_d"), %%mm5 \n\t" |
|
689 |
- "psubusw %%mm6, %%mm2 \n\t" |
|
690 |
- |
|
691 |
-//This func is totally compute-bound, operates at huge speed. So, DC shortcut |
|
692 |
-// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3). |
|
693 |
-//However, typical numbers: nondc - 29%%, dc - 46%%, zero - 25%%. All <> 0 case is very rare. |
|
694 |
- "paddw "MANGLE(MM_2)", %%mm5 \n\t" |
|
695 |
- "movq %%mm2, %%mm6 \n\t" |
|
696 |
- |
|
697 |
- "paddw %%mm5, %%mm2 \n\t" |
|
698 |
- "psubw %%mm6, %%mm5 \n\t" |
|
699 |
- |
|
700 |
- "movq %%mm1, %%mm6 \n\t" |
|
701 |
- "paddw %%mm7, %%mm1 \n\t" //d2 |
|
702 |
- |
|
703 |
- "psubw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" |
|
704 |
- "psubw %%mm7, %%mm6 \n\t" //d6 |
|
705 |
- |
|
706 |
- "movq 1*8+6*16(%%"REG_d"), %%mm7 \n\t" |
|
707 |
- "psraw $2, %%mm5 \n\t" |
|
708 |
- |
|
709 |
- "paddusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" |
|
710 |
- "psubw %%mm7, %%mm6 \n\t" |
|
711 |
- // t7 d2 /t11 t4 t6 - d6 /t10 |
|
712 |
- |
|
713 |
- "paddw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" |
|
714 |
- "paddusw %%mm7, %%mm6 \n\t" |
|
715 |
- |
|
716 |
- "psubusw 1*8+2*16(%%"REG_d"), %%mm1 \n\t" |
|
717 |
- "paddw %%mm7, %%mm6 \n\t" |
|
718 |
- |
|
719 |
- "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3\n\t" |
|
720 |
- "psubusw %%mm7, %%mm6 \n\t" |
|
721 |
- |
|
722 |
- //movq [edi+"DCTSIZE_S"*2*2], mm1 |
|
723 |
- //movq [edi+"DCTSIZE_S"*6*2], mm6 |
|
724 |
- "movq %%mm1, %%mm7 \n\t" |
|
725 |
- "psraw $2, %%mm2 \n\t" |
|
726 |
- |
|
727 |
- "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4\n\t" |
|
728 |
- "psubw %%mm6, %%mm1 \n\t" |
|
729 |
- |
|
730 |
- "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0\n\t" |
|
731 |
- "paddw %%mm7, %%mm6 \n\t" //'t13 |
|
732 |
- |
|
733 |
- "psraw $2, %%mm6 \n\t" //paddw mm6, MM_2 !! --- |
|
734 |
- "movq %%mm2, %%mm7 \n\t" |
|
735 |
- |
|
736 |
- "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t" |
|
737 |
- "paddw %%mm6, %%mm2 \n\t" //'t0 |
|
738 |
- |
|
739 |
- "movq %%mm2, 0*8+%3 \n\t" //! |
|
740 |
- "psubw %%mm6, %%mm7 \n\t" //'t3 |
|
741 |
- |
|
742 |
- "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" |
|
743 |
- "psubw %%mm6, %%mm1 \n\t" //'t12 |
|
744 |
- |
|
745 |
- "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t5 |
|
746 |
- "movq %%mm5, %%mm6 \n\t" |
|
747 |
- |
|
748 |
- "movq %%mm7, 3*8+%3 \n\t" |
|
749 |
- "paddw %%mm2, %%mm3 \n\t" //t10 |
|
750 |
- |
|
751 |
- "paddw %%mm4, %%mm2 \n\t" //t11 |
|
752 |
- "paddw %%mm0, %%mm4 \n\t" //t12 |
|
753 |
- |
|
754 |
- "movq %%mm3, %%mm7 \n\t" |
|
755 |
- "psubw %%mm4, %%mm3 \n\t" |
|
756 |
- |
|
757 |
- "psllw $2, %%mm3 \n\t" |
|
758 |
- "psllw $2, %%mm7 \n\t" //opt for P6 |
|
759 |
- |
|
760 |
- "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" |
|
761 |
- "psllw $2, %%mm4 \n\t" |
|
762 |
- |
|
763 |
- "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t" |
|
764 |
- "psllw $2, %%mm2 \n\t" |
|
765 |
- |
|
766 |
- "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t" |
|
767 |
- "paddw %%mm1, %%mm5 \n\t" //'t1 |
|
768 |
- |
|
769 |
- "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t" |
|
770 |
- "psubw %%mm1, %%mm6 \n\t" //'t2 |
|
771 |
- // t7 't12 't11 t4 t6 - 't13 't10 --- |
|
772 |
- |
|
773 |
- "paddw %%mm3, %%mm7 \n\t" //z2 |
|
774 |
- |
|
775 |
- "movq %%mm5, 1*8+%3 \n\t" |
|
776 |
- "paddw %%mm3, %%mm4 \n\t" //z4 |
|
777 |
- |
|
778 |
- "movq 1*8+3*16(%%"REG_d"), %%mm3 \n\t" |
|
779 |
- "movq %%mm0, %%mm1 \n\t" |
|
780 |
- |
|
781 |
- "movq %%mm6, 2*8+%3 \n\t" |
|
782 |
- "psubw %%mm2, %%mm1 \n\t" //z13 |
|
783 |
- |
|
784 |
-//=== |
|
785 |
- "paddw %%mm2, %%mm0 \n\t" //z11 |
|
786 |
- "movq %%mm1, %%mm5 \n\t" |
|
787 |
- |
|
788 |
- "movq 1*8+5*16(%%"REG_d"), %%mm2 \n\t" |
|
789 |
- "psubw %%mm7, %%mm1 \n\t" //d3 |
|
790 |
- |
|
791 |
- "paddw %%mm7, %%mm5 \n\t" //d5 |
|
792 |
- "psubw %%mm3, %%mm1 \n\t" |
|
793 |
- |
|
794 |
- "movq 1*8+1*16(%%"REG_d"), %%mm7 \n\t" |
|
795 |
- "psubw %%mm2, %%mm5 \n\t" |
|
796 |
- |
|
797 |
- "movq %%mm0, %%mm6 \n\t" |
|
798 |
- "paddw %%mm4, %%mm0 \n\t" //d1 |
|
799 |
- |
|
800 |
- "paddusw %%mm3, %%mm1 \n\t" |
|
801 |
- "psubw %%mm4, %%mm6 \n\t" //d7 |
|
802 |
- |
|
803 |
- // d1 d3 - - - d5 d7 - |
|
804 |
- "movq 1*8+7*16(%%"REG_d"), %%mm4 \n\t" |
|
805 |
- "psubw %%mm7, %%mm0 \n\t" |
|
806 |
- |
|
807 |
- "psubw %%mm4, %%mm6 \n\t" |
|
808 |
- "paddusw %%mm2, %%mm5 \n\t" |
|
809 |
- |
|
810 |
- "paddusw %%mm4, %%mm6 \n\t" |
|
811 |
- "paddw %%mm3, %%mm1 \n\t" |
|
812 |
- |
|
813 |
- "paddw %%mm2, %%mm5 \n\t" |
|
814 |
- "paddw %%mm4, %%mm6 \n\t" |
|
815 |
- |
|
816 |
- "psubusw %%mm3, %%mm1 \n\t" |
|
817 |
- "psubusw %%mm2, %%mm5 \n\t" |
|
818 |
- |
|
819 |
- "psubusw %%mm4, %%mm6 \n\t" |
|
820 |
- "movq %%mm1, %%mm4 \n\t" |
|
821 |
- |
|
822 |
- "por %%mm5, %%mm4 \n\t" |
|
823 |
- "paddusw %%mm7, %%mm0 \n\t" |
|
824 |
- |
|
825 |
- "por %%mm6, %%mm4 \n\t" |
|
826 |
- "paddw %%mm7, %%mm0 \n\t" |
|
827 |
- |
|
828 |
- "packssdw %%mm4, %%mm4 \n\t" |
|
829 |
- "psubusw %%mm7, %%mm0 \n\t" |
|
830 |
- |
|
831 |
- "movd %%mm4, %%"REG_a" \n\t" |
|
832 |
- "or %%"REG_a", %%"REG_a" \n\t" |
|
833 |
- "jnz 3f \n\t" |
|
834 |
- //movq [edi+"DCTSIZE_S"*3*2], mm1 |
|
835 |
- //movq [edi+"DCTSIZE_S"*5*2], mm5 |
|
836 |
- //movq [edi+"DCTSIZE_S"*1*2], mm0 |
|
837 |
- //movq [edi+"DCTSIZE_S"*7*2], mm6 |
|
838 |
- // t4 t5 - - - t6 t7 - |
|
839 |
- //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0 |
|
840 |
-//Typical numbers: nondc - 19%%, dc - 26%%, zero - 55%%. zero case alone isn't worthwhile |
|
841 |
- "movq 0*8+%3, %%mm4 \n\t" |
|
842 |
- "movq %%mm0, %%mm1 \n\t" |
|
843 |
- |
|
844 |
- "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6 |
|
845 |
- "movq %%mm1, %%mm2 \n\t" |
|
846 |
- |
|
847 |
- "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5\n\t" |
|
848 |
- "movq %%mm2, %%mm3 \n\t" |
|
849 |
- |
|
850 |
- "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5 |
|
851 |
- "paddw %%mm4, %%mm5 \n\t" |
|
852 |
- |
|
853 |
- "movq 1*8+%3, %%mm6 \n\t" |
|
854 |
- //paddw mm3, MM_2 |
|
855 |
- "psraw $2, %%mm3 \n\t" //tmp7 |
|
856 |
- |
|
857 |
- "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4 |
|
858 |
- "psubw %%mm3, %%mm4 \n\t" |
|
859 |
- |
|
860 |
- "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7\n\t" |
|
861 |
- "paddw %%mm3, %%mm5 \n\t" |
|
862 |
- |
|
863 |
- "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D")\n\t" |
|
864 |
- "paddw %%mm6, %%mm7 \n\t" |
|
865 |
- |
|
866 |
- "movq 2*8+%3, %%mm3 \n\t" |
|
867 |
- "psubw %%mm0, %%mm6 \n\t" |
|
868 |
- |
|
869 |
- "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4\n\t" |
|
870 |
- "paddw %%mm0, %%mm7 \n\t" |
|
871 |
- |
|
872 |
- "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D")\n\t" |
|
873 |
- "paddw %%mm3, %%mm4 \n\t" |
|
874 |
- |
|
875 |
- "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D")\n\t" |
|
876 |
- "psubw %%mm1, %%mm3 \n\t" |
|
877 |
- |
|
878 |
- "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5\n\t" |
|
879 |
- "paddw %%mm1, %%mm4 \n\t" |
|
880 |
- |
|
881 |
- "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6\n\t" |
|
882 |
- "paddw %%mm3, %%mm5 \n\t" |
|
883 |
- |
|
884 |
- "movq 3*8+%3, %%mm0 \n\t" |
|
885 |
- "add $24, %%"REG_S" \n\t" |
|
886 |
- |
|
887 |
- "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D")\n\t" |
|
888 |
- "paddw %%mm0, %%mm6 \n\t" |
|
889 |
- |
|
890 |
- "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D")\n\t" |
|
891 |
- "psubw %%mm2, %%mm0 \n\t" |
|
892 |
- |
|
893 |
- "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7\n\t" |
|
894 |
- "paddw %%mm2, %%mm6 \n\t" |
|
895 |
- |
|
896 |
- "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D")\n\t" |
|
897 |
- "paddw %%mm0, %%mm7 \n\t" |
|
898 |
- |
|
899 |
- "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D")\n\t" |
|
900 |
- |
|
901 |
- "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D")\n\t" |
|
902 |
- "add $24, %%"REG_D" \n\t" |
|
903 |
- "sub $2, %%"REG_c" \n\t" |
|
904 |
- "jnz 1b \n\t" |
|
905 |
- "jmp 5f \n\t" |
|
906 |
- |
|
907 |
- "3: \n\t" |
|
908 |
- //--- non DC2 |
|
909 |
- //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1 (actually thr1, thr1, thr1-1) |
|
910 |
- //psraw mm5, 2 |
|
911 |
- //psraw mm0, 2 |
|
912 |
- //psraw mm6, 2 |
|
913 |
- "movq %%mm5, %%mm3 \n\t" |
|
914 |
- "psubw %%mm1, %%mm5 \n\t" |
|
915 |
- |
|
916 |
- "psllw $1, %%mm5 \n\t" //'z10 |
|
917 |
- "paddw %%mm1, %%mm3 \n\t" //'z13 |
|
918 |
- |
|
919 |
- "movq %%mm0, %%mm2 \n\t" |
|
920 |
- "psubw %%mm6, %%mm0 \n\t" |
|
921 |
- |
|
922 |
- "movq %%mm5, %%mm1 \n\t" |
|
923 |
- "psllw $1, %%mm0 \n\t" //'z12 |
|
924 |
- |
|
925 |
- "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //- |
|
926 |
- "paddw %%mm0, %%mm5 \n\t" |
|
927 |
- |
|
928 |
- "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5 |
|
929 |
- "paddw %%mm6, %%mm2 \n\t" //'z11 |
|
930 |
- |
|
931 |
- "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t" |
|
932 |
- "movq %%mm2, %%mm7 \n\t" |
|
933 |
- |
|
934 |
- //--- |
|
935 |
- "movq 0*8+%3, %%mm4 \n\t" |
|
936 |
- "psubw %%mm3, %%mm2 \n\t" |
|
937 |
- |
|
938 |
- "psllw $1, %%mm2 \n\t" |
|
939 |
- "paddw %%mm3, %%mm7 \n\t" //'t7 |
|
940 |
- |
|
941 |
- "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11 |
|
942 |
- "movq %%mm4, %%mm6 \n\t" |
|
943 |
- //paddw mm7, MM_2 |
|
944 |
- "psraw $2, %%mm7 \n\t" |
|
945 |
- |
|
946 |
- "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t" |
|
947 |
- "psubw %%mm7, %%mm6 \n\t" |
|
948 |
- |
|
949 |
- "movq 1*8+%3, %%mm3 \n\t" |
|
950 |
- "paddw %%mm7, %%mm4 \n\t" |
|
951 |
- |
|
952 |
- "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t" |
|
953 |
- "paddw %%mm5, %%mm1 \n\t" //'t12 |
|
954 |
- |
|
955 |
- "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" |
|
956 |
- "psubw %%mm7, %%mm1 \n\t" //'t6 |
|
957 |
- |
|
958 |
- "movq 2*8+%3, %%mm7 \n\t" |
|
959 |
- "psubw %%mm5, %%mm0 \n\t" //'t10 |
|
960 |
- |
|
961 |
- "movq 3*8+%3, %%mm6 \n\t" |
|
962 |
- "movq %%mm3, %%mm5 \n\t" |
|
963 |
- |
|
964 |
- "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t" |
|
965 |
- "psubw %%mm1, %%mm5 \n\t" |
|
966 |
- |
|
967 |
- "psubw %%mm1, %%mm2 \n\t" //'t5 |
|
968 |
- "paddw %%mm1, %%mm3 \n\t" |
|
969 |
- |
|
970 |
- "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t" |
|
971 |
- "movq %%mm7, %%mm4 \n\t" |
|
972 |
- |
|
973 |
- "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t" |
|
974 |
- "psubw %%mm2, %%mm4 \n\t" |
|
975 |
- |
|
976 |
- "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t" |
|
977 |
- "paddw %%mm2, %%mm7 \n\t" |
|
978 |
- |
|
979 |
- "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" |
|
980 |
- "paddw %%mm2, %%mm0 \n\t" //'t4 |
|
981 |
- |
|
982 |
- // 't4 't6 't5 - - - - 't7 |
|
983 |
- "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" |
|
984 |
- "movq %%mm6, %%mm1 \n\t" |
|
985 |
- |
|
986 |
- "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t" |
|
987 |
- "psubw %%mm0, %%mm1 \n\t" |
|
988 |
- |
|
989 |
- "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t" |
|
990 |
- "paddw %%mm0, %%mm6 \n\t" |
|
991 |
- |
|
992 |
- "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t" |
|
993 |
- "add $24, %%"REG_S" \n\t" |
|
994 |
- |
|
995 |
- "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t" |
|
996 |
- |
|
997 |
- "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" |
|
998 |
- "add $24, %%"REG_D" \n\t" |
|
999 |
- "sub $2, %%"REG_c" \n\t" |
|
1000 |
- "jnz 1b \n\t" |
|
1001 |
- "5: \n\t" |
|
1002 |
- |
|
1003 |
- : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps) |
|
1004 |
- : "d"(thr_adr) |
|
1005 |
- NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, MM_2,MM_FIX_1_414213562_A, MM_FIX_1_414213562, MM_FIX_0_382683433, |
|
1006 |
- ff_MM_FIX_0_541196100, MM_FIX_1_306562965, MM_FIX_0_847759065) |
|
1007 |
- NAMED_CONSTRAINTS_ADD(MM_FIX_0_566454497, MM_FIX_0_198912367, MM_FIX_2_613125930, MM_FIX_1_847759065, |
|
1008 |
- MM_FIX_1_082392200) |
|
1009 |
- : "%"REG_a |
|
1010 |
- ); |
|
1011 |
-} |
|
1012 |
- |
|
1013 |
-static void row_idct_mmx (int16_t *workspace, int16_t *output_adr, int output_stride, int cnt) |
|
1014 |
-{ |
|
1015 |
- DECLARE_ALIGNED(8, uint64_t, temps)[4]; |
|
1016 |
- |
|
1017 |
- __asm__ volatile( |
|
1018 |
- "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t" |
|
1019 |
- "1: \n\t" |
|
1020 |
- "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0 \n\t" |
|
1021 |
- // |
|
1022 |
- |
|
1023 |
- "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1 \n\t" |
|
1024 |
- "movq %%mm0, %%mm4 \n\t" |
|
1025 |
- |
|
1026 |
- "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t" |
|
1027 |
- "punpcklwd %%mm1, %%mm0 \n\t" |
|
1028 |
- |
|
1029 |
- "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3 \n\t" |
|
1030 |
- "punpckhwd %%mm1, %%mm4 \n\t" |
|
1031 |
- |
|
1032 |
- //transpose 4x4 |
|
1033 |
- "movq %%mm2, %%mm7 \n\t" |
|
1034 |
- "punpcklwd %%mm3, %%mm2 \n\t" |
|
1035 |
- |
|
1036 |
- "movq %%mm0, %%mm6 \n\t" |
|
1037 |
- "punpckldq %%mm2, %%mm0 \n\t" //0 |
|
1038 |
- |
|
1039 |
- "punpckhdq %%mm2, %%mm6 \n\t" //1 |
|
1040 |
- "movq %%mm0, %%mm5 \n\t" |
|
1041 |
- |
|
1042 |
- "punpckhwd %%mm3, %%mm7 \n\t" |
|
1043 |
- "psubw %%mm6, %%mm0 \n\t" |
|
1044 |
- |
|
1045 |
- "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t" |
|
1046 |
- "movq %%mm4, %%mm2 \n\t" |
|
1047 |
- |
|
1048 |
- "punpckldq %%mm7, %%mm4 \n\t" //2 |
|
1049 |
- "paddw %%mm6, %%mm5 \n\t" |
|
1050 |
- |
|
1051 |
- "punpckhdq %%mm7, %%mm2 \n\t" //3 |
|
1052 |
- "movq %%mm4, %%mm1 \n\t" |
|
1053 |
- |
|
1054 |
- "psllw $2, %%mm0 \n\t" |
|
1055 |
- "paddw %%mm2, %%mm4 \n\t" //t10 |
|
1056 |
- |
|
1057 |
- "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t" |
|
1058 |
- "psubw %%mm2, %%mm1 \n\t" //t11 |
|
1059 |
- |
|
1060 |
- "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t" |
|
1061 |
- "psubw %%mm5, %%mm0 \n\t" |
|
1062 |
- |
|
1063 |
- "movq %%mm4, %%mm6 \n\t" |
|
1064 |
- "paddw %%mm5, %%mm4 \n\t" //t0 |
|
1065 |
- |
|
1066 |
- "psubw %%mm5, %%mm6 \n\t" //t3 |
|
1067 |
- "movq %%mm1, %%mm7 \n\t" |
|
1068 |
- |
|
1069 |
- "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t" |
|
1070 |
- "paddw %%mm0, %%mm1 \n\t" //t1 |
|
1071 |
- |
|
1072 |
- "movq %%mm4, 0*8+%3 \n\t" //t0 |
|
1073 |
- "movq %%mm3, %%mm4 \n\t" |
|
1074 |
- |
|
1075 |
- "movq %%mm6, 1*8+%3 \n\t" //t3 |
|
1076 |
- "punpcklwd %%mm2, %%mm3 \n\t" |
|
1077 |
- |
|
1078 |
- //transpose 4x4 |
|
1079 |
- "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t" |
|
1080 |
- "punpckhwd %%mm2, %%mm4 \n\t" |
|
1081 |
- |
|
1082 |
- "movq %%mm5, %%mm2 \n\t" |
|
1083 |
- "punpcklwd %%mm6, %%mm5 \n\t" |
|
1084 |
- |
|
1085 |
- "psubw %%mm0, %%mm7 \n\t" //t2 |
|
1086 |
- "punpckhwd %%mm6, %%mm2 \n\t" |
|
1087 |
- |
|
1088 |
- "movq %%mm3, %%mm0 \n\t" |
|
1089 |
- "punpckldq %%mm5, %%mm3 \n\t" //4 |
|
1090 |
- |
|
1091 |
- "punpckhdq %%mm5, %%mm0 \n\t" //5 |
|
1092 |
- "movq %%mm4, %%mm5 \n\t" |
|
1093 |
- |
|
1094 |
- // |
|
1095 |
- "movq %%mm3, %%mm6 \n\t" |
|
1096 |
- "punpckldq %%mm2, %%mm4 \n\t" //6 |
|
1097 |
- |
|
1098 |
- "psubw %%mm0, %%mm3 \n\t" //z10 |
|
1099 |
- "punpckhdq %%mm2, %%mm5 \n\t" //7 |
|
1100 |
- |
|
1101 |
- "paddw %%mm0, %%mm6 \n\t" //z13 |
|
1102 |
- "movq %%mm4, %%mm2 \n\t" |
|
1103 |
- |
|
1104 |
- "movq %%mm3, %%mm0 \n\t" |
|
1105 |
- "psubw %%mm5, %%mm4 \n\t" //z12 |
|
1106 |
- |
|
1107 |
- "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0\n\t" //- |
|
1108 |
- "paddw %%mm4, %%mm3 \n\t" |
|
1109 |
- |
|
1110 |
- "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3\n\t" //z5 |
|
1111 |
- "paddw %%mm5, %%mm2 \n\t" //z11 > |
|
1112 |
- |
|
1113 |
- "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4\n\t" |
|
1114 |
- "movq %%mm2, %%mm5 \n\t" |
|
1115 |
- |
|
1116 |
- "psubw %%mm6, %%mm2 \n\t" |
|
1117 |
- "paddw %%mm6, %%mm5 \n\t" //t7 |
|
1118 |
- |
|
1119 |
- "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2\n\t" //t11 |
|
1120 |
- "paddw %%mm3, %%mm0 \n\t" //t12 |
|
1121 |
- |
|
1122 |
- "psllw $3, %%mm0 \n\t" |
|
1123 |
- "psubw %%mm3, %%mm4 \n\t" //t10 |
|
1124 |
- |
|
1125 |
- "movq 0*8+%3, %%mm6 \n\t" |
|
1126 |
- "movq %%mm1, %%mm3 \n\t" |
|
1127 |
- |
|
1128 |
- "psllw $3, %%mm4 \n\t" |
|
1129 |
- "psubw %%mm5, %%mm0 \n\t" //t6 |
|
1130 |
- |
|
1131 |
- "psllw $3, %%mm2 \n\t" |
|
1132 |
- "paddw %%mm0, %%mm1 \n\t" //d1 |
|
1133 |
- |
|
1134 |
- "psubw %%mm0, %%mm2 \n\t" //t5 |
|
1135 |
- "psubw %%mm0, %%mm3 \n\t" //d6 |
|
1136 |
- |
|
1137 |
- "paddw %%mm2, %%mm4 \n\t" //t4 |
|
1138 |
- "movq %%mm7, %%mm0 \n\t" |
|
1139 |
- |
|
1140 |
- "paddw %%mm2, %%mm7 \n\t" //d2 |
|
1141 |
- "psubw %%mm2, %%mm0 \n\t" //d5 |
|
1142 |
- |
|
1143 |
- "movq "MANGLE(MM_DESCALE_RND)", %%mm2 \n\t" //4 |
|
1144 |
- "psubw %%mm5, %%mm6 \n\t" //d7 |
|
1145 |
- |
|
1146 |
- "paddw 0*8+%3, %%mm5 \n\t" //d0 |
|
1147 |
- "paddw %%mm2, %%mm1 \n\t" |
|
1148 |
- |
|
1149 |
- "paddw %%mm2, %%mm5 \n\t" |
|
1150 |
- "psraw $3, %%mm1 \n\t" |
|
1151 |
- |
|
1152 |
- "paddw %%mm2, %%mm7 \n\t" |
|
1153 |
- "psraw $3, %%mm5 \n\t" |
|
1154 |
- |
|
1155 |
- "paddw (%%"REG_D"), %%mm5 \n\t" |
|
1156 |
- "psraw $3, %%mm7 \n\t" |
|
1157 |
- |
|
1158 |
- "paddw (%%"REG_D",%%"REG_a"), %%mm1 \n\t" |
|
1159 |
- "paddw %%mm2, %%mm0 \n\t" |
|
1160 |
- |
|
1161 |
- "paddw (%%"REG_D",%%"REG_a",2), %%mm7 \n\t" |
|
1162 |
- "paddw %%mm2, %%mm3 \n\t" |
|
1163 |
- |
|
1164 |
- "movq %%mm5, (%%"REG_D") \n\t" |
|
1165 |
- "paddw %%mm2, %%mm6 \n\t" |
|
1166 |
- |
|
1167 |
- "movq %%mm1, (%%"REG_D",%%"REG_a") \n\t" |
|
1168 |
- "psraw $3, %%mm0 \n\t" |
|
1169 |
- |
|
1170 |
- "movq %%mm7, (%%"REG_D",%%"REG_a",2) \n\t" |
|
1171 |
- "add %%"REG_d", %%"REG_D" \n\t" //3*ls |
|
1172 |
- |
|
1173 |
- "movq 1*8+%3, %%mm5 \n\t" //t3 |
|
1174 |
- "psraw $3, %%mm3 \n\t" |
|
1175 |
- |
|
1176 |
- "paddw (%%"REG_D",%%"REG_a",2), %%mm0 \n\t" |
|
1177 |
- "psubw %%mm4, %%mm5 \n\t" //d3 |
|
1178 |
- |
|
1179 |
- "paddw (%%"REG_D",%%"REG_d"), %%mm3 \n\t" |
|
1180 |
- "psraw $3, %%mm6 \n\t" |
|
1181 |
- |
|
1182 |
- "paddw 1*8+%3, %%mm4 \n\t" //d4 |
|
1183 |
- "paddw %%mm2, %%mm5 \n\t" |
|
1184 |
- |
|
1185 |
- "paddw (%%"REG_D",%%"REG_a",4), %%mm6 \n\t" |
|
1186 |
- "paddw %%mm2, %%mm4 \n\t" |
|
1187 |
- |
|
1188 |
- "movq %%mm0, (%%"REG_D",%%"REG_a",2) \n\t" |
|
1189 |
- "psraw $3, %%mm5 \n\t" |
|
1190 |
- |
|
1191 |
- "paddw (%%"REG_D"), %%mm5 \n\t" |
|
1192 |
- "psraw $3, %%mm4 \n\t" |
|
1193 |
- |
|
1194 |
- "paddw (%%"REG_D",%%"REG_a"), %%mm4 \n\t" |
|
1195 |
- "add $"DCTSIZE_S"*2*4, %%"REG_S" \n\t" //4 rows |
|
1196 |
- |
|
1197 |
- "movq %%mm3, (%%"REG_D",%%"REG_d") \n\t" |
|
1198 |
- "movq %%mm6, (%%"REG_D",%%"REG_a",4) \n\t" |
|
1199 |
- "movq %%mm5, (%%"REG_D") \n\t" |
|
1200 |
- "movq %%mm4, (%%"REG_D",%%"REG_a") \n\t" |
|
1201 |
- |
|
1202 |
- "sub %%"REG_d", %%"REG_D" \n\t" |
|
1203 |
- "add $8, %%"REG_D" \n\t" |
|
1204 |
- "dec %%"REG_c" \n\t" |
|
1205 |
- "jnz 1b \n\t" |
|
1206 |
- |
|
1207 |
- : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps) |
|
1208 |
- : "a"(output_stride * sizeof(short)) |
|
1209 |
- NAMED_CONSTRAINTS_ADD(MM_FIX_1_414213562_A, MM_FIX_2_613125930, MM_FIX_1_847759065, MM_FIX_1_082392200, |
|
1210 |
- MM_FIX_1_414213562,MM_DESCALE_RND) |
|
1211 |
- : "%"REG_d |
|
1212 |
- ); |
|
1213 |
-} |
|
1214 |
- |
|
1215 |
-static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt) |
|
1216 |
-{ |
|
1217 |
- DECLARE_ALIGNED(8, uint64_t, temps)[4]; |
|
1218 |
- |
|
1219 |
- __asm__ volatile( |
|
1220 |
- "lea (%%"REG_a",%%"REG_a",2), %%"REG_d" \n\t" |
|
1221 |
- "6: \n\t" |
|
1222 |
- "movd (%%"REG_S"), %%mm0 \n\t" |
|
1223 |
- "pxor %%mm7, %%mm7 \n\t" |
|
1224 |
- |
|
1225 |
- "movd (%%"REG_S",%%"REG_a"), %%mm1 \n\t" |
|
1226 |
- "punpcklbw %%mm7, %%mm0 \n\t" |
|
1227 |
- |
|
1228 |
- "movd (%%"REG_S",%%"REG_a",2), %%mm2 \n\t" |
|
1229 |
- "punpcklbw %%mm7, %%mm1 \n\t" |
|
1230 |
- |
|
1231 |
- "punpcklbw %%mm7, %%mm2 \n\t" |
|
1232 |
- "add %%"REG_d", %%"REG_S" \n\t" |
|
1233 |
- |
|
1234 |
- "movq %%mm0, %%mm5 \n\t" |
|
1235 |
- // |
|
1236 |
- |
|
1237 |
- "movd (%%"REG_S",%%"REG_a",4), %%mm3 \n\t" //7 ;prefetch! |
|
1238 |
- "movq %%mm1, %%mm6 \n\t" |
|
1239 |
- |
|
1240 |
- "movd (%%"REG_S",%%"REG_d"), %%mm4 \n\t" //6 |
|
1241 |
- "punpcklbw %%mm7, %%mm3 \n\t" |
|
1242 |
- |
|
1243 |
- "psubw %%mm3, %%mm5 \n\t" |
|
1244 |
- "punpcklbw %%mm7, %%mm4 \n\t" |
|
1245 |
- |
|
1246 |
- "paddw %%mm3, %%mm0 \n\t" |
|
1247 |
- "psubw %%mm4, %%mm6 \n\t" |
|
1248 |
- |
|
1249 |
- "movd (%%"REG_S",%%"REG_a",2), %%mm3 \n\t" //5 |
|
1250 |
- "paddw %%mm4, %%mm1 \n\t" |
|
1251 |
- |
|
1252 |
- "movq %%mm5, %3 \n\t" //t7 |
|
1253 |
- "punpcklbw %%mm7, %%mm3 \n\t" |
|
1254 |
- |
|
1255 |
- "movq %%mm6, %4 \n\t" //t6 |
|
1256 |
- "movq %%mm2, %%mm4 \n\t" |
|
1257 |
- |
|
1258 |
- "movd (%%"REG_S"), %%mm5 \n\t" //3 |
|
1259 |
- "paddw %%mm3, %%mm2 \n\t" |
|
1260 |
- |
|
1261 |
- "movd (%%"REG_S",%%"REG_a"), %%mm6 \n\t" //4 |
|
1262 |
- "punpcklbw %%mm7, %%mm5 \n\t" |
|
1263 |
- |
|
1264 |
- "psubw %%mm3, %%mm4 \n\t" |
|
1265 |
- "punpcklbw %%mm7, %%mm6 \n\t" |
|
1266 |
- |
|
1267 |
- "movq %%mm5, %%mm3 \n\t" |
|
1268 |
- "paddw %%mm6, %%mm5 \n\t" //t3 |
|
1269 |
- |
|
1270 |
- "psubw %%mm6, %%mm3 \n\t" //t4 ; t0 t1 t2 t4 t5 t3 - - |
|
1271 |
- "movq %%mm0, %%mm6 \n\t" |
|
1272 |
- |
|
1273 |
- "movq %%mm1, %%mm7 \n\t" |
|
1274 |
- "psubw %%mm5, %%mm0 \n\t" //t13 |
|
1275 |
- |
|
1276 |
- "psubw %%mm2, %%mm1 \n\t" |
|
1277 |
- "paddw %%mm2, %%mm7 \n\t" //t11 |
|
1278 |
- |
|
1279 |
- "paddw %%mm0, %%mm1 \n\t" |
|
1280 |
- "movq %%mm7, %%mm2 \n\t" |
|
1281 |
- |
|
1282 |
- "psllw $2, %%mm1 \n\t" |
|
1283 |
- "paddw %%mm5, %%mm6 \n\t" //t10 |
|
1284 |
- |
|
1285 |
- "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t" |
|
1286 |
- "paddw %%mm6, %%mm7 \n\t" //d2 |
|
1287 |
- |
|
1288 |
- "psubw %%mm2, %%mm6 \n\t" //d3 |
|
1289 |
- "movq %%mm0, %%mm5 \n\t" |
|
1290 |
- |
|
1291 |
- //transpose 4x4 |
|
1292 |
- "movq %%mm7, %%mm2 \n\t" |
|
1293 |
- "punpcklwd %%mm6, %%mm7 \n\t" |
|
1294 |
- |
|
1295 |
- "paddw %%mm1, %%mm0 \n\t" //d0 |
|
1296 |
- "punpckhwd %%mm6, %%mm2 \n\t" |
|
1297 |
- |
|
1298 |
- "psubw %%mm1, %%mm5 \n\t" //d1 |
|
1299 |
- "movq %%mm0, %%mm6 \n\t" |
|
1300 |
- |
|
1301 |
- "movq %4, %%mm1 \n\t" |
|
1302 |
- "punpcklwd %%mm5, %%mm0 \n\t" |
|
1303 |
- |
|
1304 |
- "punpckhwd %%mm5, %%mm6 \n\t" |
|
1305 |
- "movq %%mm0, %%mm5 \n\t" |
|
1306 |
- |
|
1307 |
- "punpckldq %%mm7, %%mm0 \n\t" //0 |
|
1308 |
- "paddw %%mm4, %%mm3 \n\t" |
|
1309 |
- |
|
1310 |
- "punpckhdq %%mm7, %%mm5 \n\t" //1 |
|
1311 |
- "movq %%mm6, %%mm7 \n\t" |
|
1312 |
- |
|
1313 |
- "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D") \n\t" |
|
1314 |
- "punpckldq %%mm2, %%mm6 \n\t" //2 |
|
1315 |
- |
|
1316 |
- "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D") \n\t" |
|
1317 |
- "punpckhdq %%mm2, %%mm7 \n\t" //3 |
|
1318 |
- |
|
1319 |
- "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D") \n\t" |
|
1320 |
- "paddw %%mm1, %%mm4 \n\t" |
|
1321 |
- |
|
1322 |
- "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D") \n\t" |
|
1323 |
- "psllw $2, %%mm3 \n\t" //t10 |
|
1324 |
- |
|
1325 |
- "movq %3, %%mm2 \n\t" |
|
1326 |
- "psllw $2, %%mm4 \n\t" //t11 |
|
1327 |
- |
|
1328 |
- "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3 |
|
1329 |
- "paddw %%mm2, %%mm1 \n\t" |
|
1330 |
- |
|
1331 |
- "psllw $2, %%mm1 \n\t" //t12 |
|
1332 |
- "movq %%mm3, %%mm0 \n\t" |
|
1333 |
- |
|
1334 |
- "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t" |
|
1335 |
- "psubw %%mm1, %%mm3 \n\t" |
|
1336 |
- |
|
1337 |
- "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5 |
|
1338 |
- "movq %%mm2, %%mm5 \n\t" |
|
1339 |
- |
|
1340 |
- "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t" |
|
1341 |
- "psubw %%mm4, %%mm2 \n\t" //z13 |
|
1342 |
- |
|
1343 |
- "paddw %%mm4, %%mm5 \n\t" //z11 |
|
1344 |
- "movq %%mm2, %%mm6 \n\t" |
|
1345 |
- |
|
1346 |
- "paddw %%mm3, %%mm0 \n\t" //z2 |
|
1347 |
- "movq %%mm5, %%mm7 \n\t" |
|
1348 |
- |
|
1349 |
- "paddw %%mm0, %%mm2 \n\t" //d4 |
|
1350 |
- "psubw %%mm0, %%mm6 \n\t" //d5 |
|
1351 |
- |
|
1352 |
- "movq %%mm2, %%mm4 \n\t" |
|
1353 |
- "paddw %%mm3, %%mm1 \n\t" //z4 |
|
1354 |
- |
|
1355 |
- //transpose 4x4 |
|
1356 |
- "punpcklwd %%mm6, %%mm2 \n\t" |
|
1357 |
- "paddw %%mm1, %%mm5 \n\t" //d6 |
|
1358 |
- |
|
1359 |
- "punpckhwd %%mm6, %%mm4 \n\t" |
|
1360 |
- "psubw %%mm1, %%mm7 \n\t" //d7 |
|
1361 |
- |
|
1362 |
- "movq %%mm5, %%mm6 \n\t" |
|
1363 |
- "punpcklwd %%mm7, %%mm5 \n\t" |
|
1364 |
- |
|
1365 |
- "punpckhwd %%mm7, %%mm6 \n\t" |
|
1366 |
- "movq %%mm2, %%mm7 \n\t" |
|
1367 |
- |
|
1368 |
- "punpckldq %%mm5, %%mm2 \n\t" //4 |
|
1369 |
- "sub %%"REG_d", %%"REG_S" \n\t" |
|
1370 |
- |
|
1371 |
- "punpckhdq %%mm5, %%mm7 \n\t" //5 |
|
1372 |
- "movq %%mm4, %%mm5 \n\t" |
|
1373 |
- |
|
1374 |
- "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t" |
|
1375 |
- "punpckldq %%mm6, %%mm4 \n\t" //6 |
|
1376 |
- |
|
1377 |
- "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t" |
|
1378 |
- "punpckhdq %%mm6, %%mm5 \n\t" //7 |
|
1379 |
- |
|
1380 |
- "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t" |
|
1381 |
- "add $4, %%"REG_S" \n\t" |
|
1382 |
- |
|
1383 |
- "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t" |
|
1384 |
- "add $"DCTSIZE_S"*2*4, %%"REG_D" \n\t" //4 rows |
|
1385 |
- "dec %%"REG_c" \n\t" |
|
1386 |
- "jnz 6b \n\t" |
|
1387 |
- |
|
1388 |
- : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps), "=o"(temps[1]) |
|
1389 |
- : "a"(line_size) |
|
1390 |
- NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, ff_MM_FIX_0_541196100, MM_FIX_0_382683433, MM_FIX_1_306562965) |
|
1391 |
- : "%"REG_d); |
|
1392 |
-} |
|
1393 |
-#endif |
|
1394 |
- |
|
1395 |
-av_cold void ff_fspp_init_x86(FSPPContext *s) |
|
1396 |
-{ |
|
1397 |
-#if HAVE_MMX_INLINE |
|
1398 |
- int cpu_flags = av_get_cpu_flags(); |
|
1399 |
- |
|
1400 |
- if (HAVE_MMX_INLINE && cpu_flags & AV_CPU_FLAG_MMX) { |
|
1401 |
- s->store_slice = store_slice_mmx; |
|
1402 |
- s->store_slice2 = store_slice2_mmx; |
|
1403 |
- s->mul_thrmat = mul_thrmat_mmx; |
|
1404 |
- s->column_fidct = column_fidct_mmx; |
|
1405 |
- s->row_idct = row_idct_mmx; |
|
1406 |
- s->row_fdct = row_fdct_mmx; |
|
1407 |
- } |
|
1408 |
-#endif |
|
1409 |
-} |
1410 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,49 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> |
|
2 |
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru> |
|
3 |
+ * |
|
4 |
+ * This file is part of FFmpeg. |
|
5 |
+ * |
|
6 |
+ * FFmpeg is free software; you can redistribute it and/or modify |
|
7 |
+ * it under the terms of the GNU General Public License as published by |
|
8 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
9 |
+ * (at your option) any later version. |
|
10 |
+ * |
|
11 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
+ * GNU General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU General Public License along |
|
17 |
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc., |
|
18 |
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+#include "libavutil/attributes.h" |
|
22 |
+#include "libavutil/x86/cpu.h" |
|
23 |
+#include "libavfilter/vf_fspp.h" |
|
24 |
+ |
|
25 |
+void ff_store_slice_mmx(uint8_t *dst, int16_t *src, |
|
26 |
+ ptrdiff_t dst_stride, ptrdiff_t src_stride, |
|
27 |
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); |
|
28 |
+void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, |
|
29 |
+ ptrdiff_t dst_stride, ptrdiff_t src_stride, |
|
30 |
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); |
|
31 |
+void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); |
|
32 |
+void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); |
|
33 |
+void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); |
|
34 |
+void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); |
|
35 |
+ |
|
36 |
+av_cold void ff_fspp_init_x86(FSPPContext *s) |
|
37 |
+{ |
|
38 |
+ int cpu_flags = av_get_cpu_flags(); |
|
39 |
+ |
|
40 |
+ if (EXTERNAL_MMX(cpu_flags)) { |
|
41 |
+ s->store_slice = ff_store_slice_mmx; |
|
42 |
+ s->store_slice2 = ff_store_slice2_mmx; |
|
43 |
+ s->mul_thrmat = ff_mul_thrmat_mmx; |
|
44 |
+ s->column_fidct = ff_column_fidct_mmx; |
|
45 |
+ s->row_idct = ff_row_idct_mmx; |
|
46 |
+ s->row_fdct = ff_row_fdct_mmx; |
|
47 |
+ } |
|
48 |
+} |