Browse code

x86/vf_fspp: port inline asm to yasm

Reviewed-by: Michael Niedermayer <michaelni@gmx.at>
Signed-off-by: James Almer <jamrial@gmail.com>

James Almer authored on 2014/12/27 03:37:54
Showing 6 changed files
... ...
@@ -151,11 +151,11 @@ static void store_slice2_c(uint8_t *dst, int16_t *src,
151 151
     }
152 152
 }
153 153
 
154
-static void mul_thrmat_c(FSPPContext *p, int q)
154
+static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
155 155
 {
156 156
     int a;
157 157
     for (a = 0; a < 64; a++)
158
-        ((int16_t *)p->threshold_mtx)[a] = q * ((int16_t *)p->threshold_mtx_noq)[a];//ints faster in C
158
+        thr_adr[a] = q * thr_adr_noq[a];
159 159
 }
160 160
 
161 161
 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
... ...
@@ -220,7 +220,7 @@ static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
220 220
                     t = qp_store[qy + (t >> qpsh)];
221 221
                     t = norm_qscale(t, p->qscale_type);
222 222
 
223
-                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat(p, t);
223
+                    if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
224 224
                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
225 225
                 }
226 226
             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
... ...
@@ -378,7 +378,7 @@ static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int
378 378
     }
379 379
 }
380 380
 
381
-static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_stride, int cnt)
381
+static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
382 382
 {
383 383
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
384 384
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
... ...
@@ -440,7 +440,7 @@ static void row_idct_c(int16_t *workspace, int16_t *output_adr, int output_strid
440 440
     }
441 441
 }
442 442
 
443
-static void row_fdct_c(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
443
+static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
444 444
 {
445 445
     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
446 446
     int_simd16_t tmp10, tmp11, tmp12, tmp13;
... ...
@@ -582,7 +582,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in)
582 582
     }
583 583
 
584 584
     if (fspp->qp)
585
-        fspp->prev_q = fspp->qp, fspp->mul_thrmat(fspp, fspp->qp);
585
+        fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
586 586
 
587 587
     /* if we are not in a constant user quantizer mode and we don't want to use
588 588
      * the quantizers from the B-frames (B-frames often have a higher QP), we
... ...
@@ -79,16 +79,16 @@ typedef struct FSPPContext {
79 79
                          ptrdiff_t dst_stride, ptrdiff_t src_stride,
80 80
                          ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
81 81
 
82
-    void (*mul_thrmat)(struct FSPPContext *fspp, int q);
82
+    void (*mul_thrmat)(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
83 83
 
84 84
     void (*column_fidct)(int16_t *thr_adr, int16_t *data,
85 85
                          int16_t *output, int cnt);
86 86
 
87 87
     void (*row_idct)(int16_t *workspace, int16_t *output_adr,
88
-                     int output_stride, int cnt);
88
+                     ptrdiff_t output_stride, int cnt);
89 89
 
90 90
     void (*row_fdct)(int16_t *data, const uint8_t *pixels,
91
-                     int line_size, int cnt);
91
+                     ptrdiff_t line_size, int cnt);
92 92
 
93 93
 } FSPPContext;
94 94
 
... ...
@@ -1,4 +1,4 @@
1
-OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp.o
1
+OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
2 2
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
3 3
 OBJS-$(CONFIG_HQDN3D_FILTER)                 += x86/vf_hqdn3d_init.o
4 4
 OBJS-$(CONFIG_IDET_FILTER)                   += x86/vf_idet_init.o
... ...
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_TINTERLACE_FILTER)             += x86/vf_tinterlace_init.o
10 10
 OBJS-$(CONFIG_VOLUME_FILTER)                 += x86/af_volume_init.o
11 11
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
12 12
 
13
+YASM-OBJS-$(CONFIG_FSPP_FILTER)              += x86/vf_fspp.o
13 14
 YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
14 15
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
15 16
 YASM-OBJS-$(CONFIG_IDET_FILTER)              += x86/vf_idet.o
16 17
new file mode 100644
... ...
@@ -0,0 +1,727 @@
0
+;*****************************************************************************
1
+;* x86-optimized functions for fspp filter
2
+;*
3
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4
+;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
5
+;*
6
+;* This file is part of FFmpeg.
7
+;*
8
+;* FFmpeg is free software; you can redistribute it and/or modify
9
+;* it under the terms of the GNU General Public License as published by
10
+;* the Free Software Foundation; either version 2 of the License, or
11
+;* (at your option) any later version.
12
+;*
13
+;* FFmpeg is distributed in the hope that it will be useful,
14
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
+;* GNU General Public License for more details.
17
+;*
18
+;* You should have received a copy of the GNU General Public License along
19
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21
+;******************************************************************************
22
+
23
+%include "libavutil/x86/x86util.asm"
24
+
25
+SECTION_RODATA
26
+
27
+pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  19,  47,  31, \
28
+              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  27,  39,  23, \
29
+              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  17,  45,  29, \
30
+             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  25,  37,  21
31
+pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
32
+pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
33
+pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
34
+pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
35
+pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
36
+pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
37
+pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
38
+pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
39
+pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
40
+pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
41
+pw_4:    times 4 dw 4
42
+pw_2:    times 4 dw 2
43
+
44
+SECTION .text
45
+
46
+%define DCTSIZE 8
47
+
48
+INIT_MMX mmx
49
+
50
+;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
51
+;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
52
+;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
53
+%if ARCH_X86_64
54
+cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
55
+%else
56
+cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
57
+%define dst_strideq r2m
58
+%define src_strideq r3m
59
+    mov       widthq, r4m
60
+    mov       dither_heightq, r5m
61
+    mov       ditherq, r6m ; log2_scale
62
+%endif
63
+    add       widthq, 7
64
+    mov       tmpq, src_strideq
65
+    and       widthq, ~7
66
+    sub       dst_strideq, widthq
67
+    movd      m5, ditherq ; log2_scale
68
+    xor       ditherq, -1 ; log2_scale
69
+    mov       tmp2q, tmpq
70
+    add       ditherq, 7 ; log2_scale
71
+    neg       tmpq
72
+    sub       tmp2q, widthq
73
+    movd      m2, ditherq ; log2_scale
74
+    add       tmp2q, tmp2q
75
+    lea       ditherq, [pb_dither]
76
+    mov       src_strideq, tmp2q
77
+    shl       tmpq, 4
78
+    lea       dither_heightq, [ditherq+dither_heightq*8]
79
+
80
+.loop_height:
81
+    movq      m3, [ditherq]
82
+    movq      m4, m3
83
+    pxor      m7, m7
84
+    punpcklbw m3, m7
85
+    punpckhbw m4, m7
86
+    mov       tmp2q, widthq
87
+    psraw     m3, m5
88
+    psraw     m4, m5
89
+
90
+.loop_width:
91
+    movq      [srcq+tmpq], m7
92
+    movq      m0, [srcq]
93
+    movq      m1, [srcq+8]
94
+    movq      [srcq+tmpq+8], m7
95
+    paddw     m0, m3
96
+    paddw     m1, m4
97
+    movq      [srcq], m7
98
+    psraw     m0, m2
99
+    psraw     m1, m2
100
+    movq      [srcq+8], m7
101
+    packuswb  m0, m1
102
+    add       srcq, 16
103
+    movq      [dstq], m0
104
+    add       dstq, 8
105
+    sub       tmp2q, 8
106
+    jg .loop_width
107
+
108
+    add       srcq, src_strideq
109
+    add       ditherq, 8
110
+    add       dstq, dst_strideq
111
+    cmp       ditherq, dither_heightq
112
+    jl .loop_height
113
+    RET
114
+
115
+;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
116
+;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
117
+;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
118
+%if ARCH_X86_64
119
+cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
120
+%else
121
+cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
122
+%define dst_strideq r2m
123
+%define src_strideq r3m
124
+    mov       dstq, dstm
125
+    mov       srcq, srcm
126
+    mov       widthq, r4m
127
+    mov       dither_heightq, r5m
128
+    mov       ditherq, r6m ; log2_scale
129
+%endif
130
+    add       widthq, 7
131
+    mov       tmpq, src_strideq
132
+    and       widthq, ~7
133
+    sub       dst_strideq, widthq
134
+    movd      m5, ditherq ; log2_scale
135
+    xor       ditherq, -1 ; log2_scale
136
+    mov       tmp2q, tmpq
137
+    add       ditherq, 7 ; log2_scale
138
+    sub       tmp2q, widthq
139
+    movd      m2, ditherq ; log2_scale
140
+    add       tmp2q, tmp2q
141
+    lea       ditherq, [pb_dither]
142
+    mov       src_strideq, tmp2q
143
+    shl       tmpq, 5
144
+    lea       dither_heightq, [ditherq+dither_heightq*8]
145
+
146
+.loop_height:
147
+    movq      m3, [ditherq]
148
+    movq      m4, m3
149
+    pxor      m7, m7
150
+    punpcklbw m3, m7
151
+    punpckhbw m4, m7
152
+    mov       tmp2q,widthq
153
+    psraw     m3, m5
154
+    psraw     m4, m5
155
+
156
+.loop_width:
157
+    movq      m0, [srcq]
158
+    movq      m1, [srcq+8]
159
+    paddw     m0, m3
160
+    paddw     m0, [srcq+tmpq]
161
+    paddw     m1, m4
162
+    movq      m6, [srcq+tmpq+8]
163
+    movq      [srcq+tmpq], m7
164
+    psraw     m0, m2
165
+    paddw     m1, m6
166
+    movq      [srcq+tmpq+8], m7
167
+    psraw     m1, m2
168
+    packuswb  m0, m1
169
+    movq      [dstq], m0
170
+    add       srcq, 16
171
+    add       dstq, 8
172
+    sub       tmp2q, 8
173
+    jg .loop_width
174
+
175
+    add       srcq, src_strideq
176
+    add       ditherq, 8
177
+    add       dstq, dst_strideq
178
+    cmp       ditherq, dither_heightq
179
+    jl .loop_height
180
+    RET
181
+
182
+;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
183
+cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
184
+    movd      m7, qd
185
+    movq      m0, [thrnq]
186
+    punpcklwd m7, m7
187
+    movq      m1, [thrnq+8]
188
+    punpckldq m7, m7
189
+    pmullw    m0, m7
190
+    movq      m2, [thrnq+8*2]
191
+    pmullw    m1, m7
192
+    movq      m3, [thrnq+8*3]
193
+    pmullw    m2, m7
194
+    movq      [thrq], m0
195
+    movq      m4, [thrnq+8*4]
196
+    pmullw    m3, m7
197
+    movq      [thrq+8], m1
198
+    movq      m5, [thrnq+8*5]
199
+    pmullw    m4, m7
200
+    movq      [thrq+8*2], m2
201
+    movq      m6, [thrnq+8*6]
202
+    pmullw    m5, m7
203
+    movq      [thrq+8*3], m3
204
+    movq      m0, [thrnq+8*7]
205
+    pmullw    m6, m7
206
+    movq      [thrq+8*4], m4
207
+    movq      m1, [thrnq+8*7+8]
208
+    pmullw    m0, m7
209
+    movq      [thrq+8*5], m5
210
+    movq      m2, [thrnq+8*7+8*2]
211
+    pmullw    m1, m7
212
+    movq      [thrq+8*6], m6
213
+    movq      m3, [thrnq+8*7+8*3]
214
+    pmullw    m2, m7
215
+    movq      [thrq+8*7], m0
216
+    movq      m4, [thrnq+8*7+8*4]
217
+    pmullw    m3, m7
218
+    movq      [thrq+8*7+8], m1
219
+    movq      m5, [thrnq+8*7+8*5]
220
+    pmullw    m4, m7
221
+    movq      [thrq+8*7+8*2], m2
222
+    movq      m6, [thrnq+8*7+8*6]
223
+    pmullw    m5, m7
224
+    movq      [thrq+8*7+8*3], m3
225
+    movq      m0, [thrnq+14*8]
226
+    pmullw    m6, m7
227
+    movq      [thrq+8*7+8*4], m4
228
+    movq      m1, [thrnq+14*8+8]
229
+    pmullw    m0, m7
230
+    movq      [thrq+8*7+8*5], m5
231
+    pmullw    m1, m7
232
+    movq      [thrq+8*7+8*6], m6
233
+    movq      [thrq+14*8], m0
234
+    movq      [thrq+14*8+8], m1
235
+    RET
236
+
237
+%macro COLUMN_FDCT 1-3 0, 0
238
+    movq      m1, [srcq+DCTSIZE*0*2]
239
+    movq      m7, [srcq+DCTSIZE*3*2]
240
+    movq      m0, m1
241
+    paddw     m1, [srcq+DCTSIZE*7*2]
242
+    movq      m3, m7
243
+    paddw     m7, [srcq+DCTSIZE*4*2]
244
+    movq      m5, m1
245
+    movq      m6, [srcq+DCTSIZE*1*2]
246
+    psubw     m1, m7
247
+    movq      m2, [srcq+DCTSIZE*2*2]
248
+    movq      m4, m6
249
+    paddw     m6, [srcq+DCTSIZE*6*2]
250
+    paddw     m5, m7
251
+    paddw     m2, [srcq+DCTSIZE*5*2]
252
+    movq      m7, m6
253
+    paddw     m6, m2
254
+    psubw     m7, m2
255
+    movq      m2, m5
256
+    paddw     m5, m6
257
+    psubw     m2, m6
258
+    paddw     m7, m1
259
+    movq      m6, [thrq+4*16+%2]
260
+    psllw     m7, 2
261
+    psubw     m5, [thrq+%2]
262
+    psubw     m2, m6
263
+    paddusw   m5, [thrq+%2]
264
+    paddusw   m2, m6
265
+    pmulhw    m7, [pw_2D41]
266
+    paddw     m5, [thrq+%2]
267
+    paddw     m2, m6
268
+    psubusw   m5, [thrq+%2]
269
+    psubusw   m2, m6
270
+    paddw     m5, [pw_2]
271
+    movq      m6, m2
272
+    paddw     m2, m5
273
+    psubw     m5, m6
274
+    movq      m6, m1
275
+    paddw     m1, m7
276
+    psubw     m1, [thrq+2*16+%2]
277
+    psubw     m6, m7
278
+    movq      m7, [thrq+6*16+%2]
279
+    psraw     m5, 2
280
+    paddusw   m1, [thrq+2*16+%2]
281
+    psubw     m6, m7
282
+    paddw     m1, [thrq+2*16+%2]
283
+    paddusw   m6, m7
284
+    psubusw   m1, [thrq+2*16+%2]
285
+    paddw     m6, m7
286
+    psubw     m3, [srcq+DCTSIZE*4*2]
287
+    psubusw   m6, m7
288
+    movq      m7, m1
289
+    psraw     m2, 2
290
+    psubw     m4, [srcq+DCTSIZE*6*2]
291
+    psubw     m1, m6
292
+    psubw     m0, [srcq+DCTSIZE*7*2]
293
+    paddw     m6, m7
294
+    psraw     m6, 2
295
+    movq      m7, m2
296
+    pmulhw    m1, [pw_5A82]
297
+    paddw     m2, m6
298
+    movq      [rsp], m2
299
+    psubw     m7, m6
300
+    movq      m2, [srcq+DCTSIZE*2*2]
301
+    psubw     m1, m6
302
+    psubw     m2, [srcq+DCTSIZE*5*2]
303
+    movq      m6, m5
304
+    movq      [rsp+8*3], m7
305
+    paddw     m3, m2
306
+    paddw     m2, m4
307
+    paddw     m4, m0
308
+    movq      m7, m3
309
+    psubw     m3, m4
310
+    psllw     m3, 2
311
+    psllw     m7, 2
312
+    pmulhw    m3, [pw_187E]
313
+    psllw     m4, 2
314
+    pmulhw    m7, [pw_22A3]
315
+    psllw     m2, 2
316
+    pmulhw    m4, [pw_539F]
317
+    paddw     m5, m1
318
+    pmulhw    m2, [pw_2D41]
319
+    psubw     m6, m1
320
+    paddw     m7, m3
321
+    movq      [rsp+8], m5
322
+    paddw     m4, m3
323
+    movq      m3, [thrq+3*16+%2]
324
+    movq      m1, m0
325
+    movq      [rsp+8*2], m6
326
+    psubw     m1, m2
327
+    paddw     m0, m2
328
+    movq      m5, m1
329
+    movq      m2, [thrq+5*16+%2]
330
+    psubw     m1, m7
331
+    paddw     m5, m7
332
+    psubw     m1, m3
333
+    movq      m7, [thrq+16+%2]
334
+    psubw     m5, m2
335
+    movq      m6, m0
336
+    paddw     m0, m4
337
+    paddusw   m1, m3
338
+    psubw     m6, m4
339
+    movq      m4, [thrq+7*16+%2]
340
+    psubw     m0, m7
341
+    psubw     m6, m4
342
+    paddusw   m5, m2
343
+    paddusw   m6, m4
344
+    paddw     m1, m3
345
+    paddw     m5, m2
346
+    paddw     m6, m4
347
+    psubusw   m1, m3
348
+    psubusw   m5, m2
349
+    psubusw   m6, m4
350
+    movq      m4, m1
351
+    por       m4, m5
352
+    paddusw   m0, m7
353
+    por       m4, m6
354
+    paddw     m0, m7
355
+    packssdw  m4, m4
356
+    psubusw   m0, m7
357
+    movd      tmpd, m4
358
+    or        tmpd, tmpd
359
+    jnz %1
360
+    movq      m4, [rsp]
361
+    movq      m1, m0
362
+    pmulhw    m0, [pw_3642]
363
+    movq      m2, m1
364
+    movq      m5, [outq+DCTSIZE*0*2]
365
+    movq      m3, m2
366
+    pmulhw    m1, [pw_2441]
367
+    paddw     m5, m4
368
+    movq      m6, [rsp+8]
369
+    psraw     m3, 2
370
+    pmulhw    m2, [pw_0CBB]
371
+    psubw     m4, m3
372
+    movq      m7, [outq+DCTSIZE*1*2]
373
+    paddw     m5, m3
374
+    movq      [outq+DCTSIZE*7*2], m4
375
+    paddw     m7, m6
376
+    movq      m3, [rsp+8*2]
377
+    psubw     m6, m0
378
+    movq      m4, [outq+DCTSIZE*2*2]
379
+    paddw     m7, m0
380
+    movq      [outq], m5
381
+    paddw     m4, m3
382
+    movq      [outq+DCTSIZE*6*2], m6
383
+    psubw     m3, m1
384
+    movq      m5, [outq+DCTSIZE*5*2]
385
+    paddw     m4, m1
386
+    movq      m6, [outq+DCTSIZE*3*2]
387
+    paddw     m5, m3
388
+    movq      m0, [rsp+8*3]
389
+    add       srcq, 8+%3
390
+    movq      [outq+DCTSIZE*1*2], m7
391
+    paddw     m6, m0
392
+    movq      [outq+DCTSIZE*2*2], m4
393
+    psubw     m0, m2
394
+    movq      m7, [outq+DCTSIZE*4*2]
395
+    paddw     m6, m2
396
+    movq      [outq+DCTSIZE*5*2], m5
397
+    paddw     m7, m0
398
+    movq      [outq+DCTSIZE*3*2], m6
399
+    movq      [outq+DCTSIZE*4*2], m7
400
+    add       outq, 8+%3
401
+%endmacro
402
+
403
+%macro COLUMN_IDCT 0-1 0
404
+    movq      m3, m5
405
+    psubw     m5, m1
406
+    psllw     m5, 1
407
+    paddw     m3, m1
408
+    movq      m2, m0
409
+    psubw     m0, m6
410
+    movq      m1, m5
411
+    psllw     m0, 1
412
+    pmulhw    m1, [pw_AC62]
413
+    paddw     m5, m0
414
+    pmulhw    m5, [pw_3B21]
415
+    paddw     m2, m6
416
+    pmulhw    m0, [pw_22A3]
417
+    movq      m7, m2
418
+    movq      m4, [rsp]
419
+    psubw     m2, m3
420
+    psllw     m2, 1
421
+    paddw     m7, m3
422
+    pmulhw    m2, [pw_2D41]
423
+    movq      m6, m4
424
+    psraw     m7, 2
425
+    paddw     m4, [outq]
426
+    psubw     m6, m7
427
+    movq      m3, [rsp+8]
428
+    paddw     m4, m7
429
+    movq      [outq+DCTSIZE*7*2], m6
430
+    paddw     m1, m5
431
+    movq      [outq], m4
432
+    psubw     m1, m7
433
+    movq      m7, [rsp+8*2]
434
+    psubw     m0, m5
435
+    movq      m6, [rsp+8*3]
436
+    movq      m5, m3
437
+    paddw     m3, [outq+DCTSIZE*1*2]
438
+    psubw     m5, m1
439
+    psubw     m2, m1
440
+    paddw     m3, m1
441
+    movq      [outq+DCTSIZE*6*2], m5
442
+    movq      m4, m7
443
+    paddw     m7, [outq+DCTSIZE*2*2]
444
+    psubw     m4, m2
445
+    paddw     m4, [outq+DCTSIZE*5*2]
446
+    paddw     m7, m2
447
+    movq      [outq+DCTSIZE*1*2], m3
448
+    paddw     m0, m2
449
+    movq      [outq+DCTSIZE*2*2], m7
450
+    movq      m1, m6
451
+    paddw     m6, [outq+DCTSIZE*4*2]
452
+    psubw     m1, m0
453
+    paddw     m1, [outq+DCTSIZE*3*2]
454
+    paddw     m6, m0
455
+    movq      [outq+DCTSIZE*5*2], m4
456
+    add       srcq, 8+%1
457
+    movq      [outq+DCTSIZE*4*2], m6
458
+    movq      [outq+DCTSIZE*3*2], m1
459
+    add       outq, 8+%1
460
+%endmacro
461
+
462
+;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
463
+cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
464
+.fdct1:
465
+    COLUMN_FDCT .idct1
466
+    jmp .fdct2
467
+
468
+.idct1:
469
+    COLUMN_IDCT
470
+
471
+.fdct2:
472
+    COLUMN_FDCT .idct2, 8, 16
473
+    sub    cntd, 2
474
+    jnz .fdct1
475
+    RET
476
+
477
+.idct2:
478
+    COLUMN_IDCT 16
479
+    sub    cntd, 2
480
+    jnz .fdct1
481
+    RET
482
+
483
+;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
484
+cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
485
+    add       strideq, strideq
486
+    lea       stride3q, [strideq+strideq*2]
487
+.loop:
488
+    movq      m0, [srcq+DCTSIZE*0*2]
489
+    movq      m1, [srcq+DCTSIZE*1*2]
490
+    movq      m4, m0
491
+    movq      m2, [srcq+DCTSIZE*2*2]
492
+    punpcklwd m0, m1
493
+    movq      m3, [srcq+DCTSIZE*3*2]
494
+    punpckhwd m4, m1
495
+    movq      m7, m2
496
+    punpcklwd m2, m3
497
+    movq      m6, m0
498
+    punpckldq m0, m2
499
+    punpckhdq m6, m2
500
+    movq      m5, m0
501
+    punpckhwd m7, m3
502
+    psubw     m0, m6
503
+    pmulhw    m0, [pw_5A82]
504
+    movq      m2, m4
505
+    punpckldq m4, m7
506
+    paddw     m5, m6
507
+    punpckhdq m2, m7
508
+    movq      m1, m4
509
+    psllw     m0, 2
510
+    paddw     m4, m2
511
+    movq      m3, [srcq+DCTSIZE*0*2+8]
512
+    psubw     m1, m2
513
+    movq      m2, [srcq+DCTSIZE*1*2+8]
514
+    psubw     m0, m5
515
+    movq      m6, m4
516
+    paddw     m4, m5
517
+    psubw     m6, m5
518
+    movq      m7, m1
519
+    movq      m5, [srcq+DCTSIZE*2*2+8]
520
+    paddw     m1, m0
521
+    movq      [rsp], m4
522
+    movq      m4, m3
523
+    movq      [rsp+8], m6
524
+    punpcklwd m3, m2
525
+    movq      m6, [srcq+DCTSIZE*3*2+8]
526
+    punpckhwd m4, m2
527
+    movq      m2, m5
528
+    punpcklwd m5, m6
529
+    psubw     m7, m0
530
+    punpckhwd m2, m6
531
+    movq      m0, m3
532
+    punpckldq m3, m5
533
+    punpckhdq m0, m5
534
+    movq      m5, m4
535
+    movq      m6, m3
536
+    punpckldq m4, m2
537
+    psubw     m3, m0
538
+    punpckhdq m5, m2
539
+    paddw     m6, m0
540
+    movq      m2, m4
541
+    movq      m0, m3
542
+    psubw     m4, m5
543
+    pmulhw    m0, [pw_AC62]
544
+    paddw     m3, m4
545
+    pmulhw    m3, [pw_3B21]
546
+    paddw     m2, m5
547
+    pmulhw    m4, [pw_22A3]
548
+    movq      m5, m2
549
+    psubw     m2, m6
550
+    paddw     m5, m6
551
+    pmulhw    m2, [pw_2D41]
552
+    paddw     m0, m3
553
+    psllw     m0, 3
554
+    psubw     m4, m3
555
+    movq      m6, [rsp]
556
+    movq      m3, m1
557
+    psllw     m4, 3
558
+    psubw     m0, m5
559
+    psllw     m2, 3
560
+    paddw     m1, m0
561
+    psubw     m2, m0
562
+    psubw     m3, m0
563
+    paddw     m4, m2
564
+    movq      m0, m7
565
+    paddw     m7, m2
566
+    psubw     m0, m2
567
+    movq      m2, [pw_4]
568
+    psubw     m6, m5
569
+    paddw     m5, [rsp]
570
+    paddw     m1, m2
571
+    paddw     m5, m2
572
+    psraw     m1, 3
573
+    paddw     m7, m2
574
+    psraw     m5, 3
575
+    paddw     m5, [dstq]
576
+    psraw     m7, 3
577
+    paddw     m1, [dstq+strideq*1]
578
+    paddw     m0, m2
579
+    paddw     m7, [dstq+strideq*2]
580
+    paddw     m3, m2
581
+    movq      [dstq], m5
582
+    paddw     m6, m2
583
+    movq      [dstq+strideq*1], m1
584
+    psraw     m0, 3
585
+    movq      [dstq+strideq*2], m7
586
+    add       dstq, stride3q
587
+    movq      m5, [rsp+8]
588
+    psraw     m3, 3
589
+    paddw     m0, [dstq+strideq*2]
590
+    psubw     m5, m4
591
+    paddw     m3, [dstq+stride3q*1]
592
+    psraw     m6, 3
593
+    paddw     m4, [rsp+8]
594
+    paddw     m5, m2
595
+    paddw     m6, [dstq+strideq*4]
596
+    paddw     m4, m2
597
+    movq      [dstq+strideq*2], m0
598
+    psraw     m5, 3
599
+    paddw     m5, [dstq]
600
+    psraw     m4, 3
601
+    paddw     m4, [dstq+strideq*1]
602
+    add       srcq, DCTSIZE*2*4
603
+    movq      [dstq+stride3q*1], m3
604
+    movq      [dstq+strideq*4], m6
605
+    movq      [dstq], m5
606
+    movq      [dstq+strideq*1], m4
607
+    sub       dstq, stride3q
608
+    add       dstq, 8
609
+    dec       r3d
610
+    jnz .loop
611
+    RET
612
+
613
+;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
614
+cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
615
+    lea       stride3q, [strideq+strideq*2]
616
+.loop:
617
+    movd      m0, [pixq]
618
+    pxor      m7, m7
619
+    movd      m1, [pixq+strideq*1]
620
+    punpcklbw m0, m7
621
+    movd      m2, [pixq+strideq*2]
622
+    punpcklbw m1, m7
623
+    punpcklbw m2, m7
624
+    add       pixq,stride3q
625
+    movq      m5, m0
626
+    movd      m3, [pixq+strideq*4]
627
+    movq      m6, m1
628
+    movd      m4, [pixq+stride3q*1]
629
+    punpcklbw m3, m7
630
+    psubw     m5, m3
631
+    punpcklbw m4, m7
632
+    paddw     m0, m3
633
+    psubw     m6, m4
634
+    movd      m3, [pixq+strideq*2]
635
+    paddw     m1, m4
636
+    movq      [rsp], m5
637
+    punpcklbw m3, m7
638
+    movq      [rsp+8], m6
639
+    movq      m4, m2
640
+    movd      m5, [pixq]
641
+    paddw     m2, m3
642
+    movd      m6, [pixq+strideq*1]
643
+    punpcklbw m5, m7
644
+    psubw     m4, m3
645
+    punpcklbw m6, m7
646
+    movq      m3, m5
647
+    paddw     m5, m6
648
+    psubw     m3, m6
649
+    movq      m6, m0
650
+    movq      m7, m1
651
+    psubw     m0, m5
652
+    psubw     m1, m2
653
+    paddw     m7, m2
654
+    paddw     m1, m0
655
+    movq      m2, m7
656
+    psllw     m1, 2
657
+    paddw     m6, m5
658
+    pmulhw    m1, [pw_2D41]
659
+    paddw     m7, m6
660
+    psubw     m6, m2
661
+    movq      m5, m0
662
+    movq      m2, m7
663
+    punpcklwd m7, m6
664
+    paddw     m0, m1
665
+    punpckhwd m2, m6
666
+    psubw     m5, m1
667
+    movq      m6, m0
668
+    movq      m1, [rsp+8]
669
+    punpcklwd m0, m5
670
+    punpckhwd m6, m5
671
+    movq      m5, m0
672
+    punpckldq m0, m7
673
+    paddw     m3, m4
674
+    punpckhdq m5, m7
675
+    movq      m7, m6
676
+    movq      [srcq+DCTSIZE*0*2], m0
677
+    punpckldq m6, m2
678
+    movq      [srcq+DCTSIZE*1*2], m5
679
+    punpckhdq m7, m2
680
+    movq      [srcq+DCTSIZE*2*2], m6
681
+    paddw     m4, m1
682
+    movq      [srcq+DCTSIZE*3*2], m7
683
+    psllw     m3, 2
684
+    movq      m2, [rsp]
685
+    psllw     m4, 2
686
+    pmulhw    m4, [pw_2D41]
687
+    paddw     m1, m2
688
+    psllw     m1, 2
689
+    movq      m0, m3
690
+    pmulhw    m0, [pw_22A3]
691
+    psubw     m3, m1
692
+    pmulhw    m3, [pw_187E]
693
+    movq      m5, m2
694
+    pmulhw    m1, [pw_539F]
695
+    psubw     m2, m4
696
+    paddw     m5, m4
697
+    movq      m6, m2
698
+    paddw     m0, m3
699
+    movq      m7, m5
700
+    paddw     m2, m0
701
+    psubw     m6, m0
702
+    movq      m4, m2
703
+    paddw     m1, m3
704
+    punpcklwd m2, m6
705
+    paddw     m5, m1
706
+    punpckhwd m4, m6
707
+    psubw     m7, m1
708
+    movq      m6, m5
709
+    punpcklwd m5, m7
710
+    punpckhwd m6, m7
711
+    movq      m7, m2
712
+    punpckldq m2, m5
713
+    sub       pixq, stride3q
714
+    punpckhdq m7, m5
715
+    movq      m5, m4
716
+    movq      [srcq+DCTSIZE*0*2+8], m2
717
+    punpckldq m4, m6
718
+    movq      [srcq+DCTSIZE*1*2+8], m7
719
+    punpckhdq m5, m6
720
+    movq      [srcq+DCTSIZE*2*2+8], m4
721
+    add       pixq, 4
722
+    movq      [srcq+DCTSIZE*3*2+8], m5
723
+    add       srcq, DCTSIZE*4*2
724
+    dec       cntd
725
+    jnz .loop
726
+    RET
0 727
deleted file mode 100644
... ...
@@ -1,1409 +0,0 @@
1
-/*
2
- * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3
- * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
4
- *
5
- * This file is part of FFmpeg.
6
- *
7
- * FFmpeg is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU General Public License as published by
9
- * the Free Software Foundation; either version 2 of the License, or
10
- * (at your option) any later version.
11
- *
12
- * FFmpeg is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License along
18
- * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
19
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20
- */
21
-
22
-#include "libavutil/attributes.h"
23
-#include "libavutil/cpu.h"
24
-#include "libavutil/mem.h"
25
-#include "libavutil/x86/asm.h"
26
-#include "libavfilter/vf_fspp.h"
27
-
28
-#if HAVE_MMX_INLINE
29
-DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
30
-    {  0,  48,  12,  60,   3,  51,  15,  63, },
31
-    { 32,  16,  44,  28,  35,  19,  47,  31, },
32
-    {  8,  56,   4,  52,  11,  59,   7,  55, },
33
-    { 40,  24,  36,  20,  43,  27,  39,  23, },
34
-    {  2,  50,  14,  62,   1,  49,  13,  61, },
35
-    { 34,  18,  46,  30,  33,  17,  45,  29, },
36
-    { 10,  58,   6,  54,   9,  57,   5,  53, },
37
-    { 42,  26,  38,  22,  41,  25,  37,  21, },
38
-};
39
-
40
-//This func reads from 1 slice, 1 and clears 0 & 1
41
-static void store_slice_mmx(uint8_t *dst, int16_t *src,
42
-                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
43
-                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
44
-{
45
-    const uint8_t *od = &dither[0][0];
46
-    const uint8_t *end = &dither[height][0];
47
-    width = (width + 7) & ~7;
48
-    dst_stride -= width;
49
-
50
-    __asm__ volatile(
51
-        "mov %5 , %%"REG_d"                \n\t"
52
-        "mov %6 , %%"REG_S"                \n\t"
53
-        "mov %7 , %%"REG_D"                \n\t"
54
-        "mov %1 , %%"REG_a"                \n\t"
55
-        "movd %%"REG_d" , %%mm5            \n\t"
56
-        "xor $-1 , %%"REG_d"               \n\t"
57
-        "mov %%"REG_a" , %%"REG_c"         \n\t"
58
-        "add $7 , %%"REG_d"                \n\t"
59
-        "neg %%"REG_a"                     \n\t"
60
-        "sub %0 , %%"REG_c"                \n\t"
61
-        "add %%"REG_c" , %%"REG_c"         \n\t"
62
-        "movd %%"REG_d" , %%mm2            \n\t"
63
-        "mov %%"REG_c" , %1                \n\t"
64
-        "mov %2 , %%"REG_d"                \n\t"
65
-        "shl $4 , %%"REG_a"                \n\t"
66
-
67
-        "2:                                \n\t"
68
-        "movq (%%"REG_d") , %%mm3          \n\t"
69
-        "movq %%mm3 , %%mm4                \n\t"
70
-        "pxor %%mm7 , %%mm7                \n\t"
71
-        "punpcklbw %%mm7 , %%mm3           \n\t"
72
-        "punpckhbw %%mm7 , %%mm4           \n\t"
73
-        "mov %0 , %%"REG_c"                \n\t"
74
-        "psraw %%mm5 , %%mm3               \n\t"
75
-        "psraw %%mm5 , %%mm4               \n\t"
76
-        "1:                                \n\t"
77
-        "movq %%mm7, (%%"REG_S",%%"REG_a") \n\t"
78
-        "movq (%%"REG_S") , %%mm0          \n\t"
79
-        "movq 8(%%"REG_S"), %%mm1          \n\t"
80
-
81
-        "movq %%mm7, 8(%%"REG_S",%%"REG_a")\n\t"
82
-        "paddw %%mm3, %%mm0                \n\t"
83
-        "paddw %%mm4, %%mm1                \n\t"
84
-
85
-        "movq %%mm7, (%%"REG_S")           \n\t"
86
-        "psraw %%mm2, %%mm0                \n\t"
87
-        "psraw %%mm2, %%mm1                \n\t"
88
-
89
-        "movq %%mm7, 8(%%"REG_S")          \n\t"
90
-        "packuswb %%mm1, %%mm0             \n\t"
91
-        "add $16, %%"REG_S"                \n\t"
92
-
93
-        "movq %%mm0, (%%"REG_D")           \n\t"
94
-        "add $8, %%"REG_D"                 \n\t"
95
-        "sub $8, %%"REG_c"                 \n\t"
96
-        "jg 1b                             \n\t"
97
-        "add %1, %%"REG_S"                 \n\t"
98
-        "add $8, %%"REG_d"                 \n\t"
99
-        "add %3, %%"REG_D"                 \n\t"
100
-        "cmp %4, %%"REG_d"                 \n\t"
101
-        "jl 2b                             \n\t"
102
-
103
-        :
104
-        : "m" (width),      "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
105
-          "m" (log2_scale), "m" (src),        "m" (dst)                                     //input
106
-        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
107
-        );
108
-}
109
-
110
-//This func reads from 2 slices, 0 & 2  and clears 2-nd
111
-static void store_slice2_mmx(uint8_t *dst, int16_t *src,
112
-                             ptrdiff_t dst_stride, ptrdiff_t src_stride,
113
-                             ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
114
-{
115
-    const uint8_t *od = &dither[0][0];
116
-    const uint8_t *end = &dither[height][0];
117
-    width = (width + 7) & ~7;
118
-    dst_stride -= width;
119
-
120
-    __asm__ volatile(
121
-        "mov %5, %%"REG_d"                \n\t"
122
-        "mov %6, %%"REG_S"                \n\t"
123
-        "mov %7, %%"REG_D"                \n\t"
124
-        "mov %1, %%"REG_a"                \n\t"
125
-        "movd %%"REG_d", %%mm5            \n\t"
126
-        "xor $-1, %%"REG_d"               \n\t"
127
-        "mov %%"REG_a", %%"REG_c"         \n\t"
128
-        "add $7, %%"REG_d"                \n\t"
129
-        "sub %0, %%"REG_c"                \n\t"
130
-        "add %%"REG_c", %%"REG_c"         \n\t"
131
-        "movd %%"REG_d", %%mm2            \n\t"
132
-        "mov %%"REG_c", %1                \n\t"
133
-        "mov %2, %%"REG_d"                \n\t"
134
-        "shl $5, %%"REG_a"                \n\t"
135
-
136
-        "2:                               \n\t"
137
-        "movq (%%"REG_d"), %%mm3          \n\t"
138
-        "movq %%mm3, %%mm4                \n\t"
139
-        "pxor %%mm7, %%mm7                \n\t"
140
-        "punpcklbw %%mm7, %%mm3           \n\t"
141
-        "punpckhbw %%mm7, %%mm4           \n\t"
142
-        "mov %0, %%"REG_c"                \n\t"
143
-        "psraw %%mm5, %%mm3               \n\t"
144
-        "psraw %%mm5, %%mm4               \n\t"
145
-        "1:                               \n\t"
146
-        "movq (%%"REG_S"), %%mm0          \n\t"
147
-        "movq 8(%%"REG_S"), %%mm1         \n\t"
148
-        "paddw %%mm3, %%mm0               \n\t"
149
-
150
-        "paddw (%%"REG_S",%%"REG_a"),%%mm0\n\t"
151
-        "paddw %%mm4, %%mm1               \n\t"
152
-        "movq 8(%%"REG_S",%%"REG_a"),%%mm6\n\t"
153
-
154
-        "movq %%mm7, (%%"REG_S",%%"REG_a")\n\t"
155
-        "psraw %%mm2, %%mm0               \n\t"
156
-        "paddw %%mm6, %%mm1               \n\t"
157
-
158
-        "movq %%mm7,8(%%"REG_S",%%"REG_a")\n\t"
159
-        "psraw %%mm2, %%mm1               \n\t"
160
-        "packuswb %%mm1, %%mm0            \n\t"
161
-
162
-        "movq %%mm0, (%%"REG_D")          \n\t"
163
-        "add $16, %%"REG_S"               \n\t"
164
-        "add $8, %%"REG_D"                \n\t"
165
-        "sub $8, %%"REG_c"                \n\t"
166
-        "jg 1b                            \n\t"
167
-        "add %1, %%"REG_S"                \n\t"
168
-        "add $8, %%"REG_d"                \n\t"
169
-        "add %3, %%"REG_D"                \n\t"
170
-        "cmp %4, %%"REG_d"                \n\t"
171
-        "jl 2b                            \n\t"
172
-
173
-        :
174
-        : "m" (width),      "m" (src_stride), "erm" (od), "m" (dst_stride), "erm" (end),
175
-          "m" (log2_scale), "m" (src),        "m" (dst)                                     //input
176
-        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_D, "%"REG_S
177
-        );
178
-}
179
-
180
-static void mul_thrmat_mmx(FSPPContext *p, int q)
181
-{
182
-    uint64_t *adr = &p->threshold_mtx_noq[0];
183
-
184
-    __asm__ volatile(
185
-        "movd %0, %%mm7                   \n\t"
186
-        "add $8*8*2, %%"REG_D"            \n\t"
187
-        "movq 0*8(%%"REG_S"), %%mm0       \n\t"
188
-        "punpcklwd %%mm7, %%mm7           \n\t"
189
-        "movq 1*8(%%"REG_S"), %%mm1       \n\t"
190
-        "punpckldq %%mm7, %%mm7           \n\t"
191
-        "pmullw %%mm7, %%mm0              \n\t"
192
-
193
-        "movq 2*8(%%"REG_S"), %%mm2       \n\t"
194
-        "pmullw %%mm7, %%mm1              \n\t"
195
-
196
-        "movq 3*8(%%"REG_S"), %%mm3       \n\t"
197
-        "pmullw %%mm7, %%mm2              \n\t"
198
-
199
-        "movq %%mm0, 0*8(%%"REG_D")       \n\t"
200
-        "movq 4*8(%%"REG_S"), %%mm4       \n\t"
201
-        "pmullw %%mm7, %%mm3              \n\t"
202
-
203
-        "movq %%mm1, 1*8(%%"REG_D")       \n\t"
204
-        "movq 5*8(%%"REG_S"), %%mm5       \n\t"
205
-        "pmullw %%mm7, %%mm4              \n\t"
206
-
207
-        "movq %%mm2, 2*8(%%"REG_D")       \n\t"
208
-        "movq 6*8(%%"REG_S"), %%mm6       \n\t"
209
-        "pmullw %%mm7, %%mm5              \n\t"
210
-
211
-        "movq %%mm3, 3*8(%%"REG_D")       \n\t"
212
-        "movq 7*8+0*8(%%"REG_S"), %%mm0   \n\t"
213
-        "pmullw %%mm7, %%mm6              \n\t"
214
-
215
-        "movq %%mm4, 4*8(%%"REG_D")       \n\t"
216
-        "movq 7*8+1*8(%%"REG_S"), %%mm1   \n\t"
217
-        "pmullw %%mm7, %%mm0              \n\t"
218
-
219
-        "movq %%mm5, 5*8(%%"REG_D")       \n\t"
220
-        "movq 7*8+2*8(%%"REG_S"), %%mm2   \n\t"
221
-        "pmullw %%mm7, %%mm1              \n\t"
222
-
223
-        "movq %%mm6, 6*8(%%"REG_D")       \n\t"
224
-        "movq 7*8+3*8(%%"REG_S"), %%mm3   \n\t"
225
-        "pmullw %%mm7, %%mm2              \n\t"
226
-
227
-        "movq %%mm0, 7*8+0*8(%%"REG_D")   \n\t"
228
-        "movq 7*8+4*8(%%"REG_S"), %%mm4   \n\t"
229
-        "pmullw %%mm7, %%mm3              \n\t"
230
-
231
-        "movq %%mm1, 7*8+1*8(%%"REG_D")   \n\t"
232
-        "movq 7*8+5*8(%%"REG_S"), %%mm5   \n\t"
233
-        "pmullw %%mm7, %%mm4              \n\t"
234
-
235
-        "movq %%mm2, 7*8+2*8(%%"REG_D")   \n\t"
236
-        "movq 7*8+6*8(%%"REG_S"), %%mm6   \n\t"
237
-        "pmullw %%mm7, %%mm5              \n\t"
238
-
239
-        "movq %%mm3, 7*8+3*8(%%"REG_D")   \n\t"
240
-        "movq 14*8+0*8(%%"REG_S"), %%mm0  \n\t"
241
-        "pmullw %%mm7, %%mm6              \n\t"
242
-
243
-        "movq %%mm4, 7*8+4*8(%%"REG_D")   \n\t"
244
-        "movq 14*8+1*8(%%"REG_S"), %%mm1  \n\t"
245
-        "pmullw %%mm7, %%mm0              \n\t"
246
-
247
-        "movq %%mm5, 7*8+5*8(%%"REG_D")   \n\t"
248
-        "pmullw %%mm7, %%mm1              \n\t"
249
-
250
-        "movq %%mm6, 7*8+6*8(%%"REG_D")   \n\t"
251
-        "movq %%mm0, 14*8+0*8(%%"REG_D")  \n\t"
252
-        "movq %%mm1, 14*8+1*8(%%"REG_D")  \n\t"
253
-
254
-        : "+g" (q), "+S" (adr), "+D" (adr)
255
-        :
256
-        );
257
-}
258
-
259
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)   = FIX64(0.382683433, 14);
260
-DECLARE_ALIGNED  (8, uint64_t, ff_MM_FIX_0_541196100)= FIX64(0.541196100, 14);
261
-DECLARE_ALIGNED  (8, uint64_t, ff_MM_FIX_0_707106781)= FIX64(0.707106781, 14);
262
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)   = FIX64(1.306562965, 14);
263
-
264
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A) = FIX64(1.414213562, 14);
265
-
266
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)   = FIX64(1.847759065, 13);
267
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)   = FIX64(-2.613125930, 13);
268
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)   = FIX64(1.414213562, 13);
269
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)   = FIX64(1.082392200, 13);
270
-//for t3,t5,t7 == 0 shortcut
271
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)   = FIX64(0.847759065, 14);
272
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)   = FIX64(0.566454497, 14);
273
-DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)   = FIX64(0.198912367, 14);
274
-
275
-DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)       = C64(4);
276
-DECLARE_ASM_CONST(8, uint64_t, MM_2)                 = C64(2);
277
-
278
-static void column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
279
-{
280
-    DECLARE_ALIGNED(8, uint64_t, temps)[4];
281
-
282
-    __asm__ volatile(
283
-
284
-        "1:                                       \n\t"
285
-        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1   \n\t"
286
-        //
287
-        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7   \n\t"
288
-        "movq %%mm1, %%mm0                        \n\t"
289
-
290
-        "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1  \n\t" //t0
291
-        "movq %%mm7, %%mm3                        \n\t"
292
-
293
-        "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7  \n\t" //t3
294
-        "movq %%mm1, %%mm5             \n\t"
295
-
296
-        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6   \n\t"
297
-        "psubw %%mm7, %%mm1                       \n\t" //t13
298
-
299
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2   \n\t"
300
-        "movq %%mm6, %%mm4                        \n\t"
301
-
302
-        "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6  \n\t" //t1
303
-        "paddw %%mm7, %%mm5                       \n\t" //t10
304
-
305
-        "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2  \n\t" //t2
306
-        "movq %%mm6, %%mm7                        \n\t"
307
-
308
-        "paddw %%mm2, %%mm6                       \n\t" //t11
309
-        "psubw %%mm2, %%mm7                       \n\t" //t12
310
-
311
-        "movq %%mm5, %%mm2                        \n\t"
312
-        "paddw %%mm6, %%mm5                       \n\t" //d0
313
-        // i0 t13 t12 i3 i1 d0 - d4
314
-        "psubw %%mm6, %%mm2                       \n\t" //d4
315
-        "paddw %%mm1, %%mm7                       \n\t"
316
-
317
-        "movq  4*16(%%"REG_d"), %%mm6             \n\t"
318
-        "psllw $2, %%mm7                          \n\t"
319
-
320
-        "psubw 0*16(%%"REG_d"), %%mm5             \n\t"
321
-        "psubw %%mm6, %%mm2                       \n\t"
322
-
323
-        "paddusw 0*16(%%"REG_d"), %%mm5           \n\t"
324
-        "paddusw %%mm6, %%mm2                     \n\t"
325
-
326
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
327
-        //
328
-        "paddw 0*16(%%"REG_d"), %%mm5             \n\t"
329
-        "paddw %%mm6, %%mm2                       \n\t"
330
-
331
-        "psubusw 0*16(%%"REG_d"), %%mm5           \n\t"
332
-        "psubusw %%mm6, %%mm2                     \n\t"
333
-
334
-//This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
335
-// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
336
-//However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
337
-        "paddw "MANGLE(MM_2)", %%mm5              \n\t"
338
-        "movq %%mm2, %%mm6                        \n\t"
339
-
340
-        "paddw %%mm5, %%mm2                       \n\t"
341
-        "psubw %%mm6, %%mm5                       \n\t"
342
-
343
-        "movq %%mm1, %%mm6                        \n\t"
344
-        "paddw %%mm7, %%mm1                       \n\t" //d2
345
-
346
-        "psubw 2*16(%%"REG_d"), %%mm1             \n\t"
347
-        "psubw %%mm7, %%mm6                       \n\t" //d6
348
-
349
-        "movq 6*16(%%"REG_d"), %%mm7              \n\t"
350
-        "psraw $2, %%mm5                          \n\t"
351
-
352
-        "paddusw 2*16(%%"REG_d"), %%mm1           \n\t"
353
-        "psubw %%mm7, %%mm6                       \n\t"
354
-        // t7 d2 /t11 t4 t6 - d6 /t10
355
-
356
-        "paddw 2*16(%%"REG_d"), %%mm1             \n\t"
357
-        "paddusw %%mm7, %%mm6                     \n\t"
358
-
359
-        "psubusw 2*16(%%"REG_d"), %%mm1           \n\t"
360
-        "paddw %%mm7, %%mm6                       \n\t"
361
-
362
-        "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3  \n\t"
363
-        "psubusw %%mm7, %%mm6                     \n\t"
364
-
365
-        //movq [edi+"DCTSIZE_S"*2*2], mm1
366
-        //movq [edi+"DCTSIZE_S"*6*2], mm6
367
-        "movq %%mm1, %%mm7                        \n\t"
368
-        "psraw $2, %%mm2                          \n\t"
369
-
370
-        "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4  \n\t"
371
-        "psubw %%mm6, %%mm1                       \n\t"
372
-
373
-        "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0  \n\t"
374
-        "paddw %%mm7, %%mm6                       \n\t" //'t13
375
-
376
-        "psraw $2, %%mm6                          \n\t" //paddw mm6, MM_2 !!    ---
377
-        "movq %%mm2, %%mm7                        \n\t"
378
-
379
-        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
380
-        "paddw %%mm6, %%mm2                       \n\t" //'t0
381
-
382
-        "movq %%mm2, 0*8+%3                       \n\t" //!
383
-        "psubw %%mm6, %%mm7                       \n\t" //'t3
384
-
385
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2   \n\t"
386
-        "psubw %%mm6, %%mm1                       \n\t" //'t12
387
-
388
-        "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2  \n\t" //t5
389
-        "movq %%mm5, %%mm6                        \n\t"
390
-
391
-        "movq %%mm7, 3*8+%3                       \n\t"
392
-        "paddw %%mm2, %%mm3                       \n\t" //t10
393
-
394
-        "paddw %%mm4, %%mm2                       \n\t" //t11
395
-        "paddw %%mm0, %%mm4                       \n\t" //t12
396
-
397
-        "movq %%mm3, %%mm7                        \n\t"
398
-        "psubw %%mm4, %%mm3                       \n\t"
399
-
400
-        "psllw $2, %%mm3                          \n\t"
401
-        "psllw $2, %%mm7                          \n\t" //opt for P6
402
-
403
-        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
404
-        "psllw $2, %%mm4                          \n\t"
405
-
406
-        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
407
-        "psllw $2, %%mm2                          \n\t"
408
-
409
-        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
410
-        "paddw %%mm1, %%mm5                       \n\t" //'t1
411
-
412
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
413
-        "psubw %%mm1, %%mm6                       \n\t" //'t2
414
-        // t7 't12 't11 t4 t6 - 't13 't10   ---
415
-
416
-        "paddw %%mm3, %%mm7                       \n\t" //z2
417
-
418
-        "movq %%mm5, 1*8+%3                       \n\t"
419
-        "paddw %%mm3, %%mm4                       \n\t" //z4
420
-
421
-        "movq 3*16(%%"REG_d"), %%mm3              \n\t"
422
-        "movq %%mm0, %%mm1                        \n\t"
423
-
424
-        "movq %%mm6, 2*8+%3                       \n\t"
425
-        "psubw %%mm2, %%mm1                       \n\t" //z13
426
-
427
-//===
428
-        "paddw %%mm2, %%mm0                       \n\t" //z11
429
-        "movq %%mm1, %%mm5                        \n\t"
430
-
431
-        "movq 5*16(%%"REG_d"), %%mm2              \n\t"
432
-        "psubw %%mm7, %%mm1                       \n\t" //d3
433
-
434
-        "paddw %%mm7, %%mm5                       \n\t" //d5
435
-        "psubw %%mm3, %%mm1                       \n\t"
436
-
437
-        "movq 1*16(%%"REG_d"), %%mm7              \n\t"
438
-        "psubw %%mm2, %%mm5                       \n\t"
439
-
440
-        "movq %%mm0, %%mm6                        \n\t"
441
-        "paddw %%mm4, %%mm0                       \n\t" //d1
442
-
443
-        "paddusw %%mm3, %%mm1                     \n\t"
444
-        "psubw %%mm4, %%mm6                       \n\t" //d7
445
-
446
-        // d1 d3 - - - d5 d7 -
447
-        "movq 7*16(%%"REG_d"), %%mm4              \n\t"
448
-        "psubw %%mm7, %%mm0                       \n\t"
449
-
450
-        "psubw %%mm4, %%mm6                       \n\t"
451
-        "paddusw %%mm2, %%mm5                     \n\t"
452
-
453
-        "paddusw %%mm4, %%mm6                     \n\t"
454
-        "paddw %%mm3, %%mm1                       \n\t"
455
-
456
-        "paddw %%mm2, %%mm5                       \n\t"
457
-        "paddw %%mm4, %%mm6                       \n\t"
458
-
459
-        "psubusw %%mm3, %%mm1                     \n\t"
460
-        "psubusw %%mm2, %%mm5                     \n\t"
461
-
462
-        "psubusw %%mm4, %%mm6                     \n\t"
463
-        "movq %%mm1, %%mm4                        \n\t"
464
-
465
-        "por %%mm5, %%mm4                         \n\t"
466
-        "paddusw %%mm7, %%mm0                     \n\t"
467
-
468
-        "por %%mm6, %%mm4                         \n\t"
469
-        "paddw %%mm7, %%mm0                       \n\t"
470
-
471
-        "packssdw %%mm4, %%mm4                    \n\t"
472
-        "psubusw %%mm7, %%mm0                     \n\t"
473
-
474
-        "movd %%mm4, %%"REG_a"                    \n\t"
475
-        "or %%"REG_a", %%"REG_a"                  \n\t"
476
-        "jnz 2f                                   \n\t"
477
-        //movq [edi+"DCTSIZE_S"*3*2], mm1
478
-        //movq [edi+"DCTSIZE_S"*5*2], mm5
479
-        //movq [edi+"DCTSIZE_S"*1*2], mm0
480
-        //movq [edi+"DCTSIZE_S"*7*2], mm6
481
-        // t4 t5 - - - t6 t7 -
482
-        //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
483
-//Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
484
-        "movq 0*8+%3, %%mm4                      \n\t"
485
-        "movq %%mm0, %%mm1                       \n\t"
486
-
487
-        "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
488
-        "movq %%mm1, %%mm2                       \n\t"
489
-
490
-        "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5 \n\t"
491
-        "movq %%mm2, %%mm3                      \n\t"
492
-
493
-        "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
494
-        "paddw %%mm4, %%mm5                     \n\t"
495
-
496
-        "movq 1*8+%3, %%mm6                     \n\t"
497
-        //paddw mm3, MM_2
498
-        "psraw $2, %%mm3                        \n\t" //tmp7
499
-
500
-        "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
501
-        "psubw %%mm3, %%mm4                     \n\t"
502
-
503
-        "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7 \n\t"
504
-        "paddw %%mm3, %%mm5                     \n\t"
505
-
506
-        "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
507
-        "paddw %%mm6, %%mm7                     \n\t"
508
-
509
-        "movq 2*8+%3, %%mm3                     \n\t"
510
-        "psubw %%mm0, %%mm6                     \n\t"
511
-
512
-        "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4 \n\t"
513
-        "paddw %%mm0, %%mm7                     \n\t"
514
-
515
-        "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
516
-        "paddw %%mm3, %%mm4                     \n\t"
517
-
518
-        "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
519
-        "psubw %%mm1, %%mm3                     \n\t"
520
-
521
-        "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5 \n\t"
522
-        "paddw %%mm1, %%mm4                     \n\t"
523
-
524
-        "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6 \n\t"
525
-        "paddw %%mm3, %%mm5                     \n\t"
526
-
527
-        "movq 3*8+%3, %%mm0                     \n\t"
528
-        "add $8, %%"REG_S"                      \n\t"
529
-
530
-        "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
531
-        "paddw %%mm0, %%mm6                     \n\t"
532
-
533
-        "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
534
-        "psubw %%mm2, %%mm0                     \n\t"
535
-
536
-        "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7 \n\t"
537
-        "paddw %%mm2, %%mm6                     \n\t"
538
-
539
-        "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
540
-        "paddw %%mm0, %%mm7                     \n\t"
541
-
542
-        "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
543
-
544
-        "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
545
-        "add $8, %%"REG_D"                      \n\t"
546
-        "jmp 4f                                 \n\t"
547
-
548
-        "2:                                     \n\t"
549
-        //--- non DC2
550
-        //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
551
-        //psraw mm5, 2
552
-        //psraw mm0, 2
553
-        //psraw mm6, 2
554
-        "movq %%mm5, %%mm3                      \n\t"
555
-        "psubw %%mm1, %%mm5                     \n\t"
556
-
557
-        "psllw $1, %%mm5                        \n\t" //'z10
558
-        "paddw %%mm1, %%mm3                     \n\t" //'z13
559
-
560
-        "movq %%mm0, %%mm2                      \n\t"
561
-        "psubw %%mm6, %%mm0                     \n\t"
562
-
563
-        "movq %%mm5, %%mm1                      \n\t"
564
-        "psllw $1, %%mm0                        \n\t" //'z12
565
-
566
-        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
567
-        "paddw %%mm0, %%mm5                     \n\t"
568
-
569
-        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
570
-        "paddw %%mm6, %%mm2                     \n\t" //'z11
571
-
572
-        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
573
-        "movq %%mm2, %%mm7                      \n\t"
574
-
575
-        //---
576
-        "movq 0*8+%3, %%mm4                     \n\t"
577
-        "psubw %%mm3, %%mm2                     \n\t"
578
-
579
-        "psllw $1, %%mm2                        \n\t"
580
-        "paddw %%mm3, %%mm7                     \n\t" //'t7
581
-
582
-        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
583
-        "movq %%mm4, %%mm6                      \n\t"
584
-        //paddw mm7, MM_2
585
-        "psraw $2, %%mm7                        \n\t"
586
-
587
-        "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4\n\t"
588
-        "psubw %%mm7, %%mm6                     \n\t"
589
-
590
-        "movq 1*8+%3, %%mm3                     \n\t"
591
-        "paddw %%mm7, %%mm4                     \n\t"
592
-
593
-        "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
594
-        "paddw %%mm5, %%mm1                     \n\t" //'t12
595
-
596
-        "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
597
-        "psubw %%mm7, %%mm1                     \n\t" //'t6
598
-
599
-        "movq 2*8+%3, %%mm7                     \n\t"
600
-        "psubw %%mm5, %%mm0                     \n\t" //'t10
601
-
602
-        "movq 3*8+%3, %%mm6                     \n\t"
603
-        "movq %%mm3, %%mm5                      \n\t"
604
-
605
-        "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3\n\t"
606
-        "psubw %%mm1, %%mm5                     \n\t"
607
-
608
-        "psubw %%mm1, %%mm2                     \n\t" //'t5
609
-        "paddw %%mm1, %%mm3                     \n\t"
610
-
611
-        "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
612
-        "movq %%mm7, %%mm4                      \n\t"
613
-
614
-        "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7\n\t"
615
-        "psubw %%mm2, %%mm4                     \n\t"
616
-
617
-        "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4\n\t"
618
-        "paddw %%mm2, %%mm7                     \n\t"
619
-
620
-        "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
621
-        "paddw %%mm2, %%mm0                     \n\t" //'t4
622
-
623
-        // 't4 't6 't5 - - - - 't7
624
-        "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
625
-        "movq %%mm6, %%mm1                      \n\t"
626
-
627
-        "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6\n\t"
628
-        "psubw %%mm0, %%mm1                     \n\t"
629
-
630
-        "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1\n\t"
631
-        "paddw %%mm0, %%mm6                     \n\t"
632
-
633
-        "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
634
-        "add $8, %%"REG_S"                      \n\t"
635
-
636
-        "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
637
-
638
-        "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
639
-        "add $8, %%"REG_D"                      \n\t"
640
-
641
-        "4:                                     \n\t"
642
-        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm1 \n\t"
643
-        //
644
-        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm7 \n\t"
645
-        "movq %%mm1, %%mm0                      \n\t"
646
-
647
-        "paddw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm1\n\t" //t0
648
-        "movq %%mm7, %%mm3                      \n\t"
649
-
650
-        "paddw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm7\n\t" //t3
651
-        "movq %%mm1, %%mm5                      \n\t"
652
-
653
-        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm6 \n\t"
654
-        "psubw %%mm7, %%mm1                     \n\t" //t13
655
-
656
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
657
-        "movq %%mm6, %%mm4                      \n\t"
658
-
659
-        "paddw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm6\n\t" //t1
660
-        "paddw %%mm7, %%mm5                     \n\t" //t10
661
-
662
-        "paddw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t2
663
-        "movq %%mm6, %%mm7                      \n\t"
664
-
665
-        "paddw %%mm2, %%mm6                     \n\t" //t11
666
-        "psubw %%mm2, %%mm7                     \n\t" //t12
667
-
668
-        "movq %%mm5, %%mm2                      \n\t"
669
-        "paddw %%mm6, %%mm5                     \n\t" //d0
670
-        // i0 t13 t12 i3 i1 d0 - d4
671
-        "psubw %%mm6, %%mm2                     \n\t" //d4
672
-        "paddw %%mm1, %%mm7                     \n\t"
673
-
674
-        "movq  1*8+4*16(%%"REG_d"), %%mm6       \n\t"
675
-        "psllw $2, %%mm7                        \n\t"
676
-
677
-        "psubw 1*8+0*16(%%"REG_d"), %%mm5       \n\t"
678
-        "psubw %%mm6, %%mm2                     \n\t"
679
-
680
-        "paddusw 1*8+0*16(%%"REG_d"), %%mm5     \n\t"
681
-        "paddusw %%mm6, %%mm2                   \n\t"
682
-
683
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm7 \n\t"
684
-        //
685
-        "paddw 1*8+0*16(%%"REG_d"), %%mm5       \n\t"
686
-        "paddw %%mm6, %%mm2                     \n\t"
687
-
688
-        "psubusw 1*8+0*16(%%"REG_d"), %%mm5     \n\t"
689
-        "psubusw %%mm6, %%mm2                   \n\t"
690
-
691
-//This func is totally compute-bound,  operates at huge speed. So,  DC shortcut
692
-// at this place isn't worthwhile due to BTB miss penalty (checked on Pent. 3).
693
-//However,  typical numbers: nondc - 29%%,  dc - 46%%,  zero - 25%%. All <> 0 case is very rare.
694
-        "paddw "MANGLE(MM_2)", %%mm5            \n\t"
695
-        "movq %%mm2, %%mm6                      \n\t"
696
-
697
-        "paddw %%mm5, %%mm2                     \n\t"
698
-        "psubw %%mm6, %%mm5                     \n\t"
699
-
700
-        "movq %%mm1, %%mm6                      \n\t"
701
-        "paddw %%mm7, %%mm1                     \n\t" //d2
702
-
703
-        "psubw 1*8+2*16(%%"REG_d"), %%mm1       \n\t"
704
-        "psubw %%mm7, %%mm6                     \n\t" //d6
705
-
706
-        "movq 1*8+6*16(%%"REG_d"), %%mm7        \n\t"
707
-        "psraw $2, %%mm5                        \n\t"
708
-
709
-        "paddusw 1*8+2*16(%%"REG_d"), %%mm1     \n\t"
710
-        "psubw %%mm7, %%mm6                     \n\t"
711
-        // t7 d2 /t11 t4 t6 - d6 /t10
712
-
713
-        "paddw 1*8+2*16(%%"REG_d"), %%mm1       \n\t"
714
-        "paddusw %%mm7, %%mm6                   \n\t"
715
-
716
-        "psubusw 1*8+2*16(%%"REG_d"), %%mm1     \n\t"
717
-        "paddw %%mm7, %%mm6                     \n\t"
718
-
719
-        "psubw "DCTSIZE_S"*4*2(%%"REG_S"), %%mm3\n\t"
720
-        "psubusw %%mm7, %%mm6                   \n\t"
721
-
722
-        //movq [edi+"DCTSIZE_S"*2*2], mm1
723
-        //movq [edi+"DCTSIZE_S"*6*2], mm6
724
-        "movq %%mm1, %%mm7                      \n\t"
725
-        "psraw $2, %%mm2                        \n\t"
726
-
727
-        "psubw "DCTSIZE_S"*6*2(%%"REG_S"), %%mm4\n\t"
728
-        "psubw %%mm6, %%mm1                     \n\t"
729
-
730
-        "psubw "DCTSIZE_S"*7*2(%%"REG_S"), %%mm0\n\t"
731
-        "paddw %%mm7, %%mm6                     \n\t" //'t13
732
-
733
-        "psraw $2, %%mm6                        \n\t" //paddw mm6, MM_2 !!    ---
734
-        "movq %%mm2, %%mm7                      \n\t"
735
-
736
-        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm1 \n\t"
737
-        "paddw %%mm6, %%mm2                     \n\t" //'t0
738
-
739
-        "movq %%mm2, 0*8+%3                     \n\t" //!
740
-        "psubw %%mm6, %%mm7                     \n\t" //'t3
741
-
742
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2 \n\t"
743
-        "psubw %%mm6, %%mm1                     \n\t" //'t12
744
-
745
-        "psubw "DCTSIZE_S"*5*2(%%"REG_S"), %%mm2\n\t" //t5
746
-        "movq %%mm5, %%mm6                      \n\t"
747
-
748
-        "movq %%mm7, 3*8+%3                     \n\t"
749
-        "paddw %%mm2, %%mm3                     \n\t" //t10
750
-
751
-        "paddw %%mm4, %%mm2                     \n\t" //t11
752
-        "paddw %%mm0, %%mm4                     \n\t" //t12
753
-
754
-        "movq %%mm3, %%mm7                      \n\t"
755
-        "psubw %%mm4, %%mm3                     \n\t"
756
-
757
-        "psllw $2, %%mm3                        \n\t"
758
-        "psllw $2, %%mm7                        \n\t" //opt for P6
759
-
760
-        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t"
761
-        "psllw $2, %%mm4                        \n\t"
762
-
763
-        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm7 \n\t"
764
-        "psllw $2, %%mm2                        \n\t"
765
-
766
-        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm4 \n\t"
767
-        "paddw %%mm1, %%mm5                     \n\t" //'t1
768
-
769
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm2 \n\t"
770
-        "psubw %%mm1, %%mm6                     \n\t" //'t2
771
-        // t7 't12 't11 t4 t6 - 't13 't10   ---
772
-
773
-        "paddw %%mm3, %%mm7                     \n\t" //z2
774
-
775
-        "movq %%mm5, 1*8+%3                     \n\t"
776
-        "paddw %%mm3, %%mm4                     \n\t" //z4
777
-
778
-        "movq 1*8+3*16(%%"REG_d"), %%mm3        \n\t"
779
-        "movq %%mm0, %%mm1                      \n\t"
780
-
781
-        "movq %%mm6, 2*8+%3                     \n\t"
782
-        "psubw %%mm2, %%mm1                     \n\t" //z13
783
-
784
-//===
785
-        "paddw %%mm2, %%mm0                     \n\t" //z11
786
-        "movq %%mm1, %%mm5                      \n\t"
787
-
788
-        "movq 1*8+5*16(%%"REG_d"), %%mm2        \n\t"
789
-        "psubw %%mm7, %%mm1                     \n\t" //d3
790
-
791
-        "paddw %%mm7, %%mm5                     \n\t" //d5
792
-        "psubw %%mm3, %%mm1                     \n\t"
793
-
794
-        "movq 1*8+1*16(%%"REG_d"), %%mm7        \n\t"
795
-        "psubw %%mm2, %%mm5                     \n\t"
796
-
797
-        "movq %%mm0, %%mm6                      \n\t"
798
-        "paddw %%mm4, %%mm0                     \n\t" //d1
799
-
800
-        "paddusw %%mm3, %%mm1                   \n\t"
801
-        "psubw %%mm4, %%mm6                     \n\t" //d7
802
-
803
-        // d1 d3 - - - d5 d7 -
804
-        "movq 1*8+7*16(%%"REG_d"), %%mm4        \n\t"
805
-        "psubw %%mm7, %%mm0                     \n\t"
806
-
807
-        "psubw %%mm4, %%mm6                     \n\t"
808
-        "paddusw %%mm2, %%mm5                   \n\t"
809
-
810
-        "paddusw %%mm4, %%mm6                   \n\t"
811
-        "paddw %%mm3, %%mm1                     \n\t"
812
-
813
-        "paddw %%mm2, %%mm5                     \n\t"
814
-        "paddw %%mm4, %%mm6                     \n\t"
815
-
816
-        "psubusw %%mm3, %%mm1                   \n\t"
817
-        "psubusw %%mm2, %%mm5                   \n\t"
818
-
819
-        "psubusw %%mm4, %%mm6                   \n\t"
820
-        "movq %%mm1, %%mm4                      \n\t"
821
-
822
-        "por %%mm5, %%mm4                       \n\t"
823
-        "paddusw %%mm7, %%mm0                   \n\t"
824
-
825
-        "por %%mm6, %%mm4                       \n\t"
826
-        "paddw %%mm7, %%mm0                     \n\t"
827
-
828
-        "packssdw %%mm4, %%mm4                  \n\t"
829
-        "psubusw %%mm7, %%mm0                   \n\t"
830
-
831
-        "movd %%mm4, %%"REG_a"                  \n\t"
832
-        "or %%"REG_a", %%"REG_a"                \n\t"
833
-        "jnz 3f                                 \n\t"
834
-        //movq [edi+"DCTSIZE_S"*3*2], mm1
835
-        //movq [edi+"DCTSIZE_S"*5*2], mm5
836
-        //movq [edi+"DCTSIZE_S"*1*2], mm0
837
-        //movq [edi+"DCTSIZE_S"*7*2], mm6
838
-        // t4 t5 - - - t6 t7 -
839
-        //--- t4 (mm0) may be <>0; mm1, mm5, mm6 == 0
840
-//Typical numbers: nondc - 19%%,  dc - 26%%,  zero - 55%%. zero case alone isn't worthwhile
841
-        "movq 0*8+%3, %%mm4                    \n\t"
842
-        "movq %%mm0, %%mm1                     \n\t"
843
-
844
-        "pmulhw "MANGLE(MM_FIX_0_847759065)", %%mm0 \n\t" //tmp6
845
-        "movq %%mm1, %%mm2                     \n\t"
846
-
847
-        "movq "DCTSIZE_S"*0*2(%%"REG_D"), %%mm5\n\t"
848
-        "movq %%mm2, %%mm3                     \n\t"
849
-
850
-        "pmulhw "MANGLE(MM_FIX_0_566454497)", %%mm1 \n\t" //tmp5
851
-        "paddw %%mm4, %%mm5                    \n\t"
852
-
853
-        "movq 1*8+%3, %%mm6                    \n\t"
854
-        //paddw mm3, MM_2
855
-        "psraw $2, %%mm3                       \n\t" //tmp7
856
-
857
-        "pmulhw "MANGLE(MM_FIX_0_198912367)", %%mm2 \n\t" //-tmp4
858
-        "psubw %%mm3, %%mm4                    \n\t"
859
-
860
-        "movq "DCTSIZE_S"*1*2(%%"REG_D"), %%mm7\n\t"
861
-        "paddw %%mm3, %%mm5                    \n\t"
862
-
863
-        "movq %%mm4, "DCTSIZE_S"*7*2(%%"REG_D")\n\t"
864
-        "paddw %%mm6, %%mm7                    \n\t"
865
-
866
-        "movq 2*8+%3, %%mm3                    \n\t"
867
-        "psubw %%mm0, %%mm6                    \n\t"
868
-
869
-        "movq "DCTSIZE_S"*2*2(%%"REG_D"), %%mm4\n\t"
870
-        "paddw %%mm0, %%mm7                    \n\t"
871
-
872
-        "movq %%mm5, "DCTSIZE_S"*0*2(%%"REG_D")\n\t"
873
-        "paddw %%mm3, %%mm4                    \n\t"
874
-
875
-        "movq %%mm6, "DCTSIZE_S"*6*2(%%"REG_D")\n\t"
876
-        "psubw %%mm1, %%mm3                    \n\t"
877
-
878
-        "movq "DCTSIZE_S"*5*2(%%"REG_D"), %%mm5\n\t"
879
-        "paddw %%mm1, %%mm4                    \n\t"
880
-
881
-        "movq "DCTSIZE_S"*3*2(%%"REG_D"), %%mm6\n\t"
882
-        "paddw %%mm3, %%mm5                    \n\t"
883
-
884
-        "movq 3*8+%3, %%mm0                    \n\t"
885
-        "add $24, %%"REG_S"                    \n\t"
886
-
887
-        "movq %%mm7, "DCTSIZE_S"*1*2(%%"REG_D")\n\t"
888
-        "paddw %%mm0, %%mm6                    \n\t"
889
-
890
-        "movq %%mm4, "DCTSIZE_S"*2*2(%%"REG_D")\n\t"
891
-        "psubw %%mm2, %%mm0                    \n\t"
892
-
893
-        "movq "DCTSIZE_S"*4*2(%%"REG_D"), %%mm7\n\t"
894
-        "paddw %%mm2, %%mm6                    \n\t"
895
-
896
-        "movq %%mm5, "DCTSIZE_S"*5*2(%%"REG_D")\n\t"
897
-        "paddw %%mm0, %%mm7                    \n\t"
898
-
899
-        "movq %%mm6, "DCTSIZE_S"*3*2(%%"REG_D")\n\t"
900
-
901
-        "movq %%mm7, "DCTSIZE_S"*4*2(%%"REG_D")\n\t"
902
-        "add $24, %%"REG_D"                    \n\t"
903
-        "sub $2, %%"REG_c"                     \n\t"
904
-        "jnz 1b                                \n\t"
905
-        "jmp 5f                                \n\t"
906
-
907
-        "3:                                    \n\t"
908
-        //--- non DC2
909
-        //psraw mm1, 2 w/o it -> offset. thr1, thr1, thr1  (actually thr1, thr1, thr1-1)
910
-        //psraw mm5, 2
911
-        //psraw mm0, 2
912
-        //psraw mm6, 2
913
-        "movq %%mm5, %%mm3                    \n\t"
914
-        "psubw %%mm1, %%mm5                   \n\t"
915
-
916
-        "psllw $1, %%mm5                      \n\t" //'z10
917
-        "paddw %%mm1, %%mm3                   \n\t" //'z13
918
-
919
-        "movq %%mm0, %%mm2                    \n\t"
920
-        "psubw %%mm6, %%mm0                   \n\t"
921
-
922
-        "movq %%mm5, %%mm1                    \n\t"
923
-        "psllw $1, %%mm0                      \n\t" //'z12
924
-
925
-        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm1 \n\t" //-
926
-        "paddw %%mm0, %%mm5                   \n\t"
927
-
928
-        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm5 \n\t" //'z5
929
-        "paddw %%mm6, %%mm2                   \n\t" //'z11
930
-
931
-        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm0 \n\t"
932
-        "movq %%mm2, %%mm7                    \n\t"
933
-
934
-        //---
935
-        "movq 0*8+%3, %%mm4                   \n\t"
936
-        "psubw %%mm3, %%mm2                   \n\t"
937
-
938
-        "psllw $1, %%mm2                      \n\t"
939
-        "paddw %%mm3, %%mm7                   \n\t" //'t7
940
-
941
-        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2 \n\t" //'t11
942
-        "movq %%mm4, %%mm6                    \n\t"
943
-        //paddw mm7, MM_2
944
-        "psraw $2, %%mm7                      \n\t"
945
-
946
-        "paddw "DCTSIZE_S"*0*2(%%"REG_D"), %%mm4 \n\t"
947
-        "psubw %%mm7, %%mm6                   \n\t"
948
-
949
-        "movq 1*8+%3, %%mm3                   \n\t"
950
-        "paddw %%mm7, %%mm4                   \n\t"
951
-
952
-        "movq %%mm6, "DCTSIZE_S"*7*2(%%"REG_D") \n\t"
953
-        "paddw %%mm5, %%mm1                   \n\t" //'t12
954
-
955
-        "movq %%mm4, "DCTSIZE_S"*0*2(%%"REG_D") \n\t"
956
-        "psubw %%mm7, %%mm1                   \n\t" //'t6
957
-
958
-        "movq 2*8+%3, %%mm7                   \n\t"
959
-        "psubw %%mm5, %%mm0                   \n\t" //'t10
960
-
961
-        "movq 3*8+%3, %%mm6                   \n\t"
962
-        "movq %%mm3, %%mm5                    \n\t"
963
-
964
-        "paddw "DCTSIZE_S"*1*2(%%"REG_D"), %%mm3 \n\t"
965
-        "psubw %%mm1, %%mm5                   \n\t"
966
-
967
-        "psubw %%mm1, %%mm2                   \n\t" //'t5
968
-        "paddw %%mm1, %%mm3                   \n\t"
969
-
970
-        "movq %%mm5, "DCTSIZE_S"*6*2(%%"REG_D") \n\t"
971
-        "movq %%mm7, %%mm4                    \n\t"
972
-
973
-        "paddw "DCTSIZE_S"*2*2(%%"REG_D"), %%mm7 \n\t"
974
-        "psubw %%mm2, %%mm4                   \n\t"
975
-
976
-        "paddw "DCTSIZE_S"*5*2(%%"REG_D"), %%mm4 \n\t"
977
-        "paddw %%mm2, %%mm7                   \n\t"
978
-
979
-        "movq %%mm3, "DCTSIZE_S"*1*2(%%"REG_D") \n\t"
980
-        "paddw %%mm2, %%mm0                    \n\t" //'t4
981
-
982
-        // 't4 't6 't5 - - - - 't7
983
-        "movq %%mm7, "DCTSIZE_S"*2*2(%%"REG_D") \n\t"
984
-        "movq %%mm6, %%mm1                     \n\t"
985
-
986
-        "paddw "DCTSIZE_S"*4*2(%%"REG_D"), %%mm6 \n\t"
987
-        "psubw %%mm0, %%mm1                    \n\t"
988
-
989
-        "paddw "DCTSIZE_S"*3*2(%%"REG_D"), %%mm1 \n\t"
990
-        "paddw %%mm0, %%mm6                    \n\t"
991
-
992
-        "movq %%mm4, "DCTSIZE_S"*5*2(%%"REG_D") \n\t"
993
-        "add $24, %%"REG_S"                    \n\t"
994
-
995
-        "movq %%mm6, "DCTSIZE_S"*4*2(%%"REG_D") \n\t"
996
-
997
-        "movq %%mm1, "DCTSIZE_S"*3*2(%%"REG_D") \n\t"
998
-        "add $24, %%"REG_D"                    \n\t"
999
-        "sub $2, %%"REG_c"                     \n\t"
1000
-        "jnz 1b                                \n\t"
1001
-        "5:                                    \n\t"
1002
-
1003
-        : "+S"(data), "+D"(output), "+c"(cnt), "=o"(temps)
1004
-        : "d"(thr_adr)
1005
-          NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, MM_2,MM_FIX_1_414213562_A, MM_FIX_1_414213562, MM_FIX_0_382683433,
1006
-                                ff_MM_FIX_0_541196100, MM_FIX_1_306562965, MM_FIX_0_847759065)
1007
-          NAMED_CONSTRAINTS_ADD(MM_FIX_0_566454497, MM_FIX_0_198912367, MM_FIX_2_613125930, MM_FIX_1_847759065,
1008
-                                MM_FIX_1_082392200)
1009
-        : "%"REG_a
1010
-        );
1011
-}
1012
-
1013
-static void row_idct_mmx (int16_t *workspace, int16_t *output_adr, int output_stride, int cnt)
1014
-{
1015
-    DECLARE_ALIGNED(8, uint64_t, temps)[4];
1016
-
1017
-    __asm__ volatile(
1018
-        "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
1019
-        "1:                     \n\t"
1020
-        "movq "DCTSIZE_S"*0*2(%%"REG_S"), %%mm0    \n\t"
1021
-        //
1022
-
1023
-        "movq "DCTSIZE_S"*1*2(%%"REG_S"), %%mm1    \n\t"
1024
-        "movq %%mm0, %%mm4                         \n\t"
1025
-
1026
-        "movq "DCTSIZE_S"*2*2(%%"REG_S"), %%mm2    \n\t"
1027
-        "punpcklwd %%mm1, %%mm0                    \n\t"
1028
-
1029
-        "movq "DCTSIZE_S"*3*2(%%"REG_S"), %%mm3    \n\t"
1030
-        "punpckhwd %%mm1, %%mm4                    \n\t"
1031
-
1032
-        //transpose 4x4
1033
-        "movq %%mm2, %%mm7                         \n\t"
1034
-        "punpcklwd %%mm3, %%mm2                    \n\t"
1035
-
1036
-        "movq %%mm0, %%mm6                         \n\t"
1037
-        "punpckldq %%mm2, %%mm0                    \n\t" //0
1038
-
1039
-        "punpckhdq %%mm2, %%mm6                    \n\t" //1
1040
-        "movq %%mm0, %%mm5                         \n\t"
1041
-
1042
-        "punpckhwd %%mm3, %%mm7                    \n\t"
1043
-        "psubw %%mm6, %%mm0                        \n\t"
1044
-
1045
-        "pmulhw "MANGLE(MM_FIX_1_414213562_A)", %%mm0 \n\t"
1046
-        "movq %%mm4, %%mm2                         \n\t"
1047
-
1048
-        "punpckldq %%mm7, %%mm4                    \n\t" //2
1049
-        "paddw %%mm6, %%mm5                        \n\t"
1050
-
1051
-        "punpckhdq %%mm7, %%mm2                    \n\t" //3
1052
-        "movq %%mm4, %%mm1                         \n\t"
1053
-
1054
-        "psllw $2, %%mm0                           \n\t"
1055
-        "paddw %%mm2, %%mm4                        \n\t" //t10
1056
-
1057
-        "movq "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_S"), %%mm3 \n\t"
1058
-        "psubw %%mm2, %%mm1                        \n\t" //t11
1059
-
1060
-        "movq "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_S"), %%mm2 \n\t"
1061
-        "psubw %%mm5, %%mm0                        \n\t"
1062
-
1063
-        "movq %%mm4, %%mm6                         \n\t"
1064
-        "paddw %%mm5, %%mm4                        \n\t" //t0
1065
-
1066
-        "psubw %%mm5, %%mm6                        \n\t" //t3
1067
-        "movq %%mm1, %%mm7                         \n\t"
1068
-
1069
-        "movq "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_S"), %%mm5 \n\t"
1070
-        "paddw %%mm0, %%mm1                        \n\t" //t1
1071
-
1072
-        "movq %%mm4, 0*8+%3                        \n\t" //t0
1073
-        "movq %%mm3, %%mm4                         \n\t"
1074
-
1075
-        "movq %%mm6, 1*8+%3                        \n\t" //t3
1076
-        "punpcklwd %%mm2, %%mm3                    \n\t"
1077
-
1078
-        //transpose 4x4
1079
-        "movq "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_S"), %%mm6 \n\t"
1080
-        "punpckhwd %%mm2, %%mm4                    \n\t"
1081
-
1082
-        "movq %%mm5, %%mm2                         \n\t"
1083
-        "punpcklwd %%mm6, %%mm5                    \n\t"
1084
-
1085
-        "psubw %%mm0, %%mm7                        \n\t" //t2
1086
-        "punpckhwd %%mm6, %%mm2                    \n\t"
1087
-
1088
-        "movq %%mm3, %%mm0                         \n\t"
1089
-        "punpckldq %%mm5, %%mm3                    \n\t" //4
1090
-
1091
-        "punpckhdq %%mm5, %%mm0                    \n\t" //5
1092
-        "movq %%mm4, %%mm5                         \n\t"
1093
-
1094
-        //
1095
-        "movq %%mm3, %%mm6                         \n\t"
1096
-        "punpckldq %%mm2, %%mm4                    \n\t" //6
1097
-
1098
-        "psubw %%mm0, %%mm3                        \n\t" //z10
1099
-        "punpckhdq %%mm2, %%mm5                    \n\t" //7
1100
-
1101
-        "paddw %%mm0, %%mm6                        \n\t" //z13
1102
-        "movq %%mm4, %%mm2                         \n\t"
1103
-
1104
-        "movq %%mm3, %%mm0                         \n\t"
1105
-        "psubw %%mm5, %%mm4                        \n\t" //z12
1106
-
1107
-        "pmulhw "MANGLE(MM_FIX_2_613125930)", %%mm0\n\t" //-
1108
-        "paddw %%mm4, %%mm3                        \n\t"
1109
-
1110
-        "pmulhw "MANGLE(MM_FIX_1_847759065)", %%mm3\n\t" //z5
1111
-        "paddw %%mm5, %%mm2                        \n\t" //z11  >
1112
-
1113
-        "pmulhw "MANGLE(MM_FIX_1_082392200)", %%mm4\n\t"
1114
-        "movq %%mm2, %%mm5                         \n\t"
1115
-
1116
-        "psubw %%mm6, %%mm2                        \n\t"
1117
-        "paddw %%mm6, %%mm5                        \n\t" //t7
1118
-
1119
-        "pmulhw "MANGLE(MM_FIX_1_414213562)", %%mm2\n\t" //t11
1120
-        "paddw %%mm3, %%mm0                        \n\t" //t12
1121
-
1122
-        "psllw $3, %%mm0                           \n\t"
1123
-        "psubw %%mm3, %%mm4                        \n\t" //t10
1124
-
1125
-        "movq 0*8+%3, %%mm6                        \n\t"
1126
-        "movq %%mm1, %%mm3                         \n\t"
1127
-
1128
-        "psllw $3, %%mm4                           \n\t"
1129
-        "psubw %%mm5, %%mm0                        \n\t" //t6
1130
-
1131
-        "psllw $3, %%mm2                           \n\t"
1132
-        "paddw %%mm0, %%mm1                        \n\t" //d1
1133
-
1134
-        "psubw %%mm0, %%mm2                        \n\t" //t5
1135
-        "psubw %%mm0, %%mm3                        \n\t" //d6
1136
-
1137
-        "paddw %%mm2, %%mm4                        \n\t" //t4
1138
-        "movq %%mm7, %%mm0                         \n\t"
1139
-
1140
-        "paddw %%mm2, %%mm7                        \n\t" //d2
1141
-        "psubw %%mm2, %%mm0                        \n\t" //d5
1142
-
1143
-        "movq "MANGLE(MM_DESCALE_RND)", %%mm2      \n\t" //4
1144
-        "psubw %%mm5, %%mm6                        \n\t" //d7
1145
-
1146
-        "paddw 0*8+%3, %%mm5                       \n\t" //d0
1147
-        "paddw %%mm2, %%mm1                        \n\t"
1148
-
1149
-        "paddw %%mm2, %%mm5                        \n\t"
1150
-        "psraw $3, %%mm1                           \n\t"
1151
-
1152
-        "paddw %%mm2, %%mm7                        \n\t"
1153
-        "psraw $3, %%mm5                           \n\t"
1154
-
1155
-        "paddw (%%"REG_D"), %%mm5                  \n\t"
1156
-        "psraw $3, %%mm7                           \n\t"
1157
-
1158
-        "paddw (%%"REG_D",%%"REG_a"), %%mm1        \n\t"
1159
-        "paddw %%mm2, %%mm0                        \n\t"
1160
-
1161
-        "paddw (%%"REG_D",%%"REG_a",2), %%mm7      \n\t"
1162
-        "paddw %%mm2, %%mm3                        \n\t"
1163
-
1164
-        "movq %%mm5, (%%"REG_D")                   \n\t"
1165
-        "paddw %%mm2, %%mm6                        \n\t"
1166
-
1167
-        "movq %%mm1, (%%"REG_D",%%"REG_a")         \n\t"
1168
-        "psraw $3, %%mm0                           \n\t"
1169
-
1170
-        "movq %%mm7, (%%"REG_D",%%"REG_a",2)       \n\t"
1171
-        "add %%"REG_d", %%"REG_D"                  \n\t" //3*ls
1172
-
1173
-        "movq 1*8+%3, %%mm5                        \n\t" //t3
1174
-        "psraw $3, %%mm3                           \n\t"
1175
-
1176
-        "paddw (%%"REG_D",%%"REG_a",2), %%mm0      \n\t"
1177
-        "psubw %%mm4, %%mm5                        \n\t" //d3
1178
-
1179
-        "paddw (%%"REG_D",%%"REG_d"), %%mm3        \n\t"
1180
-        "psraw $3, %%mm6                           \n\t"
1181
-
1182
-        "paddw 1*8+%3, %%mm4                       \n\t" //d4
1183
-        "paddw %%mm2, %%mm5                        \n\t"
1184
-
1185
-        "paddw (%%"REG_D",%%"REG_a",4), %%mm6      \n\t"
1186
-        "paddw %%mm2, %%mm4                        \n\t"
1187
-
1188
-        "movq %%mm0, (%%"REG_D",%%"REG_a",2)       \n\t"
1189
-        "psraw $3, %%mm5                           \n\t"
1190
-
1191
-        "paddw (%%"REG_D"), %%mm5                  \n\t"
1192
-        "psraw $3, %%mm4                           \n\t"
1193
-
1194
-        "paddw (%%"REG_D",%%"REG_a"), %%mm4        \n\t"
1195
-        "add $"DCTSIZE_S"*2*4, %%"REG_S"           \n\t" //4 rows
1196
-
1197
-        "movq %%mm3, (%%"REG_D",%%"REG_d")         \n\t"
1198
-        "movq %%mm6, (%%"REG_D",%%"REG_a",4)       \n\t"
1199
-        "movq %%mm5, (%%"REG_D")                   \n\t"
1200
-        "movq %%mm4, (%%"REG_D",%%"REG_a")         \n\t"
1201
-
1202
-        "sub %%"REG_d", %%"REG_D"                  \n\t"
1203
-        "add $8, %%"REG_D"                         \n\t"
1204
-        "dec %%"REG_c"                             \n\t"
1205
-        "jnz 1b                                    \n\t"
1206
-
1207
-        : "+S"(workspace), "+D"(output_adr), "+c"(cnt), "=o"(temps)
1208
-        : "a"(output_stride * sizeof(short))
1209
-        NAMED_CONSTRAINTS_ADD(MM_FIX_1_414213562_A, MM_FIX_2_613125930, MM_FIX_1_847759065, MM_FIX_1_082392200,
1210
-                              MM_FIX_1_414213562,MM_DESCALE_RND)
1211
-        : "%"REG_d
1212
-        );
1213
-}
1214
-
1215
-static void row_fdct_mmx(int16_t *data, const uint8_t *pixels, int line_size, int cnt)
1216
-{
1217
-    DECLARE_ALIGNED(8, uint64_t, temps)[4];
1218
-
1219
-    __asm__ volatile(
1220
-        "lea (%%"REG_a",%%"REG_a",2), %%"REG_d"    \n\t"
1221
-        "6:                                        \n\t"
1222
-        "movd (%%"REG_S"), %%mm0                   \n\t"
1223
-        "pxor %%mm7, %%mm7                         \n\t"
1224
-
1225
-        "movd (%%"REG_S",%%"REG_a"), %%mm1         \n\t"
1226
-        "punpcklbw %%mm7, %%mm0                    \n\t"
1227
-
1228
-        "movd (%%"REG_S",%%"REG_a",2), %%mm2       \n\t"
1229
-        "punpcklbw %%mm7, %%mm1                    \n\t"
1230
-
1231
-        "punpcklbw %%mm7, %%mm2                    \n\t"
1232
-        "add %%"REG_d", %%"REG_S"                  \n\t"
1233
-
1234
-        "movq %%mm0, %%mm5                         \n\t"
1235
-        //
1236
-
1237
-        "movd (%%"REG_S",%%"REG_a",4), %%mm3       \n\t" //7  ;prefetch!
1238
-        "movq %%mm1, %%mm6                         \n\t"
1239
-
1240
-        "movd (%%"REG_S",%%"REG_d"), %%mm4         \n\t" //6
1241
-        "punpcklbw %%mm7, %%mm3                    \n\t"
1242
-
1243
-        "psubw %%mm3, %%mm5                        \n\t"
1244
-        "punpcklbw %%mm7, %%mm4                    \n\t"
1245
-
1246
-        "paddw %%mm3, %%mm0                        \n\t"
1247
-        "psubw %%mm4, %%mm6                        \n\t"
1248
-
1249
-        "movd (%%"REG_S",%%"REG_a",2), %%mm3       \n\t" //5
1250
-        "paddw %%mm4, %%mm1                        \n\t"
1251
-
1252
-        "movq %%mm5, %3                            \n\t" //t7
1253
-        "punpcklbw %%mm7, %%mm3                    \n\t"
1254
-
1255
-        "movq %%mm6, %4                            \n\t" //t6
1256
-        "movq %%mm2, %%mm4                         \n\t"
1257
-
1258
-        "movd (%%"REG_S"), %%mm5                   \n\t" //3
1259
-        "paddw %%mm3, %%mm2                        \n\t"
1260
-
1261
-        "movd (%%"REG_S",%%"REG_a"), %%mm6         \n\t" //4
1262
-        "punpcklbw %%mm7, %%mm5                    \n\t"
1263
-
1264
-        "psubw %%mm3, %%mm4                        \n\t"
1265
-        "punpcklbw %%mm7, %%mm6                    \n\t"
1266
-
1267
-        "movq %%mm5, %%mm3                         \n\t"
1268
-        "paddw %%mm6, %%mm5                        \n\t" //t3
1269
-
1270
-        "psubw %%mm6, %%mm3                        \n\t" //t4  ; t0 t1 t2 t4 t5 t3 - -
1271
-        "movq %%mm0, %%mm6                         \n\t"
1272
-
1273
-        "movq %%mm1, %%mm7                         \n\t"
1274
-        "psubw %%mm5, %%mm0                        \n\t" //t13
1275
-
1276
-        "psubw %%mm2, %%mm1                        \n\t"
1277
-        "paddw %%mm2, %%mm7                        \n\t" //t11
1278
-
1279
-        "paddw %%mm0, %%mm1                        \n\t"
1280
-        "movq %%mm7, %%mm2                         \n\t"
1281
-
1282
-        "psllw $2, %%mm1                           \n\t"
1283
-        "paddw %%mm5, %%mm6                        \n\t" //t10
1284
-
1285
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm1 \n\t"
1286
-        "paddw %%mm6, %%mm7                        \n\t" //d2
1287
-
1288
-        "psubw %%mm2, %%mm6                        \n\t" //d3
1289
-        "movq %%mm0, %%mm5                         \n\t"
1290
-
1291
-        //transpose 4x4
1292
-        "movq %%mm7, %%mm2                         \n\t"
1293
-        "punpcklwd %%mm6, %%mm7                    \n\t"
1294
-
1295
-        "paddw %%mm1, %%mm0                        \n\t" //d0
1296
-        "punpckhwd %%mm6, %%mm2                    \n\t"
1297
-
1298
-        "psubw %%mm1, %%mm5                        \n\t" //d1
1299
-        "movq %%mm0, %%mm6                         \n\t"
1300
-
1301
-        "movq %4, %%mm1                            \n\t"
1302
-        "punpcklwd %%mm5, %%mm0                    \n\t"
1303
-
1304
-        "punpckhwd %%mm5, %%mm6                    \n\t"
1305
-        "movq %%mm0, %%mm5                         \n\t"
1306
-
1307
-        "punpckldq %%mm7, %%mm0                    \n\t" //0
1308
-        "paddw %%mm4, %%mm3                        \n\t"
1309
-
1310
-        "punpckhdq %%mm7, %%mm5                    \n\t" //1
1311
-        "movq %%mm6, %%mm7                         \n\t"
1312
-
1313
-        "movq %%mm0, "DCTSIZE_S"*0*2(%%"REG_D")    \n\t"
1314
-        "punpckldq %%mm2, %%mm6                    \n\t" //2
1315
-
1316
-        "movq %%mm5, "DCTSIZE_S"*1*2(%%"REG_D")    \n\t"
1317
-        "punpckhdq %%mm2, %%mm7                    \n\t" //3
1318
-
1319
-        "movq %%mm6, "DCTSIZE_S"*2*2(%%"REG_D")    \n\t"
1320
-        "paddw %%mm1, %%mm4                        \n\t"
1321
-
1322
-        "movq %%mm7, "DCTSIZE_S"*3*2(%%"REG_D")    \n\t"
1323
-        "psllw $2, %%mm3                           \n\t" //t10
1324
-
1325
-        "movq %3, %%mm2                            \n\t"
1326
-        "psllw $2, %%mm4                           \n\t" //t11
1327
-
1328
-        "pmulhw "MANGLE(ff_MM_FIX_0_707106781)", %%mm4 \n\t" //z3
1329
-        "paddw %%mm2, %%mm1                        \n\t"
1330
-
1331
-        "psllw $2, %%mm1                           \n\t" //t12
1332
-        "movq %%mm3, %%mm0                         \n\t"
1333
-
1334
-        "pmulhw "MANGLE(ff_MM_FIX_0_541196100)", %%mm0 \n\t"
1335
-        "psubw %%mm1, %%mm3                        \n\t"
1336
-
1337
-        "pmulhw "MANGLE(MM_FIX_0_382683433)", %%mm3 \n\t" //z5
1338
-        "movq %%mm2, %%mm5                         \n\t"
1339
-
1340
-        "pmulhw "MANGLE(MM_FIX_1_306562965)", %%mm1 \n\t"
1341
-        "psubw %%mm4, %%mm2                        \n\t" //z13
1342
-
1343
-        "paddw %%mm4, %%mm5                        \n\t" //z11
1344
-        "movq %%mm2, %%mm6                         \n\t"
1345
-
1346
-        "paddw %%mm3, %%mm0                        \n\t" //z2
1347
-        "movq %%mm5, %%mm7                         \n\t"
1348
-
1349
-        "paddw %%mm0, %%mm2                        \n\t" //d4
1350
-        "psubw %%mm0, %%mm6                        \n\t" //d5
1351
-
1352
-        "movq %%mm2, %%mm4                         \n\t"
1353
-        "paddw %%mm3, %%mm1                        \n\t" //z4
1354
-
1355
-        //transpose 4x4
1356
-        "punpcklwd %%mm6, %%mm2                    \n\t"
1357
-        "paddw %%mm1, %%mm5                        \n\t" //d6
1358
-
1359
-        "punpckhwd %%mm6, %%mm4                    \n\t"
1360
-        "psubw %%mm1, %%mm7                        \n\t" //d7
1361
-
1362
-        "movq %%mm5, %%mm6                         \n\t"
1363
-        "punpcklwd %%mm7, %%mm5                    \n\t"
1364
-
1365
-        "punpckhwd %%mm7, %%mm6                    \n\t"
1366
-        "movq %%mm2, %%mm7                         \n\t"
1367
-
1368
-        "punpckldq %%mm5, %%mm2                    \n\t" //4
1369
-        "sub %%"REG_d", %%"REG_S"                  \n\t"
1370
-
1371
-        "punpckhdq %%mm5, %%mm7                    \n\t" //5
1372
-        "movq %%mm4, %%mm5                         \n\t"
1373
-
1374
-        "movq %%mm2, "DCTSIZE_S"*0*2+"DCTSIZE_S"(%%"REG_D") \n\t"
1375
-        "punpckldq %%mm6, %%mm4                    \n\t" //6
1376
-
1377
-        "movq %%mm7, "DCTSIZE_S"*1*2+"DCTSIZE_S"(%%"REG_D") \n\t"
1378
-        "punpckhdq %%mm6, %%mm5                    \n\t" //7
1379
-
1380
-        "movq %%mm4, "DCTSIZE_S"*2*2+"DCTSIZE_S"(%%"REG_D") \n\t"
1381
-        "add $4, %%"REG_S"                         \n\t"
1382
-
1383
-        "movq %%mm5, "DCTSIZE_S"*3*2+"DCTSIZE_S"(%%"REG_D") \n\t"
1384
-        "add $"DCTSIZE_S"*2*4, %%"REG_D"           \n\t" //4 rows
1385
-        "dec %%"REG_c"                             \n\t"
1386
-        "jnz 6b                                    \n\t"
1387
-
1388
-        : "+S"(pixels), "+D"(data), "+c"(cnt), "=o"(temps), "=o"(temps[1])
1389
-        : "a"(line_size)
1390
-        NAMED_CONSTRAINTS_ADD(ff_MM_FIX_0_707106781, ff_MM_FIX_0_541196100, MM_FIX_0_382683433, MM_FIX_1_306562965)
1391
-        : "%"REG_d);
1392
-}
1393
-#endif
1394
-
1395
-av_cold void ff_fspp_init_x86(FSPPContext *s)
1396
-{
1397
-#if HAVE_MMX_INLINE
1398
-    int cpu_flags = av_get_cpu_flags();
1399
-
1400
-    if (HAVE_MMX_INLINE && cpu_flags & AV_CPU_FLAG_MMX) {
1401
-        s->store_slice  = store_slice_mmx;
1402
-        s->store_slice2 = store_slice2_mmx;
1403
-        s->mul_thrmat   = mul_thrmat_mmx;
1404
-        s->column_fidct = column_fidct_mmx;
1405
-        s->row_idct     = row_idct_mmx;
1406
-        s->row_fdct     = row_fdct_mmx;
1407
-    }
1408
-#endif
1409
-}
1410 1
new file mode 100644
... ...
@@ -0,0 +1,49 @@
0
+/*
1
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
2
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
3
+ *
4
+ * This file is part of FFmpeg.
5
+ *
6
+ * FFmpeg is free software; you can redistribute it and/or modify
7
+ * it under the terms of the GNU General Public License as published by
8
+ * the Free Software Foundation; either version 2 of the License, or
9
+ * (at your option) any later version.
10
+ *
11
+ * FFmpeg is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
+ * GNU General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU General Public License along
17
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19
+ */
20
+
21
+#include "libavutil/attributes.h"
22
+#include "libavutil/x86/cpu.h"
23
+#include "libavfilter/vf_fspp.h"
24
+
25
+void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
26
+                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
27
+                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
28
+void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
29
+                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
30
+                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
31
+void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
32
+void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
33
+void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
34
+void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
35
+
36
+av_cold void ff_fspp_init_x86(FSPPContext *s)
37
+{
38
+    int cpu_flags = av_get_cpu_flags();
39
+
40
+    if (EXTERNAL_MMX(cpu_flags)) {
41
+        s->store_slice  = ff_store_slice_mmx;
42
+        s->store_slice2 = ff_store_slice2_mmx;
43
+        s->mul_thrmat   = ff_mul_thrmat_mmx;
44
+        s->column_fidct = ff_column_fidct_mmx;
45
+        s->row_idct     = ff_row_idct_mmx;
46
+        s->row_fdct     = ff_row_fdct_mmx;
47
+    }
48
+}