Browse code

vf_colorspace: x86-64 SIMD (SSE2) optimizations.

Ronald S. Bultje authored on 2016/04/07 03:09:08
Showing 9 changed files
... ...
@@ -128,4 +128,7 @@ void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp)
128 128
     init_yuv2yuv_fns(2, 12);
129 129
 
130 130
     dsp->multiply3x3 = multiply3x3_c;
131
+
132
+    if (ARCH_X86)
133
+        ff_colorspacedsp_x86_init(dsp);
131 134
 }
... ...
@@ -48,4 +48,7 @@ typedef struct ColorSpaceDSPContext {
48 48
 
49 49
 void ff_colorspacedsp_init(ColorSpaceDSPContext *dsp);
50 50
 
51
+/* internal */
52
+void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp);
53
+
51 54
 #endif /* AVFILTER_COLORSPACEDSP_H */
... ...
@@ -1,5 +1,6 @@
1 1
 OBJS-$(CONFIG_BLEND_FILTER)                  += x86/vf_blend_init.o
2 2
 OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
3
+OBJS-$(CONFIG_COLORSPACE_FILTER)             += x86/colorspacedsp_init.o
3 4
 OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
4 5
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
5 6
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
... ...
@@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
23 23
 
24 24
 YASM-OBJS-$(CONFIG_BLEND_FILTER)             += x86/vf_blend.o
25 25
 YASM-OBJS-$(CONFIG_BWDIF_FILTER)             += x86/vf_bwdif.o
26
+YASM-OBJS-$(CONFIG_COLORSPACE_FILTER)        += x86/colorspacedsp.o
26 27
 YASM-OBJS-$(CONFIG_FSPP_FILTER)              += x86/vf_fspp.o
27 28
 YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
28 29
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
29 30
new file mode 100644
... ...
@@ -0,0 +1,1097 @@
0
+;*****************************************************************************
1
+;* x86-optimized functions for colorspace filter
2
+;*
3
+;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com>
4
+;*
5
+;* This file is part of FFmpeg.
6
+;*
7
+;* FFmpeg is free software; you can redistribute it and/or
8
+;* modify it under the terms of the GNU Lesser General Public
9
+;* License as published by the Free Software Foundation; either
10
+;* version 2.1 of the License, or (at your option) any later version.
11
+;*
12
+;* FFmpeg is distributed in the hope that it will be useful,
13
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
+;* Lesser General Public License for more details.
16
+;*
17
+;* You should have received a copy of the GNU Lesser General Public
18
+;* License along with FFmpeg; if not, write to the Free Software
19
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+;******************************************************************************
21
+
22
+%include "libavutil/x86/x86util.asm"
23
+
24
+SECTION_RODATA
25
+
26
+pw_1: times 8 dw 1
27
+pw_2: times 8 dw 2
28
+pw_4: times 8 dw 4
29
+pw_8: times 8 dw 8
30
+pw_16: times 8 dw 16
31
+pw_64: times 8 dw 64
32
+pw_128: times 8 dw 128
33
+pw_256: times 8 dw 256
34
+pw_512: times 8 dw 512
35
+pw_1023: times 8 dw 1023
36
+pw_1024: times 8 dw 1024
37
+pw_2048: times 8 dw 2048
38
+pw_4095: times 8 dw 4095
39
+pw_8192: times 8 dw 8192
40
+pw_16384: times 8 dw 16384
41
+
42
+pd_1: times 4 dd 1
43
+pd_2: times 4 dd 2
44
+pd_128: times 4 dd 128
45
+pd_512: times 4 dd 512
46
+pd_2048: times 4 dd 2048
47
+pd_8192: times 4 dd 8192
48
+pd_32768: times 4 dd 32768
49
+pd_131072: times 4 dd 131072
50
+
51
+SECTION .text
52
+
53
+; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3],
54
+;                               uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3],
55
+;                               int w, int h, const int16_t yuv2yuv_coeffs[3][3][8],
56
+;                               const int16_t yuv_offset[2][8])
57
+
58
+%if ARCH_X86_64
59
+%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert)
60
+
61
+%assign %%sh (14 + %1 - %2)
62
+%assign %%rnd (1 << (%%sh - 1))
63
+%assign %%uvinoff (128 << (%1 - 8))
64
+%assign %%uvoutoff (128 << (%2 - 8))
65
+%if %3 == 0
66
+%assign %%ss 444
67
+%elif %4 == 0
68
+%assign %%ss 422
69
+%else ; %4 == 1
70
+%assign %%ss 420
71
+%endif ; %3/%4
72
+%if %2 != 8
73
+%assign %%maxval (1 << %2) - 1
74
+%endif ; %2 != 8
75
+
76
+%assign %%ypsh %%sh - 1
77
+%if %%ypsh > 14
78
+%assign %%yoffsh %%ypsh - 13
79
+%assign %%ypsh 14
80
+%else
81
+%assign %%yoffsh 1
82
+%endif
83
+%assign %%yprnd (1 << (%%yoffsh - 1))
84
+%assign %%ypmul (1 << %%ypsh)
85
+
86
+cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \
87
+                                     yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo
88
+%if %3 == 1
89
+    inc             wd
90
+    sar             wd, 1
91
+%if %4 == 1
92
+    inc             hd
93
+    sar             hd, 1
94
+%endif ; %4 == 1
95
+%endif ; %3 == 1
96
+    mov [rsp+3*mmsize+0], wd
97
+    mov [rsp+3*mmsize+4], hd
98
+
99
+    mova           m10, [cq]
100
+    pxor           m11, m11
101
+    mova           m12, [pd_ %+ %%uvoutoff]
102
+    pslld          m12, %%sh
103
+    paddd          m12, [pd_ %+ %%rnd]
104
+    mova           m13, [pw_ %+ %%uvinoff]
105
+    mova           m14, [yoffq+ 0]      ; y_off_in
106
+    mova           m15, [yoffq+16]      ; y_off_out
107
+%if %%yoffsh != 0
108
+    psllw          m15, %%yoffsh
109
+%endif
110
+    paddw          m15, [pw_ %+ %%yprnd]
111
+    punpcklwd      m10, m15
112
+    mova           m15, [pw_ %+ %%ypmul]
113
+    movh            m0, [cq+1*16]       ; cyu
114
+    movh            m1, [cq+2*16]       ; cyv
115
+    movh            m2, [cq+4*16]       ; cuu
116
+    movh            m3, [cq+5*16]       ; cuv
117
+    movh            m4, [cq+7*16]       ; cvu
118
+    movh            m5, [cq+8*16]       ; cvv
119
+    punpcklwd       m0, m1
120
+    punpcklwd       m2, m3
121
+    punpcklwd       m4, m5
122
+    mova [rsp+0*mmsize], m0
123
+    mova [rsp+1*mmsize], m2
124
+    mova [rsp+2*mmsize], m4
125
+
126
+    DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp
127
+
128
+    mov            uiq, [yiq+gprsize*1]
129
+    mov            viq, [yiq+gprsize*2]
130
+    mov            yiq, [yiq+gprsize*0]
131
+    mov            uoq, [yoq+gprsize*1]
132
+    mov            voq, [yoq+gprsize*2]
133
+    mov            yoq, [yoq+gprsize*0]
134
+    mov           uisq, [yisq+gprsize*1]
135
+    mov           visq, [yisq+gprsize*2]
136
+    mov           yisq, [yisq+gprsize*0]
137
+    mov           uosq, [yosq+gprsize*1]
138
+    mov           vosq, [yosq+gprsize*2]
139
+    mov           yosq, [yosq+gprsize*0]
140
+
141
+.loop_v:
142
+    xor             xq, xq
143
+
144
+.loop_h:
145
+%if %4 == 1
146
+    lea           tmpq, [yiq+yisq]
147
+%endif ; %4 == 1
148
+%if %1 == 8
149
+    movu            m0, [yiq+xq*(1<<%3)]        ; y00/01
150
+%if %4 == 1
151
+    movu            m2, [tmpq+xq*2]             ; y10/11
152
+%endif ; %4 == 1
153
+%if %3 == 1
154
+    movh            m4, [uiq+xq]                ; u
155
+    movh            m5, [viq+xq]                ; v
156
+%else ; %3 != 1
157
+    movu            m4, [uiq+xq]                ; u
158
+    movu            m5, [viq+xq]                ; v
159
+%endif ; %3 ==/!= 1
160
+    punpckhbw       m1, m0, m11
161
+    punpcklbw       m0, m11
162
+%if %4 == 1
163
+    punpckhbw       m3, m2, m11
164
+    punpcklbw       m2, m11
165
+%endif ; %4 == 1
166
+%if %3 == 0
167
+    punpckhbw       m2, m4, m11
168
+    punpckhbw       m3, m5, m11
169
+%endif ; %3 == 0
170
+    punpcklbw       m4, m11
171
+    punpcklbw       m5, m11
172
+%else ; %1 != 8
173
+    movu            m0, [yiq+xq*(2<<%3)]        ; y00/01
174
+    movu            m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01
175
+%if %4 == 1
176
+    movu            m2, [tmpq+xq*4]             ; y10/11
177
+    movu            m3, [tmpq+xq*4+mmsize]      ; y10/11
178
+%endif ; %4 == 1
179
+    movu            m4, [uiq+xq*2]              ; u
180
+    movu            m5, [viq+xq*2]              ; v
181
+%if %3 == 0
182
+    movu            m2, [uiq+xq*2+mmsize]
183
+    movu            m3, [viq+xq*2+mmsize]
184
+%endif ; %3 == 0
185
+%endif ; %1 ==/!= 8
186
+    psubw           m0, m14
187
+    psubw           m1, m14
188
+%if %4 == 1
189
+    psubw           m2, m14
190
+    psubw           m3, m14
191
+%endif ; %4 == 1
192
+    psubw           m4, m13
193
+    psubw           m5, m13
194
+%if %3 == 0
195
+    psubw           m2, m13
196
+    psubw           m3, m13
197
+%endif ; %3 == 0
198
+
199
+    SBUTTERFLY   wd, 4, 5, 6
200
+    pmaddwd         m6, m4, [rsp+1*mmsize]
201
+    pmaddwd         m7, m5, [rsp+1*mmsize]
202
+%if %3 == 0
203
+    SBUTTERFLY   wd, 2, 3, 8
204
+    pmaddwd         m8, m2, [rsp+1*mmsize]
205
+    pmaddwd         m9, m3, [rsp+1*mmsize]
206
+%else ; %3 != 0
207
+    pmaddwd         m8, m4, [rsp+2*mmsize]
208
+    pmaddwd         m9, m5, [rsp+2*mmsize]
209
+%endif
210
+    paddd           m6, m12
211
+    paddd           m7, m12
212
+    paddd           m8, m12
213
+    paddd           m9, m12
214
+    psrad           m6, %%sh
215
+    psrad           m7, %%sh
216
+    psrad           m8, %%sh
217
+    psrad           m9, %%sh
218
+    packssdw        m6, m7
219
+    packssdw        m8, m9
220
+%if %2 == 8
221
+    packuswb        m6, m8
222
+%if %3 == 0
223
+    movu      [uoq+xq], m6
224
+%else ; %3 != 0
225
+    movh      [uoq+xq], m6
226
+    movhps    [voq+xq], m6
227
+%endif ; %3 ==/!= 0
228
+%else ; %2 != 8
229
+    CLIPW           m6, m11, [pw_ %+ %%maxval]
230
+    CLIPW           m8, m11, [pw_ %+ %%maxval]
231
+    movu    [uoq+xq*2], m6
232
+%if %3 == 0
233
+    movu    [uoq+xq*2+mmsize], m8
234
+%else ; %3 != 0
235
+    movu    [voq+xq*2], m8
236
+%endif ; %3 ==/!= 0
237
+%endif ; %2 ==/!= 8
238
+
239
+%if %3 == 0
240
+    pmaddwd         m6, m4, [rsp+2*mmsize]
241
+    pmaddwd         m7, m5, [rsp+2*mmsize]
242
+    pmaddwd         m8, m2, [rsp+2*mmsize]
243
+    pmaddwd         m9, m3, [rsp+2*mmsize]
244
+    paddd           m6, m12
245
+    paddd           m7, m12
246
+    paddd           m8, m12
247
+    paddd           m9, m12
248
+    psrad           m6, %%sh
249
+    psrad           m7, %%sh
250
+    psrad           m8, %%sh
251
+    psrad           m9, %%sh
252
+    packssdw        m6, m7
253
+    packssdw        m8, m9
254
+%if %2 == 8
255
+    packuswb        m6, m8
256
+    movu      [voq+xq], m6
257
+%else ; %2 != 8
258
+    CLIPW           m6, m11, [pw_ %+ %%maxval]
259
+    CLIPW           m8, m11, [pw_ %+ %%maxval]
260
+    movu    [voq+xq*2], m6
261
+    movu    [voq+xq*2+mmsize], m8
262
+%endif ; %2 ==/!= 8
263
+%endif ; %3 == 0
264
+
265
+    pmaddwd         m4, [rsp+0*mmsize]
266
+    pmaddwd         m5, [rsp+0*mmsize]          ; uv_val
267
+%if %3 == 0
268
+    pmaddwd         m2, [rsp+0*mmsize]
269
+    pmaddwd         m3, [rsp+0*mmsize]
270
+%endif ; %3 == 0
271
+
272
+    ; unpack y pixels with m15 (shifted round + offset), then multiply
273
+    ; by m10, add uv pixels, and we're done!
274
+%if %3 == 1
275
+    punpckhdq       m8, m4, m4
276
+    punpckldq       m4, m4
277
+    punpckhdq       m9, m5, m5
278
+    punpckldq       m5, m5
279
+%else ; %3 != 1
280
+    SWAP             8, 5, 2
281
+    SWAP             3, 9
282
+%endif ; %3 ==/!= 1
283
+%if %4 == 1
284
+    punpckhwd       m6, m2, m15
285
+    punpcklwd       m2, m15
286
+    punpckhwd       m7, m3, m15
287
+    punpcklwd       m3, m15
288
+    pmaddwd         m2, m10
289
+    pmaddwd         m6, m10
290
+    pmaddwd         m3, m10
291
+    pmaddwd         m7, m10
292
+    paddd           m2, m4
293
+    paddd           m6, m8
294
+    paddd           m3, m5
295
+    paddd           m7, m9
296
+    psrad           m2, %%sh
297
+    psrad           m6, %%sh
298
+    psrad           m3, %%sh
299
+    psrad           m7, %%sh
300
+    packssdw        m2, m6
301
+    packssdw        m3, m7
302
+
303
+    lea           tmpq, [yoq+yosq]
304
+%if %2 == 8
305
+    packuswb        m2, m3
306
+    movu   [tmpq+xq*2], m2
307
+%else ; %2 != 8
308
+    CLIPW           m2, m11, [pw_ %+ %%maxval]
309
+    CLIPW           m3, m11, [pw_ %+ %%maxval]
310
+    movu   [tmpq+xq*4], m2
311
+    movu [tmpq+xq*4+mmsize], m3
312
+%endif ; %2 ==/!= 8
313
+%endif ; %4 == 1
314
+
315
+    punpckhwd       m6, m0, m15
316
+    punpcklwd       m0, m15
317
+    punpckhwd       m7, m1, m15
318
+    punpcklwd       m1, m15
319
+    pmaddwd         m0, m10
320
+    pmaddwd         m6, m10
321
+    pmaddwd         m1, m10
322
+    pmaddwd         m7, m10
323
+    paddd           m0, m4
324
+    paddd           m6, m8
325
+    paddd           m1, m5
326
+    paddd           m7, m9
327
+    psrad           m0, %%sh
328
+    psrad           m6, %%sh
329
+    psrad           m1, %%sh
330
+    psrad           m7, %%sh
331
+    packssdw        m0, m6
332
+    packssdw        m1, m7
333
+
334
+%if %2 == 8
335
+    packuswb        m0, m1
336
+    movu    [yoq+xq*(1<<%3)], m0
337
+%else ; %2 != 8
338
+    CLIPW           m0, m11, [pw_ %+ %%maxval]
339
+    CLIPW           m1, m11, [pw_ %+ %%maxval]
340
+    movu  [yoq+xq*(2<<%3)], m0
341
+    movu [yoq+xq*(2<<%3)+mmsize], m1
342
+%endif ; %2 ==/!= 8
343
+
344
+    add             xq, mmsize >> %3
345
+    cmp             xd, dword [rsp+3*mmsize+0]
346
+    jl .loop_h
347
+
348
+%if %4 == 1
349
+    lea            yiq, [yiq+yisq*2]
350
+    lea            yoq, [yoq+yosq*2]
351
+%else ; %4 != 1
352
+    add            yiq, yisq
353
+    add            yoq, yosq
354
+%endif ; %4 ==/!= 1
355
+    add            uiq, uisq
356
+    add            viq, visq
357
+    add            uoq, uosq
358
+    add            voq, vosq
359
+    dec dword [rsp+3*mmsize+4]
360
+    jg .loop_v
361
+
362
+    RET
363
+%endmacro
364
+
365
+%macro YUV2YUV_FNS 2 ; ss_w, ss_h
366
+YUV2YUV_FN  8,  8, %1, %2
367
+YUV2YUV_FN 10,  8, %1, %2
368
+YUV2YUV_FN 12,  8, %1, %2
369
+YUV2YUV_FN  8, 10, %1, %2
370
+YUV2YUV_FN 10, 10, %1, %2
371
+YUV2YUV_FN 12, 10, %1, %2
372
+YUV2YUV_FN  8, 12, %1, %2
373
+YUV2YUV_FN 10, 12, %1, %2
374
+YUV2YUV_FN 12, 12, %1, %2
375
+%endmacro
376
+
377
+INIT_XMM sse2
378
+YUV2YUV_FNS 0, 0
379
+YUV2YUV_FNS 1, 0
380
+YUV2YUV_FNS 1, 1
381
+
382
+; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride,
383
+;                            uint8_t *yuv[3], ptrdiff_t yuv_stride[3],
384
+;                            int w, int h, const int16_t yuv2rgb_coeffs[3][3][8],
385
+;                            const int16_t yuv_offset[8])
386
+%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
387
+%assign %%sh (%1 - 1)
388
+%assign %%rnd (1 << (%%sh - 1))
389
+%assign %%uvoff (1 << (%1 - 1))
390
+%if %2 == 0
391
+%assign %%ss 444
392
+%elif %3 == 0
393
+%assign %%ss 422
394
+%else ; %3 == 1
395
+%assign %%ss 420
396
+%endif ; %2/%3
397
+
398
+cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \
399
+                                rgb, rgbs, yuv, yuvs, ww, h, c, yoff
400
+%if %2 == 1
401
+    inc            wwd
402
+    sar            wwd, 1
403
+%endif ; %2 == 1
404
+%if %3 == 1
405
+    inc             hd
406
+    sar             hd, 1
407
+%endif ; %3 == 1
408
+    pxor           m11, m11
409
+    mova           m15, [yoffq]                 ; yoff
410
+    movh           m14, [cq+  0]                ; cy
411
+    movh           m10, [cq+ 32]                ; crv
412
+    movh           m13, [cq+112]                ; cbu
413
+    movh           m12, [cq+ 64]                ; cgu
414
+    movh            m9, [cq+ 80]                ; cgv
415
+    punpcklwd      m14, [pw_ %+ %%rnd]          ; cy, rnd
416
+    punpcklwd      m13, m11                     ; cbu, 0
417
+    punpcklwd      m11, m10                     ; 0, crv
418
+    punpcklwd      m12, m9                      ; cgu, cgv
419
+    mova [rsp+0*mmsize], m11
420
+    mova [rsp+1*mmsize], m12
421
+    mova [rsp+2*mmsize], m13
422
+    mova [rsp+3*mmsize], m14
423
+    pxor           m14, m14
424
+
425
+    DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp
426
+
427
+    mov             gq, [rq+1*gprsize]
428
+    mov             bq, [rq+2*gprsize]
429
+    mov             rq, [rq+0*gprsize]
430
+    mov             uq, [yq+1*gprsize]
431
+    mov             vq, [yq+2*gprsize]
432
+    mov             yq, [yq+0*gprsize]
433
+    mov            usq, [ysq+1*gprsize]
434
+    mov            vsq, [ysq+2*gprsize]
435
+    mov            ysq, [ysq+0*gprsize]
436
+
437
+.loop_v:
438
+    xor             xq, xq
439
+
440
+.loop_h:
441
+%if %3 == 1
442
+    lea           tmpq, [yq+ysq]
443
+%endif ; %3 == 1
444
+%if %1 == 8
445
+    movu            m0, [yq+xq*(1<<%2)]
446
+%if %3 == 1
447
+    movu            m2, [tmpq+xq*2]
448
+%endif ; %3 == 1
449
+%if %2 == 1
450
+    movh            m4, [uq+xq]
451
+    movh            m5, [vq+xq]
452
+%else ; %2 != 1
453
+    movu            m4, [uq+xq]
454
+    movu            m5, [vq+xq]
455
+%endif ; %2 ==/!= 1
456
+    punpckhbw       m1, m0, m14
457
+    punpcklbw       m0, m14
458
+%if %3 == 1
459
+    punpckhbw       m3, m2, m14
460
+    punpcklbw       m2, m14
461
+%endif ; %3 == 1
462
+%if %2 == 0
463
+    punpckhbw       m2, m4, m14
464
+    punpckhbw       m3, m5, m14
465
+%endif ; %2 == 0
466
+    punpcklbw       m4, m14
467
+    punpcklbw       m5, m14
468
+%else ; %1 != 8
469
+    movu            m0, [yq+xq*(2<<%2)]
470
+    movu            m1, [yq+xq*(2<<%2)+mmsize]
471
+%if %3 == 1
472
+    movu            m2, [tmpq+xq*4]
473
+    movu            m3, [tmpq+xq*4+mmsize]
474
+%endif ; %3 == 1
475
+    movu            m4, [uq+xq*2]
476
+    movu            m5, [vq+xq*2]
477
+%if %2 == 0
478
+    movu            m2, [uq+xq*2+mmsize]
479
+    movu            m3, [vq+xq*2+mmsize]
480
+%endif ; %2 == 0
481
+%endif ; %1 ==/!= 8
482
+    psubw           m0, m15
483
+    psubw           m1, m15
484
+%if %3 == 1
485
+    psubw           m2, m15
486
+    psubw           m3, m15
487
+%endif ; %3 == 1
488
+    psubw           m4, [pw_ %+ %%uvoff]
489
+    psubw           m5, [pw_ %+ %%uvoff]
490
+    SBUTTERFLY   wd, 4, 5, 6
491
+%if %2 == 0
492
+    psubw           m2, [pw_ %+ %%uvoff]
493
+    psubw           m3, [pw_ %+ %%uvoff]
494
+    SBUTTERFLY   wd, 2, 3, 6
495
+%endif ; %2 == 0
496
+
497
+    ; calculate y+rnd full-resolution [0-3,6-9]
498
+    punpckhwd       m6, m0, [pw_1]              ; y, 1
499
+    punpcklwd       m0, [pw_1]                  ; y, 1
500
+    punpckhwd       m7, m1, [pw_1]              ; y, 1
501
+    punpcklwd       m1, [pw_1]                  ; y, 1
502
+    pmaddwd         m0, [rsp+3*mmsize]
503
+    pmaddwd         m6, [rsp+3*mmsize]
504
+    pmaddwd         m1, [rsp+3*mmsize]
505
+    pmaddwd         m7, [rsp+3*mmsize]
506
+%if %3 == 1
507
+    punpckhwd       m8, m2, [pw_1]              ; y, 1
508
+    punpcklwd       m2, [pw_1]                  ; y, 1
509
+    punpckhwd       m9, m3, [pw_1]              ; y, 1
510
+    punpcklwd       m3, [pw_1]                  ; y, 1
511
+    pmaddwd         m2, [rsp+3*mmsize]
512
+    pmaddwd         m8, [rsp+3*mmsize]
513
+    pmaddwd         m3, [rsp+3*mmsize]
514
+    pmaddwd         m9, [rsp+3*mmsize]
515
+    mova [rsp+4*mmsize], m2
516
+    mova [rsp+5*mmsize], m8
517
+    mova [rsp+6*mmsize], m3
518
+    mova [rsp+7*mmsize], m9
519
+%endif ; %3 == 1
520
+
521
+    ; calculate r offsets (un-subsampled, then duplicate)
522
+    pmaddwd        m10, m4, [rsp+0*mmsize]
523
+%if %2 == 1
524
+    pmaddwd        m12, m5, [rsp+0*mmsize]
525
+    punpckhdq      m11, m10, m10
526
+    punpckldq      m10, m10
527
+    punpckhdq      m13, m12, m12
528
+    punpckldq      m12, m12
529
+%else ; %2 != 1
530
+    pmaddwd        m11, m5, [rsp+0*mmsize]
531
+    pmaddwd        m12, m2, [rsp+0*mmsize]
532
+    pmaddwd        m13, m3, [rsp+0*mmsize]
533
+%endif ; %2 ==/!= 1
534
+%if %3 == 1
535
+    paddd           m2, m10, [rsp+4*mmsize]
536
+    paddd           m3, m11, [rsp+5*mmsize]
537
+    paddd           m8, m12, [rsp+6*mmsize]
538
+    paddd           m9, m13, [rsp+7*mmsize]
539
+%endif
540
+    paddd          m10, m0
541
+    paddd          m11, m6
542
+    paddd          m12, m1
543
+    paddd          m13, m7
544
+%if %3 == 1
545
+    psrad           m2, %%sh
546
+    psrad           m3, %%sh
547
+    psrad           m8, %%sh
548
+    psrad           m9, %%sh
549
+%endif ; %3 == 1
550
+    psrad          m10, %%sh
551
+    psrad          m11, %%sh
552
+    psrad          m12, %%sh
553
+    psrad          m13, %%sh
554
+%if %3 == 1
555
+    lea           tmpq, [rq+rgbsq*2]
556
+    packssdw        m2, m3
557
+    packssdw        m8, m9
558
+    mova [tmpq+xq*4], m2
559
+    mova [tmpq+xq*4+mmsize], m8
560
+%endif ; %3 == 1
561
+    packssdw       m10, m11
562
+    packssdw       m12, m13
563
+    mova   [rq+xq*(2 << %2)], m10
564
+    mova   [rq+xq*(2 << %2)+mmsize], m12
565
+
566
+    ; calculate g offsets (un-subsampled, then duplicate)
567
+    pmaddwd        m10, m4, [rsp+1*mmsize]
568
+%if %2 == 1
569
+    pmaddwd        m12, m5, [rsp+1*mmsize]
570
+    punpckhdq      m11, m10, m10
571
+    punpckldq      m10, m10
572
+    punpckhdq      m13, m12, m12
573
+    punpckldq      m12, m12
574
+%else ; %2 != 1
575
+    pmaddwd        m11, m5, [rsp+1*mmsize]
576
+    pmaddwd        m12, m2, [rsp+1*mmsize]
577
+    pmaddwd        m13, m3, [rsp+1*mmsize]
578
+%endif ; %2 ==/!= 1
579
+%if %3 == 1
580
+    paddd           m2, m10, [rsp+4*mmsize]
581
+    paddd           m3, m11, [rsp+5*mmsize]
582
+    paddd           m8, m12, [rsp+6*mmsize]
583
+    paddd           m9, m13, [rsp+7*mmsize]
584
+%endif ; %3 == 1
585
+    paddd          m10, m0
586
+    paddd          m11, m6
587
+    paddd          m12, m1
588
+    paddd          m13, m7
589
+%if %3 == 1
590
+    psrad           m2, %%sh
591
+    psrad           m3, %%sh
592
+    psrad           m8, %%sh
593
+    psrad           m9, %%sh
594
+%endif ; %3 == 1
595
+    psrad          m10, %%sh
596
+    psrad          m11, %%sh
597
+    psrad          m12, %%sh
598
+    psrad          m13, %%sh
599
+%if %3 == 1
600
+    lea           tmpq, [gq+rgbsq*2]
601
+    packssdw        m2, m3
602
+    packssdw        m8, m9
603
+    mova [tmpq+xq*4], m2
604
+    mova [tmpq+xq*4+mmsize], m8
605
+%endif ; %3 == 1
606
+    packssdw       m10, m11
607
+    packssdw       m12, m13
608
+    mova   [gq+xq*(2 << %2)], m10
609
+    mova   [gq+xq*(2 << %2)+mmsize], m12
610
+
611
+    ; calculate b offsets (un-subsampled, then duplicate)
612
+    pmaddwd         m4, [rsp+2*mmsize]
613
+    pmaddwd         m5, [rsp+2*mmsize]
614
+%if %2 == 1
615
+    punpckhdq       m2, m4, m4
616
+    punpckldq       m4, m4
617
+    punpckhdq       m3, m5, m5
618
+    punpckldq       m5, m5
619
+%else ; %2 != 1
620
+    pmaddwd         m2, [rsp+2*mmsize]
621
+    pmaddwd         m3, [rsp+2*mmsize]
622
+    SWAP             2, 5
623
+%endif ; %2 ==/!= 1
624
+    paddd           m0, m4
625
+    paddd           m6, m2
626
+    paddd           m1, m5
627
+    paddd           m7, m3
628
+%if %3 == 1
629
+    paddd           m4, [rsp+4*mmsize]
630
+    paddd           m2, [rsp+5*mmsize]
631
+    paddd           m5, [rsp+6*mmsize]
632
+    paddd           m3, [rsp+7*mmsize]
633
+%endif ; %3 == 1
634
+    psrad           m0, %%sh
635
+    psrad           m6, %%sh
636
+    psrad           m1, %%sh
637
+    psrad           m7, %%sh
638
+%if %3 == 1
639
+    psrad           m4, %%sh
640
+    psrad           m2, %%sh
641
+    psrad           m5, %%sh
642
+    psrad           m3, %%sh
643
+%endif ; %3 == 1
644
+    packssdw        m0, m6
645
+    packssdw        m1, m7
646
+    movu   [bq+xq*(2 << %2)], m0
647
+    movu   [bq+xq*(2 << %2)+mmsize], m1
648
+%if %3 == 1
649
+    lea           tmpq, [bq+rgbsq*2]
650
+    packssdw        m4, m2
651
+    packssdw        m5, m3
652
+    movu [tmpq+xq*4], m4
653
+    movu [tmpq+xq*4+mmsize], m5
654
+%endif ; %3 == 1
655
+
656
+    add             xd, mmsize >> %2
657
+    cmp             xd, wwd
658
+    jl .loop_h
659
+
660
+    lea             rq, [rq+rgbsq*(2 << %3)]
661
+    lea             gq, [gq+rgbsq*(2 << %3)]
662
+    lea             bq, [bq+rgbsq*(2 << %3)]
663
+%if %3 == 1
664
+    lea             yq, [yq+ysq*2]
665
+%else ; %3 != 0
666
+    add             yq, ysq
667
+%endif ; %3 ==/!= 1
668
+    add             uq, usq
669
+    add             vq, vsq
670
+    dec             hd
671
+    jg .loop_v
672
+
673
+    RET
674
+%endmacro
675
+
676
+%macro YUV2RGB_FNS 2
677
+YUV2RGB_FN  8, %1, %2
678
+YUV2RGB_FN 10, %1, %2
679
+YUV2RGB_FN 12, %1, %2
680
+%endmacro
681
+
682
+INIT_XMM sse2
683
+YUV2RGB_FNS 0, 0
684
+YUV2RGB_FNS 1, 0
685
+YUV2RGB_FNS 1, 1
686
+
687
+%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert)
688
+%assign %%sh 29 - %1
689
+%assign %%rnd (1 << (%%sh - 15))
690
+%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14))
691
+%if %1 != 8
692
+%assign %%maxval ((1 << %1) - 1)
693
+%endif ; %1 != 8
694
+%if %2 == 0
695
+%assign %%ss 444
696
+%elif %3 == 0
697
+%assign %%ss 422
698
+%else ; %3 == 1
699
+%assign %%ss 420
700
+%endif ; %2/%3
701
+
702
+cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \
703
+                                 yuv, yuvs, rgb, rgbs, ww, h, c, off
704
+%if %2 == 1
705
+    inc            wwd
706
+    sar            wwd, 1
707
+%endif ; %2 == 1
708
+%if %3 == 1
709
+    inc             hd
710
+    sar             hd, 1
711
+%endif ; %3 == 1
712
+
713
+    ; prepare coeffs
714
+    movh            m8, [offq]
715
+    movh            m9, [pw_ %+ %%uvrnd]
716
+    psllw           m8, %%sh - 14
717
+    paddw           m9, [pw_ %+ %%rnd]
718
+    paddw           m8, [pw_ %+ %%rnd]
719
+    movh            m0, [cq+  0]
720
+    movh            m1, [cq+ 16]
721
+    movh            m2, [cq+ 32]
722
+    movh            m3, [cq+ 48]
723
+    movh            m4, [cq+ 64]
724
+    movh            m5, [cq+ 80]
725
+    movh            m6, [cq+112]
726
+    movh            m7, [cq+128]
727
+    punpcklwd       m0, m1
728
+    punpcklwd       m2, m8
729
+    punpcklwd       m3, m4
730
+    punpcklwd       m4, m5, m9
731
+    punpcklwd       m5, m6
732
+    punpcklwd       m7, m9
733
+
734
+    mova [rsp+0*mmsize], m0                 ; cry, cgy
735
+    mova [rsp+1*mmsize], m2                 ; cby, off + rnd
736
+    mova [rsp+2*mmsize], m3                 ; cru, cgu
737
+    mova [rsp+3*mmsize], m4                 ; cburv, uvoff + rnd
738
+    mova [rsp+4*mmsize], m5                 ; cburv, cgv
739
+    mova [rsp+5*mmsize], m7                 ; cbv, uvoff + rnd
740
+
741
+
742
+    DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x
743
+    mov             gq, [rq+gprsize*1]
744
+    mov             bq, [rq+gprsize*2]
745
+    mov             rq, [rq+gprsize*0]
746
+    mov             uq, [yq+gprsize*1]
747
+    mov             vq, [yq+gprsize*2]
748
+    mov             yq, [yq+gprsize*0]
749
+    mov            usq, [ysq+gprsize*1]
750
+    mov            vsq, [ysq+gprsize*2]
751
+    mov            ysq, [ysq+gprsize*0]
752
+
753
+    pxor           m15, m15
754
+.loop_v:
755
+    xor             xd, xd
756
+
757
+.loop_h:
758
+    ; top line y
759
+    mova            m0, [rq+xq*(2<<%2)]
760
+    mova            m3, [rq+xq*(2<<%2)+mmsize]
761
+    mova            m1, [gq+xq*(2<<%2)]
762
+    mova            m4, [gq+xq*(2<<%2)+mmsize]
763
+    mova            m2, [bq+xq*(2<<%2)]
764
+    mova            m5, [bq+xq*(2<<%2)+mmsize]
765
+
766
+    punpcklwd       m6, m0, m1
767
+    punpckhwd       m7, m0, m1
768
+    punpcklwd       m8, m3, m4
769
+    punpckhwd       m9, m3, m4
770
+    punpcklwd      m10, m2, [pw_16384]
771
+    punpckhwd      m11, m2, [pw_16384]
772
+    punpcklwd      m12, m5, [pw_16384]
773
+    punpckhwd      m13, m5, [pw_16384]
774
+
775
+    pmaddwd         m6, [rsp+0*mmsize]
776
+    pmaddwd         m7, [rsp+0*mmsize]
777
+    pmaddwd         m8, [rsp+0*mmsize]
778
+    pmaddwd         m9, [rsp+0*mmsize]
779
+    pmaddwd        m10, [rsp+1*mmsize]
780
+    pmaddwd        m11, [rsp+1*mmsize]
781
+    pmaddwd        m12, [rsp+1*mmsize]
782
+    pmaddwd        m13, [rsp+1*mmsize]
783
+    paddd           m6, m10
784
+    paddd           m7, m11
785
+    paddd           m8, m12
786
+    paddd           m9, m13
787
+    psrad           m6, %%sh
788
+    psrad           m7, %%sh
789
+    psrad           m8, %%sh
790
+    psrad           m9, %%sh
791
+    packssdw        m6, m7
792
+    packssdw        m8, m9
793
+%if %1 == 8
794
+    packuswb        m6, m8
795
+    movu [yq+xq*(1<<%2)], m6
796
+%else
797
+    CLIPW           m6, m15, [pw_ %+ %%maxval]
798
+    CLIPW           m8, m15, [pw_ %+ %%maxval]
799
+    movu [yq+xq*(2<<%2)], m6
800
+    movu [yq+xq*(2<<%2)+mmsize], m8
801
+%endif
802
+
803
+%if %2 == 1
804
+    ; subsampling cached data
805
+    pmaddwd         m0, [pw_1]
806
+    pmaddwd         m1, [pw_1]
807
+    pmaddwd         m2, [pw_1]
808
+    pmaddwd         m3, [pw_1]
809
+    pmaddwd         m4, [pw_1]
810
+    pmaddwd         m5, [pw_1]
811
+
812
+%if %3 == 1
813
+    ; bottom line y, r/g portion only
814
+    lea           tmpq, [rgbsq+xq*2]
815
+    mova            m6, [rq+tmpq*2]
816
+    mova            m9, [rq+tmpq*2+mmsize]
817
+    mova            m7, [gq+tmpq*2]
818
+    mova           m10, [gq+tmpq*2+mmsize]
819
+    mova            m8, [bq+tmpq*2]
820
+    mova           m11, [bq+tmpq*2+mmsize]
821
+
822
+    punpcklwd      m12, m6, m7
823
+    punpckhwd      m13, m6, m7
824
+    punpcklwd      m14, m9, m10
825
+    punpckhwd      m15, m9, m10
826
+
827
+    ; release two more registers
828
+    pmaddwd         m6, [pw_1]
829
+    pmaddwd         m7, [pw_1]
830
+    pmaddwd         m9, [pw_1]
831
+    pmaddwd        m10, [pw_1]
832
+    paddd           m0, m6
833
+    paddd           m3, m9
834
+    paddd           m1, m7
835
+    paddd           m4, m10
836
+
837
+    ; bottom line y, b/rnd portion only
838
+    punpcklwd       m6, m8,  [pw_16384]
839
+    punpckhwd       m7, m8,  [pw_16384]
840
+    punpcklwd       m9, m11, [pw_16384]
841
+    punpckhwd      m10, m11, [pw_16384]
842
+
843
+    pmaddwd        m12, [rsp+0*mmsize]
844
+    pmaddwd        m13, [rsp+0*mmsize]
845
+    pmaddwd        m14, [rsp+0*mmsize]
846
+    pmaddwd        m15, [rsp+0*mmsize]
847
+    pmaddwd         m6, [rsp+1*mmsize]
848
+    pmaddwd         m7, [rsp+1*mmsize]
849
+    pmaddwd         m9, [rsp+1*mmsize]
850
+    pmaddwd        m10, [rsp+1*mmsize]
851
+    paddd          m12, m6
852
+    paddd          m13, m7
853
+    paddd          m14, m9
854
+    paddd          m15, m10
855
+    psrad          m12, %%sh
856
+    psrad          m13, %%sh
857
+    psrad          m14, %%sh
858
+    psrad          m15, %%sh
859
+    packssdw       m12, m13
860
+    packssdw       m14, m15
861
+    lea           tmpq, [yq+ysq]
862
+%if %1 == 8
863
+    packuswb       m12, m14
864
+    movu   [tmpq+xq*2], m12
865
+%else
866
+    pxor           m15, m15
867
+    CLIPW          m12, m15, [pw_ %+ %%maxval]
868
+    CLIPW          m14, m15, [pw_ %+ %%maxval]
869
+    movu   [tmpq+xq*4], m12
870
+    movu [tmpq+xq*4+mmsize], m14
871
+%endif
872
+
873
+    ; complete subsampling of r/g/b pixels for u/v
874
+    pmaddwd         m8, [pw_1]
875
+    pmaddwd        m11, [pw_1]
876
+    paddd           m2, m8
877
+    paddd           m5, m11
878
+    paddd           m0, [pd_2]
879
+    paddd           m1, [pd_2]
880
+    paddd           m2, [pd_2]
881
+    paddd           m3, [pd_2]
882
+    paddd           m4, [pd_2]
883
+    paddd           m5, [pd_2]
884
+    psrad           m0, 2
885
+    psrad           m1, 2
886
+    psrad           m2, 2
887
+    psrad           m3, 2
888
+    psrad           m4, 2
889
+    psrad           m5, 2
890
+%else ; %3 != 1
891
+    paddd           m0, [pd_1]
892
+    paddd           m1, [pd_1]
893
+    paddd           m2, [pd_1]
894
+    paddd           m3, [pd_1]
895
+    paddd           m4, [pd_1]
896
+    paddd           m5, [pd_1]
897
+    psrad           m0, 1
898
+    psrad           m1, 1
899
+    psrad           m2, 1
900
+    psrad           m3, 1
901
+    psrad           m4, 1
902
+    psrad           m5, 1
903
+%endif ; %3 ==/!= 1
904
+    packssdw        m0, m3
905
+    packssdw        m1, m4
906
+    packssdw        m2, m5
907
+%endif ; %2 == 1
908
+
909
+    ; convert u/v pixels
910
+    SBUTTERFLY   wd, 0, 1, 6
911
+    punpckhwd       m6, m2, [pw_16384]
912
+    punpcklwd       m2, [pw_16384]
913
+
914
+    pmaddwd         m7, m0, [rsp+2*mmsize]
915
+    pmaddwd         m8, m1, [rsp+2*mmsize]
916
+    pmaddwd         m9, m2, [rsp+3*mmsize]
917
+    pmaddwd        m10, m6, [rsp+3*mmsize]
918
+    pmaddwd         m0, [rsp+4*mmsize]
919
+    pmaddwd         m1, [rsp+4*mmsize]
920
+    pmaddwd         m2, [rsp+5*mmsize]
921
+    pmaddwd         m6, [rsp+5*mmsize]
922
+    paddd           m7, m9
923
+    paddd           m8, m10
924
+    paddd           m0, m2
925
+    paddd           m1, m6
926
+    psrad           m7, %%sh
927
+    psrad           m8, %%sh
928
+    psrad           m0, %%sh
929
+    psrad           m1, %%sh
930
+    packssdw        m7, m8
931
+    packssdw        m0, m1
932
+%if %2 == 1
933
+%if %1 == 8
934
+    packuswb        m7, m0
935
+    movh       [uq+xq], m7
936
+    movhps     [vq+xq], m7
937
+%else
938
+    CLIPW           m7, m15, [pw_ %+ %%maxval]
939
+    CLIPW           m0, m15, [pw_ %+ %%maxval]
940
+    movu     [uq+xq*2], m7
941
+    movu     [vq+xq*2], m0
942
+%endif
943
+%else ; %2 != 1
944
+    ; second set of u/v pixels
945
+    SBUTTERFLY   wd, 3, 4, 6
946
+    punpckhwd       m6, m5, [pw_16384]
947
+    punpcklwd       m5, [pw_16384]
948
+
949
+    pmaddwd         m8, m3, [rsp+2*mmsize]
950
+    pmaddwd         m9, m4, [rsp+2*mmsize]
951
+    pmaddwd        m10, m5, [rsp+3*mmsize]
952
+    pmaddwd        m11, m6, [rsp+3*mmsize]
953
+    pmaddwd         m3, [rsp+4*mmsize]
954
+    pmaddwd         m4, [rsp+4*mmsize]
955
+    pmaddwd         m5, [rsp+5*mmsize]
956
+    pmaddwd         m6, [rsp+5*mmsize]
957
+    paddd           m8, m10
958
+    paddd           m9, m11
959
+    paddd           m3, m5
960
+    paddd           m4, m6
961
+    psrad           m8, %%sh
962
+    psrad           m9, %%sh
963
+    psrad           m3, %%sh
964
+    psrad           m4, %%sh
965
+    packssdw        m8, m9
966
+    packssdw        m3, m4
967
+
968
+%if %1 == 8
969
+    packuswb        m7, m8
970
+    packuswb        m0, m3
971
+    movu       [uq+xq], m7
972
+    movu       [vq+xq], m0
973
+%else
974
+    CLIPW           m7, m15, [pw_ %+ %%maxval]
975
+    CLIPW           m0, m15, [pw_ %+ %%maxval]
976
+    CLIPW           m8, m15, [pw_ %+ %%maxval]
977
+    CLIPW           m3, m15, [pw_ %+ %%maxval]
978
+    movu     [uq+xq*2], m7
979
+    movu [uq+xq*2+mmsize], m8
980
+    movu     [vq+xq*2], m0
981
+    movu [vq+xq*2+mmsize], m3
982
+%endif
983
+%endif ; %2 ==/!= 1
984
+
985
+    add             xq, mmsize >> %2
986
+    cmp             xd, wwd
987
+    jl .loop_h
988
+
989
+%if %3 == 0
990
+    add             yq, ysq
991
+%else ; %3 != 0
992
+    lea             yq, [yq+ysq*2]
993
+%endif ; %3 ==/!= 0
994
+    add             uq, usq
995
+    add             vq, vsq
996
+    lea             rq, [rq+rgbsq*(2<<%3)]
997
+    lea             gq, [gq+rgbsq*(2<<%3)]
998
+    lea             bq, [bq+rgbsq*(2<<%3)]
999
+    dec             hd
1000
+    jg .loop_v
1001
+
1002
+    RET
1003
+%endmacro
1004
+
1005
+%macro RGB2YUV_FNS 2
1006
+RGB2YUV_FN  8, %1, %2
1007
+RGB2YUV_FN 10, %1, %2
1008
+RGB2YUV_FN 12, %1, %2
1009
+%endmacro
1010
+
1011
+INIT_XMM sse2
1012
+RGB2YUV_FNS 0, 0
1013
+RGB2YUV_FNS 1, 0
1014
+RGB2YUV_FNS 1, 1
1015
+
1016
+; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride,
1017
+;                          int w, int h, const int16_t coeff[3][3][8])
1018
+INIT_XMM sse2
1019
+cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c
1020
+    movh            m0, [cq+  0]
1021
+    movh            m1, [cq+ 32]
1022
+    movh            m2, [cq+ 48]
1023
+    movh            m3, [cq+ 80]
1024
+    movh            m4, [cq+ 96]
1025
+    movh            m5, [cq+128]
1026
+    punpcklwd       m0, [cq+ 16]
1027
+    punpcklwd       m1, [pw_8192]
1028
+    punpcklwd       m2, [cq+ 64]
1029
+    punpcklwd       m3, [pw_8192]
1030
+    punpcklwd       m4, [cq+112]
1031
+    punpcklwd       m5, [pw_8192]
1032
+
1033
+    DEFINE_ARGS data0, stride, ww, h, data1, data2, x
1034
+    shl        strideq, 1
1035
+    mov         data1q, [data0q+gprsize*1]
1036
+    mov         data2q, [data0q+gprsize*2]
1037
+    mov         data0q, [data0q+gprsize*0]
1038
+
1039
+.loop_v:
1040
+    xor             xd, xd
1041
+
1042
+.loop_h:
1043
+    mova            m6, [data0q+xq*2]
1044
+    mova            m7, [data1q+xq*2]
1045
+    mova            m8, [data2q+xq*2]
1046
+    SBUTTERFLY   wd, 6, 7, 9
1047
+    punpckhwd       m9, m8, [pw_1]
1048
+    punpcklwd       m8, [pw_1]
1049
+
1050
+    pmaddwd        m10, m6, m0
1051
+    pmaddwd        m11, m7, m0
1052
+    pmaddwd        m12, m8, m1
1053
+    pmaddwd        m13, m9, m1
1054
+    paddd          m10, m12
1055
+    paddd          m11, m13
1056
+    psrad          m10, 14
1057
+    psrad          m11, 14
1058
+
1059
+    pmaddwd        m12, m6, m2
1060
+    pmaddwd        m13, m7, m2
1061
+    pmaddwd        m14, m8, m3
1062
+    pmaddwd        m15, m9, m3
1063
+    paddd          m12, m14
1064
+    paddd          m13, m15
1065
+    psrad          m12, 14
1066
+    psrad          m13, 14
1067
+
1068
+    pmaddwd         m6, m4
1069
+    pmaddwd         m7, m4
1070
+    pmaddwd         m8, m5
1071
+    pmaddwd         m9, m5
1072
+    paddd           m6, m8
1073
+    paddd           m7, m9
1074
+    psrad           m6, 14
1075
+    psrad           m7, 14
1076
+
1077
+    packssdw       m10, m11
1078
+    packssdw       m12, m13
1079
+    packssdw        m6, m7
1080
+
1081
+    mova [data0q+xq*2], m10
1082
+    mova [data1q+xq*2], m12
1083
+    mova [data2q+xq*2], m6
1084
+
1085
+    add             xd, mmsize / 2
1086
+    cmp             xd, wwd
1087
+    jl .loop_h
1088
+
1089
+    add         data0q, strideq
1090
+    add         data1q, strideq
1091
+    add         data2q, strideq
1092
+    dec             hd
1093
+    jg .loop_v
1094
+
1095
+    RET
1096
+%endif
0 1097
new file mode 100644
... ...
@@ -0,0 +1,119 @@
0
+/*
1
+ * Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
2
+ *
3
+ * This file is part of FFmpeg.
4
+ *
5
+ * FFmpeg is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * FFmpeg is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with FFmpeg; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "libavutil/x86/cpu.h"
21
+
22
+#include "libavfilter/colorspacedsp.h"
23
+
24
+#define decl_yuv2yuv_fn(t) \
25
+void ff_yuv2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], \
26
+                           uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], \
27
+                           int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], \
28
+                           const int16_t yuv_offset[2][8])
29
+
30
+#define decl_yuv2yuv_fns(ss) \
31
+decl_yuv2yuv_fn(ss##p8to8); \
32
+decl_yuv2yuv_fn(ss##p10to8); \
33
+decl_yuv2yuv_fn(ss##p12to8); \
34
+decl_yuv2yuv_fn(ss##p8to10); \
35
+decl_yuv2yuv_fn(ss##p10to10); \
36
+decl_yuv2yuv_fn(ss##p12to10); \
37
+decl_yuv2yuv_fn(ss##p8to12); \
38
+decl_yuv2yuv_fn(ss##p10to12); \
39
+decl_yuv2yuv_fn(ss##p12to12)
40
+
41
+decl_yuv2yuv_fns(420);
42
+decl_yuv2yuv_fns(422);
43
+decl_yuv2yuv_fns(444);
44
+
45
+#define decl_yuv2rgb_fn(t) \
46
+void ff_yuv2rgb_##t##_sse2(int16_t *rgb_out[3], ptrdiff_t rgb_stride, \
47
+                           uint8_t *yuv_in[3], ptrdiff_t yuv_stride[3], \
48
+                           int w, int h, const int16_t coeff[3][3][8], \
49
+                           const int16_t yuv_offset[8])
50
+
51
+#define decl_yuv2rgb_fns(ss) \
52
+decl_yuv2rgb_fn(ss##p8); \
53
+decl_yuv2rgb_fn(ss##p10); \
54
+decl_yuv2rgb_fn(ss##p12)
55
+
56
+decl_yuv2rgb_fns(420);
57
+decl_yuv2rgb_fns(422);
58
+decl_yuv2rgb_fns(444);
59
+
60
+#define decl_rgb2yuv_fn(t) \
61
+void ff_rgb2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_stride[3], \
62
+                           int16_t *rgb_in[3], ptrdiff_t rgb_stride, \
63
+                           int w, int h, const int16_t coeff[3][3][8], \
64
+                           const int16_t yuv_offset[8])
65
+
66
+#define decl_rgb2yuv_fns(ss) \
67
+decl_rgb2yuv_fn(ss##p8); \
68
+decl_rgb2yuv_fn(ss##p10); \
69
+decl_rgb2yuv_fn(ss##p12)
70
+
71
+decl_rgb2yuv_fns(420);
72
+decl_rgb2yuv_fns(422);
73
+decl_rgb2yuv_fns(444);
74
+
75
+void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, int w, int h,
76
+                         const int16_t coeff[3][3][8]);
77
+
78
+void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp)
79
+{
80
+    int cpu_flags = av_get_cpu_flags();
81
+
82
+    if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) {
83
+#define assign_yuv2yuv_fns(idx, ss) \
84
+        dsp->yuv2yuv[0][0][idx] = ff_yuv2yuv_##ss##p8to8_sse2; \
85
+        dsp->yuv2yuv[0][1][idx] = ff_yuv2yuv_##ss##p8to10_sse2; \
86
+        dsp->yuv2yuv[0][2][idx] = ff_yuv2yuv_##ss##p8to12_sse2; \
87
+        dsp->yuv2yuv[1][0][idx] = ff_yuv2yuv_##ss##p10to8_sse2; \
88
+        dsp->yuv2yuv[1][1][idx] = ff_yuv2yuv_##ss##p10to10_sse2; \
89
+        dsp->yuv2yuv[1][2][idx] = ff_yuv2yuv_##ss##p10to12_sse2; \
90
+        dsp->yuv2yuv[2][0][idx] = ff_yuv2yuv_##ss##p12to8_sse2; \
91
+        dsp->yuv2yuv[2][1][idx] = ff_yuv2yuv_##ss##p12to10_sse2; \
92
+        dsp->yuv2yuv[2][2][idx] = ff_yuv2yuv_##ss##p12to12_sse2
93
+
94
+        assign_yuv2yuv_fns(2, 420);
95
+        assign_yuv2yuv_fns(1, 422);
96
+        assign_yuv2yuv_fns(0, 444);
97
+
98
+#define assign_yuv2rgb_fns(idx, ss) \
99
+        dsp->yuv2rgb[0][idx] = ff_yuv2rgb_##ss##p8_sse2; \
100
+        dsp->yuv2rgb[1][idx] = ff_yuv2rgb_##ss##p10_sse2; \
101
+        dsp->yuv2rgb[2][idx] = ff_yuv2rgb_##ss##p12_sse2
102
+
103
+        assign_yuv2rgb_fns(2, 420);
104
+        assign_yuv2rgb_fns(1, 422);
105
+        assign_yuv2rgb_fns(0, 444);
106
+
107
+#define assign_rgb2yuv_fns(idx, ss) \
108
+        dsp->rgb2yuv[0][idx] = ff_rgb2yuv_##ss##p8_sse2; \
109
+        dsp->rgb2yuv[1][idx] = ff_rgb2yuv_##ss##p10_sse2; \
110
+        dsp->rgb2yuv[2][idx] = ff_rgb2yuv_##ss##p12_sse2
111
+
112
+        assign_rgb2yuv_fns(2, 420);
113
+        assign_rgb2yuv_fns(1, 422);
114
+        assign_rgb2yuv_fns(0, 444);
115
+
116
+        dsp->multiply3x3 = ff_multiply3x3_sse2;
117
+    }
118
+}
... ...
@@ -16,6 +16,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes)
16 16
 
17 17
 # libavfilter tests
18 18
 AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
19
+AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
19 20
 
20 21
 CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes)
21 22
 
... ...
@@ -106,6 +106,9 @@ static const struct {
106 106
     #if CONFIG_BLEND_FILTER
107 107
         { "vf_blend", checkasm_check_blend },
108 108
     #endif
109
+    #if CONFIG_COLORSPACE_FILTER
110
+        { "vf_colorspace", checkasm_check_colorspace },
111
+    #endif
109 112
 #endif
110 113
     { NULL }
111 114
 };
... ...
@@ -33,6 +33,7 @@
33 33
 void checkasm_check_alacdsp(void);
34 34
 void checkasm_check_blend(void);
35 35
 void checkasm_check_bswapdsp(void);
36
+void checkasm_check_colorspace(void);
36 37
 void checkasm_check_flacdsp(void);
37 38
 void checkasm_check_fmtconvert(void);
38 39
 void checkasm_check_h264pred(void);
39 40
new file mode 100644
... ...
@@ -0,0 +1,314 @@
0
+/*
1
+ * Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com>
2
+ *
3
+ * This file is part of FFmpeg.
4
+ *
5
+ * FFmpeg is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * FFmpeg is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with FFmpeg; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include <string.h>
21
+#include "checkasm.h"
22
+#include "libavfilter/colorspacedsp.h"
23
+#include "libavutil/common.h"
24
+#include "libavutil/internal.h"
25
+#include "libavutil/intreadwrite.h"
26
+
27
+#define W 64
28
+#define H 64
29
+
30
+#define randomize_buffers()                     \
31
+    do {                                        \
32
+        unsigned mask = bpp_mask[idepth];       \
33
+        int n, m;                               \
34
+        int bpp = 1 + (!!idepth);               \
35
+        int buf_size = W * H * bpp;             \
36
+        for (m = 0; m < 3; m++) {               \
37
+            int ss = m ? ss_w + ss_h : 0;       \
38
+            int plane_sz = buf_size >> ss;      \
39
+            for (n = 0; n < plane_sz; n += 4) { \
40
+                unsigned r = rnd() & mask;      \
41
+                AV_WN32A(&src[m][n], r);        \
42
+            }                                   \
43
+        }                                       \
44
+    } while (0)
45
+
46
+static const char *format_string[] = {
47
+    "444", "422", "420"
48
+};
49
+
50
+static unsigned bpp_mask[] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
51
+
52
+static void check_yuv2yuv(void)
53
+{
54
+    declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3],
55
+                 uint8_t *src[3], ptrdiff_t src_stride[3],
56
+                 int w, int h, const int16_t coeff[3][3][8],
57
+                 const int16_t off[2][8]);
58
+    ColorSpaceDSPContext dsp;
59
+    int idepth, odepth, fmt, n;
60
+    LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]);
61
+    LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]);
62
+    LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]);
63
+    uint8_t *src[3] = { src_y, src_u, src_v };
64
+    LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H * 2]);
65
+    LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H * 2]);
66
+    LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H * 2]);
67
+    LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H * 2]);
68
+    LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H * 2]);
69
+    LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H * 2]);
70
+    uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v };
71
+    LOCAL_ALIGNED_32(int16_t, offset_buf, [16]);
72
+    LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
73
+    int16_t (*offset)[8] = (int16_t(*)[8]) offset_buf;
74
+    int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
75
+
76
+    ff_colorspacedsp_init(&dsp);
77
+    for (n = 0; n < 8; n++) {
78
+        offset[0][n] = offset[1][n] = 16;
79
+
80
+        coeff[0][0][n] = (1 << 14) + (1 << 7) + 1;
81
+        coeff[0][1][n] = (1 << 7) - 1;
82
+        coeff[0][2][n] = -(1 << 8);
83
+        coeff[1][0][n] = coeff[2][0][n] = 0;
84
+        coeff[1][1][n] = (1 << 14) + (1 << 7);
85
+        coeff[1][2][n] = -(1 << 7);
86
+        coeff[2][2][n] = (1 << 14) - (1 << 6);
87
+        coeff[2][1][n] = 1 << 6;
88
+    }
89
+    for (idepth = 0; idepth < 3; idepth++) {
90
+        for (odepth = 0; odepth < 3; odepth++) {
91
+            for (fmt = 0; fmt < 3; fmt++) {
92
+                if (check_func(dsp.yuv2yuv[idepth][odepth][fmt],
93
+                               "ff_colorspacedsp_yuv2yuv_%sp%dto%d",
94
+                               format_string[fmt],
95
+                               idepth * 2 + 8, odepth * 2 + 8)) {
96
+                    int ss_w = !!fmt, ss_h = fmt == 2;
97
+                    int y_src_stride = W << !!idepth, y_dst_stride = W << !!odepth;
98
+                    int uv_src_stride = y_src_stride >> ss_w, uv_dst_stride = y_dst_stride >> ss_w;
99
+
100
+                    randomize_buffers();
101
+                    call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride },
102
+                             src, (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride },
103
+                             W, H, coeff, offset);
104
+                    call_new(dst1, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride },
105
+                             src, (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride },
106
+                             W, H, coeff, offset);
107
+                    if (memcmp(dst0[0], dst1[0], y_dst_stride * H) ||
108
+                        memcmp(dst0[1], dst1[1], uv_dst_stride * H >> ss_h) ||
109
+                        memcmp(dst0[2], dst1[2], uv_dst_stride * H >> ss_h)) {
110
+                        fail();
111
+                    }
112
+                }
113
+            }
114
+        }
115
+    }
116
+
117
+    report("yuv2yuv");
118
+}
119
+
120
+static void check_yuv2rgb(void)
121
+{
122
+    declare_func(void, int16_t *dst[3], ptrdiff_t dst_stride,
123
+                 uint8_t *src[3], ptrdiff_t src_stride[3],
124
+                 int w, int h, const int16_t coeff[3][3][8],
125
+                 const int16_t off[8]);
126
+    ColorSpaceDSPContext dsp;
127
+    int idepth, fmt, n;
128
+    LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]);
129
+    LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]);
130
+    LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]);
131
+    uint8_t *src[3] = { src_y, src_u, src_v };
132
+    LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]);
133
+    LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]);
134
+    LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]);
135
+    LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]);
136
+    LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]);
137
+    LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]);
138
+    int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v };
139
+    LOCAL_ALIGNED_32(int16_t, offset, [8]);
140
+    LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
141
+    int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
142
+
143
+    ff_colorspacedsp_init(&dsp);
144
+    for (n = 0; n < 8; n++) {
145
+        offset[n] = 16;
146
+
147
+        coeff[0][0][n] = coeff[1][0][n] = coeff[2][0][n] = (1 << 14) | 1;
148
+        coeff[0][1][n] = coeff[2][2][n] = 0;
149
+        coeff[0][2][n] = 1 << 13;
150
+        coeff[1][1][n] = -(1 << 12);
151
+        coeff[1][2][n] = 1 << 12;
152
+        coeff[2][1][n] = 1 << 11;
153
+    }
154
+    for (idepth = 0; idepth < 3; idepth++) {
155
+        for (fmt = 0; fmt < 3; fmt++) {
156
+            if (check_func(dsp.yuv2rgb[idepth][fmt],
157
+                           "ff_colorspacedsp_yuv2rgb_%sp%d",
158
+                           format_string[fmt], idepth * 2 + 8)) {
159
+                int ss_w = !!fmt, ss_h = fmt == 2;
160
+                int y_src_stride = W << !!idepth;
161
+                int uv_src_stride = y_src_stride >> ss_w;
162
+
163
+                randomize_buffers();
164
+                call_ref(dst0, W, src,
165
+                         (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride },
166
+                         W, H, coeff, offset);
167
+                call_new(dst1, W, src,
168
+                         (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride },
169
+                         W, H, coeff, offset);
170
+                if (memcmp(dst0[0], dst1[0], W * H * sizeof(int16_t)) ||
171
+                    memcmp(dst0[1], dst1[1], W * H * sizeof(int16_t)) ||
172
+                    memcmp(dst0[2], dst1[2], W * H * sizeof(int16_t))) {
173
+                    fail();
174
+                }
175
+            }
176
+        }
177
+    }
178
+
179
+    report("yuv2rgb");
180
+}
181
+
182
+#undef randomize_buffers
183
+#define randomize_buffers()                     \
184
+    do {                                        \
185
+        int y, x, p;                            \
186
+        for (p = 0; p < 3; p++) {               \
187
+            for (y = 0; y < H; y++) {           \
188
+                for (x = 0; x < W; x++) {       \
189
+                    int r = rnd() & 0x7fff;     \
190
+                    r -= (32768 - 28672) >> 1;  \
191
+                    src[p][y * W + x] = r;      \
192
+                }                               \
193
+            }                                   \
194
+        }                                       \
195
+    } while (0)
196
+
197
+static void check_rgb2yuv(void)
198
+{
199
+    declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3],
200
+                 int16_t *src[3], ptrdiff_t src_stride,
201
+                 int w, int h, const int16_t coeff[3][3][8],
202
+                 const int16_t off[8]);
203
+    ColorSpaceDSPContext dsp;
204
+    int odepth, fmt, n;
205
+    LOCAL_ALIGNED_32(int16_t, src_y, [W * H * 2]);
206
+    LOCAL_ALIGNED_32(int16_t, src_u, [W * H * 2]);
207
+    LOCAL_ALIGNED_32(int16_t, src_v, [W * H * 2]);
208
+    int16_t *src[3] = { src_y, src_u, src_v };
209
+    LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H]);
210
+    LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H]);
211
+    LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H]);
212
+    LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H]);
213
+    LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H]);
214
+    LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H]);
215
+    uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v };
216
+    LOCAL_ALIGNED_32(int16_t, offset, [8]);
217
+    LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
218
+    int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
219
+
220
+    ff_colorspacedsp_init(&dsp);
221
+    for (n = 0; n < 8; n++) {
222
+        offset[n] = 16;
223
+
224
+        // these somewhat resemble bt601/smpte170m coefficients
225
+        coeff[0][0][n] = lrint(0.3 * (1 << 14));
226
+        coeff[0][1][n] = lrint(0.6 * (1 << 14));
227
+        coeff[0][2][n] = lrint(0.1 * (1 << 14));
228
+        coeff[1][0][n] = lrint(-0.15 * (1 << 14));
229
+        coeff[1][1][n] = lrint(-0.35 * (1 << 14));
230
+        coeff[1][2][n] = lrint(0.5 * (1 << 14));
231
+        coeff[2][0][n] = lrint(0.5 * (1 << 14));
232
+        coeff[2][1][n] = lrint(-0.42 * (1 << 14));
233
+        coeff[2][2][n] = lrint(-0.08 * (1 << 14));
234
+    }
235
+    for (odepth = 0; odepth < 3; odepth++) {
236
+        for (fmt = 0; fmt < 3; fmt++) {
237
+            if (check_func(dsp.rgb2yuv[odepth][fmt],
238
+                           "ff_colorspacedsp_rgb2yuv_%sp%d",
239
+                           format_string[fmt], odepth * 2 + 8)) {
240
+                int ss_w = !!fmt, ss_h = fmt == 2;
241
+                int y_dst_stride = W << !!odepth;
242
+                int uv_dst_stride = y_dst_stride >> ss_w;
243
+
244
+                randomize_buffers();
245
+                call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride },
246
+                         src, W, W, H, coeff, offset);
247
+                call_new(dst1, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride },
248
+                         src, W, W, H, coeff, offset);
249
+                if (memcmp(dst0[0], dst1[0], H * y_dst_stride) ||
250
+                    memcmp(dst0[1], dst1[1], H * uv_dst_stride >> ss_h) ||
251
+                    memcmp(dst0[2], dst1[2], H * uv_dst_stride >> ss_h)) {
252
+                    fail();
253
+                }
254
+            }
255
+        }
256
+    }
257
+
258
+    report("rgb2yuv");
259
+}
260
+
261
+static void check_multiply3x3(void)
262
+{
263
+    declare_func(void, int16_t *data[3], ptrdiff_t stride,
264
+                 int w, int h, const int16_t coeff[3][3][8]);
265
+    ColorSpaceDSPContext dsp;
266
+    LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]);
267
+    LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]);
268
+    LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]);
269
+    LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]);
270
+    LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]);
271
+    LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]);
272
+    int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v };
273
+    int16_t **src = dst0;
274
+    LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]);
275
+    int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf;
276
+    int n;
277
+
278
+    ff_colorspacedsp_init(&dsp);
279
+    for (n = 0; n < 8; n++) {
280
+        coeff[0][0][n] = lrint(0.85 * (1 << 14));
281
+        coeff[0][1][n] = lrint(0.10 * (1 << 14));
282
+        coeff[0][2][n] = lrint(0.05 * (1 << 14));
283
+        coeff[1][0][n] = lrint(-0.1 * (1 << 14));
284
+        coeff[1][1][n] = lrint(0.95 * (1 << 14));
285
+        coeff[1][2][n] = lrint(0.15 * (1 << 14));
286
+        coeff[2][0][n] = lrint(-0.2 * (1 << 14));
287
+        coeff[2][1][n] = lrint(0.30 * (1 << 14));
288
+        coeff[2][2][n] = lrint(0.90 * (1 << 14));
289
+    }
290
+    if (check_func(dsp.multiply3x3, "ff_colorspacedsp_multiply3x3")) {
291
+        randomize_buffers();
292
+        memcpy(dst1_y, dst0_y, W * H * sizeof(*dst1_y));
293
+        memcpy(dst1_u, dst0_u, W * H * sizeof(*dst1_u));
294
+        memcpy(dst1_v, dst0_v, W * H * sizeof(*dst1_v));
295
+        call_ref(dst0, W, W, H, coeff);
296
+        call_new(dst1, W, W, H, coeff);
297
+        if (memcmp(dst0[0], dst1[0], H * W * sizeof(*dst0_y)) ||
298
+            memcmp(dst0[1], dst1[1], H * W * sizeof(*dst0_u)) ||
299
+            memcmp(dst0[2], dst1[2], H * W * sizeof(*dst0_v))) {
300
+            fail();
301
+        }
302
+    }
303
+
304
+    report("multiply3x3");
305
+}
306
+
307
+void checkasm_check_colorspace(void)
308
+{
309
+    check_yuv2yuv();
310
+    check_yuv2rgb();
311
+    check_rgb2yuv();
312
+    check_multiply3x3();
313
+}