... | ... |
@@ -1,5 +1,6 @@ |
1 | 1 |
OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend_init.o |
2 | 2 |
OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif_init.o |
3 |
+OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp_init.o |
|
3 | 4 |
OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o |
4 | 5 |
OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o |
5 | 6 |
OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o |
... | ... |
@@ -23,6 +24,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o |
23 | 23 |
|
24 | 24 |
YASM-OBJS-$(CONFIG_BLEND_FILTER) += x86/vf_blend.o |
25 | 25 |
YASM-OBJS-$(CONFIG_BWDIF_FILTER) += x86/vf_bwdif.o |
26 |
+YASM-OBJS-$(CONFIG_COLORSPACE_FILTER) += x86/colorspacedsp.o |
|
26 | 27 |
YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o |
27 | 28 |
YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o |
28 | 29 |
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o |
29 | 30 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,1097 @@ |
0 |
+;***************************************************************************** |
|
1 |
+;* x86-optimized functions for colorspace filter |
|
2 |
+;* |
|
3 |
+;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com> |
|
4 |
+;* |
|
5 |
+;* This file is part of FFmpeg. |
|
6 |
+;* |
|
7 |
+;* FFmpeg is free software; you can redistribute it and/or |
|
8 |
+;* modify it under the terms of the GNU Lesser General Public |
|
9 |
+;* License as published by the Free Software Foundation; either |
|
10 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
11 |
+;* |
|
12 |
+;* FFmpeg is distributed in the hope that it will be useful, |
|
13 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
+;* Lesser General Public License for more details. |
|
16 |
+;* |
|
17 |
+;* You should have received a copy of the GNU Lesser General Public |
|
18 |
+;* License along with FFmpeg; if not, write to the Free Software |
|
19 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
20 |
+;****************************************************************************** |
|
21 |
+ |
|
22 |
+%include "libavutil/x86/x86util.asm" |
|
23 |
+ |
|
24 |
+SECTION_RODATA |
|
25 |
+ |
|
26 |
+pw_1: times 8 dw 1 |
|
27 |
+pw_2: times 8 dw 2 |
|
28 |
+pw_4: times 8 dw 4 |
|
29 |
+pw_8: times 8 dw 8 |
|
30 |
+pw_16: times 8 dw 16 |
|
31 |
+pw_64: times 8 dw 64 |
|
32 |
+pw_128: times 8 dw 128 |
|
33 |
+pw_256: times 8 dw 256 |
|
34 |
+pw_512: times 8 dw 512 |
|
35 |
+pw_1023: times 8 dw 1023 |
|
36 |
+pw_1024: times 8 dw 1024 |
|
37 |
+pw_2048: times 8 dw 2048 |
|
38 |
+pw_4095: times 8 dw 4095 |
|
39 |
+pw_8192: times 8 dw 8192 |
|
40 |
+pw_16384: times 8 dw 16384 |
|
41 |
+ |
|
42 |
+pd_1: times 4 dd 1 |
|
43 |
+pd_2: times 4 dd 2 |
|
44 |
+pd_128: times 4 dd 128 |
|
45 |
+pd_512: times 4 dd 512 |
|
46 |
+pd_2048: times 4 dd 2048 |
|
47 |
+pd_8192: times 4 dd 8192 |
|
48 |
+pd_32768: times 4 dd 32768 |
|
49 |
+pd_131072: times 4 dd 131072 |
|
50 |
+ |
|
51 |
+SECTION .text |
|
52 |
+ |
|
53 |
+; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], |
|
54 |
+; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], |
|
55 |
+; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], |
|
56 |
+; const int16_t yuv_offset[2][8]) |
|
57 |
+ |
|
58 |
+%if ARCH_X86_64 |
|
59 |
+%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert) |
|
60 |
+ |
|
61 |
+%assign %%sh (14 + %1 - %2) |
|
62 |
+%assign %%rnd (1 << (%%sh - 1)) |
|
63 |
+%assign %%uvinoff (128 << (%1 - 8)) |
|
64 |
+%assign %%uvoutoff (128 << (%2 - 8)) |
|
65 |
+%if %3 == 0 |
|
66 |
+%assign %%ss 444 |
|
67 |
+%elif %4 == 0 |
|
68 |
+%assign %%ss 422 |
|
69 |
+%else ; %4 == 1 |
|
70 |
+%assign %%ss 420 |
|
71 |
+%endif ; %3/%4 |
|
72 |
+%if %2 != 8 |
|
73 |
+%assign %%maxval (1 << %2) - 1 |
|
74 |
+%endif ; %2 != 8 |
|
75 |
+ |
|
76 |
+%assign %%ypsh %%sh - 1 |
|
77 |
+%if %%ypsh > 14 |
|
78 |
+%assign %%yoffsh %%ypsh - 13 |
|
79 |
+%assign %%ypsh 14 |
|
80 |
+%else |
|
81 |
+%assign %%yoffsh 1 |
|
82 |
+%endif |
|
83 |
+%assign %%yprnd (1 << (%%yoffsh - 1)) |
|
84 |
+%assign %%ypmul (1 << %%ypsh) |
|
85 |
+ |
|
86 |
+cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \ |
|
87 |
+ yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo |
|
88 |
+%if %3 == 1 |
|
89 |
+ inc wd |
|
90 |
+ sar wd, 1 |
|
91 |
+%if %4 == 1 |
|
92 |
+ inc hd |
|
93 |
+ sar hd, 1 |
|
94 |
+%endif ; %4 == 1 |
|
95 |
+%endif ; %3 == 1 |
|
96 |
+ mov [rsp+3*mmsize+0], wd |
|
97 |
+ mov [rsp+3*mmsize+4], hd |
|
98 |
+ |
|
99 |
+ mova m10, [cq] |
|
100 |
+ pxor m11, m11 |
|
101 |
+ mova m12, [pd_ %+ %%uvoutoff] |
|
102 |
+ pslld m12, %%sh |
|
103 |
+ paddd m12, [pd_ %+ %%rnd] |
|
104 |
+ mova m13, [pw_ %+ %%uvinoff] |
|
105 |
+ mova m14, [yoffq+ 0] ; y_off_in |
|
106 |
+ mova m15, [yoffq+16] ; y_off_out |
|
107 |
+%if %%yoffsh != 0 |
|
108 |
+ psllw m15, %%yoffsh |
|
109 |
+%endif |
|
110 |
+ paddw m15, [pw_ %+ %%yprnd] |
|
111 |
+ punpcklwd m10, m15 |
|
112 |
+ mova m15, [pw_ %+ %%ypmul] |
|
113 |
+ movh m0, [cq+1*16] ; cyu |
|
114 |
+ movh m1, [cq+2*16] ; cyv |
|
115 |
+ movh m2, [cq+4*16] ; cuu |
|
116 |
+ movh m3, [cq+5*16] ; cuv |
|
117 |
+ movh m4, [cq+7*16] ; cvu |
|
118 |
+ movh m5, [cq+8*16] ; cvv |
|
119 |
+ punpcklwd m0, m1 |
|
120 |
+ punpcklwd m2, m3 |
|
121 |
+ punpcklwd m4, m5 |
|
122 |
+ mova [rsp+0*mmsize], m0 |
|
123 |
+ mova [rsp+1*mmsize], m2 |
|
124 |
+ mova [rsp+2*mmsize], m4 |
|
125 |
+ |
|
126 |
+ DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp |
|
127 |
+ |
|
128 |
+ mov uiq, [yiq+gprsize*1] |
|
129 |
+ mov viq, [yiq+gprsize*2] |
|
130 |
+ mov yiq, [yiq+gprsize*0] |
|
131 |
+ mov uoq, [yoq+gprsize*1] |
|
132 |
+ mov voq, [yoq+gprsize*2] |
|
133 |
+ mov yoq, [yoq+gprsize*0] |
|
134 |
+ mov uisq, [yisq+gprsize*1] |
|
135 |
+ mov visq, [yisq+gprsize*2] |
|
136 |
+ mov yisq, [yisq+gprsize*0] |
|
137 |
+ mov uosq, [yosq+gprsize*1] |
|
138 |
+ mov vosq, [yosq+gprsize*2] |
|
139 |
+ mov yosq, [yosq+gprsize*0] |
|
140 |
+ |
|
141 |
+.loop_v: |
|
142 |
+ xor xq, xq |
|
143 |
+ |
|
144 |
+.loop_h: |
|
145 |
+%if %4 == 1 |
|
146 |
+ lea tmpq, [yiq+yisq] |
|
147 |
+%endif ; %4 == 1 |
|
148 |
+%if %1 == 8 |
|
149 |
+ movu m0, [yiq+xq*(1<<%3)] ; y00/01 |
|
150 |
+%if %4 == 1 |
|
151 |
+ movu m2, [tmpq+xq*2] ; y10/11 |
|
152 |
+%endif ; %4 == 1 |
|
153 |
+%if %3 == 1 |
|
154 |
+ movh m4, [uiq+xq] ; u |
|
155 |
+ movh m5, [viq+xq] ; v |
|
156 |
+%else ; %3 != 1 |
|
157 |
+ movu m4, [uiq+xq] ; u |
|
158 |
+ movu m5, [viq+xq] ; v |
|
159 |
+%endif ; %3 ==/!= 1 |
|
160 |
+ punpckhbw m1, m0, m11 |
|
161 |
+ punpcklbw m0, m11 |
|
162 |
+%if %4 == 1 |
|
163 |
+ punpckhbw m3, m2, m11 |
|
164 |
+ punpcklbw m2, m11 |
|
165 |
+%endif ; %4 == 1 |
|
166 |
+%if %3 == 0 |
|
167 |
+ punpckhbw m2, m4, m11 |
|
168 |
+ punpckhbw m3, m5, m11 |
|
169 |
+%endif ; %3 == 0 |
|
170 |
+ punpcklbw m4, m11 |
|
171 |
+ punpcklbw m5, m11 |
|
172 |
+%else ; %1 != 8 |
|
173 |
+ movu m0, [yiq+xq*(2<<%3)] ; y00/01 |
|
174 |
+ movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01 |
|
175 |
+%if %4 == 1 |
|
176 |
+ movu m2, [tmpq+xq*4] ; y10/11 |
|
177 |
+ movu m3, [tmpq+xq*4+mmsize] ; y10/11 |
|
178 |
+%endif ; %4 == 1 |
|
179 |
+ movu m4, [uiq+xq*2] ; u |
|
180 |
+ movu m5, [viq+xq*2] ; v |
|
181 |
+%if %3 == 0 |
|
182 |
+ movu m2, [uiq+xq*2+mmsize] |
|
183 |
+ movu m3, [viq+xq*2+mmsize] |
|
184 |
+%endif ; %3 == 0 |
|
185 |
+%endif ; %1 ==/!= 8 |
|
186 |
+ psubw m0, m14 |
|
187 |
+ psubw m1, m14 |
|
188 |
+%if %4 == 1 |
|
189 |
+ psubw m2, m14 |
|
190 |
+ psubw m3, m14 |
|
191 |
+%endif ; %4 == 1 |
|
192 |
+ psubw m4, m13 |
|
193 |
+ psubw m5, m13 |
|
194 |
+%if %3 == 0 |
|
195 |
+ psubw m2, m13 |
|
196 |
+ psubw m3, m13 |
|
197 |
+%endif ; %3 == 0 |
|
198 |
+ |
|
199 |
+ SBUTTERFLY wd, 4, 5, 6 |
|
200 |
+ pmaddwd m6, m4, [rsp+1*mmsize] |
|
201 |
+ pmaddwd m7, m5, [rsp+1*mmsize] |
|
202 |
+%if %3 == 0 |
|
203 |
+ SBUTTERFLY wd, 2, 3, 8 |
|
204 |
+ pmaddwd m8, m2, [rsp+1*mmsize] |
|
205 |
+ pmaddwd m9, m3, [rsp+1*mmsize] |
|
206 |
+%else ; %3 != 0 |
|
207 |
+ pmaddwd m8, m4, [rsp+2*mmsize] |
|
208 |
+ pmaddwd m9, m5, [rsp+2*mmsize] |
|
209 |
+%endif |
|
210 |
+ paddd m6, m12 |
|
211 |
+ paddd m7, m12 |
|
212 |
+ paddd m8, m12 |
|
213 |
+ paddd m9, m12 |
|
214 |
+ psrad m6, %%sh |
|
215 |
+ psrad m7, %%sh |
|
216 |
+ psrad m8, %%sh |
|
217 |
+ psrad m9, %%sh |
|
218 |
+ packssdw m6, m7 |
|
219 |
+ packssdw m8, m9 |
|
220 |
+%if %2 == 8 |
|
221 |
+ packuswb m6, m8 |
|
222 |
+%if %3 == 0 |
|
223 |
+ movu [uoq+xq], m6 |
|
224 |
+%else ; %3 != 0 |
|
225 |
+ movh [uoq+xq], m6 |
|
226 |
+ movhps [voq+xq], m6 |
|
227 |
+%endif ; %3 ==/!= 0 |
|
228 |
+%else ; %2 != 8 |
|
229 |
+ CLIPW m6, m11, [pw_ %+ %%maxval] |
|
230 |
+ CLIPW m8, m11, [pw_ %+ %%maxval] |
|
231 |
+ movu [uoq+xq*2], m6 |
|
232 |
+%if %3 == 0 |
|
233 |
+ movu [uoq+xq*2+mmsize], m8 |
|
234 |
+%else ; %3 != 0 |
|
235 |
+ movu [voq+xq*2], m8 |
|
236 |
+%endif ; %3 ==/!= 0 |
|
237 |
+%endif ; %2 ==/!= 8 |
|
238 |
+ |
|
239 |
+%if %3 == 0 |
|
240 |
+ pmaddwd m6, m4, [rsp+2*mmsize] |
|
241 |
+ pmaddwd m7, m5, [rsp+2*mmsize] |
|
242 |
+ pmaddwd m8, m2, [rsp+2*mmsize] |
|
243 |
+ pmaddwd m9, m3, [rsp+2*mmsize] |
|
244 |
+ paddd m6, m12 |
|
245 |
+ paddd m7, m12 |
|
246 |
+ paddd m8, m12 |
|
247 |
+ paddd m9, m12 |
|
248 |
+ psrad m6, %%sh |
|
249 |
+ psrad m7, %%sh |
|
250 |
+ psrad m8, %%sh |
|
251 |
+ psrad m9, %%sh |
|
252 |
+ packssdw m6, m7 |
|
253 |
+ packssdw m8, m9 |
|
254 |
+%if %2 == 8 |
|
255 |
+ packuswb m6, m8 |
|
256 |
+ movu [voq+xq], m6 |
|
257 |
+%else ; %2 != 8 |
|
258 |
+ CLIPW m6, m11, [pw_ %+ %%maxval] |
|
259 |
+ CLIPW m8, m11, [pw_ %+ %%maxval] |
|
260 |
+ movu [voq+xq*2], m6 |
|
261 |
+ movu [voq+xq*2+mmsize], m8 |
|
262 |
+%endif ; %2 ==/!= 8 |
|
263 |
+%endif ; %3 == 0 |
|
264 |
+ |
|
265 |
+ pmaddwd m4, [rsp+0*mmsize] |
|
266 |
+ pmaddwd m5, [rsp+0*mmsize] ; uv_val |
|
267 |
+%if %3 == 0 |
|
268 |
+ pmaddwd m2, [rsp+0*mmsize] |
|
269 |
+ pmaddwd m3, [rsp+0*mmsize] |
|
270 |
+%endif ; %3 == 0 |
|
271 |
+ |
|
272 |
+ ; unpack y pixels with m15 (shifted round + offset), then multiply |
|
273 |
+ ; by m10, add uv pixels, and we're done! |
|
274 |
+%if %3 == 1 |
|
275 |
+ punpckhdq m8, m4, m4 |
|
276 |
+ punpckldq m4, m4 |
|
277 |
+ punpckhdq m9, m5, m5 |
|
278 |
+ punpckldq m5, m5 |
|
279 |
+%else ; %3 != 1 |
|
280 |
+ SWAP 8, 5, 2 |
|
281 |
+ SWAP 3, 9 |
|
282 |
+%endif ; %3 ==/!= 1 |
|
283 |
+%if %4 == 1 |
|
284 |
+ punpckhwd m6, m2, m15 |
|
285 |
+ punpcklwd m2, m15 |
|
286 |
+ punpckhwd m7, m3, m15 |
|
287 |
+ punpcklwd m3, m15 |
|
288 |
+ pmaddwd m2, m10 |
|
289 |
+ pmaddwd m6, m10 |
|
290 |
+ pmaddwd m3, m10 |
|
291 |
+ pmaddwd m7, m10 |
|
292 |
+ paddd m2, m4 |
|
293 |
+ paddd m6, m8 |
|
294 |
+ paddd m3, m5 |
|
295 |
+ paddd m7, m9 |
|
296 |
+ psrad m2, %%sh |
|
297 |
+ psrad m6, %%sh |
|
298 |
+ psrad m3, %%sh |
|
299 |
+ psrad m7, %%sh |
|
300 |
+ packssdw m2, m6 |
|
301 |
+ packssdw m3, m7 |
|
302 |
+ |
|
303 |
+ lea tmpq, [yoq+yosq] |
|
304 |
+%if %2 == 8 |
|
305 |
+ packuswb m2, m3 |
|
306 |
+ movu [tmpq+xq*2], m2 |
|
307 |
+%else ; %2 != 8 |
|
308 |
+ CLIPW m2, m11, [pw_ %+ %%maxval] |
|
309 |
+ CLIPW m3, m11, [pw_ %+ %%maxval] |
|
310 |
+ movu [tmpq+xq*4], m2 |
|
311 |
+ movu [tmpq+xq*4+mmsize], m3 |
|
312 |
+%endif ; %2 ==/!= 8 |
|
313 |
+%endif ; %4 == 1 |
|
314 |
+ |
|
315 |
+ punpckhwd m6, m0, m15 |
|
316 |
+ punpcklwd m0, m15 |
|
317 |
+ punpckhwd m7, m1, m15 |
|
318 |
+ punpcklwd m1, m15 |
|
319 |
+ pmaddwd m0, m10 |
|
320 |
+ pmaddwd m6, m10 |
|
321 |
+ pmaddwd m1, m10 |
|
322 |
+ pmaddwd m7, m10 |
|
323 |
+ paddd m0, m4 |
|
324 |
+ paddd m6, m8 |
|
325 |
+ paddd m1, m5 |
|
326 |
+ paddd m7, m9 |
|
327 |
+ psrad m0, %%sh |
|
328 |
+ psrad m6, %%sh |
|
329 |
+ psrad m1, %%sh |
|
330 |
+ psrad m7, %%sh |
|
331 |
+ packssdw m0, m6 |
|
332 |
+ packssdw m1, m7 |
|
333 |
+ |
|
334 |
+%if %2 == 8 |
|
335 |
+ packuswb m0, m1 |
|
336 |
+ movu [yoq+xq*(1<<%3)], m0 |
|
337 |
+%else ; %2 != 8 |
|
338 |
+ CLIPW m0, m11, [pw_ %+ %%maxval] |
|
339 |
+ CLIPW m1, m11, [pw_ %+ %%maxval] |
|
340 |
+ movu [yoq+xq*(2<<%3)], m0 |
|
341 |
+ movu [yoq+xq*(2<<%3)+mmsize], m1 |
|
342 |
+%endif ; %2 ==/!= 8 |
|
343 |
+ |
|
344 |
+ add xq, mmsize >> %3 |
|
345 |
+ cmp xd, dword [rsp+3*mmsize+0] |
|
346 |
+ jl .loop_h |
|
347 |
+ |
|
348 |
+%if %4 == 1 |
|
349 |
+ lea yiq, [yiq+yisq*2] |
|
350 |
+ lea yoq, [yoq+yosq*2] |
|
351 |
+%else ; %4 != 1 |
|
352 |
+ add yiq, yisq |
|
353 |
+ add yoq, yosq |
|
354 |
+%endif ; %4 ==/!= 1 |
|
355 |
+ add uiq, uisq |
|
356 |
+ add viq, visq |
|
357 |
+ add uoq, uosq |
|
358 |
+ add voq, vosq |
|
359 |
+ dec dword [rsp+3*mmsize+4] |
|
360 |
+ jg .loop_v |
|
361 |
+ |
|
362 |
+ RET |
|
363 |
+%endmacro |
|
364 |
+ |
|
365 |
+%macro YUV2YUV_FNS 2 ; ss_w, ss_h |
|
366 |
+YUV2YUV_FN 8, 8, %1, %2 |
|
367 |
+YUV2YUV_FN 10, 8, %1, %2 |
|
368 |
+YUV2YUV_FN 12, 8, %1, %2 |
|
369 |
+YUV2YUV_FN 8, 10, %1, %2 |
|
370 |
+YUV2YUV_FN 10, 10, %1, %2 |
|
371 |
+YUV2YUV_FN 12, 10, %1, %2 |
|
372 |
+YUV2YUV_FN 8, 12, %1, %2 |
|
373 |
+YUV2YUV_FN 10, 12, %1, %2 |
|
374 |
+YUV2YUV_FN 12, 12, %1, %2 |
|
375 |
+%endmacro |
|
376 |
+ |
|
377 |
+INIT_XMM sse2 |
|
378 |
+YUV2YUV_FNS 0, 0 |
|
379 |
+YUV2YUV_FNS 1, 0 |
|
380 |
+YUV2YUV_FNS 1, 1 |
|
381 |
+ |
|
382 |
+; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride, |
|
383 |
+; uint8_t *yuv[3], ptrdiff_t yuv_stride[3], |
|
384 |
+; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8], |
|
385 |
+; const int16_t yuv_offset[8]) |
|
386 |
+%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) |
|
387 |
+%assign %%sh (%1 - 1) |
|
388 |
+%assign %%rnd (1 << (%%sh - 1)) |
|
389 |
+%assign %%uvoff (1 << (%1 - 1)) |
|
390 |
+%if %2 == 0 |
|
391 |
+%assign %%ss 444 |
|
392 |
+%elif %3 == 0 |
|
393 |
+%assign %%ss 422 |
|
394 |
+%else ; %3 == 1 |
|
395 |
+%assign %%ss 420 |
|
396 |
+%endif ; %2/%3 |
|
397 |
+ |
|
398 |
+cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \ |
|
399 |
+ rgb, rgbs, yuv, yuvs, ww, h, c, yoff |
|
400 |
+%if %2 == 1 |
|
401 |
+ inc wwd |
|
402 |
+ sar wwd, 1 |
|
403 |
+%endif ; %2 == 1 |
|
404 |
+%if %3 == 1 |
|
405 |
+ inc hd |
|
406 |
+ sar hd, 1 |
|
407 |
+%endif ; %3 == 1 |
|
408 |
+ pxor m11, m11 |
|
409 |
+ mova m15, [yoffq] ; yoff |
|
410 |
+ movh m14, [cq+ 0] ; cy |
|
411 |
+ movh m10, [cq+ 32] ; crv |
|
412 |
+ movh m13, [cq+112] ; cbu |
|
413 |
+ movh m12, [cq+ 64] ; cgu |
|
414 |
+ movh m9, [cq+ 80] ; cgv |
|
415 |
+ punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd |
|
416 |
+ punpcklwd m13, m11 ; cbu, 0 |
|
417 |
+ punpcklwd m11, m10 ; 0, crv |
|
418 |
+ punpcklwd m12, m9 ; cgu, cgv |
|
419 |
+ mova [rsp+0*mmsize], m11 |
|
420 |
+ mova [rsp+1*mmsize], m12 |
|
421 |
+ mova [rsp+2*mmsize], m13 |
|
422 |
+ mova [rsp+3*mmsize], m14 |
|
423 |
+ pxor m14, m14 |
|
424 |
+ |
|
425 |
+ DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp |
|
426 |
+ |
|
427 |
+ mov gq, [rq+1*gprsize] |
|
428 |
+ mov bq, [rq+2*gprsize] |
|
429 |
+ mov rq, [rq+0*gprsize] |
|
430 |
+ mov uq, [yq+1*gprsize] |
|
431 |
+ mov vq, [yq+2*gprsize] |
|
432 |
+ mov yq, [yq+0*gprsize] |
|
433 |
+ mov usq, [ysq+1*gprsize] |
|
434 |
+ mov vsq, [ysq+2*gprsize] |
|
435 |
+ mov ysq, [ysq+0*gprsize] |
|
436 |
+ |
|
437 |
+.loop_v: |
|
438 |
+ xor xq, xq |
|
439 |
+ |
|
440 |
+.loop_h: |
|
441 |
+%if %3 == 1 |
|
442 |
+ lea tmpq, [yq+ysq] |
|
443 |
+%endif ; %3 == 1 |
|
444 |
+%if %1 == 8 |
|
445 |
+ movu m0, [yq+xq*(1<<%2)] |
|
446 |
+%if %3 == 1 |
|
447 |
+ movu m2, [tmpq+xq*2] |
|
448 |
+%endif ; %3 == 1 |
|
449 |
+%if %2 == 1 |
|
450 |
+ movh m4, [uq+xq] |
|
451 |
+ movh m5, [vq+xq] |
|
452 |
+%else ; %2 != 1 |
|
453 |
+ movu m4, [uq+xq] |
|
454 |
+ movu m5, [vq+xq] |
|
455 |
+%endif ; %2 ==/!= 1 |
|
456 |
+ punpckhbw m1, m0, m14 |
|
457 |
+ punpcklbw m0, m14 |
|
458 |
+%if %3 == 1 |
|
459 |
+ punpckhbw m3, m2, m14 |
|
460 |
+ punpcklbw m2, m14 |
|
461 |
+%endif ; %3 == 1 |
|
462 |
+%if %2 == 0 |
|
463 |
+ punpckhbw m2, m4, m14 |
|
464 |
+ punpckhbw m3, m5, m14 |
|
465 |
+%endif ; %2 == 0 |
|
466 |
+ punpcklbw m4, m14 |
|
467 |
+ punpcklbw m5, m14 |
|
468 |
+%else ; %1 != 8 |
|
469 |
+ movu m0, [yq+xq*(2<<%2)] |
|
470 |
+ movu m1, [yq+xq*(2<<%2)+mmsize] |
|
471 |
+%if %3 == 1 |
|
472 |
+ movu m2, [tmpq+xq*4] |
|
473 |
+ movu m3, [tmpq+xq*4+mmsize] |
|
474 |
+%endif ; %3 == 1 |
|
475 |
+ movu m4, [uq+xq*2] |
|
476 |
+ movu m5, [vq+xq*2] |
|
477 |
+%if %2 == 0 |
|
478 |
+ movu m2, [uq+xq*2+mmsize] |
|
479 |
+ movu m3, [vq+xq*2+mmsize] |
|
480 |
+%endif ; %2 == 0 |
|
481 |
+%endif ; %1 ==/!= 8 |
|
482 |
+ psubw m0, m15 |
|
483 |
+ psubw m1, m15 |
|
484 |
+%if %3 == 1 |
|
485 |
+ psubw m2, m15 |
|
486 |
+ psubw m3, m15 |
|
487 |
+%endif ; %3 == 1 |
|
488 |
+ psubw m4, [pw_ %+ %%uvoff] |
|
489 |
+ psubw m5, [pw_ %+ %%uvoff] |
|
490 |
+ SBUTTERFLY wd, 4, 5, 6 |
|
491 |
+%if %2 == 0 |
|
492 |
+ psubw m2, [pw_ %+ %%uvoff] |
|
493 |
+ psubw m3, [pw_ %+ %%uvoff] |
|
494 |
+ SBUTTERFLY wd, 2, 3, 6 |
|
495 |
+%endif ; %2 == 0 |
|
496 |
+ |
|
497 |
+ ; calculate y+rnd full-resolution [0-3,6-9] |
|
498 |
+ punpckhwd m6, m0, [pw_1] ; y, 1 |
|
499 |
+ punpcklwd m0, [pw_1] ; y, 1 |
|
500 |
+ punpckhwd m7, m1, [pw_1] ; y, 1 |
|
501 |
+ punpcklwd m1, [pw_1] ; y, 1 |
|
502 |
+ pmaddwd m0, [rsp+3*mmsize] |
|
503 |
+ pmaddwd m6, [rsp+3*mmsize] |
|
504 |
+ pmaddwd m1, [rsp+3*mmsize] |
|
505 |
+ pmaddwd m7, [rsp+3*mmsize] |
|
506 |
+%if %3 == 1 |
|
507 |
+ punpckhwd m8, m2, [pw_1] ; y, 1 |
|
508 |
+ punpcklwd m2, [pw_1] ; y, 1 |
|
509 |
+ punpckhwd m9, m3, [pw_1] ; y, 1 |
|
510 |
+ punpcklwd m3, [pw_1] ; y, 1 |
|
511 |
+ pmaddwd m2, [rsp+3*mmsize] |
|
512 |
+ pmaddwd m8, [rsp+3*mmsize] |
|
513 |
+ pmaddwd m3, [rsp+3*mmsize] |
|
514 |
+ pmaddwd m9, [rsp+3*mmsize] |
|
515 |
+ mova [rsp+4*mmsize], m2 |
|
516 |
+ mova [rsp+5*mmsize], m8 |
|
517 |
+ mova [rsp+6*mmsize], m3 |
|
518 |
+ mova [rsp+7*mmsize], m9 |
|
519 |
+%endif ; %3 == 1 |
|
520 |
+ |
|
521 |
+ ; calculate r offsets (un-subsampled, then duplicate) |
|
522 |
+ pmaddwd m10, m4, [rsp+0*mmsize] |
|
523 |
+%if %2 == 1 |
|
524 |
+ pmaddwd m12, m5, [rsp+0*mmsize] |
|
525 |
+ punpckhdq m11, m10, m10 |
|
526 |
+ punpckldq m10, m10 |
|
527 |
+ punpckhdq m13, m12, m12 |
|
528 |
+ punpckldq m12, m12 |
|
529 |
+%else ; %2 != 1 |
|
530 |
+ pmaddwd m11, m5, [rsp+0*mmsize] |
|
531 |
+ pmaddwd m12, m2, [rsp+0*mmsize] |
|
532 |
+ pmaddwd m13, m3, [rsp+0*mmsize] |
|
533 |
+%endif ; %2 ==/!= 1 |
|
534 |
+%if %3 == 1 |
|
535 |
+ paddd m2, m10, [rsp+4*mmsize] |
|
536 |
+ paddd m3, m11, [rsp+5*mmsize] |
|
537 |
+ paddd m8, m12, [rsp+6*mmsize] |
|
538 |
+ paddd m9, m13, [rsp+7*mmsize] |
|
539 |
+%endif |
|
540 |
+ paddd m10, m0 |
|
541 |
+ paddd m11, m6 |
|
542 |
+ paddd m12, m1 |
|
543 |
+ paddd m13, m7 |
|
544 |
+%if %3 == 1 |
|
545 |
+ psrad m2, %%sh |
|
546 |
+ psrad m3, %%sh |
|
547 |
+ psrad m8, %%sh |
|
548 |
+ psrad m9, %%sh |
|
549 |
+%endif ; %3 == 1 |
|
550 |
+ psrad m10, %%sh |
|
551 |
+ psrad m11, %%sh |
|
552 |
+ psrad m12, %%sh |
|
553 |
+ psrad m13, %%sh |
|
554 |
+%if %3 == 1 |
|
555 |
+ lea tmpq, [rq+rgbsq*2] |
|
556 |
+ packssdw m2, m3 |
|
557 |
+ packssdw m8, m9 |
|
558 |
+ mova [tmpq+xq*4], m2 |
|
559 |
+ mova [tmpq+xq*4+mmsize], m8 |
|
560 |
+%endif ; %3 == 1 |
|
561 |
+ packssdw m10, m11 |
|
562 |
+ packssdw m12, m13 |
|
563 |
+ mova [rq+xq*(2 << %2)], m10 |
|
564 |
+ mova [rq+xq*(2 << %2)+mmsize], m12 |
|
565 |
+ |
|
566 |
+ ; calculate g offsets (un-subsampled, then duplicate) |
|
567 |
+ pmaddwd m10, m4, [rsp+1*mmsize] |
|
568 |
+%if %2 == 1 |
|
569 |
+ pmaddwd m12, m5, [rsp+1*mmsize] |
|
570 |
+ punpckhdq m11, m10, m10 |
|
571 |
+ punpckldq m10, m10 |
|
572 |
+ punpckhdq m13, m12, m12 |
|
573 |
+ punpckldq m12, m12 |
|
574 |
+%else ; %2 != 1 |
|
575 |
+ pmaddwd m11, m5, [rsp+1*mmsize] |
|
576 |
+ pmaddwd m12, m2, [rsp+1*mmsize] |
|
577 |
+ pmaddwd m13, m3, [rsp+1*mmsize] |
|
578 |
+%endif ; %2 ==/!= 1 |
|
579 |
+%if %3 == 1 |
|
580 |
+ paddd m2, m10, [rsp+4*mmsize] |
|
581 |
+ paddd m3, m11, [rsp+5*mmsize] |
|
582 |
+ paddd m8, m12, [rsp+6*mmsize] |
|
583 |
+ paddd m9, m13, [rsp+7*mmsize] |
|
584 |
+%endif ; %3 == 1 |
|
585 |
+ paddd m10, m0 |
|
586 |
+ paddd m11, m6 |
|
587 |
+ paddd m12, m1 |
|
588 |
+ paddd m13, m7 |
|
589 |
+%if %3 == 1 |
|
590 |
+ psrad m2, %%sh |
|
591 |
+ psrad m3, %%sh |
|
592 |
+ psrad m8, %%sh |
|
593 |
+ psrad m9, %%sh |
|
594 |
+%endif ; %3 == 1 |
|
595 |
+ psrad m10, %%sh |
|
596 |
+ psrad m11, %%sh |
|
597 |
+ psrad m12, %%sh |
|
598 |
+ psrad m13, %%sh |
|
599 |
+%if %3 == 1 |
|
600 |
+ lea tmpq, [gq+rgbsq*2] |
|
601 |
+ packssdw m2, m3 |
|
602 |
+ packssdw m8, m9 |
|
603 |
+ mova [tmpq+xq*4], m2 |
|
604 |
+ mova [tmpq+xq*4+mmsize], m8 |
|
605 |
+%endif ; %3 == 1 |
|
606 |
+ packssdw m10, m11 |
|
607 |
+ packssdw m12, m13 |
|
608 |
+ mova [gq+xq*(2 << %2)], m10 |
|
609 |
+ mova [gq+xq*(2 << %2)+mmsize], m12 |
|
610 |
+ |
|
611 |
+ ; calculate b offsets (un-subsampled, then duplicate) |
|
612 |
+ pmaddwd m4, [rsp+2*mmsize] |
|
613 |
+ pmaddwd m5, [rsp+2*mmsize] |
|
614 |
+%if %2 == 1 |
|
615 |
+ punpckhdq m2, m4, m4 |
|
616 |
+ punpckldq m4, m4 |
|
617 |
+ punpckhdq m3, m5, m5 |
|
618 |
+ punpckldq m5, m5 |
|
619 |
+%else ; %2 != 1 |
|
620 |
+ pmaddwd m2, [rsp+2*mmsize] |
|
621 |
+ pmaddwd m3, [rsp+2*mmsize] |
|
622 |
+ SWAP 2, 5 |
|
623 |
+%endif ; %2 ==/!= 1 |
|
624 |
+ paddd m0, m4 |
|
625 |
+ paddd m6, m2 |
|
626 |
+ paddd m1, m5 |
|
627 |
+ paddd m7, m3 |
|
628 |
+%if %3 == 1 |
|
629 |
+ paddd m4, [rsp+4*mmsize] |
|
630 |
+ paddd m2, [rsp+5*mmsize] |
|
631 |
+ paddd m5, [rsp+6*mmsize] |
|
632 |
+ paddd m3, [rsp+7*mmsize] |
|
633 |
+%endif ; %3 == 1 |
|
634 |
+ psrad m0, %%sh |
|
635 |
+ psrad m6, %%sh |
|
636 |
+ psrad m1, %%sh |
|
637 |
+ psrad m7, %%sh |
|
638 |
+%if %3 == 1 |
|
639 |
+ psrad m4, %%sh |
|
640 |
+ psrad m2, %%sh |
|
641 |
+ psrad m5, %%sh |
|
642 |
+ psrad m3, %%sh |
|
643 |
+%endif ; %3 == 1 |
|
644 |
+ packssdw m0, m6 |
|
645 |
+ packssdw m1, m7 |
|
646 |
+ movu [bq+xq*(2 << %2)], m0 |
|
647 |
+ movu [bq+xq*(2 << %2)+mmsize], m1 |
|
648 |
+%if %3 == 1 |
|
649 |
+ lea tmpq, [bq+rgbsq*2] |
|
650 |
+ packssdw m4, m2 |
|
651 |
+ packssdw m5, m3 |
|
652 |
+ movu [tmpq+xq*4], m4 |
|
653 |
+ movu [tmpq+xq*4+mmsize], m5 |
|
654 |
+%endif ; %3 == 1 |
|
655 |
+ |
|
656 |
+ add xd, mmsize >> %2 |
|
657 |
+ cmp xd, wwd |
|
658 |
+ jl .loop_h |
|
659 |
+ |
|
660 |
+ lea rq, [rq+rgbsq*(2 << %3)] |
|
661 |
+ lea gq, [gq+rgbsq*(2 << %3)] |
|
662 |
+ lea bq, [bq+rgbsq*(2 << %3)] |
|
663 |
+%if %3 == 1 |
|
664 |
+ lea yq, [yq+ysq*2] |
|
665 |
+%else ; %3 != 0 |
|
666 |
+ add yq, ysq |
|
667 |
+%endif ; %3 ==/!= 1 |
|
668 |
+ add uq, usq |
|
669 |
+ add vq, vsq |
|
670 |
+ dec hd |
|
671 |
+ jg .loop_v |
|
672 |
+ |
|
673 |
+ RET |
|
674 |
+%endmacro |
|
675 |
+ |
|
676 |
+%macro YUV2RGB_FNS 2 |
|
677 |
+YUV2RGB_FN 8, %1, %2 |
|
678 |
+YUV2RGB_FN 10, %1, %2 |
|
679 |
+YUV2RGB_FN 12, %1, %2 |
|
680 |
+%endmacro |
|
681 |
+ |
|
682 |
+INIT_XMM sse2 |
|
683 |
+YUV2RGB_FNS 0, 0 |
|
684 |
+YUV2RGB_FNS 1, 0 |
|
685 |
+YUV2RGB_FNS 1, 1 |
|
686 |
+ |
|
687 |
+%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) |
|
688 |
+%assign %%sh 29 - %1 |
|
689 |
+%assign %%rnd (1 << (%%sh - 15)) |
|
690 |
+%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14)) |
|
691 |
+%if %1 != 8 |
|
692 |
+%assign %%maxval ((1 << %1) - 1) |
|
693 |
+%endif ; %1 != 8 |
|
694 |
+%if %2 == 0 |
|
695 |
+%assign %%ss 444 |
|
696 |
+%elif %3 == 0 |
|
697 |
+%assign %%ss 422 |
|
698 |
+%else ; %3 == 1 |
|
699 |
+%assign %%ss 420 |
|
700 |
+%endif ; %2/%3 |
|
701 |
+ |
|
702 |
+cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \ |
|
703 |
+ yuv, yuvs, rgb, rgbs, ww, h, c, off |
|
704 |
+%if %2 == 1 |
|
705 |
+ inc wwd |
|
706 |
+ sar wwd, 1 |
|
707 |
+%endif ; %2 == 1 |
|
708 |
+%if %3 == 1 |
|
709 |
+ inc hd |
|
710 |
+ sar hd, 1 |
|
711 |
+%endif ; %3 == 1 |
|
712 |
+ |
|
713 |
+ ; prepare coeffs |
|
714 |
+ movh m8, [offq] |
|
715 |
+ movh m9, [pw_ %+ %%uvrnd] |
|
716 |
+ psllw m8, %%sh - 14 |
|
717 |
+ paddw m9, [pw_ %+ %%rnd] |
|
718 |
+ paddw m8, [pw_ %+ %%rnd] |
|
719 |
+ movh m0, [cq+ 0] |
|
720 |
+ movh m1, [cq+ 16] |
|
721 |
+ movh m2, [cq+ 32] |
|
722 |
+ movh m3, [cq+ 48] |
|
723 |
+ movh m4, [cq+ 64] |
|
724 |
+ movh m5, [cq+ 80] |
|
725 |
+ movh m6, [cq+112] |
|
726 |
+ movh m7, [cq+128] |
|
727 |
+ punpcklwd m0, m1 |
|
728 |
+ punpcklwd m2, m8 |
|
729 |
+ punpcklwd m3, m4 |
|
730 |
+ punpcklwd m4, m5, m9 |
|
731 |
+ punpcklwd m5, m6 |
|
732 |
+ punpcklwd m7, m9 |
|
733 |
+ |
|
734 |
+ mova [rsp+0*mmsize], m0 ; cry, cgy |
|
735 |
+ mova [rsp+1*mmsize], m2 ; cby, off + rnd |
|
736 |
+ mova [rsp+2*mmsize], m3 ; cru, cgu |
|
737 |
+ mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd |
|
738 |
+ mova [rsp+4*mmsize], m5 ; cburv, cgv |
|
739 |
+ mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd |
|
740 |
+ |
|
741 |
+ |
|
742 |
+ DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x |
|
743 |
+ mov gq, [rq+gprsize*1] |
|
744 |
+ mov bq, [rq+gprsize*2] |
|
745 |
+ mov rq, [rq+gprsize*0] |
|
746 |
+ mov uq, [yq+gprsize*1] |
|
747 |
+ mov vq, [yq+gprsize*2] |
|
748 |
+ mov yq, [yq+gprsize*0] |
|
749 |
+ mov usq, [ysq+gprsize*1] |
|
750 |
+ mov vsq, [ysq+gprsize*2] |
|
751 |
+ mov ysq, [ysq+gprsize*0] |
|
752 |
+ |
|
753 |
+ pxor m15, m15 |
|
754 |
+.loop_v: |
|
755 |
+ xor xd, xd |
|
756 |
+ |
|
757 |
+.loop_h: |
|
758 |
+ ; top line y |
|
759 |
+ mova m0, [rq+xq*(2<<%2)] |
|
760 |
+ mova m3, [rq+xq*(2<<%2)+mmsize] |
|
761 |
+ mova m1, [gq+xq*(2<<%2)] |
|
762 |
+ mova m4, [gq+xq*(2<<%2)+mmsize] |
|
763 |
+ mova m2, [bq+xq*(2<<%2)] |
|
764 |
+ mova m5, [bq+xq*(2<<%2)+mmsize] |
|
765 |
+ |
|
766 |
+ punpcklwd m6, m0, m1 |
|
767 |
+ punpckhwd m7, m0, m1 |
|
768 |
+ punpcklwd m8, m3, m4 |
|
769 |
+ punpckhwd m9, m3, m4 |
|
770 |
+ punpcklwd m10, m2, [pw_16384] |
|
771 |
+ punpckhwd m11, m2, [pw_16384] |
|
772 |
+ punpcklwd m12, m5, [pw_16384] |
|
773 |
+ punpckhwd m13, m5, [pw_16384] |
|
774 |
+ |
|
775 |
+ pmaddwd m6, [rsp+0*mmsize] |
|
776 |
+ pmaddwd m7, [rsp+0*mmsize] |
|
777 |
+ pmaddwd m8, [rsp+0*mmsize] |
|
778 |
+ pmaddwd m9, [rsp+0*mmsize] |
|
779 |
+ pmaddwd m10, [rsp+1*mmsize] |
|
780 |
+ pmaddwd m11, [rsp+1*mmsize] |
|
781 |
+ pmaddwd m12, [rsp+1*mmsize] |
|
782 |
+ pmaddwd m13, [rsp+1*mmsize] |
|
783 |
+ paddd m6, m10 |
|
784 |
+ paddd m7, m11 |
|
785 |
+ paddd m8, m12 |
|
786 |
+ paddd m9, m13 |
|
787 |
+ psrad m6, %%sh |
|
788 |
+ psrad m7, %%sh |
|
789 |
+ psrad m8, %%sh |
|
790 |
+ psrad m9, %%sh |
|
791 |
+ packssdw m6, m7 |
|
792 |
+ packssdw m8, m9 |
|
793 |
+%if %1 == 8 |
|
794 |
+ packuswb m6, m8 |
|
795 |
+ movu [yq+xq*(1<<%2)], m6 |
|
796 |
+%else |
|
797 |
+ CLIPW m6, m15, [pw_ %+ %%maxval] |
|
798 |
+ CLIPW m8, m15, [pw_ %+ %%maxval] |
|
799 |
+ movu [yq+xq*(2<<%2)], m6 |
|
800 |
+ movu [yq+xq*(2<<%2)+mmsize], m8 |
|
801 |
+%endif |
|
802 |
+ |
|
803 |
+%if %2 == 1 |
|
804 |
+ ; subsampling cached data |
|
805 |
+ pmaddwd m0, [pw_1] |
|
806 |
+ pmaddwd m1, [pw_1] |
|
807 |
+ pmaddwd m2, [pw_1] |
|
808 |
+ pmaddwd m3, [pw_1] |
|
809 |
+ pmaddwd m4, [pw_1] |
|
810 |
+ pmaddwd m5, [pw_1] |
|
811 |
+ |
|
812 |
+%if %3 == 1 |
|
813 |
+ ; bottom line y, r/g portion only |
|
814 |
+ lea tmpq, [rgbsq+xq*2] |
|
815 |
+ mova m6, [rq+tmpq*2] |
|
816 |
+ mova m9, [rq+tmpq*2+mmsize] |
|
817 |
+ mova m7, [gq+tmpq*2] |
|
818 |
+ mova m10, [gq+tmpq*2+mmsize] |
|
819 |
+ mova m8, [bq+tmpq*2] |
|
820 |
+ mova m11, [bq+tmpq*2+mmsize] |
|
821 |
+ |
|
822 |
+ punpcklwd m12, m6, m7 |
|
823 |
+ punpckhwd m13, m6, m7 |
|
824 |
+ punpcklwd m14, m9, m10 |
|
825 |
+ punpckhwd m15, m9, m10 |
|
826 |
+ |
|
827 |
+ ; release two more registers |
|
828 |
+ pmaddwd m6, [pw_1] |
|
829 |
+ pmaddwd m7, [pw_1] |
|
830 |
+ pmaddwd m9, [pw_1] |
|
831 |
+ pmaddwd m10, [pw_1] |
|
832 |
+ paddd m0, m6 |
|
833 |
+ paddd m3, m9 |
|
834 |
+ paddd m1, m7 |
|
835 |
+ paddd m4, m10 |
|
836 |
+ |
|
837 |
+ ; bottom line y, b/rnd portion only |
|
838 |
+ punpcklwd m6, m8, [pw_16384] |
|
839 |
+ punpckhwd m7, m8, [pw_16384] |
|
840 |
+ punpcklwd m9, m11, [pw_16384] |
|
841 |
+ punpckhwd m10, m11, [pw_16384] |
|
842 |
+ |
|
843 |
+ pmaddwd m12, [rsp+0*mmsize] |
|
844 |
+ pmaddwd m13, [rsp+0*mmsize] |
|
845 |
+ pmaddwd m14, [rsp+0*mmsize] |
|
846 |
+ pmaddwd m15, [rsp+0*mmsize] |
|
847 |
+ pmaddwd m6, [rsp+1*mmsize] |
|
848 |
+ pmaddwd m7, [rsp+1*mmsize] |
|
849 |
+ pmaddwd m9, [rsp+1*mmsize] |
|
850 |
+ pmaddwd m10, [rsp+1*mmsize] |
|
851 |
+ paddd m12, m6 |
|
852 |
+ paddd m13, m7 |
|
853 |
+ paddd m14, m9 |
|
854 |
+ paddd m15, m10 |
|
855 |
+ psrad m12, %%sh |
|
856 |
+ psrad m13, %%sh |
|
857 |
+ psrad m14, %%sh |
|
858 |
+ psrad m15, %%sh |
|
859 |
+ packssdw m12, m13 |
|
860 |
+ packssdw m14, m15 |
|
861 |
+ lea tmpq, [yq+ysq] |
|
862 |
+%if %1 == 8 |
|
863 |
+ packuswb m12, m14 |
|
864 |
+ movu [tmpq+xq*2], m12 |
|
865 |
+%else |
|
866 |
+ pxor m15, m15 |
|
867 |
+ CLIPW m12, m15, [pw_ %+ %%maxval] |
|
868 |
+ CLIPW m14, m15, [pw_ %+ %%maxval] |
|
869 |
+ movu [tmpq+xq*4], m12 |
|
870 |
+ movu [tmpq+xq*4+mmsize], m14 |
|
871 |
+%endif |
|
872 |
+ |
|
873 |
+ ; complete subsampling of r/g/b pixels for u/v |
|
874 |
+ pmaddwd m8, [pw_1] |
|
875 |
+ pmaddwd m11, [pw_1] |
|
876 |
+ paddd m2, m8 |
|
877 |
+ paddd m5, m11 |
|
878 |
+ paddd m0, [pd_2] |
|
879 |
+ paddd m1, [pd_2] |
|
880 |
+ paddd m2, [pd_2] |
|
881 |
+ paddd m3, [pd_2] |
|
882 |
+ paddd m4, [pd_2] |
|
883 |
+ paddd m5, [pd_2] |
|
884 |
+ psrad m0, 2 |
|
885 |
+ psrad m1, 2 |
|
886 |
+ psrad m2, 2 |
|
887 |
+ psrad m3, 2 |
|
888 |
+ psrad m4, 2 |
|
889 |
+ psrad m5, 2 |
|
890 |
+%else ; %3 != 1 |
|
891 |
+ paddd m0, [pd_1] |
|
892 |
+ paddd m1, [pd_1] |
|
893 |
+ paddd m2, [pd_1] |
|
894 |
+ paddd m3, [pd_1] |
|
895 |
+ paddd m4, [pd_1] |
|
896 |
+ paddd m5, [pd_1] |
|
897 |
+ psrad m0, 1 |
|
898 |
+ psrad m1, 1 |
|
899 |
+ psrad m2, 1 |
|
900 |
+ psrad m3, 1 |
|
901 |
+ psrad m4, 1 |
|
902 |
+ psrad m5, 1 |
|
903 |
+%endif ; %3 ==/!= 1 |
|
904 |
+ packssdw m0, m3 |
|
905 |
+ packssdw m1, m4 |
|
906 |
+ packssdw m2, m5 |
|
907 |
+%endif ; %2 == 1 |
|
908 |
+ |
|
909 |
+ ; convert u/v pixels |
|
910 |
+ SBUTTERFLY wd, 0, 1, 6 |
|
911 |
+ punpckhwd m6, m2, [pw_16384] |
|
912 |
+ punpcklwd m2, [pw_16384] |
|
913 |
+ |
|
914 |
+ pmaddwd m7, m0, [rsp+2*mmsize] |
|
915 |
+ pmaddwd m8, m1, [rsp+2*mmsize] |
|
916 |
+ pmaddwd m9, m2, [rsp+3*mmsize] |
|
917 |
+ pmaddwd m10, m6, [rsp+3*mmsize] |
|
918 |
+ pmaddwd m0, [rsp+4*mmsize] |
|
919 |
+ pmaddwd m1, [rsp+4*mmsize] |
|
920 |
+ pmaddwd m2, [rsp+5*mmsize] |
|
921 |
+ pmaddwd m6, [rsp+5*mmsize] |
|
922 |
+ paddd m7, m9 |
|
923 |
+ paddd m8, m10 |
|
924 |
+ paddd m0, m2 |
|
925 |
+ paddd m1, m6 |
|
926 |
+ psrad m7, %%sh |
|
927 |
+ psrad m8, %%sh |
|
928 |
+ psrad m0, %%sh |
|
929 |
+ psrad m1, %%sh |
|
930 |
+ packssdw m7, m8 |
|
931 |
+ packssdw m0, m1 |
|
932 |
+%if %2 == 1 |
|
933 |
+%if %1 == 8 |
|
934 |
+ packuswb m7, m0 |
|
935 |
+ movh [uq+xq], m7 |
|
936 |
+ movhps [vq+xq], m7 |
|
937 |
+%else |
|
938 |
+ CLIPW m7, m15, [pw_ %+ %%maxval] |
|
939 |
+ CLIPW m0, m15, [pw_ %+ %%maxval] |
|
940 |
+ movu [uq+xq*2], m7 |
|
941 |
+ movu [vq+xq*2], m0 |
|
942 |
+%endif |
|
943 |
+%else ; %2 != 1 |
|
944 |
+ ; second set of u/v pixels |
|
945 |
+ SBUTTERFLY wd, 3, 4, 6 |
|
946 |
+ punpckhwd m6, m5, [pw_16384] |
|
947 |
+ punpcklwd m5, [pw_16384] |
|
948 |
+ |
|
949 |
+ pmaddwd m8, m3, [rsp+2*mmsize] |
|
950 |
+ pmaddwd m9, m4, [rsp+2*mmsize] |
|
951 |
+ pmaddwd m10, m5, [rsp+3*mmsize] |
|
952 |
+ pmaddwd m11, m6, [rsp+3*mmsize] |
|
953 |
+ pmaddwd m3, [rsp+4*mmsize] |
|
954 |
+ pmaddwd m4, [rsp+4*mmsize] |
|
955 |
+ pmaddwd m5, [rsp+5*mmsize] |
|
956 |
+ pmaddwd m6, [rsp+5*mmsize] |
|
957 |
+ paddd m8, m10 |
|
958 |
+ paddd m9, m11 |
|
959 |
+ paddd m3, m5 |
|
960 |
+ paddd m4, m6 |
|
961 |
+ psrad m8, %%sh |
|
962 |
+ psrad m9, %%sh |
|
963 |
+ psrad m3, %%sh |
|
964 |
+ psrad m4, %%sh |
|
965 |
+ packssdw m8, m9 |
|
966 |
+ packssdw m3, m4 |
|
967 |
+ |
|
968 |
+%if %1 == 8 |
|
969 |
+ packuswb m7, m8 |
|
970 |
+ packuswb m0, m3 |
|
971 |
+ movu [uq+xq], m7 |
|
972 |
+ movu [vq+xq], m0 |
|
973 |
+%else |
|
974 |
+ CLIPW m7, m15, [pw_ %+ %%maxval] |
|
975 |
+ CLIPW m0, m15, [pw_ %+ %%maxval] |
|
976 |
+ CLIPW m8, m15, [pw_ %+ %%maxval] |
|
977 |
+ CLIPW m3, m15, [pw_ %+ %%maxval] |
|
978 |
+ movu [uq+xq*2], m7 |
|
979 |
+ movu [uq+xq*2+mmsize], m8 |
|
980 |
+ movu [vq+xq*2], m0 |
|
981 |
+ movu [vq+xq*2+mmsize], m3 |
|
982 |
+%endif |
|
983 |
+%endif ; %2 ==/!= 1 |
|
984 |
+ |
|
985 |
+ add xq, mmsize >> %2 |
|
986 |
+ cmp xd, wwd |
|
987 |
+ jl .loop_h |
|
988 |
+ |
|
989 |
+%if %3 == 0 |
|
990 |
+ add yq, ysq |
|
991 |
+%else ; %3 != 0 |
|
992 |
+ lea yq, [yq+ysq*2] |
|
993 |
+%endif ; %3 ==/!= 0 |
|
994 |
+ add uq, usq |
|
995 |
+ add vq, vsq |
|
996 |
+ lea rq, [rq+rgbsq*(2<<%3)] |
|
997 |
+ lea gq, [gq+rgbsq*(2<<%3)] |
|
998 |
+ lea bq, [bq+rgbsq*(2<<%3)] |
|
999 |
+ dec hd |
|
1000 |
+ jg .loop_v |
|
1001 |
+ |
|
1002 |
+ RET |
|
1003 |
+%endmacro |
|
1004 |
+ |
|
1005 |
+%macro RGB2YUV_FNS 2 |
|
1006 |
+RGB2YUV_FN 8, %1, %2 |
|
1007 |
+RGB2YUV_FN 10, %1, %2 |
|
1008 |
+RGB2YUV_FN 12, %1, %2 |
|
1009 |
+%endmacro |
|
1010 |
+ |
|
1011 |
+INIT_XMM sse2 |
|
1012 |
+RGB2YUV_FNS 0, 0 |
|
1013 |
+RGB2YUV_FNS 1, 0 |
|
1014 |
+RGB2YUV_FNS 1, 1 |
|
1015 |
+ |
|
1016 |
+; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, |
|
1017 |
+; int w, int h, const int16_t coeff[3][3][8]) |
|
1018 |
+INIT_XMM sse2 |
|
1019 |
+cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c |
|
1020 |
+ movh m0, [cq+ 0] |
|
1021 |
+ movh m1, [cq+ 32] |
|
1022 |
+ movh m2, [cq+ 48] |
|
1023 |
+ movh m3, [cq+ 80] |
|
1024 |
+ movh m4, [cq+ 96] |
|
1025 |
+ movh m5, [cq+128] |
|
1026 |
+ punpcklwd m0, [cq+ 16] |
|
1027 |
+ punpcklwd m1, [pw_8192] |
|
1028 |
+ punpcklwd m2, [cq+ 64] |
|
1029 |
+ punpcklwd m3, [pw_8192] |
|
1030 |
+ punpcklwd m4, [cq+112] |
|
1031 |
+ punpcklwd m5, [pw_8192] |
|
1032 |
+ |
|
1033 |
+ DEFINE_ARGS data0, stride, ww, h, data1, data2, x |
|
1034 |
+ shl strideq, 1 |
|
1035 |
+ mov data1q, [data0q+gprsize*1] |
|
1036 |
+ mov data2q, [data0q+gprsize*2] |
|
1037 |
+ mov data0q, [data0q+gprsize*0] |
|
1038 |
+ |
|
1039 |
+.loop_v: |
|
1040 |
+ xor xd, xd |
|
1041 |
+ |
|
1042 |
+.loop_h: |
|
1043 |
+ mova m6, [data0q+xq*2] |
|
1044 |
+ mova m7, [data1q+xq*2] |
|
1045 |
+ mova m8, [data2q+xq*2] |
|
1046 |
+ SBUTTERFLY wd, 6, 7, 9 |
|
1047 |
+ punpckhwd m9, m8, [pw_1] |
|
1048 |
+ punpcklwd m8, [pw_1] |
|
1049 |
+ |
|
1050 |
+ pmaddwd m10, m6, m0 |
|
1051 |
+ pmaddwd m11, m7, m0 |
|
1052 |
+ pmaddwd m12, m8, m1 |
|
1053 |
+ pmaddwd m13, m9, m1 |
|
1054 |
+ paddd m10, m12 |
|
1055 |
+ paddd m11, m13 |
|
1056 |
+ psrad m10, 14 |
|
1057 |
+ psrad m11, 14 |
|
1058 |
+ |
|
1059 |
+ pmaddwd m12, m6, m2 |
|
1060 |
+ pmaddwd m13, m7, m2 |
|
1061 |
+ pmaddwd m14, m8, m3 |
|
1062 |
+ pmaddwd m15, m9, m3 |
|
1063 |
+ paddd m12, m14 |
|
1064 |
+ paddd m13, m15 |
|
1065 |
+ psrad m12, 14 |
|
1066 |
+ psrad m13, 14 |
|
1067 |
+ |
|
1068 |
+ pmaddwd m6, m4 |
|
1069 |
+ pmaddwd m7, m4 |
|
1070 |
+ pmaddwd m8, m5 |
|
1071 |
+ pmaddwd m9, m5 |
|
1072 |
+ paddd m6, m8 |
|
1073 |
+ paddd m7, m9 |
|
1074 |
+ psrad m6, 14 |
|
1075 |
+ psrad m7, 14 |
|
1076 |
+ |
|
1077 |
+ packssdw m10, m11 |
|
1078 |
+ packssdw m12, m13 |
|
1079 |
+ packssdw m6, m7 |
|
1080 |
+ |
|
1081 |
+ mova [data0q+xq*2], m10 |
|
1082 |
+ mova [data1q+xq*2], m12 |
|
1083 |
+ mova [data2q+xq*2], m6 |
|
1084 |
+ |
|
1085 |
+ add xd, mmsize / 2 |
|
1086 |
+ cmp xd, wwd |
|
1087 |
+ jl .loop_h |
|
1088 |
+ |
|
1089 |
+ add data0q, strideq |
|
1090 |
+ add data1q, strideq |
|
1091 |
+ add data2q, strideq |
|
1092 |
+ dec hd |
|
1093 |
+ jg .loop_v |
|
1094 |
+ |
|
1095 |
+ RET |
|
1096 |
+%endif |
0 | 1097 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,119 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com> |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "libavutil/x86/cpu.h" |
|
21 |
+ |
|
22 |
+#include "libavfilter/colorspacedsp.h" |
|
23 |
+ |
|
24 |
+#define decl_yuv2yuv_fn(t) \ |
|
25 |
+void ff_yuv2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], \ |
|
26 |
+ uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], \ |
|
27 |
+ int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], \ |
|
28 |
+ const int16_t yuv_offset[2][8]) |
|
29 |
+ |
|
30 |
+#define decl_yuv2yuv_fns(ss) \ |
|
31 |
+decl_yuv2yuv_fn(ss##p8to8); \ |
|
32 |
+decl_yuv2yuv_fn(ss##p10to8); \ |
|
33 |
+decl_yuv2yuv_fn(ss##p12to8); \ |
|
34 |
+decl_yuv2yuv_fn(ss##p8to10); \ |
|
35 |
+decl_yuv2yuv_fn(ss##p10to10); \ |
|
36 |
+decl_yuv2yuv_fn(ss##p12to10); \ |
|
37 |
+decl_yuv2yuv_fn(ss##p8to12); \ |
|
38 |
+decl_yuv2yuv_fn(ss##p10to12); \ |
|
39 |
+decl_yuv2yuv_fn(ss##p12to12) |
|
40 |
+ |
|
41 |
+decl_yuv2yuv_fns(420); |
|
42 |
+decl_yuv2yuv_fns(422); |
|
43 |
+decl_yuv2yuv_fns(444); |
|
44 |
+ |
|
45 |
+#define decl_yuv2rgb_fn(t) \ |
|
46 |
+void ff_yuv2rgb_##t##_sse2(int16_t *rgb_out[3], ptrdiff_t rgb_stride, \ |
|
47 |
+ uint8_t *yuv_in[3], ptrdiff_t yuv_stride[3], \ |
|
48 |
+ int w, int h, const int16_t coeff[3][3][8], \ |
|
49 |
+ const int16_t yuv_offset[8]) |
|
50 |
+ |
|
51 |
+#define decl_yuv2rgb_fns(ss) \ |
|
52 |
+decl_yuv2rgb_fn(ss##p8); \ |
|
53 |
+decl_yuv2rgb_fn(ss##p10); \ |
|
54 |
+decl_yuv2rgb_fn(ss##p12) |
|
55 |
+ |
|
56 |
+decl_yuv2rgb_fns(420); |
|
57 |
+decl_yuv2rgb_fns(422); |
|
58 |
+decl_yuv2rgb_fns(444); |
|
59 |
+ |
|
60 |
+#define decl_rgb2yuv_fn(t) \ |
|
61 |
+void ff_rgb2yuv_##t##_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_stride[3], \ |
|
62 |
+ int16_t *rgb_in[3], ptrdiff_t rgb_stride, \ |
|
63 |
+ int w, int h, const int16_t coeff[3][3][8], \ |
|
64 |
+ const int16_t yuv_offset[8]) |
|
65 |
+ |
|
66 |
+#define decl_rgb2yuv_fns(ss) \ |
|
67 |
+decl_rgb2yuv_fn(ss##p8); \ |
|
68 |
+decl_rgb2yuv_fn(ss##p10); \ |
|
69 |
+decl_rgb2yuv_fn(ss##p12) |
|
70 |
+ |
|
71 |
+decl_rgb2yuv_fns(420); |
|
72 |
+decl_rgb2yuv_fns(422); |
|
73 |
+decl_rgb2yuv_fns(444); |
|
74 |
+ |
|
75 |
+void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, int w, int h, |
|
76 |
+ const int16_t coeff[3][3][8]); |
|
77 |
+ |
|
78 |
+void ff_colorspacedsp_x86_init(ColorSpaceDSPContext *dsp) |
|
79 |
+{ |
|
80 |
+ int cpu_flags = av_get_cpu_flags(); |
|
81 |
+ |
|
82 |
+ if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) { |
|
83 |
+#define assign_yuv2yuv_fns(idx, ss) \ |
|
84 |
+ dsp->yuv2yuv[0][0][idx] = ff_yuv2yuv_##ss##p8to8_sse2; \ |
|
85 |
+ dsp->yuv2yuv[0][1][idx] = ff_yuv2yuv_##ss##p8to10_sse2; \ |
|
86 |
+ dsp->yuv2yuv[0][2][idx] = ff_yuv2yuv_##ss##p8to12_sse2; \ |
|
87 |
+ dsp->yuv2yuv[1][0][idx] = ff_yuv2yuv_##ss##p10to8_sse2; \ |
|
88 |
+ dsp->yuv2yuv[1][1][idx] = ff_yuv2yuv_##ss##p10to10_sse2; \ |
|
89 |
+ dsp->yuv2yuv[1][2][idx] = ff_yuv2yuv_##ss##p10to12_sse2; \ |
|
90 |
+ dsp->yuv2yuv[2][0][idx] = ff_yuv2yuv_##ss##p12to8_sse2; \ |
|
91 |
+ dsp->yuv2yuv[2][1][idx] = ff_yuv2yuv_##ss##p12to10_sse2; \ |
|
92 |
+ dsp->yuv2yuv[2][2][idx] = ff_yuv2yuv_##ss##p12to12_sse2 |
|
93 |
+ |
|
94 |
+ assign_yuv2yuv_fns(2, 420); |
|
95 |
+ assign_yuv2yuv_fns(1, 422); |
|
96 |
+ assign_yuv2yuv_fns(0, 444); |
|
97 |
+ |
|
98 |
+#define assign_yuv2rgb_fns(idx, ss) \ |
|
99 |
+ dsp->yuv2rgb[0][idx] = ff_yuv2rgb_##ss##p8_sse2; \ |
|
100 |
+ dsp->yuv2rgb[1][idx] = ff_yuv2rgb_##ss##p10_sse2; \ |
|
101 |
+ dsp->yuv2rgb[2][idx] = ff_yuv2rgb_##ss##p12_sse2 |
|
102 |
+ |
|
103 |
+ assign_yuv2rgb_fns(2, 420); |
|
104 |
+ assign_yuv2rgb_fns(1, 422); |
|
105 |
+ assign_yuv2rgb_fns(0, 444); |
|
106 |
+ |
|
107 |
+#define assign_rgb2yuv_fns(idx, ss) \ |
|
108 |
+ dsp->rgb2yuv[0][idx] = ff_rgb2yuv_##ss##p8_sse2; \ |
|
109 |
+ dsp->rgb2yuv[1][idx] = ff_rgb2yuv_##ss##p10_sse2; \ |
|
110 |
+ dsp->rgb2yuv[2][idx] = ff_rgb2yuv_##ss##p12_sse2 |
|
111 |
+ |
|
112 |
+ assign_rgb2yuv_fns(2, 420); |
|
113 |
+ assign_rgb2yuv_fns(1, 422); |
|
114 |
+ assign_rgb2yuv_fns(0, 444); |
|
115 |
+ |
|
116 |
+ dsp->multiply3x3 = ff_multiply3x3_sse2; |
|
117 |
+ } |
|
118 |
+} |
... | ... |
@@ -16,6 +16,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC) += $(AVCODECOBJS-yes) |
16 | 16 |
|
17 | 17 |
# libavfilter tests |
18 | 18 |
AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o |
19 |
+AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o |
|
19 | 20 |
|
20 | 21 |
CHECKASMOBJS-$(CONFIG_AVFILTER) += $(AVFILTEROBJS-yes) |
21 | 22 |
|
... | ... |
@@ -33,6 +33,7 @@ |
33 | 33 |
void checkasm_check_alacdsp(void); |
34 | 34 |
void checkasm_check_blend(void); |
35 | 35 |
void checkasm_check_bswapdsp(void); |
36 |
+void checkasm_check_colorspace(void); |
|
36 | 37 |
void checkasm_check_flacdsp(void); |
37 | 38 |
void checkasm_check_fmtconvert(void); |
38 | 39 |
void checkasm_check_h264pred(void); |
39 | 40 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,314 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2016 Ronald S. Bultje <rsbultje@gmail.com> |
|
2 |
+ * |
|
3 |
+ * This file is part of FFmpeg. |
|
4 |
+ * |
|
5 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include <string.h> |
|
21 |
+#include "checkasm.h" |
|
22 |
+#include "libavfilter/colorspacedsp.h" |
|
23 |
+#include "libavutil/common.h" |
|
24 |
+#include "libavutil/internal.h" |
|
25 |
+#include "libavutil/intreadwrite.h" |
|
26 |
+ |
|
27 |
+#define W 64 |
|
28 |
+#define H 64 |
|
29 |
+ |
|
30 |
+#define randomize_buffers() \ |
|
31 |
+ do { \ |
|
32 |
+ unsigned mask = bpp_mask[idepth]; \ |
|
33 |
+ int n, m; \ |
|
34 |
+ int bpp = 1 + (!!idepth); \ |
|
35 |
+ int buf_size = W * H * bpp; \ |
|
36 |
+ for (m = 0; m < 3; m++) { \ |
|
37 |
+ int ss = m ? ss_w + ss_h : 0; \ |
|
38 |
+ int plane_sz = buf_size >> ss; \ |
|
39 |
+ for (n = 0; n < plane_sz; n += 4) { \ |
|
40 |
+ unsigned r = rnd() & mask; \ |
|
41 |
+ AV_WN32A(&src[m][n], r); \ |
|
42 |
+ } \ |
|
43 |
+ } \ |
|
44 |
+ } while (0) |
|
45 |
+ |
|
46 |
+static const char *format_string[] = { |
|
47 |
+ "444", "422", "420" |
|
48 |
+}; |
|
49 |
+ |
|
50 |
+static unsigned bpp_mask[] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff }; |
|
51 |
+ |
|
52 |
+static void check_yuv2yuv(void) |
|
53 |
+{ |
|
54 |
+ declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3], |
|
55 |
+ uint8_t *src[3], ptrdiff_t src_stride[3], |
|
56 |
+ int w, int h, const int16_t coeff[3][3][8], |
|
57 |
+ const int16_t off[2][8]); |
|
58 |
+ ColorSpaceDSPContext dsp; |
|
59 |
+ int idepth, odepth, fmt, n; |
|
60 |
+ LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]); |
|
61 |
+ LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]); |
|
62 |
+ LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]); |
|
63 |
+ uint8_t *src[3] = { src_y, src_u, src_v }; |
|
64 |
+ LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H * 2]); |
|
65 |
+ LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H * 2]); |
|
66 |
+ LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H * 2]); |
|
67 |
+ LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H * 2]); |
|
68 |
+ LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H * 2]); |
|
69 |
+ LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H * 2]); |
|
70 |
+ uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v }; |
|
71 |
+ LOCAL_ALIGNED_32(int16_t, offset_buf, [16]); |
|
72 |
+ LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); |
|
73 |
+ int16_t (*offset)[8] = (int16_t(*)[8]) offset_buf; |
|
74 |
+ int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; |
|
75 |
+ |
|
76 |
+ ff_colorspacedsp_init(&dsp); |
|
77 |
+ for (n = 0; n < 8; n++) { |
|
78 |
+ offset[0][n] = offset[1][n] = 16; |
|
79 |
+ |
|
80 |
+ coeff[0][0][n] = (1 << 14) + (1 << 7) + 1; |
|
81 |
+ coeff[0][1][n] = (1 << 7) - 1; |
|
82 |
+ coeff[0][2][n] = -(1 << 8); |
|
83 |
+ coeff[1][0][n] = coeff[2][0][n] = 0; |
|
84 |
+ coeff[1][1][n] = (1 << 14) + (1 << 7); |
|
85 |
+ coeff[1][2][n] = -(1 << 7); |
|
86 |
+ coeff[2][2][n] = (1 << 14) - (1 << 6); |
|
87 |
+ coeff[2][1][n] = 1 << 6; |
|
88 |
+ } |
|
89 |
+ for (idepth = 0; idepth < 3; idepth++) { |
|
90 |
+ for (odepth = 0; odepth < 3; odepth++) { |
|
91 |
+ for (fmt = 0; fmt < 3; fmt++) { |
|
92 |
+ if (check_func(dsp.yuv2yuv[idepth][odepth][fmt], |
|
93 |
+ "ff_colorspacedsp_yuv2yuv_%sp%dto%d", |
|
94 |
+ format_string[fmt], |
|
95 |
+ idepth * 2 + 8, odepth * 2 + 8)) { |
|
96 |
+ int ss_w = !!fmt, ss_h = fmt == 2; |
|
97 |
+ int y_src_stride = W << !!idepth, y_dst_stride = W << !!odepth; |
|
98 |
+ int uv_src_stride = y_src_stride >> ss_w, uv_dst_stride = y_dst_stride >> ss_w; |
|
99 |
+ |
|
100 |
+ randomize_buffers(); |
|
101 |
+ call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride }, |
|
102 |
+ src, (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride }, |
|
103 |
+ W, H, coeff, offset); |
|
104 |
+ call_new(dst1, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride }, |
|
105 |
+ src, (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride }, |
|
106 |
+ W, H, coeff, offset); |
|
107 |
+ if (memcmp(dst0[0], dst1[0], y_dst_stride * H) || |
|
108 |
+ memcmp(dst0[1], dst1[1], uv_dst_stride * H >> ss_h) || |
|
109 |
+ memcmp(dst0[2], dst1[2], uv_dst_stride * H >> ss_h)) { |
|
110 |
+ fail(); |
|
111 |
+ } |
|
112 |
+ } |
|
113 |
+ } |
|
114 |
+ } |
|
115 |
+ } |
|
116 |
+ |
|
117 |
+ report("yuv2yuv"); |
|
118 |
+} |
|
119 |
+ |
|
120 |
+static void check_yuv2rgb(void) |
|
121 |
+{ |
|
122 |
+ declare_func(void, int16_t *dst[3], ptrdiff_t dst_stride, |
|
123 |
+ uint8_t *src[3], ptrdiff_t src_stride[3], |
|
124 |
+ int w, int h, const int16_t coeff[3][3][8], |
|
125 |
+ const int16_t off[8]); |
|
126 |
+ ColorSpaceDSPContext dsp; |
|
127 |
+ int idepth, fmt, n; |
|
128 |
+ LOCAL_ALIGNED_32(uint8_t, src_y, [W * H * 2]); |
|
129 |
+ LOCAL_ALIGNED_32(uint8_t, src_u, [W * H * 2]); |
|
130 |
+ LOCAL_ALIGNED_32(uint8_t, src_v, [W * H * 2]); |
|
131 |
+ uint8_t *src[3] = { src_y, src_u, src_v }; |
|
132 |
+ LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]); |
|
133 |
+ LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]); |
|
134 |
+ LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]); |
|
135 |
+ LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]); |
|
136 |
+ LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]); |
|
137 |
+ LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]); |
|
138 |
+ int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v }; |
|
139 |
+ LOCAL_ALIGNED_32(int16_t, offset, [8]); |
|
140 |
+ LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); |
|
141 |
+ int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; |
|
142 |
+ |
|
143 |
+ ff_colorspacedsp_init(&dsp); |
|
144 |
+ for (n = 0; n < 8; n++) { |
|
145 |
+ offset[n] = 16; |
|
146 |
+ |
|
147 |
+ coeff[0][0][n] = coeff[1][0][n] = coeff[2][0][n] = (1 << 14) | 1; |
|
148 |
+ coeff[0][1][n] = coeff[2][2][n] = 0; |
|
149 |
+ coeff[0][2][n] = 1 << 13; |
|
150 |
+ coeff[1][1][n] = -(1 << 12); |
|
151 |
+ coeff[1][2][n] = 1 << 12; |
|
152 |
+ coeff[2][1][n] = 1 << 11; |
|
153 |
+ } |
|
154 |
+ for (idepth = 0; idepth < 3; idepth++) { |
|
155 |
+ for (fmt = 0; fmt < 3; fmt++) { |
|
156 |
+ if (check_func(dsp.yuv2rgb[idepth][fmt], |
|
157 |
+ "ff_colorspacedsp_yuv2rgb_%sp%d", |
|
158 |
+ format_string[fmt], idepth * 2 + 8)) { |
|
159 |
+ int ss_w = !!fmt, ss_h = fmt == 2; |
|
160 |
+ int y_src_stride = W << !!idepth; |
|
161 |
+ int uv_src_stride = y_src_stride >> ss_w; |
|
162 |
+ |
|
163 |
+ randomize_buffers(); |
|
164 |
+ call_ref(dst0, W, src, |
|
165 |
+ (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride }, |
|
166 |
+ W, H, coeff, offset); |
|
167 |
+ call_new(dst1, W, src, |
|
168 |
+ (ptrdiff_t[3]) { y_src_stride, uv_src_stride, uv_src_stride }, |
|
169 |
+ W, H, coeff, offset); |
|
170 |
+ if (memcmp(dst0[0], dst1[0], W * H * sizeof(int16_t)) || |
|
171 |
+ memcmp(dst0[1], dst1[1], W * H * sizeof(int16_t)) || |
|
172 |
+ memcmp(dst0[2], dst1[2], W * H * sizeof(int16_t))) { |
|
173 |
+ fail(); |
|
174 |
+ } |
|
175 |
+ } |
|
176 |
+ } |
|
177 |
+ } |
|
178 |
+ |
|
179 |
+ report("yuv2rgb"); |
|
180 |
+} |
|
181 |
+ |
|
182 |
+#undef randomize_buffers |
|
183 |
+#define randomize_buffers() \ |
|
184 |
+ do { \ |
|
185 |
+ int y, x, p; \ |
|
186 |
+ for (p = 0; p < 3; p++) { \ |
|
187 |
+ for (y = 0; y < H; y++) { \ |
|
188 |
+ for (x = 0; x < W; x++) { \ |
|
189 |
+ int r = rnd() & 0x7fff; \ |
|
190 |
+ r -= (32768 - 28672) >> 1; \ |
|
191 |
+ src[p][y * W + x] = r; \ |
|
192 |
+ } \ |
|
193 |
+ } \ |
|
194 |
+ } \ |
|
195 |
+ } while (0) |
|
196 |
+ |
|
197 |
+static void check_rgb2yuv(void) |
|
198 |
+{ |
|
199 |
+ declare_func(void, uint8_t *dst[3], ptrdiff_t dst_stride[3], |
|
200 |
+ int16_t *src[3], ptrdiff_t src_stride, |
|
201 |
+ int w, int h, const int16_t coeff[3][3][8], |
|
202 |
+ const int16_t off[8]); |
|
203 |
+ ColorSpaceDSPContext dsp; |
|
204 |
+ int odepth, fmt, n; |
|
205 |
+ LOCAL_ALIGNED_32(int16_t, src_y, [W * H * 2]); |
|
206 |
+ LOCAL_ALIGNED_32(int16_t, src_u, [W * H * 2]); |
|
207 |
+ LOCAL_ALIGNED_32(int16_t, src_v, [W * H * 2]); |
|
208 |
+ int16_t *src[3] = { src_y, src_u, src_v }; |
|
209 |
+ LOCAL_ALIGNED_32(uint8_t, dst0_y, [W * H]); |
|
210 |
+ LOCAL_ALIGNED_32(uint8_t, dst0_u, [W * H]); |
|
211 |
+ LOCAL_ALIGNED_32(uint8_t, dst0_v, [W * H]); |
|
212 |
+ LOCAL_ALIGNED_32(uint8_t, dst1_y, [W * H]); |
|
213 |
+ LOCAL_ALIGNED_32(uint8_t, dst1_u, [W * H]); |
|
214 |
+ LOCAL_ALIGNED_32(uint8_t, dst1_v, [W * H]); |
|
215 |
+ uint8_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v }; |
|
216 |
+ LOCAL_ALIGNED_32(int16_t, offset, [8]); |
|
217 |
+ LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); |
|
218 |
+ int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; |
|
219 |
+ |
|
220 |
+ ff_colorspacedsp_init(&dsp); |
|
221 |
+ for (n = 0; n < 8; n++) { |
|
222 |
+ offset[n] = 16; |
|
223 |
+ |
|
224 |
+ // these somewhat resemble bt601/smpte170m coefficients |
|
225 |
+ coeff[0][0][n] = lrint(0.3 * (1 << 14)); |
|
226 |
+ coeff[0][1][n] = lrint(0.6 * (1 << 14)); |
|
227 |
+ coeff[0][2][n] = lrint(0.1 * (1 << 14)); |
|
228 |
+ coeff[1][0][n] = lrint(-0.15 * (1 << 14)); |
|
229 |
+ coeff[1][1][n] = lrint(-0.35 * (1 << 14)); |
|
230 |
+ coeff[1][2][n] = lrint(0.5 * (1 << 14)); |
|
231 |
+ coeff[2][0][n] = lrint(0.5 * (1 << 14)); |
|
232 |
+ coeff[2][1][n] = lrint(-0.42 * (1 << 14)); |
|
233 |
+ coeff[2][2][n] = lrint(-0.08 * (1 << 14)); |
|
234 |
+ } |
|
235 |
+ for (odepth = 0; odepth < 3; odepth++) { |
|
236 |
+ for (fmt = 0; fmt < 3; fmt++) { |
|
237 |
+ if (check_func(dsp.rgb2yuv[odepth][fmt], |
|
238 |
+ "ff_colorspacedsp_rgb2yuv_%sp%d", |
|
239 |
+ format_string[fmt], odepth * 2 + 8)) { |
|
240 |
+ int ss_w = !!fmt, ss_h = fmt == 2; |
|
241 |
+ int y_dst_stride = W << !!odepth; |
|
242 |
+ int uv_dst_stride = y_dst_stride >> ss_w; |
|
243 |
+ |
|
244 |
+ randomize_buffers(); |
|
245 |
+ call_ref(dst0, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride }, |
|
246 |
+ src, W, W, H, coeff, offset); |
|
247 |
+ call_new(dst1, (ptrdiff_t[3]) { y_dst_stride, uv_dst_stride, uv_dst_stride }, |
|
248 |
+ src, W, W, H, coeff, offset); |
|
249 |
+ if (memcmp(dst0[0], dst1[0], H * y_dst_stride) || |
|
250 |
+ memcmp(dst0[1], dst1[1], H * uv_dst_stride >> ss_h) || |
|
251 |
+ memcmp(dst0[2], dst1[2], H * uv_dst_stride >> ss_h)) { |
|
252 |
+ fail(); |
|
253 |
+ } |
|
254 |
+ } |
|
255 |
+ } |
|
256 |
+ } |
|
257 |
+ |
|
258 |
+ report("rgb2yuv"); |
|
259 |
+} |
|
260 |
+ |
|
261 |
+static void check_multiply3x3(void) |
|
262 |
+{ |
|
263 |
+ declare_func(void, int16_t *data[3], ptrdiff_t stride, |
|
264 |
+ int w, int h, const int16_t coeff[3][3][8]); |
|
265 |
+ ColorSpaceDSPContext dsp; |
|
266 |
+ LOCAL_ALIGNED_32(int16_t, dst0_y, [W * H]); |
|
267 |
+ LOCAL_ALIGNED_32(int16_t, dst0_u, [W * H]); |
|
268 |
+ LOCAL_ALIGNED_32(int16_t, dst0_v, [W * H]); |
|
269 |
+ LOCAL_ALIGNED_32(int16_t, dst1_y, [W * H]); |
|
270 |
+ LOCAL_ALIGNED_32(int16_t, dst1_u, [W * H]); |
|
271 |
+ LOCAL_ALIGNED_32(int16_t, dst1_v, [W * H]); |
|
272 |
+ int16_t *dst0[3] = { dst0_y, dst0_u, dst0_v }, *dst1[3] = { dst1_y, dst1_u, dst1_v }; |
|
273 |
+ int16_t **src = dst0; |
|
274 |
+ LOCAL_ALIGNED_32(int16_t, coeff_buf, [3 * 3 * 8]); |
|
275 |
+ int16_t (*coeff)[3][8] = (int16_t(*)[3][8]) coeff_buf; |
|
276 |
+ int n; |
|
277 |
+ |
|
278 |
+ ff_colorspacedsp_init(&dsp); |
|
279 |
+ for (n = 0; n < 8; n++) { |
|
280 |
+ coeff[0][0][n] = lrint(0.85 * (1 << 14)); |
|
281 |
+ coeff[0][1][n] = lrint(0.10 * (1 << 14)); |
|
282 |
+ coeff[0][2][n] = lrint(0.05 * (1 << 14)); |
|
283 |
+ coeff[1][0][n] = lrint(-0.1 * (1 << 14)); |
|
284 |
+ coeff[1][1][n] = lrint(0.95 * (1 << 14)); |
|
285 |
+ coeff[1][2][n] = lrint(0.15 * (1 << 14)); |
|
286 |
+ coeff[2][0][n] = lrint(-0.2 * (1 << 14)); |
|
287 |
+ coeff[2][1][n] = lrint(0.30 * (1 << 14)); |
|
288 |
+ coeff[2][2][n] = lrint(0.90 * (1 << 14)); |
|
289 |
+ } |
|
290 |
+ if (check_func(dsp.multiply3x3, "ff_colorspacedsp_multiply3x3")) { |
|
291 |
+ randomize_buffers(); |
|
292 |
+ memcpy(dst1_y, dst0_y, W * H * sizeof(*dst1_y)); |
|
293 |
+ memcpy(dst1_u, dst0_u, W * H * sizeof(*dst1_u)); |
|
294 |
+ memcpy(dst1_v, dst0_v, W * H * sizeof(*dst1_v)); |
|
295 |
+ call_ref(dst0, W, W, H, coeff); |
|
296 |
+ call_new(dst1, W, W, H, coeff); |
|
297 |
+ if (memcmp(dst0[0], dst1[0], H * W * sizeof(*dst0_y)) || |
|
298 |
+ memcmp(dst0[1], dst1[1], H * W * sizeof(*dst0_u)) || |
|
299 |
+ memcmp(dst0[2], dst1[2], H * W * sizeof(*dst0_v))) { |
|
300 |
+ fail(); |
|
301 |
+ } |
|
302 |
+ } |
|
303 |
+ |
|
304 |
+ report("multiply3x3"); |
|
305 |
+} |
|
306 |
+ |
|
307 |
+void checkasm_check_colorspace(void) |
|
308 |
+{ |
|
309 |
+ check_yuv2yuv(); |
|
310 |
+ check_yuv2rgb(); |
|
311 |
+ check_rgb2yuv(); |
|
312 |
+ check_multiply3x3(); |
|
313 |
+} |