SSE2 version 4%-35% faster than MMX depending on the width.
AVX2 version 1%-13% faster than SSE2 depending on the width.
... | ... |
@@ -60,12 +60,12 @@ static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst, |
60 | 60 |
} |
61 | 61 |
return left; |
62 | 62 |
} else { |
63 |
- for (i = 0; i < 16; i++) { |
|
63 |
+ for (i = 0; i < 32; i++) { |
|
64 | 64 |
const int temp = src[i]; |
65 | 65 |
dst[i] = temp - left; |
66 | 66 |
left = temp; |
67 | 67 |
} |
68 |
- s->hencdsp.diff_bytes(dst + 16, src + 16, src + 15, w - 16); |
|
68 |
+ s->hencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32); |
|
69 | 69 |
return src[w-1]; |
70 | 70 |
} |
71 | 71 |
} else { |
... | ... |
@@ -27,9 +27,9 @@ |
27 | 27 |
|
28 | 28 |
section .text |
29 | 29 |
|
30 |
-INIT_MMX mmx |
|
31 | 30 |
; void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, |
32 | 31 |
; intptr_t w); |
32 |
+%macro DIFF_BYTES_PROLOGUE 0 |
|
33 | 33 |
%if ARCH_X86_32 |
34 | 34 |
cglobal diff_bytes, 3,5,2, dst, src1, src2 |
35 | 35 |
%define wq r4q |
... | ... |
@@ -40,34 +40,108 @@ cglobal diff_bytes, 4,5,2, dst, src1, src2, w |
40 | 40 |
DECLARE_REG_TMP 4 |
41 | 41 |
%endif ; ARCH_X86_32 |
42 | 42 |
%define i t0q |
43 |
+%endmacro |
|
44 |
+ |
|
45 |
+; label to jump to if w < regsize |
|
46 |
+%macro DIFF_BYTES_LOOP_PREP 1 |
|
43 | 47 |
mov i, wq |
44 |
- and i, -2 * mmsize |
|
45 |
- jz .setup_loop2 |
|
48 |
+ and i, -2 * regsize |
|
49 |
+ jz %1 |
|
46 | 50 |
add dstq, i |
47 | 51 |
add src1q, i |
48 | 52 |
add src2q, i |
49 | 53 |
neg i |
50 |
-.loop: |
|
51 |
- mova m0, [src1q + i] |
|
52 |
- mova m1, [src1q + i + mmsize] |
|
53 |
- psubb m0, [src2q + i] |
|
54 |
- psubb m1, [src2q + i + mmsize] |
|
55 |
- mova [dstq + i], m0 |
|
56 |
- mova [mmsize + dstq + i], m1 |
|
57 |
- add i, 2 * mmsize |
|
58 |
- jl .loop |
|
59 |
-.setup_loop2: |
|
60 |
- and wq, 2 * mmsize - 1 |
|
61 |
- jz .end |
|
54 |
+%endmacro |
|
55 |
+ |
|
56 |
+; mov type used for src1q, dstq, first reg, second reg |
|
57 |
+%macro DIFF_BYTES_LOOP_CORE 4 |
|
58 |
+%if regsize != 16 |
|
59 |
+ mov%1 %3, [src1q + i] |
|
60 |
+ mov%1 %4, [src1q + i + regsize] |
|
61 |
+ psubb %3, [src2q + i] |
|
62 |
+ psubb %4, [src2q + i + regsize] |
|
63 |
+ mov%2 [dstq + i], %3 |
|
64 |
+ mov%2 [regsize + dstq + i], %4 |
|
65 |
+%else |
|
66 |
+ ; SSE enforces alignment of psubb operand |
|
67 |
+ mov%1 %3, [src1q + i] |
|
68 |
+ movu %4, [src2q + i] |
|
69 |
+ psubb %3, %4 |
|
70 |
+ mov%2 [dstq + i], %3 |
|
71 |
+ mov%1 %3, [src1q + i + regsize] |
|
72 |
+ movu %4, [src2q + i + regsize] |
|
73 |
+ psubb %3, %4 |
|
74 |
+ mov%2 [regsize + dstq + i], %3 |
|
75 |
+%endif |
|
76 |
+%endmacro |
|
77 |
+ |
|
78 |
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq |
|
79 |
+ %define regsize mmsize |
|
80 |
+.loop_%1%2: |
|
81 |
+ DIFF_BYTES_LOOP_CORE %1, %2, m0, m1 |
|
82 |
+ add i, 2 * regsize |
|
83 |
+ jl .loop_%1%2 |
|
84 |
+.skip_main_%1%2: |
|
85 |
+ and wq, 2 * regsize - 1 |
|
86 |
+ jz .end_%1%2 |
|
87 |
+%if mmsize > 16 |
|
88 |
+ ; fall back to narrower xmm |
|
89 |
+ %define regsize mmsize / 2 |
|
90 |
+ DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa |
|
91 |
+.loop2_%1%2: |
|
92 |
+ DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1 |
|
93 |
+ add i, 2 * regsize |
|
94 |
+ jl .loop2_%1%2 |
|
95 |
+.setup_loop_gpr_%1%2: |
|
96 |
+ and wq, 2 * regsize - 1 |
|
97 |
+ jz .end_%1%2 |
|
98 |
+%endif |
|
62 | 99 |
add dstq, wq |
63 | 100 |
add src1q, wq |
64 | 101 |
add src2q, wq |
65 | 102 |
neg wq |
66 |
-.loop2: |
|
103 |
+.loop_gpr_%1%2: |
|
67 | 104 |
mov t0b, [src1q + wq] |
68 | 105 |
sub t0b, [src2q + wq] |
69 | 106 |
mov [dstq + wq], t0b |
70 | 107 |
inc wq |
71 |
- jl .loop2 |
|
72 |
-.end: |
|
108 |
+ jl .loop_gpr_%1%2 |
|
109 |
+.end_%1%2: |
|
73 | 110 |
REP_RET |
111 |
+%endmacro |
|
112 |
+ |
|
113 |
+%if ARCH_X86_32 |
|
114 |
+INIT_MMX mmx |
|
115 |
+DIFF_BYTES_PROLOGUE |
|
116 |
+ %define regsize mmsize |
|
117 |
+ DIFF_BYTES_LOOP_PREP .skip_main_aa |
|
118 |
+ DIFF_BYTES_BODY a, a |
|
119 |
+%endif |
|
120 |
+ |
|
121 |
+INIT_XMM sse2 |
|
122 |
+DIFF_BYTES_PROLOGUE |
|
123 |
+ %define regsize mmsize |
|
124 |
+ DIFF_BYTES_LOOP_PREP .skip_main_aa |
|
125 |
+ test dstq, regsize - 1 |
|
126 |
+ jnz .loop_uu |
|
127 |
+ test src1q, regsize - 1 |
|
128 |
+ jnz .loop_ua |
|
129 |
+ DIFF_BYTES_BODY a, a |
|
130 |
+ DIFF_BYTES_BODY u, a |
|
131 |
+ DIFF_BYTES_BODY u, u |
|
132 |
+ |
|
133 |
+%if HAVE_AVX2_EXTERNAL |
|
134 |
+INIT_YMM avx2 |
|
135 |
+DIFF_BYTES_PROLOGUE |
|
136 |
+ %define regsize mmsize |
|
137 |
+ ; Directly using unaligned SSE2 version is marginally faster than |
|
138 |
+ ; branching based on arguments. |
|
139 |
+ DIFF_BYTES_LOOP_PREP .skip_main_uu |
|
140 |
+ test dstq, regsize - 1 |
|
141 |
+ jnz .loop_uu |
|
142 |
+ test src1q, regsize - 1 |
|
143 |
+ jnz .loop_ua |
|
144 |
+ DIFF_BYTES_BODY a, a |
|
145 |
+ DIFF_BYTES_BODY u, a |
|
146 |
+ DIFF_BYTES_BODY u, u |
|
147 |
+%endif |
... | ... |
@@ -31,6 +31,10 @@ |
31 | 31 |
|
32 | 32 |
void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, |
33 | 33 |
intptr_t w); |
34 |
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, |
|
35 |
+ intptr_t w); |
|
36 |
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, |
|
37 |
+ intptr_t w); |
|
34 | 38 |
|
35 | 39 |
#if HAVE_INLINE_ASM |
36 | 40 |
|
... | ... |
@@ -80,7 +84,7 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) |
80 | 80 |
{ |
81 | 81 |
av_unused int cpu_flags = av_get_cpu_flags(); |
82 | 82 |
|
83 |
- if (EXTERNAL_MMX(cpu_flags)) { |
|
83 |
+ if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { |
|
84 | 84 |
c->diff_bytes = ff_diff_bytes_mmx; |
85 | 85 |
} |
86 | 86 |
|
... | ... |
@@ -89,4 +93,12 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) |
89 | 89 |
c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext; |
90 | 90 |
} |
91 | 91 |
#endif /* HAVE_INLINE_ASM */ |
92 |
+ |
|
93 |
+ if (EXTERNAL_SSE2(cpu_flags)) { |
|
94 |
+ c->diff_bytes = ff_diff_bytes_sse2; |
|
95 |
+ } |
|
96 |
+ |
|
97 |
+ if (EXTERNAL_AVX2(cpu_flags)) { |
|
98 |
+ c->diff_bytes = ff_diff_bytes_avx2; |
|
99 |
+ } |
|
92 | 100 |
} |