Signed-off-by: James Almer <jamrial@gmail.com>
James Almer authored on 2017/01/08 09:10:46... | ... |
@@ -2430,7 +2430,7 @@ hap_encoder_deps="libsnappy" |
2430 | 2430 |
hap_encoder_select="texturedspenc" |
2431 | 2431 |
hevc_decoder_select="bswapdsp cabac golomb videodsp" |
2432 | 2432 |
huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" |
2433 |
-huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llviddsp" |
|
2433 |
+huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp" |
|
2434 | 2434 |
iac_decoder_select="imc_decoder" |
2435 | 2435 |
imc_decoder_select="bswapdsp fft mdct sinewin" |
2436 | 2436 |
indeo3_decoder_select="hpeldsp" |
... | ... |
@@ -43,7 +43,7 @@ static inline void diff_bytes(HYuvContext *s, uint8_t *dst, |
43 | 43 |
if (s->bps <= 8) { |
44 | 44 |
s->hencdsp.diff_bytes(dst, src0, src1, w); |
45 | 45 |
} else { |
46 |
- s->llviddsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w); |
|
46 |
+ s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w); |
|
47 | 47 |
} |
48 | 48 |
} |
49 | 49 |
|
... | ... |
@@ -84,7 +84,7 @@ static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst, |
84 | 84 |
dst16[i] = temp - left; |
85 | 85 |
left = temp; |
86 | 86 |
} |
87 |
- s->llviddsp.diff_int16(dst16 + 16, src16 + 16, src16 + 15, s->n - 1, w - 16); |
|
87 |
+ s->hencdsp.diff_int16(dst16 + 16, src16 + 16, src16 + 15, s->n - 1, w - 16); |
|
88 | 88 |
return src16[w-1]; |
89 | 89 |
} |
90 | 90 |
} |
... | ... |
@@ -158,7 +158,7 @@ static void sub_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *s |
158 | 158 |
if (s->bps <= 8) { |
159 | 159 |
s->hencdsp.sub_hfyu_median_pred(dst, src1, src2, w , left, left_top); |
160 | 160 |
} else { |
161 |
- s->llviddsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top); |
|
161 |
+ s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top); |
|
162 | 162 |
} |
163 | 163 |
} |
164 | 164 |
|
... | ... |
@@ -217,7 +217,7 @@ static av_cold int encode_init(AVCodecContext *avctx) |
217 | 217 |
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(avctx->pix_fmt); |
218 | 218 |
|
219 | 219 |
ff_huffyuv_common_init(avctx); |
220 |
- ff_huffyuvencdsp_init(&s->hencdsp); |
|
220 |
+ ff_huffyuvencdsp_init(&s->hencdsp, avctx); |
|
221 | 221 |
|
222 | 222 |
avctx->extradata = av_mallocz(3*MAX_N + 4); |
223 | 223 |
if (s->flags&AV_CODEC_FLAG_PASS1) { |
... | ... |
@@ -53,6 +53,32 @@ static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, |
53 | 53 |
dst[i + 0] = src1[i + 0] - src2[i + 0]; |
54 | 54 |
} |
55 | 55 |
|
56 |
+static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){ |
|
57 |
+ long i; |
|
58 |
+#if !HAVE_FAST_UNALIGNED |
|
59 |
+ if((long)src2 & (sizeof(long)-1)){ |
|
60 |
+ for(i=0; i+3<w; i+=4){ |
|
61 |
+ dst[i+0] = (src1[i+0]-src2[i+0]) & mask; |
|
62 |
+ dst[i+1] = (src1[i+1]-src2[i+1]) & mask; |
|
63 |
+ dst[i+2] = (src1[i+2]-src2[i+2]) & mask; |
|
64 |
+ dst[i+3] = (src1[i+3]-src2[i+3]) & mask; |
|
65 |
+ } |
|
66 |
+ }else |
|
67 |
+#endif |
|
68 |
+ { |
|
69 |
+ unsigned long pw_lsb = (mask >> 1) * 0x0001000100010001ULL; |
|
70 |
+ unsigned long pw_msb = pw_lsb + 0x0001000100010001ULL; |
|
71 |
+ |
|
72 |
+ for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) { |
|
73 |
+ long a = *(long*)(src1+i); |
|
74 |
+ long b = *(long*)(src2+i); |
|
75 |
+ *(long*)(dst+i) = ((a|pw_msb) - (b&pw_lsb)) ^ ((a^b^pw_msb)&pw_msb); |
|
76 |
+ } |
|
77 |
+ } |
|
78 |
+ for (; i<w; i++) |
|
79 |
+ dst[i] = (src1[i] - src2[i]) & mask; |
|
80 |
+} |
|
81 |
+ |
|
56 | 82 |
static void sub_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1, |
57 | 83 |
const uint8_t *src2, intptr_t w, |
58 | 84 |
int *left, int *left_top) |
... | ... |
@@ -74,11 +100,31 @@ static void sub_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1, |
74 | 74 |
*left_top = lt; |
75 | 75 |
} |
76 | 76 |
|
77 |
-av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c) |
|
77 |
+static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){ |
|
78 |
+ int i; |
|
79 |
+ uint16_t l, lt; |
|
80 |
+ |
|
81 |
+ l = *left; |
|
82 |
+ lt = *left_top; |
|
83 |
+ |
|
84 |
+ for(i=0; i<w; i++){ |
|
85 |
+ const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & mask); |
|
86 |
+ lt = src1[i]; |
|
87 |
+ l = src2[i]; |
|
88 |
+ dst[i] = (l - pred) & mask; |
|
89 |
+ } |
|
90 |
+ |
|
91 |
+ *left = l; |
|
92 |
+ *left_top = lt; |
|
93 |
+} |
|
94 |
+ |
|
95 |
+av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, AVCodecContext *avctx) |
|
78 | 96 |
{ |
79 | 97 |
c->diff_bytes = diff_bytes_c; |
98 |
+ c->diff_int16 = diff_int16_c; |
|
80 | 99 |
c->sub_hfyu_median_pred = sub_hfyu_median_pred_c; |
100 |
+ c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c; |
|
81 | 101 |
|
82 | 102 |
if (ARCH_X86) |
83 |
- ff_huffyuvencdsp_init_x86(c); |
|
103 |
+ ff_huffyuvencdsp_init_x86(c, avctx); |
|
84 | 104 |
} |
... | ... |
@@ -21,11 +21,18 @@ |
21 | 21 |
|
22 | 22 |
#include <stdint.h> |
23 | 23 |
|
24 |
+#include "avcodec.h" |
|
25 |
+ |
|
24 | 26 |
typedef struct HuffYUVEncDSPContext { |
25 | 27 |
void (*diff_bytes)(uint8_t *dst /* align 16 */, |
26 | 28 |
const uint8_t *src1 /* align 16 */, |
27 | 29 |
const uint8_t *src2 /* align 1 */, |
28 | 30 |
intptr_t w); |
31 |
+ void (*diff_int16)(uint16_t *dst /* align 16 */, |
|
32 |
+ const uint16_t *src1 /* align 16 */, |
|
33 |
+ const uint16_t *src2 /* align 1 */, |
|
34 |
+ unsigned mask, int w); |
|
35 |
+ |
|
29 | 36 |
/** |
30 | 37 |
* Subtract HuffYUV's variant of median prediction. |
31 | 38 |
* Note, this might read from src1[-1], src2[-1]. |
... | ... |
@@ -33,9 +40,12 @@ typedef struct HuffYUVEncDSPContext { |
33 | 33 |
void (*sub_hfyu_median_pred)(uint8_t *dst, const uint8_t *src1, |
34 | 34 |
const uint8_t *src2, intptr_t w, |
35 | 35 |
int *left, int *left_top); |
36 |
+ void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1, |
|
37 |
+ const uint16_t *src2, unsigned mask, |
|
38 |
+ int w, int *left, int *left_top); |
|
36 | 39 |
} HuffYUVEncDSPContext; |
37 | 40 |
|
38 |
-void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c); |
|
39 |
-void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c); |
|
41 |
+void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, AVCodecContext *avctx); |
|
42 |
+void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx); |
|
40 | 43 |
|
41 | 44 |
#endif /* AVCODEC_HUFFYUVENCDSP_H */ |
... | ... |
@@ -92,32 +92,6 @@ static void add_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w |
92 | 92 |
dst[i] = (dst[i] + src[i]) & mask; |
93 | 93 |
} |
94 | 94 |
|
95 |
-static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){ |
|
96 |
- long i; |
|
97 |
-#if !HAVE_FAST_UNALIGNED |
|
98 |
- if((long)src2 & (sizeof(long)-1)){ |
|
99 |
- for(i=0; i+3<w; i+=4){ |
|
100 |
- dst[i+0] = (src1[i+0]-src2[i+0]) & mask; |
|
101 |
- dst[i+1] = (src1[i+1]-src2[i+1]) & mask; |
|
102 |
- dst[i+2] = (src1[i+2]-src2[i+2]) & mask; |
|
103 |
- dst[i+3] = (src1[i+3]-src2[i+3]) & mask; |
|
104 |
- } |
|
105 |
- }else |
|
106 |
-#endif |
|
107 |
- { |
|
108 |
- unsigned long pw_lsb = (mask >> 1) * 0x0001000100010001ULL; |
|
109 |
- unsigned long pw_msb = pw_lsb + 0x0001000100010001ULL; |
|
110 |
- |
|
111 |
- for (i = 0; i <= w - (int)sizeof(long)/2; i += sizeof(long)/2) { |
|
112 |
- long a = *(long*)(src1+i); |
|
113 |
- long b = *(long*)(src2+i); |
|
114 |
- *(long*)(dst+i) = ((a|pw_msb) - (b&pw_lsb)) ^ ((a^b^pw_msb)&pw_msb); |
|
115 |
- } |
|
116 |
- } |
|
117 |
- for (; i<w; i++) |
|
118 |
- dst[i] = (src1[i] - src2[i]) & mask; |
|
119 |
-} |
|
120 |
- |
|
121 | 95 |
static void add_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top){ |
122 | 96 |
int i; |
123 | 97 |
uint16_t l, lt; |
... | ... |
@@ -135,24 +109,6 @@ static void add_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src, con |
135 | 135 |
*left_top = lt; |
136 | 136 |
} |
137 | 137 |
|
138 |
-static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){ |
|
139 |
- int i; |
|
140 |
- uint16_t l, lt; |
|
141 |
- |
|
142 |
- l = *left; |
|
143 |
- lt = *left_top; |
|
144 |
- |
|
145 |
- for(i=0; i<w; i++){ |
|
146 |
- const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & mask); |
|
147 |
- lt = src1[i]; |
|
148 |
- l = src2[i]; |
|
149 |
- dst[i] = (l - pred) & mask; |
|
150 |
- } |
|
151 |
- |
|
152 |
- *left = l; |
|
153 |
- *left_top = lt; |
|
154 |
-} |
|
155 |
- |
|
156 | 138 |
static int add_hfyu_left_pred_int16_c(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc){ |
157 | 139 |
int i; |
158 | 140 |
|
... | ... |
@@ -180,10 +136,8 @@ void ff_llviddsp_init(LLVidDSPContext *c, AVCodecContext *avctx) |
180 | 180 |
c->add_left_pred = add_left_pred_c; |
181 | 181 |
|
182 | 182 |
c->add_int16 = add_int16_c; |
183 |
- c->diff_int16= diff_int16_c; |
|
184 | 183 |
c->add_hfyu_left_pred_int16 = add_hfyu_left_pred_int16_c; |
185 | 184 |
c->add_hfyu_median_pred_int16 = add_hfyu_median_pred_int16_c; |
186 |
- c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c; |
|
187 | 185 |
|
188 | 186 |
if (ARCH_X86) |
189 | 187 |
ff_llviddsp_init_x86(c, avctx); |
... | ... |
@@ -35,9 +35,7 @@ typedef struct LLVidDSPContext { |
35 | 35 |
intptr_t w, int left); |
36 | 36 |
|
37 | 37 |
void (*add_int16)(uint16_t *dst/*align 16*/, const uint16_t *src/*align 16*/, unsigned mask, int w); |
38 |
- void (*diff_int16)(uint16_t *dst/*align 16*/, const uint16_t *src1/*align 16*/, const uint16_t *src2/*align 1*/, unsigned mask, int w); |
|
39 | 38 |
|
40 |
- void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); |
|
41 | 39 |
void (*add_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top); |
42 | 40 |
int (*add_hfyu_left_pred_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned left); |
43 | 41 |
} LLVidDSPContext; |
... | ... |
@@ -1015,7 +1015,7 @@ FF_DISABLE_DEPRECATION_WARNINGS |
1015 | 1015 |
FF_ENABLE_DEPRECATION_WARNINGS |
1016 | 1016 |
#endif |
1017 | 1017 |
|
1018 |
- ff_huffyuvencdsp_init(&s->hdsp); |
|
1018 |
+ ff_huffyuvencdsp_init(&s->hdsp, avctx); |
|
1019 | 1019 |
|
1020 | 1020 |
#if FF_API_PRIVATE_OPT |
1021 | 1021 |
FF_DISABLE_DEPRECATION_WARNINGS |
... | ... |
@@ -120,7 +120,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx) |
120 | 120 |
} |
121 | 121 |
|
122 | 122 |
ff_bswapdsp_init(&c->bdsp); |
123 |
- ff_huffyuvencdsp_init(&c->hdsp); |
|
123 |
+ ff_huffyuvencdsp_init(&c->hdsp, avctx); |
|
124 | 124 |
|
125 | 125 |
#if FF_API_PRIVATE_OPT |
126 | 126 |
FF_DISABLE_DEPRECATION_WARNINGS |
... | ... |
@@ -148,3 +148,116 @@ DIFF_BYTES_PROLOGUE |
148 | 148 |
DIFF_BYTES_BODY u, u |
149 | 149 |
%undef i |
150 | 150 |
%endif |
151 |
+ |
|
152 |
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub |
|
153 |
+ movd m4, maskd |
|
154 |
+ SPLATW m4, m4 |
|
155 |
+ add wd, wd |
|
156 |
+ test wq, 2*mmsize - 1 |
|
157 |
+ jz %%.tomainloop |
|
158 |
+ push tmpq |
|
159 |
+%%.wordloop: |
|
160 |
+ sub wq, 2 |
|
161 |
+%ifidn %2, add |
|
162 |
+ mov tmpw, [srcq+wq] |
|
163 |
+ add tmpw, [dstq+wq] |
|
164 |
+%else |
|
165 |
+ mov tmpw, [src1q+wq] |
|
166 |
+ sub tmpw, [src2q+wq] |
|
167 |
+%endif |
|
168 |
+ and tmpw, maskw |
|
169 |
+ mov [dstq+wq], tmpw |
|
170 |
+ test wq, 2*mmsize - 1 |
|
171 |
+ jnz %%.wordloop |
|
172 |
+ pop tmpq |
|
173 |
+%%.tomainloop: |
|
174 |
+%ifidn %2, add |
|
175 |
+ add srcq, wq |
|
176 |
+%else |
|
177 |
+ add src1q, wq |
|
178 |
+ add src2q, wq |
|
179 |
+%endif |
|
180 |
+ add dstq, wq |
|
181 |
+ neg wq |
|
182 |
+ jz %%.end |
|
183 |
+%%.loop: |
|
184 |
+%ifidn %2, add |
|
185 |
+ mov%1 m0, [srcq+wq] |
|
186 |
+ mov%1 m1, [dstq+wq] |
|
187 |
+ mov%1 m2, [srcq+wq+mmsize] |
|
188 |
+ mov%1 m3, [dstq+wq+mmsize] |
|
189 |
+%else |
|
190 |
+ mov%1 m0, [src1q+wq] |
|
191 |
+ mov%1 m1, [src2q+wq] |
|
192 |
+ mov%1 m2, [src1q+wq+mmsize] |
|
193 |
+ mov%1 m3, [src2q+wq+mmsize] |
|
194 |
+%endif |
|
195 |
+ p%2w m0, m1 |
|
196 |
+ p%2w m2, m3 |
|
197 |
+ pand m0, m4 |
|
198 |
+ pand m2, m4 |
|
199 |
+ mov%1 [dstq+wq] , m0 |
|
200 |
+ mov%1 [dstq+wq+mmsize], m2 |
|
201 |
+ add wq, 2*mmsize |
|
202 |
+ jl %%.loop |
|
203 |
+%%.end: |
|
204 |
+ RET |
|
205 |
+%endmacro |
|
206 |
+ |
|
207 |
+%if ARCH_X86_32 |
|
208 |
+INIT_MMX mmx |
|
209 |
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp |
|
210 |
+ INT16_LOOP a, sub |
|
211 |
+%endif |
|
212 |
+ |
|
213 |
+INIT_XMM sse2 |
|
214 |
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp |
|
215 |
+ test src1q, mmsize-1 |
|
216 |
+ jnz .unaligned |
|
217 |
+ test src2q, mmsize-1 |
|
218 |
+ jnz .unaligned |
|
219 |
+ test dstq, mmsize-1 |
|
220 |
+ jnz .unaligned |
|
221 |
+ INT16_LOOP a, sub |
|
222 |
+.unaligned: |
|
223 |
+ INT16_LOOP u, sub |
|
224 |
+ |
|
225 |
+INIT_MMX mmxext |
|
226 |
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top |
|
227 |
+ add wd, wd |
|
228 |
+ movd mm7, maskd |
|
229 |
+ SPLATW mm7, mm7 |
|
230 |
+ movq mm0, [src1q] |
|
231 |
+ movq mm2, [src2q] |
|
232 |
+ psllq mm0, 16 |
|
233 |
+ psllq mm2, 16 |
|
234 |
+ movd mm6, [left_topq] |
|
235 |
+ por mm0, mm6 |
|
236 |
+ movd mm6, [leftq] |
|
237 |
+ por mm2, mm6 |
|
238 |
+ xor maskq, maskq |
|
239 |
+.loop: |
|
240 |
+ movq mm1, [src1q + maskq] |
|
241 |
+ movq mm3, [src2q + maskq] |
|
242 |
+ movq mm4, mm2 |
|
243 |
+ psubw mm2, mm0 |
|
244 |
+ paddw mm2, mm1 |
|
245 |
+ pand mm2, mm7 |
|
246 |
+ movq mm5, mm4 |
|
247 |
+ pmaxsw mm4, mm1 |
|
248 |
+ pminsw mm1, mm5 |
|
249 |
+ pminsw mm4, mm2 |
|
250 |
+ pmaxsw mm4, mm1 |
|
251 |
+ psubw mm3, mm4 |
|
252 |
+ pand mm3, mm7 |
|
253 |
+ movq [dstq + maskq], mm3 |
|
254 |
+ add maskq, 8 |
|
255 |
+ movq mm0, [src1q + maskq - 2] |
|
256 |
+ movq mm2, [src2q + maskq - 2] |
|
257 |
+ cmp maskq, wq |
|
258 |
+ jb .loop |
|
259 |
+ movzx maskd, word [src1q + wq - 2] |
|
260 |
+ mov [left_topq], maskd |
|
261 |
+ movzx maskd, word [src2q + wq - 2] |
|
262 |
+ mov [leftq], maskd |
|
263 |
+ RET |
... | ... |
@@ -24,6 +24,7 @@ |
24 | 24 |
|
25 | 25 |
#include "libavutil/attributes.h" |
26 | 26 |
#include "libavutil/cpu.h" |
27 |
+#include "libavutil/pixdesc.h" |
|
27 | 28 |
#include "libavutil/x86/asm.h" |
28 | 29 |
#include "libavutil/x86/cpu.h" |
29 | 30 |
#include "libavcodec/huffyuvencdsp.h" |
... | ... |
@@ -35,6 +36,12 @@ void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, |
35 | 35 |
intptr_t w); |
36 | 36 |
void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, |
37 | 37 |
intptr_t w); |
38 |
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, |
|
39 |
+ unsigned mask, int w); |
|
40 |
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, |
|
41 |
+ unsigned mask, int w); |
|
42 |
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, |
|
43 |
+ unsigned mask, int w, int *left, int *left_top); |
|
38 | 44 |
|
39 | 45 |
#if HAVE_INLINE_ASM |
40 | 46 |
|
... | ... |
@@ -80,12 +87,14 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1, |
80 | 80 |
|
81 | 81 |
#endif /* HAVE_INLINE_ASM */ |
82 | 82 |
|
83 |
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) |
|
83 |
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx) |
|
84 | 84 |
{ |
85 | 85 |
av_unused int cpu_flags = av_get_cpu_flags(); |
86 |
+ const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt); |
|
86 | 87 |
|
87 | 88 |
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { |
88 | 89 |
c->diff_bytes = ff_diff_bytes_mmx; |
90 |
+ c->diff_int16 = ff_diff_int16_mmx; |
|
89 | 91 |
} |
90 | 92 |
|
91 | 93 |
#if HAVE_INLINE_ASM |
... | ... |
@@ -94,8 +103,13 @@ av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c) |
94 | 94 |
} |
95 | 95 |
#endif /* HAVE_INLINE_ASM */ |
96 | 96 |
|
97 |
+ if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) { |
|
98 |
+ c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext; |
|
99 |
+ } |
|
100 |
+ |
|
97 | 101 |
if (EXTERNAL_SSE2(cpu_flags)) { |
98 | 102 |
c->diff_bytes = ff_diff_bytes_sse2; |
103 |
+ c->diff_int16 = ff_diff_int16_sse2; |
|
99 | 104 |
} |
100 | 105 |
|
101 | 106 |
if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
... | ... |
@@ -288,25 +288,6 @@ cglobal add_int16, 4,4,5, dst, src, mask, w, tmp |
288 | 288 |
.unaligned: |
289 | 289 |
INT16_LOOP u, add |
290 | 290 |
|
291 |
-%if ARCH_X86_32 |
|
292 |
-INIT_MMX mmx |
|
293 |
-cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp |
|
294 |
- INT16_LOOP a, sub |
|
295 |
-%endif |
|
296 |
- |
|
297 |
-INIT_XMM sse2 |
|
298 |
-cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp |
|
299 |
- test src1q, mmsize-1 |
|
300 |
- jnz .unaligned |
|
301 |
- test src2q, mmsize-1 |
|
302 |
- jnz .unaligned |
|
303 |
- test dstq, mmsize-1 |
|
304 |
- jnz .unaligned |
|
305 |
- INT16_LOOP a, sub |
|
306 |
-.unaligned: |
|
307 |
- INT16_LOOP u, sub |
|
308 |
- |
|
309 |
- |
|
310 | 291 |
%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) |
311 | 292 |
add wd, wd |
312 | 293 |
add srcq, wq |
... | ... |
@@ -443,42 +424,3 @@ cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_t |
443 | 443 |
movzx r2d, word [topq-2] |
444 | 444 |
mov [left_topq], r2d |
445 | 445 |
RET |
446 |
- |
|
447 |
-cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top |
|
448 |
- add wd, wd |
|
449 |
- movd mm7, maskd |
|
450 |
- SPLATW mm7, mm7 |
|
451 |
- movq mm0, [src1q] |
|
452 |
- movq mm2, [src2q] |
|
453 |
- psllq mm0, 16 |
|
454 |
- psllq mm2, 16 |
|
455 |
- movd mm6, [left_topq] |
|
456 |
- por mm0, mm6 |
|
457 |
- movd mm6, [leftq] |
|
458 |
- por mm2, mm6 |
|
459 |
- xor maskq, maskq |
|
460 |
-.loop: |
|
461 |
- movq mm1, [src1q + maskq] |
|
462 |
- movq mm3, [src2q + maskq] |
|
463 |
- movq mm4, mm2 |
|
464 |
- psubw mm2, mm0 |
|
465 |
- paddw mm2, mm1 |
|
466 |
- pand mm2, mm7 |
|
467 |
- movq mm5, mm4 |
|
468 |
- pmaxsw mm4, mm1 |
|
469 |
- pminsw mm1, mm5 |
|
470 |
- pminsw mm4, mm2 |
|
471 |
- pmaxsw mm4, mm1 |
|
472 |
- psubw mm3, mm4 |
|
473 |
- pand mm3, mm7 |
|
474 |
- movq [dstq + maskq], mm3 |
|
475 |
- add maskq, 8 |
|
476 |
- movq mm0, [src1q + maskq - 2] |
|
477 |
- movq mm2, [src2q + maskq - 2] |
|
478 |
- cmp maskq, wq |
|
479 |
- jb .loop |
|
480 |
- movzx maskd, word [src1q + wq - 2] |
|
481 |
- mov [left_topq], maskd |
|
482 |
- movzx maskd, word [src2q + wq - 2] |
|
483 |
- mov [leftq], maskd |
|
484 |
- RET |
... | ... |
@@ -41,12 +41,9 @@ int ff_add_left_pred_sse4(uint8_t *dst, const uint8_t *src, |
41 | 41 |
|
42 | 42 |
void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w); |
43 | 43 |
void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w); |
44 |
-void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w); |
|
45 |
-void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w); |
|
46 | 44 |
int ff_add_hfyu_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc); |
47 | 45 |
int ff_add_hfyu_left_pred_int16_sse4(uint16_t *dst, const uint16_t *src, unsigned mask, int w, unsigned acc); |
48 | 46 |
void ff_add_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *top, const uint16_t *diff, unsigned mask, int w, int *left, int *left_top); |
49 |
-void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top); |
|
50 | 47 |
|
51 | 48 |
#if HAVE_INLINE_ASM && HAVE_7REGS && ARCH_X86_32 |
52 | 49 |
static void add_median_pred_cmov(uint8_t *dst, const uint8_t *top, |
... | ... |
@@ -98,9 +95,7 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx) |
98 | 98 |
|
99 | 99 |
if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) { |
100 | 100 |
c->add_bytes = ff_add_bytes_mmx; |
101 |
- |
|
102 | 101 |
c->add_int16 = ff_add_int16_mmx; |
103 |
- c->diff_int16 = ff_diff_int16_mmx; |
|
104 | 102 |
} |
105 | 103 |
|
106 | 104 |
if (ARCH_X86_32 && EXTERNAL_MMXEXT(cpu_flags)) { |
... | ... |
@@ -111,7 +106,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx) |
111 | 111 |
|
112 | 112 |
if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) { |
113 | 113 |
c->add_hfyu_median_pred_int16 = ff_add_hfyu_median_pred_int16_mmxext; |
114 |
- c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext; |
|
115 | 114 |
} |
116 | 115 |
|
117 | 116 |
if (EXTERNAL_SSE2(cpu_flags)) { |
... | ... |
@@ -119,7 +113,6 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c, AVCodecContext *avctx) |
119 | 119 |
c->add_median_pred = ff_add_median_pred_sse2; |
120 | 120 |
|
121 | 121 |
c->add_int16 = ff_add_int16_sse2; |
122 |
- c->diff_int16 = ff_diff_int16_sse2; |
|
123 | 122 |
} |
124 | 123 |
|
125 | 124 |
if (EXTERNAL_SSSE3(cpu_flags)) { |