This allows supporting files for which the image stride is smaller than
the max. block size + number of subpel mc taps, e.g. a 64x64 VP9 file
or a 16x16 VP8 file with -fflags +emu_edge.
... | ... |
@@ -405,7 +405,8 @@ static inline void mc_dir_part(AVSContext *h, AVFrame *pic, |
405 | 405 |
|| full_my < 0-extra_height |
406 | 406 |
|| full_mx + 16/*FIXME*/ > pic_width + extra_width |
407 | 407 |
|| full_my + 16/*FIXME*/ > pic_height + extra_height){ |
408 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_y - 2 - 2*h->l_stride, h->l_stride, |
|
408 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->l_stride, |
|
409 |
+ src_y - 2 - 2*h->l_stride, h->l_stride, |
|
409 | 410 |
16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height); |
410 | 411 |
src_y= h->edge_emu_buffer + 2 + 2*h->l_stride; |
411 | 412 |
emu=1; |
... | ... |
@@ -414,14 +415,14 @@ static inline void mc_dir_part(AVSContext *h, AVFrame *pic, |
414 | 414 |
qpix_op[luma_xy](dest_y, src_y, h->l_stride); //FIXME try variable height perhaps? |
415 | 415 |
|
416 | 416 |
if(emu){ |
417 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cb, h->c_stride, |
|
417 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->c_stride, src_cb, h->c_stride, |
|
418 | 418 |
9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); |
419 | 419 |
src_cb= h->edge_emu_buffer; |
420 | 420 |
} |
421 | 421 |
chroma_op(dest_cb, src_cb, h->c_stride, chroma_height, mx&7, my&7); |
422 | 422 |
|
423 | 423 |
if(emu){ |
424 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cr, h->c_stride, |
|
424 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->c_stride, src_cr, h->c_stride, |
|
425 | 425 |
9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1); |
426 | 426 |
src_cr= h->edge_emu_buffer; |
427 | 427 |
} |
... | ... |
@@ -1414,7 +1414,8 @@ static int mc_subpel(DiracContext *s, DiracBlock *block, const uint8_t *src[5], |
1414 | 1414 |
y + p->yblen > p->height+EDGE_WIDTH/2 || |
1415 | 1415 |
x < 0 || y < 0) { |
1416 | 1416 |
for (i = 0; i < nplanes; i++) { |
1417 |
- ff_emulated_edge_mc(s->edge_emu_buffer[i], src[i], p->stride, |
|
1417 |
+ ff_emulated_edge_mc(s->edge_emu_buffer[i], p->stride, |
|
1418 |
+ src[i], p->stride, |
|
1418 | 1419 |
p->xblen, p->yblen, x, y, |
1419 | 1420 |
p->width+EDGE_WIDTH/2, p->height+EDGE_WIDTH/2); |
1420 | 1421 |
src[i] = s->edge_emu_buffer[i]; |
... | ... |
@@ -932,7 +932,7 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic, |
932 | 932 |
full_my < 0 - extra_height || |
933 | 933 |
full_mx + 16 /*FIXME*/ > pic_width + extra_width || |
934 | 934 |
full_my + 16 /*FIXME*/ > pic_height + extra_height) { |
935 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, |
|
935 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->mb_linesize, |
|
936 | 936 |
src_y - (2 << pixel_shift) - 2 * h->mb_linesize, |
937 | 937 |
h->mb_linesize, |
938 | 938 |
16 + 5, 16 + 5 /*FIXME*/, full_mx - 2, |
... | ... |
@@ -951,7 +951,7 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic, |
951 | 951 |
if (chroma_idc == 3 /* yuv444 */) { |
952 | 952 |
src_cb = pic->f.data[1] + offset; |
953 | 953 |
if (emu) { |
954 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, |
|
954 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->mb_linesize, |
|
955 | 955 |
src_cb - (2 << pixel_shift) - 2 * h->mb_linesize, |
956 | 956 |
h->mb_linesize, |
957 | 957 |
16 + 5, 16 + 5 /*FIXME*/, |
... | ... |
@@ -965,7 +965,7 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic, |
965 | 965 |
|
966 | 966 |
src_cr = pic->f.data[2] + offset; |
967 | 967 |
if (emu) { |
968 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, |
|
968 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->mb_linesize, |
|
969 | 969 |
src_cr - (2 << pixel_shift) - 2 * h->mb_linesize, |
970 | 970 |
h->mb_linesize, |
971 | 971 |
16 + 5, 16 + 5 /*FIXME*/, |
... | ... |
@@ -992,7 +992,7 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic, |
992 | 992 |
(my >> ysh) * h->mb_uvlinesize; |
993 | 993 |
|
994 | 994 |
if (emu) { |
995 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cb, h->mb_uvlinesize, |
|
995 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->mb_uvlinesize, src_cb, h->mb_uvlinesize, |
|
996 | 996 |
9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), |
997 | 997 |
pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); |
998 | 998 |
src_cb = h->edge_emu_buffer; |
... | ... |
@@ -1002,7 +1002,7 @@ static av_always_inline void mc_dir_part(H264Context *h, Picture *pic, |
1002 | 1002 |
mx & 7, (my << (chroma_idc == 2 /* yuv422 */)) & 7); |
1003 | 1003 |
|
1004 | 1004 |
if (emu) { |
1005 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src_cr, h->mb_uvlinesize, |
|
1005 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->mb_uvlinesize, src_cr, h->mb_uvlinesize, |
|
1006 | 1006 |
9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh), |
1007 | 1007 |
pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */)); |
1008 | 1008 |
src_cr = h->edge_emu_buffer; |
... | ... |
@@ -2204,7 +2204,8 @@ static inline int hpel_motion_lowres(MpegEncContext *s, |
2204 | 2204 |
|
2205 | 2205 |
if ((unsigned)src_x > FFMAX( h_edge_pos - (!!sx) - w, 0) || |
2206 | 2206 |
(unsigned)src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) { |
2207 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src, s->linesize, w + 1, |
|
2207 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, |
|
2208 |
+ src, s->linesize, w + 1, |
|
2208 | 2209 |
(h + 1) << field_based, src_x, |
2209 | 2210 |
src_y << field_based, |
2210 | 2211 |
h_edge_pos, |
... | ... |
@@ -2306,18 +2307,20 @@ static av_always_inline void mpeg_motion_lowres(MpegEncContext *s, |
2306 | 2306 |
|
2307 | 2307 |
if ((unsigned) src_x > FFMAX( h_edge_pos - (!!sx) - 2 * block_s, 0) || uvsrc_y<0 || |
2308 | 2308 |
(unsigned) src_y > FFMAX((v_edge_pos >> field_based) - (!!sy) - h, 0)) { |
2309 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr_y, |
|
2309 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, linesize >> field_based, ptr_y, |
|
2310 | 2310 |
linesize >> field_based, 17, 17 + field_based, |
2311 | 2311 |
src_x, src_y << field_based, h_edge_pos, |
2312 | 2312 |
v_edge_pos); |
2313 | 2313 |
ptr_y = s->edge_emu_buffer; |
2314 | 2314 |
if (!CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) { |
2315 | 2315 |
uint8_t *uvbuf = s->edge_emu_buffer + 18 * s->linesize; |
2316 |
- s->vdsp.emulated_edge_mc(uvbuf , ptr_cb, uvlinesize >> field_based, 9, |
|
2316 |
+ s->vdsp.emulated_edge_mc(uvbuf, uvlinesize >> field_based, |
|
2317 |
+ ptr_cb, uvlinesize >> field_based, 9, |
|
2317 | 2318 |
9 + field_based, |
2318 | 2319 |
uvsrc_x, uvsrc_y << field_based, |
2319 | 2320 |
h_edge_pos >> 1, v_edge_pos >> 1); |
2320 |
- s->vdsp.emulated_edge_mc(uvbuf + 16, ptr_cr, uvlinesize >> field_based, 9, |
|
2321 |
+ s->vdsp.emulated_edge_mc(uvbuf + 16, uvlinesize >> field_based, |
|
2322 |
+ ptr_cr, uvlinesize >> field_based, 9, |
|
2321 | 2323 |
9 + field_based, |
2322 | 2324 |
uvsrc_x, uvsrc_y << field_based, |
2323 | 2325 |
h_edge_pos >> 1, v_edge_pos >> 1); |
... | ... |
@@ -2390,7 +2393,7 @@ static inline void chroma_4mv_motion_lowres(MpegEncContext *s, |
2390 | 2390 |
ptr = ref_picture[1] + offset; |
2391 | 2391 |
if ((unsigned) src_x > FFMAX(h_edge_pos - (!!sx) - block_s, 0) || |
2392 | 2392 |
(unsigned) src_y > FFMAX(v_edge_pos - (!!sy) - block_s, 0)) { |
2393 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, |
|
2393 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, ptr, s->uvlinesize, |
|
2394 | 2394 |
9, 9, src_x, src_y, h_edge_pos, v_edge_pos); |
2395 | 2395 |
ptr = s->edge_emu_buffer; |
2396 | 2396 |
emu = 1; |
... | ... |
@@ -2401,7 +2404,8 @@ static inline void chroma_4mv_motion_lowres(MpegEncContext *s, |
2401 | 2401 |
|
2402 | 2402 |
ptr = ref_picture[2] + offset; |
2403 | 2403 |
if (emu) { |
2404 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9, |
|
2404 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, |
|
2405 |
+ ptr, s->uvlinesize, 9, 9, |
|
2405 | 2406 |
src_x, src_y, h_edge_pos, v_edge_pos); |
2406 | 2407 |
ptr = s->edge_emu_buffer; |
2407 | 2408 |
} |
... | ... |
@@ -1819,14 +1819,14 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, |
1819 | 1819 |
uint8_t *ebuf = s->edge_emu_buffer + 32; |
1820 | 1820 |
int cw = (s->width + s->chroma_x_shift) >> s->chroma_x_shift; |
1821 | 1821 |
int ch = (s->height + s->chroma_y_shift) >> s->chroma_y_shift; |
1822 |
- s->vdsp.emulated_edge_mc(ebuf, ptr_y, wrap_y, 16, 16, mb_x * 16, |
|
1822 |
+ s->vdsp.emulated_edge_mc(ebuf, wrap_y, ptr_y, wrap_y, 16, 16, mb_x * 16, |
|
1823 | 1823 |
mb_y * 16, s->width, s->height); |
1824 | 1824 |
ptr_y = ebuf; |
1825 |
- s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y, ptr_cb, wrap_c, mb_block_width, |
|
1825 |
+ s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y, wrap_c, ptr_cb, wrap_c, mb_block_width, |
|
1826 | 1826 |
mb_block_height, mb_x * mb_block_width, mb_y * mb_block_height, |
1827 | 1827 |
cw, ch); |
1828 | 1828 |
ptr_cb = ebuf + 18 * wrap_y; |
1829 |
- s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y + 16, ptr_cr, wrap_c, mb_block_width, |
|
1829 |
+ s->vdsp.emulated_edge_mc(ebuf + 18 * wrap_y + 16, wrap_c, ptr_cr, wrap_c, mb_block_width, |
|
1830 | 1830 |
mb_block_height, mb_x * mb_block_width, mb_y * mb_block_height, |
1831 | 1831 |
cw, ch); |
1832 | 1832 |
ptr_cr = ebuf + 18 * wrap_y + 16; |
... | ... |
@@ -63,7 +63,9 @@ static void gmc1_motion(MpegEncContext *s, |
63 | 63 |
|
64 | 64 |
if( (unsigned)src_x >= FFMAX(s->h_edge_pos - 17, 0) |
65 | 65 |
|| (unsigned)src_y >= FFMAX(s->v_edge_pos - 17, 0)){ |
66 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, linesize, 17, 17, src_x, src_y, s->h_edge_pos, s->v_edge_pos); |
|
66 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, linesize, |
|
67 |
+ ptr, linesize, 17, 17, src_x, src_y, |
|
68 |
+ s->h_edge_pos, s->v_edge_pos); |
|
67 | 69 |
ptr= s->edge_emu_buffer; |
68 | 70 |
} |
69 | 71 |
|
... | ... |
@@ -100,7 +102,9 @@ static void gmc1_motion(MpegEncContext *s, |
100 | 100 |
ptr = ref_picture[1] + offset; |
101 | 101 |
if( (unsigned)src_x >= FFMAX((s->h_edge_pos>>1) - 9, 0) |
102 | 102 |
|| (unsigned)src_y >= FFMAX((s->v_edge_pos>>1) - 9, 0)){ |
103 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
103 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, uvlinesize, |
|
104 |
+ ptr, uvlinesize, 9, 9, src_x, src_y, |
|
105 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
104 | 106 |
ptr= s->edge_emu_buffer; |
105 | 107 |
emu=1; |
106 | 108 |
} |
... | ... |
@@ -108,7 +112,9 @@ static void gmc1_motion(MpegEncContext *s, |
108 | 108 |
|
109 | 109 |
ptr = ref_picture[2] + offset; |
110 | 110 |
if(emu){ |
111 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, uvlinesize, 9, 9, src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
111 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, uvlinesize, |
|
112 |
+ ptr, uvlinesize, 9, 9, src_x, src_y, |
|
113 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
112 | 114 |
ptr= s->edge_emu_buffer; |
113 | 115 |
} |
114 | 116 |
s->dsp.gmc1(dest_cr, ptr, uvlinesize, 8, motion_x&15, motion_y&15, 128 - s->no_rounding); |
... | ... |
@@ -196,8 +202,9 @@ static inline int hpel_motion(MpegEncContext *s, |
196 | 196 |
if(s->unrestricted_mv && (s->flags&CODEC_FLAG_EMU_EDGE)){ |
197 | 197 |
if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&1) - 8, 0) |
198 | 198 |
|| (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y&1) - 8, 0)){ |
199 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, src, s->linesize, 9, 9, |
|
200 |
- src_x, src_y, s->h_edge_pos, s->v_edge_pos); |
|
199 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, |
|
200 |
+ src, s->linesize, 9, 9, |
|
201 |
+ src_x, src_y, s->h_edge_pos, s->v_edge_pos); |
|
201 | 202 |
src= s->edge_emu_buffer; |
202 | 203 |
emu=1; |
203 | 204 |
} |
... | ... |
@@ -287,19 +294,19 @@ if(s->quarter_sample) |
287 | 287 |
"MPEG motion vector out of boundary (%d %d)\n", src_x, src_y); |
288 | 288 |
return; |
289 | 289 |
} |
290 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize, |
|
291 |
- 17, 17+field_based, |
|
292 |
- src_x, src_y<<field_based, |
|
293 |
- s->h_edge_pos, s->v_edge_pos); |
|
290 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, |
|
291 |
+ ptr_y, s->linesize, 17, 17+field_based, |
|
292 |
+ src_x, src_y<<field_based, |
|
293 |
+ s->h_edge_pos, s->v_edge_pos); |
|
294 | 294 |
ptr_y = s->edge_emu_buffer; |
295 | 295 |
if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ |
296 | 296 |
uint8_t *uvbuf= s->edge_emu_buffer+18*s->linesize; |
297 |
- s->vdsp.emulated_edge_mc(uvbuf , |
|
297 |
+ s->vdsp.emulated_edge_mc(uvbuf, s->uvlinesize, |
|
298 | 298 |
ptr_cb, s->uvlinesize, |
299 | 299 |
9, 9+field_based, |
300 | 300 |
uvsrc_x, uvsrc_y<<field_based, |
301 | 301 |
s->h_edge_pos>>1, s->v_edge_pos>>1); |
302 |
- s->vdsp.emulated_edge_mc(uvbuf+16, |
|
302 |
+ s->vdsp.emulated_edge_mc(uvbuf+16, s->uvlinesize, |
|
303 | 303 |
ptr_cr, s->uvlinesize, |
304 | 304 |
9, 9+field_based, |
305 | 305 |
uvsrc_x, uvsrc_y<<field_based, |
... | ... |
@@ -501,20 +508,23 @@ static inline void qpel_motion(MpegEncContext *s, |
501 | 501 |
|
502 | 502 |
if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&3) - 16, 0) |
503 | 503 |
|| (unsigned)src_y > FFMAX( v_edge_pos - (motion_y&3) - h , 0)){ |
504 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr_y, s->linesize, |
|
505 |
- 17, 17+field_based, src_x, src_y<<field_based, |
|
506 |
- s->h_edge_pos, s->v_edge_pos); |
|
504 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, |
|
505 |
+ ptr_y, s->linesize, |
|
506 |
+ 17, 17+field_based, src_x, src_y<<field_based, |
|
507 |
+ s->h_edge_pos, s->v_edge_pos); |
|
507 | 508 |
ptr_y= s->edge_emu_buffer; |
508 | 509 |
if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ |
509 | 510 |
uint8_t *uvbuf= s->edge_emu_buffer + 18*s->linesize; |
510 |
- s->vdsp.emulated_edge_mc(uvbuf, ptr_cb, s->uvlinesize, |
|
511 |
- 9, 9 + field_based, |
|
512 |
- uvsrc_x, uvsrc_y<<field_based, |
|
513 |
- s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
514 |
- s->vdsp.emulated_edge_mc(uvbuf + 16, ptr_cr, s->uvlinesize, |
|
515 |
- 9, 9 + field_based, |
|
516 |
- uvsrc_x, uvsrc_y<<field_based, |
|
517 |
- s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
511 |
+ s->vdsp.emulated_edge_mc(uvbuf, s->uvlinesize, |
|
512 |
+ ptr_cb, s->uvlinesize, |
|
513 |
+ 9, 9 + field_based, |
|
514 |
+ uvsrc_x, uvsrc_y<<field_based, |
|
515 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
516 |
+ s->vdsp.emulated_edge_mc(uvbuf + 16, s->uvlinesize, |
|
517 |
+ ptr_cr, s->uvlinesize, |
|
518 |
+ 9, 9 + field_based, |
|
519 |
+ uvsrc_x, uvsrc_y<<field_based, |
|
520 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
518 | 521 |
ptr_cb= uvbuf; |
519 | 522 |
ptr_cr= uvbuf + 16; |
520 | 523 |
} |
... | ... |
@@ -581,9 +591,9 @@ static void chroma_4mv_motion(MpegEncContext *s, |
581 | 581 |
if(s->flags&CODEC_FLAG_EMU_EDGE){ |
582 | 582 |
if( (unsigned)src_x > FFMAX((s->h_edge_pos>>1) - (dxy &1) - 8, 0) |
583 | 583 |
|| (unsigned)src_y > FFMAX((s->v_edge_pos>>1) - (dxy>>1) - 8, 0)){ |
584 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, |
|
585 |
- 9, 9, src_x, src_y, |
|
586 |
- s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
584 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, |
|
585 |
+ ptr, s->uvlinesize, 9, 9, src_x, src_y, |
|
586 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
587 | 587 |
ptr= s->edge_emu_buffer; |
588 | 588 |
emu=1; |
589 | 589 |
} |
... | ... |
@@ -592,9 +602,9 @@ static void chroma_4mv_motion(MpegEncContext *s, |
592 | 592 |
|
593 | 593 |
ptr = ref_picture[2] + offset; |
594 | 594 |
if(emu){ |
595 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, |
|
596 |
- 9, 9, src_x, src_y, |
|
597 |
- s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
595 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, |
|
596 |
+ ptr, s->uvlinesize, 9, 9, src_x, src_y, |
|
597 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
598 | 598 |
ptr= s->edge_emu_buffer; |
599 | 599 |
} |
600 | 600 |
pix_op[dxy](dest_cr, ptr, s->uvlinesize, 8); |
... | ... |
@@ -761,10 +771,10 @@ static av_always_inline void MPV_motion_internal(MpegEncContext *s, |
761 | 761 |
if(s->flags&CODEC_FLAG_EMU_EDGE){ |
762 | 762 |
if( (unsigned)src_x > FFMAX(s->h_edge_pos - (motion_x&3) - 8, 0) |
763 | 763 |
|| (unsigned)src_y > FFMAX(s->v_edge_pos - (motion_y&3) - 8, 0)){ |
764 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, |
|
765 |
- s->linesize, 9, 9, |
|
766 |
- src_x, src_y, |
|
767 |
- s->h_edge_pos, s->v_edge_pos); |
|
764 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, |
|
765 |
+ ptr, s->linesize, 9, 9, |
|
766 |
+ src_x, src_y, |
|
767 |
+ s->h_edge_pos, s->v_edge_pos); |
|
768 | 768 |
ptr= s->edge_emu_buffer; |
769 | 769 |
} |
770 | 770 |
} |
... | ... |
@@ -724,13 +724,16 @@ static inline void rv34_mc(RV34DecContext *r, const int block_type, |
724 | 724 |
uint8_t *uvbuf = s->edge_emu_buffer + 22 * s->linesize; |
725 | 725 |
|
726 | 726 |
srcY -= 2 + 2*s->linesize; |
727 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, (width<<3)+6, (height<<3)+6, |
|
728 |
- src_x - 2, src_y - 2, s->h_edge_pos, s->v_edge_pos); |
|
727 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, srcY, s->linesize, |
|
728 |
+ (width<<3)+6, (height<<3)+6, src_x - 2, src_y - 2, |
|
729 |
+ s->h_edge_pos, s->v_edge_pos); |
|
729 | 730 |
srcY = s->edge_emu_buffer + 2 + 2*s->linesize; |
730 |
- s->vdsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, (width<<2)+1, (height<<2)+1, |
|
731 |
- uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1); |
|
732 |
- s->vdsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, (width<<2)+1, (height<<2)+1, |
|
733 |
- uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, s->v_edge_pos >> 1); |
|
731 |
+ s->vdsp.emulated_edge_mc(uvbuf, s->uvlinesize, srcU, s->uvlinesize, |
|
732 |
+ (width<<2)+1, (height<<2)+1, uvsrc_x, uvsrc_y, |
|
733 |
+ s->h_edge_pos >> 1, s->v_edge_pos >> 1); |
|
734 |
+ s->vdsp.emulated_edge_mc(uvbuf + 16, s->uvlinesize, srcV, s->uvlinesize, |
|
735 |
+ (width<<2)+1, (height<<2)+1, uvsrc_x, uvsrc_y, |
|
736 |
+ s->h_edge_pos >> 1, s->v_edge_pos >> 1); |
|
734 | 737 |
srcU = uvbuf; |
735 | 738 |
srcV = uvbuf + 16; |
736 | 739 |
} |
... | ... |
@@ -349,7 +349,9 @@ void ff_snow_pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, ptrdiff_t st |
349 | 349 |
src += sx + sy*stride; |
350 | 350 |
if( (unsigned)sx >= FFMAX(w - b_w - (HTAPS_MAX-2), 0) |
351 | 351 |
|| (unsigned)sy >= FFMAX(h - b_h - (HTAPS_MAX-2), 0)){ |
352 |
- s->vdsp.emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1, sx, sy, w, h); |
|
352 |
+ s->vdsp.emulated_edge_mc(tmp + MB_SIZE, stride, src, stride, |
|
353 |
+ b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1, |
|
354 |
+ sx, sy, w, h); |
|
353 | 355 |
src= tmp + MB_SIZE; |
354 | 356 |
} |
355 | 357 |
|
... | ... |
@@ -317,7 +317,8 @@ static inline void svq3_mc_dir_part(SVQ3Context *s, |
317 | 317 |
src = pic->f.data[0] + mx + my * h->linesize; |
318 | 318 |
|
319 | 319 |
if (emu) { |
320 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src, h->linesize, |
|
320 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->linesize, |
|
321 |
+ src, h->linesize, |
|
321 | 322 |
width + 1, height + 1, |
322 | 323 |
mx, my, s->h_edge_pos, s->v_edge_pos); |
323 | 324 |
src = h->edge_emu_buffer; |
... | ... |
@@ -343,7 +344,8 @@ static inline void svq3_mc_dir_part(SVQ3Context *s, |
343 | 343 |
src = pic->f.data[i] + mx + my * h->uvlinesize; |
344 | 344 |
|
345 | 345 |
if (emu) { |
346 |
- h->vdsp.emulated_edge_mc(h->edge_emu_buffer, src, h->uvlinesize, |
|
346 |
+ h->vdsp.emulated_edge_mc(h->edge_emu_buffer, h->uvlinesize, |
|
347 |
+ src, h->uvlinesize, |
|
347 | 348 |
width + 1, height + 1, |
348 | 349 |
mx, my, (s->h_edge_pos >> 1), |
349 | 350 |
s->v_edge_pos >> 1); |
... | ... |
@@ -454,15 +454,18 @@ static void vc1_mc_1mv(VC1Context *v, int dir) |
454 | 454 |
uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize; |
455 | 455 |
|
456 | 456 |
srcY -= s->mspel * (1 + s->linesize); |
457 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, |
|
457 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, |
|
458 |
+ srcY, s->linesize, |
|
458 | 459 |
17 + s->mspel * 2, 17 + s->mspel * 2, |
459 | 460 |
src_x - s->mspel, src_y - s->mspel, |
460 | 461 |
s->h_edge_pos, v_edge_pos); |
461 | 462 |
srcY = s->edge_emu_buffer; |
462 |
- s->vdsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, 8 + 1, 8 + 1, |
|
463 |
- uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
464 |
- s->vdsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 8 + 1, 8 + 1, |
|
465 |
- uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
463 |
+ s->vdsp.emulated_edge_mc(uvbuf, s->uvlinesize, srcU, s->uvlinesize, |
|
464 |
+ 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
465 |
+ s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
466 |
+ s->vdsp.emulated_edge_mc(uvbuf + 16, s->uvlinesize, srcV, s->uvlinesize, |
|
467 |
+ 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
468 |
+ s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
466 | 469 |
srcU = uvbuf; |
467 | 470 |
srcV = uvbuf + 16; |
468 | 471 |
/* if we deal with range reduction we need to scale source blocks */ |
... | ... |
@@ -697,7 +700,7 @@ static void vc1_mc_4mv_luma(VC1Context *v, int n, int dir, int avg) |
697 | 697 |
|| (unsigned)(src_y - (s->mspel << fieldmv)) > v_edge_pos - (my & 3) - ((8 + s->mspel * 2) << fieldmv)) { |
698 | 698 |
srcY -= s->mspel * (1 + (s->linesize << fieldmv)); |
699 | 699 |
/* check emulate edge stride and offset */ |
700 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, |
|
700 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, srcY, s->linesize, |
|
701 | 701 |
9 + s->mspel * 2, (9 + s->mspel * 2) << fieldmv, |
702 | 702 |
src_x - s->mspel, src_y - (s->mspel << fieldmv), |
703 | 703 |
s->h_edge_pos, v_edge_pos); |
... | ... |
@@ -912,11 +915,11 @@ static void vc1_mc_4mv_chroma(VC1Context *v, int dir) |
912 | 912 |
|| s->h_edge_pos < 18 || v_edge_pos < 18 |
913 | 913 |
|| (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 9 |
914 | 914 |
|| (unsigned)uvsrc_y > (v_edge_pos >> 1) - 9) { |
915 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer , srcU, s->uvlinesize, |
|
916 |
- 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
915 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, srcU, |
|
916 |
+ s->uvlinesize, 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
917 | 917 |
s->h_edge_pos >> 1, v_edge_pos >> 1); |
918 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer + 16, srcV, s->uvlinesize, |
|
919 |
- 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
918 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer + 16, s->uvlinesize, srcV, |
|
919 |
+ s->uvlinesize, 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
920 | 920 |
s->h_edge_pos >> 1, v_edge_pos >> 1); |
921 | 921 |
srcU = s->edge_emu_buffer; |
922 | 922 |
srcV = s->edge_emu_buffer + 16; |
... | ... |
@@ -1033,12 +1036,12 @@ static void vc1_mc_4mv_chroma4(VC1Context *v, int dir, int dir2, int avg) |
1033 | 1033 |
|| s->h_edge_pos < 10 || v_edge_pos < (5 << fieldmv) |
1034 | 1034 |
|| (unsigned)uvsrc_x > (s->h_edge_pos >> 1) - 5 |
1035 | 1035 |
|| (unsigned)uvsrc_y > v_edge_pos - (5 << fieldmv)) { |
1036 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcU, s->uvlinesize, |
|
1037 |
- 5, (5 << fieldmv), uvsrc_x, uvsrc_y, |
|
1038 |
- s->h_edge_pos >> 1, v_edge_pos); |
|
1039 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer + 16, srcV, s->uvlinesize, |
|
1040 |
- 5, (5 << fieldmv), uvsrc_x, uvsrc_y, |
|
1041 |
- s->h_edge_pos >> 1, v_edge_pos); |
|
1036 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, srcU, |
|
1037 |
+ s->uvlinesize, 5, (5 << fieldmv), uvsrc_x, |
|
1038 |
+ uvsrc_y, s->h_edge_pos >> 1, v_edge_pos); |
|
1039 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer + 16, s->uvlinesize, srcV, |
|
1040 |
+ s->uvlinesize, 5, (5 << fieldmv), uvsrc_x, |
|
1041 |
+ uvsrc_y, s->h_edge_pos >> 1, v_edge_pos); |
|
1042 | 1042 |
srcU = s->edge_emu_buffer; |
1043 | 1043 |
srcV = s->edge_emu_buffer + 16; |
1044 | 1044 |
|
... | ... |
@@ -1966,15 +1969,17 @@ static void vc1_interp_mc(VC1Context *v) |
1966 | 1966 |
uint8_t *uvbuf = s->edge_emu_buffer + 19 * s->linesize; |
1967 | 1967 |
|
1968 | 1968 |
srcY -= s->mspel * (1 + s->linesize); |
1969 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, srcY, s->linesize, |
|
1969 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, srcY, s->linesize, |
|
1970 | 1970 |
17 + s->mspel * 2, 17 + s->mspel * 2, |
1971 | 1971 |
src_x - s->mspel, src_y - s->mspel, |
1972 | 1972 |
s->h_edge_pos, v_edge_pos); |
1973 | 1973 |
srcY = s->edge_emu_buffer; |
1974 |
- s->vdsp.emulated_edge_mc(uvbuf , srcU, s->uvlinesize, 8 + 1, 8 + 1, |
|
1975 |
- uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
1976 |
- s->vdsp.emulated_edge_mc(uvbuf + 16, srcV, s->uvlinesize, 8 + 1, 8 + 1, |
|
1977 |
- uvsrc_x, uvsrc_y, s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
1974 |
+ s->vdsp.emulated_edge_mc(uvbuf, s->uvlinesize, srcU, s->uvlinesize, |
|
1975 |
+ 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
1976 |
+ s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
1977 |
+ s->vdsp.emulated_edge_mc(uvbuf + 16, s->uvlinesize, srcV, s->uvlinesize, |
|
1978 |
+ 8 + 1, 8 + 1, uvsrc_x, uvsrc_y, |
|
1979 |
+ s->h_edge_pos >> 1, v_edge_pos >> 1); |
|
1978 | 1980 |
srcU = uvbuf; |
1979 | 1981 |
srcV = uvbuf + 16; |
1980 | 1982 |
/* if we deal with range reduction we need to scale source blocks */ |
... | ... |
@@ -30,9 +30,10 @@ |
30 | 30 |
#include <stdint.h> |
31 | 31 |
|
32 | 32 |
#define EMULATED_EDGE(depth) \ |
33 |
-void ff_emulated_edge_mc_ ## depth (uint8_t *buf, const uint8_t *src, ptrdiff_t linesize,\ |
|
34 |
- int block_w, int block_h,\ |
|
35 |
- int src_x, int src_y, int w, int h); |
|
33 |
+void ff_emulated_edge_mc_ ## depth(uint8_t *dst, ptrdiff_t dst_stride, \ |
|
34 |
+ const uint8_t *src, ptrdiff_t src_stride, \ |
|
35 |
+ int block_w, int block_h,\ |
|
36 |
+ int src_x, int src_y, int w, int h); |
|
36 | 37 |
|
37 | 38 |
EMULATED_EDGE(8) |
38 | 39 |
EMULATED_EDGE(16) |
... | ... |
@@ -42,10 +43,12 @@ typedef struct VideoDSPContext { |
42 | 42 |
* Copy a rectangular area of samples to a temporary buffer and replicate |
43 | 43 |
* the border samples. |
44 | 44 |
* |
45 |
- * @param buf destination buffer |
|
45 |
+ * @param dst destination buffer |
|
46 |
+ * @param dst_stride number of bytes between 2 vertically adjacent samples |
|
47 |
+ * in destination buffer |
|
46 | 48 |
* @param src source buffer |
47 |
- * @param linesize number of bytes between 2 vertically adjacent samples |
|
48 |
- * in both the source and destination buffers |
|
49 |
+ * @param src_stride number of bytes between 2 vertically adjacent samples |
|
50 |
+ * in source buffer |
|
49 | 51 |
* @param block_w width of block |
50 | 52 |
* @param block_h height of block |
51 | 53 |
* @param src_x x coordinate of the top left sample of the block in the |
... | ... |
@@ -55,8 +58,9 @@ typedef struct VideoDSPContext { |
55 | 55 |
* @param w width of the source buffer |
56 | 56 |
* @param h height of the source buffer |
57 | 57 |
*/ |
58 |
- void (*emulated_edge_mc)(uint8_t *buf, const uint8_t *src, |
|
59 |
- ptrdiff_t linesize, int block_w, int block_h, |
|
58 |
+ void (*emulated_edge_mc)(uint8_t *dst, ptrdiff_t dst_stride, |
|
59 |
+ const uint8_t *src, ptrdiff_t src_stride, |
|
60 |
+ int block_w, int block_h, |
|
60 | 61 |
int src_x, int src_y, int w, int h); |
61 | 62 |
|
62 | 63 |
/** |
... | ... |
@@ -20,10 +20,10 @@ |
20 | 20 |
*/ |
21 | 21 |
|
22 | 22 |
#include "bit_depth_template.c" |
23 |
-void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, |
|
24 |
- ptrdiff_t linesize, |
|
25 |
- int block_w, int block_h, |
|
26 |
- int src_x, int src_y, int w, int h) |
|
23 |
+void FUNC(ff_emulated_edge_mc)(uint8_t *buf, ptrdiff_t buf_stride, |
|
24 |
+ const uint8_t *src, ptrdiff_t src_stride, |
|
25 |
+ int block_w, int block_h, |
|
26 |
+ int src_x, int src_y, int w, int h) |
|
27 | 27 |
{ |
28 | 28 |
int x, y; |
29 | 29 |
int start_y, start_x, end_y, end_x; |
... | ... |
@@ -32,12 +32,12 @@ void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, |
32 | 32 |
return; |
33 | 33 |
|
34 | 34 |
if (src_y >= h) { |
35 |
- src -= src_y * linesize; |
|
36 |
- src += (h - 1) * linesize; |
|
35 |
+ src -= src_y * src_stride; |
|
36 |
+ src += (h - 1) * src_stride; |
|
37 | 37 |
src_y = h - 1; |
38 | 38 |
} else if (src_y <= -block_h) { |
39 |
- src -= src_y * linesize; |
|
40 |
- src += (1 - block_h) * linesize; |
|
39 |
+ src -= src_y * src_stride; |
|
40 |
+ src += (1 - block_h) * src_stride; |
|
41 | 41 |
src_y = 1 - block_h; |
42 | 42 |
} |
43 | 43 |
if (src_x >= w) { |
... | ... |
@@ -56,30 +56,30 @@ void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, |
56 | 56 |
av_assert2(start_x < end_x && block_w); |
57 | 57 |
|
58 | 58 |
w = end_x - start_x; |
59 |
- src += start_y * linesize + start_x * sizeof(pixel); |
|
59 |
+ src += start_y * src_stride + start_x * sizeof(pixel); |
|
60 | 60 |
buf += start_x * sizeof(pixel); |
61 | 61 |
|
62 | 62 |
// top |
63 | 63 |
for (y = 0; y < start_y; y++) { |
64 | 64 |
memcpy(buf, src, w * sizeof(pixel)); |
65 |
- buf += linesize; |
|
65 |
+ buf += buf_stride; |
|
66 | 66 |
} |
67 | 67 |
|
68 | 68 |
// copy existing part |
69 | 69 |
for (; y < end_y; y++) { |
70 | 70 |
memcpy(buf, src, w * sizeof(pixel)); |
71 |
- src += linesize; |
|
72 |
- buf += linesize; |
|
71 |
+ src += src_stride; |
|
72 |
+ buf += buf_stride; |
|
73 | 73 |
} |
74 | 74 |
|
75 | 75 |
// bottom |
76 |
- src -= linesize; |
|
76 |
+ src -= src_stride; |
|
77 | 77 |
for (; y < block_h; y++) { |
78 | 78 |
memcpy(buf, src, w * sizeof(pixel)); |
79 |
- buf += linesize; |
|
79 |
+ buf += buf_stride; |
|
80 | 80 |
} |
81 | 81 |
|
82 |
- buf -= block_h * linesize + start_x * sizeof(pixel); |
|
82 |
+ buf -= block_h * buf_stride + start_x * sizeof(pixel); |
|
83 | 83 |
while (block_h--) { |
84 | 84 |
pixel *bufp = (pixel *) buf; |
85 | 85 |
|
... | ... |
@@ -92,6 +92,6 @@ void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, |
92 | 92 |
for (x = end_x; x < block_w; x++) { |
93 | 93 |
bufp[x] = bufp[end_x - 1]; |
94 | 94 |
} |
95 |
- buf += linesize; |
|
95 |
+ buf += buf_stride; |
|
96 | 96 |
} |
97 | 97 |
} |
... | ... |
@@ -1549,7 +1549,10 @@ static void render_slice(Vp3DecodeContext *s, int slice) |
1549 | 1549 |
uint8_t *temp= s->edge_emu_buffer; |
1550 | 1550 |
if(stride<0) temp -= 8*stride; |
1551 | 1551 |
|
1552 |
- s->vdsp.emulated_edge_mc(temp, motion_source, stride, 9, 9, src_x, src_y, plane_width, plane_height); |
|
1552 |
+ s->vdsp.emulated_edge_mc(temp, stride, |
|
1553 |
+ motion_source, stride, |
|
1554 |
+ 9, 9, src_x, src_y, |
|
1555 |
+ plane_width, plane_height); |
|
1553 | 1556 |
motion_source= temp; |
1554 | 1557 |
} |
1555 | 1558 |
} |
... | ... |
@@ -339,7 +339,7 @@ static void vp56_mc(VP56Context *s, int b, int plane, uint8_t *src, |
339 | 339 |
|
340 | 340 |
if (x<0 || x+12>=s->plane_width[plane] || |
341 | 341 |
y<0 || y+12>=s->plane_height[plane]) { |
342 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, |
|
342 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, stride, |
|
343 | 343 |
src + s->block_offset[b] + (dy-2)*stride + (dx-2), |
344 | 344 |
stride, 12, 12, x, y, |
345 | 345 |
s->plane_width[plane], |
... | ... |
@@ -45,7 +45,6 @@ static void free_buffers(VP8Context *s) |
45 | 45 |
pthread_mutex_destroy(&s->thread_data[i].lock); |
46 | 46 |
#endif |
47 | 47 |
av_freep(&s->thread_data[i].filter_strength); |
48 |
- av_freep(&s->thread_data[i].edge_emu_buffer); |
|
49 | 48 |
} |
50 | 49 |
av_freep(&s->thread_data); |
51 | 50 |
av_freep(&s->macroblocks_base); |
... | ... |
@@ -1186,7 +1185,7 @@ void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst, |
1186 | 1186 |
uint8_t *src = ref->f->data[0]; |
1187 | 1187 |
|
1188 | 1188 |
if (AV_RN32A(mv)) { |
1189 |
- |
|
1189 |
+ int src_linesize = linesize; |
|
1190 | 1190 |
int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx]; |
1191 | 1191 |
int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my]; |
1192 | 1192 |
|
... | ... |
@@ -1198,12 +1197,15 @@ void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst, |
1198 | 1198 |
src += y_off * linesize + x_off; |
1199 | 1199 |
if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] || |
1200 | 1200 |
y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) { |
1201 |
- s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize, |
|
1202 |
- block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my], |
|
1201 |
+ s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32, |
|
1202 |
+ src - my_idx * linesize - mx_idx, linesize, |
|
1203 |
+ block_w + subpel_idx[1][mx], |
|
1204 |
+ block_h + subpel_idx[1][my], |
|
1203 | 1205 |
x_off - mx_idx, y_off - my_idx, width, height); |
1204 |
- src = td->edge_emu_buffer + mx_idx + linesize * my_idx; |
|
1206 |
+ src = td->edge_emu_buffer + mx_idx + 32 * my_idx; |
|
1207 |
+ src_linesize = 32; |
|
1205 | 1208 |
} |
1206 |
- mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my); |
|
1209 |
+ mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my); |
|
1207 | 1210 |
} else { |
1208 | 1211 |
ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0); |
1209 | 1212 |
mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0); |
... | ... |
@@ -1248,17 +1250,21 @@ void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst |
1248 | 1248 |
ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0); |
1249 | 1249 |
if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] || |
1250 | 1250 |
y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) { |
1251 |
- s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize, |
|
1252 |
- block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my], |
|
1251 |
+ s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32, |
|
1252 |
+ src1 - my_idx * linesize - mx_idx, linesize, |
|
1253 |
+ block_w + subpel_idx[1][mx], |
|
1254 |
+ block_h + subpel_idx[1][my], |
|
1253 | 1255 |
x_off - mx_idx, y_off - my_idx, width, height); |
1254 |
- src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx; |
|
1255 |
- mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my); |
|
1256 |
+ src1 = td->edge_emu_buffer + mx_idx + 32 * my_idx; |
|
1257 |
+ mc_func[my_idx][mx_idx](dst1, linesize, src1, 32, block_h, mx, my); |
|
1256 | 1258 |
|
1257 |
- s->vdsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize, |
|
1258 |
- block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my], |
|
1259 |
+ s->vdsp.emulated_edge_mc(td->edge_emu_buffer, 32, |
|
1260 |
+ src2 - my_idx * linesize - mx_idx, linesize, |
|
1261 |
+ block_w + subpel_idx[1][mx], |
|
1262 |
+ block_h + subpel_idx[1][my], |
|
1259 | 1263 |
x_off - mx_idx, y_off - my_idx, width, height); |
1260 |
- src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx; |
|
1261 |
- mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my); |
|
1264 |
+ src2 = td->edge_emu_buffer + mx_idx + 32 * my_idx; |
|
1265 |
+ mc_func[my_idx][mx_idx](dst2, linesize, src2, 32, block_h, mx, my); |
|
1262 | 1266 |
} else { |
1263 | 1267 |
mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my); |
1264 | 1268 |
mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my); |
... | ... |
@@ -1944,10 +1950,6 @@ int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, |
1944 | 1944 |
s->linesize = curframe->tf.f->linesize[0]; |
1945 | 1945 |
s->uvlinesize = curframe->tf.f->linesize[1]; |
1946 | 1946 |
|
1947 |
- if (!s->thread_data[0].edge_emu_buffer) |
|
1948 |
- for (i = 0; i < MAX_THREADS; i++) |
|
1949 |
- s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize); |
|
1950 |
- |
|
1951 | 1947 |
memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz)); |
1952 | 1948 |
/* Zero macroblock structures for top/top-left prediction from outside the frame. */ |
1953 | 1949 |
if (!s->mb_layout) |
... | ... |
@@ -122,7 +122,7 @@ typedef struct VP8ThreadData { |
122 | 122 |
#endif |
123 | 123 |
int thread_mb_pos; // (mb_y << 16) | (mb_x & 0xFFFF) |
124 | 124 |
int wait_mb_pos; // What the current thread is waiting on. |
125 |
- uint8_t *edge_emu_buffer; |
|
125 |
+ DECLARE_ALIGNED(16, uint8_t, edge_emu_buffer)[21*32]; |
|
126 | 126 |
VP8FilterStrength *filter_strength; |
127 | 127 |
} VP8ThreadData; |
128 | 128 |
|
... | ... |
@@ -119,8 +119,10 @@ void ff_mspel_motion(MpegEncContext *s, |
119 | 119 |
|
120 | 120 |
if(src_x<1 || src_y<1 || src_x + 17 >= s->h_edge_pos |
121 | 121 |
|| src_y + h+1 >= v_edge_pos){ |
122 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr - 1 - s->linesize, s->linesize, 19, 19, |
|
123 |
- src_x-1, src_y-1, s->h_edge_pos, s->v_edge_pos); |
|
122 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->linesize, |
|
123 |
+ ptr - 1 - s->linesize, s->linesize, 19, 19, |
|
124 |
+ src_x-1, src_y-1, |
|
125 |
+ s->h_edge_pos, s->v_edge_pos); |
|
124 | 126 |
ptr= s->edge_emu_buffer + 1 + s->linesize; |
125 | 127 |
emu=1; |
126 | 128 |
} |
... | ... |
@@ -159,16 +161,18 @@ void ff_mspel_motion(MpegEncContext *s, |
159 | 159 |
offset = (src_y * uvlinesize) + src_x; |
160 | 160 |
ptr = ref_picture[1] + offset; |
161 | 161 |
if(emu){ |
162 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9, |
|
163 |
- src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
162 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, |
|
163 |
+ ptr, s->uvlinesize, 9, 9, src_x, src_y, |
|
164 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
164 | 165 |
ptr= s->edge_emu_buffer; |
165 | 166 |
} |
166 | 167 |
pix_op[1][dxy](dest_cb, ptr, uvlinesize, h >> 1); |
167 | 168 |
|
168 | 169 |
ptr = ref_picture[2] + offset; |
169 | 170 |
if(emu){ |
170 |
- s->vdsp.emulated_edge_mc(s->edge_emu_buffer, ptr, s->uvlinesize, 9, 9, |
|
171 |
- src_x, src_y, s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
171 |
+ s->vdsp.emulated_edge_mc(s->edge_emu_buffer, s->uvlinesize, |
|
172 |
+ ptr, s->uvlinesize, 9, 9, src_x, src_y, |
|
173 |
+ s->h_edge_pos>>1, s->v_edge_pos>>1); |
|
172 | 174 |
ptr= s->edge_emu_buffer; |
173 | 175 |
} |
174 | 176 |
pix_op[1][dxy](dest_cr, ptr, uvlinesize, h >> 1); |
... | ... |
@@ -405,8 +405,9 @@ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, |
405 | 405 |
} |
406 | 406 |
} |
407 | 407 |
|
408 |
-typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, |
|
409 |
- ptrdiff_t linesize, int block_w, int block_h, |
|
408 |
+typedef void emulated_edge_mc_func(uint8_t *dst, ptrdiff_t dst_stride, |
|
409 |
+ const uint8_t *src, ptrdiff_t src_linesize, |
|
410 |
+ int block_w, int block_h, |
|
410 | 411 |
int src_x, int src_y, int w, int h); |
411 | 412 |
|
412 | 413 |
static av_always_inline void gmc(uint8_t *dst, uint8_t *src, |
... | ... |
@@ -454,7 +455,7 @@ static av_always_inline void gmc(uint8_t *dst, uint8_t *src, |
454 | 454 |
|
455 | 455 |
src += ix + iy * stride; |
456 | 456 |
if (need_emu) { |
457 |
- emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height); |
|
457 |
+ emu_edge_fn(edge_buf, stride, src, stride, w + 1, h + 1, ix, iy, width, height); |
|
458 | 458 |
src = edge_buf; |
459 | 459 |
} |
460 | 460 |
|
... | ... |
@@ -23,576 +23,394 @@ |
23 | 23 |
|
24 | 24 |
SECTION .text |
25 | 25 |
|
26 |
-; void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize, |
|
27 |
-; x86_reg start_y, x86_reg end_y, x86_reg block_h, |
|
28 |
-; x86_reg start_x, x86_reg end_x, x86_reg block_w); |
|
29 |
-; |
|
30 |
-; The actual function itself is below. It basically wraps a very simple |
|
31 |
-; w = end_x - start_x |
|
32 |
-; if (w) { |
|
33 |
-; if (w > 22) { |
|
34 |
-; jump to the slow loop functions |
|
35 |
-; } else { |
|
36 |
-; jump to the fast loop functions |
|
37 |
-; } |
|
38 |
-; } |
|
39 |
-; |
|
40 |
-; ... and then the same for left/right extend also. See below for loop |
|
41 |
-; function implementations. Fast are fixed-width, slow is variable-width |
|
42 |
- |
|
43 |
-%macro EMU_EDGE_FUNC 0 |
|
44 |
-%if ARCH_X86_64 |
|
45 |
-%define w_reg r7 |
|
46 |
-cglobal emu_edge_core, 6, 9, 1 |
|
47 |
- mov r8, r5 ; save block_h |
|
48 |
-%else |
|
49 |
-%define w_reg r6 |
|
50 |
-cglobal emu_edge_core, 2, 7, 0 |
|
51 |
- mov r4, r4m ; end_y |
|
52 |
- mov r5, r5m ; block_h |
|
53 |
-%endif |
|
54 |
- |
|
55 |
- ; start with vertical extend (top/bottom) and body pixel copy |
|
56 |
- mov w_reg, r7m |
|
57 |
- sub w_reg, r6m ; w = start_x - end_x |
|
58 |
- sub r5, r4 |
|
59 |
-%if ARCH_X86_64 |
|
60 |
- sub r4, r3 |
|
61 |
-%else |
|
62 |
- sub r4, dword r3m |
|
63 |
-%endif |
|
64 |
- cmp w_reg, 22 |
|
65 |
- jg .slow_v_extend_loop |
|
66 |
-%if ARCH_X86_32 |
|
67 |
- mov r2, r2m ; linesize |
|
68 |
-%endif |
|
69 |
- sal w_reg, 7 ; w * 128 |
|
70 |
-%ifdef PIC |
|
71 |
- lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)] |
|
72 |
- add w_reg, rax |
|
73 |
-%else |
|
74 |
- lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg] |
|
75 |
-%endif |
|
76 |
- call w_reg ; fast top extend, body copy and bottom extend |
|
77 |
-.v_extend_end: |
|
26 |
+; slow vertical extension loop function. Works with variable-width, and |
|
27 |
+; does per-line reading/writing of source data |
|
28 |
+ |
|
29 |
+%macro V_COPY_ROW 2 ; type (top/body/bottom), h |
|
30 |
+.%1_y_loop: ; do { |
|
31 |
+ mov wq, r7mp ; initialize w (r7mp = wmp) |
|
32 |
+.%1_x_loop: ; do { |
|
33 |
+ movu m0, [srcq+wq] ; m0 = read($mmsize) |
|
34 |
+ movu [dstq+wq], m0 ; write(m0, $mmsize) |
|
35 |
+ add wq, mmsize ; w -= $mmsize |
|
36 |
+ cmp wq, -mmsize ; } while (w > $mmsize); |
|
37 |
+ jl .%1_x_loop |
|
38 |
+ movu m0, [srcq-mmsize] ; m0 = read($mmsize) |
|
39 |
+ movu [dstq-mmsize], m0 ; write(m0, $mmsize) |
|
40 |
+%ifidn %1, body ; if ($type == body) { |
|
41 |
+ add srcq, src_strideq ; src += src_stride |
|
42 |
+%endif ; } |
|
43 |
+ add dstq, dst_strideq ; dst += dst_stride |
|
44 |
+ dec %2 ; } while (--$h); |
|
45 |
+ jnz .%1_y_loop |
|
46 |
+%endmacro |
|
78 | 47 |
|
79 |
- ; horizontal extend (left/right) |
|
80 |
- mov w_reg, r6m ; start_x |
|
81 |
- sub r0, w_reg |
|
48 |
+%macro vvar_fn 0 |
|
49 |
+; .----. <- zero |
|
50 |
+; | | <- top is copied from first line in body of source |
|
51 |
+; |----| <- start_y |
|
52 |
+; | | <- body is copied verbatim (line-by-line) from source |
|
53 |
+; |----| <- end_y |
|
54 |
+; | | <- bottom is copied from last line in body of source |
|
55 |
+; '----' <- bh |
|
82 | 56 |
%if ARCH_X86_64 |
83 |
- mov r3, r0 ; backup of buf+block_h*linesize |
|
84 |
- mov r5, r8 |
|
85 |
-%else |
|
86 |
- mov r0m, r0 ; backup of buf+block_h*linesize |
|
87 |
- mov r5, r5m |
|
57 |
+cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ |
|
58 |
+ start_y, end_y, bh, w |
|
59 |
+%else ; x86-32 |
|
60 |
+cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w |
|
61 |
+%define src_strideq r3mp |
|
62 |
+%define dst_strideq r1mp |
|
63 |
+ mov srcq, r2mp |
|
64 |
+ mov start_yq, r4mp |
|
65 |
+ mov end_yq, r5mp |
|
66 |
+ mov bhq, r6mp |
|
88 | 67 |
%endif |
89 |
- test w_reg, w_reg |
|
90 |
- jz .right_extend |
|
91 |
- cmp w_reg, 22 |
|
92 |
- jg .slow_left_extend_loop |
|
93 |
- mov r1, w_reg |
|
94 |
- dec w_reg |
|
95 |
- ; FIXME we can do a if size == 1 here if that makes any speed difference, test me |
|
96 |
- sar w_reg, 1 |
|
97 |
- sal w_reg, 6 |
|
98 |
- ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs |
|
99 |
- ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h |
|
100 |
-%ifdef PIC |
|
101 |
- lea rax, [.emuedge_extend_left_2] |
|
102 |
- add w_reg, rax |
|
103 |
-%else |
|
104 |
- lea w_reg, [.emuedge_extend_left_2+w_reg] |
|
105 |
-%endif |
|
106 |
- call w_reg |
|
68 |
+ sub bhq, end_yq ; bh -= end_q |
|
69 |
+ sub end_yq, start_yq ; end_q -= start_q |
|
70 |
+ add srcq, r7mp ; (r7mp = wmp) |
|
71 |
+ add dstq, r7mp ; (r7mp = wmp) |
|
72 |
+ neg r7mp ; (r7mp = wmp) |
|
73 |
+ test start_yq, start_yq ; if (start_q) { |
|
74 |
+ jz .body |
|
75 |
+ V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) |
|
76 |
+.body: ; } |
|
77 |
+ V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) |
|
78 |
+ test bhq, bhq ; if (bh) { |
|
79 |
+ jz .end |
|
80 |
+ sub srcq, src_strideq ; src -= src_stride |
|
81 |
+ V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) |
|
82 |
+.end: ; } |
|
83 |
+ RET |
|
84 |
+%endmacro |
|
107 | 85 |
|
108 |
- ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w |
|
109 |
-.right_extend: |
|
110 | 86 |
%if ARCH_X86_32 |
111 |
- mov r0, r0m |
|
112 |
- mov r5, r5m |
|
87 |
+INIT_MMX mmx |
|
88 |
+vvar_fn |
|
113 | 89 |
%endif |
114 |
- mov w_reg, r7m ; end_x |
|
115 |
- mov r1, r8m ; block_w |
|
116 |
- mov r4, r1 |
|
117 |
- sub r1, w_reg |
|
118 |
- jz .h_extend_end ; if (end_x == block_w) goto h_extend_end |
|
119 |
- cmp r1, 22 |
|
120 |
- jg .slow_right_extend_loop |
|
121 |
- dec r1 |
|
122 |
- ; FIXME we can do a if size == 1 here if that makes any speed difference, test me |
|
123 |
- sar r1, 1 |
|
124 |
- sal r1, 6 |
|
125 |
-%ifdef PIC |
|
126 |
- lea rax, [.emuedge_extend_right_2] |
|
127 |
- add r1, rax |
|
128 |
-%else |
|
129 |
- lea r1, [.emuedge_extend_right_2+r1] |
|
130 |
-%endif |
|
131 |
- call r1 |
|
132 |
-.h_extend_end: |
|
90 |
+ |
|
91 |
+INIT_XMM sse |
|
92 |
+vvar_fn |
|
93 |
+ |
|
94 |
+%macro hvar_fn 0 |
|
95 |
+cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w |
|
96 |
+ lea dstq, [dstq+n_wordsq*2] |
|
97 |
+ neg n_wordsq |
|
98 |
+ lea start_xq, [start_xq+n_wordsq*2] |
|
99 |
+.y_loop: ; do { |
|
100 |
+ ; FIXME also write a ssse3 version using pshufb |
|
101 |
+ movzx wd, byte [dstq+start_xq] ; w = read(1) |
|
102 |
+ imul wd, 0x01010101 ; w *= 0x01010101 |
|
103 |
+ movd m0, wd |
|
104 |
+ mov wq, n_wordsq ; initialize w |
|
105 |
+%if cpuflag(sse) |
|
106 |
+ shufps m0, m0, q0000 ; splat |
|
107 |
+%else ; mmx |
|
108 |
+ punpckldq m0, m0 ; splat |
|
109 |
+%endif ; mmx/sse |
|
110 |
+.x_loop: ; do { |
|
111 |
+ movu [dstq+wq*2], m0 ; write($reg, $mmsize) |
|
112 |
+ add wq, mmsize/2 ; w -= $mmsize/2 |
|
113 |
+ cmp wq, -mmsize/2 ; } while (w > $mmsize/2) |
|
114 |
+ jl .x_loop |
|
115 |
+ movu [dstq-mmsize], m0 ; write($reg, $mmsize) |
|
116 |
+ add dstq, dst_strideq ; dst += dst_stride |
|
117 |
+ dec hq ; } while (h--) |
|
118 |
+ jnz .y_loop |
|
133 | 119 |
RET |
120 |
+%endmacro |
|
134 | 121 |
|
135 |
-%if ARCH_X86_64 |
|
136 |
-%define vall al |
|
137 |
-%define valh ah |
|
138 |
-%define valw ax |
|
139 |
-%define valw2 r7w |
|
140 |
-%define valw3 r3w |
|
141 |
-%if WIN64 |
|
142 |
-%define valw4 r7w |
|
143 |
-%else ; unix64 |
|
144 |
-%define valw4 r3w |
|
145 |
-%endif |
|
146 |
-%define vald eax |
|
147 |
-%else |
|
148 |
-%define vall bl |
|
149 |
-%define valh bh |
|
150 |
-%define valw bx |
|
151 |
-%define valw2 r6w |
|
152 |
-%define valw3 valw2 |
|
153 |
-%define valw4 valw3 |
|
154 |
-%define vald ebx |
|
155 |
-%define stack_offset 0x14 |
|
122 |
+%if ARCH_X86_32 |
|
123 |
+INIT_MMX mmx |
|
124 |
+hvar_fn |
|
156 | 125 |
%endif |
157 | 126 |
|
158 |
-%endmacro |
|
127 |
+INIT_XMM sse |
|
128 |
+hvar_fn |
|
159 | 129 |
|
160 | 130 |
; macro to read/write a horizontal number of pixels (%2) to/from registers |
161 |
-; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels |
|
162 |
-; - if (%2 & 15 == 8) fills the last 8 bytes into rax |
|
163 |
-; - else if (%2 & 8) fills 8 bytes into mm0 |
|
164 |
-; - if (%2 & 7 == 4) fills the last 4 bytes into rax |
|
165 |
-; - else if (%2 & 4) fills 4 bytes into mm0-1 |
|
166 |
-; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax |
|
167 |
-; (note that we're using r3 for body/bottom because it's a shorter |
|
168 |
-; opcode, and then the loop fits in 128 bytes) |
|
169 |
-; - else fills remaining bytes into rax |
|
170 |
-; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels |
|
171 |
-; - if (%2 & 7 == 4) fills 4 bytes into ebx |
|
172 |
-; - else if (%2 & 4) fills 4 bytes into mm0-7 |
|
173 |
-; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx |
|
174 |
-; - else fills remaining bytes into ebx |
|
131 |
+; on sse, - fills xmm0-15 for consecutive sets of 16 pixels |
|
132 |
+; - if (%2 & 8) fills 8 bytes into xmm$next |
|
133 |
+; - if (%2 & 4) fills 4 bytes into xmm$next |
|
134 |
+; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
|
135 |
+; on mmx, - fills mm0-7 for consecutive sets of 8 pixels |
|
136 |
+; - if (%2 & 4) fills 4 bytes into mm$next |
|
137 |
+; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
|
175 | 138 |
; writing data out is in the same way |
176 | 139 |
%macro READ_NUM_BYTES 2 |
177 |
-%assign %%src_off 0 ; offset in source buffer |
|
178 |
-%assign %%smidx 0 ; mmx register idx |
|
179 |
-%assign %%sxidx 0 ; xmm register idx |
|
180 |
- |
|
181 |
-%if cpuflag(sse) |
|
182 |
-%rep %2/16 |
|
183 |
- movups xmm %+ %%sxidx, [r1+%%src_off] |
|
184 |
-%assign %%src_off %%src_off+16 |
|
185 |
-%assign %%sxidx %%sxidx+1 |
|
186 |
-%endrep ; %2/16 |
|
140 |
+%assign %%off 0 ; offset in source buffer |
|
141 |
+%assign %%idx 0 ; mmx/xmm register index |
|
142 |
+ |
|
143 |
+%rep %2/mmsize |
|
144 |
+ movu m %+ %%idx, [srcq+%%off] |
|
145 |
+%assign %%off %%off+mmsize |
|
146 |
+%assign %%idx %%idx+1 |
|
147 |
+%endrep ; %2/mmsize |
|
148 |
+ |
|
149 |
+%if mmsize == 16 |
|
150 |
+%if (%2-%%off) >= 8 |
|
151 |
+%if %2 > 16 && (%2-%%off) > 8 |
|
152 |
+ movu m %+ %%idx, [srcq+%2-16] |
|
153 |
+%assign %%off %2 |
|
154 |
+%else |
|
155 |
+ movq m %+ %%idx, [srcq+%%off] |
|
156 |
+%assign %%off %%off+8 |
|
157 |
+%endif |
|
158 |
+%assign %%idx %%idx+1 |
|
159 |
+%endif ; (%2-%%off) >= 8 |
|
187 | 160 |
%endif |
188 | 161 |
|
189 |
-%if ARCH_X86_64 |
|
190 |
-%if (%2-%%src_off) == 8 |
|
191 |
- mov rax, [r1+%%src_off] |
|
192 |
-%assign %%src_off %%src_off+8 |
|
193 |
-%endif ; (%2-%%src_off) == 8 |
|
194 |
-%endif ; x86-64 |
|
195 |
- |
|
196 |
-%rep (%2-%%src_off)/8 |
|
197 |
- movq mm %+ %%smidx, [r1+%%src_off] |
|
198 |
-%assign %%src_off %%src_off+8 |
|
199 |
-%assign %%smidx %%smidx+1 |
|
200 |
-%endrep ; (%2-%%dst_off)/8 |
|
201 |
- |
|
202 |
-%if (%2-%%src_off) == 4 |
|
203 |
- mov vald, [r1+%%src_off] |
|
204 |
-%elif (%2-%%src_off) & 4 |
|
205 |
- movd mm %+ %%smidx, [r1+%%src_off] |
|
206 |
-%assign %%src_off %%src_off+4 |
|
207 |
-%endif ; (%2-%%src_off) ==/& 4 |
|
208 |
- |
|
209 |
-%if (%2-%%src_off) == 1 |
|
210 |
- mov vall, [r1+%%src_off] |
|
211 |
-%elif (%2-%%src_off) == 2 |
|
212 |
- mov valw, [r1+%%src_off] |
|
213 |
-%elif (%2-%%src_off) == 3 |
|
214 |
-%ifidn %1, top |
|
215 |
- mov valw2, [r1+%%src_off] |
|
162 |
+%if (%2-%%off) >= 4 |
|
163 |
+%if %2 > 8 && (%2-%%off) > 4 |
|
164 |
+ movq m %+ %%idx, [srcq+%2-8] |
|
165 |
+%assign %%off %2 |
|
166 |
+%else |
|
167 |
+ movd m %+ %%idx, [srcq+%%off] |
|
168 |
+%assign %%off %%off+4 |
|
169 |
+%endif |
|
170 |
+%assign %%idx %%idx+1 |
|
171 |
+%endif ; (%2-%%off) >= 4 |
|
172 |
+ |
|
173 |
+%if (%2-%%off) >= 1 |
|
174 |
+%if %2 >= 4 |
|
175 |
+ movd m %+ %%idx, [srcq+%2-4] |
|
176 |
+%elif (%2-%%off) == 1 |
|
177 |
+ mov valb, [srcq+%2-1] |
|
178 |
+%elif (%2-%%off) == 2 |
|
179 |
+ mov valw, [srcq+%2-2] |
|
216 | 180 |
%elifidn %1, body |
217 |
- mov valw3, [r1+%%src_off] |
|
218 |
-%elifidn %1, bottom |
|
219 |
- mov valw4, [r1+%%src_off] |
|
220 |
-%endif ; %1 ==/!= top |
|
221 |
- mov vall, [r1+%%src_off+2] |
|
222 |
-%endif ; (%2-%%src_off) == 1/2/3 |
|
181 |
+ mov vald, [srcq+%2-3] |
|
182 |
+%else |
|
183 |
+ movd m %+ %%idx, [srcq+%2-3] |
|
184 |
+%endif |
|
185 |
+%endif ; (%2-%%off) >= 1 |
|
223 | 186 |
%endmacro ; READ_NUM_BYTES |
224 | 187 |
|
225 | 188 |
%macro WRITE_NUM_BYTES 2 |
226 |
-%assign %%dst_off 0 ; offset in destination buffer |
|
227 |
-%assign %%dmidx 0 ; mmx register idx |
|
228 |
-%assign %%dxidx 0 ; xmm register idx |
|
229 |
- |
|
230 |
-%if cpuflag(sse) |
|
231 |
-%rep %2/16 |
|
232 |
- movups [r0+%%dst_off], xmm %+ %%dxidx |
|
233 |
-%assign %%dst_off %%dst_off+16 |
|
234 |
-%assign %%dxidx %%dxidx+1 |
|
235 |
-%endrep ; %2/16 |
|
189 |
+%assign %%off 0 ; offset in destination buffer |
|
190 |
+%assign %%idx 0 ; mmx/xmm register index |
|
191 |
+ |
|
192 |
+%rep %2/mmsize |
|
193 |
+ movu [dstq+%%off], m %+ %%idx |
|
194 |
+%assign %%off %%off+mmsize |
|
195 |
+%assign %%idx %%idx+1 |
|
196 |
+%endrep ; %2/mmsize |
|
197 |
+ |
|
198 |
+%if mmsize == 16 |
|
199 |
+%if (%2-%%off) >= 8 |
|
200 |
+%if %2 > 16 && (%2-%%off) > 8 |
|
201 |
+ movu [dstq+%2-16], m %+ %%idx |
|
202 |
+%assign %%off %2 |
|
203 |
+%else |
|
204 |
+ movq [dstq+%%off], m %+ %%idx |
|
205 |
+%assign %%off %%off+8 |
|
206 |
+%endif |
|
207 |
+%assign %%idx %%idx+1 |
|
208 |
+%endif ; (%2-%%off) >= 8 |
|
236 | 209 |
%endif |
237 | 210 |
|
238 |
-%if ARCH_X86_64 |
|
239 |
-%if (%2-%%dst_off) == 8 |
|
240 |
- mov [r0+%%dst_off], rax |
|
241 |
-%assign %%dst_off %%dst_off+8 |
|
242 |
-%endif ; (%2-%%dst_off) == 8 |
|
243 |
-%endif ; x86-64 |
|
244 |
- |
|
245 |
-%rep (%2-%%dst_off)/8 |
|
246 |
- movq [r0+%%dst_off], mm %+ %%dmidx |
|
247 |
-%assign %%dst_off %%dst_off+8 |
|
248 |
-%assign %%dmidx %%dmidx+1 |
|
249 |
-%endrep ; (%2-%%dst_off)/8 |
|
250 |
- |
|
251 |
-%if (%2-%%dst_off) == 4 |
|
252 |
- mov [r0+%%dst_off], vald |
|
253 |
-%elif (%2-%%dst_off) & 4 |
|
254 |
- movd [r0+%%dst_off], mm %+ %%dmidx |
|
255 |
-%assign %%dst_off %%dst_off+4 |
|
256 |
-%endif ; (%2-%%dst_off) ==/& 4 |
|
257 |
- |
|
258 |
-%if (%2-%%dst_off) == 1 |
|
259 |
- mov [r0+%%dst_off], vall |
|
260 |
-%elif (%2-%%dst_off) == 2 |
|
261 |
- mov [r0+%%dst_off], valw |
|
262 |
-%elif (%2-%%dst_off) == 3 |
|
263 |
-%ifidn %1, top |
|
264 |
- mov [r0+%%dst_off], valw2 |
|
211 |
+%if (%2-%%off) >= 4 |
|
212 |
+%if %2 > 8 && (%2-%%off) > 4 |
|
213 |
+ movq [dstq+%2-8], m %+ %%idx |
|
214 |
+%assign %%off %2 |
|
215 |
+%else |
|
216 |
+ movd [dstq+%%off], m %+ %%idx |
|
217 |
+%assign %%off %%off+4 |
|
218 |
+%endif |
|
219 |
+%assign %%idx %%idx+1 |
|
220 |
+%endif ; (%2-%%off) >= 4 |
|
221 |
+ |
|
222 |
+%if (%2-%%off) >= 1 |
|
223 |
+%if %2 >= 4 |
|
224 |
+ movd [dstq+%2-4], m %+ %%idx |
|
225 |
+%elif (%2-%%off) == 1 |
|
226 |
+ mov [dstq+%2-1], valb |
|
227 |
+%elif (%2-%%off) == 2 |
|
228 |
+ mov [dstq+%2-2], valw |
|
265 | 229 |
%elifidn %1, body |
266 |
- mov [r0+%%dst_off], valw3 |
|
267 |
-%elifidn %1, bottom |
|
268 |
- mov [r0+%%dst_off], valw4 |
|
269 |
-%endif ; %1 ==/!= top |
|
270 |
- mov [r0+%%dst_off+2], vall |
|
271 |
-%endif ; (%2-%%dst_off) == 1/2/3 |
|
230 |
+ mov [dstq+%2-3], valw |
|
231 |
+ shr vald, 16 |
|
232 |
+ mov [dstq+%2-1], valb |
|
233 |
+%else |
|
234 |
+ movd vald, m %+ %%idx |
|
235 |
+ mov [dstq+%2-3], valw |
|
236 |
+ shr vald, 16 |
|
237 |
+ mov [dstq+%2-1], valb |
|
238 |
+%endif |
|
239 |
+%endif ; (%2-%%off) >= 1 |
|
272 | 240 |
%endmacro ; WRITE_NUM_BYTES |
273 | 241 |
|
274 | 242 |
; vertical top/bottom extend and body copy fast loops |
275 | 243 |
; these are function pointers to set-width line copy functions, i.e. |
276 | 244 |
; they read a fixed number of pixels into set registers, and write |
277 | 245 |
; those out into the destination buffer |
278 |
-; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h |
|
279 |
-; r6(eax/64)/r3(ebx/32)=val_reg |
|
280 |
-%macro VERTICAL_EXTEND 0 |
|
281 |
-%assign %%n 1 |
|
282 |
-%rep 22 |
|
283 |
-ALIGN 128 |
|
284 |
-.emuedge_v_extend_ %+ %%n: |
|
285 |
- ; extend pixels above body |
|
246 |
+%macro VERTICAL_EXTEND 2 |
|
247 |
+%assign %%n %1 |
|
248 |
+%rep 1+%2-%1 |
|
249 |
+%if %%n <= 3 |
|
286 | 250 |
%if ARCH_X86_64 |
287 |
- test r3 , r3 ; if (!start_y) |
|
288 |
- jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body |
|
289 |
-%else ; ARCH_X86_32 |
|
290 |
- cmp dword r3m, 0 |
|
291 |
- je .emuedge_copy_body_ %+ %%n %+ _loop |
|
292 |
-%endif ; ARCH_X86_64/32 |
|
293 |
- READ_NUM_BYTES top, %%n ; read bytes |
|
294 |
-.emuedge_extend_top_ %+ %%n %+ _loop: ; do { |
|
295 |
- WRITE_NUM_BYTES top, %%n ; write bytes |
|
296 |
- add r0 , r2 ; dst += linesize |
|
251 |
+cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ |
|
252 |
+ start_y, end_y, val, bh |
|
253 |
+ mov bhq, r6mp ; r6mp = bhmp |
|
254 |
+%else ; x86-32 |
|
255 |
+cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh |
|
256 |
+ mov dstq, r0mp |
|
257 |
+ mov srcq, r2mp |
|
258 |
+ mov start_yq, r4mp |
|
259 |
+ mov end_yq, r5mp |
|
260 |
+ mov bhq, r6mp |
|
261 |
+%define dst_strideq r1mp |
|
262 |
+%define src_strideq r3mp |
|
263 |
+%endif ; x86-64/32 |
|
264 |
+%else |
|
297 | 265 |
%if ARCH_X86_64 |
298 |
- dec r3d |
|
299 |
-%else ; ARCH_X86_32 |
|
300 |
- dec dword r3m |
|
301 |
-%endif ; ARCH_X86_64/32 |
|
302 |
- jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y) |
|
266 |
+cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ |
|
267 |
+ start_y, end_y, bh |
|
268 |
+%else ; x86-32 |
|
269 |
+cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh |
|
270 |
+ mov srcq, r2mp |
|
271 |
+ mov start_yq, r4mp |
|
272 |
+ mov end_yq, r5mp |
|
273 |
+ mov bhq, r6mp |
|
274 |
+%define dst_strideq r1mp |
|
275 |
+%define src_strideq r3mp |
|
276 |
+%endif ; x86-64/32 |
|
277 |
+%endif |
|
278 |
+ ; FIXME move this to c wrapper? |
|
279 |
+ sub bhq, end_yq ; bh -= end_y |
|
280 |
+ sub end_yq, start_yq ; end_y -= start_y |
|
281 |
+ |
|
282 |
+ ; extend pixels above body |
|
283 |
+ test start_yq, start_yq ; if (start_y) { |
|
284 |
+ jz .body_loop |
|
285 |
+ READ_NUM_BYTES top, %%n ; $variable_regs = read($n) |
|
286 |
+.top_loop: ; do { |
|
287 |
+ WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) |
|
288 |
+ add dstq, dst_strideq ; dst += linesize |
|
289 |
+ dec start_yq ; } while (--start_y) |
|
290 |
+ jnz .top_loop ; } |
|
303 | 291 |
|
304 | 292 |
; copy body pixels |
305 |
-.emuedge_copy_body_ %+ %%n %+ _loop: ; do { |
|
306 |
- READ_NUM_BYTES body, %%n ; read bytes |
|
307 |
- WRITE_NUM_BYTES body, %%n ; write bytes |
|
308 |
- add r0 , r2 ; dst += linesize |
|
309 |
- add r1 , r2 ; src += linesize |
|
310 |
- dec r4d |
|
311 |
- jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y) |
|
293 |
+.body_loop: ; do { |
|
294 |
+ READ_NUM_BYTES body, %%n ; $variable_regs = read($n) |
|
295 |
+ WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) |
|
296 |
+ add dstq, dst_strideq ; dst += dst_stride |
|
297 |
+ add srcq, src_strideq ; src += src_stride |
|
298 |
+ dec end_yq ; } while (--end_y) |
|
299 |
+ jnz .body_loop |
|
312 | 300 |
|
313 | 301 |
; copy bottom pixels |
314 |
- test r5 , r5 ; if (!block_h) |
|
315 |
- jz .emuedge_v_extend_end_ %+ %%n ; goto end |
|
316 |
- sub r1 , r2 ; src -= linesize |
|
317 |
- READ_NUM_BYTES bottom, %%n ; read bytes |
|
318 |
-.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do { |
|
319 |
- WRITE_NUM_BYTES bottom, %%n ; write bytes |
|
320 |
- add r0 , r2 ; dst += linesize |
|
321 |
- dec r5d |
|
322 |
- jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h) |
|
323 |
- |
|
324 |
-.emuedge_v_extend_end_ %+ %%n: |
|
325 |
-%if ARCH_X86_64 |
|
326 |
- ret |
|
327 |
-%else ; ARCH_X86_32 |
|
328 |
- rep ret |
|
329 |
-%endif ; ARCH_X86_64/32 |
|
302 |
+ test bhq, bhq ; if (block_h) { |
|
303 |
+ jz .end |
|
304 |
+ sub srcq, src_strideq ; src -= linesize |
|
305 |
+ READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) |
|
306 |
+.bottom_loop: ; do { |
|
307 |
+ WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) |
|
308 |
+ add dstq, dst_strideq ; dst += linesize |
|
309 |
+ dec bhq ; } while (--bh) |
|
310 |
+ jnz .bottom_loop ; } |
|
311 |
+ |
|
312 |
+.end: |
|
313 |
+ RET |
|
330 | 314 |
%assign %%n %%n+1 |
331 |
-%endrep |
|
332 |
-%endmacro VERTICAL_EXTEND |
|
315 |
+%endrep ; 1+%2-%1 |
|
316 |
+%endmacro ; VERTICAL_EXTEND |
|
317 |
+ |
|
318 |
+INIT_MMX mmx |
|
319 |
+VERTICAL_EXTEND 1, 15 |
|
320 |
+%if ARCH_X86_32 |
|
321 |
+VERTICAL_EXTEND 16, 22 |
|
322 |
+%endif |
|
323 |
+ |
|
324 |
+INIT_XMM sse |
|
325 |
+VERTICAL_EXTEND 16, 22 |
|
333 | 326 |
|
334 | 327 |
; left/right (horizontal) fast extend functions |
335 | 328 |
; these are essentially identical to the vertical extend ones above, |
336 | 329 |
; just left/right separated because number of pixels to extend is |
337 | 330 |
; obviously not the same on both sides. |
338 |
-; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the |
|
339 |
-; lowest two bytes of the register (so val*0x0101), and are splatted |
|
340 |
-; into each byte of mm0 as well if n_pixels >= 8 |
|
341 | 331 |
|
342 | 332 |
%macro READ_V_PIXEL 2 |
343 |
- mov vall, %2 |
|
344 |
- mov valh, vall |
|
345 |
-%if %1 >= 8 |
|
346 |
- movd mm0, vald |
|
347 |
-%if cpuflag(mmxext) |
|
348 |
- pshufw mm0, mm0, 0 |
|
349 |
-%else ; mmx |
|
350 |
- punpcklwd mm0, mm0 |
|
351 |
- punpckldq mm0, mm0 |
|
352 |
-%endif ; sse |
|
353 |
-%endif ; %1 >= 8 |
|
354 |
-%endmacro |
|
355 |
- |
|
356 |
-%macro WRITE_V_PIXEL 2 |
|
357 |
-%assign %%dst_off 0 |
|
358 |
-%rep %1/8 |
|
359 |
- movq [%2+%%dst_off], mm0 |
|
360 |
-%assign %%dst_off %%dst_off+8 |
|
361 |
-%endrep |
|
362 |
-%if %1 & 4 |
|
333 |
+%if %1 == 2 |
|
334 |
+ movzx valw, byte %2 |
|
335 |
+ imul valw, 0x0101 |
|
336 |
+%else |
|
337 |
+ movzx vald, byte %2 |
|
338 |
+ imul vald, 0x01010101 |
|
363 | 339 |
%if %1 >= 8 |
364 |
- movd [%2+%%dst_off], mm0 |
|
365 |
-%else ; %1 < 8 |
|
366 |
- mov [%2+%%dst_off] , valw |
|
367 |
- mov [%2+%%dst_off+2], valw |
|
368 |
-%endif ; %1 >=/< 8 |
|
369 |
-%assign %%dst_off %%dst_off+4 |
|
370 |
-%endif ; %1 & 4 |
|
371 |
-%if %1&2 |
|
372 |
- mov [%2+%%dst_off], valw |
|
373 |
-%endif ; %1 & 2 |
|
374 |
-%endmacro |
|
375 |
- |
|
376 |
-; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val |
|
377 |
-%macro LEFT_EXTEND 0 |
|
378 |
-%assign %%n 2 |
|
379 |
-%rep 11 |
|
380 |
-ALIGN 64 |
|
381 |
-.emuedge_extend_left_ %+ %%n: ; do { |
|
382 |
- sub r0, r2 ; dst -= linesize |
|
383 |
- READ_V_PIXEL %%n, [r0+r1] ; read pixels |
|
384 |
- WRITE_V_PIXEL %%n, r0 ; write pixels |
|
385 |
- dec r5 |
|
386 |
- jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h) |
|
387 |
-%if ARCH_X86_64 |
|
388 |
- ret |
|
389 |
-%else ; ARCH_X86_32 |
|
390 |
- rep ret |
|
391 |
-%endif ; ARCH_X86_64/32 |
|
392 |
-%assign %%n %%n+2 |
|
393 |
-%endrep |
|
394 |
-%endmacro ; LEFT_EXTEND |
|
395 |
- |
|
396 |
-; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val |
|
397 |
-%macro RIGHT_EXTEND 0 |
|
398 |
-%assign %%n 2 |
|
399 |
-%rep 11 |
|
400 |
-ALIGN 64 |
|
401 |
-.emuedge_extend_right_ %+ %%n: ; do { |
|
402 |
-%if ARCH_X86_64 |
|
403 |
- sub r3, r2 ; dst -= linesize |
|
404 |
- READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels |
|
405 |
- WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels |
|
406 |
- dec r8 |
|
407 |
-%else ; ARCH_X86_32 |
|
408 |
- sub r0, r2 ; dst -= linesize |
|
409 |
- READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels |
|
410 |
- WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels |
|
411 |
- dec r5 |
|
412 |
-%endif ; ARCH_X86_64/32 |
|
413 |
- jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h) |
|
414 |
-%if ARCH_X86_64 |
|
415 |
- ret |
|
416 |
-%else ; ARCH_X86_32 |
|
417 |
- rep ret |
|
418 |
-%endif ; ARCH_X86_64/32 |
|
419 |
-%assign %%n %%n+2 |
|
420 |
-%endrep |
|
421 |
- |
|
422 |
-%if ARCH_X86_32 |
|
423 |
-%define stack_offset 0x10 |
|
424 |
-%endif |
|
425 |
-%endmacro ; RIGHT_EXTEND |
|
426 |
- |
|
427 |
-; below follow the "slow" copy/extend functions, these act on a non-fixed |
|
428 |
-; width specified in a register, and run a loop to copy the full amount |
|
429 |
-; of bytes. They are optimized for copying of large amounts of pixels per |
|
430 |
-; line, so they unconditionally splat data into mm registers to copy 8 |
|
431 |
-; bytes per loop iteration. It could be considered to use xmm for x86-64 |
|
432 |
-; also, but I haven't optimized this as much (i.e. FIXME) |
|
433 |
-%macro V_COPY_NPX 4-5 |
|
434 |
-%if %0 == 4 |
|
435 |
- test w_reg, %4 |
|
436 |
- jz .%1_skip_%4_px |
|
437 |
-%else ; %0 == 5 |
|
438 |
-.%1_%4_px_loop: |
|
340 |
+ movd m0, vald |
|
341 |
+%if mmsize == 16 |
|
342 |
+ shufps m0, m0, q0000 |
|
343 |
+%else |
|
344 |
+ punpckldq m0, m0 |
|
439 | 345 |
%endif |
440 |
- %3 %2, [r1+cnt_reg] |
|
441 |
- %3 [r0+cnt_reg], %2 |
|
442 |
- add cnt_reg, %4 |
|
443 |
-%if %0 == 5 |
|
444 |
- sub w_reg, %4 |
|
445 |
- test w_reg, %5 |
|
446 |
- jnz .%1_%4_px_loop |
|
346 |
+%endif ; %1 >= 8 |
|
447 | 347 |
%endif |
448 |
-.%1_skip_%4_px: |
|
449 |
-%endmacro |
|
348 |
+%endmacro ; READ_V_PIXEL |
|
450 | 349 |
|
451 |
-%macro V_COPY_ROW 2 |
|
452 |
-%ifidn %1, bottom |
|
453 |
- sub r1, linesize |
|
350 |
+%macro WRITE_V_PIXEL 2 |
|
351 |
+%assign %%off 0 |
|
352 |
+%rep %1/mmsize |
|
353 |
+ movu [%2+%%off], m0 |
|
354 |
+%assign %%off %%off+mmsize |
|
355 |
+%endrep ; %1/mmsize |
|
356 |
+ |
|
357 |
+%if mmsize == 16 |
|
358 |
+%if %1-%%off >= 8 |
|
359 |
+%if %1 > 16 && %1-%%off > 8 |
|
360 |
+ movu [%2+%1-16], m0 |
|
361 |
+%assign %%off %1 |
|
362 |
+%else |
|
363 |
+ movq [%2+%%off], m0 |
|
364 |
+%assign %%off %%off+8 |
|
454 | 365 |
%endif |
455 |
-.%1_copy_loop: |
|
456 |
- xor cnt_reg, cnt_reg |
|
457 |
-%if notcpuflag(sse) |
|
458 |
-%define linesize r2m |
|
459 |
- V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8 |
|
460 |
-%else ; sse |
|
461 |
- V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0 |
|
462 |
-%if ARCH_X86_64 |
|
463 |
-%define linesize r2 |
|
464 |
- V_COPY_NPX %1, rax , mov, 8 |
|
465 |
-%else ; ARCH_X86_32 |
|
466 |
-%define linesize r2m |
|
467 |
- V_COPY_NPX %1, mm0, movq, 8 |
|
468 |
-%endif ; ARCH_X86_64/32 |
|
469 |
-%endif ; sse |
|
470 |
- V_COPY_NPX %1, vald, mov, 4 |
|
471 |
- V_COPY_NPX %1, valw, mov, 2 |
|
472 |
- V_COPY_NPX %1, vall, mov, 1 |
|
473 |
- mov w_reg, cnt_reg |
|
474 |
-%ifidn %1, body |
|
475 |
- add r1, linesize |
|
366 |
+%endif ; %1-%%off >= 8 |
|
476 | 367 |
%endif |
477 |
- add r0, linesize |
|
478 |
- dec %2 |
|
479 |
- jnz .%1_copy_loop |
|
480 |
-%endmacro |
|
481 | 368 |
|
482 |
-%macro SLOW_V_EXTEND 0 |
|
483 |
-.slow_v_extend_loop: |
|
484 |
-; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h |
|
485 |
-; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x |
|
486 |
-%if ARCH_X86_64 |
|
487 |
- push r8 ; save old value of block_h |
|
488 |
- test r3, r3 |
|
489 |
-%define cnt_reg r8 |
|
490 |
- jz .do_body_copy ; if (!start_y) goto do_body_copy |
|
491 |
- V_COPY_ROW top, r3 |
|
369 |
+%if %1-%%off >= 4 |
|
370 |
+%if %1 > 8 %% %1-%%off > 4 |
|
371 |
+ movq [%2+%1-8], m0 |
|
372 |
+%assign %%off %1 |
|
373 |
+%elif %1 >= 8 && %1-%%off >= 4 |
|
374 |
+ movd [%2+%%off], m0 |
|
375 |
+%assign %%off %%off+4 |
|
492 | 376 |
%else |
493 |
- cmp dword r3m, 0 |
|
494 |
-%define cnt_reg r2 |
|
495 |
- je .do_body_copy ; if (!start_y) goto do_body_copy |
|
496 |
- V_COPY_ROW top, dword r3m |
|
377 |
+ mov [%2+%%off], vald |
|
378 |
+%assign %%off %%off+4 |
|
497 | 379 |
%endif |
380 |
+%endif ; %1-%%off >= 4 |
|
498 | 381 |
|
499 |
-.do_body_copy: |
|
500 |
- V_COPY_ROW body, r4 |
|
501 |
- |
|
502 |
-%if ARCH_X86_64 |
|
503 |
- pop r8 ; restore old value of block_h |
|
504 |
-%define cnt_reg r3 |
|
505 |
-%endif |
|
506 |
- test r5, r5 |
|
507 |
-%if ARCH_X86_64 |
|
508 |
- jz .v_extend_end |
|
382 |
+%if %1-%%off >= 2 |
|
383 |
+%if %1 >= 8 |
|
384 |
+ movd [%2+%1-4], m0 |
|
509 | 385 |
%else |
510 |
- jz .skip_bottom_extend |
|
386 |
+ mov [%2+%%off], valw |
|
511 | 387 |
%endif |
512 |
- V_COPY_ROW bottom, r5 |
|
513 |
-%if ARCH_X86_32 |
|
514 |
-.skip_bottom_extend: |
|
515 |
- mov r2, r2m |
|
516 |
-%endif |
|
517 |
- jmp .v_extend_end |
|
518 |
-%endmacro |
|
388 |
+%endif ; (%1-%%off)/2 |
|
389 |
+%endmacro ; WRITE_V_PIXEL |
|
390 |
+ |
|
391 |
+%macro H_EXTEND 2 |
|
392 |
+%assign %%n %1 |
|
393 |
+%rep 1+(%2-%1)/2 |
|
394 |
+cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val |
|
395 |
+.loop_y: ; do { |
|
396 |
+ READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) |
|
397 |
+ WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) |
|
398 |
+ add dstq, dst_strideq ; dst += dst_stride |
|
399 |
+ dec bhq ; } while (--bh) |
|
400 |
+ jnz .loop_y |
|
401 |
+ RET |
|
402 |
+%assign %%n %%n+2 |
|
403 |
+%endrep ; 1+(%2-%1)/2 |
|
404 |
+%endmacro ; H_EXTEND |
|
519 | 405 |
|
520 |
-%macro SLOW_LEFT_EXTEND 0 |
|
521 |
-.slow_left_extend_loop: |
|
522 |
-; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x |
|
523 |
- mov r4, 8 |
|
524 |
- sub r0, linesize |
|
525 |
- READ_V_PIXEL 8, [r0+w_reg] |
|
526 |
-.left_extend_8px_loop: |
|
527 |
- movq [r0+r4-8], mm0 |
|
528 |
- add r4, 8 |
|
529 |
- cmp r4, w_reg |
|
530 |
- jle .left_extend_8px_loop |
|
531 |
- sub r4, 8 |
|
532 |
- cmp r4, w_reg |
|
533 |
- jge .left_extend_loop_end |
|
534 |
-.left_extend_2px_loop: |
|
535 |
- mov [r0+r4], valw |
|
536 |
- add r4, 2 |
|
537 |
- cmp r4, w_reg |
|
538 |
- jl .left_extend_2px_loop |
|
539 |
-.left_extend_loop_end: |
|
540 |
- dec r5 |
|
541 |
- jnz .slow_left_extend_loop |
|
406 |
+INIT_MMX mmx |
|
407 |
+H_EXTEND 2, 14 |
|
542 | 408 |
%if ARCH_X86_32 |
543 |
- mov r2, r2m |
|
544 |
-%endif |
|
545 |
- jmp .right_extend |
|
546 |
-%endmacro |
|
547 |
- |
|
548 |
-%macro SLOW_RIGHT_EXTEND 0 |
|
549 |
-.slow_right_extend_loop: |
|
550 |
-; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h, |
|
551 |
-; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr |
|
552 |
-%if ARCH_X86_64 |
|
553 |
-%define buf_reg r3 |
|
554 |
-%define bh_reg r8 |
|
555 |
-%else |
|
556 |
-%define buf_reg r0 |
|
557 |
-%define bh_reg r5 |
|
409 |
+H_EXTEND 16, 22 |
|
558 | 410 |
%endif |
559 |
- lea r1, [r4-8] |
|
560 |
- sub buf_reg, linesize |
|
561 |
- READ_V_PIXEL 8, [buf_reg+w_reg-1] |
|
562 |
-.right_extend_8px_loop: |
|
563 |
- movq [buf_reg+r1], mm0 |
|
564 |
- sub r1, 8 |
|
565 |
- cmp r1, w_reg |
|
566 |
- jge .right_extend_8px_loop |
|
567 |
- add r1, 8 |
|
568 |
- cmp r1, w_reg |
|
569 |
- je .right_extend_loop_end |
|
570 |
-.right_extend_2px_loop: |
|
571 |
- sub r1, 2 |
|
572 |
- mov [buf_reg+r1], valw |
|
573 |
- cmp r1, w_reg |
|
574 |
- jg .right_extend_2px_loop |
|
575 |
-.right_extend_loop_end: |
|
576 |
- dec bh_reg |
|
577 |
- jnz .slow_right_extend_loop |
|
578 |
- jmp .h_extend_end |
|
579 |
-%endmacro |
|
580 |
- |
|
581 |
-%macro emu_edge 1 |
|
582 |
-INIT_XMM %1 |
|
583 |
-EMU_EDGE_FUNC |
|
584 |
-VERTICAL_EXTEND |
|
585 |
-LEFT_EXTEND |
|
586 |
-RIGHT_EXTEND |
|
587 |
-SLOW_V_EXTEND |
|
588 |
-SLOW_LEFT_EXTEND |
|
589 |
-SLOW_RIGHT_EXTEND |
|
590 |
-%endmacro |
|
591 | 411 |
|
592 |
-emu_edge sse |
|
593 |
-%if ARCH_X86_32 |
|
594 |
-emu_edge mmx |
|
595 |
-%endif |
|
412 |
+INIT_XMM sse |
|
413 |
+H_EXTEND 16, 22 |
|
596 | 414 |
|
597 | 415 |
%macro PREFETCH_FN 1 |
598 | 416 |
cglobal prefetch, 3, 3, 0, buf, stride, h |
... | ... |
@@ -30,32 +30,126 @@ |
30 | 30 |
#include "libavcodec/videodsp.h" |
31 | 31 |
|
32 | 32 |
#if HAVE_YASM |
33 |
-typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, |
|
34 |
- x86_reg linesize, x86_reg start_y, |
|
35 |
- x86_reg end_y, x86_reg block_h, |
|
36 |
- x86_reg start_x, x86_reg end_x, |
|
37 |
- x86_reg block_w); |
|
38 |
-extern emu_edge_core_func ff_emu_edge_core_mmx; |
|
39 |
-extern emu_edge_core_func ff_emu_edge_core_sse; |
|
40 |
- |
|
41 |
-static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src, |
|
42 |
- ptrdiff_t linesize, |
|
43 |
- int block_w, int block_h, |
|
44 |
- int src_x, int src_y, |
|
45 |
- int w, int h, |
|
46 |
- emu_edge_core_func *core_fn) |
|
33 |
+typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, |
|
34 |
+ const uint8_t *src, x86_reg src_stride, |
|
35 |
+ x86_reg start_y, x86_reg end_y, x86_reg bh); |
|
36 |
+typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, |
|
37 |
+ const uint8_t *src, x86_reg src_stride, |
|
38 |
+ x86_reg start_y, x86_reg end_y, x86_reg bh, |
|
39 |
+ x86_reg w); |
|
40 |
+ |
|
41 |
+extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; |
|
42 |
+extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; |
|
43 |
+extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; |
|
44 |
+extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; |
|
45 |
+extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; |
|
46 |
+extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; |
|
47 |
+extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; |
|
48 |
+extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; |
|
49 |
+extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; |
|
50 |
+extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; |
|
51 |
+extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; |
|
52 |
+extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; |
|
53 |
+extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; |
|
54 |
+extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; |
|
55 |
+extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; |
|
56 |
+extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx; |
|
57 |
+extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx; |
|
58 |
+extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx; |
|
59 |
+extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx; |
|
60 |
+extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx; |
|
61 |
+extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx; |
|
62 |
+extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx; |
|
63 |
+#if ARCH_X86_32 |
|
64 |
+static emu_edge_vfix_func *vfixtbl_mmx[22] = { |
|
65 |
+ &ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx, |
|
66 |
+ &ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx, |
|
67 |
+ &ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx, |
|
68 |
+ &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx, |
|
69 |
+ &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx, |
|
70 |
+ &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx, |
|
71 |
+ &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx, |
|
72 |
+ &ff_emu_edge_vfix22_mmx |
|
73 |
+}; |
|
74 |
+#endif |
|
75 |
+extern emu_edge_vvar_func ff_emu_edge_vvar_mmx; |
|
76 |
+extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; |
|
77 |
+extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; |
|
78 |
+extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; |
|
79 |
+extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; |
|
80 |
+extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; |
|
81 |
+extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; |
|
82 |
+extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; |
|
83 |
+static emu_edge_vfix_func *vfixtbl_sse[22] = { |
|
84 |
+ ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, |
|
85 |
+ ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, |
|
86 |
+ ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, |
|
87 |
+ ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, |
|
88 |
+ ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, |
|
89 |
+ ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, |
|
90 |
+ ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, |
|
91 |
+ ff_emu_edge_vfix22_sse |
|
92 |
+}; |
|
93 |
+extern emu_edge_vvar_func ff_emu_edge_vvar_sse; |
|
94 |
+ |
|
95 |
+typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride, |
|
96 |
+ x86_reg start_x, x86_reg bh); |
|
97 |
+typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, |
|
98 |
+ x86_reg start_x, x86_reg n_words, x86_reg bh); |
|
99 |
+ |
|
100 |
+extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; |
|
101 |
+extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; |
|
102 |
+extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; |
|
103 |
+extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; |
|
104 |
+extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; |
|
105 |
+extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; |
|
106 |
+extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; |
|
107 |
+extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx; |
|
108 |
+extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx; |
|
109 |
+extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx; |
|
110 |
+extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx; |
|
111 |
+#if ARCH_X86_32 |
|
112 |
+static emu_edge_hfix_func *hfixtbl_mmx[11] = { |
|
113 |
+ ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, |
|
114 |
+ ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, |
|
115 |
+ ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx, |
|
116 |
+ ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx |
|
117 |
+}; |
|
118 |
+#endif |
|
119 |
+extern emu_edge_hvar_func ff_emu_edge_hvar_mmx; |
|
120 |
+extern emu_edge_hfix_func ff_emu_edge_hfix16_sse; |
|
121 |
+extern emu_edge_hfix_func ff_emu_edge_hfix18_sse; |
|
122 |
+extern emu_edge_hfix_func ff_emu_edge_hfix20_sse; |
|
123 |
+extern emu_edge_hfix_func ff_emu_edge_hfix22_sse; |
|
124 |
+static emu_edge_hfix_func *hfixtbl_sse[11] = { |
|
125 |
+ ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, |
|
126 |
+ ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, |
|
127 |
+ ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse, ff_emu_edge_hfix18_sse, |
|
128 |
+ ff_emu_edge_hfix20_sse, ff_emu_edge_hfix22_sse |
|
129 |
+}; |
|
130 |
+extern emu_edge_hvar_func ff_emu_edge_hvar_sse; |
|
131 |
+ |
|
132 |
+static av_always_inline void emulated_edge_mc(uint8_t *dst, ptrdiff_t dst_stride, |
|
133 |
+ const uint8_t *src, ptrdiff_t src_stride, |
|
134 |
+ x86_reg block_w, x86_reg block_h, |
|
135 |
+ x86_reg src_x, x86_reg src_y, |
|
136 |
+ x86_reg w, x86_reg h, |
|
137 |
+ emu_edge_vfix_func **vfix_tbl, |
|
138 |
+ emu_edge_vvar_func *v_extend_var, |
|
139 |
+ emu_edge_hfix_func **hfix_tbl, |
|
140 |
+ emu_edge_hvar_func *h_extend_var) |
|
47 | 141 |
{ |
48 |
- int start_y, start_x, end_y, end_x, src_y_add = 0; |
|
142 |
+ x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; |
|
49 | 143 |
|
50 | 144 |
if(!w || !h) |
51 | 145 |
return; |
52 | 146 |
|
53 | 147 |
if (src_y >= h) { |
54 |
- src -= src_y*linesize; |
|
148 |
+ src -= src_y*src_stride; |
|
55 | 149 |
src_y_add = h - 1; |
56 | 150 |
src_y = h - 1; |
57 | 151 |
} else if (src_y <= -block_h) { |
58 |
- src -= src_y*linesize; |
|
152 |
+ src -= src_y*src_stride; |
|
59 | 153 |
src_y_add = 1 - block_h; |
60 | 154 |
src_y = 1 - block_h; |
61 | 155 |
} |
... | ... |
@@ -75,30 +169,59 @@ static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src, |
75 | 75 |
av_assert2(start_y < end_y && block_h > 0); |
76 | 76 |
|
77 | 77 |
// fill in the to-be-copied part plus all above/below |
78 |
- src += (src_y_add + start_y) * linesize + start_x; |
|
79 |
- buf += start_x; |
|
80 |
- core_fn(buf, src, linesize, start_y, end_y, |
|
81 |
- block_h, start_x, end_x, block_w); |
|
78 |
+ src += (src_y_add + start_y) * src_stride + start_x; |
|
79 |
+ w = end_x - start_x; |
|
80 |
+ if (w <= 22) { |
|
81 |
+ vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, |
|
82 |
+ start_y, end_y, block_h); |
|
83 |
+ } else { |
|
84 |
+ v_extend_var(dst + start_x, dst_stride, src, src_stride, |
|
85 |
+ start_y, end_y, block_h, w); |
|
86 |
+ } |
|
87 |
+ |
|
88 |
+ // fill left |
|
89 |
+ if (start_x) { |
|
90 |
+ if (start_x <= 22) { |
|
91 |
+ hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); |
|
92 |
+ } else { |
|
93 |
+ h_extend_var(dst, dst_stride, |
|
94 |
+ start_x, (start_x + 1) >> 1, block_h); |
|
95 |
+ } |
|
96 |
+ } |
|
97 |
+ |
|
98 |
+ // fill right |
|
99 |
+ p = block_w - end_x; |
|
100 |
+ if (p) { |
|
101 |
+ if (p <= 22) { |
|
102 |
+ hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride, |
|
103 |
+ -!(p & 1), block_h); |
|
104 |
+ } else { |
|
105 |
+ h_extend_var(dst + end_x - (p & 1), dst_stride, |
|
106 |
+ -!(p & 1), (p + 1) >> 1, block_h); |
|
107 |
+ } |
|
108 |
+ } |
|
82 | 109 |
} |
83 | 110 |
|
84 | 111 |
#if ARCH_X86_32 |
85 |
-static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, |
|
86 |
- ptrdiff_t linesize, |
|
112 |
+static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, ptrdiff_t buf_stride, |
|
113 |
+ const uint8_t *src, ptrdiff_t src_stride, |
|
87 | 114 |
int block_w, int block_h, |
88 | 115 |
int src_x, int src_y, int w, int h) |
89 | 116 |
{ |
90 |
- emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, |
|
91 |
- w, h, &ff_emu_edge_core_mmx); |
|
117 |
+ emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h, |
|
118 |
+ src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx, |
|
119 |
+ hfixtbl_mmx, &ff_emu_edge_hvar_mmx); |
|
92 | 120 |
} |
93 | 121 |
#endif |
94 | 122 |
|
95 |
-static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, |
|
96 |
- ptrdiff_t linesize, |
|
123 |
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, ptrdiff_t buf_stride, |
|
124 |
+ const uint8_t *src, ptrdiff_t src_stride, |
|
97 | 125 |
int block_w, int block_h, |
98 | 126 |
int src_x, int src_y, int w, int h) |
99 | 127 |
{ |
100 |
- emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, |
|
101 |
- w, h, &ff_emu_edge_core_sse); |
|
128 |
+ emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h, |
|
129 |
+ src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, |
|
130 |
+ hfixtbl_sse, &ff_emu_edge_hvar_sse); |
|
102 | 131 |
} |
103 | 132 |
#endif /* HAVE_YASM */ |
104 | 133 |
|