Also implement MMX/MMX2 versions and SSE4 versions.
| ... | ... |
@@ -29,8 +29,11 @@ max_19bit_int: times 4 dd 0x7ffff |
| 29 | 29 |
max_19bit_flt: times 4 dd 524287.0 |
| 30 | 30 |
minshort: times 8 dw 0x8000 |
| 31 | 31 |
unicoeff: times 4 dd 0x20000000 |
| 32 |
+yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 |
|
| 32 | 33 |
yuv2yuvX_10_start: times 4 dd 0x10000 |
| 34 |
+yuv2yuvX_9_start: times 4 dd 0x20000 |
|
| 33 | 35 |
yuv2yuvX_10_upper: times 8 dw 0x3ff |
| 36 |
+yuv2yuvX_9_upper: times 8 dw 0x1ff |
|
| 34 | 37 |
|
| 35 | 38 |
SECTION .text |
| 36 | 39 |
|
| ... | ... |
@@ -447,58 +450,216 @@ SCALE_FUNCS2 sse4, 6, 6, 8 |
| 447 | 447 |
; of 2. $offset is either 0 or 3. $dither holds 8 values. |
| 448 | 448 |
;----------------------------------------------------------------------------- |
| 449 | 449 |
|
| 450 |
-%macro yuv2planeX10 1 |
|
| 450 |
+%macro yuv2planeX_fn 4 |
|
| 451 | 451 |
|
| 452 | 452 |
%ifdef ARCH_X86_32 |
| 453 | 453 |
%define cntr_reg r1 |
| 454 |
+%define movsx mov |
|
| 454 | 455 |
%else |
| 455 | 456 |
%define cntr_reg r11 |
| 457 |
+%define movsx movsxd |
|
| 456 | 458 |
%endif |
| 457 | 459 |
|
| 458 |
-cglobal yuv2planeX10_%1, 7, 7 |
|
| 459 |
- xor r5, r5 |
|
| 460 |
-.pixelloop |
|
| 461 |
- mova m1, [yuv2yuvX_10_start] |
|
| 462 |
- mova m2, m1 |
|
| 463 |
- movsxdifnidn cntr_reg, r1d |
|
| 464 |
-.filterloop |
|
| 465 |
- pxor m0, m0 |
|
| 466 |
- |
|
| 467 |
- mov r6, [r2+gprsize*cntr_reg-2*gprsize] |
|
| 468 |
- mova m3, [r6+r5] |
|
| 469 |
- |
|
| 470 |
- mov r6, [r2+gprsize*cntr_reg-gprsize] |
|
| 471 |
- mova m4, [r6+r5] |
|
| 472 |
- |
|
| 473 |
- punpcklwd m5, m3, m4 |
|
| 474 |
- punpckhwd m3, m4 |
|
| 475 |
- |
|
| 476 |
- movd m0, [r0+2*cntr_reg-4] |
|
| 477 |
- SPLATD m0, m0 |
|
| 478 |
- |
|
| 479 |
- pmaddwd m5, m0 |
|
| 480 |
- pmaddwd m3, m0 |
|
| 481 |
- |
|
| 482 |
- paddd m2, m5 |
|
| 483 |
- paddd m1, m3 |
|
| 484 |
- |
|
| 485 |
- sub cntr_reg, 2 |
|
| 486 |
- jg .filterloop |
|
| 460 |
+cglobal yuv2planeX_%2_%1, %4, 7, %3 |
|
| 461 |
+%if %2 == 8 || %2 == 9 || %2 == 10 |
|
| 462 |
+ pxor m6, m6 |
|
| 463 |
+%endif ; %2 == 8/9/10 |
|
| 487 | 464 |
|
| 488 |
- psrad m2, 17 |
|
| 489 |
- psrad m1, 17 |
|
| 465 |
+%if %2 == 8 |
|
| 466 |
+%ifdef ARCH_X86_32 |
|
| 467 |
+%assign pad 0x2c - (stack_offset & 15) |
|
| 468 |
+ SUB rsp, pad |
|
| 469 |
+%define m_dith m7 |
|
| 470 |
+%else ; x86-64 |
|
| 471 |
+%define m_dith m9 |
|
| 472 |
+%endif ; x86-32 |
|
| 473 |
+ |
|
| 474 |
+ ; create registers holding dither |
|
| 475 |
+ movq m_dith, [r5] ; dither |
|
| 476 |
+ test r6d, r6d |
|
| 477 |
+ jz .no_rot |
|
| 478 |
+%if mmsize == 16 |
|
| 479 |
+ punpcklqdq m_dith, m_dith |
|
| 480 |
+%endif ; mmsize == 16 |
|
| 481 |
+ PALIGNR m_dith, m_dith, 3, m0 |
|
| 482 |
+.no_rot: |
|
| 483 |
+%if mmsize == 16 |
|
| 484 |
+ punpcklbw m_dith, m6 |
|
| 485 |
+%ifdef ARCH_X86_64 |
|
| 486 |
+ punpcklwd m8, m_dith, m6 |
|
| 487 |
+ pslld m8, 12 |
|
| 488 |
+%else ; x86-32 |
|
| 489 |
+ punpcklwd m5, m_dith, m6 |
|
| 490 |
+ pslld m5, 12 |
|
| 491 |
+%endif ; x86-32/64 |
|
| 492 |
+ punpckhwd m_dith, m6 |
|
| 493 |
+ pslld m_dith, 12 |
|
| 494 |
+%ifdef ARCH_X86_32 |
|
| 495 |
+ mova [rsp+ 0], m5 |
|
| 496 |
+ mova [rsp+16], m_dith |
|
| 497 |
+%endif |
|
| 498 |
+%else ; mmsize == 8 |
|
| 499 |
+ punpcklbw m5, m_dith, m6 |
|
| 500 |
+ punpckhbw m_dith, m6 |
|
| 501 |
+ punpcklwd m4, m5, m6 |
|
| 502 |
+ punpckhwd m5, m6 |
|
| 503 |
+ punpcklwd m3, m_dith, m6 |
|
| 504 |
+ punpckhwd m_dith, m6 |
|
| 505 |
+ pslld m4, 12 |
|
| 506 |
+ pslld m5, 12 |
|
| 507 |
+ pslld m3, 12 |
|
| 508 |
+ pslld m_dith, 12 |
|
| 509 |
+ mova [rsp+ 0], m4 |
|
| 510 |
+ mova [rsp+ 8], m5 |
|
| 511 |
+ mova [rsp+16], m3 |
|
| 512 |
+ mova [rsp+24], m_dith |
|
| 513 |
+%endif ; mmsize == 8/16 |
|
| 514 |
+%endif ; %2 == 8 |
|
| 490 | 515 |
|
| 491 |
- packusdw m2, m1 |
|
| 492 |
- pminsw m2, [yuv2yuvX_10_upper] |
|
| 493 |
- mova [r3+r5], m2 |
|
| 516 |
+ xor r5, r5 |
|
| 494 | 517 |
|
| 495 |
- add r5, mmsize |
|
| 496 |
- sub r4d, mmsize/2 |
|
| 518 |
+.pixelloop |
|
| 519 |
+%assign %%i 0 |
|
| 520 |
+ ; the rep here is for the 8bit output mmx case, where dither covers |
|
| 521 |
+ ; 8 pixels but we can only handle 2 pixels per register, and thus 4 |
|
| 522 |
+ ; pixels per iteration. In order to not have to keep track of where |
|
| 523 |
+ ; we are w.r.t. dithering, we unroll the mmx/8bit loop x2. |
|
| 524 |
+%if %2 == 8 |
|
| 525 |
+%rep 16/mmsize |
|
| 526 |
+%endif ; %2 == 8 |
|
| 527 |
+ |
|
| 528 |
+%if %2 == 8 |
|
| 529 |
+%ifdef ARCH_X86_32 |
|
| 530 |
+ mova m2, [rsp+mmsize*(0+%%i)] |
|
| 531 |
+ mova m1, [rsp+mmsize*(1+%%i)] |
|
| 532 |
+%else ; x86-64 |
|
| 533 |
+ mova m2, m8 |
|
| 534 |
+ mova m1, m_dith |
|
| 535 |
+%endif ; x86-32/64 |
|
| 536 |
+%else ; %2 == 9/10/16 |
|
| 537 |
+ mova m1, [yuv2yuvX_%2_start] |
|
| 538 |
+ mova m2, m1 |
|
| 539 |
+%endif ; %2 == 8/9/10/16 |
|
| 540 |
+ movsx cntr_reg, r1m |
|
| 541 |
+.filterloop_ %+ %%i |
|
| 542 |
+ ; input pixels |
|
| 543 |
+ mov r6, [r2+gprsize*cntr_reg-2*gprsize] |
|
| 544 |
+%if %2 == 16 |
|
| 545 |
+ mova m3, [r6+r5*4] |
|
| 546 |
+ mova m5, [r6+r5*4+mmsize] |
|
| 547 |
+%else ; %2 == 8/9/10 |
|
| 548 |
+ mova m3, [r6+r5*2] |
|
| 549 |
+%endif ; %2 == 8/9/10/16 |
|
| 550 |
+ mov r6, [r2+gprsize*cntr_reg-gprsize] |
|
| 551 |
+%if %2 == 16 |
|
| 552 |
+ mova m4, [r6+r5*4] |
|
| 553 |
+ mova m6, [r6+r5*4+mmsize] |
|
| 554 |
+%else ; %2 == 8/9/10 |
|
| 555 |
+ mova m4, [r6+r5*2] |
|
| 556 |
+%endif ; %2 == 8/9/10/16 |
|
| 557 |
+ |
|
| 558 |
+ ; coefficients |
|
| 559 |
+ movd m0, [r0+2*cntr_reg-4]; coeff[0], coeff[1] |
|
| 560 |
+%if %2 == 16 |
|
| 561 |
+ pshuflw m7, m0, 0 ; coeff[0] |
|
| 562 |
+ pshuflw m0, m0, 0x55 ; coeff[1] |
|
| 563 |
+ pmovsxwd m7, m7 ; word -> dword |
|
| 564 |
+ pmovsxwd m0, m0 ; word -> dword |
|
| 565 |
+ |
|
| 566 |
+ pmulld m3, m7 |
|
| 567 |
+ pmulld m5, m7 |
|
| 568 |
+ pmulld m4, m0 |
|
| 569 |
+ pmulld m6, m0 |
|
| 570 |
+ |
|
| 571 |
+ paddd m2, m3 |
|
| 572 |
+ paddd m1, m5 |
|
| 573 |
+ paddd m2, m4 |
|
| 574 |
+ paddd m1, m6 |
|
| 575 |
+%else ; %2 == 10/9/8 |
|
| 576 |
+ punpcklwd m5, m3, m4 |
|
| 577 |
+ punpckhwd m3, m4 |
|
| 578 |
+ SPLATD m0, m0 |
|
| 579 |
+ |
|
| 580 |
+ pmaddwd m5, m0 |
|
| 581 |
+ pmaddwd m3, m0 |
|
| 582 |
+ |
|
| 583 |
+ paddd m2, m5 |
|
| 584 |
+ paddd m1, m3 |
|
| 585 |
+%endif ; %2 == 8/9/10/16 |
|
| 586 |
+ |
|
| 587 |
+ sub cntr_reg, 2 |
|
| 588 |
+ jg .filterloop_ %+ %%i |
|
| 589 |
+ |
|
| 590 |
+%if %2 == 16 |
|
| 591 |
+ psrad m2, 31 - %2 |
|
| 592 |
+ psrad m1, 31 - %2 |
|
| 593 |
+%else ; %2 == 10/9/8 |
|
| 594 |
+ psrad m2, 27 - %2 |
|
| 595 |
+ psrad m1, 27 - %2 |
|
| 596 |
+%endif ; %2 == 8/9/10/16 |
|
| 597 |
+ |
|
| 598 |
+%if %2 == 8 |
|
| 599 |
+ packssdw m2, m1 |
|
| 600 |
+ packuswb m2, m2 |
|
| 601 |
+ movh [r3+r5*1], m2 |
|
| 602 |
+%else ; %2 == 9/10/16 |
|
| 603 |
+%if %2 == 16 |
|
| 604 |
+ packssdw m2, m1 |
|
| 605 |
+ paddw m2, [minshort] |
|
| 606 |
+%else ; %2 == 9/10 |
|
| 607 |
+%ifidn %1, sse4 |
|
| 608 |
+ packusdw m2, m1 |
|
| 609 |
+%elifidn %1, avx |
|
| 610 |
+ packusdw m2, m1 |
|
| 611 |
+%else ; mmx2/sse2 |
|
| 612 |
+ packssdw m2, m1 |
|
| 613 |
+ pmaxsw m2, m6 |
|
| 614 |
+%endif ; mmx2/sse2/sse4/avx |
|
| 615 |
+ pminsw m2, [yuv2yuvX_%2_upper] |
|
| 616 |
+%endif ; %2 == 9/10/16 |
|
| 617 |
+ mova [r3+r5*2], m2 |
|
| 618 |
+%endif ; %2 == 8/9/10/16 |
|
| 619 |
+ |
|
| 620 |
+ add r5, mmsize/2 |
|
| 621 |
+ sub r4d, mmsize/2 |
|
| 622 |
+%if %2 == 8 |
|
| 623 |
+%assign %%i %%i+2 |
|
| 624 |
+%endrep |
|
| 625 |
+%endif ; %2 == 8 |
|
| 497 | 626 |
jg .pixelloop |
| 627 |
+ |
|
| 628 |
+%if %2 == 8 |
|
| 629 |
+%ifdef ARCH_X86_32 |
|
| 630 |
+ ADD rsp, pad |
|
| 631 |
+ RET |
|
| 632 |
+%else ; x86-64 |
|
| 633 |
+ REP_RET |
|
| 634 |
+%endif ; x86-32/64 |
|
| 635 |
+%else ; %2 == 9/10/16 |
|
| 498 | 636 |
REP_RET |
| 637 |
+%endif ; %2 == 8/9/10/16 |
|
| 499 | 638 |
%endmacro |
| 500 | 639 |
|
| 640 |
+%define PALIGNR PALIGNR_MMX |
|
| 641 |
+%ifdef ARCH_X86_32 |
|
| 642 |
+INIT_MMX |
|
| 643 |
+yuv2planeX_fn mmx, 8, 0, 7 |
|
| 644 |
+yuv2planeX_fn mmx2, 9, 0, 5 |
|
| 645 |
+yuv2planeX_fn mmx2, 10, 0, 5 |
|
| 646 |
+%endif |
|
| 647 |
+ |
|
| 501 | 648 |
INIT_XMM |
| 502 |
-yuv2planeX10 sse4 |
|
| 649 |
+yuv2planeX_fn sse2, 8, 10, 7 |
|
| 650 |
+yuv2planeX_fn sse2, 9, 7, 5 |
|
| 651 |
+yuv2planeX_fn sse2, 10, 7, 5 |
|
| 652 |
+ |
|
| 653 |
+%define PALIGNR PALIGNR_SSSE3 |
|
| 654 |
+yuv2planeX_fn sse4, 8, 10, 7 |
|
| 655 |
+yuv2planeX_fn sse4, 9, 7, 5 |
|
| 656 |
+yuv2planeX_fn sse4, 10, 7, 5 |
|
| 657 |
+yuv2planeX_fn sse4, 16, 8, 5 |
|
| 658 |
+ |
|
| 503 | 659 |
INIT_AVX |
| 504 |
-yuv2planeX10 avx |
|
| 660 |
+yuv2planeX_fn avx, 8, 10, 7 |
|
| 661 |
+yuv2planeX_fn avx, 9, 7, 5 |
|
| 662 |
+yuv2planeX_fn avx, 10, 7, 5 |
| ... | ... |
@@ -211,13 +211,22 @@ SCALE_FUNCS_SSE(sse2); |
| 211 | 211 |
SCALE_FUNCS_SSE(ssse3); |
| 212 | 212 |
SCALE_FUNCS_SSE(sse4); |
| 213 | 213 |
|
| 214 |
-extern void ff_yuv2planeX10_sse4(const int16_t *filter, int filterSize, |
|
| 215 |
- const int16_t **src, uint8_t *dest, int dstW, |
|
| 216 |
- const uint8_t *dither, int offset); |
|
| 214 |
+#define VSCALEX_FUNC(size, opt) \ |
|
| 215 |
+extern void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \ |
|
| 216 |
+ const int16_t **src, uint8_t *dest, int dstW, \ |
|
| 217 |
+ const uint8_t *dither, int offset) |
|
| 218 |
+#define VSCALEX_FUNCS(opt1, opt2) \ |
|
| 219 |
+ VSCALEX_FUNC(8, opt1); \ |
|
| 220 |
+ VSCALEX_FUNC(9, opt2); \ |
|
| 221 |
+ VSCALEX_FUNC(10, opt2) |
|
| 217 | 222 |
|
| 218 |
-extern void ff_yuv2planeX10_avx(const int16_t *filter, int filterSize, |
|
| 219 |
- const int16_t **src, uint8_t *dest, int dstW, |
|
| 220 |
- const uint8_t *dither, int offset); |
|
| 223 |
+#if ARCH_X86_32 |
|
| 224 |
+VSCALEX_FUNCS(mmx, mmx2); |
|
| 225 |
+#endif |
|
| 226 |
+VSCALEX_FUNCS(sse2, sse2); |
|
| 227 |
+VSCALEX_FUNCS(sse4, sse4); |
|
| 228 |
+VSCALEX_FUNC(16, sse4); |
|
| 229 |
+VSCALEX_FUNCS(avx, avx); |
|
| 221 | 230 |
|
| 222 | 231 |
void ff_sws_init_swScale_mmx(SwsContext *c) |
| 223 | 232 |
{
|
| ... | ... |
@@ -252,10 +261,18 @@ void ff_sws_init_swScale_mmx(SwsContext *c) |
| 252 | 252 |
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \ |
| 253 | 253 |
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \ |
| 254 | 254 |
} |
| 255 |
+#define ASSIGN_VSCALEX_FUNC(vscalefn, opt1, opt2, opt2chk, do_16_case) \ |
|
| 256 |
+switch(c->dstBpc){ \
|
|
| 257 |
+ case 16: do_16_case; break; \ |
|
| 258 |
+ case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2planeX_10_ ## opt2; break; \ |
|
| 259 |
+ case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2planeX_9_ ## opt2; break; \ |
|
| 260 |
+ default: vscalefn = ff_yuv2planeX_8_ ## opt1; break; \ |
|
| 261 |
+ } |
|
| 255 | 262 |
#if ARCH_X86_32 |
| 256 | 263 |
if (cpu_flags & AV_CPU_FLAG_MMX) {
|
| 257 | 264 |
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx); |
| 258 | 265 |
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx); |
| 266 |
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmx, mmx2, cpu_flags & AV_CPU_FLAG_MMX2,); |
|
| 259 | 267 |
} |
| 260 | 268 |
#endif |
| 261 | 269 |
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \ |
| ... | ... |
@@ -269,6 +286,7 @@ void ff_sws_init_swScale_mmx(SwsContext *c) |
| 269 | 269 |
if (cpu_flags & AV_CPU_FLAG_SSE2) {
|
| 270 | 270 |
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2); |
| 271 | 271 |
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2); |
| 272 |
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, sse2, 1,); |
|
| 272 | 273 |
} |
| 273 | 274 |
if (cpu_flags & AV_CPU_FLAG_SSSE3) {
|
| 274 | 275 |
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3); |
| ... | ... |
@@ -278,13 +296,12 @@ void ff_sws_init_swScale_mmx(SwsContext *c) |
| 278 | 278 |
/* Xto15 don't need special sse4 functions */ |
| 279 | 279 |
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3); |
| 280 | 280 |
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3); |
| 281 |
- if (c->dstBpc == 10 && !isBE(c->dstFormat)) |
|
| 282 |
- c->yuv2planeX = ff_yuv2planeX10_sse4; |
|
| 281 |
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4, sse4, 1, |
|
| 282 |
+ if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4); |
|
| 283 | 283 |
} |
| 284 | 284 |
|
| 285 | 285 |
if (cpu_flags & AV_CPU_FLAG_AVX) {
|
| 286 |
- if (c->dstBpc == 10 && !isBE(c->dstFormat)) |
|
| 287 |
- c->yuv2planeX = ff_yuv2planeX10_avx; |
|
| 286 |
+ ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, avx, 1,); |
|
| 288 | 287 |
} |
| 289 | 288 |
#endif |
| 290 | 289 |
} |
| ... | ... |
@@ -35,41 +35,6 @@ |
| 35 | 35 |
#endif |
| 36 | 36 |
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
| 37 | 37 |
|
| 38 |
-#define YSCALEYUV2YV12X(offset, dest, end, pos) \ |
|
| 39 |
- __asm__ volatile(\ |
|
| 40 |
- "movq "DITHER16"+0(%0), %%mm3 \n\t"\ |
|
| 41 |
- "movq "DITHER16"+8(%0), %%mm4 \n\t"\ |
|
| 42 |
- "lea " offset "(%0), %%"REG_d" \n\t"\ |
|
| 43 |
- "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
|
| 44 |
- ".p2align 4 \n\t" /* FIXME Unroll? */\ |
|
| 45 |
- "1: \n\t"\ |
|
| 46 |
- "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
|
| 47 |
- "movq (%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\ |
|
| 48 |
- "movq 8(%%"REG_S", %3, 2), %%mm5 \n\t" /* srcData */\ |
|
| 49 |
- "add $16, %%"REG_d" \n\t"\ |
|
| 50 |
- "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
|
| 51 |
- "test %%"REG_S", %%"REG_S" \n\t"\ |
|
| 52 |
- "pmulhw %%mm0, %%mm2 \n\t"\ |
|
| 53 |
- "pmulhw %%mm0, %%mm5 \n\t"\ |
|
| 54 |
- "paddw %%mm2, %%mm3 \n\t"\ |
|
| 55 |
- "paddw %%mm5, %%mm4 \n\t"\ |
|
| 56 |
- " jnz 1b \n\t"\ |
|
| 57 |
- "psraw $3, %%mm3 \n\t"\ |
|
| 58 |
- "psraw $3, %%mm4 \n\t"\ |
|
| 59 |
- "packuswb %%mm4, %%mm3 \n\t"\ |
|
| 60 |
- MOVNTQ(%%mm3, (%1, %3))\ |
|
| 61 |
- "add $8, %3 \n\t"\ |
|
| 62 |
- "cmp %2, %3 \n\t"\ |
|
| 63 |
- "movq "DITHER16"+0(%0), %%mm3 \n\t"\ |
|
| 64 |
- "movq "DITHER16"+8(%0), %%mm4 \n\t"\ |
|
| 65 |
- "lea " offset "(%0), %%"REG_d" \n\t"\ |
|
| 66 |
- "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
|
| 67 |
- "jb 1b \n\t"\ |
|
| 68 |
- :: "r" (&c->redDither),\ |
|
| 69 |
- "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\ |
|
| 70 |
- : "%"REG_d, "%"REG_S\ |
|
| 71 |
- ); |
|
| 72 |
- |
|
| 73 | 38 |
#if !COMPILE_TEMPLATE_MMX2 |
| 74 | 39 |
static av_always_inline void |
| 75 | 40 |
dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot) |
| ... | ... |
@@ -106,175 +71,6 @@ dither_8to16(SwsContext *c, const uint8_t *srcDither, int rot) |
| 106 | 106 |
} |
| 107 | 107 |
#endif |
| 108 | 108 |
|
| 109 |
-static void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, |
|
| 110 |
- const int16_t **lumSrc, int lumFilterSize, |
|
| 111 |
- const int16_t *chrFilter, const int16_t **chrUSrc, |
|
| 112 |
- const int16_t **chrVSrc, |
|
| 113 |
- int chrFilterSize, const int16_t **alpSrc, |
|
| 114 |
- uint8_t *dest[4], int dstW, int chrDstW) |
|
| 115 |
-{
|
|
| 116 |
- uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], |
|
| 117 |
- *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; |
|
| 118 |
- const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; |
|
| 119 |
- |
|
| 120 |
- if (uDest) {
|
|
| 121 |
- x86_reg uv_off = c->uv_off_byte >> 1; |
|
| 122 |
- dither_8to16(c, chrDither, 0); |
|
| 123 |
- YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) |
|
| 124 |
- dither_8to16(c, chrDither, 1); |
|
| 125 |
- YSCALEYUV2YV12X(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) |
|
| 126 |
- } |
|
| 127 |
- dither_8to16(c, lumDither, 0); |
|
| 128 |
- if (CONFIG_SWSCALE_ALPHA && aDest) {
|
|
| 129 |
- YSCALEYUV2YV12X(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) |
|
| 130 |
- } |
|
| 131 |
- |
|
| 132 |
- YSCALEYUV2YV12X(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0) |
|
| 133 |
-} |
|
| 134 |
- |
|
| 135 |
-#define YSCALEYUV2YV12X_ACCURATE(offset, dest, end, pos) \ |
|
| 136 |
- __asm__ volatile(\ |
|
| 137 |
- "lea " offset "(%0), %%"REG_d" \n\t"\ |
|
| 138 |
- "movq "DITHER32"+0(%0), %%mm4 \n\t"\ |
|
| 139 |
- "movq "DITHER32"+8(%0), %%mm5 \n\t"\ |
|
| 140 |
- "movq "DITHER32"+16(%0), %%mm6 \n\t"\ |
|
| 141 |
- "movq "DITHER32"+24(%0), %%mm7 \n\t"\ |
|
| 142 |
- "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
|
| 143 |
- ".p2align 4 \n\t"\ |
|
| 144 |
- "1: \n\t"\ |
|
| 145 |
- "movq (%%"REG_S", %3, 2), %%mm0 \n\t" /* srcData */\ |
|
| 146 |
- "movq 8(%%"REG_S", %3, 2), %%mm2 \n\t" /* srcData */\ |
|
| 147 |
- "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
|
| 148 |
- "movq (%%"REG_S", %3, 2), %%mm1 \n\t" /* srcData */\ |
|
| 149 |
- "movq %%mm0, %%mm3 \n\t"\ |
|
| 150 |
- "punpcklwd %%mm1, %%mm0 \n\t"\ |
|
| 151 |
- "punpckhwd %%mm1, %%mm3 \n\t"\ |
|
| 152 |
- "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ |
|
| 153 |
- "pmaddwd %%mm1, %%mm0 \n\t"\ |
|
| 154 |
- "pmaddwd %%mm1, %%mm3 \n\t"\ |
|
| 155 |
- "paddd %%mm0, %%mm4 \n\t"\ |
|
| 156 |
- "paddd %%mm3, %%mm5 \n\t"\ |
|
| 157 |
- "movq 8(%%"REG_S", %3, 2), %%mm3 \n\t" /* srcData */\ |
|
| 158 |
- "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
|
| 159 |
- "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ |
|
| 160 |
- "test %%"REG_S", %%"REG_S" \n\t"\ |
|
| 161 |
- "movq %%mm2, %%mm0 \n\t"\ |
|
| 162 |
- "punpcklwd %%mm3, %%mm2 \n\t"\ |
|
| 163 |
- "punpckhwd %%mm3, %%mm0 \n\t"\ |
|
| 164 |
- "pmaddwd %%mm1, %%mm2 \n\t"\ |
|
| 165 |
- "pmaddwd %%mm1, %%mm0 \n\t"\ |
|
| 166 |
- "paddd %%mm2, %%mm6 \n\t"\ |
|
| 167 |
- "paddd %%mm0, %%mm7 \n\t"\ |
|
| 168 |
- " jnz 1b \n\t"\ |
|
| 169 |
- "psrad $16, %%mm4 \n\t"\ |
|
| 170 |
- "psrad $16, %%mm5 \n\t"\ |
|
| 171 |
- "psrad $16, %%mm6 \n\t"\ |
|
| 172 |
- "psrad $16, %%mm7 \n\t"\ |
|
| 173 |
- "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ |
|
| 174 |
- "packssdw %%mm5, %%mm4 \n\t"\ |
|
| 175 |
- "packssdw %%mm7, %%mm6 \n\t"\ |
|
| 176 |
- "paddw %%mm0, %%mm4 \n\t"\ |
|
| 177 |
- "paddw %%mm0, %%mm6 \n\t"\ |
|
| 178 |
- "psraw $3, %%mm4 \n\t"\ |
|
| 179 |
- "psraw $3, %%mm6 \n\t"\ |
|
| 180 |
- "packuswb %%mm6, %%mm4 \n\t"\ |
|
| 181 |
- MOVNTQ(%%mm4, (%1, %3))\ |
|
| 182 |
- "add $8, %3 \n\t"\ |
|
| 183 |
- "cmp %2, %3 \n\t"\ |
|
| 184 |
- "lea " offset "(%0), %%"REG_d" \n\t"\ |
|
| 185 |
- "movq "DITHER32"+0(%0), %%mm4 \n\t"\ |
|
| 186 |
- "movq "DITHER32"+8(%0), %%mm5 \n\t"\ |
|
| 187 |
- "movq "DITHER32"+16(%0), %%mm6 \n\t"\ |
|
| 188 |
- "movq "DITHER32"+24(%0), %%mm7 \n\t"\ |
|
| 189 |
- "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
|
| 190 |
- "jb 1b \n\t"\ |
|
| 191 |
- :: "r" (&c->redDither),\ |
|
| 192 |
- "r" (dest), "g" ((x86_reg)(end)), "r"((x86_reg)(pos))\ |
|
| 193 |
- : "%"REG_a, "%"REG_d, "%"REG_S\ |
|
| 194 |
- ); |
|
| 195 |
- |
|
| 196 |
-#if !COMPILE_TEMPLATE_MMX2 |
|
| 197 |
-static av_always_inline void |
|
| 198 |
-dither_8to32(SwsContext *c, const uint8_t *srcDither, int rot) |
|
| 199 |
-{
|
|
| 200 |
- if (rot) {
|
|
| 201 |
- __asm__ volatile("pxor %%mm0, %%mm0\n\t"
|
|
| 202 |
- "movq (%0), %%mm4\n\t" |
|
| 203 |
- "movq %%mm4, %%mm5\n\t" |
|
| 204 |
- "psrlq $24, %%mm4\n\t" |
|
| 205 |
- "psllq $40, %%mm5\n\t" |
|
| 206 |
- "por %%mm5, %%mm4\n\t" |
|
| 207 |
- "movq %%mm4, %%mm6\n\t" |
|
| 208 |
- "punpcklbw %%mm0, %%mm4\n\t" |
|
| 209 |
- "punpckhbw %%mm0, %%mm6\n\t" |
|
| 210 |
- "movq %%mm4, %%mm5\n\t" |
|
| 211 |
- "movq %%mm6, %%mm7\n\t" |
|
| 212 |
- "punpcklwd %%mm0, %%mm4\n\t" |
|
| 213 |
- "punpckhwd %%mm0, %%mm5\n\t" |
|
| 214 |
- "punpcklwd %%mm0, %%mm6\n\t" |
|
| 215 |
- "punpckhwd %%mm0, %%mm7\n\t" |
|
| 216 |
- "pslld $12, %%mm4\n\t" |
|
| 217 |
- "pslld $12, %%mm5\n\t" |
|
| 218 |
- "pslld $12, %%mm6\n\t" |
|
| 219 |
- "pslld $12, %%mm7\n\t" |
|
| 220 |
- "movq %%mm4, "DITHER32"+0(%1)\n\t" |
|
| 221 |
- "movq %%mm5, "DITHER32"+8(%1)\n\t" |
|
| 222 |
- "movq %%mm6, "DITHER32"+16(%1)\n\t" |
|
| 223 |
- "movq %%mm7, "DITHER32"+24(%1)\n\t" |
|
| 224 |
- :: "r"(srcDither), "r"(&c->redDither) |
|
| 225 |
- ); |
|
| 226 |
- } else {
|
|
| 227 |
- __asm__ volatile("pxor %%mm0, %%mm0\n\t"
|
|
| 228 |
- "movq (%0), %%mm4\n\t" |
|
| 229 |
- "movq %%mm4, %%mm6\n\t" |
|
| 230 |
- "punpcklbw %%mm0, %%mm4\n\t" |
|
| 231 |
- "punpckhbw %%mm0, %%mm6\n\t" |
|
| 232 |
- "movq %%mm4, %%mm5\n\t" |
|
| 233 |
- "movq %%mm6, %%mm7\n\t" |
|
| 234 |
- "punpcklwd %%mm0, %%mm4\n\t" |
|
| 235 |
- "punpckhwd %%mm0, %%mm5\n\t" |
|
| 236 |
- "punpcklwd %%mm0, %%mm6\n\t" |
|
| 237 |
- "punpckhwd %%mm0, %%mm7\n\t" |
|
| 238 |
- "pslld $12, %%mm4\n\t" |
|
| 239 |
- "pslld $12, %%mm5\n\t" |
|
| 240 |
- "pslld $12, %%mm6\n\t" |
|
| 241 |
- "pslld $12, %%mm7\n\t" |
|
| 242 |
- "movq %%mm4, "DITHER32"+0(%1)\n\t" |
|
| 243 |
- "movq %%mm5, "DITHER32"+8(%1)\n\t" |
|
| 244 |
- "movq %%mm6, "DITHER32"+16(%1)\n\t" |
|
| 245 |
- "movq %%mm7, "DITHER32"+24(%1)\n\t" |
|
| 246 |
- :: "r"(srcDither), "r"(&c->redDither) |
|
| 247 |
- ); |
|
| 248 |
- } |
|
| 249 |
-} |
|
| 250 |
-#endif |
|
| 251 |
- |
|
| 252 |
-static void RENAME(yuv2yuvX_ar)(SwsContext *c, const int16_t *lumFilter, |
|
| 253 |
- const int16_t **lumSrc, int lumFilterSize, |
|
| 254 |
- const int16_t *chrFilter, const int16_t **chrUSrc, |
|
| 255 |
- const int16_t **chrVSrc, |
|
| 256 |
- int chrFilterSize, const int16_t **alpSrc, |
|
| 257 |
- uint8_t *dest[4], int dstW, int chrDstW) |
|
| 258 |
-{
|
|
| 259 |
- uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2], |
|
| 260 |
- *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL; |
|
| 261 |
- const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8; |
|
| 262 |
- |
|
| 263 |
- if (uDest) {
|
|
| 264 |
- x86_reg uv_off = c->uv_off_byte >> 1; |
|
| 265 |
- dither_8to32(c, chrDither, 0); |
|
| 266 |
- YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, uDest, chrDstW, 0) |
|
| 267 |
- dither_8to32(c, chrDither, 1); |
|
| 268 |
- YSCALEYUV2YV12X_ACCURATE(CHR_MMX_FILTER_OFFSET, vDest - uv_off, chrDstW + uv_off, uv_off) |
|
| 269 |
- } |
|
| 270 |
- dither_8to32(c, lumDither, 0); |
|
| 271 |
- if (CONFIG_SWSCALE_ALPHA && aDest) {
|
|
| 272 |
- YSCALEYUV2YV12X_ACCURATE(ALP_MMX_FILTER_OFFSET, aDest, dstW, 0) |
|
| 273 |
- } |
|
| 274 |
- |
|
| 275 |
- YSCALEYUV2YV12X_ACCURATE(LUM_MMX_FILTER_OFFSET, yDest, dstW, 0) |
|
| 276 |
-} |
|
| 277 |
- |
|
| 278 | 109 |
static void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, |
| 279 | 110 |
const int16_t *chrUSrc, const int16_t *chrVSrc, |
| 280 | 111 |
const int16_t *alpSrc, |
| ... | ... |
@@ -2104,7 +1900,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) |
| 2104 | 2104 |
if (!(c->flags & SWS_BITEXACT)) {
|
| 2105 | 2105 |
if (c->flags & SWS_ACCURATE_RND) {
|
| 2106 | 2106 |
//c->yuv2yuv1 = RENAME(yuv2yuv1_ar ); |
| 2107 |
- //c->yuv2yuvX = RENAME(yuv2yuvX_ar ); |
|
| 2108 | 2107 |
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
|
| 2109 | 2108 |
switch (c->dstFormat) {
|
| 2110 | 2109 |
case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; |
| ... | ... |
@@ -2117,7 +1912,6 @@ static av_cold void RENAME(sws_init_swScale)(SwsContext *c) |
| 2117 | 2117 |
} |
| 2118 | 2118 |
} else {
|
| 2119 | 2119 |
//c->yuv2yuv1 = RENAME(yuv2yuv1 ); |
| 2120 |
- //c->yuv2yuvX = RENAME(yuv2yuvX ); |
|
| 2121 | 2120 |
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
|
| 2122 | 2121 |
switch (c->dstFormat) {
|
| 2123 | 2122 |
case PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; |