Neon parts by Mans Rullgard <mans@mansr.com>.
Ronald S. Bultje authored on 2011/10/21 16:00:39... | ... |
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, |
32 | 32 |
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, |
33 | 33 |
int beta, int8_t *tc0); |
34 | 34 |
|
35 |
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den, |
|
36 |
- int weight, int offset); |
|
37 |
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den, |
|
38 |
- int weight, int offset); |
|
39 |
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den, |
|
40 |
- int weight, int offset); |
|
41 |
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den, |
|
42 |
- int weight, int offset); |
|
43 |
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den, |
|
44 |
- int weight, int offset); |
|
45 |
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den, |
|
46 |
- int weight, int offset); |
|
47 |
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den, |
|
48 |
- int weight, int offset); |
|
49 |
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den, |
|
50 |
- int weight, int offset); |
|
35 |
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, |
|
36 |
+ int log2_den, int weight, int offset); |
|
37 |
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, |
|
38 |
+ int log2_den, int weight, int offset); |
|
39 |
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, |
|
40 |
+ int log2_den, int weight, int offset); |
|
51 | 41 |
|
52 |
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, |
|
53 |
- int log2_den, int weightd, int weights, |
|
54 |
- int offset); |
|
55 |
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, |
|
56 |
- int log2_den, int weightd, int weights, |
|
57 |
- int offset); |
|
58 |
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, |
|
59 |
- int log2_den, int weightd, int weights, |
|
60 |
- int offset); |
|
61 |
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, |
|
62 |
- int log2_den, int weightd, int weights, |
|
63 |
- int offset); |
|
64 |
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, |
|
65 |
- int log2_den, int weightd, int weights, |
|
66 |
- int offset); |
|
67 |
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, |
|
68 |
- int log2_den, int weightd, int weights, |
|
69 |
- int offset); |
|
70 |
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, |
|
71 |
- int log2_den, int weightd, int weights, |
|
72 |
- int offset); |
|
73 |
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, |
|
74 |
- int log2_den, int weightd, int weights, |
|
75 |
- int offset); |
|
42 |
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, |
|
43 |
+ int height, int log2_den, int weightd, |
|
44 |
+ int weights, int offset); |
|
45 |
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, |
|
46 |
+ int height, int log2_den, int weightd, |
|
47 |
+ int weights, int offset); |
|
48 |
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, |
|
49 |
+ int height, int log2_den, int weightd, |
|
50 |
+ int weights, int offset); |
|
76 | 51 |
|
77 | 52 |
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); |
78 | 53 |
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); |
... | ... |
@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i |
100 | 100 |
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; |
101 | 101 |
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; |
102 | 102 |
|
103 |
- c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon; |
|
104 |
- c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon; |
|
105 |
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon; |
|
106 |
- c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon; |
|
107 |
- c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon; |
|
108 |
- c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon; |
|
109 |
- c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon; |
|
110 |
- c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon; |
|
103 |
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; |
|
104 |
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; |
|
105 |
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; |
|
111 | 106 |
|
112 |
- c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; |
|
113 |
- c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; |
|
114 |
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; |
|
115 |
- c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; |
|
116 |
- c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; |
|
117 |
- c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; |
|
118 |
- c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; |
|
119 |
- c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; |
|
107 |
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; |
|
108 |
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; |
|
109 |
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; |
|
120 | 110 |
|
121 | 111 |
c->h264_idct_add = ff_h264_idct_add_neon; |
122 | 112 |
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; |
... | ... |
@@ -1592,7 +1592,7 @@ endfunc |
1592 | 1592 |
vdup.8 d1, r5 |
1593 | 1593 |
vmov q2, q8 |
1594 | 1594 |
vmov q3, q8 |
1595 |
-1: subs ip, ip, #2 |
|
1595 |
+1: subs r3, r3, #2 |
|
1596 | 1596 |
vld1.8 {d20-d21},[r0,:128], r2 |
1597 | 1597 |
\macd q2, d0, d20 |
1598 | 1598 |
pld [r0] |
... | ... |
@@ -1632,7 +1632,7 @@ endfunc |
1632 | 1632 |
vdup.8 d1, r5 |
1633 | 1633 |
vmov q1, q8 |
1634 | 1634 |
vmov q10, q8 |
1635 |
-1: subs ip, ip, #2 |
|
1635 |
+1: subs r3, r3, #2 |
|
1636 | 1636 |
vld1.8 {d4},[r0,:64], r2 |
1637 | 1637 |
\macd q1, d0, d4 |
1638 | 1638 |
pld [r0] |
... | ... |
@@ -1662,7 +1662,7 @@ endfunc |
1662 | 1662 |
vdup.8 d1, r5 |
1663 | 1663 |
vmov q1, q8 |
1664 | 1664 |
vmov q10, q8 |
1665 |
-1: subs ip, ip, #4 |
|
1665 |
+1: subs r3, r3, #4 |
|
1666 | 1666 |
vld1.32 {d4[0]},[r0,:32], r2 |
1667 | 1667 |
vld1.32 {d4[1]},[r0,:32], r2 |
1668 | 1668 |
\macd q1, d0, d4 |
... | ... |
@@ -1700,16 +1700,17 @@ endfunc |
1700 | 1700 |
.endm |
1701 | 1701 |
|
1702 | 1702 |
.macro biweight_func w |
1703 |
-function biweight_h264_pixels_\w\()_neon |
|
1703 |
+function ff_biweight_h264_pixels_\w\()_neon, export=1 |
|
1704 | 1704 |
push {r4-r6, lr} |
1705 |
- add r4, sp, #16 |
|
1705 |
+ ldr r12, [sp, #16] |
|
1706 |
+ add r4, sp, #20 |
|
1706 | 1707 |
ldm r4, {r4-r6} |
1707 | 1708 |
lsr lr, r4, #31 |
1708 | 1709 |
add r6, r6, #1 |
1709 | 1710 |
eors lr, lr, r5, lsr #30 |
1710 | 1711 |
orr r6, r6, #1 |
1711 |
- vdup.16 q9, r3 |
|
1712 |
- lsl r6, r6, r3 |
|
1712 |
+ vdup.16 q9, r12 |
|
1713 |
+ lsl r6, r6, r12 |
|
1713 | 1714 |
vmvn q9, q9 |
1714 | 1715 |
vdup.16 q8, r6 |
1715 | 1716 |
mov r6, r0 |
... | ... |
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon |
1730 | 1730 |
endfunc |
1731 | 1731 |
.endm |
1732 | 1732 |
|
1733 |
- .macro biweight_entry w, h, b=1 |
|
1734 |
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 |
|
1735 |
- mov ip, #\h |
|
1736 |
-.if \b |
|
1737 |
- b biweight_h264_pixels_\w\()_neon |
|
1738 |
-.endif |
|
1739 |
-endfunc |
|
1740 |
- .endm |
|
1741 |
- |
|
1742 |
- biweight_entry 16, 8 |
|
1743 |
- biweight_entry 16, 16, b=0 |
|
1744 | 1733 |
biweight_func 16 |
1745 |
- |
|
1746 |
- biweight_entry 8, 16 |
|
1747 |
- biweight_entry 8, 4 |
|
1748 |
- biweight_entry 8, 8, b=0 |
|
1749 | 1734 |
biweight_func 8 |
1750 |
- |
|
1751 |
- biweight_entry 4, 8 |
|
1752 |
- biweight_entry 4, 2 |
|
1753 |
- biweight_entry 4, 4, b=0 |
|
1754 | 1735 |
biweight_func 4 |
1755 | 1736 |
|
1756 | 1737 |
@ Weighted prediction |
1757 | 1738 |
|
1758 | 1739 |
.macro weight_16 add |
1759 |
- vdup.8 d0, r3 |
|
1760 |
-1: subs ip, ip, #2 |
|
1740 |
+ vdup.8 d0, r12 |
|
1741 |
+1: subs r2, r2, #2 |
|
1761 | 1742 |
vld1.8 {d20-d21},[r0,:128], r1 |
1762 | 1743 |
vmull.u8 q2, d0, d20 |
1763 | 1744 |
pld [r0] |
... | ... |
@@ -1785,8 +1767,8 @@ endfunc |
1785 | 1785 |
.endm |
1786 | 1786 |
|
1787 | 1787 |
.macro weight_8 add |
1788 |
- vdup.8 d0, r3 |
|
1789 |
-1: subs ip, ip, #2 |
|
1788 |
+ vdup.8 d0, r12 |
|
1789 |
+1: subs r2, r2, #2 |
|
1790 | 1790 |
vld1.8 {d4},[r0,:64], r1 |
1791 | 1791 |
vmull.u8 q1, d0, d4 |
1792 | 1792 |
pld [r0] |
... | ... |
@@ -1806,10 +1788,10 @@ endfunc |
1806 | 1806 |
.endm |
1807 | 1807 |
|
1808 | 1808 |
.macro weight_4 add |
1809 |
- vdup.8 d0, r3 |
|
1809 |
+ vdup.8 d0, r12 |
|
1810 | 1810 |
vmov q1, q8 |
1811 | 1811 |
vmov q10, q8 |
1812 |
-1: subs ip, ip, #4 |
|
1812 |
+1: subs r2, r2, #4 |
|
1813 | 1813 |
vld1.32 {d4[0]},[r0,:32], r1 |
1814 | 1814 |
vld1.32 {d4[1]},[r0,:32], r1 |
1815 | 1815 |
vmull.u8 q1, d0, d4 |
... | ... |
@@ -1842,50 +1824,32 @@ endfunc |
1842 | 1842 |
.endm |
1843 | 1843 |
|
1844 | 1844 |
.macro weight_func w |
1845 |
-function weight_h264_pixels_\w\()_neon |
|
1845 |
+function ff_weight_h264_pixels_\w\()_neon, export=1 |
|
1846 | 1846 |
push {r4, lr} |
1847 |
- ldr r4, [sp, #8] |
|
1848 |
- cmp r2, #1 |
|
1849 |
- lsl r4, r4, r2 |
|
1847 |
+ ldr r12, [sp, #8] |
|
1848 |
+ ldr r4, [sp, #12] |
|
1849 |
+ cmp r3, #1 |
|
1850 |
+ lsl r4, r4, r3 |
|
1850 | 1851 |
vdup.16 q8, r4 |
1851 | 1852 |
mov r4, r0 |
1852 | 1853 |
ble 20f |
1853 |
- rsb lr, r2, #1 |
|
1854 |
+ rsb lr, r3, #1 |
|
1854 | 1855 |
vdup.16 q9, lr |
1855 |
- cmp r3, #0 |
|
1856 |
+ cmp r12, #0 |
|
1856 | 1857 |
blt 10f |
1857 | 1858 |
weight_\w vhadd.s16 |
1858 |
-10: rsb r3, r3, #0 |
|
1859 |
+10: rsb r12, r12, #0 |
|
1859 | 1860 |
weight_\w vhsub.s16 |
1860 |
-20: rsb lr, r2, #0 |
|
1861 |
+20: rsb lr, r3, #0 |
|
1861 | 1862 |
vdup.16 q9, lr |
1862 |
- cmp r3, #0 |
|
1863 |
+ cmp r12, #0 |
|
1863 | 1864 |
blt 10f |
1864 | 1865 |
weight_\w vadd.s16 |
1865 |
-10: rsb r3, r3, #0 |
|
1866 |
+10: rsb r12, r12, #0 |
|
1866 | 1867 |
weight_\w vsub.s16 |
1867 | 1868 |
endfunc |
1868 | 1869 |
.endm |
1869 | 1870 |
|
1870 |
- .macro weight_entry w, h, b=1 |
|
1871 |
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1 |
|
1872 |
- mov ip, #\h |
|
1873 |
-.if \b |
|
1874 |
- b weight_h264_pixels_\w\()_neon |
|
1875 |
-.endif |
|
1876 |
-endfunc |
|
1877 |
- .endm |
|
1878 |
- |
|
1879 |
- weight_entry 16, 8 |
|
1880 |
- weight_entry 16, 16, b=0 |
|
1881 | 1871 |
weight_func 16 |
1882 |
- |
|
1883 |
- weight_entry 8, 16 |
|
1884 |
- weight_entry 8, 4 |
|
1885 |
- weight_entry 8, 8, b=0 |
|
1886 | 1872 |
weight_func 8 |
1887 |
- |
|
1888 |
- weight_entry 4, 8 |
|
1889 |
- weight_entry 4, 2 |
|
1890 |
- weight_entry 4, 4, b=0 |
|
1891 | 1873 |
weight_func 4 |
... | ... |
@@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){ |
438 | 438 |
} |
439 | 439 |
#endif |
440 | 440 |
|
441 |
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, |
|
441 |
+static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, |
|
442 |
+ int height, int delta, int list, |
|
442 | 443 |
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, |
443 | 444 |
int src_x_offset, int src_y_offset, |
444 | 445 |
qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op, |
... | ... |
@@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, |
518 | 518 |
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); |
519 | 519 |
src_cb= s->edge_emu_buffer; |
520 | 520 |
} |
521 |
- chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); |
|
521 |
+ chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); |
|
522 | 522 |
|
523 | 523 |
if(emu){ |
524 | 524 |
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422)); |
525 | 525 |
src_cr= s->edge_emu_buffer; |
526 | 526 |
} |
527 |
- chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); |
|
527 |
+ chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7); |
|
528 | 528 |
} |
529 | 529 |
|
530 |
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta, |
|
530 |
+static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta, |
|
531 | 531 |
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, |
532 | 532 |
int x_offset, int y_offset, |
533 | 533 |
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, |
... | ... |
@@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei |
553 | 553 |
|
554 | 554 |
if(list0){ |
555 | 555 |
Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ]; |
556 |
- mc_dir_part(h, ref, n, square, chroma_height, delta, 0, |
|
556 |
+ mc_dir_part(h, ref, n, square, height, delta, 0, |
|
557 | 557 |
dest_y, dest_cb, dest_cr, x_offset, y_offset, |
558 | 558 |
qpix_op, chroma_op, pixel_shift, chroma444); |
559 | 559 |
|
... | ... |
@@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei |
563 | 563 |
|
564 | 564 |
if(list1){ |
565 | 565 |
Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ]; |
566 |
- mc_dir_part(h, ref, n, square, chroma_height, delta, 1, |
|
566 |
+ mc_dir_part(h, ref, n, square, height, delta, 1, |
|
567 | 567 |
dest_y, dest_cb, dest_cr, x_offset, y_offset, |
568 | 568 |
qpix_op, chroma_op, pixel_shift, chroma444); |
569 | 569 |
} |
570 | 570 |
} |
571 | 571 |
|
572 |
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta, |
|
572 |
+static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta, |
|
573 | 573 |
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, |
574 | 574 |
int x_offset, int y_offset, |
575 | 575 |
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, |
... | ... |
@@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom |
577 | 577 |
h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg, |
578 | 578 |
int list0, int list1, int pixel_shift, int chroma444){ |
579 | 579 |
MpegEncContext * const s = &h->s; |
580 |
+ int chroma_height; |
|
580 | 581 |
|
581 | 582 |
dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; |
582 | 583 |
if(chroma444){ |
584 |
+ chroma_height = height; |
|
583 | 585 |
chroma_weight_avg = luma_weight_avg; |
584 | 586 |
chroma_weight_op = luma_weight_op; |
585 | 587 |
dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; |
586 | 588 |
dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize; |
587 | 589 |
} else if (CHROMA422) { |
590 |
+ chroma_height = height; |
|
588 | 591 |
dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; |
589 | 592 |
dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize; |
590 | 593 |
}else{ |
594 |
+ chroma_height = height >> 1; |
|
591 | 595 |
dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; |
592 | 596 |
dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize; |
593 | 597 |
} |
... | ... |
@@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom |
603 | 603 |
int refn0 = h->ref_cache[0][ scan8[n] ]; |
604 | 604 |
int refn1 = h->ref_cache[1][ scan8[n] ]; |
605 | 605 |
|
606 |
- mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0, |
|
606 |
+ mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0, |
|
607 | 607 |
dest_y, dest_cb, dest_cr, |
608 | 608 |
x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); |
609 |
- mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1, |
|
609 |
+ mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1, |
|
610 | 610 |
tmp_y, tmp_cb, tmp_cr, |
611 | 611 |
x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444); |
612 | 612 |
|
613 | 613 |
if(h->use_weight == 2){ |
614 | 614 |
int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1]; |
615 | 615 |
int weight1 = 64 - weight0; |
616 |
- luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0); |
|
617 |
- chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0); |
|
618 |
- chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0); |
|
619 |
- if (CHROMA422) { |
|
620 |
- chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, |
|
621 |
- tmp_cb + chroma_height * h->mb_uvlinesize, |
|
622 |
- h->mb_uvlinesize, 5, weight0, weight1, 0); |
|
623 |
- chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, |
|
624 |
- tmp_cr + chroma_height * h->mb_uvlinesize, |
|
625 |
- h->mb_uvlinesize, 5, weight0, weight1, 0); |
|
626 |
- } |
|
616 |
+ luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, |
|
617 |
+ height, 5, weight0, weight1, 0); |
|
618 |
+ chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, |
|
619 |
+ chroma_height, 5, weight0, weight1, 0); |
|
620 |
+ chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, |
|
621 |
+ chroma_height, 5, weight0, weight1, 0); |
|
627 | 622 |
}else{ |
628 |
- luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom, |
|
623 |
+ luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom, |
|
629 | 624 |
h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0], |
630 | 625 |
h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]); |
631 |
- chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
626 |
+ chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, |
|
632 | 627 |
h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], |
633 | 628 |
h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); |
634 |
- chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
629 |
+ chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, |
|
635 | 630 |
h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], |
636 | 631 |
h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); |
637 |
- if (CHROMA422) { |
|
638 |
- chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize, |
|
639 |
- tmp_cb + chroma_height * h->mb_uvlinesize, |
|
640 |
- h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
641 |
- h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0], |
|
642 |
- h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]); |
|
643 |
- chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize, |
|
644 |
- tmp_cr + chroma_height * h->mb_uvlinesize, |
|
645 |
- h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
646 |
- h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0], |
|
647 |
- h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]); |
|
648 |
- } |
|
649 | 632 |
} |
650 | 633 |
}else{ |
651 | 634 |
int list = list1 ? 1 : 0; |
652 | 635 |
int refn = h->ref_cache[list][ scan8[n] ]; |
653 | 636 |
Picture *ref= &h->ref_list[list][refn]; |
654 |
- mc_dir_part(h, ref, n, square, chroma_height, delta, list, |
|
637 |
+ mc_dir_part(h, ref, n, square, height, delta, list, |
|
655 | 638 |
dest_y, dest_cb, dest_cr, x_offset, y_offset, |
656 | 639 |
qpix_put, chroma_put, pixel_shift, chroma444); |
657 | 640 |
|
658 |
- luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom, |
|
641 |
+ luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom, |
|
659 | 642 |
h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]); |
660 | 643 |
if(h->use_weight_chroma){ |
661 |
- chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
644 |
+ chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, |
|
662 | 645 |
h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); |
663 |
- chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
646 |
+ chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom, |
|
664 | 647 |
h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); |
665 |
- if (CHROMA422) { |
|
666 |
- chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize, |
|
667 |
- h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
668 |
- h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]); |
|
669 |
- chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize, |
|
670 |
- h->mb_uvlinesize, h->chroma_log2_weight_denom, |
|
671 |
- h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]); |
|
672 |
- } |
|
673 | 648 |
} |
674 | 649 |
} |
675 | 650 |
} |
676 | 651 |
|
677 |
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta, |
|
652 |
+static inline void mc_part(H264Context *h, int n, int square, int height, int delta, |
|
678 | 653 |
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, |
679 | 654 |
int x_offset, int y_offset, |
680 | 655 |
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put, |
... | ... |
@@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height, |
684 | 684 |
if((h->use_weight==2 && list0 && list1 |
685 | 685 |
&& (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32)) |
686 | 686 |
|| h->use_weight==1) |
687 |
- mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, |
|
687 |
+ mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr, |
|
688 | 688 |
x_offset, y_offset, qpix_put, chroma_put, |
689 |
- weight_op[0], weight_op[3], weight_avg[0], |
|
690 |
- weight_avg[3], list0, list1, pixel_shift, chroma444); |
|
689 |
+ weight_op[0], weight_op[1], weight_avg[0], |
|
690 |
+ weight_avg[1], list0, list1, pixel_shift, chroma444); |
|
691 | 691 |
else |
692 |
- mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr, |
|
692 |
+ mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr, |
|
693 | 693 |
x_offset, y_offset, qpix_put, chroma_put, qpix_avg, |
694 | 694 |
chroma_avg, list0, list1, pixel_shift, chroma444); |
695 | 695 |
} |
... | ... |
@@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t |
731 | 731 |
prefetch_motion(h, 0, pixel_shift, chroma444); |
732 | 732 |
|
733 | 733 |
if(IS_16X16(mb_type)){ |
734 |
- mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0, |
|
734 |
+ mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0, |
|
735 | 735 |
qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0], |
736 | 736 |
weight_op, weight_avg, |
737 | 737 |
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), |
738 | 738 |
pixel_shift, chroma444); |
739 | 739 |
}else if(IS_16X8(mb_type)){ |
740 |
- mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, |
|
740 |
+ mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0, |
|
741 | 741 |
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], |
742 |
- &weight_op[1], &weight_avg[1], |
|
742 |
+ weight_op, weight_avg, |
|
743 | 743 |
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), |
744 | 744 |
pixel_shift, chroma444); |
745 |
- mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, |
|
745 |
+ mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4, |
|
746 | 746 |
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0], |
747 |
- &weight_op[1], &weight_avg[1], |
|
747 |
+ weight_op, weight_avg, |
|
748 | 748 |
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), |
749 | 749 |
pixel_shift, chroma444); |
750 | 750 |
}else if(IS_8X16(mb_type)){ |
751 |
- mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, |
|
751 |
+ mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0, |
|
752 | 752 |
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], |
753 |
- &weight_op[2], &weight_avg[2], |
|
753 |
+ &weight_op[1], &weight_avg[1], |
|
754 | 754 |
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), |
755 | 755 |
pixel_shift, chroma444); |
756 |
- mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, |
|
756 |
+ mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0, |
|
757 | 757 |
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], |
758 |
- &weight_op[2], &weight_avg[2], |
|
758 |
+ &weight_op[1], &weight_avg[1], |
|
759 | 759 |
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), |
760 | 760 |
pixel_shift, chroma444); |
761 | 761 |
}else{ |
... | ... |
@@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t |
770 | 770 |
int y_offset= (i&2)<<1; |
771 | 771 |
|
772 | 772 |
if(IS_SUB_8X8(sub_mb_type)){ |
773 |
- mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, |
|
773 |
+ mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset, |
|
774 | 774 |
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1], |
775 |
- &weight_op[3], &weight_avg[3], |
|
775 |
+ &weight_op[1], &weight_avg[1], |
|
776 | 776 |
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), |
777 | 777 |
pixel_shift, chroma444); |
778 | 778 |
}else if(IS_SUB_8X4(sub_mb_type)){ |
779 |
- mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, |
|
779 |
+ mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset, |
|
780 | 780 |
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], |
781 |
- &weight_op[4], &weight_avg[4], |
|
781 |
+ &weight_op[1], &weight_avg[1], |
|
782 | 782 |
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), |
783 | 783 |
pixel_shift, chroma444); |
784 |
- mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, |
|
784 |
+ mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2, |
|
785 | 785 |
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1], |
786 |
- &weight_op[4], &weight_avg[4], |
|
786 |
+ &weight_op[1], &weight_avg[1], |
|
787 | 787 |
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), |
788 | 788 |
pixel_shift, chroma444); |
789 | 789 |
}else if(IS_SUB_4X8(sub_mb_type)){ |
790 |
- mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, |
|
790 |
+ mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset, |
|
791 | 791 |
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], |
792 |
- &weight_op[5], &weight_avg[5], |
|
792 |
+ &weight_op[2], &weight_avg[2], |
|
793 | 793 |
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), |
794 | 794 |
pixel_shift, chroma444); |
795 |
- mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, |
|
795 |
+ mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset, |
|
796 | 796 |
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], |
797 |
- &weight_op[5], &weight_avg[5], |
|
797 |
+ &weight_op[2], &weight_avg[2], |
|
798 | 798 |
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), |
799 | 799 |
pixel_shift, chroma444); |
800 | 800 |
}else{ |
... | ... |
@@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t |
803 | 803 |
for(j=0; j<4; j++){ |
804 | 804 |
int sub_x_offset= x_offset + 2*(j&1); |
805 | 805 |
int sub_y_offset= y_offset + (j&2); |
806 |
- mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, |
|
806 |
+ mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset, |
|
807 | 807 |
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2], |
808 |
- &weight_op[6], &weight_avg[6], |
|
808 |
+ &weight_op[2], &weight_avg[2], |
|
809 | 809 |
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1), |
810 | 810 |
pixel_shift, chroma444); |
811 | 811 |
} |
... | ... |
@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo |
64 | 64 |
else\ |
65 | 65 |
c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\ |
66 | 66 |
\ |
67 |
- c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\ |
|
68 |
- c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\ |
|
69 |
- c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\ |
|
70 |
- c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\ |
|
71 |
- c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\ |
|
72 |
- c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\ |
|
73 |
- c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\ |
|
74 |
- c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\ |
|
75 |
- c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\ |
|
76 |
- c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\ |
|
77 |
- c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\ |
|
78 |
- c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\ |
|
79 |
- c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\ |
|
80 |
- c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\ |
|
81 |
- c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\ |
|
82 |
- c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\ |
|
83 |
- c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\ |
|
84 |
- c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\ |
|
85 |
- c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\ |
|
86 |
- c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\ |
|
67 |
+ c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\ |
|
68 |
+ c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\ |
|
69 |
+ c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\ |
|
70 |
+ c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\ |
|
71 |
+ c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\ |
|
72 |
+ c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\ |
|
73 |
+ c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\ |
|
74 |
+ c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\ |
|
87 | 75 |
\ |
88 | 76 |
c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\ |
89 | 77 |
c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\ |
... | ... |
@@ -31,16 +31,18 @@ |
31 | 31 |
#include "dsputil.h" |
32 | 32 |
|
33 | 33 |
//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y); |
34 |
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset); |
|
35 |
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset); |
|
34 |
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height, |
|
35 |
+ int log2_denom, int weight, int offset); |
|
36 |
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height, |
|
37 |
+ int log2_denom, int weightd, int weights, int offset); |
|
36 | 38 |
|
37 | 39 |
/** |
38 | 40 |
* Context for storing H.264 DSP functions |
39 | 41 |
*/ |
40 | 42 |
typedef struct H264DSPContext{ |
41 | 43 |
/* weighted MC */ |
42 |
- h264_weight_func weight_h264_pixels_tab[10]; |
|
43 |
- h264_biweight_func biweight_h264_pixels_tab[10]; |
|
44 |
+ h264_weight_func weight_h264_pixels_tab[4]; |
|
45 |
+ h264_biweight_func biweight_h264_pixels_tab[4]; |
|
44 | 46 |
|
45 | 47 |
/* loop filter */ |
46 | 48 |
void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0); |
... | ... |
@@ -29,14 +29,16 @@ |
29 | 29 |
|
30 | 30 |
#define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom ) |
31 | 31 |
#define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) |
32 |
-#define H264_WEIGHT(W,H) \ |
|
33 |
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \ |
|
32 |
+#define H264_WEIGHT(W) \ |
|
33 |
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \ |
|
34 |
+ int log2_denom, int weight, int offset) \ |
|
35 |
+{ \ |
|
34 | 36 |
int y; \ |
35 | 37 |
pixel *block = (pixel*)_block; \ |
36 | 38 |
stride /= sizeof(pixel); \ |
37 | 39 |
offset <<= (log2_denom + (BIT_DEPTH-8)); \ |
38 | 40 |
if(log2_denom) offset += 1<<(log2_denom-1); \ |
39 |
- for(y=0; y<H; y++, block += stride){ \ |
|
41 |
+ for (y = 0; y < height; y++, block += stride) { \ |
|
40 | 42 |
op_scale1(0); \ |
41 | 43 |
op_scale1(1); \ |
42 | 44 |
if(W==2) continue; \ |
... | ... |
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride |
58 | 58 |
op_scale1(15); \ |
59 | 59 |
} \ |
60 | 60 |
} \ |
61 |
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \ |
|
61 |
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \ |
|
62 |
+ int log2_denom, int weightd, int weights, int offset) \ |
|
63 |
+{ \ |
|
62 | 64 |
int y; \ |
63 | 65 |
pixel *dst = (pixel*)_dst; \ |
64 | 66 |
pixel *src = (pixel*)_src; \ |
65 | 67 |
stride /= sizeof(pixel); \ |
66 | 68 |
offset <<= (BIT_DEPTH-8); \ |
67 | 69 |
offset = ((offset + 1) | 1) << log2_denom; \ |
68 |
- for(y=0; y<H; y++, dst += stride, src += stride){ \ |
|
70 |
+ for (y = 0; y < height; y++, dst += stride, src += stride) { \ |
|
69 | 71 |
op_scale2(0); \ |
70 | 72 |
op_scale2(1); \ |
71 | 73 |
if(W==2) continue; \ |
... | ... |
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_ |
88 | 88 |
} \ |
89 | 89 |
} |
90 | 90 |
|
91 |
-H264_WEIGHT(16,16) |
|
92 |
-H264_WEIGHT(16,8) |
|
93 |
-H264_WEIGHT(8,16) |
|
94 |
-H264_WEIGHT(8,8) |
|
95 |
-H264_WEIGHT(8,4) |
|
96 |
-H264_WEIGHT(4,8) |
|
97 |
-H264_WEIGHT(4,4) |
|
98 |
-H264_WEIGHT(4,2) |
|
99 |
-H264_WEIGHT(2,4) |
|
100 |
-H264_WEIGHT(2,2) |
|
91 |
+H264_WEIGHT(16) |
|
92 |
+H264_WEIGHT(8) |
|
93 |
+H264_WEIGHT(4) |
|
94 |
+H264_WEIGHT(2) |
|
101 | 95 |
|
102 | 96 |
#undef op_scale1 |
103 | 97 |
#undef op_scale2 |
... | ... |
@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, |
843 | 843 |
} |
844 | 844 |
|
845 | 845 |
static av_always_inline |
846 |
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h) |
|
846 |
+void weight_h264_W_altivec(uint8_t *block, int stride, int height, |
|
847 |
+ int log2_denom, int weight, int offset, int w) |
|
847 | 848 |
{ |
848 | 849 |
int y, aligned; |
849 | 850 |
vec_u8 vblock; |
... | ... |
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei |
864 | 864 |
voffset = vec_splat(vtemp, 5); |
865 | 865 |
aligned = !((unsigned long)block & 0xf); |
866 | 866 |
|
867 |
- for (y=0; y<h; y++) { |
|
867 |
+ for (y = 0; y < height; y++) { |
|
868 | 868 |
vblock = vec_ld(0, block); |
869 | 869 |
|
870 | 870 |
v0 = (vec_s16)vec_mergeh(zero_u8v, vblock); |
... | ... |
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei |
888 | 888 |
} |
889 | 889 |
|
890 | 890 |
static av_always_inline |
891 |
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, |
|
892 |
- int weightd, int weights, int offset, int w, int h) |
|
891 |
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height, |
|
892 |
+ int log2_denom, int weightd, int weights, int offset, int w) |
|
893 | 893 |
{ |
894 | 894 |
int y, dst_aligned, src_aligned; |
895 | 895 |
vec_u8 vsrc, vdst; |
... | ... |
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_ |
912 | 912 |
dst_aligned = !((unsigned long)dst & 0xf); |
913 | 913 |
src_aligned = !((unsigned long)src & 0xf); |
914 | 914 |
|
915 |
- for (y=0; y<h; y++) { |
|
915 |
+ for (y = 0; y < height; y++) { |
|
916 | 916 |
vdst = vec_ld(0, dst); |
917 | 917 |
vsrc = vec_ld(0, src); |
918 | 918 |
|
... | ... |
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_ |
952 | 952 |
} |
953 | 953 |
} |
954 | 954 |
|
955 |
-#define H264_WEIGHT(W,H) \ |
|
956 |
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ |
|
957 |
- weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \ |
|
955 |
+#define H264_WEIGHT(W) \ |
|
956 |
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \ |
|
957 |
+ int log2_denom, int weight, int offset){ \ |
|
958 |
+ weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \ |
|
958 | 959 |
}\ |
959 |
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ |
|
960 |
- biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ |
|
960 |
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \ |
|
961 |
+ int log2_denom, int weightd, int weights, int offset){ \ |
|
962 |
+ biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \ |
|
961 | 963 |
} |
962 | 964 |
|
963 |
-H264_WEIGHT(16,16) |
|
964 |
-H264_WEIGHT(16, 8) |
|
965 |
-H264_WEIGHT( 8,16) |
|
966 |
-H264_WEIGHT( 8, 8) |
|
967 |
-H264_WEIGHT( 8, 4) |
|
965 |
+H264_WEIGHT(16) |
|
966 |
+H264_WEIGHT( 8) |
|
968 | 967 |
|
969 | 968 |
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { |
970 | 969 |
const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
... | ... |
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom |
1015 | 1015 |
c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; |
1016 | 1016 |
c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; |
1017 | 1017 |
|
1018 |
- c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec; |
|
1019 |
- c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec; |
|
1020 |
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec; |
|
1021 |
- c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec; |
|
1022 |
- c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec; |
|
1023 |
- c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec; |
|
1024 |
- c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec; |
|
1025 |
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec; |
|
1026 |
- c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec; |
|
1027 |
- c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec; |
|
1018 |
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec; |
|
1019 |
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec; |
|
1020 |
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec; |
|
1021 |
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec; |
|
1028 | 1022 |
} |
1029 | 1023 |
} |
1030 | 1024 |
} |
... | ... |
@@ -28,21 +28,20 @@ SECTION .text |
28 | 28 |
;----------------------------------------------------------------------------- |
29 | 29 |
; biweight pred: |
30 | 30 |
; |
31 |
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, |
|
32 |
-; int log2_denom, int weightd, int weights, |
|
33 |
-; int offset); |
|
31 |
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, |
|
32 |
+; int height, int log2_denom, int weightd, |
|
33 |
+; int weights, int offset); |
|
34 | 34 |
; and |
35 |
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride, |
|
36 |
-; int log2_denom, int weight, |
|
37 |
-; int offset); |
|
35 |
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, |
|
36 |
+; int log2_denom, int weight, int offset); |
|
38 | 37 |
;----------------------------------------------------------------------------- |
39 | 38 |
|
40 | 39 |
%macro WEIGHT_SETUP 0 |
41 |
- add r4, r4 |
|
42 |
- inc r4 |
|
43 |
- movd m3, r3d |
|
44 |
- movd m5, r4d |
|
45 |
- movd m6, r2d |
|
40 |
+ add r5, r5 |
|
41 |
+ inc r5 |
|
42 |
+ movd m3, r4d |
|
43 |
+ movd m5, r5d |
|
44 |
+ movd m6, r3d |
|
46 | 45 |
pslld m5, m6 |
47 | 46 |
psrld m5, 1 |
48 | 47 |
%if mmsize == 16 |
... | ... |
@@ -71,60 +70,41 @@ SECTION .text |
71 | 71 |
packuswb m0, m1 |
72 | 72 |
%endmacro |
73 | 73 |
|
74 |
-%macro WEIGHT_FUNC_DBL_MM 1 |
|
75 |
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0 |
|
74 |
+INIT_MMX |
|
75 |
+cglobal h264_weight_16_mmx2, 6, 6, 0 |
|
76 | 76 |
WEIGHT_SETUP |
77 |
- mov r2, %1 |
|
78 |
-%if %1 == 16 |
|
79 | 77 |
.nextrow |
80 | 78 |
WEIGHT_OP 0, 4 |
81 | 79 |
mova [r0 ], m0 |
82 | 80 |
WEIGHT_OP 8, 12 |
83 | 81 |
mova [r0+8], m0 |
84 | 82 |
add r0, r1 |
85 |
- dec r2 |
|
83 |
+ dec r2d |
|
86 | 84 |
jnz .nextrow |
87 | 85 |
REP_RET |
88 |
-%else |
|
89 |
- jmp mangle(ff_h264_weight_16x16_mmx2.nextrow) |
|
90 |
-%endif |
|
91 |
-%endmacro |
|
92 | 86 |
|
93 |
-INIT_MMX |
|
94 |
-WEIGHT_FUNC_DBL_MM 16 |
|
95 |
-WEIGHT_FUNC_DBL_MM 8 |
|
96 |
- |
|
97 |
-%macro WEIGHT_FUNC_MM 4 |
|
98 |
-cglobal h264_weight_%1x%2_%4, 7, 7, %3 |
|
87 |
+%macro WEIGHT_FUNC_MM 3 |
|
88 |
+cglobal h264_weight_%1_%3, 6, 6, %2 |
|
99 | 89 |
WEIGHT_SETUP |
100 |
- mov r2, %2 |
|
101 |
-%if %2 == 16 |
|
102 | 90 |
.nextrow |
103 | 91 |
WEIGHT_OP 0, mmsize/2 |
104 | 92 |
mova [r0], m0 |
105 | 93 |
add r0, r1 |
106 |
- dec r2 |
|
94 |
+ dec r2d |
|
107 | 95 |
jnz .nextrow |
108 | 96 |
REP_RET |
109 |
-%else |
|
110 |
- jmp mangle(ff_h264_weight_%1x16_%4.nextrow) |
|
111 |
-%endif |
|
112 | 97 |
%endmacro |
113 | 98 |
|
114 | 99 |
INIT_MMX |
115 |
-WEIGHT_FUNC_MM 8, 16, 0, mmx2 |
|
116 |
-WEIGHT_FUNC_MM 8, 8, 0, mmx2 |
|
117 |
-WEIGHT_FUNC_MM 8, 4, 0, mmx2 |
|
100 |
+WEIGHT_FUNC_MM 8, 0, mmx2 |
|
118 | 101 |
INIT_XMM |
119 |
-WEIGHT_FUNC_MM 16, 16, 8, sse2 |
|
120 |
-WEIGHT_FUNC_MM 16, 8, 8, sse2 |
|
102 |
+WEIGHT_FUNC_MM 16, 8, sse2 |
|
121 | 103 |
|
122 |
-%macro WEIGHT_FUNC_HALF_MM 5 |
|
123 |
-cglobal h264_weight_%1x%2_%5, 5, 5, %4 |
|
104 |
+%macro WEIGHT_FUNC_HALF_MM 3 |
|
105 |
+cglobal h264_weight_%1_%3, 6, 6, %2 |
|
124 | 106 |
WEIGHT_SETUP |
125 |
- mov r2, %2/2 |
|
107 |
+ sar r2d, 1 |
|
126 | 108 |
lea r3, [r1*2] |
127 |
-%if %2 == mmsize |
|
128 | 109 |
.nextrow |
129 | 110 |
WEIGHT_OP 0, r1 |
130 | 111 |
movh [r0], m0 |
... | ... |
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4 |
135 | 135 |
movh [r0+r1], m0 |
136 | 136 |
%endif |
137 | 137 |
add r0, r3 |
138 |
- dec r2 |
|
138 |
+ dec r2d |
|
139 | 139 |
jnz .nextrow |
140 | 140 |
REP_RET |
141 |
-%else |
|
142 |
- jmp mangle(ff_h264_weight_%1x%3_%5.nextrow) |
|
143 |
-%endif |
|
144 | 141 |
%endmacro |
145 | 142 |
|
146 | 143 |
INIT_MMX |
147 |
-WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
|
148 |
-WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
|
149 |
-WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
|
144 |
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2 |
|
145 |
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2 |
|
146 |
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2 |
|
150 | 147 |
INIT_XMM |
151 |
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
|
152 |
-WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
|
153 |
-WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
|
148 |
+WEIGHT_FUNC_HALF_MM 8, 8, sse2 |
|
149 |
+WEIGHT_FUNC_HALF_MM 8, 8, sse2 |
|
150 |
+WEIGHT_FUNC_HALF_MM 8, 8, sse2 |
|
154 | 151 |
|
155 | 152 |
%macro BIWEIGHT_SETUP 0 |
156 |
- add r6, 1 |
|
157 |
- or r6, 1 |
|
158 |
- add r3, 1 |
|
159 |
- movd m3, r4d |
|
160 |
- movd m4, r5d |
|
161 |
- movd m5, r6d |
|
162 |
- movd m6, r3d |
|
153 |
+%ifdef ARCH_X86_64 |
|
154 |
+%define off_regd r11d |
|
155 |
+%else |
|
156 |
+%define off_regd r3d |
|
157 |
+%endif |
|
158 |
+ mov off_regd, r7m |
|
159 |
+ add off_regd, 1 |
|
160 |
+ or off_regd, 1 |
|
161 |
+ add r4, 1 |
|
162 |
+ movd m3, r5d |
|
163 |
+ movd m4, r6d |
|
164 |
+ movd m5, off_regd |
|
165 |
+ movd m6, r4d |
|
163 | 166 |
pslld m5, m6 |
164 | 167 |
psrld m5, 1 |
165 | 168 |
%if mmsize == 16 |
... | ... |
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
195 | 195 |
packuswb m0, m1 |
196 | 196 |
%endmacro |
197 | 197 |
|
198 |
-%macro BIWEIGHT_FUNC_DBL_MM 1 |
|
199 |
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 |
|
198 |
+INIT_MMX |
|
199 |
+cglobal h264_biweight_16_mmx2, 7, 7, 0 |
|
200 | 200 |
BIWEIGHT_SETUP |
201 |
- mov r3, %1 |
|
202 |
-%if %1 == 16 |
|
201 |
+ movifnidn r3d, r3m |
|
203 | 202 |
.nextrow |
204 | 203 |
BIWEIGHT_STEPA 0, 1, 0 |
205 | 204 |
BIWEIGHT_STEPA 1, 2, 4 |
... | ... |
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0 |
211 | 211 |
mova [r0+8], m0 |
212 | 212 |
add r0, r2 |
213 | 213 |
add r1, r2 |
214 |
- dec r3 |
|
214 |
+ dec r3d |
|
215 | 215 |
jnz .nextrow |
216 | 216 |
REP_RET |
217 |
-%else |
|
218 |
- jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow) |
|
219 |
-%endif |
|
220 |
-%endmacro |
|
221 | 217 |
|
222 |
-INIT_MMX |
|
223 |
-BIWEIGHT_FUNC_DBL_MM 16 |
|
224 |
-BIWEIGHT_FUNC_DBL_MM 8 |
|
225 |
- |
|
226 |
-%macro BIWEIGHT_FUNC_MM 4 |
|
227 |
-cglobal h264_biweight_%1x%2_%4, 7, 7, %3 |
|
218 |
+%macro BIWEIGHT_FUNC_MM 3 |
|
219 |
+cglobal h264_biweight_%1_%3, 7, 7, %2 |
|
228 | 220 |
BIWEIGHT_SETUP |
229 |
- mov r3, %2 |
|
230 |
-%if %2 == 16 |
|
221 |
+ movifnidn r3d, r3m |
|
231 | 222 |
.nextrow |
232 | 223 |
BIWEIGHT_STEPA 0, 1, 0 |
233 | 224 |
BIWEIGHT_STEPA 1, 2, mmsize/2 |
... | ... |
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3 |
235 | 235 |
mova [r0], m0 |
236 | 236 |
add r0, r2 |
237 | 237 |
add r1, r2 |
238 |
- dec r3 |
|
238 |
+ dec r3d |
|
239 | 239 |
jnz .nextrow |
240 | 240 |
REP_RET |
241 |
-%else |
|
242 |
- jmp mangle(ff_h264_biweight_%1x16_%4.nextrow) |
|
243 |
-%endif |
|
244 | 241 |
%endmacro |
245 | 242 |
|
246 | 243 |
INIT_MMX |
247 |
-BIWEIGHT_FUNC_MM 8, 16, 0, mmx2 |
|
248 |
-BIWEIGHT_FUNC_MM 8, 8, 0, mmx2 |
|
249 |
-BIWEIGHT_FUNC_MM 8, 4, 0, mmx2 |
|
244 |
+BIWEIGHT_FUNC_MM 8, 0, mmx2 |
|
250 | 245 |
INIT_XMM |
251 |
-BIWEIGHT_FUNC_MM 16, 16, 8, sse2 |
|
252 |
-BIWEIGHT_FUNC_MM 16, 8, 8, sse2 |
|
246 |
+BIWEIGHT_FUNC_MM 16, 8, sse2 |
|
253 | 247 |
|
254 |
-%macro BIWEIGHT_FUNC_HALF_MM 5 |
|
255 |
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4 |
|
248 |
+%macro BIWEIGHT_FUNC_HALF_MM 3 |
|
249 |
+cglobal h264_biweight_%1_%3, 7, 7, %2 |
|
256 | 250 |
BIWEIGHT_SETUP |
257 |
- mov r3, %2/2 |
|
251 |
+ movifnidn r3d, r3m |
|
252 |
+ sar r3, 1 |
|
258 | 253 |
lea r4, [r2*2] |
259 |
-%if %2 == mmsize |
|
260 | 254 |
.nextrow |
261 | 255 |
BIWEIGHT_STEPA 0, 1, 0 |
262 | 256 |
BIWEIGHT_STEPA 1, 2, r2 |
... | ... |
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4 |
270 | 270 |
%endif |
271 | 271 |
add r0, r4 |
272 | 272 |
add r1, r4 |
273 |
- dec r3 |
|
273 |
+ dec r3d |
|
274 | 274 |
jnz .nextrow |
275 | 275 |
REP_RET |
276 |
-%else |
|
277 |
- jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow) |
|
278 |
-%endif |
|
279 | 276 |
%endmacro |
280 | 277 |
|
281 | 278 |
INIT_MMX |
282 |
-BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2 |
|
283 |
-BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2 |
|
284 |
-BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2 |
|
279 |
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2 |
|
285 | 280 |
INIT_XMM |
286 |
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2 |
|
287 |
-BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2 |
|
288 |
-BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
|
281 |
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2 |
|
289 | 282 |
|
290 | 283 |
%macro BIWEIGHT_SSSE3_SETUP 0 |
291 |
- add r6, 1 |
|
292 |
- or r6, 1 |
|
293 |
- add r3, 1 |
|
294 |
- movd m4, r4d |
|
295 |
- movd m0, r5d |
|
296 |
- movd m5, r6d |
|
297 |
- movd m6, r3d |
|
284 |
+%ifdef ARCH_X86_64 |
|
285 |
+%define off_regd r11d |
|
286 |
+%else |
|
287 |
+%define off_regd r3d |
|
288 |
+%endif |
|
289 |
+ mov off_regd, r7m |
|
290 |
+ add off_regd, 1 |
|
291 |
+ or off_regd, 1 |
|
292 |
+ add r4, 1 |
|
293 |
+ movd m4, r5d |
|
294 |
+ movd m0, r6d |
|
295 |
+ movd m5, off_regd |
|
296 |
+ movd m6, r4d |
|
298 | 297 |
pslld m5, m6 |
299 | 298 |
psrld m5, 1 |
300 | 299 |
punpcklbw m4, m0 |
... | ... |
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2 |
314 | 314 |
packuswb m0, m2 |
315 | 315 |
%endmacro |
316 | 316 |
|
317 |
-%macro BIWEIGHT_SSSE3_16 1 |
|
318 |
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 |
|
317 |
+INIT_XMM |
|
318 |
+cglobal h264_biweight_16_ssse3, 7, 7, 8 |
|
319 | 319 |
BIWEIGHT_SSSE3_SETUP |
320 |
- mov r3, %1 |
|
320 |
+ movifnidn r3d, r3m |
|
321 | 321 |
|
322 |
-%if %1 == 16 |
|
323 | 322 |
.nextrow |
324 | 323 |
movh m0, [r0] |
325 | 324 |
movh m2, [r0+8] |
... | ... |
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8 |
330 | 330 |
mova [r0], m0 |
331 | 331 |
add r0, r2 |
332 | 332 |
add r1, r2 |
333 |
- dec r3 |
|
333 |
+ dec r3d |
|
334 | 334 |
jnz .nextrow |
335 | 335 |
REP_RET |
336 |
-%else |
|
337 |
- jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow) |
|
338 |
-%endif |
|
339 |
-%endmacro |
|
340 | 336 |
|
341 | 337 |
INIT_XMM |
342 |
-BIWEIGHT_SSSE3_16 16 |
|
343 |
-BIWEIGHT_SSSE3_16 8 |
|
344 |
- |
|
345 |
-%macro BIWEIGHT_SSSE3_8 1 |
|
346 |
-cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 |
|
338 |
+cglobal h264_biweight_8_ssse3, 7, 7, 8 |
|
347 | 339 |
BIWEIGHT_SSSE3_SETUP |
348 |
- mov r3, %1/2 |
|
340 |
+ movifnidn r3d, r3m |
|
341 |
+ sar r3, 1 |
|
349 | 342 |
lea r4, [r2*2] |
350 | 343 |
|
351 |
-%if %1 == 16 |
|
352 | 344 |
.nextrow |
353 | 345 |
movh m0, [r0] |
354 | 346 |
movh m1, [r1] |
... | ... |
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8 |
361 | 361 |
movhps [r0+r2], m0 |
362 | 362 |
add r0, r4 |
363 | 363 |
add r1, r4 |
364 |
- dec r3 |
|
364 |
+ dec r3d |
|
365 | 365 |
jnz .nextrow |
366 | 366 |
REP_RET |
367 |
-%else |
|
368 |
- jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow) |
|
369 |
-%endif |
|
370 |
-%endmacro |
|
371 |
- |
|
372 |
-INIT_XMM |
|
373 |
-BIWEIGHT_SSSE3_8 16 |
|
374 |
-BIWEIGHT_SSSE3_8 8 |
|
375 |
-BIWEIGHT_SSSE3_8 4 |
... | ... |
@@ -36,33 +36,26 @@ cextern pw_1 |
36 | 36 |
SECTION .text |
37 | 37 |
|
38 | 38 |
;----------------------------------------------------------------------------- |
39 |
-; void h264_weight(uint8_t *dst, int stride, int log2_denom, |
|
39 |
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, |
|
40 | 40 |
; int weight, int offset); |
41 | 41 |
;----------------------------------------------------------------------------- |
42 |
-%ifdef ARCH_X86_32 |
|
43 |
-DECLARE_REG_TMP 2 |
|
44 |
-%else |
|
45 |
-DECLARE_REG_TMP 10 |
|
46 |
-%endif |
|
47 |
- |
|
48 |
-%macro WEIGHT_PROLOGUE 1 |
|
49 |
- mov t0, %1 |
|
42 |
+%macro WEIGHT_PROLOGUE 0 |
|
50 | 43 |
.prologue |
51 |
- PROLOGUE 0,5,8 |
|
44 |
+ PROLOGUE 0,6,8 |
|
52 | 45 |
movifnidn r0, r0mp |
53 | 46 |
movifnidn r1d, r1m |
54 |
- movifnidn r3d, r3m |
|
55 | 47 |
movifnidn r4d, r4m |
48 |
+ movifnidn r5d, r5m |
|
56 | 49 |
%endmacro |
57 | 50 |
|
58 | 51 |
%macro WEIGHT_SETUP 1 |
59 | 52 |
mova m0, [pw_1] |
60 |
- movd m2, r2m |
|
53 |
+ movd m2, r3m |
|
61 | 54 |
pslld m0, m2 ; 1<<log2_denom |
62 | 55 |
SPLATW m0, m0 |
63 |
- shl r4, 19 ; *8, move to upper half of dword |
|
64 |
- lea r4, [r4+r3*2+0x10000] |
|
65 |
- movd m3, r4d ; weight<<1 | 1+(offset<<(3)) |
|
56 |
+ shl r5, 19 ; *8, move to upper half of dword |
|
57 |
+ lea r5, [r5+r4*2+0x10000] |
|
58 |
+ movd m3, r5d ; weight<<1 | 1+(offset<<(3)) |
|
66 | 59 |
pshufd m3, m3, 0 |
67 | 60 |
mova m4, [pw_pixel_max] |
68 | 61 |
paddw m2, [sq_1] ; log2_denom+1 |
... | ... |
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10 |
96 | 96 |
%endmacro |
97 | 97 |
|
98 | 98 |
%macro WEIGHT_FUNC_DBL 1 |
99 |
-cglobal h264_weight_16x16_10_%1 |
|
100 |
- WEIGHT_PROLOGUE 16 |
|
99 |
+cglobal h264_weight_16_10_%1 |
|
100 |
+ WEIGHT_PROLOGUE |
|
101 | 101 |
WEIGHT_SETUP %1 |
102 | 102 |
.nextrow |
103 | 103 |
WEIGHT_OP %1, 0 |
... | ... |
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1 |
105 | 105 |
WEIGHT_OP %1, 16 |
106 | 106 |
mova [r0+16], m5 |
107 | 107 |
add r0, r1 |
108 |
- dec t0 |
|
108 |
+ dec r2d |
|
109 | 109 |
jnz .nextrow |
110 | 110 |
REP_RET |
111 |
- |
|
112 |
-cglobal h264_weight_16x8_10_%1 |
|
113 |
- mov t0, 8 |
|
114 |
- jmp mangle(ff_h264_weight_16x16_10_%1.prologue) |
|
115 | 111 |
%endmacro |
116 | 112 |
|
117 | 113 |
INIT_XMM |
... | ... |
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4 |
120 | 120 |
|
121 | 121 |
|
122 | 122 |
%macro WEIGHT_FUNC_MM 1 |
123 |
-cglobal h264_weight_8x16_10_%1 |
|
124 |
- WEIGHT_PROLOGUE 16 |
|
123 |
+cglobal h264_weight_8_10_%1 |
|
124 |
+ WEIGHT_PROLOGUE |
|
125 | 125 |
WEIGHT_SETUP %1 |
126 | 126 |
.nextrow |
127 | 127 |
WEIGHT_OP %1, 0 |
128 | 128 |
mova [r0], m5 |
129 | 129 |
add r0, r1 |
130 |
- dec t0 |
|
130 |
+ dec r2d |
|
131 | 131 |
jnz .nextrow |
132 | 132 |
REP_RET |
133 |
- |
|
134 |
-cglobal h264_weight_8x8_10_%1 |
|
135 |
- mov t0, 8 |
|
136 |
- jmp mangle(ff_h264_weight_8x16_10_%1.prologue) |
|
137 |
- |
|
138 |
-cglobal h264_weight_8x4_10_%1 |
|
139 |
- mov t0, 4 |
|
140 |
- jmp mangle(ff_h264_weight_8x16_10_%1.prologue) |
|
141 | 133 |
%endmacro |
142 | 134 |
|
143 | 135 |
INIT_XMM |
... | ... |
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4 |
146 | 146 |
|
147 | 147 |
|
148 | 148 |
%macro WEIGHT_FUNC_HALF_MM 1 |
149 |
-cglobal h264_weight_4x8_10_%1 |
|
150 |
- WEIGHT_PROLOGUE 4 |
|
149 |
+cglobal h264_weight_4_10_%1 |
|
150 |
+ WEIGHT_PROLOGUE |
|
151 |
+ sar r2d, 1 |
|
151 | 152 |
WEIGHT_SETUP %1 |
152 | 153 |
lea r3, [r1*2] |
153 | 154 |
.nextrow |
... | ... |
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1 |
155 | 155 |
movh [r0], m5 |
156 | 156 |
movhps [r0+r1], m5 |
157 | 157 |
add r0, r3 |
158 |
- dec t0 |
|
158 |
+ dec r2d |
|
159 | 159 |
jnz .nextrow |
160 | 160 |
REP_RET |
161 |
- |
|
162 |
-cglobal h264_weight_4x4_10_%1 |
|
163 |
- mov t0, 2 |
|
164 |
- jmp mangle(ff_h264_weight_4x8_10_%1.prologue) |
|
165 |
- |
|
166 |
-cglobal h264_weight_4x2_10_%1 |
|
167 |
- mov t0, 1 |
|
168 |
- jmp mangle(ff_h264_weight_4x8_10_%1.prologue) |
|
169 | 161 |
%endmacro |
170 | 162 |
|
171 | 163 |
INIT_XMM |
... | ... |
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4 |
174 | 174 |
|
175 | 175 |
|
176 | 176 |
;----------------------------------------------------------------------------- |
177 |
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom, |
|
178 |
-; int weightd, int weights, int offset); |
|
177 |
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, |
|
178 |
+; int log2_denom, int weightd, int weights, int offset); |
|
179 | 179 |
;----------------------------------------------------------------------------- |
180 | 180 |
%ifdef ARCH_X86_32 |
181 |
-DECLARE_REG_TMP 2,3 |
|
181 |
+DECLARE_REG_TMP 3 |
|
182 | 182 |
%else |
183 |
-DECLARE_REG_TMP 10,2 |
|
183 |
+DECLARE_REG_TMP 10 |
|
184 | 184 |
%endif |
185 | 185 |
|
186 |
-%macro BIWEIGHT_PROLOGUE 1 |
|
187 |
- mov t0, %1 |
|
186 |
+%macro BIWEIGHT_PROLOGUE 0 |
|
188 | 187 |
.prologue |
189 | 188 |
PROLOGUE 0,7,8 |
190 | 189 |
movifnidn r0, r0mp |
191 | 190 |
movifnidn r1, r1mp |
192 |
- movifnidn t1d, r2m |
|
193 |
- movifnidn r4d, r4m |
|
191 |
+ movifnidn r2d, r2m |
|
194 | 192 |
movifnidn r5d, r5m |
195 | 193 |
movifnidn r6d, r6m |
194 |
+ movifnidn t0d, r7m |
|
196 | 195 |
%endmacro |
197 | 196 |
|
198 | 197 |
%macro BIWEIGHT_SETUP 1 |
199 |
- lea r6, [r6*4+1] ; (offset<<2)+1 |
|
200 |
- or r6, 1 |
|
201 |
- shl r5, 16 |
|
202 |
- or r4, r5 |
|
203 |
- movd m4, r4d ; weightd | weights |
|
204 |
- movd m5, r6d ; (offset+1)|1 |
|
205 |
- movd m6, r3m ; log2_denom |
|
198 |
+ lea t0, [t0*4+1] ; (offset<<2)+1 |
|
199 |
+ or t0, 1 |
|
200 |
+ shl r6, 16 |
|
201 |
+ or r5, r6 |
|
202 |
+ movd m4, r5d ; weightd | weights |
|
203 |
+ movd m5, t0d ; (offset+1)|1 |
|
204 |
+ movd m6, r4m ; log2_denom |
|
206 | 205 |
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom |
207 | 206 |
paddd m6, [sq_1] |
208 | 207 |
pshufd m4, m4, 0 |
209 | 208 |
pshufd m5, m5, 0 |
210 | 209 |
mova m3, [pw_pixel_max] |
210 |
+ movifnidn r3d, r3m |
|
211 | 211 |
%ifnidn %1, sse4 |
212 | 212 |
pxor m7, m7 |
213 | 213 |
%endif |
... | ... |
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2 |
243 | 243 |
%endmacro |
244 | 244 |
|
245 | 245 |
%macro BIWEIGHT_FUNC_DBL 1 |
246 |
-cglobal h264_biweight_16x16_10_%1 |
|
247 |
- BIWEIGHT_PROLOGUE 16 |
|
246 |
+cglobal h264_biweight_16_10_%1 |
|
247 |
+ BIWEIGHT_PROLOGUE |
|
248 | 248 |
BIWEIGHT_SETUP %1 |
249 | 249 |
.nextrow |
250 | 250 |
BIWEIGHT %1, 0 |
251 | 251 |
mova [r0 ], m0 |
252 | 252 |
BIWEIGHT %1, 16 |
253 | 253 |
mova [r0+16], m0 |
254 |
- add r0, t1 |
|
255 |
- add r1, t1 |
|
256 |
- dec t0 |
|
254 |
+ add r0, r2 |
|
255 |
+ add r1, r2 |
|
256 |
+ dec r3d |
|
257 | 257 |
jnz .nextrow |
258 | 258 |
REP_RET |
259 |
- |
|
260 |
-cglobal h264_biweight_16x8_10_%1 |
|
261 |
- mov t0, 8 |
|
262 |
- jmp mangle(ff_h264_biweight_16x16_10_%1.prologue) |
|
263 | 259 |
%endmacro |
264 | 260 |
|
265 | 261 |
INIT_XMM |
... | ... |
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2 |
267 | 267 |
BIWEIGHT_FUNC_DBL sse4 |
268 | 268 |
|
269 | 269 |
%macro BIWEIGHT_FUNC 1 |
270 |
-cglobal h264_biweight_8x16_10_%1 |
|
271 |
- BIWEIGHT_PROLOGUE 16 |
|
270 |
+cglobal h264_biweight_8_10_%1 |
|
271 |
+ BIWEIGHT_PROLOGUE |
|
272 | 272 |
BIWEIGHT_SETUP %1 |
273 | 273 |
.nextrow |
274 | 274 |
BIWEIGHT %1, 0 |
275 | 275 |
mova [r0], m0 |
276 |
- add r0, t1 |
|
277 |
- add r1, t1 |
|
278 |
- dec t0 |
|
276 |
+ add r0, r2 |
|
277 |
+ add r1, r2 |
|
278 |
+ dec r3d |
|
279 | 279 |
jnz .nextrow |
280 | 280 |
REP_RET |
281 |
- |
|
282 |
-cglobal h264_biweight_8x8_10_%1 |
|
283 |
- mov t0, 8 |
|
284 |
- jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) |
|
285 |
- |
|
286 |
-cglobal h264_biweight_8x4_10_%1 |
|
287 |
- mov t0, 4 |
|
288 |
- jmp mangle(ff_h264_biweight_8x16_10_%1.prologue) |
|
289 | 281 |
%endmacro |
290 | 282 |
|
291 | 283 |
INIT_XMM |
... | ... |
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2 |
293 | 293 |
BIWEIGHT_FUNC sse4 |
294 | 294 |
|
295 | 295 |
%macro BIWEIGHT_FUNC_HALF 1 |
296 |
-cglobal h264_biweight_4x8_10_%1 |
|
297 |
- BIWEIGHT_PROLOGUE 4 |
|
296 |
+cglobal h264_biweight_4_10_%1 |
|
297 |
+ BIWEIGHT_PROLOGUE |
|
298 | 298 |
BIWEIGHT_SETUP %1 |
299 |
- lea r4, [t1*2] |
|
299 |
+ sar r3d, 1 |
|
300 |
+ lea r4, [r2*2] |
|
300 | 301 |
.nextrow |
301 |
- BIWEIGHT %1, 0, t1 |
|
302 |
+ BIWEIGHT %1, 0, r2 |
|
302 | 303 |
movh [r0 ], m0 |
303 |
- movhps [r0+t1], m0 |
|
304 |
+ movhps [r0+r2], m0 |
|
304 | 305 |
add r0, r4 |
305 | 306 |
add r1, r4 |
306 |
- dec t0 |
|
307 |
+ dec r3d |
|
307 | 308 |
jnz .nextrow |
308 | 309 |
REP_RET |
309 |
- |
|
310 |
-cglobal h264_biweight_4x4_10_%1 |
|
311 |
- mov t0, 2 |
|
312 |
- jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) |
|
313 |
- |
|
314 |
-cglobal h264_biweight_4x2_10_%1 |
|
315 |
- mov t0, 1 |
|
316 |
- jmp mangle(ff_h264_biweight_4x8_10_%1.prologue) |
|
317 | 310 |
%endmacro |
318 | 311 |
|
319 | 312 |
INIT_XMM |
... | ... |
@@ -298,57 +298,47 @@ LF_IFUNC(v, luma_intra, 10, mmxext) |
298 | 298 |
/***********************************/ |
299 | 299 |
/* weighted prediction */ |
300 | 300 |
|
301 |
-#define H264_WEIGHT(W, H, OPT) \ |
|
302 |
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ |
|
303 |
- int stride, int log2_denom, int weight, int offset); |
|
301 |
+#define H264_WEIGHT(W, OPT) \ |
|
302 |
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \ |
|
303 |
+ int stride, int height, int log2_denom, int weight, int offset); |
|
304 | 304 |
|
305 |
-#define H264_BIWEIGHT(W, H, OPT) \ |
|
306 |
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ |
|
307 |
- uint8_t *src, int stride, int log2_denom, int weightd, \ |
|
305 |
+#define H264_BIWEIGHT(W, OPT) \ |
|
306 |
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \ |
|
307 |
+ uint8_t *src, int stride, int height, int log2_denom, int weightd, \ |
|
308 | 308 |
int weights, int offset); |
309 | 309 |
|
310 |
-#define H264_BIWEIGHT_MMX(W,H) \ |
|
311 |
-H264_WEIGHT (W, H, mmx2) \ |
|
312 |
-H264_BIWEIGHT(W, H, mmx2) |
|
313 |
- |
|
314 |
-#define H264_BIWEIGHT_MMX_SSE(W,H) \ |
|
315 |
-H264_BIWEIGHT_MMX(W, H) \ |
|
316 |
-H264_WEIGHT (W, H, sse2) \ |
|
317 |
-H264_BIWEIGHT (W, H, sse2) \ |
|
318 |
-H264_BIWEIGHT (W, H, ssse3) |
|
319 |
- |
|
320 |
-H264_BIWEIGHT_MMX_SSE(16, 16) |
|
321 |
-H264_BIWEIGHT_MMX_SSE(16, 8) |
|
322 |
-H264_BIWEIGHT_MMX_SSE( 8, 16) |
|
323 |
-H264_BIWEIGHT_MMX_SSE( 8, 8) |
|
324 |
-H264_BIWEIGHT_MMX_SSE( 8, 4) |
|
325 |
-H264_BIWEIGHT_MMX ( 4, 8) |
|
326 |
-H264_BIWEIGHT_MMX ( 4, 4) |
|
327 |
-H264_BIWEIGHT_MMX ( 4, 2) |
|
328 |
- |
|
329 |
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \ |
|
330 |
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ |
|
331 |
- int stride, int log2_denom, int weight, int offset); |
|
332 |
- |
|
333 |
-#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \ |
|
334 |
-void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \ |
|
335 |
- (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \ |
|
310 |
+#define H264_BIWEIGHT_MMX(W) \ |
|
311 |
+H264_WEIGHT (W, mmx2) \ |
|
312 |
+H264_BIWEIGHT(W, mmx2) |
|
313 |
+ |
|
314 |
+#define H264_BIWEIGHT_MMX_SSE(W) \ |
|
315 |
+H264_BIWEIGHT_MMX(W) \ |
|
316 |
+H264_WEIGHT (W, sse2) \ |
|
317 |
+H264_BIWEIGHT (W, sse2) \ |
|
318 |
+H264_BIWEIGHT (W, ssse3) |
|
319 |
+ |
|
320 |
+H264_BIWEIGHT_MMX_SSE(16) |
|
321 |
+H264_BIWEIGHT_MMX_SSE( 8) |
|
322 |
+H264_BIWEIGHT_MMX ( 4) |
|
323 |
+ |
|
324 |
+#define H264_WEIGHT_10(W, DEPTH, OPT) \ |
|
325 |
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ |
|
326 |
+ int stride, int height, int log2_denom, int weight, int offset); |
|
327 |
+ |
|
328 |
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ |
|
329 |
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \ |
|
330 |
+ (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \ |
|
336 | 331 |
int weightd, int weights, int offset); |
337 | 332 |
|
338 |
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \ |
|
339 |
-H264_WEIGHT_10 (W, H, DEPTH, sse2) \ |
|
340 |
-H264_WEIGHT_10 (W, H, DEPTH, sse4) \ |
|
341 |
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \ |
|
342 |
-H264_BIWEIGHT_10(W, H, DEPTH, sse4) |
|
343 |
- |
|
344 |
-H264_BIWEIGHT_10_SSE(16, 16, 10) |
|
345 |
-H264_BIWEIGHT_10_SSE(16, 8, 10) |
|
346 |
-H264_BIWEIGHT_10_SSE( 8, 16, 10) |
|
347 |
-H264_BIWEIGHT_10_SSE( 8, 8, 10) |
|
348 |
-H264_BIWEIGHT_10_SSE( 8, 4, 10) |
|
349 |
-H264_BIWEIGHT_10_SSE( 4, 8, 10) |
|
350 |
-H264_BIWEIGHT_10_SSE( 4, 4, 10) |
|
351 |
-H264_BIWEIGHT_10_SSE( 4, 2, 10) |
|
333 |
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ |
|
334 |
+H264_WEIGHT_10 (W, DEPTH, sse2) \ |
|
335 |
+H264_WEIGHT_10 (W, DEPTH, sse4) \ |
|
336 |
+H264_BIWEIGHT_10(W, DEPTH, sse2) \ |
|
337 |
+H264_BIWEIGHT_10(W, DEPTH, sse4) |
|
338 |
+ |
|
339 |
+H264_BIWEIGHT_10_SSE(16, 10) |
|
340 |
+H264_BIWEIGHT_10_SSE( 8, 10) |
|
341 |
+H264_BIWEIGHT_10_SSE( 4, 10) |
|
352 | 342 |
|
353 | 343 |
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) |
354 | 344 |
{ |
... | ... |
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom |
394 | 394 |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; |
395 | 395 |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; |
396 | 396 |
#endif |
397 |
- c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; |
|
398 |
- c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; |
|
399 |
- c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; |
|
400 |
- c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; |
|
401 |
- c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; |
|
402 |
- c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; |
|
403 |
- c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; |
|
404 |
- c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; |
|
405 |
- |
|
406 |
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; |
|
407 |
- c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; |
|
408 |
- c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; |
|
409 |
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; |
|
410 |
- c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; |
|
411 |
- c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; |
|
412 |
- c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; |
|
413 |
- c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; |
|
397 |
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; |
|
398 |
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; |
|
399 |
+ c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2; |
|
400 |
+ |
|
401 |
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2; |
|
402 |
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2; |
|
403 |
+ c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2; |
|
414 | 404 |
|
415 | 405 |
if (mm_flags&AV_CPU_FLAG_SSE2) { |
416 | 406 |
c->h264_idct8_add = ff_h264_idct8_add_8_sse2; |
... | ... |
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom |
422 | 422 |
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; |
423 | 423 |
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; |
424 | 424 |
|
425 |
- c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; |
|
426 |
- c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; |
|
427 |
- c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; |
|
428 |
- c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; |
|
429 |
- c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; |
|
425 |
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2; |
|
426 |
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2; |
|
430 | 427 |
|
431 |
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; |
|
432 |
- c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; |
|
433 |
- c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; |
|
434 |
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; |
|
435 |
- c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; |
|
428 |
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2; |
|
429 |
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2; |
|
436 | 430 |
|
437 | 431 |
#if HAVE_ALIGNED_STACK |
438 | 432 |
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; |
... | ... |
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom |
442 | 442 |
#endif |
443 | 443 |
} |
444 | 444 |
if (mm_flags&AV_CPU_FLAG_SSSE3) { |
445 |
- c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; |
|
446 |
- c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; |
|
447 |
- c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; |
|
448 |
- c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; |
|
449 |
- c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; |
|
445 |
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3; |
|
446 |
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3; |
|
450 | 447 |
} |
451 | 448 |
if (mm_flags&AV_CPU_FLAG_AVX) { |
452 | 449 |
#if HAVE_ALIGNED_STACK |
... | ... |
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom |
485 | 485 |
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; |
486 | 486 |
#endif |
487 | 487 |
|
488 |
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2; |
|
489 |
- c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2; |
|
490 |
- c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2; |
|
491 |
- c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2; |
|
492 |
- c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2; |
|
493 |
- c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2; |
|
494 |
- c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2; |
|
495 |
- c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2; |
|
496 |
- |
|
497 |
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2; |
|
498 |
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2; |
|
499 |
- c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2; |
|
500 |
- c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2; |
|
501 |
- c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2; |
|
502 |
- c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2; |
|
503 |
- c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2; |
|
504 |
- c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2; |
|
488 |
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; |
|
489 |
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; |
|
490 |
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; |
|
491 |
+ |
|
492 |
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; |
|
493 |
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; |
|
494 |
+ c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; |
|
505 | 495 |
|
506 | 496 |
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; |
507 | 497 |
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; |
... | ... |
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom |
513 | 513 |
#endif |
514 | 514 |
} |
515 | 515 |
if (mm_flags&AV_CPU_FLAG_SSE4) { |
516 |
- c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4; |
|
517 |
- c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4; |
|
518 |
- c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4; |
|
519 |
- c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4; |
|
520 |
- c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4; |
|
521 |
- c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4; |
|
522 |
- c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4; |
|
523 |
- c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4; |
|
524 |
- |
|
525 |
- c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4; |
|
526 |
- c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4; |
|
527 |
- c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4; |
|
528 |
- c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4; |
|
529 |
- c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4; |
|
530 |
- c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4; |
|
531 |
- c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4; |
|
532 |
- c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4; |
|
516 |
+ c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; |
|
517 |
+ c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; |
|
518 |
+ c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; |
|
519 |
+ |
|
520 |
+ c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; |
|
521 |
+ c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; |
|
522 |
+ c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; |
|
533 | 523 |
} |
534 | 524 |
#if HAVE_AVX |
535 | 525 |
if (mm_flags&AV_CPU_FLAG_AVX) { |