Browse code

H264: change weight/biweight functions to take a height argument.

Neon parts by Mans Rullgard <mans@mansr.com>.

Ronald S. Bultje authored on 2011/10/21 16:00:39
Showing 10 changed files
... ...
@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
32 32
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
33 33
                                        int beta, int8_t *tc0);
34 34
 
35
-void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
36
-                                      int weight, int offset);
37
-void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
38
-                                     int weight, int offset);
39
-void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
40
-                                     int weight, int offset);
41
-void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
42
-                                    int weight, int offset);
43
-void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
44
-                                    int weight, int offset);
45
-void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
46
-                                    int weight, int offset);
47
-void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
48
-                                    int weight, int offset);
49
-void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
50
-                                    int weight, int offset);
35
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
36
+                                   int log2_den, int weight, int offset);
37
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
38
+                                  int log2_den, int weight, int offset);
39
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
40
+                                  int log2_den, int weight, int offset);
51 41
 
52
-void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
53
-                                        int log2_den, int weightd, int weights,
54
-                                        int offset);
55
-void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
56
-                                       int log2_den, int weightd, int weights,
57
-                                       int offset);
58
-void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
59
-                                       int log2_den, int weightd, int weights,
60
-                                       int offset);
61
-void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
62
-                                      int log2_den, int weightd, int weights,
63
-                                      int offset);
64
-void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
65
-                                      int log2_den, int weightd, int weights,
66
-                                      int offset);
67
-void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
68
-                                      int log2_den, int weightd, int weights,
69
-                                      int offset);
70
-void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
71
-                                      int log2_den, int weightd, int weights,
72
-                                      int offset);
73
-void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
74
-                                      int log2_den, int weightd, int weights,
75
-                                      int offset);
42
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
43
+                                     int height, int log2_den, int weightd,
44
+                                     int weights, int offset);
45
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
46
+                                    int height, int log2_den, int weightd,
47
+                                    int weights, int offset);
48
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
49
+                                    int height, int log2_den, int weightd,
50
+                                    int weights, int offset);
76 51
 
77 52
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
78 53
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
... ...
@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
100 100
     c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
101 101
     c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
102 102
 
103
-    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
104
-    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
105
-    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
106
-    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
107
-    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
108
-    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
109
-    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
110
-    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
103
+    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
104
+    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
105
+    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
111 106
 
112
-    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
113
-    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
114
-    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
115
-    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
116
-    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
117
-    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
118
-    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
119
-    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
107
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
108
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
109
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
120 110
 
121 111
     c->h264_idct_add        = ff_h264_idct_add_neon;
122 112
     c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;
... ...
@@ -1592,7 +1592,7 @@ endfunc
1592 1592
         vdup.8          d1,  r5
1593 1593
         vmov            q2,  q8
1594 1594
         vmov            q3,  q8
1595
-1:      subs            ip,  ip,  #2
1595
+1:      subs            r3,  r3,  #2
1596 1596
         vld1.8          {d20-d21},[r0,:128], r2
1597 1597
         \macd           q2,  d0,  d20
1598 1598
         pld             [r0]
... ...
@@ -1632,7 +1632,7 @@ endfunc
1632 1632
         vdup.8          d1,  r5
1633 1633
         vmov            q1,  q8
1634 1634
         vmov            q10, q8
1635
-1:      subs            ip,  ip,  #2
1635
+1:      subs            r3,  r3,  #2
1636 1636
         vld1.8          {d4},[r0,:64], r2
1637 1637
         \macd           q1,  d0,  d4
1638 1638
         pld             [r0]
... ...
@@ -1662,7 +1662,7 @@ endfunc
1662 1662
         vdup.8          d1,  r5
1663 1663
         vmov            q1,  q8
1664 1664
         vmov            q10, q8
1665
-1:      subs            ip,  ip,  #4
1665
+1:      subs            r3,  r3,  #4
1666 1666
         vld1.32         {d4[0]},[r0,:32], r2
1667 1667
         vld1.32         {d4[1]},[r0,:32], r2
1668 1668
         \macd           q1,  d0,  d4
... ...
@@ -1700,16 +1700,17 @@ endfunc
1700 1700
         .endm
1701 1701
 
1702 1702
         .macro  biweight_func w
1703
-function biweight_h264_pixels_\w\()_neon
1703
+function ff_biweight_h264_pixels_\w\()_neon, export=1
1704 1704
         push            {r4-r6, lr}
1705
-        add             r4,  sp,  #16
1705
+        ldr             r12, [sp, #16]
1706
+        add             r4,  sp,  #20
1706 1707
         ldm             r4,  {r4-r6}
1707 1708
         lsr             lr,  r4,  #31
1708 1709
         add             r6,  r6,  #1
1709 1710
         eors            lr,  lr,  r5,  lsr #30
1710 1711
         orr             r6,  r6,  #1
1711
-        vdup.16         q9,  r3
1712
-        lsl             r6,  r6,  r3
1712
+        vdup.16         q9,  r12
1713
+        lsl             r6,  r6,  r12
1713 1714
         vmvn            q9,  q9
1714 1715
         vdup.16         q8,  r6
1715 1716
         mov             r6,  r0
... ...
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
1730 1730
 endfunc
1731 1731
         .endm
1732 1732
 
1733
-        .macro  biweight_entry w, h, b=1
1734
-function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1735
-        mov             ip,  #\h
1736
-.if \b
1737
-        b               biweight_h264_pixels_\w\()_neon
1738
-.endif
1739
-endfunc
1740
-        .endm
1741
-
1742
-        biweight_entry  16, 8
1743
-        biweight_entry  16, 16, b=0
1744 1733
         biweight_func   16
1745
-
1746
-        biweight_entry  8,  16
1747
-        biweight_entry  8,  4
1748
-        biweight_entry  8,  8,  b=0
1749 1734
         biweight_func   8
1750
-
1751
-        biweight_entry  4,  8
1752
-        biweight_entry  4,  2
1753
-        biweight_entry  4,  4,  b=0
1754 1735
         biweight_func   4
1755 1736
 
1756 1737
 @ Weighted prediction
1757 1738
 
1758 1739
         .macro  weight_16 add
1759
-        vdup.8          d0,  r3
1760
-1:      subs            ip,  ip,  #2
1740
+        vdup.8          d0,  r12
1741
+1:      subs            r2,  r2,  #2
1761 1742
         vld1.8          {d20-d21},[r0,:128], r1
1762 1743
         vmull.u8        q2,  d0,  d20
1763 1744
         pld             [r0]
... ...
@@ -1785,8 +1767,8 @@ endfunc
1785 1785
         .endm
1786 1786
 
1787 1787
         .macro  weight_8 add
1788
-        vdup.8          d0,  r3
1789
-1:      subs            ip,  ip,  #2
1788
+        vdup.8          d0,  r12
1789
+1:      subs            r2,  r2,  #2
1790 1790
         vld1.8          {d4},[r0,:64], r1
1791 1791
         vmull.u8        q1,  d0,  d4
1792 1792
         pld             [r0]
... ...
@@ -1806,10 +1788,10 @@ endfunc
1806 1806
         .endm
1807 1807
 
1808 1808
         .macro  weight_4 add
1809
-        vdup.8          d0,  r3
1809
+        vdup.8          d0,  r12
1810 1810
         vmov            q1,  q8
1811 1811
         vmov            q10, q8
1812
-1:      subs            ip,  ip,  #4
1812
+1:      subs            r2,  r2,  #4
1813 1813
         vld1.32         {d4[0]},[r0,:32], r1
1814 1814
         vld1.32         {d4[1]},[r0,:32], r1
1815 1815
         vmull.u8        q1,  d0,  d4
... ...
@@ -1842,50 +1824,32 @@ endfunc
1842 1842
         .endm
1843 1843
 
1844 1844
         .macro  weight_func w
1845
-function weight_h264_pixels_\w\()_neon
1845
+function ff_weight_h264_pixels_\w\()_neon, export=1
1846 1846
         push            {r4, lr}
1847
-        ldr             r4,  [sp, #8]
1848
-        cmp             r2,  #1
1849
-        lsl             r4,  r4,  r2
1847
+        ldr             r12, [sp, #8]
1848
+        ldr             r4,  [sp, #12]
1849
+        cmp             r3,  #1
1850
+        lsl             r4,  r4,  r3
1850 1851
         vdup.16         q8,  r4
1851 1852
         mov             r4,  r0
1852 1853
         ble             20f
1853
-        rsb             lr,  r2,  #1
1854
+        rsb             lr,  r3,  #1
1854 1855
         vdup.16         q9,  lr
1855
-        cmp             r3,  #0
1856
+        cmp             r12, #0
1856 1857
         blt             10f
1857 1858
         weight_\w       vhadd.s16
1858
-10:     rsb             r3,  r3,  #0
1859
+10:     rsb             r12, r12, #0
1859 1860
         weight_\w       vhsub.s16
1860
-20:     rsb             lr,  r2,  #0
1861
+20:     rsb             lr,  r3,  #0
1861 1862
         vdup.16         q9,  lr
1862
-        cmp             r3,  #0
1863
+        cmp             r12, #0
1863 1864
         blt             10f
1864 1865
         weight_\w       vadd.s16
1865
-10:     rsb             r3,  r3,  #0
1866
+10:     rsb             r12, r12, #0
1866 1867
         weight_\w       vsub.s16
1867 1868
 endfunc
1868 1869
         .endm
1869 1870
 
1870
-        .macro  weight_entry w, h, b=1
1871
-function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1872
-        mov             ip,  #\h
1873
-.if \b
1874
-        b               weight_h264_pixels_\w\()_neon
1875
-.endif
1876
-endfunc
1877
-        .endm
1878
-
1879
-        weight_entry    16, 8
1880
-        weight_entry    16, 16, b=0
1881 1871
         weight_func     16
1882
-
1883
-        weight_entry    8,  16
1884
-        weight_entry    8,  4
1885
-        weight_entry    8,  8,  b=0
1886 1872
         weight_func     8
1887
-
1888
-        weight_entry    4,  8
1889
-        weight_entry    4,  2
1890
-        weight_entry    4,  4,  b=0
1891 1873
         weight_func     4
... ...
@@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
438 438
 }
439 439
 #endif
440 440
 
441
-static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
441
+static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
442
+                               int height, int delta, int list,
442 443
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
443 444
                            int src_x_offset, int src_y_offset,
444 445
                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
... ...
@@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
518 518
         s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
519 519
             src_cb= s->edge_emu_buffer;
520 520
     }
521
-    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
521
+    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
522 522
 
523 523
     if(emu){
524 524
         s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
525 525
             src_cr= s->edge_emu_buffer;
526 526
     }
527
-    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
527
+    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
528 528
 }
529 529
 
530
-static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
530
+static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
531 531
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
532 532
                            int x_offset, int y_offset,
533 533
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
... ...
@@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
553 553
 
554 554
     if(list0){
555 555
         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
556
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
556
+        mc_dir_part(h, ref, n, square, height, delta, 0,
557 557
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
558 558
                            qpix_op, chroma_op, pixel_shift, chroma444);
559 559
 
... ...
@@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
563 563
 
564 564
     if(list1){
565 565
         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
566
-        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
566
+        mc_dir_part(h, ref, n, square, height, delta, 1,
567 567
                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
568 568
                            qpix_op, chroma_op, pixel_shift, chroma444);
569 569
     }
570 570
 }
571 571
 
572
-static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
572
+static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
573 573
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
574 574
                            int x_offset, int y_offset,
575 575
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
... ...
@@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
577 577
                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
578 578
                            int list0, int list1, int pixel_shift, int chroma444){
579 579
     MpegEncContext * const s = &h->s;
580
+    int chroma_height;
580 581
 
581 582
     dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
582 583
     if(chroma444){
584
+        chroma_height = height;
583 585
         chroma_weight_avg = luma_weight_avg;
584 586
         chroma_weight_op = luma_weight_op;
585 587
         dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
586 588
         dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
587 589
     } else if (CHROMA422) {
590
+        chroma_height = height;
588 591
         dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
589 592
         dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
590 593
     }else{
594
+        chroma_height = height >> 1;
591 595
         dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
592 596
         dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
593 597
     }
... ...
@@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
603 603
         int refn0 = h->ref_cache[0][ scan8[n] ];
604 604
         int refn1 = h->ref_cache[1][ scan8[n] ];
605 605
 
606
-        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
606
+        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
607 607
                     dest_y, dest_cb, dest_cr,
608 608
                     x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
609
-        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
609
+        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
610 610
                     tmp_y, tmp_cb, tmp_cr,
611 611
                     x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
612 612
 
613 613
         if(h->use_weight == 2){
614 614
             int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
615 615
             int weight1 = 64 - weight0;
616
-            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
617
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
618
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
619
-            if (CHROMA422) {
620
-                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
621
-                                  tmp_cb + chroma_height * h->mb_uvlinesize,
622
-                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
623
-                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
624
-                                  tmp_cr + chroma_height * h->mb_uvlinesize,
625
-                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
626
-            }
616
+            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
617
+                              height,        5, weight0, weight1, 0);
618
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
619
+                              chroma_height, 5, weight0, weight1, 0);
620
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
621
+                              chroma_height, 5, weight0, weight1, 0);
627 622
         }else{
628
-            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
623
+            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
629 624
                             h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
630 625
                             h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
631
-            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
626
+            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
632 627
                             h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
633 628
                             h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
634
-            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
629
+            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
635 630
                             h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
636 631
                             h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
637
-            if (CHROMA422) {
638
-                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
639
-                                  tmp_cb + chroma_height * h->mb_uvlinesize,
640
-                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
641
-                                  h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
642
-                                  h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
643
-                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
644
-                                  tmp_cr + chroma_height * h->mb_uvlinesize,
645
-                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
646
-                                  h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
647
-                                  h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
648
-            }
649 632
         }
650 633
     }else{
651 634
         int list = list1 ? 1 : 0;
652 635
         int refn = h->ref_cache[list][ scan8[n] ];
653 636
         Picture *ref= &h->ref_list[list][refn];
654
-        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
637
+        mc_dir_part(h, ref, n, square, height, delta, list,
655 638
                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
656 639
                     qpix_put, chroma_put, pixel_shift, chroma444);
657 640
 
658
-        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
641
+        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
659 642
                        h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
660 643
         if(h->use_weight_chroma){
661
-            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
644
+            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
662 645
                              h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
663
-            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
646
+            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
664 647
                              h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
665
-            if (CHROMA422) {
666
-                chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
667
-                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
668
-                                 h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
669
-                chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
670
-                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
671
-                                 h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
672
-            }
673 648
         }
674 649
     }
675 650
 }
676 651
 
677
-static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
652
+static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
678 653
                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
679 654
                            int x_offset, int y_offset,
680 655
                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
... ...
@@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
684 684
     if((h->use_weight==2 && list0 && list1
685 685
         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
686 686
        || h->use_weight==1)
687
-        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
687
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
688 688
                          x_offset, y_offset, qpix_put, chroma_put,
689
-                         weight_op[0], weight_op[3], weight_avg[0],
690
-                         weight_avg[3], list0, list1, pixel_shift, chroma444);
689
+                         weight_op[0], weight_op[1], weight_avg[0],
690
+                         weight_avg[1], list0, list1, pixel_shift, chroma444);
691 691
     else
692
-        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
692
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
693 693
                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
694 694
                     chroma_avg, list0, list1, pixel_shift, chroma444);
695 695
 }
... ...
@@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
731 731
     prefetch_motion(h, 0, pixel_shift, chroma444);
732 732
 
733 733
     if(IS_16X16(mb_type)){
734
-        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
734
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
735 735
                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
736 736
                 weight_op, weight_avg,
737 737
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
738 738
                 pixel_shift, chroma444);
739 739
     }else if(IS_16X8(mb_type)){
740
-        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
740
+        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
741 741
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
742
-                &weight_op[1], &weight_avg[1],
742
+                weight_op, weight_avg,
743 743
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
744 744
                 pixel_shift, chroma444);
745
-        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
745
+        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
746 746
                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
747
-                &weight_op[1], &weight_avg[1],
747
+                weight_op, weight_avg,
748 748
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
749 749
                 pixel_shift, chroma444);
750 750
     }else if(IS_8X16(mb_type)){
751
-        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
751
+        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
752 752
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
753
-                &weight_op[2], &weight_avg[2],
753
+                &weight_op[1], &weight_avg[1],
754 754
                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
755 755
                 pixel_shift, chroma444);
756
-        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
756
+        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
757 757
                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
758
-                &weight_op[2], &weight_avg[2],
758
+                &weight_op[1], &weight_avg[1],
759 759
                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
760 760
                 pixel_shift, chroma444);
761 761
     }else{
... ...
@@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
770 770
             int y_offset= (i&2)<<1;
771 771
 
772 772
             if(IS_SUB_8X8(sub_mb_type)){
773
-                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
773
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
774 774
                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
775
-                    &weight_op[3], &weight_avg[3],
775
+                    &weight_op[1], &weight_avg[1],
776 776
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
777 777
                     pixel_shift, chroma444);
778 778
             }else if(IS_SUB_8X4(sub_mb_type)){
779
-                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
779
+                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
780 780
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
781
-                    &weight_op[4], &weight_avg[4],
781
+                    &weight_op[1], &weight_avg[1],
782 782
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
783 783
                     pixel_shift, chroma444);
784
-                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
784
+                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
785 785
                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
786
-                    &weight_op[4], &weight_avg[4],
786
+                    &weight_op[1], &weight_avg[1],
787 787
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
788 788
                     pixel_shift, chroma444);
789 789
             }else if(IS_SUB_4X8(sub_mb_type)){
790
-                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
790
+                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
791 791
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
792
-                    &weight_op[5], &weight_avg[5],
792
+                    &weight_op[2], &weight_avg[2],
793 793
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
794 794
                     pixel_shift, chroma444);
795
-                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
795
+                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
796 796
                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
797
-                    &weight_op[5], &weight_avg[5],
797
+                    &weight_op[2], &weight_avg[2],
798 798
                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
799 799
                     pixel_shift, chroma444);
800 800
             }else{
... ...
@@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
803 803
                 for(j=0; j<4; j++){
804 804
                     int sub_x_offset= x_offset + 2*(j&1);
805 805
                     int sub_y_offset= y_offset +   (j&2);
806
-                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
806
+                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
807 807
                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
808
-                        &weight_op[6], &weight_avg[6],
808
+                        &weight_op[2], &weight_avg[2],
809 809
                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
810 810
                         pixel_shift, chroma444);
811 811
                 }
... ...
@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
64 64
     else\
65 65
         c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
66 66
 \
67
-    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
68
-    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
69
-    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
70
-    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
71
-    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
72
-    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
73
-    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
74
-    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
75
-    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
76
-    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
77
-    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
78
-    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
79
-    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
80
-    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
81
-    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
82
-    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
83
-    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
84
-    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
85
-    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
86
-    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
67
+    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
68
+    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
69
+    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
70
+    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
71
+    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
72
+    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
73
+    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
74
+    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
87 75
 \
88 76
     c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
89 77
     c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
... ...
@@ -31,16 +31,18 @@
31 31
 #include "dsputil.h"
32 32
 
33 33
 //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
34
-typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
35
-typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
34
+typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
35
+                                 int log2_denom, int weight, int offset);
36
+typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
37
+                                   int log2_denom, int weightd, int weights, int offset);
36 38
 
37 39
 /**
38 40
  * Context for storing H.264 DSP functions
39 41
  */
40 42
 typedef struct H264DSPContext{
41 43
     /* weighted MC */
42
-    h264_weight_func weight_h264_pixels_tab[10];
43
-    h264_biweight_func biweight_h264_pixels_tab[10];
44
+    h264_weight_func weight_h264_pixels_tab[4];
45
+    h264_biweight_func biweight_h264_pixels_tab[4];
44 46
 
45 47
     /* loop filter */
46 48
     void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
... ...
@@ -29,14 +29,16 @@
29 29
 
30 30
 #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
31 31
 #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
32
-#define H264_WEIGHT(W,H) \
33
-static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
32
+#define H264_WEIGHT(W) \
33
+static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
34
+                                           int log2_denom, int weight, int offset) \
35
+{ \
34 36
     int y; \
35 37
     pixel *block = (pixel*)_block; \
36 38
     stride /= sizeof(pixel); \
37 39
     offset <<= (log2_denom + (BIT_DEPTH-8)); \
38 40
     if(log2_denom) offset += 1<<(log2_denom-1); \
39
-    for(y=0; y<H; y++, block += stride){ \
41
+    for (y = 0; y < height; y++, block += stride) { \
40 42
         op_scale1(0); \
41 43
         op_scale1(1); \
42 44
         if(W==2) continue; \
... ...
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
58 58
         op_scale1(15); \
59 59
     } \
60 60
 } \
61
-static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
61
+static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
62
+                                             int log2_denom, int weightd, int weights, int offset) \
63
+{ \
62 64
     int y; \
63 65
     pixel *dst = (pixel*)_dst; \
64 66
     pixel *src = (pixel*)_src; \
65 67
     stride /= sizeof(pixel); \
66 68
     offset <<= (BIT_DEPTH-8); \
67 69
     offset = ((offset + 1) | 1) << log2_denom; \
68
-    for(y=0; y<H; y++, dst += stride, src += stride){ \
70
+    for (y = 0; y < height; y++, dst += stride, src += stride) { \
69 71
         op_scale2(0); \
70 72
         op_scale2(1); \
71 73
         if(W==2) continue; \
... ...
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
88 88
     } \
89 89
 }
90 90
 
91
-H264_WEIGHT(16,16)
92
-H264_WEIGHT(16,8)
93
-H264_WEIGHT(8,16)
94
-H264_WEIGHT(8,8)
95
-H264_WEIGHT(8,4)
96
-H264_WEIGHT(4,8)
97
-H264_WEIGHT(4,4)
98
-H264_WEIGHT(4,2)
99
-H264_WEIGHT(2,4)
100
-H264_WEIGHT(2,2)
91
+H264_WEIGHT(16)
92
+H264_WEIGHT(8)
93
+H264_WEIGHT(4)
94
+H264_WEIGHT(2)
101 95
 
102 96
 #undef op_scale1
103 97
 #undef op_scale2
... ...
@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
843 843
 }
844 844
 
845 845
 static av_always_inline
846
-void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
846
+void weight_h264_W_altivec(uint8_t *block, int stride, int height,
847
+                           int log2_denom, int weight, int offset, int w)
847 848
 {
848 849
     int y, aligned;
849 850
     vec_u8 vblock;
... ...
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
864 864
     voffset = vec_splat(vtemp, 5);
865 865
     aligned = !((unsigned long)block & 0xf);
866 866
 
867
-    for (y=0; y<h; y++) {
867
+    for (y = 0; y < height; y++) {
868 868
         vblock = vec_ld(0, block);
869 869
 
870 870
         v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
... ...
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
888 888
 }
889 889
 
890 890
 static av_always_inline
891
-void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
892
-                               int weightd, int weights, int offset, int w, int h)
891
+void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
892
+                             int log2_denom, int weightd, int weights, int offset, int w)
893 893
 {
894 894
     int y, dst_aligned, src_aligned;
895 895
     vec_u8 vsrc, vdst;
... ...
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
912 912
     dst_aligned = !((unsigned long)dst & 0xf);
913 913
     src_aligned = !((unsigned long)src & 0xf);
914 914
 
915
-    for (y=0; y<h; y++) {
915
+    for (y = 0; y < height; y++) {
916 916
         vdst = vec_ld(0, dst);
917 917
         vsrc = vec_ld(0, src);
918 918
 
... ...
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
952 952
     }
953 953
 }
954 954
 
955
-#define H264_WEIGHT(W,H) \
956
-static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
957
-    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
955
+#define H264_WEIGHT(W) \
956
+static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
957
+                                                   int log2_denom, int weight, int offset){ \
958
+    weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
958 959
 }\
959
-static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
960
-    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
960
+static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
961
+                                                     int log2_denom, int weightd, int weights, int offset){ \
962
+    biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
961 963
 }
962 964
 
963
-H264_WEIGHT(16,16)
964
-H264_WEIGHT(16, 8)
965
-H264_WEIGHT( 8,16)
966
-H264_WEIGHT( 8, 8)
967
-H264_WEIGHT( 8, 4)
965
+H264_WEIGHT(16)
966
+H264_WEIGHT( 8)
968 967
 
969 968
 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
970 969
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
... ...
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
1015 1015
         c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
1016 1016
         c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
1017 1017
 
1018
-        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
1019
-        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
1020
-        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
1021
-        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
1022
-        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
1023
-        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
1024
-        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
1025
-        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
1026
-        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
1027
-        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
1018
+        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
1019
+        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
1020
+        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
1021
+        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
1028 1022
     }
1029 1023
     }
1030 1024
 }
... ...
@@ -28,21 +28,20 @@ SECTION .text
28 28
 ;-----------------------------------------------------------------------------
29 29
 ; biweight pred:
30 30
 ;
31
-; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
32
-;                               int log2_denom, int weightd, int weights,
33
-;                               int offset);
31
+; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
32
+;                            int height, int log2_denom, int weightd,
33
+;                            int weights, int offset);
34 34
 ; and
35
-; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
36
-;                             int log2_denom, int weight,
37
-;                             int offset);
35
+; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
36
+;                          int log2_denom, int weight, int offset);
38 37
 ;-----------------------------------------------------------------------------
39 38
 
40 39
 %macro WEIGHT_SETUP 0
41
-    add        r4, r4
42
-    inc        r4
43
-    movd       m3, r3d
44
-    movd       m5, r4d
45
-    movd       m6, r2d
40
+    add        r5, r5
41
+    inc        r5
42
+    movd       m3, r4d
43
+    movd       m5, r5d
44
+    movd       m6, r3d
46 45
     pslld      m5, m6
47 46
     psrld      m5, 1
48 47
 %if mmsize == 16
... ...
@@ -71,60 +70,41 @@ SECTION .text
71 71
     packuswb      m0, m1
72 72
 %endmacro
73 73
 
74
-%macro WEIGHT_FUNC_DBL_MM 1
75
-cglobal h264_weight_16x%1_mmx2, 5, 5, 0
74
+INIT_MMX
75
+cglobal h264_weight_16_mmx2, 6, 6, 0
76 76
     WEIGHT_SETUP
77
-    mov        r2, %1
78
-%if %1 == 16
79 77
 .nextrow
80 78
     WEIGHT_OP 0,  4
81 79
     mova     [r0  ], m0
82 80
     WEIGHT_OP 8, 12
83 81
     mova     [r0+8], m0
84 82
     add        r0, r1
85
-    dec        r2
83
+    dec        r2d
86 84
     jnz .nextrow
87 85
     REP_RET
88
-%else
89
-    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
90
-%endif
91
-%endmacro
92 86
 
93
-INIT_MMX
94
-WEIGHT_FUNC_DBL_MM 16
95
-WEIGHT_FUNC_DBL_MM  8
96
-
97
-%macro WEIGHT_FUNC_MM 4
98
-cglobal h264_weight_%1x%2_%4, 7, 7, %3
87
+%macro WEIGHT_FUNC_MM 3
88
+cglobal h264_weight_%1_%3, 6, 6, %2
99 89
     WEIGHT_SETUP
100
-    mov        r2, %2
101
-%if %2 == 16
102 90
 .nextrow
103 91
     WEIGHT_OP 0, mmsize/2
104 92
     mova     [r0], m0
105 93
     add        r0, r1
106
-    dec        r2
94
+    dec        r2d
107 95
     jnz .nextrow
108 96
     REP_RET
109
-%else
110
-    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
111
-%endif
112 97
 %endmacro
113 98
 
114 99
 INIT_MMX
115
-WEIGHT_FUNC_MM  8, 16,  0, mmx2
116
-WEIGHT_FUNC_MM  8,  8,  0, mmx2
117
-WEIGHT_FUNC_MM  8,  4,  0, mmx2
100
+WEIGHT_FUNC_MM  8, 0, mmx2
118 101
 INIT_XMM
119
-WEIGHT_FUNC_MM 16, 16,  8, sse2
120
-WEIGHT_FUNC_MM 16,  8,  8, sse2
102
+WEIGHT_FUNC_MM 16, 8, sse2
121 103
 
122
-%macro WEIGHT_FUNC_HALF_MM 5
123
-cglobal h264_weight_%1x%2_%5, 5, 5, %4
104
+%macro WEIGHT_FUNC_HALF_MM 3
105
+cglobal h264_weight_%1_%3, 6, 6, %2
124 106
     WEIGHT_SETUP
125
-    mov        r2, %2/2
107
+    sar       r2d, 1
126 108
     lea        r3, [r1*2]
127
-%if %2 == mmsize
128 109
 .nextrow
129 110
     WEIGHT_OP 0, r1
130 111
     movh     [r0], m0
... ...
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
135 135
     movh     [r0+r1], m0
136 136
 %endif
137 137
     add        r0, r3
138
-    dec        r2
138
+    dec        r2d
139 139
     jnz .nextrow
140 140
     REP_RET
141
-%else
142
-    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
143
-%endif
144 141
 %endmacro
145 142
 
146 143
 INIT_MMX
147
-WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
148
-WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
149
-WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
144
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
145
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
146
+WEIGHT_FUNC_HALF_MM 4, 0, mmx2
150 147
 INIT_XMM
151
-WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
152
-WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
153
-WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
148
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
149
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
150
+WEIGHT_FUNC_HALF_MM 8, 8, sse2
154 151
 
155 152
 %macro BIWEIGHT_SETUP 0
156
-    add        r6, 1
157
-    or         r6, 1
158
-    add        r3, 1
159
-    movd       m3, r4d
160
-    movd       m4, r5d
161
-    movd       m5, r6d
162
-    movd       m6, r3d
153
+%ifdef ARCH_X86_64
154
+%define off_regd r11d
155
+%else
156
+%define off_regd r3d
157
+%endif
158
+    mov  off_regd, r7m
159
+    add  off_regd, 1
160
+    or   off_regd, 1
161
+    add        r4, 1
162
+    movd       m3, r5d
163
+    movd       m4, r6d
164
+    movd       m5, off_regd
165
+    movd       m6, r4d
163 166
     pslld      m5, m6
164 167
     psrld      m5, 1
165 168
 %if mmsize == 16
... ...
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
195 195
     packuswb   m0, m1
196 196
 %endmacro
197 197
 
198
-%macro BIWEIGHT_FUNC_DBL_MM 1
199
-cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
198
+INIT_MMX
199
+cglobal h264_biweight_16_mmx2, 7, 7, 0
200 200
     BIWEIGHT_SETUP
201
-    mov        r3, %1
202
-%if %1 == 16
201
+    movifnidn r3d, r3m
203 202
 .nextrow
204 203
     BIWEIGHT_STEPA 0, 1, 0
205 204
     BIWEIGHT_STEPA 1, 2, 4
... ...
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
211 211
     mova     [r0+8], m0
212 212
     add        r0, r2
213 213
     add        r1, r2
214
-    dec        r3
214
+    dec        r3d
215 215
     jnz .nextrow
216 216
     REP_RET
217
-%else
218
-    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
219
-%endif
220
-%endmacro
221 217
 
222
-INIT_MMX
223
-BIWEIGHT_FUNC_DBL_MM 16
224
-BIWEIGHT_FUNC_DBL_MM  8
225
-
226
-%macro BIWEIGHT_FUNC_MM 4
227
-cglobal h264_biweight_%1x%2_%4, 7, 7, %3
218
+%macro BIWEIGHT_FUNC_MM 3
219
+cglobal h264_biweight_%1_%3, 7, 7, %2
228 220
     BIWEIGHT_SETUP
229
-    mov        r3, %2
230
-%if %2 == 16
221
+    movifnidn r3d, r3m
231 222
 .nextrow
232 223
     BIWEIGHT_STEPA 0, 1, 0
233 224
     BIWEIGHT_STEPA 1, 2, mmsize/2
... ...
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
235 235
     mova       [r0], m0
236 236
     add        r0, r2
237 237
     add        r1, r2
238
-    dec        r3
238
+    dec        r3d
239 239
     jnz .nextrow
240 240
     REP_RET
241
-%else
242
-    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
243
-%endif
244 241
 %endmacro
245 242
 
246 243
 INIT_MMX
247
-BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
248
-BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
249
-BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
244
+BIWEIGHT_FUNC_MM  8, 0, mmx2
250 245
 INIT_XMM
251
-BIWEIGHT_FUNC_MM 16, 16,  8, sse2
252
-BIWEIGHT_FUNC_MM 16,  8,  8, sse2
246
+BIWEIGHT_FUNC_MM 16, 8, sse2
253 247
 
254
-%macro BIWEIGHT_FUNC_HALF_MM 5
255
-cglobal h264_biweight_%1x%2_%5, 7, 7, %4
248
+%macro BIWEIGHT_FUNC_HALF_MM 3
249
+cglobal h264_biweight_%1_%3, 7, 7, %2
256 250
     BIWEIGHT_SETUP
257
-    mov        r3, %2/2
251
+    movifnidn r3d, r3m
252
+    sar        r3, 1
258 253
     lea        r4, [r2*2]
259
-%if %2 == mmsize
260 254
 .nextrow
261 255
     BIWEIGHT_STEPA 0, 1, 0
262 256
     BIWEIGHT_STEPA 1, 2, r2
... ...
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
270 270
 %endif
271 271
     add        r0, r4
272 272
     add        r1, r4
273
-    dec        r3
273
+    dec        r3d
274 274
     jnz .nextrow
275 275
     REP_RET
276
-%else
277
-    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
278
-%endif
279 276
 %endmacro
280 277
 
281 278
 INIT_MMX
282
-BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
283
-BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
284
-BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
279
+BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
285 280
 INIT_XMM
286
-BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
287
-BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
288
-BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
281
+BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
289 282
 
290 283
 %macro BIWEIGHT_SSSE3_SETUP 0
291
-    add        r6, 1
292
-    or         r6, 1
293
-    add        r3, 1
294
-    movd       m4, r4d
295
-    movd       m0, r5d
296
-    movd       m5, r6d
297
-    movd       m6, r3d
284
+%ifdef ARCH_X86_64
285
+%define off_regd r11d
286
+%else
287
+%define off_regd r3d
288
+%endif
289
+    mov  off_regd, r7m
290
+    add  off_regd, 1
291
+    or   off_regd, 1
292
+    add        r4, 1
293
+    movd       m4, r5d
294
+    movd       m0, r6d
295
+    movd       m5, off_regd
296
+    movd       m6, r4d
298 297
     pslld      m5, m6
299 298
     psrld      m5, 1
300 299
     punpcklbw  m4, m0
... ...
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
314 314
     packuswb   m0, m2
315 315
 %endmacro
316 316
 
317
-%macro BIWEIGHT_SSSE3_16 1
318
-cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
317
+INIT_XMM
318
+cglobal h264_biweight_16_ssse3, 7, 7, 8
319 319
     BIWEIGHT_SSSE3_SETUP
320
-    mov        r3, %1
320
+    movifnidn r3d, r3m
321 321
 
322
-%if %1 == 16
323 322
 .nextrow
324 323
     movh       m0, [r0]
325 324
     movh       m2, [r0+8]
... ...
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
330 330
     mova       [r0], m0
331 331
     add        r0, r2
332 332
     add        r1, r2
333
-    dec        r3
333
+    dec        r3d
334 334
     jnz .nextrow
335 335
     REP_RET
336
-%else
337
-    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
338
-%endif
339
-%endmacro
340 336
 
341 337
 INIT_XMM
342
-BIWEIGHT_SSSE3_16 16
343
-BIWEIGHT_SSSE3_16  8
344
-
345
-%macro BIWEIGHT_SSSE3_8 1
346
-cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
338
+cglobal h264_biweight_8_ssse3, 7, 7, 8
347 339
     BIWEIGHT_SSSE3_SETUP
348
-    mov        r3, %1/2
340
+    movifnidn r3d, r3m
341
+    sar        r3, 1
349 342
     lea        r4, [r2*2]
350 343
 
351
-%if %1 == 16
352 344
 .nextrow
353 345
     movh       m0, [r0]
354 346
     movh       m1, [r1]
... ...
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
361 361
     movhps     [r0+r2], m0
362 362
     add        r0, r4
363 363
     add        r1, r4
364
-    dec        r3
364
+    dec        r3d
365 365
     jnz .nextrow
366 366
     REP_RET
367
-%else
368
-    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
369
-%endif
370
-%endmacro
371
-
372
-INIT_XMM
373
-BIWEIGHT_SSSE3_8 16
374
-BIWEIGHT_SSSE3_8  8
375
-BIWEIGHT_SSSE3_8  4
... ...
@@ -36,33 +36,26 @@ cextern pw_1
36 36
 SECTION .text
37 37
 
38 38
 ;-----------------------------------------------------------------------------
39
-; void h264_weight(uint8_t *dst, int stride, int log2_denom,
39
+; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
40 40
 ;                  int weight, int offset);
41 41
 ;-----------------------------------------------------------------------------
42
-%ifdef ARCH_X86_32
43
-DECLARE_REG_TMP 2
44
-%else
45
-DECLARE_REG_TMP 10
46
-%endif
47
-
48
-%macro WEIGHT_PROLOGUE 1
49
-    mov t0, %1
42
+%macro WEIGHT_PROLOGUE 0
50 43
 .prologue
51
-    PROLOGUE 0,5,8
44
+    PROLOGUE 0,6,8
52 45
     movifnidn  r0, r0mp
53 46
     movifnidn r1d, r1m
54
-    movifnidn r3d, r3m
55 47
     movifnidn r4d, r4m
48
+    movifnidn r5d, r5m
56 49
 %endmacro
57 50
 
58 51
 %macro WEIGHT_SETUP 1
59 52
     mova       m0, [pw_1]
60
-    movd       m2, r2m
53
+    movd       m2, r3m
61 54
     pslld      m0, m2       ; 1<<log2_denom
62 55
     SPLATW     m0, m0
63
-    shl        r4, 19       ; *8, move to upper half of dword
64
-    lea        r4, [r4+r3*2+0x10000]
65
-    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
56
+    shl        r5, 19       ; *8, move to upper half of dword
57
+    lea        r5, [r5+r4*2+0x10000]
58
+    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
66 59
     pshufd     m3, m3, 0
67 60
     mova       m4, [pw_pixel_max]
68 61
     paddw      m2, [sq_1]   ; log2_denom+1
... ...
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
96 96
 %endmacro
97 97
 
98 98
 %macro WEIGHT_FUNC_DBL 1
99
-cglobal h264_weight_16x16_10_%1
100
-    WEIGHT_PROLOGUE 16
99
+cglobal h264_weight_16_10_%1
100
+    WEIGHT_PROLOGUE
101 101
     WEIGHT_SETUP %1
102 102
 .nextrow
103 103
     WEIGHT_OP %1,  0
... ...
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
105 105
     WEIGHT_OP %1, 16
106 106
     mova [r0+16], m5
107 107
     add       r0, r1
108
-    dec       t0
108
+    dec       r2d
109 109
     jnz .nextrow
110 110
     REP_RET
111
-
112
-cglobal h264_weight_16x8_10_%1
113
-    mov t0, 8
114
-    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
115 111
 %endmacro
116 112
 
117 113
 INIT_XMM
... ...
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
120 120
 
121 121
 
122 122
 %macro WEIGHT_FUNC_MM 1
123
-cglobal h264_weight_8x16_10_%1
124
-    WEIGHT_PROLOGUE 16
123
+cglobal h264_weight_8_10_%1
124
+    WEIGHT_PROLOGUE
125 125
     WEIGHT_SETUP %1
126 126
 .nextrow
127 127
     WEIGHT_OP  %1, 0
128 128
     mova     [r0], m5
129 129
     add        r0, r1
130
-    dec        t0
130
+    dec        r2d
131 131
     jnz .nextrow
132 132
     REP_RET
133
-
134
-cglobal h264_weight_8x8_10_%1
135
-    mov t0, 8
136
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
137
-
138
-cglobal h264_weight_8x4_10_%1
139
-    mov t0, 4
140
-    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
141 133
 %endmacro
142 134
 
143 135
 INIT_XMM
... ...
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
146 146
 
147 147
 
148 148
 %macro WEIGHT_FUNC_HALF_MM 1
149
-cglobal h264_weight_4x8_10_%1
150
-    WEIGHT_PROLOGUE 4
149
+cglobal h264_weight_4_10_%1
150
+    WEIGHT_PROLOGUE
151
+    sar         r2d, 1
151 152
     WEIGHT_SETUP %1
152 153
     lea         r3, [r1*2]
153 154
 .nextrow
... ...
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
155 155
     movh      [r0], m5
156 156
     movhps [r0+r1], m5
157 157
     add         r0, r3
158
-    dec         t0
158
+    dec         r2d
159 159
     jnz .nextrow
160 160
     REP_RET
161
-
162
-cglobal h264_weight_4x4_10_%1
163
-    mov t0, 2
164
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
165
-
166
-cglobal h264_weight_4x2_10_%1
167
-    mov t0, 1
168
-    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
169 161
 %endmacro
170 162
 
171 163
 INIT_XMM
... ...
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
174 174
 
175 175
 
176 176
 ;-----------------------------------------------------------------------------
177
-; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
178
-;                    int weightd, int weights, int offset);
177
+; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
178
+;                    int log2_denom, int weightd, int weights, int offset);
179 179
 ;-----------------------------------------------------------------------------
180 180
 %ifdef ARCH_X86_32
181
-DECLARE_REG_TMP 2,3
181
+DECLARE_REG_TMP 3
182 182
 %else
183
-DECLARE_REG_TMP 10,2
183
+DECLARE_REG_TMP 10
184 184
 %endif
185 185
 
186
-%macro BIWEIGHT_PROLOGUE 1
187
-    mov t0, %1
186
+%macro BIWEIGHT_PROLOGUE 0
188 187
 .prologue
189 188
     PROLOGUE 0,7,8
190 189
     movifnidn  r0, r0mp
191 190
     movifnidn  r1, r1mp
192
-    movifnidn t1d, r2m
193
-    movifnidn r4d, r4m
191
+    movifnidn r2d, r2m
194 192
     movifnidn r5d, r5m
195 193
     movifnidn r6d, r6m
194
+    movifnidn t0d, r7m
196 195
 %endmacro
197 196
 
198 197
 %macro BIWEIGHT_SETUP 1
199
-    lea        r6, [r6*4+1] ; (offset<<2)+1
200
-    or         r6, 1
201
-    shl        r5, 16
202
-    or         r4, r5
203
-    movd       m4, r4d      ; weightd | weights
204
-    movd       m5, r6d      ; (offset+1)|1
205
-    movd       m6, r3m      ; log2_denom
198
+    lea        t0, [t0*4+1] ; (offset<<2)+1
199
+    or         t0, 1
200
+    shl        r6, 16
201
+    or         r5, r6
202
+    movd       m4, r5d      ; weightd | weights
203
+    movd       m5, t0d      ; (offset+1)|1
204
+    movd       m6, r4m      ; log2_denom
206 205
     pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
207 206
     paddd      m6, [sq_1]
208 207
     pshufd     m4, m4, 0
209 208
     pshufd     m5, m5, 0
210 209
     mova       m3, [pw_pixel_max]
210
+    movifnidn r3d, r3m
211 211
 %ifnidn %1, sse4
212 212
     pxor       m7, m7
213 213
 %endif
... ...
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
243 243
 %endmacro
244 244
 
245 245
 %macro BIWEIGHT_FUNC_DBL 1
246
-cglobal h264_biweight_16x16_10_%1
247
-    BIWEIGHT_PROLOGUE 16
246
+cglobal h264_biweight_16_10_%1
247
+    BIWEIGHT_PROLOGUE
248 248
     BIWEIGHT_SETUP %1
249 249
 .nextrow
250 250
     BIWEIGHT  %1,  0
251 251
     mova [r0   ], m0
252 252
     BIWEIGHT  %1, 16
253 253
     mova [r0+16], m0
254
-    add       r0, t1
255
-    add       r1, t1
256
-    dec       t0
254
+    add       r0, r2
255
+    add       r1, r2
256
+    dec       r3d
257 257
     jnz .nextrow
258 258
     REP_RET
259
-
260
-cglobal h264_biweight_16x8_10_%1
261
-    mov t0, 8
262
-    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
263 259
 %endmacro
264 260
 
265 261
 INIT_XMM
... ...
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
267 267
 BIWEIGHT_FUNC_DBL sse4
268 268
 
269 269
 %macro BIWEIGHT_FUNC 1
270
-cglobal h264_biweight_8x16_10_%1
271
-    BIWEIGHT_PROLOGUE 16
270
+cglobal h264_biweight_8_10_%1
271
+    BIWEIGHT_PROLOGUE
272 272
     BIWEIGHT_SETUP %1
273 273
 .nextrow
274 274
     BIWEIGHT %1, 0
275 275
     mova   [r0], m0
276
-    add      r0, t1
277
-    add      r1, t1
278
-    dec      t0
276
+    add      r0, r2
277
+    add      r1, r2
278
+    dec      r3d
279 279
     jnz .nextrow
280 280
     REP_RET
281
-
282
-cglobal h264_biweight_8x8_10_%1
283
-    mov t0, 8
284
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
285
-
286
-cglobal h264_biweight_8x4_10_%1
287
-    mov t0, 4
288
-    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
289 281
 %endmacro
290 282
 
291 283
 INIT_XMM
... ...
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
293 293
 BIWEIGHT_FUNC sse4
294 294
 
295 295
 %macro BIWEIGHT_FUNC_HALF 1
296
-cglobal h264_biweight_4x8_10_%1
297
-    BIWEIGHT_PROLOGUE 4
296
+cglobal h264_biweight_4_10_%1
297
+    BIWEIGHT_PROLOGUE
298 298
     BIWEIGHT_SETUP %1
299
-    lea        r4, [t1*2]
299
+    sar        r3d, 1
300
+    lea        r4, [r2*2]
300 301
 .nextrow
301
-    BIWEIGHT    %1, 0, t1
302
+    BIWEIGHT    %1, 0, r2
302 303
     movh   [r0   ], m0
303
-    movhps [r0+t1], m0
304
+    movhps [r0+r2], m0
304 305
     add         r0, r4
305 306
     add         r1, r4
306
-    dec         t0
307
+    dec         r3d
307 308
     jnz .nextrow
308 309
     REP_RET
309
-
310
-cglobal h264_biweight_4x4_10_%1
311
-    mov t0, 2
312
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
313
-
314
-cglobal h264_biweight_4x2_10_%1
315
-    mov t0, 1
316
-    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
317 310
 %endmacro
318 311
 
319 312
 INIT_XMM
... ...
@@ -298,57 +298,47 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
298 298
 /***********************************/
299 299
 /* weighted prediction */
300 300
 
301
-#define H264_WEIGHT(W, H, OPT) \
302
-void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
303
-    int stride, int log2_denom, int weight, int offset);
301
+#define H264_WEIGHT(W, OPT) \
302
+void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
303
+    int stride, int height, int log2_denom, int weight, int offset);
304 304
 
305
-#define H264_BIWEIGHT(W, H, OPT) \
306
-void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
307
-    uint8_t *src, int stride, int log2_denom, int weightd, \
305
+#define H264_BIWEIGHT(W, OPT) \
306
+void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
307
+    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
308 308
     int weights, int offset);
309 309
 
310
-#define H264_BIWEIGHT_MMX(W,H) \
311
-H264_WEIGHT  (W, H, mmx2) \
312
-H264_BIWEIGHT(W, H, mmx2)
313
-
314
-#define H264_BIWEIGHT_MMX_SSE(W,H) \
315
-H264_BIWEIGHT_MMX(W, H) \
316
-H264_WEIGHT      (W, H, sse2) \
317
-H264_BIWEIGHT    (W, H, sse2) \
318
-H264_BIWEIGHT    (W, H, ssse3)
319
-
320
-H264_BIWEIGHT_MMX_SSE(16, 16)
321
-H264_BIWEIGHT_MMX_SSE(16,  8)
322
-H264_BIWEIGHT_MMX_SSE( 8, 16)
323
-H264_BIWEIGHT_MMX_SSE( 8,  8)
324
-H264_BIWEIGHT_MMX_SSE( 8,  4)
325
-H264_BIWEIGHT_MMX    ( 4,  8)
326
-H264_BIWEIGHT_MMX    ( 4,  4)
327
-H264_BIWEIGHT_MMX    ( 4,  2)
328
-
329
-#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
330
-void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
331
-    int stride, int log2_denom, int weight, int offset);
332
-
333
-#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
334
-void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
335
-    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
310
+#define H264_BIWEIGHT_MMX(W) \
311
+H264_WEIGHT  (W, mmx2) \
312
+H264_BIWEIGHT(W, mmx2)
313
+
314
+#define H264_BIWEIGHT_MMX_SSE(W) \
315
+H264_BIWEIGHT_MMX(W) \
316
+H264_WEIGHT      (W, sse2) \
317
+H264_BIWEIGHT    (W, sse2) \
318
+H264_BIWEIGHT    (W, ssse3)
319
+
320
+H264_BIWEIGHT_MMX_SSE(16)
321
+H264_BIWEIGHT_MMX_SSE( 8)
322
+H264_BIWEIGHT_MMX    ( 4)
323
+
324
+#define H264_WEIGHT_10(W, DEPTH, OPT) \
325
+void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
326
+    int stride, int height, int log2_denom, int weight, int offset);
327
+
328
+#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
329
+void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
330
+    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
336 331
      int weightd, int weights, int offset);
337 332
 
338
-#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
339
-H264_WEIGHT_10  (W, H, DEPTH, sse2) \
340
-H264_WEIGHT_10  (W, H, DEPTH, sse4) \
341
-H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
342
-H264_BIWEIGHT_10(W, H, DEPTH, sse4)
343
-
344
-H264_BIWEIGHT_10_SSE(16, 16, 10)
345
-H264_BIWEIGHT_10_SSE(16,  8, 10)
346
-H264_BIWEIGHT_10_SSE( 8, 16, 10)
347
-H264_BIWEIGHT_10_SSE( 8,  8, 10)
348
-H264_BIWEIGHT_10_SSE( 8,  4, 10)
349
-H264_BIWEIGHT_10_SSE( 4,  8, 10)
350
-H264_BIWEIGHT_10_SSE( 4,  4, 10)
351
-H264_BIWEIGHT_10_SSE( 4,  2, 10)
333
+#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
334
+H264_WEIGHT_10  (W, DEPTH, sse2) \
335
+H264_WEIGHT_10  (W, DEPTH, sse4) \
336
+H264_BIWEIGHT_10(W, DEPTH, sse2) \
337
+H264_BIWEIGHT_10(W, DEPTH, sse4)
338
+
339
+H264_BIWEIGHT_10_SSE(16, 10)
340
+H264_BIWEIGHT_10_SSE( 8, 10)
341
+H264_BIWEIGHT_10_SSE( 4, 10)
352 342
 
353 343
 void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
354 344
 {
... ...
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
394 394
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
395 395
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
396 396
 #endif
397
-            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
398
-            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
399
-            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
400
-            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
401
-            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
402
-            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
403
-            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
404
-            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
405
-
406
-            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
407
-            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
408
-            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
409
-            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
410
-            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
411
-            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
412
-            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
413
-            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
397
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
398
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
399
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
400
+
401
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
402
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
403
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
414 404
 
415 405
             if (mm_flags&AV_CPU_FLAG_SSE2) {
416 406
                 c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
... ...
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
422 422
                 c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
423 423
                 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
424 424
 
425
-                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
426
-                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
427
-                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
428
-                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
429
-                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
425
+                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
426
+                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
430 427
 
431
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
432
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
433
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
434
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
435
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
428
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
429
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
436 430
 
437 431
 #if HAVE_ALIGNED_STACK
438 432
                 c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
... ...
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
442 442
 #endif
443 443
             }
444 444
             if (mm_flags&AV_CPU_FLAG_SSSE3) {
445
-                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
446
-                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
447
-                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
448
-                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
449
-                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
445
+                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
446
+                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
450 447
             }
451 448
             if (mm_flags&AV_CPU_FLAG_AVX) {
452 449
 #if HAVE_ALIGNED_STACK
... ...
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
485 485
                 c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
486 486
 #endif
487 487
 
488
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
489
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
490
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
491
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
492
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
493
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
494
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
495
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
496
-
497
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
498
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
499
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
500
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
501
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
502
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
503
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
504
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
488
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
489
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
490
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
491
+
492
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
493
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
494
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
505 495
 
506 496
                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
507 497
                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
... ...
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
513 513
 #endif
514 514
             }
515 515
             if (mm_flags&AV_CPU_FLAG_SSE4) {
516
-                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
517
-                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
518
-                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
519
-                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
520
-                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
521
-                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
522
-                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
523
-                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
524
-
525
-                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
526
-                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
527
-                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
528
-                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
529
-                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
530
-                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
531
-                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
532
-                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
516
+                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
517
+                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
518
+                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
519
+
520
+                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
521
+                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
522
+                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
533 523
             }
534 524
 #if HAVE_AVX
535 525
             if (mm_flags&AV_CPU_FLAG_AVX) {