GitList

Browse code

H264: change weight/biweight functions to take a height argument.

Neon parts by Mans Rullgard <mans@mansr.com>.

Ronald S. Bultje authored on 2011/10/21 16:00:39
Showing 10 changed files

libavcodec/arm/h264dsp_init_arm.c index c1ca217..1c331a4 100644
libavcodec/arm/h264dsp_neon.S index 0fa4a6b..3d2c674 100644
libavcodec/h264.c index 8d652f1..7306828 100644
libavcodec/h264dsp.c index 19ad2db..ba96707 100644
libavcodec/h264dsp.h index 7337f17..7cae215 100644
libavcodec/h264dsp_template.c index ee4bbe5..3d99cfc 100644
libavcodec/ppc/h264_altivec.c index a915378..edc043c 100644
libavcodec/x86/h264_weight.asm index d80ca32..bc8bfd6 100644
libavcodec/x86/h264_weight_10bit.asm index 1c58d72..20df6fb 100644
libavcodec/x86/h264dsp_mmx.c index 06ee7ca..dcd9180 100644

@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                      void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                                             int beta, int8_t *tc0);
                     -void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
                     -                                      int weight, int offset);
                     -void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
                     -                                     int weight, int offset);
                     -void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
                     -                                     int weight, int offset);
                     -void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
                     -                                    int weight, int offset);
                     -void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
                     -                                    int weight, int offset);
                     -void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
                     -                                    int weight, int offset);
                     -void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
                     -                                    int weight, int offset);
                     -void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
                     -                                    int weight, int offset);
                     +void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
                     +                                   int log2_den, int weight, int offset);
                     +void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
                     +                                  int log2_den, int weight, int offset);
                     +void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
                     +                                  int log2_den, int weight, int offset);
                     -void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                        int log2_den, int weightd, int weights,
                     -                                        int offset);
                     -void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                       int log2_den, int weightd, int weights,
                     -                                       int offset);
                     -void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                       int log2_den, int weightd, int weights,
                     -                                       int offset);
                     -void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                      int log2_den, int weightd, int weights,
                     -                                      int offset);
                     -void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                      int log2_den, int weightd, int weights,
                     -                                      int offset);
                     -void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                      int log2_den, int weightd, int weights,
                     -                                      int offset);
                     -void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                      int log2_den, int weightd, int weights,
                     -                                      int offset);
                     -void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
                     -                                      int log2_den, int weightd, int weights,
                     -                                      int offset);
                     +void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
                     +                                     int height, int log2_den, int weightd,
                     +                                     int weights, int offset);
                     +void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
                     +                                    int height, int log2_den, int weightd,
                     +                                    int weights, int offset);
                     +void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
                     +                                    int height, int log2_den, int weightd,
                     +                                    int weights, int offset);
                      void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
                      void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
                          c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
                          c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
                     -    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
                     -    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
                     -    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
                     -    c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
                     -    c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
                     -    c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
                     -    c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
                     -    c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
                     +    c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
                     +    c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
                     +    c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
                     -    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
                     -    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
                     -    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
                     -    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
                     -    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
                     -    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
                     -    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
                     -    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
                     +    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
                     +    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
                     +    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
                          c->h264_idct_add        = ff_h264_idct_add_neon;
                          c->h264_idct_dc_add     = ff_h264_idct_dc_add_neon;

libavcodec/arm/h264dsp_neon.S

History View file @ c2d3374

@@ -1592,7 +1592,7 @@ endfunc
                              vdup.8          d1,  r5
                              vmov            q2,  q8
                              vmov            q3,  q8
                     -1:      subs            ip,  ip,  #2
                     +1:      subs            r3,  r3,  #2
                              vld1.8          {d20-d21},[r0,:128], r2
                              \macd           q2,  d0,  d20
                              pld             [r0]
@@ -1632,7 +1632,7 @@ endfunc
                              vdup.8          d1,  r5
                              vmov            q1,  q8
                              vmov            q10, q8
                     -1:      subs            ip,  ip,  #2
                     +1:      subs            r3,  r3,  #2
                              vld1.8          {d4},[r0,:64], r2
                              \macd           q1,  d0,  d4
                              pld             [r0]
@@ -1662,7 +1662,7 @@ endfunc
                              vdup.8          d1,  r5
                              vmov            q1,  q8
                              vmov            q10, q8
                     -1:      subs            ip,  ip,  #4
                     +1:      subs            r3,  r3,  #4
                              vld1.32         {d4[0]},[r0,:32], r2
                              vld1.32         {d4[1]},[r0,:32], r2
                              \macd           q1,  d0,  d4
@@ -1700,16 +1700,17 @@ endfunc
                              .endm
                              .macro  biweight_func w
                     -function biweight_h264_pixels_\w\()_neon
                     +function ff_biweight_h264_pixels_\w\()_neon, export=1
                              push            {r4-r6, lr}
                     -        add             r4,  sp,  #16
                     +        ldr             r12, [sp, #16]
                     +        add             r4,  sp,  #20
                              ldm             r4,  {r4-r6}
                              lsr             lr,  r4,  #31
                              add             r6,  r6,  #1
                              eors            lr,  lr,  r5,  lsr #30
                              orr             r6,  r6,  #1
                     -        vdup.16         q9,  r3
                     -        lsl             r6,  r6,  r3
                     +        vdup.16         q9,  r12
                     +        lsl             r6,  r6,  r12
                              vmvn            q9,  q9
                              vdup.16         q8,  r6
                              mov             r6,  r0
@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
                      endfunc
                              .endm
                     -        .macro  biweight_entry w, h, b=1
                     -function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
                     -        mov             ip,  #\h
                     -.if \b
                     -        b               biweight_h264_pixels_\w\()_neon
                     -.endif
                     -endfunc
                     -        .endm
+                    -
                     -        biweight_entry  16, 8
                     -        biweight_entry  16, 16, b=0
                              biweight_func   16
+                    -
                     -        biweight_entry  8,  16
                     -        biweight_entry  8,  4
                     -        biweight_entry  8,  8,  b=0
                              biweight_func   8
+                    -
                     -        biweight_entry  4,  8
                     -        biweight_entry  4,  2
                     -        biweight_entry  4,  4,  b=0
                              biweight_func   4
                      @ Weighted prediction
                              .macro  weight_16 add
                     -        vdup.8          d0,  r3
                     -1:      subs            ip,  ip,  #2
                     +        vdup.8          d0,  r12
                     +1:      subs            r2,  r2,  #2
                              vld1.8          {d20-d21},[r0,:128], r1
                              vmull.u8        q2,  d0,  d20
                              pld             [r0]
@@ -1785,8 +1767,8 @@ endfunc
                              .endm
                              .macro  weight_8 add
                     -        vdup.8          d0,  r3
                     -1:      subs            ip,  ip,  #2
                     +        vdup.8          d0,  r12
                     +1:      subs            r2,  r2,  #2
                              vld1.8          {d4},[r0,:64], r1
                              vmull.u8        q1,  d0,  d4
                              pld             [r0]
@@ -1806,10 +1788,10 @@ endfunc
                              .endm
                              .macro  weight_4 add
                     -        vdup.8          d0,  r3
                     +        vdup.8          d0,  r12
                              vmov            q1,  q8
                              vmov            q10, q8
                     -1:      subs            ip,  ip,  #4
                     +1:      subs            r2,  r2,  #4
                              vld1.32         {d4[0]},[r0,:32], r1
                              vld1.32         {d4[1]},[r0,:32], r1
                              vmull.u8        q1,  d0,  d4
@@ -1842,50 +1824,32 @@ endfunc
                              .endm
                              .macro  weight_func w
                     -function weight_h264_pixels_\w\()_neon
                     +function ff_weight_h264_pixels_\w\()_neon, export=1
                              push            {r4, lr}
                     -        ldr             r4,  [sp, #8]
                     -        cmp             r2,  #1
                     -        lsl             r4,  r4,  r2
                     +        ldr             r12, [sp, #8]
                     +        ldr             r4,  [sp, #12]
                     +        cmp             r3,  #1
                     +        lsl             r4,  r4,  r3
                              vdup.16         q8,  r4
                              mov             r4,  r0
                              ble             20f
                     -        rsb             lr,  r2,  #1
                     +        rsb             lr,  r3,  #1
                              vdup.16         q9,  lr
                     -        cmp             r3,  #0
                     +        cmp             r12, #0
                              blt             10f
                              weight_\w       vhadd.s16
                     -10:     rsb             r3,  r3,  #0
                     +10:     rsb             r12, r12, #0
                              weight_\w       vhsub.s16
                     -20:     rsb             lr,  r2,  #0
                     +20:     rsb             lr,  r3,  #0
                              vdup.16         q9,  lr
                     -        cmp             r3,  #0
                     +        cmp             r12, #0
                              blt             10f
                              weight_\w       vadd.s16
                     -10:     rsb             r3,  r3,  #0
                     +10:     rsb             r12, r12, #0
                              weight_\w       vsub.s16
                      endfunc
                              .endm
                     -        .macro  weight_entry w, h, b=1
                     -function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
                     -        mov             ip,  #\h
                     -.if \b
                     -        b               weight_h264_pixels_\w\()_neon
                     -.endif
                     -endfunc
                     -        .endm
+                    -
                     -        weight_entry    16, 8
                     -        weight_entry    16, 16, b=0
                              weight_func     16
+                    -
                     -        weight_entry    8,  16
                     -        weight_entry    8,  4
                     -        weight_entry    8,  8,  b=0
                              weight_func     8
+                    -
                     -        weight_entry    4,  8
                     -        weight_entry    4,  2
                     -        weight_entry    4,  4,  b=0
                              weight_func     4

libavcodec/h264.c

History View file @ c2d3374

@@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
+                     }
                      #endif
                     -static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
                     +static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                     +                               int height, int delta, int list,
                                                 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                                                 int src_x_offset, int src_y_offset,
                                                 qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
@@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
                              s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
                                  src_cb= s->edge_emu_buffer;
+                         }
                     -    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
                     +    chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
                          if(emu){
                              s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
                                  src_cr= s->edge_emu_buffer;
+                         }
                     -    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
                     +    chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
+                     }
                     -static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
                     +static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
                                                 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                                                 int x_offset, int y_offset,
                                                 qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
                          if(list0){
                              Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
                     -        mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
                     +        mc_dir_part(h, ref, n, square, height, delta, 0,
                                                 dest_y, dest_cb, dest_cr, x_offset, y_offset,
                                                 qpix_op, chroma_op, pixel_shift, chroma444);
@@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
                          if(list1){
                              Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
                     -        mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
                     +        mc_dir_part(h, ref, n, square, height, delta, 1,
                                                 dest_y, dest_cb, dest_cr, x_offset, y_offset,
                                                 qpix_op, chroma_op, pixel_shift, chroma444);
+                         }
+                     }
                     -static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
                     +static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
                                                 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                                                 int x_offset, int y_offset,
                                                 qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                                                 h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
                                                 int list0, int list1, int pixel_shift, int chroma444){
                          MpegEncContext * const s = &h->s;
                     +    int chroma_height;
                          dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
                          if(chroma444){
                     +        chroma_height = height;
                              chroma_weight_avg = luma_weight_avg;
                              chroma_weight_op = luma_weight_op;
                              dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
                              dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
                          } else if (CHROMA422) {
                     +        chroma_height = height;
                              dest_cb += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
                              dest_cr += (  x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
                          }else{
                     +        chroma_height = height >> 1;
                              dest_cb += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
                              dest_cr += (  x_offset << pixel_shift) +   y_offset*h->mb_uvlinesize;
+                         }
@@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
                              int refn0 = h->ref_cache[0][ scan8[n] ];
                              int refn1 = h->ref_cache[1][ scan8[n] ];
                     -        mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
                     +        mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
                                          dest_y, dest_cb, dest_cr,
                                          x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
                     -        mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
                     +        mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
                                          tmp_y, tmp_cb, tmp_cr,
                                          x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
                              if(h->use_weight == 2){
                                  int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
                                  int weight1 = 64 - weight0;
                     -            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
                     -            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
                     -            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
                     -            if (CHROMA422) {
                     -                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
                     -                                  tmp_cb + chroma_height * h->mb_uvlinesize,
                     -                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
                     -                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
                     -                                  tmp_cr + chroma_height * h->mb_uvlinesize,
                     -                                  h->mb_uvlinesize, 5, weight0, weight1, 0);
                     -            }
                     +            luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize,
                     +                              height,        5, weight0, weight1, 0);
                     +            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
                     +                              chroma_height, 5, weight0, weight1, 0);
                     +            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
                     +                              chroma_height, 5, weight0, weight1, 0);
                              }else{
                     -            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
                     +            luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                                                  h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
                                                  h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
                     -            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     +            chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                                                  h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                                                  h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
                     -            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     +            chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                                                  h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                                                  h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
                     -            if (CHROMA422) {
                     -                chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
                     -                                  tmp_cb + chroma_height * h->mb_uvlinesize,
                     -                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     -                                  h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
                     -                                  h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
                     -                chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
                     -                                  tmp_cr + chroma_height * h->mb_uvlinesize,
                     -                                  h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     -                                  h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
                     -                                  h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
                     -            }
+                             }
                          }else{
                              int list = list1 ? 1 : 0;
                              int refn = h->ref_cache[list][ scan8[n] ];
                              Picture *ref= &h->ref_list[list][refn];
                     -        mc_dir_part(h, ref, n, square, chroma_height, delta, list,
                     +        mc_dir_part(h, ref, n, square, height, delta, list,
                                          dest_y, dest_cb, dest_cr, x_offset, y_offset,
                                          qpix_put, chroma_put, pixel_shift, chroma444);
                     -        luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
                     +        luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
                                             h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
                              if(h->use_weight_chroma){
                     -            chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     +            chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                                                   h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
                     -            chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     +            chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
                                                   h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
                     -            if (CHROMA422) {
                     -                chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
                     -                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     -                                 h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
                     -                chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
                     -                                 h->mb_uvlinesize, h->chroma_log2_weight_denom,
                     -                                 h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
                     -            }
+                             }
+                         }
+                     }
                     -static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
                     +static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
                                                 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
                                                 int x_offset, int y_offset,
                                                 qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
@@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
                          if((h->use_weight==2 && list0 && list1
                              && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
                             || h->use_weight==1)
                     -        mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                     +        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                                               x_offset, y_offset, qpix_put, chroma_put,
                     -                         weight_op[0], weight_op[3], weight_avg[0],
                     -                         weight_avg[3], list0, list1, pixel_shift, chroma444);
                     +                         weight_op[0], weight_op[1], weight_avg[0],
                     +                         weight_avg[1], list0, list1, pixel_shift, chroma444);
                          else
                     -        mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
                     +        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
                                          x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
                                          chroma_avg, list0, list1, pixel_shift, chroma444);
+                     }
@@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                          prefetch_motion(h, 0, pixel_shift, chroma444);
                          if(IS_16X16(mb_type)){
                     -        mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
                     +        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
                                      qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
                                      weight_op, weight_avg,
                                      IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                                      pixel_shift, chroma444);
                          }else if(IS_16X8(mb_type)){
                     -        mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                     +        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
                                      qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                     -                &weight_op[1], &weight_avg[1],
                     +                weight_op, weight_avg,
                                      IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                                      pixel_shift, chroma444);
                     -        mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                     +        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
                                      qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
                     -                &weight_op[1], &weight_avg[1],
                     +                weight_op, weight_avg,
                                      IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                                      pixel_shift, chroma444);
                          }else if(IS_8X16(mb_type)){
                     -        mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                     +        mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
                                      qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                     -                &weight_op[2], &weight_avg[2],
                     +                &weight_op[1], &weight_avg[1],
                                      IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
                                      pixel_shift, chroma444);
                     -        mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                     +        mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
                                      qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                     -                &weight_op[2], &weight_avg[2],
                     +                &weight_op[1], &weight_avg[1],
                                      IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
                                      pixel_shift, chroma444);
                          }else{
@@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                                  int y_offset= (i&2)<<1;
                                  if(IS_SUB_8X8(sub_mb_type)){
                     -                mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     +                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                                          qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
                     -                    &weight_op[3], &weight_avg[3],
                     +                    &weight_op[1], &weight_avg[1],
                                          IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                                          pixel_shift, chroma444);
                                  }else if(IS_SUB_8X4(sub_mb_type)){
                     -                mc_part(h, n  , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     +                mc_part(h, n  , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                                          qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     -                    &weight_op[4], &weight_avg[4],
                     +                    &weight_op[1], &weight_avg[1],
                                          IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                                          pixel_shift, chroma444);
                     -                mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                     +                mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
                                          qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
                     -                    &weight_op[4], &weight_avg[4],
                     +                    &weight_op[1], &weight_avg[1],
                                          IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                                          pixel_shift, chroma444);
                                  }else if(IS_SUB_4X8(sub_mb_type)){
                     -                mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                     +                mc_part(h, n  , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
                                          qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     -                    &weight_op[5], &weight_avg[5],
                     +                    &weight_op[2], &weight_avg[2],
                                          IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                                          pixel_shift, chroma444);
                     -                mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                     +                mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
                                          qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     -                    &weight_op[5], &weight_avg[5],
                     +                    &weight_op[2], &weight_avg[2],
                                          IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                                          pixel_shift, chroma444);
                                  }else{
@@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
                                      for(j=0; j<4; j++){
                                          int sub_x_offset= x_offset + 2*(j&1);
                                          int sub_y_offset= y_offset +   (j&2);
                     -                    mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                     +                    mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
                                              qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
                     -                        &weight_op[6], &weight_avg[6],
                     +                        &weight_op[2], &weight_avg[2],
                                              IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
                                              pixel_shift, chroma444);
+                                     }

libavcodec/h264dsp.c

History View file @ c2d3374

@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
                          else\
                              c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
+                     \
                     -    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
                     -    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
                     -    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
                     -    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
                     -    c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
                     -    c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
                     -    c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
                     -    c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
                     -    c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
                     -    c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
                     -    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
                     -    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
                     -    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
                     -    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
                     -    c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
                     -    c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
                     -    c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
                     -    c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
                     -    c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
                     -    c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
                     +    c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
                     +    c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
                     +    c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
                     +    c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
                     +    c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
                     +    c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
                     +    c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
                     +    c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
+                     \
                          c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
                          c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\

libavcodec/h264dsp.h

History View file @ c2d3374

@@ -31,16 +31,18 @@
                      #include "dsputil.h"
                      //typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
                     -typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
                     -typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
                     +typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
                     +                                 int log2_denom, int weight, int offset);
                     +typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
                     +                                   int log2_denom, int weightd, int weights, int offset);
                      /**
                       * Context for storing H.264 DSP functions
                       */
                      typedef struct H264DSPContext{
                          /* weighted MC */
                     -    h264_weight_func weight_h264_pixels_tab[10];
                     -    h264_biweight_func biweight_h264_pixels_tab[10];
                     +    h264_weight_func weight_h264_pixels_tab[4];
                     +    h264_biweight_func biweight_h264_pixels_tab[4];
                          /* loop filter */
                          void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);

libavcodec/h264dsp_template.c

History View file @ c2d3374

@@ -29,14 +29,16 @@
                      #define op_scale1(x)  block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
                      #define op_scale2(x)  dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
                     -#define H264_WEIGHT(W,H) \
                     -static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
                     +#define H264_WEIGHT(W) \
                     +static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
                     +                                           int log2_denom, int weight, int offset) \
                     +{ \
                          int y; \
                          pixel *block = (pixel*)_block; \
                          stride /= sizeof(pixel); \
                          offset <<= (log2_denom + (BIT_DEPTH-8)); \
                          if(log2_denom) offset += 1<<(log2_denom-1); \
                     -    for(y=0; y<H; y++, block += stride){ \
                     +    for (y = 0; y < height; y++, block += stride) { \
                              op_scale1(0); \
                              op_scale1(1); \
                              if(W==2) continue; \
@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
                              op_scale1(15); \
                          } \
                      } \
                     -static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
                     +static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
                     +                                             int log2_denom, int weightd, int weights, int offset) \
                     +{ \
                          int y; \
                          pixel *dst = (pixel*)_dst; \
                          pixel *src = (pixel*)_src; \
                          stride /= sizeof(pixel); \
                          offset <<= (BIT_DEPTH-8); \
                          offset = ((offset + 1) | 1) << log2_denom; \
                     -    for(y=0; y<H; y++, dst += stride, src += stride){ \
                     +    for (y = 0; y < height; y++, dst += stride, src += stride) { \
                              op_scale2(0); \
                              op_scale2(1); \
                              if(W==2) continue; \
@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
                          } \
+                     }
                     -H264_WEIGHT(16,16)
                     -H264_WEIGHT(16,8)
                     -H264_WEIGHT(8,16)
                     -H264_WEIGHT(8,8)
                     -H264_WEIGHT(8,4)
                     -H264_WEIGHT(4,8)
                     -H264_WEIGHT(4,4)
                     -H264_WEIGHT(4,2)
                     -H264_WEIGHT(2,4)
                     -H264_WEIGHT(2,2)
                     +H264_WEIGHT(16)
                     +H264_WEIGHT(8)
                     +H264_WEIGHT(4)
                     +H264_WEIGHT(2)
                      #undef op_scale1
                      #undef op_scale2

libavcodec/ppc/h264_altivec.c

History View file @ c2d3374

@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
+                     }
                      static av_always_inline
                     -void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
                     +void weight_h264_W_altivec(uint8_t *block, int stride, int height,
                     +                           int log2_denom, int weight, int offset, int w)
+                     {
                          int y, aligned;
                          vec_u8 vblock;
@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
                          voffset = vec_splat(vtemp, 5);
                          aligned = !((unsigned long)block & 0xf);
                     -    for (y=0; y<h; y++) {
                     +    for (y = 0; y < height; y++) {
                              vblock = vec_ld(0, block);
                              v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
+                     }
                      static av_always_inline
                     -void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
                     -                               int weightd, int weights, int offset, int w, int h)
                     +void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
                     +                             int log2_denom, int weightd, int weights, int offset, int w)
+                     {
                          int y, dst_aligned, src_aligned;
                          vec_u8 vsrc, vdst;
@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
                          dst_aligned = !((unsigned long)dst & 0xf);
                          src_aligned = !((unsigned long)src & 0xf);
                     -    for (y=0; y<h; y++) {
                     +    for (y = 0; y < height; y++) {
                              vdst = vec_ld(0, dst);
                              vsrc = vec_ld(0, src);
@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
+                         }
+                     }
                     -#define H264_WEIGHT(W,H) \
                     -static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
                     -    weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
                     +#define H264_WEIGHT(W) \
                     +static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
                     +                                                   int log2_denom, int weight, int offset){ \
                     +    weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
                      }\
                     -static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
                     -    biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
                     +static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
                     +                                                     int log2_denom, int weightd, int weights, int offset){ \
                     +    biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
+                     }
                     -H264_WEIGHT(16,16)
                     -H264_WEIGHT(16, 8)
                     -H264_WEIGHT( 8,16)
                     -H264_WEIGHT( 8, 8)
                     -H264_WEIGHT( 8, 4)
                     +H264_WEIGHT(16)
                     +H264_WEIGHT( 8)
                      void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
                          const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
                              c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
                              c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
                     -        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
                     -        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
                     -        c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
                     -        c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
                     -        c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
                     -        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
                     -        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
                     -        c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
                     -        c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
                     -        c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
                     +        c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
                     +        c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
                     +        c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
                     +        c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
+                         }
+                         }
+                     }

libavcodec/x86/h264_weight.asm

History View file @ c2d3374

@@ -28,21 +28,20 @@ SECTION .text
                      ;-----------------------------------------------------------------------------
                      ; biweight pred:
+                     ;
                     -; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
                     -;                               int log2_denom, int weightd, int weights,
                     -;                               int offset);
                     +; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
                     +;                            int height, int log2_denom, int weightd,
                     +;                            int weights, int offset);
                      ; and
                     -; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
                     -;                             int log2_denom, int weight,
                     -;                             int offset);
                     +; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
                     +;                          int log2_denom, int weight, int offset);
                      ;-----------------------------------------------------------------------------
                      %macro WEIGHT_SETUP 0
                     -    add        r4, r4
                     -    inc        r4
                     -    movd       m3, r3d
                     -    movd       m5, r4d
                     -    movd       m6, r2d
                     +    add        r5, r5
                     +    inc        r5
                     +    movd       m3, r4d
                     +    movd       m5, r5d
                     +    movd       m6, r3d
                          pslld      m5, m6
                          psrld      m5, 1
                      %if mmsize == 16
@@ -71,60 +70,41 @@ SECTION .text
                          packuswb      m0, m1
                      %endmacro
                     -%macro WEIGHT_FUNC_DBL_MM 1
                     -cglobal h264_weight_16x%1_mmx2, 5, 5, 0
                     +INIT_MMX
                     +cglobal h264_weight_16_mmx2, 6, 6, 0
                          WEIGHT_SETUP
                     -    mov        r2, %1
                     -%if %1 == 16
                      .nextrow
                          WEIGHT_OP 0,  4
                          mova     [r0  ], m0
                          WEIGHT_OP 8, 12
                          mova     [r0+8], m0
                          add        r0, r1
                     -    dec        r2
                     +    dec        r2d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
                     -%endif
                     -%endmacro
                     -INIT_MMX
                     -WEIGHT_FUNC_DBL_MM 16
                     -WEIGHT_FUNC_DBL_MM  8
+                    -
                     -%macro WEIGHT_FUNC_MM 4
                     -cglobal h264_weight_%1x%2_%4, 7, 7, %3
                     +%macro WEIGHT_FUNC_MM 3
                     +cglobal h264_weight_%1_%3, 6, 6, %2
                          WEIGHT_SETUP
                     -    mov        r2, %2
                     -%if %2 == 16
                      .nextrow
                          WEIGHT_OP 0, mmsize/2
                          mova     [r0], m0
                          add        r0, r1
                     -    dec        r2
                     +    dec        r2d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
                     -%endif
                      %endmacro
                      INIT_MMX
                     -WEIGHT_FUNC_MM  8, 16,  0, mmx2
                     -WEIGHT_FUNC_MM  8,  8,  0, mmx2
                     -WEIGHT_FUNC_MM  8,  4,  0, mmx2
                     +WEIGHT_FUNC_MM  8, 0, mmx2
                      INIT_XMM
                     -WEIGHT_FUNC_MM 16, 16,  8, sse2
                     -WEIGHT_FUNC_MM 16,  8,  8, sse2
                     +WEIGHT_FUNC_MM 16, 8, sse2
                     -%macro WEIGHT_FUNC_HALF_MM 5
                     -cglobal h264_weight_%1x%2_%5, 5, 5, %4
                     +%macro WEIGHT_FUNC_HALF_MM 3
                     +cglobal h264_weight_%1_%3, 6, 6, %2
                          WEIGHT_SETUP
                     -    mov        r2, %2/2
                     +    sar       r2d, 1
                          lea        r3, [r1*2]
                     -%if %2 == mmsize
                      .nextrow
                          WEIGHT_OP 0, r1
                          movh     [r0], m0
@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
                          movh     [r0+r1], m0
                      %endif
                          add        r0, r3
                     -    dec        r2
                     +    dec        r2d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
                     -%endif
                      %endmacro
                      INIT_MMX
                     -WEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
                     -WEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
                     -WEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
                     +WEIGHT_FUNC_HALF_MM 4, 0, mmx2
                     +WEIGHT_FUNC_HALF_MM 4, 0, mmx2
                     +WEIGHT_FUNC_HALF_MM 4, 0, mmx2
                      INIT_XMM
                     -WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
                     -WEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
                     -WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
                     +WEIGHT_FUNC_HALF_MM 8, 8, sse2
                     +WEIGHT_FUNC_HALF_MM 8, 8, sse2
                     +WEIGHT_FUNC_HALF_MM 8, 8, sse2
                      %macro BIWEIGHT_SETUP 0
                     -    add        r6, 1
                     -    or         r6, 1
                     -    add        r3, 1
                     -    movd       m3, r4d
                     -    movd       m4, r5d
                     -    movd       m5, r6d
                     -    movd       m6, r3d
                     +%ifdef ARCH_X86_64
                     +%define off_regd r11d
                     +%else
                     +%define off_regd r3d
                     +%endif
                     +    mov  off_regd, r7m
                     +    add  off_regd, 1
                     +    or   off_regd, 1
                     +    add        r4, 1
                     +    movd       m3, r5d
                     +    movd       m4, r6d
                     +    movd       m5, off_regd
                     +    movd       m6, r4d
                          pslld      m5, m6
                          psrld      m5, 1
                      %if mmsize == 16
@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
                          packuswb   m0, m1
                      %endmacro
                     -%macro BIWEIGHT_FUNC_DBL_MM 1
                     -cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
                     +INIT_MMX
                     +cglobal h264_biweight_16_mmx2, 7, 7, 0
                          BIWEIGHT_SETUP
                     -    mov        r3, %1
                     -%if %1 == 16
                     +    movifnidn r3d, r3m
                      .nextrow
                          BIWEIGHT_STEPA 0, 1, 0
                          BIWEIGHT_STEPA 1, 2, 4
@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
                          mova     [r0+8], m0
                          add        r0, r2
                          add        r1, r2
                     -    dec        r3
                     +    dec        r3d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
                     -%endif
                     -%endmacro
                     -INIT_MMX
                     -BIWEIGHT_FUNC_DBL_MM 16
                     -BIWEIGHT_FUNC_DBL_MM  8
+                    -
                     -%macro BIWEIGHT_FUNC_MM 4
                     -cglobal h264_biweight_%1x%2_%4, 7, 7, %3
                     +%macro BIWEIGHT_FUNC_MM 3
                     +cglobal h264_biweight_%1_%3, 7, 7, %2
                          BIWEIGHT_SETUP
                     -    mov        r3, %2
                     -%if %2 == 16
                     +    movifnidn r3d, r3m
                      .nextrow
                          BIWEIGHT_STEPA 0, 1, 0
                          BIWEIGHT_STEPA 1, 2, mmsize/2
@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
                          mova       [r0], m0
                          add        r0, r2
                          add        r1, r2
                     -    dec        r3
                     +    dec        r3d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
                     -%endif
                      %endmacro
                      INIT_MMX
                     -BIWEIGHT_FUNC_MM  8, 16,  0, mmx2
                     -BIWEIGHT_FUNC_MM  8,  8,  0, mmx2
                     -BIWEIGHT_FUNC_MM  8,  4,  0, mmx2
                     +BIWEIGHT_FUNC_MM  8, 0, mmx2
                      INIT_XMM
                     -BIWEIGHT_FUNC_MM 16, 16,  8, sse2
                     -BIWEIGHT_FUNC_MM 16,  8,  8, sse2
                     +BIWEIGHT_FUNC_MM 16, 8, sse2
                     -%macro BIWEIGHT_FUNC_HALF_MM 5
                     -cglobal h264_biweight_%1x%2_%5, 7, 7, %4
                     +%macro BIWEIGHT_FUNC_HALF_MM 3
                     +cglobal h264_biweight_%1_%3, 7, 7, %2
                          BIWEIGHT_SETUP
                     -    mov        r3, %2/2
                     +    movifnidn r3d, r3m
                     +    sar        r3, 1
                          lea        r4, [r2*2]
                     -%if %2 == mmsize
                      .nextrow
                          BIWEIGHT_STEPA 0, 1, 0
                          BIWEIGHT_STEPA 1, 2, r2
@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
                      %endif
                          add        r0, r4
                          add        r1, r4
                     -    dec        r3
                     +    dec        r3d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
                     -%endif
                      %endmacro
                      INIT_MMX
                     -BIWEIGHT_FUNC_HALF_MM 4,  8,  8, 0, mmx2
                     -BIWEIGHT_FUNC_HALF_MM 4,  4,  8, 0, mmx2
                     -BIWEIGHT_FUNC_HALF_MM 4,  2,  8, 0, mmx2
                     +BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
                      INIT_XMM
                     -BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
                     -BIWEIGHT_FUNC_HALF_MM 8,  8, 16, 8, sse2
                     -BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
                     +BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
                      %macro BIWEIGHT_SSSE3_SETUP 0
                     -    add        r6, 1
                     -    or         r6, 1
                     -    add        r3, 1
                     -    movd       m4, r4d
                     -    movd       m0, r5d
                     -    movd       m5, r6d
                     -    movd       m6, r3d
                     +%ifdef ARCH_X86_64
                     +%define off_regd r11d
                     +%else
                     +%define off_regd r3d
                     +%endif
                     +    mov  off_regd, r7m
                     +    add  off_regd, 1
                     +    or   off_regd, 1
                     +    add        r4, 1
                     +    movd       m4, r5d
                     +    movd       m0, r6d
                     +    movd       m5, off_regd
                     +    movd       m6, r4d
                          pslld      m5, m6
                          psrld      m5, 1
                          punpcklbw  m4, m0
@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8,  4, 16, 8, sse2
                          packuswb   m0, m2
                      %endmacro
                     -%macro BIWEIGHT_SSSE3_16 1
                     -cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
                     +INIT_XMM
                     +cglobal h264_biweight_16_ssse3, 7, 7, 8
                          BIWEIGHT_SSSE3_SETUP
                     -    mov        r3, %1
                     +    movifnidn r3d, r3m
                     -%if %1 == 16
                      .nextrow
                          movh       m0, [r0]
                          movh       m2, [r0+8]
@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
                          mova       [r0], m0
                          add        r0, r2
                          add        r1, r2
                     -    dec        r3
                     +    dec        r3d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
                     -%endif
                     -%endmacro
                      INIT_XMM
                     -BIWEIGHT_SSSE3_16 16
                     -BIWEIGHT_SSSE3_16  8
+                    -
                     -%macro BIWEIGHT_SSSE3_8 1
                     -cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
                     +cglobal h264_biweight_8_ssse3, 7, 7, 8
                          BIWEIGHT_SSSE3_SETUP
                     -    mov        r3, %1/2
                     +    movifnidn r3d, r3m
                     +    sar        r3, 1
                          lea        r4, [r2*2]
                     -%if %1 == 16
                      .nextrow
                          movh       m0, [r0]
                          movh       m1, [r1]
@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
                          movhps     [r0+r2], m0
                          add        r0, r4
                          add        r1, r4
                     -    dec        r3
                     +    dec        r3d
                          jnz .nextrow
                          REP_RET
                     -%else
                     -    jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
                     -%endif
                     -%endmacro
+                    -
                     -INIT_XMM
                     -BIWEIGHT_SSSE3_8 16
                     -BIWEIGHT_SSSE3_8  8
                     -BIWEIGHT_SSSE3_8  4

libavcodec/x86/h264_weight_10bit.asm

History View file @ c2d3374

@@ -36,33 +36,26 @@ cextern pw_1
                      SECTION .text
                      ;-----------------------------------------------------------------------------
                     -; void h264_weight(uint8_t *dst, int stride, int log2_denom,
                     +; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
                      ;                  int weight, int offset);
                      ;-----------------------------------------------------------------------------
                     -%ifdef ARCH_X86_32
                     -DECLARE_REG_TMP 2
                     -%else
                     -DECLARE_REG_TMP 10
                     -%endif
+                    -
                     -%macro WEIGHT_PROLOGUE 1
                     -    mov t0, %1
                     +%macro WEIGHT_PROLOGUE 0
                      .prologue
                     -    PROLOGUE 0,5,8
                     +    PROLOGUE 0,6,8
                          movifnidn  r0, r0mp
                          movifnidn r1d, r1m
                     -    movifnidn r3d, r3m
                          movifnidn r4d, r4m
                     +    movifnidn r5d, r5m
                      %endmacro
                      %macro WEIGHT_SETUP 1
                          mova       m0, [pw_1]
                     -    movd       m2, r2m
                     +    movd       m2, r3m
                          pslld      m0, m2       ; 1<<log2_denom
                          SPLATW     m0, m0
                     -    shl        r4, 19       ; *8, move to upper half of dword
                     -    lea        r4, [r4+r3*2+0x10000]
                     -    movd       m3, r4d      ; weight<<1 | 1+(offset<<(3))
                     +    shl        r5, 19       ; *8, move to upper half of dword
                     +    lea        r5, [r5+r4*2+0x10000]
                     +    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
                          pshufd     m3, m3, 0
                          mova       m4, [pw_pixel_max]
                          paddw      m2, [sq_1]   ; log2_denom+1
@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
                      %endmacro
                      %macro WEIGHT_FUNC_DBL 1
                     -cglobal h264_weight_16x16_10_%1
                     -    WEIGHT_PROLOGUE 16
                     +cglobal h264_weight_16_10_%1
                     +    WEIGHT_PROLOGUE
                          WEIGHT_SETUP %1
                      .nextrow
                          WEIGHT_OP %1,  0
@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
                          WEIGHT_OP %1, 16
                          mova [r0+16], m5
                          add       r0, r1
                     -    dec       t0
                     +    dec       r2d
                          jnz .nextrow
                          REP_RET
+                    -
                     -cglobal h264_weight_16x8_10_%1
                     -    mov t0, 8
                     -    jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
                      %endmacro
                      INIT_XMM
@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
                      %macro WEIGHT_FUNC_MM 1
                     -cglobal h264_weight_8x16_10_%1
                     -    WEIGHT_PROLOGUE 16
                     +cglobal h264_weight_8_10_%1
                     +    WEIGHT_PROLOGUE
                          WEIGHT_SETUP %1
                      .nextrow
                          WEIGHT_OP  %1, 0
                          mova     [r0], m5
                          add        r0, r1
                     -    dec        t0
                     +    dec        r2d
                          jnz .nextrow
                          REP_RET
+                    -
                     -cglobal h264_weight_8x8_10_%1
                     -    mov t0, 8
                     -    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
+                    -
                     -cglobal h264_weight_8x4_10_%1
                     -    mov t0, 4
                     -    jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
                      %endmacro
                      INIT_XMM
@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
                      %macro WEIGHT_FUNC_HALF_MM 1
                     -cglobal h264_weight_4x8_10_%1
                     -    WEIGHT_PROLOGUE 4
                     +cglobal h264_weight_4_10_%1
                     +    WEIGHT_PROLOGUE
                     +    sar         r2d, 1
                          WEIGHT_SETUP %1
                          lea         r3, [r1*2]
                      .nextrow
@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
                          movh      [r0], m5
                          movhps [r0+r1], m5
                          add         r0, r3
                     -    dec         t0
                     +    dec         r2d
                          jnz .nextrow
                          REP_RET
+                    -
                     -cglobal h264_weight_4x4_10_%1
                     -    mov t0, 2
                     -    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
+                    -
                     -cglobal h264_weight_4x2_10_%1
                     -    mov t0, 1
                     -    jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
                      %endmacro
                      INIT_XMM
@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
                      ;-----------------------------------------------------------------------------
                     -; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
                     -;                    int weightd, int weights, int offset);
                     +; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
                     +;                    int log2_denom, int weightd, int weights, int offset);
                      ;-----------------------------------------------------------------------------
                      %ifdef ARCH_X86_32
                     -DECLARE_REG_TMP 2,3
                     +DECLARE_REG_TMP 3
                      %else
                     -DECLARE_REG_TMP 10,2
                     +DECLARE_REG_TMP 10
                      %endif
                     -%macro BIWEIGHT_PROLOGUE 1
                     -    mov t0, %1
                     +%macro BIWEIGHT_PROLOGUE 0
                      .prologue
                          PROLOGUE 0,7,8
                          movifnidn  r0, r0mp
                          movifnidn  r1, r1mp
                     -    movifnidn t1d, r2m
                     -    movifnidn r4d, r4m
                     +    movifnidn r2d, r2m
                          movifnidn r5d, r5m
                          movifnidn r6d, r6m
                     +    movifnidn t0d, r7m
                      %endmacro
                      %macro BIWEIGHT_SETUP 1
                     -    lea        r6, [r6*4+1] ; (offset<<2)+1
                     -    or         r6, 1
                     -    shl        r5, 16
                     -    or         r4, r5
                     -    movd       m4, r4d      ; weightd | weights
                     -    movd       m5, r6d      ; (offset+1)|1
                     -    movd       m6, r3m      ; log2_denom
                     +    lea        t0, [t0*4+1] ; (offset<<2)+1
                     +    or         t0, 1
                     +    shl        r6, 16
                     +    or         r5, r6
                     +    movd       m4, r5d      ; weightd | weights
                     +    movd       m5, t0d      ; (offset+1)|1
                     +    movd       m6, r4m      ; log2_denom
                          pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
                          paddd      m6, [sq_1]
                          pshufd     m4, m4, 0
                          pshufd     m5, m5, 0
                          mova       m3, [pw_pixel_max]
                     +    movifnidn r3d, r3m
                      %ifnidn %1, sse4
                          pxor       m7, m7
                      %endif
@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
                      %endmacro
                      %macro BIWEIGHT_FUNC_DBL 1
                     -cglobal h264_biweight_16x16_10_%1
                     -    BIWEIGHT_PROLOGUE 16
                     +cglobal h264_biweight_16_10_%1
                     +    BIWEIGHT_PROLOGUE
                          BIWEIGHT_SETUP %1
                      .nextrow
                          BIWEIGHT  %1,  0
                          mova [r0   ], m0
                          BIWEIGHT  %1, 16
                          mova [r0+16], m0
                     -    add       r0, t1
                     -    add       r1, t1
                     -    dec       t0
                     +    add       r0, r2
                     +    add       r1, r2
                     +    dec       r3d
                          jnz .nextrow
                          REP_RET
+                    -
                     -cglobal h264_biweight_16x8_10_%1
                     -    mov t0, 8
                     -    jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
                      %endmacro
                      INIT_XMM
@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
                      BIWEIGHT_FUNC_DBL sse4
                      %macro BIWEIGHT_FUNC 1
                     -cglobal h264_biweight_8x16_10_%1
                     -    BIWEIGHT_PROLOGUE 16
                     +cglobal h264_biweight_8_10_%1
                     +    BIWEIGHT_PROLOGUE
                          BIWEIGHT_SETUP %1
                      .nextrow
                          BIWEIGHT %1, 0
                          mova   [r0], m0
                     -    add      r0, t1
                     -    add      r1, t1
                     -    dec      t0
                     +    add      r0, r2
                     +    add      r1, r2
                     +    dec      r3d
                          jnz .nextrow
                          REP_RET
+                    -
                     -cglobal h264_biweight_8x8_10_%1
                     -    mov t0, 8
                     -    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
+                    -
                     -cglobal h264_biweight_8x4_10_%1
                     -    mov t0, 4
                     -    jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
                      %endmacro
                      INIT_XMM
@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
                      BIWEIGHT_FUNC sse4
                      %macro BIWEIGHT_FUNC_HALF 1
                     -cglobal h264_biweight_4x8_10_%1
                     -    BIWEIGHT_PROLOGUE 4
                     +cglobal h264_biweight_4_10_%1
                     +    BIWEIGHT_PROLOGUE
                          BIWEIGHT_SETUP %1
                     -    lea        r4, [t1*2]
                     +    sar        r3d, 1
                     +    lea        r4, [r2*2]
                      .nextrow
                     -    BIWEIGHT    %1, 0, t1
                     +    BIWEIGHT    %1, 0, r2
                          movh   [r0   ], m0
                     -    movhps [r0+t1], m0
                     +    movhps [r0+r2], m0
                          add         r0, r4
                          add         r1, r4
                     -    dec         t0
                     +    dec         r3d
                          jnz .nextrow
                          REP_RET
+                    -
                     -cglobal h264_biweight_4x4_10_%1
                     -    mov t0, 2
                     -    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
+                    -
                     -cglobal h264_biweight_4x2_10_%1
                     -    mov t0, 1
                     -    jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
                      %endmacro
                      INIT_XMM

libavcodec/x86/h264dsp_mmx.c

History View file @ c2d3374

@@ -298,57 +298,47 @@ LF_IFUNC(v,  luma_intra,      10, mmxext)
                      /***********************************/
                      /* weighted prediction */
                     -#define H264_WEIGHT(W, H, OPT) \
                     -void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
                     -    int stride, int log2_denom, int weight, int offset);
                     +#define H264_WEIGHT(W, OPT) \
                     +void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
                     +    int stride, int height, int log2_denom, int weight, int offset);
                     -#define H264_BIWEIGHT(W, H, OPT) \
                     -void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
                     -    uint8_t *src, int stride, int log2_denom, int weightd, \
                     +#define H264_BIWEIGHT(W, OPT) \
                     +void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
                     +    uint8_t *src, int stride, int height, int log2_denom, int weightd, \
                          int weights, int offset);
                     -#define H264_BIWEIGHT_MMX(W,H) \
                     -H264_WEIGHT  (W, H, mmx2) \
                     -H264_BIWEIGHT(W, H, mmx2)
+                    -
                     -#define H264_BIWEIGHT_MMX_SSE(W,H) \
                     -H264_BIWEIGHT_MMX(W, H) \
                     -H264_WEIGHT      (W, H, sse2) \
                     -H264_BIWEIGHT    (W, H, sse2) \
                     -H264_BIWEIGHT    (W, H, ssse3)
+                    -
                     -H264_BIWEIGHT_MMX_SSE(16, 16)
                     -H264_BIWEIGHT_MMX_SSE(16,  8)
                     -H264_BIWEIGHT_MMX_SSE( 8, 16)
                     -H264_BIWEIGHT_MMX_SSE( 8,  8)
                     -H264_BIWEIGHT_MMX_SSE( 8,  4)
                     -H264_BIWEIGHT_MMX    ( 4,  8)
                     -H264_BIWEIGHT_MMX    ( 4,  4)
                     -H264_BIWEIGHT_MMX    ( 4,  2)
+                    -
                     -#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
                     -void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
                     -    int stride, int log2_denom, int weight, int offset);
+                    -
                     -#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
                     -void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
                     -    (uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
                     +#define H264_BIWEIGHT_MMX(W) \
                     +H264_WEIGHT  (W, mmx2) \
                     +H264_BIWEIGHT(W, mmx2)
+                    +
                     +#define H264_BIWEIGHT_MMX_SSE(W) \
                     +H264_BIWEIGHT_MMX(W) \
                     +H264_WEIGHT      (W, sse2) \
                     +H264_BIWEIGHT    (W, sse2) \
                     +H264_BIWEIGHT    (W, ssse3)
+                    +
                     +H264_BIWEIGHT_MMX_SSE(16)
                     +H264_BIWEIGHT_MMX_SSE( 8)
                     +H264_BIWEIGHT_MMX    ( 4)
+                    +
                     +#define H264_WEIGHT_10(W, DEPTH, OPT) \
                     +void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
                     +    int stride, int height, int log2_denom, int weight, int offset);
+                    +
                     +#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
                     +void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
                     +    (uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
                           int weightd, int weights, int offset);
                     -#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
                     -H264_WEIGHT_10  (W, H, DEPTH, sse2) \
                     -H264_WEIGHT_10  (W, H, DEPTH, sse4) \
                     -H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
                     -H264_BIWEIGHT_10(W, H, DEPTH, sse4)
+                    -
                     -H264_BIWEIGHT_10_SSE(16, 16, 10)
                     -H264_BIWEIGHT_10_SSE(16,  8, 10)
                     -H264_BIWEIGHT_10_SSE( 8, 16, 10)
                     -H264_BIWEIGHT_10_SSE( 8,  8, 10)
                     -H264_BIWEIGHT_10_SSE( 8,  4, 10)
                     -H264_BIWEIGHT_10_SSE( 4,  8, 10)
                     -H264_BIWEIGHT_10_SSE( 4,  4, 10)
                     -H264_BIWEIGHT_10_SSE( 4,  2, 10)
                     +#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
                     +H264_WEIGHT_10  (W, DEPTH, sse2) \
                     +H264_WEIGHT_10  (W, DEPTH, sse4) \
                     +H264_BIWEIGHT_10(W, DEPTH, sse2) \
                     +H264_BIWEIGHT_10(W, DEPTH, sse4)
+                    +
                     +H264_BIWEIGHT_10_SSE(16, 10)
                     +H264_BIWEIGHT_10_SSE( 8, 10)
                     +H264_BIWEIGHT_10_SSE( 4, 10)
                      void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
+                     {
@@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                                  c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
                                  c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
                      #endif
                     -            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
                     -            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
                     -            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
                     -            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
                     -            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
                     -            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
                     -            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
                     -            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+                    -
                     -            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
                     -            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
                     -            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
                     -            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
                     -            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
                     -            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
                     -            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
                     -            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
                     +            c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
                     +            c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
                     +            c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
+                    +
                     +            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
                     +            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
                     +            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
                                  if (mm_flags&AV_CPU_FLAG_SSE2) {
                                      c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
@@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                                      c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
                                      c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
                     -                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
                     -                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
                     -                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
                     -                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
                     -                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
                     +                c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
                     +                c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
                     -                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
                     -                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
                     -                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
                     -                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
                     -                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
                     +                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
                     +                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
                      #if HAVE_ALIGNED_STACK
                                      c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
@@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                      #endif
+                                 }
                                  if (mm_flags&AV_CPU_FLAG_SSSE3) {
                     -                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
                     -                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
                     -                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
                     -                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
                     -                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
                     +                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
                     +                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
+                                 }
                                  if (mm_flags&AV_CPU_FLAG_AVX) {
                      #if HAVE_ALIGNED_STACK
@@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                                      c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
                      #endif
                     -                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
                     -                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
                     -                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
                     -                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
                     -                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
                     -                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
                     -                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
                     -                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
+                    -
                     -                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
                     -                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
                     -                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
                     -                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
                     -                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
                     -                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
                     -                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
                     -                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
                     +                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
                     +                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
                     +                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
+                    +
                     +                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
                     +                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
                     +                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
                                      c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
                                      c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
@@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
                      #endif
+                                 }
                                  if (mm_flags&AV_CPU_FLAG_SSE4) {
                     -                c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
                     -                c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
                     -                c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
                     -                c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
                     -                c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
                     -                c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
                     -                c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
                     -                c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
+                    -
                     -                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
                     -                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
                     -                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
                     -                c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
                     -                c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
                     -                c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
                     -                c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
                     -                c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
                     +                c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
                     +                c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
                     +                c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
+                    +
                     +                c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
                     +                c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
                     +                c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
+                                 }
                      #if HAVE_AVX
                                  if (mm_flags&AV_CPU_FLAG_AVX) {