Browse code

h264: use templates to avoid excessive inlining

Instead of inlining everything into ff_h264_hl_decode_mb(), use
explicit templating to create versions of the called functions
with constant parameters filled in. This greatly speeds up
compilation of h264.c and reduces the code size without any
measurable impact on performance.

Compilation time for h264.c on an i7 goes from 30s to 5.5s.
Code size is reduced by 430kB.

Signed-off-by: Mans Rullgard <mans@mansr.com>

Mans Rullgard authored on 2012/07/04 07:16:11
Showing 3 changed files
... ...
@@ -714,33 +714,6 @@ static av_always_inline void mc_part_weighted(H264Context *h, int n, int square,
714 714
     }
715 715
 }
716 716
 
717
-static av_always_inline void mc_part(H264Context *h, int n, int square,
718
-                                     int height, int delta,
719
-                                     uint8_t *dest_y, uint8_t *dest_cb,
720
-                                     uint8_t *dest_cr,
721
-                                     int x_offset, int y_offset,
722
-                                     qpel_mc_func *qpix_put,
723
-                                     h264_chroma_mc_func chroma_put,
724
-                                     qpel_mc_func *qpix_avg,
725
-                                     h264_chroma_mc_func chroma_avg,
726
-                                     h264_weight_func *weight_op,
727
-                                     h264_biweight_func *weight_avg,
728
-                                     int list0, int list1,
729
-                                     int pixel_shift, int chroma_idc)
730
-{
731
-    if ((h->use_weight == 2 && list0 && list1 &&
732
-         (h->implicit_weight[h->ref_cache[0][scan8[n]]][h->ref_cache[1][scan8[n]]][h->s.mb_y & 1] != 32)) ||
733
-        h->use_weight == 1)
734
-        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
735
-                         x_offset, y_offset, qpix_put, chroma_put,
736
-                         weight_op[0], weight_op[1], weight_avg[0],
737
-                         weight_avg[1], list0, list1, pixel_shift, chroma_idc);
738
-    else
739
-        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
740
-                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
741
-                    chroma_avg, list0, list1, pixel_shift, chroma_idc);
742
-}
743
-
744 717
 static av_always_inline void prefetch_motion(H264Context *h, int list,
745 718
                                              int pixel_shift, int chroma_idc)
746 719
 {
... ...
@@ -768,146 +741,6 @@ static av_always_inline void prefetch_motion(H264Context *h, int list,
768 768
     }
769 769
 }
770 770
 
771
-static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y,
772
-                                       uint8_t *dest_cb, uint8_t *dest_cr,
773
-                                       qpel_mc_func(*qpix_put)[16],
774
-                                       h264_chroma_mc_func(*chroma_put),
775
-                                       qpel_mc_func(*qpix_avg)[16],
776
-                                       h264_chroma_mc_func(*chroma_avg),
777
-                                       h264_weight_func *weight_op,
778
-                                       h264_biweight_func *weight_avg,
779
-                                       int pixel_shift, int chroma_idc)
780
-{
781
-    MpegEncContext *const s = &h->s;
782
-    const int mb_xy   = h->mb_xy;
783
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
784
-
785
-    assert(IS_INTER(mb_type));
786
-
787
-    if (HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
788
-        await_references(h);
789
-    prefetch_motion(h, 0, pixel_shift, chroma_idc);
790
-
791
-    if (IS_16X16(mb_type)) {
792
-        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
793
-                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
794
-                weight_op, weight_avg,
795
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
796
-                pixel_shift, chroma_idc);
797
-    } else if (IS_16X8(mb_type)) {
798
-        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
799
-                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
800
-                weight_op, weight_avg,
801
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
802
-                pixel_shift, chroma_idc);
803
-        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
804
-                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
805
-                weight_op, weight_avg,
806
-                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
807
-                pixel_shift, chroma_idc);
808
-    } else if (IS_8X16(mb_type)) {
809
-        mc_part(h, 0, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
810
-                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
811
-                &weight_op[1], &weight_avg[1],
812
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
813
-                pixel_shift, chroma_idc);
814
-        mc_part(h, 4, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
815
-                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
816
-                &weight_op[1], &weight_avg[1],
817
-                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
818
-                pixel_shift, chroma_idc);
819
-    } else {
820
-        int i;
821
-
822
-        assert(IS_8X8(mb_type));
823
-
824
-        for (i = 0; i < 4; i++) {
825
-            const int sub_mb_type = h->sub_mb_type[i];
826
-            const int n  = 4 * i;
827
-            int x_offset = (i & 1) << 2;
828
-            int y_offset = (i & 2) << 1;
829
-
830
-            if (IS_SUB_8X8(sub_mb_type)) {
831
-                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr,
832
-                        x_offset, y_offset,
833
-                        qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
834
-                        &weight_op[1], &weight_avg[1],
835
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
836
-                        pixel_shift, chroma_idc);
837
-            } else if (IS_SUB_8X4(sub_mb_type)) {
838
-                mc_part(h, n, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr,
839
-                        x_offset, y_offset,
840
-                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
841
-                        &weight_op[1], &weight_avg[1],
842
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
843
-                        pixel_shift, chroma_idc);
844
-                mc_part(h, n + 2, 0, 4, 4 << pixel_shift,
845
-                        dest_y, dest_cb, dest_cr, x_offset, y_offset + 2,
846
-                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
847
-                        &weight_op[1], &weight_avg[1],
848
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
849
-                        pixel_shift, chroma_idc);
850
-            } else if (IS_SUB_4X8(sub_mb_type)) {
851
-                mc_part(h, n, 0, 8, 4 * h->mb_linesize,
852
-                        dest_y, dest_cb, dest_cr, x_offset, y_offset,
853
-                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
854
-                        &weight_op[2], &weight_avg[2],
855
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
856
-                        pixel_shift, chroma_idc);
857
-                mc_part(h, n + 1, 0, 8, 4 * h->mb_linesize,
858
-                        dest_y, dest_cb, dest_cr, x_offset + 2, y_offset,
859
-                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
860
-                        &weight_op[2], &weight_avg[2],
861
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
862
-                        pixel_shift, chroma_idc);
863
-            } else {
864
-                int j;
865
-                assert(IS_SUB_4X4(sub_mb_type));
866
-                for (j = 0; j < 4; j++) {
867
-                    int sub_x_offset = x_offset + 2 * (j & 1);
868
-                    int sub_y_offset = y_offset + (j & 2);
869
-                    mc_part(h, n + j, 1, 4, 0,
870
-                            dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
871
-                            qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
872
-                            &weight_op[2], &weight_avg[2],
873
-                            IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
874
-                            pixel_shift, chroma_idc);
875
-                }
876
-            }
877
-        }
878
-    }
879
-
880
-    prefetch_motion(h, 1, pixel_shift, chroma_idc);
881
-}
882
-
883
-static av_always_inline void hl_motion_420(H264Context *h, uint8_t *dest_y,
884
-                                           uint8_t *dest_cb, uint8_t *dest_cr,
885
-                                           qpel_mc_func(*qpix_put)[16],
886
-                                           h264_chroma_mc_func(*chroma_put),
887
-                                           qpel_mc_func(*qpix_avg)[16],
888
-                                           h264_chroma_mc_func(*chroma_avg),
889
-                                           h264_weight_func *weight_op,
890
-                                           h264_biweight_func *weight_avg,
891
-                                           int pixel_shift)
892
-{
893
-    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
894
-              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1);
895
-}
896
-
897
-static av_always_inline void hl_motion_422(H264Context *h, uint8_t *dest_y,
898
-                                           uint8_t *dest_cb, uint8_t *dest_cr,
899
-                                           qpel_mc_func(*qpix_put)[16],
900
-                                           h264_chroma_mc_func(*chroma_put),
901
-                                           qpel_mc_func(*qpix_avg)[16],
902
-                                           h264_chroma_mc_func(*chroma_avg),
903
-                                           h264_weight_func *weight_op,
904
-                                           h264_biweight_func *weight_avg,
905
-                                           int pixel_shift)
906
-{
907
-    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
908
-              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2);
909
-}
910
-
911 771
 static void free_tables(H264Context *h, int free_rbsp)
912 772
 {
913 773
     int i;
... ...
@@ -2077,373 +1910,17 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
2077 2077
     }
2078 2078
 }
2079 2079
 
2080
-static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple,
2081
-                                                   int pixel_shift)
2082
-{
2083
-    MpegEncContext *const s = &h->s;
2084
-    const int mb_x    = s->mb_x;
2085
-    const int mb_y    = s->mb_y;
2086
-    const int mb_xy   = h->mb_xy;
2087
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
2088
-    uint8_t *dest_y, *dest_cb, *dest_cr;
2089
-    int linesize, uvlinesize /*dct_offset*/;
2090
-    int i, j;
2091
-    int *block_offset = &h->block_offset[0];
2092
-    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2093
-    /* is_h264 should always be true if SVQ3 is disabled. */
2094
-    const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2095
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2096
-    const int block_h   = 16 >> s->chroma_y_shift;
2097
-    const int chroma422 = CHROMA422;
2098
-
2099
-    dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift)     + mb_y * s->linesize)  * 16;
2100
-    dest_cb = s->current_picture.f.data[1] +  (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h;
2101
-    dest_cr = s->current_picture.f.data[2] +  (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h;
2102
-
2103
-    s->dsp.prefetch(dest_y  + (s->mb_x & 3) * 4 * s->linesize   + (64 << pixel_shift), s->linesize,       4);
2104
-    s->dsp.prefetch(dest_cb + (s->mb_x & 7)     * s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2);
2105
-
2106
-    h->list_counts[mb_xy] = h->list_count;
2107
-
2108
-    if (!simple && MB_FIELD) {
2109
-        linesize     = h->mb_linesize = s->linesize * 2;
2110
-        uvlinesize   = h->mb_uvlinesize = s->uvlinesize * 2;
2111
-        block_offset = &h->block_offset[48];
2112
-        if (mb_y & 1) { // FIXME move out of this function?
2113
-            dest_y  -= s->linesize * 15;
2114
-            dest_cb -= s->uvlinesize * (block_h - 1);
2115
-            dest_cr -= s->uvlinesize * (block_h - 1);
2116
-        }
2117
-        if (FRAME_MBAFF) {
2118
-            int list;
2119
-            for (list = 0; list < h->list_count; list++) {
2120
-                if (!USES_LIST(mb_type, list))
2121
-                    continue;
2122
-                if (IS_16X16(mb_type)) {
2123
-                    int8_t *ref = &h->ref_cache[list][scan8[0]];
2124
-                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
2125
-                } else {
2126
-                    for (i = 0; i < 16; i += 4) {
2127
-                        int ref = h->ref_cache[list][scan8[i]];
2128
-                        if (ref >= 0)
2129
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
2130
-                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
2131
-                    }
2132
-                }
2133
-            }
2134
-        }
2135
-    } else {
2136
-        linesize   = h->mb_linesize   = s->linesize;
2137
-        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2138
-        // dct_offset = s->linesize * 16;
2139
-    }
2140
-
2141
-    if (!simple && IS_INTRA_PCM(mb_type)) {
2142
-        if (pixel_shift) {
2143
-            const int bit_depth = h->sps.bit_depth_luma;
2144
-            int j;
2145
-            GetBitContext gb;
2146
-            init_get_bits(&gb, (uint8_t *)h->mb,
2147
-                          ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
2148
-
2149
-            for (i = 0; i < 16; i++) {
2150
-                uint16_t *tmp_y = (uint16_t *)(dest_y + i * linesize);
2151
-                for (j = 0; j < 16; j++)
2152
-                    tmp_y[j] = get_bits(&gb, bit_depth);
2153
-            }
2154
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
2155
-                if (!h->sps.chroma_format_idc) {
2156
-                    for (i = 0; i < block_h; i++) {
2157
-                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
2158
-                        for (j = 0; j < 8; j++)
2159
-                            tmp_cb[j] = 1 << (bit_depth - 1);
2160
-                    }
2161
-                    for (i = 0; i < block_h; i++) {
2162
-                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
2163
-                        for (j = 0; j < 8; j++)
2164
-                            tmp_cr[j] = 1 << (bit_depth - 1);
2165
-                    }
2166
-                } else {
2167
-                    for (i = 0; i < block_h; i++) {
2168
-                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
2169
-                        for (j = 0; j < 8; j++)
2170
-                            tmp_cb[j] = get_bits(&gb, bit_depth);
2171
-                    }
2172
-                    for (i = 0; i < block_h; i++) {
2173
-                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
2174
-                        for (j = 0; j < 8; j++)
2175
-                            tmp_cr[j] = get_bits(&gb, bit_depth);
2176
-                    }
2177
-                }
2178
-            }
2179
-        } else {
2180
-            for (i = 0; i < 16; i++)
2181
-                memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
2182
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
2183
-                if (!h->sps.chroma_format_idc) {
2184
-                    for (i = 0; i < block_h; i++) {
2185
-                        memset(dest_cb + i * uvlinesize, 128, 8);
2186
-                        memset(dest_cr + i * uvlinesize, 128, 8);
2187
-                    }
2188
-                } else {
2189
-                    uint8_t *src_cb = (uint8_t *)h->mb + 256;
2190
-                    uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
2191
-                    for (i = 0; i < block_h; i++) {
2192
-                        memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
2193
-                        memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
2194
-                    }
2195
-                }
2196
-            }
2197
-        }
2198
-    } else {
2199
-        if (IS_INTRA(mb_type)) {
2200
-            if (h->deblocking_filter)
2201
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
2202
-                               uvlinesize, 1, 0, simple, pixel_shift);
2203
-
2204
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
2205
-                h->hpc.pred8x8[h->chroma_pred_mode](dest_cb, uvlinesize);
2206
-                h->hpc.pred8x8[h->chroma_pred_mode](dest_cr, uvlinesize);
2207
-            }
2208
-
2209
-            hl_decode_mb_predict_luma(h, mb_type, is_h264, simple,
2210
-                                      transform_bypass, pixel_shift,
2211
-                                      block_offset, linesize, dest_y, 0);
2212
-
2213
-            if (h->deblocking_filter)
2214
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
2215
-                               uvlinesize, 0, 0, simple, pixel_shift);
2216
-        } else if (is_h264) {
2217
-            if (chroma422) {
2218
-                hl_motion_422(h, dest_y, dest_cb, dest_cr,
2219
-                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2220
-                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2221
-                              h->h264dsp.weight_h264_pixels_tab,
2222
-                              h->h264dsp.biweight_h264_pixels_tab,
2223
-                              pixel_shift);
2224
-            } else {
2225
-                hl_motion_420(h, dest_y, dest_cb, dest_cr,
2226
-                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2227
-                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2228
-                              h->h264dsp.weight_h264_pixels_tab,
2229
-                              h->h264dsp.biweight_h264_pixels_tab,
2230
-                              pixel_shift);
2231
-            }
2232
-        }
2233
-
2234
-        hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass,
2235
-                               pixel_shift, block_offset, linesize, dest_y, 0);
2236
-
2237
-        if ((simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) &&
2238
-            (h->cbp & 0x30)) {
2239
-            uint8_t *dest[2] = { dest_cb, dest_cr };
2240
-            if (transform_bypass) {
2241
-                if (IS_INTRA(mb_type) && h->sps.profile_idc == 244 &&
2242
-                    (h->chroma_pred_mode == VERT_PRED8x8 ||
2243
-                     h->chroma_pred_mode == HOR_PRED8x8)) {
2244
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0],
2245
-                                                            block_offset + 16,
2246
-                                                            h->mb + (16 * 16 * 1 << pixel_shift),
2247
-                                                            uvlinesize);
2248
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1],
2249
-                                                            block_offset + 32,
2250
-                                                            h->mb + (16 * 16 * 2 << pixel_shift),
2251
-                                                            uvlinesize);
2252
-                } else {
2253
-                    idct_add = s->dsp.add_pixels4;
2254
-                    for (j = 1; j < 3; j++) {
2255
-                        for (i = j * 16; i < j * 16 + 4; i++)
2256
-                            if (h->non_zero_count_cache[scan8[i]] ||
2257
-                                dctcoef_get(h->mb, pixel_shift, i * 16))
2258
-                                idct_add(dest[j - 1] + block_offset[i],
2259
-                                         h->mb + (i * 16 << pixel_shift),
2260
-                                         uvlinesize);
2261
-                        if (chroma422) {
2262
-                            for (i = j * 16 + 4; i < j * 16 + 8; i++)
2263
-                                if (h->non_zero_count_cache[scan8[i + 4]] ||
2264
-                                    dctcoef_get(h->mb, pixel_shift, i * 16))
2265
-                                    idct_add(dest[j - 1] + block_offset[i + 4],
2266
-                                             h->mb + (i * 16 << pixel_shift),
2267
-                                             uvlinesize);
2268
-                        }
2269
-                    }
2270
-                }
2271
-            } else {
2272
-                if (is_h264) {
2273
-                    int qp[2];
2274
-                    if (chroma422) {
2275
-                        qp[0] = h->chroma_qp[0] + 3;
2276
-                        qp[1] = h->chroma_qp[1] + 3;
2277
-                    } else {
2278
-                        qp[0] = h->chroma_qp[0];
2279
-                        qp[1] = h->chroma_qp[1];
2280
-                    }
2281
-                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 0]])
2282
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 1 << pixel_shift),
2283
-                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][qp[0]][0]);
2284
-                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 1]])
2285
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 2 << pixel_shift),
2286
-                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][qp[1]][0]);
2287
-                    h->h264dsp.h264_idct_add8(dest, block_offset,
2288
-                                              h->mb, uvlinesize,
2289
-                                              h->non_zero_count_cache);
2290
-                } else if (CONFIG_SVQ3_DECODER) {
2291
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 1,
2292
-                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][h->chroma_qp[0]][0]);
2293
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 2,
2294
-                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][h->chroma_qp[1]][0]);
2295
-                    for (j = 1; j < 3; j++) {
2296
-                        for (i = j * 16; i < j * 16 + 4; i++)
2297
-                            if (h->non_zero_count_cache[scan8[i]] || h->mb[i * 16]) {
2298
-                                uint8_t *const ptr = dest[j - 1] + block_offset[i];
2299
-                                ff_svq3_add_idct_c(ptr, h->mb + i * 16,
2300
-                                                   uvlinesize,
2301
-                                                   ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
2302
-                            }
2303
-                    }
2304
-                }
2305
-            }
2306
-        }
2307
-    }
2308
-    if (h->cbp || IS_INTRA(mb_type)) {
2309
-        s->dsp.clear_blocks(h->mb);
2310
-        s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift));
2311
-    }
2312
-}
2313
-
2314
-static av_always_inline void hl_decode_mb_444_internal(H264Context *h,
2315
-                                                       int simple,
2316
-                                                       int pixel_shift)
2317
-{
2318
-    MpegEncContext *const s = &h->s;
2319
-    const int mb_x    = s->mb_x;
2320
-    const int mb_y    = s->mb_y;
2321
-    const int mb_xy   = h->mb_xy;
2322
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
2323
-    uint8_t *dest[3];
2324
-    int linesize;
2325
-    int i, j, p;
2326
-    int *block_offset = &h->block_offset[0];
2327
-    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2328
-    const int plane_count      = (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) ? 3 : 1;
2329
-
2330
-    for (p = 0; p < plane_count; p++) {
2331
-        dest[p] = s->current_picture.f.data[p] +
2332
-                  ((mb_x << pixel_shift) + mb_y * s->linesize) * 16;
2333
-        s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << pixel_shift),
2334
-                        s->linesize, 4);
2335
-    }
2336
-
2337
-    h->list_counts[mb_xy] = h->list_count;
2338
-
2339
-    if (!simple && MB_FIELD) {
2340
-        linesize     = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
2341
-        block_offset = &h->block_offset[48];
2342
-        if (mb_y & 1) // FIXME move out of this function?
2343
-            for (p = 0; p < 3; p++)
2344
-                dest[p] -= s->linesize * 15;
2345
-        if (FRAME_MBAFF) {
2346
-            int list;
2347
-            for (list = 0; list < h->list_count; list++) {
2348
-                if (!USES_LIST(mb_type, list))
2349
-                    continue;
2350
-                if (IS_16X16(mb_type)) {
2351
-                    int8_t *ref = &h->ref_cache[list][scan8[0]];
2352
-                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
2353
-                } else {
2354
-                    for (i = 0; i < 16; i += 4) {
2355
-                        int ref = h->ref_cache[list][scan8[i]];
2356
-                        if (ref >= 0)
2357
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
2358
-                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
2359
-                    }
2360
-                }
2361
-            }
2362
-        }
2363
-    } else {
2364
-        linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize;
2365
-    }
2366
-
2367
-    if (!simple && IS_INTRA_PCM(mb_type)) {
2368
-        if (pixel_shift) {
2369
-            const int bit_depth = h->sps.bit_depth_luma;
2370
-            GetBitContext gb;
2371
-            init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
2372
-
2373
-            for (p = 0; p < plane_count; p++)
2374
-                for (i = 0; i < 16; i++) {
2375
-                    uint16_t *tmp = (uint16_t *)(dest[p] + i * linesize);
2376
-                    for (j = 0; j < 16; j++)
2377
-                        tmp[j] = get_bits(&gb, bit_depth);
2378
-                }
2379
-        } else {
2380
-            for (p = 0; p < plane_count; p++)
2381
-                for (i = 0; i < 16; i++)
2382
-                    memcpy(dest[p] + i * linesize,
2383
-                           (uint8_t *)h->mb + p * 256 + i * 16, 16);
2384
-        }
2385
-    } else {
2386
-        if (IS_INTRA(mb_type)) {
2387
-            if (h->deblocking_filter)
2388
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
2389
-                               linesize, 1, 1, simple, pixel_shift);
2390
-
2391
-            for (p = 0; p < plane_count; p++)
2392
-                hl_decode_mb_predict_luma(h, mb_type, 1, simple,
2393
-                                          transform_bypass, pixel_shift,
2394
-                                          block_offset, linesize, dest[p], p);
2395
-
2396
-            if (h->deblocking_filter)
2397
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
2398
-                               linesize, 0, 1, simple, pixel_shift);
2399
-        } else {
2400
-            hl_motion(h, dest[0], dest[1], dest[2],
2401
-                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2402
-                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2403
-                      h->h264dsp.weight_h264_pixels_tab,
2404
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3);
2405
-        }
2406
-
2407
-        for (p = 0; p < plane_count; p++)
2408
-            hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass,
2409
-                                   pixel_shift, block_offset, linesize,
2410
-                                   dest[p], p);
2411
-    }
2412
-    if (h->cbp || IS_INTRA(mb_type)) {
2413
-        s->dsp.clear_blocks(h->mb);
2414
-        s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift));
2415
-    }
2416
-}
2417
-
2418
-/**
2419
- * Process a macroblock; this case avoids checks for expensive uncommon cases.
2420
- */
2421
-#define hl_decode_mb_simple(sh, bits)                          \
2422
-static void hl_decode_mb_simple_ ## bits(H264Context *h)       \
2423
-{                                                              \
2424
-    hl_decode_mb_internal(h, 1, sh);                           \
2425
-}
2080
+#define BITS   8
2081
+#define SIMPLE 1
2082
+#include "h264_mb_template.c"
2426 2083
 
2427
-hl_decode_mb_simple(0, 8)
2428
-hl_decode_mb_simple(1, 16)
2084
+#undef  BITS
2085
+#define BITS   16
2086
+#include "h264_mb_template.c"
2429 2087
 
2430
-/**
2431
- * Process a macroblock; this handles edge cases, such as interlacing.
2432
- */
2433
-static av_noinline void hl_decode_mb_complex(H264Context *h)
2434
-{
2435
-    hl_decode_mb_internal(h, 0, h->pixel_shift);
2436
-}
2437
-
2438
-static av_noinline void hl_decode_mb_444_complex(H264Context *h)
2439
-{
2440
-    hl_decode_mb_444_internal(h, 0, h->pixel_shift);
2441
-}
2442
-
2443
-static av_noinline void hl_decode_mb_444_simple(H264Context *h)
2444
-{
2445
-    hl_decode_mb_444_internal(h, 1, 0);
2446
-}
2088
+#undef  SIMPLE
2089
+#define SIMPLE 0
2090
+#include "h264_mb_template.c"
2447 2091
 
2448 2092
 void ff_h264_hl_decode_mb(H264Context *h)
2449 2093
 {
... ...
@@ -2456,7 +1933,7 @@ void ff_h264_hl_decode_mb(H264Context *h)
2456 2456
         if (is_complex || h->pixel_shift)
2457 2457
             hl_decode_mb_444_complex(h);
2458 2458
         else
2459
-            hl_decode_mb_444_simple(h);
2459
+            hl_decode_mb_444_simple_8(h);
2460 2460
     } else if (is_complex) {
2461 2461
         hl_decode_mb_complex(h);
2462 2462
     } else if (h->pixel_shift) {
2463 2463
new file mode 100644
... ...
@@ -0,0 +1,380 @@
0
+/*
1
+ * H.26L/H.264/AVC/JVT/14496-10/... decoder
2
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3
+ *
4
+ * This file is part of Libav.
5
+ *
6
+ * Libav is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * Libav is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with Libav; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#undef FUNC
22
+#undef PIXEL_SHIFT
23
+
24
+#if SIMPLE
25
+#   define FUNC(n) AV_JOIN(n ## _simple_, BITS)
26
+#   define PIXEL_SHIFT (BITS >> 4)
27
+#else
28
+#   define FUNC(n) n ## _complex
29
+#   define PIXEL_SHIFT h->pixel_shift
30
+#endif
31
+
32
+#undef  CHROMA_IDC
33
+#define CHROMA_IDC 1
34
+#include "h264_mc_template.c"
35
+
36
+#undef  CHROMA_IDC
37
+#define CHROMA_IDC 2
38
+#include "h264_mc_template.c"
39
+
40
+static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
41
+{
42
+    MpegEncContext *const s = &h->s;
43
+    const int mb_x    = s->mb_x;
44
+    const int mb_y    = s->mb_y;
45
+    const int mb_xy   = h->mb_xy;
46
+    const int mb_type = s->current_picture.f.mb_type[mb_xy];
47
+    uint8_t *dest_y, *dest_cb, *dest_cr;
48
+    int linesize, uvlinesize /*dct_offset*/;
49
+    int i, j;
50
+    int *block_offset = &h->block_offset[0];
51
+    const int transform_bypass = !SIMPLE && (s->qscale == 0 && h->sps.transform_bypass);
52
+    /* is_h264 should always be true if SVQ3 is disabled. */
53
+    const int is_h264 = !CONFIG_SVQ3_DECODER || SIMPLE || s->codec_id == CODEC_ID_H264;
54
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
55
+    const int block_h   = 16 >> s->chroma_y_shift;
56
+    const int chroma422 = CHROMA422;
57
+
58
+    dest_y  = s->current_picture.f.data[0] + ((mb_x << PIXEL_SHIFT)     + mb_y * s->linesize)  * 16;
59
+    dest_cb = s->current_picture.f.data[1] +  (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h;
60
+    dest_cr = s->current_picture.f.data[2] +  (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h;
61
+
62
+    s->dsp.prefetch(dest_y  + (s->mb_x & 3) * 4 * s->linesize   + (64 << PIXEL_SHIFT), s->linesize,       4);
63
+    s->dsp.prefetch(dest_cb + (s->mb_x & 7)     * s->uvlinesize + (64 << PIXEL_SHIFT), dest_cr - dest_cb, 2);
64
+
65
+    h->list_counts[mb_xy] = h->list_count;
66
+
67
+    if (!SIMPLE && MB_FIELD) {
68
+        linesize     = h->mb_linesize = s->linesize * 2;
69
+        uvlinesize   = h->mb_uvlinesize = s->uvlinesize * 2;
70
+        block_offset = &h->block_offset[48];
71
+        if (mb_y & 1) { // FIXME move out of this function?
72
+            dest_y  -= s->linesize * 15;
73
+            dest_cb -= s->uvlinesize * (block_h - 1);
74
+            dest_cr -= s->uvlinesize * (block_h - 1);
75
+        }
76
+        if (FRAME_MBAFF) {
77
+            int list;
78
+            for (list = 0; list < h->list_count; list++) {
79
+                if (!USES_LIST(mb_type, list))
80
+                    continue;
81
+                if (IS_16X16(mb_type)) {
82
+                    int8_t *ref = &h->ref_cache[list][scan8[0]];
83
+                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
84
+                } else {
85
+                    for (i = 0; i < 16; i += 4) {
86
+                        int ref = h->ref_cache[list][scan8[i]];
87
+                        if (ref >= 0)
88
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
89
+                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
90
+                    }
91
+                }
92
+            }
93
+        }
94
+    } else {
95
+        linesize   = h->mb_linesize   = s->linesize;
96
+        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
97
+        // dct_offset = s->linesize * 16;
98
+    }
99
+
100
+    if (!SIMPLE && IS_INTRA_PCM(mb_type)) {
101
+        if (PIXEL_SHIFT) {
102
+            const int bit_depth = h->sps.bit_depth_luma;
103
+            int j;
104
+            GetBitContext gb;
105
+            init_get_bits(&gb, (uint8_t *)h->mb,
106
+                          ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
107
+
108
+            for (i = 0; i < 16; i++) {
109
+                uint16_t *tmp_y = (uint16_t *)(dest_y + i * linesize);
110
+                for (j = 0; j < 16; j++)
111
+                    tmp_y[j] = get_bits(&gb, bit_depth);
112
+            }
113
+            if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
114
+                if (!h->sps.chroma_format_idc) {
115
+                    for (i = 0; i < block_h; i++) {
116
+                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
117
+                        for (j = 0; j < 8; j++)
118
+                            tmp_cb[j] = 1 << (bit_depth - 1);
119
+                    }
120
+                    for (i = 0; i < block_h; i++) {
121
+                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
122
+                        for (j = 0; j < 8; j++)
123
+                            tmp_cr[j] = 1 << (bit_depth - 1);
124
+                    }
125
+                } else {
126
+                    for (i = 0; i < block_h; i++) {
127
+                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
128
+                        for (j = 0; j < 8; j++)
129
+                            tmp_cb[j] = get_bits(&gb, bit_depth);
130
+                    }
131
+                    for (i = 0; i < block_h; i++) {
132
+                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
133
+                        for (j = 0; j < 8; j++)
134
+                            tmp_cr[j] = get_bits(&gb, bit_depth);
135
+                    }
136
+                }
137
+            }
138
+        } else {
139
+            for (i = 0; i < 16; i++)
140
+                memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
141
+            if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
142
+                if (!h->sps.chroma_format_idc) {
143
+                    for (i = 0; i < block_h; i++) {
144
+                        memset(dest_cb + i * uvlinesize, 128, 8);
145
+                        memset(dest_cr + i * uvlinesize, 128, 8);
146
+                    }
147
+                } else {
148
+                    uint8_t *src_cb = (uint8_t *)h->mb + 256;
149
+                    uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
150
+                    for (i = 0; i < block_h; i++) {
151
+                        memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
152
+                        memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
153
+                    }
154
+                }
155
+            }
156
+        }
157
+    } else {
158
+        if (IS_INTRA(mb_type)) {
159
+            if (h->deblocking_filter)
160
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
161
+                               uvlinesize, 1, 0, SIMPLE, PIXEL_SHIFT);
162
+
163
+            if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
164
+                h->hpc.pred8x8[h->chroma_pred_mode](dest_cb, uvlinesize);
165
+                h->hpc.pred8x8[h->chroma_pred_mode](dest_cr, uvlinesize);
166
+            }
167
+
168
+            hl_decode_mb_predict_luma(h, mb_type, is_h264, SIMPLE,
169
+                                      transform_bypass, PIXEL_SHIFT,
170
+                                      block_offset, linesize, dest_y, 0);
171
+
172
+            if (h->deblocking_filter)
173
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
174
+                               uvlinesize, 0, 0, SIMPLE, PIXEL_SHIFT);
175
+        } else if (is_h264) {
176
+            if (chroma422) {
177
+                FUNC(hl_motion_422)(h, dest_y, dest_cb, dest_cr,
178
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
179
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
180
+                              h->h264dsp.weight_h264_pixels_tab,
181
+                              h->h264dsp.biweight_h264_pixels_tab);
182
+            } else {
183
+                FUNC(hl_motion_420)(h, dest_y, dest_cb, dest_cr,
184
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
185
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
186
+                              h->h264dsp.weight_h264_pixels_tab,
187
+                              h->h264dsp.biweight_h264_pixels_tab);
188
+            }
189
+        }
190
+
191
+        hl_decode_mb_idct_luma(h, mb_type, is_h264, SIMPLE, transform_bypass,
192
+                               PIXEL_SHIFT, block_offset, linesize, dest_y, 0);
193
+
194
+        if ((SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) &&
195
+            (h->cbp & 0x30)) {
196
+            uint8_t *dest[2] = { dest_cb, dest_cr };
197
+            if (transform_bypass) {
198
+                if (IS_INTRA(mb_type) && h->sps.profile_idc == 244 &&
199
+                    (h->chroma_pred_mode == VERT_PRED8x8 ||
200
+                     h->chroma_pred_mode == HOR_PRED8x8)) {
201
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0],
202
+                                                            block_offset + 16,
203
+                                                            h->mb + (16 * 16 * 1 << PIXEL_SHIFT),
204
+                                                            uvlinesize);
205
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1],
206
+                                                            block_offset + 32,
207
+                                                            h->mb + (16 * 16 * 2 << PIXEL_SHIFT),
208
+                                                            uvlinesize);
209
+                } else {
210
+                    idct_add = s->dsp.add_pixels4;
211
+                    for (j = 1; j < 3; j++) {
212
+                        for (i = j * 16; i < j * 16 + 4; i++)
213
+                            if (h->non_zero_count_cache[scan8[i]] ||
214
+                                dctcoef_get(h->mb, PIXEL_SHIFT, i * 16))
215
+                                idct_add(dest[j - 1] + block_offset[i],
216
+                                         h->mb + (i * 16 << PIXEL_SHIFT),
217
+                                         uvlinesize);
218
+                        if (chroma422) {
219
+                            for (i = j * 16 + 4; i < j * 16 + 8; i++)
220
+                                if (h->non_zero_count_cache[scan8[i + 4]] ||
221
+                                    dctcoef_get(h->mb, PIXEL_SHIFT, i * 16))
222
+                                    idct_add(dest[j - 1] + block_offset[i + 4],
223
+                                             h->mb + (i * 16 << PIXEL_SHIFT),
224
+                                             uvlinesize);
225
+                        }
226
+                    }
227
+                }
228
+            } else {
229
+                if (is_h264) {
230
+                    int qp[2];
231
+                    if (chroma422) {
232
+                        qp[0] = h->chroma_qp[0] + 3;
233
+                        qp[1] = h->chroma_qp[1] + 3;
234
+                    } else {
235
+                        qp[0] = h->chroma_qp[0];
236
+                        qp[1] = h->chroma_qp[1];
237
+                    }
238
+                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 0]])
239
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 1 << PIXEL_SHIFT),
240
+                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][qp[0]][0]);
241
+                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 1]])
242
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 2 << PIXEL_SHIFT),
243
+                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][qp[1]][0]);
244
+                    h->h264dsp.h264_idct_add8(dest, block_offset,
245
+                                              h->mb, uvlinesize,
246
+                                              h->non_zero_count_cache);
247
+                } else if (CONFIG_SVQ3_DECODER) {
248
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 1,
249
+                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][h->chroma_qp[0]][0]);
250
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 2,
251
+                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][h->chroma_qp[1]][0]);
252
+                    for (j = 1; j < 3; j++) {
253
+                        for (i = j * 16; i < j * 16 + 4; i++)
254
+                            if (h->non_zero_count_cache[scan8[i]] || h->mb[i * 16]) {
255
+                                uint8_t *const ptr = dest[j - 1] + block_offset[i];
256
+                                ff_svq3_add_idct_c(ptr, h->mb + i * 16,
257
+                                                   uvlinesize,
258
+                                                   ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
259
+                            }
260
+                    }
261
+                }
262
+            }
263
+        }
264
+    }
265
+    if (h->cbp || IS_INTRA(mb_type)) {
266
+        s->dsp.clear_blocks(h->mb);
267
+        s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
268
+    }
269
+}
270
+
271
+#if !SIMPLE || BITS == 8
272
+
273
+#undef  CHROMA_IDC
274
+#define CHROMA_IDC 3
275
+#include "h264_mc_template.c"
276
+
277
+static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
278
+{
279
+    MpegEncContext *const s = &h->s;
280
+    const int mb_x    = s->mb_x;
281
+    const int mb_y    = s->mb_y;
282
+    const int mb_xy   = h->mb_xy;
283
+    const int mb_type = s->current_picture.f.mb_type[mb_xy];
284
+    uint8_t *dest[3];
285
+    int linesize;
286
+    int i, j, p;
287
+    int *block_offset = &h->block_offset[0];
288
+    const int transform_bypass = !SIMPLE && (s->qscale == 0 && h->sps.transform_bypass);
289
+    const int plane_count      = (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) ? 3 : 1;
290
+
291
+    for (p = 0; p < plane_count; p++) {
292
+        dest[p] = s->current_picture.f.data[p] +
293
+                  ((mb_x << PIXEL_SHIFT) + mb_y * s->linesize) * 16;
294
+        s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT),
295
+                        s->linesize, 4);
296
+    }
297
+
298
+    h->list_counts[mb_xy] = h->list_count;
299
+
300
+    if (!SIMPLE && MB_FIELD) {
301
+        linesize     = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
302
+        block_offset = &h->block_offset[48];
303
+        if (mb_y & 1) // FIXME move out of this function?
304
+            for (p = 0; p < 3; p++)
305
+                dest[p] -= s->linesize * 15;
306
+        if (FRAME_MBAFF) {
307
+            int list;
308
+            for (list = 0; list < h->list_count; list++) {
309
+                if (!USES_LIST(mb_type, list))
310
+                    continue;
311
+                if (IS_16X16(mb_type)) {
312
+                    int8_t *ref = &h->ref_cache[list][scan8[0]];
313
+                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
314
+                } else {
315
+                    for (i = 0; i < 16; i += 4) {
316
+                        int ref = h->ref_cache[list][scan8[i]];
317
+                        if (ref >= 0)
318
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
319
+                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
320
+                    }
321
+                }
322
+            }
323
+        }
324
+    } else {
325
+        linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize;
326
+    }
327
+
328
+    if (!SIMPLE && IS_INTRA_PCM(mb_type)) {
329
+        if (PIXEL_SHIFT) {
330
+            const int bit_depth = h->sps.bit_depth_luma;
331
+            GetBitContext gb;
332
+            init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
333
+
334
+            for (p = 0; p < plane_count; p++)
335
+                for (i = 0; i < 16; i++) {
336
+                    uint16_t *tmp = (uint16_t *)(dest[p] + i * linesize);
337
+                    for (j = 0; j < 16; j++)
338
+                        tmp[j] = get_bits(&gb, bit_depth);
339
+                }
340
+        } else {
341
+            for (p = 0; p < plane_count; p++)
342
+                for (i = 0; i < 16; i++)
343
+                    memcpy(dest[p] + i * linesize,
344
+                           (uint8_t *)h->mb + p * 256 + i * 16, 16);
345
+        }
346
+    } else {
347
+        if (IS_INTRA(mb_type)) {
348
+            if (h->deblocking_filter)
349
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
350
+                               linesize, 1, 1, SIMPLE, PIXEL_SHIFT);
351
+
352
+            for (p = 0; p < plane_count; p++)
353
+                hl_decode_mb_predict_luma(h, mb_type, 1, SIMPLE,
354
+                                          transform_bypass, PIXEL_SHIFT,
355
+                                          block_offset, linesize, dest[p], p);
356
+
357
+            if (h->deblocking_filter)
358
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
359
+                               linesize, 0, 1, SIMPLE, PIXEL_SHIFT);
360
+        } else {
361
+            FUNC(hl_motion_444)(h, dest[0], dest[1], dest[2],
362
+                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
363
+                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
364
+                      h->h264dsp.weight_h264_pixels_tab,
365
+                      h->h264dsp.biweight_h264_pixels_tab);
366
+        }
367
+
368
+        for (p = 0; p < plane_count; p++)
369
+            hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass,
370
+                                   PIXEL_SHIFT, block_offset, linesize,
371
+                                   dest[p], p);
372
+    }
373
+    if (h->cbp || IS_INTRA(mb_type)) {
374
+        s->dsp.clear_blocks(h->mb);
375
+        s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
376
+    }
377
+}
378
+
379
+#endif
0 380
new file mode 100644
... ...
@@ -0,0 +1,160 @@
0
+/*
1
+ * H.26L/H.264/AVC/JVT/14496-10/... decoder
2
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
3
+ *
4
+ * This file is part of Libav.
5
+ *
6
+ * Libav is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * Libav is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with Libav; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#undef MCFUNC
22
+
23
+#if   CHROMA_IDC == 1
24
+#   define MCFUNC(n) FUNC(n ## _420)
25
+#elif CHROMA_IDC == 2
26
+#   define MCFUNC(n) FUNC(n ## _422)
27
+#elif CHROMA_IDC == 3
28
+#   define MCFUNC(n) FUNC(n ## _444)
29
+#endif
30
+
31
+#undef  mc_part
32
+#define mc_part MCFUNC(mc_part)
33
+
34
+static void mc_part(H264Context *h, int n, int square,
35
+                    int height, int delta,
36
+                    uint8_t *dest_y, uint8_t *dest_cb,
37
+                    uint8_t *dest_cr,
38
+                    int x_offset, int y_offset,
39
+                    qpel_mc_func *qpix_put,
40
+                    h264_chroma_mc_func chroma_put,
41
+                    qpel_mc_func *qpix_avg,
42
+                    h264_chroma_mc_func chroma_avg,
43
+                    h264_weight_func *weight_op,
44
+                    h264_biweight_func *weight_avg,
45
+                    int list0, int list1)
46
+{
47
+    if ((h->use_weight == 2 && list0 && list1 &&
48
+         (h->implicit_weight[h->ref_cache[0][scan8[n]]][h->ref_cache[1][scan8[n]]][h->s.mb_y & 1] != 32)) ||
49
+        h->use_weight == 1)
50
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
51
+                         x_offset, y_offset, qpix_put, chroma_put,
52
+                         weight_op[0], weight_op[1], weight_avg[0],
53
+                         weight_avg[1], list0, list1, PIXEL_SHIFT, CHROMA_IDC);
54
+    else
55
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
56
+                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
57
+                    chroma_avg, list0, list1, PIXEL_SHIFT, CHROMA_IDC);
58
+}
59
+
60
+static void MCFUNC(hl_motion)(H264Context *h, uint8_t *dest_y,
61
+                              uint8_t *dest_cb, uint8_t *dest_cr,
62
+                              qpel_mc_func(*qpix_put)[16],
63
+                              h264_chroma_mc_func(*chroma_put),
64
+                              qpel_mc_func(*qpix_avg)[16],
65
+                              h264_chroma_mc_func(*chroma_avg),
66
+                              h264_weight_func *weight_op,
67
+                              h264_biweight_func *weight_avg)
68
+{
69
+    MpegEncContext *const s = &h->s;
70
+    const int mb_xy   = h->mb_xy;
71
+    const int mb_type = s->current_picture.f.mb_type[mb_xy];
72
+
73
+    assert(IS_INTER(mb_type));
74
+
75
+    if (HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
76
+        await_references(h);
77
+    prefetch_motion(h, 0, PIXEL_SHIFT, CHROMA_IDC);
78
+
79
+    if (IS_16X16(mb_type)) {
80
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
81
+                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
82
+                weight_op, weight_avg,
83
+                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
84
+    } else if (IS_16X8(mb_type)) {
85
+        mc_part(h, 0, 0, 8, 8 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr, 0, 0,
86
+                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
87
+                weight_op, weight_avg,
88
+                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
89
+        mc_part(h, 8, 0, 8, 8 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr, 0, 4,
90
+                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
91
+                weight_op, weight_avg,
92
+                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
93
+    } else if (IS_8X16(mb_type)) {
94
+        mc_part(h, 0, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
95
+                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
96
+                &weight_op[1], &weight_avg[1],
97
+                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
98
+        mc_part(h, 4, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
99
+                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
100
+                &weight_op[1], &weight_avg[1],
101
+                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
102
+    } else {
103
+        int i;
104
+
105
+        assert(IS_8X8(mb_type));
106
+
107
+        for (i = 0; i < 4; i++) {
108
+            const int sub_mb_type = h->sub_mb_type[i];
109
+            const int n  = 4 * i;
110
+            int x_offset = (i & 1) << 2;
111
+            int y_offset = (i & 2) << 1;
112
+
113
+            if (IS_SUB_8X8(sub_mb_type)) {
114
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr,
115
+                        x_offset, y_offset,
116
+                        qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
117
+                        &weight_op[1], &weight_avg[1],
118
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
119
+            } else if (IS_SUB_8X4(sub_mb_type)) {
120
+                mc_part(h, n, 0, 4, 4 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr,
121
+                        x_offset, y_offset,
122
+                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
123
+                        &weight_op[1], &weight_avg[1],
124
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
125
+                mc_part(h, n + 2, 0, 4, 4 << PIXEL_SHIFT,
126
+                        dest_y, dest_cb, dest_cr, x_offset, y_offset + 2,
127
+                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
128
+                        &weight_op[1], &weight_avg[1],
129
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
130
+            } else if (IS_SUB_4X8(sub_mb_type)) {
131
+                mc_part(h, n, 0, 8, 4 * h->mb_linesize,
132
+                        dest_y, dest_cb, dest_cr, x_offset, y_offset,
133
+                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
134
+                        &weight_op[2], &weight_avg[2],
135
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
136
+                mc_part(h, n + 1, 0, 8, 4 * h->mb_linesize,
137
+                        dest_y, dest_cb, dest_cr, x_offset + 2, y_offset,
138
+                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
139
+                        &weight_op[2], &weight_avg[2],
140
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
141
+            } else {
142
+                int j;
143
+                assert(IS_SUB_4X4(sub_mb_type));
144
+                for (j = 0; j < 4; j++) {
145
+                    int sub_x_offset = x_offset + 2 * (j & 1);
146
+                    int sub_y_offset = y_offset + (j & 2);
147
+                    mc_part(h, n + j, 1, 4, 0,
148
+                            dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
149
+                            qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
150
+                            &weight_op[2], &weight_avg[2],
151
+                            IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
152
+                }
153
+            }
154
+        }
155
+    }
156
+
157
+    prefetch_motion(h, 1, PIXEL_SHIFT, CHROMA_IDC);
158
+}
159
+