Browse code

H.264: faster write_back_*

Avoid aliasing, unroll loops, and inline more functions.

Jason Garrett-Glaser authored on 2011/06/30 05:27:36
Showing 5 changed files
... ...
@@ -60,15 +60,6 @@ static const enum PixelFormat hwaccel_pixfmt_list_h264_jpeg_420[] = {
60 60
     PIX_FMT_NONE
61 61
 };
62 62
 
63
-void ff_h264_write_back_intra_pred_mode(H264Context *h){
64
-    int8_t *mode= h->intra4x4_pred_mode + h->mb2br_xy[h->mb_xy];
65
-
66
-    AV_COPY32(mode, h->intra4x4_pred_mode_cache + 4 + 8*4);
67
-    mode[4]= h->intra4x4_pred_mode_cache[7+8*3];
68
-    mode[5]= h->intra4x4_pred_mode_cache[7+8*2];
69
-    mode[6]= h->intra4x4_pred_mode_cache[7+8*1];
70
-}
71
-
72 63
 /**
73 64
  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
74 65
  */
... ...
@@ -658,7 +658,6 @@ int ff_h264_check_intra4x4_pred_mode(H264Context *h);
658 658
  */
659 659
 int ff_h264_check_intra_pred_mode(H264Context *h, int mode);
660 660
 
661
-void ff_h264_write_back_intra_pred_mode(H264Context *h);
662 661
 void ff_h264_hl_decode_mb(H264Context *h);
663 662
 int ff_h264_frame_start(H264Context *h);
664 663
 int ff_h264_decode_extradata(H264Context *h);
... ...
@@ -1185,7 +1184,7 @@ static void fill_decode_caches(H264Context *h, int mb_type){
1185 1185
 /**
1186 1186
  * gets the predicted intra4x4 prediction mode.
1187 1187
  */
1188
-static inline int pred_intra_mode(H264Context *h, int n){
1188
+static av_always_inline int pred_intra_mode(H264Context *h, int n){
1189 1189
     const int index8= scan8[n];
1190 1190
     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1191 1191
     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
... ...
@@ -1197,69 +1196,83 @@ static inline int pred_intra_mode(H264Context *h, int n){
1197 1197
     else      return min;
1198 1198
 }
1199 1199
 
1200
-static inline void write_back_non_zero_count(H264Context *h){
1201
-    const int mb_xy= h->mb_xy;
1200
+static av_always_inline void write_back_intra_pred_mode(H264Context *h){
1201
+    int8_t *i4x4= h->intra4x4_pred_mode + h->mb2br_xy[h->mb_xy];
1202
+    int8_t *i4x4_cache= h->intra4x4_pred_mode_cache;
1203
+
1204
+    AV_COPY32(i4x4, i4x4_cache + 4 + 8*4);
1205
+    i4x4[4]= i4x4_cache[7+8*3];
1206
+    i4x4[5]= i4x4_cache[7+8*2];
1207
+    i4x4[6]= i4x4_cache[7+8*1];
1208
+}
1202 1209
 
1203
-    AV_COPY32(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[4+8* 1]);
1204
-    AV_COPY32(&h->non_zero_count[mb_xy][ 4], &h->non_zero_count_cache[4+8* 2]);
1205
-    AV_COPY32(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[4+8* 3]);
1206
-    AV_COPY32(&h->non_zero_count[mb_xy][12], &h->non_zero_count_cache[4+8* 4]);
1207
-    AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[4+8* 6]);
1208
-    AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8* 7]);
1209
-    AV_COPY32(&h->non_zero_count[mb_xy][32], &h->non_zero_count_cache[4+8*11]);
1210
-    AV_COPY32(&h->non_zero_count[mb_xy][36], &h->non_zero_count_cache[4+8*12]);
1210
+static av_always_inline void write_back_non_zero_count(H264Context *h){
1211
+    const int mb_xy= h->mb_xy;
1212
+    uint8_t *nnz = h->non_zero_count[mb_xy];
1213
+    uint8_t *nnz_cache = h->non_zero_count_cache;
1214
+
1215
+    AV_COPY32(&nnz[ 0], &nnz_cache[4+8* 1]);
1216
+    AV_COPY32(&nnz[ 4], &nnz_cache[4+8* 2]);
1217
+    AV_COPY32(&nnz[ 8], &nnz_cache[4+8* 3]);
1218
+    AV_COPY32(&nnz[12], &nnz_cache[4+8* 4]);
1219
+    AV_COPY32(&nnz[16], &nnz_cache[4+8* 6]);
1220
+    AV_COPY32(&nnz[20], &nnz_cache[4+8* 7]);
1221
+    AV_COPY32(&nnz[32], &nnz_cache[4+8*11]);
1222
+    AV_COPY32(&nnz[36], &nnz_cache[4+8*12]);
1211 1223
 
1212 1224
     if(CHROMA444){
1213
-        AV_COPY32(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[4+8* 8]);
1214
-        AV_COPY32(&h->non_zero_count[mb_xy][28], &h->non_zero_count_cache[4+8* 9]);
1215
-        AV_COPY32(&h->non_zero_count[mb_xy][40], &h->non_zero_count_cache[4+8*13]);
1216
-        AV_COPY32(&h->non_zero_count[mb_xy][44], &h->non_zero_count_cache[4+8*14]);
1225
+        AV_COPY32(&nnz[24], &nnz_cache[4+8* 8]);
1226
+        AV_COPY32(&nnz[28], &nnz_cache[4+8* 9]);
1227
+        AV_COPY32(&nnz[40], &nnz_cache[4+8*13]);
1228
+        AV_COPY32(&nnz[44], &nnz_cache[4+8*14]);
1229
+    }
1230
+}
1231
+
1232
+static av_always_inline void write_back_motion_list(H264Context *h, MpegEncContext * const s, int b_stride,
1233
+                                                    int b_xy, int b8_xy, int mb_type, int list )
1234
+{
1235
+    int16_t (*mv_dst)[2] = &s->current_picture.motion_val[list][b_xy];
1236
+    int16_t (*mv_src)[2] = &h->mv_cache[list][scan8[0]];
1237
+    AV_COPY128(mv_dst + 0*b_stride, mv_src + 8*0);
1238
+    AV_COPY128(mv_dst + 1*b_stride, mv_src + 8*1);
1239
+    AV_COPY128(mv_dst + 2*b_stride, mv_src + 8*2);
1240
+    AV_COPY128(mv_dst + 3*b_stride, mv_src + 8*3);
1241
+    if( CABAC ) {
1242
+        uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? 8*h->mb_xy : h->mb2br_xy[h->mb_xy]];
1243
+        uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
1244
+        if(IS_SKIP(mb_type))
1245
+            AV_ZERO128(mvd_dst);
1246
+        else{
1247
+            AV_COPY64(mvd_dst, mvd_src + 8*3);
1248
+            AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
1249
+            AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
1250
+            AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
1251
+        }
1252
+    }
1253
+
1254
+    {
1255
+        int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1256
+        int8_t *ref_cache = h->ref_cache[list];
1257
+        ref_index[0+0*2]= ref_cache[scan8[0]];
1258
+        ref_index[1+0*2]= ref_cache[scan8[4]];
1259
+        ref_index[0+1*2]= ref_cache[scan8[8]];
1260
+        ref_index[1+1*2]= ref_cache[scan8[12]];
1217 1261
     }
1218 1262
 }
1219 1263
 
1220
-static inline void write_back_motion(H264Context *h, int mb_type){
1264
+static av_always_inline void write_back_motion(H264Context *h, int mb_type){
1221 1265
     MpegEncContext * const s = &h->s;
1266
+    const int b_stride = h->b_stride;
1222 1267
     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; //try mb2b(8)_xy
1223 1268
     const int b8_xy= 4*h->mb_xy;
1224
-    int list;
1225 1269
 
1226
-    if(!USES_LIST(mb_type, 0))
1270
+    if(USES_LIST(mb_type, 0)){
1271
+        write_back_motion_list(h, s, b_stride, b_xy, b8_xy, mb_type, 0);
1272
+    }else{
1227 1273
         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1);
1228
-
1229
-    for(list=0; list<h->list_count; list++){
1230
-        int y, b_stride;
1231
-        int16_t (*mv_dst)[2];
1232
-        int16_t (*mv_src)[2];
1233
-
1234
-        if(!USES_LIST(mb_type, list))
1235
-            continue;
1236
-
1237
-        b_stride = h->b_stride;
1238
-        mv_dst   = &s->current_picture.motion_val[list][b_xy];
1239
-        mv_src   = &h->mv_cache[list][scan8[0]];
1240
-        for(y=0; y<4; y++){
1241
-            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
1242
-        }
1243
-        if( CABAC ) {
1244
-            uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? 8*h->mb_xy : h->mb2br_xy[h->mb_xy]];
1245
-            uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
1246
-            if(IS_SKIP(mb_type))
1247
-                AV_ZERO128(mvd_dst);
1248
-            else{
1249
-            AV_COPY64(mvd_dst, mvd_src + 8*3);
1250
-                AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0);
1251
-                AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1);
1252
-                AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2);
1253
-            }
1254
-        }
1255
-
1256
-        {
1257
-            int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1258
-            ref_index[0+0*2]= h->ref_cache[list][scan8[0]];
1259
-            ref_index[1+0*2]= h->ref_cache[list][scan8[4]];
1260
-            ref_index[0+1*2]= h->ref_cache[list][scan8[8]];
1261
-            ref_index[1+1*2]= h->ref_cache[list][scan8[12]];
1262
-        }
1274
+    }
1275
+    if(USES_LIST(mb_type, 1)){
1276
+        write_back_motion_list(h, s, b_stride, b_xy, b8_xy, mb_type, 1);
1263 1277
     }
1264 1278
 
1265 1279
     if(h->slice_type_nos == AV_PICTURE_TYPE_B && CABAC){
... ...
@@ -1272,7 +1285,7 @@ static inline void write_back_motion(H264Context *h, int mb_type){
1272 1272
     }
1273 1273
 }
1274 1274
 
1275
-static inline int get_dct8x8_allowed(H264Context *h){
1275
+static av_always_inline int get_dct8x8_allowed(H264Context *h){
1276 1276
     if(h->sps.direct_8x8_inference_flag)
1277 1277
         return !(AV_RN64A(h->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
1278 1278
     else
... ...
@@ -1999,7 +1999,7 @@ decode_intra_mb:
1999 1999
                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
2000 2000
                 }
2001 2001
             }
2002
-            ff_h264_write_back_intra_pred_mode(h);
2002
+            write_back_intra_pred_mode(h);
2003 2003
             if( ff_h264_check_intra4x4_pred_mode(h) < 0 ) return -1;
2004 2004
         } else {
2005 2005
             h->intra16x16_pred_mode= ff_h264_check_intra_pred_mode( h, h->intra16x16_pred_mode );
... ...
@@ -2248,21 +2248,22 @@ decode_intra_mb:
2248 2248
      * the transform mode of the current macroblock there. */
2249 2249
     if (CHROMA444 && IS_8x8DCT(mb_type)){
2250 2250
         int i;
2251
+        uint8_t *nnz_cache = h->non_zero_count_cache;
2251 2252
         for (i = 0; i < 2; i++){
2252 2253
             if (h->left_type[i] && !IS_8x8DCT(h->left_type[i])){
2253
-                h->non_zero_count_cache[3+8* 1 + 2*8*i]=
2254
-                h->non_zero_count_cache[3+8* 2 + 2*8*i]=
2255
-                h->non_zero_count_cache[3+8* 6 + 2*8*i]=
2256
-                h->non_zero_count_cache[3+8* 7 + 2*8*i]=
2257
-                h->non_zero_count_cache[3+8*11 + 2*8*i]=
2258
-                h->non_zero_count_cache[3+8*12 + 2*8*i]= IS_INTRA(mb_type) ? 64 : 0;
2254
+                nnz_cache[3+8* 1 + 2*8*i]=
2255
+                nnz_cache[3+8* 2 + 2*8*i]=
2256
+                nnz_cache[3+8* 6 + 2*8*i]=
2257
+                nnz_cache[3+8* 7 + 2*8*i]=
2258
+                nnz_cache[3+8*11 + 2*8*i]=
2259
+                nnz_cache[3+8*12 + 2*8*i]= IS_INTRA(mb_type) ? 64 : 0;
2259 2260
             }
2260 2261
         }
2261 2262
         if (h->top_type && !IS_8x8DCT(h->top_type)){
2262 2263
             uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040;
2263
-            AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty);
2264
-            AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty);
2265
-            AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty);
2264
+            AV_WN32A(&nnz_cache[4+8* 0], top_empty);
2265
+            AV_WN32A(&nnz_cache[4+8* 5], top_empty);
2266
+            AV_WN32A(&nnz_cache[4+8*10], top_empty);
2266 2267
         }
2267 2268
     }
2268 2269
     s->current_picture.mb_type[mb_xy]= mb_type;
... ...
@@ -731,7 +731,7 @@ decode_intra_mb:
731 731
                 else
732 732
                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
733 733
             }
734
-            ff_h264_write_back_intra_pred_mode(h);
734
+            write_back_intra_pred_mode(h);
735 735
             if( ff_h264_check_intra4x4_pred_mode(h) < 0)
736 736
                 return -1;
737 737
         }else{
... ...
@@ -589,7 +589,7 @@ static int svq3_decode_mb(SVQ3Context *svq3, unsigned int mb_type)
589 589
             }
590 590
         }
591 591
 
592
-        ff_h264_write_back_intra_pred_mode(h);
592
+        write_back_intra_pred_mode(h);
593 593
 
594 594
         if (mb_type == 8) {
595 595
             ff_h264_check_intra4x4_pred_mode(h);