Avoid aliasing, unroll loops, and inline more functions.
| ... | ... |
@@ -60,15 +60,6 @@ static const enum PixelFormat hwaccel_pixfmt_list_h264_jpeg_420[] = {
|
| 60 | 60 |
PIX_FMT_NONE |
| 61 | 61 |
}; |
| 62 | 62 |
|
| 63 |
-void ff_h264_write_back_intra_pred_mode(H264Context *h){
|
|
| 64 |
- int8_t *mode= h->intra4x4_pred_mode + h->mb2br_xy[h->mb_xy]; |
|
| 65 |
- |
|
| 66 |
- AV_COPY32(mode, h->intra4x4_pred_mode_cache + 4 + 8*4); |
|
| 67 |
- mode[4]= h->intra4x4_pred_mode_cache[7+8*3]; |
|
| 68 |
- mode[5]= h->intra4x4_pred_mode_cache[7+8*2]; |
|
| 69 |
- mode[6]= h->intra4x4_pred_mode_cache[7+8*1]; |
|
| 70 |
-} |
|
| 71 |
- |
|
| 72 | 63 |
/** |
| 73 | 64 |
* checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks. |
| 74 | 65 |
*/ |
| ... | ... |
@@ -658,7 +658,6 @@ int ff_h264_check_intra4x4_pred_mode(H264Context *h); |
| 658 | 658 |
*/ |
| 659 | 659 |
int ff_h264_check_intra_pred_mode(H264Context *h, int mode); |
| 660 | 660 |
|
| 661 |
-void ff_h264_write_back_intra_pred_mode(H264Context *h); |
|
| 662 | 661 |
void ff_h264_hl_decode_mb(H264Context *h); |
| 663 | 662 |
int ff_h264_frame_start(H264Context *h); |
| 664 | 663 |
int ff_h264_decode_extradata(H264Context *h); |
| ... | ... |
@@ -1185,7 +1184,7 @@ static void fill_decode_caches(H264Context *h, int mb_type){
|
| 1185 | 1185 |
/** |
| 1186 | 1186 |
* gets the predicted intra4x4 prediction mode. |
| 1187 | 1187 |
*/ |
| 1188 |
-static inline int pred_intra_mode(H264Context *h, int n){
|
|
| 1188 |
+static av_always_inline int pred_intra_mode(H264Context *h, int n){
|
|
| 1189 | 1189 |
const int index8= scan8[n]; |
| 1190 | 1190 |
const int left= h->intra4x4_pred_mode_cache[index8 - 1]; |
| 1191 | 1191 |
const int top = h->intra4x4_pred_mode_cache[index8 - 8]; |
| ... | ... |
@@ -1197,69 +1196,83 @@ static inline int pred_intra_mode(H264Context *h, int n){
|
| 1197 | 1197 |
else return min; |
| 1198 | 1198 |
} |
| 1199 | 1199 |
|
| 1200 |
-static inline void write_back_non_zero_count(H264Context *h){
|
|
| 1201 |
- const int mb_xy= h->mb_xy; |
|
| 1200 |
+static av_always_inline void write_back_intra_pred_mode(H264Context *h){
|
|
| 1201 |
+ int8_t *i4x4= h->intra4x4_pred_mode + h->mb2br_xy[h->mb_xy]; |
|
| 1202 |
+ int8_t *i4x4_cache= h->intra4x4_pred_mode_cache; |
|
| 1203 |
+ |
|
| 1204 |
+ AV_COPY32(i4x4, i4x4_cache + 4 + 8*4); |
|
| 1205 |
+ i4x4[4]= i4x4_cache[7+8*3]; |
|
| 1206 |
+ i4x4[5]= i4x4_cache[7+8*2]; |
|
| 1207 |
+ i4x4[6]= i4x4_cache[7+8*1]; |
|
| 1208 |
+} |
|
| 1202 | 1209 |
|
| 1203 |
- AV_COPY32(&h->non_zero_count[mb_xy][ 0], &h->non_zero_count_cache[4+8* 1]); |
|
| 1204 |
- AV_COPY32(&h->non_zero_count[mb_xy][ 4], &h->non_zero_count_cache[4+8* 2]); |
|
| 1205 |
- AV_COPY32(&h->non_zero_count[mb_xy][ 8], &h->non_zero_count_cache[4+8* 3]); |
|
| 1206 |
- AV_COPY32(&h->non_zero_count[mb_xy][12], &h->non_zero_count_cache[4+8* 4]); |
|
| 1207 |
- AV_COPY32(&h->non_zero_count[mb_xy][16], &h->non_zero_count_cache[4+8* 6]); |
|
| 1208 |
- AV_COPY32(&h->non_zero_count[mb_xy][20], &h->non_zero_count_cache[4+8* 7]); |
|
| 1209 |
- AV_COPY32(&h->non_zero_count[mb_xy][32], &h->non_zero_count_cache[4+8*11]); |
|
| 1210 |
- AV_COPY32(&h->non_zero_count[mb_xy][36], &h->non_zero_count_cache[4+8*12]); |
|
| 1210 |
+static av_always_inline void write_back_non_zero_count(H264Context *h){
|
|
| 1211 |
+ const int mb_xy= h->mb_xy; |
|
| 1212 |
+ uint8_t *nnz = h->non_zero_count[mb_xy]; |
|
| 1213 |
+ uint8_t *nnz_cache = h->non_zero_count_cache; |
|
| 1214 |
+ |
|
| 1215 |
+ AV_COPY32(&nnz[ 0], &nnz_cache[4+8* 1]); |
|
| 1216 |
+ AV_COPY32(&nnz[ 4], &nnz_cache[4+8* 2]); |
|
| 1217 |
+ AV_COPY32(&nnz[ 8], &nnz_cache[4+8* 3]); |
|
| 1218 |
+ AV_COPY32(&nnz[12], &nnz_cache[4+8* 4]); |
|
| 1219 |
+ AV_COPY32(&nnz[16], &nnz_cache[4+8* 6]); |
|
| 1220 |
+ AV_COPY32(&nnz[20], &nnz_cache[4+8* 7]); |
|
| 1221 |
+ AV_COPY32(&nnz[32], &nnz_cache[4+8*11]); |
|
| 1222 |
+ AV_COPY32(&nnz[36], &nnz_cache[4+8*12]); |
|
| 1211 | 1223 |
|
| 1212 | 1224 |
if(CHROMA444){
|
| 1213 |
- AV_COPY32(&h->non_zero_count[mb_xy][24], &h->non_zero_count_cache[4+8* 8]); |
|
| 1214 |
- AV_COPY32(&h->non_zero_count[mb_xy][28], &h->non_zero_count_cache[4+8* 9]); |
|
| 1215 |
- AV_COPY32(&h->non_zero_count[mb_xy][40], &h->non_zero_count_cache[4+8*13]); |
|
| 1216 |
- AV_COPY32(&h->non_zero_count[mb_xy][44], &h->non_zero_count_cache[4+8*14]); |
|
| 1225 |
+ AV_COPY32(&nnz[24], &nnz_cache[4+8* 8]); |
|
| 1226 |
+ AV_COPY32(&nnz[28], &nnz_cache[4+8* 9]); |
|
| 1227 |
+ AV_COPY32(&nnz[40], &nnz_cache[4+8*13]); |
|
| 1228 |
+ AV_COPY32(&nnz[44], &nnz_cache[4+8*14]); |
|
| 1229 |
+ } |
|
| 1230 |
+} |
|
| 1231 |
+ |
|
| 1232 |
+static av_always_inline void write_back_motion_list(H264Context *h, MpegEncContext * const s, int b_stride, |
|
| 1233 |
+ int b_xy, int b8_xy, int mb_type, int list ) |
|
| 1234 |
+{
|
|
| 1235 |
+ int16_t (*mv_dst)[2] = &s->current_picture.motion_val[list][b_xy]; |
|
| 1236 |
+ int16_t (*mv_src)[2] = &h->mv_cache[list][scan8[0]]; |
|
| 1237 |
+ AV_COPY128(mv_dst + 0*b_stride, mv_src + 8*0); |
|
| 1238 |
+ AV_COPY128(mv_dst + 1*b_stride, mv_src + 8*1); |
|
| 1239 |
+ AV_COPY128(mv_dst + 2*b_stride, mv_src + 8*2); |
|
| 1240 |
+ AV_COPY128(mv_dst + 3*b_stride, mv_src + 8*3); |
|
| 1241 |
+ if( CABAC ) {
|
|
| 1242 |
+ uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? 8*h->mb_xy : h->mb2br_xy[h->mb_xy]]; |
|
| 1243 |
+ uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; |
|
| 1244 |
+ if(IS_SKIP(mb_type)) |
|
| 1245 |
+ AV_ZERO128(mvd_dst); |
|
| 1246 |
+ else{
|
|
| 1247 |
+ AV_COPY64(mvd_dst, mvd_src + 8*3); |
|
| 1248 |
+ AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); |
|
| 1249 |
+ AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); |
|
| 1250 |
+ AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); |
|
| 1251 |
+ } |
|
| 1252 |
+ } |
|
| 1253 |
+ |
|
| 1254 |
+ {
|
|
| 1255 |
+ int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; |
|
| 1256 |
+ int8_t *ref_cache = h->ref_cache[list]; |
|
| 1257 |
+ ref_index[0+0*2]= ref_cache[scan8[0]]; |
|
| 1258 |
+ ref_index[1+0*2]= ref_cache[scan8[4]]; |
|
| 1259 |
+ ref_index[0+1*2]= ref_cache[scan8[8]]; |
|
| 1260 |
+ ref_index[1+1*2]= ref_cache[scan8[12]]; |
|
| 1217 | 1261 |
} |
| 1218 | 1262 |
} |
| 1219 | 1263 |
|
| 1220 |
-static inline void write_back_motion(H264Context *h, int mb_type){
|
|
| 1264 |
+static av_always_inline void write_back_motion(H264Context *h, int mb_type){
|
|
| 1221 | 1265 |
MpegEncContext * const s = &h->s; |
| 1266 |
+ const int b_stride = h->b_stride; |
|
| 1222 | 1267 |
const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; //try mb2b(8)_xy |
| 1223 | 1268 |
const int b8_xy= 4*h->mb_xy; |
| 1224 |
- int list; |
|
| 1225 | 1269 |
|
| 1226 |
- if(!USES_LIST(mb_type, 0)) |
|
| 1270 |
+ if(USES_LIST(mb_type, 0)){
|
|
| 1271 |
+ write_back_motion_list(h, s, b_stride, b_xy, b8_xy, mb_type, 0); |
|
| 1272 |
+ }else{
|
|
| 1227 | 1273 |
fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, 2, (uint8_t)LIST_NOT_USED, 1); |
| 1228 |
- |
|
| 1229 |
- for(list=0; list<h->list_count; list++){
|
|
| 1230 |
- int y, b_stride; |
|
| 1231 |
- int16_t (*mv_dst)[2]; |
|
| 1232 |
- int16_t (*mv_src)[2]; |
|
| 1233 |
- |
|
| 1234 |
- if(!USES_LIST(mb_type, list)) |
|
| 1235 |
- continue; |
|
| 1236 |
- |
|
| 1237 |
- b_stride = h->b_stride; |
|
| 1238 |
- mv_dst = &s->current_picture.motion_val[list][b_xy]; |
|
| 1239 |
- mv_src = &h->mv_cache[list][scan8[0]]; |
|
| 1240 |
- for(y=0; y<4; y++){
|
|
| 1241 |
- AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); |
|
| 1242 |
- } |
|
| 1243 |
- if( CABAC ) {
|
|
| 1244 |
- uint8_t (*mvd_dst)[2] = &h->mvd_table[list][FMO ? 8*h->mb_xy : h->mb2br_xy[h->mb_xy]]; |
|
| 1245 |
- uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; |
|
| 1246 |
- if(IS_SKIP(mb_type)) |
|
| 1247 |
- AV_ZERO128(mvd_dst); |
|
| 1248 |
- else{
|
|
| 1249 |
- AV_COPY64(mvd_dst, mvd_src + 8*3); |
|
| 1250 |
- AV_COPY16(mvd_dst + 3 + 3, mvd_src + 3 + 8*0); |
|
| 1251 |
- AV_COPY16(mvd_dst + 3 + 2, mvd_src + 3 + 8*1); |
|
| 1252 |
- AV_COPY16(mvd_dst + 3 + 1, mvd_src + 3 + 8*2); |
|
| 1253 |
- } |
|
| 1254 |
- } |
|
| 1255 |
- |
|
| 1256 |
- {
|
|
| 1257 |
- int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy]; |
|
| 1258 |
- ref_index[0+0*2]= h->ref_cache[list][scan8[0]]; |
|
| 1259 |
- ref_index[1+0*2]= h->ref_cache[list][scan8[4]]; |
|
| 1260 |
- ref_index[0+1*2]= h->ref_cache[list][scan8[8]]; |
|
| 1261 |
- ref_index[1+1*2]= h->ref_cache[list][scan8[12]]; |
|
| 1262 |
- } |
|
| 1274 |
+ } |
|
| 1275 |
+ if(USES_LIST(mb_type, 1)){
|
|
| 1276 |
+ write_back_motion_list(h, s, b_stride, b_xy, b8_xy, mb_type, 1); |
|
| 1263 | 1277 |
} |
| 1264 | 1278 |
|
| 1265 | 1279 |
if(h->slice_type_nos == AV_PICTURE_TYPE_B && CABAC){
|
| ... | ... |
@@ -1272,7 +1285,7 @@ static inline void write_back_motion(H264Context *h, int mb_type){
|
| 1272 | 1272 |
} |
| 1273 | 1273 |
} |
| 1274 | 1274 |
|
| 1275 |
-static inline int get_dct8x8_allowed(H264Context *h){
|
|
| 1275 |
+static av_always_inline int get_dct8x8_allowed(H264Context *h){
|
|
| 1276 | 1276 |
if(h->sps.direct_8x8_inference_flag) |
| 1277 | 1277 |
return !(AV_RN64A(h->sub_mb_type) & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8 )*0x0001000100010001ULL)); |
| 1278 | 1278 |
else |
| ... | ... |
@@ -1999,7 +1999,7 @@ decode_intra_mb: |
| 1999 | 1999 |
//av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] ); |
| 2000 | 2000 |
} |
| 2001 | 2001 |
} |
| 2002 |
- ff_h264_write_back_intra_pred_mode(h); |
|
| 2002 |
+ write_back_intra_pred_mode(h); |
|
| 2003 | 2003 |
if( ff_h264_check_intra4x4_pred_mode(h) < 0 ) return -1; |
| 2004 | 2004 |
} else {
|
| 2005 | 2005 |
h->intra16x16_pred_mode= ff_h264_check_intra_pred_mode( h, h->intra16x16_pred_mode ); |
| ... | ... |
@@ -2248,21 +2248,22 @@ decode_intra_mb: |
| 2248 | 2248 |
* the transform mode of the current macroblock there. */ |
| 2249 | 2249 |
if (CHROMA444 && IS_8x8DCT(mb_type)){
|
| 2250 | 2250 |
int i; |
| 2251 |
+ uint8_t *nnz_cache = h->non_zero_count_cache; |
|
| 2251 | 2252 |
for (i = 0; i < 2; i++){
|
| 2252 | 2253 |
if (h->left_type[i] && !IS_8x8DCT(h->left_type[i])){
|
| 2253 |
- h->non_zero_count_cache[3+8* 1 + 2*8*i]= |
|
| 2254 |
- h->non_zero_count_cache[3+8* 2 + 2*8*i]= |
|
| 2255 |
- h->non_zero_count_cache[3+8* 6 + 2*8*i]= |
|
| 2256 |
- h->non_zero_count_cache[3+8* 7 + 2*8*i]= |
|
| 2257 |
- h->non_zero_count_cache[3+8*11 + 2*8*i]= |
|
| 2258 |
- h->non_zero_count_cache[3+8*12 + 2*8*i]= IS_INTRA(mb_type) ? 64 : 0; |
|
| 2254 |
+ nnz_cache[3+8* 1 + 2*8*i]= |
|
| 2255 |
+ nnz_cache[3+8* 2 + 2*8*i]= |
|
| 2256 |
+ nnz_cache[3+8* 6 + 2*8*i]= |
|
| 2257 |
+ nnz_cache[3+8* 7 + 2*8*i]= |
|
| 2258 |
+ nnz_cache[3+8*11 + 2*8*i]= |
|
| 2259 |
+ nnz_cache[3+8*12 + 2*8*i]= IS_INTRA(mb_type) ? 64 : 0; |
|
| 2259 | 2260 |
} |
| 2260 | 2261 |
} |
| 2261 | 2262 |
if (h->top_type && !IS_8x8DCT(h->top_type)){
|
| 2262 | 2263 |
uint32_t top_empty = CABAC && !IS_INTRA(mb_type) ? 0 : 0x40404040; |
| 2263 |
- AV_WN32A(&h->non_zero_count_cache[4+8* 0], top_empty); |
|
| 2264 |
- AV_WN32A(&h->non_zero_count_cache[4+8* 5], top_empty); |
|
| 2265 |
- AV_WN32A(&h->non_zero_count_cache[4+8*10], top_empty); |
|
| 2264 |
+ AV_WN32A(&nnz_cache[4+8* 0], top_empty); |
|
| 2265 |
+ AV_WN32A(&nnz_cache[4+8* 5], top_empty); |
|
| 2266 |
+ AV_WN32A(&nnz_cache[4+8*10], top_empty); |
|
| 2266 | 2267 |
} |
| 2267 | 2268 |
} |
| 2268 | 2269 |
s->current_picture.mb_type[mb_xy]= mb_type; |