Browse code

vp9: split out reconstruction functions in their own source file.

Ronald S. Bultje authored on 2017/03/28 06:32:20
Showing 7 changed files
... ...
@@ -610,7 +610,7 @@ OBJS-$(CONFIG_VP8_DECODER)             += vp8.o vp56rac.o
610 610
 OBJS-$(CONFIG_VP8_CUVID_DECODER)       += cuvid.o
611 611
 OBJS-$(CONFIG_VP8_MEDIACODEC_DECODER)  += mediacodecdec.o
612 612
 OBJS-$(CONFIG_VP8_VAAPI_ENCODER)       += vaapi_encode_vp8.o
613
-OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9data.o vp9dsp.o vp9lpf.o \
613
+OBJS-$(CONFIG_VP9_DECODER)             += vp9.o vp9data.o vp9dsp.o vp9lpf.o vp9recon.o \
614 614
                                           vp9block.o vp9prob.o vp9mvs.o vp56rac.o \
615 615
                                           vp9dsp_8bpp.o vp9dsp_10bpp.o vp9dsp_12bpp.o
616 616
 OBJS-$(CONFIG_VP9_CUVID_DECODER)       += cuvid.o
... ...
@@ -405,8 +405,10 @@ static void FN(inter_pred)(AVCodecContext *avctx)
405 405
         }
406 406
     } else {
407 407
         int bwl = bwlog_tab[0][b->bs];
408
-        int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
409
-        int uvbw = bwh_tab[s->ss_h][b->bs][0] * 4, uvbh = bwh_tab[s->ss_v][b->bs][1] * 4;
408
+        int bw = ff_vp9_bwh_tab[0][b->bs][0] * 4;
409
+        int bh = ff_vp9_bwh_tab[0][b->bs][1] * 4;
410
+        int uvbw = ff_vp9_bwh_tab[s->ss_h][b->bs][0] * 4;
411
+        int uvbh = ff_vp9_bwh_tab[s->ss_v][b->bs][1] * 4;
410 412
 
411 413
         mc_luma_dir(s, mc[bwl][b->filter][0], s->dst[0], ls_y,
412 414
                     ref1->data[0], ref1->linesize[0], tref1,
... ...
@@ -31,16 +31,6 @@
31 31
 #include "vp9data.h"
32 32
 #include "vp9dec.h"
33 33
 
34
-static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
35
-    {
36
-        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
37
-        {  4,  4 }, {  4, 2 }, { 2,  4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
38
-    }, {
39
-        {  8,  8 }, {  8, 4 }, { 4,  8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
40
-        {  2,  2 }, {  2, 1 }, { 1,  2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
41
-    }
42
-};
43
-
44 34
 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
45 35
                                        ptrdiff_t stride, int v)
46 36
 {
... ...
@@ -103,8 +93,8 @@ static void decode_mode(AVCodecContext *avctx)
103 103
     VP9Block *b = s->b;
104 104
     int row = s->row, col = s->col, row7 = s->row7;
105 105
     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
106
-    int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
107
-    int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
106
+    int bw4 = ff_vp9_bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
107
+    int bh4 = ff_vp9_bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
108 108
     int have_a = row > 0, have_l = col > s->tile_col_start;
109 109
     int vref, filter_id;
110 110
 
... ...
@@ -272,8 +262,8 @@ static void decode_mode(AVCodecContext *avctx)
272 272
             b->mode[2] =
273 273
             b->mode[1] = b->mode[0];
274 274
             // FIXME this can probably be optimized
275
-            memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
276
-            memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
275
+            memset(a, b->mode[0], ff_vp9_bwh_tab[0][b->bs][0]);
276
+            memset(l, b->mode[0], ff_vp9_bwh_tab[0][b->bs][1]);
277 277
         }
278 278
         b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
279 279
                                      ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
... ...
@@ -725,7 +715,7 @@ static void decode_mode(AVCodecContext *avctx)
725 725
     }
726 726
 #endif
727 727
 
728
-    switch (bwh_tab[1][b->bs][0]) {
728
+    switch (ff_vp9_bwh_tab[1][b->bs][0]) {
729 729
 #define SET_CTXS(dir, off, n) \
730 730
     do { \
731 731
         SPLAT_CTX(s->dir##_skip_ctx[off],      b->skip,          n); \
... ...
@@ -748,7 +738,7 @@ static void decode_mode(AVCodecContext *avctx)
748 748
     case 4: SET_CTXS(above, col, 4); break;
749 749
     case 8: SET_CTXS(above, col, 8); break;
750 750
     }
751
-    switch (bwh_tab[1][b->bs][1]) {
751
+    switch (ff_vp9_bwh_tab[1][b->bs][1]) {
752 752
     case 1: SET_CTXS(left, row7, 1); break;
753 753
     case 2: SET_CTXS(left, row7, 2); break;
754 754
     case 4: SET_CTXS(left, row7, 4); break;
... ...
@@ -983,7 +973,7 @@ static av_always_inline int decode_coeffs(AVCodecContext *avctx, int is8bitsperp
983 983
     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
984 984
     unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
985 985
     unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
986
-    int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
986
+    int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1;
987 987
     int end_x = FFMIN(2 * (s->cols - col), w4);
988 988
     int end_y = FFMIN(2 * (s->rows - row), h4);
989 989
     int n, pl, x, y, ret;
... ...
@@ -1152,615 +1142,6 @@ static int decode_coeffs_16bpp(AVCodecContext *avctx)
1152 1152
     return decode_coeffs(avctx, 0);
1153 1153
 }
1154 1154
 
1155
-static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
1156
-                                             uint8_t *dst_edge, ptrdiff_t stride_edge,
1157
-                                             uint8_t *dst_inner, ptrdiff_t stride_inner,
1158
-                                             uint8_t *l, int col, int x, int w,
1159
-                                             int row, int y, enum TxfmMode tx,
1160
-                                             int p, int ss_h, int ss_v, int bytesperpixel)
1161
-{
1162
-    int have_top = row > 0 || y > 0;
1163
-    int have_left = col > s->tile_col_start || x > 0;
1164
-    int have_right = x < w - 1;
1165
-    int bpp = s->s.h.bpp;
1166
-    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
1167
-        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED            },
1168
-                                   { DC_127_PRED,          VERT_PRED            } },
1169
-        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED          },
1170
-                                   { HOR_PRED,             HOR_PRED             } },
1171
-        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED          },
1172
-                                   { LEFT_DC_PRED,         DC_PRED              } },
1173
-        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  },
1174
-                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  } },
1175
-        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
1176
-                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
1177
-        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      },
1178
-                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      } },
1179
-        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED        },
1180
-                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED        } },
1181
-        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED       },
1182
-                                   { DC_127_PRED,          VERT_LEFT_PRED       } },
1183
-        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED          },
1184
-                                   { HOR_UP_PRED,          HOR_UP_PRED          } },
1185
-        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED            },
1186
-                                   { HOR_PRED,             TM_VP8_PRED          } },
1187
-    };
1188
-    static const struct {
1189
-        uint8_t needs_left:1;
1190
-        uint8_t needs_top:1;
1191
-        uint8_t needs_topleft:1;
1192
-        uint8_t needs_topright:1;
1193
-        uint8_t invert_left:1;
1194
-    } edges[N_INTRA_PRED_MODES] = {
1195
-        [VERT_PRED]            = { .needs_top  = 1 },
1196
-        [HOR_PRED]             = { .needs_left = 1 },
1197
-        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
1198
-        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
1199
-        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
1200
-                                   .needs_topleft = 1 },
1201
-        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1,
1202
-                                   .needs_topleft = 1 },
1203
-        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1,
1204
-                                   .needs_topleft = 1 },
1205
-        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
1206
-        [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
1207
-        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1,
1208
-                                   .needs_topleft = 1 },
1209
-        [LEFT_DC_PRED]         = { .needs_left = 1 },
1210
-        [TOP_DC_PRED]          = { .needs_top  = 1 },
1211
-        [DC_128_PRED]          = { 0 },
1212
-        [DC_127_PRED]          = { 0 },
1213
-        [DC_129_PRED]          = { 0 }
1214
-    };
1215
-
1216
-    av_assert2(mode >= 0 && mode < 10);
1217
-    mode = mode_conv[mode][have_left][have_top];
1218
-    if (edges[mode].needs_top) {
1219
-        uint8_t *top, *topleft;
1220
-        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
1221
-        int n_px_need_tr = 0;
1222
-
1223
-        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
1224
-            n_px_need_tr = 4;
1225
-
1226
-        // if top of sb64-row, use s->intra_pred_data[] instead of
1227
-        // dst[-stride] for intra prediction (it contains pre- instead of
1228
-        // post-loopfilter data)
1229
-        if (have_top) {
1230
-            top = !(row & 7) && !y ?
1231
-                s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
1232
-                y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
1233
-            if (have_left)
1234
-                topleft = !(row & 7) && !y ?
1235
-                    s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
1236
-                    y == 0 || x == 0 ? &dst_edge[-stride_edge] :
1237
-                    &dst_inner[-stride_inner];
1238
-        }
1239
-
1240
-        if (have_top &&
1241
-            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
1242
-            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
1243
-            n_px_need + n_px_need_tr <= n_px_have) {
1244
-            *a = top;
1245
-        } else {
1246
-            if (have_top) {
1247
-                if (n_px_need <= n_px_have) {
1248
-                    memcpy(*a, top, n_px_need * bytesperpixel);
1249
-                } else {
1250
-#define memset_bpp(c, i1, v, i2, num) do { \
1251
-    if (bytesperpixel == 1) { \
1252
-        memset(&(c)[(i1)], (v)[(i2)], (num)); \
1253
-    } else { \
1254
-        int n, val = AV_RN16A(&(v)[(i2) * 2]); \
1255
-        for (n = 0; n < (num); n++) { \
1256
-            AV_WN16A(&(c)[((i1) + n) * 2], val); \
1257
-        } \
1258
-    } \
1259
-} while (0)
1260
-                    memcpy(*a, top, n_px_have * bytesperpixel);
1261
-                    memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
1262
-                }
1263
-            } else {
1264
-#define memset_val(c, val, num) do { \
1265
-    if (bytesperpixel == 1) { \
1266
-        memset((c), (val), (num)); \
1267
-    } else { \
1268
-        int n; \
1269
-        for (n = 0; n < (num); n++) { \
1270
-            AV_WN16A(&(c)[n * 2], (val)); \
1271
-        } \
1272
-    } \
1273
-} while (0)
1274
-                memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
1275
-            }
1276
-            if (edges[mode].needs_topleft) {
1277
-                if (have_left && have_top) {
1278
-#define assign_bpp(c, i1, v, i2) do { \
1279
-    if (bytesperpixel == 1) { \
1280
-        (c)[(i1)] = (v)[(i2)]; \
1281
-    } else { \
1282
-        AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
1283
-    } \
1284
-} while (0)
1285
-                    assign_bpp(*a, -1, topleft, -1);
1286
-                } else {
1287
-#define assign_val(c, i, v) do { \
1288
-    if (bytesperpixel == 1) { \
1289
-        (c)[(i)] = (v); \
1290
-    } else { \
1291
-        AV_WN16A(&(c)[(i) * 2], (v)); \
1292
-    } \
1293
-} while (0)
1294
-                    assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
1295
-                }
1296
-            }
1297
-            if (tx == TX_4X4 && edges[mode].needs_topright) {
1298
-                if (have_top && have_right &&
1299
-                    n_px_need + n_px_need_tr <= n_px_have) {
1300
-                    memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
1301
-                } else {
1302
-                    memset_bpp(*a, 4, *a, 3, 4);
1303
-                }
1304
-            }
1305
-        }
1306
-    }
1307
-    if (edges[mode].needs_left) {
1308
-        if (have_left) {
1309
-            int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
1310
-            uint8_t *dst = x == 0 ? dst_edge : dst_inner;
1311
-            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
1312
-
1313
-            if (edges[mode].invert_left) {
1314
-                if (n_px_need <= n_px_have) {
1315
-                    for (i = 0; i < n_px_need; i++)
1316
-                        assign_bpp(l, i, &dst[i * stride], -1);
1317
-                } else {
1318
-                    for (i = 0; i < n_px_have; i++)
1319
-                        assign_bpp(l, i, &dst[i * stride], -1);
1320
-                    memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
1321
-                }
1322
-            } else {
1323
-                if (n_px_need <= n_px_have) {
1324
-                    for (i = 0; i < n_px_need; i++)
1325
-                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
1326
-                } else {
1327
-                    for (i = 0; i < n_px_have; i++)
1328
-                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
1329
-                    memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
1330
-                }
1331
-            }
1332
-        } else {
1333
-            memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
1334
-        }
1335
-    }
1336
-
1337
-    return mode;
1338
-}
1339
-
1340
-static av_always_inline void intra_recon(AVCodecContext *avctx, ptrdiff_t y_off,
1341
-                                         ptrdiff_t uv_off, int bytesperpixel)
1342
-{
1343
-    VP9Context *s = avctx->priv_data;
1344
-    VP9Block *b = s->b;
1345
-    int row = s->row, col = s->col;
1346
-    int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
1347
-    int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
1348
-    int end_x = FFMIN(2 * (s->cols - col), w4);
1349
-    int end_y = FFMIN(2 * (s->rows - row), h4);
1350
-    int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
1351
-    int uvstep1d = 1 << b->uvtx, p;
1352
-    uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
1353
-    LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
1354
-    LOCAL_ALIGNED_32(uint8_t, l, [64]);
1355
-
1356
-    for (n = 0, y = 0; y < end_y; y += step1d) {
1357
-        uint8_t *ptr = dst, *ptr_r = dst_r;
1358
-        for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
1359
-                               ptr_r += 4 * step1d * bytesperpixel, n += step) {
1360
-            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
1361
-                               y * 2 + x : 0];
1362
-            uint8_t *a = &a_buf[32];
1363
-            enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
1364
-            int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
1365
-
1366
-            mode = check_intra_mode(s, mode, &a, ptr_r,
1367
-                                    s->s.frames[CUR_FRAME].tf.f->linesize[0],
1368
-                                    ptr, s->y_stride, l,
1369
-                                    col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
1370
-            s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
1371
-            if (eob)
1372
-                s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
1373
-                                           s->block + 16 * n * bytesperpixel, eob);
1374
-        }
1375
-        dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
1376
-        dst   += 4 * step1d * s->y_stride;
1377
-    }
1378
-
1379
-    // U/V
1380
-    w4    >>= s->ss_h;
1381
-    end_x >>= s->ss_h;
1382
-    end_y >>= s->ss_v;
1383
-    step = 1 << (b->uvtx * 2);
1384
-    for (p = 0; p < 2; p++) {
1385
-        dst   = s->dst[1 + p];
1386
-        dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
1387
-        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
1388
-            uint8_t *ptr = dst, *ptr_r = dst_r;
1389
-            for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
1390
-                                   ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
1391
-                int mode = b->uvmode;
1392
-                uint8_t *a = &a_buf[32];
1393
-                int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
1394
-
1395
-                mode = check_intra_mode(s, mode, &a, ptr_r,
1396
-                                        s->s.frames[CUR_FRAME].tf.f->linesize[1],
1397
-                                        ptr, s->uv_stride, l, col, x, w4, row, y,
1398
-                                        b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
1399
-                s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
1400
-                if (eob)
1401
-                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
1402
-                                                    s->uvblock[p] + 16 * n * bytesperpixel, eob);
1403
-            }
1404
-            dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
1405
-            dst   += 4 * uvstep1d * s->uv_stride;
1406
-        }
1407
-    }
1408
-}
1409
-
1410
-static void intra_recon_8bpp(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
1411
-{
1412
-    intra_recon(avctx, y_off, uv_off, 1);
1413
-}
1414
-
1415
-static void intra_recon_16bpp(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
1416
-{
1417
-    intra_recon(avctx, y_off, uv_off, 2);
1418
-}
1419
-
1420
-static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
1421
-                                              uint8_t *dst, ptrdiff_t dst_stride,
1422
-                                              const uint8_t *ref, ptrdiff_t ref_stride,
1423
-                                              ThreadFrame *ref_frame,
1424
-                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
1425
-                                              int bw, int bh, int w, int h, int bytesperpixel)
1426
-{
1427
-    int mx = mv->x, my = mv->y, th;
1428
-
1429
-    y += my >> 3;
1430
-    x += mx >> 3;
1431
-    ref += y * ref_stride + x * bytesperpixel;
1432
-    mx &= 7;
1433
-    my &= 7;
1434
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
1435
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
1436
-    // the longest loopfilter of the next sbrow
1437
-    th = (y + bh + 4 * !!my + 7) >> 6;
1438
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
1439
-    // The arm/aarch64 _hv filters read one more row than what actually is
1440
-    // needed, so switch to emulated edge one pixel sooner vertically
1441
-    // (!!my * 5) than horizontally (!!mx * 4).
1442
-    if (x < !!mx * 3 || y < !!my * 3 ||
1443
-        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
1444
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
1445
-                                 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
1446
-                                 160, ref_stride,
1447
-                                 bw + !!mx * 7, bh + !!my * 7,
1448
-                                 x - !!mx * 3, y - !!my * 3, w, h);
1449
-        ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
1450
-        ref_stride = 160;
1451
-    }
1452
-    mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
1453
-}
1454
-
1455
-static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
1456
-                                                uint8_t *dst_u, uint8_t *dst_v,
1457
-                                                ptrdiff_t dst_stride,
1458
-                                                const uint8_t *ref_u, ptrdiff_t src_stride_u,
1459
-                                                const uint8_t *ref_v, ptrdiff_t src_stride_v,
1460
-                                                ThreadFrame *ref_frame,
1461
-                                                ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
1462
-                                                int bw, int bh, int w, int h, int bytesperpixel)
1463
-{
1464
-    int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
1465
-
1466
-    y += my >> 4;
1467
-    x += mx >> 4;
1468
-    ref_u += y * src_stride_u + x * bytesperpixel;
1469
-    ref_v += y * src_stride_v + x * bytesperpixel;
1470
-    mx &= 15;
1471
-    my &= 15;
1472
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
1473
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
1474
-    // the longest loopfilter of the next sbrow
1475
-    th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
1476
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
1477
-    // The arm/aarch64 _hv filters read one more row than what actually is
1478
-    // needed, so switch to emulated edge one pixel sooner vertically
1479
-    // (!!my * 5) than horizontally (!!mx * 4).
1480
-    if (x < !!mx * 3 || y < !!my * 3 ||
1481
-        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
1482
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
1483
-                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
1484
-                                 160, src_stride_u,
1485
-                                 bw + !!mx * 7, bh + !!my * 7,
1486
-                                 x - !!mx * 3, y - !!my * 3, w, h);
1487
-        ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
1488
-        mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
1489
-
1490
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
1491
-                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
1492
-                                 160, src_stride_v,
1493
-                                 bw + !!mx * 7, bh + !!my * 7,
1494
-                                 x - !!mx * 3, y - !!my * 3, w, h);
1495
-        ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
1496
-        mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
1497
-    } else {
1498
-        mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
1499
-        mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
1500
-    }
1501
-}
1502
-
1503
-#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
1504
-                    px, py, pw, ph, bw, bh, w, h, i) \
1505
-    mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
1506
-                     mv, bw, bh, w, h, bytesperpixel)
1507
-#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
1508
-                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
1509
-    mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
1510
-                       row, col, mv, bw, bh, w, h, bytesperpixel)
1511
-#define SCALED 0
1512
-#define FN(x) x##_8bpp
1513
-#define BYTES_PER_PIXEL 1
1514
-#include "vp9_mc_template.c"
1515
-#undef FN
1516
-#undef BYTES_PER_PIXEL
1517
-#define FN(x) x##_16bpp
1518
-#define BYTES_PER_PIXEL 2
1519
-#include "vp9_mc_template.c"
1520
-#undef mc_luma_dir
1521
-#undef mc_chroma_dir
1522
-#undef FN
1523
-#undef BYTES_PER_PIXEL
1524
-#undef SCALED
1525
-
1526
-static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
1527
-                                            vp9_mc_func (*mc)[2],
1528
-                                            uint8_t *dst, ptrdiff_t dst_stride,
1529
-                                            const uint8_t *ref, ptrdiff_t ref_stride,
1530
-                                            ThreadFrame *ref_frame,
1531
-                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
1532
-                                            int px, int py, int pw, int ph,
1533
-                                            int bw, int bh, int w, int h, int bytesperpixel,
1534
-                                            const uint16_t *scale, const uint8_t *step)
1535
-{
1536
-    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
1537
-        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
1538
-        mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
1539
-                         y, x, in_mv, bw, bh, w, h, bytesperpixel);
1540
-    } else {
1541
-#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
1542
-    int mx, my;
1543
-    int refbw_m1, refbh_m1;
1544
-    int th;
1545
-    VP56mv mv;
1546
-
1547
-    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
1548
-    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
1549
-    // BUG libvpx seems to scale the two components separately. This introduces
1550
-    // rounding errors but we have to reproduce them to be exactly compatible
1551
-    // with the output from libvpx...
1552
-    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
1553
-    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
1554
-
1555
-    y = my >> 4;
1556
-    x = mx >> 4;
1557
-    ref += y * ref_stride + x * bytesperpixel;
1558
-    mx &= 15;
1559
-    my &= 15;
1560
-    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
1561
-    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
1562
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
1563
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
1564
-    // the longest loopfilter of the next sbrow
1565
-    th = (y + refbh_m1 + 4 + 7) >> 6;
1566
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
1567
-    // The arm/aarch64 _hv filters read one more row than what actually is
1568
-    // needed, so switch to emulated edge one pixel sooner vertically
1569
-    // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
1570
-    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
1571
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
1572
-                                 ref - 3 * ref_stride - 3 * bytesperpixel,
1573
-                                 288, ref_stride,
1574
-                                 refbw_m1 + 8, refbh_m1 + 8,
1575
-                                 x - 3, y - 3, w, h);
1576
-        ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
1577
-        ref_stride = 288;
1578
-    }
1579
-    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
1580
-    }
1581
-}
1582
-
1583
-static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
1584
-                                              vp9_mc_func (*mc)[2],
1585
-                                              uint8_t *dst_u, uint8_t *dst_v,
1586
-                                              ptrdiff_t dst_stride,
1587
-                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
1588
-                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
1589
-                                              ThreadFrame *ref_frame,
1590
-                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
1591
-                                              int px, int py, int pw, int ph,
1592
-                                              int bw, int bh, int w, int h, int bytesperpixel,
1593
-                                              const uint16_t *scale, const uint8_t *step)
1594
-{
1595
-    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
1596
-        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
1597
-        mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
1598
-                           ref_v, src_stride_v, ref_frame,
1599
-                           y, x, in_mv, bw, bh, w, h, bytesperpixel);
1600
-    } else {
1601
-    int mx, my;
1602
-    int refbw_m1, refbh_m1;
1603
-    int th;
1604
-    VP56mv mv;
1605
-
1606
-    if (s->ss_h) {
1607
-        // BUG https://code.google.com/p/webm/issues/detail?id=820
1608
-        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
1609
-        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
1610
-    } else {
1611
-        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
1612
-        mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
1613
-    }
1614
-    if (s->ss_v) {
1615
-        // BUG https://code.google.com/p/webm/issues/detail?id=820
1616
-        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
1617
-        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
1618
-    } else {
1619
-        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
1620
-        my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
1621
-    }
1622
-#undef scale_mv
1623
-    y = my >> 4;
1624
-    x = mx >> 4;
1625
-    ref_u += y * src_stride_u + x * bytesperpixel;
1626
-    ref_v += y * src_stride_v + x * bytesperpixel;
1627
-    mx &= 15;
1628
-    my &= 15;
1629
-    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
1630
-    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
1631
-    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
1632
-    // we use +7 because the last 7 pixels of each sbrow can be changed in
1633
-    // the longest loopfilter of the next sbrow
1634
-    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
1635
-    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
1636
-    // The arm/aarch64 _hv filters read one more row than what actually is
1637
-    // needed, so switch to emulated edge one pixel sooner vertically
1638
-    // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
1639
-    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
1640
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
1641
-                                 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
1642
-                                 288, src_stride_u,
1643
-                                 refbw_m1 + 8, refbh_m1 + 8,
1644
-                                 x - 3, y - 3, w, h);
1645
-        ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
1646
-        smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
1647
-
1648
-        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
1649
-                                 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
1650
-                                 288, src_stride_v,
1651
-                                 refbw_m1 + 8, refbh_m1 + 8,
1652
-                                 x - 3, y - 3, w, h);
1653
-        ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
1654
-        smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
1655
-    } else {
1656
-        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
1657
-        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
1658
-    }
1659
-    }
1660
-}
1661
-
1662
-#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
1663
-                    px, py, pw, ph, bw, bh, w, h, i) \
1664
-    mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
1665
-                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
1666
-                   s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
1667
-#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
1668
-                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
1669
-    mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
1670
-                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
1671
-                     s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
1672
-#define SCALED 1
1673
-#define FN(x) x##_scaled_8bpp
1674
-#define BYTES_PER_PIXEL 1
1675
-#include "vp9_mc_template.c"
1676
-#undef FN
1677
-#undef BYTES_PER_PIXEL
1678
-#define FN(x) x##_scaled_16bpp
1679
-#define BYTES_PER_PIXEL 2
1680
-#include "vp9_mc_template.c"
1681
-#undef mc_luma_dir
1682
-#undef mc_chroma_dir
1683
-#undef FN
1684
-#undef BYTES_PER_PIXEL
1685
-#undef SCALED
1686
-
1687
-static av_always_inline void inter_recon(AVCodecContext *avctx, int bytesperpixel)
1688
-{
1689
-    VP9Context *s = avctx->priv_data;
1690
-    VP9Block *b = s->b;
1691
-    int row = s->row, col = s->col;
1692
-
1693
-    if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
1694
-        if (bytesperpixel == 1) {
1695
-            inter_pred_scaled_8bpp(avctx);
1696
-        } else {
1697
-            inter_pred_scaled_16bpp(avctx);
1698
-        }
1699
-    } else {
1700
-        if (bytesperpixel == 1) {
1701
-            inter_pred_8bpp(avctx);
1702
-        } else {
1703
-            inter_pred_16bpp(avctx);
1704
-        }
1705
-    }
1706
-
1707
-    if (!b->skip) {
1708
-        /* mostly copied intra_recon() */
1709
-
1710
-        int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
1711
-        int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
1712
-        int end_x = FFMIN(2 * (s->cols - col), w4);
1713
-        int end_y = FFMIN(2 * (s->rows - row), h4);
1714
-        int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
1715
-        int uvstep1d = 1 << b->uvtx, p;
1716
-        uint8_t *dst = s->dst[0];
1717
-
1718
-        // y itxfm add
1719
-        for (n = 0, y = 0; y < end_y; y += step1d) {
1720
-            uint8_t *ptr = dst;
1721
-            for (x = 0; x < end_x; x += step1d,
1722
-                 ptr += 4 * step1d * bytesperpixel, n += step) {
1723
-                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
1724
-
1725
-                if (eob)
1726
-                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
1727
-                                                  s->block + 16 * n * bytesperpixel, eob);
1728
-            }
1729
-            dst += 4 * s->y_stride * step1d;
1730
-        }
1731
-
1732
-        // uv itxfm add
1733
-        end_x >>= s->ss_h;
1734
-        end_y >>= s->ss_v;
1735
-        step = 1 << (b->uvtx * 2);
1736
-        for (p = 0; p < 2; p++) {
1737
-            dst = s->dst[p + 1];
1738
-            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
1739
-                uint8_t *ptr = dst;
1740
-                for (x = 0; x < end_x; x += uvstep1d,
1741
-                     ptr += 4 * uvstep1d * bytesperpixel, n += step) {
1742
-                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
1743
-
1744
-                    if (eob)
1745
-                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
1746
-                                                        s->uvblock[p] + 16 * n * bytesperpixel, eob);
1747
-                }
1748
-                dst += 4 * uvstep1d * s->uv_stride;
1749
-            }
1750
-        }
1751
-    }
1752
-}
1753
-
1754
-static void inter_recon_8bpp(AVCodecContext *avctx)
1755
-{
1756
-    inter_recon(avctx, 1);
1757
-}
1758
-
1759
-static void inter_recon_16bpp(AVCodecContext *avctx)
1760
-{
1761
-    inter_recon(avctx, 2);
1762
-}
1763
-
1764 1155
 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
1765 1156
                                         int row_and_7, int col_and_7,
1766 1157
                                         int w, int h, int col_end, int row_end,
... ...
@@ -1891,7 +1272,7 @@ void ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
1891 1891
     VP9Block *b = s->b;
1892 1892
     enum BlockSize bs = bl * 3 + bp;
1893 1893
     int bytesperpixel = s->bytesperpixel;
1894
-    int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
1894
+    int w4 = ff_vp9_bwh_tab[1][bs][0], h4 = ff_vp9_bwh_tab[1][bs][1], lvl;
1895 1895
     int emu[2];
1896 1896
     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
1897 1897
 
... ...
@@ -2001,15 +1382,15 @@ void ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
2001 2001
     }
2002 2002
     if (b->intra) {
2003 2003
         if (s->s.h.bpp > 8) {
2004
-            intra_recon_16bpp(avctx, yoff, uvoff);
2004
+            ff_vp9_intra_recon_16bpp(avctx, yoff, uvoff);
2005 2005
         } else {
2006
-            intra_recon_8bpp(avctx, yoff, uvoff);
2006
+            ff_vp9_intra_recon_8bpp(avctx, yoff, uvoff);
2007 2007
         }
2008 2008
     } else {
2009 2009
         if (s->s.h.bpp > 8) {
2010
-            inter_recon_16bpp(avctx);
2010
+            ff_vp9_inter_recon_16bpp(avctx);
2011 2011
         } else {
2012
-            inter_recon_8bpp(avctx);
2012
+            ff_vp9_inter_recon_8bpp(avctx);
2013 2013
         }
2014 2014
     }
2015 2015
     if (emu[0]) {
... ...
@@ -22,6 +22,16 @@
22 22
 #include "vp9.h"
23 23
 #include "vp9data.h"
24 24
 
25
+const uint8_t ff_vp9_bwh_tab[2][N_BS_SIZES][2] = {
26
+    {
27
+        { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
28
+        {  4,  4 }, {  4, 2 }, { 2,  4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
29
+    }, {
30
+        {  8,  8 }, {  8, 4 }, { 4,  8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
31
+        {  2,  2 }, {  2, 1 }, { 1,  2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
32
+    }
33
+};
34
+
25 35
 const int8_t ff_vp9_partition_tree[3][2] = {
26 36
     { -PARTITION_NONE, 1 },                      // '0'
27 37
         { -PARTITION_H, 2 },                     // '10'
... ...
@@ -26,6 +26,7 @@
26 26
 
27 27
 #include "vp9dec.h"
28 28
 
29
+extern const uint8_t ff_vp9_bwh_tab[2][N_BS_SIZES][2];
29 30
 extern const int8_t ff_vp9_partition_tree[3][2];
30 31
 extern const uint8_t ff_vp9_default_kf_partition_probs[4][4][3];
31 32
 extern const int8_t ff_vp9_segmentation_tree[7][2];
... ...
@@ -206,4 +206,11 @@ void ff_vp9_decode_block(AVCodecContext *ctx, int row, int col,
206 206
 void ff_vp9_loopfilter_sb(AVCodecContext *avctx, VP9Filter *lflvl,
207 207
                           int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff);
208 208
 
209
+void ff_vp9_intra_recon_8bpp(AVCodecContext *avctx,
210
+                             ptrdiff_t y_off, ptrdiff_t uv_off);
211
+void ff_vp9_intra_recon_16bpp(AVCodecContext *avctx,
212
+                              ptrdiff_t y_off, ptrdiff_t uv_off);
213
+void ff_vp9_inter_recon_8bpp(AVCodecContext *avctx);
214
+void ff_vp9_inter_recon_16bpp(AVCodecContext *avctx);
215
+
209 216
 #endif /* AVCODEC_VP9DEC_H */
210 217
new file mode 100644
... ...
@@ -0,0 +1,639 @@
0
+/*
1
+ * VP9 compatible video decoder
2
+ *
3
+ * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
4
+ * Copyright (C) 2013 Clément Bœsch <u pkh me>
5
+ *
6
+ * This file is part of FFmpeg.
7
+ *
8
+ * FFmpeg is free software; you can redistribute it and/or
9
+ * modify it under the terms of the GNU Lesser General Public
10
+ * License as published by the Free Software Foundation; either
11
+ * version 2.1 of the License, or (at your option) any later version.
12
+ *
13
+ * FFmpeg is distributed in the hope that it will be useful,
14
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
+ * Lesser General Public License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public
19
+ * License along with FFmpeg; if not, write to the Free Software
20
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ */
22
+
23
+#include "libavutil/avassert.h"
24
+
25
+#include "avcodec.h"
26
+#include "internal.h"
27
+#include "videodsp.h"
28
+#include "vp9data.h"
29
+#include "vp9dec.h"
30
+
31
+static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
32
+                                             uint8_t *dst_edge, ptrdiff_t stride_edge,
33
+                                             uint8_t *dst_inner, ptrdiff_t stride_inner,
34
+                                             uint8_t *l, int col, int x, int w,
35
+                                             int row, int y, enum TxfmMode tx,
36
+                                             int p, int ss_h, int ss_v, int bytesperpixel)
37
+{
38
+    int have_top = row > 0 || y > 0;
39
+    int have_left = col > s->tile_col_start || x > 0;
40
+    int have_right = x < w - 1;
41
+    int bpp = s->s.h.bpp;
42
+    static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
43
+        [VERT_PRED]            = { { DC_127_PRED,          VERT_PRED            },
44
+                                   { DC_127_PRED,          VERT_PRED            } },
45
+        [HOR_PRED]             = { { DC_129_PRED,          DC_129_PRED          },
46
+                                   { HOR_PRED,             HOR_PRED             } },
47
+        [DC_PRED]              = { { DC_128_PRED,          TOP_DC_PRED          },
48
+                                   { LEFT_DC_PRED,         DC_PRED              } },
49
+        [DIAG_DOWN_LEFT_PRED]  = { { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  },
50
+                                   { DC_127_PRED,          DIAG_DOWN_LEFT_PRED  } },
51
+        [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
52
+                                   { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
53
+        [VERT_RIGHT_PRED]      = { { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      },
54
+                                   { VERT_RIGHT_PRED,      VERT_RIGHT_PRED      } },
55
+        [HOR_DOWN_PRED]        = { { HOR_DOWN_PRED,        HOR_DOWN_PRED        },
56
+                                   { HOR_DOWN_PRED,        HOR_DOWN_PRED        } },
57
+        [VERT_LEFT_PRED]       = { { DC_127_PRED,          VERT_LEFT_PRED       },
58
+                                   { DC_127_PRED,          VERT_LEFT_PRED       } },
59
+        [HOR_UP_PRED]          = { { DC_129_PRED,          DC_129_PRED          },
60
+                                   { HOR_UP_PRED,          HOR_UP_PRED          } },
61
+        [TM_VP8_PRED]          = { { DC_129_PRED,          VERT_PRED            },
62
+                                   { HOR_PRED,             TM_VP8_PRED          } },
63
+    };
64
+    static const struct {
65
+        uint8_t needs_left:1;
66
+        uint8_t needs_top:1;
67
+        uint8_t needs_topleft:1;
68
+        uint8_t needs_topright:1;
69
+        uint8_t invert_left:1;
70
+    } edges[N_INTRA_PRED_MODES] = {
71
+        [VERT_PRED]            = { .needs_top  = 1 },
72
+        [HOR_PRED]             = { .needs_left = 1 },
73
+        [DC_PRED]              = { .needs_top  = 1, .needs_left = 1 },
74
+        [DIAG_DOWN_LEFT_PRED]  = { .needs_top  = 1, .needs_topright = 1 },
75
+        [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
76
+                                   .needs_topleft = 1 },
77
+        [VERT_RIGHT_PRED]      = { .needs_left = 1, .needs_top = 1,
78
+                                   .needs_topleft = 1 },
79
+        [HOR_DOWN_PRED]        = { .needs_left = 1, .needs_top = 1,
80
+                                   .needs_topleft = 1 },
81
+        [VERT_LEFT_PRED]       = { .needs_top  = 1, .needs_topright = 1 },
82
+        [HOR_UP_PRED]          = { .needs_left = 1, .invert_left = 1 },
83
+        [TM_VP8_PRED]          = { .needs_left = 1, .needs_top = 1,
84
+                                   .needs_topleft = 1 },
85
+        [LEFT_DC_PRED]         = { .needs_left = 1 },
86
+        [TOP_DC_PRED]          = { .needs_top  = 1 },
87
+        [DC_128_PRED]          = { 0 },
88
+        [DC_127_PRED]          = { 0 },
89
+        [DC_129_PRED]          = { 0 }
90
+    };
91
+
92
+    av_assert2(mode >= 0 && mode < 10);
93
+    mode = mode_conv[mode][have_left][have_top];
94
+    if (edges[mode].needs_top) {
95
+        uint8_t *top, *topleft;
96
+        int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
97
+        int n_px_need_tr = 0;
98
+
99
+        if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
100
+            n_px_need_tr = 4;
101
+
102
+        // if top of sb64-row, use s->intra_pred_data[] instead of
103
+        // dst[-stride] for intra prediction (it contains pre- instead of
104
+        // post-loopfilter data)
105
+        if (have_top) {
106
+            top = !(row & 7) && !y ?
107
+                s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
108
+                y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
109
+            if (have_left)
110
+                topleft = !(row & 7) && !y ?
111
+                    s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
112
+                    y == 0 || x == 0 ? &dst_edge[-stride_edge] :
113
+                    &dst_inner[-stride_inner];
114
+        }
115
+
116
+        if (have_top &&
117
+            (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
118
+            (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
119
+            n_px_need + n_px_need_tr <= n_px_have) {
120
+            *a = top;
121
+        } else {
122
+            if (have_top) {
123
+                if (n_px_need <= n_px_have) {
124
+                    memcpy(*a, top, n_px_need * bytesperpixel);
125
+                } else {
126
+#define memset_bpp(c, i1, v, i2, num) do { \
127
+    if (bytesperpixel == 1) { \
128
+        memset(&(c)[(i1)], (v)[(i2)], (num)); \
129
+    } else { \
130
+        int n, val = AV_RN16A(&(v)[(i2) * 2]); \
131
+        for (n = 0; n < (num); n++) { \
132
+            AV_WN16A(&(c)[((i1) + n) * 2], val); \
133
+        } \
134
+    } \
135
+} while (0)
136
+                    memcpy(*a, top, n_px_have * bytesperpixel);
137
+                    memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
138
+                }
139
+            } else {
140
+#define memset_val(c, val, num) do { \
141
+    if (bytesperpixel == 1) { \
142
+        memset((c), (val), (num)); \
143
+    } else { \
144
+        int n; \
145
+        for (n = 0; n < (num); n++) { \
146
+            AV_WN16A(&(c)[n * 2], (val)); \
147
+        } \
148
+    } \
149
+} while (0)
150
+                memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
151
+            }
152
+            if (edges[mode].needs_topleft) {
153
+                if (have_left && have_top) {
154
+#define assign_bpp(c, i1, v, i2) do { \
155
+    if (bytesperpixel == 1) { \
156
+        (c)[(i1)] = (v)[(i2)]; \
157
+    } else { \
158
+        AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
159
+    } \
160
+} while (0)
161
+                    assign_bpp(*a, -1, topleft, -1);
162
+                } else {
163
+#define assign_val(c, i, v) do { \
164
+    if (bytesperpixel == 1) { \
165
+        (c)[(i)] = (v); \
166
+    } else { \
167
+        AV_WN16A(&(c)[(i) * 2], (v)); \
168
+    } \
169
+} while (0)
170
+                    assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
171
+                }
172
+            }
173
+            if (tx == TX_4X4 && edges[mode].needs_topright) {
174
+                if (have_top && have_right &&
175
+                    n_px_need + n_px_need_tr <= n_px_have) {
176
+                    memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
177
+                } else {
178
+                    memset_bpp(*a, 4, *a, 3, 4);
179
+                }
180
+            }
181
+        }
182
+    }
183
+    if (edges[mode].needs_left) {
184
+        if (have_left) {
185
+            int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
186
+            uint8_t *dst = x == 0 ? dst_edge : dst_inner;
187
+            ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
188
+
189
+            if (edges[mode].invert_left) {
190
+                if (n_px_need <= n_px_have) {
191
+                    for (i = 0; i < n_px_need; i++)
192
+                        assign_bpp(l, i, &dst[i * stride], -1);
193
+                } else {
194
+                    for (i = 0; i < n_px_have; i++)
195
+                        assign_bpp(l, i, &dst[i * stride], -1);
196
+                    memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
197
+                }
198
+            } else {
199
+                if (n_px_need <= n_px_have) {
200
+                    for (i = 0; i < n_px_need; i++)
201
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
202
+                } else {
203
+                    for (i = 0; i < n_px_have; i++)
204
+                        assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
205
+                    memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
206
+                }
207
+            }
208
+        } else {
209
+            memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
210
+        }
211
+    }
212
+
213
+    return mode;
214
+}
215
+
216
+static av_always_inline void intra_recon(AVCodecContext *avctx, ptrdiff_t y_off,
217
+                                         ptrdiff_t uv_off, int bytesperpixel)
218
+{
219
+    VP9Context *s = avctx->priv_data;
220
+    VP9Block *b = s->b;
221
+    int row = s->row, col = s->col;
222
+    int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
223
+    int h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
224
+    int end_x = FFMIN(2 * (s->cols - col), w4);
225
+    int end_y = FFMIN(2 * (s->rows - row), h4);
226
+    int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
227
+    int uvstep1d = 1 << b->uvtx, p;
228
+    uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
229
+    LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
230
+    LOCAL_ALIGNED_32(uint8_t, l, [64]);
231
+
232
+    for (n = 0, y = 0; y < end_y; y += step1d) {
233
+        uint8_t *ptr = dst, *ptr_r = dst_r;
234
+        for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
235
+                               ptr_r += 4 * step1d * bytesperpixel, n += step) {
236
+            int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
237
+                               y * 2 + x : 0];
238
+            uint8_t *a = &a_buf[32];
239
+            enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
240
+            int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
241
+
242
+            mode = check_intra_mode(s, mode, &a, ptr_r,
243
+                                    s->s.frames[CUR_FRAME].tf.f->linesize[0],
244
+                                    ptr, s->y_stride, l,
245
+                                    col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
246
+            s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
247
+            if (eob)
248
+                s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
249
+                                           s->block + 16 * n * bytesperpixel, eob);
250
+        }
251
+        dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
252
+        dst   += 4 * step1d * s->y_stride;
253
+    }
254
+
255
+    // U/V
256
+    w4    >>= s->ss_h;
257
+    end_x >>= s->ss_h;
258
+    end_y >>= s->ss_v;
259
+    step = 1 << (b->uvtx * 2);
260
+    for (p = 0; p < 2; p++) {
261
+        dst   = s->dst[1 + p];
262
+        dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
263
+        for (n = 0, y = 0; y < end_y; y += uvstep1d) {
264
+            uint8_t *ptr = dst, *ptr_r = dst_r;
265
+            for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
266
+                                   ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
267
+                int mode = b->uvmode;
268
+                uint8_t *a = &a_buf[32];
269
+                int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
270
+
271
+                mode = check_intra_mode(s, mode, &a, ptr_r,
272
+                                        s->s.frames[CUR_FRAME].tf.f->linesize[1],
273
+                                        ptr, s->uv_stride, l, col, x, w4, row, y,
274
+                                        b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
275
+                s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
276
+                if (eob)
277
+                    s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
278
+                                                    s->uvblock[p] + 16 * n * bytesperpixel, eob);
279
+            }
280
+            dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
281
+            dst   += 4 * uvstep1d * s->uv_stride;
282
+        }
283
+    }
284
+}
285
+
286
+void ff_vp9_intra_recon_8bpp(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
287
+{
288
+    intra_recon(avctx, y_off, uv_off, 1);
289
+}
290
+
291
+void ff_vp9_intra_recon_16bpp(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
292
+{
293
+    intra_recon(avctx, y_off, uv_off, 2);
294
+}
295
+
296
+static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
297
+                                              uint8_t *dst, ptrdiff_t dst_stride,
298
+                                              const uint8_t *ref, ptrdiff_t ref_stride,
299
+                                              ThreadFrame *ref_frame,
300
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
301
+                                              int bw, int bh, int w, int h, int bytesperpixel)
302
+{
303
+    int mx = mv->x, my = mv->y, th;
304
+
305
+    y += my >> 3;
306
+    x += mx >> 3;
307
+    ref += y * ref_stride + x * bytesperpixel;
308
+    mx &= 7;
309
+    my &= 7;
310
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
311
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
312
+    // the longest loopfilter of the next sbrow
313
+    th = (y + bh + 4 * !!my + 7) >> 6;
314
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
315
+    // The arm/aarch64 _hv filters read one more row than what actually is
316
+    // needed, so switch to emulated edge one pixel sooner vertically
317
+    // (!!my * 5) than horizontally (!!mx * 4).
318
+    if (x < !!mx * 3 || y < !!my * 3 ||
319
+        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
320
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
321
+                                 ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
322
+                                 160, ref_stride,
323
+                                 bw + !!mx * 7, bh + !!my * 7,
324
+                                 x - !!mx * 3, y - !!my * 3, w, h);
325
+        ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
326
+        ref_stride = 160;
327
+    }
328
+    mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
329
+}
330
+
331
+static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
332
+                                                uint8_t *dst_u, uint8_t *dst_v,
333
+                                                ptrdiff_t dst_stride,
334
+                                                const uint8_t *ref_u, ptrdiff_t src_stride_u,
335
+                                                const uint8_t *ref_v, ptrdiff_t src_stride_v,
336
+                                                ThreadFrame *ref_frame,
337
+                                                ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
338
+                                                int bw, int bh, int w, int h, int bytesperpixel)
339
+{
340
+    int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
341
+
342
+    y += my >> 4;
343
+    x += mx >> 4;
344
+    ref_u += y * src_stride_u + x * bytesperpixel;
345
+    ref_v += y * src_stride_v + x * bytesperpixel;
346
+    mx &= 15;
347
+    my &= 15;
348
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
349
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
350
+    // the longest loopfilter of the next sbrow
351
+    th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
352
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
353
+    // The arm/aarch64 _hv filters read one more row than what actually is
354
+    // needed, so switch to emulated edge one pixel sooner vertically
355
+    // (!!my * 5) than horizontally (!!mx * 4).
356
+    if (x < !!mx * 3 || y < !!my * 3 ||
357
+        x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
358
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
359
+                                 ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
360
+                                 160, src_stride_u,
361
+                                 bw + !!mx * 7, bh + !!my * 7,
362
+                                 x - !!mx * 3, y - !!my * 3, w, h);
363
+        ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
364
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
365
+
366
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
367
+                                 ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
368
+                                 160, src_stride_v,
369
+                                 bw + !!mx * 7, bh + !!my * 7,
370
+                                 x - !!mx * 3, y - !!my * 3, w, h);
371
+        ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
372
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
373
+    } else {
374
+        mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
375
+        mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
376
+    }
377
+}
378
+
379
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
380
+                    px, py, pw, ph, bw, bh, w, h, i) \
381
+    mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
382
+                     mv, bw, bh, w, h, bytesperpixel)
383
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
384
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
385
+    mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
386
+                       row, col, mv, bw, bh, w, h, bytesperpixel)
387
+#define SCALED 0
388
+#define FN(x) x##_8bpp
389
+#define BYTES_PER_PIXEL 1
390
+#include "vp9_mc_template.c"
391
+#undef FN
392
+#undef BYTES_PER_PIXEL
393
+#define FN(x) x##_16bpp
394
+#define BYTES_PER_PIXEL 2
395
+#include "vp9_mc_template.c"
396
+#undef mc_luma_dir
397
+#undef mc_chroma_dir
398
+#undef FN
399
+#undef BYTES_PER_PIXEL
400
+#undef SCALED
401
+
402
+static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
403
+                                            vp9_mc_func (*mc)[2],
404
+                                            uint8_t *dst, ptrdiff_t dst_stride,
405
+                                            const uint8_t *ref, ptrdiff_t ref_stride,
406
+                                            ThreadFrame *ref_frame,
407
+                                            ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
408
+                                            int px, int py, int pw, int ph,
409
+                                            int bw, int bh, int w, int h, int bytesperpixel,
410
+                                            const uint16_t *scale, const uint8_t *step)
411
+{
412
+    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
413
+        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
414
+        mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
415
+                         y, x, in_mv, bw, bh, w, h, bytesperpixel);
416
+    } else {
417
+#define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
418
+    int mx, my;
419
+    int refbw_m1, refbh_m1;
420
+    int th;
421
+    VP56mv mv;
422
+
423
+    mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
424
+    mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
425
+    // BUG libvpx seems to scale the two components separately. This introduces
426
+    // rounding errors but we have to reproduce them to be exactly compatible
427
+    // with the output from libvpx...
428
+    mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
429
+    my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
430
+
431
+    y = my >> 4;
432
+    x = mx >> 4;
433
+    ref += y * ref_stride + x * bytesperpixel;
434
+    mx &= 15;
435
+    my &= 15;
436
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
437
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
438
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
439
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
440
+    // the longest loopfilter of the next sbrow
441
+    th = (y + refbh_m1 + 4 + 7) >> 6;
442
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
443
+    // The arm/aarch64 _hv filters read one more row than what actually is
444
+    // needed, so switch to emulated edge one pixel sooner vertically
445
+    // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
446
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
447
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
448
+                                 ref - 3 * ref_stride - 3 * bytesperpixel,
449
+                                 288, ref_stride,
450
+                                 refbw_m1 + 8, refbh_m1 + 8,
451
+                                 x - 3, y - 3, w, h);
452
+        ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
453
+        ref_stride = 288;
454
+    }
455
+    smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
456
+    }
457
+}
458
+
459
+static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
460
+                                              vp9_mc_func (*mc)[2],
461
+                                              uint8_t *dst_u, uint8_t *dst_v,
462
+                                              ptrdiff_t dst_stride,
463
+                                              const uint8_t *ref_u, ptrdiff_t src_stride_u,
464
+                                              const uint8_t *ref_v, ptrdiff_t src_stride_v,
465
+                                              ThreadFrame *ref_frame,
466
+                                              ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
467
+                                              int px, int py, int pw, int ph,
468
+                                              int bw, int bh, int w, int h, int bytesperpixel,
469
+                                              const uint16_t *scale, const uint8_t *step)
470
+{
471
+    if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
472
+        s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
473
+        mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
474
+                           ref_v, src_stride_v, ref_frame,
475
+                           y, x, in_mv, bw, bh, w, h, bytesperpixel);
476
+    } else {
477
+    int mx, my;
478
+    int refbw_m1, refbh_m1;
479
+    int th;
480
+    VP56mv mv;
481
+
482
+    if (s->ss_h) {
483
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
484
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
485
+        mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
486
+    } else {
487
+        mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
488
+        mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
489
+    }
490
+    if (s->ss_v) {
491
+        // BUG https://code.google.com/p/webm/issues/detail?id=820
492
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
493
+        my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
494
+    } else {
495
+        mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
496
+        my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
497
+    }
498
+#undef scale_mv
499
+    y = my >> 4;
500
+    x = mx >> 4;
501
+    ref_u += y * src_stride_u + x * bytesperpixel;
502
+    ref_v += y * src_stride_v + x * bytesperpixel;
503
+    mx &= 15;
504
+    my &= 15;
505
+    refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
506
+    refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
507
+    // FIXME bilinear filter only needs 0/1 pixels, not 3/4
508
+    // we use +7 because the last 7 pixels of each sbrow can be changed in
509
+    // the longest loopfilter of the next sbrow
510
+    th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
511
+    ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
512
+    // The arm/aarch64 _hv filters read one more row than what actually is
513
+    // needed, so switch to emulated edge one pixel sooner vertically
514
+    // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
515
+    if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
516
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
517
+                                 ref_u - 3 * src_stride_u - 3 * bytesperpixel,
518
+                                 288, src_stride_u,
519
+                                 refbw_m1 + 8, refbh_m1 + 8,
520
+                                 x - 3, y - 3, w, h);
521
+        ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
522
+        smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
523
+
524
+        s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
525
+                                 ref_v - 3 * src_stride_v - 3 * bytesperpixel,
526
+                                 288, src_stride_v,
527
+                                 refbw_m1 + 8, refbh_m1 + 8,
528
+                                 x - 3, y - 3, w, h);
529
+        ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
530
+        smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
531
+    } else {
532
+        smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
533
+        smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
534
+    }
535
+    }
536
+}
537
+
538
+#define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
539
+                    px, py, pw, ph, bw, bh, w, h, i) \
540
+    mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
541
+                   mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
542
+                   s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
543
+#define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
544
+                      row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
545
+    mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
546
+                     row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
547
+                     s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
548
+#define SCALED 1
549
+#define FN(x) x##_scaled_8bpp
550
+#define BYTES_PER_PIXEL 1
551
+#include "vp9_mc_template.c"
552
+#undef FN
553
+#undef BYTES_PER_PIXEL
554
+#define FN(x) x##_scaled_16bpp
555
+#define BYTES_PER_PIXEL 2
556
+#include "vp9_mc_template.c"
557
+#undef mc_luma_dir
558
+#undef mc_chroma_dir
559
+#undef FN
560
+#undef BYTES_PER_PIXEL
561
+#undef SCALED
562
+
563
+static av_always_inline void inter_recon(AVCodecContext *avctx, int bytesperpixel)
564
+{
565
+    VP9Context *s = avctx->priv_data;
566
+    VP9Block *b = s->b;
567
+    int row = s->row, col = s->col;
568
+
569
+    if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
570
+        if (bytesperpixel == 1) {
571
+            inter_pred_scaled_8bpp(avctx);
572
+        } else {
573
+            inter_pred_scaled_16bpp(avctx);
574
+        }
575
+    } else {
576
+        if (bytesperpixel == 1) {
577
+            inter_pred_8bpp(avctx);
578
+        } else {
579
+            inter_pred_16bpp(avctx);
580
+        }
581
+    }
582
+
583
+    if (!b->skip) {
584
+        /* mostly copied intra_recon() */
585
+
586
+        int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
587
+        int h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
588
+        int end_x = FFMIN(2 * (s->cols - col), w4);
589
+        int end_y = FFMIN(2 * (s->rows - row), h4);
590
+        int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
591
+        int uvstep1d = 1 << b->uvtx, p;
592
+        uint8_t *dst = s->dst[0];
593
+
594
+        // y itxfm add
595
+        for (n = 0, y = 0; y < end_y; y += step1d) {
596
+            uint8_t *ptr = dst;
597
+            for (x = 0; x < end_x; x += step1d,
598
+                 ptr += 4 * step1d * bytesperpixel, n += step) {
599
+                int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
600
+
601
+                if (eob)
602
+                    s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
603
+                                                  s->block + 16 * n * bytesperpixel, eob);
604
+            }
605
+            dst += 4 * s->y_stride * step1d;
606
+        }
607
+
608
+        // uv itxfm add
609
+        end_x >>= s->ss_h;
610
+        end_y >>= s->ss_v;
611
+        step = 1 << (b->uvtx * 2);
612
+        for (p = 0; p < 2; p++) {
613
+            dst = s->dst[p + 1];
614
+            for (n = 0, y = 0; y < end_y; y += uvstep1d) {
615
+                uint8_t *ptr = dst;
616
+                for (x = 0; x < end_x; x += uvstep1d,
617
+                     ptr += 4 * uvstep1d * bytesperpixel, n += step) {
618
+                    int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
619
+
620
+                    if (eob)
621
+                        s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
622
+                                                        s->uvblock[p] + 16 * n * bytesperpixel, eob);
623
+                }
624
+                dst += 4 * uvstep1d * s->uv_stride;
625
+            }
626
+        }
627
+    }
628
+}
629
+
630
+void ff_vp9_inter_recon_8bpp(AVCodecContext *avctx)
631
+{
632
+    inter_recon(avctx, 1);
633
+}
634
+
635
+void ff_vp9_inter_recon_16bpp(AVCodecContext *avctx)
636
+{
637
+    inter_recon(avctx, 2);
638
+}