Browse code

Merge remote-tracking branch 'qatar/master'

* qatar/master:
dnxhddec: avoid a branch in 10-bit decode_dct_block()
H.264: Add optimizations to predict x86 assembly.
riff: Add mpgv MPEG-2 fourcc
add Flash Screen Video 2 decoder

Conflicts:
configure
doc/general.texi
libavcodec/Makefile
libavcodec/allcodecs.c
libavcodec/version.h

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2011/07/23 23:47:10
Showing 12 changed files
... ...
@@ -6,6 +6,7 @@ version next:
6 6
 - openal input device added
7 7
 - boxblur filter added
8 8
 - BWF muxer
9
+- Flash Screen Video 2 decoder
9 10
 
10 11
 
11 12
 version 0.8:
... ...
@@ -1280,6 +1280,7 @@ flac_encoder_select="golomb lpc"
1280 1280
 flashsv_decoder_select="zlib"
1281 1281
 flashsv_encoder_select="zlib"
1282 1282
 flashsv2_encoder_select="zlib"
1283
+flashsv2_decoder_select="zlib"
1283 1284
 flv_decoder_select="h263_decoder"
1284 1285
 flv_encoder_select="h263_encoder"
1285 1286
 fraps_decoder_select="huffman"
... ...
@@ -401,7 +401,7 @@ following image formats are supported:
401 401
     @tab experimental lossless codec (fourcc: FFV1)
402 402
 @item Flash Screen Video v1  @tab  X  @tab  X
403 403
     @tab fourcc: FSV1
404
-@item Flash Screen Video v2  @tab  X
404
+@item Flash Screen Video v2  @tab  X  @tab  X
405 405
 @item Flash Video (FLV)      @tab  X  @tab  X
406 406
     @tab Sorenson H.263 used in Flash
407 407
 @item Fraps                  @tab     @tab  X
... ...
@@ -153,6 +153,7 @@ OBJS-$(CONFIG_FLAC_ENCODER)            += flacenc.o flacdata.o flac.o vorbis_dat
153 153
 OBJS-$(CONFIG_FLASHSV_DECODER)         += flashsv.o
154 154
 OBJS-$(CONFIG_FLASHSV_ENCODER)         += flashsvenc.o
155 155
 OBJS-$(CONFIG_FLASHSV2_ENCODER)        += flashsv2enc.o
156
+OBJS-$(CONFIG_FLASHSV2_DECODER)        += flashsv.o
156 157
 OBJS-$(CONFIG_FLIC_DECODER)            += flicvideo.o
157 158
 OBJS-$(CONFIG_FOURXM_DECODER)          += 4xm.o
158 159
 OBJS-$(CONFIG_FRAPS_DECODER)           += fraps.o
... ...
@@ -109,7 +109,7 @@ void avcodec_register_all(void)
109 109
     REGISTER_ENCDEC  (FFV1, ffv1);
110 110
     REGISTER_ENCDEC  (FFVHUFF, ffvhuff);
111 111
     REGISTER_ENCDEC  (FLASHSV, flashsv);
112
-    REGISTER_ENCODER (FLASHSV2, flashsv2);
112
+    REGISTER_ENCDEC (FLASHSV2, flashsv2);
113 113
     REGISTER_DECODER (FLIC, flic);
114 114
     REGISTER_ENCDEC  (FLV, flv);
115 115
     REGISTER_DECODER (FOURXM, fourxm);
... ...
@@ -246,7 +246,7 @@ static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx,
246 246
         //av_log(ctx->avctx, AV_LOG_DEBUG, "j %d\n", j);
247 247
         //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, weight %d\n", level, weight_matrix[i]);
248 248
         level = (2*level+1) * qscale * weight_matrix[i];
249
-        if (weight_matrix[i] != level_bias)
249
+        if (level_bias < 32 || weight_matrix[i] != level_bias)
250 250
             level += level_bias;
251 251
         level >>= level_shift;
252 252
 
... ...
@@ -25,6 +25,8 @@
25 25
  * Flash Screen Video decoder
26 26
  * @author Alex Beregszaszi
27 27
  * @author Benjamin Larsson
28
+ * @author Daniel Verkamp
29
+ * @author Konstantin Shishkov
28 30
  *
29 31
  * A description of the bitstream format for Flash Screen Video version 1/2
30 32
  * is part of the SWF File Format Specification (version 10), which can be
... ...
@@ -35,9 +37,17 @@
35 35
 #include <stdlib.h>
36 36
 #include <zlib.h>
37 37
 
38
+#include "libavutil/intreadwrite.h"
38 39
 #include "avcodec.h"
40
+#include "bytestream.h"
39 41
 #include "get_bits.h"
40 42
 
43
+typedef struct BlockInfo {
44
+    uint8_t *pos;
45
+    int      size;
46
+    int      unp_size;
47
+} BlockInfo;
48
+
41 49
 typedef struct FlashSVContext {
42 50
     AVCodecContext *avctx;
43 51
     AVFrame         frame;
... ...
@@ -46,9 +56,50 @@ typedef struct FlashSVContext {
46 46
     uint8_t        *tmpblock;
47 47
     int             block_size;
48 48
     z_stream        zstream;
49
+    int             ver;
50
+    const uint32_t *pal;
51
+    int             is_keyframe;
52
+    uint8_t        *keyframedata;
53
+    uint8_t        *keyframe;
54
+    BlockInfo      *blocks;
55
+    uint8_t        *deflate_block;
56
+    int             deflate_block_size;
57
+    int             color_depth;
58
+    int             zlibprime_curr, zlibprime_prev;
59
+    int             diff_start, diff_height;
49 60
 } FlashSVContext;
50 61
 
51 62
 
63
+static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy,
64
+                         int h, int w, int stride, const uint32_t *pal)
65
+{
66
+    int x, y;
67
+    const uint8_t *orig_src = sptr;
68
+
69
+    for (y = dx+h; y > dx; y--) {
70
+        uint8_t *dst = dptr + (y * stride) + dy * 3;
71
+        for (x = 0; x < w; x++) {
72
+            if (*sptr & 0x80) {
73
+                /* 15-bit color */
74
+                unsigned c = AV_RB16(sptr) & ~0x8000;
75
+                unsigned b =  c        & 0x1F;
76
+                unsigned g = (c >>  5) & 0x1F;
77
+                unsigned r =  c >> 10;
78
+                /* 000aaabb -> aaabbaaa  */
79
+                *dst++ = (b << 3) | (b >> 2);
80
+                *dst++ = (g << 3) | (g >> 2);
81
+                *dst++ = (r << 3) | (r >> 2);
82
+                sptr += 2;
83
+            } else {
84
+                /* palette index */
85
+                uint32_t c = pal[*sptr++];
86
+                bytestream_put_le24(&dst, c);
87
+            }
88
+        }
89
+    }
90
+    return sptr - orig_src;
91
+}
92
+
52 93
 static av_cold int flashsv_decode_init(AVCodecContext *avctx)
53 94
 {
54 95
     FlashSVContext *s = avctx->priv_data;
... ...
@@ -71,9 +122,42 @@ static av_cold int flashsv_decode_init(AVCodecContext *avctx)
71 71
 }
72 72
 
73 73
 
74
+static void flashsv2_prime(FlashSVContext *s, uint8_t *src,
75
+                           int size, int unp_size)
76
+{
77
+    z_stream zs;
78
+
79
+    zs.zalloc = NULL;
80
+    zs.zfree  = NULL;
81
+    zs.opaque = NULL;
82
+
83
+    s->zstream.next_in   = src;
84
+    s->zstream.avail_in  = size;
85
+    s->zstream.next_out  = s->tmpblock;
86
+    s->zstream.avail_out = s->block_size * 3;
87
+    inflate(&s->zstream, Z_SYNC_FLUSH);
88
+
89
+    deflateInit(&zs, 0);
90
+    zs.next_in   = s->tmpblock;
91
+    zs.avail_in  = s->block_size * 3 - s->zstream.avail_out;
92
+    zs.next_out  = s->deflate_block;
93
+    zs.avail_out = s->deflate_block_size;
94
+    deflate(&zs, Z_SYNC_FLUSH);
95
+    deflateEnd(&zs);
96
+
97
+    inflateReset(&s->zstream);
98
+
99
+    s->zstream.next_in   = s->deflate_block;
100
+    s->zstream.avail_in  = s->deflate_block_size - zs.avail_out;
101
+    s->zstream.next_out  = s->tmpblock;
102
+    s->zstream.avail_out = s->block_size * 3;
103
+    inflate(&s->zstream, Z_SYNC_FLUSH);
104
+}
105
+
74 106
 static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
75 107
                                 GetBitContext *gb, int block_size,
76
-                                int width, int height, int x_pos, int y_pos)
108
+                                int width, int height, int x_pos, int y_pos,
109
+                                int blk_idx)
77 110
 {
78 111
     struct FlashSVContext *s = avctx->priv_data;
79 112
     uint8_t *line = s->tmpblock;
... ...
@@ -82,6 +166,10 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
82 82
     if (ret != Z_OK) {
83 83
         //return -1;
84 84
     }
85
+    if (s->zlibprime_curr || s->zlibprime_prev) {
86
+        flashsv2_prime(s, s->blocks[blk_idx].pos, s->blocks[blk_idx].size,
87
+                       s->blocks[blk_idx].unp_size);
88
+    }
85 89
     s->zstream.next_in   = avpkt->data + get_bits_count(gb) / 8;
86 90
     s->zstream.avail_in  = block_size;
87 91
     s->zstream.next_out  = s->tmpblock;
... ...
@@ -96,19 +184,48 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt,
96 96
     if (ret != Z_OK && ret != Z_STREAM_END) {
97 97
         //return -1;
98 98
     }
99
-    /* Flash Screen Video stores the image upside down, so copy
100
-     * lines to destination in reverse order. */
101
-    for (k = 1; k <= height; k++) {
102
-        memcpy(s->frame.data[0] + x_pos * 3 +
103
-               (s->image_height - y_pos - k) * s->frame.linesize[0],
104
-               line, width * 3);
105
-        /* advance source pointer to next line */
106
-        line += width * 3;
99
+
100
+    if (s->is_keyframe) {
101
+        s->blocks[blk_idx].pos      = s->keyframedata + (get_bits_count(gb) / 8);
102
+        s->blocks[blk_idx].size     = block_size;
103
+        s->blocks[blk_idx].unp_size = s->block_size * 3 - s->zstream.avail_out;
104
+    }
105
+    if (!s->color_depth) {
106
+        /* Flash Screen Video stores the image upside down, so copy
107
+         * lines to destination in reverse order. */
108
+        for (k = 1; k <= s->diff_height; k++) {
109
+            memcpy(s->frame.data[0] + x_pos * 3 +
110
+                   (s->image_height - y_pos - s->diff_start - k) * s->frame.linesize[0],
111
+                   line, width * 3);
112
+            /* advance source pointer to next line */
113
+            line += width * 3;
114
+        }
115
+    } else {
116
+        /* hybrid 15-bit/palette mode */
117
+        decode_hybrid(s->tmpblock, s->frame.data[0],
118
+                      s->image_height - (y_pos + 1 + s->diff_start + s->diff_height),
119
+                      x_pos, s->diff_height, width,
120
+                      s->frame.linesize[0], s->pal);
107 121
     }
108 122
     skip_bits_long(gb, 8 * block_size); /* skip the consumed bits */
109 123
     return 0;
110 124
 }
111 125
 
126
+static int calc_deflate_block_size(int tmpblock_size)
127
+{
128
+    z_stream zstream;
129
+    int size;
130
+
131
+    zstream.zalloc = Z_NULL;
132
+    zstream.zfree  = Z_NULL;
133
+    zstream.opaque = Z_NULL;
134
+    if (deflateInit(&zstream, 0) != Z_OK)
135
+        return -1;
136
+    size = deflateBound(&zstream, tmpblock_size);
137
+    deflateEnd(&zstream);
138
+
139
+    return size;
140
+}
112 141
 
113 142
 static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
114 143
                                 int *data_size, AVPacket *avpkt)
... ...
@@ -132,6 +249,18 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
132 132
     s->block_height = 16 * (get_bits(&gb,  4) + 1);
133 133
     s->image_height =       get_bits(&gb, 12);
134 134
 
135
+    if (s->ver == 2) {
136
+        skip_bits(&gb, 6);
137
+        if (get_bits1(&gb)) {
138
+            av_log_missing_feature(avctx, "iframe", 1);
139
+            return AVERROR_PATCHWELCOME;
140
+        }
141
+        if (get_bits1(&gb)) {
142
+            av_log_missing_feature(avctx, "custom palette", 1);
143
+            return AVERROR_PATCHWELCOME;
144
+        }
145
+    }
146
+
135 147
     /* calculate number of blocks and size of border (partial) blocks */
136 148
     h_blocks = s->image_width  / s->block_width;
137 149
     h_part   = s->image_width  % s->block_width;
... ...
@@ -141,11 +270,25 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
141 141
     /* the block size could change between frames, make sure the buffer
142 142
      * is large enough, if not, get a larger one */
143 143
     if (s->block_size < s->block_width * s->block_height) {
144
-        av_free(s->tmpblock);
145
-        if ((s->tmpblock = av_malloc(3 * s->block_width * s->block_height)) == NULL) {
144
+        int tmpblock_size = 3 * s->block_width * s->block_height;
145
+
146
+        s->tmpblock = av_realloc(s->tmpblock, tmpblock_size);
147
+        if (!s->tmpblock) {
146 148
             av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n");
147 149
             return AVERROR(ENOMEM);
148 150
         }
151
+        if (s->ver == 2) {
152
+            s->deflate_block_size = calc_deflate_block_size(tmpblock_size);
153
+            if (s->deflate_block_size <= 0) {
154
+                av_log(avctx, AV_LOG_ERROR, "Can't determine deflate buffer size.\n");
155
+                return -1;
156
+            }
157
+            s->deflate_block = av_realloc(s->deflate_block, s->deflate_block_size);
158
+            if (!s->deflate_block) {
159
+                av_log(avctx, AV_LOG_ERROR, "Can't allocate deflate buffer.\n");
160
+                return AVERROR(ENOMEM);
161
+            }
162
+        }
149 163
     }
150 164
     s->block_size = s->block_width * s->block_height;
151 165
 
... ...
@@ -164,6 +307,16 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
164 164
         return AVERROR_INVALIDDATA;
165 165
     }
166 166
 
167
+    /* we care for keyframes only in Screen Video v2 */
168
+    s->is_keyframe = (avpkt->flags & AV_PKT_FLAG_KEY) && (s->ver == 2);
169
+    if (s->is_keyframe) {
170
+        s->keyframedata = av_realloc(s->keyframedata, avpkt->size);
171
+        memcpy(s->keyframedata, avpkt->data, avpkt->size);
172
+        s->blocks = av_realloc(s->blocks,
173
+                               (v_blocks + !!v_part) * (h_blocks + !!h_part)
174
+                               * sizeof(s->blocks[0]));
175
+    }
176
+
167 177
     av_dlog(avctx, "image: %dx%d block: %dx%d num: %dx%d part: %dx%d\n",
168 178
             s->image_width, s->image_height, s->block_width, s->block_height,
169 179
             h_blocks, v_blocks, h_part, v_part);
... ...
@@ -187,25 +340,90 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data,
187 187
         for (i = 0; i < h_blocks + (h_part ? 1 : 0); i++) {
188 188
             int x_pos = i * s->block_width; // horizontal position in frame
189 189
             int cur_blk_width = (i < h_blocks) ? s->block_width : h_part;
190
+            int has_diff = 0;
190 191
 
191 192
             /* get the size of the compressed zlib chunk */
192 193
             int size = get_bits(&gb, 16);
194
+
195
+            s->color_depth    = 0;
196
+            s->zlibprime_curr = 0;
197
+            s->zlibprime_prev = 0;
198
+            s->diff_start     = 0;
199
+            s->diff_height    = cur_blk_height;
200
+
193 201
             if (8 * size > get_bits_left(&gb)) {
194 202
                 avctx->release_buffer(avctx, &s->frame);
195 203
                 s->frame.data[0] = NULL;
196 204
                 return AVERROR_INVALIDDATA;
197 205
             }
198 206
 
207
+            if (s->ver == 2 && size) {
208
+                skip_bits(&gb, 3);
209
+                s->color_depth    = get_bits(&gb, 2);
210
+                has_diff          = get_bits1(&gb);
211
+                s->zlibprime_curr = get_bits1(&gb);
212
+                s->zlibprime_prev = get_bits1(&gb);
213
+
214
+                if (s->color_depth != 0 && s->color_depth != 2) {
215
+                    av_log(avctx, AV_LOG_ERROR,
216
+                           "%dx%d invalid color depth %d\n", i, j, s->color_depth);
217
+                    return -1;
218
+                }
219
+
220
+                if (has_diff) {
221
+                    s->diff_start  = get_bits(&gb, 8);
222
+                    s->diff_height = get_bits(&gb, 8);
223
+                    av_log(avctx, AV_LOG_DEBUG,
224
+                           "%dx%d diff start %d height %d\n",
225
+                           i, j, s->diff_start, s->diff_height);
226
+                    size -= 2;
227
+                }
228
+
229
+                if (s->zlibprime_prev)
230
+                    av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_prev\n", i, j);
231
+
232
+                if (s->zlibprime_curr) {
233
+                    int col = get_bits(&gb, 8);
234
+                    int row = get_bits(&gb, 8);
235
+                    av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_curr %dx%d\n", i, j, col, row);
236
+                    size -= 2;
237
+                    av_log_missing_feature(avctx, "zlibprime_curr", 1);
238
+                    return AVERROR_PATCHWELCOME;
239
+                }
240
+                size--; // account for flags byte
241
+            }
242
+
243
+            if (has_diff) {
244
+                int k;
245
+                int off = (s->image_height - y_pos - 1) * s->frame.linesize[0];
246
+
247
+                for (k = 0; k < cur_blk_height; k++)
248
+                    memcpy(s->frame.data[0] + off - k*s->frame.linesize[0] + x_pos*3,
249
+                           s->keyframe + off - k*s->frame.linesize[0] + x_pos*3,
250
+                           cur_blk_width * 3);
251
+            }
252
+
199 253
             /* skip unchanged blocks, which have size 0 */
200 254
             if (size) {
201 255
                 if (flashsv_decode_block(avctx, avpkt, &gb, size,
202 256
                                          cur_blk_width, cur_blk_height,
203
-                                         x_pos, y_pos))
257
+                                         x_pos, y_pos,
258
+                                         i + j * (h_blocks + !!h_part)))
204 259
                     av_log(avctx, AV_LOG_ERROR,
205 260
                            "error in decompression of block %dx%d\n", i, j);
206 261
             }
207 262
         }
208 263
     }
264
+    if (s->is_keyframe && s->ver == 2) {
265
+        if (!s->keyframe) {
266
+            s->keyframe = av_malloc(s->frame.linesize[0] * avctx->height);
267
+            if (!s->keyframe) {
268
+                av_log(avctx, AV_LOG_ERROR, "Cannot allocate image data\n");
269
+                return AVERROR(ENOMEM);
270
+            }
271
+        }
272
+        memcpy(s->keyframe, s->frame.data[0], s->frame.linesize[0] * avctx->height);
273
+    }
209 274
 
210 275
     *data_size = sizeof(AVFrame);
211 276
     *(AVFrame*)data = s->frame;
... ...
@@ -234,6 +452,7 @@ static av_cold int flashsv_decode_end(AVCodecContext *avctx)
234 234
 }
235 235
 
236 236
 
237
+#if CONFIG_FLASHSV_DECODER
237 238
 AVCodec ff_flashsv_decoder = {
238 239
     .name           = "flashsv",
239 240
     .type           = AVMEDIA_TYPE_VIDEO,
... ...
@@ -246,3 +465,67 @@ AVCodec ff_flashsv_decoder = {
246 246
     .pix_fmts       = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_NONE},
247 247
     .long_name      = NULL_IF_CONFIG_SMALL("Flash Screen Video v1"),
248 248
 };
249
+#endif /* CONFIG_FLASHSV_DECODER */
250
+
251
+#if CONFIG_FLASHSV2_DECODER
252
+static const uint32_t ff_flashsv2_default_palette[128] = {
253
+    0x000000, 0x333333, 0x666666, 0x999999, 0xCCCCCC, 0xFFFFFF,
254
+    0x330000, 0x660000, 0x990000, 0xCC0000, 0xFF0000, 0x003300,
255
+    0x006600, 0x009900, 0x00CC00, 0x00FF00, 0x000033, 0x000066,
256
+    0x000099, 0x0000CC, 0x0000FF, 0x333300, 0x666600, 0x999900,
257
+    0xCCCC00, 0xFFFF00, 0x003333, 0x006666, 0x009999, 0x00CCCC,
258
+    0x00FFFF, 0x330033, 0x660066, 0x990099, 0xCC00CC, 0xFF00FF,
259
+    0xFFFF33, 0xFFFF66, 0xFFFF99, 0xFFFFCC, 0xFF33FF, 0xFF66FF,
260
+    0xFF99FF, 0xFFCCFF, 0x33FFFF, 0x66FFFF, 0x99FFFF, 0xCCFFFF,
261
+    0xCCCC33, 0xCCCC66, 0xCCCC99, 0xCCCCFF, 0xCC33CC, 0xCC66CC,
262
+    0xCC99CC, 0xCCFFCC, 0x33CCCC, 0x66CCCC, 0x99CCCC, 0xFFCCCC,
263
+    0x999933, 0x999966, 0x9999CC, 0x9999FF, 0x993399, 0x996699,
264
+    0x99CC99, 0x99FF99, 0x339999, 0x669999, 0xCC9999, 0xFF9999,
265
+    0x666633, 0x666699, 0x6666CC, 0x6666FF, 0x663366, 0x669966,
266
+    0x66CC66, 0x66FF66, 0x336666, 0x996666, 0xCC6666, 0xFF6666,
267
+    0x333366, 0x333399, 0x3333CC, 0x3333FF, 0x336633, 0x339933,
268
+    0x33CC33, 0x33FF33, 0x663333, 0x993333, 0xCC3333, 0xFF3333,
269
+    0x003366, 0x336600, 0x660033, 0x006633, 0x330066, 0x663300,
270
+    0x336699, 0x669933, 0x993366, 0x339966, 0x663399, 0x996633,
271
+    0x6699CC, 0x99CC66, 0xCC6699, 0x66CC99, 0x9966CC, 0xCC9966,
272
+    0x99CCFF, 0xCCFF99, 0xFF99CC, 0x99FFCC, 0xCC99FF, 0xFFCC99,
273
+    0x111111, 0x222222, 0x444444, 0x555555, 0xAAAAAA, 0xBBBBBB,
274
+    0xDDDDDD, 0xEEEEEE
275
+};
276
+
277
+static av_cold int flashsv2_decode_init(AVCodecContext *avctx)
278
+{
279
+    FlashSVContext *s = avctx->priv_data;
280
+    flashsv_decode_init(avctx);
281
+    s->pal = ff_flashsv2_default_palette;
282
+    s->ver = 2;
283
+
284
+    return 0;
285
+}
286
+
287
+static av_cold int flashsv2_decode_end(AVCodecContext *avctx)
288
+{
289
+    FlashSVContext *s = avctx->priv_data;
290
+
291
+    av_freep(&s->keyframedata);
292
+    av_freep(&s->blocks);
293
+    av_freep(&s->keyframe);
294
+    av_freep(&s->deflate_block);
295
+    flashsv_decode_end(avctx);
296
+
297
+    return 0;
298
+}
299
+
300
+AVCodec ff_flashsv2_decoder = {
301
+    .name           = "flashsv2",
302
+    .type           = AVMEDIA_TYPE_VIDEO,
303
+    .id             = CODEC_ID_FLASHSV2,
304
+    .priv_data_size = sizeof(FlashSVContext),
305
+    .init           = flashsv2_decode_init,
306
+    .close          = flashsv2_decode_end,
307
+    .decode         = flashsv_decode_frame,
308
+    .capabilities   = CODEC_CAP_DR1,
309
+    .pix_fmts       = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_NONE},
310
+    .long_name      = NULL_IF_CONFIG_SMALL("Flash Screen Video v2"),
311
+};
312
+#endif /* CONFIG_FLASHSV2_DECODER */
... ...
@@ -21,7 +21,7 @@
21 21
 #define AVCODEC_VERSION_H
22 22
 
23 23
 #define LIBAVCODEC_VERSION_MAJOR 53
24
-#define LIBAVCODEC_VERSION_MINOR  8
24
+#define LIBAVCODEC_VERSION_MINOR  9
25 25
 #define LIBAVCODEC_VERSION_MICRO  0
26 26
 
27 27
 #define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
... ...
@@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3
2611 2611
     punpckldq m1, [r1]
2612 2612
     movq      m2, m1
2613 2613
     movq      m3, m1
2614
-    movq      m4, m1
2615 2614
     psllq     m1, 8
2616 2615
     pxor      m2, m1
2617 2616
     psrlq     m2, 8
2618
-    pxor      m3, m2
2619
-    PRED4x4_LOWPASS m0, m1, m3, m4, m5
2617
+    pxor      m2, m3
2618
+    PRED4x4_LOWPASS m0, m1, m2, m3, m4
2620 2619
     lea       r1, [r0+r2*2]
2621 2620
     psrlq     m0, 8
2622 2621
     movd      [r0+r2*1], m0
... ...
@@ -27,8 +27,6 @@
27 27
 
28 28
 SECTION_RODATA
29 29
 
30
-SECTION .text
31
-
32 30
 cextern pw_16
33 31
 cextern pw_8
34 32
 cextern pw_4
... ...
@@ -42,6 +40,8 @@ pw_512:       times 8 dw 512
42 42
 pd_17:        times 4 dd 17
43 43
 pd_16:        times 4 dd 16
44 44
 
45
+SECTION .text
46
+
45 47
 ; dest, left, right, src
46 48
 ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
47 49
 %macro PRED4x4_LOWPASS 4
... ...
@@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3
64 64
     movq      m3, [r0]
65 65
     punpckhdq m1, m2
66 66
     PALIGNR   m3, m1, 10, m1
67
-    mova      m1, m3
68 67
     movhps    m4, [r1+r2*1-8]
69
-    PALIGNR   m3, m4, 14, m4
70
-    mova      m2, m3
68
+    PALIGNR   m0, m3, m4, 14, m4
71 69
     movhps    m4, [r1+r2*2-8]
72
-    PALIGNR   m3, m4, 14, m4
73
-    PRED4x4_LOWPASS m0, m3, m1, m2
70
+    PALIGNR   m2, m0, m4, 14, m4
71
+    PRED4x4_LOWPASS m0, m2, m3, m0
74 72
     movq      [r1+r2*2], m0
75 73
     psrldq    m0, 2
76 74
     movq      [r1+r2*1], m0
... ...
@@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6
104 104
     pavgw   m5, m0
105 105
     movhps  m1, [r0+r2*1-8]
106 106
     PALIGNR m0, m1, 14, m1      ; ....t3t2t1t0ltl0
107
-    mova    m1, m0
108 107
     movhps  m2, [r0+r2*2-8]
109
-    PALIGNR m0, m2, 14, m2      ; ..t3t2t1t0ltl0l1
110
-    mova    m2, m0
108
+    PALIGNR m1, m0, m2, 14, m2  ; ..t3t2t1t0ltl0l1
111 109
     movhps  m3, [r1+r2*1-8]
112
-    PALIGNR m0, m3, 14, m3      ; t3t2t1t0ltl0l1l2
113
-    PRED4x4_LOWPASS m3, m1, m0, m2
114
-    pslldq  m1, m3, 12
115
-    psrldq  m3, 4
110
+    PALIGNR m2, m1, m3, 14, m3  ; t3t2t1t0ltl0l1l2
111
+    PRED4x4_LOWPASS m1, m0, m2, m1
112
+    pslldq  m0, m1, 12
113
+    psrldq  m1, 4
116 114
     movq    [r0+r2*1], m5
117
-    movq    [r0+r2*2], m3
118
-    PALIGNR m5, m1, 14, m2
119
-    pslldq  m1, 2
115
+    movq    [r0+r2*2], m1
116
+    PALIGNR m5, m0, 14, m2
117
+    pslldq  m0, 2
120 118
     movq    [r1+r2*1], m5
121
-    PALIGNR m3, m1, 14, m1
122
-    movq    [r1+r2*2], m3
119
+    PALIGNR m1, m0, 14, m0
120
+    movq    [r1+r2*2], m1
123 121
     RET
124 122
 %endmacro
125 123
 
... ...
@@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3
152 152
     punpckhdq  m1, m2          ; l0 l1 l2 l3
153 153
     punpckhqdq m1, m0          ; t2 t1 t0 lt l0 l1 l2 l3
154 154
     psrldq     m0, m1, 4       ; .. .. t2 t1 t0 lt l0 l1
155
-    psrldq     m2, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
156
-    pavgw      m5, m1, m2
157
-    PRED4x4_LOWPASS m3, m1, m0, m2
155
+    psrldq     m3, m1, 2       ; .. t2 t1 t0 lt l0 l1 l2
156
+    pavgw      m5, m1, m3
157
+    PRED4x4_LOWPASS m3, m1, m0, m3
158 158
     punpcklwd  m5, m3
159 159
     psrldq     m3, 8
160 160
     PALIGNR    m3, m5, 12, m4
... ...
@@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3
220 220
 ;-----------------------------------------------------------------------------
221 221
 ; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
222 222
 ;-----------------------------------------------------------------------------
223
-;TODO: more AVX here
224 223
 %macro PRED4x4_DL 1
225 224
 cglobal pred4x4_down_left_10_%1, 3,3
226 225
     sub        r0, r2
227
-    movq       m1, [r0]
228
-    movhps     m1, [r1]
229
-    pslldq     m5, m1, 2
230
-    pxor       m2, m5, m1
231
-    psrldq     m2, 2
232
-    pxor       m3, m1, m2
233
-    PRED4x4_LOWPASS m0, m5, m3, m1
226
+    movq       m0, [r0]
227
+    movhps     m0, [r1]
228
+    psrldq     m2, m0, 2
229
+    pslldq     m3, m0, 2
230
+    pshufhw    m2, m2, 10100100b
231
+    PRED4x4_LOWPASS m0, m3, m2, m0
234 232
     lea        r1, [r0+r2*2]
235 233
     movhps     [r1+r2*2], m0
236 234
     psrldq     m0, 2
... ...
@@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3
257 257
     sub        r0, r2
258 258
     movu       m1, [r0]
259 259
     movhps     m1, [r1]
260
-    psrldq     m3, m1, 2
260
+    psrldq     m0, m1, 2
261 261
     psrldq     m2, m1, 4
262
-    pavgw      m4, m3, m1
263
-    PRED4x4_LOWPASS m0, m1, m2, m3
262
+    pavgw      m4, m0, m1
263
+    PRED4x4_LOWPASS m0, m1, m2, m0
264 264
     lea        r1, [r0+r2*2]
265 265
     movq       [r0+r2*1], m4
266 266
     movq       [r0+r2*2], m0
... ...
@@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3
298 298
     pavgw     m2, m0
299 299
 
300 300
     pshufw    m5, m0, 11111110b
301
-    PRED4x4_LOWPASS m3, m0, m5, m1
301
+    PRED4x4_LOWPASS m1, m0, m5, m1
302 302
     movq      m6, m2
303
-    punpcklwd m6, m3
303
+    punpcklwd m6, m1
304 304
     movq      [r0+r2*1], m6
305 305
     psrlq     m2, 16
306
-    psrlq     m3, 16
307
-    punpcklwd m2, m3
306
+    psrlq     m1, 16
307
+    punpcklwd m2, m1
308 308
     movq      [r0+r2*2], m2
309 309
     psrlq     m2, 32
310 310
     movd      [r1+r2*1], m2
... ...
@@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2
333 333
 ;-----------------------------------------------------------------------------
334 334
 INIT_XMM
335 335
 cglobal pred8x8_horizontal_10_sse2, 2,3
336
-    mov          r2, 4
336
+    mov         r2d, 4
337 337
 .loop:
338 338
     movq         m0, [r0+r1*0-8]
339 339
     movq         m1, [r0+r1*1-8]
... ...
@@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
344 344
     mova  [r0+r1*0], m0
345 345
     mova  [r0+r1*1], m1
346 346
     lea          r0, [r0+r1*2]
347
-    dec          r2
347
+    dec          r2d
348 348
     jg .loop
349 349
     REP_RET
350 350
 
... ...
@@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3
362 362
 %endmacro
363 363
 
364 364
 %macro PRED8x8_DC 2
365
-cglobal pred8x8_dc_10_%1, 2,4
366
-%ifdef ARCH_X86_64
367
-%define t0 r10
368
-%else
369
-%define t0 r0m
370
-%endif
365
+cglobal pred8x8_dc_10_%1, 2,6
371 366
     sub         r0, r1
372 367
     pxor        m4, m4
373 368
     movq        m0, [r0+0]
374 369
     movq        m1, [r0+8]
375
-    HADDW       m0, m2
376
-    mov         t0, r0
377
-    HADDW       m1, m2
370
+%if mmsize==16
371
+    punpcklwd   m0, m1
372
+    movhlps     m1, m0
373
+    paddw       m0, m1
374
+%else
375
+    pshufw      m2, m0, 00001110b
376
+    pshufw      m3, m1, 00001110b
377
+    paddw       m0, m2
378
+    paddw       m1, m3
379
+    punpcklwd   m0, m1
380
+%endif
381
+    %2          m2, m0, 00001110b
382
+    paddw       m0, m2
378 383
 
384
+    lea         r5, [r1*3]
385
+    lea         r4, [r0+r1*4]
379 386
     movzx      r2d, word [r0+r1*1-2]
380 387
     movzx      r3d, word [r0+r1*2-2]
381
-    lea         r0, [r0+r1*2]
382 388
     add        r2d, r3d
383
-    movzx      r3d, word [r0+r1*1-2]
389
+    movzx      r3d, word [r0+r5*1-2]
384 390
     add        r2d, r3d
385
-    movzx      r3d, word [r0+r1*2-2]
391
+    movzx      r3d, word [r4-2]
386 392
     add        r2d, r3d
387
-    lea         r0, [r0+r1*2]
388 393
     movd        m2, r2d            ; s2
389 394
 
390
-    movzx      r2d, word [r0+r1*1-2]
391
-    movzx      r3d, word [r0+r1*2-2]
392
-    lea         r0, [r0+r1*2]
395
+    movzx      r2d, word [r4+r1*1-2]
396
+    movzx      r3d, word [r4+r1*2-2]
393 397
     add        r2d, r3d
394
-    movzx      r3d, word [r0+r1*1-2]
398
+    movzx      r3d, word [r4+r5*1-2]
395 399
     add        r2d, r3d
396
-    movzx      r3d, word [r0+r1*2-2]
400
+    movzx      r3d, word [r4+r1*4-2]
397 401
     add        r2d, r3d
398 402
     movd        m3, r2d            ; s3
399 403
 
400
-    punpcklwd   m0, m1
401
-    mov         r0, t0
402 404
     punpcklwd   m2, m3
403 405
     punpckldq   m0, m2            ; s0, s1, s2, s3
404 406
     %2          m3, m0, 11110110b ; s2, s1, s3, s3
405
-    lea         r2, [r1+r1*2]
406 407
     %2          m0, m0, 01110100b ; s0, s1, s3, s1
407 408
     paddw       m0, m3
408
-    lea         r3, [r0+r1*4]
409 409
     psrlw       m0, 2
410 410
     pavgw       m0, m4            ; s0+s2, s1, s3, s1+s3
411
-%ifidn %1, sse2
411
+%if mmsize==16
412 412
     punpcklwd   m0, m0
413 413
     pshufd      m3, m0, 11111010b
414 414
     punpckldq   m0, m0
... ...
@@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4
421 421
 %endif
422 422
     MOV8   r0+r1*1, m1, m2
423 423
     MOV8   r0+r1*2, m1, m2
424
-    MOV8   r0+r2*1, m1, m2
424
+    MOV8   r0+r5*1, m1, m2
425 425
     MOV8   r0+r1*4, m1, m2
426
-    MOV8   r3+r1*1, m3, m4
427
-    MOV8   r3+r1*2, m3, m4
428
-    MOV8   r3+r2*1, m3, m4
429
-    MOV8   r3+r1*4, m3, m4
426
+    MOV8   r4+r1*1, m3, m4
427
+    MOV8   r4+r1*2, m3, m4
428
+    MOV8   r4+r5*1, m3, m4
429
+    MOV8   r4+r1*4, m3, m4
430 430
     RET
431 431
 %endmacro
432 432
 
... ...
@@ -438,39 +432,29 @@ PRED8x8_DC sse2  , pshuflw
438 438
 ;-----------------------------------------------------------------------------
439 439
 ; void pred8x8_top_dc(pixel *src, int stride)
440 440
 ;-----------------------------------------------------------------------------
441
-%macro PRED8x8_TOP_DC 2
442
-cglobal pred8x8_top_dc_10_%1, 2,4
441
+INIT_XMM
442
+cglobal pred8x8_top_dc_10_sse2, 2,4
443 443
     sub         r0, r1
444
-    movq        m0, [r0+0]
445
-    movq        m1, [r0+8]
446
-    HADDW       m0, m2
447
-    HADDW       m1, m3
448
-    lea         r2, [r1+r1*2]
449
-    paddw       m0, [pw_2]
450
-    paddw       m1, [pw_2]
444
+    mova        m0, [r0]
445
+    pshuflw     m1, m0, 0x4e
446
+    pshufhw     m1, m1, 0x4e
447
+    paddw       m0, m1
448
+    pshuflw     m1, m0, 0xb1
449
+    pshufhw     m1, m1, 0xb1
450
+    paddw       m0, m1
451
+    lea         r2, [r1*3]
451 452
     lea         r3, [r0+r1*4]
453
+    paddw       m0, [pw_2]
452 454
     psrlw       m0, 2
453
-    psrlw       m1, 2
454
-    %2          m0, m0, 0
455
-    %2          m1, m1, 0
456
-%ifidn %1, sse2
457
-    punpcklqdq  m0, m1
458
-%endif
459
-    MOV8   r0+r1*1, m0, m1
460
-    MOV8   r0+r1*2, m0, m1
461
-    MOV8   r0+r2*1, m0, m1
462
-    MOV8   r0+r1*4, m0, m1
463
-    MOV8   r3+r1*1, m0, m1
464
-    MOV8   r3+r1*2, m0, m1
465
-    MOV8   r3+r2*1, m0, m1
466
-    MOV8   r3+r1*4, m0, m1
455
+    mova [r0+r1*1], m0
456
+    mova [r0+r1*2], m0
457
+    mova [r0+r2*1], m0
458
+    mova [r0+r1*4], m0
459
+    mova [r3+r1*1], m0
460
+    mova [r3+r1*2], m0
461
+    mova [r3+r2*1], m0
462
+    mova [r3+r1*4], m0
467 463
     RET
468
-%endmacro
469
-
470
-INIT_MMX
471
-PRED8x8_TOP_DC mmxext, pshufw
472
-INIT_XMM
473
-PRED8x8_TOP_DC sse2  , pshuflw
474 464
 
475 465
 ;-----------------------------------------------------------------------------
476 466
 ; void pred8x8_plane(pixel *src, int stride)
... ...
@@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2  , pshuflw
478 478
 INIT_XMM
479 479
 cglobal pred8x8_plane_10_sse2, 2,7,7
480 480
     sub       r0, r1
481
-    lea       r2, [r1+r1*2]
481
+    lea       r2, [r1*3]
482 482
     lea       r3, [r0+r1*4]
483 483
     mova      m2, [r0]
484 484
     pmaddwd   m2, [pw_m32101234]
... ...
@@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
500 500
     movzx    r5d, word [r3+r2*1-2] ; src[6*stride-1]
501 501
     movzx    r6d, word [r0+r1*1-2] ; src[0*stride-1]
502 502
     sub      r5d, r6d
503
-    lea      r5d, [r5+r5*2]
503
+    lea      r5d, [r5*3]
504 504
     add      r4d, r5d
505 505
     movzx    r6d, word [r3+r1*4-2] ; src[7*stride-1]
506 506
     movzx    r5d, word [r0+r1*0-2] ; src[ -stride-1]
... ...
@@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7
540 540
 ;-----------------------------------------------------------------------------
541 541
 %macro PRED8x8L_128_DC 1
542 542
 cglobal pred8x8l_128_dc_10_%1, 4,4
543
-    mova      m0, [pw_512]
544
-    lea       r1, [r3+r3*2]
543
+    mova      m0, [pw_512] ; (1<<(BIT_DEPTH-1))
544
+    lea       r1, [r3*3]
545 545
     lea       r2, [r0+r3*4]
546 546
     MOV8 r0+r3*0, m0, m0
547 547
     MOV8 r0+r3*1, m0, m0
... ...
@@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2
565 565
 %macro PRED8x8L_TOP_DC 1
566 566
 cglobal pred8x8l_top_dc_10_%1, 4,4,6
567 567
     sub         r0, r3
568
-    pxor        m7, m7
569
-    mova        m0, [r0-16]
570
-    mova        m3, [r0]
571
-    mova        m1, [r0+16]
572
-    mova        m2, m3
573
-    mova        m4, m3
574
-    PALIGNR     m2, m0, 14, m0
575
-    PALIGNR     m1, m4,  2, m4
576
-    test        r1, r1 ; top_left
577
-    jz .fix_lt_2
578
-    test        r2, r2 ; top_right
579
-    jz .fix_tr_1
580
-    jmp .body
581
-.fix_lt_2:
582
-    mova        m5, m3
583
-    pxor        m5, m2
584
-    pslldq      m5, 14
585
-    psrldq      m5, 14
586
-    pxor        m2, m5
587
-    test        r2, r2 ; top_right
588
-    jnz .body
589
-.fix_tr_1:
590
-    mova        m5, m3
591
-    pxor        m5, m1
592
-    psrldq      m5, 14
593
-    pslldq      m5, 14
594
-    pxor        m1, m5
595
-.body
596
-    lea         r1, [r3+r3*2]
568
+    mova        m0, [r0]
569
+    shr        r1d, 14
570
+    shr        r2d, 13
571
+    neg         r1
572
+    pslldq      m1, m0, 2
573
+    psrldq      m2, m0, 2
574
+    pinsrw      m1, [r0+r1], 0
575
+    pinsrw      m2, [r0+r2+14], 7
576
+    lea         r1, [r3*3]
597 577
     lea         r2, [r0+r3*4]
598
-    PRED4x4_LOWPASS m0, m2, m1, m3
578
+    PRED4x4_LOWPASS m0, m2, m1, m0
599 579
     HADDW       m0, m1
600 580
     paddw       m0, [pw_4]
601 581
     psrlw       m0, 3
... ...
@@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6
612 612
 %endmacro
613 613
 
614 614
 INIT_XMM
615
-%define PALIGNR PALIGNR_MMX
616 615
 PRED8x8L_TOP_DC sse2
617
-%define PALIGNR PALIGNR_SSSE3
618
-PRED8x8L_TOP_DC ssse3
616
+%ifdef HAVE_AVX
617
+INIT_AVX
618
+PRED8x8L_TOP_DC avx
619
+%endif
619 620
 
620 621
 ;-----------------------------------------------------------------------------
621 622
 ;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
622 623
 ;-----------------------------------------------------------------------------
623 624
 ;TODO: see if scalar is faster
624 625
 %macro PRED8x8L_DC 1
625
-cglobal pred8x8l_dc_10_%1, 4,5,8
626
+cglobal pred8x8l_dc_10_%1, 4,6,6
626 627
     sub         r0, r3
627
-    lea         r4, [r0+r3*2]
628
-    mova        m0, [r0+r3*1-16]
629
-    punpckhwd   m0, [r0+r3*0-16]
630
-    mova        m1, [r4+r3*1-16]
631
-    punpckhwd   m1, [r0+r3*2-16]
632
-    mov         r4, r0
628
+    lea         r4, [r0+r3*4]
629
+    lea         r5, [r3*3]
630
+    mova        m0, [r0+r3*2-16]
631
+    punpckhwd   m0, [r0+r3*1-16]
632
+    mova        m1, [r4+r3*0-16]
633
+    punpckhwd   m1, [r0+r5*1-16]
633 634
     punpckhdq   m1, m0
634
-    lea         r0, [r0+r3*4]
635
-    mova        m2, [r0+r3*1-16]
636
-    punpckhwd   m2, [r0+r3*0-16]
637
-    lea         r0, [r0+r3*2]
638
-    mova        m3, [r0+r3*1-16]
639
-    punpckhwd   m3, [r0+r3*0-16]
635
+    mova        m2, [r4+r3*2-16]
636
+    punpckhwd   m2, [r4+r3*1-16]
637
+    mova        m3, [r4+r3*4-16]
638
+    punpckhwd   m3, [r4+r5*1-16]
640 639
     punpckhdq   m3, m2
641 640
     punpckhqdq  m3, m1
642
-    lea         r0, [r0+r3*2]
643
-    mova        m0, [r0+r3*0-16]
644
-    mova        m1, [r4]
645
-    mov         r0, r4
646
-    mova        m4, m3
647
-    mova        m2, m3
648
-    PALIGNR     m4, m0, 14, m0
649
-    PALIGNR     m1, m2,  2, m2
650
-    test        r1, r1
651
-    jnz .do_left
652
-.fix_lt_1:
653
-    mova        m5, m3
654
-    pxor        m5, m4
655
-    psrldq      m5, 14
656
-    pslldq      m5, 12
657
-    pxor        m1, m5
658
-    jmp .do_left
659
-.fix_lt_2:
660
-    mova        m5, m3
661
-    pxor        m5, m2
662
-    pslldq      m5, 14
663
-    psrldq      m5, 14
664
-    pxor        m2, m5
665
-    test        r2, r2
666
-    jnz .body
667
-.fix_tr_1:
668
-    mova        m5, m3
669
-    pxor        m5, m1
670
-    psrldq      m5, 14
671
-    pslldq      m5, 14
672
-    pxor        m1, m5
673
-    jmp .body
674
-.do_left:
675
-    mova        m0, m4
676
-    PRED4x4_LOWPASS m2, m1, m4, m3
677
-    mova        m4, m0
678
-    mova        m7, m2
679
-    PRED4x4_LOWPASS m1, m3, m0, m4
680
-    pslldq      m1, 14
681
-    PALIGNR     m7, m1, 14, m3
682
-    mova        m0, [r0-16]
683
-    mova        m3, [r0]
684
-    mova        m1, [r0+16]
685
-    mova        m2, m3
686
-    mova        m4, m3
687
-    PALIGNR     m2, m0, 14, m0
688
-    PALIGNR     m1, m4,  2, m4
689
-    test        r1, r1
690
-    jz .fix_lt_2
691
-    test        r2, r2
692
-    jz .fix_tr_1
693
-.body
694
-    lea         r1, [r3+r3*2]
695
-    PRED4x4_LOWPASS m6, m2, m1, m3
696
-    HADDW       m7, m0
697
-    HADDW       m6, m0
698
-    lea         r2, [r0+r3*4]
699
-    paddw       m7, [pw_8]
700
-    paddw       m7, m6
701
-    psrlw       m7, 4
702
-    SPLATW      m7, m7
703
-    mova [r0+r3*1], m7
704
-    mova [r0+r3*2], m7
705
-    mova [r0+r1*1], m7
706
-    mova [r0+r3*4], m7
707
-    mova [r2+r3*1], m7
708
-    mova [r2+r3*2], m7
709
-    mova [r2+r1*1], m7
710
-    mova [r2+r3*4], m7
641
+    mova        m0, [r0]
642
+    shr        r1d, 14
643
+    shr        r2d, 13
644
+    neg         r1
645
+    pslldq      m1, m0, 2
646
+    psrldq      m2, m0, 2
647
+    pinsrw      m1, [r0+r1], 0
648
+    pinsrw      m2, [r0+r2+14], 7
649
+    not         r1
650
+    and         r1, r3
651
+    pslldq      m4, m3, 2
652
+    psrldq      m5, m3, 2
653
+    pshuflw     m4, m4, 11100101b
654
+    pinsrw      m5, [r0+r1-2], 7
655
+    PRED4x4_LOWPASS m3, m4, m5, m3
656
+    PRED4x4_LOWPASS m0, m2, m1, m0
657
+    paddw       m0, m3
658
+    HADDW       m0, m1
659
+    paddw       m0, [pw_8]
660
+    psrlw       m0, 4
661
+    SPLATW      m0, m0
662
+    mova [r0+r3*1], m0
663
+    mova [r0+r3*2], m0
664
+    mova [r0+r5*1], m0
665
+    mova [r0+r3*4], m0
666
+    mova [r4+r3*1], m0
667
+    mova [r4+r3*2], m0
668
+    mova [r4+r5*1], m0
669
+    mova [r4+r3*4], m0
711 670
     RET
712 671
 %endmacro
713 672
 
714 673
 INIT_XMM
715
-%define PALIGNR PALIGNR_MMX
716 674
 PRED8x8L_DC sse2
717
-%define PALIGNR PALIGNR_SSSE3
718
-PRED8x8L_DC ssse3
675
+%ifdef HAVE_AVX
676
+INIT_AVX
677
+PRED8x8L_DC avx
678
+%endif
719 679
 
720 680
 ;-----------------------------------------------------------------------------
721 681
 ; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
... ...
@@ -723,36 +647,17 @@ PRED8x8L_DC ssse3
723 723
 %macro PRED8x8L_VERTICAL 1
724 724
 cglobal pred8x8l_vertical_10_%1, 4,4,6
725 725
     sub         r0, r3
726
-    mova        m0, [r0-16]
727
-    mova        m3, [r0]
728
-    mova        m1, [r0+16]
729
-    mova        m2, m3
730
-    mova        m4, m3
731
-    PALIGNR     m2, m0, 14, m0
732
-    PALIGNR     m1, m4,  2, m4
733
-    test        r1, r1 ; top_left
734
-    jz .fix_lt_2
735
-    test        r2, r2 ; top_right
736
-    jz .fix_tr_1
737
-    jmp .body
738
-.fix_lt_2:
739
-    mova        m5, m3
740
-    pxor        m5, m2
741
-    pslldq      m5, 14
742
-    psrldq      m5, 14
743
-    pxor        m2, m5
744
-    test        r2, r2 ; top_right
745
-    jnz .body
746
-.fix_tr_1:
747
-    mova        m5, m3
748
-    pxor        m5, m1
749
-    psrldq      m5, 14
750
-    pslldq      m5, 14
751
-    pxor        m1, m5
752
-.body
753
-    lea         r1, [r3+r3*2]
726
+    mova        m0, [r0]
727
+    shr        r1d, 14
728
+    shr        r2d, 13
729
+    neg         r1
730
+    pslldq      m1, m0, 2
731
+    psrldq      m2, m0, 2
732
+    pinsrw      m1, [r0+r1], 0
733
+    pinsrw      m2, [r0+r2+14], 7
734
+    lea         r1, [r3*3]
754 735
     lea         r2, [r0+r3*4]
755
-    PRED4x4_LOWPASS m0, m2, m1, m3
736
+    PRED4x4_LOWPASS m0, m2, m1, m0
756 737
     mova [r0+r3*1], m0
757 738
     mova [r0+r3*2], m0
758 739
     mova [r0+r1*1], m0
... ...
@@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6
765 765
 %endmacro
766 766
 
767 767
 INIT_XMM
768
-%define PALIGNR PALIGNR_MMX
769 768
 PRED8x8L_VERTICAL sse2
770
-%define PALIGNR PALIGNR_SSSE3
771
-PRED8x8L_VERTICAL ssse3
769
+%ifdef HAVE_AVX
770
+INIT_AVX
771
+PRED8x8L_VERTICAL avx
772
+%endif
772 773
 
773 774
 ;-----------------------------------------------------------------------------
774 775
 ; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
775 776
 ;-----------------------------------------------------------------------------
776 777
 %macro PRED8x8L_HORIZONTAL 1
777
-cglobal pred8x8l_horizontal_10_%1, 4,4,8
778
-    sub         r0, r3
779
-    lea         r2, [r0+r3*2]
780
-    mova        m0, [r0+r3*1-16]
781
-    test        r1, r1
782
-    lea         r1, [r0+r3]
783
-    cmovnz      r1, r0
784
-    punpckhwd   m0, [r1+r3*0-16]
785
-    mova        m1, [r2+r3*1-16]
786
-    punpckhwd   m1, [r0+r3*2-16]
787
-    mov         r2, r0
778
+cglobal pred8x8l_horizontal_10_%1, 4,4,5
779
+    mova        m0, [r0-16]
780
+    shr        r1d, 14
781
+    dec         r1
782
+    and         r1, r3
783
+    sub         r1, r3
784
+    punpckhwd   m0, [r0+r1-16]
785
+    mova        m1, [r0+r3*2-16]
786
+    punpckhwd   m1, [r0+r3*1-16]
787
+    lea         r2, [r0+r3*4]
788
+    lea         r1, [r3*3]
788 789
     punpckhdq   m1, m0
789
-    lea         r0, [r0+r3*4]
790
-    mova        m2, [r0+r3*1-16]
791
-    punpckhwd   m2, [r0+r3*0-16]
792
-    lea         r0, [r0+r3*2]
793
-    mova        m3, [r0+r3*1-16]
794
-    punpckhwd   m3, [r0+r3*0-16]
790
+    mova        m2, [r2+r3*0-16]
791
+    punpckhwd   m2, [r0+r1-16]
792
+    mova        m3, [r2+r3*2-16]
793
+    punpckhwd   m3, [r2+r3*1-16]
795 794
     punpckhdq   m3, m2
796 795
     punpckhqdq  m3, m1
797
-    lea         r0, [r0+r3*2]
798
-    mova        m0, [r0+r3*0-16]
799
-    mova        m1, [r1+r3*0-16]
800
-    mov         r0, r2
801
-    mova        m4, m3
802
-    mova        m2, m3
803
-    PALIGNR     m4, m0, 14, m0
804
-    PALIGNR     m1, m2,  2, m2
805
-    mova        m0, m4
806
-    PRED4x4_LOWPASS m2, m1, m4, m3
807
-    mova        m4, m0
808
-    mova        m7, m2
809
-    PRED4x4_LOWPASS m1, m3, m0, m4
810
-    pslldq      m1, 14
811
-    PALIGNR     m7, m1, 14, m3
812
-    lea         r1, [r3+r3*2]
813
-    punpckhwd   m3, m7, m7
814
-    punpcklwd   m7, m7
796
+    PALIGNR     m4, m3, [r2+r1-16], 14, m0
797
+    pslldq      m0, m4, 2
798
+    pshuflw     m0, m0, 11100101b
799
+    PRED4x4_LOWPASS m4, m3, m0, m4
800
+    punpckhwd   m3, m4, m4
801
+    punpcklwd   m4, m4
815 802
     pshufd      m0, m3, 0xff
816 803
     pshufd      m1, m3, 0xaa
817
-    lea         r2, [r0+r3*4]
818 804
     pshufd      m2, m3, 0x55
819 805
     pshufd      m3, m3, 0x00
820
-    pshufd      m4, m7, 0xff
821
-    pshufd      m5, m7, 0xaa
822
-    pshufd      m6, m7, 0x55
823
-    pshufd      m7, m7, 0x00
824
-    mova [r0+r3*1], m0
825
-    mova [r0+r3*2], m1
826
-    mova [r0+r1*1], m2
827
-    mova [r0+r3*4], m3
828
-    mova [r2+r3*1], m4
829
-    mova [r2+r3*2], m5
830
-    mova [r2+r1*1], m6
831
-    mova [r2+r3*4], m7
806
+    mova [r0+r3*0], m0
807
+    mova [r0+r3*1], m1
808
+    mova [r0+r3*2], m2
809
+    mova [r0+r1*1], m3
810
+    pshufd      m0, m4, 0xff
811
+    pshufd      m1, m4, 0xaa
812
+    pshufd      m2, m4, 0x55
813
+    pshufd      m3, m4, 0x00
814
+    mova [r2+r3*0], m0
815
+    mova [r2+r3*1], m1
816
+    mova [r2+r3*2], m2
817
+    mova [r2+r1*1], m3
832 818
     RET
833 819
 %endmacro
834 820
 
... ...
@@ -837,116 +728,68 @@ INIT_XMM
837 837
 PRED8x8L_HORIZONTAL sse2
838 838
 %define PALIGNR PALIGNR_SSSE3
839 839
 PRED8x8L_HORIZONTAL ssse3
840
+%ifdef HAVE_AVX
841
+INIT_AVX
842
+PRED8x8L_HORIZONTAL avx
843
+%endif
840 844
 
841 845
 ;-----------------------------------------------------------------------------
842 846
 ;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
843 847
 ;-----------------------------------------------------------------------------
844 848
 %macro PRED8x8L_DOWN_LEFT 1
845
-cglobal pred8x8l_down_left_10_%1, 4,4,8
849
+cglobal pred8x8l_down_left_10_%1, 4,4,7
846 850
     sub         r0, r3
847
-    mova        m0, [r0-16]
848 851
     mova        m3, [r0]
852
+    shr        r1d, 14
853
+    neg         r1
854
+    shr        r2d, 13
855
+    pslldq      m1, m3, 2
856
+    psrldq      m2, m3, 2
857
+    pinsrw      m1, [r0+r1], 0
858
+    pinsrw      m2, [r0+r2+14], 7
859
+    PRED4x4_LOWPASS m6, m2, m1, m3
860
+    jz .fix_tr ; flags from shr r2d
849 861
     mova        m1, [r0+16]
850
-    mova        m2, m3
851
-    mova        m4, m3
852
-    PALIGNR     m2, m0, 14, m0
853
-    PALIGNR     m1, m4,  2, m4
854
-    test        r1, r1
855
-    jz .fix_lt_2
856
-    test        r2, r2
857
-    jz .fix_tr_1
858
-    jmp .do_top
859
-.fix_lt_2:
860
-    mova        m5, m3
861
-    pxor        m5, m2
862
-    pslldq      m5, 14
863
-    psrldq      m5, 14
864
-    pxor        m2, m5
865
-    test        r2, r2
866
-    jnz .do_top
867
-.fix_tr_1:
868
-    mova        m5, m3
869
-    pxor        m5, m1
870
-    psrldq      m5, 14
871
-    pslldq      m5, 14
872
-    pxor        m1, m5
873
-    jmp .do_top
874
-.fix_tr_2:
875
-    punpckhwd   m3, m3
876
-    pshufd      m1, m3, 0xFF
877
-    jmp .do_topright
878
-.do_top:
879
-    PRED4x4_LOWPASS m4, m2, m1, m3
880
-    mova        m7, m4
881
-    test        r2, r2
882
-    jz .fix_tr_2
883
-    mova        m0, [r0+16]
884
-    mova        m5, m0
885
-    mova        m2, m0
886
-    mova        m4, m0
887
-    psrldq      m5, 14
888
-    PALIGNR     m2, m3, 14, m3
889
-    PALIGNR     m5, m4,  2, m4
890
-    PRED4x4_LOWPASS m1, m2, m5, m0
862
+    psrldq      m5, m1, 2
863
+    PALIGNR     m2, m1, m3, 14, m3
864
+    pshufhw     m5, m5, 10100100b
865
+    PRED4x4_LOWPASS m1, m2, m5, m1
891 866
 .do_topright:
892
-    lea         r1, [r3+r3*2]
893
-    mova        m6, m1
894
-    psrldq      m1, 14
895
-    mova        m4, m1
867
+    lea         r1, [r3*3]
868
+    psrldq      m5, m1, 14
896 869
     lea         r2, [r0+r3*4]
897
-    mova        m2, m6
898
-    PALIGNR     m2, m7,  2, m0
899
-    mova        m3, m6
900
-    PALIGNR     m3, m7, 14, m0
901
-    PALIGNR     m4, m6,  2, m0
902
-    mova        m5, m7
903
-    mova        m1, m7
904
-    mova        m7, m6
905
-    pslldq      m1, 2
906
-    PRED4x4_LOWPASS m0, m1, m2, m5
907
-    PRED4x4_LOWPASS m1, m3, m4, m7
870
+    PALIGNR     m2, m1, m6,  2, m0
871
+    PALIGNR     m3, m1, m6, 14, m0
872
+    PALIGNR     m5, m1,  2, m0
873
+    pslldq      m4, m6, 2
874
+    PRED4x4_LOWPASS m6, m4, m2, m6
875
+    PRED4x4_LOWPASS m1, m3, m5, m1
908 876
     mova [r2+r3*4], m1
909
-    mova        m2, m0
910
-    pslldq      m1, 2
911
-    psrldq      m2, 14
912
-    pslldq      m0, 2
913
-    por         m1, m2
877
+    PALIGNR     m1, m6, 14, m2
878
+    pslldq      m6, 2
914 879
     mova [r2+r1*1], m1
915
-    mova        m2, m0
916
-    pslldq      m1, 2
917
-    psrldq      m2, 14
918
-    pslldq      m0, 2
919
-    por         m1, m2
880
+    PALIGNR     m1, m6, 14, m2
881
+    pslldq      m6, 2
920 882
     mova [r2+r3*2], m1
921
-    mova        m2, m0
922
-    pslldq      m1, 2
923
-    psrldq      m2, 14
924
-    pslldq      m0, 2
925
-    por         m1, m2
883
+    PALIGNR     m1, m6, 14, m2
884
+    pslldq      m6, 2
926 885
     mova [r2+r3*1], m1
927
-    mova        m2, m0
928
-    pslldq      m1, 2
929
-    psrldq      m2, 14
930
-    pslldq      m0, 2
931
-    por         m1, m2
886
+    PALIGNR     m1, m6, 14, m2
887
+    pslldq      m6, 2
932 888
     mova [r0+r3*4], m1
933
-    mova        m2, m0
934
-    pslldq      m1, 2
935
-    psrldq      m2, 14
936
-    pslldq      m0, 2
937
-    por         m1, m2
889
+    PALIGNR     m1, m6, 14, m2
890
+    pslldq      m6, 2
938 891
     mova [r0+r1*1], m1
939
-    mova        m2, m0
940
-    pslldq      m1, 2
941
-    psrldq      m2, 14
942
-    pslldq      m0, 2
943
-    por         m1, m2
892
+    PALIGNR     m1, m6, 14, m2
893
+    pslldq      m6, 2
944 894
     mova [r0+r3*2], m1
945
-    pslldq      m1, 2
946
-    psrldq      m0, 14
947
-    por         m1, m0
895
+    PALIGNR     m1, m6, 14, m6
948 896
     mova [r0+r3*1], m1
949 897
     RET
898
+.fix_tr:
899
+    punpckhwd   m3, m3
900
+    pshufd      m1, m3, 0xFF
901
+    jmp .do_topright
950 902
 %endmacro
951 903
 
952 904
 INIT_XMM
... ...
@@ -954,139 +797,73 @@ INIT_XMM
954 954
 PRED8x8L_DOWN_LEFT sse2
955 955
 %define PALIGNR PALIGNR_SSSE3
956 956
 PRED8x8L_DOWN_LEFT ssse3
957
+%ifdef HAVE_AVX
958
+INIT_AVX
959
+PRED8x8L_DOWN_LEFT avx
960
+%endif
957 961
 
958 962
 ;-----------------------------------------------------------------------------
959
-;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride)
963
+;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
960 964
 ;-----------------------------------------------------------------------------
961 965
 %macro PRED8x8L_DOWN_RIGHT 1
966
+; standard forbids this when has_topleft is false
967
+; no need to check
962 968
 cglobal pred8x8l_down_right_10_%1, 4,5,8
963 969
     sub         r0, r3
964
-    lea         r4, [r0+r3*2]
970
+    lea         r4, [r0+r3*4]
971
+    lea         r1, [r3*3]
965 972
     mova        m0, [r0+r3*1-16]
966 973
     punpckhwd   m0, [r0+r3*0-16]
967
-    mova        m1, [r4+r3*1-16]
974
+    mova        m1, [r0+r1*1-16]
968 975
     punpckhwd   m1, [r0+r3*2-16]
969
-    mov         r4, r0
970 976
     punpckhdq   m1, m0
971
-    lea         r0, [r0+r3*4]
972
-    mova        m2, [r0+r3*1-16]
973
-    punpckhwd   m2, [r0+r3*0-16]
974
-    lea         r0, [r0+r3*2]
975
-    mova        m3, [r0+r3*1-16]
976
-    punpckhwd   m3, [r0+r3*0-16]
977
+    mova        m2, [r4+r3*1-16]
978
+    punpckhwd   m2, [r4+r3*0-16]
979
+    mova        m3, [r4+r1*1-16]
980
+    punpckhwd   m3, [r4+r3*2-16]
977 981
     punpckhdq   m3, m2
978 982
     punpckhqdq  m3, m1
979
-    lea         r0, [r0+r3*2]
980
-    mova        m0, [r0+r3*0-16]
981
-    mova        m1, [r4]
982
-    mov         r0, r4
983
-    mova        m4, m3
984
-    mova        m2, m3
985
-    PALIGNR     m4, m0, 14, m0
986
-    PALIGNR     m1, m2,  2, m2
987
-    test        r1, r1 ; top_left
988
-    jz .fix_lt_1
989
-.do_left:
990
-    mova        m0, m4
991
-    PRED4x4_LOWPASS m2, m1, m4, m3
992
-    mova        m4, m0
993
-    mova        m7, m2
994
-    mova        m6, m2
995
-    PRED4x4_LOWPASS m1, m3, m0, m4
996
-    pslldq      m1, 14
997
-    PALIGNR     m7, m1, 14, m3
998
-    mova        m0, [r0-16]
983
+    mova        m0, [r4+r3*4-16]
984
+    mova        m1, [r0]
985
+    PALIGNR     m4, m3, m0, 14, m0
986
+    PALIGNR     m1, m3,  2, m2
987
+    pslldq      m0, m4, 2
988
+    pshuflw     m0, m0, 11100101b
989
+    PRED4x4_LOWPASS m6, m1, m4, m3
990
+    PRED4x4_LOWPASS m4, m3, m0, m4
999 991
     mova        m3, [r0]
1000
-    mova        m1, [r0+16]
1001
-    mova        m2, m3
1002
-    mova        m4, m3
1003
-    PALIGNR     m2, m0, 14, m0
1004
-    PALIGNR     m1, m4,  2, m4
1005
-    test        r1, r1 ; top_left
1006
-    jz .fix_lt_2
1007
-    test        r2, r2 ; top_right
1008
-    jz .fix_tr_1
1009
-.do_top:
1010
-    PRED4x4_LOWPASS m4, m2, m1, m3
1011
-    mova        m5, m4
1012
-    jmp .body
1013
-.fix_lt_1:
1014
-    mova        m5, m3
1015
-    pxor        m5, m4
1016
-    psrldq      m5, 14
1017
-    pslldq      m5, 12
1018
-    pxor        m1, m5
1019
-    jmp .do_left
1020
-.fix_lt_2:
1021
-    mova        m5, m3
1022
-    pxor        m5, m2
1023
-    pslldq      m5, 14
1024
-    psrldq      m5, 14
1025
-    pxor        m2, m5
1026
-    test        r2, r2 ; top_right
1027
-    jnz .do_top
1028
-.fix_tr_1:
1029
-    mova        m5, m3
1030
-    pxor        m5, m1
1031
-    psrldq      m5, 14
1032
-    pslldq      m5, 14
1033
-    pxor        m1, m5
1034
-    jmp .do_top
1035
-.body
1036
-    lea         r1, [r3+r3*2]
1037
-    mova        m1, m7
1038
-    mova        m7, m5
1039
-    mova        m5, m6
1040
-    mova        m2, m7
1041
-    lea         r2, [r0+r3*4]
1042
-    PALIGNR     m2, m6,  2, m0
1043
-    mova        m3, m7
1044
-    PALIGNR     m3, m6, 14, m0
1045
-    mova        m4, m7
1046
-    psrldq      m4, 2
1047
-    PRED4x4_LOWPASS m0, m1, m2, m5
1048
-    PRED4x4_LOWPASS m1, m3, m4, m7
1049
-    mova [r2+r3*4], m0
1050
-    mova        m2, m1
1051
-    psrldq      m0, 2
1052
-    pslldq      m2, 14
1053
-    psrldq      m1, 2
1054
-    por         m0, m2
1055
-    mova [r2+r1*1], m0
1056
-    mova        m2, m1
1057
-    psrldq      m0, 2
1058
-    pslldq      m2, 14
1059
-    psrldq      m1, 2
1060
-    por         m0, m2
1061
-    mova [r2+r3*2], m0
1062
-    mova        m2, m1
1063
-    psrldq      m0, 2
1064
-    pslldq      m2, 14
1065
-    psrldq      m1, 2
1066
-    por         m0, m2
1067
-    mova [r2+r3*1], m0
1068
-    mova        m2, m1
1069
-    psrldq      m0, 2
1070
-    pslldq      m2, 14
1071
-    psrldq      m1, 2
1072
-    por         m0, m2
1073
-    mova [r0+r3*4], m0
1074
-    mova        m2, m1
1075
-    psrldq      m0, 2
1076
-    pslldq      m2, 14
1077
-    psrldq      m1, 2
1078
-    por         m0, m2
1079
-    mova [r0+r1*1], m0
1080
-    mova        m2, m1
1081
-    psrldq      m0, 2
1082
-    pslldq      m2, 14
1083
-    psrldq      m1, 2
1084
-    por         m0, m2
1085
-    mova [r0+r3*2], m0
1086
-    psrldq      m0, 2
1087
-    pslldq      m1, 14
1088
-    por         m0, m1
1089
-    mova [r0+r3*1], m0
992
+    shr        r2d, 13
993
+    pslldq      m1, m3, 2
994
+    psrldq      m2, m3, 2
995
+    pinsrw      m1, [r0-2], 0
996
+    pinsrw      m2, [r0+r2+14], 7
997
+    PRED4x4_LOWPASS m3, m2, m1, m3
998
+    PALIGNR     m2, m3, m6,  2, m0
999
+    PALIGNR     m5, m3, m6, 14, m0
1000
+    psrldq      m7, m3, 2
1001
+    PRED4x4_LOWPASS m6, m4, m2, m6
1002
+    PRED4x4_LOWPASS m3, m5, m7, m3
1003
+    mova [r4+r3*4], m6
1004
+    PALIGNR     m3, m6, 14, m2
1005
+    pslldq      m6, 2
1006
+    mova [r0+r3*1], m3
1007
+    PALIGNR     m3, m6, 14, m2
1008
+    pslldq      m6, 2
1009
+    mova [r0+r3*2], m3
1010
+    PALIGNR     m3, m6, 14, m2
1011
+    pslldq      m6, 2
1012
+    mova [r0+r1*1], m3
1013
+    PALIGNR     m3, m6, 14, m2
1014
+    pslldq      m6, 2
1015
+    mova [r0+r3*4], m3
1016
+    PALIGNR     m3, m6, 14, m2
1017
+    pslldq      m6, 2
1018
+    mova [r4+r3*1], m3
1019
+    PALIGNR     m3, m6, 14, m2
1020
+    pslldq      m6, 2
1021
+    mova [r4+r3*2], m3
1022
+    PALIGNR     m3, m6, 14, m6
1023
+    mova [r4+r1*1], m3
1090 1024
     RET
1091 1025
 %endmacro
1092 1026
 
... ...
@@ -1095,114 +872,69 @@ INIT_XMM
1095 1095
 PRED8x8L_DOWN_RIGHT sse2
1096 1096
 %define PALIGNR PALIGNR_SSSE3
1097 1097
 PRED8x8L_DOWN_RIGHT ssse3
1098
+%ifdef HAVE_AVX
1099
+INIT_AVX
1100
+PRED8x8L_DOWN_RIGHT avx
1101
+%endif
1098 1102
 
1099 1103
 ;-----------------------------------------------------------------------------
1100 1104
 ; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
1101 1105
 ;-----------------------------------------------------------------------------
1102 1106
 %macro PRED8x8L_VERTICAL_RIGHT 1
1103
-cglobal pred8x8l_vertical_right_10_%1, 4,5,8
1107
+; likewise with 8x8l_down_right
1108
+cglobal pred8x8l_vertical_right_10_%1, 4,5,7
1104 1109
     sub         r0, r3
1105
-    lea         r4, [r0+r3*2]
1110
+    lea         r4, [r0+r3*4]
1111
+    lea         r1, [r3*3]
1106 1112
     mova        m0, [r0+r3*1-16]
1107 1113
     punpckhwd   m0, [r0+r3*0-16]
1108
-    mova        m1, [r4+r3*1-16]
1114
+    mova        m1, [r0+r1*1-16]
1109 1115
     punpckhwd   m1, [r0+r3*2-16]
1110
-    mov         r4, r0
1111 1116
     punpckhdq   m1, m0
1112
-    lea         r0, [r0+r3*4]
1113
-    mova        m2, [r0+r3*1-16]
1114
-    punpckhwd   m2, [r0+r3*0-16]
1115
-    lea         r0, [r0+r3*2]
1116
-    mova        m3, [r0+r3*1-16]
1117
-    punpckhwd   m3, [r0+r3*0-16]
1117
+    mova        m2, [r4+r3*1-16]
1118
+    punpckhwd   m2, [r4+r3*0-16]
1119
+    mova        m3, [r4+r1*1-16]
1120
+    punpckhwd   m3, [r4+r3*2-16]
1118 1121
     punpckhdq   m3, m2
1119 1122
     punpckhqdq  m3, m1
1120
-    lea         r0, [r0+r3*2]
1121
-    mova        m0, [r0+r3*0-16]
1122
-    mova        m1, [r4]
1123
-    mov         r0, r4
1124
-    mova        m4, m3
1125
-    mova        m2, m3
1126
-    PALIGNR     m4, m0, 14, m0
1127
-    PALIGNR     m1, m2,  2, m2
1128
-    test        r1, r1
1129
-    jz .fix_lt_1
1130
-    jmp .do_left
1131
-.fix_lt_1:
1132
-    mova        m5, m3
1133
-    pxor        m5, m4
1134
-    psrldq      m5, 14
1135
-    pslldq      m5, 12
1136
-    pxor        m1, m5
1137
-    jmp .do_left
1138
-.fix_lt_2:
1139
-    mova        m5, m3
1140
-    pxor        m5, m2
1141
-    pslldq      m5, 14
1142
-    psrldq      m5, 14
1143
-    pxor        m2, m5
1144
-    test        r2, r2
1145
-    jnz .do_top
1146
-.fix_tr_1:
1147
-    mova        m5, m3
1148
-    pxor        m5, m1
1149
-    psrldq      m5, 14
1150
-    pslldq      m5, 14
1151
-    pxor        m1, m5
1152
-    jmp .do_top
1153
-.do_left:
1154
-    mova        m0, m4
1155
-    PRED4x4_LOWPASS m2, m1, m4, m3
1156
-    mova        m7, m2
1157
-    mova        m0, [r0-16]
1158
-    mova        m3, [r0]
1159
-    mova        m1, [r0+16]
1160
-    mova        m2, m3
1161
-    mova        m4, m3
1162
-    PALIGNR     m2, m0, 14, m0
1163
-    PALIGNR     m1, m4,  2, m4
1164
-    test        r1, r1
1165
-    jz .fix_lt_2
1166
-    test        r2, r2
1167
-    jz .fix_tr_1
1168
-.do_top
1169
-    PRED4x4_LOWPASS m6, m2, m1, m3
1170
-    lea         r1, [r3+r3*2]
1171
-    mova        m2, m6
1172
-    mova        m3, m6
1173
-    PALIGNR     m3, m7, 14, m0
1174
-    PALIGNR     m6, m7, 12, m1
1175
-    mova        m4, m3
1176
-    pavgw       m3, m2
1177
-    lea         r2, [r0+r3*4]
1178
-    PRED4x4_LOWPASS m0, m6, m2, m4
1179
-    mova [r0+r3*1], m3
1123
+    mova        m0, [r4+r3*4-16]
1124
+    mova        m1, [r0]
1125
+    PALIGNR     m4, m3, m0, 14, m0
1126
+    PALIGNR     m1, m3,  2, m2
1127
+    PRED4x4_LOWPASS m3, m1, m4, m3
1128
+    mova        m2, [r0]
1129
+    shr        r2d, 13
1130
+    pslldq      m1, m2, 2
1131
+    psrldq      m5, m2, 2
1132
+    pinsrw      m1, [r0-2], 0
1133
+    pinsrw      m5, [r0+r2+14], 7
1134
+    PRED4x4_LOWPASS m2, m5, m1, m2
1135
+    PALIGNR     m6, m2, m3, 12, m1
1136
+    PALIGNR     m5, m2, m3, 14, m0
1137
+    PRED4x4_LOWPASS m0, m6, m2, m5
1138
+    pavgw       m2, m5
1180 1139
     mova [r0+r3*2], m0
1181
-    mova        m5, m0
1182
-    mova        m6, m3
1183
-    mova        m1, m7
1184
-    mova        m2, m1
1185
-    pslldq      m2, 2
1186
-    mova        m3, m1
1187
-    pslldq      m3, 4
1188
-    PRED4x4_LOWPASS m0, m1, m3, m2
1189
-    PALIGNR     m6, m0, 14, m2
1190
-    mova [r0+r1*1], m6
1191
-    pslldq      m0, 2
1192
-    PALIGNR     m5, m0, 14, m1
1193
-    mova [r0+r3*4], m5
1194
-    pslldq      m0, 2
1195
-    PALIGNR     m6, m0, 14, m2
1196
-    mova [r2+r3*1], m6
1197
-    pslldq      m0, 2
1198
-    PALIGNR     m5, m0, 14, m1
1199
-    mova [r2+r3*2], m5
1200
-    pslldq      m0, 2
1201
-    PALIGNR     m6, m0, 14, m2
1202
-    mova [r2+r1*1], m6
1203
-    pslldq      m0, 2
1204
-    PALIGNR     m5, m0, 14, m1
1205
-    mova [r2+r3*4], m5
1140
+    mova [r0+r3*1], m2
1141
+    pslldq      m6, m3, 4
1142
+    pslldq      m1, m3, 2
1143
+    PRED4x4_LOWPASS m1, m3, m6, m1
1144
+    PALIGNR     m2, m1, 14, m4
1145
+    mova [r0+r1*1], m2
1146
+    pslldq      m1, 2
1147
+    PALIGNR     m0, m1, 14, m3
1148
+    mova [r0+r3*4], m0
1149
+    pslldq      m1, 2
1150
+    PALIGNR     m2, m1, 14, m4
1151
+    mova [r4+r3*1], m2
1152
+    pslldq      m1, 2
1153
+    PALIGNR     m0, m1, 14, m3
1154
+    mova [r4+r3*2], m0
1155
+    pslldq      m1, 2
1156
+    PALIGNR     m2, m1, 14, m4
1157
+    mova [r4+r1*1], m2
1158
+    pslldq      m1, 2
1159
+    PALIGNR     m0, m1, 14, m1
1160
+    mova [r4+r3*4], m0
1206 1161
     RET
1207 1162
 %endmacro
1208 1163
 
... ...
@@ -1211,84 +943,60 @@ INIT_XMM
1211 1211
 PRED8x8L_VERTICAL_RIGHT sse2
1212 1212
 %define PALIGNR PALIGNR_SSSE3
1213 1213
 PRED8x8L_VERTICAL_RIGHT ssse3
1214
+%ifdef HAVE_AVX
1215
+INIT_AVX
1216
+PRED8x8L_VERTICAL_RIGHT avx
1217
+%endif
1214 1218
 
1215 1219
 ;-----------------------------------------------------------------------------
1216 1220
 ; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
1217 1221
 ;-----------------------------------------------------------------------------
1218 1222
 %macro PRED8x8L_HORIZONTAL_UP 1
1219
-cglobal pred8x8l_horizontal_up_10_%1, 4,4,8
1220
-    sub         r0, r3
1221
-    lea         r2, [r0+r3*2]
1222
-    mova        m0, [r0+r3*1-16]
1223
-    test        r1, r1
1224
-    lea         r1, [r0+r3]
1225
-    cmovnz      r1, r0
1226
-    punpckhwd   m0, [r1+r3*0-16]
1227
-    mova        m1, [r2+r3*1-16]
1228
-    punpckhwd   m1, [r0+r3*2-16]
1229
-    mov         r2, r0
1230
-    punpckhdq   m1, m0
1231
-    lea         r0, [r0+r3*4]
1232
-    mova        m2, [r0+r3*1-16]
1233
-    punpckhwd   m2, [r0+r3*0-16]
1234
-    lea         r0, [r0+r3*2]
1235
-    mova        m3, [r0+r3*1-16]
1236
-    punpckhwd   m3, [r0+r3*0-16]
1237
-    punpckhdq   m3, m2
1238
-    punpckhqdq  m3, m1
1239
-    lea         r0, [r0+r3*2]
1223
+cglobal pred8x8l_horizontal_up_10_%1, 4,4,6
1240 1224
     mova        m0, [r0+r3*0-16]
1241
-    mova        m1, [r1+r3*0-16]
1242
-    mov         r0, r2
1243
-    mova        m4, m3
1244
-    mova        m2, m3
1245
-    PALIGNR     m4, m0, 14, m0
1246
-    PALIGNR     m1, m2,  2, m2
1247
-    mova        m0, m4
1248
-    PRED4x4_LOWPASS m2, m1, m4, m3
1249
-    mova        m4, m0
1250
-    mova        m7, m2
1251
-    PRED4x4_LOWPASS m1, m3, m0, m4
1252
-    pslldq      m1, 14
1253
-    PALIGNR     m7, m1, 14, m3
1254
-    lea         r1, [r3+r3*2]
1255
-    pshufd      m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
1256
-    pslldq      m7, 14             ; l7 .. .. .. .. .. .. ..
1257
-    mova        m2, m0
1258
-    pslld       m0, 16
1259
-    psrld       m2, 16
1260
-    por         m2, m0            ; l7 l6 l5 l4 l3 l2 l1 l0
1261
-    mova        m3, m2
1262
-    mova        m4, m2
1263
-    mova        m5, m2
1264
-    psrldq      m2, 2
1265
-    psrldq      m3, 4
1225
+    punpckhwd   m0, [r0+r3*1-16]
1226
+    shr        r1d, 14
1227
+    dec         r1
1228
+    and         r1, r3
1229
+    sub         r1, r3
1230
+    mova        m4, [r0+r1*1-16]
1231
+    lea         r1, [r3*3]
1266 1232
     lea         r2, [r0+r3*4]
1267
-    por         m2, m7            ; l7 l7 l6 l5 l4 l3 l2 l1
1268
-    punpckhwd   m7, m7
1269
-    por         m3, m7            ; l7 l7 l7 l6 l5 l4 l3 l2
1270
-    pavgw       m4, m2
1271
-    PRED4x4_LOWPASS m1, m3, m5, m2
1272
-    mova        m5, m4
1273
-    punpcklwd   m4, m1            ; p4 p3 p2 p1
1274
-    punpckhwd   m5, m1            ; p8 p7 p6 p5
1275
-    mova        m6, m5
1276
-    mova        m7, m5
1277
-    mova        m0, m5
1278
-    PALIGNR     m5, m4, 4, m1
1279
-    pshufd      m1, m6, 11111001b
1280
-    PALIGNR     m6, m4, 8, m2
1281
-    pshufd      m2, m7, 11111110b
1282
-    PALIGNR     m7, m4, 12, m3
1283
-    pshufd      m3, m0, 11111111b
1284
-    mova [r0+r3*1], m4
1285
-    mova [r0+r3*2], m5
1286
-    mova [r0+r1*1], m6
1287
-    mova [r0+r3*4], m7
1233
+    mova        m1, [r0+r3*2-16]
1234
+    punpckhwd   m1, [r0+r1*1-16]
1235
+    punpckhdq   m0, m1
1236
+    mova        m2, [r2+r3*0-16]
1237
+    punpckhwd   m2, [r2+r3*1-16]
1238
+    mova        m3, [r2+r3*2-16]
1239
+    punpckhwd   m3, [r2+r1*1-16]
1240
+    punpckhdq   m2, m3
1241
+    punpckhqdq  m0, m2
1242
+    PALIGNR     m1, m0, m4, 14, m4
1243
+    psrldq      m2, m0, 2
1244
+    pshufhw     m2, m2, 10100100b
1245
+    PRED4x4_LOWPASS m0, m1, m2, m0
1246
+    psrldq      m1, m0, 2
1247
+    psrldq      m2, m0, 4
1248
+    pshufhw     m1, m1, 10100100b
1249
+    pshufhw     m2, m2, 01010100b
1250
+    pavgw       m4, m0, m1
1251
+    PRED4x4_LOWPASS m1, m2, m0, m1
1252
+    punpckhwd   m5, m4, m1
1253
+    punpcklwd   m4, m1
1254
+    mova [r2+r3*0], m5
1255
+    mova [r0+r3*0], m4
1256
+    pshufd      m0, m5, 11111001b
1257
+    pshufd      m1, m5, 11111110b
1258
+    pshufd      m2, m5, 11111111b
1288 1259
     mova [r2+r3*1], m0
1289 1260
     mova [r2+r3*2], m1
1290 1261
     mova [r2+r1*1], m2
1291
-    mova [r2+r3*4], m3
1262
+    PALIGNR     m2, m5, m4, 4, m0
1263
+    PALIGNR     m3, m5, m4, 8, m1
1264
+    PALIGNR     m5, m5, m4, 12, m4
1265
+    mova [r0+r3*1], m2
1266
+    mova [r0+r3*2], m3
1267
+    mova [r0+r1*1], m5
1292 1268
     RET
1293 1269
 %endmacro
1294 1270
 
... ...
@@ -1297,7 +1005,10 @@ INIT_XMM
1297 1297
 PRED8x8L_HORIZONTAL_UP sse2
1298 1298
 %define PALIGNR PALIGNR_SSSE3
1299 1299
 PRED8x8L_HORIZONTAL_UP ssse3
1300
-
1300
+%ifdef HAVE_AVX
1301
+INIT_AVX
1302
+PRED8x8L_HORIZONTAL_UP avx
1303
+%endif
1301 1304
 
1302 1305
 
1303 1306
 ;-----------------------------------------------------------------------------
... ...
@@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3
1315 1315
 %macro PRED16x16_VERTICAL 1
1316 1316
 cglobal pred16x16_vertical_10_%1, 2,3
1317 1317
     sub   r0, r1
1318
-    mov   r2, 8
1318
+    mov  r2d, 8
1319 1319
     mova  m0, [r0+ 0]
1320 1320
     mova  m1, [r0+mmsize]
1321 1321
 %if mmsize==8
... ...
@@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3
1326 1326
     MOV16 r0+r1*1, m0, m1, m2, m3
1327 1327
     MOV16 r0+r1*2, m0, m1, m2, m3
1328 1328
     lea   r0, [r0+r1*2]
1329
-    dec   r2
1329
+    dec   r2d
1330 1330
     jg .loop
1331 1331
     REP_RET
1332 1332
 %endmacro
... ...
@@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2
1341 1341
 ;-----------------------------------------------------------------------------
1342 1342
 %macro PRED16x16_HORIZONTAL 1
1343 1343
 cglobal pred16x16_horizontal_10_%1, 2,3
1344
-    mov    r2, 8
1344
+    mov   r2d, 8
1345 1345
 .vloop:
1346 1346
     movd   m0, [r0+r1*0-4]
1347 1347
     movd   m1, [r0+r1*1-4]
... ...
@@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3
1350 1350
     MOV16  r0+r1*0, m0, m0, m0, m0
1351 1351
     MOV16  r0+r1*1, m1, m1, m1, m1
1352 1352
     lea    r0, [r0+r1*2]
1353
-    dec    r2
1353
+    dec    r2d
1354 1354
     jg .vloop
1355 1355
     REP_RET
1356 1356
 %endmacro
... ...
@@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2
1364 1364
 ; void pred16x16_dc(pixel *src, int stride)
1365 1365
 ;-----------------------------------------------------------------------------
1366 1366
 %macro PRED16x16_DC 1
1367
-cglobal pred16x16_dc_10_%1, 2,7
1368
-    mov        r4, r0
1367
+cglobal pred16x16_dc_10_%1, 2,6
1368
+    mov        r5, r0
1369 1369
     sub        r0, r1
1370 1370
     mova       m0, [r0+0]
1371 1371
     paddw      m0, [r0+mmsize]
... ...
@@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7
1375 1375
 %endif
1376 1376
     HADDW      m0, m2
1377 1377
 
1378
-    sub        r0, 2
1379
-    movzx     r3d, word [r0+r1*1]
1380
-    movzx     r5d, word [r0+r1*2]
1378
+    lea        r0, [r0+r1-2]
1379
+    movzx     r3d, word [r0]
1380
+    movzx     r4d, word [r0+r1]
1381 1381
 %rep 7
1382 1382
     lea        r0, [r0+r1*2]
1383
-    movzx     r2d, word [r0+r1*1]
1383
+    movzx     r2d, word [r0]
1384 1384
     add       r3d, r2d
1385
-    movzx     r2d, word [r0+r1*2]
1386
-    add       r5d, r2d
1385
+    movzx     r2d, word [r0+r1]
1386
+    add       r4d, r2d
1387 1387
 %endrep
1388
-    lea       r3d, [r3+r5+16]
1388
+    lea       r3d, [r3+r4+16]
1389 1389
 
1390 1390
     movd       m1, r3d
1391 1391
     paddw      m0, m1
... ...
@@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7
1393 1393
     SPLATW     m0, m0
1394 1394
     mov       r3d, 8
1395 1395
 .loop:
1396
-    MOV16 r4+r1*0, m0, m0, m0, m0
1397
-    MOV16 r4+r1*1, m0, m0, m0, m0
1398
-    lea        r4, [r4+r1*2]
1396
+    MOV16 r5+r1*0, m0, m0, m0, m0
1397
+    MOV16 r5+r1*1, m0, m0, m0, m0
1398
+    lea        r5, [r5+r1*2]
1399 1399
     dec       r3d
1400 1400
     jg .loop
1401 1401
     REP_RET
... ...
@@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2
1442 1442
 ; void pred16x16_left_dc(pixel *src, int stride)
1443 1443
 ;-----------------------------------------------------------------------------
1444 1444
 %macro PRED16x16_LEFT_DC 1
1445
-cglobal pred16x16_left_dc_10_%1, 2,7
1446
-    mov        r4, r0
1445
+cglobal pred16x16_left_dc_10_%1, 2,6
1446
+    mov        r5, r0
1447 1447
 
1448 1448
     sub        r0, 2
1449
-    movzx     r5d, word [r0+r1*0]
1450
-    movzx     r6d, word [r0+r1*1]
1449
+    movzx     r3d, word [r0]
1450
+    movzx     r4d, word [r0+r1]
1451 1451
 %rep 7
1452 1452
     lea        r0, [r0+r1*2]
1453
-    movzx     r2d, word [r0+r1*0]
1454
-    movzx     r3d, word [r0+r1*1]
1455
-    add       r5d, r2d
1456
-    add       r6d, r3d
1453
+    movzx     r2d, word [r0]
1454
+    add       r3d, r2d
1455
+    movzx     r2d, word [r0+r1]
1456
+    add       r4d, r2d
1457 1457
 %endrep
1458
-    lea       r2d, [r5+r6+8]
1459
-    shr       r2d, 4
1458
+    lea       r3d, [r3+r4+8]
1459
+    shr       r3d, 4
1460 1460
 
1461
-    movd       m0, r2d
1461
+    movd       m0, r3d
1462 1462
     SPLATW     m0, m0
1463 1463
     mov       r3d, 8
1464 1464
 .loop:
1465
-    MOV16 r4+r1*0, m0, m0, m0, m0
1466
-    MOV16 r4+r1*1, m0, m0, m0, m0
1467
-    lea        r4, [r4+r1*2]
1465
+    MOV16 r5+r1*0, m0, m0, m0, m0
1466
+    MOV16 r5+r1*1, m0, m0, m0, m0
1467
+    lea        r5, [r5+r1*2]
1468 1468
     dec       r3d
1469 1469
     jg .loop
1470 1470
     REP_RET
... ...
@@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
45 45
 
46 46
 PRED8x8(dc, 10, mmxext)
47 47
 PRED8x8(dc, 10, sse2)
48
-PRED8x8(top_dc, 10, mmxext)
49 48
 PRED8x8(top_dc, 10, sse2)
50 49
 PRED8x8(plane, 10, sse2)
51 50
 PRED8x8(vertical, 10, sse2)
... ...
@@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2)
55 55
 void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride);
56 56
 
57 57
 PRED8x8L(dc, 10, sse2)
58
-PRED8x8L(dc, 10, ssse3)
58
+PRED8x8L(dc, 10, avx)
59 59
 PRED8x8L(128_dc, 10, mmxext)
60 60
 PRED8x8L(128_dc, 10, sse2)
61 61
 PRED8x8L(top_dc, 10, sse2)
62
-PRED8x8L(top_dc, 10, ssse3)
62
+PRED8x8L(top_dc, 10, avx)
63 63
 PRED8x8L(vertical, 10, sse2)
64
-PRED8x8L(vertical, 10, ssse3)
64
+PRED8x8L(vertical, 10, avx)
65 65
 PRED8x8L(horizontal, 10, sse2)
66 66
 PRED8x8L(horizontal, 10, ssse3)
67
+PRED8x8L(horizontal, 10, avx)
67 68
 PRED8x8L(down_left, 10, sse2)
68 69
 PRED8x8L(down_left, 10, ssse3)
70
+PRED8x8L(down_left, 10, avx)
69 71
 PRED8x8L(down_right, 10, sse2)
70 72
 PRED8x8L(down_right, 10, ssse3)
73
+PRED8x8L(down_right, 10, avx)
71 74
 PRED8x8L(vertical_right, 10, sse2)
72 75
 PRED8x8L(vertical_right, 10, ssse3)
76
+PRED8x8L(vertical_right, 10, avx)
73 77
 PRED8x8L(horizontal_up, 10, sse2)
74 78
 PRED8x8L(horizontal_up, 10, ssse3)
79
+PRED8x8L(horizontal_up, 10, avx)
75 80
 
76 81
 #define PRED16x16(TYPE, DEPTH, OPT)\
77 82
 void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride);
... ...
@@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
298 298
             h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext;
299 299
 
300 300
             h->pred8x8[DC_PRED8x8          ] = ff_pred8x8_dc_10_mmxext;
301
-            h->pred8x8[TOP_DC_PRED8x8      ] = ff_pred8x8_top_dc_10_mmxext;
302 301
 
303 302
             h->pred8x8l[DC_128_PRED        ] = ff_pred8x8l_128_dc_10_mmxext;
304 303
 
... ...
@@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth
344 344
             h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_ssse3;
345 345
             h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_ssse3;
346 346
 
347
-            h->pred8x8l[VERT_PRED           ] = ff_pred8x8l_vertical_10_ssse3;
348 347
             h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_ssse3;
349
-            h->pred8x8l[DC_PRED             ] = ff_pred8x8l_dc_10_ssse3;
350
-            h->pred8x8l[TOP_DC_PRED         ] = ff_pred8x8l_top_dc_10_ssse3;
351 348
             h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
349
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
350
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_ssse3;
351
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_ssse3;
352 352
         }
353 353
 #if HAVE_AVX
354 354
         if (mm_flags & AV_CPU_FLAG_AVX) {
355 355
             h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
356 356
             h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
357
+            h->pred4x4[VERT_LEFT_PRED      ] = ff_pred4x4_vertical_left_10_avx;
357 358
             h->pred4x4[VERT_RIGHT_PRED     ] = ff_pred4x4_vertical_right_10_avx;
358 359
             h->pred4x4[HOR_DOWN_PRED       ] = ff_pred4x4_horizontal_down_10_avx;
360
+
361
+            h->pred8x8l[VERT_PRED           ] = ff_pred8x8l_vertical_10_avx;
362
+            h->pred8x8l[HOR_PRED            ] = ff_pred8x8l_horizontal_10_avx;
363
+            h->pred8x8l[DC_PRED             ] = ff_pred8x8l_dc_10_avx;
364
+            h->pred8x8l[TOP_DC_PRED         ] = ff_pred8x8l_top_dc_10_avx;
365
+            h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
366
+            h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
367
+            h->pred8x8l[VERT_RIGHT_PRED     ] = ff_pred8x8l_vertical_right_10_avx;
368
+            h->pred8x8l[HOR_UP_PRED         ] = ff_pred8x8l_horizontal_up_10_avx;
359 369
         }
360 370
 #endif /* HAVE_AVX */
361 371
     }
... ...
@@ -131,6 +131,7 @@ const AVCodecTag ff_codec_bmp_tags[] = {
131 131
     { CODEC_ID_MPEG2VIDEO,   MKTAG('s', 'l', 'i', 'f') },
132 132
     { CODEC_ID_MPEG2VIDEO,   MKTAG('E', 'M', '2', 'V') },
133 133
     { CODEC_ID_MPEG2VIDEO,   MKTAG('M', '7', '0', '1') }, /* Matrox MPEG2 intra-only */
134
+    { CODEC_ID_MPEG2VIDEO,   MKTAG('m', 'p', 'g', 'v') },
134 135
     { CODEC_ID_MJPEG,        MKTAG('M', 'J', 'P', 'G') },
135 136
     { CODEC_ID_MJPEG,        MKTAG('L', 'J', 'P', 'G') },
136 137
     { CODEC_ID_MJPEG,        MKTAG('d', 'm', 'b', '1') },