* qatar/master:
dnxhddec: avoid a branch in 10-bit decode_dct_block()
H.264: Add optimizations to predict x86 assembly.
riff: Add mpgv MPEG-2 fourcc
add Flash Screen Video 2 decoder
Conflicts:
configure
doc/general.texi
libavcodec/Makefile
libavcodec/allcodecs.c
libavcodec/version.h
Merged-by: Michael Niedermayer <michaelni@gmx.at>
| ... | ... |
@@ -1280,6 +1280,7 @@ flac_encoder_select="golomb lpc" |
| 1280 | 1280 |
flashsv_decoder_select="zlib" |
| 1281 | 1281 |
flashsv_encoder_select="zlib" |
| 1282 | 1282 |
flashsv2_encoder_select="zlib" |
| 1283 |
+flashsv2_decoder_select="zlib" |
|
| 1283 | 1284 |
flv_decoder_select="h263_decoder" |
| 1284 | 1285 |
flv_encoder_select="h263_encoder" |
| 1285 | 1286 |
fraps_decoder_select="huffman" |
| ... | ... |
@@ -401,7 +401,7 @@ following image formats are supported: |
| 401 | 401 |
@tab experimental lossless codec (fourcc: FFV1) |
| 402 | 402 |
@item Flash Screen Video v1 @tab X @tab X |
| 403 | 403 |
@tab fourcc: FSV1 |
| 404 |
-@item Flash Screen Video v2 @tab X |
|
| 404 |
+@item Flash Screen Video v2 @tab X @tab X |
|
| 405 | 405 |
@item Flash Video (FLV) @tab X @tab X |
| 406 | 406 |
@tab Sorenson H.263 used in Flash |
| 407 | 407 |
@item Fraps @tab @tab X |
| ... | ... |
@@ -153,6 +153,7 @@ OBJS-$(CONFIG_FLAC_ENCODER) += flacenc.o flacdata.o flac.o vorbis_dat |
| 153 | 153 |
OBJS-$(CONFIG_FLASHSV_DECODER) += flashsv.o |
| 154 | 154 |
OBJS-$(CONFIG_FLASHSV_ENCODER) += flashsvenc.o |
| 155 | 155 |
OBJS-$(CONFIG_FLASHSV2_ENCODER) += flashsv2enc.o |
| 156 |
+OBJS-$(CONFIG_FLASHSV2_DECODER) += flashsv.o |
|
| 156 | 157 |
OBJS-$(CONFIG_FLIC_DECODER) += flicvideo.o |
| 157 | 158 |
OBJS-$(CONFIG_FOURXM_DECODER) += 4xm.o |
| 158 | 159 |
OBJS-$(CONFIG_FRAPS_DECODER) += fraps.o |
| ... | ... |
@@ -109,7 +109,7 @@ void avcodec_register_all(void) |
| 109 | 109 |
REGISTER_ENCDEC (FFV1, ffv1); |
| 110 | 110 |
REGISTER_ENCDEC (FFVHUFF, ffvhuff); |
| 111 | 111 |
REGISTER_ENCDEC (FLASHSV, flashsv); |
| 112 |
- REGISTER_ENCODER (FLASHSV2, flashsv2); |
|
| 112 |
+ REGISTER_ENCDEC (FLASHSV2, flashsv2); |
|
| 113 | 113 |
REGISTER_DECODER (FLIC, flic); |
| 114 | 114 |
REGISTER_ENCDEC (FLV, flv); |
| 115 | 115 |
REGISTER_DECODER (FOURXM, fourxm); |
| ... | ... |
@@ -246,7 +246,7 @@ static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx, |
| 246 | 246 |
//av_log(ctx->avctx, AV_LOG_DEBUG, "j %d\n", j); |
| 247 | 247 |
//av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, weight %d\n", level, weight_matrix[i]); |
| 248 | 248 |
level = (2*level+1) * qscale * weight_matrix[i]; |
| 249 |
- if (weight_matrix[i] != level_bias) |
|
| 249 |
+ if (level_bias < 32 || weight_matrix[i] != level_bias) |
|
| 250 | 250 |
level += level_bias; |
| 251 | 251 |
level >>= level_shift; |
| 252 | 252 |
|
| ... | ... |
@@ -25,6 +25,8 @@ |
| 25 | 25 |
* Flash Screen Video decoder |
| 26 | 26 |
* @author Alex Beregszaszi |
| 27 | 27 |
* @author Benjamin Larsson |
| 28 |
+ * @author Daniel Verkamp |
|
| 29 |
+ * @author Konstantin Shishkov |
|
| 28 | 30 |
* |
| 29 | 31 |
* A description of the bitstream format for Flash Screen Video version 1/2 |
| 30 | 32 |
* is part of the SWF File Format Specification (version 10), which can be |
| ... | ... |
@@ -35,9 +37,17 @@ |
| 35 | 35 |
#include <stdlib.h> |
| 36 | 36 |
#include <zlib.h> |
| 37 | 37 |
|
| 38 |
+#include "libavutil/intreadwrite.h" |
|
| 38 | 39 |
#include "avcodec.h" |
| 40 |
+#include "bytestream.h" |
|
| 39 | 41 |
#include "get_bits.h" |
| 40 | 42 |
|
| 43 |
+typedef struct BlockInfo {
|
|
| 44 |
+ uint8_t *pos; |
|
| 45 |
+ int size; |
|
| 46 |
+ int unp_size; |
|
| 47 |
+} BlockInfo; |
|
| 48 |
+ |
|
| 41 | 49 |
typedef struct FlashSVContext {
|
| 42 | 50 |
AVCodecContext *avctx; |
| 43 | 51 |
AVFrame frame; |
| ... | ... |
@@ -46,9 +56,50 @@ typedef struct FlashSVContext {
|
| 46 | 46 |
uint8_t *tmpblock; |
| 47 | 47 |
int block_size; |
| 48 | 48 |
z_stream zstream; |
| 49 |
+ int ver; |
|
| 50 |
+ const uint32_t *pal; |
|
| 51 |
+ int is_keyframe; |
|
| 52 |
+ uint8_t *keyframedata; |
|
| 53 |
+ uint8_t *keyframe; |
|
| 54 |
+ BlockInfo *blocks; |
|
| 55 |
+ uint8_t *deflate_block; |
|
| 56 |
+ int deflate_block_size; |
|
| 57 |
+ int color_depth; |
|
| 58 |
+ int zlibprime_curr, zlibprime_prev; |
|
| 59 |
+ int diff_start, diff_height; |
|
| 49 | 60 |
} FlashSVContext; |
| 50 | 61 |
|
| 51 | 62 |
|
| 63 |
+static int decode_hybrid(const uint8_t *sptr, uint8_t *dptr, int dx, int dy, |
|
| 64 |
+ int h, int w, int stride, const uint32_t *pal) |
|
| 65 |
+{
|
|
| 66 |
+ int x, y; |
|
| 67 |
+ const uint8_t *orig_src = sptr; |
|
| 68 |
+ |
|
| 69 |
+ for (y = dx+h; y > dx; y--) {
|
|
| 70 |
+ uint8_t *dst = dptr + (y * stride) + dy * 3; |
|
| 71 |
+ for (x = 0; x < w; x++) {
|
|
| 72 |
+ if (*sptr & 0x80) {
|
|
| 73 |
+ /* 15-bit color */ |
|
| 74 |
+ unsigned c = AV_RB16(sptr) & ~0x8000; |
|
| 75 |
+ unsigned b = c & 0x1F; |
|
| 76 |
+ unsigned g = (c >> 5) & 0x1F; |
|
| 77 |
+ unsigned r = c >> 10; |
|
| 78 |
+ /* 000aaabb -> aaabbaaa */ |
|
| 79 |
+ *dst++ = (b << 3) | (b >> 2); |
|
| 80 |
+ *dst++ = (g << 3) | (g >> 2); |
|
| 81 |
+ *dst++ = (r << 3) | (r >> 2); |
|
| 82 |
+ sptr += 2; |
|
| 83 |
+ } else {
|
|
| 84 |
+ /* palette index */ |
|
| 85 |
+ uint32_t c = pal[*sptr++]; |
|
| 86 |
+ bytestream_put_le24(&dst, c); |
|
| 87 |
+ } |
|
| 88 |
+ } |
|
| 89 |
+ } |
|
| 90 |
+ return sptr - orig_src; |
|
| 91 |
+} |
|
| 92 |
+ |
|
| 52 | 93 |
static av_cold int flashsv_decode_init(AVCodecContext *avctx) |
| 53 | 94 |
{
|
| 54 | 95 |
FlashSVContext *s = avctx->priv_data; |
| ... | ... |
@@ -71,9 +122,42 @@ static av_cold int flashsv_decode_init(AVCodecContext *avctx) |
| 71 | 71 |
} |
| 72 | 72 |
|
| 73 | 73 |
|
| 74 |
+static void flashsv2_prime(FlashSVContext *s, uint8_t *src, |
|
| 75 |
+ int size, int unp_size) |
|
| 76 |
+{
|
|
| 77 |
+ z_stream zs; |
|
| 78 |
+ |
|
| 79 |
+ zs.zalloc = NULL; |
|
| 80 |
+ zs.zfree = NULL; |
|
| 81 |
+ zs.opaque = NULL; |
|
| 82 |
+ |
|
| 83 |
+ s->zstream.next_in = src; |
|
| 84 |
+ s->zstream.avail_in = size; |
|
| 85 |
+ s->zstream.next_out = s->tmpblock; |
|
| 86 |
+ s->zstream.avail_out = s->block_size * 3; |
|
| 87 |
+ inflate(&s->zstream, Z_SYNC_FLUSH); |
|
| 88 |
+ |
|
| 89 |
+ deflateInit(&zs, 0); |
|
| 90 |
+ zs.next_in = s->tmpblock; |
|
| 91 |
+ zs.avail_in = s->block_size * 3 - s->zstream.avail_out; |
|
| 92 |
+ zs.next_out = s->deflate_block; |
|
| 93 |
+ zs.avail_out = s->deflate_block_size; |
|
| 94 |
+ deflate(&zs, Z_SYNC_FLUSH); |
|
| 95 |
+ deflateEnd(&zs); |
|
| 96 |
+ |
|
| 97 |
+ inflateReset(&s->zstream); |
|
| 98 |
+ |
|
| 99 |
+ s->zstream.next_in = s->deflate_block; |
|
| 100 |
+ s->zstream.avail_in = s->deflate_block_size - zs.avail_out; |
|
| 101 |
+ s->zstream.next_out = s->tmpblock; |
|
| 102 |
+ s->zstream.avail_out = s->block_size * 3; |
|
| 103 |
+ inflate(&s->zstream, Z_SYNC_FLUSH); |
|
| 104 |
+} |
|
| 105 |
+ |
|
| 74 | 106 |
static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt, |
| 75 | 107 |
GetBitContext *gb, int block_size, |
| 76 |
- int width, int height, int x_pos, int y_pos) |
|
| 108 |
+ int width, int height, int x_pos, int y_pos, |
|
| 109 |
+ int blk_idx) |
|
| 77 | 110 |
{
|
| 78 | 111 |
struct FlashSVContext *s = avctx->priv_data; |
| 79 | 112 |
uint8_t *line = s->tmpblock; |
| ... | ... |
@@ -82,6 +166,10 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt, |
| 82 | 82 |
if (ret != Z_OK) {
|
| 83 | 83 |
//return -1; |
| 84 | 84 |
} |
| 85 |
+ if (s->zlibprime_curr || s->zlibprime_prev) {
|
|
| 86 |
+ flashsv2_prime(s, s->blocks[blk_idx].pos, s->blocks[blk_idx].size, |
|
| 87 |
+ s->blocks[blk_idx].unp_size); |
|
| 88 |
+ } |
|
| 85 | 89 |
s->zstream.next_in = avpkt->data + get_bits_count(gb) / 8; |
| 86 | 90 |
s->zstream.avail_in = block_size; |
| 87 | 91 |
s->zstream.next_out = s->tmpblock; |
| ... | ... |
@@ -96,19 +184,48 @@ static int flashsv_decode_block(AVCodecContext *avctx, AVPacket *avpkt, |
| 96 | 96 |
if (ret != Z_OK && ret != Z_STREAM_END) {
|
| 97 | 97 |
//return -1; |
| 98 | 98 |
} |
| 99 |
- /* Flash Screen Video stores the image upside down, so copy |
|
| 100 |
- * lines to destination in reverse order. */ |
|
| 101 |
- for (k = 1; k <= height; k++) {
|
|
| 102 |
- memcpy(s->frame.data[0] + x_pos * 3 + |
|
| 103 |
- (s->image_height - y_pos - k) * s->frame.linesize[0], |
|
| 104 |
- line, width * 3); |
|
| 105 |
- /* advance source pointer to next line */ |
|
| 106 |
- line += width * 3; |
|
| 99 |
+ |
|
| 100 |
+ if (s->is_keyframe) {
|
|
| 101 |
+ s->blocks[blk_idx].pos = s->keyframedata + (get_bits_count(gb) / 8); |
|
| 102 |
+ s->blocks[blk_idx].size = block_size; |
|
| 103 |
+ s->blocks[blk_idx].unp_size = s->block_size * 3 - s->zstream.avail_out; |
|
| 104 |
+ } |
|
| 105 |
+ if (!s->color_depth) {
|
|
| 106 |
+ /* Flash Screen Video stores the image upside down, so copy |
|
| 107 |
+ * lines to destination in reverse order. */ |
|
| 108 |
+ for (k = 1; k <= s->diff_height; k++) {
|
|
| 109 |
+ memcpy(s->frame.data[0] + x_pos * 3 + |
|
| 110 |
+ (s->image_height - y_pos - s->diff_start - k) * s->frame.linesize[0], |
|
| 111 |
+ line, width * 3); |
|
| 112 |
+ /* advance source pointer to next line */ |
|
| 113 |
+ line += width * 3; |
|
| 114 |
+ } |
|
| 115 |
+ } else {
|
|
| 116 |
+ /* hybrid 15-bit/palette mode */ |
|
| 117 |
+ decode_hybrid(s->tmpblock, s->frame.data[0], |
|
| 118 |
+ s->image_height - (y_pos + 1 + s->diff_start + s->diff_height), |
|
| 119 |
+ x_pos, s->diff_height, width, |
|
| 120 |
+ s->frame.linesize[0], s->pal); |
|
| 107 | 121 |
} |
| 108 | 122 |
skip_bits_long(gb, 8 * block_size); /* skip the consumed bits */ |
| 109 | 123 |
return 0; |
| 110 | 124 |
} |
| 111 | 125 |
|
| 126 |
+static int calc_deflate_block_size(int tmpblock_size) |
|
| 127 |
+{
|
|
| 128 |
+ z_stream zstream; |
|
| 129 |
+ int size; |
|
| 130 |
+ |
|
| 131 |
+ zstream.zalloc = Z_NULL; |
|
| 132 |
+ zstream.zfree = Z_NULL; |
|
| 133 |
+ zstream.opaque = Z_NULL; |
|
| 134 |
+ if (deflateInit(&zstream, 0) != Z_OK) |
|
| 135 |
+ return -1; |
|
| 136 |
+ size = deflateBound(&zstream, tmpblock_size); |
|
| 137 |
+ deflateEnd(&zstream); |
|
| 138 |
+ |
|
| 139 |
+ return size; |
|
| 140 |
+} |
|
| 112 | 141 |
|
| 113 | 142 |
static int flashsv_decode_frame(AVCodecContext *avctx, void *data, |
| 114 | 143 |
int *data_size, AVPacket *avpkt) |
| ... | ... |
@@ -132,6 +249,18 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, |
| 132 | 132 |
s->block_height = 16 * (get_bits(&gb, 4) + 1); |
| 133 | 133 |
s->image_height = get_bits(&gb, 12); |
| 134 | 134 |
|
| 135 |
+ if (s->ver == 2) {
|
|
| 136 |
+ skip_bits(&gb, 6); |
|
| 137 |
+ if (get_bits1(&gb)) {
|
|
| 138 |
+ av_log_missing_feature(avctx, "iframe", 1); |
|
| 139 |
+ return AVERROR_PATCHWELCOME; |
|
| 140 |
+ } |
|
| 141 |
+ if (get_bits1(&gb)) {
|
|
| 142 |
+ av_log_missing_feature(avctx, "custom palette", 1); |
|
| 143 |
+ return AVERROR_PATCHWELCOME; |
|
| 144 |
+ } |
|
| 145 |
+ } |
|
| 146 |
+ |
|
| 135 | 147 |
/* calculate number of blocks and size of border (partial) blocks */ |
| 136 | 148 |
h_blocks = s->image_width / s->block_width; |
| 137 | 149 |
h_part = s->image_width % s->block_width; |
| ... | ... |
@@ -141,11 +270,25 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, |
| 141 | 141 |
/* the block size could change between frames, make sure the buffer |
| 142 | 142 |
* is large enough, if not, get a larger one */ |
| 143 | 143 |
if (s->block_size < s->block_width * s->block_height) {
|
| 144 |
- av_free(s->tmpblock); |
|
| 145 |
- if ((s->tmpblock = av_malloc(3 * s->block_width * s->block_height)) == NULL) {
|
|
| 144 |
+ int tmpblock_size = 3 * s->block_width * s->block_height; |
|
| 145 |
+ |
|
| 146 |
+ s->tmpblock = av_realloc(s->tmpblock, tmpblock_size); |
|
| 147 |
+ if (!s->tmpblock) {
|
|
| 146 | 148 |
av_log(avctx, AV_LOG_ERROR, "Can't allocate decompression buffer.\n"); |
| 147 | 149 |
return AVERROR(ENOMEM); |
| 148 | 150 |
} |
| 151 |
+ if (s->ver == 2) {
|
|
| 152 |
+ s->deflate_block_size = calc_deflate_block_size(tmpblock_size); |
|
| 153 |
+ if (s->deflate_block_size <= 0) {
|
|
| 154 |
+ av_log(avctx, AV_LOG_ERROR, "Can't determine deflate buffer size.\n"); |
|
| 155 |
+ return -1; |
|
| 156 |
+ } |
|
| 157 |
+ s->deflate_block = av_realloc(s->deflate_block, s->deflate_block_size); |
|
| 158 |
+ if (!s->deflate_block) {
|
|
| 159 |
+ av_log(avctx, AV_LOG_ERROR, "Can't allocate deflate buffer.\n"); |
|
| 160 |
+ return AVERROR(ENOMEM); |
|
| 161 |
+ } |
|
| 162 |
+ } |
|
| 149 | 163 |
} |
| 150 | 164 |
s->block_size = s->block_width * s->block_height; |
| 151 | 165 |
|
| ... | ... |
@@ -164,6 +307,16 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, |
| 164 | 164 |
return AVERROR_INVALIDDATA; |
| 165 | 165 |
} |
| 166 | 166 |
|
| 167 |
+ /* we care for keyframes only in Screen Video v2 */ |
|
| 168 |
+ s->is_keyframe = (avpkt->flags & AV_PKT_FLAG_KEY) && (s->ver == 2); |
|
| 169 |
+ if (s->is_keyframe) {
|
|
| 170 |
+ s->keyframedata = av_realloc(s->keyframedata, avpkt->size); |
|
| 171 |
+ memcpy(s->keyframedata, avpkt->data, avpkt->size); |
|
| 172 |
+ s->blocks = av_realloc(s->blocks, |
|
| 173 |
+ (v_blocks + !!v_part) * (h_blocks + !!h_part) |
|
| 174 |
+ * sizeof(s->blocks[0])); |
|
| 175 |
+ } |
|
| 176 |
+ |
|
| 167 | 177 |
av_dlog(avctx, "image: %dx%d block: %dx%d num: %dx%d part: %dx%d\n", |
| 168 | 178 |
s->image_width, s->image_height, s->block_width, s->block_height, |
| 169 | 179 |
h_blocks, v_blocks, h_part, v_part); |
| ... | ... |
@@ -187,25 +340,90 @@ static int flashsv_decode_frame(AVCodecContext *avctx, void *data, |
| 187 | 187 |
for (i = 0; i < h_blocks + (h_part ? 1 : 0); i++) {
|
| 188 | 188 |
int x_pos = i * s->block_width; // horizontal position in frame |
| 189 | 189 |
int cur_blk_width = (i < h_blocks) ? s->block_width : h_part; |
| 190 |
+ int has_diff = 0; |
|
| 190 | 191 |
|
| 191 | 192 |
/* get the size of the compressed zlib chunk */ |
| 192 | 193 |
int size = get_bits(&gb, 16); |
| 194 |
+ |
|
| 195 |
+ s->color_depth = 0; |
|
| 196 |
+ s->zlibprime_curr = 0; |
|
| 197 |
+ s->zlibprime_prev = 0; |
|
| 198 |
+ s->diff_start = 0; |
|
| 199 |
+ s->diff_height = cur_blk_height; |
|
| 200 |
+ |
|
| 193 | 201 |
if (8 * size > get_bits_left(&gb)) {
|
| 194 | 202 |
avctx->release_buffer(avctx, &s->frame); |
| 195 | 203 |
s->frame.data[0] = NULL; |
| 196 | 204 |
return AVERROR_INVALIDDATA; |
| 197 | 205 |
} |
| 198 | 206 |
|
| 207 |
+ if (s->ver == 2 && size) {
|
|
| 208 |
+ skip_bits(&gb, 3); |
|
| 209 |
+ s->color_depth = get_bits(&gb, 2); |
|
| 210 |
+ has_diff = get_bits1(&gb); |
|
| 211 |
+ s->zlibprime_curr = get_bits1(&gb); |
|
| 212 |
+ s->zlibprime_prev = get_bits1(&gb); |
|
| 213 |
+ |
|
| 214 |
+ if (s->color_depth != 0 && s->color_depth != 2) {
|
|
| 215 |
+ av_log(avctx, AV_LOG_ERROR, |
|
| 216 |
+ "%dx%d invalid color depth %d\n", i, j, s->color_depth); |
|
| 217 |
+ return -1; |
|
| 218 |
+ } |
|
| 219 |
+ |
|
| 220 |
+ if (has_diff) {
|
|
| 221 |
+ s->diff_start = get_bits(&gb, 8); |
|
| 222 |
+ s->diff_height = get_bits(&gb, 8); |
|
| 223 |
+ av_log(avctx, AV_LOG_DEBUG, |
|
| 224 |
+ "%dx%d diff start %d height %d\n", |
|
| 225 |
+ i, j, s->diff_start, s->diff_height); |
|
| 226 |
+ size -= 2; |
|
| 227 |
+ } |
|
| 228 |
+ |
|
| 229 |
+ if (s->zlibprime_prev) |
|
| 230 |
+ av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_prev\n", i, j); |
|
| 231 |
+ |
|
| 232 |
+ if (s->zlibprime_curr) {
|
|
| 233 |
+ int col = get_bits(&gb, 8); |
|
| 234 |
+ int row = get_bits(&gb, 8); |
|
| 235 |
+ av_log(avctx, AV_LOG_DEBUG, "%dx%d zlibprime_curr %dx%d\n", i, j, col, row); |
|
| 236 |
+ size -= 2; |
|
| 237 |
+ av_log_missing_feature(avctx, "zlibprime_curr", 1); |
|
| 238 |
+ return AVERROR_PATCHWELCOME; |
|
| 239 |
+ } |
|
| 240 |
+ size--; // account for flags byte |
|
| 241 |
+ } |
|
| 242 |
+ |
|
| 243 |
+ if (has_diff) {
|
|
| 244 |
+ int k; |
|
| 245 |
+ int off = (s->image_height - y_pos - 1) * s->frame.linesize[0]; |
|
| 246 |
+ |
|
| 247 |
+ for (k = 0; k < cur_blk_height; k++) |
|
| 248 |
+ memcpy(s->frame.data[0] + off - k*s->frame.linesize[0] + x_pos*3, |
|
| 249 |
+ s->keyframe + off - k*s->frame.linesize[0] + x_pos*3, |
|
| 250 |
+ cur_blk_width * 3); |
|
| 251 |
+ } |
|
| 252 |
+ |
|
| 199 | 253 |
/* skip unchanged blocks, which have size 0 */ |
| 200 | 254 |
if (size) {
|
| 201 | 255 |
if (flashsv_decode_block(avctx, avpkt, &gb, size, |
| 202 | 256 |
cur_blk_width, cur_blk_height, |
| 203 |
- x_pos, y_pos)) |
|
| 257 |
+ x_pos, y_pos, |
|
| 258 |
+ i + j * (h_blocks + !!h_part))) |
|
| 204 | 259 |
av_log(avctx, AV_LOG_ERROR, |
| 205 | 260 |
"error in decompression of block %dx%d\n", i, j); |
| 206 | 261 |
} |
| 207 | 262 |
} |
| 208 | 263 |
} |
| 264 |
+ if (s->is_keyframe && s->ver == 2) {
|
|
| 265 |
+ if (!s->keyframe) {
|
|
| 266 |
+ s->keyframe = av_malloc(s->frame.linesize[0] * avctx->height); |
|
| 267 |
+ if (!s->keyframe) {
|
|
| 268 |
+ av_log(avctx, AV_LOG_ERROR, "Cannot allocate image data\n"); |
|
| 269 |
+ return AVERROR(ENOMEM); |
|
| 270 |
+ } |
|
| 271 |
+ } |
|
| 272 |
+ memcpy(s->keyframe, s->frame.data[0], s->frame.linesize[0] * avctx->height); |
|
| 273 |
+ } |
|
| 209 | 274 |
|
| 210 | 275 |
*data_size = sizeof(AVFrame); |
| 211 | 276 |
*(AVFrame*)data = s->frame; |
| ... | ... |
@@ -234,6 +452,7 @@ static av_cold int flashsv_decode_end(AVCodecContext *avctx) |
| 234 | 234 |
} |
| 235 | 235 |
|
| 236 | 236 |
|
| 237 |
+#if CONFIG_FLASHSV_DECODER |
|
| 237 | 238 |
AVCodec ff_flashsv_decoder = {
|
| 238 | 239 |
.name = "flashsv", |
| 239 | 240 |
.type = AVMEDIA_TYPE_VIDEO, |
| ... | ... |
@@ -246,3 +465,67 @@ AVCodec ff_flashsv_decoder = {
|
| 246 | 246 |
.pix_fmts = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_NONE},
|
| 247 | 247 |
.long_name = NULL_IF_CONFIG_SMALL("Flash Screen Video v1"),
|
| 248 | 248 |
}; |
| 249 |
+#endif /* CONFIG_FLASHSV_DECODER */ |
|
| 250 |
+ |
|
| 251 |
+#if CONFIG_FLASHSV2_DECODER |
|
| 252 |
+static const uint32_t ff_flashsv2_default_palette[128] = {
|
|
| 253 |
+ 0x000000, 0x333333, 0x666666, 0x999999, 0xCCCCCC, 0xFFFFFF, |
|
| 254 |
+ 0x330000, 0x660000, 0x990000, 0xCC0000, 0xFF0000, 0x003300, |
|
| 255 |
+ 0x006600, 0x009900, 0x00CC00, 0x00FF00, 0x000033, 0x000066, |
|
| 256 |
+ 0x000099, 0x0000CC, 0x0000FF, 0x333300, 0x666600, 0x999900, |
|
| 257 |
+ 0xCCCC00, 0xFFFF00, 0x003333, 0x006666, 0x009999, 0x00CCCC, |
|
| 258 |
+ 0x00FFFF, 0x330033, 0x660066, 0x990099, 0xCC00CC, 0xFF00FF, |
|
| 259 |
+ 0xFFFF33, 0xFFFF66, 0xFFFF99, 0xFFFFCC, 0xFF33FF, 0xFF66FF, |
|
| 260 |
+ 0xFF99FF, 0xFFCCFF, 0x33FFFF, 0x66FFFF, 0x99FFFF, 0xCCFFFF, |
|
| 261 |
+ 0xCCCC33, 0xCCCC66, 0xCCCC99, 0xCCCCFF, 0xCC33CC, 0xCC66CC, |
|
| 262 |
+ 0xCC99CC, 0xCCFFCC, 0x33CCCC, 0x66CCCC, 0x99CCCC, 0xFFCCCC, |
|
| 263 |
+ 0x999933, 0x999966, 0x9999CC, 0x9999FF, 0x993399, 0x996699, |
|
| 264 |
+ 0x99CC99, 0x99FF99, 0x339999, 0x669999, 0xCC9999, 0xFF9999, |
|
| 265 |
+ 0x666633, 0x666699, 0x6666CC, 0x6666FF, 0x663366, 0x669966, |
|
| 266 |
+ 0x66CC66, 0x66FF66, 0x336666, 0x996666, 0xCC6666, 0xFF6666, |
|
| 267 |
+ 0x333366, 0x333399, 0x3333CC, 0x3333FF, 0x336633, 0x339933, |
|
| 268 |
+ 0x33CC33, 0x33FF33, 0x663333, 0x993333, 0xCC3333, 0xFF3333, |
|
| 269 |
+ 0x003366, 0x336600, 0x660033, 0x006633, 0x330066, 0x663300, |
|
| 270 |
+ 0x336699, 0x669933, 0x993366, 0x339966, 0x663399, 0x996633, |
|
| 271 |
+ 0x6699CC, 0x99CC66, 0xCC6699, 0x66CC99, 0x9966CC, 0xCC9966, |
|
| 272 |
+ 0x99CCFF, 0xCCFF99, 0xFF99CC, 0x99FFCC, 0xCC99FF, 0xFFCC99, |
|
| 273 |
+ 0x111111, 0x222222, 0x444444, 0x555555, 0xAAAAAA, 0xBBBBBB, |
|
| 274 |
+ 0xDDDDDD, 0xEEEEEE |
|
| 275 |
+}; |
|
| 276 |
+ |
|
| 277 |
+static av_cold int flashsv2_decode_init(AVCodecContext *avctx) |
|
| 278 |
+{
|
|
| 279 |
+ FlashSVContext *s = avctx->priv_data; |
|
| 280 |
+ flashsv_decode_init(avctx); |
|
| 281 |
+ s->pal = ff_flashsv2_default_palette; |
|
| 282 |
+ s->ver = 2; |
|
| 283 |
+ |
|
| 284 |
+ return 0; |
|
| 285 |
+} |
|
| 286 |
+ |
|
| 287 |
+static av_cold int flashsv2_decode_end(AVCodecContext *avctx) |
|
| 288 |
+{
|
|
| 289 |
+ FlashSVContext *s = avctx->priv_data; |
|
| 290 |
+ |
|
| 291 |
+ av_freep(&s->keyframedata); |
|
| 292 |
+ av_freep(&s->blocks); |
|
| 293 |
+ av_freep(&s->keyframe); |
|
| 294 |
+ av_freep(&s->deflate_block); |
|
| 295 |
+ flashsv_decode_end(avctx); |
|
| 296 |
+ |
|
| 297 |
+ return 0; |
|
| 298 |
+} |
|
| 299 |
+ |
|
| 300 |
+AVCodec ff_flashsv2_decoder = {
|
|
| 301 |
+ .name = "flashsv2", |
|
| 302 |
+ .type = AVMEDIA_TYPE_VIDEO, |
|
| 303 |
+ .id = CODEC_ID_FLASHSV2, |
|
| 304 |
+ .priv_data_size = sizeof(FlashSVContext), |
|
| 305 |
+ .init = flashsv2_decode_init, |
|
| 306 |
+ .close = flashsv2_decode_end, |
|
| 307 |
+ .decode = flashsv_decode_frame, |
|
| 308 |
+ .capabilities = CODEC_CAP_DR1, |
|
| 309 |
+ .pix_fmts = (const enum PixelFormat[]){PIX_FMT_BGR24, PIX_FMT_NONE},
|
|
| 310 |
+ .long_name = NULL_IF_CONFIG_SMALL("Flash Screen Video v2"),
|
|
| 311 |
+}; |
|
| 312 |
+#endif /* CONFIG_FLASHSV2_DECODER */ |
| ... | ... |
@@ -21,7 +21,7 @@ |
| 21 | 21 |
#define AVCODEC_VERSION_H |
| 22 | 22 |
|
| 23 | 23 |
#define LIBAVCODEC_VERSION_MAJOR 53 |
| 24 |
-#define LIBAVCODEC_VERSION_MINOR 8 |
|
| 24 |
+#define LIBAVCODEC_VERSION_MINOR 9 |
|
| 25 | 25 |
#define LIBAVCODEC_VERSION_MICRO 0 |
| 26 | 26 |
|
| 27 | 27 |
#define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ |
| ... | ... |
@@ -2611,12 +2611,11 @@ cglobal pred4x4_down_left_mmxext, 3,3 |
| 2611 | 2611 |
punpckldq m1, [r1] |
| 2612 | 2612 |
movq m2, m1 |
| 2613 | 2613 |
movq m3, m1 |
| 2614 |
- movq m4, m1 |
|
| 2615 | 2614 |
psllq m1, 8 |
| 2616 | 2615 |
pxor m2, m1 |
| 2617 | 2616 |
psrlq m2, 8 |
| 2618 |
- pxor m3, m2 |
|
| 2619 |
- PRED4x4_LOWPASS m0, m1, m3, m4, m5 |
|
| 2617 |
+ pxor m2, m3 |
|
| 2618 |
+ PRED4x4_LOWPASS m0, m1, m2, m3, m4 |
|
| 2620 | 2619 |
lea r1, [r0+r2*2] |
| 2621 | 2620 |
psrlq m0, 8 |
| 2622 | 2621 |
movd [r0+r2*1], m0 |
| ... | ... |
@@ -27,8 +27,6 @@ |
| 27 | 27 |
|
| 28 | 28 |
SECTION_RODATA |
| 29 | 29 |
|
| 30 |
-SECTION .text |
|
| 31 |
- |
|
| 32 | 30 |
cextern pw_16 |
| 33 | 31 |
cextern pw_8 |
| 34 | 32 |
cextern pw_4 |
| ... | ... |
@@ -42,6 +40,8 @@ pw_512: times 8 dw 512 |
| 42 | 42 |
pd_17: times 4 dd 17 |
| 43 | 43 |
pd_16: times 4 dd 16 |
| 44 | 44 |
|
| 45 |
+SECTION .text |
|
| 46 |
+ |
|
| 45 | 47 |
; dest, left, right, src |
| 46 | 48 |
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 |
| 47 | 49 |
%macro PRED4x4_LOWPASS 4 |
| ... | ... |
@@ -64,13 +64,11 @@ cglobal pred4x4_down_right_10_%1, 3,3 |
| 64 | 64 |
movq m3, [r0] |
| 65 | 65 |
punpckhdq m1, m2 |
| 66 | 66 |
PALIGNR m3, m1, 10, m1 |
| 67 |
- mova m1, m3 |
|
| 68 | 67 |
movhps m4, [r1+r2*1-8] |
| 69 |
- PALIGNR m3, m4, 14, m4 |
|
| 70 |
- mova m2, m3 |
|
| 68 |
+ PALIGNR m0, m3, m4, 14, m4 |
|
| 71 | 69 |
movhps m4, [r1+r2*2-8] |
| 72 |
- PALIGNR m3, m4, 14, m4 |
|
| 73 |
- PRED4x4_LOWPASS m0, m3, m1, m2 |
|
| 70 |
+ PALIGNR m2, m0, m4, 14, m4 |
|
| 71 |
+ PRED4x4_LOWPASS m0, m2, m3, m0 |
|
| 74 | 72 |
movq [r1+r2*2], m0 |
| 75 | 73 |
psrldq m0, 2 |
| 76 | 74 |
movq [r1+r2*1], m0 |
| ... | ... |
@@ -104,22 +102,20 @@ cglobal pred4x4_vertical_right_10_%1, 3,3,6 |
| 104 | 104 |
pavgw m5, m0 |
| 105 | 105 |
movhps m1, [r0+r2*1-8] |
| 106 | 106 |
PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 |
| 107 |
- mova m1, m0 |
|
| 108 | 107 |
movhps m2, [r0+r2*2-8] |
| 109 |
- PALIGNR m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 |
|
| 110 |
- mova m2, m0 |
|
| 108 |
+ PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 |
|
| 111 | 109 |
movhps m3, [r1+r2*1-8] |
| 112 |
- PALIGNR m0, m3, 14, m3 ; t3t2t1t0ltl0l1l2 |
|
| 113 |
- PRED4x4_LOWPASS m3, m1, m0, m2 |
|
| 114 |
- pslldq m1, m3, 12 |
|
| 115 |
- psrldq m3, 4 |
|
| 110 |
+ PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 |
|
| 111 |
+ PRED4x4_LOWPASS m1, m0, m2, m1 |
|
| 112 |
+ pslldq m0, m1, 12 |
|
| 113 |
+ psrldq m1, 4 |
|
| 116 | 114 |
movq [r0+r2*1], m5 |
| 117 |
- movq [r0+r2*2], m3 |
|
| 118 |
- PALIGNR m5, m1, 14, m2 |
|
| 119 |
- pslldq m1, 2 |
|
| 115 |
+ movq [r0+r2*2], m1 |
|
| 116 |
+ PALIGNR m5, m0, 14, m2 |
|
| 117 |
+ pslldq m0, 2 |
|
| 120 | 118 |
movq [r1+r2*1], m5 |
| 121 |
- PALIGNR m3, m1, 14, m1 |
|
| 122 |
- movq [r1+r2*2], m3 |
|
| 119 |
+ PALIGNR m1, m0, 14, m0 |
|
| 120 |
+ movq [r1+r2*2], m1 |
|
| 123 | 121 |
RET |
| 124 | 122 |
%endmacro |
| 125 | 123 |
|
| ... | ... |
@@ -152,9 +148,9 @@ cglobal pred4x4_horizontal_down_10_%1, 3,3 |
| 152 | 152 |
punpckhdq m1, m2 ; l0 l1 l2 l3 |
| 153 | 153 |
punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 |
| 154 | 154 |
psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 |
| 155 |
- psrldq m2, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 |
|
| 156 |
- pavgw m5, m1, m2 |
|
| 157 |
- PRED4x4_LOWPASS m3, m1, m0, m2 |
|
| 155 |
+ psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 |
|
| 156 |
+ pavgw m5, m1, m3 |
|
| 157 |
+ PRED4x4_LOWPASS m3, m1, m0, m3 |
|
| 158 | 158 |
punpcklwd m5, m3 |
| 159 | 159 |
psrldq m3, 8 |
| 160 | 160 |
PALIGNR m3, m5, 12, m4 |
| ... | ... |
@@ -220,17 +216,15 @@ cglobal pred4x4_dc_10_mmxext, 3,3 |
| 220 | 220 |
;----------------------------------------------------------------------------- |
| 221 | 221 |
; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) |
| 222 | 222 |
;----------------------------------------------------------------------------- |
| 223 |
-;TODO: more AVX here |
|
| 224 | 223 |
%macro PRED4x4_DL 1 |
| 225 | 224 |
cglobal pred4x4_down_left_10_%1, 3,3 |
| 226 | 225 |
sub r0, r2 |
| 227 |
- movq m1, [r0] |
|
| 228 |
- movhps m1, [r1] |
|
| 229 |
- pslldq m5, m1, 2 |
|
| 230 |
- pxor m2, m5, m1 |
|
| 231 |
- psrldq m2, 2 |
|
| 232 |
- pxor m3, m1, m2 |
|
| 233 |
- PRED4x4_LOWPASS m0, m5, m3, m1 |
|
| 226 |
+ movq m0, [r0] |
|
| 227 |
+ movhps m0, [r1] |
|
| 228 |
+ psrldq m2, m0, 2 |
|
| 229 |
+ pslldq m3, m0, 2 |
|
| 230 |
+ pshufhw m2, m2, 10100100b |
|
| 231 |
+ PRED4x4_LOWPASS m0, m3, m2, m0 |
|
| 234 | 232 |
lea r1, [r0+r2*2] |
| 235 | 233 |
movhps [r1+r2*2], m0 |
| 236 | 234 |
psrldq m0, 2 |
| ... | ... |
@@ -257,10 +251,10 @@ cglobal pred4x4_vertical_left_10_%1, 3,3 |
| 257 | 257 |
sub r0, r2 |
| 258 | 258 |
movu m1, [r0] |
| 259 | 259 |
movhps m1, [r1] |
| 260 |
- psrldq m3, m1, 2 |
|
| 260 |
+ psrldq m0, m1, 2 |
|
| 261 | 261 |
psrldq m2, m1, 4 |
| 262 |
- pavgw m4, m3, m1 |
|
| 263 |
- PRED4x4_LOWPASS m0, m1, m2, m3 |
|
| 262 |
+ pavgw m4, m0, m1 |
|
| 263 |
+ PRED4x4_LOWPASS m0, m1, m2, m0 |
|
| 264 | 264 |
lea r1, [r0+r2*2] |
| 265 | 265 |
movq [r0+r2*1], m4 |
| 266 | 266 |
movq [r0+r2*2], m0 |
| ... | ... |
@@ -298,13 +292,13 @@ cglobal pred4x4_horizontal_up_10_mmxext, 3,3 |
| 298 | 298 |
pavgw m2, m0 |
| 299 | 299 |
|
| 300 | 300 |
pshufw m5, m0, 11111110b |
| 301 |
- PRED4x4_LOWPASS m3, m0, m5, m1 |
|
| 301 |
+ PRED4x4_LOWPASS m1, m0, m5, m1 |
|
| 302 | 302 |
movq m6, m2 |
| 303 |
- punpcklwd m6, m3 |
|
| 303 |
+ punpcklwd m6, m1 |
|
| 304 | 304 |
movq [r0+r2*1], m6 |
| 305 | 305 |
psrlq m2, 16 |
| 306 |
- psrlq m3, 16 |
|
| 307 |
- punpcklwd m2, m3 |
|
| 306 |
+ psrlq m1, 16 |
|
| 307 |
+ punpcklwd m2, m1 |
|
| 308 | 308 |
movq [r0+r2*2], m2 |
| 309 | 309 |
psrlq m2, 32 |
| 310 | 310 |
movd [r1+r2*1], m2 |
| ... | ... |
@@ -333,7 +327,7 @@ cglobal pred8x8_vertical_10_sse2, 2,2 |
| 333 | 333 |
;----------------------------------------------------------------------------- |
| 334 | 334 |
INIT_XMM |
| 335 | 335 |
cglobal pred8x8_horizontal_10_sse2, 2,3 |
| 336 |
- mov r2, 4 |
|
| 336 |
+ mov r2d, 4 |
|
| 337 | 337 |
.loop: |
| 338 | 338 |
movq m0, [r0+r1*0-8] |
| 339 | 339 |
movq m1, [r0+r1*1-8] |
| ... | ... |
@@ -344,7 +338,7 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 |
| 344 | 344 |
mova [r0+r1*0], m0 |
| 345 | 345 |
mova [r0+r1*1], m1 |
| 346 | 346 |
lea r0, [r0+r1*2] |
| 347 |
- dec r2 |
|
| 347 |
+ dec r2d |
|
| 348 | 348 |
jg .loop |
| 349 | 349 |
REP_RET |
| 350 | 350 |
|
| ... | ... |
@@ -362,53 +356,53 @@ cglobal pred8x8_horizontal_10_sse2, 2,3 |
| 362 | 362 |
%endmacro |
| 363 | 363 |
|
| 364 | 364 |
%macro PRED8x8_DC 2 |
| 365 |
-cglobal pred8x8_dc_10_%1, 2,4 |
|
| 366 |
-%ifdef ARCH_X86_64 |
|
| 367 |
-%define t0 r10 |
|
| 368 |
-%else |
|
| 369 |
-%define t0 r0m |
|
| 370 |
-%endif |
|
| 365 |
+cglobal pred8x8_dc_10_%1, 2,6 |
|
| 371 | 366 |
sub r0, r1 |
| 372 | 367 |
pxor m4, m4 |
| 373 | 368 |
movq m0, [r0+0] |
| 374 | 369 |
movq m1, [r0+8] |
| 375 |
- HADDW m0, m2 |
|
| 376 |
- mov t0, r0 |
|
| 377 |
- HADDW m1, m2 |
|
| 370 |
+%if mmsize==16 |
|
| 371 |
+ punpcklwd m0, m1 |
|
| 372 |
+ movhlps m1, m0 |
|
| 373 |
+ paddw m0, m1 |
|
| 374 |
+%else |
|
| 375 |
+ pshufw m2, m0, 00001110b |
|
| 376 |
+ pshufw m3, m1, 00001110b |
|
| 377 |
+ paddw m0, m2 |
|
| 378 |
+ paddw m1, m3 |
|
| 379 |
+ punpcklwd m0, m1 |
|
| 380 |
+%endif |
|
| 381 |
+ %2 m2, m0, 00001110b |
|
| 382 |
+ paddw m0, m2 |
|
| 378 | 383 |
|
| 384 |
+ lea r5, [r1*3] |
|
| 385 |
+ lea r4, [r0+r1*4] |
|
| 379 | 386 |
movzx r2d, word [r0+r1*1-2] |
| 380 | 387 |
movzx r3d, word [r0+r1*2-2] |
| 381 |
- lea r0, [r0+r1*2] |
|
| 382 | 388 |
add r2d, r3d |
| 383 |
- movzx r3d, word [r0+r1*1-2] |
|
| 389 |
+ movzx r3d, word [r0+r5*1-2] |
|
| 384 | 390 |
add r2d, r3d |
| 385 |
- movzx r3d, word [r0+r1*2-2] |
|
| 391 |
+ movzx r3d, word [r4-2] |
|
| 386 | 392 |
add r2d, r3d |
| 387 |
- lea r0, [r0+r1*2] |
|
| 388 | 393 |
movd m2, r2d ; s2 |
| 389 | 394 |
|
| 390 |
- movzx r2d, word [r0+r1*1-2] |
|
| 391 |
- movzx r3d, word [r0+r1*2-2] |
|
| 392 |
- lea r0, [r0+r1*2] |
|
| 395 |
+ movzx r2d, word [r4+r1*1-2] |
|
| 396 |
+ movzx r3d, word [r4+r1*2-2] |
|
| 393 | 397 |
add r2d, r3d |
| 394 |
- movzx r3d, word [r0+r1*1-2] |
|
| 398 |
+ movzx r3d, word [r4+r5*1-2] |
|
| 395 | 399 |
add r2d, r3d |
| 396 |
- movzx r3d, word [r0+r1*2-2] |
|
| 400 |
+ movzx r3d, word [r4+r1*4-2] |
|
| 397 | 401 |
add r2d, r3d |
| 398 | 402 |
movd m3, r2d ; s3 |
| 399 | 403 |
|
| 400 |
- punpcklwd m0, m1 |
|
| 401 |
- mov r0, t0 |
|
| 402 | 404 |
punpcklwd m2, m3 |
| 403 | 405 |
punpckldq m0, m2 ; s0, s1, s2, s3 |
| 404 | 406 |
%2 m3, m0, 11110110b ; s2, s1, s3, s3 |
| 405 |
- lea r2, [r1+r1*2] |
|
| 406 | 407 |
%2 m0, m0, 01110100b ; s0, s1, s3, s1 |
| 407 | 408 |
paddw m0, m3 |
| 408 |
- lea r3, [r0+r1*4] |
|
| 409 | 409 |
psrlw m0, 2 |
| 410 | 410 |
pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 |
| 411 |
-%ifidn %1, sse2 |
|
| 411 |
+%if mmsize==16 |
|
| 412 | 412 |
punpcklwd m0, m0 |
| 413 | 413 |
pshufd m3, m0, 11111010b |
| 414 | 414 |
punpckldq m0, m0 |
| ... | ... |
@@ -421,12 +415,12 @@ cglobal pred8x8_dc_10_%1, 2,4 |
| 421 | 421 |
%endif |
| 422 | 422 |
MOV8 r0+r1*1, m1, m2 |
| 423 | 423 |
MOV8 r0+r1*2, m1, m2 |
| 424 |
- MOV8 r0+r2*1, m1, m2 |
|
| 424 |
+ MOV8 r0+r5*1, m1, m2 |
|
| 425 | 425 |
MOV8 r0+r1*4, m1, m2 |
| 426 |
- MOV8 r3+r1*1, m3, m4 |
|
| 427 |
- MOV8 r3+r1*2, m3, m4 |
|
| 428 |
- MOV8 r3+r2*1, m3, m4 |
|
| 429 |
- MOV8 r3+r1*4, m3, m4 |
|
| 426 |
+ MOV8 r4+r1*1, m3, m4 |
|
| 427 |
+ MOV8 r4+r1*2, m3, m4 |
|
| 428 |
+ MOV8 r4+r5*1, m3, m4 |
|
| 429 |
+ MOV8 r4+r1*4, m3, m4 |
|
| 430 | 430 |
RET |
| 431 | 431 |
%endmacro |
| 432 | 432 |
|
| ... | ... |
@@ -438,39 +432,29 @@ PRED8x8_DC sse2 , pshuflw |
| 438 | 438 |
;----------------------------------------------------------------------------- |
| 439 | 439 |
; void pred8x8_top_dc(pixel *src, int stride) |
| 440 | 440 |
;----------------------------------------------------------------------------- |
| 441 |
-%macro PRED8x8_TOP_DC 2 |
|
| 442 |
-cglobal pred8x8_top_dc_10_%1, 2,4 |
|
| 441 |
+INIT_XMM |
|
| 442 |
+cglobal pred8x8_top_dc_10_sse2, 2,4 |
|
| 443 | 443 |
sub r0, r1 |
| 444 |
- movq m0, [r0+0] |
|
| 445 |
- movq m1, [r0+8] |
|
| 446 |
- HADDW m0, m2 |
|
| 447 |
- HADDW m1, m3 |
|
| 448 |
- lea r2, [r1+r1*2] |
|
| 449 |
- paddw m0, [pw_2] |
|
| 450 |
- paddw m1, [pw_2] |
|
| 444 |
+ mova m0, [r0] |
|
| 445 |
+ pshuflw m1, m0, 0x4e |
|
| 446 |
+ pshufhw m1, m1, 0x4e |
|
| 447 |
+ paddw m0, m1 |
|
| 448 |
+ pshuflw m1, m0, 0xb1 |
|
| 449 |
+ pshufhw m1, m1, 0xb1 |
|
| 450 |
+ paddw m0, m1 |
|
| 451 |
+ lea r2, [r1*3] |
|
| 451 | 452 |
lea r3, [r0+r1*4] |
| 453 |
+ paddw m0, [pw_2] |
|
| 452 | 454 |
psrlw m0, 2 |
| 453 |
- psrlw m1, 2 |
|
| 454 |
- %2 m0, m0, 0 |
|
| 455 |
- %2 m1, m1, 0 |
|
| 456 |
-%ifidn %1, sse2 |
|
| 457 |
- punpcklqdq m0, m1 |
|
| 458 |
-%endif |
|
| 459 |
- MOV8 r0+r1*1, m0, m1 |
|
| 460 |
- MOV8 r0+r1*2, m0, m1 |
|
| 461 |
- MOV8 r0+r2*1, m0, m1 |
|
| 462 |
- MOV8 r0+r1*4, m0, m1 |
|
| 463 |
- MOV8 r3+r1*1, m0, m1 |
|
| 464 |
- MOV8 r3+r1*2, m0, m1 |
|
| 465 |
- MOV8 r3+r2*1, m0, m1 |
|
| 466 |
- MOV8 r3+r1*4, m0, m1 |
|
| 455 |
+ mova [r0+r1*1], m0 |
|
| 456 |
+ mova [r0+r1*2], m0 |
|
| 457 |
+ mova [r0+r2*1], m0 |
|
| 458 |
+ mova [r0+r1*4], m0 |
|
| 459 |
+ mova [r3+r1*1], m0 |
|
| 460 |
+ mova [r3+r1*2], m0 |
|
| 461 |
+ mova [r3+r2*1], m0 |
|
| 462 |
+ mova [r3+r1*4], m0 |
|
| 467 | 463 |
RET |
| 468 |
-%endmacro |
|
| 469 |
- |
|
| 470 |
-INIT_MMX |
|
| 471 |
-PRED8x8_TOP_DC mmxext, pshufw |
|
| 472 |
-INIT_XMM |
|
| 473 |
-PRED8x8_TOP_DC sse2 , pshuflw |
|
| 474 | 464 |
|
| 475 | 465 |
;----------------------------------------------------------------------------- |
| 476 | 466 |
; void pred8x8_plane(pixel *src, int stride) |
| ... | ... |
@@ -478,7 +462,7 @@ PRED8x8_TOP_DC sse2 , pshuflw |
| 478 | 478 |
INIT_XMM |
| 479 | 479 |
cglobal pred8x8_plane_10_sse2, 2,7,7 |
| 480 | 480 |
sub r0, r1 |
| 481 |
- lea r2, [r1+r1*2] |
|
| 481 |
+ lea r2, [r1*3] |
|
| 482 | 482 |
lea r3, [r0+r1*4] |
| 483 | 483 |
mova m2, [r0] |
| 484 | 484 |
pmaddwd m2, [pw_m32101234] |
| ... | ... |
@@ -500,7 +484,7 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 |
| 500 | 500 |
movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] |
| 501 | 501 |
movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] |
| 502 | 502 |
sub r5d, r6d |
| 503 |
- lea r5d, [r5+r5*2] |
|
| 503 |
+ lea r5d, [r5*3] |
|
| 504 | 504 |
add r4d, r5d |
| 505 | 505 |
movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] |
| 506 | 506 |
movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] |
| ... | ... |
@@ -540,8 +524,8 @@ cglobal pred8x8_plane_10_sse2, 2,7,7 |
| 540 | 540 |
;----------------------------------------------------------------------------- |
| 541 | 541 |
%macro PRED8x8L_128_DC 1 |
| 542 | 542 |
cglobal pred8x8l_128_dc_10_%1, 4,4 |
| 543 |
- mova m0, [pw_512] |
|
| 544 |
- lea r1, [r3+r3*2] |
|
| 543 |
+ mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) |
|
| 544 |
+ lea r1, [r3*3] |
|
| 545 | 545 |
lea r2, [r0+r3*4] |
| 546 | 546 |
MOV8 r0+r3*0, m0, m0 |
| 547 | 547 |
MOV8 r0+r3*1, m0, m0 |
| ... | ... |
@@ -565,37 +549,17 @@ PRED8x8L_128_DC sse2 |
| 565 | 565 |
%macro PRED8x8L_TOP_DC 1 |
| 566 | 566 |
cglobal pred8x8l_top_dc_10_%1, 4,4,6 |
| 567 | 567 |
sub r0, r3 |
| 568 |
- pxor m7, m7 |
|
| 569 |
- mova m0, [r0-16] |
|
| 570 |
- mova m3, [r0] |
|
| 571 |
- mova m1, [r0+16] |
|
| 572 |
- mova m2, m3 |
|
| 573 |
- mova m4, m3 |
|
| 574 |
- PALIGNR m2, m0, 14, m0 |
|
| 575 |
- PALIGNR m1, m4, 2, m4 |
|
| 576 |
- test r1, r1 ; top_left |
|
| 577 |
- jz .fix_lt_2 |
|
| 578 |
- test r2, r2 ; top_right |
|
| 579 |
- jz .fix_tr_1 |
|
| 580 |
- jmp .body |
|
| 581 |
-.fix_lt_2: |
|
| 582 |
- mova m5, m3 |
|
| 583 |
- pxor m5, m2 |
|
| 584 |
- pslldq m5, 14 |
|
| 585 |
- psrldq m5, 14 |
|
| 586 |
- pxor m2, m5 |
|
| 587 |
- test r2, r2 ; top_right |
|
| 588 |
- jnz .body |
|
| 589 |
-.fix_tr_1: |
|
| 590 |
- mova m5, m3 |
|
| 591 |
- pxor m5, m1 |
|
| 592 |
- psrldq m5, 14 |
|
| 593 |
- pslldq m5, 14 |
|
| 594 |
- pxor m1, m5 |
|
| 595 |
-.body |
|
| 596 |
- lea r1, [r3+r3*2] |
|
| 568 |
+ mova m0, [r0] |
|
| 569 |
+ shr r1d, 14 |
|
| 570 |
+ shr r2d, 13 |
|
| 571 |
+ neg r1 |
|
| 572 |
+ pslldq m1, m0, 2 |
|
| 573 |
+ psrldq m2, m0, 2 |
|
| 574 |
+ pinsrw m1, [r0+r1], 0 |
|
| 575 |
+ pinsrw m2, [r0+r2+14], 7 |
|
| 576 |
+ lea r1, [r3*3] |
|
| 597 | 577 |
lea r2, [r0+r3*4] |
| 598 |
- PRED4x4_LOWPASS m0, m2, m1, m3 |
|
| 578 |
+ PRED4x4_LOWPASS m0, m2, m1, m0 |
|
| 599 | 579 |
HADDW m0, m1 |
| 600 | 580 |
paddw m0, [pw_4] |
| 601 | 581 |
psrlw m0, 3 |
| ... | ... |
@@ -612,110 +576,70 @@ cglobal pred8x8l_top_dc_10_%1, 4,4,6 |
| 612 | 612 |
%endmacro |
| 613 | 613 |
|
| 614 | 614 |
INIT_XMM |
| 615 |
-%define PALIGNR PALIGNR_MMX |
|
| 616 | 615 |
PRED8x8L_TOP_DC sse2 |
| 617 |
-%define PALIGNR PALIGNR_SSSE3 |
|
| 618 |
-PRED8x8L_TOP_DC ssse3 |
|
| 616 |
+%ifdef HAVE_AVX |
|
| 617 |
+INIT_AVX |
|
| 618 |
+PRED8x8L_TOP_DC avx |
|
| 619 |
+%endif |
|
| 619 | 620 |
|
| 620 | 621 |
;----------------------------------------------------------------------------- |
| 621 | 622 |
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) |
| 622 | 623 |
;----------------------------------------------------------------------------- |
| 623 | 624 |
;TODO: see if scalar is faster |
| 624 | 625 |
%macro PRED8x8L_DC 1 |
| 625 |
-cglobal pred8x8l_dc_10_%1, 4,5,8 |
|
| 626 |
+cglobal pred8x8l_dc_10_%1, 4,6,6 |
|
| 626 | 627 |
sub r0, r3 |
| 627 |
- lea r4, [r0+r3*2] |
|
| 628 |
- mova m0, [r0+r3*1-16] |
|
| 629 |
- punpckhwd m0, [r0+r3*0-16] |
|
| 630 |
- mova m1, [r4+r3*1-16] |
|
| 631 |
- punpckhwd m1, [r0+r3*2-16] |
|
| 632 |
- mov r4, r0 |
|
| 628 |
+ lea r4, [r0+r3*4] |
|
| 629 |
+ lea r5, [r3*3] |
|
| 630 |
+ mova m0, [r0+r3*2-16] |
|
| 631 |
+ punpckhwd m0, [r0+r3*1-16] |
|
| 632 |
+ mova m1, [r4+r3*0-16] |
|
| 633 |
+ punpckhwd m1, [r0+r5*1-16] |
|
| 633 | 634 |
punpckhdq m1, m0 |
| 634 |
- lea r0, [r0+r3*4] |
|
| 635 |
- mova m2, [r0+r3*1-16] |
|
| 636 |
- punpckhwd m2, [r0+r3*0-16] |
|
| 637 |
- lea r0, [r0+r3*2] |
|
| 638 |
- mova m3, [r0+r3*1-16] |
|
| 639 |
- punpckhwd m3, [r0+r3*0-16] |
|
| 635 |
+ mova m2, [r4+r3*2-16] |
|
| 636 |
+ punpckhwd m2, [r4+r3*1-16] |
|
| 637 |
+ mova m3, [r4+r3*4-16] |
|
| 638 |
+ punpckhwd m3, [r4+r5*1-16] |
|
| 640 | 639 |
punpckhdq m3, m2 |
| 641 | 640 |
punpckhqdq m3, m1 |
| 642 |
- lea r0, [r0+r3*2] |
|
| 643 |
- mova m0, [r0+r3*0-16] |
|
| 644 |
- mova m1, [r4] |
|
| 645 |
- mov r0, r4 |
|
| 646 |
- mova m4, m3 |
|
| 647 |
- mova m2, m3 |
|
| 648 |
- PALIGNR m4, m0, 14, m0 |
|
| 649 |
- PALIGNR m1, m2, 2, m2 |
|
| 650 |
- test r1, r1 |
|
| 651 |
- jnz .do_left |
|
| 652 |
-.fix_lt_1: |
|
| 653 |
- mova m5, m3 |
|
| 654 |
- pxor m5, m4 |
|
| 655 |
- psrldq m5, 14 |
|
| 656 |
- pslldq m5, 12 |
|
| 657 |
- pxor m1, m5 |
|
| 658 |
- jmp .do_left |
|
| 659 |
-.fix_lt_2: |
|
| 660 |
- mova m5, m3 |
|
| 661 |
- pxor m5, m2 |
|
| 662 |
- pslldq m5, 14 |
|
| 663 |
- psrldq m5, 14 |
|
| 664 |
- pxor m2, m5 |
|
| 665 |
- test r2, r2 |
|
| 666 |
- jnz .body |
|
| 667 |
-.fix_tr_1: |
|
| 668 |
- mova m5, m3 |
|
| 669 |
- pxor m5, m1 |
|
| 670 |
- psrldq m5, 14 |
|
| 671 |
- pslldq m5, 14 |
|
| 672 |
- pxor m1, m5 |
|
| 673 |
- jmp .body |
|
| 674 |
-.do_left: |
|
| 675 |
- mova m0, m4 |
|
| 676 |
- PRED4x4_LOWPASS m2, m1, m4, m3 |
|
| 677 |
- mova m4, m0 |
|
| 678 |
- mova m7, m2 |
|
| 679 |
- PRED4x4_LOWPASS m1, m3, m0, m4 |
|
| 680 |
- pslldq m1, 14 |
|
| 681 |
- PALIGNR m7, m1, 14, m3 |
|
| 682 |
- mova m0, [r0-16] |
|
| 683 |
- mova m3, [r0] |
|
| 684 |
- mova m1, [r0+16] |
|
| 685 |
- mova m2, m3 |
|
| 686 |
- mova m4, m3 |
|
| 687 |
- PALIGNR m2, m0, 14, m0 |
|
| 688 |
- PALIGNR m1, m4, 2, m4 |
|
| 689 |
- test r1, r1 |
|
| 690 |
- jz .fix_lt_2 |
|
| 691 |
- test r2, r2 |
|
| 692 |
- jz .fix_tr_1 |
|
| 693 |
-.body |
|
| 694 |
- lea r1, [r3+r3*2] |
|
| 695 |
- PRED4x4_LOWPASS m6, m2, m1, m3 |
|
| 696 |
- HADDW m7, m0 |
|
| 697 |
- HADDW m6, m0 |
|
| 698 |
- lea r2, [r0+r3*4] |
|
| 699 |
- paddw m7, [pw_8] |
|
| 700 |
- paddw m7, m6 |
|
| 701 |
- psrlw m7, 4 |
|
| 702 |
- SPLATW m7, m7 |
|
| 703 |
- mova [r0+r3*1], m7 |
|
| 704 |
- mova [r0+r3*2], m7 |
|
| 705 |
- mova [r0+r1*1], m7 |
|
| 706 |
- mova [r0+r3*4], m7 |
|
| 707 |
- mova [r2+r3*1], m7 |
|
| 708 |
- mova [r2+r3*2], m7 |
|
| 709 |
- mova [r2+r1*1], m7 |
|
| 710 |
- mova [r2+r3*4], m7 |
|
| 641 |
+ mova m0, [r0] |
|
| 642 |
+ shr r1d, 14 |
|
| 643 |
+ shr r2d, 13 |
|
| 644 |
+ neg r1 |
|
| 645 |
+ pslldq m1, m0, 2 |
|
| 646 |
+ psrldq m2, m0, 2 |
|
| 647 |
+ pinsrw m1, [r0+r1], 0 |
|
| 648 |
+ pinsrw m2, [r0+r2+14], 7 |
|
| 649 |
+ not r1 |
|
| 650 |
+ and r1, r3 |
|
| 651 |
+ pslldq m4, m3, 2 |
|
| 652 |
+ psrldq m5, m3, 2 |
|
| 653 |
+ pshuflw m4, m4, 11100101b |
|
| 654 |
+ pinsrw m5, [r0+r1-2], 7 |
|
| 655 |
+ PRED4x4_LOWPASS m3, m4, m5, m3 |
|
| 656 |
+ PRED4x4_LOWPASS m0, m2, m1, m0 |
|
| 657 |
+ paddw m0, m3 |
|
| 658 |
+ HADDW m0, m1 |
|
| 659 |
+ paddw m0, [pw_8] |
|
| 660 |
+ psrlw m0, 4 |
|
| 661 |
+ SPLATW m0, m0 |
|
| 662 |
+ mova [r0+r3*1], m0 |
|
| 663 |
+ mova [r0+r3*2], m0 |
|
| 664 |
+ mova [r0+r5*1], m0 |
|
| 665 |
+ mova [r0+r3*4], m0 |
|
| 666 |
+ mova [r4+r3*1], m0 |
|
| 667 |
+ mova [r4+r3*2], m0 |
|
| 668 |
+ mova [r4+r5*1], m0 |
|
| 669 |
+ mova [r4+r3*4], m0 |
|
| 711 | 670 |
RET |
| 712 | 671 |
%endmacro |
| 713 | 672 |
|
| 714 | 673 |
INIT_XMM |
| 715 |
-%define PALIGNR PALIGNR_MMX |
|
| 716 | 674 |
PRED8x8L_DC sse2 |
| 717 |
-%define PALIGNR PALIGNR_SSSE3 |
|
| 718 |
-PRED8x8L_DC ssse3 |
|
| 675 |
+%ifdef HAVE_AVX |
|
| 676 |
+INIT_AVX |
|
| 677 |
+PRED8x8L_DC avx |
|
| 678 |
+%endif |
|
| 719 | 679 |
|
| 720 | 680 |
;----------------------------------------------------------------------------- |
| 721 | 681 |
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) |
| ... | ... |
@@ -723,36 +647,17 @@ PRED8x8L_DC ssse3 |
| 723 | 723 |
%macro PRED8x8L_VERTICAL 1 |
| 724 | 724 |
cglobal pred8x8l_vertical_10_%1, 4,4,6 |
| 725 | 725 |
sub r0, r3 |
| 726 |
- mova m0, [r0-16] |
|
| 727 |
- mova m3, [r0] |
|
| 728 |
- mova m1, [r0+16] |
|
| 729 |
- mova m2, m3 |
|
| 730 |
- mova m4, m3 |
|
| 731 |
- PALIGNR m2, m0, 14, m0 |
|
| 732 |
- PALIGNR m1, m4, 2, m4 |
|
| 733 |
- test r1, r1 ; top_left |
|
| 734 |
- jz .fix_lt_2 |
|
| 735 |
- test r2, r2 ; top_right |
|
| 736 |
- jz .fix_tr_1 |
|
| 737 |
- jmp .body |
|
| 738 |
-.fix_lt_2: |
|
| 739 |
- mova m5, m3 |
|
| 740 |
- pxor m5, m2 |
|
| 741 |
- pslldq m5, 14 |
|
| 742 |
- psrldq m5, 14 |
|
| 743 |
- pxor m2, m5 |
|
| 744 |
- test r2, r2 ; top_right |
|
| 745 |
- jnz .body |
|
| 746 |
-.fix_tr_1: |
|
| 747 |
- mova m5, m3 |
|
| 748 |
- pxor m5, m1 |
|
| 749 |
- psrldq m5, 14 |
|
| 750 |
- pslldq m5, 14 |
|
| 751 |
- pxor m1, m5 |
|
| 752 |
-.body |
|
| 753 |
- lea r1, [r3+r3*2] |
|
| 726 |
+ mova m0, [r0] |
|
| 727 |
+ shr r1d, 14 |
|
| 728 |
+ shr r2d, 13 |
|
| 729 |
+ neg r1 |
|
| 730 |
+ pslldq m1, m0, 2 |
|
| 731 |
+ psrldq m2, m0, 2 |
|
| 732 |
+ pinsrw m1, [r0+r1], 0 |
|
| 733 |
+ pinsrw m2, [r0+r2+14], 7 |
|
| 734 |
+ lea r1, [r3*3] |
|
| 754 | 735 |
lea r2, [r0+r3*4] |
| 755 |
- PRED4x4_LOWPASS m0, m2, m1, m3 |
|
| 736 |
+ PRED4x4_LOWPASS m0, m2, m1, m0 |
|
| 756 | 737 |
mova [r0+r3*1], m0 |
| 757 | 738 |
mova [r0+r3*2], m0 |
| 758 | 739 |
mova [r0+r1*1], m0 |
| ... | ... |
@@ -765,70 +670,56 @@ cglobal pred8x8l_vertical_10_%1, 4,4,6 |
| 765 | 765 |
%endmacro |
| 766 | 766 |
|
| 767 | 767 |
INIT_XMM |
| 768 |
-%define PALIGNR PALIGNR_MMX |
|
| 769 | 768 |
PRED8x8L_VERTICAL sse2 |
| 770 |
-%define PALIGNR PALIGNR_SSSE3 |
|
| 771 |
-PRED8x8L_VERTICAL ssse3 |
|
| 769 |
+%ifdef HAVE_AVX |
|
| 770 |
+INIT_AVX |
|
| 771 |
+PRED8x8L_VERTICAL avx |
|
| 772 |
+%endif |
|
| 772 | 773 |
|
| 773 | 774 |
;----------------------------------------------------------------------------- |
| 774 | 775 |
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) |
| 775 | 776 |
;----------------------------------------------------------------------------- |
| 776 | 777 |
%macro PRED8x8L_HORIZONTAL 1 |
| 777 |
-cglobal pred8x8l_horizontal_10_%1, 4,4,8 |
|
| 778 |
- sub r0, r3 |
|
| 779 |
- lea r2, [r0+r3*2] |
|
| 780 |
- mova m0, [r0+r3*1-16] |
|
| 781 |
- test r1, r1 |
|
| 782 |
- lea r1, [r0+r3] |
|
| 783 |
- cmovnz r1, r0 |
|
| 784 |
- punpckhwd m0, [r1+r3*0-16] |
|
| 785 |
- mova m1, [r2+r3*1-16] |
|
| 786 |
- punpckhwd m1, [r0+r3*2-16] |
|
| 787 |
- mov r2, r0 |
|
| 778 |
+cglobal pred8x8l_horizontal_10_%1, 4,4,5 |
|
| 779 |
+ mova m0, [r0-16] |
|
| 780 |
+ shr r1d, 14 |
|
| 781 |
+ dec r1 |
|
| 782 |
+ and r1, r3 |
|
| 783 |
+ sub r1, r3 |
|
| 784 |
+ punpckhwd m0, [r0+r1-16] |
|
| 785 |
+ mova m1, [r0+r3*2-16] |
|
| 786 |
+ punpckhwd m1, [r0+r3*1-16] |
|
| 787 |
+ lea r2, [r0+r3*4] |
|
| 788 |
+ lea r1, [r3*3] |
|
| 788 | 789 |
punpckhdq m1, m0 |
| 789 |
- lea r0, [r0+r3*4] |
|
| 790 |
- mova m2, [r0+r3*1-16] |
|
| 791 |
- punpckhwd m2, [r0+r3*0-16] |
|
| 792 |
- lea r0, [r0+r3*2] |
|
| 793 |
- mova m3, [r0+r3*1-16] |
|
| 794 |
- punpckhwd m3, [r0+r3*0-16] |
|
| 790 |
+ mova m2, [r2+r3*0-16] |
|
| 791 |
+ punpckhwd m2, [r0+r1-16] |
|
| 792 |
+ mova m3, [r2+r3*2-16] |
|
| 793 |
+ punpckhwd m3, [r2+r3*1-16] |
|
| 795 | 794 |
punpckhdq m3, m2 |
| 796 | 795 |
punpckhqdq m3, m1 |
| 797 |
- lea r0, [r0+r3*2] |
|
| 798 |
- mova m0, [r0+r3*0-16] |
|
| 799 |
- mova m1, [r1+r3*0-16] |
|
| 800 |
- mov r0, r2 |
|
| 801 |
- mova m4, m3 |
|
| 802 |
- mova m2, m3 |
|
| 803 |
- PALIGNR m4, m0, 14, m0 |
|
| 804 |
- PALIGNR m1, m2, 2, m2 |
|
| 805 |
- mova m0, m4 |
|
| 806 |
- PRED4x4_LOWPASS m2, m1, m4, m3 |
|
| 807 |
- mova m4, m0 |
|
| 808 |
- mova m7, m2 |
|
| 809 |
- PRED4x4_LOWPASS m1, m3, m0, m4 |
|
| 810 |
- pslldq m1, 14 |
|
| 811 |
- PALIGNR m7, m1, 14, m3 |
|
| 812 |
- lea r1, [r3+r3*2] |
|
| 813 |
- punpckhwd m3, m7, m7 |
|
| 814 |
- punpcklwd m7, m7 |
|
| 796 |
+ PALIGNR m4, m3, [r2+r1-16], 14, m0 |
|
| 797 |
+ pslldq m0, m4, 2 |
|
| 798 |
+ pshuflw m0, m0, 11100101b |
|
| 799 |
+ PRED4x4_LOWPASS m4, m3, m0, m4 |
|
| 800 |
+ punpckhwd m3, m4, m4 |
|
| 801 |
+ punpcklwd m4, m4 |
|
| 815 | 802 |
pshufd m0, m3, 0xff |
| 816 | 803 |
pshufd m1, m3, 0xaa |
| 817 |
- lea r2, [r0+r3*4] |
|
| 818 | 804 |
pshufd m2, m3, 0x55 |
| 819 | 805 |
pshufd m3, m3, 0x00 |
| 820 |
- pshufd m4, m7, 0xff |
|
| 821 |
- pshufd m5, m7, 0xaa |
|
| 822 |
- pshufd m6, m7, 0x55 |
|
| 823 |
- pshufd m7, m7, 0x00 |
|
| 824 |
- mova [r0+r3*1], m0 |
|
| 825 |
- mova [r0+r3*2], m1 |
|
| 826 |
- mova [r0+r1*1], m2 |
|
| 827 |
- mova [r0+r3*4], m3 |
|
| 828 |
- mova [r2+r3*1], m4 |
|
| 829 |
- mova [r2+r3*2], m5 |
|
| 830 |
- mova [r2+r1*1], m6 |
|
| 831 |
- mova [r2+r3*4], m7 |
|
| 806 |
+ mova [r0+r3*0], m0 |
|
| 807 |
+ mova [r0+r3*1], m1 |
|
| 808 |
+ mova [r0+r3*2], m2 |
|
| 809 |
+ mova [r0+r1*1], m3 |
|
| 810 |
+ pshufd m0, m4, 0xff |
|
| 811 |
+ pshufd m1, m4, 0xaa |
|
| 812 |
+ pshufd m2, m4, 0x55 |
|
| 813 |
+ pshufd m3, m4, 0x00 |
|
| 814 |
+ mova [r2+r3*0], m0 |
|
| 815 |
+ mova [r2+r3*1], m1 |
|
| 816 |
+ mova [r2+r3*2], m2 |
|
| 817 |
+ mova [r2+r1*1], m3 |
|
| 832 | 818 |
RET |
| 833 | 819 |
%endmacro |
| 834 | 820 |
|
| ... | ... |
@@ -837,116 +728,68 @@ INIT_XMM |
| 837 | 837 |
PRED8x8L_HORIZONTAL sse2 |
| 838 | 838 |
%define PALIGNR PALIGNR_SSSE3 |
| 839 | 839 |
PRED8x8L_HORIZONTAL ssse3 |
| 840 |
+%ifdef HAVE_AVX |
|
| 841 |
+INIT_AVX |
|
| 842 |
+PRED8x8L_HORIZONTAL avx |
|
| 843 |
+%endif |
|
| 840 | 844 |
|
| 841 | 845 |
;----------------------------------------------------------------------------- |
| 842 | 846 |
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) |
| 843 | 847 |
;----------------------------------------------------------------------------- |
| 844 | 848 |
%macro PRED8x8L_DOWN_LEFT 1 |
| 845 |
-cglobal pred8x8l_down_left_10_%1, 4,4,8 |
|
| 849 |
+cglobal pred8x8l_down_left_10_%1, 4,4,7 |
|
| 846 | 850 |
sub r0, r3 |
| 847 |
- mova m0, [r0-16] |
|
| 848 | 851 |
mova m3, [r0] |
| 852 |
+ shr r1d, 14 |
|
| 853 |
+ neg r1 |
|
| 854 |
+ shr r2d, 13 |
|
| 855 |
+ pslldq m1, m3, 2 |
|
| 856 |
+ psrldq m2, m3, 2 |
|
| 857 |
+ pinsrw m1, [r0+r1], 0 |
|
| 858 |
+ pinsrw m2, [r0+r2+14], 7 |
|
| 859 |
+ PRED4x4_LOWPASS m6, m2, m1, m3 |
|
| 860 |
+ jz .fix_tr ; flags from shr r2d |
|
| 849 | 861 |
mova m1, [r0+16] |
| 850 |
- mova m2, m3 |
|
| 851 |
- mova m4, m3 |
|
| 852 |
- PALIGNR m2, m0, 14, m0 |
|
| 853 |
- PALIGNR m1, m4, 2, m4 |
|
| 854 |
- test r1, r1 |
|
| 855 |
- jz .fix_lt_2 |
|
| 856 |
- test r2, r2 |
|
| 857 |
- jz .fix_tr_1 |
|
| 858 |
- jmp .do_top |
|
| 859 |
-.fix_lt_2: |
|
| 860 |
- mova m5, m3 |
|
| 861 |
- pxor m5, m2 |
|
| 862 |
- pslldq m5, 14 |
|
| 863 |
- psrldq m5, 14 |
|
| 864 |
- pxor m2, m5 |
|
| 865 |
- test r2, r2 |
|
| 866 |
- jnz .do_top |
|
| 867 |
-.fix_tr_1: |
|
| 868 |
- mova m5, m3 |
|
| 869 |
- pxor m5, m1 |
|
| 870 |
- psrldq m5, 14 |
|
| 871 |
- pslldq m5, 14 |
|
| 872 |
- pxor m1, m5 |
|
| 873 |
- jmp .do_top |
|
| 874 |
-.fix_tr_2: |
|
| 875 |
- punpckhwd m3, m3 |
|
| 876 |
- pshufd m1, m3, 0xFF |
|
| 877 |
- jmp .do_topright |
|
| 878 |
-.do_top: |
|
| 879 |
- PRED4x4_LOWPASS m4, m2, m1, m3 |
|
| 880 |
- mova m7, m4 |
|
| 881 |
- test r2, r2 |
|
| 882 |
- jz .fix_tr_2 |
|
| 883 |
- mova m0, [r0+16] |
|
| 884 |
- mova m5, m0 |
|
| 885 |
- mova m2, m0 |
|
| 886 |
- mova m4, m0 |
|
| 887 |
- psrldq m5, 14 |
|
| 888 |
- PALIGNR m2, m3, 14, m3 |
|
| 889 |
- PALIGNR m5, m4, 2, m4 |
|
| 890 |
- PRED4x4_LOWPASS m1, m2, m5, m0 |
|
| 862 |
+ psrldq m5, m1, 2 |
|
| 863 |
+ PALIGNR m2, m1, m3, 14, m3 |
|
| 864 |
+ pshufhw m5, m5, 10100100b |
|
| 865 |
+ PRED4x4_LOWPASS m1, m2, m5, m1 |
|
| 891 | 866 |
.do_topright: |
| 892 |
- lea r1, [r3+r3*2] |
|
| 893 |
- mova m6, m1 |
|
| 894 |
- psrldq m1, 14 |
|
| 895 |
- mova m4, m1 |
|
| 867 |
+ lea r1, [r3*3] |
|
| 868 |
+ psrldq m5, m1, 14 |
|
| 896 | 869 |
lea r2, [r0+r3*4] |
| 897 |
- mova m2, m6 |
|
| 898 |
- PALIGNR m2, m7, 2, m0 |
|
| 899 |
- mova m3, m6 |
|
| 900 |
- PALIGNR m3, m7, 14, m0 |
|
| 901 |
- PALIGNR m4, m6, 2, m0 |
|
| 902 |
- mova m5, m7 |
|
| 903 |
- mova m1, m7 |
|
| 904 |
- mova m7, m6 |
|
| 905 |
- pslldq m1, 2 |
|
| 906 |
- PRED4x4_LOWPASS m0, m1, m2, m5 |
|
| 907 |
- PRED4x4_LOWPASS m1, m3, m4, m7 |
|
| 870 |
+ PALIGNR m2, m1, m6, 2, m0 |
|
| 871 |
+ PALIGNR m3, m1, m6, 14, m0 |
|
| 872 |
+ PALIGNR m5, m1, 2, m0 |
|
| 873 |
+ pslldq m4, m6, 2 |
|
| 874 |
+ PRED4x4_LOWPASS m6, m4, m2, m6 |
|
| 875 |
+ PRED4x4_LOWPASS m1, m3, m5, m1 |
|
| 908 | 876 |
mova [r2+r3*4], m1 |
| 909 |
- mova m2, m0 |
|
| 910 |
- pslldq m1, 2 |
|
| 911 |
- psrldq m2, 14 |
|
| 912 |
- pslldq m0, 2 |
|
| 913 |
- por m1, m2 |
|
| 877 |
+ PALIGNR m1, m6, 14, m2 |
|
| 878 |
+ pslldq m6, 2 |
|
| 914 | 879 |
mova [r2+r1*1], m1 |
| 915 |
- mova m2, m0 |
|
| 916 |
- pslldq m1, 2 |
|
| 917 |
- psrldq m2, 14 |
|
| 918 |
- pslldq m0, 2 |
|
| 919 |
- por m1, m2 |
|
| 880 |
+ PALIGNR m1, m6, 14, m2 |
|
| 881 |
+ pslldq m6, 2 |
|
| 920 | 882 |
mova [r2+r3*2], m1 |
| 921 |
- mova m2, m0 |
|
| 922 |
- pslldq m1, 2 |
|
| 923 |
- psrldq m2, 14 |
|
| 924 |
- pslldq m0, 2 |
|
| 925 |
- por m1, m2 |
|
| 883 |
+ PALIGNR m1, m6, 14, m2 |
|
| 884 |
+ pslldq m6, 2 |
|
| 926 | 885 |
mova [r2+r3*1], m1 |
| 927 |
- mova m2, m0 |
|
| 928 |
- pslldq m1, 2 |
|
| 929 |
- psrldq m2, 14 |
|
| 930 |
- pslldq m0, 2 |
|
| 931 |
- por m1, m2 |
|
| 886 |
+ PALIGNR m1, m6, 14, m2 |
|
| 887 |
+ pslldq m6, 2 |
|
| 932 | 888 |
mova [r0+r3*4], m1 |
| 933 |
- mova m2, m0 |
|
| 934 |
- pslldq m1, 2 |
|
| 935 |
- psrldq m2, 14 |
|
| 936 |
- pslldq m0, 2 |
|
| 937 |
- por m1, m2 |
|
| 889 |
+ PALIGNR m1, m6, 14, m2 |
|
| 890 |
+ pslldq m6, 2 |
|
| 938 | 891 |
mova [r0+r1*1], m1 |
| 939 |
- mova m2, m0 |
|
| 940 |
- pslldq m1, 2 |
|
| 941 |
- psrldq m2, 14 |
|
| 942 |
- pslldq m0, 2 |
|
| 943 |
- por m1, m2 |
|
| 892 |
+ PALIGNR m1, m6, 14, m2 |
|
| 893 |
+ pslldq m6, 2 |
|
| 944 | 894 |
mova [r0+r3*2], m1 |
| 945 |
- pslldq m1, 2 |
|
| 946 |
- psrldq m0, 14 |
|
| 947 |
- por m1, m0 |
|
| 895 |
+ PALIGNR m1, m6, 14, m6 |
|
| 948 | 896 |
mova [r0+r3*1], m1 |
| 949 | 897 |
RET |
| 898 |
+.fix_tr: |
|
| 899 |
+ punpckhwd m3, m3 |
|
| 900 |
+ pshufd m1, m3, 0xFF |
|
| 901 |
+ jmp .do_topright |
|
| 950 | 902 |
%endmacro |
| 951 | 903 |
|
| 952 | 904 |
INIT_XMM |
| ... | ... |
@@ -954,139 +797,73 @@ INIT_XMM |
| 954 | 954 |
PRED8x8L_DOWN_LEFT sse2 |
| 955 | 955 |
%define PALIGNR PALIGNR_SSSE3 |
| 956 | 956 |
PRED8x8L_DOWN_LEFT ssse3 |
| 957 |
+%ifdef HAVE_AVX |
|
| 958 |
+INIT_AVX |
|
| 959 |
+PRED8x8L_DOWN_LEFT avx |
|
| 960 |
+%endif |
|
| 957 | 961 |
|
| 958 | 962 |
;----------------------------------------------------------------------------- |
| 959 |
-;void pred8x8l_down_right_mxext(pixel *src, int has_topleft, int has_topright, int stride) |
|
| 963 |
+;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride) |
|
| 960 | 964 |
;----------------------------------------------------------------------------- |
| 961 | 965 |
%macro PRED8x8L_DOWN_RIGHT 1 |
| 966 |
+; standard forbids this when has_topleft is false |
|
| 967 |
+; no need to check |
|
| 962 | 968 |
cglobal pred8x8l_down_right_10_%1, 4,5,8 |
| 963 | 969 |
sub r0, r3 |
| 964 |
- lea r4, [r0+r3*2] |
|
| 970 |
+ lea r4, [r0+r3*4] |
|
| 971 |
+ lea r1, [r3*3] |
|
| 965 | 972 |
mova m0, [r0+r3*1-16] |
| 966 | 973 |
punpckhwd m0, [r0+r3*0-16] |
| 967 |
- mova m1, [r4+r3*1-16] |
|
| 974 |
+ mova m1, [r0+r1*1-16] |
|
| 968 | 975 |
punpckhwd m1, [r0+r3*2-16] |
| 969 |
- mov r4, r0 |
|
| 970 | 976 |
punpckhdq m1, m0 |
| 971 |
- lea r0, [r0+r3*4] |
|
| 972 |
- mova m2, [r0+r3*1-16] |
|
| 973 |
- punpckhwd m2, [r0+r3*0-16] |
|
| 974 |
- lea r0, [r0+r3*2] |
|
| 975 |
- mova m3, [r0+r3*1-16] |
|
| 976 |
- punpckhwd m3, [r0+r3*0-16] |
|
| 977 |
+ mova m2, [r4+r3*1-16] |
|
| 978 |
+ punpckhwd m2, [r4+r3*0-16] |
|
| 979 |
+ mova m3, [r4+r1*1-16] |
|
| 980 |
+ punpckhwd m3, [r4+r3*2-16] |
|
| 977 | 981 |
punpckhdq m3, m2 |
| 978 | 982 |
punpckhqdq m3, m1 |
| 979 |
- lea r0, [r0+r3*2] |
|
| 980 |
- mova m0, [r0+r3*0-16] |
|
| 981 |
- mova m1, [r4] |
|
| 982 |
- mov r0, r4 |
|
| 983 |
- mova m4, m3 |
|
| 984 |
- mova m2, m3 |
|
| 985 |
- PALIGNR m4, m0, 14, m0 |
|
| 986 |
- PALIGNR m1, m2, 2, m2 |
|
| 987 |
- test r1, r1 ; top_left |
|
| 988 |
- jz .fix_lt_1 |
|
| 989 |
-.do_left: |
|
| 990 |
- mova m0, m4 |
|
| 991 |
- PRED4x4_LOWPASS m2, m1, m4, m3 |
|
| 992 |
- mova m4, m0 |
|
| 993 |
- mova m7, m2 |
|
| 994 |
- mova m6, m2 |
|
| 995 |
- PRED4x4_LOWPASS m1, m3, m0, m4 |
|
| 996 |
- pslldq m1, 14 |
|
| 997 |
- PALIGNR m7, m1, 14, m3 |
|
| 998 |
- mova m0, [r0-16] |
|
| 983 |
+ mova m0, [r4+r3*4-16] |
|
| 984 |
+ mova m1, [r0] |
|
| 985 |
+ PALIGNR m4, m3, m0, 14, m0 |
|
| 986 |
+ PALIGNR m1, m3, 2, m2 |
|
| 987 |
+ pslldq m0, m4, 2 |
|
| 988 |
+ pshuflw m0, m0, 11100101b |
|
| 989 |
+ PRED4x4_LOWPASS m6, m1, m4, m3 |
|
| 990 |
+ PRED4x4_LOWPASS m4, m3, m0, m4 |
|
| 999 | 991 |
mova m3, [r0] |
| 1000 |
- mova m1, [r0+16] |
|
| 1001 |
- mova m2, m3 |
|
| 1002 |
- mova m4, m3 |
|
| 1003 |
- PALIGNR m2, m0, 14, m0 |
|
| 1004 |
- PALIGNR m1, m4, 2, m4 |
|
| 1005 |
- test r1, r1 ; top_left |
|
| 1006 |
- jz .fix_lt_2 |
|
| 1007 |
- test r2, r2 ; top_right |
|
| 1008 |
- jz .fix_tr_1 |
|
| 1009 |
-.do_top: |
|
| 1010 |
- PRED4x4_LOWPASS m4, m2, m1, m3 |
|
| 1011 |
- mova m5, m4 |
|
| 1012 |
- jmp .body |
|
| 1013 |
-.fix_lt_1: |
|
| 1014 |
- mova m5, m3 |
|
| 1015 |
- pxor m5, m4 |
|
| 1016 |
- psrldq m5, 14 |
|
| 1017 |
- pslldq m5, 12 |
|
| 1018 |
- pxor m1, m5 |
|
| 1019 |
- jmp .do_left |
|
| 1020 |
-.fix_lt_2: |
|
| 1021 |
- mova m5, m3 |
|
| 1022 |
- pxor m5, m2 |
|
| 1023 |
- pslldq m5, 14 |
|
| 1024 |
- psrldq m5, 14 |
|
| 1025 |
- pxor m2, m5 |
|
| 1026 |
- test r2, r2 ; top_right |
|
| 1027 |
- jnz .do_top |
|
| 1028 |
-.fix_tr_1: |
|
| 1029 |
- mova m5, m3 |
|
| 1030 |
- pxor m5, m1 |
|
| 1031 |
- psrldq m5, 14 |
|
| 1032 |
- pslldq m5, 14 |
|
| 1033 |
- pxor m1, m5 |
|
| 1034 |
- jmp .do_top |
|
| 1035 |
-.body |
|
| 1036 |
- lea r1, [r3+r3*2] |
|
| 1037 |
- mova m1, m7 |
|
| 1038 |
- mova m7, m5 |
|
| 1039 |
- mova m5, m6 |
|
| 1040 |
- mova m2, m7 |
|
| 1041 |
- lea r2, [r0+r3*4] |
|
| 1042 |
- PALIGNR m2, m6, 2, m0 |
|
| 1043 |
- mova m3, m7 |
|
| 1044 |
- PALIGNR m3, m6, 14, m0 |
|
| 1045 |
- mova m4, m7 |
|
| 1046 |
- psrldq m4, 2 |
|
| 1047 |
- PRED4x4_LOWPASS m0, m1, m2, m5 |
|
| 1048 |
- PRED4x4_LOWPASS m1, m3, m4, m7 |
|
| 1049 |
- mova [r2+r3*4], m0 |
|
| 1050 |
- mova m2, m1 |
|
| 1051 |
- psrldq m0, 2 |
|
| 1052 |
- pslldq m2, 14 |
|
| 1053 |
- psrldq m1, 2 |
|
| 1054 |
- por m0, m2 |
|
| 1055 |
- mova [r2+r1*1], m0 |
|
| 1056 |
- mova m2, m1 |
|
| 1057 |
- psrldq m0, 2 |
|
| 1058 |
- pslldq m2, 14 |
|
| 1059 |
- psrldq m1, 2 |
|
| 1060 |
- por m0, m2 |
|
| 1061 |
- mova [r2+r3*2], m0 |
|
| 1062 |
- mova m2, m1 |
|
| 1063 |
- psrldq m0, 2 |
|
| 1064 |
- pslldq m2, 14 |
|
| 1065 |
- psrldq m1, 2 |
|
| 1066 |
- por m0, m2 |
|
| 1067 |
- mova [r2+r3*1], m0 |
|
| 1068 |
- mova m2, m1 |
|
| 1069 |
- psrldq m0, 2 |
|
| 1070 |
- pslldq m2, 14 |
|
| 1071 |
- psrldq m1, 2 |
|
| 1072 |
- por m0, m2 |
|
| 1073 |
- mova [r0+r3*4], m0 |
|
| 1074 |
- mova m2, m1 |
|
| 1075 |
- psrldq m0, 2 |
|
| 1076 |
- pslldq m2, 14 |
|
| 1077 |
- psrldq m1, 2 |
|
| 1078 |
- por m0, m2 |
|
| 1079 |
- mova [r0+r1*1], m0 |
|
| 1080 |
- mova m2, m1 |
|
| 1081 |
- psrldq m0, 2 |
|
| 1082 |
- pslldq m2, 14 |
|
| 1083 |
- psrldq m1, 2 |
|
| 1084 |
- por m0, m2 |
|
| 1085 |
- mova [r0+r3*2], m0 |
|
| 1086 |
- psrldq m0, 2 |
|
| 1087 |
- pslldq m1, 14 |
|
| 1088 |
- por m0, m1 |
|
| 1089 |
- mova [r0+r3*1], m0 |
|
| 992 |
+ shr r2d, 13 |
|
| 993 |
+ pslldq m1, m3, 2 |
|
| 994 |
+ psrldq m2, m3, 2 |
|
| 995 |
+ pinsrw m1, [r0-2], 0 |
|
| 996 |
+ pinsrw m2, [r0+r2+14], 7 |
|
| 997 |
+ PRED4x4_LOWPASS m3, m2, m1, m3 |
|
| 998 |
+ PALIGNR m2, m3, m6, 2, m0 |
|
| 999 |
+ PALIGNR m5, m3, m6, 14, m0 |
|
| 1000 |
+ psrldq m7, m3, 2 |
|
| 1001 |
+ PRED4x4_LOWPASS m6, m4, m2, m6 |
|
| 1002 |
+ PRED4x4_LOWPASS m3, m5, m7, m3 |
|
| 1003 |
+ mova [r4+r3*4], m6 |
|
| 1004 |
+ PALIGNR m3, m6, 14, m2 |
|
| 1005 |
+ pslldq m6, 2 |
|
| 1006 |
+ mova [r0+r3*1], m3 |
|
| 1007 |
+ PALIGNR m3, m6, 14, m2 |
|
| 1008 |
+ pslldq m6, 2 |
|
| 1009 |
+ mova [r0+r3*2], m3 |
|
| 1010 |
+ PALIGNR m3, m6, 14, m2 |
|
| 1011 |
+ pslldq m6, 2 |
|
| 1012 |
+ mova [r0+r1*1], m3 |
|
| 1013 |
+ PALIGNR m3, m6, 14, m2 |
|
| 1014 |
+ pslldq m6, 2 |
|
| 1015 |
+ mova [r0+r3*4], m3 |
|
| 1016 |
+ PALIGNR m3, m6, 14, m2 |
|
| 1017 |
+ pslldq m6, 2 |
|
| 1018 |
+ mova [r4+r3*1], m3 |
|
| 1019 |
+ PALIGNR m3, m6, 14, m2 |
|
| 1020 |
+ pslldq m6, 2 |
|
| 1021 |
+ mova [r4+r3*2], m3 |
|
| 1022 |
+ PALIGNR m3, m6, 14, m6 |
|
| 1023 |
+ mova [r4+r1*1], m3 |
|
| 1090 | 1024 |
RET |
| 1091 | 1025 |
%endmacro |
| 1092 | 1026 |
|
| ... | ... |
@@ -1095,114 +872,69 @@ INIT_XMM |
| 1095 | 1095 |
PRED8x8L_DOWN_RIGHT sse2 |
| 1096 | 1096 |
%define PALIGNR PALIGNR_SSSE3 |
| 1097 | 1097 |
PRED8x8L_DOWN_RIGHT ssse3 |
| 1098 |
+%ifdef HAVE_AVX |
|
| 1099 |
+INIT_AVX |
|
| 1100 |
+PRED8x8L_DOWN_RIGHT avx |
|
| 1101 |
+%endif |
|
| 1098 | 1102 |
|
| 1099 | 1103 |
;----------------------------------------------------------------------------- |
| 1100 | 1104 |
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) |
| 1101 | 1105 |
;----------------------------------------------------------------------------- |
| 1102 | 1106 |
%macro PRED8x8L_VERTICAL_RIGHT 1 |
| 1103 |
-cglobal pred8x8l_vertical_right_10_%1, 4,5,8 |
|
| 1107 |
+; likewise with 8x8l_down_right |
|
| 1108 |
+cglobal pred8x8l_vertical_right_10_%1, 4,5,7 |
|
| 1104 | 1109 |
sub r0, r3 |
| 1105 |
- lea r4, [r0+r3*2] |
|
| 1110 |
+ lea r4, [r0+r3*4] |
|
| 1111 |
+ lea r1, [r3*3] |
|
| 1106 | 1112 |
mova m0, [r0+r3*1-16] |
| 1107 | 1113 |
punpckhwd m0, [r0+r3*0-16] |
| 1108 |
- mova m1, [r4+r3*1-16] |
|
| 1114 |
+ mova m1, [r0+r1*1-16] |
|
| 1109 | 1115 |
punpckhwd m1, [r0+r3*2-16] |
| 1110 |
- mov r4, r0 |
|
| 1111 | 1116 |
punpckhdq m1, m0 |
| 1112 |
- lea r0, [r0+r3*4] |
|
| 1113 |
- mova m2, [r0+r3*1-16] |
|
| 1114 |
- punpckhwd m2, [r0+r3*0-16] |
|
| 1115 |
- lea r0, [r0+r3*2] |
|
| 1116 |
- mova m3, [r0+r3*1-16] |
|
| 1117 |
- punpckhwd m3, [r0+r3*0-16] |
|
| 1117 |
+ mova m2, [r4+r3*1-16] |
|
| 1118 |
+ punpckhwd m2, [r4+r3*0-16] |
|
| 1119 |
+ mova m3, [r4+r1*1-16] |
|
| 1120 |
+ punpckhwd m3, [r4+r3*2-16] |
|
| 1118 | 1121 |
punpckhdq m3, m2 |
| 1119 | 1122 |
punpckhqdq m3, m1 |
| 1120 |
- lea r0, [r0+r3*2] |
|
| 1121 |
- mova m0, [r0+r3*0-16] |
|
| 1122 |
- mova m1, [r4] |
|
| 1123 |
- mov r0, r4 |
|
| 1124 |
- mova m4, m3 |
|
| 1125 |
- mova m2, m3 |
|
| 1126 |
- PALIGNR m4, m0, 14, m0 |
|
| 1127 |
- PALIGNR m1, m2, 2, m2 |
|
| 1128 |
- test r1, r1 |
|
| 1129 |
- jz .fix_lt_1 |
|
| 1130 |
- jmp .do_left |
|
| 1131 |
-.fix_lt_1: |
|
| 1132 |
- mova m5, m3 |
|
| 1133 |
- pxor m5, m4 |
|
| 1134 |
- psrldq m5, 14 |
|
| 1135 |
- pslldq m5, 12 |
|
| 1136 |
- pxor m1, m5 |
|
| 1137 |
- jmp .do_left |
|
| 1138 |
-.fix_lt_2: |
|
| 1139 |
- mova m5, m3 |
|
| 1140 |
- pxor m5, m2 |
|
| 1141 |
- pslldq m5, 14 |
|
| 1142 |
- psrldq m5, 14 |
|
| 1143 |
- pxor m2, m5 |
|
| 1144 |
- test r2, r2 |
|
| 1145 |
- jnz .do_top |
|
| 1146 |
-.fix_tr_1: |
|
| 1147 |
- mova m5, m3 |
|
| 1148 |
- pxor m5, m1 |
|
| 1149 |
- psrldq m5, 14 |
|
| 1150 |
- pslldq m5, 14 |
|
| 1151 |
- pxor m1, m5 |
|
| 1152 |
- jmp .do_top |
|
| 1153 |
-.do_left: |
|
| 1154 |
- mova m0, m4 |
|
| 1155 |
- PRED4x4_LOWPASS m2, m1, m4, m3 |
|
| 1156 |
- mova m7, m2 |
|
| 1157 |
- mova m0, [r0-16] |
|
| 1158 |
- mova m3, [r0] |
|
| 1159 |
- mova m1, [r0+16] |
|
| 1160 |
- mova m2, m3 |
|
| 1161 |
- mova m4, m3 |
|
| 1162 |
- PALIGNR m2, m0, 14, m0 |
|
| 1163 |
- PALIGNR m1, m4, 2, m4 |
|
| 1164 |
- test r1, r1 |
|
| 1165 |
- jz .fix_lt_2 |
|
| 1166 |
- test r2, r2 |
|
| 1167 |
- jz .fix_tr_1 |
|
| 1168 |
-.do_top |
|
| 1169 |
- PRED4x4_LOWPASS m6, m2, m1, m3 |
|
| 1170 |
- lea r1, [r3+r3*2] |
|
| 1171 |
- mova m2, m6 |
|
| 1172 |
- mova m3, m6 |
|
| 1173 |
- PALIGNR m3, m7, 14, m0 |
|
| 1174 |
- PALIGNR m6, m7, 12, m1 |
|
| 1175 |
- mova m4, m3 |
|
| 1176 |
- pavgw m3, m2 |
|
| 1177 |
- lea r2, [r0+r3*4] |
|
| 1178 |
- PRED4x4_LOWPASS m0, m6, m2, m4 |
|
| 1179 |
- mova [r0+r3*1], m3 |
|
| 1123 |
+ mova m0, [r4+r3*4-16] |
|
| 1124 |
+ mova m1, [r0] |
|
| 1125 |
+ PALIGNR m4, m3, m0, 14, m0 |
|
| 1126 |
+ PALIGNR m1, m3, 2, m2 |
|
| 1127 |
+ PRED4x4_LOWPASS m3, m1, m4, m3 |
|
| 1128 |
+ mova m2, [r0] |
|
| 1129 |
+ shr r2d, 13 |
|
| 1130 |
+ pslldq m1, m2, 2 |
|
| 1131 |
+ psrldq m5, m2, 2 |
|
| 1132 |
+ pinsrw m1, [r0-2], 0 |
|
| 1133 |
+ pinsrw m5, [r0+r2+14], 7 |
|
| 1134 |
+ PRED4x4_LOWPASS m2, m5, m1, m2 |
|
| 1135 |
+ PALIGNR m6, m2, m3, 12, m1 |
|
| 1136 |
+ PALIGNR m5, m2, m3, 14, m0 |
|
| 1137 |
+ PRED4x4_LOWPASS m0, m6, m2, m5 |
|
| 1138 |
+ pavgw m2, m5 |
|
| 1180 | 1139 |
mova [r0+r3*2], m0 |
| 1181 |
- mova m5, m0 |
|
| 1182 |
- mova m6, m3 |
|
| 1183 |
- mova m1, m7 |
|
| 1184 |
- mova m2, m1 |
|
| 1185 |
- pslldq m2, 2 |
|
| 1186 |
- mova m3, m1 |
|
| 1187 |
- pslldq m3, 4 |
|
| 1188 |
- PRED4x4_LOWPASS m0, m1, m3, m2 |
|
| 1189 |
- PALIGNR m6, m0, 14, m2 |
|
| 1190 |
- mova [r0+r1*1], m6 |
|
| 1191 |
- pslldq m0, 2 |
|
| 1192 |
- PALIGNR m5, m0, 14, m1 |
|
| 1193 |
- mova [r0+r3*4], m5 |
|
| 1194 |
- pslldq m0, 2 |
|
| 1195 |
- PALIGNR m6, m0, 14, m2 |
|
| 1196 |
- mova [r2+r3*1], m6 |
|
| 1197 |
- pslldq m0, 2 |
|
| 1198 |
- PALIGNR m5, m0, 14, m1 |
|
| 1199 |
- mova [r2+r3*2], m5 |
|
| 1200 |
- pslldq m0, 2 |
|
| 1201 |
- PALIGNR m6, m0, 14, m2 |
|
| 1202 |
- mova [r2+r1*1], m6 |
|
| 1203 |
- pslldq m0, 2 |
|
| 1204 |
- PALIGNR m5, m0, 14, m1 |
|
| 1205 |
- mova [r2+r3*4], m5 |
|
| 1140 |
+ mova [r0+r3*1], m2 |
|
| 1141 |
+ pslldq m6, m3, 4 |
|
| 1142 |
+ pslldq m1, m3, 2 |
|
| 1143 |
+ PRED4x4_LOWPASS m1, m3, m6, m1 |
|
| 1144 |
+ PALIGNR m2, m1, 14, m4 |
|
| 1145 |
+ mova [r0+r1*1], m2 |
|
| 1146 |
+ pslldq m1, 2 |
|
| 1147 |
+ PALIGNR m0, m1, 14, m3 |
|
| 1148 |
+ mova [r0+r3*4], m0 |
|
| 1149 |
+ pslldq m1, 2 |
|
| 1150 |
+ PALIGNR m2, m1, 14, m4 |
|
| 1151 |
+ mova [r4+r3*1], m2 |
|
| 1152 |
+ pslldq m1, 2 |
|
| 1153 |
+ PALIGNR m0, m1, 14, m3 |
|
| 1154 |
+ mova [r4+r3*2], m0 |
|
| 1155 |
+ pslldq m1, 2 |
|
| 1156 |
+ PALIGNR m2, m1, 14, m4 |
|
| 1157 |
+ mova [r4+r1*1], m2 |
|
| 1158 |
+ pslldq m1, 2 |
|
| 1159 |
+ PALIGNR m0, m1, 14, m1 |
|
| 1160 |
+ mova [r4+r3*4], m0 |
|
| 1206 | 1161 |
RET |
| 1207 | 1162 |
%endmacro |
| 1208 | 1163 |
|
| ... | ... |
@@ -1211,84 +943,60 @@ INIT_XMM |
| 1211 | 1211 |
PRED8x8L_VERTICAL_RIGHT sse2 |
| 1212 | 1212 |
%define PALIGNR PALIGNR_SSSE3 |
| 1213 | 1213 |
PRED8x8L_VERTICAL_RIGHT ssse3 |
| 1214 |
+%ifdef HAVE_AVX |
|
| 1215 |
+INIT_AVX |
|
| 1216 |
+PRED8x8L_VERTICAL_RIGHT avx |
|
| 1217 |
+%endif |
|
| 1214 | 1218 |
|
| 1215 | 1219 |
;----------------------------------------------------------------------------- |
| 1216 | 1220 |
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) |
| 1217 | 1221 |
;----------------------------------------------------------------------------- |
| 1218 | 1222 |
%macro PRED8x8L_HORIZONTAL_UP 1 |
| 1219 |
-cglobal pred8x8l_horizontal_up_10_%1, 4,4,8 |
|
| 1220 |
- sub r0, r3 |
|
| 1221 |
- lea r2, [r0+r3*2] |
|
| 1222 |
- mova m0, [r0+r3*1-16] |
|
| 1223 |
- test r1, r1 |
|
| 1224 |
- lea r1, [r0+r3] |
|
| 1225 |
- cmovnz r1, r0 |
|
| 1226 |
- punpckhwd m0, [r1+r3*0-16] |
|
| 1227 |
- mova m1, [r2+r3*1-16] |
|
| 1228 |
- punpckhwd m1, [r0+r3*2-16] |
|
| 1229 |
- mov r2, r0 |
|
| 1230 |
- punpckhdq m1, m0 |
|
| 1231 |
- lea r0, [r0+r3*4] |
|
| 1232 |
- mova m2, [r0+r3*1-16] |
|
| 1233 |
- punpckhwd m2, [r0+r3*0-16] |
|
| 1234 |
- lea r0, [r0+r3*2] |
|
| 1235 |
- mova m3, [r0+r3*1-16] |
|
| 1236 |
- punpckhwd m3, [r0+r3*0-16] |
|
| 1237 |
- punpckhdq m3, m2 |
|
| 1238 |
- punpckhqdq m3, m1 |
|
| 1239 |
- lea r0, [r0+r3*2] |
|
| 1223 |
+cglobal pred8x8l_horizontal_up_10_%1, 4,4,6 |
|
| 1240 | 1224 |
mova m0, [r0+r3*0-16] |
| 1241 |
- mova m1, [r1+r3*0-16] |
|
| 1242 |
- mov r0, r2 |
|
| 1243 |
- mova m4, m3 |
|
| 1244 |
- mova m2, m3 |
|
| 1245 |
- PALIGNR m4, m0, 14, m0 |
|
| 1246 |
- PALIGNR m1, m2, 2, m2 |
|
| 1247 |
- mova m0, m4 |
|
| 1248 |
- PRED4x4_LOWPASS m2, m1, m4, m3 |
|
| 1249 |
- mova m4, m0 |
|
| 1250 |
- mova m7, m2 |
|
| 1251 |
- PRED4x4_LOWPASS m1, m3, m0, m4 |
|
| 1252 |
- pslldq m1, 14 |
|
| 1253 |
- PALIGNR m7, m1, 14, m3 |
|
| 1254 |
- lea r1, [r3+r3*2] |
|
| 1255 |
- pshufd m0, m7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 |
|
| 1256 |
- pslldq m7, 14 ; l7 .. .. .. .. .. .. .. |
|
| 1257 |
- mova m2, m0 |
|
| 1258 |
- pslld m0, 16 |
|
| 1259 |
- psrld m2, 16 |
|
| 1260 |
- por m2, m0 ; l7 l6 l5 l4 l3 l2 l1 l0 |
|
| 1261 |
- mova m3, m2 |
|
| 1262 |
- mova m4, m2 |
|
| 1263 |
- mova m5, m2 |
|
| 1264 |
- psrldq m2, 2 |
|
| 1265 |
- psrldq m3, 4 |
|
| 1225 |
+ punpckhwd m0, [r0+r3*1-16] |
|
| 1226 |
+ shr r1d, 14 |
|
| 1227 |
+ dec r1 |
|
| 1228 |
+ and r1, r3 |
|
| 1229 |
+ sub r1, r3 |
|
| 1230 |
+ mova m4, [r0+r1*1-16] |
|
| 1231 |
+ lea r1, [r3*3] |
|
| 1266 | 1232 |
lea r2, [r0+r3*4] |
| 1267 |
- por m2, m7 ; l7 l7 l6 l5 l4 l3 l2 l1 |
|
| 1268 |
- punpckhwd m7, m7 |
|
| 1269 |
- por m3, m7 ; l7 l7 l7 l6 l5 l4 l3 l2 |
|
| 1270 |
- pavgw m4, m2 |
|
| 1271 |
- PRED4x4_LOWPASS m1, m3, m5, m2 |
|
| 1272 |
- mova m5, m4 |
|
| 1273 |
- punpcklwd m4, m1 ; p4 p3 p2 p1 |
|
| 1274 |
- punpckhwd m5, m1 ; p8 p7 p6 p5 |
|
| 1275 |
- mova m6, m5 |
|
| 1276 |
- mova m7, m5 |
|
| 1277 |
- mova m0, m5 |
|
| 1278 |
- PALIGNR m5, m4, 4, m1 |
|
| 1279 |
- pshufd m1, m6, 11111001b |
|
| 1280 |
- PALIGNR m6, m4, 8, m2 |
|
| 1281 |
- pshufd m2, m7, 11111110b |
|
| 1282 |
- PALIGNR m7, m4, 12, m3 |
|
| 1283 |
- pshufd m3, m0, 11111111b |
|
| 1284 |
- mova [r0+r3*1], m4 |
|
| 1285 |
- mova [r0+r3*2], m5 |
|
| 1286 |
- mova [r0+r1*1], m6 |
|
| 1287 |
- mova [r0+r3*4], m7 |
|
| 1233 |
+ mova m1, [r0+r3*2-16] |
|
| 1234 |
+ punpckhwd m1, [r0+r1*1-16] |
|
| 1235 |
+ punpckhdq m0, m1 |
|
| 1236 |
+ mova m2, [r2+r3*0-16] |
|
| 1237 |
+ punpckhwd m2, [r2+r3*1-16] |
|
| 1238 |
+ mova m3, [r2+r3*2-16] |
|
| 1239 |
+ punpckhwd m3, [r2+r1*1-16] |
|
| 1240 |
+ punpckhdq m2, m3 |
|
| 1241 |
+ punpckhqdq m0, m2 |
|
| 1242 |
+ PALIGNR m1, m0, m4, 14, m4 |
|
| 1243 |
+ psrldq m2, m0, 2 |
|
| 1244 |
+ pshufhw m2, m2, 10100100b |
|
| 1245 |
+ PRED4x4_LOWPASS m0, m1, m2, m0 |
|
| 1246 |
+ psrldq m1, m0, 2 |
|
| 1247 |
+ psrldq m2, m0, 4 |
|
| 1248 |
+ pshufhw m1, m1, 10100100b |
|
| 1249 |
+ pshufhw m2, m2, 01010100b |
|
| 1250 |
+ pavgw m4, m0, m1 |
|
| 1251 |
+ PRED4x4_LOWPASS m1, m2, m0, m1 |
|
| 1252 |
+ punpckhwd m5, m4, m1 |
|
| 1253 |
+ punpcklwd m4, m1 |
|
| 1254 |
+ mova [r2+r3*0], m5 |
|
| 1255 |
+ mova [r0+r3*0], m4 |
|
| 1256 |
+ pshufd m0, m5, 11111001b |
|
| 1257 |
+ pshufd m1, m5, 11111110b |
|
| 1258 |
+ pshufd m2, m5, 11111111b |
|
| 1288 | 1259 |
mova [r2+r3*1], m0 |
| 1289 | 1260 |
mova [r2+r3*2], m1 |
| 1290 | 1261 |
mova [r2+r1*1], m2 |
| 1291 |
- mova [r2+r3*4], m3 |
|
| 1262 |
+ PALIGNR m2, m5, m4, 4, m0 |
|
| 1263 |
+ PALIGNR m3, m5, m4, 8, m1 |
|
| 1264 |
+ PALIGNR m5, m5, m4, 12, m4 |
|
| 1265 |
+ mova [r0+r3*1], m2 |
|
| 1266 |
+ mova [r0+r3*2], m3 |
|
| 1267 |
+ mova [r0+r1*1], m5 |
|
| 1292 | 1268 |
RET |
| 1293 | 1269 |
%endmacro |
| 1294 | 1270 |
|
| ... | ... |
@@ -1297,7 +1005,10 @@ INIT_XMM |
| 1297 | 1297 |
PRED8x8L_HORIZONTAL_UP sse2 |
| 1298 | 1298 |
%define PALIGNR PALIGNR_SSSE3 |
| 1299 | 1299 |
PRED8x8L_HORIZONTAL_UP ssse3 |
| 1300 |
- |
|
| 1300 |
+%ifdef HAVE_AVX |
|
| 1301 |
+INIT_AVX |
|
| 1302 |
+PRED8x8L_HORIZONTAL_UP avx |
|
| 1303 |
+%endif |
|
| 1301 | 1304 |
|
| 1302 | 1305 |
|
| 1303 | 1306 |
;----------------------------------------------------------------------------- |
| ... | ... |
@@ -1315,7 +1026,7 @@ PRED8x8L_HORIZONTAL_UP ssse3 |
| 1315 | 1315 |
%macro PRED16x16_VERTICAL 1 |
| 1316 | 1316 |
cglobal pred16x16_vertical_10_%1, 2,3 |
| 1317 | 1317 |
sub r0, r1 |
| 1318 |
- mov r2, 8 |
|
| 1318 |
+ mov r2d, 8 |
|
| 1319 | 1319 |
mova m0, [r0+ 0] |
| 1320 | 1320 |
mova m1, [r0+mmsize] |
| 1321 | 1321 |
%if mmsize==8 |
| ... | ... |
@@ -1326,7 +1037,7 @@ cglobal pred16x16_vertical_10_%1, 2,3 |
| 1326 | 1326 |
MOV16 r0+r1*1, m0, m1, m2, m3 |
| 1327 | 1327 |
MOV16 r0+r1*2, m0, m1, m2, m3 |
| 1328 | 1328 |
lea r0, [r0+r1*2] |
| 1329 |
- dec r2 |
|
| 1329 |
+ dec r2d |
|
| 1330 | 1330 |
jg .loop |
| 1331 | 1331 |
REP_RET |
| 1332 | 1332 |
%endmacro |
| ... | ... |
@@ -1341,7 +1052,7 @@ PRED16x16_VERTICAL sse2 |
| 1341 | 1341 |
;----------------------------------------------------------------------------- |
| 1342 | 1342 |
%macro PRED16x16_HORIZONTAL 1 |
| 1343 | 1343 |
cglobal pred16x16_horizontal_10_%1, 2,3 |
| 1344 |
- mov r2, 8 |
|
| 1344 |
+ mov r2d, 8 |
|
| 1345 | 1345 |
.vloop: |
| 1346 | 1346 |
movd m0, [r0+r1*0-4] |
| 1347 | 1347 |
movd m1, [r0+r1*1-4] |
| ... | ... |
@@ -1350,7 +1061,7 @@ cglobal pred16x16_horizontal_10_%1, 2,3 |
| 1350 | 1350 |
MOV16 r0+r1*0, m0, m0, m0, m0 |
| 1351 | 1351 |
MOV16 r0+r1*1, m1, m1, m1, m1 |
| 1352 | 1352 |
lea r0, [r0+r1*2] |
| 1353 |
- dec r2 |
|
| 1353 |
+ dec r2d |
|
| 1354 | 1354 |
jg .vloop |
| 1355 | 1355 |
REP_RET |
| 1356 | 1356 |
%endmacro |
| ... | ... |
@@ -1364,8 +1075,8 @@ PRED16x16_HORIZONTAL sse2 |
| 1364 | 1364 |
; void pred16x16_dc(pixel *src, int stride) |
| 1365 | 1365 |
;----------------------------------------------------------------------------- |
| 1366 | 1366 |
%macro PRED16x16_DC 1 |
| 1367 |
-cglobal pred16x16_dc_10_%1, 2,7 |
|
| 1368 |
- mov r4, r0 |
|
| 1367 |
+cglobal pred16x16_dc_10_%1, 2,6 |
|
| 1368 |
+ mov r5, r0 |
|
| 1369 | 1369 |
sub r0, r1 |
| 1370 | 1370 |
mova m0, [r0+0] |
| 1371 | 1371 |
paddw m0, [r0+mmsize] |
| ... | ... |
@@ -1375,17 +1086,17 @@ cglobal pred16x16_dc_10_%1, 2,7 |
| 1375 | 1375 |
%endif |
| 1376 | 1376 |
HADDW m0, m2 |
| 1377 | 1377 |
|
| 1378 |
- sub r0, 2 |
|
| 1379 |
- movzx r3d, word [r0+r1*1] |
|
| 1380 |
- movzx r5d, word [r0+r1*2] |
|
| 1378 |
+ lea r0, [r0+r1-2] |
|
| 1379 |
+ movzx r3d, word [r0] |
|
| 1380 |
+ movzx r4d, word [r0+r1] |
|
| 1381 | 1381 |
%rep 7 |
| 1382 | 1382 |
lea r0, [r0+r1*2] |
| 1383 |
- movzx r2d, word [r0+r1*1] |
|
| 1383 |
+ movzx r2d, word [r0] |
|
| 1384 | 1384 |
add r3d, r2d |
| 1385 |
- movzx r2d, word [r0+r1*2] |
|
| 1386 |
- add r5d, r2d |
|
| 1385 |
+ movzx r2d, word [r0+r1] |
|
| 1386 |
+ add r4d, r2d |
|
| 1387 | 1387 |
%endrep |
| 1388 |
- lea r3d, [r3+r5+16] |
|
| 1388 |
+ lea r3d, [r3+r4+16] |
|
| 1389 | 1389 |
|
| 1390 | 1390 |
movd m1, r3d |
| 1391 | 1391 |
paddw m0, m1 |
| ... | ... |
@@ -1393,9 +1104,9 @@ cglobal pred16x16_dc_10_%1, 2,7 |
| 1393 | 1393 |
SPLATW m0, m0 |
| 1394 | 1394 |
mov r3d, 8 |
| 1395 | 1395 |
.loop: |
| 1396 |
- MOV16 r4+r1*0, m0, m0, m0, m0 |
|
| 1397 |
- MOV16 r4+r1*1, m0, m0, m0, m0 |
|
| 1398 |
- lea r4, [r4+r1*2] |
|
| 1396 |
+ MOV16 r5+r1*0, m0, m0, m0, m0 |
|
| 1397 |
+ MOV16 r5+r1*1, m0, m0, m0, m0 |
|
| 1398 |
+ lea r5, [r5+r1*2] |
|
| 1399 | 1399 |
dec r3d |
| 1400 | 1400 |
jg .loop |
| 1401 | 1401 |
REP_RET |
| ... | ... |
@@ -1442,29 +1153,29 @@ PRED16x16_TOP_DC sse2 |
| 1442 | 1442 |
; void pred16x16_left_dc(pixel *src, int stride) |
| 1443 | 1443 |
;----------------------------------------------------------------------------- |
| 1444 | 1444 |
%macro PRED16x16_LEFT_DC 1 |
| 1445 |
-cglobal pred16x16_left_dc_10_%1, 2,7 |
|
| 1446 |
- mov r4, r0 |
|
| 1445 |
+cglobal pred16x16_left_dc_10_%1, 2,6 |
|
| 1446 |
+ mov r5, r0 |
|
| 1447 | 1447 |
|
| 1448 | 1448 |
sub r0, 2 |
| 1449 |
- movzx r5d, word [r0+r1*0] |
|
| 1450 |
- movzx r6d, word [r0+r1*1] |
|
| 1449 |
+ movzx r3d, word [r0] |
|
| 1450 |
+ movzx r4d, word [r0+r1] |
|
| 1451 | 1451 |
%rep 7 |
| 1452 | 1452 |
lea r0, [r0+r1*2] |
| 1453 |
- movzx r2d, word [r0+r1*0] |
|
| 1454 |
- movzx r3d, word [r0+r1*1] |
|
| 1455 |
- add r5d, r2d |
|
| 1456 |
- add r6d, r3d |
|
| 1453 |
+ movzx r2d, word [r0] |
|
| 1454 |
+ add r3d, r2d |
|
| 1455 |
+ movzx r2d, word [r0+r1] |
|
| 1456 |
+ add r4d, r2d |
|
| 1457 | 1457 |
%endrep |
| 1458 |
- lea r2d, [r5+r6+8] |
|
| 1459 |
- shr r2d, 4 |
|
| 1458 |
+ lea r3d, [r3+r4+8] |
|
| 1459 |
+ shr r3d, 4 |
|
| 1460 | 1460 |
|
| 1461 |
- movd m0, r2d |
|
| 1461 |
+ movd m0, r3d |
|
| 1462 | 1462 |
SPLATW m0, m0 |
| 1463 | 1463 |
mov r3d, 8 |
| 1464 | 1464 |
.loop: |
| 1465 |
- MOV16 r4+r1*0, m0, m0, m0, m0 |
|
| 1466 |
- MOV16 r4+r1*1, m0, m0, m0, m0 |
|
| 1467 |
- lea r4, [r4+r1*2] |
|
| 1465 |
+ MOV16 r5+r1*0, m0, m0, m0, m0 |
|
| 1466 |
+ MOV16 r5+r1*1, m0, m0, m0, m0 |
|
| 1467 |
+ lea r5, [r5+r1*2] |
|
| 1468 | 1468 |
dec r3d |
| 1469 | 1469 |
jg .loop |
| 1470 | 1470 |
REP_RET |
| ... | ... |
@@ -45,7 +45,6 @@ void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); |
| 45 | 45 |
|
| 46 | 46 |
PRED8x8(dc, 10, mmxext) |
| 47 | 47 |
PRED8x8(dc, 10, sse2) |
| 48 |
-PRED8x8(top_dc, 10, mmxext) |
|
| 49 | 48 |
PRED8x8(top_dc, 10, sse2) |
| 50 | 49 |
PRED8x8(plane, 10, sse2) |
| 51 | 50 |
PRED8x8(vertical, 10, sse2) |
| ... | ... |
@@ -55,23 +54,28 @@ PRED8x8(horizontal, 10, sse2) |
| 55 | 55 |
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int has_topleft, int has_topright, int stride); |
| 56 | 56 |
|
| 57 | 57 |
PRED8x8L(dc, 10, sse2) |
| 58 |
-PRED8x8L(dc, 10, ssse3) |
|
| 58 |
+PRED8x8L(dc, 10, avx) |
|
| 59 | 59 |
PRED8x8L(128_dc, 10, mmxext) |
| 60 | 60 |
PRED8x8L(128_dc, 10, sse2) |
| 61 | 61 |
PRED8x8L(top_dc, 10, sse2) |
| 62 |
-PRED8x8L(top_dc, 10, ssse3) |
|
| 62 |
+PRED8x8L(top_dc, 10, avx) |
|
| 63 | 63 |
PRED8x8L(vertical, 10, sse2) |
| 64 |
-PRED8x8L(vertical, 10, ssse3) |
|
| 64 |
+PRED8x8L(vertical, 10, avx) |
|
| 65 | 65 |
PRED8x8L(horizontal, 10, sse2) |
| 66 | 66 |
PRED8x8L(horizontal, 10, ssse3) |
| 67 |
+PRED8x8L(horizontal, 10, avx) |
|
| 67 | 68 |
PRED8x8L(down_left, 10, sse2) |
| 68 | 69 |
PRED8x8L(down_left, 10, ssse3) |
| 70 |
+PRED8x8L(down_left, 10, avx) |
|
| 69 | 71 |
PRED8x8L(down_right, 10, sse2) |
| 70 | 72 |
PRED8x8L(down_right, 10, ssse3) |
| 73 |
+PRED8x8L(down_right, 10, avx) |
|
| 71 | 74 |
PRED8x8L(vertical_right, 10, sse2) |
| 72 | 75 |
PRED8x8L(vertical_right, 10, ssse3) |
| 76 |
+PRED8x8L(vertical_right, 10, avx) |
|
| 73 | 77 |
PRED8x8L(horizontal_up, 10, sse2) |
| 74 | 78 |
PRED8x8L(horizontal_up, 10, ssse3) |
| 79 |
+PRED8x8L(horizontal_up, 10, avx) |
|
| 75 | 80 |
|
| 76 | 81 |
#define PRED16x16(TYPE, DEPTH, OPT)\ |
| 77 | 82 |
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, int stride); |
| ... | ... |
@@ -298,7 +302,6 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth |
| 298 | 298 |
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; |
| 299 | 299 |
|
| 300 | 300 |
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; |
| 301 |
- h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_mmxext; |
|
| 302 | 301 |
|
| 303 | 302 |
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; |
| 304 | 303 |
|
| ... | ... |
@@ -344,18 +347,28 @@ void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth |
| 344 | 344 |
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; |
| 345 | 345 |
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; |
| 346 | 346 |
|
| 347 |
- h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_ssse3; |
|
| 348 | 347 |
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; |
| 349 |
- h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_ssse3; |
|
| 350 |
- h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_ssse3; |
|
| 351 | 348 |
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; |
| 349 |
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3; |
|
| 350 |
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; |
|
| 351 |
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; |
|
| 352 | 352 |
} |
| 353 | 353 |
#if HAVE_AVX |
| 354 | 354 |
if (mm_flags & AV_CPU_FLAG_AVX) {
|
| 355 | 355 |
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; |
| 356 | 356 |
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; |
| 357 |
+ h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; |
|
| 357 | 358 |
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; |
| 358 | 359 |
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; |
| 360 |
+ |
|
| 361 |
+ h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx; |
|
| 362 |
+ h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx; |
|
| 363 |
+ h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx; |
|
| 364 |
+ h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx; |
|
| 365 |
+ h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx; |
|
| 366 |
+ h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx; |
|
| 367 |
+ h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx; |
|
| 368 |
+ h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx; |
|
| 359 | 369 |
} |
| 360 | 370 |
#endif /* HAVE_AVX */ |
| 361 | 371 |
} |
| ... | ... |
@@ -131,6 +131,7 @@ const AVCodecTag ff_codec_bmp_tags[] = {
|
| 131 | 131 |
{ CODEC_ID_MPEG2VIDEO, MKTAG('s', 'l', 'i', 'f') },
|
| 132 | 132 |
{ CODEC_ID_MPEG2VIDEO, MKTAG('E', 'M', '2', 'V') },
|
| 133 | 133 |
{ CODEC_ID_MPEG2VIDEO, MKTAG('M', '7', '0', '1') }, /* Matrox MPEG2 intra-only */
|
| 134 |
+ { CODEC_ID_MPEG2VIDEO, MKTAG('m', 'p', 'g', 'v') },
|
|
| 134 | 135 |
{ CODEC_ID_MJPEG, MKTAG('M', 'J', 'P', 'G') },
|
| 135 | 136 |
{ CODEC_ID_MJPEG, MKTAG('L', 'J', 'P', 'G') },
|
| 136 | 137 |
{ CODEC_ID_MJPEG, MKTAG('d', 'm', 'b', '1') },
|