* qatar/master:
x86: dsputil: prettyprint gcc inline asm
x86: K&R prettyprinting cosmetics for dsputil_mmx.c
x86: conditionally compile H.264 QPEL optimizations
dsputil_mmx: Surround QPEL macros by "do { } while (0);" blocks.
Ignore generated files below doc/.
dpcm: convert to bytestream2.
interplayvideo: convert to bytestream2.
movenc: Merge if statements
h264: fix memleak in error path.
pthread: Immediately release all frames in ff_thread_flush()
h264: Add check for invalid chroma_format_idc
utvideo: port header reading to bytestream2.
Conflicts:
.gitignore
configure
libavcodec/h264_ps.c
libavcodec/interplayvideo.c
libavcodec/pthread.c
libavcodec/x86/dsputil_mmx.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
... | ... |
@@ -1266,6 +1266,7 @@ CONFIG_EXTRA=" |
1266 | 1266 |
h264chroma |
1267 | 1267 |
h264dsp |
1268 | 1268 |
h264pred |
1269 |
+ h264qpel |
|
1269 | 1270 |
huffman |
1270 | 1271 |
lgplv3 |
1271 | 1272 |
lpc |
... | ... |
@@ -1424,7 +1425,7 @@ h263_vaapi_hwaccel_select="vaapi h263_decoder" |
1424 | 1424 |
h263i_decoder_select="h263_decoder" |
1425 | 1425 |
h263p_encoder_select="h263_encoder" |
1426 | 1426 |
h264_crystalhd_decoder_select="crystalhd h264_mp4toannexb_bsf h264_parser" |
1427 |
-h264_decoder_select="golomb h264chroma h264dsp h264pred" |
|
1427 |
+h264_decoder_select="golomb h264chroma h264dsp h264pred h264qpel" |
|
1428 | 1428 |
h264_dxva2_hwaccel_deps="dxva2api_h" |
1429 | 1429 |
h264_dxva2_hwaccel_select="dxva2 h264_decoder" |
1430 | 1430 |
h264_vaapi_hwaccel_select="vaapi h264_decoder" |
... | ... |
@@ -1485,8 +1486,8 @@ rv10_decoder_select="h263_decoder" |
1485 | 1485 |
rv10_encoder_select="h263_encoder" |
1486 | 1486 |
rv20_decoder_select="h263_decoder" |
1487 | 1487 |
rv20_encoder_select="h263_encoder" |
1488 |
-rv30_decoder_select="golomb h264chroma h264pred" |
|
1489 |
-rv40_decoder_select="golomb h264chroma h264pred" |
|
1488 |
+rv30_decoder_select="golomb h264chroma h264pred h264qpel" |
|
1489 |
+rv40_decoder_select="golomb h264chroma h264pred h264qpel" |
|
1490 | 1490 |
shorten_decoder_select="golomb" |
1491 | 1491 |
sipr_decoder_select="lsp" |
1492 | 1492 |
snow_decoder_select="dwt" |
... | ... |
@@ -1495,7 +1496,7 @@ sonic_decoder_select="golomb" |
1495 | 1495 |
sonic_encoder_select="golomb" |
1496 | 1496 |
sonic_ls_encoder_select="golomb" |
1497 | 1497 |
svq1_encoder_select="aandct" |
1498 |
-svq3_decoder_select="golomb h264chroma h264dsp h264pred" |
|
1498 |
+svq3_decoder_select="golomb h264chroma h264dsp h264pred h264qpel" |
|
1499 | 1499 |
svq3_decoder_suggest="zlib" |
1500 | 1500 |
theora_decoder_select="vp3_decoder" |
1501 | 1501 |
tiff_decoder_suggest="zlib" |
... | ... |
@@ -1504,7 +1505,7 @@ truehd_decoder_select="mlp_decoder" |
1504 | 1504 |
tscc_decoder_select="zlib" |
1505 | 1505 |
twinvq_decoder_select="mdct lsp sinewin" |
1506 | 1506 |
vc1_crystalhd_decoder_select="crystalhd" |
1507 |
-vc1_decoder_select="h263_decoder h264chroma" |
|
1507 |
+vc1_decoder_select="h263_decoder h264chroma h264qpel" |
|
1508 | 1508 |
vc1_dxva2_hwaccel_deps="dxva2api_h" |
1509 | 1509 |
vc1_dxva2_hwaccel_select="dxva2 vc1_decoder" |
1510 | 1510 |
vc1_vaapi_hwaccel_select="vaapi vc1_decoder" |
... | ... |
@@ -1515,7 +1516,7 @@ vorbis_encoder_select="mdct" |
1515 | 1515 |
vp6_decoder_select="huffman" |
1516 | 1516 |
vp6a_decoder_select="vp6_decoder" |
1517 | 1517 |
vp6f_decoder_select="vp6_decoder" |
1518 |
-vp8_decoder_select="h264pred" |
|
1518 |
+vp8_decoder_select="h264pred h264qpel" |
|
1519 | 1519 |
wmapro_decoder_select="mdct sinewin" |
1520 | 1520 |
wmav1_decoder_select="mdct sinewin" |
1521 | 1521 |
wmav1_encoder_select="mdct sinewin" |
... | ... |
@@ -1544,7 +1545,7 @@ vda_deps="VideoDecodeAcceleration_VDADecoder_h pthreads" |
1544 | 1544 |
vdpau_deps="vdpau_vdpau_h vdpau_vdpau_x11_h" |
1545 | 1545 |
|
1546 | 1546 |
# parsers |
1547 |
-h264_parser_select="golomb h264chroma h264dsp h264pred" |
|
1547 |
+h264_parser_select="golomb h264chroma h264dsp h264pred h264qpel" |
|
1548 | 1548 |
|
1549 | 1549 |
# external libraries |
1550 | 1550 |
libaacplus_encoder_deps="libaacplus" |
... | ... |
@@ -40,6 +40,7 @@ |
40 | 40 |
#include "libavutil/intreadwrite.h" |
41 | 41 |
#include "avcodec.h" |
42 | 42 |
#include "bytestream.h" |
43 |
+#include "mathops.h" |
|
43 | 44 |
|
44 | 45 |
typedef struct DPCMContext { |
45 | 46 |
AVFrame frame; |
... | ... |
@@ -173,20 +174,18 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx) |
173 | 173 |
static int dpcm_decode_frame(AVCodecContext *avctx, void *data, |
174 | 174 |
int *got_frame_ptr, AVPacket *avpkt) |
175 | 175 |
{ |
176 |
- const uint8_t *buf = avpkt->data; |
|
177 | 176 |
int buf_size = avpkt->size; |
178 |
- const uint8_t *buf_end = buf + buf_size; |
|
179 | 177 |
DPCMContext *s = avctx->priv_data; |
180 | 178 |
int out = 0, ret; |
181 | 179 |
int predictor[2]; |
182 | 180 |
int ch = 0; |
183 | 181 |
int stereo = s->channels - 1; |
184 |
- int16_t *output_samples; |
|
182 |
+ int16_t *output_samples, *samples_end; |
|
183 |
+ GetByteContext gb; |
|
185 | 184 |
|
186 |
- if (stereo && (buf_size & 1)) { |
|
185 |
+ if (stereo && (buf_size & 1)) |
|
187 | 186 |
buf_size--; |
188 |
- buf_end--; |
|
189 |
- } |
|
187 |
+ bytestream2_init(&gb, avpkt->data, buf_size); |
|
190 | 188 |
|
191 | 189 |
/* calculate output size */ |
192 | 190 |
switch(avctx->codec->id) { |
... | ... |
@@ -221,22 +220,23 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, |
221 | 221 |
return ret; |
222 | 222 |
} |
223 | 223 |
output_samples = (int16_t *)s->frame.data[0]; |
224 |
+ samples_end = output_samples + out; |
|
224 | 225 |
|
225 | 226 |
switch(avctx->codec->id) { |
226 | 227 |
|
227 | 228 |
case CODEC_ID_ROQ_DPCM: |
228 |
- buf += 6; |
|
229 |
+ bytestream2_skipu(&gb, 6); |
|
229 | 230 |
|
230 | 231 |
if (stereo) { |
231 |
- predictor[1] = (int16_t)(bytestream_get_byte(&buf) << 8); |
|
232 |
- predictor[0] = (int16_t)(bytestream_get_byte(&buf) << 8); |
|
232 |
+ predictor[1] = sign_extend(bytestream2_get_byteu(&gb) << 8, 16); |
|
233 |
+ predictor[0] = sign_extend(bytestream2_get_byteu(&gb) << 8, 16); |
|
233 | 234 |
} else { |
234 |
- predictor[0] = (int16_t)bytestream_get_le16(&buf); |
|
235 |
+ predictor[0] = sign_extend(bytestream2_get_le16u(&gb), 16); |
|
235 | 236 |
} |
236 | 237 |
|
237 | 238 |
/* decode the samples */ |
238 |
- while (buf < buf_end) { |
|
239 |
- predictor[ch] += s->roq_square_array[*buf++]; |
|
239 |
+ while (output_samples < samples_end) { |
|
240 |
+ predictor[ch] += s->roq_square_array[bytestream2_get_byteu(&gb)]; |
|
240 | 241 |
predictor[ch] = av_clip_int16(predictor[ch]); |
241 | 242 |
*output_samples++ = predictor[ch]; |
242 | 243 |
|
... | ... |
@@ -246,16 +246,16 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, |
246 | 246 |
break; |
247 | 247 |
|
248 | 248 |
case CODEC_ID_INTERPLAY_DPCM: |
249 |
- buf += 6; /* skip over the stream mask and stream length */ |
|
249 |
+ bytestream2_skipu(&gb, 6); /* skip over the stream mask and stream length */ |
|
250 | 250 |
|
251 | 251 |
for (ch = 0; ch < s->channels; ch++) { |
252 |
- predictor[ch] = (int16_t)bytestream_get_le16(&buf); |
|
252 |
+ predictor[ch] = sign_extend(bytestream2_get_le16u(&gb), 16); |
|
253 | 253 |
*output_samples++ = predictor[ch]; |
254 | 254 |
} |
255 | 255 |
|
256 | 256 |
ch = 0; |
257 |
- while (buf < buf_end) { |
|
258 |
- predictor[ch] += interplay_delta_table[*buf++]; |
|
257 |
+ while (output_samples < samples_end) { |
|
258 |
+ predictor[ch] += interplay_delta_table[bytestream2_get_byteu(&gb)]; |
|
259 | 259 |
predictor[ch] = av_clip_int16(predictor[ch]); |
260 | 260 |
*output_samples++ = predictor[ch]; |
261 | 261 |
|
... | ... |
@@ -269,16 +269,19 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, |
269 | 269 |
int shift[2] = { 4, 4 }; |
270 | 270 |
|
271 | 271 |
for (ch = 0; ch < s->channels; ch++) |
272 |
- predictor[ch] = (int16_t)bytestream_get_le16(&buf); |
|
272 |
+ predictor[ch] = sign_extend(bytestream2_get_le16u(&gb), 16); |
|
273 | 273 |
|
274 | 274 |
ch = 0; |
275 |
- while (buf < buf_end) { |
|
276 |
- uint8_t n = *buf++; |
|
277 |
- int16_t diff = (n & 0xFC) << 8; |
|
278 |
- if ((n & 0x03) == 3) |
|
275 |
+ while (output_samples < samples_end) { |
|
276 |
+ int diff = bytestream2_get_byteu(&gb); |
|
277 |
+ int n = diff & 3; |
|
278 |
+ |
|
279 |
+ if (n == 3) |
|
279 | 280 |
shift[ch]++; |
280 | 281 |
else |
281 |
- shift[ch] -= (2 * (n & 3)); |
|
282 |
+ shift[ch] -= (2 * n); |
|
283 |
+ diff = sign_extend((diff &~ 3) << 8, 16); |
|
284 |
+ |
|
282 | 285 |
/* saturate the shifter to a lower limit of 0 */ |
283 | 286 |
if (shift[ch] < 0) |
284 | 287 |
shift[ch] = 0; |
... | ... |
@@ -296,9 +299,10 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, |
296 | 296 |
} |
297 | 297 |
case CODEC_ID_SOL_DPCM: |
298 | 298 |
if (avctx->codec_tag != 3) { |
299 |
- uint8_t *output_samples_u8 = s->frame.data[0]; |
|
300 |
- while (buf < buf_end) { |
|
301 |
- uint8_t n = *buf++; |
|
299 |
+ uint8_t *output_samples_u8 = s->frame.data[0], |
|
300 |
+ *samples_end_u8 = output_samples_u8 + out; |
|
301 |
+ while (output_samples_u8 < samples_end_u8) { |
|
302 |
+ int n = bytestream2_get_byteu(&gb); |
|
302 | 303 |
|
303 | 304 |
s->sample[0] += s->sol_table[n >> 4]; |
304 | 305 |
s->sample[0] = av_clip_uint8(s->sample[0]); |
... | ... |
@@ -309,8 +313,8 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data, |
309 | 309 |
*output_samples_u8++ = s->sample[stereo]; |
310 | 310 |
} |
311 | 311 |
} else { |
312 |
- while (buf < buf_end) { |
|
313 |
- uint8_t n = *buf++; |
|
312 |
+ while (output_samples < samples_end) { |
|
313 |
+ int n = bytestream2_get_byteu(&gb); |
|
314 | 314 |
if (n & 0x80) s->sample[ch] -= sol_table_16[n & 0x7F]; |
315 | 315 |
else s->sample[ch] += sol_table_16[n & 0x7F]; |
316 | 316 |
s->sample[ch] = av_clip_int16(s->sample[ch]); |
... | ... |
@@ -352,9 +352,9 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ |
352 | 352 |
if (sps->chroma_format_idc > 3U) { |
353 | 353 |
av_log(h->s.avctx, AV_LOG_ERROR, "chroma_format_idc %d is illegal\n", sps->chroma_format_idc); |
354 | 354 |
goto fail; |
355 |
- } |
|
356 |
- if(sps->chroma_format_idc == 3) |
|
355 |
+ } else if(sps->chroma_format_idc == 3) { |
|
357 | 356 |
sps->residual_color_transform_flag = get_bits1(&s->gb); |
357 |
+ } |
|
358 | 358 |
sps->bit_depth_luma = get_ue_golomb(&s->gb) + 8; |
359 | 359 |
sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8; |
360 | 360 |
if (sps->bit_depth_luma > 12U || sps->bit_depth_chroma > 12U) { |
... | ... |
@@ -56,14 +56,8 @@ typedef struct IpvideoContext { |
56 | 56 |
const unsigned char *decoding_map; |
57 | 57 |
int decoding_map_size; |
58 | 58 |
|
59 |
- const unsigned char *buf; |
|
60 |
- int size; |
|
61 |
- |
|
62 | 59 |
int is_16bpp; |
63 |
- const unsigned char *stream_ptr; |
|
64 |
- const unsigned char *stream_end; |
|
65 |
- const uint8_t *mv_ptr; |
|
66 |
- const uint8_t *mv_end; |
|
60 |
+ GetByteContext stream_ptr, mv_ptr; |
|
67 | 61 |
unsigned char *pixel_ptr; |
68 | 62 |
int line_inc; |
69 | 63 |
int stride; |
... | ... |
@@ -72,13 +66,6 @@ typedef struct IpvideoContext { |
72 | 72 |
uint32_t pal[256]; |
73 | 73 |
} IpvideoContext; |
74 | 74 |
|
75 |
-#define CHECK_STREAM_PTR(stream_ptr, stream_end, n) \ |
|
76 |
- if (stream_end - stream_ptr < n) { \ |
|
77 |
- av_log(s->avctx, AV_LOG_ERROR, "stream_ptr out of bounds (%p >= %p)\n", \ |
|
78 |
- stream_ptr + n, stream_end); \ |
|
79 |
- return -1; \ |
|
80 |
- } |
|
81 |
- |
|
82 | 75 |
static int copy_from(IpvideoContext *s, AVFrame *src, int delta_x, int delta_y) |
83 | 76 |
{ |
84 | 77 |
int current_offset = s->pixel_ptr - s->current_frame.data[0]; |
... | ... |
@@ -118,11 +105,9 @@ static int ipvideo_decode_block_opcode_0x2(IpvideoContext *s) |
118 | 118 |
|
119 | 119 |
/* copy block from 2 frames ago using a motion vector; need 1 more byte */ |
120 | 120 |
if (!s->is_16bpp) { |
121 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); |
|
122 |
- B = *s->stream_ptr++; |
|
121 |
+ B = bytestream2_get_byte(&s->stream_ptr); |
|
123 | 122 |
} else { |
124 |
- CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1); |
|
125 |
- B = *s->mv_ptr++; |
|
123 |
+ B = bytestream2_get_byte(&s->mv_ptr); |
|
126 | 124 |
} |
127 | 125 |
|
128 | 126 |
if (B < 56) { |
... | ... |
@@ -146,11 +131,9 @@ static int ipvideo_decode_block_opcode_0x3(IpvideoContext *s) |
146 | 146 |
|
147 | 147 |
/* need 1 more byte for motion */ |
148 | 148 |
if (!s->is_16bpp) { |
149 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); |
|
150 |
- B = *s->stream_ptr++; |
|
149 |
+ B = bytestream2_get_byte(&s->stream_ptr); |
|
151 | 150 |
} else { |
152 |
- CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1); |
|
153 |
- B = *s->mv_ptr++; |
|
151 |
+ B = bytestream2_get_byte(&s->mv_ptr); |
|
154 | 152 |
} |
155 | 153 |
|
156 | 154 |
if (B < 56) { |
... | ... |
@@ -172,11 +155,9 @@ static int ipvideo_decode_block_opcode_0x4(IpvideoContext *s) |
172 | 172 |
|
173 | 173 |
/* copy a block from the previous frame; need 1 more byte */ |
174 | 174 |
if (!s->is_16bpp) { |
175 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); |
|
176 |
- B = *s->stream_ptr++; |
|
175 |
+ B = bytestream2_get_byte(&s->stream_ptr); |
|
177 | 176 |
} else { |
178 |
- CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1); |
|
179 |
- B = *s->mv_ptr++; |
|
177 |
+ B = bytestream2_get_byte(&s->mv_ptr); |
|
180 | 178 |
} |
181 | 179 |
|
182 | 180 |
BL = B & 0x0F; |
... | ... |
@@ -194,10 +175,8 @@ static int ipvideo_decode_block_opcode_0x5(IpvideoContext *s) |
194 | 194 |
|
195 | 195 |
/* copy a block from the previous frame using an expanded range; |
196 | 196 |
* need 2 more bytes */ |
197 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
198 |
- |
|
199 |
- x = *s->stream_ptr++; |
|
200 |
- y = *s->stream_ptr++; |
|
197 |
+ x = bytestream2_get_byte(&s->stream_ptr); |
|
198 |
+ y = bytestream2_get_byte(&s->stream_ptr); |
|
201 | 199 |
|
202 | 200 |
av_dlog(s->avctx, "motion bytes = %d, %d\n", x, y); |
203 | 201 |
return copy_from(s, &s->last_frame, x, y); |
... | ... |
@@ -219,18 +198,14 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s) |
219 | 219 |
unsigned int flags; |
220 | 220 |
|
221 | 221 |
/* 2-color encoding */ |
222 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
223 |
- |
|
224 |
- P[0] = *s->stream_ptr++; |
|
225 |
- P[1] = *s->stream_ptr++; |
|
222 |
+ P[0] = bytestream2_get_byte(&s->stream_ptr); |
|
223 |
+ P[1] = bytestream2_get_byte(&s->stream_ptr); |
|
226 | 224 |
|
227 | 225 |
if (P[0] <= P[1]) { |
228 | 226 |
|
229 | 227 |
/* need 8 more bytes from the stream */ |
230 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); |
|
231 |
- |
|
232 | 228 |
for (y = 0; y < 8; y++) { |
233 |
- flags = *s->stream_ptr++ | 0x100; |
|
229 |
+ flags = bytestream2_get_byte(&s->stream_ptr) | 0x100; |
|
234 | 230 |
for (; flags != 1; flags >>= 1) |
235 | 231 |
*s->pixel_ptr++ = P[flags & 1]; |
236 | 232 |
s->pixel_ptr += s->line_inc; |
... | ... |
@@ -239,9 +214,7 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s) |
239 | 239 |
} else { |
240 | 240 |
|
241 | 241 |
/* need 2 more bytes from the stream */ |
242 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
243 |
- |
|
244 |
- flags = bytestream_get_le16(&s->stream_ptr); |
|
242 |
+ flags = bytestream2_get_le16(&s->stream_ptr); |
|
245 | 243 |
for (y = 0; y < 8; y += 2) { |
246 | 244 |
for (x = 0; x < 8; x += 2, flags >>= 1) { |
247 | 245 |
s->pixel_ptr[x ] = |
... | ... |
@@ -260,26 +233,23 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s) |
260 | 260 |
static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) |
261 | 261 |
{ |
262 | 262 |
int x, y; |
263 |
- unsigned char P[2]; |
|
263 |
+ unsigned char P[4]; |
|
264 | 264 |
unsigned int flags = 0; |
265 | 265 |
|
266 | 266 |
/* 2-color encoding for each 4x4 quadrant, or 2-color encoding on |
267 | 267 |
* either top and bottom or left and right halves */ |
268 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
269 |
- |
|
270 |
- P[0] = *s->stream_ptr++; |
|
271 |
- P[1] = *s->stream_ptr++; |
|
268 |
+ P[0] = bytestream2_get_byte(&s->stream_ptr); |
|
269 |
+ P[1] = bytestream2_get_byte(&s->stream_ptr); |
|
272 | 270 |
|
273 | 271 |
if (P[0] <= P[1]) { |
274 |
- |
|
275 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 14); |
|
276 |
- s->stream_ptr -= 2; |
|
277 |
- |
|
278 | 272 |
for (y = 0; y < 16; y++) { |
279 | 273 |
// new values for each 4x4 block |
280 | 274 |
if (!(y & 3)) { |
281 |
- P[0] = *s->stream_ptr++; P[1] = *s->stream_ptr++; |
|
282 |
- flags = bytestream_get_le16(&s->stream_ptr); |
|
275 |
+ if (y) { |
|
276 |
+ P[0] = bytestream2_get_byte(&s->stream_ptr); |
|
277 |
+ P[1] = bytestream2_get_byte(&s->stream_ptr); |
|
278 |
+ } |
|
279 |
+ flags = bytestream2_get_le16(&s->stream_ptr); |
|
283 | 280 |
} |
284 | 281 |
|
285 | 282 |
for (x = 0; x < 4; x++, flags >>= 1) |
... | ... |
@@ -290,13 +260,11 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) |
290 | 290 |
} |
291 | 291 |
|
292 | 292 |
} else { |
293 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
294 |
+ P[2] = bytestream2_get_byte(&s->stream_ptr); |
|
295 |
+ P[3] = bytestream2_get_byte(&s->stream_ptr); |
|
293 | 296 |
|
294 |
- /* need 10 more bytes */ |
|
295 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 10); |
|
296 |
- |
|
297 |
- if (s->stream_ptr[4] <= s->stream_ptr[5]) { |
|
298 |
- |
|
299 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
297 |
+ if (P[2] <= P[3]) { |
|
300 | 298 |
|
301 | 299 |
/* vertical split; left & right halves are 2-color encoded */ |
302 | 300 |
|
... | ... |
@@ -307,8 +275,9 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) |
307 | 307 |
// switch to right half |
308 | 308 |
if (y == 7) { |
309 | 309 |
s->pixel_ptr -= 8 * s->stride - 4; |
310 |
- P[0] = *s->stream_ptr++; P[1] = *s->stream_ptr++; |
|
311 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
310 |
+ P[0] = P[2]; |
|
311 |
+ P[1] = P[3]; |
|
312 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
312 | 313 |
} |
313 | 314 |
} |
314 | 315 |
|
... | ... |
@@ -318,12 +287,12 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s) |
318 | 318 |
|
319 | 319 |
for (y = 0; y < 8; y++) { |
320 | 320 |
if (y == 4) { |
321 |
- P[0] = *s->stream_ptr++; |
|
322 |
- P[1] = *s->stream_ptr++; |
|
321 |
+ P[0] = P[2]; |
|
322 |
+ P[1] = P[3]; |
|
323 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
323 | 324 |
} |
324 |
- flags = *s->stream_ptr++ | 0x100; |
|
325 | 325 |
|
326 |
- for (; flags != 1; flags >>= 1) |
|
326 |
+ for (x = 0; x < 8; x++, flags >>= 1) |
|
327 | 327 |
*s->pixel_ptr++ = P[flags & 1]; |
328 | 328 |
s->pixel_ptr += s->line_inc; |
329 | 329 |
} |
... | ... |
@@ -340,20 +309,15 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) |
340 | 340 |
unsigned char P[4]; |
341 | 341 |
|
342 | 342 |
/* 4-color encoding */ |
343 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); |
|
344 |
- |
|
345 |
- memcpy(P, s->stream_ptr, 4); |
|
346 |
- s->stream_ptr += 4; |
|
343 |
+ bytestream2_get_buffer(&s->stream_ptr, P, 4); |
|
347 | 344 |
|
348 | 345 |
if (P[0] <= P[1]) { |
349 | 346 |
if (P[2] <= P[3]) { |
350 | 347 |
|
351 | 348 |
/* 1 of 4 colors for each pixel, need 16 more bytes */ |
352 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16); |
|
353 |
- |
|
354 | 349 |
for (y = 0; y < 8; y++) { |
355 | 350 |
/* get the next set of 8 2-bit flags */ |
356 |
- int flags = bytestream_get_le16(&s->stream_ptr); |
|
351 |
+ int flags = bytestream2_get_le16(&s->stream_ptr); |
|
357 | 352 |
for (x = 0; x < 8; x++, flags >>= 2) |
358 | 353 |
*s->pixel_ptr++ = P[flags & 0x03]; |
359 | 354 |
s->pixel_ptr += s->line_inc; |
... | ... |
@@ -363,9 +327,7 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) |
363 | 363 |
uint32_t flags; |
364 | 364 |
|
365 | 365 |
/* 1 of 4 colors for each 2x2 block, need 4 more bytes */ |
366 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); |
|
367 |
- |
|
368 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
366 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
369 | 367 |
|
370 | 368 |
for (y = 0; y < 8; y += 2) { |
371 | 369 |
for (x = 0; x < 8; x += 2, flags >>= 2) { |
... | ... |
@@ -382,9 +344,7 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) |
382 | 382 |
uint64_t flags; |
383 | 383 |
|
384 | 384 |
/* 1 of 4 colors for each 2x1 or 1x2 block, need 8 more bytes */ |
385 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); |
|
386 |
- |
|
387 |
- flags = bytestream_get_le64(&s->stream_ptr); |
|
385 |
+ flags = bytestream2_get_le64(&s->stream_ptr); |
|
388 | 386 |
if (P[2] <= P[3]) { |
389 | 387 |
for (y = 0; y < 8; y++) { |
390 | 388 |
for (x = 0; x < 8; x += 2, flags >>= 2) { |
... | ... |
@@ -411,24 +371,21 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s) |
411 | 411 |
static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s) |
412 | 412 |
{ |
413 | 413 |
int x, y; |
414 |
- unsigned char P[4]; |
|
414 |
+ unsigned char P[8]; |
|
415 | 415 |
int flags = 0; |
416 | 416 |
|
417 |
+ bytestream2_get_buffer(&s->stream_ptr, P, 4); |
|
418 |
+ |
|
417 | 419 |
/* 4-color encoding for each 4x4 quadrant, or 4-color encoding on |
418 | 420 |
* either top and bottom or left and right halves */ |
419 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24); |
|
420 |
- |
|
421 |
- if (s->stream_ptr[0] <= s->stream_ptr[1]) { |
|
421 |
+ if (P[0] <= P[1]) { |
|
422 | 422 |
|
423 | 423 |
/* 4-color encoding for each quadrant; need 32 bytes */ |
424 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 32); |
|
425 |
- |
|
426 | 424 |
for (y = 0; y < 16; y++) { |
427 | 425 |
// new values for each 4x4 block |
428 | 426 |
if (!(y & 3)) { |
429 |
- memcpy(P, s->stream_ptr, 4); |
|
430 |
- s->stream_ptr += 4; |
|
431 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
427 |
+ if (y) bytestream2_get_buffer(&s->stream_ptr, P, 4); |
|
428 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
432 | 429 |
} |
433 | 430 |
|
434 | 431 |
for (x = 0; x < 4; x++, flags >>= 2) |
... | ... |
@@ -441,20 +398,16 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s) |
441 | 441 |
|
442 | 442 |
} else { |
443 | 443 |
// vertical split? |
444 |
- int vert = s->stream_ptr[12] <= s->stream_ptr[13]; |
|
445 |
- uint64_t flags = 0; |
|
444 |
+ int vert; |
|
445 |
+ uint64_t flags = bytestream2_get_le64(&s->stream_ptr); |
|
446 |
+ |
|
447 |
+ bytestream2_get_buffer(&s->stream_ptr, P + 4, 4); |
|
448 |
+ vert = P[4] <= P[5]; |
|
446 | 449 |
|
447 | 450 |
/* 4-color encoding for either left and right or top and bottom |
448 | 451 |
* halves */ |
449 | 452 |
|
450 | 453 |
for (y = 0; y < 16; y++) { |
451 |
- // load values for each half |
|
452 |
- if (!(y & 7)) { |
|
453 |
- memcpy(P, s->stream_ptr, 4); |
|
454 |
- s->stream_ptr += 4; |
|
455 |
- flags = bytestream_get_le64(&s->stream_ptr); |
|
456 |
- } |
|
457 |
- |
|
458 | 454 |
for (x = 0; x < 4; x++, flags >>= 2) |
459 | 455 |
*s->pixel_ptr++ = P[flags & 0x03]; |
460 | 456 |
|
... | ... |
@@ -463,6 +416,12 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s) |
463 | 463 |
// switch to right half |
464 | 464 |
if (y == 7) s->pixel_ptr -= 8 * s->stride - 4; |
465 | 465 |
} else if (y & 1) s->pixel_ptr += s->line_inc; |
466 |
+ |
|
467 |
+ // load values for second half |
|
468 |
+ if (y == 7) { |
|
469 |
+ memcpy(P, P + 4, 4); |
|
470 |
+ flags = bytestream2_get_le64(&s->stream_ptr); |
|
471 |
+ } |
|
466 | 472 |
} |
467 | 473 |
} |
468 | 474 |
|
... | ... |
@@ -475,11 +434,8 @@ static int ipvideo_decode_block_opcode_0xB(IpvideoContext *s) |
475 | 475 |
int y; |
476 | 476 |
|
477 | 477 |
/* 64-color encoding (each pixel in block is a different color) */ |
478 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 64); |
|
479 |
- |
|
480 | 478 |
for (y = 0; y < 8; y++) { |
481 |
- memcpy(s->pixel_ptr, s->stream_ptr, 8); |
|
482 |
- s->stream_ptr += 8; |
|
479 |
+ bytestream2_get_buffer(&s->stream_ptr, s->pixel_ptr, 8); |
|
483 | 480 |
s->pixel_ptr += s->stride; |
484 | 481 |
} |
485 | 482 |
|
... | ... |
@@ -492,14 +448,12 @@ static int ipvideo_decode_block_opcode_0xC(IpvideoContext *s) |
492 | 492 |
int x, y; |
493 | 493 |
|
494 | 494 |
/* 16-color block encoding: each 2x2 block is a different color */ |
495 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16); |
|
496 |
- |
|
497 | 495 |
for (y = 0; y < 8; y += 2) { |
498 | 496 |
for (x = 0; x < 8; x += 2) { |
499 | 497 |
s->pixel_ptr[x ] = |
500 | 498 |
s->pixel_ptr[x + 1 ] = |
501 | 499 |
s->pixel_ptr[x + s->stride] = |
502 |
- s->pixel_ptr[x + 1 + s->stride] = *s->stream_ptr++; |
|
500 |
+ s->pixel_ptr[x + 1 + s->stride] = bytestream2_get_byte(&s->stream_ptr); |
|
503 | 501 |
} |
504 | 502 |
s->pixel_ptr += s->stride * 2; |
505 | 503 |
} |
... | ... |
@@ -514,12 +468,10 @@ static int ipvideo_decode_block_opcode_0xD(IpvideoContext *s) |
514 | 514 |
unsigned char P[2]; |
515 | 515 |
|
516 | 516 |
/* 4-color block encoding: each 4x4 block is a different color */ |
517 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); |
|
518 |
- |
|
519 | 517 |
for (y = 0; y < 8; y++) { |
520 | 518 |
if (!(y & 3)) { |
521 |
- P[0] = *s->stream_ptr++; |
|
522 |
- P[1] = *s->stream_ptr++; |
|
519 |
+ P[0] = bytestream2_get_byte(&s->stream_ptr); |
|
520 |
+ P[1] = bytestream2_get_byte(&s->stream_ptr); |
|
523 | 521 |
} |
524 | 522 |
memset(s->pixel_ptr, P[0], 4); |
525 | 523 |
memset(s->pixel_ptr + 4, P[1], 4); |
... | ... |
@@ -536,8 +488,7 @@ static int ipvideo_decode_block_opcode_0xE(IpvideoContext *s) |
536 | 536 |
unsigned char pix; |
537 | 537 |
|
538 | 538 |
/* 1-color encoding: the whole block is 1 solid color */ |
539 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1); |
|
540 |
- pix = *s->stream_ptr++; |
|
539 |
+ pix = bytestream2_get_byte(&s->stream_ptr); |
|
541 | 540 |
|
542 | 541 |
for (y = 0; y < 8; y++) { |
543 | 542 |
memset(s->pixel_ptr, pix, 8); |
... | ... |
@@ -554,9 +505,8 @@ static int ipvideo_decode_block_opcode_0xF(IpvideoContext *s) |
554 | 554 |
unsigned char sample[2]; |
555 | 555 |
|
556 | 556 |
/* dithered encoding */ |
557 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
558 |
- sample[0] = *s->stream_ptr++; |
|
559 |
- sample[1] = *s->stream_ptr++; |
|
557 |
+ sample[0] = bytestream2_get_byte(&s->stream_ptr); |
|
558 |
+ sample[1] = bytestream2_get_byte(&s->stream_ptr); |
|
560 | 559 |
|
561 | 560 |
for (y = 0; y < 8; y++) { |
562 | 561 |
for (x = 0; x < 8; x += 2) { |
... | ... |
@@ -575,10 +525,8 @@ static int ipvideo_decode_block_opcode_0x6_16(IpvideoContext *s) |
575 | 575 |
signed char x, y; |
576 | 576 |
|
577 | 577 |
/* copy a block from the second last frame using an expanded range */ |
578 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
579 |
- |
|
580 |
- x = *s->stream_ptr++; |
|
581 |
- y = *s->stream_ptr++; |
|
578 |
+ x = bytestream2_get_byte(&s->stream_ptr); |
|
579 |
+ y = bytestream2_get_byte(&s->stream_ptr); |
|
582 | 580 |
|
583 | 581 |
av_dlog(s->avctx, "motion bytes = %d, %d\n", x, y); |
584 | 582 |
return copy_from(s, &s->second_last_frame, x, y); |
... | ... |
@@ -592,17 +540,13 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s) |
592 | 592 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
593 | 593 |
|
594 | 594 |
/* 2-color encoding */ |
595 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); |
|
596 |
- |
|
597 |
- P[0] = bytestream_get_le16(&s->stream_ptr); |
|
598 |
- P[1] = bytestream_get_le16(&s->stream_ptr); |
|
595 |
+ P[0] = bytestream2_get_le16(&s->stream_ptr); |
|
596 |
+ P[1] = bytestream2_get_le16(&s->stream_ptr); |
|
599 | 597 |
|
600 | 598 |
if (!(P[0] & 0x8000)) { |
601 | 599 |
|
602 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); |
|
603 |
- |
|
604 | 600 |
for (y = 0; y < 8; y++) { |
605 |
- flags = *s->stream_ptr++ | 0x100; |
|
601 |
+ flags = bytestream2_get_byte(&s->stream_ptr) | 0x100; |
|
606 | 602 |
for (; flags != 1; flags >>= 1) |
607 | 603 |
*pixel_ptr++ = P[flags & 1]; |
608 | 604 |
pixel_ptr += s->line_inc; |
... | ... |
@@ -610,9 +554,7 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s) |
610 | 610 |
|
611 | 611 |
} else { |
612 | 612 |
|
613 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
614 |
- |
|
615 |
- flags = bytestream_get_le16(&s->stream_ptr); |
|
613 |
+ flags = bytestream2_get_le16(&s->stream_ptr); |
|
616 | 614 |
for (y = 0; y < 8; y += 2) { |
617 | 615 |
for (x = 0; x < 8; x += 2, flags >>= 1) { |
618 | 616 |
pixel_ptr[x ] = |
... | ... |
@@ -630,28 +572,25 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s) |
630 | 630 |
static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) |
631 | 631 |
{ |
632 | 632 |
int x, y; |
633 |
- uint16_t P[2]; |
|
633 |
+ uint16_t P[4]; |
|
634 | 634 |
unsigned int flags = 0; |
635 | 635 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
636 | 636 |
|
637 | 637 |
/* 2-color encoding for each 4x4 quadrant, or 2-color encoding on |
638 | 638 |
* either top and bottom or left and right halves */ |
639 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); |
|
640 |
- |
|
641 |
- P[0] = bytestream_get_le16(&s->stream_ptr); |
|
642 |
- P[1] = bytestream_get_le16(&s->stream_ptr); |
|
639 |
+ P[0] = bytestream2_get_le16(&s->stream_ptr); |
|
640 |
+ P[1] = bytestream2_get_le16(&s->stream_ptr); |
|
643 | 641 |
|
644 | 642 |
if (!(P[0] & 0x8000)) { |
645 | 643 |
|
646 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24); |
|
647 |
- s->stream_ptr -= 4; |
|
648 |
- |
|
649 | 644 |
for (y = 0; y < 16; y++) { |
650 | 645 |
// new values for each 4x4 block |
651 | 646 |
if (!(y & 3)) { |
652 |
- P[0] = bytestream_get_le16(&s->stream_ptr); |
|
653 |
- P[1] = bytestream_get_le16(&s->stream_ptr); |
|
654 |
- flags = bytestream_get_le16(&s->stream_ptr); |
|
647 |
+ if (y) { |
|
648 |
+ P[0] = bytestream2_get_le16(&s->stream_ptr); |
|
649 |
+ P[1] = bytestream2_get_le16(&s->stream_ptr); |
|
650 |
+ } |
|
651 |
+ flags = bytestream2_get_le16(&s->stream_ptr); |
|
655 | 652 |
} |
656 | 653 |
|
657 | 654 |
for (x = 0; x < 4; x++, flags >>= 1) |
... | ... |
@@ -663,11 +602,11 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) |
663 | 663 |
|
664 | 664 |
} else { |
665 | 665 |
|
666 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 12); |
|
667 |
- |
|
668 |
- if (!(AV_RL16(s->stream_ptr + 4) & 0x8000)) { |
|
666 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
667 |
+ P[2] = bytestream2_get_le16(&s->stream_ptr); |
|
668 |
+ P[3] = bytestream2_get_le16(&s->stream_ptr); |
|
669 | 669 |
|
670 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
670 |
+ if (!(P[2] & 0x8000)) { |
|
671 | 671 |
|
672 | 672 |
/* vertical split; left & right halves are 2-color encoded */ |
673 | 673 |
|
... | ... |
@@ -678,9 +617,9 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) |
678 | 678 |
// switch to right half |
679 | 679 |
if (y == 7) { |
680 | 680 |
pixel_ptr -= 8 * s->stride - 4; |
681 |
- P[0] = bytestream_get_le16(&s->stream_ptr); |
|
682 |
- P[1] = bytestream_get_le16(&s->stream_ptr); |
|
683 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
681 |
+ P[0] = P[2]; |
|
682 |
+ P[1] = P[3]; |
|
683 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
684 | 684 |
} |
685 | 685 |
} |
686 | 686 |
|
... | ... |
@@ -690,12 +629,12 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s) |
690 | 690 |
|
691 | 691 |
for (y = 0; y < 8; y++) { |
692 | 692 |
if (y == 4) { |
693 |
- P[0] = bytestream_get_le16(&s->stream_ptr); |
|
694 |
- P[1] = bytestream_get_le16(&s->stream_ptr); |
|
693 |
+ P[0] = P[2]; |
|
694 |
+ P[1] = P[3]; |
|
695 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
695 | 696 |
} |
696 |
- flags = *s->stream_ptr++ | 0x100; |
|
697 | 697 |
|
698 |
- for (; flags != 1; flags >>= 1) |
|
698 |
+ for (x = 0; x < 8; x++, flags >>= 1) |
|
699 | 699 |
*pixel_ptr++ = P[flags & 1]; |
700 | 700 |
pixel_ptr += s->line_inc; |
701 | 701 |
} |
... | ... |
@@ -713,20 +652,16 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) |
713 | 713 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
714 | 714 |
|
715 | 715 |
/* 4-color encoding */ |
716 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); |
|
717 |
- |
|
718 | 716 |
for (x = 0; x < 4; x++) |
719 |
- P[x] = bytestream_get_le16(&s->stream_ptr); |
|
717 |
+ P[x] = bytestream2_get_le16(&s->stream_ptr); |
|
720 | 718 |
|
721 | 719 |
if (!(P[0] & 0x8000)) { |
722 | 720 |
if (!(P[2] & 0x8000)) { |
723 | 721 |
|
724 | 722 |
/* 1 of 4 colors for each pixel */ |
725 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16); |
|
726 |
- |
|
727 | 723 |
for (y = 0; y < 8; y++) { |
728 | 724 |
/* get the next set of 8 2-bit flags */ |
729 |
- int flags = bytestream_get_le16(&s->stream_ptr); |
|
725 |
+ int flags = bytestream2_get_le16(&s->stream_ptr); |
|
730 | 726 |
for (x = 0; x < 8; x++, flags >>= 2) |
731 | 727 |
*pixel_ptr++ = P[flags & 0x03]; |
732 | 728 |
pixel_ptr += s->line_inc; |
... | ... |
@@ -736,9 +671,7 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) |
736 | 736 |
uint32_t flags; |
737 | 737 |
|
738 | 738 |
/* 1 of 4 colors for each 2x2 block */ |
739 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4); |
|
740 |
- |
|
741 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
739 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
742 | 740 |
|
743 | 741 |
for (y = 0; y < 8; y += 2) { |
744 | 742 |
for (x = 0; x < 8; x += 2, flags >>= 2) { |
... | ... |
@@ -755,9 +688,7 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) |
755 | 755 |
uint64_t flags; |
756 | 756 |
|
757 | 757 |
/* 1 of 4 colors for each 2x1 or 1x2 block */ |
758 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); |
|
759 |
- |
|
760 |
- flags = bytestream_get_le64(&s->stream_ptr); |
|
758 |
+ flags = bytestream2_get_le64(&s->stream_ptr); |
|
761 | 759 |
if (!(P[2] & 0x8000)) { |
762 | 760 |
for (y = 0; y < 8; y++) { |
763 | 761 |
for (x = 0; x < 8; x += 2, flags >>= 2) { |
... | ... |
@@ -784,25 +715,25 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s) |
784 | 784 |
static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s) |
785 | 785 |
{ |
786 | 786 |
int x, y; |
787 |
- uint16_t P[4]; |
|
787 |
+ uint16_t P[8]; |
|
788 | 788 |
int flags = 0; |
789 | 789 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
790 | 790 |
|
791 |
+ for (x = 0; x < 4; x++) |
|
792 |
+ P[x] = bytestream2_get_le16(&s->stream_ptr); |
|
793 |
+ |
|
791 | 794 |
/* 4-color encoding for each 4x4 quadrant, or 4-color encoding on |
792 | 795 |
* either top and bottom or left and right halves */ |
793 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24); |
|
794 |
- |
|
795 |
- if (!(AV_RL16(s->stream_ptr) & 0x8000)) { |
|
796 |
+ if (!(P[0] & 0x8000)) { |
|
796 | 797 |
|
797 | 798 |
/* 4-color encoding for each quadrant */ |
798 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 48); |
|
799 |
- |
|
800 | 799 |
for (y = 0; y < 16; y++) { |
801 | 800 |
// new values for each 4x4 block |
802 | 801 |
if (!(y & 3)) { |
803 |
- for (x = 0; x < 4; x++) |
|
804 |
- P[x] = bytestream_get_le16(&s->stream_ptr); |
|
805 |
- flags = bytestream_get_le32(&s->stream_ptr); |
|
802 |
+ if (y) |
|
803 |
+ for (x = 0; x < 4; x++) |
|
804 |
+ P[x] = bytestream2_get_le16(&s->stream_ptr); |
|
805 |
+ flags = bytestream2_get_le32(&s->stream_ptr); |
|
806 | 806 |
} |
807 | 807 |
|
808 | 808 |
for (x = 0; x < 4; x++, flags >>= 2) |
... | ... |
@@ -815,20 +746,17 @@ static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s) |
815 | 815 |
|
816 | 816 |
} else { |
817 | 817 |
// vertical split? |
818 |
- int vert = !(AV_RL16(s->stream_ptr + 16) & 0x8000); |
|
819 |
- uint64_t flags = 0; |
|
818 |
+ int vert; |
|
819 |
+ uint64_t flags = bytestream2_get_le64(&s->stream_ptr); |
|
820 |
+ |
|
821 |
+ for (x = 4; x < 8; x++) |
|
822 |
+ P[x] = bytestream2_get_le16(&s->stream_ptr); |
|
823 |
+ vert = !(P[4] & 0x8000); |
|
820 | 824 |
|
821 | 825 |
/* 4-color encoding for either left and right or top and bottom |
822 | 826 |
* halves */ |
823 | 827 |
|
824 | 828 |
for (y = 0; y < 16; y++) { |
825 |
- // load values for each half |
|
826 |
- if (!(y & 7)) { |
|
827 |
- for (x = 0; x < 4; x++) |
|
828 |
- P[x] = bytestream_get_le16(&s->stream_ptr); |
|
829 |
- flags = bytestream_get_le64(&s->stream_ptr); |
|
830 |
- } |
|
831 |
- |
|
832 | 829 |
for (x = 0; x < 4; x++, flags >>= 2) |
833 | 830 |
*pixel_ptr++ = P[flags & 0x03]; |
834 | 831 |
|
... | ... |
@@ -837,6 +765,12 @@ static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s) |
837 | 837 |
// switch to right half |
838 | 838 |
if (y == 7) pixel_ptr -= 8 * s->stride - 4; |
839 | 839 |
} else if (y & 1) pixel_ptr += s->line_inc; |
840 |
+ |
|
841 |
+ // load values for second half |
|
842 |
+ if (y == 7) { |
|
843 |
+ memcpy(P, P + 4, 8); |
|
844 |
+ flags = bytestream2_get_le64(&s->stream_ptr); |
|
845 |
+ } |
|
840 | 846 |
} |
841 | 847 |
} |
842 | 848 |
|
... | ... |
@@ -850,11 +784,9 @@ static int ipvideo_decode_block_opcode_0xB_16(IpvideoContext *s) |
850 | 850 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
851 | 851 |
|
852 | 852 |
/* 64-color encoding (each pixel in block is a different color) */ |
853 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 128); |
|
854 |
- |
|
855 | 853 |
for (y = 0; y < 8; y++) { |
856 | 854 |
for (x = 0; x < 8; x++) |
857 |
- pixel_ptr[x] = bytestream_get_le16(&s->stream_ptr); |
|
855 |
+ pixel_ptr[x] = bytestream2_get_le16(&s->stream_ptr); |
|
858 | 856 |
pixel_ptr += s->stride; |
859 | 857 |
} |
860 | 858 |
|
... | ... |
@@ -868,14 +800,12 @@ static int ipvideo_decode_block_opcode_0xC_16(IpvideoContext *s) |
868 | 868 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
869 | 869 |
|
870 | 870 |
/* 16-color block encoding: each 2x2 block is a different color */ |
871 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 32); |
|
872 |
- |
|
873 | 871 |
for (y = 0; y < 8; y += 2) { |
874 | 872 |
for (x = 0; x < 8; x += 2) { |
875 | 873 |
pixel_ptr[x ] = |
876 | 874 |
pixel_ptr[x + 1 ] = |
877 | 875 |
pixel_ptr[x + s->stride] = |
878 |
- pixel_ptr[x + 1 + s->stride] = bytestream_get_le16(&s->stream_ptr); |
|
876 |
+ pixel_ptr[x + 1 + s->stride] = bytestream2_get_le16(&s->stream_ptr); |
|
879 | 877 |
} |
880 | 878 |
pixel_ptr += s->stride * 2; |
881 | 879 |
} |
... | ... |
@@ -891,12 +821,10 @@ static int ipvideo_decode_block_opcode_0xD_16(IpvideoContext *s) |
891 | 891 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
892 | 892 |
|
893 | 893 |
/* 4-color block encoding: each 4x4 block is a different color */ |
894 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8); |
|
895 |
- |
|
896 | 894 |
for (y = 0; y < 8; y++) { |
897 | 895 |
if (!(y & 3)) { |
898 |
- P[0] = bytestream_get_le16(&s->stream_ptr); |
|
899 |
- P[1] = bytestream_get_le16(&s->stream_ptr); |
|
896 |
+ P[0] = bytestream2_get_le16(&s->stream_ptr); |
|
897 |
+ P[1] = bytestream2_get_le16(&s->stream_ptr); |
|
900 | 898 |
} |
901 | 899 |
for (x = 0; x < 8; x++) |
902 | 900 |
pixel_ptr[x] = P[x >> 2]; |
... | ... |
@@ -914,8 +842,7 @@ static int ipvideo_decode_block_opcode_0xE_16(IpvideoContext *s) |
914 | 914 |
uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr; |
915 | 915 |
|
916 | 916 |
/* 1-color encoding: the whole block is 1 solid color */ |
917 |
- CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2); |
|
918 |
- pix = bytestream_get_le16(&s->stream_ptr); |
|
917 |
+ pix = bytestream2_get_le16(&s->stream_ptr); |
|
919 | 918 |
|
920 | 919 |
for (y = 0; y < 8; y++) { |
921 | 920 |
for (x = 0; x < 8; x++) |
... | ... |
@@ -960,19 +887,16 @@ static void ipvideo_decode_opcodes(IpvideoContext *s) |
960 | 960 |
av_dlog(s->avctx, "frame %d\n", frame); |
961 | 961 |
frame++; |
962 | 962 |
|
963 |
+ bytestream2_skip(&s->stream_ptr, 14); /* data starts 14 bytes in */ |
|
963 | 964 |
if (!s->is_16bpp) { |
964 | 965 |
/* this is PAL8, so make the palette available */ |
965 | 966 |
memcpy(s->current_frame.data[1], s->pal, AVPALETTE_SIZE); |
966 | 967 |
|
967 | 968 |
s->stride = s->current_frame.linesize[0]; |
968 |
- s->stream_ptr = s->buf + 14; /* data starts 14 bytes in */ |
|
969 |
- s->stream_end = s->buf + s->size; |
|
970 | 969 |
} else { |
971 | 970 |
s->stride = s->current_frame.linesize[0] >> 1; |
972 |
- s->stream_ptr = s->buf + 16; |
|
973 |
- s->stream_end = |
|
974 |
- s->mv_ptr = s->buf + 14 + AV_RL16(s->buf+14); |
|
975 |
- s->mv_end = s->buf + s->size; |
|
971 |
+ s->mv_ptr = s->stream_ptr; |
|
972 |
+ bytestream2_skip(&s->mv_ptr, bytestream2_get_le16(&s->stream_ptr)); |
|
976 | 973 |
} |
977 | 974 |
s->line_inc = s->stride - 8; |
978 | 975 |
s->upper_motion_limit_offset = (s->avctx->height - 8) * s->current_frame.linesize[0] |
... | ... |
@@ -1002,9 +926,10 @@ static void ipvideo_decode_opcodes(IpvideoContext *s) |
1002 | 1002 |
} |
1003 | 1003 |
} |
1004 | 1004 |
} |
1005 |
- if (s->stream_end - s->stream_ptr > 1) { |
|
1006 |
- av_log(s->avctx, AV_LOG_ERROR, "decode finished with %td bytes left over\n", |
|
1007 |
- s->stream_end - s->stream_ptr); |
|
1005 |
+ if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) { |
|
1006 |
+ av_log(s->avctx, AV_LOG_ERROR, |
|
1007 |
+ "decode finished with %d bytes left over\n", |
|
1008 |
+ bytestream2_get_bytes_left(&s->stream_ptr)); |
|
1008 | 1009 |
} |
1009 | 1010 |
} |
1010 | 1011 |
|
... | ... |
@@ -1046,8 +971,8 @@ static int ipvideo_decode_frame(AVCodecContext *avctx, |
1046 | 1046 |
return buf_size; |
1047 | 1047 |
|
1048 | 1048 |
s->decoding_map = buf; |
1049 |
- s->buf = buf + s->decoding_map_size; |
|
1050 |
- s->size = buf_size - s->decoding_map_size; |
|
1049 |
+ bytestream2_init(&s->stream_ptr, buf + s->decoding_map_size, |
|
1050 |
+ buf_size - s->decoding_map_size); |
|
1051 | 1051 |
|
1052 | 1052 |
s->current_frame.reference = 3; |
1053 | 1053 |
if (avctx->get_buffer(avctx, &s->current_frame)) { |
... | ... |
@@ -907,9 +907,13 @@ void ff_thread_flush(AVCodecContext *avctx) |
907 | 907 |
fctx->next_decoding = fctx->next_finished = 0; |
908 | 908 |
fctx->delaying = 1; |
909 | 909 |
fctx->prev_thread = NULL; |
910 |
- // Make sure decode flush calls with size=0 won't return old frames |
|
911 |
- for (i = 0; i < avctx->thread_count; i++) |
|
912 |
- fctx->threads[i].got_frame = 0; |
|
910 |
+ for (i = 0; i < avctx->thread_count; i++) { |
|
911 |
+ PerThreadContext *p = &fctx->threads[i]; |
|
912 |
+ // Make sure decode flush calls with size=0 won't return old frames |
|
913 |
+ p->got_frame = 0; |
|
914 |
+ |
|
915 |
+ release_delayed_buffers(p); |
|
916 |
+ } |
|
913 | 917 |
} |
914 | 918 |
|
915 | 919 |
static int *allocate_progress(PerThreadContext *p) |
... | ... |
@@ -358,13 +358,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac |
358 | 358 |
{ |
359 | 359 |
const uint8_t *buf = avpkt->data; |
360 | 360 |
int buf_size = avpkt->size; |
361 |
- const uint8_t *buf_end = buf + buf_size; |
|
362 | 361 |
UtvideoContext *c = avctx->priv_data; |
363 |
- const uint8_t *ptr; |
|
364 | 362 |
int i, j; |
365 | 363 |
const uint8_t *plane_start[5]; |
366 | 364 |
int plane_size, max_slice_size = 0, slice_start, slice_end, slice_size; |
367 | 365 |
int ret; |
366 |
+ GetByteContext gb; |
|
368 | 367 |
|
369 | 368 |
if (c->pic.data[0]) |
370 | 369 |
ff_thread_release_buffer(avctx, &c->pic); |
... | ... |
@@ -377,20 +376,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac |
377 | 377 |
} |
378 | 378 |
|
379 | 379 |
/* parse plane structure to retrieve frame flags and validate slice offsets */ |
380 |
- ptr = buf; |
|
380 |
+ bytestream2_init(&gb, buf, buf_size); |
|
381 | 381 |
for (i = 0; i < c->planes; i++) { |
382 |
- plane_start[i] = ptr; |
|
383 |
- if (buf_end - ptr < 256 + 4 * c->slices) { |
|
382 |
+ plane_start[i] = gb.buffer; |
|
383 |
+ if (bytestream2_get_bytes_left(&gb) < 256 + 4 * c->slices) { |
|
384 | 384 |
av_log(avctx, AV_LOG_ERROR, "Insufficient data for a plane\n"); |
385 | 385 |
return AVERROR_INVALIDDATA; |
386 | 386 |
} |
387 |
- ptr += 256; |
|
387 |
+ bytestream2_skipu(&gb, 256); |
|
388 | 388 |
slice_start = 0; |
389 | 389 |
slice_end = 0; |
390 | 390 |
for (j = 0; j < c->slices; j++) { |
391 |
- slice_end = bytestream_get_le32(&ptr); |
|
391 |
+ slice_end = bytestream2_get_le32u(&gb); |
|
392 | 392 |
slice_size = slice_end - slice_start; |
393 |
- if (slice_size < 0) { |
|
393 |
+ if (slice_end <= 0 || slice_size <= 0 || |
|
394 |
+ bytestream2_get_bytes_left(&gb) < slice_end) { |
|
394 | 395 |
av_log(avctx, AV_LOG_ERROR, "Incorrect slice size\n"); |
395 | 396 |
return AVERROR_INVALIDDATA; |
396 | 397 |
} |
... | ... |
@@ -398,18 +398,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac |
398 | 398 |
max_slice_size = FFMAX(max_slice_size, slice_size); |
399 | 399 |
} |
400 | 400 |
plane_size = slice_end; |
401 |
- if (buf_end - ptr < plane_size) { |
|
402 |
- av_log(avctx, AV_LOG_ERROR, "Plane size is bigger than available data\n"); |
|
403 |
- return AVERROR_INVALIDDATA; |
|
404 |
- } |
|
405 |
- ptr += plane_size; |
|
401 |
+ bytestream2_skipu(&gb, plane_size); |
|
406 | 402 |
} |
407 |
- plane_start[c->planes] = ptr; |
|
408 |
- if (buf_end - ptr < c->frame_info_size) { |
|
403 |
+ plane_start[c->planes] = gb.buffer; |
|
404 |
+ if (bytestream2_get_bytes_left(&gb) < c->frame_info_size) { |
|
409 | 405 |
av_log(avctx, AV_LOG_ERROR, "Not enough data for frame information\n"); |
410 | 406 |
return AVERROR_INVALIDDATA; |
411 | 407 |
} |
412 |
- c->frame_info = AV_RL32(ptr); |
|
408 |
+ c->frame_info = bytestream2_get_le32u(&gb); |
|
413 | 409 |
av_log(avctx, AV_LOG_DEBUG, "frame information flags %X\n", c->frame_info); |
414 | 410 |
|
415 | 411 |
c->frame_pred = (c->frame_info >> 8) & 3; |
... | ... |
@@ -25,6 +25,7 @@ YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ |
25 | 25 |
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ |
26 | 26 |
x86/h264_intrapred_10bit.o |
27 | 27 |
MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o |
28 |
+YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_10bit.o |
|
28 | 29 |
|
29 | 30 |
MMX-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o |
30 | 31 |
YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o |
... | ... |
@@ -71,7 +72,6 @@ MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o |
71 | 71 |
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ |
72 | 72 |
x86/deinterlace.o \ |
73 | 73 |
x86/fmtconvert.o \ |
74 |
- x86/h264_qpel_10bit.o \ |
|
75 | 74 |
$(YASM-OBJS-yes) |
76 | 75 |
|
77 | 76 |
MMX-OBJS-$(CONFIG_FFT) += x86/fft.o |
... | ... |
@@ -41,129 +41,129 @@ DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; |
41 | 41 |
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; |
42 | 42 |
|
43 | 43 |
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = |
44 |
-{0x8000000080000000ULL, 0x8000000080000000ULL}; |
|
45 |
- |
|
46 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL}; |
|
47 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL}; |
|
48 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; |
|
49 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; |
|
50 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; |
|
51 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL}; |
|
52 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL}; |
|
53 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL; |
|
54 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL}; |
|
55 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL}; |
|
56 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL}; |
|
57 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL; |
|
58 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL}; |
|
59 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL}; |
|
60 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL}; |
|
61 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL; |
|
62 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL; |
|
63 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL}; |
|
64 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL}; |
|
65 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL; |
|
66 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; |
|
67 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; |
|
68 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL}; |
|
69 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL}; |
|
70 |
- |
|
71 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL}; |
|
72 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL}; |
|
73 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL}; |
|
74 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL}; |
|
75 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL; |
|
76 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL; |
|
77 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL; |
|
78 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL}; |
|
79 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL; |
|
80 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL}; |
|
81 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL}; |
|
82 |
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL; |
|
83 |
-DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL}; |
|
44 |
+ { 0x8000000080000000ULL, 0x8000000080000000ULL }; |
|
45 |
+ |
|
46 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; |
|
47 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; |
|
48 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; |
|
49 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL }; |
|
50 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; |
|
51 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; |
|
52 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; |
|
53 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; |
|
54 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; |
|
55 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; |
|
56 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; |
|
57 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; |
|
58 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL }; |
|
59 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL }; |
|
60 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; |
|
61 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; |
|
62 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; |
|
63 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL }; |
|
64 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; |
|
65 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; |
|
66 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; |
|
67 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; |
|
68 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; |
|
69 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; |
|
70 |
+ |
|
71 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; |
|
72 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; |
|
73 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; |
|
74 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL }; |
|
75 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL; |
|
76 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL; |
|
77 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL; |
|
78 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; |
|
79 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL; |
|
80 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL }; |
|
81 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL }; |
|
82 |
+DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; |
|
83 |
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL }; |
|
84 | 84 |
|
85 | 85 |
DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; |
86 | 86 |
DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; |
87 | 87 |
|
88 |
-#define JUMPALIGN() __asm__ volatile (".p2align 3"::) |
|
89 |
-#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::) |
|
88 |
+#define JUMPALIGN() __asm__ volatile (".p2align 3"::) |
|
89 |
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) |
|
90 | 90 |
|
91 |
-#define MOVQ_BFE(regd) \ |
|
92 |
- __asm__ volatile ( \ |
|
93 |
- "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ |
|
94 |
- "paddb %%" #regd ", %%" #regd " \n\t" ::) |
|
91 |
+#define MOVQ_BFE(regd) \ |
|
92 |
+ __asm__ volatile ( \ |
|
93 |
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \ |
|
94 |
+ "paddb %%"#regd", %%"#regd" \n\t" ::) |
|
95 | 95 |
|
96 | 96 |
#ifndef PIC |
97 |
-#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone)) |
|
98 |
-#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo)) |
|
97 |
+#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone)) |
|
98 |
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) |
|
99 | 99 |
#else |
100 | 100 |
// for shared library it's better to use this way for accessing constants |
101 | 101 |
// pcmpeqd -> -1 |
102 |
-#define MOVQ_BONE(regd) \ |
|
103 |
- __asm__ volatile ( \ |
|
104 |
- "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
|
105 |
- "psrlw $15, %%" #regd " \n\t" \ |
|
106 |
- "packuswb %%" #regd ", %%" #regd " \n\t" ::) |
|
107 |
- |
|
108 |
-#define MOVQ_WTWO(regd) \ |
|
109 |
- __asm__ volatile ( \ |
|
110 |
- "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
|
111 |
- "psrlw $15, %%" #regd " \n\t" \ |
|
112 |
- "psllw $1, %%" #regd " \n\t"::) |
|
102 |
+#define MOVQ_BONE(regd) \ |
|
103 |
+ __asm__ volatile ( \ |
|
104 |
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \ |
|
105 |
+ "psrlw $15, %%"#regd" \n\t" \ |
|
106 |
+ "packuswb %%"#regd", %%"#regd" \n\t" ::) |
|
107 |
+ |
|
108 |
+#define MOVQ_WTWO(regd) \ |
|
109 |
+ __asm__ volatile ( \ |
|
110 |
+ "pcmpeqd %%"#regd", %%"#regd" \n\t" \ |
|
111 |
+ "psrlw $15, %%"#regd" \n\t" \ |
|
112 |
+ "psllw $1, %%"#regd" \n\t"::) |
|
113 | 113 |
|
114 | 114 |
#endif |
115 | 115 |
|
116 | 116 |
// using regr as temporary and for the output result |
117 | 117 |
// first argument is unmodifed and second is trashed |
118 | 118 |
// regfe is supposed to contain 0xfefefefefefefefe |
119 |
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ |
|
120 |
- "movq " #rega ", " #regr " \n\t"\ |
|
121 |
- "pand " #regb ", " #regr " \n\t"\ |
|
122 |
- "pxor " #rega ", " #regb " \n\t"\ |
|
123 |
- "pand " #regfe "," #regb " \n\t"\ |
|
124 |
- "psrlq $1, " #regb " \n\t"\ |
|
125 |
- "paddb " #regb ", " #regr " \n\t" |
|
126 |
- |
|
127 |
-#define PAVGB_MMX(rega, regb, regr, regfe) \ |
|
128 |
- "movq " #rega ", " #regr " \n\t"\ |
|
129 |
- "por " #regb ", " #regr " \n\t"\ |
|
130 |
- "pxor " #rega ", " #regb " \n\t"\ |
|
131 |
- "pand " #regfe "," #regb " \n\t"\ |
|
132 |
- "psrlq $1, " #regb " \n\t"\ |
|
133 |
- "psubb " #regb ", " #regr " \n\t" |
|
119 |
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ |
|
120 |
+ "movq "#rega", "#regr" \n\t" \ |
|
121 |
+ "pand "#regb", "#regr" \n\t" \ |
|
122 |
+ "pxor "#rega", "#regb" \n\t" \ |
|
123 |
+ "pand "#regfe", "#regb" \n\t" \ |
|
124 |
+ "psrlq $1, "#regb" \n\t" \ |
|
125 |
+ "paddb "#regb", "#regr" \n\t" |
|
126 |
+ |
|
127 |
+#define PAVGB_MMX(rega, regb, regr, regfe) \ |
|
128 |
+ "movq "#rega", "#regr" \n\t" \ |
|
129 |
+ "por "#regb", "#regr" \n\t" \ |
|
130 |
+ "pxor "#rega", "#regb" \n\t" \ |
|
131 |
+ "pand "#regfe", "#regb" \n\t" \ |
|
132 |
+ "psrlq $1, "#regb" \n\t" \ |
|
133 |
+ "psubb "#regb", "#regr" \n\t" |
|
134 | 134 |
|
135 | 135 |
// mm6 is supposed to contain 0xfefefefefefefefe |
136 |
-#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
|
137 |
- "movq " #rega ", " #regr " \n\t"\ |
|
138 |
- "movq " #regc ", " #regp " \n\t"\ |
|
139 |
- "pand " #regb ", " #regr " \n\t"\ |
|
140 |
- "pand " #regd ", " #regp " \n\t"\ |
|
141 |
- "pxor " #rega ", " #regb " \n\t"\ |
|
142 |
- "pxor " #regc ", " #regd " \n\t"\ |
|
143 |
- "pand %%mm6, " #regb " \n\t"\ |
|
144 |
- "pand %%mm6, " #regd " \n\t"\ |
|
145 |
- "psrlq $1, " #regb " \n\t"\ |
|
146 |
- "psrlq $1, " #regd " \n\t"\ |
|
147 |
- "paddb " #regb ", " #regr " \n\t"\ |
|
148 |
- "paddb " #regd ", " #regp " \n\t" |
|
149 |
- |
|
150 |
-#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
|
151 |
- "movq " #rega ", " #regr " \n\t"\ |
|
152 |
- "movq " #regc ", " #regp " \n\t"\ |
|
153 |
- "por " #regb ", " #regr " \n\t"\ |
|
154 |
- "por " #regd ", " #regp " \n\t"\ |
|
155 |
- "pxor " #rega ", " #regb " \n\t"\ |
|
156 |
- "pxor " #regc ", " #regd " \n\t"\ |
|
157 |
- "pand %%mm6, " #regb " \n\t"\ |
|
158 |
- "pand %%mm6, " #regd " \n\t"\ |
|
159 |
- "psrlq $1, " #regd " \n\t"\ |
|
160 |
- "psrlq $1, " #regb " \n\t"\ |
|
161 |
- "psubb " #regb ", " #regr " \n\t"\ |
|
162 |
- "psubb " #regd ", " #regp " \n\t" |
|
136 |
+#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
|
137 |
+ "movq "#rega", "#regr" \n\t" \ |
|
138 |
+ "movq "#regc", "#regp" \n\t" \ |
|
139 |
+ "pand "#regb", "#regr" \n\t" \ |
|
140 |
+ "pand "#regd", "#regp" \n\t" \ |
|
141 |
+ "pxor "#rega", "#regb" \n\t" \ |
|
142 |
+ "pxor "#regc", "#regd" \n\t" \ |
|
143 |
+ "pand %%mm6, "#regb" \n\t" \ |
|
144 |
+ "pand %%mm6, "#regd" \n\t" \ |
|
145 |
+ "psrlq $1, "#regb" \n\t" \ |
|
146 |
+ "psrlq $1, "#regd" \n\t" \ |
|
147 |
+ "paddb "#regb", "#regr" \n\t" \ |
|
148 |
+ "paddb "#regd", "#regp" \n\t" |
|
149 |
+ |
|
150 |
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
|
151 |
+ "movq "#rega", "#regr" \n\t" \ |
|
152 |
+ "movq "#regc", "#regp" \n\t" \ |
|
153 |
+ "por "#regb", "#regr" \n\t" \ |
|
154 |
+ "por "#regd", "#regp" \n\t" \ |
|
155 |
+ "pxor "#rega", "#regb" \n\t" \ |
|
156 |
+ "pxor "#regc", "#regd" \n\t" \ |
|
157 |
+ "pand %%mm6, "#regb" \n\t" \ |
|
158 |
+ "pand %%mm6, "#regd" \n\t" \ |
|
159 |
+ "psrlq $1, "#regd" \n\t" \ |
|
160 |
+ "psrlq $1, "#regb" \n\t" \ |
|
161 |
+ "psubb "#regb", "#regr" \n\t" \ |
|
162 |
+ "psubb "#regd", "#regp" \n\t" |
|
163 | 163 |
|
164 | 164 |
/***********************************/ |
165 | 165 |
/* MMX no rounding */ |
166 |
-#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx |
|
166 |
+#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx |
|
167 | 167 |
#define SET_RND MOVQ_WONE |
168 | 168 |
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
169 | 169 |
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
... | ... |
@@ -178,7 +178,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; |
178 | 178 |
/***********************************/ |
179 | 179 |
/* MMX rounding */ |
180 | 180 |
|
181 |
-#define DEF(x, y) x ## _ ## y ##_mmx |
|
181 |
+#define DEF(x, y) x ## _ ## y ## _mmx |
|
182 | 182 |
#define SET_RND MOVQ_WTWO |
183 | 183 |
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
184 | 184 |
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
... | ... |
@@ -235,537 +235,552 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; |
235 | 235 |
/***********************************/ |
236 | 236 |
/* standard MMX */ |
237 | 237 |
|
238 |
-void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
|
238 |
+void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, |
|
239 |
+ int line_size) |
|
239 | 240 |
{ |
240 | 241 |
const DCTELEM *p; |
241 | 242 |
uint8_t *pix; |
242 | 243 |
|
243 | 244 |
/* read the pixels */ |
244 |
- p = block; |
|
245 |
+ p = block; |
|
245 | 246 |
pix = pixels; |
246 | 247 |
/* unrolled loop */ |
247 |
- __asm__ volatile( |
|
248 |
- "movq %3, %%mm0 \n\t" |
|
249 |
- "movq 8%3, %%mm1 \n\t" |
|
250 |
- "movq 16%3, %%mm2 \n\t" |
|
251 |
- "movq 24%3, %%mm3 \n\t" |
|
252 |
- "movq 32%3, %%mm4 \n\t" |
|
253 |
- "movq 40%3, %%mm5 \n\t" |
|
254 |
- "movq 48%3, %%mm6 \n\t" |
|
255 |
- "movq 56%3, %%mm7 \n\t" |
|
256 |
- "packuswb %%mm1, %%mm0 \n\t" |
|
257 |
- "packuswb %%mm3, %%mm2 \n\t" |
|
258 |
- "packuswb %%mm5, %%mm4 \n\t" |
|
259 |
- "packuswb %%mm7, %%mm6 \n\t" |
|
260 |
- "movq %%mm0, (%0) \n\t" |
|
261 |
- "movq %%mm2, (%0, %1) \n\t" |
|
262 |
- "movq %%mm4, (%0, %1, 2) \n\t" |
|
263 |
- "movq %%mm6, (%0, %2) \n\t" |
|
264 |
- ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p) |
|
265 |
- :"memory"); |
|
266 |
- pix += line_size*4; |
|
267 |
- p += 32; |
|
248 |
+ __asm__ volatile ( |
|
249 |
+ "movq %3, %%mm0 \n\t" |
|
250 |
+ "movq 8%3, %%mm1 \n\t" |
|
251 |
+ "movq 16%3, %%mm2 \n\t" |
|
252 |
+ "movq 24%3, %%mm3 \n\t" |
|
253 |
+ "movq 32%3, %%mm4 \n\t" |
|
254 |
+ "movq 40%3, %%mm5 \n\t" |
|
255 |
+ "movq 48%3, %%mm6 \n\t" |
|
256 |
+ "movq 56%3, %%mm7 \n\t" |
|
257 |
+ "packuswb %%mm1, %%mm0 \n\t" |
|
258 |
+ "packuswb %%mm3, %%mm2 \n\t" |
|
259 |
+ "packuswb %%mm5, %%mm4 \n\t" |
|
260 |
+ "packuswb %%mm7, %%mm6 \n\t" |
|
261 |
+ "movq %%mm0, (%0) \n\t" |
|
262 |
+ "movq %%mm2, (%0, %1) \n\t" |
|
263 |
+ "movq %%mm4, (%0, %1, 2) \n\t" |
|
264 |
+ "movq %%mm6, (%0, %2) \n\t" |
|
265 |
+ :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), |
|
266 |
+ "m"(*p) |
|
267 |
+ : "memory"); |
|
268 |
+ pix += line_size * 4; |
|
269 |
+ p += 32; |
|
268 | 270 |
|
269 | 271 |
// if here would be an exact copy of the code above |
270 | 272 |
// compiler would generate some very strange code |
271 | 273 |
// thus using "r" |
272 |
- __asm__ volatile( |
|
273 |
- "movq (%3), %%mm0 \n\t" |
|
274 |
- "movq 8(%3), %%mm1 \n\t" |
|
275 |
- "movq 16(%3), %%mm2 \n\t" |
|
276 |
- "movq 24(%3), %%mm3 \n\t" |
|
277 |
- "movq 32(%3), %%mm4 \n\t" |
|
278 |
- "movq 40(%3), %%mm5 \n\t" |
|
279 |
- "movq 48(%3), %%mm6 \n\t" |
|
280 |
- "movq 56(%3), %%mm7 \n\t" |
|
281 |
- "packuswb %%mm1, %%mm0 \n\t" |
|
282 |
- "packuswb %%mm3, %%mm2 \n\t" |
|
283 |
- "packuswb %%mm5, %%mm4 \n\t" |
|
284 |
- "packuswb %%mm7, %%mm6 \n\t" |
|
285 |
- "movq %%mm0, (%0) \n\t" |
|
286 |
- "movq %%mm2, (%0, %1) \n\t" |
|
287 |
- "movq %%mm4, (%0, %1, 2) \n\t" |
|
288 |
- "movq %%mm6, (%0, %2) \n\t" |
|
289 |
- ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p) |
|
290 |
- :"memory"); |
|
291 |
-} |
|
292 |
- |
|
293 |
-#define put_signed_pixels_clamped_mmx_half(off) \ |
|
294 |
- "movq "#off"(%2), %%mm1 \n\t"\ |
|
295 |
- "movq 16+"#off"(%2), %%mm2 \n\t"\ |
|
296 |
- "movq 32+"#off"(%2), %%mm3 \n\t"\ |
|
297 |
- "movq 48+"#off"(%2), %%mm4 \n\t"\ |
|
298 |
- "packsswb 8+"#off"(%2), %%mm1 \n\t"\ |
|
299 |
- "packsswb 24+"#off"(%2), %%mm2 \n\t"\ |
|
300 |
- "packsswb 40+"#off"(%2), %%mm3 \n\t"\ |
|
301 |
- "packsswb 56+"#off"(%2), %%mm4 \n\t"\ |
|
302 |
- "paddb %%mm0, %%mm1 \n\t"\ |
|
303 |
- "paddb %%mm0, %%mm2 \n\t"\ |
|
304 |
- "paddb %%mm0, %%mm3 \n\t"\ |
|
305 |
- "paddb %%mm0, %%mm4 \n\t"\ |
|
306 |
- "movq %%mm1, (%0) \n\t"\ |
|
307 |
- "movq %%mm2, (%0, %3) \n\t"\ |
|
308 |
- "movq %%mm3, (%0, %3, 2) \n\t"\ |
|
309 |
- "movq %%mm4, (%0, %1) \n\t" |
|
310 |
- |
|
311 |
-void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
|
274 |
+ __asm__ volatile ( |
|
275 |
+ "movq (%3), %%mm0 \n\t" |
|
276 |
+ "movq 8(%3), %%mm1 \n\t" |
|
277 |
+ "movq 16(%3), %%mm2 \n\t" |
|
278 |
+ "movq 24(%3), %%mm3 \n\t" |
|
279 |
+ "movq 32(%3), %%mm4 \n\t" |
|
280 |
+ "movq 40(%3), %%mm5 \n\t" |
|
281 |
+ "movq 48(%3), %%mm6 \n\t" |
|
282 |
+ "movq 56(%3), %%mm7 \n\t" |
|
283 |
+ "packuswb %%mm1, %%mm0 \n\t" |
|
284 |
+ "packuswb %%mm3, %%mm2 \n\t" |
|
285 |
+ "packuswb %%mm5, %%mm4 \n\t" |
|
286 |
+ "packuswb %%mm7, %%mm6 \n\t" |
|
287 |
+ "movq %%mm0, (%0) \n\t" |
|
288 |
+ "movq %%mm2, (%0, %1) \n\t" |
|
289 |
+ "movq %%mm4, (%0, %1, 2) \n\t" |
|
290 |
+ "movq %%mm6, (%0, %2) \n\t" |
|
291 |
+ :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p) |
|
292 |
+ : "memory"); |
|
293 |
+} |
|
294 |
+ |
|
295 |
+#define put_signed_pixels_clamped_mmx_half(off) \ |
|
296 |
+ "movq "#off"(%2), %%mm1 \n\t" \ |
|
297 |
+ "movq 16 + "#off"(%2), %%mm2 \n\t" \ |
|
298 |
+ "movq 32 + "#off"(%2), %%mm3 \n\t" \ |
|
299 |
+ "movq 48 + "#off"(%2), %%mm4 \n\t" \ |
|
300 |
+ "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ |
|
301 |
+ "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ |
|
302 |
+ "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ |
|
303 |
+ "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ |
|
304 |
+ "paddb %%mm0, %%mm1 \n\t" \ |
|
305 |
+ "paddb %%mm0, %%mm2 \n\t" \ |
|
306 |
+ "paddb %%mm0, %%mm3 \n\t" \ |
|
307 |
+ "paddb %%mm0, %%mm4 \n\t" \ |
|
308 |
+ "movq %%mm1, (%0) \n\t" \ |
|
309 |
+ "movq %%mm2, (%0, %3) \n\t" \ |
|
310 |
+ "movq %%mm3, (%0, %3, 2) \n\t" \ |
|
311 |
+ "movq %%mm4, (%0, %1) \n\t" |
|
312 |
+ |
|
313 |
+void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, |
|
314 |
+ int line_size) |
|
312 | 315 |
{ |
313 | 316 |
x86_reg line_skip = line_size; |
314 | 317 |
x86_reg line_skip3; |
315 | 318 |
|
316 | 319 |
__asm__ volatile ( |
317 |
- "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" |
|
318 |
- "lea (%3, %3, 2), %1 \n\t" |
|
319 |
- put_signed_pixels_clamped_mmx_half(0) |
|
320 |
- "lea (%0, %3, 4), %0 \n\t" |
|
321 |
- put_signed_pixels_clamped_mmx_half(64) |
|
322 |
- :"+&r" (pixels), "=&r" (line_skip3) |
|
323 |
- :"r" (block), "r"(line_skip) |
|
324 |
- :"memory"); |
|
325 |
-} |
|
326 |
- |
|
327 |
-void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) |
|
320 |
+ "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" |
|
321 |
+ "lea (%3, %3, 2), %1 \n\t" |
|
322 |
+ put_signed_pixels_clamped_mmx_half(0) |
|
323 |
+ "lea (%0, %3, 4), %0 \n\t" |
|
324 |
+ put_signed_pixels_clamped_mmx_half(64) |
|
325 |
+ : "+&r"(pixels), "=&r"(line_skip3) |
|
326 |
+ : "r"(block), "r"(line_skip) |
|
327 |
+ : "memory"); |
|
328 |
+} |
|
329 |
+ |
|
330 |
+void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, |
|
331 |
+ int line_size) |
|
328 | 332 |
{ |
329 | 333 |
const DCTELEM *p; |
330 | 334 |
uint8_t *pix; |
331 | 335 |
int i; |
332 | 336 |
|
333 | 337 |
/* read the pixels */ |
334 |
- p = block; |
|
338 |
+ p = block; |
|
335 | 339 |
pix = pixels; |
336 | 340 |
MOVQ_ZERO(mm7); |
337 | 341 |
i = 4; |
338 | 342 |
do { |
339 |
- __asm__ volatile( |
|
340 |
- "movq (%2), %%mm0 \n\t" |
|
341 |
- "movq 8(%2), %%mm1 \n\t" |
|
342 |
- "movq 16(%2), %%mm2 \n\t" |
|
343 |
- "movq 24(%2), %%mm3 \n\t" |
|
344 |
- "movq %0, %%mm4 \n\t" |
|
345 |
- "movq %1, %%mm6 \n\t" |
|
346 |
- "movq %%mm4, %%mm5 \n\t" |
|
347 |
- "punpcklbw %%mm7, %%mm4 \n\t" |
|
348 |
- "punpckhbw %%mm7, %%mm5 \n\t" |
|
349 |
- "paddsw %%mm4, %%mm0 \n\t" |
|
350 |
- "paddsw %%mm5, %%mm1 \n\t" |
|
351 |
- "movq %%mm6, %%mm5 \n\t" |
|
352 |
- "punpcklbw %%mm7, %%mm6 \n\t" |
|
353 |
- "punpckhbw %%mm7, %%mm5 \n\t" |
|
354 |
- "paddsw %%mm6, %%mm2 \n\t" |
|
355 |
- "paddsw %%mm5, %%mm3 \n\t" |
|
356 |
- "packuswb %%mm1, %%mm0 \n\t" |
|
357 |
- "packuswb %%mm3, %%mm2 \n\t" |
|
358 |
- "movq %%mm0, %0 \n\t" |
|
359 |
- "movq %%mm2, %1 \n\t" |
|
360 |
- :"+m"(*pix), "+m"(*(pix+line_size)) |
|
361 |
- :"r"(p) |
|
362 |
- :"memory"); |
|
363 |
- pix += line_size*2; |
|
364 |
- p += 16; |
|
343 |
+ __asm__ volatile ( |
|
344 |
+ "movq (%2), %%mm0 \n\t" |
|
345 |
+ "movq 8(%2), %%mm1 \n\t" |
|
346 |
+ "movq 16(%2), %%mm2 \n\t" |
|
347 |
+ "movq 24(%2), %%mm3 \n\t" |
|
348 |
+ "movq %0, %%mm4 \n\t" |
|
349 |
+ "movq %1, %%mm6 \n\t" |
|
350 |
+ "movq %%mm4, %%mm5 \n\t" |
|
351 |
+ "punpcklbw %%mm7, %%mm4 \n\t" |
|
352 |
+ "punpckhbw %%mm7, %%mm5 \n\t" |
|
353 |
+ "paddsw %%mm4, %%mm0 \n\t" |
|
354 |
+ "paddsw %%mm5, %%mm1 \n\t" |
|
355 |
+ "movq %%mm6, %%mm5 \n\t" |
|
356 |
+ "punpcklbw %%mm7, %%mm6 \n\t" |
|
357 |
+ "punpckhbw %%mm7, %%mm5 \n\t" |
|
358 |
+ "paddsw %%mm6, %%mm2 \n\t" |
|
359 |
+ "paddsw %%mm5, %%mm3 \n\t" |
|
360 |
+ "packuswb %%mm1, %%mm0 \n\t" |
|
361 |
+ "packuswb %%mm3, %%mm2 \n\t" |
|
362 |
+ "movq %%mm0, %0 \n\t" |
|
363 |
+ "movq %%mm2, %1 \n\t" |
|
364 |
+ : "+m"(*pix), "+m"(*(pix + line_size)) |
|
365 |
+ : "r"(p) |
|
366 |
+ : "memory"); |
|
367 |
+ pix += line_size * 2; |
|
368 |
+ p += 16; |
|
365 | 369 |
} while (--i); |
366 | 370 |
} |
367 | 371 |
|
368 |
-static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
372 |
+static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, |
|
373 |
+ int line_size, int h) |
|
369 | 374 |
{ |
370 |
- __asm__ volatile( |
|
371 |
- "lea (%3, %3), %%"REG_a" \n\t" |
|
372 |
- ".p2align 3 \n\t" |
|
373 |
- "1: \n\t" |
|
374 |
- "movd (%1), %%mm0 \n\t" |
|
375 |
- "movd (%1, %3), %%mm1 \n\t" |
|
376 |
- "movd %%mm0, (%2) \n\t" |
|
377 |
- "movd %%mm1, (%2, %3) \n\t" |
|
378 |
- "add %%"REG_a", %1 \n\t" |
|
379 |
- "add %%"REG_a", %2 \n\t" |
|
380 |
- "movd (%1), %%mm0 \n\t" |
|
381 |
- "movd (%1, %3), %%mm1 \n\t" |
|
382 |
- "movd %%mm0, (%2) \n\t" |
|
383 |
- "movd %%mm1, (%2, %3) \n\t" |
|
384 |
- "add %%"REG_a", %1 \n\t" |
|
385 |
- "add %%"REG_a", %2 \n\t" |
|
386 |
- "subl $4, %0 \n\t" |
|
387 |
- "jnz 1b \n\t" |
|
388 |
- : "+g"(h), "+r" (pixels), "+r" (block) |
|
389 |
- : "r"((x86_reg)line_size) |
|
390 |
- : "%"REG_a, "memory" |
|
375 |
+ __asm__ volatile ( |
|
376 |
+ "lea (%3, %3), %%"REG_a" \n\t" |
|
377 |
+ ".p2align 3 \n\t" |
|
378 |
+ "1: \n\t" |
|
379 |
+ "movd (%1 ), %%mm0 \n\t" |
|
380 |
+ "movd (%1, %3), %%mm1 \n\t" |
|
381 |
+ "movd %%mm0, (%2) \n\t" |
|
382 |
+ "movd %%mm1, (%2, %3) \n\t" |
|
383 |
+ "add %%"REG_a", %1 \n\t" |
|
384 |
+ "add %%"REG_a", %2 \n\t" |
|
385 |
+ "movd (%1 ), %%mm0 \n\t" |
|
386 |
+ "movd (%1, %3), %%mm1 \n\t" |
|
387 |
+ "movd %%mm0, (%2) \n\t" |
|
388 |
+ "movd %%mm1, (%2, %3) \n\t" |
|
389 |
+ "add %%"REG_a", %1 \n\t" |
|
390 |
+ "add %%"REG_a", %2 \n\t" |
|
391 |
+ "subl $4, %0 \n\t" |
|
392 |
+ "jnz 1b \n\t" |
|
393 |
+ : "+g"(h), "+r"(pixels), "+r"(block) |
|
394 |
+ : "r"((x86_reg)line_size) |
|
395 |
+ : "%"REG_a, "memory" |
|
391 | 396 |
); |
392 | 397 |
} |
393 | 398 |
|
394 |
-static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
399 |
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, |
|
400 |
+ int line_size, int h) |
|
395 | 401 |
{ |
396 |
- __asm__ volatile( |
|
397 |
- "lea (%3, %3), %%"REG_a" \n\t" |
|
398 |
- ".p2align 3 \n\t" |
|
399 |
- "1: \n\t" |
|
400 |
- "movq (%1), %%mm0 \n\t" |
|
401 |
- "movq (%1, %3), %%mm1 \n\t" |
|
402 |
- "movq %%mm0, (%2) \n\t" |
|
403 |
- "movq %%mm1, (%2, %3) \n\t" |
|
404 |
- "add %%"REG_a", %1 \n\t" |
|
405 |
- "add %%"REG_a", %2 \n\t" |
|
406 |
- "movq (%1), %%mm0 \n\t" |
|
407 |
- "movq (%1, %3), %%mm1 \n\t" |
|
408 |
- "movq %%mm0, (%2) \n\t" |
|
409 |
- "movq %%mm1, (%2, %3) \n\t" |
|
410 |
- "add %%"REG_a", %1 \n\t" |
|
411 |
- "add %%"REG_a", %2 \n\t" |
|
412 |
- "subl $4, %0 \n\t" |
|
413 |
- "jnz 1b \n\t" |
|
414 |
- : "+g"(h), "+r" (pixels), "+r" (block) |
|
415 |
- : "r"((x86_reg)line_size) |
|
416 |
- : "%"REG_a, "memory" |
|
402 |
+ __asm__ volatile ( |
|
403 |
+ "lea (%3, %3), %%"REG_a" \n\t" |
|
404 |
+ ".p2align 3 \n\t" |
|
405 |
+ "1: \n\t" |
|
406 |
+ "movq (%1 ), %%mm0 \n\t" |
|
407 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
408 |
+ "movq %%mm0, (%2) \n\t" |
|
409 |
+ "movq %%mm1, (%2, %3) \n\t" |
|
410 |
+ "add %%"REG_a", %1 \n\t" |
|
411 |
+ "add %%"REG_a", %2 \n\t" |
|
412 |
+ "movq (%1 ), %%mm0 \n\t" |
|
413 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
414 |
+ "movq %%mm0, (%2) \n\t" |
|
415 |
+ "movq %%mm1, (%2, %3) \n\t" |
|
416 |
+ "add %%"REG_a", %1 \n\t" |
|
417 |
+ "add %%"REG_a", %2 \n\t" |
|
418 |
+ "subl $4, %0 \n\t" |
|
419 |
+ "jnz 1b \n\t" |
|
420 |
+ : "+g"(h), "+r"(pixels), "+r"(block) |
|
421 |
+ : "r"((x86_reg)line_size) |
|
422 |
+ : "%"REG_a, "memory" |
|
417 | 423 |
); |
418 | 424 |
} |
419 | 425 |
|
420 |
-static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
426 |
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, |
|
427 |
+ int line_size, int h) |
|
421 | 428 |
{ |
422 |
- __asm__ volatile( |
|
423 |
- "lea (%3, %3), %%"REG_a" \n\t" |
|
424 |
- ".p2align 3 \n\t" |
|
425 |
- "1: \n\t" |
|
426 |
- "movq (%1), %%mm0 \n\t" |
|
427 |
- "movq 8(%1), %%mm4 \n\t" |
|
428 |
- "movq (%1, %3), %%mm1 \n\t" |
|
429 |
- "movq 8(%1, %3), %%mm5 \n\t" |
|
430 |
- "movq %%mm0, (%2) \n\t" |
|
431 |
- "movq %%mm4, 8(%2) \n\t" |
|
432 |
- "movq %%mm1, (%2, %3) \n\t" |
|
433 |
- "movq %%mm5, 8(%2, %3) \n\t" |
|
434 |
- "add %%"REG_a", %1 \n\t" |
|
435 |
- "add %%"REG_a", %2 \n\t" |
|
436 |
- "movq (%1), %%mm0 \n\t" |
|
437 |
- "movq 8(%1), %%mm4 \n\t" |
|
438 |
- "movq (%1, %3), %%mm1 \n\t" |
|
439 |
- "movq 8(%1, %3), %%mm5 \n\t" |
|
440 |
- "movq %%mm0, (%2) \n\t" |
|
441 |
- "movq %%mm4, 8(%2) \n\t" |
|
442 |
- "movq %%mm1, (%2, %3) \n\t" |
|
443 |
- "movq %%mm5, 8(%2, %3) \n\t" |
|
444 |
- "add %%"REG_a", %1 \n\t" |
|
445 |
- "add %%"REG_a", %2 \n\t" |
|
446 |
- "subl $4, %0 \n\t" |
|
447 |
- "jnz 1b \n\t" |
|
448 |
- : "+g"(h), "+r" (pixels), "+r" (block) |
|
449 |
- : "r"((x86_reg)line_size) |
|
450 |
- : "%"REG_a, "memory" |
|
429 |
+ __asm__ volatile ( |
|
430 |
+ "lea (%3, %3), %%"REG_a" \n\t" |
|
431 |
+ ".p2align 3 \n\t" |
|
432 |
+ "1: \n\t" |
|
433 |
+ "movq (%1 ), %%mm0 \n\t" |
|
434 |
+ "movq 8(%1 ), %%mm4 \n\t" |
|
435 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
436 |
+ "movq 8(%1, %3), %%mm5 \n\t" |
|
437 |
+ "movq %%mm0, (%2) \n\t" |
|
438 |
+ "movq %%mm4, 8(%2) \n\t" |
|
439 |
+ "movq %%mm1, (%2, %3) \n\t" |
|
440 |
+ "movq %%mm5, 8(%2, %3) \n\t" |
|
441 |
+ "add %%"REG_a", %1 \n\t" |
|
442 |
+ "add %%"REG_a", %2 \n\t" |
|
443 |
+ "movq (%1 ), %%mm0 \n\t" |
|
444 |
+ "movq 8(%1 ), %%mm4 \n\t" |
|
445 |
+ "movq (%1, %3), %%mm1 \n\t" |
|
446 |
+ "movq 8(%1, %3), %%mm5 \n\t" |
|
447 |
+ "movq %%mm0, (%2) \n\t" |
|
448 |
+ "movq %%mm4, 8(%2) \n\t" |
|
449 |
+ "movq %%mm1, (%2, %3) \n\t" |
|
450 |
+ "movq %%mm5, 8(%2, %3) \n\t" |
|
451 |
+ "add %%"REG_a", %1 \n\t" |
|
452 |
+ "add %%"REG_a", %2 \n\t" |
|
453 |
+ "subl $4, %0 \n\t" |
|
454 |
+ "jnz 1b \n\t" |
|
455 |
+ : "+g"(h), "+r"(pixels), "+r"(block) |
|
456 |
+ : "r"((x86_reg)line_size) |
|
457 |
+ : "%"REG_a, "memory" |
|
451 | 458 |
); |
452 | 459 |
} |
453 | 460 |
|
454 |
-static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
461 |
+static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, |
|
462 |
+ int line_size, int h) |
|
455 | 463 |
{ |
456 |
- __asm__ volatile( |
|
457 |
- "1: \n\t" |
|
458 |
- "movdqu (%1), %%xmm0 \n\t" |
|
459 |
- "movdqu (%1,%3), %%xmm1 \n\t" |
|
460 |
- "movdqu (%1,%3,2), %%xmm2 \n\t" |
|
461 |
- "movdqu (%1,%4), %%xmm3 \n\t" |
|
462 |
- "lea (%1,%3,4), %1 \n\t" |
|
463 |
- "movdqa %%xmm0, (%2) \n\t" |
|
464 |
- "movdqa %%xmm1, (%2,%3) \n\t" |
|
465 |
- "movdqa %%xmm2, (%2,%3,2) \n\t" |
|
466 |
- "movdqa %%xmm3, (%2,%4) \n\t" |
|
467 |
- "subl $4, %0 \n\t" |
|
468 |
- "lea (%2,%3,4), %2 \n\t" |
|
469 |
- "jnz 1b \n\t" |
|
470 |
- : "+g"(h), "+r" (pixels), "+r" (block) |
|
471 |
- : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) |
|
472 |
- : "memory" |
|
464 |
+ __asm__ volatile ( |
|
465 |
+ "1: \n\t" |
|
466 |
+ "movdqu (%1 ), %%xmm0 \n\t" |
|
467 |
+ "movdqu (%1, %3 ), %%xmm1 \n\t" |
|
468 |
+ "movdqu (%1, %3, 2), %%xmm2 \n\t" |
|
469 |
+ "movdqu (%1, %4 ), %%xmm3 \n\t" |
|
470 |
+ "lea (%1, %3, 4), %1 \n\t" |
|
471 |
+ "movdqa %%xmm0, (%2) \n\t" |
|
472 |
+ "movdqa %%xmm1, (%2, %3) \n\t" |
|
473 |
+ "movdqa %%xmm2, (%2, %3, 2) \n\t" |
|
474 |
+ "movdqa %%xmm3, (%2, %4) \n\t" |
|
475 |
+ "subl $4, %0 \n\t" |
|
476 |
+ "lea (%2, %3, 4), %2 \n\t" |
|
477 |
+ "jnz 1b \n\t" |
|
478 |
+ : "+g"(h), "+r"(pixels), "+r"(block) |
|
479 |
+ : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size) |
|
480 |
+ : "memory" |
|
473 | 481 |
); |
474 | 482 |
} |
475 | 483 |
|
476 |
-static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
484 |
+static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, |
|
485 |
+ int line_size, int h) |
|
477 | 486 |
{ |
478 |
- __asm__ volatile( |
|
479 |
- "1: \n\t" |
|
480 |
- "movdqu (%1), %%xmm0 \n\t" |
|
481 |
- "movdqu (%1,%3), %%xmm1 \n\t" |
|
482 |
- "movdqu (%1,%3,2), %%xmm2 \n\t" |
|
483 |
- "movdqu (%1,%4), %%xmm3 \n\t" |
|
484 |
- "lea (%1,%3,4), %1 \n\t" |
|
485 |
- "pavgb (%2), %%xmm0 \n\t" |
|
486 |
- "pavgb (%2,%3), %%xmm1 \n\t" |
|
487 |
- "pavgb (%2,%3,2), %%xmm2 \n\t" |
|
488 |
- "pavgb (%2,%4), %%xmm3 \n\t" |
|
489 |
- "movdqa %%xmm0, (%2) \n\t" |
|
490 |
- "movdqa %%xmm1, (%2,%3) \n\t" |
|
491 |
- "movdqa %%xmm2, (%2,%3,2) \n\t" |
|
492 |
- "movdqa %%xmm3, (%2,%4) \n\t" |
|
493 |
- "subl $4, %0 \n\t" |
|
494 |
- "lea (%2,%3,4), %2 \n\t" |
|
495 |
- "jnz 1b \n\t" |
|
496 |
- : "+g"(h), "+r" (pixels), "+r" (block) |
|
497 |
- : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size) |
|
498 |
- : "memory" |
|
487 |
+ __asm__ volatile ( |
|
488 |
+ "1: \n\t" |
|
489 |
+ "movdqu (%1 ), %%xmm0 \n\t" |
|
490 |
+ "movdqu (%1, %3 ), %%xmm1 \n\t" |
|
491 |
+ "movdqu (%1, %3, 2), %%xmm2 \n\t" |
|
492 |
+ "movdqu (%1, %4 ), %%xmm3 \n\t" |
|
493 |
+ "lea (%1, %3, 4), %1 \n\t" |
|
494 |
+ "pavgb (%2 ), %%xmm0 \n\t" |
|
495 |
+ "pavgb (%2, %3 ), %%xmm1 \n\t" |
|
496 |
+ "pavgb (%2, %3, 2), %%xmm2 \n\t" |
|
497 |
+ "pavgb (%2, %4), %%xmm3 \n\t" |
|
498 |
+ "movdqa %%xmm0, (%2) \n\t" |
|
499 |
+ "movdqa %%xmm1, (%2, %3) \n\t" |
|
500 |
+ "movdqa %%xmm2, (%2, %3, 2) \n\t" |
|
501 |
+ "movdqa %%xmm3, (%2, %4) \n\t" |
|
502 |
+ "subl $4, %0 \n\t" |
|
503 |
+ "lea (%2, %3, 4), %2 \n\t" |
|
504 |
+ "jnz 1b \n\t" |
|
505 |
+ : "+g"(h), "+r"(pixels), "+r"(block) |
|
506 |
+ : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size) |
|
507 |
+ : "memory" |
|
499 | 508 |
); |
500 | 509 |
} |
501 | 510 |
|
502 |
-#define CLEAR_BLOCKS(name,n) \ |
|
503 |
-static void name(DCTELEM *blocks)\ |
|
504 |
-{\ |
|
505 |
- __asm__ volatile(\ |
|
506 |
- "pxor %%mm7, %%mm7 \n\t"\ |
|
507 |
- "mov %1, %%"REG_a" \n\t"\ |
|
508 |
- "1: \n\t"\ |
|
509 |
- "movq %%mm7, (%0, %%"REG_a") \n\t"\ |
|
510 |
- "movq %%mm7, 8(%0, %%"REG_a") \n\t"\ |
|
511 |
- "movq %%mm7, 16(%0, %%"REG_a") \n\t"\ |
|
512 |
- "movq %%mm7, 24(%0, %%"REG_a") \n\t"\ |
|
513 |
- "add $32, %%"REG_a" \n\t"\ |
|
514 |
- " js 1b \n\t"\ |
|
515 |
- : : "r" (((uint8_t *)blocks)+128*n),\ |
|
516 |
- "i" (-128*n)\ |
|
517 |
- : "%"REG_a\ |
|
518 |
- );\ |
|
511 |
+#define CLEAR_BLOCKS(name, n) \ |
|
512 |
+static void name(DCTELEM *blocks) \ |
|
513 |
+{ \ |
|
514 |
+ __asm__ volatile ( \ |
|
515 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
516 |
+ "mov %1, %%"REG_a" \n\t" \ |
|
517 |
+ "1: \n\t" \ |
|
518 |
+ "movq %%mm7, (%0, %%"REG_a") \n\t" \ |
|
519 |
+ "movq %%mm7, 8(%0, %%"REG_a") \n\t" \ |
|
520 |
+ "movq %%mm7, 16(%0, %%"REG_a") \n\t" \ |
|
521 |
+ "movq %%mm7, 24(%0, %%"REG_a") \n\t" \ |
|
522 |
+ "add $32, %%"REG_a" \n\t" \ |
|
523 |
+ "js 1b \n\t" \ |
|
524 |
+ :: "r"(((uint8_t *)blocks) + 128 * n), \ |
|
525 |
+ "i"(-128 * n) \ |
|
526 |
+ : "%"REG_a \ |
|
527 |
+ ); \ |
|
519 | 528 |
} |
520 | 529 |
CLEAR_BLOCKS(clear_blocks_mmx, 6) |
521 | 530 |
CLEAR_BLOCKS(clear_block_mmx, 1) |
522 | 531 |
|
523 | 532 |
static void clear_block_sse(DCTELEM *block) |
524 | 533 |
{ |
525 |
- __asm__ volatile( |
|
526 |
- "xorps %%xmm0, %%xmm0 \n" |
|
527 |
- "movaps %%xmm0, (%0) \n" |
|
528 |
- "movaps %%xmm0, 16(%0) \n" |
|
529 |
- "movaps %%xmm0, 32(%0) \n" |
|
530 |
- "movaps %%xmm0, 48(%0) \n" |
|
531 |
- "movaps %%xmm0, 64(%0) \n" |
|
532 |
- "movaps %%xmm0, 80(%0) \n" |
|
533 |
- "movaps %%xmm0, 96(%0) \n" |
|
534 |
- "movaps %%xmm0, 112(%0) \n" |
|
534 |
+ __asm__ volatile ( |
|
535 |
+ "xorps %%xmm0, %%xmm0 \n" |
|
536 |
+ "movaps %%xmm0, (%0) \n" |
|
537 |
+ "movaps %%xmm0, 16(%0) \n" |
|
538 |
+ "movaps %%xmm0, 32(%0) \n" |
|
539 |
+ "movaps %%xmm0, 48(%0) \n" |
|
540 |
+ "movaps %%xmm0, 64(%0) \n" |
|
541 |
+ "movaps %%xmm0, 80(%0) \n" |
|
542 |
+ "movaps %%xmm0, 96(%0) \n" |
|
543 |
+ "movaps %%xmm0, 112(%0) \n" |
|
535 | 544 |
:: "r"(block) |
536 | 545 |
: "memory" |
537 | 546 |
); |
538 | 547 |
} |
539 | 548 |
|
540 | 549 |
static void clear_blocks_sse(DCTELEM *blocks) |
541 |
-{\ |
|
542 |
- __asm__ volatile( |
|
543 |
- "xorps %%xmm0, %%xmm0 \n" |
|
544 |
- "mov %1, %%"REG_a" \n" |
|
545 |
- "1: \n" |
|
546 |
- "movaps %%xmm0, (%0, %%"REG_a") \n" |
|
547 |
- "movaps %%xmm0, 16(%0, %%"REG_a") \n" |
|
548 |
- "movaps %%xmm0, 32(%0, %%"REG_a") \n" |
|
549 |
- "movaps %%xmm0, 48(%0, %%"REG_a") \n" |
|
550 |
- "movaps %%xmm0, 64(%0, %%"REG_a") \n" |
|
551 |
- "movaps %%xmm0, 80(%0, %%"REG_a") \n" |
|
552 |
- "movaps %%xmm0, 96(%0, %%"REG_a") \n" |
|
553 |
- "movaps %%xmm0, 112(%0, %%"REG_a") \n" |
|
554 |
- "add $128, %%"REG_a" \n" |
|
555 |
- " js 1b \n" |
|
556 |
- : : "r" (((uint8_t *)blocks)+128*6), |
|
557 |
- "i" (-128*6) |
|
550 |
+{ |
|
551 |
+ __asm__ volatile ( |
|
552 |
+ "xorps %%xmm0, %%xmm0 \n" |
|
553 |
+ "mov %1, %%"REG_a" \n" |
|
554 |
+ "1: \n" |
|
555 |
+ "movaps %%xmm0, (%0, %%"REG_a") \n" |
|
556 |
+ "movaps %%xmm0, 16(%0, %%"REG_a") \n" |
|
557 |
+ "movaps %%xmm0, 32(%0, %%"REG_a") \n" |
|
558 |
+ "movaps %%xmm0, 48(%0, %%"REG_a") \n" |
|
559 |
+ "movaps %%xmm0, 64(%0, %%"REG_a") \n" |
|
560 |
+ "movaps %%xmm0, 80(%0, %%"REG_a") \n" |
|
561 |
+ "movaps %%xmm0, 96(%0, %%"REG_a") \n" |
|
562 |
+ "movaps %%xmm0, 112(%0, %%"REG_a") \n" |
|
563 |
+ "add $128, %%"REG_a" \n" |
|
564 |
+ "js 1b \n" |
|
565 |
+ :: "r"(((uint8_t *)blocks) + 128 * 6), |
|
566 |
+ "i"(-128 * 6) |
|
558 | 567 |
: "%"REG_a |
559 | 568 |
); |
560 | 569 |
} |
561 | 570 |
|
562 |
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ |
|
563 |
- x86_reg i=0; |
|
564 |
- __asm__ volatile( |
|
565 |
- "jmp 2f \n\t" |
|
571 |
+static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) |
|
572 |
+{ |
|
573 |
+ x86_reg i = 0; |
|
574 |
+ __asm__ volatile ( |
|
575 |
+ "jmp 2f \n\t" |
|
566 | 576 |
"1: \n\t" |
567 |
- "movq (%1, %0), %%mm0 \n\t" |
|
568 |
- "movq (%2, %0), %%mm1 \n\t" |
|
569 |
- "paddb %%mm0, %%mm1 \n\t" |
|
570 |
- "movq %%mm1, (%2, %0) \n\t" |
|
571 |
- "movq 8(%1, %0), %%mm0 \n\t" |
|
572 |
- "movq 8(%2, %0), %%mm1 \n\t" |
|
573 |
- "paddb %%mm0, %%mm1 \n\t" |
|
574 |
- "movq %%mm1, 8(%2, %0) \n\t" |
|
575 |
- "add $16, %0 \n\t" |
|
577 |
+ "movq (%1, %0), %%mm0 \n\t" |
|
578 |
+ "movq (%2, %0), %%mm1 \n\t" |
|
579 |
+ "paddb %%mm0, %%mm1 \n\t" |
|
580 |
+ "movq %%mm1, (%2, %0) \n\t" |
|
581 |
+ "movq 8(%1, %0), %%mm0 \n\t" |
|
582 |
+ "movq 8(%2, %0), %%mm1 \n\t" |
|
583 |
+ "paddb %%mm0, %%mm1 \n\t" |
|
584 |
+ "movq %%mm1, 8(%2, %0) \n\t" |
|
585 |
+ "add $16, %0 \n\t" |
|
576 | 586 |
"2: \n\t" |
577 |
- "cmp %3, %0 \n\t" |
|
578 |
- " js 1b \n\t" |
|
579 |
- : "+r" (i) |
|
580 |
- : "r"(src), "r"(dst), "r"((x86_reg)w-15) |
|
587 |
+ "cmp %3, %0 \n\t" |
|
588 |
+ "js 1b \n\t" |
|
589 |
+ : "+r"(i) |
|
590 |
+ : "r"(src), "r"(dst), "r"((x86_reg)w - 15) |
|
581 | 591 |
); |
582 |
- for(; i<w; i++) |
|
583 |
- dst[i+0] += src[i+0]; |
|
592 |
+ for ( ; i < w; i++) |
|
593 |
+ dst[i + 0] += src[i + 0]; |
|
584 | 594 |
} |
585 | 595 |
|
586 | 596 |
#if HAVE_7REGS |
587 |
-static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) { |
|
597 |
+static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, |
|
598 |
+ const uint8_t *diff, int w, |
|
599 |
+ int *left, int *left_top) |
|
600 |
+{ |
|
588 | 601 |
x86_reg w2 = -w; |
589 | 602 |
x86_reg x; |
590 |
- int l = *left & 0xff; |
|
603 |
+ int l = *left & 0xff; |
|
591 | 604 |
int tl = *left_top & 0xff; |
592 | 605 |
int t; |
593 |
- __asm__ volatile( |
|
594 |
- "mov %7, %3 \n" |
|
595 |
- "1: \n" |
|
596 |
- "movzbl (%3,%4), %2 \n" |
|
597 |
- "mov %2, %k3 \n" |
|
598 |
- "sub %b1, %b3 \n" |
|
599 |
- "add %b0, %b3 \n" |
|
600 |
- "mov %2, %1 \n" |
|
601 |
- "cmp %0, %2 \n" |
|
602 |
- "cmovg %0, %2 \n" |
|
603 |
- "cmovg %1, %0 \n" |
|
604 |
- "cmp %k3, %0 \n" |
|
605 |
- "cmovg %k3, %0 \n" |
|
606 |
- "mov %7, %3 \n" |
|
607 |
- "cmp %2, %0 \n" |
|
608 |
- "cmovl %2, %0 \n" |
|
609 |
- "add (%6,%4), %b0 \n" |
|
610 |
- "mov %b0, (%5,%4) \n" |
|
611 |
- "inc %4 \n" |
|
612 |
- "jl 1b \n" |
|
613 |
- :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) |
|
614 |
- :"r"(dst+w), "r"(diff+w), "rm"(top+w) |
|
606 |
+ __asm__ volatile ( |
|
607 |
+ "mov %7, %3 \n" |
|
608 |
+ "1: \n" |
|
609 |
+ "movzbl (%3, %4), %2 \n" |
|
610 |
+ "mov %2, %k3 \n" |
|
611 |
+ "sub %b1, %b3 \n" |
|
612 |
+ "add %b0, %b3 \n" |
|
613 |
+ "mov %2, %1 \n" |
|
614 |
+ "cmp %0, %2 \n" |
|
615 |
+ "cmovg %0, %2 \n" |
|
616 |
+ "cmovg %1, %0 \n" |
|
617 |
+ "cmp %k3, %0 \n" |
|
618 |
+ "cmovg %k3, %0 \n" |
|
619 |
+ "mov %7, %3 \n" |
|
620 |
+ "cmp %2, %0 \n" |
|
621 |
+ "cmovl %2, %0 \n" |
|
622 |
+ "add (%6, %4), %b0 \n" |
|
623 |
+ "mov %b0, (%5, %4) \n" |
|
624 |
+ "inc %4 \n" |
|
625 |
+ "jl 1b \n" |
|
626 |
+ : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) |
|
627 |
+ : "r"(dst + w), "r"(diff + w), "rm"(top + w) |
|
615 | 628 |
); |
616 |
- *left = l; |
|
629 |
+ *left = l; |
|
617 | 630 |
*left_top = tl; |
618 | 631 |
} |
619 | 632 |
#endif |
620 | 633 |
|
621 |
-#define H263_LOOP_FILTER \ |
|
622 |
- "pxor %%mm7, %%mm7 \n\t"\ |
|
623 |
- "movq %0, %%mm0 \n\t"\ |
|
624 |
- "movq %0, %%mm1 \n\t"\ |
|
625 |
- "movq %3, %%mm2 \n\t"\ |
|
626 |
- "movq %3, %%mm3 \n\t"\ |
|
627 |
- "punpcklbw %%mm7, %%mm0 \n\t"\ |
|
628 |
- "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
629 |
- "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
630 |
- "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
631 |
- "psubw %%mm2, %%mm0 \n\t"\ |
|
632 |
- "psubw %%mm3, %%mm1 \n\t"\ |
|
633 |
- "movq %1, %%mm2 \n\t"\ |
|
634 |
- "movq %1, %%mm3 \n\t"\ |
|
635 |
- "movq %2, %%mm4 \n\t"\ |
|
636 |
- "movq %2, %%mm5 \n\t"\ |
|
637 |
- "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
638 |
- "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
639 |
- "punpcklbw %%mm7, %%mm4 \n\t"\ |
|
640 |
- "punpckhbw %%mm7, %%mm5 \n\t"\ |
|
641 |
- "psubw %%mm2, %%mm4 \n\t"\ |
|
642 |
- "psubw %%mm3, %%mm5 \n\t"\ |
|
643 |
- "psllw $2, %%mm4 \n\t"\ |
|
644 |
- "psllw $2, %%mm5 \n\t"\ |
|
645 |
- "paddw %%mm0, %%mm4 \n\t"\ |
|
646 |
- "paddw %%mm1, %%mm5 \n\t"\ |
|
647 |
- "pxor %%mm6, %%mm6 \n\t"\ |
|
648 |
- "pcmpgtw %%mm4, %%mm6 \n\t"\ |
|
649 |
- "pcmpgtw %%mm5, %%mm7 \n\t"\ |
|
650 |
- "pxor %%mm6, %%mm4 \n\t"\ |
|
651 |
- "pxor %%mm7, %%mm5 \n\t"\ |
|
652 |
- "psubw %%mm6, %%mm4 \n\t"\ |
|
653 |
- "psubw %%mm7, %%mm5 \n\t"\ |
|
654 |
- "psrlw $3, %%mm4 \n\t"\ |
|
655 |
- "psrlw $3, %%mm5 \n\t"\ |
|
656 |
- "packuswb %%mm5, %%mm4 \n\t"\ |
|
657 |
- "packsswb %%mm7, %%mm6 \n\t"\ |
|
658 |
- "pxor %%mm7, %%mm7 \n\t"\ |
|
659 |
- "movd %4, %%mm2 \n\t"\ |
|
660 |
- "punpcklbw %%mm2, %%mm2 \n\t"\ |
|
661 |
- "punpcklbw %%mm2, %%mm2 \n\t"\ |
|
662 |
- "punpcklbw %%mm2, %%mm2 \n\t"\ |
|
663 |
- "psubusb %%mm4, %%mm2 \n\t"\ |
|
664 |
- "movq %%mm2, %%mm3 \n\t"\ |
|
665 |
- "psubusb %%mm4, %%mm3 \n\t"\ |
|
666 |
- "psubb %%mm3, %%mm2 \n\t"\ |
|
667 |
- "movq %1, %%mm3 \n\t"\ |
|
668 |
- "movq %2, %%mm4 \n\t"\ |
|
669 |
- "pxor %%mm6, %%mm3 \n\t"\ |
|
670 |
- "pxor %%mm6, %%mm4 \n\t"\ |
|
671 |
- "paddusb %%mm2, %%mm3 \n\t"\ |
|
672 |
- "psubusb %%mm2, %%mm4 \n\t"\ |
|
673 |
- "pxor %%mm6, %%mm3 \n\t"\ |
|
674 |
- "pxor %%mm6, %%mm4 \n\t"\ |
|
675 |
- "paddusb %%mm2, %%mm2 \n\t"\ |
|
676 |
- "packsswb %%mm1, %%mm0 \n\t"\ |
|
677 |
- "pcmpgtb %%mm0, %%mm7 \n\t"\ |
|
678 |
- "pxor %%mm7, %%mm0 \n\t"\ |
|
679 |
- "psubb %%mm7, %%mm0 \n\t"\ |
|
680 |
- "movq %%mm0, %%mm1 \n\t"\ |
|
681 |
- "psubusb %%mm2, %%mm0 \n\t"\ |
|
682 |
- "psubb %%mm0, %%mm1 \n\t"\ |
|
683 |
- "pand %5, %%mm1 \n\t"\ |
|
684 |
- "psrlw $2, %%mm1 \n\t"\ |
|
685 |
- "pxor %%mm7, %%mm1 \n\t"\ |
|
686 |
- "psubb %%mm7, %%mm1 \n\t"\ |
|
687 |
- "movq %0, %%mm5 \n\t"\ |
|
688 |
- "movq %3, %%mm6 \n\t"\ |
|
689 |
- "psubb %%mm1, %%mm5 \n\t"\ |
|
690 |
- "paddb %%mm1, %%mm6 \n\t" |
|
691 |
- |
|
692 |
-static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ |
|
693 |
- if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
|
694 |
- const int strength= ff_h263_loop_filter_strength[qscale]; |
|
695 |
- |
|
696 |
- __asm__ volatile( |
|
697 |
- |
|
698 |
- H263_LOOP_FILTER |
|
699 |
- |
|
700 |
- "movq %%mm3, %1 \n\t" |
|
701 |
- "movq %%mm4, %2 \n\t" |
|
702 |
- "movq %%mm5, %0 \n\t" |
|
703 |
- "movq %%mm6, %3 \n\t" |
|
704 |
- : "+m" (*(uint64_t*)(src - 2*stride)), |
|
705 |
- "+m" (*(uint64_t*)(src - 1*stride)), |
|
706 |
- "+m" (*(uint64_t*)(src + 0*stride)), |
|
707 |
- "+m" (*(uint64_t*)(src + 1*stride)) |
|
708 |
- : "g" (2*strength), "m"(ff_pb_FC) |
|
709 |
- ); |
|
634 |
+#define H263_LOOP_FILTER \ |
|
635 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
636 |
+ "movq %0, %%mm0 \n\t" \ |
|
637 |
+ "movq %0, %%mm1 \n\t" \ |
|
638 |
+ "movq %3, %%mm2 \n\t" \ |
|
639 |
+ "movq %3, %%mm3 \n\t" \ |
|
640 |
+ "punpcklbw %%mm7, %%mm0 \n\t" \ |
|
641 |
+ "punpckhbw %%mm7, %%mm1 \n\t" \ |
|
642 |
+ "punpcklbw %%mm7, %%mm2 \n\t" \ |
|
643 |
+ "punpckhbw %%mm7, %%mm3 \n\t" \ |
|
644 |
+ "psubw %%mm2, %%mm0 \n\t" \ |
|
645 |
+ "psubw %%mm3, %%mm1 \n\t" \ |
|
646 |
+ "movq %1, %%mm2 \n\t" \ |
|
647 |
+ "movq %1, %%mm3 \n\t" \ |
|
648 |
+ "movq %2, %%mm4 \n\t" \ |
|
649 |
+ "movq %2, %%mm5 \n\t" \ |
|
650 |
+ "punpcklbw %%mm7, %%mm2 \n\t" \ |
|
651 |
+ "punpckhbw %%mm7, %%mm3 \n\t" \ |
|
652 |
+ "punpcklbw %%mm7, %%mm4 \n\t" \ |
|
653 |
+ "punpckhbw %%mm7, %%mm5 \n\t" \ |
|
654 |
+ "psubw %%mm2, %%mm4 \n\t" \ |
|
655 |
+ "psubw %%mm3, %%mm5 \n\t" \ |
|
656 |
+ "psllw $2, %%mm4 \n\t" \ |
|
657 |
+ "psllw $2, %%mm5 \n\t" \ |
|
658 |
+ "paddw %%mm0, %%mm4 \n\t" \ |
|
659 |
+ "paddw %%mm1, %%mm5 \n\t" \ |
|
660 |
+ "pxor %%mm6, %%mm6 \n\t" \ |
|
661 |
+ "pcmpgtw %%mm4, %%mm6 \n\t" \ |
|
662 |
+ "pcmpgtw %%mm5, %%mm7 \n\t" \ |
|
663 |
+ "pxor %%mm6, %%mm4 \n\t" \ |
|
664 |
+ "pxor %%mm7, %%mm5 \n\t" \ |
|
665 |
+ "psubw %%mm6, %%mm4 \n\t" \ |
|
666 |
+ "psubw %%mm7, %%mm5 \n\t" \ |
|
667 |
+ "psrlw $3, %%mm4 \n\t" \ |
|
668 |
+ "psrlw $3, %%mm5 \n\t" \ |
|
669 |
+ "packuswb %%mm5, %%mm4 \n\t" \ |
|
670 |
+ "packsswb %%mm7, %%mm6 \n\t" \ |
|
671 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
672 |
+ "movd %4, %%mm2 \n\t" \ |
|
673 |
+ "punpcklbw %%mm2, %%mm2 \n\t" \ |
|
674 |
+ "punpcklbw %%mm2, %%mm2 \n\t" \ |
|
675 |
+ "punpcklbw %%mm2, %%mm2 \n\t" \ |
|
676 |
+ "psubusb %%mm4, %%mm2 \n\t" \ |
|
677 |
+ "movq %%mm2, %%mm3 \n\t" \ |
|
678 |
+ "psubusb %%mm4, %%mm3 \n\t" \ |
|
679 |
+ "psubb %%mm3, %%mm2 \n\t" \ |
|
680 |
+ "movq %1, %%mm3 \n\t" \ |
|
681 |
+ "movq %2, %%mm4 \n\t" \ |
|
682 |
+ "pxor %%mm6, %%mm3 \n\t" \ |
|
683 |
+ "pxor %%mm6, %%mm4 \n\t" \ |
|
684 |
+ "paddusb %%mm2, %%mm3 \n\t" \ |
|
685 |
+ "psubusb %%mm2, %%mm4 \n\t" \ |
|
686 |
+ "pxor %%mm6, %%mm3 \n\t" \ |
|
687 |
+ "pxor %%mm6, %%mm4 \n\t" \ |
|
688 |
+ "paddusb %%mm2, %%mm2 \n\t" \ |
|
689 |
+ "packsswb %%mm1, %%mm0 \n\t" \ |
|
690 |
+ "pcmpgtb %%mm0, %%mm7 \n\t" \ |
|
691 |
+ "pxor %%mm7, %%mm0 \n\t" \ |
|
692 |
+ "psubb %%mm7, %%mm0 \n\t" \ |
|
693 |
+ "movq %%mm0, %%mm1 \n\t" \ |
|
694 |
+ "psubusb %%mm2, %%mm0 \n\t" \ |
|
695 |
+ "psubb %%mm0, %%mm1 \n\t" \ |
|
696 |
+ "pand %5, %%mm1 \n\t" \ |
|
697 |
+ "psrlw $2, %%mm1 \n\t" \ |
|
698 |
+ "pxor %%mm7, %%mm1 \n\t" \ |
|
699 |
+ "psubb %%mm7, %%mm1 \n\t" \ |
|
700 |
+ "movq %0, %%mm5 \n\t" \ |
|
701 |
+ "movq %3, %%mm6 \n\t" \ |
|
702 |
+ "psubb %%mm1, %%mm5 \n\t" \ |
|
703 |
+ "paddb %%mm1, %%mm6 \n\t" |
|
704 |
+ |
|
705 |
+static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale) |
|
706 |
+{ |
|
707 |
+ if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
|
708 |
+ const int strength = ff_h263_loop_filter_strength[qscale]; |
|
709 |
+ |
|
710 |
+ __asm__ volatile ( |
|
711 |
+ H263_LOOP_FILTER |
|
712 |
+ |
|
713 |
+ "movq %%mm3, %1 \n\t" |
|
714 |
+ "movq %%mm4, %2 \n\t" |
|
715 |
+ "movq %%mm5, %0 \n\t" |
|
716 |
+ "movq %%mm6, %3 \n\t" |
|
717 |
+ : "+m"(*(uint64_t*)(src - 2 * stride)), |
|
718 |
+ "+m"(*(uint64_t*)(src - 1 * stride)), |
|
719 |
+ "+m"(*(uint64_t*)(src + 0 * stride)), |
|
720 |
+ "+m"(*(uint64_t*)(src + 1 * stride)) |
|
721 |
+ : "g"(2 * strength), "m"(ff_pb_FC) |
|
722 |
+ ); |
|
710 | 723 |
} |
711 | 724 |
} |
712 | 725 |
|
713 |
-static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ |
|
714 |
- if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
|
715 |
- const int strength= ff_h263_loop_filter_strength[qscale]; |
|
716 |
- DECLARE_ALIGNED(8, uint64_t, temp)[4]; |
|
717 |
- uint8_t *btemp= (uint8_t*)temp; |
|
718 |
- |
|
719 |
- src -= 2; |
|
720 |
- |
|
721 |
- transpose4x4(btemp , src , 8, stride); |
|
722 |
- transpose4x4(btemp+4, src + 4*stride, 8, stride); |
|
723 |
- __asm__ volatile( |
|
724 |
- H263_LOOP_FILTER // 5 3 4 6 |
|
725 |
- |
|
726 |
- : "+m" (temp[0]), |
|
727 |
- "+m" (temp[1]), |
|
728 |
- "+m" (temp[2]), |
|
729 |
- "+m" (temp[3]) |
|
730 |
- : "g" (2*strength), "m"(ff_pb_FC) |
|
731 |
- ); |
|
726 |
+static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale) |
|
727 |
+{ |
|
728 |
+ if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
|
729 |
+ const int strength = ff_h263_loop_filter_strength[qscale]; |
|
730 |
+ DECLARE_ALIGNED(8, uint64_t, temp)[4]; |
|
731 |
+ uint8_t *btemp = (uint8_t*)temp; |
|
732 |
+ |
|
733 |
+ src -= 2; |
|
734 |
+ |
|
735 |
+ transpose4x4(btemp, src, 8, stride); |
|
736 |
+ transpose4x4(btemp + 4, src + 4 * stride, 8, stride); |
|
737 |
+ __asm__ volatile ( |
|
738 |
+ H263_LOOP_FILTER // 5 3 4 6 |
|
739 |
+ |
|
740 |
+ : "+m"(temp[0]), |
|
741 |
+ "+m"(temp[1]), |
|
742 |
+ "+m"(temp[2]), |
|
743 |
+ "+m"(temp[3]) |
|
744 |
+ : "g"(2 * strength), "m"(ff_pb_FC) |
|
745 |
+ ); |
|
732 | 746 |
|
733 |
- __asm__ volatile( |
|
734 |
- "movq %%mm5, %%mm1 \n\t" |
|
735 |
- "movq %%mm4, %%mm0 \n\t" |
|
736 |
- "punpcklbw %%mm3, %%mm5 \n\t" |
|
737 |
- "punpcklbw %%mm6, %%mm4 \n\t" |
|
738 |
- "punpckhbw %%mm3, %%mm1 \n\t" |
|
739 |
- "punpckhbw %%mm6, %%mm0 \n\t" |
|
740 |
- "movq %%mm5, %%mm3 \n\t" |
|
741 |
- "movq %%mm1, %%mm6 \n\t" |
|
742 |
- "punpcklwd %%mm4, %%mm5 \n\t" |
|
743 |
- "punpcklwd %%mm0, %%mm1 \n\t" |
|
744 |
- "punpckhwd %%mm4, %%mm3 \n\t" |
|
745 |
- "punpckhwd %%mm0, %%mm6 \n\t" |
|
746 |
- "movd %%mm5, (%0) \n\t" |
|
747 |
- "punpckhdq %%mm5, %%mm5 \n\t" |
|
748 |
- "movd %%mm5, (%0,%2) \n\t" |
|
749 |
- "movd %%mm3, (%0,%2,2) \n\t" |
|
750 |
- "punpckhdq %%mm3, %%mm3 \n\t" |
|
751 |
- "movd %%mm3, (%0,%3) \n\t" |
|
752 |
- "movd %%mm1, (%1) \n\t" |
|
753 |
- "punpckhdq %%mm1, %%mm1 \n\t" |
|
754 |
- "movd %%mm1, (%1,%2) \n\t" |
|
755 |
- "movd %%mm6, (%1,%2,2) \n\t" |
|
756 |
- "punpckhdq %%mm6, %%mm6 \n\t" |
|
757 |
- "movd %%mm6, (%1,%3) \n\t" |
|
758 |
- :: "r" (src), |
|
759 |
- "r" (src + 4*stride), |
|
760 |
- "r" ((x86_reg) stride ), |
|
761 |
- "r" ((x86_reg)(3*stride)) |
|
762 |
- ); |
|
747 |
+ __asm__ volatile ( |
|
748 |
+ "movq %%mm5, %%mm1 \n\t" |
|
749 |
+ "movq %%mm4, %%mm0 \n\t" |
|
750 |
+ "punpcklbw %%mm3, %%mm5 \n\t" |
|
751 |
+ "punpcklbw %%mm6, %%mm4 \n\t" |
|
752 |
+ "punpckhbw %%mm3, %%mm1 \n\t" |
|
753 |
+ "punpckhbw %%mm6, %%mm0 \n\t" |
|
754 |
+ "movq %%mm5, %%mm3 \n\t" |
|
755 |
+ "movq %%mm1, %%mm6 \n\t" |
|
756 |
+ "punpcklwd %%mm4, %%mm5 \n\t" |
|
757 |
+ "punpcklwd %%mm0, %%mm1 \n\t" |
|
758 |
+ "punpckhwd %%mm4, %%mm3 \n\t" |
|
759 |
+ "punpckhwd %%mm0, %%mm6 \n\t" |
|
760 |
+ "movd %%mm5, (%0) \n\t" |
|
761 |
+ "punpckhdq %%mm5, %%mm5 \n\t" |
|
762 |
+ "movd %%mm5, (%0, %2) \n\t" |
|
763 |
+ "movd %%mm3, (%0, %2, 2) \n\t" |
|
764 |
+ "punpckhdq %%mm3, %%mm3 \n\t" |
|
765 |
+ "movd %%mm3, (%0, %3) \n\t" |
|
766 |
+ "movd %%mm1, (%1) \n\t" |
|
767 |
+ "punpckhdq %%mm1, %%mm1 \n\t" |
|
768 |
+ "movd %%mm1, (%1, %2) \n\t" |
|
769 |
+ "movd %%mm6, (%1, %2, 2) \n\t" |
|
770 |
+ "punpckhdq %%mm6, %%mm6 \n\t" |
|
771 |
+ "movd %%mm6, (%1, %3) \n\t" |
|
772 |
+ :: "r"(src), |
|
773 |
+ "r"(src + 4 * stride), |
|
774 |
+ "r"((x86_reg)stride), |
|
775 |
+ "r"((x86_reg)(3 * stride)) |
|
776 |
+ ); |
|
763 | 777 |
} |
764 | 778 |
} |
765 | 779 |
|
766 |
-/* draw the edges of width 'w' of an image of size width, height |
|
767 |
- this mmx version can only handle w==8 || w==16 */ |
|
768 |
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides) |
|
780 |
+/* Draw the edges of width 'w' of an image of size width, height |
|
781 |
+ * this MMX version can only handle w == 8 || w == 16. */ |
|
782 |
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, |
|
783 |
+ int w, int h, int sides) |
|
769 | 784 |
{ |
770 | 785 |
uint8_t *ptr, *last_line; |
771 | 786 |
int i; |
... | ... |
@@ -773,794 +788,1000 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, |
773 | 773 |
last_line = buf + (height - 1) * wrap; |
774 | 774 |
/* left and right */ |
775 | 775 |
ptr = buf; |
776 |
- if(w==8) |
|
777 |
- { |
|
778 |
- __asm__ volatile( |
|
779 |
- "1: \n\t" |
|
780 |
- "movd (%0), %%mm0 \n\t" |
|
781 |
- "punpcklbw %%mm0, %%mm0 \n\t" |
|
782 |
- "punpcklwd %%mm0, %%mm0 \n\t" |
|
783 |
- "punpckldq %%mm0, %%mm0 \n\t" |
|
784 |
- "movq %%mm0, -8(%0) \n\t" |
|
785 |
- "movq -8(%0, %2), %%mm1 \n\t" |
|
786 |
- "punpckhbw %%mm1, %%mm1 \n\t" |
|
787 |
- "punpckhwd %%mm1, %%mm1 \n\t" |
|
788 |
- "punpckhdq %%mm1, %%mm1 \n\t" |
|
789 |
- "movq %%mm1, (%0, %2) \n\t" |
|
790 |
- "add %1, %0 \n\t" |
|
791 |
- "cmp %3, %0 \n\t" |
|
792 |
- " jb 1b \n\t" |
|
793 |
- : "+r" (ptr) |
|
794 |
- : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) |
|
795 |
- ); |
|
796 |
- } |
|
797 |
- else |
|
798 |
- { |
|
799 |
- __asm__ volatile( |
|
800 |
- "1: \n\t" |
|
801 |
- "movd (%0), %%mm0 \n\t" |
|
802 |
- "punpcklbw %%mm0, %%mm0 \n\t" |
|
803 |
- "punpcklwd %%mm0, %%mm0 \n\t" |
|
804 |
- "punpckldq %%mm0, %%mm0 \n\t" |
|
805 |
- "movq %%mm0, -8(%0) \n\t" |
|
806 |
- "movq %%mm0, -16(%0) \n\t" |
|
807 |
- "movq -8(%0, %2), %%mm1 \n\t" |
|
808 |
- "punpckhbw %%mm1, %%mm1 \n\t" |
|
809 |
- "punpckhwd %%mm1, %%mm1 \n\t" |
|
810 |
- "punpckhdq %%mm1, %%mm1 \n\t" |
|
811 |
- "movq %%mm1, (%0, %2) \n\t" |
|
812 |
- "movq %%mm1, 8(%0, %2) \n\t" |
|
813 |
- "add %1, %0 \n\t" |
|
814 |
- "cmp %3, %0 \n\t" |
|
815 |
- " jb 1b \n\t" |
|
816 |
- : "+r" (ptr) |
|
817 |
- : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height) |
|
818 |
- ); |
|
776 |
+ if (w == 8) { |
|
777 |
+ __asm__ volatile ( |
|
778 |
+ "1: \n\t" |
|
779 |
+ "movd (%0), %%mm0 \n\t" |
|
780 |
+ "punpcklbw %%mm0, %%mm0 \n\t" |
|
781 |
+ "punpcklwd %%mm0, %%mm0 \n\t" |
|
782 |
+ "punpckldq %%mm0, %%mm0 \n\t" |
|
783 |
+ "movq %%mm0, -8(%0) \n\t" |
|
784 |
+ "movq -8(%0, %2), %%mm1 \n\t" |
|
785 |
+ "punpckhbw %%mm1, %%mm1 \n\t" |
|
786 |
+ "punpckhwd %%mm1, %%mm1 \n\t" |
|
787 |
+ "punpckhdq %%mm1, %%mm1 \n\t" |
|
788 |
+ "movq %%mm1, (%0, %2) \n\t" |
|
789 |
+ "add %1, %0 \n\t" |
|
790 |
+ "cmp %3, %0 \n\t" |
|
791 |
+ "jb 1b \n\t" |
|
792 |
+ : "+r"(ptr) |
|
793 |
+ : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) |
|
794 |
+ ); |
|
795 |
+ } else { |
|
796 |
+ __asm__ volatile ( |
|
797 |
+ "1: \n\t" |
|
798 |
+ "movd (%0), %%mm0 \n\t" |
|
799 |
+ "punpcklbw %%mm0, %%mm0 \n\t" |
|
800 |
+ "punpcklwd %%mm0, %%mm0 \n\t" |
|
801 |
+ "punpckldq %%mm0, %%mm0 \n\t" |
|
802 |
+ "movq %%mm0, -8(%0) \n\t" |
|
803 |
+ "movq %%mm0, -16(%0) \n\t" |
|
804 |
+ "movq -8(%0, %2), %%mm1 \n\t" |
|
805 |
+ "punpckhbw %%mm1, %%mm1 \n\t" |
|
806 |
+ "punpckhwd %%mm1, %%mm1 \n\t" |
|
807 |
+ "punpckhdq %%mm1, %%mm1 \n\t" |
|
808 |
+ "movq %%mm1, (%0, %2) \n\t" |
|
809 |
+ "movq %%mm1, 8(%0, %2) \n\t" |
|
810 |
+ "add %1, %0 \n\t" |
|
811 |
+ "cmp %3, %0 \n\t" |
|
812 |
+ "jb 1b \n\t" |
|
813 |
+ : "+r"(ptr) |
|
814 |
+ : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) |
|
815 |
+ ); |
|
819 | 816 |
} |
820 | 817 |
|
821 | 818 |
/* top and bottom (and hopefully also the corners) */ |
822 |
- if (sides&EDGE_TOP) { |
|
823 |
- for(i = 0; i < h; i += 4) { |
|
824 |
- ptr= buf - (i + 1) * wrap - w; |
|
825 |
- __asm__ volatile( |
|
826 |
- "1: \n\t" |
|
827 |
- "movq (%1, %0), %%mm0 \n\t" |
|
828 |
- "movq %%mm0, (%0) \n\t" |
|
829 |
- "movq %%mm0, (%0, %2) \n\t" |
|
830 |
- "movq %%mm0, (%0, %2, 2) \n\t" |
|
831 |
- "movq %%mm0, (%0, %3) \n\t" |
|
832 |
- "add $8, %0 \n\t" |
|
833 |
- "cmp %4, %0 \n\t" |
|
834 |
- " jb 1b \n\t" |
|
835 |
- : "+r" (ptr) |
|
836 |
- : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w) |
|
837 |
- ); |
|
819 |
+ if (sides & EDGE_TOP) { |
|
820 |
+ for (i = 0; i < h; i += 4) { |
|
821 |
+ ptr = buf - (i + 1) * wrap - w; |
|
822 |
+ __asm__ volatile ( |
|
823 |
+ "1: \n\t" |
|
824 |
+ "movq (%1, %0), %%mm0 \n\t" |
|
825 |
+ "movq %%mm0, (%0) \n\t" |
|
826 |
+ "movq %%mm0, (%0, %2) \n\t" |
|
827 |
+ "movq %%mm0, (%0, %2, 2) \n\t" |
|
828 |
+ "movq %%mm0, (%0, %3) \n\t" |
|
829 |
+ "add $8, %0 \n\t" |
|
830 |
+ "cmp %4, %0 \n\t" |
|
831 |
+ "jb 1b \n\t" |
|
832 |
+ : "+r"(ptr) |
|
833 |
+ : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap), |
|
834 |
+ "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w) |
|
835 |
+ ); |
|
838 | 836 |
} |
839 | 837 |
} |
840 | 838 |
|
841 |
- if (sides&EDGE_BOTTOM) { |
|
842 |
- for(i = 0; i < h; i += 4) { |
|
843 |
- ptr= last_line + (i + 1) * wrap - w; |
|
844 |
- __asm__ volatile( |
|
845 |
- "1: \n\t" |
|
846 |
- "movq (%1, %0), %%mm0 \n\t" |
|
847 |
- "movq %%mm0, (%0) \n\t" |
|
848 |
- "movq %%mm0, (%0, %2) \n\t" |
|
849 |
- "movq %%mm0, (%0, %2, 2) \n\t" |
|
850 |
- "movq %%mm0, (%0, %3) \n\t" |
|
851 |
- "add $8, %0 \n\t" |
|
852 |
- "cmp %4, %0 \n\t" |
|
853 |
- " jb 1b \n\t" |
|
854 |
- : "+r" (ptr) |
|
855 |
- : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w) |
|
856 |
- ); |
|
839 |
+ if (sides & EDGE_BOTTOM) { |
|
840 |
+ for (i = 0; i < h; i += 4) { |
|
841 |
+ ptr = last_line + (i + 1) * wrap - w; |
|
842 |
+ __asm__ volatile ( |
|
843 |
+ "1: \n\t" |
|
844 |
+ "movq (%1, %0), %%mm0 \n\t" |
|
845 |
+ "movq %%mm0, (%0) \n\t" |
|
846 |
+ "movq %%mm0, (%0, %2) \n\t" |
|
847 |
+ "movq %%mm0, (%0, %2, 2) \n\t" |
|
848 |
+ "movq %%mm0, (%0, %3) \n\t" |
|
849 |
+ "add $8, %0 \n\t" |
|
850 |
+ "cmp %4, %0 \n\t" |
|
851 |
+ "jb 1b \n\t" |
|
852 |
+ : "+r"(ptr) |
|
853 |
+ : "r"((x86_reg)last_line - (x86_reg)ptr - w), |
|
854 |
+ "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3), |
|
855 |
+ "r"(ptr + width + 2 * w) |
|
856 |
+ ); |
|
857 | 857 |
} |
858 | 858 |
} |
859 | 859 |
} |
860 | 860 |
|
861 |
-#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
|
862 |
- "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
|
863 |
- "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
|
864 |
- "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
|
865 |
- "movq "#in7", " #m3 " \n\t" /* d */\ |
|
866 |
- "movq "#in0", %%mm5 \n\t" /* D */\ |
|
867 |
- "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ |
|
868 |
- "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ |
|
869 |
- "movq "#in1", %%mm5 \n\t" /* C */\ |
|
870 |
- "movq "#in2", %%mm6 \n\t" /* B */\ |
|
871 |
- "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ |
|
872 |
- "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ |
|
873 |
- "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ |
|
874 |
- "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ |
|
875 |
- "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
|
876 |
- "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
|
877 |
- "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ |
|
878 |
- "psraw $5, %%mm5 \n\t"\ |
|
879 |
- "packuswb %%mm5, %%mm5 \n\t"\ |
|
880 |
- OP(%%mm5, out, %%mm7, d) |
|
881 |
- |
|
882 |
-#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ |
|
883 |
-static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
|
884 |
- uint64_t temp;\ |
|
885 |
-\ |
|
886 |
- __asm__ volatile(\ |
|
887 |
- "pxor %%mm7, %%mm7 \n\t"\ |
|
888 |
- "1: \n\t"\ |
|
889 |
- "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ |
|
890 |
- "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ |
|
891 |
- "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ |
|
892 |
- "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ |
|
893 |
- "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ |
|
894 |
- "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ |
|
895 |
- "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ |
|
896 |
- "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ |
|
897 |
- "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ |
|
898 |
- "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ |
|
899 |
- "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ |
|
900 |
- "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ |
|
901 |
- "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ |
|
902 |
- "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ |
|
903 |
- "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ |
|
904 |
- "paddw %%mm3, %%mm5 \n\t" /* b */\ |
|
905 |
- "paddw %%mm2, %%mm6 \n\t" /* c */\ |
|
906 |
- "paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
|
907 |
- "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
|
908 |
- "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
|
909 |
- "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
|
910 |
- "paddw %%mm4, %%mm0 \n\t" /* a */\ |
|
911 |
- "paddw %%mm1, %%mm5 \n\t" /* d */\ |
|
912 |
- "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
|
913 |
- "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
|
914 |
- "paddw %6, %%mm6 \n\t"\ |
|
915 |
- "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|
916 |
- "psraw $5, %%mm0 \n\t"\ |
|
917 |
- "movq %%mm0, %5 \n\t"\ |
|
918 |
- /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
|
919 |
- \ |
|
920 |
- "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ |
|
921 |
- "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ |
|
922 |
- "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ |
|
923 |
- "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ |
|
924 |
- "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ |
|
925 |
- "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ |
|
926 |
- "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ |
|
927 |
- "paddw %%mm0, %%mm2 \n\t" /* b */\ |
|
928 |
- "paddw %%mm5, %%mm3 \n\t" /* c */\ |
|
929 |
- "paddw %%mm2, %%mm2 \n\t" /* 2b */\ |
|
930 |
- "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ |
|
931 |
- "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ |
|
932 |
- "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ |
|
933 |
- "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ |
|
934 |
- "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ |
|
935 |
- "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
|
936 |
- "paddw %%mm2, %%mm1 \n\t" /* a */\ |
|
937 |
- "paddw %%mm6, %%mm4 \n\t" /* d */\ |
|
938 |
- "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
|
939 |
- "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
|
940 |
- "paddw %6, %%mm1 \n\t"\ |
|
941 |
- "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
|
942 |
- "psraw $5, %%mm3 \n\t"\ |
|
943 |
- "movq %5, %%mm1 \n\t"\ |
|
944 |
- "packuswb %%mm3, %%mm1 \n\t"\ |
|
945 |
- OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
|
946 |
- /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
|
947 |
- \ |
|
948 |
- "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ |
|
949 |
- "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ |
|
950 |
- "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ |
|
951 |
- "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ |
|
952 |
- "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ |
|
953 |
- "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ |
|
954 |
- "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ |
|
955 |
- "paddw %%mm1, %%mm5 \n\t" /* b */\ |
|
956 |
- "paddw %%mm4, %%mm0 \n\t" /* c */\ |
|
957 |
- "paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
|
958 |
- "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ |
|
959 |
- "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ |
|
960 |
- "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ |
|
961 |
- "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ |
|
962 |
- "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
|
963 |
- "paddw %%mm3, %%mm2 \n\t" /* d */\ |
|
964 |
- "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ |
|
965 |
- "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ |
|
966 |
- "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ |
|
967 |
- "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ |
|
968 |
- "paddw %%mm2, %%mm6 \n\t" /* a */\ |
|
969 |
- "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ |
|
970 |
- "paddw %6, %%mm0 \n\t"\ |
|
971 |
- "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|
972 |
- "psraw $5, %%mm0 \n\t"\ |
|
973 |
- /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ |
|
974 |
- \ |
|
975 |
- "paddw %%mm5, %%mm3 \n\t" /* a */\ |
|
976 |
- "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ |
|
977 |
- "paddw %%mm4, %%mm6 \n\t" /* b */\ |
|
978 |
- "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ |
|
979 |
- "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ |
|
980 |
- "paddw %%mm1, %%mm4 \n\t" /* c */\ |
|
981 |
- "paddw %%mm2, %%mm5 \n\t" /* d */\ |
|
982 |
- "paddw %%mm6, %%mm6 \n\t" /* 2b */\ |
|
983 |
- "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ |
|
984 |
- "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ |
|
985 |
- "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ |
|
986 |
- "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
|
987 |
- "paddw %6, %%mm4 \n\t"\ |
|
988 |
- "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
|
989 |
- "psraw $5, %%mm4 \n\t"\ |
|
990 |
- "packuswb %%mm4, %%mm0 \n\t"\ |
|
991 |
- OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ |
|
992 |
- \ |
|
993 |
- "add %3, %0 \n\t"\ |
|
994 |
- "add %4, %1 \n\t"\ |
|
995 |
- "decl %2 \n\t"\ |
|
996 |
- " jnz 1b \n\t"\ |
|
997 |
- : "+a"(src), "+c"(dst), "+D"(h)\ |
|
998 |
- : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
|
999 |
- : "memory"\ |
|
1000 |
- );\ |
|
1001 |
-}\ |
|
1002 |
-\ |
|
1003 |
-static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
|
1004 |
- int i;\ |
|
1005 |
- int16_t temp[16];\ |
|
1006 |
- /* quick HACK, XXX FIXME MUST be optimized */\ |
|
1007 |
- for(i=0; i<h; i++)\ |
|
1008 |
- {\ |
|
1009 |
- temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ |
|
1010 |
- temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ |
|
1011 |
- temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ |
|
1012 |
- temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ |
|
1013 |
- temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ |
|
1014 |
- temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\ |
|
1015 |
- temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\ |
|
1016 |
- temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\ |
|
1017 |
- temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\ |
|
1018 |
- temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\ |
|
1019 |
- temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\ |
|
1020 |
- temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\ |
|
1021 |
- temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\ |
|
1022 |
- temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\ |
|
1023 |
- temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\ |
|
1024 |
- temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\ |
|
1025 |
- __asm__ volatile(\ |
|
1026 |
- "movq (%0), %%mm0 \n\t"\ |
|
1027 |
- "movq 8(%0), %%mm1 \n\t"\ |
|
1028 |
- "paddw %2, %%mm0 \n\t"\ |
|
1029 |
- "paddw %2, %%mm1 \n\t"\ |
|
1030 |
- "psraw $5, %%mm0 \n\t"\ |
|
1031 |
- "psraw $5, %%mm1 \n\t"\ |
|
1032 |
- "packuswb %%mm1, %%mm0 \n\t"\ |
|
1033 |
- OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
|
1034 |
- "movq 16(%0), %%mm0 \n\t"\ |
|
1035 |
- "movq 24(%0), %%mm1 \n\t"\ |
|
1036 |
- "paddw %2, %%mm0 \n\t"\ |
|
1037 |
- "paddw %2, %%mm1 \n\t"\ |
|
1038 |
- "psraw $5, %%mm0 \n\t"\ |
|
1039 |
- "psraw $5, %%mm1 \n\t"\ |
|
1040 |
- "packuswb %%mm1, %%mm0 \n\t"\ |
|
1041 |
- OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\ |
|
1042 |
- :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
|
1043 |
- : "memory"\ |
|
1044 |
- );\ |
|
1045 |
- dst+=dstStride;\ |
|
1046 |
- src+=srcStride;\ |
|
1047 |
- }\ |
|
1048 |
-}\ |
|
1049 |
-\ |
|
1050 |
-static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
|
1051 |
- __asm__ volatile(\ |
|
1052 |
- "pxor %%mm7, %%mm7 \n\t"\ |
|
1053 |
- "1: \n\t"\ |
|
1054 |
- "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ |
|
1055 |
- "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ |
|
1056 |
- "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ |
|
1057 |
- "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ |
|
1058 |
- "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ |
|
1059 |
- "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ |
|
1060 |
- "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ |
|
1061 |
- "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ |
|
1062 |
- "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ |
|
1063 |
- "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ |
|
1064 |
- "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ |
|
1065 |
- "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ |
|
1066 |
- "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ |
|
1067 |
- "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ |
|
1068 |
- "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ |
|
1069 |
- "paddw %%mm3, %%mm5 \n\t" /* b */\ |
|
1070 |
- "paddw %%mm2, %%mm6 \n\t" /* c */\ |
|
1071 |
- "paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
|
1072 |
- "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
|
1073 |
- "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
|
1074 |
- "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
|
1075 |
- "paddw %%mm4, %%mm0 \n\t" /* a */\ |
|
1076 |
- "paddw %%mm1, %%mm5 \n\t" /* d */\ |
|
1077 |
- "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
|
1078 |
- "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
|
1079 |
- "paddw %5, %%mm6 \n\t"\ |
|
1080 |
- "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
|
1081 |
- "psraw $5, %%mm0 \n\t"\ |
|
1082 |
- /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
|
1083 |
- \ |
|
1084 |
- "movd 5(%0), %%mm5 \n\t" /* FGHI */\ |
|
1085 |
- "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\ |
|
1086 |
- "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\ |
|
1087 |
- "paddw %%mm5, %%mm1 \n\t" /* a */\ |
|
1088 |
- "paddw %%mm6, %%mm2 \n\t" /* b */\ |
|
1089 |
- "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\ |
|
1090 |
- "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\ |
|
1091 |
- "paddw %%mm6, %%mm3 \n\t" /* c */\ |
|
1092 |
- "paddw %%mm5, %%mm4 \n\t" /* d */\ |
|
1093 |
- "paddw %%mm2, %%mm2 \n\t" /* 2b */\ |
|
1094 |
- "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ |
|
1095 |
- "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
|
1096 |
- "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
|
1097 |
- "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
|
1098 |
- "paddw %5, %%mm1 \n\t"\ |
|
1099 |
- "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
|
1100 |
- "psraw $5, %%mm3 \n\t"\ |
|
1101 |
- "packuswb %%mm3, %%mm0 \n\t"\ |
|
1102 |
- OP_MMX2(%%mm0, (%1), %%mm4, q)\ |
|
1103 |
- \ |
|
1104 |
- "add %3, %0 \n\t"\ |
|
1105 |
- "add %4, %1 \n\t"\ |
|
1106 |
- "decl %2 \n\t"\ |
|
1107 |
- " jnz 1b \n\t"\ |
|
1108 |
- : "+a"(src), "+c"(dst), "+d"(h)\ |
|
1109 |
- : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\ |
|
1110 |
- : "memory"\ |
|
1111 |
- );\ |
|
1112 |
-}\ |
|
1113 |
-\ |
|
1114 |
-static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
|
1115 |
- int i;\ |
|
1116 |
- int16_t temp[8];\ |
|
1117 |
- /* quick HACK, XXX FIXME MUST be optimized */\ |
|
1118 |
- for(i=0; i<h; i++)\ |
|
1119 |
- {\ |
|
1120 |
- temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\ |
|
1121 |
- temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\ |
|
1122 |
- temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\ |
|
1123 |
- temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\ |
|
1124 |
- temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\ |
|
1125 |
- temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\ |
|
1126 |
- temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\ |
|
1127 |
- temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\ |
|
1128 |
- __asm__ volatile(\ |
|
1129 |
- "movq (%0), %%mm0 \n\t"\ |
|
1130 |
- "movq 8(%0), %%mm1 \n\t"\ |
|
1131 |
- "paddw %2, %%mm0 \n\t"\ |
|
1132 |
- "paddw %2, %%mm1 \n\t"\ |
|
1133 |
- "psraw $5, %%mm0 \n\t"\ |
|
1134 |
- "psraw $5, %%mm1 \n\t"\ |
|
1135 |
- "packuswb %%mm1, %%mm0 \n\t"\ |
|
1136 |
- OP_3DNOW(%%mm0, (%1), %%mm1, q)\ |
|
1137 |
- :: "r"(temp), "r"(dst), "m"(ROUNDER)\ |
|
1138 |
- :"memory"\ |
|
1139 |
- );\ |
|
1140 |
- dst+=dstStride;\ |
|
1141 |
- src+=srcStride;\ |
|
1142 |
- }\ |
|
1143 |
-} |
|
1144 |
- |
|
1145 |
-#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\ |
|
1146 |
-\ |
|
1147 |
-static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
|
1148 |
- uint64_t temp[17*4];\ |
|
1149 |
- uint64_t *temp_ptr= temp;\ |
|
1150 |
- int count= 17;\ |
|
1151 |
-\ |
|
1152 |
- /*FIXME unroll */\ |
|
1153 |
- __asm__ volatile(\ |
|
1154 |
- "pxor %%mm7, %%mm7 \n\t"\ |
|
1155 |
- "1: \n\t"\ |
|
1156 |
- "movq (%0), %%mm0 \n\t"\ |
|
1157 |
- "movq (%0), %%mm1 \n\t"\ |
|
1158 |
- "movq 8(%0), %%mm2 \n\t"\ |
|
1159 |
- "movq 8(%0), %%mm3 \n\t"\ |
|
1160 |
- "punpcklbw %%mm7, %%mm0 \n\t"\ |
|
1161 |
- "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
1162 |
- "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
1163 |
- "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
1164 |
- "movq %%mm0, (%1) \n\t"\ |
|
1165 |
- "movq %%mm1, 17*8(%1) \n\t"\ |
|
1166 |
- "movq %%mm2, 2*17*8(%1) \n\t"\ |
|
1167 |
- "movq %%mm3, 3*17*8(%1) \n\t"\ |
|
1168 |
- "add $8, %1 \n\t"\ |
|
1169 |
- "add %3, %0 \n\t"\ |
|
1170 |
- "decl %2 \n\t"\ |
|
1171 |
- " jnz 1b \n\t"\ |
|
1172 |
- : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
|
1173 |
- : "r" ((x86_reg)srcStride)\ |
|
1174 |
- : "memory"\ |
|
1175 |
- );\ |
|
1176 |
- \ |
|
1177 |
- temp_ptr= temp;\ |
|
1178 |
- count=4;\ |
|
1179 |
- \ |
|
1180 |
-/*FIXME reorder for speed */\ |
|
1181 |
- __asm__ volatile(\ |
|
1182 |
- /*"pxor %%mm7, %%mm7 \n\t"*/\ |
|
1183 |
- "1: \n\t"\ |
|
1184 |
- "movq (%0), %%mm0 \n\t"\ |
|
1185 |
- "movq 8(%0), %%mm1 \n\t"\ |
|
1186 |
- "movq 16(%0), %%mm2 \n\t"\ |
|
1187 |
- "movq 24(%0), %%mm3 \n\t"\ |
|
1188 |
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|
1189 |
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|
1190 |
- "add %4, %1 \n\t"\ |
|
1191 |
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|
1192 |
- \ |
|
1193 |
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|
1194 |
- "add %4, %1 \n\t"\ |
|
1195 |
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|
1196 |
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
|
1197 |
- "add %4, %1 \n\t"\ |
|
1198 |
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
|
1199 |
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
|
1200 |
- "add %4, %1 \n\t"\ |
|
1201 |
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
|
1202 |
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
|
1203 |
- "add %4, %1 \n\t"\ |
|
1204 |
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
|
1205 |
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
|
1206 |
- "add %4, %1 \n\t"\ |
|
1207 |
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
|
1208 |
- \ |
|
1209 |
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
|
1210 |
- "add %4, %1 \n\t" \ |
|
1211 |
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
|
1212 |
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
|
1213 |
- \ |
|
1214 |
- "add $136, %0 \n\t"\ |
|
1215 |
- "add %6, %1 \n\t"\ |
|
1216 |
- "decl %2 \n\t"\ |
|
1217 |
- " jnz 1b \n\t"\ |
|
1218 |
- \ |
|
1219 |
- : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
|
1220 |
- : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\ |
|
1221 |
- :"memory"\ |
|
1222 |
- );\ |
|
1223 |
-}\ |
|
1224 |
-\ |
|
1225 |
-static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
|
1226 |
- uint64_t temp[9*2];\ |
|
1227 |
- uint64_t *temp_ptr= temp;\ |
|
1228 |
- int count= 9;\ |
|
1229 |
-\ |
|
1230 |
- /*FIXME unroll */\ |
|
1231 |
- __asm__ volatile(\ |
|
1232 |
- "pxor %%mm7, %%mm7 \n\t"\ |
|
1233 |
- "1: \n\t"\ |
|
1234 |
- "movq (%0), %%mm0 \n\t"\ |
|
1235 |
- "movq (%0), %%mm1 \n\t"\ |
|
1236 |
- "punpcklbw %%mm7, %%mm0 \n\t"\ |
|
1237 |
- "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
1238 |
- "movq %%mm0, (%1) \n\t"\ |
|
1239 |
- "movq %%mm1, 9*8(%1) \n\t"\ |
|
1240 |
- "add $8, %1 \n\t"\ |
|
1241 |
- "add %3, %0 \n\t"\ |
|
1242 |
- "decl %2 \n\t"\ |
|
1243 |
- " jnz 1b \n\t"\ |
|
1244 |
- : "+r" (src), "+r" (temp_ptr), "+r"(count)\ |
|
1245 |
- : "r" ((x86_reg)srcStride)\ |
|
1246 |
- : "memory"\ |
|
1247 |
- );\ |
|
1248 |
- \ |
|
1249 |
- temp_ptr= temp;\ |
|
1250 |
- count=2;\ |
|
1251 |
- \ |
|
1252 |
-/*FIXME reorder for speed */\ |
|
1253 |
- __asm__ volatile(\ |
|
1254 |
- /*"pxor %%mm7, %%mm7 \n\t"*/\ |
|
1255 |
- "1: \n\t"\ |
|
1256 |
- "movq (%0), %%mm0 \n\t"\ |
|
1257 |
- "movq 8(%0), %%mm1 \n\t"\ |
|
1258 |
- "movq 16(%0), %%mm2 \n\t"\ |
|
1259 |
- "movq 24(%0), %%mm3 \n\t"\ |
|
1260 |
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
|
1261 |
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
|
1262 |
- "add %4, %1 \n\t"\ |
|
1263 |
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
|
1264 |
- \ |
|
1265 |
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
|
1266 |
- "add %4, %1 \n\t"\ |
|
1267 |
- QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
|
1268 |
- \ |
|
1269 |
- QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
|
1270 |
- "add %4, %1 \n\t"\ |
|
1271 |
- QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
|
1272 |
- QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ |
|
1273 |
- \ |
|
1274 |
- "add $72, %0 \n\t"\ |
|
1275 |
- "add %6, %1 \n\t"\ |
|
1276 |
- "decl %2 \n\t"\ |
|
1277 |
- " jnz 1b \n\t"\ |
|
1278 |
- \ |
|
1279 |
- : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
|
1280 |
- : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\ |
|
1281 |
- : "memory"\ |
|
1282 |
- );\ |
|
1283 |
-}\ |
|
1284 |
-\ |
|
1285 |
-static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
|
1286 |
- OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\ |
|
1287 |
-}\ |
|
1288 |
-\ |
|
1289 |
-static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1290 |
- uint64_t temp[8];\ |
|
1291 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1292 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ |
|
1293 |
- OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
|
1294 |
-}\ |
|
1295 |
-\ |
|
1296 |
-static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1297 |
- OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ |
|
1298 |
-}\ |
|
1299 |
-\ |
|
1300 |
-static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1301 |
- uint64_t temp[8];\ |
|
1302 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1303 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ |
|
1304 |
- OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\ |
|
1305 |
-}\ |
|
1306 |
-\ |
|
1307 |
-static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1308 |
- uint64_t temp[8];\ |
|
1309 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1310 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
|
1311 |
- OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\ |
|
1312 |
-}\ |
|
1313 |
-\ |
|
1314 |
-static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1315 |
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
|
1316 |
-}\ |
|
1317 |
-\ |
|
1318 |
-static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1319 |
- uint64_t temp[8];\ |
|
1320 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1321 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ |
|
1322 |
- OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\ |
|
1323 |
-}\ |
|
1324 |
-static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1325 |
- uint64_t half[8 + 9];\ |
|
1326 |
- uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
|
1327 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1328 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1329 |
- put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
|
1330 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
|
1331 |
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
|
1332 |
-}\ |
|
1333 |
-static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1334 |
- uint64_t half[8 + 9];\ |
|
1335 |
- uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
|
1336 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1337 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1338 |
- put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
|
1339 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
|
1340 |
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
|
1341 |
-}\ |
|
1342 |
-static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1343 |
- uint64_t half[8 + 9];\ |
|
1344 |
- uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
|
1345 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1346 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1347 |
- put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
|
1348 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
|
1349 |
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
|
1350 |
-}\ |
|
1351 |
-static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1352 |
- uint64_t half[8 + 9];\ |
|
1353 |
- uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
|
1354 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1355 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1356 |
- put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
|
1357 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
|
1358 |
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
|
1359 |
-}\ |
|
1360 |
-static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1361 |
- uint64_t half[8 + 9];\ |
|
1362 |
- uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
|
1363 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1364 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1365 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
|
1366 |
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\ |
|
1367 |
-}\ |
|
1368 |
-static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1369 |
- uint64_t half[8 + 9];\ |
|
1370 |
- uint8_t * const halfH= ((uint8_t*)half) + 64;\ |
|
1371 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1372 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1373 |
- put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
|
1374 |
- OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\ |
|
1375 |
-}\ |
|
1376 |
-static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1377 |
- uint64_t half[8 + 9];\ |
|
1378 |
- uint8_t * const halfH= ((uint8_t*)half);\ |
|
1379 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1380 |
- put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\ |
|
1381 |
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
|
1382 |
-}\ |
|
1383 |
-static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1384 |
- uint64_t half[8 + 9];\ |
|
1385 |
- uint8_t * const halfH= ((uint8_t*)half);\ |
|
1386 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1387 |
- put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\ |
|
1388 |
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
|
1389 |
-}\ |
|
1390 |
-static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1391 |
- uint64_t half[9];\ |
|
1392 |
- uint8_t * const halfH= ((uint8_t*)half);\ |
|
1393 |
- put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ |
|
1394 |
- OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ |
|
1395 |
-}\ |
|
1396 |
-static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ |
|
1397 |
- OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\ |
|
1398 |
-}\ |
|
1399 |
-\ |
|
1400 |
-static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1401 |
- uint64_t temp[32];\ |
|
1402 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1403 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ |
|
1404 |
- OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
|
1405 |
-}\ |
|
1406 |
-\ |
|
1407 |
-static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1408 |
- OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ |
|
1409 |
-}\ |
|
1410 |
-\ |
|
1411 |
-static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1412 |
- uint64_t temp[32];\ |
|
1413 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1414 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ |
|
1415 |
- OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\ |
|
1416 |
-}\ |
|
1417 |
-\ |
|
1418 |
-static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1419 |
- uint64_t temp[32];\ |
|
1420 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1421 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
|
1422 |
- OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\ |
|
1423 |
-}\ |
|
1424 |
-\ |
|
1425 |
-static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1426 |
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ |
|
1427 |
-}\ |
|
1428 |
-\ |
|
1429 |
-static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1430 |
- uint64_t temp[32];\ |
|
1431 |
- uint8_t * const half= (uint8_t*)temp;\ |
|
1432 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ |
|
1433 |
- OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\ |
|
1434 |
-}\ |
|
1435 |
-static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1436 |
- uint64_t half[16*2 + 17*2];\ |
|
1437 |
- uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
|
1438 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1439 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1440 |
- put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
|
1441 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
|
1442 |
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
|
1443 |
-}\ |
|
1444 |
-static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1445 |
- uint64_t half[16*2 + 17*2];\ |
|
1446 |
- uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
|
1447 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1448 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1449 |
- put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
|
1450 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
|
1451 |
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
|
1452 |
-}\ |
|
1453 |
-static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1454 |
- uint64_t half[16*2 + 17*2];\ |
|
1455 |
- uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
|
1456 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1457 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1458 |
- put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
|
1459 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
|
1460 |
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
|
1461 |
-}\ |
|
1462 |
-static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1463 |
- uint64_t half[16*2 + 17*2];\ |
|
1464 |
- uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
|
1465 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1466 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1467 |
- put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
|
1468 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
|
1469 |
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
|
1470 |
-}\ |
|
1471 |
-static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1472 |
- uint64_t half[16*2 + 17*2];\ |
|
1473 |
- uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
|
1474 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1475 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1476 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
|
1477 |
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\ |
|
1478 |
-}\ |
|
1479 |
-static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1480 |
- uint64_t half[16*2 + 17*2];\ |
|
1481 |
- uint8_t * const halfH= ((uint8_t*)half) + 256;\ |
|
1482 |
- uint8_t * const halfHV= ((uint8_t*)half);\ |
|
1483 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1484 |
- put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ |
|
1485 |
- OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\ |
|
1486 |
-}\ |
|
1487 |
-static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1488 |
- uint64_t half[17*2];\ |
|
1489 |
- uint8_t * const halfH= ((uint8_t*)half);\ |
|
1490 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1491 |
- put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\ |
|
1492 |
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
|
1493 |
-}\ |
|
1494 |
-static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1495 |
- uint64_t half[17*2];\ |
|
1496 |
- uint8_t * const halfH= ((uint8_t*)half);\ |
|
1497 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1498 |
- put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\ |
|
1499 |
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
|
1500 |
-}\ |
|
1501 |
-static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1502 |
- uint64_t half[17*2];\ |
|
1503 |
- uint8_t * const halfH= ((uint8_t*)half);\ |
|
1504 |
- put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ |
|
1505 |
- OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ |
|
1506 |
-} |
|
1507 |
- |
|
1508 |
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
|
1509 |
-#define AVG_3DNOW_OP(a,b,temp, size) \ |
|
1510 |
-"mov" #size " " #b ", " #temp " \n\t"\ |
|
1511 |
-"pavgusb " #temp ", " #a " \n\t"\ |
|
1512 |
-"mov" #size " " #a ", " #b " \n\t" |
|
1513 |
-#define AVG_MMX2_OP(a,b,temp, size) \ |
|
1514 |
-"mov" #size " " #b ", " #temp " \n\t"\ |
|
1515 |
-"pavgb " #temp ", " #a " \n\t"\ |
|
1516 |
-"mov" #size " " #a ", " #b " \n\t" |
|
1517 |
- |
|
1518 |
-QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP) |
|
1519 |
-QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP) |
|
1520 |
-QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) |
|
1521 |
-QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow) |
|
1522 |
-QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow) |
|
1523 |
-QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) |
|
1524 |
-QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) |
|
1525 |
-QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) |
|
1526 |
-QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
|
861 |
+#define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \ |
|
862 |
+ in0, in1, in2, in7, out, OP) \ |
|
863 |
+ "paddw "#m4", "#m3" \n\t" /* x1 */ \ |
|
864 |
+ "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \ |
|
865 |
+ "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \ |
|
866 |
+ "movq "#in7", "#m3" \n\t" /* d */ \ |
|
867 |
+ "movq "#in0", %%mm5 \n\t" /* D */ \ |
|
868 |
+ "paddw "#m3", %%mm5 \n\t" /* x4 */ \ |
|
869 |
+ "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \ |
|
870 |
+ "movq "#in1", %%mm5 \n\t" /* C */ \ |
|
871 |
+ "movq "#in2", %%mm6 \n\t" /* B */ \ |
|
872 |
+ "paddw "#m6", %%mm5 \n\t" /* x3 */ \ |
|
873 |
+ "paddw "#m5", %%mm6 \n\t" /* x2 */ \ |
|
874 |
+ "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \ |
|
875 |
+ "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \ |
|
876 |
+ "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \ |
|
877 |
+ "paddw "#rnd", %%mm4 \n\t" /* x2 */ \ |
|
878 |
+ "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \ |
|
879 |
+ "psraw $5, %%mm5 \n\t" \ |
|
880 |
+ "packuswb %%mm5, %%mm5 \n\t" \ |
|
881 |
+ OP(%%mm5, out, %%mm7, d) |
|
882 |
+ |
|
883 |
+#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \ |
|
884 |
+static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \ |
|
885 |
+ uint8_t *src, \ |
|
886 |
+ int dstStride, \ |
|
887 |
+ int srcStride, \ |
|
888 |
+ int h) \ |
|
889 |
+{ \ |
|
890 |
+ uint64_t temp; \ |
|
891 |
+ \ |
|
892 |
+ __asm__ volatile ( \ |
|
893 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
894 |
+ "1: \n\t" \ |
|
895 |
+ "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \ |
|
896 |
+ "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \ |
|
897 |
+ "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \ |
|
898 |
+ "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \ |
|
899 |
+ "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \ |
|
900 |
+ "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \ |
|
901 |
+ "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \ |
|
902 |
+ "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \ |
|
903 |
+ "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \ |
|
904 |
+ "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \ |
|
905 |
+ "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \ |
|
906 |
+ "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \ |
|
907 |
+ "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \ |
|
908 |
+ "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \ |
|
909 |
+ "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \ |
|
910 |
+ "paddw %%mm3, %%mm5 \n\t" /* b */ \ |
|
911 |
+ "paddw %%mm2, %%mm6 \n\t" /* c */ \ |
|
912 |
+ "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ |
|
913 |
+ "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \ |
|
914 |
+ "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \ |
|
915 |
+ "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \ |
|
916 |
+ "paddw %%mm4, %%mm0 \n\t" /* a */ \ |
|
917 |
+ "paddw %%mm1, %%mm5 \n\t" /* d */ \ |
|
918 |
+ "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \ |
|
919 |
+ "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \ |
|
920 |
+ "paddw %6, %%mm6 \n\t" \ |
|
921 |
+ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ |
|
922 |
+ "psraw $5, %%mm0 \n\t" \ |
|
923 |
+ "movq %%mm0, %5 \n\t" \ |
|
924 |
+ /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \ |
|
925 |
+ \ |
|
926 |
+ "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \ |
|
927 |
+ "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \ |
|
928 |
+ "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \ |
|
929 |
+ "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \ |
|
930 |
+ "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \ |
|
931 |
+ "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \ |
|
932 |
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \ |
|
933 |
+ "paddw %%mm0, %%mm2 \n\t" /* b */ \ |
|
934 |
+ "paddw %%mm5, %%mm3 \n\t" /* c */ \ |
|
935 |
+ "paddw %%mm2, %%mm2 \n\t" /* 2b */ \ |
|
936 |
+ "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \ |
|
937 |
+ "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \ |
|
938 |
+ "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \ |
|
939 |
+ "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \ |
|
940 |
+ "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \ |
|
941 |
+ "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \ |
|
942 |
+ "paddw %%mm2, %%mm1 \n\t" /* a */ \ |
|
943 |
+ "paddw %%mm6, %%mm4 \n\t" /* d */ \ |
|
944 |
+ "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \ |
|
945 |
+ "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \ |
|
946 |
+ "paddw %6, %%mm1 \n\t" \ |
|
947 |
+ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \ |
|
948 |
+ "psraw $5, %%mm3 \n\t" \ |
|
949 |
+ "movq %5, %%mm1 \n\t" \ |
|
950 |
+ "packuswb %%mm3, %%mm1 \n\t" \ |
|
951 |
+ OP_MMX2(%%mm1, (%1), %%mm4, q) \ |
|
952 |
+ /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \ |
|
953 |
+ \ |
|
954 |
+ "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \ |
|
955 |
+ "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \ |
|
956 |
+ "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \ |
|
957 |
+ "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \ |
|
958 |
+ "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \ |
|
959 |
+ "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \ |
|
960 |
+ "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \ |
|
961 |
+ "paddw %%mm1, %%mm5 \n\t" /* b */ \ |
|
962 |
+ "paddw %%mm4, %%mm0 \n\t" /* c */ \ |
|
963 |
+ "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ |
|
964 |
+ "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \ |
|
965 |
+ "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \ |
|
966 |
+ "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \ |
|
967 |
+ "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \ |
|
968 |
+ "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \ |
|
969 |
+ "paddw %%mm3, %%mm2 \n\t" /* d */ \ |
|
970 |
+ "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \ |
|
971 |
+ "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \ |
|
972 |
+ "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \ |
|
973 |
+ "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \ |
|
974 |
+ "paddw %%mm2, %%mm6 \n\t" /* a */ \ |
|
975 |
+ "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \ |
|
976 |
+ "paddw %6, %%mm0 \n\t" \ |
|
977 |
+ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ |
|
978 |
+ "psraw $5, %%mm0 \n\t" \ |
|
979 |
+ /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \ |
|
980 |
+ /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \ |
|
981 |
+ \ |
|
982 |
+ "paddw %%mm5, %%mm3 \n\t" /* a */ \ |
|
983 |
+ "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \ |
|
984 |
+ "paddw %%mm4, %%mm6 \n\t" /* b */ \ |
|
985 |
+ "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \ |
|
986 |
+ "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \ |
|
987 |
+ "paddw %%mm1, %%mm4 \n\t" /* c */ \ |
|
988 |
+ "paddw %%mm2, %%mm5 \n\t" /* d */ \ |
|
989 |
+ "paddw %%mm6, %%mm6 \n\t" /* 2b */ \ |
|
990 |
+ "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \ |
|
991 |
+ "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \ |
|
992 |
+ "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \ |
|
993 |
+ "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \ |
|
994 |
+ "paddw %6, %%mm4 \n\t" \ |
|
995 |
+ "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \ |
|
996 |
+ "psraw $5, %%mm4 \n\t" \ |
|
997 |
+ "packuswb %%mm4, %%mm0 \n\t" \ |
|
998 |
+ OP_MMX2(%%mm0, 8(%1), %%mm4, q) \ |
|
999 |
+ \ |
|
1000 |
+ "add %3, %0 \n\t" \ |
|
1001 |
+ "add %4, %1 \n\t" \ |
|
1002 |
+ "decl %2 \n\t" \ |
|
1003 |
+ "jnz 1b \n\t" \ |
|
1004 |
+ : "+a"(src), "+c"(dst), "+D"(h) \ |
|
1005 |
+ : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \ |
|
1006 |
+ /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \ |
|
1007 |
+ : "memory" \ |
|
1008 |
+ ); \ |
|
1009 |
+} \ |
|
1010 |
+ \ |
|
1011 |
+static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \ |
|
1012 |
+ uint8_t *src, \ |
|
1013 |
+ int dstStride, \ |
|
1014 |
+ int srcStride, \ |
|
1015 |
+ int h) \ |
|
1016 |
+{ \ |
|
1017 |
+ int i; \ |
|
1018 |
+ int16_t temp[16]; \ |
|
1019 |
+ /* quick HACK, XXX FIXME MUST be optimized */ \ |
|
1020 |
+ for (i = 0; i < h; i++) { \ |
|
1021 |
+ temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \ |
|
1022 |
+ (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \ |
|
1023 |
+ temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \ |
|
1024 |
+ (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \ |
|
1025 |
+ temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \ |
|
1026 |
+ (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \ |
|
1027 |
+ temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \ |
|
1028 |
+ (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \ |
|
1029 |
+ temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \ |
|
1030 |
+ (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \ |
|
1031 |
+ temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \ |
|
1032 |
+ (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \ |
|
1033 |
+ temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \ |
|
1034 |
+ (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \ |
|
1035 |
+ temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \ |
|
1036 |
+ (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \ |
|
1037 |
+ temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \ |
|
1038 |
+ (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \ |
|
1039 |
+ temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \ |
|
1040 |
+ (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \ |
|
1041 |
+ temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \ |
|
1042 |
+ (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \ |
|
1043 |
+ temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \ |
|
1044 |
+ (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \ |
|
1045 |
+ temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \ |
|
1046 |
+ (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \ |
|
1047 |
+ temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \ |
|
1048 |
+ (src[11] + src[16]) * 3 - (src[10] + src[16]); \ |
|
1049 |
+ temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \ |
|
1050 |
+ (src[12] + src[16]) * 3 - (src[11] + src[15]); \ |
|
1051 |
+ temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \ |
|
1052 |
+ (src[13] + src[15]) * 3 - (src[12] + src[14]); \ |
|
1053 |
+ __asm__ volatile ( \ |
|
1054 |
+ "movq (%0), %%mm0 \n\t" \ |
|
1055 |
+ "movq 8(%0), %%mm1 \n\t" \ |
|
1056 |
+ "paddw %2, %%mm0 \n\t" \ |
|
1057 |
+ "paddw %2, %%mm1 \n\t" \ |
|
1058 |
+ "psraw $5, %%mm0 \n\t" \ |
|
1059 |
+ "psraw $5, %%mm1 \n\t" \ |
|
1060 |
+ "packuswb %%mm1, %%mm0 \n\t" \ |
|
1061 |
+ OP_3DNOW(%%mm0, (%1), %%mm1, q) \ |
|
1062 |
+ "movq 16(%0), %%mm0 \n\t" \ |
|
1063 |
+ "movq 24(%0), %%mm1 \n\t" \ |
|
1064 |
+ "paddw %2, %%mm0 \n\t" \ |
|
1065 |
+ "paddw %2, %%mm1 \n\t" \ |
|
1066 |
+ "psraw $5, %%mm0 \n\t" \ |
|
1067 |
+ "psraw $5, %%mm1 \n\t" \ |
|
1068 |
+ "packuswb %%mm1, %%mm0 \n\t" \ |
|
1069 |
+ OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \ |
|
1070 |
+ :: "r"(temp), "r"(dst), "m"(ROUNDER) \ |
|
1071 |
+ : "memory" \ |
|
1072 |
+ ); \ |
|
1073 |
+ dst += dstStride; \ |
|
1074 |
+ src += srcStride; \ |
|
1075 |
+ } \ |
|
1076 |
+} \ |
|
1077 |
+ \ |
|
1078 |
+static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \ |
|
1079 |
+ uint8_t *src, \ |
|
1080 |
+ int dstStride, \ |
|
1081 |
+ int srcStride, \ |
|
1082 |
+ int h) \ |
|
1083 |
+{ \ |
|
1084 |
+ __asm__ volatile ( \ |
|
1085 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1086 |
+ "1: \n\t" \ |
|
1087 |
+ "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \ |
|
1088 |
+ "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \ |
|
1089 |
+ "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \ |
|
1090 |
+ "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \ |
|
1091 |
+ "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \ |
|
1092 |
+ "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \ |
|
1093 |
+ "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \ |
|
1094 |
+ "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \ |
|
1095 |
+ "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \ |
|
1096 |
+ "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \ |
|
1097 |
+ "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \ |
|
1098 |
+ "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \ |
|
1099 |
+ "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \ |
|
1100 |
+ "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \ |
|
1101 |
+ "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \ |
|
1102 |
+ "paddw %%mm3, %%mm5 \n\t" /* b */ \ |
|
1103 |
+ "paddw %%mm2, %%mm6 \n\t" /* c */ \ |
|
1104 |
+ "paddw %%mm5, %%mm5 \n\t" /* 2b */ \ |
|
1105 |
+ "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \ |
|
1106 |
+ "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \ |
|
1107 |
+ "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \ |
|
1108 |
+ "paddw %%mm4, %%mm0 \n\t" /* a */ \ |
|
1109 |
+ "paddw %%mm1, %%mm5 \n\t" /* d */ \ |
|
1110 |
+ "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \ |
|
1111 |
+ "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \ |
|
1112 |
+ "paddw %5, %%mm6 \n\t" \ |
|
1113 |
+ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \ |
|
1114 |
+ "psraw $5, %%mm0 \n\t" \ |
|
1115 |
+ /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \ |
|
1116 |
+ \ |
|
1117 |
+ "movd 5(%0), %%mm5 \n\t" /* FGHI */ \ |
|
1118 |
+ "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \ |
|
1119 |
+ "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \ |
|
1120 |
+ "paddw %%mm5, %%mm1 \n\t" /* a */ \ |
|
1121 |
+ "paddw %%mm6, %%mm2 \n\t" /* b */ \ |
|
1122 |
+ "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \ |
|
1123 |
+ "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \ |
|
1124 |
+ "paddw %%mm6, %%mm3 \n\t" /* c */ \ |
|
1125 |
+ "paddw %%mm5, %%mm4 \n\t" /* d */ \ |
|
1126 |
+ "paddw %%mm2, %%mm2 \n\t" /* 2b */ \ |
|
1127 |
+ "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \ |
|
1128 |
+ "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \ |
|
1129 |
+ "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \ |
|
1130 |
+ "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \ |
|
1131 |
+ "paddw %5, %%mm1 \n\t" \ |
|
1132 |
+ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \ |
|
1133 |
+ "psraw $5, %%mm3 \n\t" \ |
|
1134 |
+ "packuswb %%mm3, %%mm0 \n\t" \ |
|
1135 |
+ OP_MMX2(%%mm0, (%1), %%mm4, q) \ |
|
1136 |
+ \ |
|
1137 |
+ "add %3, %0 \n\t" \ |
|
1138 |
+ "add %4, %1 \n\t" \ |
|
1139 |
+ "decl %2 \n\t" \ |
|
1140 |
+ "jnz 1b \n\t" \ |
|
1141 |
+ : "+a"(src), "+c"(dst), "+d"(h) \ |
|
1142 |
+ : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \ |
|
1143 |
+ /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \ |
|
1144 |
+ : "memory" \ |
|
1145 |
+ ); \ |
|
1146 |
+} \ |
|
1147 |
+ \ |
|
1148 |
+static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \ |
|
1149 |
+ uint8_t *src, \ |
|
1150 |
+ int dstStride, \ |
|
1151 |
+ int srcStride, \ |
|
1152 |
+ int h) \ |
|
1153 |
+{ \ |
|
1154 |
+ int i; \ |
|
1155 |
+ int16_t temp[8]; \ |
|
1156 |
+ /* quick HACK, XXX FIXME MUST be optimized */ \ |
|
1157 |
+ for (i = 0; i < h; i++) { \ |
|
1158 |
+ temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \ |
|
1159 |
+ (src[1] + src[3]) * 3 - (src[2] + src[4]); \ |
|
1160 |
+ temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \ |
|
1161 |
+ (src[0] + src[4]) * 3 - (src[1] + src[5]); \ |
|
1162 |
+ temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \ |
|
1163 |
+ (src[0] + src[5]) * 3 - (src[0] + src[6]); \ |
|
1164 |
+ temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \ |
|
1165 |
+ (src[1] + src[6]) * 3 - (src[0] + src[7]); \ |
|
1166 |
+ temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \ |
|
1167 |
+ (src[2] + src[7]) * 3 - (src[1] + src[8]); \ |
|
1168 |
+ temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \ |
|
1169 |
+ (src[3] + src[8]) * 3 - (src[2] + src[8]); \ |
|
1170 |
+ temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \ |
|
1171 |
+ (src[4] + src[8]) * 3 - (src[3] + src[7]); \ |
|
1172 |
+ temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \ |
|
1173 |
+ (src[5] + src[7]) * 3 - (src[4] + src[6]); \ |
|
1174 |
+ __asm__ volatile ( \ |
|
1175 |
+ "movq (%0), %%mm0 \n\t" \ |
|
1176 |
+ "movq 8(%0), %%mm1 \n\t" \ |
|
1177 |
+ "paddw %2, %%mm0 \n\t" \ |
|
1178 |
+ "paddw %2, %%mm1 \n\t" \ |
|
1179 |
+ "psraw $5, %%mm0 \n\t" \ |
|
1180 |
+ "psraw $5, %%mm1 \n\t" \ |
|
1181 |
+ "packuswb %%mm1, %%mm0 \n\t" \ |
|
1182 |
+ OP_3DNOW(%%mm0, (%1), %%mm1, q) \ |
|
1183 |
+ :: "r"(temp), "r"(dst), "m"(ROUNDER) \ |
|
1184 |
+ : "memory" \ |
|
1185 |
+ ); \ |
|
1186 |
+ dst += dstStride; \ |
|
1187 |
+ src += srcStride; \ |
|
1188 |
+ } \ |
|
1189 |
+} |
|
1190 |
+ |
|
1191 |
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \ |
|
1192 |
+static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \ |
|
1193 |
+ uint8_t *src, \ |
|
1194 |
+ int dstStride, \ |
|
1195 |
+ int srcStride) \ |
|
1196 |
+{ \ |
|
1197 |
+ uint64_t temp[17 * 4]; \ |
|
1198 |
+ uint64_t *temp_ptr = temp; \ |
|
1199 |
+ int count = 17; \ |
|
1200 |
+ \ |
|
1201 |
+ /* FIXME unroll */ \ |
|
1202 |
+ __asm__ volatile ( \ |
|
1203 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1204 |
+ "1: \n\t" \ |
|
1205 |
+ "movq (%0), %%mm0 \n\t" \ |
|
1206 |
+ "movq (%0), %%mm1 \n\t" \ |
|
1207 |
+ "movq 8(%0), %%mm2 \n\t" \ |
|
1208 |
+ "movq 8(%0), %%mm3 \n\t" \ |
|
1209 |
+ "punpcklbw %%mm7, %%mm0 \n\t" \ |
|
1210 |
+ "punpckhbw %%mm7, %%mm1 \n\t" \ |
|
1211 |
+ "punpcklbw %%mm7, %%mm2 \n\t" \ |
|
1212 |
+ "punpckhbw %%mm7, %%mm3 \n\t" \ |
|
1213 |
+ "movq %%mm0, (%1) \n\t" \ |
|
1214 |
+ "movq %%mm1, 17 * 8(%1) \n\t" \ |
|
1215 |
+ "movq %%mm2, 2 * 17 * 8(%1) \n\t" \ |
|
1216 |
+ "movq %%mm3, 3 * 17 * 8(%1) \n\t" \ |
|
1217 |
+ "add $8, %1 \n\t" \ |
|
1218 |
+ "add %3, %0 \n\t" \ |
|
1219 |
+ "decl %2 \n\t" \ |
|
1220 |
+ "jnz 1b \n\t" \ |
|
1221 |
+ : "+r"(src), "+r"(temp_ptr), "+r"(count) \ |
|
1222 |
+ : "r"((x86_reg)srcStride) \ |
|
1223 |
+ : "memory" \ |
|
1224 |
+ ); \ |
|
1225 |
+ \ |
|
1226 |
+ temp_ptr = temp; \ |
|
1227 |
+ count = 4; \ |
|
1228 |
+ \ |
|
1229 |
+ /* FIXME reorder for speed */ \ |
|
1230 |
+ __asm__ volatile ( \ |
|
1231 |
+ /* "pxor %%mm7, %%mm7 \n\t" */ \ |
|
1232 |
+ "1: \n\t" \ |
|
1233 |
+ "movq (%0), %%mm0 \n\t" \ |
|
1234 |
+ "movq 8(%0), %%mm1 \n\t" \ |
|
1235 |
+ "movq 16(%0), %%mm2 \n\t" \ |
|
1236 |
+ "movq 24(%0), %%mm3 \n\t" \ |
|
1237 |
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \ |
|
1238 |
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \ |
|
1239 |
+ "add %4, %1 \n\t" \ |
|
1240 |
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \ |
|
1241 |
+ \ |
|
1242 |
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \ |
|
1243 |
+ "add %4, %1 \n\t" \ |
|
1244 |
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \ |
|
1245 |
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \ |
|
1246 |
+ "add %4, %1 \n\t" \ |
|
1247 |
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \ |
|
1248 |
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \ |
|
1249 |
+ "add %4, %1 \n\t" \ |
|
1250 |
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \ |
|
1251 |
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \ |
|
1252 |
+ "add %4, %1 \n\t" \ |
|
1253 |
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \ |
|
1254 |
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \ |
|
1255 |
+ "add %4, %1 \n\t" \ |
|
1256 |
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \ |
|
1257 |
+ \ |
|
1258 |
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \ |
|
1259 |
+ "add %4, %1 \n\t" \ |
|
1260 |
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \ |
|
1261 |
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \ |
|
1262 |
+ \ |
|
1263 |
+ "add $136, %0 \n\t" \ |
|
1264 |
+ "add %6, %1 \n\t" \ |
|
1265 |
+ "decl %2 \n\t" \ |
|
1266 |
+ "jnz 1b \n\t" \ |
|
1267 |
+ \ |
|
1268 |
+ : "+r"(temp_ptr), "+r"(dst), "+g"(count) \ |
|
1269 |
+ : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \ |
|
1270 |
+ /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \ |
|
1271 |
+ "g"(4 - 14 * (x86_reg)dstStride) \ |
|
1272 |
+ : "memory" \ |
|
1273 |
+ ); \ |
|
1274 |
+} \ |
|
1275 |
+ \ |
|
1276 |
+static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \ |
|
1277 |
+ uint8_t *src, \ |
|
1278 |
+ int dstStride, \ |
|
1279 |
+ int srcStride) \ |
|
1280 |
+{ \ |
|
1281 |
+ uint64_t temp[9 * 2]; \ |
|
1282 |
+ uint64_t *temp_ptr = temp; \ |
|
1283 |
+ int count = 9; \ |
|
1284 |
+ \ |
|
1285 |
+ /* FIXME unroll */ \ |
|
1286 |
+ __asm__ volatile ( \ |
|
1287 |
+ "pxor %%mm7, %%mm7 \n\t" \ |
|
1288 |
+ "1: \n\t" \ |
|
1289 |
+ "movq (%0), %%mm0 \n\t" \ |
|
1290 |
+ "movq (%0), %%mm1 \n\t" \ |
|
1291 |
+ "punpcklbw %%mm7, %%mm0 \n\t" \ |
|
1292 |
+ "punpckhbw %%mm7, %%mm1 \n\t" \ |
|
1293 |
+ "movq %%mm0, (%1) \n\t" \ |
|
1294 |
+ "movq %%mm1, 9*8(%1) \n\t" \ |
|
1295 |
+ "add $8, %1 \n\t" \ |
|
1296 |
+ "add %3, %0 \n\t" \ |
|
1297 |
+ "decl %2 \n\t" \ |
|
1298 |
+ "jnz 1b \n\t" \ |
|
1299 |
+ : "+r"(src), "+r"(temp_ptr), "+r"(count) \ |
|
1300 |
+ : "r"((x86_reg)srcStride) \ |
|
1301 |
+ : "memory" \ |
|
1302 |
+ ); \ |
|
1303 |
+ \ |
|
1304 |
+ temp_ptr = temp; \ |
|
1305 |
+ count = 2; \ |
|
1306 |
+ \ |
|
1307 |
+ /* FIXME reorder for speed */ \ |
|
1308 |
+ __asm__ volatile ( \ |
|
1309 |
+ /* "pxor %%mm7, %%mm7 \n\t" */ \ |
|
1310 |
+ "1: \n\t" \ |
|
1311 |
+ "movq (%0), %%mm0 \n\t" \ |
|
1312 |
+ "movq 8(%0), %%mm1 \n\t" \ |
|
1313 |
+ "movq 16(%0), %%mm2 \n\t" \ |
|
1314 |
+ "movq 24(%0), %%mm3 \n\t" \ |
|
1315 |
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \ |
|
1316 |
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \ |
|
1317 |
+ "add %4, %1 \n\t" \ |
|
1318 |
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \ |
|
1319 |
+ \ |
|
1320 |
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \ |
|
1321 |
+ "add %4, %1 \n\t" \ |
|
1322 |
+ QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \ |
|
1323 |
+ \ |
|
1324 |
+ QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \ |
|
1325 |
+ "add %4, %1 \n\t" \ |
|
1326 |
+ QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \ |
|
1327 |
+ QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \ |
|
1328 |
+ \ |
|
1329 |
+ "add $72, %0 \n\t" \ |
|
1330 |
+ "add %6, %1 \n\t" \ |
|
1331 |
+ "decl %2 \n\t" \ |
|
1332 |
+ "jnz 1b \n\t" \ |
|
1333 |
+ \ |
|
1334 |
+ : "+r"(temp_ptr), "+r"(dst), "+g"(count) \ |
|
1335 |
+ : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \ |
|
1336 |
+ /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \ |
|
1337 |
+ "g"(4 - 6 * (x86_reg)dstStride) \ |
|
1338 |
+ : "memory" \ |
|
1339 |
+ ); \ |
|
1340 |
+} \ |
|
1341 |
+ \ |
|
1342 |
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ |
|
1343 |
+ int stride) \ |
|
1344 |
+{ \ |
|
1345 |
+ OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \ |
|
1346 |
+} \ |
|
1347 |
+ \ |
|
1348 |
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1349 |
+ int stride) \ |
|
1350 |
+{ \ |
|
1351 |
+ uint64_t temp[8]; \ |
|
1352 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1353 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ |
|
1354 |
+ stride, 8); \ |
|
1355 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \ |
|
1356 |
+} \ |
|
1357 |
+ \ |
|
1358 |
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1359 |
+ int stride) \ |
|
1360 |
+{ \ |
|
1361 |
+ OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \ |
|
1362 |
+ stride, 8); \ |
|
1363 |
+} \ |
|
1364 |
+ \ |
|
1365 |
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1366 |
+ int stride) \ |
|
1367 |
+{ \ |
|
1368 |
+ uint64_t temp[8]; \ |
|
1369 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1370 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ |
|
1371 |
+ stride, 8); \ |
|
1372 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \ |
|
1373 |
+ stride, 8); \ |
|
1374 |
+} \ |
|
1375 |
+ \ |
|
1376 |
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1377 |
+ int stride) \ |
|
1378 |
+{ \ |
|
1379 |
+ uint64_t temp[8]; \ |
|
1380 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1381 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \ |
|
1382 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \ |
|
1383 |
+} \ |
|
1384 |
+ \ |
|
1385 |
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1386 |
+ int stride) \ |
|
1387 |
+{ \ |
|
1388 |
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \ |
|
1389 |
+} \ |
|
1390 |
+ \ |
|
1391 |
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1392 |
+ int stride) \ |
|
1393 |
+{ \ |
|
1394 |
+ uint64_t temp[8]; \ |
|
1395 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1396 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \ |
|
1397 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \ |
|
1398 |
+ stride, 8); \ |
|
1399 |
+} \ |
|
1400 |
+ \ |
|
1401 |
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1402 |
+ int stride) \ |
|
1403 |
+{ \ |
|
1404 |
+ uint64_t half[8 + 9]; \ |
|
1405 |
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
|
1406 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1407 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1408 |
+ stride, 9); \ |
|
1409 |
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ |
|
1410 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ |
|
1411 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \ |
|
1412 |
+} \ |
|
1413 |
+ \ |
|
1414 |
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1415 |
+ int stride) \ |
|
1416 |
+{ \ |
|
1417 |
+ uint64_t half[8 + 9]; \ |
|
1418 |
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
|
1419 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1420 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1421 |
+ stride, 9); \ |
|
1422 |
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ |
|
1423 |
+ stride, 9); \ |
|
1424 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ |
|
1425 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \ |
|
1426 |
+} \ |
|
1427 |
+ \ |
|
1428 |
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1429 |
+ int stride) \ |
|
1430 |
+{ \ |
|
1431 |
+ uint64_t half[8 + 9]; \ |
|
1432 |
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
|
1433 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1434 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1435 |
+ stride, 9); \ |
|
1436 |
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ |
|
1437 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ |
|
1438 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \ |
|
1439 |
+} \ |
|
1440 |
+ \ |
|
1441 |
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1442 |
+ int stride) \ |
|
1443 |
+{ \ |
|
1444 |
+ uint64_t half[8 + 9]; \ |
|
1445 |
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
|
1446 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1447 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1448 |
+ stride, 9); \ |
|
1449 |
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ |
|
1450 |
+ stride, 9); \ |
|
1451 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ |
|
1452 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \ |
|
1453 |
+} \ |
|
1454 |
+ \ |
|
1455 |
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1456 |
+ int stride) \ |
|
1457 |
+{ \ |
|
1458 |
+ uint64_t half[8 + 9]; \ |
|
1459 |
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
|
1460 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1461 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1462 |
+ stride, 9); \ |
|
1463 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ |
|
1464 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \ |
|
1465 |
+} \ |
|
1466 |
+ \ |
|
1467 |
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1468 |
+ int stride) \ |
|
1469 |
+{ \ |
|
1470 |
+ uint64_t half[8 + 9]; \ |
|
1471 |
+ uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
|
1472 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1473 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1474 |
+ stride, 9); \ |
|
1475 |
+ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \ |
|
1476 |
+ OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \ |
|
1477 |
+} \ |
|
1478 |
+ \ |
|
1479 |
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1480 |
+ int stride) \ |
|
1481 |
+{ \ |
|
1482 |
+ uint64_t half[8 + 9]; \ |
|
1483 |
+ uint8_t * const halfH = ((uint8_t*)half); \ |
|
1484 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1485 |
+ stride, 9); \ |
|
1486 |
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \ |
|
1487 |
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ |
|
1488 |
+} \ |
|
1489 |
+ \ |
|
1490 |
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1491 |
+ int stride) \ |
|
1492 |
+{ \ |
|
1493 |
+ uint64_t half[8 + 9]; \ |
|
1494 |
+ uint8_t * const halfH = ((uint8_t*)half); \ |
|
1495 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1496 |
+ stride, 9); \ |
|
1497 |
+ put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ |
|
1498 |
+ stride, 9); \ |
|
1499 |
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ |
|
1500 |
+} \ |
|
1501 |
+ \ |
|
1502 |
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1503 |
+ int stride) \ |
|
1504 |
+{ \ |
|
1505 |
+ uint64_t half[9]; \ |
|
1506 |
+ uint8_t * const halfH = ((uint8_t*)half); \ |
|
1507 |
+ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
|
1508 |
+ stride, 9); \ |
|
1509 |
+ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \ |
|
1510 |
+} \ |
|
1511 |
+ \ |
|
1512 |
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ |
|
1513 |
+ int stride) \ |
|
1514 |
+{ \ |
|
1515 |
+ OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \ |
|
1516 |
+} \ |
|
1517 |
+ \ |
|
1518 |
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1519 |
+ int stride) \ |
|
1520 |
+{ \ |
|
1521 |
+ uint64_t temp[32]; \ |
|
1522 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1523 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ |
|
1524 |
+ stride, 16); \ |
|
1525 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \ |
|
1526 |
+} \ |
|
1527 |
+ \ |
|
1528 |
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1529 |
+ int stride) \ |
|
1530 |
+{ \ |
|
1531 |
+ OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \ |
|
1532 |
+ stride, stride, 16); \ |
|
1533 |
+} \ |
|
1534 |
+ \ |
|
1535 |
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1536 |
+ int stride) \ |
|
1537 |
+{ \ |
|
1538 |
+ uint64_t temp[32]; \ |
|
1539 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1540 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ |
|
1541 |
+ stride, 16); \ |
|
1542 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \ |
|
1543 |
+ stride, stride, 16); \ |
|
1544 |
+} \ |
|
1545 |
+ \ |
|
1546 |
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1547 |
+ int stride) \ |
|
1548 |
+{ \ |
|
1549 |
+ uint64_t temp[32]; \ |
|
1550 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1551 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ |
|
1552 |
+ stride); \ |
|
1553 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \ |
|
1554 |
+} \ |
|
1555 |
+ \ |
|
1556 |
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1557 |
+ int stride) \ |
|
1558 |
+{ \ |
|
1559 |
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \ |
|
1560 |
+} \ |
|
1561 |
+ \ |
|
1562 |
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1563 |
+ int stride) \ |
|
1564 |
+{ \ |
|
1565 |
+ uint64_t temp[32]; \ |
|
1566 |
+ uint8_t * const half = (uint8_t*)temp; \ |
|
1567 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ |
|
1568 |
+ stride); \ |
|
1569 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \ |
|
1570 |
+ stride, stride, 16); \ |
|
1571 |
+} \ |
|
1572 |
+ \ |
|
1573 |
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1574 |
+ int stride) \ |
|
1575 |
+{ \ |
|
1576 |
+ uint64_t half[16 * 2 + 17 * 2]; \ |
|
1577 |
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
|
1578 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1579 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1580 |
+ stride, 17); \ |
|
1581 |
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ |
|
1582 |
+ stride, 17); \ |
|
1583 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
|
1584 |
+ 16, 16); \ |
|
1585 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \ |
|
1586 |
+} \ |
|
1587 |
+ \ |
|
1588 |
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1589 |
+ int stride) \ |
|
1590 |
+{ \ |
|
1591 |
+ uint64_t half[16 * 2 + 17 * 2]; \ |
|
1592 |
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
|
1593 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1594 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1595 |
+ stride, 17); \ |
|
1596 |
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ |
|
1597 |
+ stride, 17); \ |
|
1598 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
|
1599 |
+ 16, 16); \ |
|
1600 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \ |
|
1601 |
+} \ |
|
1602 |
+ \ |
|
1603 |
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1604 |
+ int stride) \ |
|
1605 |
+{ \ |
|
1606 |
+ uint64_t half[16 * 2 + 17 * 2]; \ |
|
1607 |
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
|
1608 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1609 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1610 |
+ stride, 17); \ |
|
1611 |
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ |
|
1612 |
+ stride, 17); \ |
|
1613 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
|
1614 |
+ 16, 16); \ |
|
1615 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \ |
|
1616 |
+ 16, 16); \ |
|
1617 |
+} \ |
|
1618 |
+ \ |
|
1619 |
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1620 |
+ int stride) \ |
|
1621 |
+{ \ |
|
1622 |
+ uint64_t half[16 * 2 + 17 * 2]; \ |
|
1623 |
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
|
1624 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1625 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1626 |
+ stride, 17); \ |
|
1627 |
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ |
|
1628 |
+ stride, 17); \ |
|
1629 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
|
1630 |
+ 16, 16); \ |
|
1631 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \ |
|
1632 |
+ 16, 16); \ |
|
1633 |
+} \ |
|
1634 |
+ \ |
|
1635 |
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1636 |
+ int stride) \ |
|
1637 |
+{ \ |
|
1638 |
+ uint64_t half[16 * 2 + 17 * 2]; \ |
|
1639 |
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
|
1640 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1641 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1642 |
+ stride, 17); \ |
|
1643 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
|
1644 |
+ 16, 16); \ |
|
1645 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \ |
|
1646 |
+} \ |
|
1647 |
+ \ |
|
1648 |
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1649 |
+ int stride) \ |
|
1650 |
+{ \ |
|
1651 |
+ uint64_t half[16 * 2 + 17 * 2]; \ |
|
1652 |
+ uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
|
1653 |
+ uint8_t * const halfHV = ((uint8_t*)half); \ |
|
1654 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1655 |
+ stride, 17); \ |
|
1656 |
+ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
|
1657 |
+ 16, 16); \ |
|
1658 |
+ OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \ |
|
1659 |
+ 16, 16); \ |
|
1660 |
+} \ |
|
1661 |
+ \ |
|
1662 |
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1663 |
+ int stride) \ |
|
1664 |
+{ \ |
|
1665 |
+ uint64_t half[17 * 2]; \ |
|
1666 |
+ uint8_t * const halfH = ((uint8_t*)half); \ |
|
1667 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1668 |
+ stride, 17); \ |
|
1669 |
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ |
|
1670 |
+ stride, 17); \ |
|
1671 |
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \ |
|
1672 |
+} \ |
|
1673 |
+ \ |
|
1674 |
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1675 |
+ int stride) \ |
|
1676 |
+{ \ |
|
1677 |
+ uint64_t half[17 * 2]; \ |
|
1678 |
+ uint8_t * const halfH = ((uint8_t*)half); \ |
|
1679 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1680 |
+ stride, 17); \ |
|
1681 |
+ put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ |
|
1682 |
+ stride, 17); \ |
|
1683 |
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \ |
|
1684 |
+} \ |
|
1685 |
+ \ |
|
1686 |
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ |
|
1687 |
+ int stride) \ |
|
1688 |
+{ \ |
|
1689 |
+ uint64_t half[17 * 2]; \ |
|
1690 |
+ uint8_t * const halfH = ((uint8_t*)half); \ |
|
1691 |
+ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
|
1692 |
+ stride, 17); \ |
|
1693 |
+ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \ |
|
1694 |
+} |
|
1695 |
+ |
|
1696 |
+#define PUT_OP(a, b, temp, size) \ |
|
1697 |
+ "mov"#size" "#a", "#b" \n\t" |
|
1698 |
+ |
|
1699 |
+#define AVG_3DNOW_OP(a, b, temp, size) \ |
|
1700 |
+ "mov"#size" "#b", "#temp" \n\t" \ |
|
1701 |
+ "pavgusb "#temp", "#a" \n\t" \ |
|
1702 |
+ "mov"#size" "#a", "#b" \n\t" |
|
1703 |
+ |
|
1704 |
+#define AVG_MMX2_OP(a, b, temp, size) \ |
|
1705 |
+ "mov"#size" "#b", "#temp" \n\t" \ |
|
1706 |
+ "pavgb "#temp", "#a" \n\t" \ |
|
1707 |
+ "mov"#size" "#a", "#b" \n\t" |
|
1708 |
+ |
|
1709 |
+QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP) |
|
1710 |
+QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP) |
|
1711 |
+QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP) |
|
1712 |
+QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow) |
|
1713 |
+QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow) |
|
1714 |
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow) |
|
1715 |
+QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2) |
|
1716 |
+QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2) |
|
1717 |
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) |
|
1527 | 1718 |
|
1528 | 1719 |
/***********************************/ |
1529 | 1720 |
/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ |
1530 | 1721 |
|
1531 |
-#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ |
|
1532 |
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1533 |
- OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ |
|
1534 |
-} |
|
1535 |
-#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ |
|
1536 |
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1537 |
- OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ |
|
1538 |
-} |
|
1539 |
- |
|
1540 |
-#define QPEL_2TAP(OPNAME, SIZE, MMX)\ |
|
1541 |
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ |
|
1542 |
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ |
|
1543 |
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ |
|
1544 |
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ |
|
1545 |
- OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ |
|
1546 |
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ |
|
1547 |
- OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ |
|
1548 |
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ |
|
1549 |
- OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ |
|
1550 |
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1551 |
- OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ |
|
1552 |
-}\ |
|
1553 |
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ |
|
1554 |
- OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ |
|
1555 |
-}\ |
|
1556 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ |
|
1557 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ |
|
1558 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ |
|
1559 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ |
|
1560 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ |
|
1561 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ |
|
1562 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ |
|
1563 |
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ |
|
1722 |
+#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \ |
|
1723 |
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \ |
|
1724 |
+ uint8_t *src, \ |
|
1725 |
+ int stride) \ |
|
1726 |
+{ \ |
|
1727 |
+ OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \ |
|
1728 |
+} |
|
1729 |
+ |
|
1730 |
+#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \ |
|
1731 |
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \ |
|
1732 |
+ uint8_t *src, \ |
|
1733 |
+ int stride) \ |
|
1734 |
+{ \ |
|
1735 |
+ OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \ |
|
1736 |
+ S1, S2); \ |
|
1737 |
+} |
|
1738 |
+ |
|
1739 |
+#define QPEL_2TAP(OPNAME, SIZE, MMX) \ |
|
1740 |
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \ |
|
1741 |
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \ |
|
1742 |
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \ |
|
1743 |
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \ |
|
1744 |
+ OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \ |
|
1745 |
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \ |
|
1746 |
+ OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \ |
|
1747 |
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \ |
|
1748 |
+ OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \ |
|
1749 |
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \ |
|
1750 |
+ uint8_t *src, \ |
|
1751 |
+ int stride) \ |
|
1752 |
+{ \ |
|
1753 |
+ OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \ |
|
1754 |
+} \ |
|
1755 |
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \ |
|
1756 |
+ uint8_t *src, \ |
|
1757 |
+ int stride) \ |
|
1758 |
+{ \ |
|
1759 |
+ OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \ |
|
1760 |
+ stride, SIZE); \ |
|
1761 |
+} \ |
|
1762 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \ |
|
1763 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \ |
|
1764 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \ |
|
1765 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \ |
|
1766 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \ |
|
1767 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \ |
|
1768 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \ |
|
1769 |
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \ |
|
1564 | 1770 |
|
1565 | 1771 |
QPEL_2TAP(put_, 16, mmx2) |
1566 | 1772 |
QPEL_2TAP(avg_, 16, mmx2) |
... | ... |
@@ -1573,265 +1794,276 @@ QPEL_2TAP(avg_, 8, 3dnow) |
1573 | 1573 |
|
1574 | 1574 |
|
1575 | 1575 |
#if HAVE_YASM |
1576 |
-typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src, |
|
1577 |
- x86_reg linesize, x86_reg start_y, |
|
1578 |
- x86_reg end_y, x86_reg block_h, |
|
1579 |
- x86_reg start_x, x86_reg end_x, |
|
1580 |
- x86_reg block_w); |
|
1576 |
+typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, |
|
1577 |
+ x86_reg linesize, x86_reg start_y, |
|
1578 |
+ x86_reg end_y, x86_reg block_h, |
|
1579 |
+ x86_reg start_x, x86_reg end_x, |
|
1580 |
+ x86_reg block_w); |
|
1581 | 1581 |
extern emu_edge_core_func ff_emu_edge_core_mmx; |
1582 | 1582 |
extern emu_edge_core_func ff_emu_edge_core_sse; |
1583 | 1583 |
|
1584 |
-static av_always_inline |
|
1585 |
-void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, |
|
1586 |
- int block_w, int block_h, |
|
1587 |
- int src_x, int src_y, int w, int h, |
|
1588 |
- emu_edge_core_func *core_fn) |
|
1584 |
+static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src, |
|
1585 |
+ int linesize, |
|
1586 |
+ int block_w, int block_h, |
|
1587 |
+ int src_x, int src_y, |
|
1588 |
+ int w, int h, |
|
1589 |
+ emu_edge_core_func *core_fn) |
|
1589 | 1590 |
{ |
1590 |
- int start_y, start_x, end_y, end_x, src_y_add=0; |
|
1591 |
- |
|
1592 |
- if(src_y>= h){ |
|
1593 |
- src_y_add = h-1-src_y; |
|
1594 |
- src_y=h-1; |
|
1595 |
- }else if(src_y<=-block_h){ |
|
1596 |
- src_y_add = 1-block_h-src_y; |
|
1597 |
- src_y=1-block_h; |
|
1591 |
+ int start_y, start_x, end_y, end_x, src_y_add = 0; |
|
1592 |
+ |
|
1593 |
+ if (src_y >= h) { |
|
1594 |
+ src_y_add = h - 1 - src_y; |
|
1595 |
+ src_y = h - 1; |
|
1596 |
+ } else if (src_y <= -block_h) { |
|
1597 |
+ src_y_add = 1 - block_h - src_y; |
|
1598 |
+ src_y = 1 - block_h; |
|
1598 | 1599 |
} |
1599 |
- if(src_x>= w){ |
|
1600 |
- src+= (w-1-src_x); |
|
1601 |
- src_x=w-1; |
|
1602 |
- }else if(src_x<=-block_w){ |
|
1603 |
- src+= (1-block_w-src_x); |
|
1604 |
- src_x=1-block_w; |
|
1600 |
+ if (src_x >= w) { |
|
1601 |
+ src += w - 1 - src_x; |
|
1602 |
+ src_x = w - 1; |
|
1603 |
+ } else if (src_x <= -block_w) { |
|
1604 |
+ src += 1 - block_w - src_x; |
|
1605 |
+ src_x = 1 - block_w; |
|
1605 | 1606 |
} |
1606 | 1607 |
|
1607 |
- start_y= FFMAX(0, -src_y); |
|
1608 |
- start_x= FFMAX(0, -src_x); |
|
1609 |
- end_y= FFMIN(block_h, h-src_y); |
|
1610 |
- end_x= FFMIN(block_w, w-src_x); |
|
1608 |
+ start_y = FFMAX(0, -src_y); |
|
1609 |
+ start_x = FFMAX(0, -src_x); |
|
1610 |
+ end_y = FFMIN(block_h, h-src_y); |
|
1611 |
+ end_x = FFMIN(block_w, w-src_x); |
|
1611 | 1612 |
assert(start_x < end_x && block_w > 0); |
1612 | 1613 |
assert(start_y < end_y && block_h > 0); |
1613 | 1614 |
|
1614 | 1615 |
// fill in the to-be-copied part plus all above/below |
1615 |
- src += (src_y_add+start_y)*linesize + start_x; |
|
1616 |
+ src += (src_y_add + start_y) * linesize + start_x; |
|
1616 | 1617 |
buf += start_x; |
1617 |
- core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w); |
|
1618 |
+ core_fn(buf, src, linesize, start_y, end_y, |
|
1619 |
+ block_h, start_x, end_x, block_w); |
|
1618 | 1620 |
} |
1619 | 1621 |
|
1620 | 1622 |
#if ARCH_X86_32 |
1621 |
-static av_noinline |
|
1622 |
-void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize, |
|
1623 |
- int block_w, int block_h, |
|
1624 |
- int src_x, int src_y, int w, int h) |
|
1623 |
+static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, |
|
1624 |
+ int linesize, |
|
1625 |
+ int block_w, int block_h, |
|
1626 |
+ int src_x, int src_y, int w, int h) |
|
1625 | 1627 |
{ |
1626 | 1628 |
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, |
1627 | 1629 |
w, h, &ff_emu_edge_core_mmx); |
1628 | 1630 |
} |
1629 | 1631 |
#endif |
1630 |
-static av_noinline |
|
1631 |
-void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize, |
|
1632 |
- int block_w, int block_h, |
|
1633 |
- int src_x, int src_y, int w, int h) |
|
1632 |
+ |
|
1633 |
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, |
|
1634 |
+ int linesize, |
|
1635 |
+ int block_w, int block_h, |
|
1636 |
+ int src_x, int src_y, int w, int h) |
|
1634 | 1637 |
{ |
1635 | 1638 |
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y, |
1636 | 1639 |
w, h, &ff_emu_edge_core_sse); |
1637 | 1640 |
} |
1638 | 1641 |
#endif /* HAVE_YASM */ |
1639 | 1642 |
|
1640 |
-typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src, |
|
1641 |
- int linesize, int block_w, int block_h, |
|
1642 |
- int src_x, int src_y, int w, int h); |
|
1643 |
- |
|
1644 |
-static av_always_inline |
|
1645 |
-void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
|
1646 |
- int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height, |
|
1647 |
- emulated_edge_mc_func *emu_edge_fn) |
|
1648 |
-{ |
|
1649 |
- const int w = 8; |
|
1650 |
- const int ix = ox>>(16+shift); |
|
1651 |
- const int iy = oy>>(16+shift); |
|
1652 |
- const int oxs = ox>>4; |
|
1653 |
- const int oys = oy>>4; |
|
1654 |
- const int dxxs = dxx>>4; |
|
1655 |
- const int dxys = dxy>>4; |
|
1656 |
- const int dyxs = dyx>>4; |
|
1657 |
- const int dyys = dyy>>4; |
|
1658 |
- const uint16_t r4[4] = {r,r,r,r}; |
|
1659 |
- const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; |
|
1660 |
- const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; |
|
1661 |
- const uint64_t shift2 = 2*shift; |
|
1662 |
- uint8_t edge_buf[(h+1)*stride]; |
|
1643 |
+typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, |
|
1644 |
+ int linesize, int block_w, int block_h, |
|
1645 |
+ int src_x, int src_y, int w, int h); |
|
1646 |
+ |
|
1647 |
+static av_always_inline void gmc(uint8_t *dst, uint8_t *src, |
|
1648 |
+ int stride, int h, int ox, int oy, |
|
1649 |
+ int dxx, int dxy, int dyx, int dyy, |
|
1650 |
+ int shift, int r, int width, int height, |
|
1651 |
+ emulated_edge_mc_func *emu_edge_fn) |
|
1652 |
+{ |
|
1653 |
+ const int w = 8; |
|
1654 |
+ const int ix = ox >> (16 + shift); |
|
1655 |
+ const int iy = oy >> (16 + shift); |
|
1656 |
+ const int oxs = ox >> 4; |
|
1657 |
+ const int oys = oy >> 4; |
|
1658 |
+ const int dxxs = dxx >> 4; |
|
1659 |
+ const int dxys = dxy >> 4; |
|
1660 |
+ const int dyxs = dyx >> 4; |
|
1661 |
+ const int dyys = dyy >> 4; |
|
1662 |
+ const uint16_t r4[4] = { r, r, r, r }; |
|
1663 |
+ const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; |
|
1664 |
+ const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; |
|
1665 |
+ const uint64_t shift2 = 2 * shift; |
|
1666 |
+ uint8_t edge_buf[(h + 1) * stride]; |
|
1663 | 1667 |
int x, y; |
1664 | 1668 |
|
1665 |
- const int dxw = (dxx-(1<<(16+shift)))*(w-1); |
|
1666 |
- const int dyh = (dyy-(1<<(16+shift)))*(h-1); |
|
1667 |
- const int dxh = dxy*(h-1); |
|
1668 |
- const int dyw = dyx*(w-1); |
|
1669 |
- if( // non-constant fullpel offset (3% of blocks) |
|
1670 |
- ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) | |
|
1671 |
- (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift) |
|
1669 |
+ const int dxw = (dxx - (1 << (16 + shift))) * (w - 1); |
|
1670 |
+ const int dyh = (dyy - (1 << (16 + shift))) * (h - 1); |
|
1671 |
+ const int dxh = dxy * (h - 1); |
|
1672 |
+ const int dyw = dyx * (w - 1); |
|
1673 |
+ if ( // non-constant fullpel offset (3% of blocks) |
|
1674 |
+ ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) | |
|
1675 |
+ (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) |
|
1672 | 1676 |
// uses more than 16 bits of subpel mv (only at huge resolution) |
1673 |
- || (dxx|dxy|dyx|dyy)&15 ) |
|
1674 |
- { |
|
1675 |
- //FIXME could still use mmx for some of the rows |
|
1676 |
- ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); |
|
1677 |
+ || (dxx | dxy | dyx | dyy) & 15) { |
|
1678 |
+ // FIXME could still use mmx for some of the rows |
|
1679 |
+ ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, |
|
1680 |
+ shift, r, width, height); |
|
1677 | 1681 |
return; |
1678 | 1682 |
} |
1679 | 1683 |
|
1680 |
- src += ix + iy*stride; |
|
1681 |
- if( (unsigned)ix >= width-w || |
|
1682 |
- (unsigned)iy >= height-h ) |
|
1683 |
- { |
|
1684 |
- emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); |
|
1684 |
+ src += ix + iy * stride; |
|
1685 |
+ if ((unsigned)ix >= width - w || |
|
1686 |
+ (unsigned)iy >= height - h) { |
|
1687 |
+ emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height); |
|
1685 | 1688 |
src = edge_buf; |
1686 | 1689 |
} |
1687 | 1690 |
|
1688 |
- __asm__ volatile( |
|
1689 |
- "movd %0, %%mm6 \n\t" |
|
1690 |
- "pxor %%mm7, %%mm7 \n\t" |
|
1691 |
- "punpcklwd %%mm6, %%mm6 \n\t" |
|
1692 |
- "punpcklwd %%mm6, %%mm6 \n\t" |
|
1691 |
+ __asm__ volatile ( |
|
1692 |
+ "movd %0, %%mm6 \n\t" |
|
1693 |
+ "pxor %%mm7, %%mm7 \n\t" |
|
1694 |
+ "punpcklwd %%mm6, %%mm6 \n\t" |
|
1695 |
+ "punpcklwd %%mm6, %%mm6 \n\t" |
|
1693 | 1696 |
:: "r"(1<<shift) |
1694 | 1697 |
); |
1695 | 1698 |
|
1696 |
- for(x=0; x<w; x+=4){ |
|
1697 |
- uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0), |
|
1698 |
- oxs - dxys + dxxs*(x+1), |
|
1699 |
- oxs - dxys + dxxs*(x+2), |
|
1700 |
- oxs - dxys + dxxs*(x+3) }; |
|
1701 |
- uint16_t dy4[4] = { oys - dyys + dyxs*(x+0), |
|
1702 |
- oys - dyys + dyxs*(x+1), |
|
1703 |
- oys - dyys + dyxs*(x+2), |
|
1704 |
- oys - dyys + dyxs*(x+3) }; |
|
1705 |
- |
|
1706 |
- for(y=0; y<h; y++){ |
|
1707 |
- __asm__ volatile( |
|
1708 |
- "movq %0, %%mm4 \n\t" |
|
1709 |
- "movq %1, %%mm5 \n\t" |
|
1710 |
- "paddw %2, %%mm4 \n\t" |
|
1711 |
- "paddw %3, %%mm5 \n\t" |
|
1712 |
- "movq %%mm4, %0 \n\t" |
|
1713 |
- "movq %%mm5, %1 \n\t" |
|
1714 |
- "psrlw $12, %%mm4 \n\t" |
|
1715 |
- "psrlw $12, %%mm5 \n\t" |
|
1699 |
+ for (x = 0; x < w; x += 4) { |
|
1700 |
+ uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0), |
|
1701 |
+ oxs - dxys + dxxs * (x + 1), |
|
1702 |
+ oxs - dxys + dxxs * (x + 2), |
|
1703 |
+ oxs - dxys + dxxs * (x + 3) }; |
|
1704 |
+ uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0), |
|
1705 |
+ oys - dyys + dyxs * (x + 1), |
|
1706 |
+ oys - dyys + dyxs * (x + 2), |
|
1707 |
+ oys - dyys + dyxs * (x + 3) }; |
|
1708 |
+ |
|
1709 |
+ for (y = 0; y < h; y++) { |
|
1710 |
+ __asm__ volatile ( |
|
1711 |
+ "movq %0, %%mm4 \n\t" |
|
1712 |
+ "movq %1, %%mm5 \n\t" |
|
1713 |
+ "paddw %2, %%mm4 \n\t" |
|
1714 |
+ "paddw %3, %%mm5 \n\t" |
|
1715 |
+ "movq %%mm4, %0 \n\t" |
|
1716 |
+ "movq %%mm5, %1 \n\t" |
|
1717 |
+ "psrlw $12, %%mm4 \n\t" |
|
1718 |
+ "psrlw $12, %%mm5 \n\t" |
|
1716 | 1719 |
: "+m"(*dx4), "+m"(*dy4) |
1717 | 1720 |
: "m"(*dxy4), "m"(*dyy4) |
1718 | 1721 |
); |
1719 | 1722 |
|
1720 |
- __asm__ volatile( |
|
1721 |
- "movq %%mm6, %%mm2 \n\t" |
|
1722 |
- "movq %%mm6, %%mm1 \n\t" |
|
1723 |
- "psubw %%mm4, %%mm2 \n\t" |
|
1724 |
- "psubw %%mm5, %%mm1 \n\t" |
|
1725 |
- "movq %%mm2, %%mm0 \n\t" |
|
1726 |
- "movq %%mm4, %%mm3 \n\t" |
|
1727 |
- "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy) |
|
1728 |
- "pmullw %%mm5, %%mm3 \n\t" // dx*dy |
|
1729 |
- "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy |
|
1730 |
- "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy) |
|
1731 |
- |
|
1732 |
- "movd %4, %%mm5 \n\t" |
|
1733 |
- "movd %3, %%mm4 \n\t" |
|
1723 |
+ __asm__ volatile ( |
|
1724 |
+ "movq %%mm6, %%mm2 \n\t" |
|
1725 |
+ "movq %%mm6, %%mm1 \n\t" |
|
1726 |
+ "psubw %%mm4, %%mm2 \n\t" |
|
1727 |
+ "psubw %%mm5, %%mm1 \n\t" |
|
1728 |
+ "movq %%mm2, %%mm0 \n\t" |
|
1729 |
+ "movq %%mm4, %%mm3 \n\t" |
|
1730 |
+ "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy) |
|
1731 |
+ "pmullw %%mm5, %%mm3 \n\t" // dx * dy |
|
1732 |
+ "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy |
|
1733 |
+ "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy) |
|
1734 |
+ |
|
1735 |
+ "movd %4, %%mm5 \n\t" |
|
1736 |
+ "movd %3, %%mm4 \n\t" |
|
1734 | 1737 |
"punpcklbw %%mm7, %%mm5 \n\t" |
1735 | 1738 |
"punpcklbw %%mm7, %%mm4 \n\t" |
1736 |
- "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy |
|
1737 |
- "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy |
|
1739 |
+ "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy |
|
1740 |
+ "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy |
|
1738 | 1741 |
|
1739 |
- "movd %2, %%mm5 \n\t" |
|
1740 |
- "movd %1, %%mm4 \n\t" |
|
1742 |
+ "movd %2, %%mm5 \n\t" |
|
1743 |
+ "movd %1, %%mm4 \n\t" |
|
1741 | 1744 |
"punpcklbw %%mm7, %%mm5 \n\t" |
1742 | 1745 |
"punpcklbw %%mm7, %%mm4 \n\t" |
1743 |
- "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy) |
|
1744 |
- "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy) |
|
1745 |
- "paddw %5, %%mm1 \n\t" |
|
1746 |
- "paddw %%mm3, %%mm2 \n\t" |
|
1747 |
- "paddw %%mm1, %%mm0 \n\t" |
|
1748 |
- "paddw %%mm2, %%mm0 \n\t" |
|
1749 |
- |
|
1750 |
- "psrlw %6, %%mm0 \n\t" |
|
1751 |
- "packuswb %%mm0, %%mm0 \n\t" |
|
1752 |
- "movd %%mm0, %0 \n\t" |
|
1753 |
- |
|
1754 |
- : "=m"(dst[x+y*stride]) |
|
1746 |
+ "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy) |
|
1747 |
+ "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy) |
|
1748 |
+ "paddw %5, %%mm1 \n\t" |
|
1749 |
+ "paddw %%mm3, %%mm2 \n\t" |
|
1750 |
+ "paddw %%mm1, %%mm0 \n\t" |
|
1751 |
+ "paddw %%mm2, %%mm0 \n\t" |
|
1752 |
+ |
|
1753 |
+ "psrlw %6, %%mm0 \n\t" |
|
1754 |
+ "packuswb %%mm0, %%mm0 \n\t" |
|
1755 |
+ "movd %%mm0, %0 \n\t" |
|
1756 |
+ |
|
1757 |
+ : "=m"(dst[x + y * stride]) |
|
1755 | 1758 |
: "m"(src[0]), "m"(src[1]), |
1756 |
- "m"(src[stride]), "m"(src[stride+1]), |
|
1759 |
+ "m"(src[stride]), "m"(src[stride + 1]), |
|
1757 | 1760 |
"m"(*r4), "m"(shift2) |
1758 | 1761 |
); |
1759 | 1762 |
src += stride; |
1760 | 1763 |
} |
1761 |
- src += 4-h*stride; |
|
1764 |
+ src += 4 - h * stride; |
|
1762 | 1765 |
} |
1763 | 1766 |
} |
1764 | 1767 |
|
1765 | 1768 |
#if HAVE_YASM |
1766 | 1769 |
#if ARCH_X86_32 |
1767 |
-static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
|
1768 |
- int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) |
|
1770 |
+static void gmc_mmx(uint8_t *dst, uint8_t *src, |
|
1771 |
+ int stride, int h, int ox, int oy, |
|
1772 |
+ int dxx, int dxy, int dyx, int dyy, |
|
1773 |
+ int shift, int r, int width, int height) |
|
1769 | 1774 |
{ |
1770 | 1775 |
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, |
1771 | 1776 |
width, height, &emulated_edge_mc_mmx); |
1772 | 1777 |
} |
1773 | 1778 |
#endif |
1774 |
-static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
|
1775 |
- int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) |
|
1779 |
+static void gmc_sse(uint8_t *dst, uint8_t *src, |
|
1780 |
+ int stride, int h, int ox, int oy, |
|
1781 |
+ int dxx, int dxy, int dyx, int dyy, |
|
1782 |
+ int shift, int r, int width, int height) |
|
1776 | 1783 |
{ |
1777 | 1784 |
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, |
1778 | 1785 |
width, height, &emulated_edge_mc_sse); |
1779 | 1786 |
} |
1780 | 1787 |
#else |
1781 |
-static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, |
|
1782 |
- int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) |
|
1788 |
+static void gmc_mmx(uint8_t *dst, uint8_t *src, |
|
1789 |
+ int stride, int h, int ox, int oy, |
|
1790 |
+ int dxx, int dxy, int dyx, int dyy, |
|
1791 |
+ int shift, int r, int width, int height) |
|
1783 | 1792 |
{ |
1784 | 1793 |
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, |
1785 | 1794 |
width, height, &ff_emulated_edge_mc_8); |
1786 | 1795 |
} |
1787 | 1796 |
#endif |
1788 | 1797 |
|
1789 |
-#define PREFETCH(name, op) \ |
|
1790 |
-static void name(void *mem, int stride, int h){\ |
|
1791 |
- const uint8_t *p= mem;\ |
|
1792 |
- do{\ |
|
1793 |
- __asm__ volatile(#op" %0" :: "m"(*p));\ |
|
1794 |
- p+= stride;\ |
|
1795 |
- }while(--h);\ |
|
1798 |
+#define PREFETCH(name, op) \ |
|
1799 |
+static void name(void *mem, int stride, int h) \ |
|
1800 |
+{ \ |
|
1801 |
+ const uint8_t *p = mem; \ |
|
1802 |
+ do { \ |
|
1803 |
+ __asm__ volatile (#op" %0" :: "m"(*p)); \ |
|
1804 |
+ p += stride; \ |
|
1805 |
+ } while (--h); \ |
|
1796 | 1806 |
} |
1807 |
+ |
|
1797 | 1808 |
PREFETCH(prefetch_mmx2, prefetcht0) |
1798 | 1809 |
PREFETCH(prefetch_3dnow, prefetch) |
1799 | 1810 |
#undef PREFETCH |
1800 | 1811 |
|
1801 | 1812 |
#include "h264_qpel_mmx.c" |
1802 | 1813 |
|
1803 |
-void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, |
|
1804 |
- int stride, int h, int x, int y); |
|
1805 |
-void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src, |
|
1806 |
- int stride, int h, int x, int y); |
|
1807 |
-void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src, |
|
1808 |
- int stride, int h, int x, int y); |
|
1809 |
- |
|
1810 |
-void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, |
|
1811 |
- int stride, int h, int x, int y); |
|
1812 |
-void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, |
|
1813 |
- int stride, int h, int x, int y); |
|
1814 |
-void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, |
|
1815 |
- int stride, int h, int x, int y); |
|
1816 |
- |
|
1817 |
-void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, |
|
1818 |
- int stride, int h, int x, int y); |
|
1819 |
-void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, |
|
1820 |
- int stride, int h, int x, int y); |
|
1821 |
- |
|
1822 |
-void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, |
|
1823 |
- int stride, int h, int x, int y); |
|
1824 |
-void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
|
1825 |
- int stride, int h, int x, int y); |
|
1826 |
- |
|
1827 |
-void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, |
|
1828 |
- int stride, int h, int x, int y); |
|
1829 |
-void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
|
1830 |
- int stride, int h, int x, int y); |
|
1831 |
- |
|
1832 |
-#define CHROMA_MC(OP, NUM, DEPTH, OPT) \ |
|
1833 |
-void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ |
|
1834 |
- (uint8_t *dst, uint8_t *src,\ |
|
1814 |
+void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, |
|
1815 |
+ int stride, int h, int x, int y); |
|
1816 |
+void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src, |
|
1817 |
+ int stride, int h, int x, int y); |
|
1818 |
+void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src, |
|
1819 |
+ int stride, int h, int x, int y); |
|
1820 |
+ |
|
1821 |
+void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, |
|
1822 |
+ int stride, int h, int x, int y); |
|
1823 |
+void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src, |
|
1824 |
+ int stride, int h, int x, int y); |
|
1825 |
+void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, |
|
1826 |
+ int stride, int h, int x, int y); |
|
1827 |
+ |
|
1828 |
+void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, |
|
1829 |
+ int stride, int h, int x, int y); |
|
1830 |
+void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src, |
|
1831 |
+ int stride, int h, int x, int y); |
|
1832 |
+ |
|
1833 |
+void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src, |
|
1834 |
+ int stride, int h, int x, int y); |
|
1835 |
+void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
|
1836 |
+ int stride, int h, int x, int y); |
|
1837 |
+ |
|
1838 |
+void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src, |
|
1839 |
+ int stride, int h, int x, int y); |
|
1840 |
+void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
|
1841 |
+ int stride, int h, int x, int y); |
|
1842 |
+ |
|
1843 |
+#define CHROMA_MC(OP, NUM, DEPTH, OPT) \ |
|
1844 |
+void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ |
|
1845 |
+ (uint8_t *dst, uint8_t *src, \ |
|
1835 | 1846 |
int stride, int h, int x, int y); |
1836 | 1847 |
|
1837 | 1848 |
CHROMA_MC(put, 2, 10, mmxext) |
... | ... |
@@ -1843,25 +2075,37 @@ CHROMA_MC(avg, 8, 10, sse2) |
1843 | 1843 |
CHROMA_MC(put, 8, 10, avx) |
1844 | 1844 |
CHROMA_MC(avg, 8, 10, avx) |
1845 | 1845 |
|
1846 |
-/* CAVS specific */ |
|
1847 |
-void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { |
|
1846 |
+/* CAVS-specific */ |
|
1847 |
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) |
|
1848 |
+{ |
|
1848 | 1849 |
put_pixels8_mmx(dst, src, stride, 8); |
1849 | 1850 |
} |
1850 |
-void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { |
|
1851 |
+ |
|
1852 |
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) |
|
1853 |
+{ |
|
1851 | 1854 |
avg_pixels8_mmx(dst, src, stride, 8); |
1852 | 1855 |
} |
1853 |
-void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { |
|
1856 |
+ |
|
1857 |
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) |
|
1858 |
+{ |
|
1854 | 1859 |
put_pixels16_mmx(dst, src, stride, 16); |
1855 | 1860 |
} |
1856 |
-void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { |
|
1861 |
+ |
|
1862 |
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) |
|
1863 |
+{ |
|
1857 | 1864 |
avg_pixels16_mmx(dst, src, stride, 16); |
1858 | 1865 |
} |
1859 | 1866 |
|
1860 |
-/* VC1 specific */ |
|
1861 |
-void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { |
|
1867 |
+/* VC-1-specific */ |
|
1868 |
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, |
|
1869 |
+ int stride, int rnd) |
|
1870 |
+{ |
|
1862 | 1871 |
put_pixels8_mmx(dst, src, stride, 8); |
1863 | 1872 |
} |
1864 |
-void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) { |
|
1873 |
+ |
|
1874 |
+void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, |
|
1875 |
+ int stride, int rnd) |
|
1876 |
+{ |
|
1865 | 1877 |
avg_pixels8_mmx2(dst, src, stride, 8); |
1866 | 1878 |
} |
1867 | 1879 |
|
... | ... |
@@ -1943,90 +2187,102 @@ void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, |
1943 | 1943 |
} |
1944 | 1944 |
#endif |
1945 | 1945 |
|
1946 |
-/* XXX: those functions should be suppressed ASAP when all IDCTs are |
|
1947 |
- converted */ |
|
1946 |
+/* XXX: Those functions should be suppressed ASAP when all IDCTs are |
|
1947 |
+ * converted. */ |
|
1948 | 1948 |
#if CONFIG_GPL |
1949 |
-static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
|
1949 |
+static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, |
|
1950 |
+ DCTELEM *block) |
|
1950 | 1951 |
{ |
1951 |
- ff_mmx_idct (block); |
|
1952 |
+ ff_mmx_idct(block); |
|
1952 | 1953 |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
1953 | 1954 |
} |
1954 |
-static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) |
|
1955 |
+ |
|
1956 |
+static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, |
|
1957 |
+ DCTELEM *block) |
|
1955 | 1958 |
{ |
1956 |
- ff_mmx_idct (block); |
|
1959 |
+ ff_mmx_idct(block); |
|
1957 | 1960 |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
1958 | 1961 |
} |
1959 |
-static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) |
|
1962 |
+ |
|
1963 |
+static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, |
|
1964 |
+ DCTELEM *block) |
|
1960 | 1965 |
{ |
1961 |
- ff_mmxext_idct (block); |
|
1966 |
+ ff_mmxext_idct(block); |
|
1962 | 1967 |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
1963 | 1968 |
} |
1964 |
-static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) |
|
1969 |
+ |
|
1970 |
+static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, |
|
1971 |
+ DCTELEM *block) |
|
1965 | 1972 |
{ |
1966 |
- ff_mmxext_idct (block); |
|
1973 |
+ ff_mmxext_idct(block); |
|
1967 | 1974 |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
1968 | 1975 |
} |
1969 | 1976 |
#endif |
1977 |
+ |
|
1970 | 1978 |
static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) |
1971 | 1979 |
{ |
1972 |
- ff_idct_xvid_mmx (block); |
|
1980 |
+ ff_idct_xvid_mmx(block); |
|
1973 | 1981 |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
1974 | 1982 |
} |
1983 |
+ |
|
1975 | 1984 |
static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block) |
1976 | 1985 |
{ |
1977 |
- ff_idct_xvid_mmx (block); |
|
1986 |
+ ff_idct_xvid_mmx(block); |
|
1978 | 1987 |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
1979 | 1988 |
} |
1989 |
+ |
|
1980 | 1990 |
static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block) |
1981 | 1991 |
{ |
1982 |
- ff_idct_xvid_mmx2 (block); |
|
1992 |
+ ff_idct_xvid_mmx2(block); |
|
1983 | 1993 |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
1984 | 1994 |
} |
1995 |
+ |
|
1985 | 1996 |
static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) |
1986 | 1997 |
{ |
1987 |
- ff_idct_xvid_mmx2 (block); |
|
1998 |
+ ff_idct_xvid_mmx2(block); |
|
1988 | 1999 |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
1989 | 2000 |
} |
1990 | 2001 |
|
1991 | 2002 |
static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) |
1992 | 2003 |
{ |
1993 | 2004 |
int i; |
1994 |
- __asm__ volatile("pxor %%mm7, %%mm7":); |
|
1995 |
- for(i=0; i<blocksize; i+=2) { |
|
1996 |
- __asm__ volatile( |
|
1997 |
- "movq %0, %%mm0 \n\t" |
|
1998 |
- "movq %1, %%mm1 \n\t" |
|
1999 |
- "movq %%mm0, %%mm2 \n\t" |
|
2000 |
- "movq %%mm1, %%mm3 \n\t" |
|
2001 |
- "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 |
|
2002 |
- "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 |
|
2003 |
- "pslld $31, %%mm2 \n\t" // keep only the sign bit |
|
2004 |
- "pxor %%mm2, %%mm1 \n\t" |
|
2005 |
- "movq %%mm3, %%mm4 \n\t" |
|
2006 |
- "pand %%mm1, %%mm3 \n\t" |
|
2007 |
- "pandn %%mm1, %%mm4 \n\t" |
|
2008 |
- "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) |
|
2009 |
- "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) |
|
2010 |
- "movq %%mm3, %1 \n\t" |
|
2011 |
- "movq %%mm0, %0 \n\t" |
|
2012 |
- :"+m"(mag[i]), "+m"(ang[i]) |
|
2013 |
- ::"memory" |
|
2005 |
+ __asm__ volatile ("pxor %%mm7, %%mm7":); |
|
2006 |
+ for (i = 0; i < blocksize; i += 2) { |
|
2007 |
+ __asm__ volatile ( |
|
2008 |
+ "movq %0, %%mm0 \n\t" |
|
2009 |
+ "movq %1, %%mm1 \n\t" |
|
2010 |
+ "movq %%mm0, %%mm2 \n\t" |
|
2011 |
+ "movq %%mm1, %%mm3 \n\t" |
|
2012 |
+ "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 |
|
2013 |
+ "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 |
|
2014 |
+ "pslld $31, %%mm2 \n\t" // keep only the sign bit |
|
2015 |
+ "pxor %%mm2, %%mm1 \n\t" |
|
2016 |
+ "movq %%mm3, %%mm4 \n\t" |
|
2017 |
+ "pand %%mm1, %%mm3 \n\t" |
|
2018 |
+ "pandn %%mm1, %%mm4 \n\t" |
|
2019 |
+ "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) |
|
2020 |
+ "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) |
|
2021 |
+ "movq %%mm3, %1 \n\t" |
|
2022 |
+ "movq %%mm0, %0 \n\t" |
|
2023 |
+ : "+m"(mag[i]), "+m"(ang[i]) |
|
2024 |
+ :: "memory" |
|
2014 | 2025 |
); |
2015 | 2026 |
} |
2016 |
- __asm__ volatile("femms"); |
|
2027 |
+ __asm__ volatile ("femms"); |
|
2017 | 2028 |
} |
2029 |
+ |
|
2018 | 2030 |
static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) |
2019 | 2031 |
{ |
2020 | 2032 |
int i; |
2021 | 2033 |
|
2022 |
- __asm__ volatile( |
|
2023 |
- "movaps %0, %%xmm5 \n\t" |
|
2024 |
- ::"m"(ff_pdw_80000000[0]) |
|
2034 |
+ __asm__ volatile ( |
|
2035 |
+ "movaps %0, %%xmm5 \n\t" |
|
2036 |
+ :: "m"(ff_pdw_80000000[0]) |
|
2025 | 2037 |
); |
2026 |
- for(i=0; i<blocksize; i+=4) { |
|
2027 |
- __asm__ volatile( |
|
2028 |
- "movaps %0, %%xmm0 \n\t" |
|
2029 |
- "movaps %1, %%xmm1 \n\t" |
|
2038 |
+ for (i = 0; i < blocksize; i += 4) { |
|
2039 |
+ __asm__ volatile ( |
|
2040 |
+ "movaps %0, %%xmm0 \n\t" |
|
2041 |
+ "movaps %1, %%xmm1 \n\t" |
|
2030 | 2042 |
"xorps %%xmm2, %%xmm2 \n\t" |
2031 | 2043 |
"xorps %%xmm3, %%xmm3 \n\t" |
2032 | 2044 |
"cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 |
... | ... |
@@ -2036,12 +2292,12 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) |
2036 | 2036 |
"movaps %%xmm3, %%xmm4 \n\t" |
2037 | 2037 |
"andps %%xmm1, %%xmm3 \n\t" |
2038 | 2038 |
"andnps %%xmm1, %%xmm4 \n\t" |
2039 |
- "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) |
|
2040 |
- "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) |
|
2039 |
+ "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) |
|
2040 |
+ "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) |
|
2041 | 2041 |
"movaps %%xmm3, %1 \n\t" |
2042 | 2042 |
"movaps %%xmm0, %0 \n\t" |
2043 |
- :"+m"(mag[i]), "+m"(ang[i]) |
|
2044 |
- ::"memory" |
|
2043 |
+ : "+m"(mag[i]), "+m"(ang[i]) |
|
2044 |
+ :: "memory" |
|
2045 | 2045 |
); |
2046 | 2046 |
} |
2047 | 2047 |
} |
... | ... |
@@ -2049,97 +2305,105 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) |
2049 | 2049 |
#define IF1(x) x |
2050 | 2050 |
#define IF0(x) |
2051 | 2051 |
|
2052 |
-#define MIX5(mono,stereo)\ |
|
2053 |
- __asm__ volatile(\ |
|
2054 |
- "movss 0(%2), %%xmm5 \n"\ |
|
2055 |
- "movss 8(%2), %%xmm6 \n"\ |
|
2056 |
- "movss 24(%2), %%xmm7 \n"\ |
|
2057 |
- "shufps $0, %%xmm5, %%xmm5 \n"\ |
|
2058 |
- "shufps $0, %%xmm6, %%xmm6 \n"\ |
|
2059 |
- "shufps $0, %%xmm7, %%xmm7 \n"\ |
|
2060 |
- "1: \n"\ |
|
2061 |
- "movaps (%0,%1), %%xmm0 \n"\ |
|
2062 |
- "movaps 0x400(%0,%1), %%xmm1 \n"\ |
|
2063 |
- "movaps 0x800(%0,%1), %%xmm2 \n"\ |
|
2064 |
- "movaps 0xc00(%0,%1), %%xmm3 \n"\ |
|
2065 |
- "movaps 0x1000(%0,%1), %%xmm4 \n"\ |
|
2066 |
- "mulps %%xmm5, %%xmm0 \n"\ |
|
2067 |
- "mulps %%xmm6, %%xmm1 \n"\ |
|
2068 |
- "mulps %%xmm5, %%xmm2 \n"\ |
|
2069 |
- "mulps %%xmm7, %%xmm3 \n"\ |
|
2070 |
- "mulps %%xmm7, %%xmm4 \n"\ |
|
2071 |
- stereo("addps %%xmm1, %%xmm0 \n")\ |
|
2072 |
- "addps %%xmm1, %%xmm2 \n"\ |
|
2073 |
- "addps %%xmm3, %%xmm0 \n"\ |
|
2074 |
- "addps %%xmm4, %%xmm2 \n"\ |
|
2075 |
- mono("addps %%xmm2, %%xmm0 \n")\ |
|
2076 |
- "movaps %%xmm0, (%0,%1) \n"\ |
|
2077 |
- stereo("movaps %%xmm2, 0x400(%0,%1) \n")\ |
|
2078 |
- "add $16, %0 \n"\ |
|
2079 |
- "jl 1b \n"\ |
|
2080 |
- :"+&r"(i)\ |
|
2081 |
- :"r"(samples[0]+len), "r"(matrix)\ |
|
2082 |
- :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
|
2083 |
- "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ |
|
2084 |
- "memory"\ |
|
2052 |
+#define MIX5(mono, stereo) \ |
|
2053 |
+ __asm__ volatile ( \ |
|
2054 |
+ "movss 0(%2), %%xmm5 \n" \ |
|
2055 |
+ "movss 8(%2), %%xmm6 \n" \ |
|
2056 |
+ "movss 24(%2), %%xmm7 \n" \ |
|
2057 |
+ "shufps $0, %%xmm5, %%xmm5 \n" \ |
|
2058 |
+ "shufps $0, %%xmm6, %%xmm6 \n" \ |
|
2059 |
+ "shufps $0, %%xmm7, %%xmm7 \n" \ |
|
2060 |
+ "1: \n" \ |
|
2061 |
+ "movaps (%0, %1), %%xmm0 \n" \ |
|
2062 |
+ "movaps 0x400(%0, %1), %%xmm1 \n" \ |
|
2063 |
+ "movaps 0x800(%0, %1), %%xmm2 \n" \ |
|
2064 |
+ "movaps 0xc00(%0, %1), %%xmm3 \n" \ |
|
2065 |
+ "movaps 0x1000(%0, %1), %%xmm4 \n" \ |
|
2066 |
+ "mulps %%xmm5, %%xmm0 \n" \ |
|
2067 |
+ "mulps %%xmm6, %%xmm1 \n" \ |
|
2068 |
+ "mulps %%xmm5, %%xmm2 \n" \ |
|
2069 |
+ "mulps %%xmm7, %%xmm3 \n" \ |
|
2070 |
+ "mulps %%xmm7, %%xmm4 \n" \ |
|
2071 |
+ stereo("addps %%xmm1, %%xmm0 \n") \ |
|
2072 |
+ "addps %%xmm1, %%xmm2 \n" \ |
|
2073 |
+ "addps %%xmm3, %%xmm0 \n" \ |
|
2074 |
+ "addps %%xmm4, %%xmm2 \n" \ |
|
2075 |
+ mono("addps %%xmm2, %%xmm0 \n") \ |
|
2076 |
+ "movaps %%xmm0, (%0, %1) \n" \ |
|
2077 |
+ stereo("movaps %%xmm2, 0x400(%0, %1) \n") \ |
|
2078 |
+ "add $16, %0 \n" \ |
|
2079 |
+ "jl 1b \n" \ |
|
2080 |
+ : "+&r"(i) \ |
|
2081 |
+ : "r"(samples[0] + len), "r"(matrix) \ |
|
2082 |
+ : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
|
2083 |
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ |
|
2084 |
+ "memory" \ |
|
2085 | 2085 |
); |
2086 | 2086 |
|
2087 |
-#define MIX_MISC(stereo)\ |
|
2088 |
- __asm__ volatile(\ |
|
2089 |
- "1: \n"\ |
|
2090 |
- "movaps (%3,%0), %%xmm0 \n"\ |
|
2091 |
- stereo("movaps %%xmm0, %%xmm1 \n")\ |
|
2092 |
- "mulps %%xmm4, %%xmm0 \n"\ |
|
2093 |
- stereo("mulps %%xmm5, %%xmm1 \n")\ |
|
2094 |
- "lea 1024(%3,%0), %1 \n"\ |
|
2095 |
- "mov %5, %2 \n"\ |
|
2096 |
- "2: \n"\ |
|
2097 |
- "movaps (%1), %%xmm2 \n"\ |
|
2098 |
- stereo("movaps %%xmm2, %%xmm3 \n")\ |
|
2099 |
- "mulps (%4,%2), %%xmm2 \n"\ |
|
2100 |
- stereo("mulps 16(%4,%2), %%xmm3 \n")\ |
|
2101 |
- "addps %%xmm2, %%xmm0 \n"\ |
|
2102 |
- stereo("addps %%xmm3, %%xmm1 \n")\ |
|
2103 |
- "add $1024, %1 \n"\ |
|
2104 |
- "add $32, %2 \n"\ |
|
2105 |
- "jl 2b \n"\ |
|
2106 |
- "movaps %%xmm0, (%3,%0) \n"\ |
|
2107 |
- stereo("movaps %%xmm1, 1024(%3,%0) \n")\ |
|
2108 |
- "add $16, %0 \n"\ |
|
2109 |
- "jl 1b \n"\ |
|
2110 |
- :"+&r"(i), "=&r"(j), "=&r"(k)\ |
|
2111 |
- :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\ |
|
2112 |
- :"memory"\ |
|
2087 |
+#define MIX_MISC(stereo) \ |
|
2088 |
+ __asm__ volatile ( \ |
|
2089 |
+ "1: \n" \ |
|
2090 |
+ "movaps (%3, %0), %%xmm0 \n" \ |
|
2091 |
+ stereo("movaps %%xmm0, %%xmm1 \n") \ |
|
2092 |
+ "mulps %%xmm4, %%xmm0 \n" \ |
|
2093 |
+ stereo("mulps %%xmm5, %%xmm1 \n") \ |
|
2094 |
+ "lea 1024(%3, %0), %1 \n" \ |
|
2095 |
+ "mov %5, %2 \n" \ |
|
2096 |
+ "2: \n" \ |
|
2097 |
+ "movaps (%1), %%xmm2 \n" \ |
|
2098 |
+ stereo("movaps %%xmm2, %%xmm3 \n") \ |
|
2099 |
+ "mulps (%4, %2), %%xmm2 \n" \ |
|
2100 |
+ stereo("mulps 16(%4, %2), %%xmm3 \n") \ |
|
2101 |
+ "addps %%xmm2, %%xmm0 \n" \ |
|
2102 |
+ stereo("addps %%xmm3, %%xmm1 \n") \ |
|
2103 |
+ "add $1024, %1 \n" \ |
|
2104 |
+ "add $32, %2 \n" \ |
|
2105 |
+ "jl 2b \n" \ |
|
2106 |
+ "movaps %%xmm0, (%3, %0) \n" \ |
|
2107 |
+ stereo("movaps %%xmm1, 1024(%3, %0) \n") \ |
|
2108 |
+ "add $16, %0 \n" \ |
|
2109 |
+ "jl 1b \n" \ |
|
2110 |
+ : "+&r"(i), "=&r"(j), "=&r"(k) \ |
|
2111 |
+ : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \ |
|
2112 |
+ "g"((intptr_t) - 32 * (in_ch - 1)) \ |
|
2113 |
+ : "memory" \ |
|
2113 | 2114 |
); |
2114 | 2115 |
|
2115 |
-static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len) |
|
2116 |
+static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], |
|
2117 |
+ int out_ch, int in_ch, int len) |
|
2116 | 2118 |
{ |
2117 | 2119 |
int (*matrix_cmp)[2] = (int(*)[2])matrix; |
2118 |
- intptr_t i,j,k; |
|
2119 |
- |
|
2120 |
- i = -len*sizeof(float); |
|
2121 |
- if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) { |
|
2122 |
- MIX5(IF0,IF1); |
|
2123 |
- } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) { |
|
2124 |
- MIX5(IF1,IF0); |
|
2120 |
+ intptr_t i, j, k; |
|
2121 |
+ |
|
2122 |
+ i = -len * sizeof(float); |
|
2123 |
+ if (in_ch == 5 && out_ch == 2 && |
|
2124 |
+ !(matrix_cmp[0][1] | matrix_cmp[2][0] | |
|
2125 |
+ matrix_cmp[3][1] | matrix_cmp[4][0] | |
|
2126 |
+ (matrix_cmp[1][0] ^ matrix_cmp[1][1]) | |
|
2127 |
+ (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) { |
|
2128 |
+ MIX5(IF0, IF1); |
|
2129 |
+ } else if (in_ch == 5 && out_ch == 1 && |
|
2130 |
+ matrix_cmp[0][0] == matrix_cmp[2][0] && |
|
2131 |
+ matrix_cmp[3][0] == matrix_cmp[4][0]) { |
|
2132 |
+ MIX5(IF1, IF0); |
|
2125 | 2133 |
} else { |
2126 | 2134 |
DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; |
2127 |
- j = 2*in_ch*sizeof(float); |
|
2128 |
- __asm__ volatile( |
|
2129 |
- "1: \n" |
|
2130 |
- "sub $8, %0 \n" |
|
2131 |
- "movss (%2,%0), %%xmm4 \n" |
|
2132 |
- "movss 4(%2,%0), %%xmm5 \n" |
|
2133 |
- "shufps $0, %%xmm4, %%xmm4 \n" |
|
2134 |
- "shufps $0, %%xmm5, %%xmm5 \n" |
|
2135 |
- "movaps %%xmm4, (%1,%0,4) \n" |
|
2136 |
- "movaps %%xmm5, 16(%1,%0,4) \n" |
|
2137 |
- "jg 1b \n" |
|
2138 |
- :"+&r"(j) |
|
2139 |
- :"r"(matrix_simd), "r"(matrix) |
|
2140 |
- :"memory" |
|
2135 |
+ j = 2 * in_ch * sizeof(float); |
|
2136 |
+ __asm__ volatile ( |
|
2137 |
+ "1: \n" |
|
2138 |
+ "sub $8, %0 \n" |
|
2139 |
+ "movss (%2, %0), %%xmm4 \n" |
|
2140 |
+ "movss 4(%2, %0), %%xmm5 \n" |
|
2141 |
+ "shufps $0, %%xmm4, %%xmm4 \n" |
|
2142 |
+ "shufps $0, %%xmm5, %%xmm5 \n" |
|
2143 |
+ "movaps %%xmm4, (%1, %0, 4) \n" |
|
2144 |
+ "movaps %%xmm5, 16(%1, %0, 4) \n" |
|
2145 |
+ "jg 1b \n" |
|
2146 |
+ : "+&r"(j) |
|
2147 |
+ : "r"(matrix_simd), "r"(matrix) |
|
2148 |
+ : "memory" |
|
2141 | 2149 |
); |
2142 |
- if(out_ch == 2) { |
|
2150 |
+ if (out_ch == 2) { |
|
2143 | 2151 |
MIX_MISC(IF1); |
2144 | 2152 |
} else { |
2145 | 2153 |
MIX_MISC(IF0); |
... | ... |
@@ -2147,216 +2411,232 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_c |
2147 | 2147 |
} |
2148 | 2148 |
} |
2149 | 2149 |
|
2150 |
-static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){ |
|
2151 |
- x86_reg i = (len-4)*4; |
|
2152 |
- __asm__ volatile( |
|
2153 |
- "1: \n\t" |
|
2154 |
- "movq (%2,%0), %%mm0 \n\t" |
|
2155 |
- "movq 8(%2,%0), %%mm1 \n\t" |
|
2156 |
- "pfmul (%3,%0), %%mm0 \n\t" |
|
2157 |
- "pfmul 8(%3,%0), %%mm1 \n\t" |
|
2158 |
- "movq %%mm0, (%1,%0) \n\t" |
|
2159 |
- "movq %%mm1, 8(%1,%0) \n\t" |
|
2160 |
- "sub $16, %0 \n\t" |
|
2161 |
- "jge 1b \n\t" |
|
2162 |
- "femms \n\t" |
|
2163 |
- :"+r"(i) |
|
2164 |
- :"r"(dst), "r"(src0), "r"(src1) |
|
2165 |
- :"memory" |
|
2150 |
+static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, |
|
2151 |
+ int len) |
|
2152 |
+{ |
|
2153 |
+ x86_reg i = (len - 4) * 4; |
|
2154 |
+ __asm__ volatile ( |
|
2155 |
+ "1: \n\t" |
|
2156 |
+ "movq (%2, %0), %%mm0 \n\t" |
|
2157 |
+ "movq 8(%2, %0), %%mm1 \n\t" |
|
2158 |
+ "pfmul (%3, %0), %%mm0 \n\t" |
|
2159 |
+ "pfmul 8(%3, %0), %%mm1 \n\t" |
|
2160 |
+ "movq %%mm0, (%1, %0) \n\t" |
|
2161 |
+ "movq %%mm1, 8(%1, %0) \n\t" |
|
2162 |
+ "sub $16, %0 \n\t" |
|
2163 |
+ "jge 1b \n\t" |
|
2164 |
+ "femms \n\t" |
|
2165 |
+ : "+r"(i) |
|
2166 |
+ : "r"(dst), "r"(src0), "r"(src1) |
|
2167 |
+ : "memory" |
|
2166 | 2168 |
); |
2167 | 2169 |
} |
2168 |
-static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){ |
|
2169 |
- x86_reg i = (len-8)*4; |
|
2170 |
- __asm__ volatile( |
|
2171 |
- "1: \n\t" |
|
2172 |
- "movaps (%2,%0), %%xmm0 \n\t" |
|
2173 |
- "movaps 16(%2,%0), %%xmm1 \n\t" |
|
2174 |
- "mulps (%3,%0), %%xmm0 \n\t" |
|
2175 |
- "mulps 16(%3,%0), %%xmm1 \n\t" |
|
2176 |
- "movaps %%xmm0, (%1,%0) \n\t" |
|
2177 |
- "movaps %%xmm1, 16(%1,%0) \n\t" |
|
2178 |
- "sub $32, %0 \n\t" |
|
2179 |
- "jge 1b \n\t" |
|
2180 |
- :"+r"(i) |
|
2181 |
- :"r"(dst), "r"(src0), "r"(src1) |
|
2182 |
- :"memory" |
|
2170 |
+ |
|
2171 |
+static void vector_fmul_sse(float *dst, const float *src0, const float *src1, |
|
2172 |
+ int len) |
|
2173 |
+{ |
|
2174 |
+ x86_reg i = (len - 8) * 4; |
|
2175 |
+ __asm__ volatile ( |
|
2176 |
+ "1: \n\t" |
|
2177 |
+ "movaps (%2, %0), %%xmm0 \n\t" |
|
2178 |
+ "movaps 16(%2, %0), %%xmm1 \n\t" |
|
2179 |
+ "mulps (%3, %0), %%xmm0 \n\t" |
|
2180 |
+ "mulps 16(%3, %0), %%xmm1 \n\t" |
|
2181 |
+ "movaps %%xmm0, (%1, %0) \n\t" |
|
2182 |
+ "movaps %%xmm1, 16(%1, %0) \n\t" |
|
2183 |
+ "sub $32, %0 \n\t" |
|
2184 |
+ "jge 1b \n\t" |
|
2185 |
+ : "+r"(i) |
|
2186 |
+ : "r"(dst), "r"(src0), "r"(src1) |
|
2187 |
+ : "memory" |
|
2183 | 2188 |
); |
2184 | 2189 |
} |
2185 | 2190 |
|
2186 |
-static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ |
|
2187 |
- x86_reg i = len*4-16; |
|
2188 |
- __asm__ volatile( |
|
2189 |
- "1: \n\t" |
|
2190 |
- "pswapd 8(%1), %%mm0 \n\t" |
|
2191 |
- "pswapd (%1), %%mm1 \n\t" |
|
2192 |
- "pfmul (%3,%0), %%mm0 \n\t" |
|
2193 |
- "pfmul 8(%3,%0), %%mm1 \n\t" |
|
2194 |
- "movq %%mm0, (%2,%0) \n\t" |
|
2195 |
- "movq %%mm1, 8(%2,%0) \n\t" |
|
2196 |
- "add $16, %1 \n\t" |
|
2197 |
- "sub $16, %0 \n\t" |
|
2198 |
- "jge 1b \n\t" |
|
2199 |
- :"+r"(i), "+r"(src1) |
|
2200 |
- :"r"(dst), "r"(src0) |
|
2191 |
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, |
|
2192 |
+ const float *src1, int len) |
|
2193 |
+{ |
|
2194 |
+ x86_reg i = len * 4 - 16; |
|
2195 |
+ __asm__ volatile ( |
|
2196 |
+ "1: \n\t" |
|
2197 |
+ "pswapd 8(%1), %%mm0 \n\t" |
|
2198 |
+ "pswapd (%1), %%mm1 \n\t" |
|
2199 |
+ "pfmul (%3, %0), %%mm0 \n\t" |
|
2200 |
+ "pfmul 8(%3, %0), %%mm1 \n\t" |
|
2201 |
+ "movq %%mm0, (%2, %0) \n\t" |
|
2202 |
+ "movq %%mm1, 8(%2, %0) \n\t" |
|
2203 |
+ "add $16, %1 \n\t" |
|
2204 |
+ "sub $16, %0 \n\t" |
|
2205 |
+ "jge 1b \n\t" |
|
2206 |
+ : "+r"(i), "+r"(src1) |
|
2207 |
+ : "r"(dst), "r"(src0) |
|
2201 | 2208 |
); |
2202 |
- __asm__ volatile("femms"); |
|
2209 |
+ __asm__ volatile ("femms"); |
|
2203 | 2210 |
} |
2204 |
-static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ |
|
2205 |
- x86_reg i = len*4-32; |
|
2206 |
- __asm__ volatile( |
|
2207 |
- "1: \n\t" |
|
2208 |
- "movaps 16(%1), %%xmm0 \n\t" |
|
2209 |
- "movaps (%1), %%xmm1 \n\t" |
|
2210 |
- "shufps $0x1b, %%xmm0, %%xmm0 \n\t" |
|
2211 |
- "shufps $0x1b, %%xmm1, %%xmm1 \n\t" |
|
2212 |
- "mulps (%3,%0), %%xmm0 \n\t" |
|
2213 |
- "mulps 16(%3,%0), %%xmm1 \n\t" |
|
2214 |
- "movaps %%xmm0, (%2,%0) \n\t" |
|
2215 |
- "movaps %%xmm1, 16(%2,%0) \n\t" |
|
2216 |
- "add $32, %1 \n\t" |
|
2217 |
- "sub $32, %0 \n\t" |
|
2218 |
- "jge 1b \n\t" |
|
2219 |
- :"+r"(i), "+r"(src1) |
|
2220 |
- :"r"(dst), "r"(src0) |
|
2211 |
+ |
|
2212 |
+static void vector_fmul_reverse_sse(float *dst, const float *src0, |
|
2213 |
+ const float *src1, int len) |
|
2214 |
+{ |
|
2215 |
+ x86_reg i = len * 4 - 32; |
|
2216 |
+ __asm__ volatile ( |
|
2217 |
+ "1: \n\t" |
|
2218 |
+ "movaps 16(%1), %%xmm0 \n\t" |
|
2219 |
+ "movaps (%1), %%xmm1 \n\t" |
|
2220 |
+ "shufps $0x1b, %%xmm0, %%xmm0 \n\t" |
|
2221 |
+ "shufps $0x1b, %%xmm1, %%xmm1 \n\t" |
|
2222 |
+ "mulps (%3, %0), %%xmm0 \n\t" |
|
2223 |
+ "mulps 16(%3, %0), %%xmm1 \n\t" |
|
2224 |
+ "movaps %%xmm0, (%2, %0) \n\t" |
|
2225 |
+ "movaps %%xmm1, 16(%2, %0) \n\t" |
|
2226 |
+ "add $32, %1 \n\t" |
|
2227 |
+ "sub $32, %0 \n\t" |
|
2228 |
+ "jge 1b \n\t" |
|
2229 |
+ : "+r"(i), "+r"(src1) |
|
2230 |
+ : "r"(dst), "r"(src0) |
|
2221 | 2231 |
); |
2222 | 2232 |
} |
2223 | 2233 |
|
2224 |
-static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, |
|
2225 |
- const float *src2, int len){ |
|
2226 |
- x86_reg i = (len-4)*4; |
|
2227 |
- __asm__ volatile( |
|
2228 |
- "1: \n\t" |
|
2229 |
- "movq (%2,%0), %%mm0 \n\t" |
|
2230 |
- "movq 8(%2,%0), %%mm1 \n\t" |
|
2231 |
- "pfmul (%3,%0), %%mm0 \n\t" |
|
2232 |
- "pfmul 8(%3,%0), %%mm1 \n\t" |
|
2233 |
- "pfadd (%4,%0), %%mm0 \n\t" |
|
2234 |
- "pfadd 8(%4,%0), %%mm1 \n\t" |
|
2235 |
- "movq %%mm0, (%1,%0) \n\t" |
|
2236 |
- "movq %%mm1, 8(%1,%0) \n\t" |
|
2237 |
- "sub $16, %0 \n\t" |
|
2238 |
- "jge 1b \n\t" |
|
2239 |
- :"+r"(i) |
|
2240 |
- :"r"(dst), "r"(src0), "r"(src1), "r"(src2) |
|
2241 |
- :"memory" |
|
2234 |
+static void vector_fmul_add_3dnow(float *dst, const float *src0, |
|
2235 |
+ const float *src1, const float *src2, int len) |
|
2236 |
+{ |
|
2237 |
+ x86_reg i = (len - 4) * 4; |
|
2238 |
+ __asm__ volatile ( |
|
2239 |
+ "1: \n\t" |
|
2240 |
+ "movq (%2, %0), %%mm0 \n\t" |
|
2241 |
+ "movq 8(%2, %0), %%mm1 \n\t" |
|
2242 |
+ "pfmul (%3, %0), %%mm0 \n\t" |
|
2243 |
+ "pfmul 8(%3, %0), %%mm1 \n\t" |
|
2244 |
+ "pfadd (%4, %0), %%mm0 \n\t" |
|
2245 |
+ "pfadd 8(%4, %0), %%mm1 \n\t" |
|
2246 |
+ "movq %%mm0, (%1, %0) \n\t" |
|
2247 |
+ "movq %%mm1, 8(%1, %0) \n\t" |
|
2248 |
+ "sub $16, %0 \n\t" |
|
2249 |
+ "jge 1b \n\t" |
|
2250 |
+ : "+r"(i) |
|
2251 |
+ : "r"(dst), "r"(src0), "r"(src1), "r"(src2) |
|
2252 |
+ : "memory" |
|
2242 | 2253 |
); |
2243 |
- __asm__ volatile("femms"); |
|
2254 |
+ __asm__ volatile ("femms"); |
|
2244 | 2255 |
} |
2245 |
-static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, |
|
2246 |
- const float *src2, int len){ |
|
2247 |
- x86_reg i = (len-8)*4; |
|
2248 |
- __asm__ volatile( |
|
2249 |
- "1: \n\t" |
|
2250 |
- "movaps (%2,%0), %%xmm0 \n\t" |
|
2251 |
- "movaps 16(%2,%0), %%xmm1 \n\t" |
|
2252 |
- "mulps (%3,%0), %%xmm0 \n\t" |
|
2253 |
- "mulps 16(%3,%0), %%xmm1 \n\t" |
|
2254 |
- "addps (%4,%0), %%xmm0 \n\t" |
|
2255 |
- "addps 16(%4,%0), %%xmm1 \n\t" |
|
2256 |
- "movaps %%xmm0, (%1,%0) \n\t" |
|
2257 |
- "movaps %%xmm1, 16(%1,%0) \n\t" |
|
2258 |
- "sub $32, %0 \n\t" |
|
2259 |
- "jge 1b \n\t" |
|
2260 |
- :"+r"(i) |
|
2261 |
- :"r"(dst), "r"(src0), "r"(src1), "r"(src2) |
|
2262 |
- :"memory" |
|
2256 |
+ |
|
2257 |
+static void vector_fmul_add_sse(float *dst, const float *src0, |
|
2258 |
+ const float *src1, const float *src2, int len) |
|
2259 |
+{ |
|
2260 |
+ x86_reg i = (len - 8) * 4; |
|
2261 |
+ __asm__ volatile ( |
|
2262 |
+ "1: \n\t" |
|
2263 |
+ "movaps (%2, %0), %%xmm0 \n\t" |
|
2264 |
+ "movaps 16(%2, %0), %%xmm1 \n\t" |
|
2265 |
+ "mulps (%3, %0), %%xmm0 \n\t" |
|
2266 |
+ "mulps 16(%3, %0), %%xmm1 \n\t" |
|
2267 |
+ "addps (%4, %0), %%xmm0 \n\t" |
|
2268 |
+ "addps 16(%4, %0), %%xmm1 \n\t" |
|
2269 |
+ "movaps %%xmm0, (%1, %0) \n\t" |
|
2270 |
+ "movaps %%xmm1, 16(%1, %0) \n\t" |
|
2271 |
+ "sub $32, %0 \n\t" |
|
2272 |
+ "jge 1b \n\t" |
|
2273 |
+ : "+r"(i) |
|
2274 |
+ : "r"(dst), "r"(src0), "r"(src1), "r"(src2) |
|
2275 |
+ : "memory" |
|
2263 | 2276 |
); |
2264 | 2277 |
} |
2265 | 2278 |
|
2266 | 2279 |
#if HAVE_6REGS |
2267 |
-static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, |
|
2268 |
- const float *win, int len){ |
|
2269 |
- x86_reg i = -len*4; |
|
2270 |
- x86_reg j = len*4-8; |
|
2271 |
- __asm__ volatile( |
|
2272 |
- "1: \n" |
|
2273 |
- "pswapd (%5,%1), %%mm1 \n" |
|
2274 |
- "movq (%5,%0), %%mm0 \n" |
|
2275 |
- "pswapd (%4,%1), %%mm5 \n" |
|
2276 |
- "movq (%3,%0), %%mm4 \n" |
|
2277 |
- "movq %%mm0, %%mm2 \n" |
|
2278 |
- "movq %%mm1, %%mm3 \n" |
|
2279 |
- "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] |
|
2280 |
- "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] |
|
2281 |
- "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] |
|
2282 |
- "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] |
|
2283 |
- "pfadd %%mm3, %%mm2 \n" |
|
2284 |
- "pfsub %%mm0, %%mm1 \n" |
|
2285 |
- "pswapd %%mm2, %%mm2 \n" |
|
2286 |
- "movq %%mm1, (%2,%0) \n" |
|
2287 |
- "movq %%mm2, (%2,%1) \n" |
|
2288 |
- "sub $8, %1 \n" |
|
2289 |
- "add $8, %0 \n" |
|
2290 |
- "jl 1b \n" |
|
2291 |
- "femms \n" |
|
2292 |
- :"+r"(i), "+r"(j) |
|
2293 |
- :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) |
|
2280 |
+static void vector_fmul_window_3dnow2(float *dst, const float *src0, |
|
2281 |
+ const float *src1, const float *win, |
|
2282 |
+ int len) |
|
2283 |
+{ |
|
2284 |
+ x86_reg i = -len * 4; |
|
2285 |
+ x86_reg j = len * 4 - 8; |
|
2286 |
+ __asm__ volatile ( |
|
2287 |
+ "1: \n" |
|
2288 |
+ "pswapd (%5, %1), %%mm1 \n" |
|
2289 |
+ "movq (%5, %0), %%mm0 \n" |
|
2290 |
+ "pswapd (%4, %1), %%mm5 \n" |
|
2291 |
+ "movq (%3, %0), %%mm4 \n" |
|
2292 |
+ "movq %%mm0, %%mm2 \n" |
|
2293 |
+ "movq %%mm1, %%mm3 \n" |
|
2294 |
+ "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i] |
|
2295 |
+ "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j] |
|
2296 |
+ "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j] |
|
2297 |
+ "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i] |
|
2298 |
+ "pfadd %%mm3, %%mm2 \n" |
|
2299 |
+ "pfsub %%mm0, %%mm1 \n" |
|
2300 |
+ "pswapd %%mm2, %%mm2 \n" |
|
2301 |
+ "movq %%mm1, (%2, %0) \n" |
|
2302 |
+ "movq %%mm2, (%2, %1) \n" |
|
2303 |
+ "sub $8, %1 \n" |
|
2304 |
+ "add $8, %0 \n" |
|
2305 |
+ "jl 1b \n" |
|
2306 |
+ "femms \n" |
|
2307 |
+ : "+r"(i), "+r"(j) |
|
2308 |
+ : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) |
|
2294 | 2309 |
); |
2295 | 2310 |
} |
2296 | 2311 |
|
2297 |
-static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, |
|
2298 |
- const float *win, int len){ |
|
2299 |
- x86_reg i = -len*4; |
|
2300 |
- x86_reg j = len*4-16; |
|
2301 |
- __asm__ volatile( |
|
2302 |
- "1: \n" |
|
2303 |
- "movaps (%5,%1), %%xmm1 \n" |
|
2304 |
- "movaps (%5,%0), %%xmm0 \n" |
|
2305 |
- "movaps (%4,%1), %%xmm5 \n" |
|
2306 |
- "movaps (%3,%0), %%xmm4 \n" |
|
2307 |
- "shufps $0x1b, %%xmm1, %%xmm1 \n" |
|
2308 |
- "shufps $0x1b, %%xmm5, %%xmm5 \n" |
|
2309 |
- "movaps %%xmm0, %%xmm2 \n" |
|
2310 |
- "movaps %%xmm1, %%xmm3 \n" |
|
2311 |
- "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] |
|
2312 |
- "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] |
|
2313 |
- "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] |
|
2314 |
- "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] |
|
2315 |
- "addps %%xmm3, %%xmm2 \n" |
|
2316 |
- "subps %%xmm0, %%xmm1 \n" |
|
2317 |
- "shufps $0x1b, %%xmm2, %%xmm2 \n" |
|
2318 |
- "movaps %%xmm1, (%2,%0) \n" |
|
2319 |
- "movaps %%xmm2, (%2,%1) \n" |
|
2320 |
- "sub $16, %1 \n" |
|
2321 |
- "add $16, %0 \n" |
|
2322 |
- "jl 1b \n" |
|
2323 |
- :"+r"(i), "+r"(j) |
|
2324 |
- :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) |
|
2312 |
+static void vector_fmul_window_sse(float *dst, const float *src0, |
|
2313 |
+ const float *src1, const float *win, int len) |
|
2314 |
+{ |
|
2315 |
+ x86_reg i = -len * 4; |
|
2316 |
+ x86_reg j = len * 4 - 16; |
|
2317 |
+ __asm__ volatile ( |
|
2318 |
+ "1: \n" |
|
2319 |
+ "movaps (%5, %1), %%xmm1 \n" |
|
2320 |
+ "movaps (%5, %0), %%xmm0 \n" |
|
2321 |
+ "movaps (%4, %1), %%xmm5 \n" |
|
2322 |
+ "movaps (%3, %0), %%xmm4 \n" |
|
2323 |
+ "shufps $0x1b, %%xmm1, %%xmm1 \n" |
|
2324 |
+ "shufps $0x1b, %%xmm5, %%xmm5 \n" |
|
2325 |
+ "movaps %%xmm0, %%xmm2 \n" |
|
2326 |
+ "movaps %%xmm1, %%xmm3 \n" |
|
2327 |
+ "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i] |
|
2328 |
+ "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j] |
|
2329 |
+ "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j] |
|
2330 |
+ "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i] |
|
2331 |
+ "addps %%xmm3, %%xmm2 \n" |
|
2332 |
+ "subps %%xmm0, %%xmm1 \n" |
|
2333 |
+ "shufps $0x1b, %%xmm2, %%xmm2 \n" |
|
2334 |
+ "movaps %%xmm1, (%2, %0) \n" |
|
2335 |
+ "movaps %%xmm2, (%2, %1) \n" |
|
2336 |
+ "sub $16, %1 \n" |
|
2337 |
+ "add $16, %0 \n" |
|
2338 |
+ "jl 1b \n" |
|
2339 |
+ : "+r"(i), "+r"(j) |
|
2340 |
+ : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len) |
|
2325 | 2341 |
); |
2326 | 2342 |
} |
2327 | 2343 |
#endif /* HAVE_6REGS */ |
2328 | 2344 |
|
2329 |
-static void vector_clipf_sse(float *dst, const float *src, float min, float max, |
|
2330 |
- int len) |
|
2345 |
+static void vector_clipf_sse(float *dst, const float *src, |
|
2346 |
+ float min, float max, int len) |
|
2331 | 2347 |
{ |
2332 |
- x86_reg i = (len-16)*4; |
|
2333 |
- __asm__ volatile( |
|
2334 |
- "movss %3, %%xmm4 \n" |
|
2335 |
- "movss %4, %%xmm5 \n" |
|
2336 |
- "shufps $0, %%xmm4, %%xmm4 \n" |
|
2337 |
- "shufps $0, %%xmm5, %%xmm5 \n" |
|
2338 |
- "1: \n\t" |
|
2339 |
- "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel |
|
2340 |
- "movaps 16(%2,%0), %%xmm1 \n\t" |
|
2341 |
- "movaps 32(%2,%0), %%xmm2 \n\t" |
|
2342 |
- "movaps 48(%2,%0), %%xmm3 \n\t" |
|
2343 |
- "maxps %%xmm4, %%xmm0 \n\t" |
|
2344 |
- "maxps %%xmm4, %%xmm1 \n\t" |
|
2345 |
- "maxps %%xmm4, %%xmm2 \n\t" |
|
2346 |
- "maxps %%xmm4, %%xmm3 \n\t" |
|
2347 |
- "minps %%xmm5, %%xmm0 \n\t" |
|
2348 |
- "minps %%xmm5, %%xmm1 \n\t" |
|
2349 |
- "minps %%xmm5, %%xmm2 \n\t" |
|
2350 |
- "minps %%xmm5, %%xmm3 \n\t" |
|
2351 |
- "movaps %%xmm0, (%1,%0) \n\t" |
|
2352 |
- "movaps %%xmm1, 16(%1,%0) \n\t" |
|
2353 |
- "movaps %%xmm2, 32(%1,%0) \n\t" |
|
2354 |
- "movaps %%xmm3, 48(%1,%0) \n\t" |
|
2355 |
- "sub $64, %0 \n\t" |
|
2356 |
- "jge 1b \n\t" |
|
2357 |
- :"+&r"(i) |
|
2358 |
- :"r"(dst), "r"(src), "m"(min), "m"(max) |
|
2359 |
- :"memory" |
|
2348 |
+ x86_reg i = (len - 16) * 4; |
|
2349 |
+ __asm__ volatile ( |
|
2350 |
+ "movss %3, %%xmm4 \n\t" |
|
2351 |
+ "movss %4, %%xmm5 \n\t" |
|
2352 |
+ "shufps $0, %%xmm4, %%xmm4 \n\t" |
|
2353 |
+ "shufps $0, %%xmm5, %%xmm5 \n\t" |
|
2354 |
+ "1: \n\t" |
|
2355 |
+ "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel |
|
2356 |
+ "movaps 16(%2, %0), %%xmm1 \n\t" |
|
2357 |
+ "movaps 32(%2, %0), %%xmm2 \n\t" |
|
2358 |
+ "movaps 48(%2, %0), %%xmm3 \n\t" |
|
2359 |
+ "maxps %%xmm4, %%xmm0 \n\t" |
|
2360 |
+ "maxps %%xmm4, %%xmm1 \n\t" |
|
2361 |
+ "maxps %%xmm4, %%xmm2 \n\t" |
|
2362 |
+ "maxps %%xmm4, %%xmm3 \n\t" |
|
2363 |
+ "minps %%xmm5, %%xmm0 \n\t" |
|
2364 |
+ "minps %%xmm5, %%xmm1 \n\t" |
|
2365 |
+ "minps %%xmm5, %%xmm2 \n\t" |
|
2366 |
+ "minps %%xmm5, %%xmm3 \n\t" |
|
2367 |
+ "movaps %%xmm0, (%1, %0) \n\t" |
|
2368 |
+ "movaps %%xmm1, 16(%1, %0) \n\t" |
|
2369 |
+ "movaps %%xmm2, 32(%1, %0) \n\t" |
|
2370 |
+ "movaps %%xmm3, 48(%1, %0) \n\t" |
|
2371 |
+ "sub $64, %0 \n\t" |
|
2372 |
+ "jge 1b \n\t" |
|
2373 |
+ : "+&r"(i) |
|
2374 |
+ : "r"(dst), "r"(src), "m"(min), "m"(max) |
|
2375 |
+ : "memory" |
|
2360 | 2376 |
); |
2361 | 2377 |
} |
2362 | 2378 |
|
... | ... |
@@ -2364,7 +2644,8 @@ void ff_vp3_idct_mmx(int16_t *input_data); |
2364 | 2364 |
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
2365 | 2365 |
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block); |
2366 | 2366 |
|
2367 |
-void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block); |
|
2367 |
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, |
|
2368 |
+ const DCTELEM *block); |
|
2368 | 2369 |
|
2369 | 2370 |
void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); |
2370 | 2371 |
void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values); |
... | ... |
@@ -2373,11 +2654,19 @@ void ff_vp3_idct_sse2(int16_t *input_data); |
2373 | 2373 |
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
2374 | 2374 |
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block); |
2375 | 2375 |
|
2376 |
-int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift); |
|
2377 |
-int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift); |
|
2378 |
-int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); |
|
2379 |
-int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); |
|
2380 |
-int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); |
|
2376 |
+int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, |
|
2377 |
+ int order, int shift); |
|
2378 |
+int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, |
|
2379 |
+ int order, int shift); |
|
2380 |
+int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, |
|
2381 |
+ const int16_t *v3, |
|
2382 |
+ int order, int mul); |
|
2383 |
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, |
|
2384 |
+ const int16_t *v3, |
|
2385 |
+ int order, int mul); |
|
2386 |
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, |
|
2387 |
+ const int16_t *v3, |
|
2388 |
+ int order, int mul); |
|
2381 | 2389 |
|
2382 | 2390 |
void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input, |
2383 | 2391 |
const int16_t *window, unsigned int len); |
... | ... |
@@ -2395,27 +2684,32 @@ void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, |
2395 | 2395 |
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); |
2396 | 2396 |
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); |
2397 | 2397 |
|
2398 |
-void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); |
|
2399 |
-int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); |
|
2400 |
-int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); |
|
2398 |
+void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, |
|
2399 |
+ const uint8_t *diff, int w, |
|
2400 |
+ int *left, int *left_top); |
|
2401 |
+int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, |
|
2402 |
+ int w, int left); |
|
2403 |
+int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, |
|
2404 |
+ int w, int left); |
|
2401 | 2405 |
|
2402 | 2406 |
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); |
2403 | 2407 |
|
2404 |
-void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min, |
|
2405 |
- int32_t max, unsigned int len); |
|
2406 |
-void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min, |
|
2407 |
- int32_t max, unsigned int len); |
|
2408 |
-void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min, |
|
2409 |
- int32_t max, unsigned int len); |
|
2410 |
-void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min, |
|
2411 |
- int32_t max, unsigned int len); |
|
2408 |
+void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, |
|
2409 |
+ int32_t min, int32_t max, unsigned int len); |
|
2410 |
+void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, |
|
2411 |
+ int32_t min, int32_t max, unsigned int len); |
|
2412 |
+void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, |
|
2413 |
+ int32_t min, int32_t max, unsigned int len); |
|
2414 |
+void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, |
|
2415 |
+ int32_t min, int32_t max, unsigned int len); |
|
2412 | 2416 |
|
2413 | 2417 |
extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0, |
2414 | 2418 |
const float *src1, int len); |
2415 | 2419 |
extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, |
2416 | 2420 |
const float *src1, int len); |
2417 | 2421 |
|
2418 |
-#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ |
|
2422 |
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ |
|
2423 |
+ do { \ |
|
2419 | 2424 |
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ |
2420 | 2425 |
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ |
2421 | 2426 |
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ |
... | ... |
@@ -2431,25 +2725,32 @@ extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, |
2431 | 2431 |
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ |
2432 | 2432 |
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ |
2433 | 2433 |
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ |
2434 |
- c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU |
|
2435 |
- |
|
2436 |
-#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
|
2437 |
- c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ |
|
2438 |
- c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ |
|
2439 |
- c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ |
|
2440 |
- c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU |
|
2441 |
- |
|
2442 |
-#define H264_QPEL_FUNCS(x, y, CPU) \ |
|
2443 |
- c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU; \ |
|
2444 |
- c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU; \ |
|
2445 |
- c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU; \ |
|
2446 |
- c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU |
|
2447 |
- |
|
2448 |
-#define H264_QPEL_FUNCS_10(x, y, CPU) \ |
|
2449 |
- c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU; \ |
|
2450 |
- c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU; \ |
|
2451 |
- c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU; \ |
|
2452 |
- c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; |
|
2434 |
+ c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ |
|
2435 |
+ } while (0) |
|
2436 |
+ |
|
2437 |
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
|
2438 |
+ do { \ |
|
2439 |
+ c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ |
|
2440 |
+ c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ |
|
2441 |
+ c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ |
|
2442 |
+ c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ |
|
2443 |
+ } while (0) |
|
2444 |
+ |
|
2445 |
+#define H264_QPEL_FUNCS(x, y, CPU) \ |
|
2446 |
+ do { \ |
|
2447 |
+ c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \ |
|
2448 |
+ c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \ |
|
2449 |
+ c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \ |
|
2450 |
+ c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \ |
|
2451 |
+ } while (0) |
|
2452 |
+ |
|
2453 |
+#define H264_QPEL_FUNCS_10(x, y, CPU) \ |
|
2454 |
+ do { \ |
|
2455 |
+ c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ |
|
2456 |
+ c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ |
|
2457 |
+ c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ |
|
2458 |
+ c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ |
|
2459 |
+ } while (0) |
|
2453 | 2460 |
|
2454 | 2461 |
static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) |
2455 | 2462 |
{ |
... | ... |
@@ -2464,18 +2765,18 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) |
2464 | 2464 |
c->clear_blocks = clear_blocks_mmx; |
2465 | 2465 |
c->draw_edges = draw_edges_mmx; |
2466 | 2466 |
|
2467 |
- SET_HPEL_FUNCS(put, 0, 16, mmx); |
|
2467 |
+ SET_HPEL_FUNCS(put, 0, 16, mmx); |
|
2468 | 2468 |
SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx); |
2469 |
- SET_HPEL_FUNCS(avg, 0, 16, mmx); |
|
2469 |
+ SET_HPEL_FUNCS(avg, 0, 16, mmx); |
|
2470 | 2470 |
SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx); |
2471 |
- SET_HPEL_FUNCS(put, 1, 8, mmx); |
|
2472 |
- SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); |
|
2473 |
- SET_HPEL_FUNCS(avg, 1, 8, mmx); |
|
2474 |
- SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); |
|
2471 |
+ SET_HPEL_FUNCS(put, 1, 8, mmx); |
|
2472 |
+ SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx); |
|
2473 |
+ SET_HPEL_FUNCS(avg, 1, 8, mmx); |
|
2474 |
+ SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx); |
|
2475 | 2475 |
} |
2476 | 2476 |
|
2477 | 2477 |
#if ARCH_X86_32 || !HAVE_YASM |
2478 |
- c->gmc= gmc_mmx; |
|
2478 |
+ c->gmc = gmc_mmx; |
|
2479 | 2479 |
#endif |
2480 | 2480 |
#if ARCH_X86_32 && HAVE_YASM |
2481 | 2481 |
if (!high_bit_depth) |
... | ... |
@@ -2543,47 +2844,48 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, |
2543 | 2543 |
c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2; |
2544 | 2544 |
} |
2545 | 2545 |
} |
2546 |
- if (CONFIG_VP3_DECODER && HAVE_YASM) { |
|
2546 |
+ if (CONFIG_VP3_DECODER && HAVE_YASM) |
|
2547 | 2547 |
c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2; |
2548 |
- } |
|
2549 | 2548 |
|
2550 |
- if (CONFIG_VP3_DECODER |
|
2551 |
- && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { |
|
2549 |
+ if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 || |
|
2550 |
+ avctx->codec_id == CODEC_ID_THEORA)) { |
|
2552 | 2551 |
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; |
2553 | 2552 |
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; |
2554 | 2553 |
} |
2555 | 2554 |
|
2556 |
- SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); |
|
2557 |
- SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); |
|
2558 |
- SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); |
|
2559 |
- SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); |
|
2560 |
- SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, ); |
|
2561 |
- SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, ); |
|
2555 |
+ if (CONFIG_H264QPEL) { |
|
2556 |
+ SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, ); |
|
2557 |
+ SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, ); |
|
2558 |
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); |
|
2559 |
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); |
|
2560 |
+ SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, ); |
|
2561 |
+ SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, ); |
|
2562 | 2562 |
|
2563 |
- if (!high_bit_depth) { |
|
2564 |
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); |
|
2565 |
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, ); |
|
2566 |
- SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, ); |
|
2567 |
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, ); |
|
2568 |
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, ); |
|
2569 |
- SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, ); |
|
2570 |
- } else if (bit_depth == 10) { |
|
2563 |
+ if (!high_bit_depth) { |
|
2564 |
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); |
|
2565 |
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, ); |
|
2566 |
+ SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, ); |
|
2567 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, ); |
|
2568 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, ); |
|
2569 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, ); |
|
2570 |
+ } else if (bit_depth == 10) { |
|
2571 | 2571 |
#if HAVE_YASM |
2572 | 2572 |
#if !ARCH_X86_64 |
2573 |
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); |
|
2574 |
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); |
|
2575 |
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); |
|
2576 |
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); |
|
2573 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); |
|
2574 |
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); |
|
2575 |
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); |
|
2576 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); |
|
2577 | 2577 |
#endif |
2578 |
- SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); |
|
2579 |
- SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); |
|
2578 |
+ SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); |
|
2579 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); |
|
2580 | 2580 |
#endif |
2581 |
- } |
|
2581 |
+ } |
|
2582 | 2582 |
|
2583 |
- SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, ); |
|
2584 |
- SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); |
|
2585 |
- SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); |
|
2586 |
- SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); |
|
2583 |
+ SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, ); |
|
2584 |
+ SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, ); |
|
2585 |
+ SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, ); |
|
2586 |
+ SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, ); |
|
2587 |
+ } |
|
2587 | 2588 |
|
2588 | 2589 |
#if HAVE_YASM |
2589 | 2590 |
if (!high_bit_depth && CONFIG_H264CHROMA) { |
... | ... |
@@ -2599,7 +2901,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, |
2599 | 2599 |
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; |
2600 | 2600 |
} |
2601 | 2601 |
|
2602 |
- c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; |
|
2602 |
+ c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; |
|
2603 | 2603 |
|
2604 | 2604 |
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; |
2605 | 2605 |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; |
... | ... |
@@ -2645,32 +2947,34 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, |
2645 | 2645 |
} |
2646 | 2646 |
} |
2647 | 2647 |
|
2648 |
- if (CONFIG_VP3_DECODER |
|
2649 |
- && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) { |
|
2648 |
+ if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 || |
|
2649 |
+ avctx->codec_id == CODEC_ID_THEORA)) { |
|
2650 | 2650 |
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow; |
2651 | 2651 |
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow; |
2652 | 2652 |
} |
2653 | 2653 |
|
2654 |
- SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); |
|
2655 |
- SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); |
|
2656 |
- SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); |
|
2657 |
- SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, ); |
|
2658 |
- SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, ); |
|
2659 |
- SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, ); |
|
2654 |
+ if (CONFIG_H264QPEL) { |
|
2655 |
+ SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, ); |
|
2656 |
+ SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, ); |
|
2657 |
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, ); |
|
2658 |
+ SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, ); |
|
2659 |
+ SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, ); |
|
2660 |
+ SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, ); |
|
2660 | 2661 |
|
2661 |
- if (!high_bit_depth) { |
|
2662 |
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); |
|
2663 |
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); |
|
2664 |
- SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); |
|
2665 |
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); |
|
2666 |
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); |
|
2667 |
- SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); |
|
2668 |
- } |
|
2662 |
+ if (!high_bit_depth) { |
|
2663 |
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, ); |
|
2664 |
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, ); |
|
2665 |
+ SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, ); |
|
2666 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, ); |
|
2667 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, ); |
|
2668 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, ); |
|
2669 |
+ } |
|
2669 | 2670 |
|
2670 |
- SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); |
|
2671 |
- SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); |
|
2672 |
- SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); |
|
2673 |
- SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); |
|
2671 |
+ SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, ); |
|
2672 |
+ SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, ); |
|
2673 |
+ SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, ); |
|
2674 |
+ SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, ); |
|
2675 |
+ } |
|
2674 | 2676 |
|
2675 | 2677 |
#if HAVE_YASM |
2676 | 2678 |
if (!high_bit_depth && CONFIG_H264CHROMA) { |
... | ... |
@@ -2702,7 +3006,7 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags) |
2702 | 2702 |
const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
2703 | 2703 |
|
2704 | 2704 |
if (!high_bit_depth) { |
2705 |
- if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){ |
|
2705 |
+ if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) { |
|
2706 | 2706 |
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ |
2707 | 2707 |
c->clear_block = clear_block_sse; |
2708 | 2708 |
c->clear_blocks = clear_blocks_sse; |
... | ... |
@@ -2745,11 +3049,12 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, |
2745 | 2745 |
c->put_pixels_tab[0][0] = put_pixels16_sse2; |
2746 | 2746 |
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2; |
2747 | 2747 |
c->avg_pixels_tab[0][0] = avg_pixels16_sse2; |
2748 |
- H264_QPEL_FUNCS(0, 0, sse2); |
|
2748 |
+ if (CONFIG_H264QPEL) |
|
2749 |
+ H264_QPEL_FUNCS(0, 0, sse2); |
|
2749 | 2750 |
} |
2750 | 2751 |
} |
2751 | 2752 |
|
2752 |
- if (!high_bit_depth) { |
|
2753 |
+ if (!high_bit_depth && CONFIG_H264QPEL) { |
|
2753 | 2754 |
H264_QPEL_FUNCS(0, 1, sse2); |
2754 | 2755 |
H264_QPEL_FUNCS(0, 2, sse2); |
2755 | 2756 |
H264_QPEL_FUNCS(0, 3, sse2); |
... | ... |
@@ -2766,14 +3071,15 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, |
2766 | 2766 |
|
2767 | 2767 |
#if HAVE_YASM |
2768 | 2768 |
if (bit_depth == 10) { |
2769 |
- SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); |
|
2770 |
- SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); |
|
2771 |
- SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); |
|
2772 |
- SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); |
|
2773 |
- H264_QPEL_FUNCS_10(1, 0, sse2_cache64); |
|
2774 |
- H264_QPEL_FUNCS_10(2, 0, sse2_cache64); |
|
2775 |
- H264_QPEL_FUNCS_10(3, 0, sse2_cache64); |
|
2776 |
- |
|
2769 |
+ if (CONFIG_H264QPEL) { |
|
2770 |
+ SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); |
|
2771 |
+ SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); |
|
2772 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); |
|
2773 |
+ SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); |
|
2774 |
+ H264_QPEL_FUNCS_10(1, 0, sse2_cache64); |
|
2775 |
+ H264_QPEL_FUNCS_10(2, 0, sse2_cache64); |
|
2776 |
+ H264_QPEL_FUNCS_10(3, 0, sse2_cache64); |
|
2777 |
+ } |
|
2777 | 2778 |
if (CONFIG_H264CHROMA) { |
2778 | 2779 |
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; |
2779 | 2780 |
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2; |
... | ... |
@@ -2789,7 +3095,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, |
2789 | 2789 |
} |
2790 | 2790 |
if (avctx->flags & CODEC_FLAG_BITEXACT) { |
2791 | 2791 |
c->apply_window_int16 = ff_apply_window_int16_sse2_ba; |
2792 |
- } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { |
|
2792 |
+ } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { |
|
2793 | 2793 |
c->apply_window_int16 = ff_apply_window_int16_sse2; |
2794 | 2794 |
} |
2795 | 2795 |
c->bswap_buf = ff_bswap32_buf_sse2; |
... | ... |
@@ -2803,7 +3109,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, |
2803 | 2803 |
const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
2804 | 2804 |
const int bit_depth = avctx->bits_per_raw_sample; |
2805 | 2805 |
|
2806 |
- if (!high_bit_depth) { |
|
2806 |
+ if (!high_bit_depth && CONFIG_H264QPEL) { |
|
2807 | 2807 |
H264_QPEL_FUNCS(1, 0, ssse3); |
2808 | 2808 |
H264_QPEL_FUNCS(1, 1, ssse3); |
2809 | 2809 |
H264_QPEL_FUNCS(1, 2, ssse3); |
... | ... |
@@ -2818,7 +3124,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, |
2818 | 2818 |
H264_QPEL_FUNCS(3, 3, ssse3); |
2819 | 2819 |
} |
2820 | 2820 |
#if HAVE_YASM |
2821 |
- else if (bit_depth == 10) { |
|
2821 |
+ else if (bit_depth == 10 && CONFIG_H264QPEL) { |
|
2822 | 2822 |
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64); |
2823 | 2823 |
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64); |
2824 | 2824 |
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64); |
... | ... |
@@ -2833,14 +3139,12 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, |
2833 | 2833 |
if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe |
2834 | 2834 |
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; |
2835 | 2835 |
|
2836 |
- if (mm_flags & AV_CPU_FLAG_ATOM) { |
|
2836 |
+ if (mm_flags & AV_CPU_FLAG_ATOM) |
|
2837 | 2837 |
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; |
2838 |
- } else { |
|
2838 |
+ else |
|
2839 | 2839 |
c->apply_window_int16 = ff_apply_window_int16_ssse3; |
2840 |
- } |
|
2841 |
- if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit |
|
2840 |
+ if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit |
|
2842 | 2841 |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; |
2843 |
- } |
|
2844 | 2842 |
c->bswap_buf = ff_bswap32_buf_ssse3; |
2845 | 2843 |
#endif |
2846 | 2844 |
#endif |
... | ... |
@@ -2862,9 +3166,11 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags) |
2862 | 2862 |
if (bit_depth == 10) { |
2863 | 2863 |
// AVX implies !cache64. |
2864 | 2864 |
// TODO: Port cache(32|64) detection from x264. |
2865 |
- H264_QPEL_FUNCS_10(1, 0, sse2); |
|
2866 |
- H264_QPEL_FUNCS_10(2, 0, sse2); |
|
2867 |
- H264_QPEL_FUNCS_10(3, 0, sse2); |
|
2865 |
+ if (CONFIG_H264QPEL) { |
|
2866 |
+ H264_QPEL_FUNCS_10(1, 0, sse2); |
|
2867 |
+ H264_QPEL_FUNCS_10(2, 0, sse2); |
|
2868 |
+ H264_QPEL_FUNCS_10(3, 0, sse2); |
|
2869 |
+ } |
|
2868 | 2870 |
|
2869 | 2871 |
if (CONFIG_H264CHROMA) { |
2870 | 2872 |
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx; |
... | ... |
@@ -2875,13 +3181,13 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags) |
2875 | 2875 |
#endif |
2876 | 2876 |
} |
2877 | 2877 |
|
2878 |
-void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
|
2878 |
+void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx) |
|
2879 | 2879 |
{ |
2880 | 2880 |
int mm_flags = av_get_cpu_flags(); |
2881 | 2881 |
|
2882 | 2882 |
if (avctx->dsp_mask) { |
2883 | 2883 |
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) |
2884 |
- mm_flags |= (avctx->dsp_mask & 0xffff); |
|
2884 |
+ mm_flags |= avctx->dsp_mask & 0xffff; |
|
2885 | 2885 |
else |
2886 | 2886 |
mm_flags &= ~(avctx->dsp_mask & 0xffff); |
2887 | 2887 |
} |
... | ... |
@@ -2902,56 +3208,57 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
2902 | 2902 |
#endif |
2903 | 2903 |
|
2904 | 2904 |
if (mm_flags & AV_CPU_FLAG_MMX) { |
2905 |
- const int idct_algo= avctx->idct_algo; |
|
2905 |
+ const int idct_algo = avctx->idct_algo; |
|
2906 | 2906 |
|
2907 | 2907 |
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { |
2908 |
- if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ |
|
2909 |
- c->idct_put= ff_simple_idct_put_mmx; |
|
2910 |
- c->idct_add= ff_simple_idct_add_mmx; |
|
2911 |
- c->idct = ff_simple_idct_mmx; |
|
2912 |
- c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; |
|
2908 |
+ if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) { |
|
2909 |
+ c->idct_put = ff_simple_idct_put_mmx; |
|
2910 |
+ c->idct_add = ff_simple_idct_add_mmx; |
|
2911 |
+ c->idct = ff_simple_idct_mmx; |
|
2912 |
+ c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; |
|
2913 | 2913 |
#if CONFIG_GPL |
2914 |
- }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ |
|
2915 |
- if(mm_flags & AV_CPU_FLAG_MMX2){ |
|
2916 |
- c->idct_put= ff_libmpeg2mmx2_idct_put; |
|
2917 |
- c->idct_add= ff_libmpeg2mmx2_idct_add; |
|
2918 |
- c->idct = ff_mmxext_idct; |
|
2919 |
- }else{ |
|
2920 |
- c->idct_put= ff_libmpeg2mmx_idct_put; |
|
2921 |
- c->idct_add= ff_libmpeg2mmx_idct_add; |
|
2922 |
- c->idct = ff_mmx_idct; |
|
2914 |
+ } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) { |
|
2915 |
+ if (mm_flags & AV_CPU_FLAG_MMX2) { |
|
2916 |
+ c->idct_put = ff_libmpeg2mmx2_idct_put; |
|
2917 |
+ c->idct_add = ff_libmpeg2mmx2_idct_add; |
|
2918 |
+ c->idct = ff_mmxext_idct; |
|
2919 |
+ } else { |
|
2920 |
+ c->idct_put = ff_libmpeg2mmx_idct_put; |
|
2921 |
+ c->idct_add = ff_libmpeg2mmx_idct_add; |
|
2922 |
+ c->idct = ff_mmx_idct; |
|
2923 | 2923 |
} |
2924 |
- c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; |
|
2924 |
+ c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; |
|
2925 | 2925 |
#endif |
2926 |
- }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) && |
|
2927 |
- idct_algo==FF_IDCT_VP3 && HAVE_YASM){ |
|
2928 |
- if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
2929 |
- c->idct_put= ff_vp3_idct_put_sse2; |
|
2930 |
- c->idct_add= ff_vp3_idct_add_sse2; |
|
2931 |
- c->idct = ff_vp3_idct_sse2; |
|
2932 |
- c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; |
|
2933 |
- }else{ |
|
2934 |
- c->idct_put= ff_vp3_idct_put_mmx; |
|
2935 |
- c->idct_add= ff_vp3_idct_add_mmx; |
|
2936 |
- c->idct = ff_vp3_idct_mmx; |
|
2937 |
- c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; |
|
2926 |
+ } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || |
|
2927 |
+ CONFIG_VP6_DECODER) && |
|
2928 |
+ idct_algo == FF_IDCT_VP3 && HAVE_YASM) { |
|
2929 |
+ if (mm_flags & AV_CPU_FLAG_SSE2) { |
|
2930 |
+ c->idct_put = ff_vp3_idct_put_sse2; |
|
2931 |
+ c->idct_add = ff_vp3_idct_add_sse2; |
|
2932 |
+ c->idct = ff_vp3_idct_sse2; |
|
2933 |
+ c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; |
|
2934 |
+ } else { |
|
2935 |
+ c->idct_put = ff_vp3_idct_put_mmx; |
|
2936 |
+ c->idct_add = ff_vp3_idct_add_mmx; |
|
2937 |
+ c->idct = ff_vp3_idct_mmx; |
|
2938 |
+ c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; |
|
2938 | 2939 |
} |
2939 |
- }else if(idct_algo==FF_IDCT_CAVS){ |
|
2940 |
- c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; |
|
2941 |
- }else if(idct_algo==FF_IDCT_XVIDMMX){ |
|
2942 |
- if(mm_flags & AV_CPU_FLAG_SSE2){ |
|
2943 |
- c->idct_put= ff_idct_xvid_sse2_put; |
|
2944 |
- c->idct_add= ff_idct_xvid_sse2_add; |
|
2945 |
- c->idct = ff_idct_xvid_sse2; |
|
2946 |
- c->idct_permutation_type= FF_SSE2_IDCT_PERM; |
|
2947 |
- }else if(mm_flags & AV_CPU_FLAG_MMX2){ |
|
2948 |
- c->idct_put= ff_idct_xvid_mmx2_put; |
|
2949 |
- c->idct_add= ff_idct_xvid_mmx2_add; |
|
2950 |
- c->idct = ff_idct_xvid_mmx2; |
|
2951 |
- }else{ |
|
2952 |
- c->idct_put= ff_idct_xvid_mmx_put; |
|
2953 |
- c->idct_add= ff_idct_xvid_mmx_add; |
|
2954 |
- c->idct = ff_idct_xvid_mmx; |
|
2940 |
+ } else if (idct_algo == FF_IDCT_CAVS) { |
|
2941 |
+ c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; |
|
2942 |
+ } else if (idct_algo == FF_IDCT_XVIDMMX) { |
|
2943 |
+ if (mm_flags & AV_CPU_FLAG_SSE2) { |
|
2944 |
+ c->idct_put = ff_idct_xvid_sse2_put; |
|
2945 |
+ c->idct_add = ff_idct_xvid_sse2_add; |
|
2946 |
+ c->idct = ff_idct_xvid_sse2; |
|
2947 |
+ c->idct_permutation_type = FF_SSE2_IDCT_PERM; |
|
2948 |
+ } else if (mm_flags & AV_CPU_FLAG_MMX2) { |
|
2949 |
+ c->idct_put = ff_idct_xvid_mmx2_put; |
|
2950 |
+ c->idct_add = ff_idct_xvid_mmx2_add; |
|
2951 |
+ c->idct = ff_idct_xvid_mmx2; |
|
2952 |
+ } else { |
|
2953 |
+ c->idct_put = ff_idct_xvid_mmx_put; |
|
2954 |
+ c->idct_add = ff_idct_xvid_mmx_add; |
|
2955 |
+ c->idct = ff_idct_xvid_mmx; |
|
2955 | 2956 |
} |
2956 | 2957 |
} |
2957 | 2958 |
} |
... | ... |
@@ -2962,13 +3269,13 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
2962 | 2962 |
if (mm_flags & AV_CPU_FLAG_MMX2) |
2963 | 2963 |
dsputil_init_mmx2(c, avctx, mm_flags); |
2964 | 2964 |
|
2965 |
- if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) |
|
2965 |
+ if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) |
|
2966 | 2966 |
dsputil_init_3dnow(c, avctx, mm_flags); |
2967 | 2967 |
|
2968 |
- if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) |
|
2968 |
+ if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) |
|
2969 | 2969 |
dsputil_init_3dnow2(c, avctx, mm_flags); |
2970 | 2970 |
|
2971 |
- if (HAVE_SSE && (mm_flags & AV_CPU_FLAG_SSE)) |
|
2971 |
+ if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) |
|
2972 | 2972 |
dsputil_init_sse(c, avctx, mm_flags); |
2973 | 2973 |
|
2974 | 2974 |
if (mm_flags & AV_CPU_FLAG_SSE2) |
... | ... |
@@ -2209,14 +2209,11 @@ static int mov_write_isml_manifest(AVIOContext *pb, MOVMuxContext *mov) |
2209 | 2209 |
size); |
2210 | 2210 |
av_free(ptr); |
2211 | 2211 |
} |
2212 |
- } else { |
|
2213 |
- param_write_hex(pb, "CodecPrivateData", track->enc->extradata, |
|
2214 |
- track->enc->extradata_size); |
|
2215 |
- } |
|
2216 |
- if (track->enc->codec_id == CODEC_ID_H264) { |
|
2217 | 2212 |
param_write_string(pb, "FourCC", "H264"); |
2218 | 2213 |
} else if (track->enc->codec_id == CODEC_ID_VC1) { |
2219 | 2214 |
param_write_string(pb, "FourCC", "WVC1"); |
2215 |
+ param_write_hex(pb, "CodecPrivateData", track->enc->extradata, |
|
2216 |
+ track->enc->extradata_size); |
|
2220 | 2217 |
} |
2221 | 2218 |
param_write_int(pb, "MaxWidth", track->enc->width); |
2222 | 2219 |
param_write_int(pb, "MaxHeight", track->enc->height); |