Browse code

Merge remote-tracking branch 'qatar/master'

* qatar/master:
x86: dsputil: prettyprint gcc inline asm
x86: K&R prettyprinting cosmetics for dsputil_mmx.c
x86: conditionally compile H.264 QPEL optimizations
dsputil_mmx: Surround QPEL macros by "do { } while (0);" blocks.
Ignore generated files below doc/.
dpcm: convert to bytestream2.
interplayvideo: convert to bytestream2.
movenc: Merge if statements
h264: fix memleak in error path.
pthread: Immediately release all frames in ff_thread_flush()
h264: Add check for invalid chroma_format_idc
utvideo: port header reading to bytestream2.

Conflicts:
.gitignore
configure
libavcodec/h264_ps.c
libavcodec/interplayvideo.c
libavcodec/pthread.c
libavcodec/x86/dsputil_mmx.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2012/03/26 06:10:30
Showing 10 changed files
... ...
@@ -22,6 +22,9 @@ ffplay
22 22
 ffprobe
23 23
 ffserver
24 24
 avconv
25
+doc/avoptions_codec.texi
26
+doc/avoptions_format.texi
27
+doc/print_options
25 28
 libavcodec/*_tablegen
26 29
 libavcodec/*_tables.c
27 30
 libavcodec/*_tables.h
... ...
@@ -1266,6 +1266,7 @@ CONFIG_EXTRA="
1266 1266
     h264chroma
1267 1267
     h264dsp
1268 1268
     h264pred
1269
+    h264qpel
1269 1270
     huffman
1270 1271
     lgplv3
1271 1272
     lpc
... ...
@@ -1424,7 +1425,7 @@ h263_vaapi_hwaccel_select="vaapi h263_decoder"
1424 1424
 h263i_decoder_select="h263_decoder"
1425 1425
 h263p_encoder_select="h263_encoder"
1426 1426
 h264_crystalhd_decoder_select="crystalhd h264_mp4toannexb_bsf h264_parser"
1427
-h264_decoder_select="golomb h264chroma h264dsp h264pred"
1427
+h264_decoder_select="golomb h264chroma h264dsp h264pred h264qpel"
1428 1428
 h264_dxva2_hwaccel_deps="dxva2api_h"
1429 1429
 h264_dxva2_hwaccel_select="dxva2 h264_decoder"
1430 1430
 h264_vaapi_hwaccel_select="vaapi h264_decoder"
... ...
@@ -1485,8 +1486,8 @@ rv10_decoder_select="h263_decoder"
1485 1485
 rv10_encoder_select="h263_encoder"
1486 1486
 rv20_decoder_select="h263_decoder"
1487 1487
 rv20_encoder_select="h263_encoder"
1488
-rv30_decoder_select="golomb h264chroma h264pred"
1489
-rv40_decoder_select="golomb h264chroma h264pred"
1488
+rv30_decoder_select="golomb h264chroma h264pred h264qpel"
1489
+rv40_decoder_select="golomb h264chroma h264pred h264qpel"
1490 1490
 shorten_decoder_select="golomb"
1491 1491
 sipr_decoder_select="lsp"
1492 1492
 snow_decoder_select="dwt"
... ...
@@ -1495,7 +1496,7 @@ sonic_decoder_select="golomb"
1495 1495
 sonic_encoder_select="golomb"
1496 1496
 sonic_ls_encoder_select="golomb"
1497 1497
 svq1_encoder_select="aandct"
1498
-svq3_decoder_select="golomb h264chroma h264dsp h264pred"
1498
+svq3_decoder_select="golomb h264chroma h264dsp h264pred h264qpel"
1499 1499
 svq3_decoder_suggest="zlib"
1500 1500
 theora_decoder_select="vp3_decoder"
1501 1501
 tiff_decoder_suggest="zlib"
... ...
@@ -1504,7 +1505,7 @@ truehd_decoder_select="mlp_decoder"
1504 1504
 tscc_decoder_select="zlib"
1505 1505
 twinvq_decoder_select="mdct lsp sinewin"
1506 1506
 vc1_crystalhd_decoder_select="crystalhd"
1507
-vc1_decoder_select="h263_decoder h264chroma"
1507
+vc1_decoder_select="h263_decoder h264chroma h264qpel"
1508 1508
 vc1_dxva2_hwaccel_deps="dxva2api_h"
1509 1509
 vc1_dxva2_hwaccel_select="dxva2 vc1_decoder"
1510 1510
 vc1_vaapi_hwaccel_select="vaapi vc1_decoder"
... ...
@@ -1515,7 +1516,7 @@ vorbis_encoder_select="mdct"
1515 1515
 vp6_decoder_select="huffman"
1516 1516
 vp6a_decoder_select="vp6_decoder"
1517 1517
 vp6f_decoder_select="vp6_decoder"
1518
-vp8_decoder_select="h264pred"
1518
+vp8_decoder_select="h264pred h264qpel"
1519 1519
 wmapro_decoder_select="mdct sinewin"
1520 1520
 wmav1_decoder_select="mdct sinewin"
1521 1521
 wmav1_encoder_select="mdct sinewin"
... ...
@@ -1544,7 +1545,7 @@ vda_deps="VideoDecodeAcceleration_VDADecoder_h pthreads"
1544 1544
 vdpau_deps="vdpau_vdpau_h vdpau_vdpau_x11_h"
1545 1545
 
1546 1546
 # parsers
1547
-h264_parser_select="golomb h264chroma h264dsp h264pred"
1547
+h264_parser_select="golomb h264chroma h264dsp h264pred h264qpel"
1548 1548
 
1549 1549
 # external libraries
1550 1550
 libaacplus_encoder_deps="libaacplus"
... ...
@@ -40,6 +40,7 @@
40 40
 #include "libavutil/intreadwrite.h"
41 41
 #include "avcodec.h"
42 42
 #include "bytestream.h"
43
+#include "mathops.h"
43 44
 
44 45
 typedef struct DPCMContext {
45 46
     AVFrame frame;
... ...
@@ -173,20 +174,18 @@ static av_cold int dpcm_decode_init(AVCodecContext *avctx)
173 173
 static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
174 174
                              int *got_frame_ptr, AVPacket *avpkt)
175 175
 {
176
-    const uint8_t *buf = avpkt->data;
177 176
     int buf_size = avpkt->size;
178
-    const uint8_t *buf_end = buf + buf_size;
179 177
     DPCMContext *s = avctx->priv_data;
180 178
     int out = 0, ret;
181 179
     int predictor[2];
182 180
     int ch = 0;
183 181
     int stereo = s->channels - 1;
184
-    int16_t *output_samples;
182
+    int16_t *output_samples, *samples_end;
183
+    GetByteContext gb;
185 184
 
186
-    if (stereo && (buf_size & 1)) {
185
+    if (stereo && (buf_size & 1))
187 186
         buf_size--;
188
-        buf_end--;
189
-    }
187
+    bytestream2_init(&gb, avpkt->data, buf_size);
190 188
 
191 189
     /* calculate output size */
192 190
     switch(avctx->codec->id) {
... ...
@@ -221,22 +220,23 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
221 221
         return ret;
222 222
     }
223 223
     output_samples = (int16_t *)s->frame.data[0];
224
+    samples_end = output_samples + out;
224 225
 
225 226
     switch(avctx->codec->id) {
226 227
 
227 228
     case CODEC_ID_ROQ_DPCM:
228
-        buf += 6;
229
+        bytestream2_skipu(&gb, 6);
229 230
 
230 231
         if (stereo) {
231
-            predictor[1] = (int16_t)(bytestream_get_byte(&buf) << 8);
232
-            predictor[0] = (int16_t)(bytestream_get_byte(&buf) << 8);
232
+            predictor[1] = sign_extend(bytestream2_get_byteu(&gb) << 8, 16);
233
+            predictor[0] = sign_extend(bytestream2_get_byteu(&gb) << 8, 16);
233 234
         } else {
234
-            predictor[0] = (int16_t)bytestream_get_le16(&buf);
235
+            predictor[0] = sign_extend(bytestream2_get_le16u(&gb), 16);
235 236
         }
236 237
 
237 238
         /* decode the samples */
238
-        while (buf < buf_end) {
239
-            predictor[ch] += s->roq_square_array[*buf++];
239
+        while (output_samples < samples_end) {
240
+            predictor[ch] += s->roq_square_array[bytestream2_get_byteu(&gb)];
240 241
             predictor[ch]  = av_clip_int16(predictor[ch]);
241 242
             *output_samples++ = predictor[ch];
242 243
 
... ...
@@ -246,16 +246,16 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
246 246
         break;
247 247
 
248 248
     case CODEC_ID_INTERPLAY_DPCM:
249
-        buf += 6;  /* skip over the stream mask and stream length */
249
+        bytestream2_skipu(&gb, 6);  /* skip over the stream mask and stream length */
250 250
 
251 251
         for (ch = 0; ch < s->channels; ch++) {
252
-            predictor[ch] = (int16_t)bytestream_get_le16(&buf);
252
+            predictor[ch] = sign_extend(bytestream2_get_le16u(&gb), 16);
253 253
             *output_samples++ = predictor[ch];
254 254
         }
255 255
 
256 256
         ch = 0;
257
-        while (buf < buf_end) {
258
-            predictor[ch] += interplay_delta_table[*buf++];
257
+        while (output_samples < samples_end) {
258
+            predictor[ch] += interplay_delta_table[bytestream2_get_byteu(&gb)];
259 259
             predictor[ch]  = av_clip_int16(predictor[ch]);
260 260
             *output_samples++ = predictor[ch];
261 261
 
... ...
@@ -269,16 +269,19 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
269 269
         int shift[2] = { 4, 4 };
270 270
 
271 271
         for (ch = 0; ch < s->channels; ch++)
272
-            predictor[ch] = (int16_t)bytestream_get_le16(&buf);
272
+            predictor[ch] = sign_extend(bytestream2_get_le16u(&gb), 16);
273 273
 
274 274
         ch = 0;
275
-        while (buf < buf_end) {
276
-            uint8_t n = *buf++;
277
-            int16_t diff = (n & 0xFC) << 8;
278
-            if ((n & 0x03) == 3)
275
+        while (output_samples < samples_end) {
276
+            int diff = bytestream2_get_byteu(&gb);
277
+            int n    = diff & 3;
278
+
279
+            if (n == 3)
279 280
                 shift[ch]++;
280 281
             else
281
-                shift[ch] -= (2 * (n & 3));
282
+                shift[ch] -= (2 * n);
283
+            diff = sign_extend((diff &~ 3) << 8, 16);
284
+
282 285
             /* saturate the shifter to a lower limit of 0 */
283 286
             if (shift[ch] < 0)
284 287
                 shift[ch] = 0;
... ...
@@ -296,9 +299,10 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
296 296
     }
297 297
     case CODEC_ID_SOL_DPCM:
298 298
         if (avctx->codec_tag != 3) {
299
-            uint8_t *output_samples_u8 = s->frame.data[0];
300
-            while (buf < buf_end) {
301
-                uint8_t n = *buf++;
299
+            uint8_t *output_samples_u8 = s->frame.data[0],
300
+                    *samples_end_u8 = output_samples_u8 + out;
301
+            while (output_samples_u8 < samples_end_u8) {
302
+                int n = bytestream2_get_byteu(&gb);
302 303
 
303 304
                 s->sample[0] += s->sol_table[n >> 4];
304 305
                 s->sample[0]  = av_clip_uint8(s->sample[0]);
... ...
@@ -309,8 +313,8 @@ static int dpcm_decode_frame(AVCodecContext *avctx, void *data,
309 309
                 *output_samples_u8++ = s->sample[stereo];
310 310
             }
311 311
         } else {
312
-            while (buf < buf_end) {
313
-                uint8_t n = *buf++;
312
+            while (output_samples < samples_end) {
313
+                int n = bytestream2_get_byteu(&gb);
314 314
                 if (n & 0x80) s->sample[ch] -= sol_table_16[n & 0x7F];
315 315
                 else          s->sample[ch] += sol_table_16[n & 0x7F];
316 316
                 s->sample[ch] = av_clip_int16(s->sample[ch]);
... ...
@@ -352,9 +352,9 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
352 352
         if (sps->chroma_format_idc > 3U) {
353 353
             av_log(h->s.avctx, AV_LOG_ERROR, "chroma_format_idc %d is illegal\n", sps->chroma_format_idc);
354 354
             goto fail;
355
-        }
356
-        if(sps->chroma_format_idc == 3)
355
+        } else if(sps->chroma_format_idc == 3) {
357 356
             sps->residual_color_transform_flag = get_bits1(&s->gb);
357
+        }
358 358
         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
359 359
         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
360 360
         if (sps->bit_depth_luma > 12U || sps->bit_depth_chroma > 12U) {
... ...
@@ -56,14 +56,8 @@ typedef struct IpvideoContext {
56 56
     const unsigned char *decoding_map;
57 57
     int decoding_map_size;
58 58
 
59
-    const unsigned char *buf;
60
-    int size;
61
-
62 59
     int is_16bpp;
63
-    const unsigned char *stream_ptr;
64
-    const unsigned char *stream_end;
65
-    const uint8_t *mv_ptr;
66
-    const uint8_t *mv_end;
60
+    GetByteContext stream_ptr, mv_ptr;
67 61
     unsigned char *pixel_ptr;
68 62
     int line_inc;
69 63
     int stride;
... ...
@@ -72,13 +66,6 @@ typedef struct IpvideoContext {
72 72
     uint32_t pal[256];
73 73
 } IpvideoContext;
74 74
 
75
-#define CHECK_STREAM_PTR(stream_ptr, stream_end, n) \
76
-    if (stream_end - stream_ptr < n) { \
77
-        av_log(s->avctx, AV_LOG_ERROR, "stream_ptr out of bounds (%p >= %p)\n", \
78
-               stream_ptr + n, stream_end); \
79
-        return -1; \
80
-    }
81
-
82 75
 static int copy_from(IpvideoContext *s, AVFrame *src, int delta_x, int delta_y)
83 76
 {
84 77
     int current_offset = s->pixel_ptr - s->current_frame.data[0];
... ...
@@ -118,11 +105,9 @@ static int ipvideo_decode_block_opcode_0x2(IpvideoContext *s)
118 118
 
119 119
     /* copy block from 2 frames ago using a motion vector; need 1 more byte */
120 120
     if (!s->is_16bpp) {
121
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1);
122
-        B = *s->stream_ptr++;
121
+        B = bytestream2_get_byte(&s->stream_ptr);
123 122
     } else {
124
-        CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1);
125
-        B = *s->mv_ptr++;
123
+        B = bytestream2_get_byte(&s->mv_ptr);
126 124
     }
127 125
 
128 126
     if (B < 56) {
... ...
@@ -146,11 +131,9 @@ static int ipvideo_decode_block_opcode_0x3(IpvideoContext *s)
146 146
 
147 147
     /* need 1 more byte for motion */
148 148
     if (!s->is_16bpp) {
149
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1);
150
-        B = *s->stream_ptr++;
149
+        B = bytestream2_get_byte(&s->stream_ptr);
151 150
     } else {
152
-        CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1);
153
-        B = *s->mv_ptr++;
151
+        B = bytestream2_get_byte(&s->mv_ptr);
154 152
     }
155 153
 
156 154
     if (B < 56) {
... ...
@@ -172,11 +155,9 @@ static int ipvideo_decode_block_opcode_0x4(IpvideoContext *s)
172 172
 
173 173
     /* copy a block from the previous frame; need 1 more byte */
174 174
     if (!s->is_16bpp) {
175
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1);
176
-        B = *s->stream_ptr++;
175
+        B = bytestream2_get_byte(&s->stream_ptr);
177 176
     } else {
178
-        CHECK_STREAM_PTR(s->mv_ptr, s->mv_end, 1);
179
-        B = *s->mv_ptr++;
177
+        B = bytestream2_get_byte(&s->mv_ptr);
180 178
     }
181 179
 
182 180
     BL = B & 0x0F;
... ...
@@ -194,10 +175,8 @@ static int ipvideo_decode_block_opcode_0x5(IpvideoContext *s)
194 194
 
195 195
     /* copy a block from the previous frame using an expanded range;
196 196
      * need 2 more bytes */
197
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
198
-
199
-    x = *s->stream_ptr++;
200
-    y = *s->stream_ptr++;
197
+    x = bytestream2_get_byte(&s->stream_ptr);
198
+    y = bytestream2_get_byte(&s->stream_ptr);
201 199
 
202 200
     av_dlog(s->avctx, "motion bytes = %d, %d\n", x, y);
203 201
     return copy_from(s, &s->last_frame, x, y);
... ...
@@ -219,18 +198,14 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s)
219 219
     unsigned int flags;
220 220
 
221 221
     /* 2-color encoding */
222
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
223
-
224
-    P[0] = *s->stream_ptr++;
225
-    P[1] = *s->stream_ptr++;
222
+    P[0] = bytestream2_get_byte(&s->stream_ptr);
223
+    P[1] = bytestream2_get_byte(&s->stream_ptr);
226 224
 
227 225
     if (P[0] <= P[1]) {
228 226
 
229 227
         /* need 8 more bytes from the stream */
230
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8);
231
-
232 228
         for (y = 0; y < 8; y++) {
233
-            flags = *s->stream_ptr++ | 0x100;
229
+            flags = bytestream2_get_byte(&s->stream_ptr) | 0x100;
234 230
             for (; flags != 1; flags >>= 1)
235 231
                 *s->pixel_ptr++ = P[flags & 1];
236 232
             s->pixel_ptr += s->line_inc;
... ...
@@ -239,9 +214,7 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s)
239 239
     } else {
240 240
 
241 241
         /* need 2 more bytes from the stream */
242
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
243
-
244
-        flags = bytestream_get_le16(&s->stream_ptr);
242
+        flags = bytestream2_get_le16(&s->stream_ptr);
245 243
         for (y = 0; y < 8; y += 2) {
246 244
             for (x = 0; x < 8; x += 2, flags >>= 1) {
247 245
                 s->pixel_ptr[x                ] =
... ...
@@ -260,26 +233,23 @@ static int ipvideo_decode_block_opcode_0x7(IpvideoContext *s)
260 260
 static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s)
261 261
 {
262 262
     int x, y;
263
-    unsigned char P[2];
263
+    unsigned char P[4];
264 264
     unsigned int flags = 0;
265 265
 
266 266
     /* 2-color encoding for each 4x4 quadrant, or 2-color encoding on
267 267
      * either top and bottom or left and right halves */
268
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
269
-
270
-    P[0] = *s->stream_ptr++;
271
-    P[1] = *s->stream_ptr++;
268
+    P[0] = bytestream2_get_byte(&s->stream_ptr);
269
+    P[1] = bytestream2_get_byte(&s->stream_ptr);
272 270
 
273 271
     if (P[0] <= P[1]) {
274
-
275
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 14);
276
-        s->stream_ptr -= 2;
277
-
278 272
         for (y = 0; y < 16; y++) {
279 273
             // new values for each 4x4 block
280 274
             if (!(y & 3)) {
281
-                P[0] = *s->stream_ptr++; P[1] = *s->stream_ptr++;
282
-                flags = bytestream_get_le16(&s->stream_ptr);
275
+                if (y) {
276
+                    P[0]  = bytestream2_get_byte(&s->stream_ptr);
277
+                    P[1]  = bytestream2_get_byte(&s->stream_ptr);
278
+                }
279
+                flags = bytestream2_get_le16(&s->stream_ptr);
283 280
             }
284 281
 
285 282
             for (x = 0; x < 4; x++, flags >>= 1)
... ...
@@ -290,13 +260,11 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s)
290 290
         }
291 291
 
292 292
     } else {
293
+        flags = bytestream2_get_le32(&s->stream_ptr);
294
+        P[2] = bytestream2_get_byte(&s->stream_ptr);
295
+        P[3] = bytestream2_get_byte(&s->stream_ptr);
293 296
 
294
-        /* need 10 more bytes */
295
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 10);
296
-
297
-        if (s->stream_ptr[4] <= s->stream_ptr[5]) {
298
-
299
-            flags = bytestream_get_le32(&s->stream_ptr);
297
+        if (P[2] <= P[3]) {
300 298
 
301 299
             /* vertical split; left & right halves are 2-color encoded */
302 300
 
... ...
@@ -307,8 +275,9 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s)
307 307
                 // switch to right half
308 308
                 if (y == 7) {
309 309
                     s->pixel_ptr -= 8 * s->stride - 4;
310
-                    P[0] = *s->stream_ptr++; P[1] = *s->stream_ptr++;
311
-                    flags = bytestream_get_le32(&s->stream_ptr);
310
+                    P[0]  = P[2];
311
+                    P[1]  = P[3];
312
+                    flags = bytestream2_get_le32(&s->stream_ptr);
312 313
                 }
313 314
             }
314 315
 
... ...
@@ -318,12 +287,12 @@ static int ipvideo_decode_block_opcode_0x8(IpvideoContext *s)
318 318
 
319 319
             for (y = 0; y < 8; y++) {
320 320
                 if (y == 4) {
321
-                    P[0] = *s->stream_ptr++;
322
-                    P[1] = *s->stream_ptr++;
321
+                    P[0]  = P[2];
322
+                    P[1]  = P[3];
323
+                    flags = bytestream2_get_le32(&s->stream_ptr);
323 324
                 }
324
-                flags = *s->stream_ptr++ | 0x100;
325 325
 
326
-                for (; flags != 1; flags >>= 1)
326
+                for (x = 0; x < 8; x++, flags >>= 1)
327 327
                     *s->pixel_ptr++ = P[flags & 1];
328 328
                 s->pixel_ptr += s->line_inc;
329 329
             }
... ...
@@ -340,20 +309,15 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s)
340 340
     unsigned char P[4];
341 341
 
342 342
     /* 4-color encoding */
343
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4);
344
-
345
-    memcpy(P, s->stream_ptr, 4);
346
-    s->stream_ptr += 4;
343
+    bytestream2_get_buffer(&s->stream_ptr, P, 4);
347 344
 
348 345
     if (P[0] <= P[1]) {
349 346
         if (P[2] <= P[3]) {
350 347
 
351 348
             /* 1 of 4 colors for each pixel, need 16 more bytes */
352
-            CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16);
353
-
354 349
             for (y = 0; y < 8; y++) {
355 350
                 /* get the next set of 8 2-bit flags */
356
-                int flags = bytestream_get_le16(&s->stream_ptr);
351
+                int flags = bytestream2_get_le16(&s->stream_ptr);
357 352
                 for (x = 0; x < 8; x++, flags >>= 2)
358 353
                     *s->pixel_ptr++ = P[flags & 0x03];
359 354
                 s->pixel_ptr += s->line_inc;
... ...
@@ -363,9 +327,7 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s)
363 363
             uint32_t flags;
364 364
 
365 365
             /* 1 of 4 colors for each 2x2 block, need 4 more bytes */
366
-            CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4);
367
-
368
-            flags = bytestream_get_le32(&s->stream_ptr);
366
+            flags = bytestream2_get_le32(&s->stream_ptr);
369 367
 
370 368
             for (y = 0; y < 8; y += 2) {
371 369
                 for (x = 0; x < 8; x += 2, flags >>= 2) {
... ...
@@ -382,9 +344,7 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s)
382 382
         uint64_t flags;
383 383
 
384 384
         /* 1 of 4 colors for each 2x1 or 1x2 block, need 8 more bytes */
385
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8);
386
-
387
-        flags = bytestream_get_le64(&s->stream_ptr);
385
+        flags = bytestream2_get_le64(&s->stream_ptr);
388 386
         if (P[2] <= P[3]) {
389 387
             for (y = 0; y < 8; y++) {
390 388
                 for (x = 0; x < 8; x += 2, flags >>= 2) {
... ...
@@ -411,24 +371,21 @@ static int ipvideo_decode_block_opcode_0x9(IpvideoContext *s)
411 411
 static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s)
412 412
 {
413 413
     int x, y;
414
-    unsigned char P[4];
414
+    unsigned char P[8];
415 415
     int flags = 0;
416 416
 
417
+    bytestream2_get_buffer(&s->stream_ptr, P, 4);
418
+
417 419
     /* 4-color encoding for each 4x4 quadrant, or 4-color encoding on
418 420
      * either top and bottom or left and right halves */
419
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24);
420
-
421
-    if (s->stream_ptr[0] <= s->stream_ptr[1]) {
421
+    if (P[0] <= P[1]) {
422 422
 
423 423
         /* 4-color encoding for each quadrant; need 32 bytes */
424
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 32);
425
-
426 424
         for (y = 0; y < 16; y++) {
427 425
             // new values for each 4x4 block
428 426
             if (!(y & 3)) {
429
-                memcpy(P, s->stream_ptr, 4);
430
-                s->stream_ptr += 4;
431
-                flags = bytestream_get_le32(&s->stream_ptr);
427
+                if (y) bytestream2_get_buffer(&s->stream_ptr, P, 4);
428
+                flags = bytestream2_get_le32(&s->stream_ptr);
432 429
             }
433 430
 
434 431
             for (x = 0; x < 4; x++, flags >>= 2)
... ...
@@ -441,20 +398,16 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s)
441 441
 
442 442
     } else {
443 443
         // vertical split?
444
-        int vert = s->stream_ptr[12] <= s->stream_ptr[13];
445
-        uint64_t flags = 0;
444
+        int vert;
445
+        uint64_t flags = bytestream2_get_le64(&s->stream_ptr);
446
+
447
+        bytestream2_get_buffer(&s->stream_ptr, P + 4, 4);
448
+        vert = P[4] <= P[5];
446 449
 
447 450
         /* 4-color encoding for either left and right or top and bottom
448 451
          * halves */
449 452
 
450 453
         for (y = 0; y < 16; y++) {
451
-            // load values for each half
452
-            if (!(y & 7)) {
453
-                memcpy(P, s->stream_ptr, 4);
454
-                s->stream_ptr += 4;
455
-                flags = bytestream_get_le64(&s->stream_ptr);
456
-            }
457
-
458 454
             for (x = 0; x < 4; x++, flags >>= 2)
459 455
                 *s->pixel_ptr++ = P[flags & 0x03];
460 456
 
... ...
@@ -463,6 +416,12 @@ static int ipvideo_decode_block_opcode_0xA(IpvideoContext *s)
463 463
                 // switch to right half
464 464
                 if (y == 7) s->pixel_ptr -= 8 * s->stride - 4;
465 465
             } else if (y & 1) s->pixel_ptr += s->line_inc;
466
+
467
+            // load values for second half
468
+            if (y == 7) {
469
+                memcpy(P, P + 4, 4);
470
+                flags = bytestream2_get_le64(&s->stream_ptr);
471
+            }
466 472
         }
467 473
     }
468 474
 
... ...
@@ -475,11 +434,8 @@ static int ipvideo_decode_block_opcode_0xB(IpvideoContext *s)
475 475
     int y;
476 476
 
477 477
     /* 64-color encoding (each pixel in block is a different color) */
478
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 64);
479
-
480 478
     for (y = 0; y < 8; y++) {
481
-        memcpy(s->pixel_ptr, s->stream_ptr, 8);
482
-        s->stream_ptr += 8;
479
+        bytestream2_get_buffer(&s->stream_ptr, s->pixel_ptr, 8);
483 480
         s->pixel_ptr  += s->stride;
484 481
     }
485 482
 
... ...
@@ -492,14 +448,12 @@ static int ipvideo_decode_block_opcode_0xC(IpvideoContext *s)
492 492
     int x, y;
493 493
 
494 494
     /* 16-color block encoding: each 2x2 block is a different color */
495
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16);
496
-
497 495
     for (y = 0; y < 8; y += 2) {
498 496
         for (x = 0; x < 8; x += 2) {
499 497
             s->pixel_ptr[x                ] =
500 498
             s->pixel_ptr[x + 1            ] =
501 499
             s->pixel_ptr[x +     s->stride] =
502
-            s->pixel_ptr[x + 1 + s->stride] = *s->stream_ptr++;
500
+            s->pixel_ptr[x + 1 + s->stride] = bytestream2_get_byte(&s->stream_ptr);
503 501
         }
504 502
         s->pixel_ptr += s->stride * 2;
505 503
     }
... ...
@@ -514,12 +468,10 @@ static int ipvideo_decode_block_opcode_0xD(IpvideoContext *s)
514 514
     unsigned char P[2];
515 515
 
516 516
     /* 4-color block encoding: each 4x4 block is a different color */
517
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4);
518
-
519 517
     for (y = 0; y < 8; y++) {
520 518
         if (!(y & 3)) {
521
-            P[0] = *s->stream_ptr++;
522
-            P[1] = *s->stream_ptr++;
519
+            P[0] = bytestream2_get_byte(&s->stream_ptr);
520
+            P[1] = bytestream2_get_byte(&s->stream_ptr);
523 521
         }
524 522
         memset(s->pixel_ptr,     P[0], 4);
525 523
         memset(s->pixel_ptr + 4, P[1], 4);
... ...
@@ -536,8 +488,7 @@ static int ipvideo_decode_block_opcode_0xE(IpvideoContext *s)
536 536
     unsigned char pix;
537 537
 
538 538
     /* 1-color encoding: the whole block is 1 solid color */
539
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 1);
540
-    pix = *s->stream_ptr++;
539
+    pix = bytestream2_get_byte(&s->stream_ptr);
541 540
 
542 541
     for (y = 0; y < 8; y++) {
543 542
         memset(s->pixel_ptr, pix, 8);
... ...
@@ -554,9 +505,8 @@ static int ipvideo_decode_block_opcode_0xF(IpvideoContext *s)
554 554
     unsigned char sample[2];
555 555
 
556 556
     /* dithered encoding */
557
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
558
-    sample[0] = *s->stream_ptr++;
559
-    sample[1] = *s->stream_ptr++;
557
+    sample[0] = bytestream2_get_byte(&s->stream_ptr);
558
+    sample[1] = bytestream2_get_byte(&s->stream_ptr);
560 559
 
561 560
     for (y = 0; y < 8; y++) {
562 561
         for (x = 0; x < 8; x += 2) {
... ...
@@ -575,10 +525,8 @@ static int ipvideo_decode_block_opcode_0x6_16(IpvideoContext *s)
575 575
     signed char x, y;
576 576
 
577 577
     /* copy a block from the second last frame using an expanded range */
578
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
579
-
580
-    x = *s->stream_ptr++;
581
-    y = *s->stream_ptr++;
578
+    x = bytestream2_get_byte(&s->stream_ptr);
579
+    y = bytestream2_get_byte(&s->stream_ptr);
582 580
 
583 581
     av_dlog(s->avctx, "motion bytes = %d, %d\n", x, y);
584 582
     return copy_from(s, &s->second_last_frame, x, y);
... ...
@@ -592,17 +540,13 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s)
592 592
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
593 593
 
594 594
     /* 2-color encoding */
595
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4);
596
-
597
-    P[0] = bytestream_get_le16(&s->stream_ptr);
598
-    P[1] = bytestream_get_le16(&s->stream_ptr);
595
+    P[0] = bytestream2_get_le16(&s->stream_ptr);
596
+    P[1] = bytestream2_get_le16(&s->stream_ptr);
599 597
 
600 598
     if (!(P[0] & 0x8000)) {
601 599
 
602
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8);
603
-
604 600
         for (y = 0; y < 8; y++) {
605
-            flags = *s->stream_ptr++ | 0x100;
601
+            flags = bytestream2_get_byte(&s->stream_ptr) | 0x100;
606 602
             for (; flags != 1; flags >>= 1)
607 603
                 *pixel_ptr++ = P[flags & 1];
608 604
             pixel_ptr += s->line_inc;
... ...
@@ -610,9 +554,7 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s)
610 610
 
611 611
     } else {
612 612
 
613
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
614
-
615
-        flags = bytestream_get_le16(&s->stream_ptr);
613
+        flags = bytestream2_get_le16(&s->stream_ptr);
616 614
         for (y = 0; y < 8; y += 2) {
617 615
             for (x = 0; x < 8; x += 2, flags >>= 1) {
618 616
                 pixel_ptr[x                ] =
... ...
@@ -630,28 +572,25 @@ static int ipvideo_decode_block_opcode_0x7_16(IpvideoContext *s)
630 630
 static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s)
631 631
 {
632 632
     int x, y;
633
-    uint16_t P[2];
633
+    uint16_t P[4];
634 634
     unsigned int flags = 0;
635 635
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
636 636
 
637 637
     /* 2-color encoding for each 4x4 quadrant, or 2-color encoding on
638 638
      * either top and bottom or left and right halves */
639
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4);
640
-
641
-    P[0] = bytestream_get_le16(&s->stream_ptr);
642
-    P[1] = bytestream_get_le16(&s->stream_ptr);
639
+    P[0] = bytestream2_get_le16(&s->stream_ptr);
640
+    P[1] = bytestream2_get_le16(&s->stream_ptr);
643 641
 
644 642
     if (!(P[0] & 0x8000)) {
645 643
 
646
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24);
647
-        s->stream_ptr -= 4;
648
-
649 644
         for (y = 0; y < 16; y++) {
650 645
             // new values for each 4x4 block
651 646
             if (!(y & 3)) {
652
-                P[0] = bytestream_get_le16(&s->stream_ptr);
653
-                P[1] = bytestream_get_le16(&s->stream_ptr);
654
-                flags = bytestream_get_le16(&s->stream_ptr);
647
+                if (y) {
648
+                    P[0] = bytestream2_get_le16(&s->stream_ptr);
649
+                    P[1] = bytestream2_get_le16(&s->stream_ptr);
650
+                }
651
+                flags = bytestream2_get_le16(&s->stream_ptr);
655 652
             }
656 653
 
657 654
             for (x = 0; x < 4; x++, flags >>= 1)
... ...
@@ -663,11 +602,11 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s)
663 663
 
664 664
     } else {
665 665
 
666
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 12);
667
-
668
-        if (!(AV_RL16(s->stream_ptr + 4) & 0x8000)) {
666
+        flags = bytestream2_get_le32(&s->stream_ptr);
667
+        P[2]  = bytestream2_get_le16(&s->stream_ptr);
668
+        P[3]  = bytestream2_get_le16(&s->stream_ptr);
669 669
 
670
-            flags = bytestream_get_le32(&s->stream_ptr);
670
+        if (!(P[2] & 0x8000)) {
671 671
 
672 672
             /* vertical split; left & right halves are 2-color encoded */
673 673
 
... ...
@@ -678,9 +617,9 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s)
678 678
                 // switch to right half
679 679
                 if (y == 7) {
680 680
                     pixel_ptr -= 8 * s->stride - 4;
681
-                    P[0] = bytestream_get_le16(&s->stream_ptr);
682
-                    P[1] = bytestream_get_le16(&s->stream_ptr);
683
-                    flags = bytestream_get_le32(&s->stream_ptr);
681
+                    P[0]  = P[2];
682
+                    P[1]  = P[3];
683
+                    flags = bytestream2_get_le32(&s->stream_ptr);
684 684
                 }
685 685
             }
686 686
 
... ...
@@ -690,12 +629,12 @@ static int ipvideo_decode_block_opcode_0x8_16(IpvideoContext *s)
690 690
 
691 691
             for (y = 0; y < 8; y++) {
692 692
                 if (y == 4) {
693
-                    P[0] = bytestream_get_le16(&s->stream_ptr);
694
-                    P[1] = bytestream_get_le16(&s->stream_ptr);
693
+                    P[0]  = P[2];
694
+                    P[1]  = P[3];
695
+                    flags = bytestream2_get_le32(&s->stream_ptr);
695 696
                 }
696
-                flags = *s->stream_ptr++ | 0x100;
697 697
 
698
-                for (; flags != 1; flags >>= 1)
698
+                for (x = 0; x < 8; x++, flags >>= 1)
699 699
                     *pixel_ptr++ = P[flags & 1];
700 700
                 pixel_ptr += s->line_inc;
701 701
             }
... ...
@@ -713,20 +652,16 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s)
713 713
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
714 714
 
715 715
     /* 4-color encoding */
716
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8);
717
-
718 716
     for (x = 0; x < 4; x++)
719
-        P[x] = bytestream_get_le16(&s->stream_ptr);
717
+        P[x] = bytestream2_get_le16(&s->stream_ptr);
720 718
 
721 719
     if (!(P[0] & 0x8000)) {
722 720
         if (!(P[2] & 0x8000)) {
723 721
 
724 722
             /* 1 of 4 colors for each pixel */
725
-            CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 16);
726
-
727 723
             for (y = 0; y < 8; y++) {
728 724
                 /* get the next set of 8 2-bit flags */
729
-                int flags = bytestream_get_le16(&s->stream_ptr);
725
+                int flags = bytestream2_get_le16(&s->stream_ptr);
730 726
                 for (x = 0; x < 8; x++, flags >>= 2)
731 727
                     *pixel_ptr++ = P[flags & 0x03];
732 728
                 pixel_ptr += s->line_inc;
... ...
@@ -736,9 +671,7 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s)
736 736
             uint32_t flags;
737 737
 
738 738
             /* 1 of 4 colors for each 2x2 block */
739
-            CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 4);
740
-
741
-            flags = bytestream_get_le32(&s->stream_ptr);
739
+            flags = bytestream2_get_le32(&s->stream_ptr);
742 740
 
743 741
             for (y = 0; y < 8; y += 2) {
744 742
                 for (x = 0; x < 8; x += 2, flags >>= 2) {
... ...
@@ -755,9 +688,7 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s)
755 755
         uint64_t flags;
756 756
 
757 757
         /* 1 of 4 colors for each 2x1 or 1x2 block */
758
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8);
759
-
760
-        flags = bytestream_get_le64(&s->stream_ptr);
758
+        flags = bytestream2_get_le64(&s->stream_ptr);
761 759
         if (!(P[2] & 0x8000)) {
762 760
             for (y = 0; y < 8; y++) {
763 761
                 for (x = 0; x < 8; x += 2, flags >>= 2) {
... ...
@@ -784,25 +715,25 @@ static int ipvideo_decode_block_opcode_0x9_16(IpvideoContext *s)
784 784
 static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s)
785 785
 {
786 786
     int x, y;
787
-    uint16_t P[4];
787
+    uint16_t P[8];
788 788
     int flags = 0;
789 789
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
790 790
 
791
+    for (x = 0; x < 4; x++)
792
+        P[x] = bytestream2_get_le16(&s->stream_ptr);
793
+
791 794
     /* 4-color encoding for each 4x4 quadrant, or 4-color encoding on
792 795
      * either top and bottom or left and right halves */
793
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 24);
794
-
795
-    if (!(AV_RL16(s->stream_ptr) & 0x8000)) {
796
+    if (!(P[0] & 0x8000)) {
796 797
 
797 798
         /* 4-color encoding for each quadrant */
798
-        CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 48);
799
-
800 799
         for (y = 0; y < 16; y++) {
801 800
             // new values for each 4x4 block
802 801
             if (!(y & 3)) {
803
-                for (x = 0; x < 4; x++)
804
-                    P[x] = bytestream_get_le16(&s->stream_ptr);
805
-                flags = bytestream_get_le32(&s->stream_ptr);
802
+                if (y)
803
+                    for (x = 0; x < 4; x++)
804
+                        P[x] = bytestream2_get_le16(&s->stream_ptr);
805
+                flags = bytestream2_get_le32(&s->stream_ptr);
806 806
             }
807 807
 
808 808
             for (x = 0; x < 4; x++, flags >>= 2)
... ...
@@ -815,20 +746,17 @@ static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s)
815 815
 
816 816
     } else {
817 817
         // vertical split?
818
-        int vert = !(AV_RL16(s->stream_ptr + 16) & 0x8000);
819
-        uint64_t flags = 0;
818
+        int vert;
819
+        uint64_t flags = bytestream2_get_le64(&s->stream_ptr);
820
+
821
+        for (x = 4; x < 8; x++)
822
+            P[x] = bytestream2_get_le16(&s->stream_ptr);
823
+        vert = !(P[4] & 0x8000);
820 824
 
821 825
         /* 4-color encoding for either left and right or top and bottom
822 826
          * halves */
823 827
 
824 828
         for (y = 0; y < 16; y++) {
825
-            // load values for each half
826
-            if (!(y & 7)) {
827
-                for (x = 0; x < 4; x++)
828
-                    P[x] = bytestream_get_le16(&s->stream_ptr);
829
-                flags = bytestream_get_le64(&s->stream_ptr);
830
-            }
831
-
832 829
             for (x = 0; x < 4; x++, flags >>= 2)
833 830
                 *pixel_ptr++ = P[flags & 0x03];
834 831
 
... ...
@@ -837,6 +765,12 @@ static int ipvideo_decode_block_opcode_0xA_16(IpvideoContext *s)
837 837
                 // switch to right half
838 838
                 if (y == 7) pixel_ptr -= 8 * s->stride - 4;
839 839
             } else if (y & 1) pixel_ptr += s->line_inc;
840
+
841
+            // load values for second half
842
+            if (y == 7) {
843
+                memcpy(P, P + 4, 8);
844
+                flags = bytestream2_get_le64(&s->stream_ptr);
845
+            }
840 846
         }
841 847
     }
842 848
 
... ...
@@ -850,11 +784,9 @@ static int ipvideo_decode_block_opcode_0xB_16(IpvideoContext *s)
850 850
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
851 851
 
852 852
     /* 64-color encoding (each pixel in block is a different color) */
853
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 128);
854
-
855 853
     for (y = 0; y < 8; y++) {
856 854
         for (x = 0; x < 8; x++)
857
-            pixel_ptr[x] = bytestream_get_le16(&s->stream_ptr);
855
+            pixel_ptr[x] = bytestream2_get_le16(&s->stream_ptr);
858 856
         pixel_ptr  += s->stride;
859 857
     }
860 858
 
... ...
@@ -868,14 +800,12 @@ static int ipvideo_decode_block_opcode_0xC_16(IpvideoContext *s)
868 868
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
869 869
 
870 870
     /* 16-color block encoding: each 2x2 block is a different color */
871
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 32);
872
-
873 871
     for (y = 0; y < 8; y += 2) {
874 872
         for (x = 0; x < 8; x += 2) {
875 873
             pixel_ptr[x                ] =
876 874
             pixel_ptr[x + 1            ] =
877 875
             pixel_ptr[x +     s->stride] =
878
-            pixel_ptr[x + 1 + s->stride] = bytestream_get_le16(&s->stream_ptr);
876
+            pixel_ptr[x + 1 + s->stride] = bytestream2_get_le16(&s->stream_ptr);
879 877
         }
880 878
         pixel_ptr += s->stride * 2;
881 879
     }
... ...
@@ -891,12 +821,10 @@ static int ipvideo_decode_block_opcode_0xD_16(IpvideoContext *s)
891 891
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
892 892
 
893 893
     /* 4-color block encoding: each 4x4 block is a different color */
894
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 8);
895
-
896 894
     for (y = 0; y < 8; y++) {
897 895
         if (!(y & 3)) {
898
-            P[0] = bytestream_get_le16(&s->stream_ptr);
899
-            P[1] = bytestream_get_le16(&s->stream_ptr);
896
+            P[0] = bytestream2_get_le16(&s->stream_ptr);
897
+            P[1] = bytestream2_get_le16(&s->stream_ptr);
900 898
         }
901 899
         for (x = 0; x < 8; x++)
902 900
             pixel_ptr[x] = P[x >> 2];
... ...
@@ -914,8 +842,7 @@ static int ipvideo_decode_block_opcode_0xE_16(IpvideoContext *s)
914 914
     uint16_t *pixel_ptr = (uint16_t*)s->pixel_ptr;
915 915
 
916 916
     /* 1-color encoding: the whole block is 1 solid color */
917
-    CHECK_STREAM_PTR(s->stream_ptr, s->stream_end, 2);
918
-    pix = bytestream_get_le16(&s->stream_ptr);
917
+    pix = bytestream2_get_le16(&s->stream_ptr);
919 918
 
920 919
     for (y = 0; y < 8; y++) {
921 920
         for (x = 0; x < 8; x++)
... ...
@@ -960,19 +887,16 @@ static void ipvideo_decode_opcodes(IpvideoContext *s)
960 960
     av_dlog(s->avctx, "frame %d\n", frame);
961 961
     frame++;
962 962
 
963
+    bytestream2_skip(&s->stream_ptr, 14); /* data starts 14 bytes in */
963 964
     if (!s->is_16bpp) {
964 965
         /* this is PAL8, so make the palette available */
965 966
         memcpy(s->current_frame.data[1], s->pal, AVPALETTE_SIZE);
966 967
 
967 968
         s->stride = s->current_frame.linesize[0];
968
-        s->stream_ptr = s->buf + 14;  /* data starts 14 bytes in */
969
-        s->stream_end = s->buf + s->size;
970 969
     } else {
971 970
         s->stride = s->current_frame.linesize[0] >> 1;
972
-        s->stream_ptr = s->buf + 16;
973
-        s->stream_end =
974
-        s->mv_ptr = s->buf + 14 + AV_RL16(s->buf+14);
975
-        s->mv_end = s->buf + s->size;
971
+        s->mv_ptr = s->stream_ptr;
972
+        bytestream2_skip(&s->mv_ptr, bytestream2_get_le16(&s->stream_ptr));
976 973
     }
977 974
     s->line_inc = s->stride - 8;
978 975
     s->upper_motion_limit_offset = (s->avctx->height - 8) * s->current_frame.linesize[0]
... ...
@@ -1002,9 +926,10 @@ static void ipvideo_decode_opcodes(IpvideoContext *s)
1002 1002
             }
1003 1003
         }
1004 1004
     }
1005
-    if (s->stream_end - s->stream_ptr > 1) {
1006
-        av_log(s->avctx, AV_LOG_ERROR, "decode finished with %td bytes left over\n",
1007
-               s->stream_end - s->stream_ptr);
1005
+    if (bytestream2_get_bytes_left(&s->stream_ptr) > 1) {
1006
+        av_log(s->avctx, AV_LOG_ERROR,
1007
+               "decode finished with %d bytes left over\n",
1008
+               bytestream2_get_bytes_left(&s->stream_ptr));
1008 1009
     }
1009 1010
 }
1010 1011
 
... ...
@@ -1046,8 +971,8 @@ static int ipvideo_decode_frame(AVCodecContext *avctx,
1046 1046
         return buf_size;
1047 1047
 
1048 1048
     s->decoding_map = buf;
1049
-    s->buf = buf + s->decoding_map_size;
1050
-    s->size = buf_size - s->decoding_map_size;
1049
+    bytestream2_init(&s->stream_ptr, buf + s->decoding_map_size,
1050
+                     buf_size - s->decoding_map_size);
1051 1051
 
1052 1052
     s->current_frame.reference = 3;
1053 1053
     if (avctx->get_buffer(avctx, &s->current_frame)) {
... ...
@@ -907,9 +907,13 @@ void ff_thread_flush(AVCodecContext *avctx)
907 907
     fctx->next_decoding = fctx->next_finished = 0;
908 908
     fctx->delaying = 1;
909 909
     fctx->prev_thread = NULL;
910
-    // Make sure decode flush calls with size=0 won't return old frames
911
-    for (i = 0; i < avctx->thread_count; i++)
912
-        fctx->threads[i].got_frame = 0;
910
+    for (i = 0; i < avctx->thread_count; i++) {
911
+        PerThreadContext *p = &fctx->threads[i];
912
+        // Make sure decode flush calls with size=0 won't return old frames
913
+        p->got_frame = 0;
914
+
915
+        release_delayed_buffers(p);
916
+    }
913 917
 }
914 918
 
915 919
 static int *allocate_progress(PerThreadContext *p)
... ...
@@ -358,13 +358,12 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac
358 358
 {
359 359
     const uint8_t *buf = avpkt->data;
360 360
     int buf_size = avpkt->size;
361
-    const uint8_t *buf_end = buf + buf_size;
362 361
     UtvideoContext *c = avctx->priv_data;
363
-    const uint8_t *ptr;
364 362
     int i, j;
365 363
     const uint8_t *plane_start[5];
366 364
     int plane_size, max_slice_size = 0, slice_start, slice_end, slice_size;
367 365
     int ret;
366
+    GetByteContext gb;
368 367
 
369 368
     if (c->pic.data[0])
370 369
         ff_thread_release_buffer(avctx, &c->pic);
... ...
@@ -377,20 +376,21 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac
377 377
     }
378 378
 
379 379
     /* parse plane structure to retrieve frame flags and validate slice offsets */
380
-    ptr = buf;
380
+    bytestream2_init(&gb, buf, buf_size);
381 381
     for (i = 0; i < c->planes; i++) {
382
-        plane_start[i] = ptr;
383
-        if (buf_end - ptr < 256 + 4 * c->slices) {
382
+        plane_start[i] = gb.buffer;
383
+        if (bytestream2_get_bytes_left(&gb) < 256 + 4 * c->slices) {
384 384
             av_log(avctx, AV_LOG_ERROR, "Insufficient data for a plane\n");
385 385
             return AVERROR_INVALIDDATA;
386 386
         }
387
-        ptr += 256;
387
+        bytestream2_skipu(&gb, 256);
388 388
         slice_start = 0;
389 389
         slice_end   = 0;
390 390
         for (j = 0; j < c->slices; j++) {
391
-            slice_end   = bytestream_get_le32(&ptr);
391
+            slice_end   = bytestream2_get_le32u(&gb);
392 392
             slice_size  = slice_end - slice_start;
393
-            if (slice_size < 0) {
393
+            if (slice_end <= 0 || slice_size <= 0 ||
394
+                bytestream2_get_bytes_left(&gb) < slice_end) {
394 395
                 av_log(avctx, AV_LOG_ERROR, "Incorrect slice size\n");
395 396
                 return AVERROR_INVALIDDATA;
396 397
             }
... ...
@@ -398,18 +398,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, AVPac
398 398
             max_slice_size = FFMAX(max_slice_size, slice_size);
399 399
         }
400 400
         plane_size = slice_end;
401
-        if (buf_end - ptr < plane_size) {
402
-            av_log(avctx, AV_LOG_ERROR, "Plane size is bigger than available data\n");
403
-            return AVERROR_INVALIDDATA;
404
-        }
405
-        ptr += plane_size;
401
+        bytestream2_skipu(&gb, plane_size);
406 402
     }
407
-    plane_start[c->planes] = ptr;
408
-    if (buf_end - ptr < c->frame_info_size) {
403
+    plane_start[c->planes] = gb.buffer;
404
+    if (bytestream2_get_bytes_left(&gb) < c->frame_info_size) {
409 405
         av_log(avctx, AV_LOG_ERROR, "Not enough data for frame information\n");
410 406
         return AVERROR_INVALIDDATA;
411 407
     }
412
-    c->frame_info = AV_RL32(ptr);
408
+    c->frame_info = bytestream2_get_le32u(&gb);
413 409
     av_log(avctx, AV_LOG_DEBUG, "frame information flags %X\n", c->frame_info);
414 410
 
415 411
     c->frame_pred = (c->frame_info >> 8) & 3;
... ...
@@ -25,6 +25,7 @@ YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
25 25
 YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o          \
26 26
                                           x86/h264_intrapred_10bit.o
27 27
 MMX-OBJS-$(CONFIG_H264PRED)            += x86/h264_intrapred_init.o
28
+YASM-OBJS-$(CONFIG_H264QPEL)           += x86/h264_qpel_10bit.o
28 29
 
29 30
 MMX-OBJS-$(CONFIG_RV30_DECODER)        += x86/rv34dsp_init.o
30 31
 YASM-OBJS-$(CONFIG_RV30_DECODER)       += x86/rv34dsp.o
... ...
@@ -71,7 +72,6 @@ MMX-OBJS-$(CONFIG_VP8_DECODER)         += x86/vp8dsp-init.o
71 71
 MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
72 72
                                           x86/deinterlace.o             \
73 73
                                           x86/fmtconvert.o              \
74
-                                          x86/h264_qpel_10bit.o         \
75 74
                                           $(YASM-OBJS-yes)
76 75
 
77 76
 MMX-OBJS-$(CONFIG_FFT)                 += x86/fft.o
... ...
@@ -41,129 +41,129 @@ DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
41 41
 DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 42
 
43 43
 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
44
-{0x8000000080000000ULL, 0x8000000080000000ULL};
45
-
46
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1  ) = {0x0001000100010001ULL, 0x0001000100010001ULL};
47
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2  ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
48
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3  ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
49
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4  ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
50
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
51
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8  ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
52
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9  ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
53
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
54
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
55
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
56
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
57
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
58
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
59
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
60
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
61
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
62
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
63
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
64
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
65
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
66
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
67
-DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL};
69
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL};
70
-
71
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0  ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
72
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1  ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
73
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3  ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
74
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4  ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
75
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
76
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
77
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
78
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
79
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
80
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
81
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
82
-DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
83
-DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
44
+    { 0x8000000080000000ULL, 0x8000000080000000ULL };
45
+
46
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
47
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
48
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
49
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
50
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
51
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
52
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
53
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
54
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
55
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
56
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
57
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
58
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_27)   = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
59
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28)   = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
60
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
61
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
62
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
63
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_63)   = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
64
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
65
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
66
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
67
+DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
68
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
69
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70
+
71
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
72
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
73
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
74
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4)    = { 0x0404040404040404ULL, 0x0404040404040404ULL };
75
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7)    =   0x0707070707070707ULL;
76
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F)   =   0x1F1F1F1F1F1F1F1FULL;
77
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F)   =   0x3F3F3F3F3F3F3F3FULL;
78
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
79
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81)   =   0x8181818181818181ULL;
80
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_A1)   = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
81
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8)   = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
82
+DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
83
+DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 84
 
85 85
 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
86 86
 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
87 87
 
88
-#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89
-#define MOVQ_ZERO(regd)  __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
88
+#define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
89
+#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
90 90
 
91
-#define MOVQ_BFE(regd) \
92
-    __asm__ volatile ( \
93
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
94
-    "paddb %%" #regd ", %%" #regd " \n\t" ::)
91
+#define MOVQ_BFE(regd)                                  \
92
+    __asm__ volatile (                                  \
93
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
94
+        "paddb   %%"#regd", %%"#regd"   \n\t" ::)
95 95
 
96 96
 #ifndef PIC
97
-#define MOVQ_BONE(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
98
-#define MOVQ_WTWO(regd)  __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
97
+#define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
98
+#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
99 99
 #else
100 100
 // for shared library it's better to use this way for accessing constants
101 101
 // pcmpeqd -> -1
102
-#define MOVQ_BONE(regd) \
103
-    __asm__ volatile ( \
104
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
105
-    "psrlw $15, %%" #regd " \n\t" \
106
-    "packuswb %%" #regd ", %%" #regd " \n\t" ::)
107
-
108
-#define MOVQ_WTWO(regd) \
109
-    __asm__ volatile ( \
110
-    "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
111
-    "psrlw $15, %%" #regd " \n\t" \
112
-    "psllw $1, %%" #regd " \n\t"::)
102
+#define MOVQ_BONE(regd)                                 \
103
+    __asm__ volatile (                                  \
104
+        "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
105
+        "psrlw          $15, %%"#regd"  \n\t"           \
106
+        "packuswb %%"#regd", %%"#regd"  \n\t" ::)
107
+
108
+#define MOVQ_WTWO(regd)                                 \
109
+    __asm__ volatile (                                  \
110
+        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
111
+        "psrlw         $15, %%"#regd"   \n\t"           \
112
+        "psllw          $1, %%"#regd"   \n\t"::)
113 113
 
114 114
 #endif
115 115
 
116 116
 // using regr as temporary and for the output result
117 117
 // first argument is unmodifed and second is trashed
118 118
 // regfe is supposed to contain 0xfefefefefefefefe
119
-#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120
-    "movq " #rega ", " #regr "  \n\t"\
121
-    "pand " #regb ", " #regr "  \n\t"\
122
-    "pxor " #rega ", " #regb "  \n\t"\
123
-    "pand " #regfe "," #regb "  \n\t"\
124
-    "psrlq $1, " #regb "        \n\t"\
125
-    "paddb " #regb ", " #regr " \n\t"
126
-
127
-#define PAVGB_MMX(rega, regb, regr, regfe) \
128
-    "movq " #rega ", " #regr "  \n\t"\
129
-    "por  " #regb ", " #regr "  \n\t"\
130
-    "pxor " #rega ", " #regb "  \n\t"\
131
-    "pand " #regfe "," #regb "  \n\t"\
132
-    "psrlq $1, " #regb "        \n\t"\
133
-    "psubb " #regb ", " #regr " \n\t"
119
+#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
120
+    "movq   "#rega", "#regr"            \n\t"                    \
121
+    "pand   "#regb", "#regr"            \n\t"                    \
122
+    "pxor   "#rega", "#regb"            \n\t"                    \
123
+    "pand  "#regfe", "#regb"            \n\t"                    \
124
+    "psrlq       $1, "#regb"            \n\t"                    \
125
+    "paddb  "#regb", "#regr"            \n\t"
126
+
127
+#define PAVGB_MMX(rega, regb, regr, regfe)                       \
128
+    "movq   "#rega", "#regr"            \n\t"                    \
129
+    "por    "#regb", "#regr"            \n\t"                    \
130
+    "pxor   "#rega", "#regb"            \n\t"                    \
131
+    "pand  "#regfe", "#regb"            \n\t"                    \
132
+    "psrlq       $1, "#regb"            \n\t"                    \
133
+    "psubb  "#regb", "#regr"            \n\t"
134 134
 
135 135
 // mm6 is supposed to contain 0xfefefefefefefefe
136
-#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
137
-    "movq " #rega ", " #regr "  \n\t"\
138
-    "movq " #regc ", " #regp "  \n\t"\
139
-    "pand " #regb ", " #regr "  \n\t"\
140
-    "pand " #regd ", " #regp "  \n\t"\
141
-    "pxor " #rega ", " #regb "  \n\t"\
142
-    "pxor " #regc ", " #regd "  \n\t"\
143
-    "pand %%mm6, " #regb "      \n\t"\
144
-    "pand %%mm6, " #regd "      \n\t"\
145
-    "psrlq $1, " #regb "        \n\t"\
146
-    "psrlq $1, " #regd "        \n\t"\
147
-    "paddb " #regb ", " #regr " \n\t"\
148
-    "paddb " #regd ", " #regp " \n\t"
149
-
150
-#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151
-    "movq " #rega ", " #regr "  \n\t"\
152
-    "movq " #regc ", " #regp "  \n\t"\
153
-    "por  " #regb ", " #regr "  \n\t"\
154
-    "por  " #regd ", " #regp "  \n\t"\
155
-    "pxor " #rega ", " #regb "  \n\t"\
156
-    "pxor " #regc ", " #regd "  \n\t"\
157
-    "pand %%mm6, " #regb "      \n\t"\
158
-    "pand %%mm6, " #regd "      \n\t"\
159
-    "psrlq $1, " #regd "        \n\t"\
160
-    "psrlq $1, " #regb "        \n\t"\
161
-    "psubb " #regb ", " #regr " \n\t"\
162
-    "psubb " #regd ", " #regp " \n\t"
136
+#define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
137
+    "movq  "#rega", "#regr"             \n\t"                    \
138
+    "movq  "#regc", "#regp"             \n\t"                    \
139
+    "pand  "#regb", "#regr"             \n\t"                    \
140
+    "pand  "#regd", "#regp"             \n\t"                    \
141
+    "pxor  "#rega", "#regb"             \n\t"                    \
142
+    "pxor  "#regc", "#regd"             \n\t"                    \
143
+    "pand    %%mm6, "#regb"             \n\t"                    \
144
+    "pand    %%mm6, "#regd"             \n\t"                    \
145
+    "psrlq      $1, "#regb"             \n\t"                    \
146
+    "psrlq      $1, "#regd"             \n\t"                    \
147
+    "paddb "#regb", "#regr"             \n\t"                    \
148
+    "paddb "#regd", "#regp"             \n\t"
149
+
150
+#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
151
+    "movq  "#rega", "#regr"             \n\t"                    \
152
+    "movq  "#regc", "#regp"             \n\t"                    \
153
+    "por   "#regb", "#regr"             \n\t"                    \
154
+    "por   "#regd", "#regp"             \n\t"                    \
155
+    "pxor  "#rega", "#regb"             \n\t"                    \
156
+    "pxor  "#regc", "#regd"             \n\t"                    \
157
+    "pand    %%mm6, "#regb"             \n\t"                    \
158
+    "pand    %%mm6, "#regd"             \n\t"                    \
159
+    "psrlq      $1, "#regd"             \n\t"                    \
160
+    "psrlq      $1, "#regb"             \n\t"                    \
161
+    "psubb "#regb", "#regr"             \n\t"                    \
162
+    "psubb "#regd", "#regp"             \n\t"
163 163
 
164 164
 /***********************************/
165 165
 /* MMX no rounding */
166
-#define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
166
+#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
167 167
 #define SET_RND  MOVQ_WONE
168 168
 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 169
 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
... ...
@@ -178,7 +178,7 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
178 178
 /***********************************/
179 179
 /* MMX rounding */
180 180
 
181
-#define DEF(x, y) x ## _ ## y ##_mmx
181
+#define DEF(x, y) x ## _ ## y ## _mmx
182 182
 #define SET_RND  MOVQ_WTWO
183 183
 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
184 184
 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
... ...
@@ -235,537 +235,552 @@ DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
235 235
 /***********************************/
236 236
 /* standard MMX */
237 237
 
238
-void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
238
+void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
239
+                               int line_size)
239 240
 {
240 241
     const DCTELEM *p;
241 242
     uint8_t *pix;
242 243
 
243 244
     /* read the pixels */
244
-    p = block;
245
+    p   = block;
245 246
     pix = pixels;
246 247
     /* unrolled loop */
247
-        __asm__ volatile(
248
-                "movq   %3, %%mm0               \n\t"
249
-                "movq   8%3, %%mm1              \n\t"
250
-                "movq   16%3, %%mm2             \n\t"
251
-                "movq   24%3, %%mm3             \n\t"
252
-                "movq   32%3, %%mm4             \n\t"
253
-                "movq   40%3, %%mm5             \n\t"
254
-                "movq   48%3, %%mm6             \n\t"
255
-                "movq   56%3, %%mm7             \n\t"
256
-                "packuswb %%mm1, %%mm0          \n\t"
257
-                "packuswb %%mm3, %%mm2          \n\t"
258
-                "packuswb %%mm5, %%mm4          \n\t"
259
-                "packuswb %%mm7, %%mm6          \n\t"
260
-                "movq   %%mm0, (%0)             \n\t"
261
-                "movq   %%mm2, (%0, %1)         \n\t"
262
-                "movq   %%mm4, (%0, %1, 2)      \n\t"
263
-                "movq   %%mm6, (%0, %2)         \n\t"
264
-                ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
265
-                :"memory");
266
-        pix += line_size*4;
267
-        p += 32;
248
+    __asm__ volatile (
249
+        "movq        %3, %%mm0          \n\t"
250
+        "movq       8%3, %%mm1          \n\t"
251
+        "movq      16%3, %%mm2          \n\t"
252
+        "movq      24%3, %%mm3          \n\t"
253
+        "movq      32%3, %%mm4          \n\t"
254
+        "movq      40%3, %%mm5          \n\t"
255
+        "movq      48%3, %%mm6          \n\t"
256
+        "movq      56%3, %%mm7          \n\t"
257
+        "packuswb %%mm1, %%mm0          \n\t"
258
+        "packuswb %%mm3, %%mm2          \n\t"
259
+        "packuswb %%mm5, %%mm4          \n\t"
260
+        "packuswb %%mm7, %%mm6          \n\t"
261
+        "movq     %%mm0, (%0)           \n\t"
262
+        "movq     %%mm2, (%0, %1)       \n\t"
263
+        "movq     %%mm4, (%0, %1, 2)    \n\t"
264
+        "movq     %%mm6, (%0, %2)       \n\t"
265
+        :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
266
+           "m"(*p)
267
+        : "memory");
268
+    pix += line_size * 4;
269
+    p   += 32;
268 270
 
269 271
     // if here would be an exact copy of the code above
270 272
     // compiler would generate some very strange code
271 273
     // thus using "r"
272
-    __asm__ volatile(
273
-            "movq       (%3), %%mm0             \n\t"
274
-            "movq       8(%3), %%mm1            \n\t"
275
-            "movq       16(%3), %%mm2           \n\t"
276
-            "movq       24(%3), %%mm3           \n\t"
277
-            "movq       32(%3), %%mm4           \n\t"
278
-            "movq       40(%3), %%mm5           \n\t"
279
-            "movq       48(%3), %%mm6           \n\t"
280
-            "movq       56(%3), %%mm7           \n\t"
281
-            "packuswb %%mm1, %%mm0              \n\t"
282
-            "packuswb %%mm3, %%mm2              \n\t"
283
-            "packuswb %%mm5, %%mm4              \n\t"
284
-            "packuswb %%mm7, %%mm6              \n\t"
285
-            "movq       %%mm0, (%0)             \n\t"
286
-            "movq       %%mm2, (%0, %1)         \n\t"
287
-            "movq       %%mm4, (%0, %1, 2)      \n\t"
288
-            "movq       %%mm6, (%0, %2)         \n\t"
289
-            ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
290
-            :"memory");
291
-}
292
-
293
-#define put_signed_pixels_clamped_mmx_half(off) \
294
-            "movq    "#off"(%2), %%mm1          \n\t"\
295
-            "movq 16+"#off"(%2), %%mm2          \n\t"\
296
-            "movq 32+"#off"(%2), %%mm3          \n\t"\
297
-            "movq 48+"#off"(%2), %%mm4          \n\t"\
298
-            "packsswb  8+"#off"(%2), %%mm1      \n\t"\
299
-            "packsswb 24+"#off"(%2), %%mm2      \n\t"\
300
-            "packsswb 40+"#off"(%2), %%mm3      \n\t"\
301
-            "packsswb 56+"#off"(%2), %%mm4      \n\t"\
302
-            "paddb %%mm0, %%mm1                 \n\t"\
303
-            "paddb %%mm0, %%mm2                 \n\t"\
304
-            "paddb %%mm0, %%mm3                 \n\t"\
305
-            "paddb %%mm0, %%mm4                 \n\t"\
306
-            "movq %%mm1, (%0)                   \n\t"\
307
-            "movq %%mm2, (%0, %3)               \n\t"\
308
-            "movq %%mm3, (%0, %3, 2)            \n\t"\
309
-            "movq %%mm4, (%0, %1)               \n\t"
310
-
311
-void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
274
+    __asm__ volatile (
275
+        "movq       (%3), %%mm0         \n\t"
276
+        "movq      8(%3), %%mm1         \n\t"
277
+        "movq     16(%3), %%mm2         \n\t"
278
+        "movq     24(%3), %%mm3         \n\t"
279
+        "movq     32(%3), %%mm4         \n\t"
280
+        "movq     40(%3), %%mm5         \n\t"
281
+        "movq     48(%3), %%mm6         \n\t"
282
+        "movq     56(%3), %%mm7         \n\t"
283
+        "packuswb  %%mm1, %%mm0         \n\t"
284
+        "packuswb  %%mm3, %%mm2         \n\t"
285
+        "packuswb  %%mm5, %%mm4         \n\t"
286
+        "packuswb  %%mm7, %%mm6         \n\t"
287
+        "movq      %%mm0, (%0)          \n\t"
288
+        "movq      %%mm2, (%0, %1)      \n\t"
289
+        "movq      %%mm4, (%0, %1, 2)   \n\t"
290
+        "movq      %%mm6, (%0, %2)      \n\t"
291
+        :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
292
+        : "memory");
293
+}
294
+
295
+#define put_signed_pixels_clamped_mmx_half(off)             \
296
+    "movq          "#off"(%2), %%mm1        \n\t"           \
297
+    "movq     16 + "#off"(%2), %%mm2        \n\t"           \
298
+    "movq     32 + "#off"(%2), %%mm3        \n\t"           \
299
+    "movq     48 + "#off"(%2), %%mm4        \n\t"           \
300
+    "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
301
+    "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
302
+    "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
303
+    "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
304
+    "paddb              %%mm0, %%mm1        \n\t"           \
305
+    "paddb              %%mm0, %%mm2        \n\t"           \
306
+    "paddb              %%mm0, %%mm3        \n\t"           \
307
+    "paddb              %%mm0, %%mm4        \n\t"           \
308
+    "movq               %%mm1, (%0)         \n\t"           \
309
+    "movq               %%mm2, (%0, %3)     \n\t"           \
310
+    "movq               %%mm3, (%0, %3, 2)  \n\t"           \
311
+    "movq               %%mm4, (%0, %1)     \n\t"
312
+
313
+void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
314
+                                      int line_size)
312 315
 {
313 316
     x86_reg line_skip = line_size;
314 317
     x86_reg line_skip3;
315 318
 
316 319
     __asm__ volatile (
317
-            "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
318
-            "lea (%3, %3, 2), %1                \n\t"
319
-            put_signed_pixels_clamped_mmx_half(0)
320
-            "lea (%0, %3, 4), %0                \n\t"
321
-            put_signed_pixels_clamped_mmx_half(64)
322
-            :"+&r" (pixels), "=&r" (line_skip3)
323
-            :"r" (block), "r"(line_skip)
324
-            :"memory");
325
-}
326
-
327
-void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
320
+        "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
321
+        "lea         (%3, %3, 2), %1        \n\t"
322
+        put_signed_pixels_clamped_mmx_half(0)
323
+        "lea         (%0, %3, 4), %0        \n\t"
324
+        put_signed_pixels_clamped_mmx_half(64)
325
+        : "+&r"(pixels), "=&r"(line_skip3)
326
+        : "r"(block), "r"(line_skip)
327
+        : "memory");
328
+}
329
+
330
+void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
331
+                               int line_size)
328 332
 {
329 333
     const DCTELEM *p;
330 334
     uint8_t *pix;
331 335
     int i;
332 336
 
333 337
     /* read the pixels */
334
-    p = block;
338
+    p   = block;
335 339
     pix = pixels;
336 340
     MOVQ_ZERO(mm7);
337 341
     i = 4;
338 342
     do {
339
-        __asm__ volatile(
340
-                "movq   (%2), %%mm0     \n\t"
341
-                "movq   8(%2), %%mm1    \n\t"
342
-                "movq   16(%2), %%mm2   \n\t"
343
-                "movq   24(%2), %%mm3   \n\t"
344
-                "movq   %0, %%mm4       \n\t"
345
-                "movq   %1, %%mm6       \n\t"
346
-                "movq   %%mm4, %%mm5    \n\t"
347
-                "punpcklbw %%mm7, %%mm4 \n\t"
348
-                "punpckhbw %%mm7, %%mm5 \n\t"
349
-                "paddsw %%mm4, %%mm0    \n\t"
350
-                "paddsw %%mm5, %%mm1    \n\t"
351
-                "movq   %%mm6, %%mm5    \n\t"
352
-                "punpcklbw %%mm7, %%mm6 \n\t"
353
-                "punpckhbw %%mm7, %%mm5 \n\t"
354
-                "paddsw %%mm6, %%mm2    \n\t"
355
-                "paddsw %%mm5, %%mm3    \n\t"
356
-                "packuswb %%mm1, %%mm0  \n\t"
357
-                "packuswb %%mm3, %%mm2  \n\t"
358
-                "movq   %%mm0, %0       \n\t"
359
-                "movq   %%mm2, %1       \n\t"
360
-                :"+m"(*pix), "+m"(*(pix+line_size))
361
-                :"r"(p)
362
-                :"memory");
363
-        pix += line_size*2;
364
-        p += 16;
343
+        __asm__ volatile (
344
+            "movq        (%2), %%mm0    \n\t"
345
+            "movq       8(%2), %%mm1    \n\t"
346
+            "movq      16(%2), %%mm2    \n\t"
347
+            "movq      24(%2), %%mm3    \n\t"
348
+            "movq          %0, %%mm4    \n\t"
349
+            "movq          %1, %%mm6    \n\t"
350
+            "movq       %%mm4, %%mm5    \n\t"
351
+            "punpcklbw  %%mm7, %%mm4    \n\t"
352
+            "punpckhbw  %%mm7, %%mm5    \n\t"
353
+            "paddsw     %%mm4, %%mm0    \n\t"
354
+            "paddsw     %%mm5, %%mm1    \n\t"
355
+            "movq       %%mm6, %%mm5    \n\t"
356
+            "punpcklbw  %%mm7, %%mm6    \n\t"
357
+            "punpckhbw  %%mm7, %%mm5    \n\t"
358
+            "paddsw     %%mm6, %%mm2    \n\t"
359
+            "paddsw     %%mm5, %%mm3    \n\t"
360
+            "packuswb   %%mm1, %%mm0    \n\t"
361
+            "packuswb   %%mm3, %%mm2    \n\t"
362
+            "movq       %%mm0, %0       \n\t"
363
+            "movq       %%mm2, %1       \n\t"
364
+            : "+m"(*pix), "+m"(*(pix + line_size))
365
+            : "r"(p)
366
+            : "memory");
367
+        pix += line_size * 2;
368
+        p   += 16;
365 369
     } while (--i);
366 370
 }
367 371
 
368
-static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
372
+static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
373
+                            int line_size, int h)
369 374
 {
370
-    __asm__ volatile(
371
-         "lea (%3, %3), %%"REG_a"       \n\t"
372
-         ".p2align 3                    \n\t"
373
-         "1:                            \n\t"
374
-         "movd (%1), %%mm0              \n\t"
375
-         "movd (%1, %3), %%mm1          \n\t"
376
-         "movd %%mm0, (%2)              \n\t"
377
-         "movd %%mm1, (%2, %3)          \n\t"
378
-         "add %%"REG_a", %1             \n\t"
379
-         "add %%"REG_a", %2             \n\t"
380
-         "movd (%1), %%mm0              \n\t"
381
-         "movd (%1, %3), %%mm1          \n\t"
382
-         "movd %%mm0, (%2)              \n\t"
383
-         "movd %%mm1, (%2, %3)          \n\t"
384
-         "add %%"REG_a", %1             \n\t"
385
-         "add %%"REG_a", %2             \n\t"
386
-         "subl $4, %0                   \n\t"
387
-         "jnz 1b                        \n\t"
388
-         : "+g"(h), "+r" (pixels),  "+r" (block)
389
-         : "r"((x86_reg)line_size)
390
-         : "%"REG_a, "memory"
375
+    __asm__ volatile (
376
+        "lea   (%3, %3), %%"REG_a"      \n\t"
377
+        ".p2align     3                 \n\t"
378
+        "1:                             \n\t"
379
+        "movd  (%1    ), %%mm0          \n\t"
380
+        "movd  (%1, %3), %%mm1          \n\t"
381
+        "movd     %%mm0, (%2)           \n\t"
382
+        "movd     %%mm1, (%2, %3)       \n\t"
383
+        "add  %%"REG_a", %1             \n\t"
384
+        "add  %%"REG_a", %2             \n\t"
385
+        "movd  (%1    ), %%mm0          \n\t"
386
+        "movd  (%1, %3), %%mm1          \n\t"
387
+        "movd     %%mm0, (%2)           \n\t"
388
+        "movd     %%mm1, (%2, %3)       \n\t"
389
+        "add  %%"REG_a", %1             \n\t"
390
+        "add  %%"REG_a", %2             \n\t"
391
+        "subl        $4, %0             \n\t"
392
+        "jnz         1b                 \n\t"
393
+        : "+g"(h), "+r"(pixels),  "+r"(block)
394
+        : "r"((x86_reg)line_size)
395
+        : "%"REG_a, "memory"
391 396
         );
392 397
 }
393 398
 
394
-static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
399
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
400
+                            int line_size, int h)
395 401
 {
396
-    __asm__ volatile(
397
-         "lea (%3, %3), %%"REG_a"       \n\t"
398
-         ".p2align 3                    \n\t"
399
-         "1:                            \n\t"
400
-         "movq (%1), %%mm0              \n\t"
401
-         "movq (%1, %3), %%mm1          \n\t"
402
-         "movq %%mm0, (%2)              \n\t"
403
-         "movq %%mm1, (%2, %3)          \n\t"
404
-         "add %%"REG_a", %1             \n\t"
405
-         "add %%"REG_a", %2             \n\t"
406
-         "movq (%1), %%mm0              \n\t"
407
-         "movq (%1, %3), %%mm1          \n\t"
408
-         "movq %%mm0, (%2)              \n\t"
409
-         "movq %%mm1, (%2, %3)          \n\t"
410
-         "add %%"REG_a", %1             \n\t"
411
-         "add %%"REG_a", %2             \n\t"
412
-         "subl $4, %0                   \n\t"
413
-         "jnz 1b                        \n\t"
414
-         : "+g"(h), "+r" (pixels),  "+r" (block)
415
-         : "r"((x86_reg)line_size)
416
-         : "%"REG_a, "memory"
402
+    __asm__ volatile (
403
+        "lea   (%3, %3), %%"REG_a"      \n\t"
404
+        ".p2align     3                 \n\t"
405
+        "1:                             \n\t"
406
+        "movq  (%1    ), %%mm0          \n\t"
407
+        "movq  (%1, %3), %%mm1          \n\t"
408
+        "movq     %%mm0, (%2)           \n\t"
409
+        "movq     %%mm1, (%2, %3)       \n\t"
410
+        "add  %%"REG_a", %1             \n\t"
411
+        "add  %%"REG_a", %2             \n\t"
412
+        "movq  (%1    ), %%mm0          \n\t"
413
+        "movq  (%1, %3), %%mm1          \n\t"
414
+        "movq     %%mm0, (%2)           \n\t"
415
+        "movq     %%mm1, (%2, %3)       \n\t"
416
+        "add  %%"REG_a", %1             \n\t"
417
+        "add  %%"REG_a", %2             \n\t"
418
+        "subl        $4, %0             \n\t"
419
+        "jnz         1b                 \n\t"
420
+        : "+g"(h), "+r"(pixels),  "+r"(block)
421
+        : "r"((x86_reg)line_size)
422
+        : "%"REG_a, "memory"
417 423
         );
418 424
 }
419 425
 
420
-static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
426
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
427
+                             int line_size, int h)
421 428
 {
422
-    __asm__ volatile(
423
-         "lea (%3, %3), %%"REG_a"       \n\t"
424
-         ".p2align 3                    \n\t"
425
-         "1:                            \n\t"
426
-         "movq (%1), %%mm0              \n\t"
427
-         "movq 8(%1), %%mm4             \n\t"
428
-         "movq (%1, %3), %%mm1          \n\t"
429
-         "movq 8(%1, %3), %%mm5         \n\t"
430
-         "movq %%mm0, (%2)              \n\t"
431
-         "movq %%mm4, 8(%2)             \n\t"
432
-         "movq %%mm1, (%2, %3)          \n\t"
433
-         "movq %%mm5, 8(%2, %3)         \n\t"
434
-         "add %%"REG_a", %1             \n\t"
435
-         "add %%"REG_a", %2             \n\t"
436
-         "movq (%1), %%mm0              \n\t"
437
-         "movq 8(%1), %%mm4             \n\t"
438
-         "movq (%1, %3), %%mm1          \n\t"
439
-         "movq 8(%1, %3), %%mm5         \n\t"
440
-         "movq %%mm0, (%2)              \n\t"
441
-         "movq %%mm4, 8(%2)             \n\t"
442
-         "movq %%mm1, (%2, %3)          \n\t"
443
-         "movq %%mm5, 8(%2, %3)         \n\t"
444
-         "add %%"REG_a", %1             \n\t"
445
-         "add %%"REG_a", %2             \n\t"
446
-         "subl $4, %0                   \n\t"
447
-         "jnz 1b                        \n\t"
448
-         : "+g"(h), "+r" (pixels),  "+r" (block)
449
-         : "r"((x86_reg)line_size)
450
-         : "%"REG_a, "memory"
429
+    __asm__ volatile (
430
+        "lea   (%3, %3), %%"REG_a"      \n\t"
431
+        ".p2align     3                 \n\t"
432
+        "1:                             \n\t"
433
+        "movq  (%1    ), %%mm0          \n\t"
434
+        "movq 8(%1    ), %%mm4          \n\t"
435
+        "movq  (%1, %3), %%mm1          \n\t"
436
+        "movq 8(%1, %3), %%mm5          \n\t"
437
+        "movq     %%mm0,  (%2)          \n\t"
438
+        "movq     %%mm4, 8(%2)          \n\t"
439
+        "movq     %%mm1,  (%2, %3)      \n\t"
440
+        "movq     %%mm5, 8(%2, %3)      \n\t"
441
+        "add  %%"REG_a", %1             \n\t"
442
+        "add  %%"REG_a", %2             \n\t"
443
+        "movq  (%1    ), %%mm0          \n\t"
444
+        "movq 8(%1    ), %%mm4          \n\t"
445
+        "movq  (%1, %3), %%mm1          \n\t"
446
+        "movq 8(%1, %3), %%mm5          \n\t"
447
+        "movq     %%mm0,  (%2)          \n\t"
448
+        "movq     %%mm4, 8(%2)          \n\t"
449
+        "movq     %%mm1,  (%2, %3)      \n\t"
450
+        "movq     %%mm5, 8(%2, %3)      \n\t"
451
+        "add  %%"REG_a", %1             \n\t"
452
+        "add  %%"REG_a", %2             \n\t"
453
+        "subl        $4, %0             \n\t"
454
+        "jnz         1b                 \n\t"
455
+        : "+g"(h), "+r"(pixels),  "+r"(block)
456
+        : "r"((x86_reg)line_size)
457
+        : "%"REG_a, "memory"
451 458
         );
452 459
 }
453 460
 
454
-static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
461
+static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
462
+                              int line_size, int h)
455 463
 {
456
-    __asm__ volatile(
457
-         "1:                            \n\t"
458
-         "movdqu (%1), %%xmm0           \n\t"
459
-         "movdqu (%1,%3), %%xmm1        \n\t"
460
-         "movdqu (%1,%3,2), %%xmm2      \n\t"
461
-         "movdqu (%1,%4), %%xmm3        \n\t"
462
-         "lea (%1,%3,4), %1             \n\t"
463
-         "movdqa %%xmm0, (%2)           \n\t"
464
-         "movdqa %%xmm1, (%2,%3)        \n\t"
465
-         "movdqa %%xmm2, (%2,%3,2)      \n\t"
466
-         "movdqa %%xmm3, (%2,%4)        \n\t"
467
-         "subl $4, %0                   \n\t"
468
-         "lea (%2,%3,4), %2             \n\t"
469
-         "jnz 1b                        \n\t"
470
-         : "+g"(h), "+r" (pixels),  "+r" (block)
471
-         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
472
-         : "memory"
464
+    __asm__ volatile (
465
+        "1:                              \n\t"
466
+        "movdqu (%1       ), %%xmm0      \n\t"
467
+        "movdqu (%1, %3   ), %%xmm1      \n\t"
468
+        "movdqu (%1, %3, 2), %%xmm2      \n\t"
469
+        "movdqu (%1, %4   ), %%xmm3      \n\t"
470
+        "lea    (%1, %3, 4), %1          \n\t"
471
+        "movdqa      %%xmm0, (%2)        \n\t"
472
+        "movdqa      %%xmm1, (%2, %3)    \n\t"
473
+        "movdqa      %%xmm2, (%2, %3, 2) \n\t"
474
+        "movdqa      %%xmm3, (%2, %4)    \n\t"
475
+        "subl            $4, %0          \n\t"
476
+        "lea    (%2, %3, 4), %2          \n\t"
477
+        "jnz             1b              \n\t"
478
+        : "+g"(h), "+r"(pixels),  "+r"(block)
479
+        : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
480
+        : "memory"
473 481
         );
474 482
 }
475 483
 
476
-static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
484
+static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
485
+                              int line_size, int h)
477 486
 {
478
-    __asm__ volatile(
479
-         "1:                            \n\t"
480
-         "movdqu (%1), %%xmm0           \n\t"
481
-         "movdqu (%1,%3), %%xmm1        \n\t"
482
-         "movdqu (%1,%3,2), %%xmm2      \n\t"
483
-         "movdqu (%1,%4), %%xmm3        \n\t"
484
-         "lea (%1,%3,4), %1             \n\t"
485
-         "pavgb  (%2), %%xmm0           \n\t"
486
-         "pavgb  (%2,%3), %%xmm1        \n\t"
487
-         "pavgb  (%2,%3,2), %%xmm2      \n\t"
488
-         "pavgb  (%2,%4), %%xmm3        \n\t"
489
-         "movdqa %%xmm0, (%2)           \n\t"
490
-         "movdqa %%xmm1, (%2,%3)        \n\t"
491
-         "movdqa %%xmm2, (%2,%3,2)      \n\t"
492
-         "movdqa %%xmm3, (%2,%4)        \n\t"
493
-         "subl $4, %0                   \n\t"
494
-         "lea (%2,%3,4), %2             \n\t"
495
-         "jnz 1b                        \n\t"
496
-         : "+g"(h), "+r" (pixels),  "+r" (block)
497
-         : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
498
-         : "memory"
487
+    __asm__ volatile (
488
+        "1:                                 \n\t"
489
+        "movdqu (%1       ), %%xmm0         \n\t"
490
+        "movdqu (%1, %3   ), %%xmm1         \n\t"
491
+        "movdqu (%1, %3, 2), %%xmm2         \n\t"
492
+        "movdqu (%1, %4   ), %%xmm3         \n\t"
493
+        "lea    (%1, %3, 4), %1             \n\t"
494
+        "pavgb  (%2       ), %%xmm0         \n\t"
495
+        "pavgb  (%2, %3   ), %%xmm1         \n\t"
496
+        "pavgb  (%2, %3, 2), %%xmm2         \n\t"
497
+        "pavgb     (%2, %4), %%xmm3         \n\t"
498
+        "movdqa      %%xmm0, (%2)           \n\t"
499
+        "movdqa      %%xmm1, (%2, %3)       \n\t"
500
+        "movdqa      %%xmm2, (%2, %3, 2)    \n\t"
501
+        "movdqa      %%xmm3, (%2, %4)       \n\t"
502
+        "subl            $4, %0             \n\t"
503
+        "lea    (%2, %3, 4), %2             \n\t"
504
+        "jnz             1b                 \n\t"
505
+        : "+g"(h), "+r"(pixels),  "+r"(block)
506
+        : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
507
+        : "memory"
499 508
         );
500 509
 }
501 510
 
502
-#define CLEAR_BLOCKS(name,n) \
503
-static void name(DCTELEM *blocks)\
504
-{\
505
-    __asm__ volatile(\
506
-                "pxor %%mm7, %%mm7              \n\t"\
507
-                "mov     %1, %%"REG_a"          \n\t"\
508
-                "1:                             \n\t"\
509
-                "movq %%mm7, (%0, %%"REG_a")    \n\t"\
510
-                "movq %%mm7, 8(%0, %%"REG_a")   \n\t"\
511
-                "movq %%mm7, 16(%0, %%"REG_a")  \n\t"\
512
-                "movq %%mm7, 24(%0, %%"REG_a")  \n\t"\
513
-                "add $32, %%"REG_a"             \n\t"\
514
-                " js 1b                         \n\t"\
515
-                : : "r" (((uint8_t *)blocks)+128*n),\
516
-                    "i" (-128*n)\
517
-                : "%"REG_a\
518
-        );\
511
+#define CLEAR_BLOCKS(name, n)                           \
512
+static void name(DCTELEM *blocks)                       \
513
+{                                                       \
514
+    __asm__ volatile (                                  \
515
+        "pxor %%mm7, %%mm7              \n\t"           \
516
+        "mov     %1,        %%"REG_a"   \n\t"           \
517
+        "1:                             \n\t"           \
518
+        "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
519
+        "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
520
+        "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
521
+        "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
522
+        "add    $32, %%"REG_a"          \n\t"           \
523
+        "js      1b                     \n\t"           \
524
+        :: "r"(((uint8_t *)blocks) + 128 * n),          \
525
+           "i"(-128 * n)                                \
526
+        : "%"REG_a                                      \
527
+        );                                              \
519 528
 }
520 529
 CLEAR_BLOCKS(clear_blocks_mmx, 6)
521 530
 CLEAR_BLOCKS(clear_block_mmx, 1)
522 531
 
523 532
 static void clear_block_sse(DCTELEM *block)
524 533
 {
525
-    __asm__ volatile(
526
-        "xorps  %%xmm0, %%xmm0  \n"
527
-        "movaps %%xmm0,    (%0) \n"
528
-        "movaps %%xmm0,  16(%0) \n"
529
-        "movaps %%xmm0,  32(%0) \n"
530
-        "movaps %%xmm0,  48(%0) \n"
531
-        "movaps %%xmm0,  64(%0) \n"
532
-        "movaps %%xmm0,  80(%0) \n"
533
-        "movaps %%xmm0,  96(%0) \n"
534
-        "movaps %%xmm0, 112(%0) \n"
534
+    __asm__ volatile (
535
+        "xorps  %%xmm0, %%xmm0          \n"
536
+        "movaps %%xmm0,    (%0)         \n"
537
+        "movaps %%xmm0,  16(%0)         \n"
538
+        "movaps %%xmm0,  32(%0)         \n"
539
+        "movaps %%xmm0,  48(%0)         \n"
540
+        "movaps %%xmm0,  64(%0)         \n"
541
+        "movaps %%xmm0,  80(%0)         \n"
542
+        "movaps %%xmm0,  96(%0)         \n"
543
+        "movaps %%xmm0, 112(%0)         \n"
535 544
         :: "r"(block)
536 545
         : "memory"
537 546
     );
538 547
 }
539 548
 
540 549
 static void clear_blocks_sse(DCTELEM *blocks)
541
-{\
542
-    __asm__ volatile(
543
-        "xorps  %%xmm0, %%xmm0  \n"
544
-        "mov     %1, %%"REG_a"  \n"
545
-        "1:                     \n"
546
-        "movaps %%xmm0,    (%0, %%"REG_a") \n"
547
-        "movaps %%xmm0,  16(%0, %%"REG_a") \n"
548
-        "movaps %%xmm0,  32(%0, %%"REG_a") \n"
549
-        "movaps %%xmm0,  48(%0, %%"REG_a") \n"
550
-        "movaps %%xmm0,  64(%0, %%"REG_a") \n"
551
-        "movaps %%xmm0,  80(%0, %%"REG_a") \n"
552
-        "movaps %%xmm0,  96(%0, %%"REG_a") \n"
553
-        "movaps %%xmm0, 112(%0, %%"REG_a") \n"
554
-        "add $128, %%"REG_a"    \n"
555
-        " js 1b                 \n"
556
-        : : "r" (((uint8_t *)blocks)+128*6),
557
-            "i" (-128*6)
550
+{
551
+    __asm__ volatile (
552
+        "xorps  %%xmm0, %%xmm0              \n"
553
+        "mov        %1,         %%"REG_a"   \n"
554
+        "1:                                 \n"
555
+        "movaps %%xmm0,    (%0, %%"REG_a")  \n"
556
+        "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
557
+        "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
558
+        "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
559
+        "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
560
+        "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
561
+        "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
562
+        "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
563
+        "add      $128,         %%"REG_a"   \n"
564
+        "js         1b                      \n"
565
+        :: "r"(((uint8_t *)blocks) + 128 * 6),
566
+           "i"(-128 * 6)
558 567
         : "%"REG_a
559 568
     );
560 569
 }
561 570
 
562
-static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
563
-    x86_reg i=0;
564
-    __asm__ volatile(
565
-        "jmp 2f                         \n\t"
571
+static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
572
+{
573
+    x86_reg i = 0;
574
+    __asm__ volatile (
575
+        "jmp          2f                \n\t"
566 576
         "1:                             \n\t"
567
-        "movq  (%1, %0), %%mm0          \n\t"
568
-        "movq  (%2, %0), %%mm1          \n\t"
569
-        "paddb %%mm0, %%mm1             \n\t"
570
-        "movq %%mm1, (%2, %0)           \n\t"
571
-        "movq 8(%1, %0), %%mm0          \n\t"
572
-        "movq 8(%2, %0), %%mm1          \n\t"
573
-        "paddb %%mm0, %%mm1             \n\t"
574
-        "movq %%mm1, 8(%2, %0)          \n\t"
575
-        "add $16, %0                    \n\t"
577
+        "movq   (%1, %0), %%mm0         \n\t"
578
+        "movq   (%2, %0), %%mm1         \n\t"
579
+        "paddb     %%mm0, %%mm1         \n\t"
580
+        "movq      %%mm1, (%2, %0)      \n\t"
581
+        "movq  8(%1, %0), %%mm0         \n\t"
582
+        "movq  8(%2, %0), %%mm1         \n\t"
583
+        "paddb     %%mm0, %%mm1         \n\t"
584
+        "movq      %%mm1, 8(%2, %0)     \n\t"
585
+        "add         $16, %0            \n\t"
576 586
         "2:                             \n\t"
577
-        "cmp %3, %0                     \n\t"
578
-        " js 1b                         \n\t"
579
-        : "+r" (i)
580
-        : "r"(src), "r"(dst), "r"((x86_reg)w-15)
587
+        "cmp          %3, %0            \n\t"
588
+        "js           1b                \n\t"
589
+        : "+r"(i)
590
+        : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
581 591
     );
582
-    for(; i<w; i++)
583
-        dst[i+0] += src[i+0];
592
+    for ( ; i < w; i++)
593
+        dst[i + 0] += src[i + 0];
584 594
 }
585 595
 
586 596
 #if HAVE_7REGS
587
-static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
597
+static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
598
+                                            const uint8_t *diff, int w,
599
+                                            int *left, int *left_top)
600
+{
588 601
     x86_reg w2 = -w;
589 602
     x86_reg x;
590
-    int l = *left & 0xff;
603
+    int l  = *left     & 0xff;
591 604
     int tl = *left_top & 0xff;
592 605
     int t;
593
-    __asm__ volatile(
594
-        "mov    %7, %3 \n"
595
-        "1: \n"
596
-        "movzbl (%3,%4), %2 \n"
597
-        "mov    %2, %k3 \n"
598
-        "sub   %b1, %b3 \n"
599
-        "add   %b0, %b3 \n"
600
-        "mov    %2, %1 \n"
601
-        "cmp    %0, %2 \n"
602
-        "cmovg  %0, %2 \n"
603
-        "cmovg  %1, %0 \n"
604
-        "cmp   %k3, %0 \n"
605
-        "cmovg %k3, %0 \n"
606
-        "mov    %7, %3 \n"
607
-        "cmp    %2, %0 \n"
608
-        "cmovl  %2, %0 \n"
609
-        "add (%6,%4), %b0 \n"
610
-        "mov   %b0, (%5,%4) \n"
611
-        "inc    %4 \n"
612
-        "jl 1b \n"
613
-        :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
614
-        :"r"(dst+w), "r"(diff+w), "rm"(top+w)
606
+    __asm__ volatile (
607
+        "mov          %7, %3            \n"
608
+        "1:                             \n"
609
+        "movzbl (%3, %4), %2            \n"
610
+        "mov          %2, %k3           \n"
611
+        "sub         %b1, %b3           \n"
612
+        "add         %b0, %b3           \n"
613
+        "mov          %2, %1            \n"
614
+        "cmp          %0, %2            \n"
615
+        "cmovg        %0, %2            \n"
616
+        "cmovg        %1, %0            \n"
617
+        "cmp         %k3, %0            \n"
618
+        "cmovg       %k3, %0            \n"
619
+        "mov          %7, %3            \n"
620
+        "cmp          %2, %0            \n"
621
+        "cmovl        %2, %0            \n"
622
+        "add    (%6, %4), %b0           \n"
623
+        "mov         %b0, (%5, %4)      \n"
624
+        "inc          %4                \n"
625
+        "jl           1b                \n"
626
+        : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
627
+        : "r"(dst + w), "r"(diff + w), "rm"(top + w)
615 628
     );
616
-    *left = l;
629
+    *left     = l;
617 630
     *left_top = tl;
618 631
 }
619 632
 #endif
620 633
 
621
-#define H263_LOOP_FILTER \
622
-        "pxor %%mm7, %%mm7              \n\t"\
623
-        "movq  %0, %%mm0                \n\t"\
624
-        "movq  %0, %%mm1                \n\t"\
625
-        "movq  %3, %%mm2                \n\t"\
626
-        "movq  %3, %%mm3                \n\t"\
627
-        "punpcklbw %%mm7, %%mm0         \n\t"\
628
-        "punpckhbw %%mm7, %%mm1         \n\t"\
629
-        "punpcklbw %%mm7, %%mm2         \n\t"\
630
-        "punpckhbw %%mm7, %%mm3         \n\t"\
631
-        "psubw %%mm2, %%mm0             \n\t"\
632
-        "psubw %%mm3, %%mm1             \n\t"\
633
-        "movq  %1, %%mm2                \n\t"\
634
-        "movq  %1, %%mm3                \n\t"\
635
-        "movq  %2, %%mm4                \n\t"\
636
-        "movq  %2, %%mm5                \n\t"\
637
-        "punpcklbw %%mm7, %%mm2         \n\t"\
638
-        "punpckhbw %%mm7, %%mm3         \n\t"\
639
-        "punpcklbw %%mm7, %%mm4         \n\t"\
640
-        "punpckhbw %%mm7, %%mm5         \n\t"\
641
-        "psubw %%mm2, %%mm4             \n\t"\
642
-        "psubw %%mm3, %%mm5             \n\t"\
643
-        "psllw $2, %%mm4                \n\t"\
644
-        "psllw $2, %%mm5                \n\t"\
645
-        "paddw %%mm0, %%mm4             \n\t"\
646
-        "paddw %%mm1, %%mm5             \n\t"\
647
-        "pxor %%mm6, %%mm6              \n\t"\
648
-        "pcmpgtw %%mm4, %%mm6           \n\t"\
649
-        "pcmpgtw %%mm5, %%mm7           \n\t"\
650
-        "pxor %%mm6, %%mm4              \n\t"\
651
-        "pxor %%mm7, %%mm5              \n\t"\
652
-        "psubw %%mm6, %%mm4             \n\t"\
653
-        "psubw %%mm7, %%mm5             \n\t"\
654
-        "psrlw $3, %%mm4                \n\t"\
655
-        "psrlw $3, %%mm5                \n\t"\
656
-        "packuswb %%mm5, %%mm4          \n\t"\
657
-        "packsswb %%mm7, %%mm6          \n\t"\
658
-        "pxor %%mm7, %%mm7              \n\t"\
659
-        "movd %4, %%mm2                 \n\t"\
660
-        "punpcklbw %%mm2, %%mm2         \n\t"\
661
-        "punpcklbw %%mm2, %%mm2         \n\t"\
662
-        "punpcklbw %%mm2, %%mm2         \n\t"\
663
-        "psubusb %%mm4, %%mm2           \n\t"\
664
-        "movq %%mm2, %%mm3              \n\t"\
665
-        "psubusb %%mm4, %%mm3           \n\t"\
666
-        "psubb %%mm3, %%mm2             \n\t"\
667
-        "movq %1, %%mm3                 \n\t"\
668
-        "movq %2, %%mm4                 \n\t"\
669
-        "pxor %%mm6, %%mm3              \n\t"\
670
-        "pxor %%mm6, %%mm4              \n\t"\
671
-        "paddusb %%mm2, %%mm3           \n\t"\
672
-        "psubusb %%mm2, %%mm4           \n\t"\
673
-        "pxor %%mm6, %%mm3              \n\t"\
674
-        "pxor %%mm6, %%mm4              \n\t"\
675
-        "paddusb %%mm2, %%mm2           \n\t"\
676
-        "packsswb %%mm1, %%mm0          \n\t"\
677
-        "pcmpgtb %%mm0, %%mm7           \n\t"\
678
-        "pxor %%mm7, %%mm0              \n\t"\
679
-        "psubb %%mm7, %%mm0             \n\t"\
680
-        "movq %%mm0, %%mm1              \n\t"\
681
-        "psubusb %%mm2, %%mm0           \n\t"\
682
-        "psubb %%mm0, %%mm1             \n\t"\
683
-        "pand %5, %%mm1                 \n\t"\
684
-        "psrlw $2, %%mm1                \n\t"\
685
-        "pxor %%mm7, %%mm1              \n\t"\
686
-        "psubb %%mm7, %%mm1             \n\t"\
687
-        "movq %0, %%mm5                 \n\t"\
688
-        "movq %3, %%mm6                 \n\t"\
689
-        "psubb %%mm1, %%mm5             \n\t"\
690
-        "paddb %%mm1, %%mm6             \n\t"
691
-
692
-static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
693
-    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
694
-    const int strength= ff_h263_loop_filter_strength[qscale];
695
-
696
-    __asm__ volatile(
697
-
698
-        H263_LOOP_FILTER
699
-
700
-        "movq %%mm3, %1                 \n\t"
701
-        "movq %%mm4, %2                 \n\t"
702
-        "movq %%mm5, %0                 \n\t"
703
-        "movq %%mm6, %3                 \n\t"
704
-        : "+m" (*(uint64_t*)(src - 2*stride)),
705
-          "+m" (*(uint64_t*)(src - 1*stride)),
706
-          "+m" (*(uint64_t*)(src + 0*stride)),
707
-          "+m" (*(uint64_t*)(src + 1*stride))
708
-        : "g" (2*strength), "m"(ff_pb_FC)
709
-    );
634
+#define H263_LOOP_FILTER                        \
635
+    "pxor      %%mm7, %%mm7             \n\t"   \
636
+    "movq         %0, %%mm0             \n\t"   \
637
+    "movq         %0, %%mm1             \n\t"   \
638
+    "movq         %3, %%mm2             \n\t"   \
639
+    "movq         %3, %%mm3             \n\t"   \
640
+    "punpcklbw %%mm7, %%mm0             \n\t"   \
641
+    "punpckhbw %%mm7, %%mm1             \n\t"   \
642
+    "punpcklbw %%mm7, %%mm2             \n\t"   \
643
+    "punpckhbw %%mm7, %%mm3             \n\t"   \
644
+    "psubw     %%mm2, %%mm0             \n\t"   \
645
+    "psubw     %%mm3, %%mm1             \n\t"   \
646
+    "movq         %1, %%mm2             \n\t"   \
647
+    "movq         %1, %%mm3             \n\t"   \
648
+    "movq         %2, %%mm4             \n\t"   \
649
+    "movq         %2, %%mm5             \n\t"   \
650
+    "punpcklbw %%mm7, %%mm2             \n\t"   \
651
+    "punpckhbw %%mm7, %%mm3             \n\t"   \
652
+    "punpcklbw %%mm7, %%mm4             \n\t"   \
653
+    "punpckhbw %%mm7, %%mm5             \n\t"   \
654
+    "psubw     %%mm2, %%mm4             \n\t"   \
655
+    "psubw     %%mm3, %%mm5             \n\t"   \
656
+    "psllw        $2, %%mm4             \n\t"   \
657
+    "psllw        $2, %%mm5             \n\t"   \
658
+    "paddw     %%mm0, %%mm4             \n\t"   \
659
+    "paddw     %%mm1, %%mm5             \n\t"   \
660
+    "pxor      %%mm6, %%mm6             \n\t"   \
661
+    "pcmpgtw   %%mm4, %%mm6             \n\t"   \
662
+    "pcmpgtw   %%mm5, %%mm7             \n\t"   \
663
+    "pxor      %%mm6, %%mm4             \n\t"   \
664
+    "pxor      %%mm7, %%mm5             \n\t"   \
665
+    "psubw     %%mm6, %%mm4             \n\t"   \
666
+    "psubw     %%mm7, %%mm5             \n\t"   \
667
+    "psrlw        $3, %%mm4             \n\t"   \
668
+    "psrlw        $3, %%mm5             \n\t"   \
669
+    "packuswb  %%mm5, %%mm4             \n\t"   \
670
+    "packsswb  %%mm7, %%mm6             \n\t"   \
671
+    "pxor      %%mm7, %%mm7             \n\t"   \
672
+    "movd         %4, %%mm2             \n\t"   \
673
+    "punpcklbw %%mm2, %%mm2             \n\t"   \
674
+    "punpcklbw %%mm2, %%mm2             \n\t"   \
675
+    "punpcklbw %%mm2, %%mm2             \n\t"   \
676
+    "psubusb   %%mm4, %%mm2             \n\t"   \
677
+    "movq      %%mm2, %%mm3             \n\t"   \
678
+    "psubusb   %%mm4, %%mm3             \n\t"   \
679
+    "psubb     %%mm3, %%mm2             \n\t"   \
680
+    "movq         %1, %%mm3             \n\t"   \
681
+    "movq         %2, %%mm4             \n\t"   \
682
+    "pxor      %%mm6, %%mm3             \n\t"   \
683
+    "pxor      %%mm6, %%mm4             \n\t"   \
684
+    "paddusb   %%mm2, %%mm3             \n\t"   \
685
+    "psubusb   %%mm2, %%mm4             \n\t"   \
686
+    "pxor      %%mm6, %%mm3             \n\t"   \
687
+    "pxor      %%mm6, %%mm4             \n\t"   \
688
+    "paddusb   %%mm2, %%mm2             \n\t"   \
689
+    "packsswb  %%mm1, %%mm0             \n\t"   \
690
+    "pcmpgtb   %%mm0, %%mm7             \n\t"   \
691
+    "pxor      %%mm7, %%mm0             \n\t"   \
692
+    "psubb     %%mm7, %%mm0             \n\t"   \
693
+    "movq      %%mm0, %%mm1             \n\t"   \
694
+    "psubusb   %%mm2, %%mm0             \n\t"   \
695
+    "psubb     %%mm0, %%mm1             \n\t"   \
696
+    "pand         %5, %%mm1             \n\t"   \
697
+    "psrlw        $2, %%mm1             \n\t"   \
698
+    "pxor      %%mm7, %%mm1             \n\t"   \
699
+    "psubb     %%mm7, %%mm1             \n\t"   \
700
+    "movq         %0, %%mm5             \n\t"   \
701
+    "movq         %3, %%mm6             \n\t"   \
702
+    "psubb     %%mm1, %%mm5             \n\t"   \
703
+    "paddb     %%mm1, %%mm6             \n\t"
704
+
705
+static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
706
+{
707
+    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
708
+        const int strength = ff_h263_loop_filter_strength[qscale];
709
+
710
+        __asm__ volatile (
711
+            H263_LOOP_FILTER
712
+
713
+            "movq %%mm3, %1             \n\t"
714
+            "movq %%mm4, %2             \n\t"
715
+            "movq %%mm5, %0             \n\t"
716
+            "movq %%mm6, %3             \n\t"
717
+            : "+m"(*(uint64_t*)(src - 2 * stride)),
718
+              "+m"(*(uint64_t*)(src - 1 * stride)),
719
+              "+m"(*(uint64_t*)(src + 0 * stride)),
720
+              "+m"(*(uint64_t*)(src + 1 * stride))
721
+            : "g"(2 * strength), "m"(ff_pb_FC)
722
+            );
710 723
     }
711 724
 }
712 725
 
713
-static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
714
-    if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
715
-    const int strength= ff_h263_loop_filter_strength[qscale];
716
-    DECLARE_ALIGNED(8, uint64_t, temp)[4];
717
-    uint8_t *btemp= (uint8_t*)temp;
718
-
719
-    src -= 2;
720
-
721
-    transpose4x4(btemp  , src           , 8, stride);
722
-    transpose4x4(btemp+4, src + 4*stride, 8, stride);
723
-    __asm__ volatile(
724
-        H263_LOOP_FILTER // 5 3 4 6
725
-
726
-        : "+m" (temp[0]),
727
-          "+m" (temp[1]),
728
-          "+m" (temp[2]),
729
-          "+m" (temp[3])
730
-        : "g" (2*strength), "m"(ff_pb_FC)
731
-    );
726
+static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
727
+{
728
+    if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
729
+        const int strength = ff_h263_loop_filter_strength[qscale];
730
+        DECLARE_ALIGNED(8, uint64_t, temp)[4];
731
+        uint8_t *btemp = (uint8_t*)temp;
732
+
733
+        src -= 2;
734
+
735
+        transpose4x4(btemp,     src,              8, stride);
736
+        transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
737
+        __asm__ volatile (
738
+            H263_LOOP_FILTER // 5 3 4 6
739
+
740
+            : "+m"(temp[0]),
741
+              "+m"(temp[1]),
742
+              "+m"(temp[2]),
743
+              "+m"(temp[3])
744
+            : "g"(2 * strength), "m"(ff_pb_FC)
745
+            );
732 746
 
733
-    __asm__ volatile(
734
-        "movq %%mm5, %%mm1              \n\t"
735
-        "movq %%mm4, %%mm0              \n\t"
736
-        "punpcklbw %%mm3, %%mm5         \n\t"
737
-        "punpcklbw %%mm6, %%mm4         \n\t"
738
-        "punpckhbw %%mm3, %%mm1         \n\t"
739
-        "punpckhbw %%mm6, %%mm0         \n\t"
740
-        "movq %%mm5, %%mm3              \n\t"
741
-        "movq %%mm1, %%mm6              \n\t"
742
-        "punpcklwd %%mm4, %%mm5         \n\t"
743
-        "punpcklwd %%mm0, %%mm1         \n\t"
744
-        "punpckhwd %%mm4, %%mm3         \n\t"
745
-        "punpckhwd %%mm0, %%mm6         \n\t"
746
-        "movd %%mm5, (%0)               \n\t"
747
-        "punpckhdq %%mm5, %%mm5         \n\t"
748
-        "movd %%mm5, (%0,%2)            \n\t"
749
-        "movd %%mm3, (%0,%2,2)          \n\t"
750
-        "punpckhdq %%mm3, %%mm3         \n\t"
751
-        "movd %%mm3, (%0,%3)            \n\t"
752
-        "movd %%mm1, (%1)               \n\t"
753
-        "punpckhdq %%mm1, %%mm1         \n\t"
754
-        "movd %%mm1, (%1,%2)            \n\t"
755
-        "movd %%mm6, (%1,%2,2)          \n\t"
756
-        "punpckhdq %%mm6, %%mm6         \n\t"
757
-        "movd %%mm6, (%1,%3)            \n\t"
758
-        :: "r" (src),
759
-           "r" (src + 4*stride),
760
-           "r" ((x86_reg)   stride ),
761
-           "r" ((x86_reg)(3*stride))
762
-    );
747
+        __asm__ volatile (
748
+            "movq      %%mm5, %%mm1         \n\t"
749
+            "movq      %%mm4, %%mm0         \n\t"
750
+            "punpcklbw %%mm3, %%mm5         \n\t"
751
+            "punpcklbw %%mm6, %%mm4         \n\t"
752
+            "punpckhbw %%mm3, %%mm1         \n\t"
753
+            "punpckhbw %%mm6, %%mm0         \n\t"
754
+            "movq      %%mm5, %%mm3         \n\t"
755
+            "movq      %%mm1, %%mm6         \n\t"
756
+            "punpcklwd %%mm4, %%mm5         \n\t"
757
+            "punpcklwd %%mm0, %%mm1         \n\t"
758
+            "punpckhwd %%mm4, %%mm3         \n\t"
759
+            "punpckhwd %%mm0, %%mm6         \n\t"
760
+            "movd      %%mm5, (%0)          \n\t"
761
+            "punpckhdq %%mm5, %%mm5         \n\t"
762
+            "movd      %%mm5, (%0, %2)      \n\t"
763
+            "movd      %%mm3, (%0, %2, 2)   \n\t"
764
+            "punpckhdq %%mm3, %%mm3         \n\t"
765
+            "movd      %%mm3, (%0, %3)      \n\t"
766
+            "movd      %%mm1, (%1)          \n\t"
767
+            "punpckhdq %%mm1, %%mm1         \n\t"
768
+            "movd      %%mm1, (%1, %2)      \n\t"
769
+            "movd      %%mm6, (%1, %2, 2)   \n\t"
770
+            "punpckhdq %%mm6, %%mm6         \n\t"
771
+            "movd      %%mm6, (%1, %3)      \n\t"
772
+            :: "r"(src),
773
+               "r"(src + 4 * stride),
774
+               "r"((x86_reg)stride),
775
+               "r"((x86_reg)(3 * stride))
776
+            );
763 777
     }
764 778
 }
765 779
 
766
-/* draw the edges of width 'w' of an image of size width, height
767
-   this mmx version can only handle w==8 || w==16 */
768
-static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
780
+/* Draw the edges of width 'w' of an image of size width, height
781
+ * this MMX version can only handle w == 8 || w == 16. */
782
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
783
+                           int w, int h, int sides)
769 784
 {
770 785
     uint8_t *ptr, *last_line;
771 786
     int i;
... ...
@@ -773,794 +788,1000 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w,
773 773
     last_line = buf + (height - 1) * wrap;
774 774
     /* left and right */
775 775
     ptr = buf;
776
-    if(w==8)
777
-    {
778
-        __asm__ volatile(
779
-                "1:                             \n\t"
780
-                "movd (%0), %%mm0               \n\t"
781
-                "punpcklbw %%mm0, %%mm0         \n\t"
782
-                "punpcklwd %%mm0, %%mm0         \n\t"
783
-                "punpckldq %%mm0, %%mm0         \n\t"
784
-                "movq %%mm0, -8(%0)             \n\t"
785
-                "movq -8(%0, %2), %%mm1         \n\t"
786
-                "punpckhbw %%mm1, %%mm1         \n\t"
787
-                "punpckhwd %%mm1, %%mm1         \n\t"
788
-                "punpckhdq %%mm1, %%mm1         \n\t"
789
-                "movq %%mm1, (%0, %2)           \n\t"
790
-                "add %1, %0                     \n\t"
791
-                "cmp %3, %0                     \n\t"
792
-                " jb 1b                         \n\t"
793
-                : "+r" (ptr)
794
-                : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
795
-        );
796
-    }
797
-    else
798
-    {
799
-        __asm__ volatile(
800
-                "1:                             \n\t"
801
-                "movd (%0), %%mm0               \n\t"
802
-                "punpcklbw %%mm0, %%mm0         \n\t"
803
-                "punpcklwd %%mm0, %%mm0         \n\t"
804
-                "punpckldq %%mm0, %%mm0         \n\t"
805
-                "movq %%mm0, -8(%0)             \n\t"
806
-                "movq %%mm0, -16(%0)            \n\t"
807
-                "movq -8(%0, %2), %%mm1         \n\t"
808
-                "punpckhbw %%mm1, %%mm1         \n\t"
809
-                "punpckhwd %%mm1, %%mm1         \n\t"
810
-                "punpckhdq %%mm1, %%mm1         \n\t"
811
-                "movq %%mm1, (%0, %2)           \n\t"
812
-                "movq %%mm1, 8(%0, %2)          \n\t"
813
-                "add %1, %0                     \n\t"
814
-                "cmp %3, %0                     \n\t"
815
-                " jb 1b                         \n\t"
816
-                : "+r" (ptr)
817
-                : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
818
-        );
776
+    if (w == 8) {
777
+        __asm__ volatile (
778
+            "1:                             \n\t"
779
+            "movd            (%0), %%mm0    \n\t"
780
+            "punpcklbw      %%mm0, %%mm0    \n\t"
781
+            "punpcklwd      %%mm0, %%mm0    \n\t"
782
+            "punpckldq      %%mm0, %%mm0    \n\t"
783
+            "movq           %%mm0, -8(%0)   \n\t"
784
+            "movq      -8(%0, %2), %%mm1    \n\t"
785
+            "punpckhbw      %%mm1, %%mm1    \n\t"
786
+            "punpckhwd      %%mm1, %%mm1    \n\t"
787
+            "punpckhdq      %%mm1, %%mm1    \n\t"
788
+            "movq           %%mm1, (%0, %2) \n\t"
789
+            "add               %1, %0       \n\t"
790
+            "cmp               %3, %0       \n\t"
791
+            "jb                1b           \n\t"
792
+            : "+r"(ptr)
793
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
794
+            );
795
+    } else {
796
+        __asm__ volatile (
797
+            "1:                                 \n\t"
798
+            "movd            (%0), %%mm0        \n\t"
799
+            "punpcklbw      %%mm0, %%mm0        \n\t"
800
+            "punpcklwd      %%mm0, %%mm0        \n\t"
801
+            "punpckldq      %%mm0, %%mm0        \n\t"
802
+            "movq           %%mm0, -8(%0)       \n\t"
803
+            "movq           %%mm0, -16(%0)      \n\t"
804
+            "movq      -8(%0, %2), %%mm1        \n\t"
805
+            "punpckhbw      %%mm1, %%mm1        \n\t"
806
+            "punpckhwd      %%mm1, %%mm1        \n\t"
807
+            "punpckhdq      %%mm1, %%mm1        \n\t"
808
+            "movq           %%mm1,  (%0, %2)    \n\t"
809
+            "movq           %%mm1, 8(%0, %2)    \n\t"
810
+            "add               %1, %0           \n\t"
811
+            "cmp               %3, %0           \n\t"
812
+            "jb                1b               \n\t"
813
+            : "+r"(ptr)
814
+            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
815
+            );
819 816
     }
820 817
 
821 818
     /* top and bottom (and hopefully also the corners) */
822
-    if (sides&EDGE_TOP) {
823
-        for(i = 0; i < h; i += 4) {
824
-            ptr= buf - (i + 1) * wrap - w;
825
-            __asm__ volatile(
826
-                    "1:                             \n\t"
827
-                    "movq (%1, %0), %%mm0           \n\t"
828
-                    "movq %%mm0, (%0)               \n\t"
829
-                    "movq %%mm0, (%0, %2)           \n\t"
830
-                    "movq %%mm0, (%0, %2, 2)        \n\t"
831
-                    "movq %%mm0, (%0, %3)           \n\t"
832
-                    "add $8, %0                     \n\t"
833
-                    "cmp %4, %0                     \n\t"
834
-                    " jb 1b                         \n\t"
835
-                    : "+r" (ptr)
836
-                    : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
837
-            );
819
+    if (sides & EDGE_TOP) {
820
+        for (i = 0; i < h; i += 4) {
821
+            ptr = buf - (i + 1) * wrap - w;
822
+            __asm__ volatile (
823
+                "1:                             \n\t"
824
+                "movq (%1, %0), %%mm0           \n\t"
825
+                "movq    %%mm0, (%0)            \n\t"
826
+                "movq    %%mm0, (%0, %2)        \n\t"
827
+                "movq    %%mm0, (%0, %2, 2)     \n\t"
828
+                "movq    %%mm0, (%0, %3)        \n\t"
829
+                "add        $8, %0              \n\t"
830
+                "cmp        %4, %0              \n\t"
831
+                "jb         1b                  \n\t"
832
+                : "+r"(ptr)
833
+                : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
834
+                  "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
835
+                );
838 836
         }
839 837
     }
840 838
 
841
-    if (sides&EDGE_BOTTOM) {
842
-        for(i = 0; i < h; i += 4) {
843
-            ptr= last_line + (i + 1) * wrap - w;
844
-            __asm__ volatile(
845
-                    "1:                             \n\t"
846
-                    "movq (%1, %0), %%mm0           \n\t"
847
-                    "movq %%mm0, (%0)               \n\t"
848
-                    "movq %%mm0, (%0, %2)           \n\t"
849
-                    "movq %%mm0, (%0, %2, 2)        \n\t"
850
-                    "movq %%mm0, (%0, %3)           \n\t"
851
-                    "add $8, %0                     \n\t"
852
-                    "cmp %4, %0                     \n\t"
853
-                    " jb 1b                         \n\t"
854
-                    : "+r" (ptr)
855
-                    : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
856
-            );
839
+    if (sides & EDGE_BOTTOM) {
840
+        for (i = 0; i < h; i += 4) {
841
+            ptr = last_line + (i + 1) * wrap - w;
842
+            __asm__ volatile (
843
+                "1:                             \n\t"
844
+                "movq (%1, %0), %%mm0           \n\t"
845
+                "movq    %%mm0, (%0)            \n\t"
846
+                "movq    %%mm0, (%0, %2)        \n\t"
847
+                "movq    %%mm0, (%0, %2, 2)     \n\t"
848
+                "movq    %%mm0, (%0, %3)        \n\t"
849
+                "add        $8, %0              \n\t"
850
+                "cmp        %4, %0              \n\t"
851
+                "jb         1b                  \n\t"
852
+                : "+r"(ptr)
853
+                : "r"((x86_reg)last_line - (x86_reg)ptr - w),
854
+                  "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
855
+                  "r"(ptr + width + 2 * w)
856
+                );
857 857
         }
858 858
     }
859 859
 }
860 860
 
861
-#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
862
-        "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
863
-        "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
864
-        "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
865
-        "movq "#in7", " #m3 "             \n\t" /* d */\
866
-        "movq "#in0", %%mm5               \n\t" /* D */\
867
-        "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
868
-        "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
869
-        "movq "#in1", %%mm5               \n\t" /* C */\
870
-        "movq "#in2", %%mm6               \n\t" /* B */\
871
-        "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
872
-        "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
873
-        "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
874
-        "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
875
-        "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
876
-        "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
877
-        "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
878
-        "psraw $5, %%mm5                  \n\t"\
879
-        "packuswb %%mm5, %%mm5            \n\t"\
880
-        OP(%%mm5, out, %%mm7, d)
881
-
882
-#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
883
-static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
884
-    uint64_t temp;\
885
-\
886
-    __asm__ volatile(\
887
-        "pxor %%mm7, %%mm7                \n\t"\
888
-        "1:                               \n\t"\
889
-        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
890
-        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
891
-        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
892
-        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
893
-        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
894
-        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
895
-        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
896
-        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
897
-        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
898
-        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
899
-        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
900
-        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
901
-        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
902
-        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
903
-        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
904
-        "paddw %%mm3, %%mm5               \n\t" /* b */\
905
-        "paddw %%mm2, %%mm6               \n\t" /* c */\
906
-        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
907
-        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
908
-        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
909
-        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
910
-        "paddw %%mm4, %%mm0               \n\t" /* a */\
911
-        "paddw %%mm1, %%mm5               \n\t" /* d */\
912
-        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
913
-        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
914
-        "paddw %6, %%mm6                  \n\t"\
915
-        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
916
-        "psraw $5, %%mm0                  \n\t"\
917
-        "movq %%mm0, %5                   \n\t"\
918
-        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
919
-        \
920
-        "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
921
-        "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
922
-        "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
923
-        "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
924
-        "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
925
-        "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
926
-        "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
927
-        "paddw %%mm0, %%mm2               \n\t" /* b */\
928
-        "paddw %%mm5, %%mm3               \n\t" /* c */\
929
-        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
930
-        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
931
-        "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
932
-        "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
933
-        "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
934
-        "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
935
-        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
936
-        "paddw %%mm2, %%mm1               \n\t" /* a */\
937
-        "paddw %%mm6, %%mm4               \n\t" /* d */\
938
-        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
939
-        "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
940
-        "paddw %6, %%mm1                  \n\t"\
941
-        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
942
-        "psraw $5, %%mm3                  \n\t"\
943
-        "movq %5, %%mm1                   \n\t"\
944
-        "packuswb %%mm3, %%mm1            \n\t"\
945
-        OP_MMX2(%%mm1, (%1),%%mm4, q)\
946
-        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
947
-        \
948
-        "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
949
-        "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
950
-        "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
951
-        "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
952
-        "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
953
-        "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
954
-        "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
955
-        "paddw %%mm1, %%mm5               \n\t" /* b */\
956
-        "paddw %%mm4, %%mm0               \n\t" /* c */\
957
-        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
958
-        "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
959
-        "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
960
-        "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
961
-        "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
962
-        "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
963
-        "paddw %%mm3, %%mm2               \n\t" /* d */\
964
-        "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
965
-        "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
966
-        "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
967
-        "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
968
-        "paddw %%mm2, %%mm6               \n\t" /* a */\
969
-        "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
970
-        "paddw %6, %%mm0                  \n\t"\
971
-        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
972
-        "psraw $5, %%mm0                  \n\t"\
973
-        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
974
-        \
975
-        "paddw %%mm5, %%mm3               \n\t" /* a */\
976
-        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
977
-        "paddw %%mm4, %%mm6               \n\t" /* b */\
978
-        "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
979
-        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
980
-        "paddw %%mm1, %%mm4               \n\t" /* c */\
981
-        "paddw %%mm2, %%mm5               \n\t" /* d */\
982
-        "paddw %%mm6, %%mm6               \n\t" /* 2b */\
983
-        "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
984
-        "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
985
-        "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
986
-        "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
987
-        "paddw %6, %%mm4                  \n\t"\
988
-        "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
989
-        "psraw $5, %%mm4                  \n\t"\
990
-        "packuswb %%mm4, %%mm0            \n\t"\
991
-        OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
992
-        \
993
-        "add %3, %0                       \n\t"\
994
-        "add %4, %1                       \n\t"\
995
-        "decl %2                          \n\t"\
996
-        " jnz 1b                          \n\t"\
997
-        : "+a"(src), "+c"(dst), "+D"(h)\
998
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
999
-        : "memory"\
1000
-    );\
1001
-}\
1002
-\
1003
-static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1004
-    int i;\
1005
-    int16_t temp[16];\
1006
-    /* quick HACK, XXX FIXME MUST be optimized */\
1007
-    for(i=0; i<h; i++)\
1008
-    {\
1009
-        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1010
-        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1011
-        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1012
-        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1013
-        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1014
-        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1015
-        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1016
-        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1017
-        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1018
-        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1019
-        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1020
-        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1021
-        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1022
-        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1023
-        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1024
-        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1025
-        __asm__ volatile(\
1026
-            "movq (%0), %%mm0               \n\t"\
1027
-            "movq 8(%0), %%mm1              \n\t"\
1028
-            "paddw %2, %%mm0                \n\t"\
1029
-            "paddw %2, %%mm1                \n\t"\
1030
-            "psraw $5, %%mm0                \n\t"\
1031
-            "psraw $5, %%mm1                \n\t"\
1032
-            "packuswb %%mm1, %%mm0          \n\t"\
1033
-            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1034
-            "movq 16(%0), %%mm0             \n\t"\
1035
-            "movq 24(%0), %%mm1             \n\t"\
1036
-            "paddw %2, %%mm0                \n\t"\
1037
-            "paddw %2, %%mm1                \n\t"\
1038
-            "psraw $5, %%mm0                \n\t"\
1039
-            "psraw $5, %%mm1                \n\t"\
1040
-            "packuswb %%mm1, %%mm0          \n\t"\
1041
-            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1042
-            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1043
-            : "memory"\
1044
-        );\
1045
-        dst+=dstStride;\
1046
-        src+=srcStride;\
1047
-    }\
1048
-}\
1049
-\
1050
-static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1051
-    __asm__ volatile(\
1052
-        "pxor %%mm7, %%mm7                \n\t"\
1053
-        "1:                               \n\t"\
1054
-        "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
1055
-        "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
1056
-        "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
1057
-        "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
1058
-        "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
1059
-        "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
1060
-        "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
1061
-        "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
1062
-        "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
1063
-        "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
1064
-        "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
1065
-        "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
1066
-        "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
1067
-        "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
1068
-        "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
1069
-        "paddw %%mm3, %%mm5               \n\t" /* b */\
1070
-        "paddw %%mm2, %%mm6               \n\t" /* c */\
1071
-        "paddw %%mm5, %%mm5               \n\t" /* 2b */\
1072
-        "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
1073
-        "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
1074
-        "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
1075
-        "paddw %%mm4, %%mm0               \n\t" /* a */\
1076
-        "paddw %%mm1, %%mm5               \n\t" /* d */\
1077
-        "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1078
-        "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
1079
-        "paddw %5, %%mm6                  \n\t"\
1080
-        "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
1081
-        "psraw $5, %%mm0                  \n\t"\
1082
-        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1083
-        \
1084
-        "movd 5(%0), %%mm5                \n\t" /* FGHI */\
1085
-        "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
1086
-        "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
1087
-        "paddw %%mm5, %%mm1               \n\t" /* a */\
1088
-        "paddw %%mm6, %%mm2               \n\t" /* b */\
1089
-        "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
1090
-        "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
1091
-        "paddw %%mm6, %%mm3               \n\t" /* c */\
1092
-        "paddw %%mm5, %%mm4               \n\t" /* d */\
1093
-        "paddw %%mm2, %%mm2               \n\t" /* 2b */\
1094
-        "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
1095
-        "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1096
-        "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
1097
-        "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
1098
-        "paddw %5, %%mm1                  \n\t"\
1099
-        "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
1100
-        "psraw $5, %%mm3                  \n\t"\
1101
-        "packuswb %%mm3, %%mm0            \n\t"\
1102
-        OP_MMX2(%%mm0, (%1), %%mm4, q)\
1103
-        \
1104
-        "add %3, %0                       \n\t"\
1105
-        "add %4, %1                       \n\t"\
1106
-        "decl %2                          \n\t"\
1107
-        " jnz 1b                          \n\t"\
1108
-        : "+a"(src), "+c"(dst), "+d"(h)\
1109
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1110
-        : "memory"\
1111
-    );\
1112
-}\
1113
-\
1114
-static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1115
-    int i;\
1116
-    int16_t temp[8];\
1117
-    /* quick HACK, XXX FIXME MUST be optimized */\
1118
-    for(i=0; i<h; i++)\
1119
-    {\
1120
-        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1121
-        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1122
-        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1123
-        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1124
-        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1125
-        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1126
-        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1127
-        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1128
-        __asm__ volatile(\
1129
-            "movq (%0), %%mm0           \n\t"\
1130
-            "movq 8(%0), %%mm1          \n\t"\
1131
-            "paddw %2, %%mm0            \n\t"\
1132
-            "paddw %2, %%mm1            \n\t"\
1133
-            "psraw $5, %%mm0            \n\t"\
1134
-            "psraw $5, %%mm1            \n\t"\
1135
-            "packuswb %%mm1, %%mm0      \n\t"\
1136
-            OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1137
-            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1138
-            :"memory"\
1139
-        );\
1140
-        dst+=dstStride;\
1141
-        src+=srcStride;\
1142
-    }\
1143
-}
1144
-
1145
-#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1146
-\
1147
-static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1148
-    uint64_t temp[17*4];\
1149
-    uint64_t *temp_ptr= temp;\
1150
-    int count= 17;\
1151
-\
1152
-    /*FIXME unroll */\
1153
-    __asm__ volatile(\
1154
-        "pxor %%mm7, %%mm7              \n\t"\
1155
-        "1:                             \n\t"\
1156
-        "movq (%0), %%mm0               \n\t"\
1157
-        "movq (%0), %%mm1               \n\t"\
1158
-        "movq 8(%0), %%mm2              \n\t"\
1159
-        "movq 8(%0), %%mm3              \n\t"\
1160
-        "punpcklbw %%mm7, %%mm0         \n\t"\
1161
-        "punpckhbw %%mm7, %%mm1         \n\t"\
1162
-        "punpcklbw %%mm7, %%mm2         \n\t"\
1163
-        "punpckhbw %%mm7, %%mm3         \n\t"\
1164
-        "movq %%mm0, (%1)               \n\t"\
1165
-        "movq %%mm1, 17*8(%1)           \n\t"\
1166
-        "movq %%mm2, 2*17*8(%1)         \n\t"\
1167
-        "movq %%mm3, 3*17*8(%1)         \n\t"\
1168
-        "add $8, %1                     \n\t"\
1169
-        "add %3, %0                     \n\t"\
1170
-        "decl %2                        \n\t"\
1171
-        " jnz 1b                        \n\t"\
1172
-        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1173
-        : "r" ((x86_reg)srcStride)\
1174
-        : "memory"\
1175
-    );\
1176
-    \
1177
-    temp_ptr= temp;\
1178
-    count=4;\
1179
-    \
1180
-/*FIXME reorder for speed */\
1181
-    __asm__ volatile(\
1182
-        /*"pxor %%mm7, %%mm7              \n\t"*/\
1183
-        "1:                             \n\t"\
1184
-        "movq (%0), %%mm0               \n\t"\
1185
-        "movq 8(%0), %%mm1              \n\t"\
1186
-        "movq 16(%0), %%mm2             \n\t"\
1187
-        "movq 24(%0), %%mm3             \n\t"\
1188
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
1189
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
1190
-        "add %4, %1                     \n\t"\
1191
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
1192
-        \
1193
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1194
-        "add %4, %1                     \n\t"\
1195
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1196
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1197
-        "add %4, %1                     \n\t"\
1198
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1199
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1200
-        "add %4, %1                     \n\t"\
1201
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1202
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1203
-        "add %4, %1                     \n\t"\
1204
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1205
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1206
-        "add %4, %1                     \n\t"\
1207
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1208
-        \
1209
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1210
-        "add %4, %1                     \n\t"  \
1211
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1212
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1213
-        \
1214
-        "add $136, %0                   \n\t"\
1215
-        "add %6, %1                     \n\t"\
1216
-        "decl %2                        \n\t"\
1217
-        " jnz 1b                        \n\t"\
1218
-        \
1219
-        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1220
-        : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1221
-        :"memory"\
1222
-    );\
1223
-}\
1224
-\
1225
-static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1226
-    uint64_t temp[9*2];\
1227
-    uint64_t *temp_ptr= temp;\
1228
-    int count= 9;\
1229
-\
1230
-    /*FIXME unroll */\
1231
-    __asm__ volatile(\
1232
-        "pxor %%mm7, %%mm7              \n\t"\
1233
-        "1:                             \n\t"\
1234
-        "movq (%0), %%mm0               \n\t"\
1235
-        "movq (%0), %%mm1               \n\t"\
1236
-        "punpcklbw %%mm7, %%mm0         \n\t"\
1237
-        "punpckhbw %%mm7, %%mm1         \n\t"\
1238
-        "movq %%mm0, (%1)               \n\t"\
1239
-        "movq %%mm1, 9*8(%1)            \n\t"\
1240
-        "add $8, %1                     \n\t"\
1241
-        "add %3, %0                     \n\t"\
1242
-        "decl %2                        \n\t"\
1243
-        " jnz 1b                        \n\t"\
1244
-        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1245
-        : "r" ((x86_reg)srcStride)\
1246
-        : "memory"\
1247
-    );\
1248
-    \
1249
-    temp_ptr= temp;\
1250
-    count=2;\
1251
-    \
1252
-/*FIXME reorder for speed */\
1253
-    __asm__ volatile(\
1254
-        /*"pxor %%mm7, %%mm7              \n\t"*/\
1255
-        "1:                             \n\t"\
1256
-        "movq (%0), %%mm0               \n\t"\
1257
-        "movq 8(%0), %%mm1              \n\t"\
1258
-        "movq 16(%0), %%mm2             \n\t"\
1259
-        "movq 24(%0), %%mm3             \n\t"\
1260
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
1261
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
1262
-        "add %4, %1                     \n\t"\
1263
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
1264
-        \
1265
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1266
-        "add %4, %1                     \n\t"\
1267
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1268
-        \
1269
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1270
-        "add %4, %1                     \n\t"\
1271
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1272
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1273
-                \
1274
-        "add $72, %0                    \n\t"\
1275
-        "add %6, %1                     \n\t"\
1276
-        "decl %2                        \n\t"\
1277
-        " jnz 1b                        \n\t"\
1278
-         \
1279
-        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1280
-        : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1281
-        : "memory"\
1282
-   );\
1283
-}\
1284
-\
1285
-static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1286
-    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1287
-}\
1288
-\
1289
-static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1290
-    uint64_t temp[8];\
1291
-    uint8_t * const half= (uint8_t*)temp;\
1292
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1293
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1294
-}\
1295
-\
1296
-static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1297
-    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1298
-}\
1299
-\
1300
-static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1301
-    uint64_t temp[8];\
1302
-    uint8_t * const half= (uint8_t*)temp;\
1303
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1304
-    OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1305
-}\
1306
-\
1307
-static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1308
-    uint64_t temp[8];\
1309
-    uint8_t * const half= (uint8_t*)temp;\
1310
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1311
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1312
-}\
1313
-\
1314
-static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1315
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1316
-}\
1317
-\
1318
-static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1319
-    uint64_t temp[8];\
1320
-    uint8_t * const half= (uint8_t*)temp;\
1321
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1322
-    OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1323
-}\
1324
-static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1325
-    uint64_t half[8 + 9];\
1326
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1327
-    uint8_t * const halfHV= ((uint8_t*)half);\
1328
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1329
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1330
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1331
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1332
-}\
1333
-static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1334
-    uint64_t half[8 + 9];\
1335
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1336
-    uint8_t * const halfHV= ((uint8_t*)half);\
1337
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1338
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1339
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1340
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1341
-}\
1342
-static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1343
-    uint64_t half[8 + 9];\
1344
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1345
-    uint8_t * const halfHV= ((uint8_t*)half);\
1346
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1347
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1348
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1349
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1350
-}\
1351
-static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1352
-    uint64_t half[8 + 9];\
1353
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1354
-    uint8_t * const halfHV= ((uint8_t*)half);\
1355
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1356
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1357
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1358
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1359
-}\
1360
-static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1361
-    uint64_t half[8 + 9];\
1362
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1363
-    uint8_t * const halfHV= ((uint8_t*)half);\
1364
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1365
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1366
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1367
-}\
1368
-static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1369
-    uint64_t half[8 + 9];\
1370
-    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1371
-    uint8_t * const halfHV= ((uint8_t*)half);\
1372
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1373
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1374
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1375
-}\
1376
-static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1377
-    uint64_t half[8 + 9];\
1378
-    uint8_t * const halfH= ((uint8_t*)half);\
1379
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1380
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1381
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1382
-}\
1383
-static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1384
-    uint64_t half[8 + 9];\
1385
-    uint8_t * const halfH= ((uint8_t*)half);\
1386
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1387
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1388
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1389
-}\
1390
-static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1391
-    uint64_t half[9];\
1392
-    uint8_t * const halfH= ((uint8_t*)half);\
1393
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1394
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1395
-}\
1396
-static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1397
-    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1398
-}\
1399
-\
1400
-static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1401
-    uint64_t temp[32];\
1402
-    uint8_t * const half= (uint8_t*)temp;\
1403
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1404
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1405
-}\
1406
-\
1407
-static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1408
-    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1409
-}\
1410
-\
1411
-static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1412
-    uint64_t temp[32];\
1413
-    uint8_t * const half= (uint8_t*)temp;\
1414
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1415
-    OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1416
-}\
1417
-\
1418
-static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1419
-    uint64_t temp[32];\
1420
-    uint8_t * const half= (uint8_t*)temp;\
1421
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1422
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1423
-}\
1424
-\
1425
-static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1426
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1427
-}\
1428
-\
1429
-static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1430
-    uint64_t temp[32];\
1431
-    uint8_t * const half= (uint8_t*)temp;\
1432
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1433
-    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1434
-}\
1435
-static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1436
-    uint64_t half[16*2 + 17*2];\
1437
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1438
-    uint8_t * const halfHV= ((uint8_t*)half);\
1439
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1440
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1441
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1442
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1443
-}\
1444
-static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1445
-    uint64_t half[16*2 + 17*2];\
1446
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1447
-    uint8_t * const halfHV= ((uint8_t*)half);\
1448
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1449
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1450
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1451
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1452
-}\
1453
-static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1454
-    uint64_t half[16*2 + 17*2];\
1455
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1456
-    uint8_t * const halfHV= ((uint8_t*)half);\
1457
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1458
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1459
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1460
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1461
-}\
1462
-static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1463
-    uint64_t half[16*2 + 17*2];\
1464
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1465
-    uint8_t * const halfHV= ((uint8_t*)half);\
1466
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1467
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1468
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1469
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1470
-}\
1471
-static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1472
-    uint64_t half[16*2 + 17*2];\
1473
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1474
-    uint8_t * const halfHV= ((uint8_t*)half);\
1475
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1476
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1477
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1478
-}\
1479
-static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1480
-    uint64_t half[16*2 + 17*2];\
1481
-    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1482
-    uint8_t * const halfHV= ((uint8_t*)half);\
1483
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1484
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1485
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1486
-}\
1487
-static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1488
-    uint64_t half[17*2];\
1489
-    uint8_t * const halfH= ((uint8_t*)half);\
1490
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1491
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1492
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1493
-}\
1494
-static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1495
-    uint64_t half[17*2];\
1496
-    uint8_t * const halfH= ((uint8_t*)half);\
1497
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1498
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1499
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1500
-}\
1501
-static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1502
-    uint64_t half[17*2];\
1503
-    uint8_t * const halfH= ((uint8_t*)half);\
1504
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1505
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1506
-}
1507
-
1508
-#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "        \n\t"
1509
-#define AVG_3DNOW_OP(a,b,temp, size) \
1510
-"mov" #size " " #b ", " #temp "   \n\t"\
1511
-"pavgusb " #temp ", " #a "        \n\t"\
1512
-"mov" #size " " #a ", " #b "      \n\t"
1513
-#define AVG_MMX2_OP(a,b,temp, size) \
1514
-"mov" #size " " #b ", " #temp "   \n\t"\
1515
-"pavgb " #temp ", " #a "          \n\t"\
1516
-"mov" #size " " #a ", " #b "      \n\t"
1517
-
1518
-QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP, PUT_OP)
1519
-QPEL_BASE(avg_       , ff_pw_16, _       , AVG_MMX2_OP, AVG_3DNOW_OP)
1520
-QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1521
-QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
1522
-QPEL_OP(avg_       , ff_pw_16, _       , AVG_3DNOW_OP, 3dnow)
1523
-QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1524
-QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
1525
-QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
1526
-QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
861
+#define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd,                      \
862
+                   in0, in1, in2, in7, out, OP)                           \
863
+    "paddw               "#m4", "#m3"   \n\t" /* x1 */                    \
864
+    "movq   "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */                    \
865
+    "pmullw              "#m3", %%mm4   \n\t" /* 20x1 */                  \
866
+    "movq               "#in7", "#m3"   \n\t" /* d */                     \
867
+    "movq               "#in0", %%mm5   \n\t" /* D */                     \
868
+    "paddw               "#m3", %%mm5   \n\t" /* x4 */                    \
869
+    "psubw               %%mm5, %%mm4   \n\t" /* 20x1 - x4 */             \
870
+    "movq               "#in1", %%mm5   \n\t" /* C */                     \
871
+    "movq               "#in2", %%mm6   \n\t" /* B */                     \
872
+    "paddw               "#m6", %%mm5   \n\t" /* x3 */                    \
873
+    "paddw               "#m5", %%mm6   \n\t" /* x2 */                    \
874
+    "paddw               %%mm6, %%mm6   \n\t" /* 2x2 */                   \
875
+    "psubw               %%mm6, %%mm5   \n\t" /* -2x2 + x3 */             \
876
+    "pmullw  "MANGLE(ff_pw_3)", %%mm5   \n\t" /* -6x2 + 3x3 */            \
877
+    "paddw              "#rnd", %%mm4   \n\t" /* x2 */                    \
878
+    "paddw               %%mm4, %%mm5   \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
879
+    "psraw                  $5, %%mm5   \n\t"                             \
880
+    "packuswb            %%mm5, %%mm5   \n\t"                             \
881
+    OP(%%mm5, out, %%mm7, d)
882
+
883
+#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)                \
884
+static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst,           \
885
+                                                  uint8_t *src,           \
886
+                                                  int dstStride,          \
887
+                                                  int srcStride,          \
888
+                                                  int h)                  \
889
+{                                                                         \
890
+    uint64_t temp;                                                        \
891
+                                                                          \
892
+    __asm__ volatile (                                                    \
893
+        "pxor      %%mm7, %%mm7             \n\t"                         \
894
+        "1:                                 \n\t"                         \
895
+        "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
896
+        "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
897
+        "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
898
+        "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
899
+        "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
900
+        "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
901
+        "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
902
+        "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
903
+        "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
904
+        "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
905
+        "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
906
+        "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
907
+        "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
908
+        "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
909
+        "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
910
+        "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
911
+        "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
912
+        "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
913
+        "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
914
+        "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
915
+        "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
916
+        "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
917
+        "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
918
+        "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
919
+        "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
920
+        "paddw        %6, %%mm6             \n\t"                         \
921
+        "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
922
+        "psraw        $5, %%mm0             \n\t"                         \
923
+        "movq      %%mm0, %5                \n\t"                         \
924
+        /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
925
+                                                                          \
926
+        "movq      5(%0), %%mm0             \n\t" /* FGHIJKLM */          \
927
+        "movq      %%mm0, %%mm5             \n\t" /* FGHIJKLM */          \
928
+        "movq      %%mm0, %%mm6             \n\t" /* FGHIJKLM */          \
929
+        "psrlq        $8, %%mm0             \n\t" /* GHIJKLM0 */          \
930
+        "psrlq       $16, %%mm5             \n\t" /* HIJKLM00 */          \
931
+        "punpcklbw %%mm7, %%mm0             \n\t" /* 0G0H0I0J */          \
932
+        "punpcklbw %%mm7, %%mm5             \n\t" /* 0H0I0J0K */          \
933
+        "paddw     %%mm0, %%mm2             \n\t" /* b */                 \
934
+        "paddw     %%mm5, %%mm3             \n\t" /* c */                 \
935
+        "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
936
+        "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
937
+        "movq      %%mm6, %%mm2             \n\t" /* FGHIJKLM */          \
938
+        "psrlq       $24, %%mm6             \n\t" /* IJKLM000 */          \
939
+        "punpcklbw %%mm7, %%mm2             \n\t" /* 0F0G0H0I */          \
940
+        "punpcklbw %%mm7, %%mm6             \n\t" /* 0I0J0K0L */          \
941
+        "pmullw "MANGLE(ff_pw_3)", %%mm3    \n\t" /* 3c - 6b */           \
942
+        "paddw     %%mm2, %%mm1             \n\t" /* a */                 \
943
+        "paddw     %%mm6, %%mm4             \n\t" /* d */                 \
944
+        "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
945
+        "psubw     %%mm4, %%mm3             \n\t" /* - 6b +3c - d */      \
946
+        "paddw        %6, %%mm1             \n\t"                         \
947
+        "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b +3c - d */  \
948
+        "psraw        $5, %%mm3             \n\t"                         \
949
+        "movq         %5, %%mm1             \n\t"                         \
950
+        "packuswb  %%mm3, %%mm1             \n\t"                         \
951
+        OP_MMX2(%%mm1, (%1), %%mm4, q)                                    \
952
+        /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */     \
953
+                                                                          \
954
+        "movq      9(%0), %%mm1             \n\t" /* JKLMNOPQ */          \
955
+        "movq      %%mm1, %%mm4             \n\t" /* JKLMNOPQ */          \
956
+        "movq      %%mm1, %%mm3             \n\t" /* JKLMNOPQ */          \
957
+        "psrlq        $8, %%mm1             \n\t" /* KLMNOPQ0 */          \
958
+        "psrlq       $16, %%mm4             \n\t" /* LMNOPQ00 */          \
959
+        "punpcklbw %%mm7, %%mm1             \n\t" /* 0K0L0M0N */          \
960
+        "punpcklbw %%mm7, %%mm4             \n\t" /* 0L0M0N0O */          \
961
+        "paddw     %%mm1, %%mm5             \n\t" /* b */                 \
962
+        "paddw     %%mm4, %%mm0             \n\t" /* c */                 \
963
+        "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
964
+        "psubw     %%mm5, %%mm0             \n\t" /* c - 2b */            \
965
+        "movq      %%mm3, %%mm5             \n\t" /* JKLMNOPQ */          \
966
+        "psrlq       $24, %%mm3             \n\t" /* MNOPQ000 */          \
967
+        "pmullw "MANGLE(ff_pw_3)", %%mm0    \n\t" /* 3c - 6b */           \
968
+        "punpcklbw %%mm7, %%mm3             \n\t" /* 0M0N0O0P */          \
969
+        "paddw     %%mm3, %%mm2             \n\t" /* d */                 \
970
+        "psubw     %%mm2, %%mm0             \n\t" /* -6b + 3c - d */      \
971
+        "movq      %%mm5, %%mm2             \n\t" /* JKLMNOPQ */          \
972
+        "punpcklbw %%mm7, %%mm2             \n\t" /* 0J0K0L0M */          \
973
+        "punpckhbw %%mm7, %%mm5             \n\t" /* 0N0O0P0Q */          \
974
+        "paddw     %%mm2, %%mm6             \n\t" /* a */                 \
975
+        "pmullw "MANGLE(ff_pw_20)", %%mm6   \n\t" /* 20a */               \
976
+        "paddw        %6, %%mm0             \n\t"                         \
977
+        "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
978
+        "psraw        $5, %%mm0             \n\t"                         \
979
+        /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */                         \
980
+        /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */                              \
981
+                                                                          \
982
+        "paddw    %%mm5, %%mm3              \n\t" /* a */                 \
983
+        "pshufw   $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */          \
984
+        "paddw    %%mm4, %%mm6              \n\t" /* b */                 \
985
+        "pshufw   $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */          \
986
+        "pshufw   $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */          \
987
+        "paddw    %%mm1, %%mm4              \n\t" /* c */                 \
988
+        "paddw    %%mm2, %%mm5              \n\t" /* d */                 \
989
+        "paddw    %%mm6, %%mm6              \n\t" /* 2b */                \
990
+        "psubw    %%mm6, %%mm4              \n\t" /* c - 2b */            \
991
+        "pmullw "MANGLE(ff_pw_20)", %%mm3   \n\t" /* 20a */               \
992
+        "pmullw  "MANGLE(ff_pw_3)", %%mm4   \n\t" /* 3c - 6b */           \
993
+        "psubw    %%mm5, %%mm3              \n\t" /* -6b + 3c - d */      \
994
+        "paddw       %6, %%mm4              \n\t"                         \
995
+        "paddw    %%mm3, %%mm4              \n\t" /* 20a - 6b + 3c - d */ \
996
+        "psraw       $5, %%mm4              \n\t"                         \
997
+        "packuswb %%mm4, %%mm0              \n\t"                         \
998
+        OP_MMX2(%%mm0, 8(%1), %%mm4, q)                                   \
999
+                                                                          \
1000
+        "add         %3, %0                 \n\t"                         \
1001
+        "add         %4, %1                 \n\t"                         \
1002
+        "decl        %2                     \n\t"                         \
1003
+        "jnz         1b                     \n\t"                         \
1004
+        : "+a"(src), "+c"(dst), "+D"(h)                                   \
1005
+        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride),               \
1006
+          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER)      \
1007
+        : "memory"                                                        \
1008
+        );                                                                \
1009
+}                                                                         \
1010
+                                                                          \
1011
+static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst,          \
1012
+                                                   uint8_t *src,          \
1013
+                                                   int dstStride,         \
1014
+                                                   int srcStride,         \
1015
+                                                   int h)                 \
1016
+{                                                                         \
1017
+    int i;                                                                \
1018
+    int16_t temp[16];                                                     \
1019
+    /* quick HACK, XXX FIXME MUST be optimized */                         \
1020
+    for (i = 0; i < h; i++) {                                             \
1021
+        temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 +   \
1022
+                   (src[ 1] + src[ 3]) *  3 - (src[ 2] + src[ 4]);        \
1023
+        temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 +   \
1024
+                   (src[ 0] + src[ 4]) *  3 - (src[ 1] + src[ 5]);        \
1025
+        temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 +   \
1026
+                   (src[ 0] + src[ 5]) *  3 - (src[ 0] + src[ 6]);        \
1027
+        temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 +   \
1028
+                   (src[ 1] + src[ 6]) *  3 - (src[ 0] + src[ 7]);        \
1029
+        temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 +   \
1030
+                   (src[ 2] + src[ 7]) *  3 - (src[ 1] + src[ 8]);        \
1031
+        temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 +   \
1032
+                   (src[ 3] + src[ 8]) *  3 - (src[ 2] + src[ 9]);        \
1033
+        temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 +   \
1034
+                   (src[ 4] + src[ 9]) *  3 - (src[ 3] + src[10]);        \
1035
+        temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 +   \
1036
+                   (src[ 5] + src[10]) *  3 - (src[ 4] + src[11]);        \
1037
+        temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 +   \
1038
+                   (src[ 6] + src[11]) *  3 - (src[ 5] + src[12]);        \
1039
+        temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 +   \
1040
+                   (src[ 7] + src[12]) *  3 - (src[ 6] + src[13]);        \
1041
+        temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 +   \
1042
+                   (src[ 8] + src[13]) *  3 - (src[ 7] + src[14]);        \
1043
+        temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 +   \
1044
+                   (src[ 9] + src[14]) *  3 - (src[ 8] + src[15]);        \
1045
+        temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 +   \
1046
+                   (src[10] + src[15]) *  3 - (src[ 9] + src[16]);        \
1047
+        temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 +   \
1048
+                   (src[11] + src[16]) *  3 - (src[10] + src[16]);        \
1049
+        temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 +   \
1050
+                   (src[12] + src[16]) *  3 - (src[11] + src[15]);        \
1051
+        temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 +   \
1052
+                   (src[13] + src[15]) *  3 - (src[12] + src[14]);        \
1053
+        __asm__ volatile (                                                \
1054
+            "movq      (%0), %%mm0          \n\t"                         \
1055
+            "movq     8(%0), %%mm1          \n\t"                         \
1056
+            "paddw       %2, %%mm0          \n\t"                         \
1057
+            "paddw       %2, %%mm1          \n\t"                         \
1058
+            "psraw       $5, %%mm0          \n\t"                         \
1059
+            "psraw       $5, %%mm1          \n\t"                         \
1060
+            "packuswb %%mm1, %%mm0          \n\t"                         \
1061
+            OP_3DNOW(%%mm0, (%1), %%mm1, q)                               \
1062
+            "movq    16(%0), %%mm0          \n\t"                         \
1063
+            "movq    24(%0), %%mm1          \n\t"                         \
1064
+            "paddw       %2, %%mm0          \n\t"                         \
1065
+            "paddw       %2, %%mm1          \n\t"                         \
1066
+            "psraw       $5, %%mm0          \n\t"                         \
1067
+            "psraw       $5, %%mm1          \n\t"                         \
1068
+            "packuswb %%mm1, %%mm0          \n\t"                         \
1069
+            OP_3DNOW(%%mm0, 8(%1), %%mm1, q)                              \
1070
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)                          \
1071
+            : "memory"                                                    \
1072
+            );                                                            \
1073
+        dst += dstStride;                                                 \
1074
+        src += srcStride;                                                 \
1075
+    }                                                                     \
1076
+}                                                                         \
1077
+                                                                          \
1078
+static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst,            \
1079
+                                                 uint8_t *src,            \
1080
+                                                 int dstStride,           \
1081
+                                                 int srcStride,           \
1082
+                                                 int h)                   \
1083
+{                                                                         \
1084
+    __asm__ volatile (                                                    \
1085
+        "pxor      %%mm7, %%mm7             \n\t"                         \
1086
+        "1:                                 \n\t"                         \
1087
+        "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
1088
+        "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
1089
+        "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
1090
+        "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
1091
+        "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
1092
+        "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
1093
+        "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
1094
+        "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
1095
+        "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
1096
+        "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
1097
+        "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
1098
+        "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
1099
+        "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
1100
+        "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
1101
+        "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
1102
+        "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
1103
+        "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
1104
+        "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
1105
+        "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
1106
+        "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
1107
+        "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
1108
+        "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
1109
+        "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
1110
+        "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
1111
+        "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
1112
+        "paddw        %5, %%mm6             \n\t"                         \
1113
+        "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
1114
+        "psraw        $5, %%mm0             \n\t"                         \
1115
+        /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
1116
+                                                                          \
1117
+        "movd      5(%0), %%mm5             \n\t" /* FGHI */              \
1118
+        "punpcklbw %%mm7, %%mm5             \n\t" /* 0F0G0H0I */          \
1119
+        "pshufw    $0xF9, %%mm5, %%mm6      \n\t" /* 0G0H0I0I */          \
1120
+        "paddw     %%mm5, %%mm1             \n\t" /* a */                 \
1121
+        "paddw     %%mm6, %%mm2             \n\t" /* b */                 \
1122
+        "pshufw    $0xBE, %%mm5, %%mm6      \n\t" /* 0H0I0I0H */          \
1123
+        "pshufw    $0x6F, %%mm5, %%mm5      \n\t" /* 0I0I0H0G */          \
1124
+        "paddw     %%mm6, %%mm3             \n\t" /* c */                 \
1125
+        "paddw     %%mm5, %%mm4             \n\t" /* d */                 \
1126
+        "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
1127
+        "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
1128
+        "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
1129
+        "pmullw  "MANGLE(ff_pw_3)", %%mm3   \n\t" /* 3c - 6b */           \
1130
+        "psubw     %%mm4, %%mm3             \n\t" /* -6b + 3c - d */      \
1131
+        "paddw        %5, %%mm1             \n\t"                         \
1132
+        "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b + 3c - d */ \
1133
+        "psraw        $5, %%mm3             \n\t"                         \
1134
+        "packuswb  %%mm3, %%mm0             \n\t"                         \
1135
+        OP_MMX2(%%mm0, (%1), %%mm4, q)                                    \
1136
+                                                                          \
1137
+        "add          %3, %0                \n\t"                         \
1138
+        "add          %4, %1                \n\t"                         \
1139
+        "decl         %2                    \n\t"                         \
1140
+        "jnz          1b                    \n\t"                         \
1141
+        : "+a"(src), "+c"(dst), "+d"(h)                                   \
1142
+        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride),               \
1143
+          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER)                 \
1144
+        : "memory"                                                        \
1145
+        );                                                                \
1146
+}                                                                         \
1147
+                                                                          \
1148
+static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst,           \
1149
+                                                  uint8_t *src,           \
1150
+                                                  int dstStride,          \
1151
+                                                  int srcStride,          \
1152
+                                                  int h)                  \
1153
+{                                                                         \
1154
+    int i;                                                                \
1155
+    int16_t temp[8];                                                      \
1156
+    /* quick HACK, XXX FIXME MUST be optimized */                         \
1157
+    for (i = 0; i < h; i++) {                                             \
1158
+        temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 +        \
1159
+                  (src[1] + src[3]) *  3 - (src[2] + src[4]);             \
1160
+        temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 +        \
1161
+                  (src[0] + src[4]) *  3 - (src[1] + src[5]);             \
1162
+        temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 +        \
1163
+                  (src[0] + src[5]) *  3 - (src[0] + src[6]);             \
1164
+        temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 +        \
1165
+                  (src[1] + src[6]) *  3 - (src[0] + src[7]);             \
1166
+        temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 +        \
1167
+                  (src[2] + src[7]) *  3 - (src[1] + src[8]);             \
1168
+        temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 +        \
1169
+                  (src[3] + src[8]) *  3 - (src[2] + src[8]);             \
1170
+        temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 +        \
1171
+                  (src[4] + src[8]) *  3 - (src[3] + src[7]);             \
1172
+        temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 +        \
1173
+                  (src[5] + src[7]) *  3 - (src[4] + src[6]);             \
1174
+        __asm__ volatile (                                                \
1175
+            "movq      (%0), %%mm0      \n\t"                             \
1176
+            "movq     8(%0), %%mm1      \n\t"                             \
1177
+            "paddw       %2, %%mm0      \n\t"                             \
1178
+            "paddw       %2, %%mm1      \n\t"                             \
1179
+            "psraw       $5, %%mm0      \n\t"                             \
1180
+            "psraw       $5, %%mm1      \n\t"                             \
1181
+            "packuswb %%mm1, %%mm0      \n\t"                             \
1182
+            OP_3DNOW(%%mm0, (%1), %%mm1, q)                               \
1183
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)                          \
1184
+            : "memory"                                                    \
1185
+            );                                                            \
1186
+        dst += dstStride;                                                 \
1187
+        src += srcStride;                                                 \
1188
+    }                                                                     \
1189
+}
1190
+
1191
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)                          \
1192
+static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst,      \
1193
+                                                     uint8_t *src,      \
1194
+                                                     int dstStride,     \
1195
+                                                     int srcStride)     \
1196
+{                                                                       \
1197
+    uint64_t temp[17 * 4];                                              \
1198
+    uint64_t *temp_ptr = temp;                                          \
1199
+    int count = 17;                                                     \
1200
+                                                                        \
1201
+    /* FIXME unroll */                                                  \
1202
+    __asm__ volatile (                                                  \
1203
+        "pxor      %%mm7, %%mm7             \n\t"                       \
1204
+        "1:                                 \n\t"                       \
1205
+        "movq       (%0), %%mm0             \n\t"                       \
1206
+        "movq       (%0), %%mm1             \n\t"                       \
1207
+        "movq      8(%0), %%mm2             \n\t"                       \
1208
+        "movq      8(%0), %%mm3             \n\t"                       \
1209
+        "punpcklbw %%mm7, %%mm0             \n\t"                       \
1210
+        "punpckhbw %%mm7, %%mm1             \n\t"                       \
1211
+        "punpcklbw %%mm7, %%mm2             \n\t"                       \
1212
+        "punpckhbw %%mm7, %%mm3             \n\t"                       \
1213
+        "movq      %%mm0, (%1)              \n\t"                       \
1214
+        "movq      %%mm1, 17 * 8(%1)        \n\t"                       \
1215
+        "movq      %%mm2, 2 * 17 * 8(%1)    \n\t"                       \
1216
+        "movq      %%mm3, 3 * 17 * 8(%1)    \n\t"                       \
1217
+        "add          $8, %1                \n\t"                       \
1218
+        "add          %3, %0                \n\t"                       \
1219
+        "decl         %2                    \n\t"                       \
1220
+        "jnz          1b                    \n\t"                       \
1221
+        : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
1222
+        : "r"((x86_reg)srcStride)                                       \
1223
+        : "memory"                                                      \
1224
+        );                                                              \
1225
+                                                                        \
1226
+    temp_ptr = temp;                                                    \
1227
+    count    = 4;                                                       \
1228
+                                                                        \
1229
+    /* FIXME reorder for speed */                                       \
1230
+    __asm__ volatile (                                                  \
1231
+        /* "pxor  %%mm7, %%mm7            \n\t" */                      \
1232
+        "1:                             \n\t"                           \
1233
+        "movq    (%0), %%mm0            \n\t"                           \
1234
+        "movq   8(%0), %%mm1            \n\t"                           \
1235
+        "movq  16(%0), %%mm2            \n\t"                           \
1236
+        "movq  24(%0), %%mm3            \n\t"                           \
1237
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),   8(%0),    (%0),  32(%0), (%1),     OP) \
1238
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),    (%0),    (%0),  40(%0), (%1, %3), OP) \
1239
+        "add       %4, %1               \n\t"                           \
1240
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),    (%0),   8(%0),  48(%0), (%1),     OP) \
1241
+                                                                        \
1242
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),   8(%0),  16(%0),  56(%0), (%1, %3), OP) \
1243
+        "add       %4, %1               \n\t"                           \
1244
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0),  16(%0),  24(%0),  64(%0), (%1),     OP) \
1245
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0),  24(%0),  32(%0),  72(%0), (%1, %3), OP) \
1246
+        "add       %4, %1               \n\t"                           \
1247
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0),  32(%0),  40(%0),  80(%0), (%1),     OP) \
1248
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0),  40(%0),  48(%0),  88(%0), (%1, %3), OP) \
1249
+        "add       %4, %1               \n\t"                           \
1250
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0),  48(%0),  56(%0),  96(%0), (%1),     OP) \
1251
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0),  56(%0),  64(%0), 104(%0), (%1, %3), OP) \
1252
+        "add       %4, %1               \n\t"                           \
1253
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0),  64(%0),  72(%0), 112(%0), (%1),     OP) \
1254
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0),  72(%0),  80(%0), 120(%0), (%1, %3), OP) \
1255
+        "add       %4, %1               \n\t"                           \
1256
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0),  80(%0),  88(%0), 128(%0), (%1),     OP) \
1257
+                                                                        \
1258
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0),  88(%0),  96(%0), 128(%0), (%1, %3), OP) \
1259
+        "add       %4, %1               \n\t"                           \
1260
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0),  96(%0), 104(%0), 120(%0), (%1),     OP) \
1261
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1262
+                                                                        \
1263
+        "add     $136, %0               \n\t"                           \
1264
+        "add       %6, %1               \n\t"                           \
1265
+        "decl      %2                   \n\t"                           \
1266
+        "jnz       1b                   \n\t"                           \
1267
+                                                                        \
1268
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
1269
+        : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
1270
+          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
1271
+          "g"(4 - 14 * (x86_reg)dstStride)                              \
1272
+        : "memory"                                                      \
1273
+        );                                                              \
1274
+}                                                                       \
1275
+                                                                        \
1276
+static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst,       \
1277
+                                                    uint8_t *src,       \
1278
+                                                    int dstStride,      \
1279
+                                                    int srcStride)      \
1280
+{                                                                       \
1281
+    uint64_t temp[9 * 2];                                               \
1282
+    uint64_t *temp_ptr = temp;                                          \
1283
+    int count = 9;                                                      \
1284
+                                                                        \
1285
+    /* FIXME unroll */                                                  \
1286
+    __asm__ volatile (                                                  \
1287
+        "pxor      %%mm7, %%mm7         \n\t"                           \
1288
+        "1:                             \n\t"                           \
1289
+        "movq       (%0), %%mm0         \n\t"                           \
1290
+        "movq       (%0), %%mm1         \n\t"                           \
1291
+        "punpcklbw %%mm7, %%mm0         \n\t"                           \
1292
+        "punpckhbw %%mm7, %%mm1         \n\t"                           \
1293
+        "movq      %%mm0, (%1)          \n\t"                           \
1294
+        "movq      %%mm1, 9*8(%1)       \n\t"                           \
1295
+        "add          $8, %1            \n\t"                           \
1296
+        "add          %3, %0            \n\t"                           \
1297
+        "decl         %2                \n\t"                           \
1298
+        "jnz          1b                \n\t"                           \
1299
+        : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
1300
+        : "r"((x86_reg)srcStride)                                       \
1301
+        : "memory"                                                      \
1302
+        );                                                              \
1303
+                                                                        \
1304
+    temp_ptr = temp;                                                    \
1305
+    count    = 2;                                                       \
1306
+                                                                        \
1307
+    /* FIXME reorder for speed */                                       \
1308
+    __asm__ volatile (                                                  \
1309
+        /* "pxor  %%mm7, %%mm7            \n\t" */                      \
1310
+        "1:                             \n\t"                           \
1311
+        "movq    (%0), %%mm0            \n\t"                           \
1312
+        "movq   8(%0), %%mm1            \n\t"                           \
1313
+        "movq  16(%0), %%mm2            \n\t"                           \
1314
+        "movq  24(%0), %%mm3            \n\t"                           \
1315
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)     \
1316
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP) \
1317
+        "add       %4, %1               \n\t"                           \
1318
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)     \
1319
+                                                                        \
1320
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1321
+        "add       %4, %1               \n\t"                           \
1322
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)     \
1323
+                                                                        \
1324
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1325
+        "add       %4, %1               \n\t"                           \
1326
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)     \
1327
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1328
+                                                                        \
1329
+        "add      $72, %0               \n\t"                           \
1330
+        "add       %6, %1               \n\t"                           \
1331
+        "decl      %2                   \n\t"                           \
1332
+        "jnz       1b                   \n\t"                           \
1333
+                                                                        \
1334
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
1335
+        : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
1336
+          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
1337
+          "g"(4 - 6 * (x86_reg)dstStride)                               \
1338
+        : "memory"                                                      \
1339
+        );                                                              \
1340
+}                                                                       \
1341
+                                                                        \
1342
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
1343
+                                          int stride)                   \
1344
+{                                                                       \
1345
+    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);                     \
1346
+}                                                                       \
1347
+                                                                        \
1348
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
1349
+                                         int stride)                    \
1350
+{                                                                       \
1351
+    uint64_t temp[8];                                                   \
1352
+    uint8_t * const half = (uint8_t*)temp;                              \
1353
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
1354
+                                                stride, 8);             \
1355
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
1356
+}                                                                       \
1357
+                                                                        \
1358
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
1359
+                                         int stride)                    \
1360
+{                                                                       \
1361
+    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,           \
1362
+                                            stride, 8);                 \
1363
+}                                                                       \
1364
+                                                                        \
1365
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
1366
+                                         int stride)                    \
1367
+{                                                                       \
1368
+    uint64_t temp[8];                                                   \
1369
+    uint8_t * const half = (uint8_t*)temp;                              \
1370
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
1371
+                                                stride, 8);             \
1372
+    OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,            \
1373
+                                 stride, 8);                            \
1374
+}                                                                       \
1375
+                                                                        \
1376
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
1377
+                                         int stride)                    \
1378
+{                                                                       \
1379
+    uint64_t temp[8];                                                   \
1380
+    uint8_t * const half = (uint8_t*)temp;                              \
1381
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
1382
+    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
1383
+}                                                                       \
1384
+                                                                        \
1385
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
1386
+                                         int stride)                    \
1387
+{                                                                       \
1388
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);  \
1389
+}                                                                       \
1390
+                                                                        \
1391
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
1392
+                                         int stride)                    \
1393
+{                                                                       \
1394
+    uint64_t temp[8];                                                   \
1395
+    uint8_t * const half = (uint8_t*)temp;                              \
1396
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
1397
+    OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,       \
1398
+                                 stride, 8);                            \
1399
+}                                                                       \
1400
+                                                                        \
1401
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
1402
+                                         int stride)                    \
1403
+{                                                                       \
1404
+    uint64_t half[8 + 9];                                               \
1405
+    uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1406
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1407
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1408
+                                                stride, 9);             \
1409
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1410
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1411
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1412
+}                                                                       \
1413
+                                                                        \
1414
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
1415
+                                         int stride)                    \
1416
+{                                                                       \
1417
+    uint64_t half[8 + 9];                                               \
1418
+    uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1419
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1420
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1421
+                                                stride, 9);             \
1422
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1423
+                                     stride, 9);                        \
1424
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1425
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1426
+}                                                                       \
1427
+                                                                        \
1428
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
1429
+                                         int stride)                    \
1430
+{                                                                       \
1431
+    uint64_t half[8 + 9];                                               \
1432
+    uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1433
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1434
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1435
+                                                stride, 9);             \
1436
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1437
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1438
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1439
+}                                                                       \
1440
+                                                                        \
1441
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
1442
+                                         int stride)                    \
1443
+{                                                                       \
1444
+    uint64_t half[8 + 9];                                               \
1445
+    uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1446
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1447
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1448
+                                                stride, 9);             \
1449
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1450
+                                     stride, 9);                        \
1451
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1452
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1453
+}                                                                       \
1454
+                                                                        \
1455
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
1456
+                                         int stride)                    \
1457
+{                                                                       \
1458
+    uint64_t half[8 + 9];                                               \
1459
+    uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1460
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1461
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1462
+                                                stride, 9);             \
1463
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1464
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1465
+}                                                                       \
1466
+                                                                        \
1467
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
1468
+                                         int stride)                    \
1469
+{                                                                       \
1470
+    uint64_t half[8 + 9];                                               \
1471
+    uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1472
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1473
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1474
+                                                stride, 9);             \
1475
+    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1476
+    OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1477
+}                                                                       \
1478
+                                                                        \
1479
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
1480
+                                         int stride)                    \
1481
+{                                                                       \
1482
+    uint64_t half[8 + 9];                                               \
1483
+    uint8_t * const halfH = ((uint8_t*)half);                           \
1484
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1485
+                                                stride, 9);             \
1486
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1487
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1488
+}                                                                       \
1489
+                                                                        \
1490
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
1491
+                                         int stride)                    \
1492
+{                                                                       \
1493
+    uint64_t half[8 + 9];                                               \
1494
+    uint8_t * const halfH = ((uint8_t*)half);                           \
1495
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1496
+                                                stride, 9);             \
1497
+    put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1498
+                                     stride, 9);                        \
1499
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1500
+}                                                                       \
1501
+                                                                        \
1502
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
1503
+                                         int stride)                    \
1504
+{                                                                       \
1505
+    uint64_t half[9];                                                   \
1506
+    uint8_t * const halfH = ((uint8_t*)half);                           \
1507
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1508
+                                                stride, 9);             \
1509
+    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1510
+}                                                                       \
1511
+                                                                        \
1512
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
1513
+                                           int stride)                  \
1514
+{                                                                       \
1515
+    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);                   \
1516
+}                                                                       \
1517
+                                                                        \
1518
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
1519
+                                          int stride)                   \
1520
+{                                                                       \
1521
+    uint64_t temp[32];                                                  \
1522
+    uint8_t * const half = (uint8_t*)temp;                              \
1523
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
1524
+                                                 stride, 16);           \
1525
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
1526
+}                                                                       \
1527
+                                                                        \
1528
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
1529
+                                          int stride)                   \
1530
+{                                                                       \
1531
+    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,                  \
1532
+                                             stride, stride, 16);       \
1533
+}                                                                       \
1534
+                                                                        \
1535
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
1536
+                                          int stride)                   \
1537
+{                                                                       \
1538
+    uint64_t temp[32];                                                  \
1539
+    uint8_t * const half = (uint8_t*)temp;                              \
1540
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
1541
+                                                 stride, 16);           \
1542
+    OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,                   \
1543
+                                  stride, stride, 16);                  \
1544
+}                                                                       \
1545
+                                                                        \
1546
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
1547
+                                          int stride)                   \
1548
+{                                                                       \
1549
+    uint64_t temp[32];                                                  \
1550
+    uint8_t * const half = (uint8_t*)temp;                              \
1551
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
1552
+                                                 stride);               \
1553
+    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
1554
+}                                                                       \
1555
+                                                                        \
1556
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
1557
+                                          int stride)                   \
1558
+{                                                                       \
1559
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1560
+}                                                                       \
1561
+                                                                        \
1562
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
1563
+                                          int stride)                   \
1564
+{                                                                       \
1565
+    uint64_t temp[32];                                                  \
1566
+    uint8_t * const half = (uint8_t*)temp;                              \
1567
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
1568
+                                                 stride);               \
1569
+    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,                \
1570
+                                  stride, stride, 16);                  \
1571
+}                                                                       \
1572
+                                                                        \
1573
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
1574
+                                          int stride)                   \
1575
+{                                                                       \
1576
+    uint64_t half[16 * 2 + 17 * 2];                                     \
1577
+    uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1578
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1579
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1580
+                                                 stride, 17);           \
1581
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1582
+                                      stride, 17);                      \
1583
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1584
+                                                 16, 16);               \
1585
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1586
+}                                                                       \
1587
+                                                                        \
1588
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
1589
+                                          int stride)                   \
1590
+{                                                                       \
1591
+    uint64_t half[16 * 2 + 17 * 2];                                     \
1592
+    uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1593
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1594
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1595
+                                                 stride, 17);           \
1596
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1597
+                                      stride, 17);                      \
1598
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1599
+                                                 16, 16);               \
1600
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1601
+}                                                                       \
1602
+                                                                        \
1603
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
1604
+                                          int stride)                   \
1605
+{                                                                       \
1606
+    uint64_t half[16 * 2 + 17 * 2];                                     \
1607
+    uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1608
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1609
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1610
+                                                 stride, 17);           \
1611
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1612
+                                      stride, 17);                      \
1613
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1614
+                                                 16, 16);               \
1615
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1616
+                                  16, 16);                              \
1617
+}                                                                       \
1618
+                                                                        \
1619
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
1620
+                                          int stride)                   \
1621
+{                                                                       \
1622
+    uint64_t half[16 * 2 + 17 * 2];                                     \
1623
+    uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1624
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1625
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1626
+                                                 stride, 17);           \
1627
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1628
+                                      stride, 17);                      \
1629
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1630
+                                                 16, 16);               \
1631
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1632
+                                  16, 16);                              \
1633
+}                                                                       \
1634
+                                                                        \
1635
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
1636
+                                          int stride)                   \
1637
+{                                                                       \
1638
+    uint64_t half[16 * 2 + 17 * 2];                                     \
1639
+    uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1640
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1641
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1642
+                                                 stride, 17);           \
1643
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1644
+                                                 16, 16);               \
1645
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1646
+}                                                                       \
1647
+                                                                        \
1648
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
1649
+                                          int stride)                   \
1650
+{                                                                       \
1651
+    uint64_t half[16 * 2 + 17 * 2];                                     \
1652
+    uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1653
+    uint8_t * const halfHV = ((uint8_t*)half);                          \
1654
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1655
+                                                 stride, 17);           \
1656
+    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1657
+                                                 16, 16);               \
1658
+    OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1659
+                                  16, 16);                              \
1660
+}                                                                       \
1661
+                                                                        \
1662
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
1663
+                                          int stride)                   \
1664
+{                                                                       \
1665
+    uint64_t half[17 * 2];                                              \
1666
+    uint8_t * const halfH = ((uint8_t*)half);                           \
1667
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1668
+                                                 stride, 17);           \
1669
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1670
+                                      stride, 17);                      \
1671
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1672
+}                                                                       \
1673
+                                                                        \
1674
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
1675
+                                          int stride)                   \
1676
+{                                                                       \
1677
+    uint64_t half[17 * 2];                                              \
1678
+    uint8_t * const halfH = ((uint8_t*)half);                           \
1679
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1680
+                                                 stride, 17);           \
1681
+    put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1682
+                                      stride, 17);                      \
1683
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1684
+}                                                                       \
1685
+                                                                        \
1686
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
1687
+                                          int stride)                   \
1688
+{                                                                       \
1689
+    uint64_t half[17 * 2];                                              \
1690
+    uint8_t * const halfH = ((uint8_t*)half);                           \
1691
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1692
+                                                 stride, 17);           \
1693
+    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1694
+}
1695
+
1696
+#define PUT_OP(a, b, temp, size)                \
1697
+    "mov"#size"        "#a", "#b"       \n\t"
1698
+
1699
+#define AVG_3DNOW_OP(a, b, temp, size)          \
1700
+    "mov"#size"        "#b", "#temp"    \n\t"   \
1701
+    "pavgusb        "#temp", "#a"       \n\t"   \
1702
+    "mov"#size"        "#a", "#b"       \n\t"
1703
+
1704
+#define AVG_MMX2_OP(a, b, temp, size)           \
1705
+    "mov"#size"        "#b", "#temp"    \n\t"   \
1706
+    "pavgb          "#temp", "#a"       \n\t"   \
1707
+    "mov"#size"        "#a", "#b"       \n\t"
1708
+
1709
+QPEL_BASE(put_,        ff_pw_16, _,        PUT_OP,       PUT_OP)
1710
+QPEL_BASE(avg_,        ff_pw_16, _,        AVG_MMX2_OP,  AVG_3DNOW_OP)
1711
+QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP,       PUT_OP)
1712
+QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,       3dnow)
1713
+QPEL_OP(avg_,          ff_pw_16, _,        AVG_3DNOW_OP, 3dnow)
1714
+QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,       3dnow)
1715
+QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,       mmx2)
1716
+QPEL_OP(avg_,          ff_pw_16, _,        AVG_MMX2_OP,  mmx2)
1717
+QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,       mmx2)
1527 1718
 
1528 1719
 /***********************************/
1529 1720
 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1530 1721
 
1531
-#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1532
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1533
-    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1534
-}
1535
-#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1536
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1537
-    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1538
-}
1539
-
1540
-#define QPEL_2TAP(OPNAME, SIZE, MMX)\
1541
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1542
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1543
-QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1544
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1545
-                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1546
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1547
-                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1548
-static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1549
-                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1550
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1551
-    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1552
-}\
1553
-static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1554
-    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1555
-}\
1556
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
1557
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
1558
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
1559
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
1560
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
1561
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
1562
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
1563
-QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1722
+#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)                              \
1723
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1724
+                                                                 uint8_t *src, \
1725
+                                                                 int stride)   \
1726
+{                                                                              \
1727
+    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);                  \
1728
+}
1729
+
1730
+#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)                        \
1731
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1732
+                                                                 uint8_t *src, \
1733
+                                                                 int stride)   \
1734
+{                                                                              \
1735
+    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE,    \
1736
+                                               S1, S2);                        \
1737
+}
1738
+
1739
+#define QPEL_2TAP(OPNAME, SIZE, MMX)                                        \
1740
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)                            \
1741
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)                            \
1742
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)                               \
1743
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =    \
1744
+    OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;                                \
1745
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =    \
1746
+    OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;                           \
1747
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =    \
1748
+    OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;                           \
1749
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst,      \
1750
+                                                         uint8_t *src,      \
1751
+                                                         int stride)        \
1752
+{                                                                           \
1753
+    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE);    \
1754
+}                                                                           \
1755
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst,      \
1756
+                                                         uint8_t *src,      \
1757
+                                                         int stride)        \
1758
+{                                                                           \
1759
+    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride,              \
1760
+                                            stride, SIZE);                  \
1761
+}                                                                           \
1762
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,           1,       0)                \
1763
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,          -1,       0)                \
1764
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,           stride,  0)                \
1765
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,     -stride,  0)                \
1766
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,           stride,  1)                \
1767
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,           stride, -1)                \
1768
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,     -stride,  1)                \
1769
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1)                \
1564 1770
 
1565 1771
 QPEL_2TAP(put_, 16, mmx2)
1566 1772
 QPEL_2TAP(avg_, 16, mmx2)
... ...
@@ -1573,265 +1794,276 @@ QPEL_2TAP(avg_,  8, 3dnow)
1573 1573
 
1574 1574
 
1575 1575
 #if HAVE_YASM
1576
-typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1577
-                                 x86_reg linesize, x86_reg start_y,
1578
-                                 x86_reg end_y, x86_reg block_h,
1579
-                                 x86_reg start_x, x86_reg end_x,
1580
-                                 x86_reg block_w);
1576
+typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1577
+                                x86_reg linesize, x86_reg start_y,
1578
+                                x86_reg end_y, x86_reg block_h,
1579
+                                x86_reg start_x, x86_reg end_x,
1580
+                                x86_reg block_w);
1581 1581
 extern emu_edge_core_func ff_emu_edge_core_mmx;
1582 1582
 extern emu_edge_core_func ff_emu_edge_core_sse;
1583 1583
 
1584
-static av_always_inline
1585
-void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1586
-                      int block_w, int block_h,
1587
-                      int src_x, int src_y, int w, int h,
1588
-                      emu_edge_core_func *core_fn)
1584
+static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1585
+                                              int linesize,
1586
+                                              int block_w, int block_h,
1587
+                                              int src_x, int src_y,
1588
+                                              int w, int h,
1589
+                                              emu_edge_core_func *core_fn)
1589 1590
 {
1590
-    int start_y, start_x, end_y, end_x, src_y_add=0;
1591
-
1592
-    if(src_y>= h){
1593
-        src_y_add = h-1-src_y;
1594
-        src_y=h-1;
1595
-    }else if(src_y<=-block_h){
1596
-        src_y_add = 1-block_h-src_y;
1597
-        src_y=1-block_h;
1591
+    int start_y, start_x, end_y, end_x, src_y_add = 0;
1592
+
1593
+    if (src_y >= h) {
1594
+        src_y_add = h - 1 - src_y;
1595
+        src_y     = h - 1;
1596
+    } else if (src_y <= -block_h) {
1597
+        src_y_add = 1 - block_h - src_y;
1598
+        src_y     = 1 - block_h;
1598 1599
     }
1599
-    if(src_x>= w){
1600
-        src+= (w-1-src_x);
1601
-        src_x=w-1;
1602
-    }else if(src_x<=-block_w){
1603
-        src+= (1-block_w-src_x);
1604
-        src_x=1-block_w;
1600
+    if (src_x >= w) {
1601
+        src   += w - 1 - src_x;
1602
+        src_x  = w - 1;
1603
+    } else if (src_x <= -block_w) {
1604
+        src   += 1 - block_w - src_x;
1605
+        src_x  = 1 - block_w;
1605 1606
     }
1606 1607
 
1607
-    start_y= FFMAX(0, -src_y);
1608
-    start_x= FFMAX(0, -src_x);
1609
-    end_y= FFMIN(block_h, h-src_y);
1610
-    end_x= FFMIN(block_w, w-src_x);
1608
+    start_y = FFMAX(0, -src_y);
1609
+    start_x = FFMAX(0, -src_x);
1610
+    end_y   = FFMIN(block_h, h-src_y);
1611
+    end_x   = FFMIN(block_w, w-src_x);
1611 1612
     assert(start_x < end_x && block_w > 0);
1612 1613
     assert(start_y < end_y && block_h > 0);
1613 1614
 
1614 1615
     // fill in the to-be-copied part plus all above/below
1615
-    src += (src_y_add+start_y)*linesize + start_x;
1616
+    src += (src_y_add + start_y) * linesize + start_x;
1616 1617
     buf += start_x;
1617
-    core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1618
+    core_fn(buf, src, linesize, start_y, end_y,
1619
+            block_h, start_x, end_x, block_w);
1618 1620
 }
1619 1621
 
1620 1622
 #if ARCH_X86_32
1621
-static av_noinline
1622
-void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1623
-                          int block_w, int block_h,
1624
-                          int src_x, int src_y, int w, int h)
1623
+static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1624
+                                             int linesize,
1625
+                                             int block_w, int block_h,
1626
+                                             int src_x, int src_y, int w, int h)
1625 1627
 {
1626 1628
     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1627 1629
                      w, h, &ff_emu_edge_core_mmx);
1628 1630
 }
1629 1631
 #endif
1630
-static av_noinline
1631
-void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1632
-                          int block_w, int block_h,
1633
-                          int src_x, int src_y, int w, int h)
1632
+
1633
+static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1634
+                                             int linesize,
1635
+                                             int block_w, int block_h,
1636
+                                             int src_x, int src_y, int w, int h)
1634 1637
 {
1635 1638
     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1636 1639
                      w, h, &ff_emu_edge_core_sse);
1637 1640
 }
1638 1641
 #endif /* HAVE_YASM */
1639 1642
 
1640
-typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1641
-                                    int linesize, int block_w, int block_h,
1642
-                                    int src_x, int src_y, int w, int h);
1643
-
1644
-static av_always_inline
1645
-void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1646
-         int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1647
-         emulated_edge_mc_func *emu_edge_fn)
1648
-{
1649
-    const int w = 8;
1650
-    const int ix = ox>>(16+shift);
1651
-    const int iy = oy>>(16+shift);
1652
-    const int oxs = ox>>4;
1653
-    const int oys = oy>>4;
1654
-    const int dxxs = dxx>>4;
1655
-    const int dxys = dxy>>4;
1656
-    const int dyxs = dyx>>4;
1657
-    const int dyys = dyy>>4;
1658
-    const uint16_t r4[4] = {r,r,r,r};
1659
-    const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1660
-    const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1661
-    const uint64_t shift2 = 2*shift;
1662
-    uint8_t edge_buf[(h+1)*stride];
1643
+typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1644
+                                   int linesize, int block_w, int block_h,
1645
+                                   int src_x, int src_y, int w, int h);
1646
+
1647
+static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1648
+                                 int stride, int h, int ox, int oy,
1649
+                                 int dxx, int dxy, int dyx, int dyy,
1650
+                                 int shift, int r, int width, int height,
1651
+                                 emulated_edge_mc_func *emu_edge_fn)
1652
+{
1653
+    const int w    = 8;
1654
+    const int ix   = ox  >> (16 + shift);
1655
+    const int iy   = oy  >> (16 + shift);
1656
+    const int oxs  = ox  >> 4;
1657
+    const int oys  = oy  >> 4;
1658
+    const int dxxs = dxx >> 4;
1659
+    const int dxys = dxy >> 4;
1660
+    const int dyxs = dyx >> 4;
1661
+    const int dyys = dyy >> 4;
1662
+    const uint16_t r4[4]   = { r, r, r, r };
1663
+    const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1664
+    const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1665
+    const uint64_t shift2 = 2 * shift;
1666
+    uint8_t edge_buf[(h + 1) * stride];
1663 1667
     int x, y;
1664 1668
 
1665
-    const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1666
-    const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1667
-    const int dxh = dxy*(h-1);
1668
-    const int dyw = dyx*(w-1);
1669
-    if( // non-constant fullpel offset (3% of blocks)
1670
-        ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1671
-         (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1669
+    const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1670
+    const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1671
+    const int dxh = dxy * (h - 1);
1672
+    const int dyw = dyx * (w - 1);
1673
+    if ( // non-constant fullpel offset (3% of blocks)
1674
+        ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1675
+         (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1672 1676
         // uses more than 16 bits of subpel mv (only at huge resolution)
1673
-        || (dxx|dxy|dyx|dyy)&15 )
1674
-    {
1675
-        //FIXME could still use mmx for some of the rows
1676
-        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1677
+        || (dxx | dxy | dyx | dyy) & 15) {
1678
+        // FIXME could still use mmx for some of the rows
1679
+        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1680
+                 shift, r, width, height);
1677 1681
         return;
1678 1682
     }
1679 1683
 
1680
-    src += ix + iy*stride;
1681
-    if( (unsigned)ix >= width-w ||
1682
-        (unsigned)iy >= height-h )
1683
-    {
1684
-        emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1684
+    src += ix + iy * stride;
1685
+    if ((unsigned)ix >= width  - w ||
1686
+        (unsigned)iy >= height - h) {
1687
+        emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1685 1688
         src = edge_buf;
1686 1689
     }
1687 1690
 
1688
-    __asm__ volatile(
1689
-        "movd         %0, %%mm6 \n\t"
1690
-        "pxor      %%mm7, %%mm7 \n\t"
1691
-        "punpcklwd %%mm6, %%mm6 \n\t"
1692
-        "punpcklwd %%mm6, %%mm6 \n\t"
1691
+    __asm__ volatile (
1692
+        "movd         %0, %%mm6         \n\t"
1693
+        "pxor      %%mm7, %%mm7         \n\t"
1694
+        "punpcklwd %%mm6, %%mm6         \n\t"
1695
+        "punpcklwd %%mm6, %%mm6         \n\t"
1693 1696
         :: "r"(1<<shift)
1694 1697
     );
1695 1698
 
1696
-    for(x=0; x<w; x+=4){
1697
-        uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1698
-                            oxs - dxys + dxxs*(x+1),
1699
-                            oxs - dxys + dxxs*(x+2),
1700
-                            oxs - dxys + dxxs*(x+3) };
1701
-        uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1702
-                            oys - dyys + dyxs*(x+1),
1703
-                            oys - dyys + dyxs*(x+2),
1704
-                            oys - dyys + dyxs*(x+3) };
1705
-
1706
-        for(y=0; y<h; y++){
1707
-            __asm__ volatile(
1708
-                "movq   %0,  %%mm4 \n\t"
1709
-                "movq   %1,  %%mm5 \n\t"
1710
-                "paddw  %2,  %%mm4 \n\t"
1711
-                "paddw  %3,  %%mm5 \n\t"
1712
-                "movq   %%mm4, %0  \n\t"
1713
-                "movq   %%mm5, %1  \n\t"
1714
-                "psrlw  $12, %%mm4 \n\t"
1715
-                "psrlw  $12, %%mm5 \n\t"
1699
+    for (x = 0; x < w; x += 4) {
1700
+        uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1701
+                            oxs - dxys + dxxs * (x + 1),
1702
+                            oxs - dxys + dxxs * (x + 2),
1703
+                            oxs - dxys + dxxs * (x + 3) };
1704
+        uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1705
+                            oys - dyys + dyxs * (x + 1),
1706
+                            oys - dyys + dyxs * (x + 2),
1707
+                            oys - dyys + dyxs * (x + 3) };
1708
+
1709
+        for (y = 0; y < h; y++) {
1710
+            __asm__ volatile (
1711
+                "movq      %0, %%mm4    \n\t"
1712
+                "movq      %1, %%mm5    \n\t"
1713
+                "paddw     %2, %%mm4    \n\t"
1714
+                "paddw     %3, %%mm5    \n\t"
1715
+                "movq   %%mm4, %0       \n\t"
1716
+                "movq   %%mm5, %1       \n\t"
1717
+                "psrlw    $12, %%mm4    \n\t"
1718
+                "psrlw    $12, %%mm5    \n\t"
1716 1719
                 : "+m"(*dx4), "+m"(*dy4)
1717 1720
                 : "m"(*dxy4), "m"(*dyy4)
1718 1721
             );
1719 1722
 
1720
-            __asm__ volatile(
1721
-                "movq   %%mm6, %%mm2 \n\t"
1722
-                "movq   %%mm6, %%mm1 \n\t"
1723
-                "psubw  %%mm4, %%mm2 \n\t"
1724
-                "psubw  %%mm5, %%mm1 \n\t"
1725
-                "movq   %%mm2, %%mm0 \n\t"
1726
-                "movq   %%mm4, %%mm3 \n\t"
1727
-                "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1728
-                "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1729
-                "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1730
-                "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1731
-
1732
-                "movd   %4,    %%mm5 \n\t"
1733
-                "movd   %3,    %%mm4 \n\t"
1723
+            __asm__ volatile (
1724
+                "movq      %%mm6, %%mm2 \n\t"
1725
+                "movq      %%mm6, %%mm1 \n\t"
1726
+                "psubw     %%mm4, %%mm2 \n\t"
1727
+                "psubw     %%mm5, %%mm1 \n\t"
1728
+                "movq      %%mm2, %%mm0 \n\t"
1729
+                "movq      %%mm4, %%mm3 \n\t"
1730
+                "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1731
+                "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
1732
+                "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
1733
+                "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
1734
+
1735
+                "movd         %4, %%mm5 \n\t"
1736
+                "movd         %3, %%mm4 \n\t"
1734 1737
                 "punpcklbw %%mm7, %%mm5 \n\t"
1735 1738
                 "punpcklbw %%mm7, %%mm4 \n\t"
1736
-                "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1737
-                "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1739
+                "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1740
+                "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1738 1741
 
1739
-                "movd   %2,    %%mm5 \n\t"
1740
-                "movd   %1,    %%mm4 \n\t"
1742
+                "movd         %2, %%mm5 \n\t"
1743
+                "movd         %1, %%mm4 \n\t"
1741 1744
                 "punpcklbw %%mm7, %%mm5 \n\t"
1742 1745
                 "punpcklbw %%mm7, %%mm4 \n\t"
1743
-                "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1744
-                "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1745
-                "paddw  %5,    %%mm1 \n\t"
1746
-                "paddw  %%mm3, %%mm2 \n\t"
1747
-                "paddw  %%mm1, %%mm0 \n\t"
1748
-                "paddw  %%mm2, %%mm0 \n\t"
1749
-
1750
-                "psrlw    %6,    %%mm0 \n\t"
1751
-                "packuswb %%mm0, %%mm0 \n\t"
1752
-                "movd     %%mm0, %0    \n\t"
1753
-
1754
-                : "=m"(dst[x+y*stride])
1746
+                "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1747
+                "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1748
+                "paddw        %5, %%mm1 \n\t"
1749
+                "paddw     %%mm3, %%mm2 \n\t"
1750
+                "paddw     %%mm1, %%mm0 \n\t"
1751
+                "paddw     %%mm2, %%mm0 \n\t"
1752
+
1753
+                "psrlw        %6, %%mm0 \n\t"
1754
+                "packuswb  %%mm0, %%mm0 \n\t"
1755
+                "movd      %%mm0, %0    \n\t"
1756
+
1757
+                : "=m"(dst[x + y * stride])
1755 1758
                 : "m"(src[0]), "m"(src[1]),
1756
-                  "m"(src[stride]), "m"(src[stride+1]),
1759
+                  "m"(src[stride]), "m"(src[stride + 1]),
1757 1760
                   "m"(*r4), "m"(shift2)
1758 1761
             );
1759 1762
             src += stride;
1760 1763
         }
1761
-        src += 4-h*stride;
1764
+        src += 4 - h * stride;
1762 1765
     }
1763 1766
 }
1764 1767
 
1765 1768
 #if HAVE_YASM
1766 1769
 #if ARCH_X86_32
1767
-static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1768
-                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1770
+static void gmc_mmx(uint8_t *dst, uint8_t *src,
1771
+                    int stride, int h, int ox, int oy,
1772
+                    int dxx, int dxy, int dyx, int dyy,
1773
+                    int shift, int r, int width, int height)
1769 1774
 {
1770 1775
     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1771 1776
         width, height, &emulated_edge_mc_mmx);
1772 1777
 }
1773 1778
 #endif
1774
-static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1775
-                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1779
+static void gmc_sse(uint8_t *dst, uint8_t *src,
1780
+                    int stride, int h, int ox, int oy,
1781
+                    int dxx, int dxy, int dyx, int dyy,
1782
+                    int shift, int r, int width, int height)
1776 1783
 {
1777 1784
     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1778 1785
         width, height, &emulated_edge_mc_sse);
1779 1786
 }
1780 1787
 #else
1781
-static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1782
-                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1788
+static void gmc_mmx(uint8_t *dst, uint8_t *src,
1789
+                    int stride, int h, int ox, int oy,
1790
+                    int dxx, int dxy, int dyx, int dyy,
1791
+                    int shift, int r, int width, int height)
1783 1792
 {
1784 1793
     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1785 1794
         width, height, &ff_emulated_edge_mc_8);
1786 1795
 }
1787 1796
 #endif
1788 1797
 
1789
-#define PREFETCH(name, op) \
1790
-static void name(void *mem, int stride, int h){\
1791
-    const uint8_t *p= mem;\
1792
-    do{\
1793
-        __asm__ volatile(#op" %0" :: "m"(*p));\
1794
-        p+= stride;\
1795
-    }while(--h);\
1798
+#define PREFETCH(name, op)                      \
1799
+static void name(void *mem, int stride, int h)  \
1800
+{                                               \
1801
+    const uint8_t *p = mem;                     \
1802
+    do {                                        \
1803
+        __asm__ volatile (#op" %0" :: "m"(*p)); \
1804
+        p += stride;                            \
1805
+    } while (--h);                              \
1796 1806
 }
1807
+
1797 1808
 PREFETCH(prefetch_mmx2,  prefetcht0)
1798 1809
 PREFETCH(prefetch_3dnow, prefetch)
1799 1810
 #undef PREFETCH
1800 1811
 
1801 1812
 #include "h264_qpel_mmx.c"
1802 1813
 
1803
-void ff_put_h264_chroma_mc8_mmx_rnd   (uint8_t *dst, uint8_t *src,
1804
-                                       int stride, int h, int x, int y);
1805
-void ff_avg_h264_chroma_mc8_mmx2_rnd  (uint8_t *dst, uint8_t *src,
1806
-                                       int stride, int h, int x, int y);
1807
-void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1808
-                                       int stride, int h, int x, int y);
1809
-
1810
-void ff_put_h264_chroma_mc4_mmx       (uint8_t *dst, uint8_t *src,
1811
-                                       int stride, int h, int x, int y);
1812
-void ff_avg_h264_chroma_mc4_mmx2      (uint8_t *dst, uint8_t *src,
1813
-                                       int stride, int h, int x, int y);
1814
-void ff_avg_h264_chroma_mc4_3dnow     (uint8_t *dst, uint8_t *src,
1815
-                                       int stride, int h, int x, int y);
1816
-
1817
-void ff_put_h264_chroma_mc2_mmx2      (uint8_t *dst, uint8_t *src,
1818
-                                       int stride, int h, int x, int y);
1819
-void ff_avg_h264_chroma_mc2_mmx2      (uint8_t *dst, uint8_t *src,
1820
-                                       int stride, int h, int x, int y);
1821
-
1822
-void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1823
-                                       int stride, int h, int x, int y);
1824
-void ff_put_h264_chroma_mc4_ssse3     (uint8_t *dst, uint8_t *src,
1825
-                                       int stride, int h, int x, int y);
1826
-
1827
-void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1828
-                                       int stride, int h, int x, int y);
1829
-void ff_avg_h264_chroma_mc4_ssse3     (uint8_t *dst, uint8_t *src,
1830
-                                       int stride, int h, int x, int y);
1831
-
1832
-#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1833
-void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1834
-                                      (uint8_t *dst, uint8_t *src,\
1814
+void ff_put_h264_chroma_mc8_mmx_rnd  (uint8_t *dst, uint8_t *src,
1815
+                                      int stride, int h, int x, int y);
1816
+void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1817
+                                      int stride, int h, int x, int y);
1818
+void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
1819
+                                      int stride, int h, int x, int y);
1820
+
1821
+void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, uint8_t *src,
1822
+                                      int stride, int h, int x, int y);
1823
+void ff_avg_h264_chroma_mc4_mmx2     (uint8_t *dst, uint8_t *src,
1824
+                                      int stride, int h, int x, int y);
1825
+void ff_avg_h264_chroma_mc4_3dnow    (uint8_t *dst, uint8_t *src,
1826
+                                      int stride, int h, int x, int y);
1827
+
1828
+void ff_put_h264_chroma_mc2_mmx2     (uint8_t *dst, uint8_t *src,
1829
+                                      int stride, int h, int x, int y);
1830
+void ff_avg_h264_chroma_mc2_mmx2     (uint8_t *dst, uint8_t *src,
1831
+                                      int stride, int h, int x, int y);
1832
+
1833
+void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
1834
+                                      int stride, int h, int x, int y);
1835
+void ff_put_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
1836
+                                      int stride, int h, int x, int y);
1837
+
1838
+void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
1839
+                                      int stride, int h, int x, int y);
1840
+void ff_avg_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
1841
+                                      int stride, int h, int x, int y);
1842
+
1843
+#define CHROMA_MC(OP, NUM, DEPTH, OPT)                                  \
1844
+void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT      \
1845
+                                      (uint8_t *dst, uint8_t *src,      \
1835 1846
                                        int stride, int h, int x, int y);
1836 1847
 
1837 1848
 CHROMA_MC(put, 2, 10, mmxext)
... ...
@@ -1843,25 +2075,37 @@ CHROMA_MC(avg, 8, 10, sse2)
1843 1843
 CHROMA_MC(put, 8, 10, avx)
1844 1844
 CHROMA_MC(avg, 8, 10, avx)
1845 1845
 
1846
-/* CAVS specific */
1847
-void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1846
+/* CAVS-specific */
1847
+void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
1848
+{
1848 1849
     put_pixels8_mmx(dst, src, stride, 8);
1849 1850
 }
1850
-void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1851
+
1852
+void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
1853
+{
1851 1854
     avg_pixels8_mmx(dst, src, stride, 8);
1852 1855
 }
1853
-void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1856
+
1857
+void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
1858
+{
1854 1859
     put_pixels16_mmx(dst, src, stride, 16);
1855 1860
 }
1856
-void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1861
+
1862
+void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
1863
+{
1857 1864
     avg_pixels16_mmx(dst, src, stride, 16);
1858 1865
 }
1859 1866
 
1860
-/* VC1 specific */
1861
-void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1867
+/* VC-1-specific */
1868
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1869
+                               int stride, int rnd)
1870
+{
1862 1871
     put_pixels8_mmx(dst, src, stride, 8);
1863 1872
 }
1864
-void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1873
+
1874
+void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
1875
+                                int stride, int rnd)
1876
+{
1865 1877
     avg_pixels8_mmx2(dst, src, stride, 8);
1866 1878
 }
1867 1879
 
... ...
@@ -1943,90 +2187,102 @@ void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride,
1943 1943
 }
1944 1944
 #endif
1945 1945
 
1946
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
1947
-   converted */
1946
+/* XXX: Those functions should be suppressed ASAP when all IDCTs are
1947
+ * converted. */
1948 1948
 #if CONFIG_GPL
1949
-static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1949
+static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1950
+                                    DCTELEM *block)
1950 1951
 {
1951
-    ff_mmx_idct (block);
1952
+    ff_mmx_idct(block);
1952 1953
     ff_put_pixels_clamped_mmx(block, dest, line_size);
1953 1954
 }
1954
-static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1955
+
1956
+static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1957
+                                    DCTELEM *block)
1955 1958
 {
1956
-    ff_mmx_idct (block);
1959
+    ff_mmx_idct(block);
1957 1960
     ff_add_pixels_clamped_mmx(block, dest, line_size);
1958 1961
 }
1959
-static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1962
+
1963
+static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1964
+                                     DCTELEM *block)
1960 1965
 {
1961
-    ff_mmxext_idct (block);
1966
+    ff_mmxext_idct(block);
1962 1967
     ff_put_pixels_clamped_mmx(block, dest, line_size);
1963 1968
 }
1964
-static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1969
+
1970
+static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1971
+                                     DCTELEM *block)
1965 1972
 {
1966
-    ff_mmxext_idct (block);
1973
+    ff_mmxext_idct(block);
1967 1974
     ff_add_pixels_clamped_mmx(block, dest, line_size);
1968 1975
 }
1969 1976
 #endif
1977
+
1970 1978
 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1971 1979
 {
1972
-    ff_idct_xvid_mmx (block);
1980
+    ff_idct_xvid_mmx(block);
1973 1981
     ff_put_pixels_clamped_mmx(block, dest, line_size);
1974 1982
 }
1983
+
1975 1984
 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1976 1985
 {
1977
-    ff_idct_xvid_mmx (block);
1986
+    ff_idct_xvid_mmx(block);
1978 1987
     ff_add_pixels_clamped_mmx(block, dest, line_size);
1979 1988
 }
1989
+
1980 1990
 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1981 1991
 {
1982
-    ff_idct_xvid_mmx2 (block);
1992
+    ff_idct_xvid_mmx2(block);
1983 1993
     ff_put_pixels_clamped_mmx(block, dest, line_size);
1984 1994
 }
1995
+
1985 1996
 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
1986 1997
 {
1987
-    ff_idct_xvid_mmx2 (block);
1998
+    ff_idct_xvid_mmx2(block);
1988 1999
     ff_add_pixels_clamped_mmx(block, dest, line_size);
1989 2000
 }
1990 2001
 
1991 2002
 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
1992 2003
 {
1993 2004
     int i;
1994
-    __asm__ volatile("pxor %%mm7, %%mm7":);
1995
-    for(i=0; i<blocksize; i+=2) {
1996
-        __asm__ volatile(
1997
-            "movq    %0,    %%mm0 \n\t"
1998
-            "movq    %1,    %%mm1 \n\t"
1999
-            "movq    %%mm0, %%mm2 \n\t"
2000
-            "movq    %%mm1, %%mm3 \n\t"
2001
-            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2002
-            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2003
-            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
2004
-            "pxor    %%mm2, %%mm1 \n\t"
2005
-            "movq    %%mm3, %%mm4 \n\t"
2006
-            "pand    %%mm1, %%mm3 \n\t"
2007
-            "pandn   %%mm1, %%mm4 \n\t"
2008
-            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2009
-            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2010
-            "movq    %%mm3, %1    \n\t"
2011
-            "movq    %%mm0, %0    \n\t"
2012
-            :"+m"(mag[i]), "+m"(ang[i])
2013
-            ::"memory"
2005
+    __asm__ volatile ("pxor %%mm7, %%mm7":);
2006
+    for (i = 0; i < blocksize; i += 2) {
2007
+        __asm__ volatile (
2008
+            "movq       %0, %%mm0   \n\t"
2009
+            "movq       %1, %%mm1   \n\t"
2010
+            "movq    %%mm0, %%mm2   \n\t"
2011
+            "movq    %%mm1, %%mm3   \n\t"
2012
+            "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
2013
+            "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
2014
+            "pslld     $31, %%mm2   \n\t" // keep only the sign bit
2015
+            "pxor    %%mm2, %%mm1   \n\t"
2016
+            "movq    %%mm3, %%mm4   \n\t"
2017
+            "pand    %%mm1, %%mm3   \n\t"
2018
+            "pandn   %%mm1, %%mm4   \n\t"
2019
+            "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2020
+            "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2021
+            "movq    %%mm3, %1      \n\t"
2022
+            "movq    %%mm0, %0      \n\t"
2023
+            : "+m"(mag[i]), "+m"(ang[i])
2024
+            :: "memory"
2014 2025
         );
2015 2026
     }
2016
-    __asm__ volatile("femms");
2027
+    __asm__ volatile ("femms");
2017 2028
 }
2029
+
2018 2030
 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2019 2031
 {
2020 2032
     int i;
2021 2033
 
2022
-    __asm__ volatile(
2023
-            "movaps  %0,     %%xmm5 \n\t"
2024
-        ::"m"(ff_pdw_80000000[0])
2034
+    __asm__ volatile (
2035
+        "movaps  %0, %%xmm5 \n\t"
2036
+        :: "m"(ff_pdw_80000000[0])
2025 2037
     );
2026
-    for(i=0; i<blocksize; i+=4) {
2027
-        __asm__ volatile(
2028
-            "movaps  %0,     %%xmm0 \n\t"
2029
-            "movaps  %1,     %%xmm1 \n\t"
2038
+    for (i = 0; i < blocksize; i += 4) {
2039
+        __asm__ volatile (
2040
+            "movaps      %0, %%xmm0 \n\t"
2041
+            "movaps      %1, %%xmm1 \n\t"
2030 2042
             "xorps   %%xmm2, %%xmm2 \n\t"
2031 2043
             "xorps   %%xmm3, %%xmm3 \n\t"
2032 2044
             "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
... ...
@@ -2036,12 +2292,12 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2036 2036
             "movaps  %%xmm3, %%xmm4 \n\t"
2037 2037
             "andps   %%xmm1, %%xmm3 \n\t"
2038 2038
             "andnps  %%xmm1, %%xmm4 \n\t"
2039
-            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2040
-            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2039
+            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2040
+            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2041 2041
             "movaps  %%xmm3, %1     \n\t"
2042 2042
             "movaps  %%xmm0, %0     \n\t"
2043
-            :"+m"(mag[i]), "+m"(ang[i])
2044
-            ::"memory"
2043
+            : "+m"(mag[i]), "+m"(ang[i])
2044
+            :: "memory"
2045 2045
         );
2046 2046
     }
2047 2047
 }
... ...
@@ -2049,97 +2305,105 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2049 2049
 #define IF1(x) x
2050 2050
 #define IF0(x)
2051 2051
 
2052
-#define MIX5(mono,stereo)\
2053
-    __asm__ volatile(\
2054
-        "movss          0(%2), %%xmm5 \n"\
2055
-        "movss          8(%2), %%xmm6 \n"\
2056
-        "movss         24(%2), %%xmm7 \n"\
2057
-        "shufps    $0, %%xmm5, %%xmm5 \n"\
2058
-        "shufps    $0, %%xmm6, %%xmm6 \n"\
2059
-        "shufps    $0, %%xmm7, %%xmm7 \n"\
2060
-        "1: \n"\
2061
-        "movaps       (%0,%1), %%xmm0 \n"\
2062
-        "movaps  0x400(%0,%1), %%xmm1 \n"\
2063
-        "movaps  0x800(%0,%1), %%xmm2 \n"\
2064
-        "movaps  0xc00(%0,%1), %%xmm3 \n"\
2065
-        "movaps 0x1000(%0,%1), %%xmm4 \n"\
2066
-        "mulps         %%xmm5, %%xmm0 \n"\
2067
-        "mulps         %%xmm6, %%xmm1 \n"\
2068
-        "mulps         %%xmm5, %%xmm2 \n"\
2069
-        "mulps         %%xmm7, %%xmm3 \n"\
2070
-        "mulps         %%xmm7, %%xmm4 \n"\
2071
- stereo("addps         %%xmm1, %%xmm0 \n")\
2072
-        "addps         %%xmm1, %%xmm2 \n"\
2073
-        "addps         %%xmm3, %%xmm0 \n"\
2074
-        "addps         %%xmm4, %%xmm2 \n"\
2075
-   mono("addps         %%xmm2, %%xmm0 \n")\
2076
-        "movaps  %%xmm0,      (%0,%1) \n"\
2077
- stereo("movaps  %%xmm2, 0x400(%0,%1) \n")\
2078
-        "add $16, %0 \n"\
2079
-        "jl 1b \n"\
2080
-        :"+&r"(i)\
2081
-        :"r"(samples[0]+len), "r"(matrix)\
2082
-        :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2083
-                      "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2084
-         "memory"\
2052
+#define MIX5(mono, stereo)                                      \
2053
+    __asm__ volatile (                                          \
2054
+        "movss           0(%2), %%xmm5          \n"             \
2055
+        "movss           8(%2), %%xmm6          \n"             \
2056
+        "movss          24(%2), %%xmm7          \n"             \
2057
+        "shufps     $0, %%xmm5, %%xmm5          \n"             \
2058
+        "shufps     $0, %%xmm6, %%xmm6          \n"             \
2059
+        "shufps     $0, %%xmm7, %%xmm7          \n"             \
2060
+        "1:                                     \n"             \
2061
+        "movaps       (%0, %1), %%xmm0          \n"             \
2062
+        "movaps  0x400(%0, %1), %%xmm1          \n"             \
2063
+        "movaps  0x800(%0, %1), %%xmm2          \n"             \
2064
+        "movaps  0xc00(%0, %1), %%xmm3          \n"             \
2065
+        "movaps 0x1000(%0, %1), %%xmm4          \n"             \
2066
+        "mulps          %%xmm5, %%xmm0          \n"             \
2067
+        "mulps          %%xmm6, %%xmm1          \n"             \
2068
+        "mulps          %%xmm5, %%xmm2          \n"             \
2069
+        "mulps          %%xmm7, %%xmm3          \n"             \
2070
+        "mulps          %%xmm7, %%xmm4          \n"             \
2071
+ stereo("addps          %%xmm1, %%xmm0          \n")            \
2072
+        "addps          %%xmm1, %%xmm2          \n"             \
2073
+        "addps          %%xmm3, %%xmm0          \n"             \
2074
+        "addps          %%xmm4, %%xmm2          \n"             \
2075
+   mono("addps          %%xmm2, %%xmm0          \n")            \
2076
+        "movaps         %%xmm0, (%0, %1)        \n"             \
2077
+ stereo("movaps         %%xmm2, 0x400(%0, %1)   \n")            \
2078
+        "add               $16, %0              \n"             \
2079
+        "jl                 1b                  \n"             \
2080
+        : "+&r"(i)                                              \
2081
+        : "r"(samples[0] + len), "r"(matrix)                    \
2082
+        : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",      \
2083
+                      "%xmm4", "%xmm5", "%xmm6", "%xmm7",)      \
2084
+         "memory"                                               \
2085 2085
     );
2086 2086
 
2087
-#define MIX_MISC(stereo)\
2088
-    __asm__ volatile(\
2089
-        "1: \n"\
2090
-        "movaps  (%3,%0), %%xmm0 \n"\
2091
- stereo("movaps   %%xmm0, %%xmm1 \n")\
2092
-        "mulps    %%xmm4, %%xmm0 \n"\
2093
- stereo("mulps    %%xmm5, %%xmm1 \n")\
2094
-        "lea 1024(%3,%0), %1 \n"\
2095
-        "mov %5, %2 \n"\
2096
-        "2: \n"\
2097
-        "movaps   (%1),   %%xmm2 \n"\
2098
- stereo("movaps   %%xmm2, %%xmm3 \n")\
2099
-        "mulps   (%4,%2), %%xmm2 \n"\
2100
- stereo("mulps 16(%4,%2), %%xmm3 \n")\
2101
-        "addps    %%xmm2, %%xmm0 \n"\
2102
- stereo("addps    %%xmm3, %%xmm1 \n")\
2103
-        "add $1024, %1 \n"\
2104
-        "add $32, %2 \n"\
2105
-        "jl 2b \n"\
2106
-        "movaps   %%xmm0,     (%3,%0) \n"\
2107
- stereo("movaps   %%xmm1, 1024(%3,%0) \n")\
2108
-        "add $16, %0 \n"\
2109
-        "jl 1b \n"\
2110
-        :"+&r"(i), "=&r"(j), "=&r"(k)\
2111
-        :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2112
-        :"memory"\
2087
+#define MIX_MISC(stereo)                                        \
2088
+    __asm__ volatile (                                          \
2089
+        "1:                                 \n"                 \
2090
+        "movaps     (%3, %0), %%xmm0        \n"                 \
2091
+ stereo("movaps       %%xmm0, %%xmm1        \n")                \
2092
+        "mulps        %%xmm4, %%xmm0        \n"                 \
2093
+ stereo("mulps        %%xmm5, %%xmm1        \n")                \
2094
+        "lea    1024(%3, %0), %1            \n"                 \
2095
+        "mov              %5, %2            \n"                 \
2096
+        "2:                                 \n"                 \
2097
+        "movaps         (%1), %%xmm2        \n"                 \
2098
+ stereo("movaps       %%xmm2, %%xmm3        \n")                \
2099
+        "mulps      (%4, %2), %%xmm2        \n"                 \
2100
+ stereo("mulps    16(%4, %2), %%xmm3        \n")                \
2101
+        "addps        %%xmm2, %%xmm0        \n"                 \
2102
+ stereo("addps        %%xmm3, %%xmm1        \n")                \
2103
+        "add           $1024, %1            \n"                 \
2104
+        "add             $32, %2            \n"                 \
2105
+        "jl               2b                \n"                 \
2106
+        "movaps       %%xmm0,     (%3, %0)  \n"                 \
2107
+ stereo("movaps       %%xmm1, 1024(%3, %0)  \n")                \
2108
+        "add             $16, %0            \n"                 \
2109
+        "jl               1b                \n"                 \
2110
+        : "+&r"(i), "=&r"(j), "=&r"(k)                          \
2111
+        : "r"(samples[0] + len), "r"(matrix_simd + in_ch),      \
2112
+          "g"((intptr_t) - 32 * (in_ch - 1))                    \
2113
+        : "memory"                                              \
2113 2114
     );
2114 2115
 
2115
-static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2116
+static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2117
+                            int out_ch, int in_ch, int len)
2116 2118
 {
2117 2119
     int (*matrix_cmp)[2] = (int(*)[2])matrix;
2118
-    intptr_t i,j,k;
2119
-
2120
-    i = -len*sizeof(float);
2121
-    if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2122
-        MIX5(IF0,IF1);
2123
-    } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2124
-        MIX5(IF1,IF0);
2120
+    intptr_t i, j, k;
2121
+
2122
+    i = -len * sizeof(float);
2123
+    if (in_ch == 5 && out_ch == 2 &&
2124
+        !(matrix_cmp[0][1] | matrix_cmp[2][0]   |
2125
+          matrix_cmp[3][1] | matrix_cmp[4][0]   |
2126
+          (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2127
+          (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2128
+        MIX5(IF0, IF1);
2129
+    } else if (in_ch == 5 && out_ch == 1 &&
2130
+               matrix_cmp[0][0] == matrix_cmp[2][0] &&
2131
+               matrix_cmp[3][0] == matrix_cmp[4][0]) {
2132
+        MIX5(IF1, IF0);
2125 2133
     } else {
2126 2134
         DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2127
-        j = 2*in_ch*sizeof(float);
2128
-        __asm__ volatile(
2129
-            "1: \n"
2130
-            "sub $8, %0 \n"
2131
-            "movss     (%2,%0), %%xmm4 \n"
2132
-            "movss    4(%2,%0), %%xmm5 \n"
2133
-            "shufps $0, %%xmm4, %%xmm4 \n"
2134
-            "shufps $0, %%xmm5, %%xmm5 \n"
2135
-            "movaps %%xmm4,   (%1,%0,4) \n"
2136
-            "movaps %%xmm5, 16(%1,%0,4) \n"
2137
-            "jg 1b \n"
2138
-            :"+&r"(j)
2139
-            :"r"(matrix_simd), "r"(matrix)
2140
-            :"memory"
2135
+        j = 2 * in_ch * sizeof(float);
2136
+        __asm__ volatile (
2137
+            "1:                                 \n"
2138
+            "sub             $8, %0             \n"
2139
+            "movss     (%2, %0), %%xmm4         \n"
2140
+            "movss    4(%2, %0), %%xmm5         \n"
2141
+            "shufps          $0, %%xmm4, %%xmm4 \n"
2142
+            "shufps          $0, %%xmm5, %%xmm5 \n"
2143
+            "movaps      %%xmm4,   (%1, %0, 4)  \n"
2144
+            "movaps      %%xmm5, 16(%1, %0, 4)  \n"
2145
+            "jg              1b                 \n"
2146
+            : "+&r"(j)
2147
+            : "r"(matrix_simd), "r"(matrix)
2148
+            : "memory"
2141 2149
         );
2142
-        if(out_ch == 2) {
2150
+        if (out_ch == 2) {
2143 2151
             MIX_MISC(IF1);
2144 2152
         } else {
2145 2153
             MIX_MISC(IF0);
... ...
@@ -2147,216 +2411,232 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_c
2147 2147
     }
2148 2148
 }
2149 2149
 
2150
-static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2151
-    x86_reg i = (len-4)*4;
2152
-    __asm__ volatile(
2153
-        "1: \n\t"
2154
-        "movq    (%2,%0), %%mm0 \n\t"
2155
-        "movq   8(%2,%0), %%mm1 \n\t"
2156
-        "pfmul   (%3,%0), %%mm0 \n\t"
2157
-        "pfmul  8(%3,%0), %%mm1 \n\t"
2158
-        "movq   %%mm0,  (%1,%0) \n\t"
2159
-        "movq   %%mm1, 8(%1,%0) \n\t"
2160
-        "sub  $16, %0 \n\t"
2161
-        "jge 1b \n\t"
2162
-        "femms  \n\t"
2163
-        :"+r"(i)
2164
-        :"r"(dst), "r"(src0), "r"(src1)
2165
-        :"memory"
2150
+static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1,
2151
+                              int len)
2152
+{
2153
+    x86_reg i = (len - 4) * 4;
2154
+    __asm__ volatile (
2155
+        "1:                             \n\t"
2156
+        "movq    (%2, %0), %%mm0        \n\t"
2157
+        "movq   8(%2, %0), %%mm1        \n\t"
2158
+        "pfmul   (%3, %0), %%mm0        \n\t"
2159
+        "pfmul  8(%3, %0), %%mm1        \n\t"
2160
+        "movq       %%mm0,  (%1, %0)    \n\t"
2161
+        "movq       %%mm1, 8(%1, %0)    \n\t"
2162
+        "sub          $16, %0           \n\t"
2163
+        "jge           1b               \n\t"
2164
+        "femms                          \n\t"
2165
+        : "+r"(i)
2166
+        : "r"(dst), "r"(src0), "r"(src1)
2167
+        : "memory"
2166 2168
     );
2167 2169
 }
2168
-static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2169
-    x86_reg i = (len-8)*4;
2170
-    __asm__ volatile(
2171
-        "1: \n\t"
2172
-        "movaps    (%2,%0), %%xmm0 \n\t"
2173
-        "movaps  16(%2,%0), %%xmm1 \n\t"
2174
-        "mulps     (%3,%0), %%xmm0 \n\t"
2175
-        "mulps   16(%3,%0), %%xmm1 \n\t"
2176
-        "movaps  %%xmm0,   (%1,%0) \n\t"
2177
-        "movaps  %%xmm1, 16(%1,%0) \n\t"
2178
-        "sub  $32, %0 \n\t"
2179
-        "jge 1b \n\t"
2180
-        :"+r"(i)
2181
-        :"r"(dst), "r"(src0), "r"(src1)
2182
-        :"memory"
2170
+
2171
+static void vector_fmul_sse(float *dst, const float *src0, const float *src1,
2172
+                            int len)
2173
+{
2174
+    x86_reg i = (len - 8) * 4;
2175
+    __asm__ volatile (
2176
+        "1:                             \n\t"
2177
+        "movaps    (%2, %0), %%xmm0     \n\t"
2178
+        "movaps  16(%2, %0), %%xmm1     \n\t"
2179
+        "mulps     (%3, %0), %%xmm0     \n\t"
2180
+        "mulps   16(%3, %0), %%xmm1     \n\t"
2181
+        "movaps      %%xmm0,   (%1, %0) \n\t"
2182
+        "movaps      %%xmm1, 16(%1, %0) \n\t"
2183
+        "sub            $32, %0         \n\t"
2184
+        "jge             1b             \n\t"
2185
+        : "+r"(i)
2186
+        : "r"(dst), "r"(src0), "r"(src1)
2187
+        : "memory"
2183 2188
     );
2184 2189
 }
2185 2190
 
2186
-static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2187
-    x86_reg i = len*4-16;
2188
-    __asm__ volatile(
2189
-        "1: \n\t"
2190
-        "pswapd   8(%1), %%mm0 \n\t"
2191
-        "pswapd    (%1), %%mm1 \n\t"
2192
-        "pfmul  (%3,%0), %%mm0 \n\t"
2193
-        "pfmul 8(%3,%0), %%mm1 \n\t"
2194
-        "movq  %%mm0,  (%2,%0) \n\t"
2195
-        "movq  %%mm1, 8(%2,%0) \n\t"
2196
-        "add   $16, %1 \n\t"
2197
-        "sub   $16, %0 \n\t"
2198
-        "jge   1b \n\t"
2199
-        :"+r"(i), "+r"(src1)
2200
-        :"r"(dst), "r"(src0)
2191
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0,
2192
+                                       const float *src1, int len)
2193
+{
2194
+    x86_reg i = len * 4 - 16;
2195
+    __asm__ volatile (
2196
+        "1:                             \n\t"
2197
+        "pswapd     8(%1), %%mm0        \n\t"
2198
+        "pswapd      (%1), %%mm1        \n\t"
2199
+        "pfmul   (%3, %0), %%mm0        \n\t"
2200
+        "pfmul  8(%3, %0), %%mm1        \n\t"
2201
+        "movq       %%mm0,  (%2, %0)    \n\t"
2202
+        "movq       %%mm1, 8(%2, %0)    \n\t"
2203
+        "add          $16, %1           \n\t"
2204
+        "sub          $16, %0           \n\t"
2205
+        "jge           1b               \n\t"
2206
+        : "+r"(i), "+r"(src1)
2207
+        : "r"(dst), "r"(src0)
2201 2208
     );
2202
-    __asm__ volatile("femms");
2209
+    __asm__ volatile ("femms");
2203 2210
 }
2204
-static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2205
-    x86_reg i = len*4-32;
2206
-    __asm__ volatile(
2207
-        "1: \n\t"
2208
-        "movaps        16(%1), %%xmm0 \n\t"
2209
-        "movaps          (%1), %%xmm1 \n\t"
2210
-        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2211
-        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2212
-        "mulps        (%3,%0), %%xmm0 \n\t"
2213
-        "mulps      16(%3,%0), %%xmm1 \n\t"
2214
-        "movaps     %%xmm0,   (%2,%0) \n\t"
2215
-        "movaps     %%xmm1, 16(%2,%0) \n\t"
2216
-        "add    $32, %1 \n\t"
2217
-        "sub    $32, %0 \n\t"
2218
-        "jge    1b \n\t"
2219
-        :"+r"(i), "+r"(src1)
2220
-        :"r"(dst), "r"(src0)
2211
+
2212
+static void vector_fmul_reverse_sse(float *dst, const float *src0,
2213
+                                    const float *src1, int len)
2214
+{
2215
+    x86_reg i = len * 4 - 32;
2216
+    __asm__ volatile (
2217
+        "1:                                 \n\t"
2218
+        "movaps         16(%1), %%xmm0      \n\t"
2219
+        "movaps           (%1), %%xmm1      \n\t"
2220
+        "shufps  $0x1b, %%xmm0, %%xmm0      \n\t"
2221
+        "shufps  $0x1b, %%xmm1, %%xmm1      \n\t"
2222
+        "mulps        (%3, %0), %%xmm0      \n\t"
2223
+        "mulps      16(%3, %0), %%xmm1      \n\t"
2224
+        "movaps         %%xmm0,   (%2, %0)  \n\t"
2225
+        "movaps         %%xmm1, 16(%2, %0)  \n\t"
2226
+        "add               $32, %1          \n\t"
2227
+        "sub               $32, %0          \n\t"
2228
+        "jge                1b              \n\t"
2229
+        : "+r"(i), "+r"(src1)
2230
+        : "r"(dst), "r"(src0)
2221 2231
     );
2222 2232
 }
2223 2233
 
2224
-static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2225
-                                  const float *src2, int len){
2226
-    x86_reg i = (len-4)*4;
2227
-    __asm__ volatile(
2228
-        "1: \n\t"
2229
-        "movq    (%2,%0), %%mm0 \n\t"
2230
-        "movq   8(%2,%0), %%mm1 \n\t"
2231
-        "pfmul   (%3,%0), %%mm0 \n\t"
2232
-        "pfmul  8(%3,%0), %%mm1 \n\t"
2233
-        "pfadd   (%4,%0), %%mm0 \n\t"
2234
-        "pfadd  8(%4,%0), %%mm1 \n\t"
2235
-        "movq  %%mm0,   (%1,%0) \n\t"
2236
-        "movq  %%mm1,  8(%1,%0) \n\t"
2237
-        "sub  $16, %0 \n\t"
2238
-        "jge  1b \n\t"
2239
-        :"+r"(i)
2240
-        :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2241
-        :"memory"
2234
+static void vector_fmul_add_3dnow(float *dst, const float *src0,
2235
+                                  const float *src1, const float *src2, int len)
2236
+{
2237
+    x86_reg i = (len - 4) * 4;
2238
+    __asm__ volatile (
2239
+        "1:                             \n\t"
2240
+        "movq   (%2, %0), %%mm0         \n\t"
2241
+        "movq  8(%2, %0), %%mm1         \n\t"
2242
+        "pfmul  (%3, %0), %%mm0         \n\t"
2243
+        "pfmul 8(%3, %0), %%mm1         \n\t"
2244
+        "pfadd  (%4, %0), %%mm0         \n\t"
2245
+        "pfadd 8(%4, %0), %%mm1         \n\t"
2246
+        "movq      %%mm0,  (%1, %0)     \n\t"
2247
+        "movq      %%mm1, 8(%1, %0)     \n\t"
2248
+        "sub         $16, %0            \n\t"
2249
+        "jge          1b                \n\t"
2250
+        : "+r"(i)
2251
+        : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2252
+        : "memory"
2242 2253
     );
2243
-    __asm__ volatile("femms");
2254
+    __asm__ volatile ("femms");
2244 2255
 }
2245
-static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2246
-                                const float *src2, int len){
2247
-    x86_reg i = (len-8)*4;
2248
-    __asm__ volatile(
2249
-        "1: \n\t"
2250
-        "movaps   (%2,%0), %%xmm0 \n\t"
2251
-        "movaps 16(%2,%0), %%xmm1 \n\t"
2252
-        "mulps    (%3,%0), %%xmm0 \n\t"
2253
-        "mulps  16(%3,%0), %%xmm1 \n\t"
2254
-        "addps    (%4,%0), %%xmm0 \n\t"
2255
-        "addps  16(%4,%0), %%xmm1 \n\t"
2256
-        "movaps %%xmm0,   (%1,%0) \n\t"
2257
-        "movaps %%xmm1, 16(%1,%0) \n\t"
2258
-        "sub  $32, %0 \n\t"
2259
-        "jge  1b \n\t"
2260
-        :"+r"(i)
2261
-        :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2262
-        :"memory"
2256
+
2257
+static void vector_fmul_add_sse(float *dst, const float *src0,
2258
+                                const float *src1, const float *src2, int len)
2259
+{
2260
+    x86_reg i = (len - 8) * 4;
2261
+    __asm__ volatile (
2262
+        "1:                             \n\t"
2263
+        "movaps   (%2, %0), %%xmm0      \n\t"
2264
+        "movaps 16(%2, %0), %%xmm1      \n\t"
2265
+        "mulps    (%3, %0), %%xmm0      \n\t"
2266
+        "mulps  16(%3, %0), %%xmm1      \n\t"
2267
+        "addps    (%4, %0), %%xmm0      \n\t"
2268
+        "addps  16(%4, %0), %%xmm1      \n\t"
2269
+        "movaps     %%xmm0,   (%1, %0)  \n\t"
2270
+        "movaps     %%xmm1, 16(%1, %0)  \n\t"
2271
+        "sub           $32, %0          \n\t"
2272
+        "jge            1b              \n\t"
2273
+        : "+r"(i)
2274
+        : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2275
+        : "memory"
2263 2276
     );
2264 2277
 }
2265 2278
 
2266 2279
 #if HAVE_6REGS
2267
-static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2268
-                                      const float *win, int len){
2269
-    x86_reg i = -len*4;
2270
-    x86_reg j = len*4-8;
2271
-    __asm__ volatile(
2272
-        "1: \n"
2273
-        "pswapd  (%5,%1), %%mm1 \n"
2274
-        "movq    (%5,%0), %%mm0 \n"
2275
-        "pswapd  (%4,%1), %%mm5 \n"
2276
-        "movq    (%3,%0), %%mm4 \n"
2277
-        "movq      %%mm0, %%mm2 \n"
2278
-        "movq      %%mm1, %%mm3 \n"
2279
-        "pfmul     %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2280
-        "pfmul     %%mm5, %%mm3 \n" // src1[    j]*win[len+j]
2281
-        "pfmul     %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2282
-        "pfmul     %%mm5, %%mm0 \n" // src1[    j]*win[len+i]
2283
-        "pfadd     %%mm3, %%mm2 \n"
2284
-        "pfsub     %%mm0, %%mm1 \n"
2285
-        "pswapd    %%mm2, %%mm2 \n"
2286
-        "movq      %%mm1, (%2,%0) \n"
2287
-        "movq      %%mm2, (%2,%1) \n"
2288
-        "sub $8, %1 \n"
2289
-        "add $8, %0 \n"
2290
-        "jl 1b \n"
2291
-        "femms \n"
2292
-        :"+r"(i), "+r"(j)
2293
-        :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2280
+static void vector_fmul_window_3dnow2(float *dst, const float *src0,
2281
+                                      const float *src1, const float *win,
2282
+                                      int len)
2283
+{
2284
+    x86_reg i = -len * 4;
2285
+    x86_reg j =  len * 4 - 8;
2286
+    __asm__ volatile (
2287
+        "1:                             \n"
2288
+        "pswapd (%5, %1), %%mm1         \n"
2289
+        "movq   (%5, %0), %%mm0         \n"
2290
+        "pswapd (%4, %1), %%mm5         \n"
2291
+        "movq   (%3, %0), %%mm4         \n"
2292
+        "movq      %%mm0, %%mm2         \n"
2293
+        "movq      %%mm1, %%mm3         \n"
2294
+        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
2295
+        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
2296
+        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
2297
+        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
2298
+        "pfadd     %%mm3, %%mm2         \n"
2299
+        "pfsub     %%mm0, %%mm1         \n"
2300
+        "pswapd    %%mm2, %%mm2         \n"
2301
+        "movq      %%mm1, (%2, %0)      \n"
2302
+        "movq      %%mm2, (%2, %1)      \n"
2303
+        "sub          $8, %1            \n"
2304
+        "add          $8, %0            \n"
2305
+        "jl           1b                \n"
2306
+        "femms                          \n"
2307
+        : "+r"(i), "+r"(j)
2308
+        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2294 2309
     );
2295 2310
 }
2296 2311
 
2297
-static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2298
-                                   const float *win, int len){
2299
-    x86_reg i = -len*4;
2300
-    x86_reg j = len*4-16;
2301
-    __asm__ volatile(
2302
-        "1: \n"
2303
-        "movaps       (%5,%1), %%xmm1 \n"
2304
-        "movaps       (%5,%0), %%xmm0 \n"
2305
-        "movaps       (%4,%1), %%xmm5 \n"
2306
-        "movaps       (%3,%0), %%xmm4 \n"
2307
-        "shufps $0x1b, %%xmm1, %%xmm1 \n"
2308
-        "shufps $0x1b, %%xmm5, %%xmm5 \n"
2309
-        "movaps        %%xmm0, %%xmm2 \n"
2310
-        "movaps        %%xmm1, %%xmm3 \n"
2311
-        "mulps         %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2312
-        "mulps         %%xmm5, %%xmm3 \n" // src1[    j]*win[len+j]
2313
-        "mulps         %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2314
-        "mulps         %%xmm5, %%xmm0 \n" // src1[    j]*win[len+i]
2315
-        "addps         %%xmm3, %%xmm2 \n"
2316
-        "subps         %%xmm0, %%xmm1 \n"
2317
-        "shufps $0x1b, %%xmm2, %%xmm2 \n"
2318
-        "movaps        %%xmm1, (%2,%0) \n"
2319
-        "movaps        %%xmm2, (%2,%1) \n"
2320
-        "sub $16, %1 \n"
2321
-        "add $16, %0 \n"
2322
-        "jl 1b \n"
2323
-        :"+r"(i), "+r"(j)
2324
-        :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2312
+static void vector_fmul_window_sse(float *dst, const float *src0,
2313
+                                   const float *src1, const float *win, int len)
2314
+{
2315
+    x86_reg i = -len * 4;
2316
+    x86_reg j =  len * 4 - 16;
2317
+    __asm__ volatile (
2318
+        "1:                             \n"
2319
+        "movaps      (%5, %1), %%xmm1   \n"
2320
+        "movaps      (%5, %0), %%xmm0   \n"
2321
+        "movaps      (%4, %1), %%xmm5   \n"
2322
+        "movaps      (%3, %0), %%xmm4   \n"
2323
+        "shufps $0x1b, %%xmm1, %%xmm1   \n"
2324
+        "shufps $0x1b, %%xmm5, %%xmm5   \n"
2325
+        "movaps        %%xmm0, %%xmm2   \n"
2326
+        "movaps        %%xmm1, %%xmm3   \n"
2327
+        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
2328
+        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
2329
+        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
2330
+        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
2331
+        "addps         %%xmm3, %%xmm2   \n"
2332
+        "subps         %%xmm0, %%xmm1   \n"
2333
+        "shufps $0x1b, %%xmm2, %%xmm2   \n"
2334
+        "movaps        %%xmm1, (%2, %0) \n"
2335
+        "movaps        %%xmm2, (%2, %1) \n"
2336
+        "sub              $16, %1       \n"
2337
+        "add              $16, %0       \n"
2338
+        "jl                1b           \n"
2339
+        : "+r"(i), "+r"(j)
2340
+        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2325 2341
     );
2326 2342
 }
2327 2343
 #endif /* HAVE_6REGS */
2328 2344
 
2329
-static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2330
-                             int len)
2345
+static void vector_clipf_sse(float *dst, const float *src,
2346
+                             float min, float max, int len)
2331 2347
 {
2332
-    x86_reg i = (len-16)*4;
2333
-    __asm__ volatile(
2334
-        "movss  %3, %%xmm4 \n"
2335
-        "movss  %4, %%xmm5 \n"
2336
-        "shufps $0, %%xmm4, %%xmm4 \n"
2337
-        "shufps $0, %%xmm5, %%xmm5 \n"
2338
-        "1: \n\t"
2339
-        "movaps    (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2340
-        "movaps  16(%2,%0), %%xmm1 \n\t"
2341
-        "movaps  32(%2,%0), %%xmm2 \n\t"
2342
-        "movaps  48(%2,%0), %%xmm3 \n\t"
2343
-        "maxps      %%xmm4, %%xmm0 \n\t"
2344
-        "maxps      %%xmm4, %%xmm1 \n\t"
2345
-        "maxps      %%xmm4, %%xmm2 \n\t"
2346
-        "maxps      %%xmm4, %%xmm3 \n\t"
2347
-        "minps      %%xmm5, %%xmm0 \n\t"
2348
-        "minps      %%xmm5, %%xmm1 \n\t"
2349
-        "minps      %%xmm5, %%xmm2 \n\t"
2350
-        "minps      %%xmm5, %%xmm3 \n\t"
2351
-        "movaps  %%xmm0,   (%1,%0) \n\t"
2352
-        "movaps  %%xmm1, 16(%1,%0) \n\t"
2353
-        "movaps  %%xmm2, 32(%1,%0) \n\t"
2354
-        "movaps  %%xmm3, 48(%1,%0) \n\t"
2355
-        "sub  $64, %0 \n\t"
2356
-        "jge 1b \n\t"
2357
-        :"+&r"(i)
2358
-        :"r"(dst), "r"(src), "m"(min), "m"(max)
2359
-        :"memory"
2348
+    x86_reg i = (len - 16) * 4;
2349
+    __asm__ volatile (
2350
+        "movss          %3, %%xmm4      \n\t"
2351
+        "movss          %4, %%xmm5      \n\t"
2352
+        "shufps $0, %%xmm4, %%xmm4      \n\t"
2353
+        "shufps $0, %%xmm5, %%xmm5      \n\t"
2354
+        "1:                             \n\t"
2355
+        "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
2356
+        "movaps 16(%2, %0), %%xmm1      \n\t"
2357
+        "movaps 32(%2, %0), %%xmm2      \n\t"
2358
+        "movaps 48(%2, %0), %%xmm3      \n\t"
2359
+        "maxps      %%xmm4, %%xmm0      \n\t"
2360
+        "maxps      %%xmm4, %%xmm1      \n\t"
2361
+        "maxps      %%xmm4, %%xmm2      \n\t"
2362
+        "maxps      %%xmm4, %%xmm3      \n\t"
2363
+        "minps      %%xmm5, %%xmm0      \n\t"
2364
+        "minps      %%xmm5, %%xmm1      \n\t"
2365
+        "minps      %%xmm5, %%xmm2      \n\t"
2366
+        "minps      %%xmm5, %%xmm3      \n\t"
2367
+        "movaps     %%xmm0,   (%1, %0)  \n\t"
2368
+        "movaps     %%xmm1, 16(%1, %0)  \n\t"
2369
+        "movaps     %%xmm2, 32(%1, %0)  \n\t"
2370
+        "movaps     %%xmm3, 48(%1, %0)  \n\t"
2371
+        "sub           $64, %0          \n\t"
2372
+        "jge            1b              \n\t"
2373
+        : "+&r"(i)
2374
+        : "r"(dst), "r"(src), "m"(min), "m"(max)
2375
+        : "memory"
2360 2376
     );
2361 2377
 }
2362 2378
 
... ...
@@ -2364,7 +2644,8 @@ void ff_vp3_idct_mmx(int16_t *input_data);
2364 2364
 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2365 2365
 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2366 2366
 
2367
-void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2367
+void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size,
2368
+                             const DCTELEM *block);
2368 2369
 
2369 2370
 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2370 2371
 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
... ...
@@ -2373,11 +2654,19 @@ void ff_vp3_idct_sse2(int16_t *input_data);
2373 2373
 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2374 2374
 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2375 2375
 
2376
-int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2377
-int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2378
-int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2379
-int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2380
-int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2376
+int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2377
+                                    int order, int shift);
2378
+int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2379
+                                    int order, int shift);
2380
+int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2381
+                                             const int16_t *v3,
2382
+                                             int order, int mul);
2383
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2384
+                                             const int16_t *v3,
2385
+                                             int order, int mul);
2386
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2387
+                                              const int16_t *v3,
2388
+                                              int order, int mul);
2381 2389
 
2382 2390
 void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
2383 2391
                                       const int16_t *window, unsigned int len);
... ...
@@ -2395,27 +2684,32 @@ void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2395 2395
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2396 2396
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2397 2397
 
2398
-void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2399
-int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2400
-int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2398
+void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2399
+                                        const uint8_t *diff, int w,
2400
+                                        int *left, int *left_top);
2401
+int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2402
+                                       int w, int left);
2403
+int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2404
+                                      int w, int left);
2401 2405
 
2402 2406
 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2403 2407
 
2404
-void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src, int32_t min,
2405
-                                   int32_t max, unsigned int len);
2406
-void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src, int32_t min,
2407
-                                   int32_t max, unsigned int len);
2408
-void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
2409
-                                   int32_t max, unsigned int len);
2410
-void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src, int32_t min,
2411
-                                   int32_t max, unsigned int len);
2408
+void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
2409
+                                   int32_t min, int32_t max, unsigned int len);
2410
+void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
2411
+                                   int32_t min, int32_t max, unsigned int len);
2412
+void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2413
+                                   int32_t min, int32_t max, unsigned int len);
2414
+void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
2415
+                                   int32_t min, int32_t max, unsigned int len);
2412 2416
 
2413 2417
 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2414 2418
                                                 const float *src1, int len);
2415 2419
 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2416 2420
                                                 const float *src1, int len);
2417 2421
 
2418
-#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2422
+#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
2423
+    do {                                                                     \
2419 2424
     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2420 2425
     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2421 2426
     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
... ...
@@ -2431,25 +2725,32 @@ extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2431 2431
     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2432 2432
     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2433 2433
     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2434
-    c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2435
-
2436
-#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2437
-    c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU;    \
2438
-    c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2439
-    c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2440
-    c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2441
-
2442
-#define H264_QPEL_FUNCS(x, y, CPU) \
2443
-    c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU; \
2444
-    c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;  \
2445
-    c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU; \
2446
-    c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU
2447
-
2448
-#define H264_QPEL_FUNCS_10(x, y, CPU) \
2449
-    c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU; \
2450
-    c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;  \
2451
-    c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU; \
2452
-    c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2434
+    c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2435
+    } while (0)
2436
+
2437
+#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
2438
+    do {                                                                        \
2439
+        c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
2440
+        c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
2441
+        c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
2442
+        c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2443
+    } while (0)
2444
+
2445
+#define H264_QPEL_FUNCS(x, y, CPU)                                                            \
2446
+    do {                                                                                      \
2447
+        c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2448
+        c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
2449
+        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2450
+        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
2451
+    } while (0)
2452
+
2453
+#define H264_QPEL_FUNCS_10(x, y, CPU)                                                               \
2454
+    do {                                                                                            \
2455
+        c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2456
+        c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
2457
+        c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2458
+        c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
2459
+    } while (0)
2453 2460
 
2454 2461
 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2455 2462
 {
... ...
@@ -2464,18 +2765,18 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2464 2464
         c->clear_blocks = clear_blocks_mmx;
2465 2465
         c->draw_edges   = draw_edges_mmx;
2466 2466
 
2467
-        SET_HPEL_FUNCS(put, 0, 16, mmx);
2467
+        SET_HPEL_FUNCS(put,        0, 16, mmx);
2468 2468
         SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2469
-        SET_HPEL_FUNCS(avg, 0, 16, mmx);
2469
+        SET_HPEL_FUNCS(avg,        0, 16, mmx);
2470 2470
         SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2471
-        SET_HPEL_FUNCS(put, 1, 8, mmx);
2472
-        SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2473
-        SET_HPEL_FUNCS(avg, 1, 8, mmx);
2474
-        SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2471
+        SET_HPEL_FUNCS(put,        1,  8, mmx);
2472
+        SET_HPEL_FUNCS(put_no_rnd, 1,  8, mmx);
2473
+        SET_HPEL_FUNCS(avg,        1,  8, mmx);
2474
+        SET_HPEL_FUNCS(avg_no_rnd, 1,  8, mmx);
2475 2475
     }
2476 2476
 
2477 2477
 #if ARCH_X86_32 || !HAVE_YASM
2478
-    c->gmc= gmc_mmx;
2478
+    c->gmc = gmc_mmx;
2479 2479
 #endif
2480 2480
 #if ARCH_X86_32 && HAVE_YASM
2481 2481
     if (!high_bit_depth)
... ...
@@ -2543,47 +2844,48 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2543 2543
             c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
2544 2544
         }
2545 2545
     }
2546
-    if (CONFIG_VP3_DECODER && HAVE_YASM) {
2546
+    if (CONFIG_VP3_DECODER && HAVE_YASM)
2547 2547
         c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2548
-    }
2549 2548
 
2550
-    if (CONFIG_VP3_DECODER
2551
-        && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2549
+    if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2550
+                               avctx->codec_id == CODEC_ID_THEORA)) {
2552 2551
         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2553 2552
         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2554 2553
     }
2555 2554
 
2556
-    SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2557
-    SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2558
-    SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2559
-    SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2560
-    SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2561
-    SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2555
+    if (CONFIG_H264QPEL) {
2556
+        SET_QPEL_FUNCS(put_qpel,        0, 16, mmx2, );
2557
+        SET_QPEL_FUNCS(put_qpel,        1,  8, mmx2, );
2558
+        SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2559
+        SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmx2, );
2560
+        SET_QPEL_FUNCS(avg_qpel,        0, 16, mmx2, );
2561
+        SET_QPEL_FUNCS(avg_qpel,        1,  8, mmx2, );
2562 2562
 
2563
-    if (!high_bit_depth) {
2564
-        SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2565
-        SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2566
-        SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2567
-        SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2568
-        SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2569
-        SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2570
-    } else if (bit_depth == 10) {
2563
+        if (!high_bit_depth) {
2564
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2565
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmx2, );
2566
+            SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmx2, );
2567
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2568
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmx2, );
2569
+            SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmx2, );
2570
+        } else if (bit_depth == 10) {
2571 2571
 #if HAVE_YASM
2572 2572
 #if !ARCH_X86_64
2573
-        SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2574
-        SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2575
-        SET_QPEL_FUNCS(put_h264_qpel, 1, 8,  10_mmxext, ff_);
2576
-        SET_QPEL_FUNCS(avg_h264_qpel, 1, 8,  10_mmxext, ff_);
2573
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2574
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2575
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
2576
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_mmxext, ff_);
2577 2577
 #endif
2578
-        SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
2579
-        SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
2578
+            SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
2579
+            SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
2580 2580
 #endif
2581
-    }
2581
+        }
2582 2582
 
2583
-    SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2584
-    SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2585
-    SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2586
-    SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2583
+        SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2584
+        SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, mmx2, );
2585
+        SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2586
+        SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, mmx2, );
2587
+    }
2587 2588
 
2588 2589
 #if HAVE_YASM
2589 2590
     if (!high_bit_depth && CONFIG_H264CHROMA) {
... ...
@@ -2599,7 +2901,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2599 2599
         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2600 2600
     }
2601 2601
 
2602
-    c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2602
+    c->add_hfyu_median_prediction   = ff_add_hfyu_median_prediction_mmx2;
2603 2603
 
2604 2604
     c->scalarproduct_int16          = ff_scalarproduct_int16_mmx2;
2605 2605
     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
... ...
@@ -2645,32 +2947,34 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2645 2645
         }
2646 2646
     }
2647 2647
 
2648
-    if (CONFIG_VP3_DECODER
2649
-        && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2648
+    if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2649
+                               avctx->codec_id == CODEC_ID_THEORA)) {
2650 2650
         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2651 2651
         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2652 2652
     }
2653 2653
 
2654
-    SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2655
-    SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2656
-    SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2657
-    SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2658
-    SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2659
-    SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2654
+    if (CONFIG_H264QPEL) {
2655
+        SET_QPEL_FUNCS(put_qpel,        0, 16, 3dnow, );
2656
+        SET_QPEL_FUNCS(put_qpel,        1,  8, 3dnow, );
2657
+        SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2658
+        SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, 3dnow, );
2659
+        SET_QPEL_FUNCS(avg_qpel,        0, 16, 3dnow, );
2660
+        SET_QPEL_FUNCS(avg_qpel,        1,  8, 3dnow, );
2660 2661
 
2661
-    if (!high_bit_depth) {
2662
-        SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2663
-        SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2664
-        SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2665
-        SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2666
-        SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2667
-        SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2668
-    }
2662
+        if (!high_bit_depth) {
2663
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2664
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 3dnow, );
2665
+            SET_QPEL_FUNCS(put_h264_qpel, 2,  4, 3dnow, );
2666
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2667
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 3dnow, );
2668
+            SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, 3dnow, );
2669
+        }
2669 2670
 
2670
-    SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2671
-    SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2672
-    SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2673
-    SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2671
+        SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2672
+        SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, 3dnow, );
2673
+        SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2674
+        SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, 3dnow, );
2675
+    }
2674 2676
 
2675 2677
 #if HAVE_YASM
2676 2678
     if (!high_bit_depth && CONFIG_H264CHROMA) {
... ...
@@ -2702,7 +3006,7 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2702 2702
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2703 2703
 
2704 2704
     if (!high_bit_depth) {
2705
-        if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2705
+        if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2706 2706
             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2707 2707
             c->clear_block  = clear_block_sse;
2708 2708
             c->clear_blocks = clear_blocks_sse;
... ...
@@ -2745,11 +3049,12 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2745 2745
             c->put_pixels_tab[0][0]        = put_pixels16_sse2;
2746 2746
             c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2747 2747
             c->avg_pixels_tab[0][0]        = avg_pixels16_sse2;
2748
-            H264_QPEL_FUNCS(0, 0, sse2);
2748
+            if (CONFIG_H264QPEL)
2749
+                H264_QPEL_FUNCS(0, 0, sse2);
2749 2750
         }
2750 2751
     }
2751 2752
 
2752
-    if (!high_bit_depth) {
2753
+    if (!high_bit_depth && CONFIG_H264QPEL) {
2753 2754
         H264_QPEL_FUNCS(0, 1, sse2);
2754 2755
         H264_QPEL_FUNCS(0, 2, sse2);
2755 2756
         H264_QPEL_FUNCS(0, 3, sse2);
... ...
@@ -2766,14 +3071,15 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2766 2766
 
2767 2767
 #if HAVE_YASM
2768 2768
     if (bit_depth == 10) {
2769
-        SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2770
-        SET_QPEL_FUNCS(put_h264_qpel, 1, 8,  10_sse2, ff_);
2771
-        SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2772
-        SET_QPEL_FUNCS(avg_h264_qpel, 1, 8,  10_sse2, ff_);
2773
-        H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2774
-        H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2775
-        H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2776
-
2769
+        if (CONFIG_H264QPEL) {
2770
+            SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2771
+            SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_sse2, ff_);
2772
+            SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2773
+            SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_sse2, ff_);
2774
+            H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2775
+            H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2776
+            H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2777
+        }
2777 2778
         if (CONFIG_H264CHROMA) {
2778 2779
             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2779 2780
             c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
... ...
@@ -2789,7 +3095,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2789 2789
     }
2790 2790
     if (avctx->flags & CODEC_FLAG_BITEXACT) {
2791 2791
         c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2792
-    } else  if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2792
+    } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2793 2793
         c->apply_window_int16 = ff_apply_window_int16_sse2;
2794 2794
     }
2795 2795
     c->bswap_buf = ff_bswap32_buf_sse2;
... ...
@@ -2803,7 +3109,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2803 2803
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2804 2804
     const int bit_depth      = avctx->bits_per_raw_sample;
2805 2805
 
2806
-    if (!high_bit_depth) {
2806
+    if (!high_bit_depth && CONFIG_H264QPEL) {
2807 2807
         H264_QPEL_FUNCS(1, 0, ssse3);
2808 2808
         H264_QPEL_FUNCS(1, 1, ssse3);
2809 2809
         H264_QPEL_FUNCS(1, 2, ssse3);
... ...
@@ -2818,7 +3124,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2818 2818
         H264_QPEL_FUNCS(3, 3, ssse3);
2819 2819
     }
2820 2820
 #if HAVE_YASM
2821
-    else if (bit_depth == 10) {
2821
+    else if (bit_depth == 10 && CONFIG_H264QPEL) {
2822 2822
         H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2823 2823
         H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2824 2824
         H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
... ...
@@ -2833,14 +3139,12 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2833 2833
     if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2834 2834
         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2835 2835
 
2836
-    if (mm_flags & AV_CPU_FLAG_ATOM) {
2836
+    if (mm_flags & AV_CPU_FLAG_ATOM)
2837 2837
         c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2838
-    } else {
2838
+    else
2839 2839
         c->apply_window_int16 = ff_apply_window_int16_ssse3;
2840
-    }
2841
-    if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2840
+    if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2842 2841
         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2843
-    }
2844 2842
     c->bswap_buf = ff_bswap32_buf_ssse3;
2845 2843
 #endif
2846 2844
 #endif
... ...
@@ -2862,9 +3166,11 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2862 2862
     if (bit_depth == 10) {
2863 2863
         // AVX implies !cache64.
2864 2864
         // TODO: Port cache(32|64) detection from x264.
2865
-        H264_QPEL_FUNCS_10(1, 0, sse2);
2866
-        H264_QPEL_FUNCS_10(2, 0, sse2);
2867
-        H264_QPEL_FUNCS_10(3, 0, sse2);
2865
+        if (CONFIG_H264QPEL) {
2866
+            H264_QPEL_FUNCS_10(1, 0, sse2);
2867
+            H264_QPEL_FUNCS_10(2, 0, sse2);
2868
+            H264_QPEL_FUNCS_10(3, 0, sse2);
2869
+        }
2868 2870
 
2869 2871
         if (CONFIG_H264CHROMA) {
2870 2872
             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
... ...
@@ -2875,13 +3181,13 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2875 2875
 #endif
2876 2876
 }
2877 2877
 
2878
-void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2878
+void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2879 2879
 {
2880 2880
     int mm_flags = av_get_cpu_flags();
2881 2881
 
2882 2882
     if (avctx->dsp_mask) {
2883 2883
         if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2884
-            mm_flags |= (avctx->dsp_mask & 0xffff);
2884
+            mm_flags |=   avctx->dsp_mask & 0xffff;
2885 2885
         else
2886 2886
             mm_flags &= ~(avctx->dsp_mask & 0xffff);
2887 2887
     }
... ...
@@ -2902,56 +3208,57 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2902 2902
 #endif
2903 2903
 
2904 2904
     if (mm_flags & AV_CPU_FLAG_MMX) {
2905
-        const int idct_algo= avctx->idct_algo;
2905
+        const int idct_algo = avctx->idct_algo;
2906 2906
 
2907 2907
         if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2908
-            if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2909
-                c->idct_put= ff_simple_idct_put_mmx;
2910
-                c->idct_add= ff_simple_idct_add_mmx;
2911
-                c->idct    = ff_simple_idct_mmx;
2912
-                c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2908
+            if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2909
+                c->idct_put              = ff_simple_idct_put_mmx;
2910
+                c->idct_add              = ff_simple_idct_add_mmx;
2911
+                c->idct                  = ff_simple_idct_mmx;
2912
+                c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2913 2913
 #if CONFIG_GPL
2914
-            }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2915
-                if(mm_flags & AV_CPU_FLAG_MMX2){
2916
-                    c->idct_put= ff_libmpeg2mmx2_idct_put;
2917
-                    c->idct_add= ff_libmpeg2mmx2_idct_add;
2918
-                    c->idct    = ff_mmxext_idct;
2919
-                }else{
2920
-                    c->idct_put= ff_libmpeg2mmx_idct_put;
2921
-                    c->idct_add= ff_libmpeg2mmx_idct_add;
2922
-                    c->idct    = ff_mmx_idct;
2914
+            } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2915
+                if (mm_flags & AV_CPU_FLAG_MMX2) {
2916
+                    c->idct_put = ff_libmpeg2mmx2_idct_put;
2917
+                    c->idct_add = ff_libmpeg2mmx2_idct_add;
2918
+                    c->idct     = ff_mmxext_idct;
2919
+                } else {
2920
+                    c->idct_put = ff_libmpeg2mmx_idct_put;
2921
+                    c->idct_add = ff_libmpeg2mmx_idct_add;
2922
+                    c->idct     = ff_mmx_idct;
2923 2923
                 }
2924
-                c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2924
+                c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2925 2925
 #endif
2926
-            }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2927
-                     idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2928
-                if(mm_flags & AV_CPU_FLAG_SSE2){
2929
-                    c->idct_put= ff_vp3_idct_put_sse2;
2930
-                    c->idct_add= ff_vp3_idct_add_sse2;
2931
-                    c->idct    = ff_vp3_idct_sse2;
2932
-                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2933
-                }else{
2934
-                    c->idct_put= ff_vp3_idct_put_mmx;
2935
-                    c->idct_add= ff_vp3_idct_add_mmx;
2936
-                    c->idct    = ff_vp3_idct_mmx;
2937
-                    c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2926
+            } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
2927
+                        CONFIG_VP6_DECODER) &&
2928
+                       idct_algo == FF_IDCT_VP3 && HAVE_YASM) {
2929
+                if (mm_flags & AV_CPU_FLAG_SSE2) {
2930
+                    c->idct_put              = ff_vp3_idct_put_sse2;
2931
+                    c->idct_add              = ff_vp3_idct_add_sse2;
2932
+                    c->idct                  = ff_vp3_idct_sse2;
2933
+                    c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
2934
+                } else {
2935
+                    c->idct_put              = ff_vp3_idct_put_mmx;
2936
+                    c->idct_add              = ff_vp3_idct_add_mmx;
2937
+                    c->idct                  = ff_vp3_idct_mmx;
2938
+                    c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
2938 2939
                 }
2939
-            }else if(idct_algo==FF_IDCT_CAVS){
2940
-                    c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2941
-            }else if(idct_algo==FF_IDCT_XVIDMMX){
2942
-                if(mm_flags & AV_CPU_FLAG_SSE2){
2943
-                    c->idct_put= ff_idct_xvid_sse2_put;
2944
-                    c->idct_add= ff_idct_xvid_sse2_add;
2945
-                    c->idct    = ff_idct_xvid_sse2;
2946
-                    c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2947
-                }else if(mm_flags & AV_CPU_FLAG_MMX2){
2948
-                    c->idct_put= ff_idct_xvid_mmx2_put;
2949
-                    c->idct_add= ff_idct_xvid_mmx2_add;
2950
-                    c->idct    = ff_idct_xvid_mmx2;
2951
-                }else{
2952
-                    c->idct_put= ff_idct_xvid_mmx_put;
2953
-                    c->idct_add= ff_idct_xvid_mmx_add;
2954
-                    c->idct    = ff_idct_xvid_mmx;
2940
+            } else if (idct_algo == FF_IDCT_CAVS) {
2941
+                    c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
2942
+            } else if (idct_algo == FF_IDCT_XVIDMMX) {
2943
+                if (mm_flags & AV_CPU_FLAG_SSE2) {
2944
+                    c->idct_put              = ff_idct_xvid_sse2_put;
2945
+                    c->idct_add              = ff_idct_xvid_sse2_add;
2946
+                    c->idct                  = ff_idct_xvid_sse2;
2947
+                    c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2948
+                } else if (mm_flags & AV_CPU_FLAG_MMX2) {
2949
+                    c->idct_put              = ff_idct_xvid_mmx2_put;
2950
+                    c->idct_add              = ff_idct_xvid_mmx2_add;
2951
+                    c->idct                  = ff_idct_xvid_mmx2;
2952
+                } else {
2953
+                    c->idct_put              = ff_idct_xvid_mmx_put;
2954
+                    c->idct_add              = ff_idct_xvid_mmx_add;
2955
+                    c->idct                  = ff_idct_xvid_mmx;
2955 2956
                 }
2956 2957
             }
2957 2958
         }
... ...
@@ -2962,13 +3269,13 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2962 2962
     if (mm_flags & AV_CPU_FLAG_MMX2)
2963 2963
         dsputil_init_mmx2(c, avctx, mm_flags);
2964 2964
 
2965
-    if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2965
+    if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
2966 2966
         dsputil_init_3dnow(c, avctx, mm_flags);
2967 2967
 
2968
-    if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT))
2968
+    if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
2969 2969
         dsputil_init_3dnow2(c, avctx, mm_flags);
2970 2970
 
2971
-    if (HAVE_SSE && (mm_flags & AV_CPU_FLAG_SSE))
2971
+    if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
2972 2972
         dsputil_init_sse(c, avctx, mm_flags);
2973 2973
 
2974 2974
     if (mm_flags & AV_CPU_FLAG_SSE2)
... ...
@@ -2209,14 +2209,11 @@ static int mov_write_isml_manifest(AVIOContext *pb, MOVMuxContext *mov)
2209 2209
                                     size);
2210 2210
                     av_free(ptr);
2211 2211
                 }
2212
-            } else {
2213
-                param_write_hex(pb, "CodecPrivateData", track->enc->extradata,
2214
-                                track->enc->extradata_size);
2215
-            }
2216
-            if (track->enc->codec_id == CODEC_ID_H264) {
2217 2212
                 param_write_string(pb, "FourCC", "H264");
2218 2213
             } else if (track->enc->codec_id == CODEC_ID_VC1) {
2219 2214
                 param_write_string(pb, "FourCC", "WVC1");
2215
+                param_write_hex(pb, "CodecPrivateData", track->enc->extradata,
2216
+                                track->enc->extradata_size);
2220 2217
             }
2221 2218
             param_write_int(pb, "MaxWidth", track->enc->width);
2222 2219
             param_write_int(pb, "MaxHeight", track->enc->height);