Browse code

Merge remote-tracking branch 'qatar/master'

* qatar/master: (22 commits)
configure: enable memalign_hack automatically when needed
swscale: unbreak the build on non-x86 systems.
swscale: remove if(bitexact) branch from functions.
swscale: remove if(canMMX2BeUsed) conditional.
swscale: remove swScale_{c,MMX,MMX2} duplication.
swscale: use emms_c().
Move emms_c() from libavcodec to libavutil.
tiff: set palette in the context when specified in TIFF_PAL tag
rtsp: use strtoul to parse rtptime and seq values.
pgssubdec: fix incorrect colors.
dvdsubdec: fix incorrect colors.
ape: Allow demuxing of files with metadata tags.
swscale: remove dead macro WRITEBGR24OLD.
swscale: remove AMD3DNOW "optimizations".
swscale: remove duplicate code in ppc/ subdirectory.
swscale: remove duplicated x86/ functions.
swscale: force --enable-runtime-cpudetect and remove SWS_CPU_CAPS_*.
vsrc_buffer.h: add file doxy
vsrc_buffer: tweak error message in init()
msmpeg4: reindent.
...

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2011/05/25 13:32:23
Showing 40 changed files
... ...
@@ -2859,11 +2859,6 @@ check_header X11/extensions/XvMClib.h
2859 2859
 
2860 2860
 check_struct dxva2api.h DXVA_PictureParameters wDecodedPictureIndex
2861 2861
 
2862
-if ! enabled_any memalign memalign_hack posix_memalign malloc_aligned &&
2863
-     enabled_any $need_memalign ; then
2864
-    die "Error, no aligned memory allocator but SSE enabled, disable it or use --enable-memalign-hack."
2865
-fi
2866
-
2867 2862
 disabled  zlib || check_lib   zlib.h      zlibVersion -lz   || disable  zlib
2868 2863
 disabled bzlib || check_lib2 bzlib.h BZ2_bzlibVersion -lbz2 || disable bzlib
2869 2864
 
... ...
@@ -3156,6 +3151,9 @@ check_deps $CONFIG_LIST       \
3156 3156
 
3157 3157
 enabled asm || { arch=c; disable $ARCH_LIST $ARCH_EXT_LIST; }
3158 3158
 
3159
+! enabled_any memalign posix_memalign malloc_aligned &&
3160
+    enabled_any $need_memalign && enable memalign_hack
3161
+
3159 3162
 echo "install prefix            $prefix"
3160 3163
 echo "source path               $source_path"
3161 3164
 echo "C compiler                $cc"
... ...
@@ -433,3 +433,49 @@ For more information about libx264 and the supported options see:
433 433
 @url{http://www.videolan.org/developers/x264.html}
434 434
 
435 435
 @c man end VIDEO ENCODERS
436
+
437
+@subheading Floating-Point-Only AC-3 Encoding Options
438
+
439
+These options are only valid for the floating-point encoder and do not exist
440
+for the fixed-point encoder due to the corresponding features not being
441
+implemented in fixed-point.
442
+
443
+@table @option
444
+
445
+@item -channel_coupling @var{boolean}
446
+Enables/Disables use of channel coupling, which is an optional AC-3 feature
447
+that increases quality by combining high frequency information from multiple
448
+channels into a single channel. The per-channel high frequency information is
449
+sent with less accuracy in both the frequency and time domains. This allows
450
+more bits to be used for lower frequencies while preserving enough information
451
+to reconstruct the high frequencies. This option is enabled by default for the
452
+floating-point encoder and should generally be left as enabled except for
453
+testing purposes or to increase encoding speed.
454
+@table @option
455
+@item -1
456
+@itemx auto
457
+Selected by Encoder (default)
458
+@item 0
459
+@itemx off
460
+Disable Channel Coupling
461
+@item 1
462
+@itemx on
463
+Enable Channel Coupling
464
+@end table
465
+
466
+@item -cpl_start_band @var{number}
467
+Coupling Start Band. Sets the channel coupling start band, from 1 to 15. If a
468
+value higher than the bandwidth is used, it will be reduced to 1 less than the
469
+coupling end band. If @var{auto} is used, the start band will be determined by
470
+the encoder based on the bit rate, sample rate, and channel layout. This option
471
+has no effect if channel coupling is disabled.
472
+@table @option
473
+@item -1
474
+@itemx auto
475
+Selected by Encoder (default)
476
+@end table
477
+
478
+@end table
479
+
480
+@c man end ENCODERS
481
+
... ...
@@ -269,8 +269,6 @@ OBJS-$(CONFIG_MPEG2VIDEO_ENCODER)      += mpeg12enc.o mpegvideo_enc.o \
269 269
                                           mpegvideo.o error_resilience.o
270 270
 OBJS-$(CONFIG_MPEG4_VAAPI_HWACCEL)     += vaapi_mpeg4.o
271 271
 OBJS-$(CONFIG_MSMPEG4V1_DECODER)       += msmpeg4.o msmpeg4data.o
272
-OBJS-$(CONFIG_MSMPEG4V1_ENCODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
273
-                                          h263.o ituh263dec.o mpeg4videodec.o
274 272
 OBJS-$(CONFIG_MSMPEG4V2_DECODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
275 273
                                           h263.o ituh263dec.o mpeg4videodec.o
276 274
 OBJS-$(CONFIG_MSMPEG4V2_ENCODER)       += msmpeg4.o msmpeg4data.o h263dec.o \
... ...
@@ -28,7 +28,8 @@
28 28
 #define AVCODEC_AC3_H
29 29
 
30 30
 #define AC3_MAX_CODED_FRAME_SIZE 3840 /* in bytes */
31
-#define AC3_MAX_CHANNELS 6 /* including LFE channel */
31
+#define AC3_MAX_CHANNELS 7            /**< maximum number of channels, including coupling channel */
32
+#define CPL_CH 0                      /**< coupling channel index */
32 33
 
33 34
 #define AC3_MAX_COEFS   256
34 35
 #define AC3_BLOCK_SIZE  256
... ...
@@ -158,7 +159,9 @@ typedef struct AC3EncOptions {
158 158
 
159 159
     /* other encoding options */
160 160
     int allow_per_frame_metadata;
161
-    int stereo_rematrixing;    
161
+    int stereo_rematrixing;
162
+    int channel_coupling;
163
+    int cpl_start;    
162 164
 } AC3EncOptions;
163 165
 
164 166
 
... ...
@@ -58,11 +58,6 @@
58 58
 #include "fft.h"
59 59
 #include "fmtconvert.h"
60 60
 
61
-/* override ac3.h to include coupling channel */
62
-#undef AC3_MAX_CHANNELS
63
-#define AC3_MAX_CHANNELS 7
64
-#define CPL_CH 0
65
-
66 61
 #define AC3_OUTPUT_LFEON  8
67 62
 
68 63
 #define SPX_MAX_BANDS    17
... ...
@@ -54,12 +54,6 @@ const uint8_t ff_eac3_hebap_tab[64] = {
54 54
 };
55 55
 
56 56
 /**
57
- * Table E2.16 Default Coupling Banding Structure
58
- */
59
-const uint8_t ff_eac3_default_cpl_band_struct[18] =
60
-{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1 };
61
-
62
-/**
63 57
  * Table E2.15 Default Spectral Extension Banding Structure
64 58
  */
65 59
 const uint8_t ff_eac3_default_spx_band_struct[17] =
... ...
@@ -27,7 +27,6 @@
27 27
 extern const uint8_t ff_ac3_ungroup_3_in_5_bits_tab[32][3];
28 28
 
29 29
 extern const uint8_t ff_eac3_hebap_tab[64];
30
-extern const uint8_t ff_eac3_default_cpl_band_struct[18];
31 30
 extern const uint8_t ff_eac3_default_spx_band_struct[17];
32 31
 
33 32
 #endif /* AVCODEC_AC3DEC_DATA_H */
... ...
@@ -70,6 +70,7 @@ typedef struct AC3MDCTContext {
70 70
     FFTContext fft;                     ///< FFT context for MDCT calculation
71 71
 } AC3MDCTContext;
72 72
 
73
+
73 74
 /**
74 75
  * Data for a single audio block.
75 76
  */
... ...
@@ -83,10 +84,22 @@ typedef struct AC3Block {
83 83
     int16_t  **band_psd;                        ///< psd per critical band
84 84
     int16_t  **mask;                            ///< masking curve
85 85
     uint16_t **qmant;                           ///< quantized mantissas
86
+    uint8_t  **cpl_coord_exp;                   ///< coupling coord exponents           (cplcoexp)
87
+    uint8_t  **cpl_coord_mant;                  ///< coupling coord mantissas           (cplcomant)
86 88
     uint8_t  coeff_shift[AC3_MAX_CHANNELS];     ///< fixed-point coefficient shift values
87 89
     uint8_t  new_rematrixing_strategy;          ///< send new rematrixing flags in this block
90
+    int      num_rematrixing_bands;             ///< number of rematrixing bands
88 91
     uint8_t  rematrixing_flags[4];              ///< rematrixing flags
89 92
     struct AC3Block *exp_ref_block[AC3_MAX_CHANNELS]; ///< reference blocks for EXP_REUSE
93
+    int      new_cpl_strategy;                  ///< send new coupling strategy
94
+    int      cpl_in_use;                        ///< coupling in use for this block     (cplinu)
95
+    uint8_t  channel_in_cpl[AC3_MAX_CHANNELS];  ///< channel in coupling                (chincpl)
96
+    int      num_cpl_channels;                  ///< number of channels in coupling
97
+    uint8_t  new_cpl_coords;                    ///< send new coupling coordinates      (cplcoe)
98
+    uint8_t  cpl_master_exp[AC3_MAX_CHANNELS];  ///< coupling coord master exponents    (mstrcplco)
99
+    int      new_snr_offsets;                   ///< send new SNR offsets
100
+    int      new_cpl_leak;                      ///< send new coupling leak info
101
+    int      end_freq[AC3_MAX_CHANNELS];        ///< end frequency bin                  (endmant)
90 102
 } AC3Block;
91 103
 
92 104
 /**
... ...
@@ -133,10 +146,16 @@ typedef struct AC3EncodeContext {
133 133
 
134 134
     int cutoff;                             ///< user-specified cutoff frequency, in Hz
135 135
     int bandwidth_code;                     ///< bandwidth code (0 to 60)               (chbwcod)
136
-    int nb_coefs[AC3_MAX_CHANNELS];
136
+    int start_freq[AC3_MAX_CHANNELS];       ///< start frequency bin                    (strtmant)
137
+    int cpl_end_freq;                       ///< coupling channel end frequency bin
138
+
139
+    int cpl_on;                             ///< coupling turned on for this frame
140
+    int cpl_enabled;                        ///< coupling enabled for all frames
141
+    int num_cpl_subbands;                   ///< number of coupling subbands            (ncplsubnd)
142
+    int num_cpl_bands;                      ///< number of coupling bands               (ncplbnd)
143
+    uint8_t cpl_band_sizes[AC3_MAX_CPL_BANDS];  ///< number of coeffs in each coupling band
137 144
 
138 145
     int rematrixing_enabled;                ///< stereo rematrixing enabled
139
-    int num_rematrixing_bands;              ///< number of rematrixing bands
140 146
 
141 147
     /* bitrate allocation control */
142 148
     int slow_gain_code;                     ///< slow gain code                         (sgaincod)
... ...
@@ -163,6 +182,8 @@ typedef struct AC3EncodeContext {
163 163
     int16_t *band_psd_buffer;
164 164
     int16_t *mask_buffer;
165 165
     uint16_t *qmant_buffer;
166
+    uint8_t *cpl_coord_exp_buffer;
167
+    uint8_t *cpl_coord_mant_buffer;
166 168
 
167 169
     uint8_t exp_strategy[AC3_MAX_CHANNELS][AC3_MAX_BLOCKS]; ///< exponent strategies
168 170
 
... ...
@@ -237,6 +258,12 @@ const AVOption ff_ac3_options[] = {
237 237
     {"hdcd",     "HDCD",               0, FF_OPT_TYPE_CONST, {.dbl = 1 }, INT_MIN, INT_MAX, AC3ENC_PARAM, "ad_conv_type"},
238 238
 /* Other Encoding Options */
239 239
 {"stereo_rematrixing", "Stereo Rematrixing", OFFSET(stereo_rematrixing), FF_OPT_TYPE_INT, {.dbl = 1 }, 0, 1, AC3ENC_PARAM},
240
+#if CONFIG_AC3ENC_FLOAT
241
+{"channel_coupling",   "Channel Coupling",   OFFSET(channel_coupling),   FF_OPT_TYPE_INT, {.dbl = 1 }, 0, 1, AC3ENC_PARAM, "channel_coupling"},
242
+    {"auto", "Selected by the Encoder", 0, FF_OPT_TYPE_CONST, {.dbl = -1 }, INT_MIN, INT_MAX, AC3ENC_PARAM, "channel_coupling"},
243
+{"cpl_start_band", "Coupling Start Band", OFFSET(cpl_start), FF_OPT_TYPE_INT, {.dbl = -1 }, -1, 15, AC3ENC_PARAM, "cpl_start_band"},
244
+    {"auto", "Selected by the Encoder", 0, FF_OPT_TYPE_CONST, {.dbl = -1 }, INT_MIN, INT_MAX, AC3ENC_PARAM, "cpl_start_band"},
245
+#endif
240 246
 {NULL}
241 247
 };
242 248
 #endif
... ...
@@ -267,9 +294,9 @@ static void scale_coefficients(AC3EncodeContext *s);
267 267
 
268 268
 /**
269 269
  * LUT for number of exponent groups.
270
- * exponent_group_tab[exponent strategy-1][number of coefficients]
270
+ * exponent_group_tab[coupling][exponent strategy-1][number of coefficients]
271 271
  */
272
-static uint8_t exponent_group_tab[3][256];
272
+static uint8_t exponent_group_tab[2][3][256];
273 273
 
274 274
 
275 275
 /**
... ...
@@ -331,6 +358,49 @@ static const uint8_t ac3_bandwidth_tab[5][3][19] = {
331 331
 
332 332
 
333 333
 /**
334
+ * LUT to select the coupling start band based on the bit rate, sample rate, and
335
+ * number of full-bandwidth channels. -1 = coupling off
336
+ * ac3_coupling_start_tab[channel_mode-2][sample rate code][bit rate code]
337
+ *
338
+ * TODO: more testing for optimal parameters.
339
+ *       multi-channel tests at 44.1kHz and 32kHz.
340
+ */
341
+static const int8_t ac3_coupling_start_tab[6][3][19] = {
342
+//      32  40  48  56  64  80  96 112 128 160 192 224 256 320 384 448 512 576 640
343
+
344
+    // 2/0
345
+    { {  0,  0,  0,  0,  0,  0,  0,  1,  1,  7,  8, 11, 12, -1, -1, -1, -1, -1, -1 },
346
+      {  0,  0,  0,  0,  0,  0,  1,  3,  5,  7, 10, 12, 13, -1, -1, -1, -1, -1, -1 },
347
+      {  0,  0,  0,  0,  1,  2,  2,  9, 13, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
348
+
349
+    // 3/0
350
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
351
+      {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
352
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
353
+
354
+    // 2/1 - untested
355
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
356
+      {  0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  6,  9, 11, 12, 13, -1, -1, -1, -1 },
357
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
358
+
359
+    // 3/1
360
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
361
+      {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
362
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
363
+
364
+    // 2/2 - untested
365
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
366
+      {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  2, 10, 11, 11, 12, 12, 14, -1 },
367
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
368
+
369
+    // 3/2
370
+    { {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  6,  8, 11, 12, 12, -1, -1 },
371
+      {  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  6,  8, 11, 12, 12, -1, -1 },
372
+      { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } },
373
+};
374
+
375
+
376
+/**
334 377
  * Adjust the frame size to make the average bit rate match the target bit rate.
335 378
  * This is only needed for 11025, 22050, and 44100 sample rates.
336 379
  */
... ...
@@ -392,15 +462,297 @@ static void apply_mdct(AC3EncodeContext *s)
392 392
 
393 393
             apply_window(&s->dsp, s->windowed_samples, input_samples, s->mdct.window, AC3_WINDOW_SIZE);
394 394
 
395
-            block->coeff_shift[ch] = normalize_samples(s);
395
+            block->coeff_shift[ch+1] = normalize_samples(s);
396 396
 
397
-            s->mdct.fft.mdct_calcw(&s->mdct.fft, block->mdct_coef[ch],
397
+            s->mdct.fft.mdct_calcw(&s->mdct.fft, block->mdct_coef[ch+1],
398 398
                                    s->windowed_samples);
399 399
         }
400 400
     }
401 401
 }
402 402
 
403 403
 
404
+static void compute_coupling_strategy(AC3EncodeContext *s)
405
+{
406
+    int blk, ch;
407
+    int got_cpl_snr;
408
+
409
+    /* set coupling use flags for each block/channel */
410
+    /* TODO: turn coupling on/off and adjust start band based on bit usage */
411
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
412
+        AC3Block *block = &s->blocks[blk];
413
+        for (ch = 1; ch <= s->fbw_channels; ch++)
414
+            block->channel_in_cpl[ch] = s->cpl_on;
415
+    }
416
+
417
+    /* enable coupling for each block if at least 2 channels have coupling
418
+       enabled for that block */
419
+    got_cpl_snr = 0;
420
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
421
+        AC3Block *block = &s->blocks[blk];
422
+        block->num_cpl_channels = 0;
423
+        for (ch = 1; ch <= s->fbw_channels; ch++)
424
+            block->num_cpl_channels += block->channel_in_cpl[ch];
425
+        block->cpl_in_use = block->num_cpl_channels > 1;
426
+        if (!block->cpl_in_use) {
427
+            block->num_cpl_channels = 0;
428
+            for (ch = 1; ch <= s->fbw_channels; ch++)
429
+                block->channel_in_cpl[ch] = 0;
430
+        }
431
+
432
+        block->new_cpl_strategy = !blk;
433
+        if (blk) {
434
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
435
+                if (block->channel_in_cpl[ch] != s->blocks[blk-1].channel_in_cpl[ch]) {
436
+                    block->new_cpl_strategy = 1;
437
+                    break;
438
+                }
439
+            }
440
+        }
441
+        block->new_cpl_leak = block->new_cpl_strategy;
442
+
443
+        if (!blk || (block->cpl_in_use && !got_cpl_snr)) {
444
+            block->new_snr_offsets = 1;
445
+            if (block->cpl_in_use)
446
+                got_cpl_snr = 1;
447
+        } else {
448
+            block->new_snr_offsets = 0;
449
+        }
450
+    }
451
+
452
+    /* set bandwidth for each channel */
453
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
454
+        AC3Block *block = &s->blocks[blk];
455
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
456
+            if (block->channel_in_cpl[ch])
457
+                block->end_freq[ch] = s->start_freq[CPL_CH];
458
+            else
459
+                block->end_freq[ch] = s->bandwidth_code * 3 + 73;
460
+        }
461
+    }
462
+}
463
+
464
+
465
+/**
466
+ * Calculate a single coupling coordinate.
467
+ */
468
+static inline float calc_cpl_coord(float energy_ch, float energy_cpl)
469
+{
470
+    float coord = 0.125;
471
+    if (energy_cpl > 0)
472
+        coord *= sqrtf(energy_ch / energy_cpl);
473
+    return coord;
474
+}
475
+
476
+
477
+/**
478
+ * Calculate coupling channel and coupling coordinates.
479
+ * TODO: Currently this is only used for the floating-point encoder. I was
480
+ *       able to make it work for the fixed-point encoder, but quality was
481
+ *       generally lower in most cases than not using coupling. If a more
482
+ *       adaptive coupling strategy were to be implemented it might be useful
483
+ *       at that time to use coupling for the fixed-point encoder as well.
484
+ */
485
+static void apply_channel_coupling(AC3EncodeContext *s)
486
+{
487
+#if CONFIG_AC3ENC_FLOAT
488
+    DECLARE_ALIGNED(16, float,   cpl_coords)      [AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16];
489
+    DECLARE_ALIGNED(16, int32_t, fixed_cpl_coords)[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16];
490
+    int blk, ch, bnd, i, j;
491
+    CoefSumType energy[AC3_MAX_BLOCKS][AC3_MAX_CHANNELS][16] = {{{0}}};
492
+    int num_cpl_coefs = s->num_cpl_subbands * 12;
493
+
494
+    /* calculate coupling channel from fbw channels */
495
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
496
+        AC3Block *block = &s->blocks[blk];
497
+        CoefType *cpl_coef = &block->mdct_coef[CPL_CH][s->start_freq[CPL_CH]];
498
+        if (!block->cpl_in_use)
499
+            continue;
500
+        memset(cpl_coef-1, 0, (num_cpl_coefs+4) * sizeof(*cpl_coef));
501
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
502
+            CoefType *ch_coef = &block->mdct_coef[ch][s->start_freq[CPL_CH]];
503
+            if (!block->channel_in_cpl[ch])
504
+                continue;
505
+            for (i = 0; i < num_cpl_coefs; i++)
506
+                cpl_coef[i] += ch_coef[i];
507
+        }
508
+        /* note: coupling start bin % 4 will always be 1 and num_cpl_coefs
509
+                 will always be a multiple of 12, so we need to subtract 1 from
510
+                 the start and add 4 to the length when using optimized
511
+                 functions which require 16-byte alignment. */
512
+
513
+        /* coefficients must be clipped to +/- 1.0 in order to be encoded */
514
+        s->dsp.vector_clipf(cpl_coef-1, cpl_coef-1, -1.0f, 1.0f, num_cpl_coefs+4);
515
+
516
+        /* scale coupling coefficients from float to 24-bit fixed-point */
517
+        s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][s->start_freq[CPL_CH]-1],
518
+                                   cpl_coef-1, num_cpl_coefs+4);
519
+    }
520
+
521
+    /* calculate energy in each band in coupling channel and each fbw channel */
522
+    /* TODO: possibly use SIMD to speed up energy calculation */
523
+    bnd = 0;
524
+    i = s->start_freq[CPL_CH];
525
+    while (i < s->cpl_end_freq) {
526
+        int band_size = s->cpl_band_sizes[bnd];
527
+        for (ch = CPL_CH; ch <= s->fbw_channels; ch++) {
528
+            for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
529
+                AC3Block *block = &s->blocks[blk];
530
+                if (!block->cpl_in_use || (ch > CPL_CH && !block->channel_in_cpl[ch]))
531
+                    continue;
532
+                for (j = 0; j < band_size; j++) {
533
+                    CoefType v = block->mdct_coef[ch][i+j];
534
+                    MAC_COEF(energy[blk][ch][bnd], v, v);
535
+                }
536
+            }
537
+        }
538
+        i += band_size;
539
+        bnd++;
540
+    }
541
+
542
+    /* determine which blocks to send new coupling coordinates for */
543
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
544
+        AC3Block *block  = &s->blocks[blk];
545
+        AC3Block *block0 = blk ? &s->blocks[blk-1] : NULL;
546
+        int new_coords = 0;
547
+        CoefSumType coord_diff[AC3_MAX_CHANNELS] = {0,};
548
+
549
+        if (block->cpl_in_use) {
550
+            /* calculate coupling coordinates for all blocks and calculate the
551
+               average difference between coordinates in successive blocks */
552
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
553
+                if (!block->channel_in_cpl[ch])
554
+                    continue;
555
+
556
+                for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
557
+                    cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy[blk][ch][bnd],
558
+                                                              energy[blk][CPL_CH][bnd]);
559
+                    if (blk > 0 && block0->cpl_in_use &&
560
+                        block0->channel_in_cpl[ch]) {
561
+                        coord_diff[ch] += fabs(cpl_coords[blk-1][ch][bnd] -
562
+                                               cpl_coords[blk  ][ch][bnd]);
563
+                    }
564
+                }
565
+                coord_diff[ch] /= s->num_cpl_bands;
566
+            }
567
+
568
+            /* send new coordinates if this is the first block, if previous
569
+             * block did not use coupling but this block does, the channels
570
+             * using coupling has changed from the previous block, or the
571
+             * coordinate difference from the last block for any channel is
572
+             * greater than a threshold value. */
573
+            if (blk == 0) {
574
+                new_coords = 1;
575
+            } else if (!block0->cpl_in_use) {
576
+                new_coords = 1;
577
+            } else {
578
+                for (ch = 1; ch <= s->fbw_channels; ch++) {
579
+                    if (block->channel_in_cpl[ch] && !block0->channel_in_cpl[ch]) {
580
+                        new_coords = 1;
581
+                        break;
582
+                    }
583
+                }
584
+                if (!new_coords) {
585
+                    for (ch = 1; ch <= s->fbw_channels; ch++) {
586
+                        if (block->channel_in_cpl[ch] && coord_diff[ch] > 0.04) {
587
+                            new_coords = 1;
588
+                            break;
589
+                        }
590
+                    }
591
+                }
592
+            }
593
+        }
594
+        block->new_cpl_coords = new_coords;
595
+    }
596
+
597
+    /* calculate final coupling coordinates, taking into account reusing of
598
+       coordinates in successive blocks */
599
+    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
600
+        blk = 0;
601
+        while (blk < AC3_MAX_BLOCKS) {
602
+            int blk1;
603
+            CoefSumType energy_cpl;
604
+            AC3Block *block  = &s->blocks[blk];
605
+
606
+            if (!block->cpl_in_use) {
607
+                blk++;
608
+                continue;
609
+            }
610
+
611
+            energy_cpl = energy[blk][CPL_CH][bnd];
612
+            blk1 = blk+1;
613
+            while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
614
+                if (s->blocks[blk1].cpl_in_use)
615
+                    energy_cpl += energy[blk1][CPL_CH][bnd];
616
+                blk1++;
617
+            }
618
+
619
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
620
+                CoefType energy_ch;
621
+                if (!block->channel_in_cpl[ch])
622
+                    continue;
623
+                energy_ch = energy[blk][ch][bnd];
624
+                blk1 = blk+1;
625
+                while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
626
+                    if (s->blocks[blk1].cpl_in_use)
627
+                        energy_ch += energy[blk1][ch][bnd];
628
+                    blk1++;
629
+                }
630
+                cpl_coords[blk][ch][bnd] = calc_cpl_coord(energy_ch, energy_cpl);
631
+            }
632
+            blk = blk1;
633
+        }
634
+    }
635
+
636
+    /* calculate exponents/mantissas for coupling coordinates */
637
+    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
638
+        AC3Block *block = &s->blocks[blk];
639
+        if (!block->cpl_in_use || !block->new_cpl_coords)
640
+            continue;
641
+
642
+        s->ac3dsp.float_to_fixed24(fixed_cpl_coords[blk][1],
643
+                                   cpl_coords[blk][1],
644
+                                   s->fbw_channels * 16);
645
+        s->ac3dsp.extract_exponents(block->cpl_coord_exp[1],
646
+                                    fixed_cpl_coords[blk][1],
647
+                                    s->fbw_channels * 16);
648
+
649
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
650
+            int bnd, min_exp, max_exp, master_exp;
651
+
652
+            /* determine master exponent */
653
+            min_exp = max_exp = block->cpl_coord_exp[ch][0];
654
+            for (bnd = 1; bnd < s->num_cpl_bands; bnd++) {
655
+                int exp = block->cpl_coord_exp[ch][bnd];
656
+                min_exp = FFMIN(exp, min_exp);
657
+                max_exp = FFMAX(exp, max_exp);
658
+            }
659
+            master_exp = ((max_exp - 15) + 2) / 3;
660
+            master_exp = FFMAX(master_exp, 0);
661
+            while (min_exp < master_exp * 3)
662
+                master_exp--;
663
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
664
+                block->cpl_coord_exp[ch][bnd] = av_clip(block->cpl_coord_exp[ch][bnd] -
665
+                                                        master_exp * 3, 0, 15);
666
+            }
667
+            block->cpl_master_exp[ch] = master_exp;
668
+
669
+            /* quantize mantissas */
670
+            for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
671
+                int cpl_exp  = block->cpl_coord_exp[ch][bnd];
672
+                int cpl_mant = (fixed_cpl_coords[blk][ch][bnd] << (5 + cpl_exp + master_exp * 3)) >> 24;
673
+                if (cpl_exp == 15)
674
+                    cpl_mant >>= 1;
675
+                else
676
+                    cpl_mant -= 16;
677
+
678
+                block->cpl_coord_mant[ch][bnd] = cpl_mant;
679
+            }
680
+        }
681
+    }
682
+#endif /* CONFIG_AC3ENC_FLOAT */
683
+}
684
+
685
+
404 686
 /**
405 687
  * Determine rematrixing flags for each block and band.
406 688
  */
... ...
@@ -413,23 +765,32 @@ static void compute_rematrixing_strategy(AC3EncodeContext *s)
413 413
     if (s->channel_mode != AC3_CHMODE_STEREO)
414 414
         return;
415 415
 
416
-    s->num_rematrixing_bands = 4;
417
-
418
-    nb_coefs = FFMIN(s->nb_coefs[0], s->nb_coefs[1]);
419
-
420 416
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
421 417
         block = &s->blocks[blk];
422 418
         block->new_rematrixing_strategy = !blk;
423
-        if (!s->rematrixing_enabled)
419
+
420
+        if (!s->rematrixing_enabled) {
421
+            block0 = block;
424 422
             continue;
425
-        for (bnd = 0; bnd < s->num_rematrixing_bands; bnd++) {
423
+        }
424
+
425
+        block->num_rematrixing_bands = 4;
426
+        if (block->cpl_in_use) {
427
+            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] <= 61);
428
+            block->num_rematrixing_bands -= (s->start_freq[CPL_CH] == 37);
429
+            if (blk && block->num_rematrixing_bands != block0->num_rematrixing_bands)
430
+                block->new_rematrixing_strategy = 1;
431
+        }
432
+        nb_coefs = FFMIN(block->end_freq[1], block->end_freq[2]);
433
+
434
+        for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
426 435
             /* calculate calculate sum of squared coeffs for one band in one block */
427 436
             int start = ff_ac3_rematrix_band_tab[bnd];
428 437
             int end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
429 438
             CoefSumType sum[4] = {0,};
430 439
             for (i = start; i < end; i++) {
431
-                CoefType lt = block->mdct_coef[0][i];
432
-                CoefType rt = block->mdct_coef[1][i];
440
+                CoefType lt = block->mdct_coef[1][i];
441
+                CoefType rt = block->mdct_coef[2][i];
433 442
                 CoefType md = lt + rt;
434 443
                 CoefType sd = lt - rt;
435 444
                 MAC_COEF(sum[0], lt, lt);
... ...
@@ -468,21 +829,20 @@ static void apply_rematrixing(AC3EncodeContext *s)
468 468
     if (!s->rematrixing_enabled)
469 469
         return;
470 470
 
471
-    nb_coefs = FFMIN(s->nb_coefs[0], s->nb_coefs[1]);
472
-
473 471
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
474 472
         AC3Block *block = &s->blocks[blk];
475 473
         if (block->new_rematrixing_strategy)
476 474
             flags = block->rematrixing_flags;
477
-        for (bnd = 0; bnd < s->num_rematrixing_bands; bnd++) {
475
+        nb_coefs = FFMIN(block->end_freq[1], block->end_freq[2]);
476
+        for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++) {
478 477
             if (flags[bnd]) {
479 478
                 start = ff_ac3_rematrix_band_tab[bnd];
480 479
                 end   = FFMIN(nb_coefs, ff_ac3_rematrix_band_tab[bnd+1]);
481 480
                 for (i = start; i < end; i++) {
482
-                    int32_t lt = block->fixed_coef[0][i];
483
-                    int32_t rt = block->fixed_coef[1][i];
484
-                    block->fixed_coef[0][i] = (lt + rt) >> 1;
485
-                    block->fixed_coef[1][i] = (lt - rt) >> 1;
481
+                    int32_t lt = block->fixed_coef[1][i];
482
+                    int32_t rt = block->fixed_coef[2][i];
483
+                    block->fixed_coef[1][i] = (lt + rt) >> 1;
484
+                    block->fixed_coef[2][i] = (lt - rt) >> 1;
486 485
                 }
487 486
             }
488 487
         }
... ...
@@ -499,12 +859,13 @@ static av_cold void exponent_init(AC3EncodeContext *s)
499 499
 
500 500
     for (expstr = EXP_D15-1; expstr <= EXP_D45-1; expstr++) {
501 501
         grpsize = 3 << expstr;
502
-        for (i = 73; i < 256; i++) {
503
-            exponent_group_tab[expstr][i] = (i + grpsize - 4) / grpsize;
502
+        for (i = 12; i < 256; i++) {
503
+            exponent_group_tab[0][expstr][i] = (i + grpsize - 4) / grpsize;
504
+            exponent_group_tab[1][expstr][i] = (i              ) / grpsize;
504 505
         }
505 506
     }
506 507
     /* LFE */
507
-    exponent_group_tab[0][7] = 2;
508
+    exponent_group_tab[0][0][7] = 2;
508 509
 }
509 510
 
510 511
 
... ...
@@ -517,7 +878,7 @@ static void extract_exponents(AC3EncodeContext *s)
517 517
 {
518 518
     int blk, ch;
519 519
 
520
-    for (ch = 0; ch < s->channels; ch++) {
520
+    for (ch = !s->cpl_on; ch <= s->channels; ch++) {
521 521
         for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
522 522
             AC3Block *block = &s->blocks[blk];
523 523
             s->ac3dsp.extract_exponents(block->exp[ch], block->fixed_coef[ch],
... ...
@@ -542,7 +903,7 @@ static void compute_exp_strategy(AC3EncodeContext *s)
542 542
 {
543 543
     int ch, blk, blk1;
544 544
 
545
-    for (ch = 0; ch < s->fbw_channels; ch++) {
545
+    for (ch = !s->cpl_on; ch <= s->fbw_channels; ch++) {
546 546
         uint8_t *exp_strategy = s->exp_strategy[ch];
547 547
         uint8_t *exp          = s->blocks[0].exp[ch];
548 548
         int exp_diff;
... ...
@@ -551,13 +912,18 @@ static void compute_exp_strategy(AC3EncodeContext *s)
551 551
            reused in the next frame */
552 552
         exp_strategy[0] = EXP_NEW;
553 553
         exp += AC3_MAX_COEFS;
554
-        for (blk = 1; blk < AC3_MAX_BLOCKS; blk++) {
554
+        for (blk = 1; blk < AC3_MAX_BLOCKS; blk++, exp += AC3_MAX_COEFS) {
555
+            if ((ch == CPL_CH && (!s->blocks[blk].cpl_in_use || !s->blocks[blk-1].cpl_in_use)) ||
556
+                (ch  > CPL_CH && (s->blocks[blk].channel_in_cpl[ch] != s->blocks[blk-1].channel_in_cpl[ch]))) {
557
+                exp_strategy[blk] = EXP_NEW;
558
+                continue;
559
+            }
555 560
             exp_diff = s->dsp.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16);
556
-            if (exp_diff > EXP_DIFF_THRESHOLD)
561
+            exp_strategy[blk] = EXP_REUSE;
562
+            if (ch == CPL_CH && exp_diff > (EXP_DIFF_THRESHOLD * (s->blocks[blk].end_freq[ch] - s->start_freq[ch]) / AC3_MAX_COEFS))
563
+                exp_strategy[blk] = EXP_NEW;
564
+            else if (ch > CPL_CH && exp_diff > EXP_DIFF_THRESHOLD)
557 565
                 exp_strategy[blk] = EXP_NEW;
558
-            else
559
-                exp_strategy[blk] = EXP_REUSE;
560
-            exp += AC3_MAX_COEFS;
561 566
         }
562 567
 
563 568
         /* now select the encoding strategy type : if exponents are often
... ...
@@ -588,25 +954,26 @@ static void compute_exp_strategy(AC3EncodeContext *s)
588 588
 /**
589 589
  * Update the exponents so that they are the ones the decoder will decode.
590 590
  */
591
-static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
591
+static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy,
592
+                                    int cpl)
592 593
 {
593 594
     int nb_groups, i, k;
594 595
 
595
-    nb_groups = exponent_group_tab[exp_strategy-1][nb_exps] * 3;
596
+    nb_groups = exponent_group_tab[cpl][exp_strategy-1][nb_exps] * 3;
596 597
 
597 598
     /* for each group, compute the minimum exponent */
598 599
     switch(exp_strategy) {
599 600
     case EXP_D25:
600
-        for (i = 1, k = 1; i <= nb_groups; i++) {
601
+        for (i = 1, k = 1-cpl; i <= nb_groups; i++) {
601 602
             uint8_t exp_min = exp[k];
602 603
             if (exp[k+1] < exp_min)
603 604
                 exp_min = exp[k+1];
604
-            exp[i] = exp_min;
605
+            exp[i-cpl] = exp_min;
605 606
             k += 2;
606 607
         }
607 608
         break;
608 609
     case EXP_D45:
609
-        for (i = 1, k = 1; i <= nb_groups; i++) {
610
+        for (i = 1, k = 1-cpl; i <= nb_groups; i++) {
610 611
             uint8_t exp_min = exp[k];
611 612
             if (exp[k+1] < exp_min)
612 613
                 exp_min = exp[k+1];
... ...
@@ -614,14 +981,14 @@ static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
614 614
                 exp_min = exp[k+2];
615 615
             if (exp[k+3] < exp_min)
616 616
                 exp_min = exp[k+3];
617
-            exp[i] = exp_min;
617
+            exp[i-cpl] = exp_min;
618 618
             k += 4;
619 619
         }
620 620
         break;
621 621
     }
622 622
 
623 623
     /* constraint for DC exponent */
624
-    if (exp[0] > 15)
624
+    if (!cpl && exp[0] > 15)
625 625
         exp[0] = 15;
626 626
 
627 627
     /* decrease the delta between each groups to within 2 so that they can be
... ...
@@ -632,18 +999,21 @@ static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
632 632
     while (--i >= 0)
633 633
         exp[i] = FFMIN(exp[i], exp[i+1] + 2);
634 634
 
635
+    if (cpl)
636
+        exp[-1] = exp[0] & ~1;
637
+
635 638
     /* now we have the exponent values the decoder will see */
636 639
     switch (exp_strategy) {
637 640
     case EXP_D25:
638
-        for (i = nb_groups, k = nb_groups * 2; i > 0; i--) {
639
-            uint8_t exp1 = exp[i];
641
+        for (i = nb_groups, k = (nb_groups * 2)-cpl; i > 0; i--) {
642
+            uint8_t exp1 = exp[i-cpl];
640 643
             exp[k--] = exp1;
641 644
             exp[k--] = exp1;
642 645
         }
643 646
         break;
644 647
     case EXP_D45:
645
-        for (i = nb_groups, k = nb_groups * 4; i > 0; i--) {
646
-            exp[k] = exp[k-1] = exp[k-2] = exp[k-3] = exp[i];
648
+        for (i = nb_groups, k = (nb_groups * 4)-cpl; i > 0; i--) {
649
+            exp[k] = exp[k-1] = exp[k-2] = exp[k-3] = exp[i-cpl];
647 650
             k -= 4;
648 651
         }
649 652
         break;
... ...
@@ -659,32 +1029,40 @@ static void encode_exponents_blk_ch(uint8_t *exp, int nb_exps, int exp_strategy)
659 659
  */
660 660
 static void encode_exponents(AC3EncodeContext *s)
661 661
 {
662
-    int blk, blk1, ch;
662
+    int blk, blk1, ch, cpl;
663 663
     uint8_t *exp, *exp_strategy;
664 664
     int nb_coefs, num_reuse_blocks;
665 665
 
666
-    for (ch = 0; ch < s->channels; ch++) {
667
-        exp          = s->blocks[0].exp[ch];
666
+    for (ch = !s->cpl_on; ch <= s->channels; ch++) {
667
+        exp          = s->blocks[0].exp[ch] + s->start_freq[ch];
668 668
         exp_strategy = s->exp_strategy[ch];
669
-        nb_coefs     = s->nb_coefs[ch];
670 669
 
670
+        cpl = (ch == CPL_CH);
671 671
         blk = 0;
672 672
         while (blk < AC3_MAX_BLOCKS) {
673
+            AC3Block *block = &s->blocks[blk];
674
+            if (cpl && !block->cpl_in_use) {
675
+                exp += AC3_MAX_COEFS;
676
+                blk++;
677
+                continue;
678
+            }
679
+            nb_coefs = block->end_freq[ch] - s->start_freq[ch];
673 680
             blk1 = blk + 1;
674 681
 
675 682
             /* count the number of EXP_REUSE blocks after the current block
676 683
                and set exponent reference block pointers */
677
-            s->blocks[blk].exp_ref_block[ch] = &s->blocks[blk];
684
+            block->exp_ref_block[ch] = block;
678 685
             while (blk1 < AC3_MAX_BLOCKS && exp_strategy[blk1] == EXP_REUSE) {
679
-                s->blocks[blk1].exp_ref_block[ch] = &s->blocks[blk];
686
+                s->blocks[blk1].exp_ref_block[ch] = block;
680 687
                 blk1++;
681 688
             }
682 689
             num_reuse_blocks = blk1 - blk - 1;
683 690
 
684 691
             /* for the EXP_REUSE case we select the min of the exponents */
685
-            s->ac3dsp.ac3_exponent_min(exp, num_reuse_blocks, nb_coefs);
692
+            s->ac3dsp.ac3_exponent_min(exp-s->start_freq[ch], num_reuse_blocks,
693
+                                       AC3_MAX_COEFS);
686 694
 
687
-            encode_exponents_blk_ch(exp, nb_coefs, exp_strategy[blk]);
695
+            encode_exponents_blk_ch(exp, nb_coefs, exp_strategy[blk], cpl);
688 696
 
689 697
             exp += AC3_MAX_COEFS * (num_reuse_blocks + 1);
690 698
             blk = blk1;
... ...
@@ -700,7 +1078,7 @@ static void encode_exponents(AC3EncodeContext *s)
700 700
  */
701 701
 static void group_exponents(AC3EncodeContext *s)
702 702
 {
703
-    int blk, ch, i;
703
+    int blk, ch, i, cpl;
704 704
     int group_size, nb_groups, bit_count;
705 705
     uint8_t *p;
706 706
     int delta0, delta1, delta2;
... ...
@@ -709,14 +1087,15 @@ static void group_exponents(AC3EncodeContext *s)
709 709
     bit_count = 0;
710 710
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
711 711
         AC3Block *block = &s->blocks[blk];
712
-        for (ch = 0; ch < s->channels; ch++) {
712
+        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
713 713
             int exp_strategy = s->exp_strategy[ch][blk];
714 714
             if (exp_strategy == EXP_REUSE)
715 715
                 continue;
716
+            cpl = (ch == CPL_CH);
716 717
             group_size = exp_strategy + (exp_strategy == EXP_D45);
717
-            nb_groups = exponent_group_tab[exp_strategy-1][s->nb_coefs[ch]];
718
+            nb_groups = exponent_group_tab[cpl][exp_strategy-1][block->end_freq[ch]-s->start_freq[ch]];
718 719
             bit_count += 4 + (nb_groups * 7);
719
-            p = block->exp[ch];
720
+            p = block->exp[ch] + s->start_freq[ch] - cpl;
720 721
 
721 722
             /* DC exponent */
722 723
             exp1 = *p++;
... ...
@@ -783,9 +1162,7 @@ static void count_frame_bits_fixed(AC3EncodeContext *s)
783 783
 
784 784
     /* assumptions:
785 785
      *   no dynamic range codes
786
-     *   no channel coupling
787 786
      *   bit allocation parameters do not change between blocks
788
-     *   SNR offsets do not change between blocks
789 787
      *   no delta bit allocation
790 788
      *   no skipped data
791 789
      *   no auxilliary data
... ...
@@ -806,11 +1183,6 @@ static void count_frame_bits_fixed(AC3EncodeContext *s)
806 806
         /* dynamic range */
807 807
         frame_bits++;
808 808
 
809
-        /* coupling strategy */
810
-        frame_bits++;
811
-        if (!blk)
812
-            frame_bits++;
813
-
814 809
         /* exponent strategy */
815 810
         frame_bits += 2 * s->fbw_channels;
816 811
         if (s->lfe_on)
... ...
@@ -821,11 +1193,6 @@ static void count_frame_bits_fixed(AC3EncodeContext *s)
821 821
         if (!blk)
822 822
             frame_bits += 2 + 2 + 2 + 2 + 3;
823 823
 
824
-        /* snr offsets and fast gain codes */
825
-        frame_bits++;
826
-        if (!blk)
827
-            frame_bits += 6 + s->channels * (4 + 3);
828
-
829 824
         /* delta bit allocation */
830 825
         frame_bits++;
831 826
 
... ...
@@ -857,7 +1224,7 @@ static void bit_alloc_init(AC3EncodeContext *s)
857 857
     s->slow_gain_code  = 1;
858 858
     s->db_per_bit_code = 3;
859 859
     s->floor_code      = 7;
860
-    for (ch = 0; ch < s->channels; ch++)
860
+    for (ch = 0; ch <= s->channels; ch++)
861 861
         s->fast_gain_code[ch] = 4;
862 862
 
863 863
     /* initial snr offset */
... ...
@@ -871,6 +1238,8 @@ static void bit_alloc_init(AC3EncodeContext *s)
871 871
     s->bit_alloc.slow_gain  = ff_ac3_slow_gain_tab[s->slow_gain_code];
872 872
     s->bit_alloc.db_per_bit = ff_ac3_db_per_bit_tab[s->db_per_bit_code];
873 873
     s->bit_alloc.floor      = ff_ac3_floor_tab[s->floor_code];
874
+    s->bit_alloc.cpl_fast_leak = 0;
875
+    s->bit_alloc.cpl_slow_leak = 0;
874 876
 
875 877
     count_frame_bits_fixed(s);
876 878
 }
... ...
@@ -899,17 +1268,64 @@ static void count_frame_bits(AC3EncodeContext *s)
899 899
 
900 900
     /* audio blocks */
901 901
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
902
+        AC3Block *block = &s->blocks[blk];
903
+
904
+        /* coupling strategy */
905
+        frame_bits++;
906
+        if (block->new_cpl_strategy) {
907
+            frame_bits++;
908
+            if (block->cpl_in_use) {
909
+                frame_bits += s->fbw_channels;
910
+                if (s->channel_mode == AC3_CHMODE_STEREO)
911
+                    frame_bits++;
912
+                frame_bits += 4 + 4;
913
+                frame_bits += s->num_cpl_subbands - 1;
914
+            }
915
+        }
916
+
917
+        /* coupling coordinates */
918
+        if (block->cpl_in_use) {
919
+            for (ch = 1; ch <= s->fbw_channels; ch++) {
920
+                if (block->channel_in_cpl[ch]) {
921
+                    frame_bits++;
922
+                    if (block->new_cpl_coords) {
923
+                        frame_bits += 2;
924
+                        frame_bits += (4 + 4) * s->num_cpl_bands;
925
+                    }
926
+                }
927
+            }
928
+        }
929
+
902 930
         /* stereo rematrixing */
903 931
         if (s->channel_mode == AC3_CHMODE_STEREO) {
904 932
             frame_bits++;
905 933
             if (s->blocks[blk].new_rematrixing_strategy)
906
-                frame_bits += s->num_rematrixing_bands;
934
+                frame_bits += block->num_rematrixing_bands;
907 935
         }
908 936
 
909 937
         /* bandwidth codes & gain range */
910
-        for (ch = 0; ch < s->fbw_channels; ch++) {
911
-            if (s->exp_strategy[ch][blk] != EXP_REUSE)
912
-                frame_bits += 6 + 2;
938
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
939
+            if (s->exp_strategy[ch][blk] != EXP_REUSE) {
940
+                if (!block->channel_in_cpl[ch])
941
+                    frame_bits += 6;
942
+                frame_bits += 2;
943
+            }
944
+        }
945
+
946
+        /* coupling exponent strategy */
947
+        if (block->cpl_in_use)
948
+            frame_bits += 2;
949
+
950
+        /* snr offsets and fast gain codes */
951
+        frame_bits++;
952
+        if (block->new_snr_offsets)
953
+            frame_bits += 6 + (s->channels + block->cpl_in_use) * (4 + 3);
954
+
955
+        /* coupling leak info */
956
+        if (block->cpl_in_use) {
957
+            frame_bits++;
958
+            if (block->new_cpl_leak)
959
+                frame_bits += 3 + 3;
913 960
         }
914 961
     }
915 962
 
... ...
@@ -943,16 +1359,16 @@ static void bit_alloc_masking(AC3EncodeContext *s)
943 943
 
944 944
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
945 945
         AC3Block *block = &s->blocks[blk];
946
-        for (ch = 0; ch < s->channels; ch++) {
946
+        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
947 947
             /* We only need psd and mask for calculating bap.
948 948
                Since we currently do not calculate bap when exponent
949 949
                strategy is EXP_REUSE we do not need to calculate psd or mask. */
950 950
             if (s->exp_strategy[ch][blk] != EXP_REUSE) {
951
-                ff_ac3_bit_alloc_calc_psd(block->exp[ch], 0,
952
-                                          s->nb_coefs[ch],
953
-                                          block->psd[ch], block->band_psd[ch]);
951
+                ff_ac3_bit_alloc_calc_psd(block->exp[ch], s->start_freq[ch],
952
+                                          block->end_freq[ch], block->psd[ch],
953
+                                          block->band_psd[ch]);
954 954
                 ff_ac3_bit_alloc_calc_mask(&s->bit_alloc, block->band_psd[ch],
955
-                                           0, s->nb_coefs[ch],
955
+                                           s->start_freq[ch], block->end_freq[ch],
956 956
                                            ff_ac3_fast_gain_tab[s->fast_gain_code[ch]],
957 957
                                            ch == s->lfe_channel,
958 958
                                            DBA_NONE, 0, NULL, NULL, NULL,
... ...
@@ -970,11 +1386,12 @@ static void bit_alloc_masking(AC3EncodeContext *s)
970 970
 static void reset_block_bap(AC3EncodeContext *s)
971 971
 {
972 972
     int blk, ch;
973
+    int channels = s->channels + 1;
973 974
     if (s->blocks[0].bap[0] == s->bap_buffer)
974 975
         return;
975 976
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
976
-        for (ch = 0; ch < s->channels; ch++) {
977
-            s->blocks[blk].bap[ch] = &s->bap_buffer[AC3_MAX_COEFS * (blk * s->channels + ch)];
977
+        for (ch = 0; ch < channels; ch++) {
978
+            s->blocks[blk].bap[ch] = &s->bap_buffer[AC3_MAX_COEFS * (blk * channels + ch)];
978 979
         }
979 980
     }
980 981
 }
... ...
@@ -1000,28 +1417,37 @@ static int bit_alloc(AC3EncodeContext *s, int snr_offset)
1000 1000
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
1001 1001
         AC3Block *block = &s->blocks[blk];
1002 1002
         AC3Block *ref_block;
1003
+        int av_uninit(ch0);
1004
+        int got_cpl = !block->cpl_in_use;
1003 1005
         // initialize grouped mantissa counts. these are set so that they are
1004 1006
         // padded to the next whole group size when bits are counted in
1005 1007
         // compute_mantissa_size_final
1006 1008
         mant_cnt[0] = mant_cnt[3] = 0;
1007 1009
         mant_cnt[1] = mant_cnt[2] = 2;
1008 1010
         mant_cnt[4] = 1;
1009
-        for (ch = 0; ch < s->channels; ch++) {
1011
+        for (ch = 1; ch <= s->channels; ch++) {
1012
+            if (!got_cpl && ch > 1 && block->channel_in_cpl[ch-1]) {
1013
+                ch0     = ch - 1;
1014
+                ch      = CPL_CH;
1015
+                got_cpl = 1;
1016
+            }
1017
+
1010 1018
             /* Currently the only bit allocation parameters which vary across
1011 1019
                blocks within a frame are the exponent values.  We can take
1012 1020
                advantage of that by reusing the bit allocation pointers
1013 1021
                whenever we reuse exponents. */
1014 1022
             ref_block = block->exp_ref_block[ch];
1015 1023
             if (s->exp_strategy[ch][blk] != EXP_REUSE) {
1016
-                s->ac3dsp.bit_alloc_calc_bap(ref_block->mask[ch],
1017
-                                             ref_block->psd[ch], 0,
1018
-                                             s->nb_coefs[ch], snr_offset,
1019
-                                             s->bit_alloc.floor, ff_ac3_bap_tab,
1020
-                                             ref_block->bap[ch]);
1024
+                s->ac3dsp.bit_alloc_calc_bap(ref_block->mask[ch], ref_block->psd[ch],
1025
+                                             s->start_freq[ch], block->end_freq[ch],
1026
+                                             snr_offset, s->bit_alloc.floor,
1027
+                                             ff_ac3_bap_tab, ref_block->bap[ch]);
1021 1028
             }
1022 1029
             mantissa_bits += s->ac3dsp.compute_mantissa_size(mant_cnt,
1023
-                                                             ref_block->bap[ch],
1024
-                                                             s->nb_coefs[ch]);
1030
+                                                             ref_block->bap[ch]+s->start_freq[ch],
1031
+                                                             block->end_freq[ch]-s->start_freq[ch]);
1032
+            if (ch == CPL_CH)
1033
+                ch = ch0;
1025 1034
         }
1026 1035
         mantissa_bits += compute_mantissa_size_final(mant_cnt);
1027 1036
     }
... ...
@@ -1047,7 +1473,7 @@ static int cbr_bit_allocation(AC3EncodeContext *s)
1047 1047
 
1048 1048
     /* if previous frame SNR offset was 1023, check if current frame can also
1049 1049
        use SNR offset of 1023. if so, skip the search. */
1050
-    if ((snr_offset | s->fine_snr_offset[0]) == 1023) {
1050
+    if ((snr_offset | s->fine_snr_offset[1]) == 1023) {
1051 1051
         if (bit_alloc(s, 1023) <= bits_left)
1052 1052
             return 0;
1053 1053
     }
... ...
@@ -1071,7 +1497,7 @@ static int cbr_bit_allocation(AC3EncodeContext *s)
1071 1071
     reset_block_bap(s);
1072 1072
 
1073 1073
     s->coarse_snr_offset = snr_offset >> 4;
1074
-    for (ch = 0; ch < s->channels; ch++)
1074
+    for (ch = !s->cpl_on; ch <= s->channels; ch++)
1075 1075
         s->fine_snr_offset[ch] = snr_offset & 0xF;
1076 1076
 
1077 1077
     return 0;
... ...
@@ -1089,26 +1515,26 @@ static int downgrade_exponents(AC3EncodeContext *s)
1089 1089
 {
1090 1090
     int ch, blk;
1091 1091
 
1092
-    for (ch = 0; ch < s->fbw_channels; ch++) {
1093
-        for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
1092
+    for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
1093
+        for (ch = !s->blocks[blk].cpl_in_use; ch <= s->fbw_channels; ch++) {
1094 1094
             if (s->exp_strategy[ch][blk] == EXP_D15) {
1095 1095
                 s->exp_strategy[ch][blk] = EXP_D25;
1096 1096
                 return 0;
1097 1097
             }
1098 1098
         }
1099 1099
     }
1100
-    for (ch = 0; ch < s->fbw_channels; ch++) {
1101
-        for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
1100
+    for (blk = AC3_MAX_BLOCKS-1; blk >= 0; blk--) {
1101
+        for (ch = !s->blocks[blk].cpl_in_use; ch <= s->fbw_channels; ch++) {
1102 1102
             if (s->exp_strategy[ch][blk] == EXP_D25) {
1103 1103
                 s->exp_strategy[ch][blk] = EXP_D45;
1104 1104
                 return 0;
1105 1105
             }
1106 1106
         }
1107 1107
     }
1108
-    for (ch = 0; ch < s->fbw_channels; ch++) {
1109
-        /* block 0 cannot reuse exponents, so only downgrade D45 to REUSE if
1110
-           the block number > 0 */
1111
-        for (blk = AC3_MAX_BLOCKS-1; blk > 0; blk--) {
1108
+    /* block 0 cannot reuse exponents, so only downgrade D45 to REUSE if
1109
+       the block number > 0 */
1110
+    for (blk = AC3_MAX_BLOCKS-1; blk > 0; blk--) {
1111
+        for (ch = !s->blocks[blk].cpl_in_use; ch <= s->fbw_channels; ch++) {
1112 1112
             if (s->exp_strategy[ch][blk] > EXP_REUSE) {
1113 1113
                 s->exp_strategy[ch][blk] = EXP_REUSE;
1114 1114
                 return 0;
... ...
@@ -1135,7 +1561,18 @@ static int compute_bit_allocation(AC3EncodeContext *s)
1135 1135
 
1136 1136
     ret = cbr_bit_allocation(s);
1137 1137
     while (ret) {
1138
-        /* fallback 1: downgrade exponents */
1138
+        /* fallback 1: disable channel coupling */
1139
+        if (s->cpl_on) {
1140
+            s->cpl_on = 0;
1141
+            compute_coupling_strategy(s);
1142
+            compute_rematrixing_strategy(s);
1143
+            apply_rematrixing(s);
1144
+            process_exponents(s);
1145
+            ret = compute_bit_allocation(s);
1146
+            continue;
1147
+        }
1148
+
1149
+        /* fallback 2: downgrade exponents */
1139 1150
         if (!downgrade_exponents(s)) {
1140 1151
             extract_exponents(s);
1141 1152
             encode_exponents(s);
... ...
@@ -1189,12 +1626,13 @@ static inline int asym_quant(int c, int e, int qbits)
1189 1189
  * Quantize a set of mantissas for a single channel in a single block.
1190 1190
  */
1191 1191
 static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
1192
-                                      uint8_t *exp,
1193
-                                      uint8_t *bap, uint16_t *qmant, int n)
1192
+                                      uint8_t *exp, uint8_t *bap,
1193
+                                      uint16_t *qmant, int start_freq,
1194
+                                      int end_freq)
1194 1195
 {
1195 1196
     int i;
1196 1197
 
1197
-    for (i = 0; i < n; i++) {
1198
+    for (i = start_freq; i < end_freq; i++) {
1198 1199
         int v;
1199 1200
         int c = fixed_coef[i];
1200 1201
         int e = exp[i];
... ...
@@ -1284,19 +1722,27 @@ static void quantize_mantissas_blk_ch(AC3Mant *s, int32_t *fixed_coef,
1284 1284
  */
1285 1285
 static void quantize_mantissas(AC3EncodeContext *s)
1286 1286
 {
1287
-    int blk, ch;
1288
-
1287
+    int blk, ch, ch0=0, got_cpl;
1289 1288
 
1290 1289
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
1291 1290
         AC3Block *block = &s->blocks[blk];
1292 1291
         AC3Block *ref_block;
1293 1292
         AC3Mant m = { 0 };
1294 1293
 
1295
-        for (ch = 0; ch < s->channels; ch++) {
1294
+        got_cpl = !block->cpl_in_use;
1295
+        for (ch = 1; ch <= s->channels; ch++) {
1296
+            if (!got_cpl && ch > 1 && block->channel_in_cpl[ch-1]) {
1297
+                ch0     = ch - 1;
1298
+                ch      = CPL_CH;
1299
+                got_cpl = 1;
1300
+            }
1296 1301
             ref_block = block->exp_ref_block[ch];
1297 1302
             quantize_mantissas_blk_ch(&m, block->fixed_coef[ch],
1298
-                                      ref_block->exp[ch], ref_block->bap[ch],
1299
-                                      block->qmant[ch], s->nb_coefs[ch]);
1303
+                                      ref_block->exp[ch],
1304
+                                      ref_block->bap[ch], block->qmant[ch],
1305
+                                      s->start_freq[ch], block->end_freq[ch]);
1306
+            if (ch == CPL_CH)
1307
+                ch = ch0;
1300 1308
         }
1301 1309
     }
1302 1310
 }
... ...
@@ -1363,7 +1809,8 @@ static void output_frame_header(AC3EncodeContext *s)
1363 1363
  */
1364 1364
 static void output_audio_block(AC3EncodeContext *s, int blk)
1365 1365
 {
1366
-    int ch, i, baie, rbnd;
1366
+    int ch, i, baie, bnd, got_cpl;
1367
+    int av_uninit(ch0);
1367 1368
     AC3Block *block = &s->blocks[blk];
1368 1369
 
1369 1370
     /* block switching */
... ...
@@ -1378,11 +1825,38 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
1378 1378
     put_bits(&s->pb, 1, 0);
1379 1379
 
1380 1380
     /* channel coupling */
1381
-    if (!blk) {
1382
-        put_bits(&s->pb, 1, 1); /* coupling strategy present */
1383
-        put_bits(&s->pb, 1, 0); /* no coupling strategy */
1384
-    } else {
1385
-        put_bits(&s->pb, 1, 0); /* no new coupling strategy */
1381
+    put_bits(&s->pb, 1, block->new_cpl_strategy);
1382
+    if (block->new_cpl_strategy) {
1383
+        put_bits(&s->pb, 1, block->cpl_in_use);
1384
+        if (block->cpl_in_use) {
1385
+            int start_sub, end_sub;
1386
+            for (ch = 1; ch <= s->fbw_channels; ch++)
1387
+                put_bits(&s->pb, 1, block->channel_in_cpl[ch]);
1388
+            if (s->channel_mode == AC3_CHMODE_STEREO)
1389
+                put_bits(&s->pb, 1, 0); /* phase flags in use */
1390
+            start_sub = (s->start_freq[CPL_CH] - 37) / 12;
1391
+            end_sub   = (s->cpl_end_freq       - 37) / 12;
1392
+            put_bits(&s->pb, 4, start_sub);
1393
+            put_bits(&s->pb, 4, end_sub - 3);
1394
+            for (bnd = start_sub+1; bnd < end_sub; bnd++)
1395
+                put_bits(&s->pb, 1, ff_eac3_default_cpl_band_struct[bnd]);
1396
+        }
1397
+    }
1398
+
1399
+    /* coupling coordinates */
1400
+    if (block->cpl_in_use) {
1401
+        for (ch = 1; ch <= s->fbw_channels; ch++) {
1402
+            if (block->channel_in_cpl[ch]) {
1403
+                put_bits(&s->pb, 1, block->new_cpl_coords);
1404
+                if (block->new_cpl_coords) {
1405
+                    put_bits(&s->pb, 2, block->cpl_master_exp[ch]);
1406
+                    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
1407
+                        put_bits(&s->pb, 4, block->cpl_coord_exp [ch][bnd]);
1408
+                        put_bits(&s->pb, 4, block->cpl_coord_mant[ch][bnd]);
1409
+                    }
1410
+                }
1411
+            }
1412
+        }
1386 1413
     }
1387 1414
 
1388 1415
     /* stereo rematrixing */
... ...
@@ -1390,40 +1864,41 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
1390 1390
         put_bits(&s->pb, 1, block->new_rematrixing_strategy);
1391 1391
         if (block->new_rematrixing_strategy) {
1392 1392
             /* rematrixing flags */
1393
-            for (rbnd = 0; rbnd < s->num_rematrixing_bands; rbnd++)
1394
-                put_bits(&s->pb, 1, block->rematrixing_flags[rbnd]);
1393
+            for (bnd = 0; bnd < block->num_rematrixing_bands; bnd++)
1394
+                put_bits(&s->pb, 1, block->rematrixing_flags[bnd]);
1395 1395
         }
1396 1396
     }
1397 1397
 
1398 1398
     /* exponent strategy */
1399
-    for (ch = 0; ch < s->fbw_channels; ch++)
1399
+    for (ch = !block->cpl_in_use; ch <= s->fbw_channels; ch++)
1400 1400
         put_bits(&s->pb, 2, s->exp_strategy[ch][blk]);
1401 1401
     if (s->lfe_on)
1402 1402
         put_bits(&s->pb, 1, s->exp_strategy[s->lfe_channel][blk]);
1403 1403
 
1404 1404
     /* bandwidth */
1405
-    for (ch = 0; ch < s->fbw_channels; ch++) {
1406
-        if (s->exp_strategy[ch][blk] != EXP_REUSE)
1405
+    for (ch = 1; ch <= s->fbw_channels; ch++) {
1406
+        if (s->exp_strategy[ch][blk] != EXP_REUSE && !block->channel_in_cpl[ch])
1407 1407
             put_bits(&s->pb, 6, s->bandwidth_code);
1408 1408
     }
1409 1409
 
1410 1410
     /* exponents */
1411
-    for (ch = 0; ch < s->channels; ch++) {
1411
+    for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
1412 1412
         int nb_groups;
1413
+        int cpl = (ch == CPL_CH);
1413 1414
 
1414 1415
         if (s->exp_strategy[ch][blk] == EXP_REUSE)
1415 1416
             continue;
1416 1417
 
1417 1418
         /* DC exponent */
1418
-        put_bits(&s->pb, 4, block->grouped_exp[ch][0]);
1419
+        put_bits(&s->pb, 4, block->grouped_exp[ch][0] >> cpl);
1419 1420
 
1420 1421
         /* exponent groups */
1421
-        nb_groups = exponent_group_tab[s->exp_strategy[ch][blk]-1][s->nb_coefs[ch]];
1422
+        nb_groups = exponent_group_tab[cpl][s->exp_strategy[ch][blk]-1][block->end_freq[ch]-s->start_freq[ch]];
1422 1423
         for (i = 1; i <= nb_groups; i++)
1423 1424
             put_bits(&s->pb, 7, block->grouped_exp[ch][i]);
1424 1425
 
1425 1426
         /* gain range info */
1426
-        if (ch != s->lfe_channel)
1427
+        if (ch != s->lfe_channel && !cpl)
1427 1428
             put_bits(&s->pb, 2, 0);
1428 1429
     }
1429 1430
 
... ...
@@ -1439,23 +1914,40 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
1439 1439
     }
1440 1440
 
1441 1441
     /* snr offset */
1442
-    put_bits(&s->pb, 1, baie);
1443
-    if (baie) {
1442
+    put_bits(&s->pb, 1, block->new_snr_offsets);
1443
+    if (block->new_snr_offsets) {
1444 1444
         put_bits(&s->pb, 6, s->coarse_snr_offset);
1445
-        for (ch = 0; ch < s->channels; ch++) {
1445
+        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
1446 1446
             put_bits(&s->pb, 4, s->fine_snr_offset[ch]);
1447 1447
             put_bits(&s->pb, 3, s->fast_gain_code[ch]);
1448 1448
         }
1449 1449
     }
1450 1450
 
1451
+    /* coupling leak */
1452
+    if (block->cpl_in_use) {
1453
+        put_bits(&s->pb, 1, block->new_cpl_leak);
1454
+        if (block->new_cpl_leak) {
1455
+            put_bits(&s->pb, 3, s->bit_alloc.cpl_fast_leak);
1456
+            put_bits(&s->pb, 3, s->bit_alloc.cpl_slow_leak);
1457
+        }
1458
+    }
1459
+
1451 1460
     put_bits(&s->pb, 1, 0); /* no delta bit allocation */
1452 1461
     put_bits(&s->pb, 1, 0); /* no data to skip */
1453 1462
 
1454 1463
     /* mantissas */
1455
-    for (ch = 0; ch < s->channels; ch++) {
1464
+    got_cpl = !block->cpl_in_use;
1465
+    for (ch = 1; ch <= s->channels; ch++) {
1456 1466
         int b, q;
1457
-        AC3Block *ref_block = block->exp_ref_block[ch];
1458
-        for (i = 0; i < s->nb_coefs[ch]; i++) {
1467
+        AC3Block *ref_block;
1468
+
1469
+        if (!got_cpl && ch > 1 && block->channel_in_cpl[ch-1]) {
1470
+            ch0     = ch - 1;
1471
+            ch      = CPL_CH;
1472
+            got_cpl = 1;
1473
+        }
1474
+        ref_block = block->exp_ref_block[ch];
1475
+        for (i = s->start_freq[ch]; i < block->end_freq[ch]; i++) {
1459 1476
             q = block->qmant[ch][i];
1460 1477
             b = ref_block->bap[ch][i];
1461 1478
             switch (b) {
... ...
@@ -1469,6 +1961,8 @@ static void output_audio_block(AC3EncodeContext *s, int blk)
1469 1469
             default:              put_bits(&s->pb, b-1, q); break;
1470 1470
             }
1471 1471
         }
1472
+        if (ch == CPL_CH)
1473
+            ch = ch0;
1472 1474
     }
1473 1475
 }
1474 1476
 
... ...
@@ -1854,6 +2348,12 @@ static int ac3_encode_frame(AVCodecContext *avctx, unsigned char *frame,
1854 1854
 
1855 1855
     scale_coefficients(s);
1856 1856
 
1857
+    s->cpl_on = s->cpl_enabled;
1858
+    compute_coupling_strategy(s);
1859
+
1860
+    if (s->cpl_on)
1861
+        apply_channel_coupling(s);
1862
+
1857 1863
     compute_rematrixing_strategy(s);
1858 1864
 
1859 1865
     apply_rematrixing(s);
... ...
@@ -1934,7 +2434,7 @@ static av_cold int set_channel_info(AC3EncodeContext *s, int channels,
1934 1934
     s->lfe_on       = !!(ch_layout & AV_CH_LOW_FREQUENCY);
1935 1935
     s->channels     = channels;
1936 1936
     s->fbw_channels = channels - s->lfe_on;
1937
-    s->lfe_channel  = s->lfe_on ? s->fbw_channels : -1;
1937
+    s->lfe_channel  = s->lfe_on ? s->fbw_channels + 1 : -1;
1938 1938
     if (s->lfe_on)
1939 1939
         ch_layout -= AV_CH_LOW_FREQUENCY;
1940 1940
 
... ...
@@ -2033,6 +2533,10 @@ static av_cold int validate_options(AVCodecContext *avctx, AC3EncodeContext *s)
2033 2033
     s->rematrixing_enabled = s->options.stereo_rematrixing &&
2034 2034
                              (s->channel_mode == AC3_CHMODE_STEREO);
2035 2035
 
2036
+    s->cpl_enabled = s->options.channel_coupling &&
2037
+                     s->channel_mode >= AC3_CHMODE_STEREO &&
2038
+                     CONFIG_AC3ENC_FLOAT;
2039
+
2036 2040
     return 0;
2037 2041
 }
2038 2042
 
... ...
@@ -2044,7 +2548,8 @@ static av_cold int validate_options(AVCodecContext *avctx, AC3EncodeContext *s)
2044 2044
  */
2045 2045
 static av_cold void set_bandwidth(AC3EncodeContext *s)
2046 2046
 {
2047
-    int ch;
2047
+    int blk, ch;
2048
+    int av_uninit(cpl_start);
2048 2049
 
2049 2050
     if (s->cutoff) {
2050 2051
         /* calculate bandwidth based on user-specified cutoff frequency */
... ...
@@ -2057,11 +2562,54 @@ static av_cold void set_bandwidth(AC3EncodeContext *s)
2057 2057
     }
2058 2058
 
2059 2059
     /* set number of coefficients for each channel */
2060
-    for (ch = 0; ch < s->fbw_channels; ch++) {
2061
-        s->nb_coefs[ch] = s->bandwidth_code * 3 + 73;
2060
+    for (ch = 1; ch <= s->fbw_channels; ch++) {
2061
+        s->start_freq[ch] = 0;
2062
+        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
2063
+            s->blocks[blk].end_freq[ch] = s->bandwidth_code * 3 + 73;
2064
+    }
2065
+    /* LFE channel always has 7 coefs */
2066
+    if (s->lfe_on) {
2067
+        s->start_freq[s->lfe_channel] = 0;
2068
+        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
2069
+            s->blocks[blk].end_freq[ch] = 7;
2070
+    }
2071
+
2072
+    /* initialize coupling strategy */
2073
+    if (s->cpl_enabled) {
2074
+        if (s->options.cpl_start >= 0) {
2075
+            cpl_start = s->options.cpl_start;
2076
+        } else {
2077
+            cpl_start = ac3_coupling_start_tab[s->channel_mode-2][s->bit_alloc.sr_code][s->frame_size_code/2];
2078
+            if (cpl_start < 0)
2079
+                s->cpl_enabled = 0;
2080
+        }
2081
+    }
2082
+    if (s->cpl_enabled) {
2083
+        int i, cpl_start_band, cpl_end_band;
2084
+        uint8_t *cpl_band_sizes = s->cpl_band_sizes;
2085
+
2086
+        cpl_end_band   = s->bandwidth_code / 4 + 3;
2087
+        cpl_start_band = av_clip(cpl_start, 0, FFMIN(cpl_end_band-1, 15));
2088
+
2089
+        s->num_cpl_subbands = cpl_end_band - cpl_start_band;
2090
+
2091
+        s->num_cpl_bands = 1;
2092
+        *cpl_band_sizes  = 12;
2093
+        for (i = cpl_start_band + 1; i < cpl_end_band; i++) {
2094
+            if (ff_eac3_default_cpl_band_struct[i]) {
2095
+                *cpl_band_sizes += 12;
2096
+            } else {
2097
+                s->num_cpl_bands++;
2098
+                cpl_band_sizes++;
2099
+                *cpl_band_sizes = 12;
2100
+            }
2101
+        }
2102
+
2103
+        s->start_freq[CPL_CH] = cpl_start_band * 12 + 37;
2104
+        s->cpl_end_freq       = cpl_end_band   * 12 + 37;
2105
+        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
2106
+            s->blocks[blk].end_freq[CPL_CH] = s->cpl_end_freq;
2062 2107
     }
2063
-    if (s->lfe_on)
2064
-        s->nb_coefs[s->lfe_channel] = 7; /* LFE channel always has 7 coefs */
2065 2108
 }
2066 2109
 
2067 2110
 
... ...
@@ -2069,6 +2617,7 @@ static av_cold int allocate_buffers(AVCodecContext *avctx)
2069 2069
 {
2070 2070
     int blk, ch;
2071 2071
     AC3EncodeContext *s = avctx->priv_data;
2072
+    int channels = s->channels + 1; /* includes coupling channel */
2072 2073
 
2073 2074
     FF_ALLOC_OR_GOTO(avctx, s->planar_samples, s->channels * sizeof(*s->planar_samples),
2074 2075
                      alloc_fail);
... ...
@@ -2077,74 +2626,90 @@ static av_cold int allocate_buffers(AVCodecContext *avctx)
2077 2077
                           (AC3_FRAME_SIZE+AC3_BLOCK_SIZE) * sizeof(**s->planar_samples),
2078 2078
                           alloc_fail);
2079 2079
     }
2080
-    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer,  AC3_MAX_BLOCKS * s->channels *
2080
+    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer,  AC3_MAX_BLOCKS * channels *
2081 2081
                      AC3_MAX_COEFS * sizeof(*s->bap_buffer),  alloc_fail);
2082
-    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, AC3_MAX_BLOCKS * s->channels *
2082
+    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, AC3_MAX_BLOCKS * channels *
2083 2083
                      AC3_MAX_COEFS * sizeof(*s->bap1_buffer), alloc_fail);
2084
-    FF_ALLOC_OR_GOTO(avctx, s->mdct_coef_buffer, AC3_MAX_BLOCKS * s->channels *
2084
+    FF_ALLOC_OR_GOTO(avctx, s->mdct_coef_buffer, AC3_MAX_BLOCKS * channels *
2085 2085
                      AC3_MAX_COEFS * sizeof(*s->mdct_coef_buffer), alloc_fail);
2086
-    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, AC3_MAX_BLOCKS * s->channels *
2086
+    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, AC3_MAX_BLOCKS * channels *
2087 2087
                      AC3_MAX_COEFS * sizeof(*s->exp_buffer), alloc_fail);
2088
-    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, AC3_MAX_BLOCKS * s->channels *
2088
+    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, AC3_MAX_BLOCKS * channels *
2089 2089
                      128 * sizeof(*s->grouped_exp_buffer), alloc_fail);
2090
-    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, AC3_MAX_BLOCKS * s->channels *
2090
+    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, AC3_MAX_BLOCKS * channels *
2091 2091
                      AC3_MAX_COEFS * sizeof(*s->psd_buffer), alloc_fail);
2092
-    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, AC3_MAX_BLOCKS * s->channels *
2092
+    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, AC3_MAX_BLOCKS * channels *
2093 2093
                      64 * sizeof(*s->band_psd_buffer), alloc_fail);
2094
-    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, AC3_MAX_BLOCKS * s->channels *
2094
+    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, AC3_MAX_BLOCKS * channels *
2095 2095
                      64 * sizeof(*s->mask_buffer), alloc_fail);
2096
-    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, AC3_MAX_BLOCKS * s->channels *
2096
+    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, AC3_MAX_BLOCKS * channels *
2097 2097
                      AC3_MAX_COEFS * sizeof(*s->qmant_buffer), alloc_fail);
2098
+    if (s->cpl_enabled) {
2099
+        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_exp_buffer, AC3_MAX_BLOCKS * channels *
2100
+                         16 * sizeof(*s->cpl_coord_exp_buffer), alloc_fail);
2101
+        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_mant_buffer, AC3_MAX_BLOCKS * channels *
2102
+                         16 * sizeof(*s->cpl_coord_mant_buffer), alloc_fail);
2103
+    }
2098 2104
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
2099 2105
         AC3Block *block = &s->blocks[blk];
2100
-        FF_ALLOC_OR_GOTO(avctx, block->bap, s->channels * sizeof(*block->bap),
2106
+        FF_ALLOC_OR_GOTO(avctx, block->bap, channels * sizeof(*block->bap),
2101 2107
                          alloc_fail);
2102
-        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, s->channels * sizeof(*block->mdct_coef),
2108
+        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, channels * sizeof(*block->mdct_coef),
2103 2109
                           alloc_fail);
2104
-        FF_ALLOCZ_OR_GOTO(avctx, block->exp, s->channels * sizeof(*block->exp),
2110
+        FF_ALLOCZ_OR_GOTO(avctx, block->exp, channels * sizeof(*block->exp),
2105 2111
                           alloc_fail);
2106
-        FF_ALLOCZ_OR_GOTO(avctx, block->grouped_exp, s->channels * sizeof(*block->grouped_exp),
2112
+        FF_ALLOCZ_OR_GOTO(avctx, block->grouped_exp, channels * sizeof(*block->grouped_exp),
2107 2113
                           alloc_fail);
2108
-        FF_ALLOCZ_OR_GOTO(avctx, block->psd, s->channels * sizeof(*block->psd),
2114
+        FF_ALLOCZ_OR_GOTO(avctx, block->psd, channels * sizeof(*block->psd),
2109 2115
                           alloc_fail);
2110
-        FF_ALLOCZ_OR_GOTO(avctx, block->band_psd, s->channels * sizeof(*block->band_psd),
2116
+        FF_ALLOCZ_OR_GOTO(avctx, block->band_psd, channels * sizeof(*block->band_psd),
2111 2117
                           alloc_fail);
2112
-        FF_ALLOCZ_OR_GOTO(avctx, block->mask, s->channels * sizeof(*block->mask),
2118
+        FF_ALLOCZ_OR_GOTO(avctx, block->mask, channels * sizeof(*block->mask),
2113 2119
                           alloc_fail);
2114
-        FF_ALLOCZ_OR_GOTO(avctx, block->qmant, s->channels * sizeof(*block->qmant),
2120
+        FF_ALLOCZ_OR_GOTO(avctx, block->qmant, channels * sizeof(*block->qmant),
2115 2121
                           alloc_fail);
2122
+        if (s->cpl_enabled) {
2123
+            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_exp, channels * sizeof(*block->cpl_coord_exp),
2124
+                              alloc_fail);
2125
+            FF_ALLOCZ_OR_GOTO(avctx, block->cpl_coord_mant, channels * sizeof(*block->cpl_coord_mant),
2126
+                              alloc_fail);
2127
+        }
2116 2128
 
2117
-        for (ch = 0; ch < s->channels; ch++) {
2129
+        for (ch = 0; ch < channels; ch++) {
2118 2130
             /* arrangement: block, channel, coeff */
2119
-            block->bap[ch]         = &s->bap_buffer        [AC3_MAX_COEFS * (blk * s->channels + ch)];
2120
-            block->mdct_coef[ch]   = &s->mdct_coef_buffer  [AC3_MAX_COEFS * (blk * s->channels + ch)];
2121
-            block->grouped_exp[ch] = &s->grouped_exp_buffer[128           * (blk * s->channels + ch)];
2122
-            block->psd[ch]         = &s->psd_buffer        [AC3_MAX_COEFS * (blk * s->channels + ch)];
2123
-            block->band_psd[ch]    = &s->band_psd_buffer   [64            * (blk * s->channels + ch)];
2124
-            block->mask[ch]        = &s->mask_buffer       [64            * (blk * s->channels + ch)];
2125
-            block->qmant[ch]       = &s->qmant_buffer      [AC3_MAX_COEFS * (blk * s->channels + ch)];
2131
+            block->bap[ch]         = &s->bap_buffer        [AC3_MAX_COEFS * (blk * channels + ch)];
2132
+            block->grouped_exp[ch] = &s->grouped_exp_buffer[128           * (blk * channels + ch)];
2133
+            block->psd[ch]         = &s->psd_buffer        [AC3_MAX_COEFS * (blk * channels + ch)];
2134
+            block->band_psd[ch]    = &s->band_psd_buffer   [64            * (blk * channels + ch)];
2135
+            block->mask[ch]        = &s->mask_buffer       [64            * (blk * channels + ch)];
2136
+            block->qmant[ch]       = &s->qmant_buffer      [AC3_MAX_COEFS * (blk * channels + ch)];
2137
+            if (s->cpl_enabled) {
2138
+                block->cpl_coord_exp[ch]  = &s->cpl_coord_exp_buffer [16  * (blk * channels + ch)];
2139
+                block->cpl_coord_mant[ch] = &s->cpl_coord_mant_buffer[16  * (blk * channels + ch)];
2140
+            }
2126 2141
 
2127 2142
             /* arrangement: channel, block, coeff */
2128 2143
             block->exp[ch]         = &s->exp_buffer        [AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
2144
+            block->mdct_coef[ch]   = &s->mdct_coef_buffer  [AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
2129 2145
         }
2130 2146
     }
2131 2147
 
2132 2148
     if (CONFIG_AC3ENC_FLOAT) {
2133
-        FF_ALLOC_OR_GOTO(avctx, s->fixed_coef_buffer, AC3_MAX_BLOCKS * s->channels *
2149
+        FF_ALLOC_OR_GOTO(avctx, s->fixed_coef_buffer, AC3_MAX_BLOCKS * channels *
2134 2150
                          AC3_MAX_COEFS * sizeof(*s->fixed_coef_buffer), alloc_fail);
2135 2151
         for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
2136 2152
             AC3Block *block = &s->blocks[blk];
2137
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, s->channels *
2153
+            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
2138 2154
                               sizeof(*block->fixed_coef), alloc_fail);
2139
-            for (ch = 0; ch < s->channels; ch++)
2140
-                block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (blk * s->channels + ch)];
2155
+            for (ch = 0; ch < channels; ch++)
2156
+                block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
2141 2157
         }
2142 2158
     } else {
2143 2159
         for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
2144 2160
             AC3Block *block = &s->blocks[blk];
2145
-            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, s->channels *
2161
+            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
2146 2162
                               sizeof(*block->fixed_coef), alloc_fail);
2147
-            for (ch = 0; ch < s->channels; ch++)
2163
+            for (ch = 0; ch < channels; ch++)
2148 2164
                 block->fixed_coef[ch] = (int32_t *)block->mdct_coef[ch];
2149 2165
         }
2150 2166
     }
... ...
@@ -101,7 +101,7 @@ static void scale_coefficients(AC3EncodeContext *s)
101 101
 
102 102
     for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
103 103
         AC3Block *block = &s->blocks[blk];
104
-        for (ch = 0; ch < s->channels; ch++) {
104
+        for (ch = 1; ch <= s->channels; ch++) {
105 105
             s->ac3dsp.ac3_rshift_int32(block->mdct_coef[ch], AC3_MAX_COEFS,
106 106
                                        block->coeff_shift[ch]);
107 107
         }
... ...
@@ -93,8 +93,10 @@ static int normalize_samples(AC3EncodeContext *s)
93 93
  */
94 94
 static void scale_coefficients(AC3EncodeContext *s)
95 95
 {
96
-    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
97
-                               AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
96
+    int chan_size = AC3_MAX_COEFS * AC3_MAX_BLOCKS;
97
+    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer + chan_size,
98
+                               s->mdct_coef_buffer  + chan_size,
99
+                               chan_size * s->channels);
98 100
 }
99 101
 
100 102
 
... ...
@@ -138,6 +138,13 @@ const uint16_t ff_ac3_bitrate_tab[19] = {
138 138
  */
139 139
 const uint8_t ff_ac3_rematrix_band_tab[5] = { 13, 25, 37, 61, 253 };
140 140
 
141
+/**
142
+ * Table E2.16 Default Coupling Banding Structure
143
+ */
144
+const uint8_t ff_eac3_default_cpl_band_struct[18] = {
145
+    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1
146
+};
147
+
141 148
 /* AC-3 MDCT window */
142 149
 
143 150
 /* MDCT window */
... ...
@@ -39,6 +39,7 @@ extern const uint8_t  ff_ac3_dec_channel_map[8][2][6];
39 39
 extern const uint16_t ff_ac3_sample_rate_tab[3];
40 40
 extern const uint16_t ff_ac3_bitrate_tab[19];
41 41
 extern const uint8_t  ff_ac3_rematrix_band_tab[5];
42
+extern const uint8_t  ff_eac3_default_cpl_band_struct[18];
42 43
 extern const int16_t  ff_ac3_window[AC3_WINDOW_SIZE/2];
43 44
 extern const uint8_t  ff_ac3_log_add_tab[260];
44 45
 extern const uint16_t ff_ac3_hearing_threshold_tab[AC3_CRITICAL_BANDS][3];
... ...
@@ -156,7 +156,7 @@ void avcodec_register_all(void)
156 156
     REGISTER_DECODER (MPEG1_VDPAU, mpeg1_vdpau);
157 157
     REGISTER_DECODER (MPEG2_CRYSTALHD, mpeg2_crystalhd);
158 158
     REGISTER_DECODER (MSMPEG4_CRYSTALHD, msmpeg4_crystalhd);
159
-    REGISTER_ENCDEC  (MSMPEG4V1, msmpeg4v1);
159
+    REGISTER_DECODER (MSMPEG4V1, msmpeg4v1);
160 160
     REGISTER_ENCDEC  (MSMPEG4V2, msmpeg4v2);
161 161
     REGISTER_ENCDEC  (MSMPEG4V3, msmpeg4v3);
162 162
     REGISTER_DECODER (MSRLE, msrle);
... ...
@@ -628,13 +628,6 @@ static inline int get_penalty_factor(int lambda, int lambda2, int type){
628 628
     }
629 629
 }
630 630
 
631
-/**
632
- * Empty mmx state.
633
- * this must be called between any dsp function and float/double code.
634
- * for example sin(); dsp->idct_put(); emms_c(); cos()
635
- */
636
-#define emms_c()
637
-
638 631
 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
639 632
 void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
640 633
 void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
... ...
@@ -652,22 +645,9 @@ void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
652 652
 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
653 653
 void ff_mlp_init_x86(DSPContext* c, AVCodecContext *avctx);
654 654
 
655
-#if HAVE_MMX
656
-
657
-#undef emms_c
658 655
 
659
-static inline void emms(void)
660
-{
661
-    __asm__ volatile ("emms;":::"memory");
662
-}
663
-
664
-#define emms_c() \
665
-{\
666
-    if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)\
667
-        emms();\
668
-}
656
+#if ARCH_ARM
669 657
 
670
-#elif ARCH_ARM
671 658
 
672 659
 #if HAVE_NEON
673 660
 #   define STRIDE_ALIGN 16
... ...
@@ -638,15 +638,6 @@ av_cold int MPV_encode_init(AVCodecContext *avctx)
638 638
         s->low_delay= s->max_b_frames ? 0 : 1;
639 639
         avctx->delay= s->low_delay ? 0 : (s->max_b_frames + 1);
640 640
         break;
641
-    case CODEC_ID_MSMPEG4V1:
642
-        s->out_format = FMT_H263;
643
-        s->h263_msmpeg4 = 1;
644
-        s->h263_pred = 1;
645
-        s->unrestricted_mv = 1;
646
-        s->msmpeg4_version= 1;
647
-        avctx->delay=0;
648
-        s->low_delay=1;
649
-        break;
650 641
     case CODEC_ID_MSMPEG4V2:
651 642
         s->out_format = FMT_H263;
652 643
         s->h263_msmpeg4 = 1;
... ...
@@ -3807,18 +3798,6 @@ AVCodec ff_h263p_encoder = {
3807 3807
     .long_name= NULL_IF_CONFIG_SMALL("H.263+ / H.263-1998 / H.263 version 2"),
3808 3808
 };
3809 3809
 
3810
-AVCodec ff_msmpeg4v1_encoder = {
3811
-    "msmpeg4v1",
3812
-    AVMEDIA_TYPE_VIDEO,
3813
-    CODEC_ID_MSMPEG4V1,
3814
-    sizeof(MpegEncContext),
3815
-    MPV_encode_init,
3816
-    MPV_encode_picture,
3817
-    MPV_encode_end,
3818
-    .pix_fmts= (const enum PixelFormat[]){PIX_FMT_YUV420P, PIX_FMT_NONE},
3819
-    .long_name= NULL_IF_CONFIG_SMALL("MPEG-4 part 2 Microsoft variant version 1"),
3820
-};
3821
-
3822 3810
 AVCodec ff_msmpeg4v2_encoder = {
3823 3811
     "msmpeg4v2",
3824 3812
     AVMEDIA_TYPE_VIDEO,
... ...
@@ -846,22 +846,14 @@ static void msmpeg4_encode_dc(MpegEncContext * s, int level, int n, int *dir_ptr
846 846
     int pred, extquant;
847 847
     int extrabits = 0;
848 848
 
849
-    if(s->msmpeg4_version==1){
850
-        int32_t *dc_val;
851
-        pred = msmpeg4v1_pred_dc(s, n, &dc_val);
852
-
853
-        /* update predictor */
854
-        *dc_val= level;
855
-    }else{
856
-        int16_t *dc_val;
857
-        pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
849
+    int16_t *dc_val;
850
+    pred = msmpeg4_pred_dc(s, n, &dc_val, dir_ptr);
858 851
 
859
-        /* update predictor */
860
-        if (n < 4) {
861
-            *dc_val = level * s->y_dc_scale;
862
-        } else {
863
-            *dc_val = level * s->c_dc_scale;
864
-        }
852
+    /* update predictor */
853
+    if (n < 4) {
854
+        *dc_val = level * s->y_dc_scale;
855
+    } else {
856
+        *dc_val = level * s->c_dc_scale;
865 857
     }
866 858
 
867 859
     /* do the prediction */
... ...
@@ -54,8 +54,7 @@ int ff_wmv2_decode_mb(MpegEncContext *s, DCTELEM block[6][64]);
54 54
                                 CONFIG_MSMPEG4V3_DECODER || \
55 55
                                 CONFIG_WMV2_DECODER      || \
56 56
                                 CONFIG_VC1_DECODER)
57
-#define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V1_ENCODER || \
58
-                                CONFIG_MSMPEG4V2_ENCODER || \
57
+#define CONFIG_MSMPEG4_ENCODER (CONFIG_MSMPEG4V2_ENCODER || \
59 58
                                 CONFIG_MSMPEG4V3_ENCODER || \
60 59
                                 CONFIG_WMV2_ENCODER)
61 60
 
... ...
@@ -39,6 +39,8 @@ typedef struct TiffContext {
39 39
 
40 40
     int width, height;
41 41
     unsigned int bpp, bppcount;
42
+    uint32_t palette[256];
43
+    int palette_is_set;
42 44
     int le;
43 45
     enum TiffCompr compr;
44 46
     int invert;
... ...
@@ -255,11 +257,15 @@ static int init_image(TiffContext *s)
255 255
         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
256 256
         return ret;
257 257
     }
258
-    if (s->bpp == 8 && s->picture.data[1]){
259
-        /* make default grayscale pal */
260
-        pal = (uint32_t *) s->picture.data[1];
261
-        for (i = 0; i < 256; i++)
262
-            pal[i] = i * 0x010101;
258
+    if (s->avctx->pix_fmt == PIX_FMT_PAL8) {
259
+        if (s->palette_is_set) {
260
+            memcpy(s->picture.data[1], s->palette, sizeof(s->palette));
261
+        } else {
262
+            /* make default grayscale pal */
263
+            pal = (uint32_t *) s->picture.data[1];
264
+            for (i = 0; i < 256; i++)
265
+                pal[i] = i * 0x010101;
266
+        }
263 267
     }
264 268
     return 0;
265 269
 }
... ...
@@ -442,11 +448,7 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
442 442
         s->fill_order = value - 1;
443 443
         break;
444 444
     case TIFF_PAL:
445
-        if(s->avctx->pix_fmt != PIX_FMT_PAL8){
446
-            av_log(s->avctx, AV_LOG_ERROR, "Palette met but this is not palettized format\n");
447
-            return -1;
448
-        }
449
-        pal = (uint32_t *) s->picture.data[1];
445
+        pal = (uint32_t *) s->palette;
450 446
         off = type_sizes[type];
451 447
         rp = buf;
452 448
         gp = buf + count / 3 * off;
... ...
@@ -459,6 +461,7 @@ static int tiff_decode_tag(TiffContext *s, const uint8_t *start, const uint8_t *
459 459
             j |= tget(&bp, type, s->le) >> off;
460 460
             pal[i] = j;
461 461
         }
462
+        s->palette_is_set = 1;
462 463
         break;
463 464
     case TIFF_PLANAR:
464 465
         if(value == 2){
... ...
@@ -287,13 +287,8 @@ zrmjpeg
287 287
 CpuCaps gCpuCaps; //FIXME initialize this so optims work
288 288
 
289 289
 
290
-//exact copy from vf_scale.c
291 290
 int get_sws_cpuflags(void){
292
-    return
293
-          (gCpuCaps.hasMMX   ? SWS_CPU_CAPS_MMX   : 0)
294
-        | (gCpuCaps.hasMMX2  ? SWS_CPU_CAPS_MMX2  : 0)
295
-        | (gCpuCaps.has3DNow ? SWS_CPU_CAPS_3DNOW : 0)
296
-        | (gCpuCaps.hasAltiVec ? SWS_CPU_CAPS_ALTIVEC : 0);
291
+    return 0;
297 292
 }
298 293
 
299 294
 static void sws_getFlagsAndFilterFromCmdLine(int *flags, SwsFilter **srcFilterParam, SwsFilter **dstFilterParam)
... ...
@@ -348,7 +343,7 @@ struct SwsContext *sws_getContextFromCmdLine(int srcW, int srcH, int srcFormat,
348 348
         if (srcFormat == IMGFMT_RGB8 || srcFormat == IMGFMT_BGR8) sfmt = PIX_FMT_PAL8;
349 349
         sws_getFlagsAndFilterFromCmdLine(&flags, &srcFilterParam, &dstFilterParam);
350 350
 
351
-        return sws_getContext(srcW, srcH, sfmt, dstW, dstH, dfmt, flags | get_sws_cpuflags(), srcFilterParam, dstFilterParam, NULL);
351
+        return sws_getContext(srcW, srcH, sfmt, dstW, dstH, dfmt, flags , srcFilterParam, dstFilterParam, NULL);
352 352
 }
353 353
 
354 354
 typedef struct {
... ...
@@ -27,6 +27,7 @@
27 27
  * memory buffer source API for video
28 28
  */
29 29
 
30
+
30 31
 #include "avfilter.h"
31 32
 
32 33
 /**
... ...
@@ -276,7 +276,7 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap)
276 276
     ape->frames[0].nblocks = ape->blocksperframe;
277 277
     ape->frames[0].skip    = 0;
278 278
     for (i = 1; i < ape->totalframes; i++) {
279
-        ape->frames[i].pos      = ape->seektable[i] + ape->junklength; //ape->frames[i-1].pos + ape->blocksperframe;
279
+        ape->frames[i].pos      = ape->seektable[i] + ape->junklength;
280 280
         ape->frames[i].nblocks  = ape->blocksperframe;
281 281
         ape->frames[i - 1].size = ape->frames[i].pos - ape->frames[i - 1].pos;
282 282
         ape->frames[i].skip     = (ape->frames[i].pos - ape->frames[0].pos) & 3;
... ...
@@ -37,6 +37,7 @@
37 37
 #include "config.h"
38 38
 #include "attributes.h"
39 39
 #include "timer.h"
40
+#include "cpu.h"
40 41
 
41 42
 #ifndef attribute_align_arg
42 43
 #if ARCH_X86_32 && AV_GCC_VERSION_AT_LEAST(4,2)
... ...
@@ -222,4 +223,19 @@
222 222
 #   define ONLY_IF_THREADS_ENABLED(x) NULL
223 223
 #endif
224 224
 
225
+#if HAVE_MMX
226
+/**
227
+ * Empty mmx state.
228
+ * this must be called between any dsp function and float/double code.
229
+ * for example sin(); dsp->idct_put(); emms_c(); cos()
230
+ */
231
+static av_always_inline void emms_c(void)
232
+{
233
+    if(av_get_cpu_flags() & AV_CPU_FLAG_MMX)
234
+        __asm__ volatile ("emms" ::: "memory");
235
+}
236
+#else /* HAVE_MMX */
237
+#define emms_c()
238
+#endif /* HAVE_MMX */
239
+
225 240
 #endif /* AVUTIL_INTERNAL_H */
... ...
@@ -79,15 +79,13 @@ static int yuyvtoyv12_unscaled(SwsContext *c, uint8_t* src[], int srcStride[], i
79 79
 void ff_bfin_get_unscaled_swscale(SwsContext *c)
80 80
 {
81 81
     SwsFunc swScale = c->swScale;
82
-    if (c->flags & SWS_CPU_CAPS_BFIN)
83
-        if (c->dstFormat == PIX_FMT_YUV420P)
84
-            if (c->srcFormat == PIX_FMT_UYVY422) {
85
-                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
86
-                c->swScale = uyvytoyv12_unscaled;
87
-            }
88
-        if (c->dstFormat == PIX_FMT_YUV420P)
89
-            if (c->srcFormat == PIX_FMT_YUYV422) {
90
-                av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
91
-                c->swScale = yuyvtoyv12_unscaled;
92
-            }
82
+
83
+    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_UYVY422) {
84
+        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized uyvytoyv12_unscaled\n");
85
+        c->swScale = uyvytoyv12_unscaled;
86
+    }
87
+    if (c->dstFormat == PIX_FMT_YUV420P && c->srcFormat == PIX_FMT_YUYV422) {
88
+        av_log (NULL, AV_LOG_VERBOSE, "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
89
+        c->swScale = yuyvtoyv12_unscaled;
90
+    }
93 91
 }
... ...
@@ -33,31 +33,6 @@
33 33
 
34 34
 #define FUNC(s,d,n) {s,d,#n,n}
35 35
 
36
-static int cpu_caps;
37
-
38
-static char *args_parse(int argc, char *argv[])
39
-{
40
-    int o;
41
-
42
-    while ((o = getopt(argc, argv, "m23")) != -1) {
43
-        switch (o) {
44
-        case 'm':
45
-            cpu_caps |= SWS_CPU_CAPS_MMX;
46
-            break;
47
-        case '2':
48
-            cpu_caps |= SWS_CPU_CAPS_MMX2;
49
-            break;
50
-        case '3':
51
-            cpu_caps |= SWS_CPU_CAPS_3DNOW;
52
-            break;
53
-        default:
54
-            av_log(NULL, AV_LOG_ERROR, "Unknown option %c\n", o);
55
-        }
56
-    }
57
-
58
-    return argv[optind];
59
-}
60
-
61 36
 int main(int argc, char **argv)
62 37
 {
63 38
     int i, funcNum;
... ...
@@ -70,9 +45,7 @@ int main(int argc, char **argv)
70 70
         return -1;
71 71
 
72 72
     av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
73
-    args_parse(argc, argv);
74
-    av_log(NULL, AV_LOG_INFO, "CPU capabilities forced to %x\n", cpu_caps);
75
-    sws_rgb2rgb_init(cpu_caps);
73
+    sws_rgb2rgb_init();
76 74
 
77 75
     for(funcNum=0; ; funcNum++) {
78 76
         struct func_info_s {
... ...
@@ -48,12 +48,6 @@ static const AVOption options[] = {
48 48
     { "spline", "natural bicubic spline", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
49 49
     { "print_info", "print info", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
50 50
     { "accurate_rnd", "accurate rounding", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
51
-    { "mmx", "MMX SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_MMX }, INT_MIN, INT_MAX, VE, "sws_flags" },
52
-    { "mmx2", "MMX2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_MMX2 }, INT_MIN, INT_MAX, VE, "sws_flags" },
53
-    { "sse2", "SSE2 SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_SSE2 }, INT_MIN, INT_MAX, VE, "sws_flags" },
54
-    { "3dnow", "3DNOW SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_3DNOW }, INT_MIN, INT_MAX, VE, "sws_flags" },
55
-    { "altivec", "AltiVec SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_ALTIVEC }, INT_MIN, INT_MAX, VE, "sws_flags" },
56
-    { "bfin", "Blackfin SIMD acceleration", 0, FF_OPT_TYPE_CONST, {.dbl = SWS_CPU_CAPS_BFIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
57 51
     { "full_chroma_int", "full chroma interpolation", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
58 52
     { "full_chroma_inp", "full chroma input", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
59 53
     { "bitexact", "", 0 , FF_OPT_TYPE_CONST, {.dbl = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
... ...
@@ -23,69 +23,16 @@
23 23
 #include "swscale_altivec_template.c"
24 24
 #endif
25 25
 
26
+#if COMPILE_TEMPLATE_ALTIVEC
26 27
 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
27 28
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
28 29
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
29 30
 {
30
-#if COMPILE_TEMPLATE_ALTIVEC
31 31
     yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
32 32
                           chrFilter, chrSrc, chrFilterSize,
33 33
                           dest, uDest, vDest, dstW, chrDstW);
34
-#else //COMPILE_TEMPLATE_ALTIVEC
35
-    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
36
-                chrFilter, chrSrc, chrFilterSize,
37
-                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
38
-#endif //!COMPILE_TEMPLATE_ALTIVEC
39
-}
40
-
41
-static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
42
-                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
43
-                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
44
-{
45
-    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
46
-                 chrFilter, chrSrc, chrFilterSize,
47
-                 dest, uDest, dstW, chrDstW, dstFormat);
48 34
 }
49 35
 
50
-static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
51
-                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
52
-{
53
-    int i;
54
-    for (i=0; i<dstW; i++) {
55
-        int val= (lumSrc[i]+64)>>7;
56
-
57
-        if (val&256) {
58
-            if (val<0) val=0;
59
-            else       val=255;
60
-        }
61
-
62
-        dest[i]= val;
63
-    }
64
-
65
-    if (uDest)
66
-        for (i=0; i<chrDstW; i++) {
67
-            int u=(chrSrc[i       ]+64)>>7;
68
-            int v=(chrSrc[i + VOFW]+64)>>7;
69
-
70
-            if ((u|v)&256) {
71
-                if (u<0)        u=0;
72
-                else if (u>255) u=255;
73
-                if (v<0)        v=0;
74
-                else if (v>255) v=255;
75
-            }
76
-
77
-            uDest[i]= u;
78
-            vDest[i]= v;
79
-        }
80
-
81
-    if (CONFIG_SWSCALE_ALPHA && aDest)
82
-        for (i=0; i<dstW; i++) {
83
-            int val= (alpSrc[i]+64)>>7;
84
-            aDest[i]= av_clip_uint8(val);
85
-        }
86
-}
87
-
88
-
89 36
 /**
90 37
  * vertical scale YV12 to RGB
91 38
  */
... ...
@@ -93,7 +40,6 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
93 93
                                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
94 94
                                        const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
95 95
 {
96
-#if COMPILE_TEMPLATE_ALTIVEC
97 96
     /* The following list of supported dstFormat values should
98 97
        match what's found in the body of ff_yuv2packedX_altivec() */
99 98
     if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
... ...
@@ -104,815 +50,17 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
104 104
                                    chrFilter, chrSrc, chrFilterSize,
105 105
                                    dest, dstW, dstY);
106 106
     else
107
-#endif
108 107
         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
109 108
                        chrFilter, chrSrc, chrFilterSize,
110 109
                        alpSrc, dest, dstW, dstY);
111 110
 }
111
+#endif
112 112
 
113
-/**
114
- * vertical bilinear scale YV12 to RGB
115
- */
116
-static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
117
-                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
118
-{
119
-    int  yalpha1=4095- yalpha;
120
-    int uvalpha1=4095-uvalpha;
121
-    int i;
122
-
123
-    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
124
-}
125
-
126
-/**
127
- * YV12 to RGB without scaling or interpolating
128
- */
129
-static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
130
-                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
131
-{
132
-    const int yalpha1=0;
133
-    int i;
134
-
135
-    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
136
-    const int yalpha= 4096; //FIXME ...
137
-
138
-    if (flags&SWS_FULL_CHR_H_INT) {
139
-        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
140
-        return;
141
-    }
142
-
143
-    if (uvalpha < 2048) {
144
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
145
-    } else {
146
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
147
-    }
148
-}
149
-
150
-//FIXME yuy2* can read up to 7 samples too much
151
-
152
-static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
153
-{
154
-    int i;
155
-    for (i=0; i<width; i++)
156
-        dst[i]= src[2*i];
157
-}
158
-
159
-static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
160
-{
161
-    int i;
162
-    for (i=0; i<width; i++) {
163
-        dstU[i]= src1[4*i + 1];
164
-        dstV[i]= src1[4*i + 3];
165
-    }
166
-    assert(src1 == src2);
167
-}
168
-
169
-static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
170
-{
171
-    int i;
172
-    for (i=0; i<width; i++) {
173
-        dstU[i]= src1[2*i + 1];
174
-        dstV[i]= src2[2*i + 1];
175
-    }
176
-}
177
-
178
-/* This is almost identical to the previous, end exists only because
179
- * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
180
-static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
181
-{
182
-    int i;
183
-    for (i=0; i<width; i++)
184
-        dst[i]= src[2*i+1];
185
-}
186
-
187
-static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
188
-{
189
-    int i;
190
-    for (i=0; i<width; i++) {
191
-        dstU[i]= src1[4*i + 0];
192
-        dstV[i]= src1[4*i + 2];
193
-    }
194
-    assert(src1 == src2);
195
-}
196
-
197
-static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
198
-{
199
-    int i;
200
-    for (i=0; i<width; i++) {
201
-        dstU[i]= src1[2*i];
202
-        dstV[i]= src2[2*i];
203
-    }
204
-}
205
-
206
-static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
207
-                                    const uint8_t *src, long width)
208
-{
209
-    int i;
210
-    for (i = 0; i < width; i++) {
211
-        dst1[i] = src[2*i+0];
212
-        dst2[i] = src[2*i+1];
213
-    }
214
-}
215
-
216
-static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
217
-                                    const uint8_t *src1, const uint8_t *src2,
218
-                                    long width, uint32_t *unused)
219
-{
220
-    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
221
-}
222
-
223
-static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
224
-                                    const uint8_t *src1, const uint8_t *src2,
225
-                                    long width, uint32_t *unused)
226
-{
227
-    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
228
-}
229
-
230
-
231
-static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
232
-{
233
-    int i;
234
-    for (i=0; i<width; i++) {
235
-        int b= src[i*3+0];
236
-        int g= src[i*3+1];
237
-        int r= src[i*3+2];
238
-
239
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
240
-    }
241
-}
242
-
243
-static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
244
-{
245
-    int i;
246
-    for (i=0; i<width; i++) {
247
-        int b= src1[3*i + 0];
248
-        int g= src1[3*i + 1];
249
-        int r= src1[3*i + 2];
250
-
251
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
252
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
253
-    }
254
-    assert(src1 == src2);
255
-}
256
-
257
-static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
258
-{
259
-    int i;
260
-    for (i=0; i<width; i++) {
261
-        int b= src1[6*i + 0] + src1[6*i + 3];
262
-        int g= src1[6*i + 1] + src1[6*i + 4];
263
-        int r= src1[6*i + 2] + src1[6*i + 5];
264
-
265
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
266
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
267
-    }
268
-    assert(src1 == src2);
269
-}
270
-
271
-static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
272
-{
273
-    int i;
274
-    for (i=0; i<width; i++) {
275
-        int r= src[i*3+0];
276
-        int g= src[i*3+1];
277
-        int b= src[i*3+2];
278
-
279
-        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
280
-    }
281
-}
282
-
283
-static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
284
-{
285
-    int i;
286
-    assert(src1==src2);
287
-    for (i=0; i<width; i++) {
288
-        int r= src1[3*i + 0];
289
-        int g= src1[3*i + 1];
290
-        int b= src1[3*i + 2];
291
-
292
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
293
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
294
-    }
295
-}
296
-
297
-static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
298
-{
299
-    int i;
300
-    assert(src1==src2);
301
-    for (i=0; i<width; i++) {
302
-        int r= src1[6*i + 0] + src1[6*i + 3];
303
-        int g= src1[6*i + 1] + src1[6*i + 4];
304
-        int b= src1[6*i + 2] + src1[6*i + 5];
305
-
306
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
307
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
308
-    }
309
-}
310
-
311
-
312
-// bilinear / bicubic scaling
313
-static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
314
-                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
315
-{
316
-#if COMPILE_TEMPLATE_ALTIVEC
317
-    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
318
-#else
319
-    int i;
320
-    for (i=0; i<dstW; i++) {
321
-        int j;
322
-        int srcPos= filterPos[i];
323
-        int val=0;
324
-        //printf("filterPos: %d\n", filterPos[i]);
325
-        for (j=0; j<filterSize; j++) {
326
-            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
327
-            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
328
-        }
329
-        //filter += hFilterSize;
330
-        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
331
-        //dst[i] = val>>7;
332
-    }
333
-#endif /* COMPILE_TEMPLATE_ALTIVEC */
334
-}
335
-
336
-//FIXME all pal and rgb srcFormats could do this convertion as well
337
-//FIXME all scalers more complex than bilinear could do half of this transform
338
-static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
339
-{
340
-    int i;
341
-    for (i = 0; i < width; i++) {
342
-        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
343
-        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
344
-    }
345
-}
346
-static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
347
-{
348
-    int i;
349
-    for (i = 0; i < width; i++) {
350
-        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
351
-        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
352
-    }
353
-}
354
-static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
355
-{
356
-    int i;
357
-    for (i = 0; i < width; i++)
358
-        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
359
-}
360
-static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
361
-{
362
-    int i;
363
-    for (i = 0; i < width; i++)
364
-        dst[i] = (dst[i]*14071 + 33561947)>>14;
365
-}
366
-
367
-static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
368
-                                        long dstWidth, const uint8_t *src, int srcW,
369
-                                        int xInc)
370
-{
371
-    int i;
372
-    unsigned int xpos=0;
373
-    for (i=0;i<dstWidth;i++) {
374
-        register unsigned int xx=xpos>>16;
375
-        register unsigned int xalpha=(xpos&0xFFFF)>>9;
376
-        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
377
-        xpos+=xInc;
378
-    }
379
-}
380
-
381
-      // *** horizontal scale Y line to temp buffer
382
-static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
383
-                                   const int16_t *hLumFilter,
384
-                                   const int16_t *hLumFilterPos, int hLumFilterSize,
385
-                                   uint8_t *formatConvBuffer,
386
-                                   uint32_t *pal, int isAlpha)
387
-{
388
-    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
389
-    void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
390
-
391
-    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
392
-
393
-    if (toYV12) {
394
-        toYV12(formatConvBuffer, src, srcW, pal);
395
-        src= formatConvBuffer;
396
-    }
397
-
398
-    if (c->hScale16) {
399
-        c->hScale16(dst, dstWidth, (uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
400
-    } else if (!c->hyscale_fast) {
401
-        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
402
-    } else { // fast bilinear upscale / crap downscale
403
-        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
404
-    }
405
-
406
-    if (convertRange)
407
-        convertRange(dst, dstWidth);
408
-}
409
-
410
-static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
411
-                                        long dstWidth, const uint8_t *src1,
412
-                                        const uint8_t *src2, int srcW, int xInc)
413
-{
414
-    int i;
415
-    unsigned int xpos=0;
416
-    for (i=0;i<dstWidth;i++) {
417
-        register unsigned int xx=xpos>>16;
418
-        register unsigned int xalpha=(xpos&0xFFFF)>>9;
419
-        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
420
-        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
421
-        /* slower
422
-        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
423
-        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
424
-        */
425
-        xpos+=xInc;
426
-    }
427
-}
428
-
429
-inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
430
-                                   int srcW, int xInc, const int16_t *hChrFilter,
431
-                                   const int16_t *hChrFilterPos, int hChrFilterSize,
432
-                                   uint8_t *formatConvBuffer,
433
-                                   uint32_t *pal)
434
-{
435
-
436
-    src1 += c->chrSrcOffset;
437
-    src2 += c->chrSrcOffset;
438
-
439
-    if (c->chrToYV12) {
440
-        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
441
-        src1= formatConvBuffer;
442
-        src2= formatConvBuffer+VOFW;
443
-    }
444
-
445
-    if (c->hScale16) {
446
-        c->hScale16(dst     , dstWidth, (uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
447
-        c->hScale16(dst+VOFW, dstWidth, (uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
448
-    } else if (!c->hcscale_fast) {
449
-        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
450
-        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
451
-    } else { // fast bilinear upscale / crap downscale
452
-        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
453
-    }
454
-
455
-    if (c->chrConvertRange)
456
-        c->chrConvertRange(dst, dstWidth);
457
-}
458
-
459
-#define DEBUG_SWSCALE_BUFFERS 0
460
-#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
461
-
462
-static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
463
-                           int srcSliceH, uint8_t* dst[], int dstStride[])
464
-{
465
-    /* load a few things into local vars to make the code more readable? and faster */
466
-    const int srcW= c->srcW;
467
-    const int dstW= c->dstW;
468
-    const int dstH= c->dstH;
469
-    const int chrDstW= c->chrDstW;
470
-    const int chrSrcW= c->chrSrcW;
471
-    const int lumXInc= c->lumXInc;
472
-    const int chrXInc= c->chrXInc;
473
-    const enum PixelFormat dstFormat= c->dstFormat;
474
-    const int flags= c->flags;
475
-    int16_t *vLumFilterPos= c->vLumFilterPos;
476
-    int16_t *vChrFilterPos= c->vChrFilterPos;
477
-    int16_t *hLumFilterPos= c->hLumFilterPos;
478
-    int16_t *hChrFilterPos= c->hChrFilterPos;
479
-    int16_t *vLumFilter= c->vLumFilter;
480
-    int16_t *vChrFilter= c->vChrFilter;
481
-    int16_t *hLumFilter= c->hLumFilter;
482
-    int16_t *hChrFilter= c->hChrFilter;
483
-    int32_t *lumMmxFilter= c->lumMmxFilter;
484
-    int32_t *chrMmxFilter= c->chrMmxFilter;
485
-    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
486
-    const int vLumFilterSize= c->vLumFilterSize;
487
-    const int vChrFilterSize= c->vChrFilterSize;
488
-    const int hLumFilterSize= c->hLumFilterSize;
489
-    const int hChrFilterSize= c->hChrFilterSize;
490
-    int16_t **lumPixBuf= c->lumPixBuf;
491
-    int16_t **chrPixBuf= c->chrPixBuf;
492
-    int16_t **alpPixBuf= c->alpPixBuf;
493
-    const int vLumBufSize= c->vLumBufSize;
494
-    const int vChrBufSize= c->vChrBufSize;
495
-    uint8_t *formatConvBuffer= c->formatConvBuffer;
496
-    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
497
-    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
498
-    int lastDstY;
499
-    uint32_t *pal=c->pal_yuv;
500
-
501
-    /* vars which will change and which we need to store back in the context */
502
-    int dstY= c->dstY;
503
-    int lumBufIndex= c->lumBufIndex;
504
-    int chrBufIndex= c->chrBufIndex;
505
-    int lastInLumBuf= c->lastInLumBuf;
506
-    int lastInChrBuf= c->lastInChrBuf;
507
-
508
-    if (isPacked(c->srcFormat)) {
509
-        src[0]=
510
-        src[1]=
511
-        src[2]=
512
-        src[3]= src[0];
513
-        srcStride[0]=
514
-        srcStride[1]=
515
-        srcStride[2]=
516
-        srcStride[3]= srcStride[0];
517
-    }
518
-    srcStride[1]<<= c->vChrDrop;
519
-    srcStride[2]<<= c->vChrDrop;
520
-
521
-    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
522
-                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
523
-                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
524
-    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
525
-                   srcSliceY,    srcSliceH,    dstY,    dstH);
526
-    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
527
-                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
528
-
529
-    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
530
-        static int warnedAlready=0; //FIXME move this into the context perhaps
531
-        if (flags & SWS_PRINT_INFO && !warnedAlready) {
532
-            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
533
-                   "         ->cannot do aligned memory accesses anymore\n");
534
-            warnedAlready=1;
535
-        }
536
-    }
537
-
538
-    /* Note the user might start scaling the picture in the middle so this
539
-       will not get executed. This is not really intended but works
540
-       currently, so people might do it. */
541
-    if (srcSliceY ==0) {
542
-        lumBufIndex=-1;
543
-        chrBufIndex=-1;
544
-        dstY=0;
545
-        lastInLumBuf= -1;
546
-        lastInChrBuf= -1;
547
-    }
548
-
549
-    lastDstY= dstY;
550
-
551
-    for (;dstY < dstH; dstY++) {
552
-        unsigned char *dest =dst[0]+dstStride[0]*dstY;
553
-        const int chrDstY= dstY>>c->chrDstVSubSample;
554
-        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
555
-        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
556
-        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
557
-
558
-        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
559
-        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
560
-        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
561
-        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
562
-        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
563
-        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
564
-        int enough_lines;
565
-
566
-        //handle holes (FAST_BILINEAR & weird filters)
567
-        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
568
-        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
569
-        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
570
-        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
571
-
572
-        DEBUG_BUFFERS("dstY: %d\n", dstY);
573
-        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
574
-                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
575
-        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
576
-                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
577
-
578
-        // Do we have enough lines in this slice to output the dstY line
579
-        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
580
-
581
-        if (!enough_lines) {
582
-            lastLumSrcY = srcSliceY + srcSliceH - 1;
583
-            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
584
-            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
585
-                                            lastLumSrcY, lastChrSrcY);
586
-        }
587
-
588
-        //Do horizontal scaling
589
-        while(lastInLumBuf < lastLumSrcY) {
590
-            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
591
-            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
592
-            lumBufIndex++;
593
-            assert(lumBufIndex < 2*vLumBufSize);
594
-            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
595
-            assert(lastInLumBuf + 1 - srcSliceY >= 0);
596
-            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
597
-                            hLumFilter, hLumFilterPos, hLumFilterSize,
598
-                            formatConvBuffer,
599
-                            pal, 0);
600
-            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
601
-                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
602
-                                hLumFilter, hLumFilterPos, hLumFilterSize,
603
-                                formatConvBuffer,
604
-                                pal, 1);
605
-            lastInLumBuf++;
606
-            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
607
-                               lumBufIndex,    lastInLumBuf);
608
-        }
609
-        while(lastInChrBuf < lastChrSrcY) {
610
-            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
611
-            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
612
-            chrBufIndex++;
613
-            assert(chrBufIndex < 2*vChrBufSize);
614
-            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
615
-            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
616
-            //FIXME replace parameters through context struct (some at least)
617
-
618
-            if (c->needs_hcscale)
619
-                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
620
-                                hChrFilter, hChrFilterPos, hChrFilterSize,
621
-                                formatConvBuffer,
622
-                                pal);
623
-            lastInChrBuf++;
624
-            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
625
-                               chrBufIndex,    lastInChrBuf);
626
-        }
627
-        //wrap buf index around to stay inside the ring buffer
628
-        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
629
-        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
630
-        if (!enough_lines)
631
-            break; //we can't output a dstY line so let's try with the next slice
632
-
633
-        if (dstY < dstH-2) {
634
-            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
635
-            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
636
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
637
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
638
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
639
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
640
-                c->yuv2nv12X(c,
641
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
642
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
643
-                             dest, uDest, dstW, chrDstW, dstFormat);
644
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
645
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
646
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
647
-                if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
648
-                    yuv2yuvX16inC(
649
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
650
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
651
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
652
-                                  dstFormat);
653
-                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
654
-                    const int16_t *lumBuf = lumSrcPtr[0];
655
-                    const int16_t *chrBuf= chrSrcPtr[0];
656
-                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
657
-                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
658
-                } else { //General YV12
659
-                    c->yuv2yuvX(c,
660
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
661
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
662
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
663
-                }
664
-            } else {
665
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
666
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
667
-                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
668
-                    int chrAlpha= vChrFilter[2*dstY+1];
669
-                    if(flags & SWS_FULL_CHR_H_INT) {
670
-                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
671
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
672
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
673
-                                         alpSrcPtr, dest, dstW, dstY);
674
-                    } else {
675
-                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
676
-                                       alpPixBuf ? *alpSrcPtr : NULL,
677
-                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
678
-                    }
679
-                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
680
-                    int lumAlpha= vLumFilter[2*dstY+1];
681
-                    int chrAlpha= vChrFilter[2*dstY+1];
682
-                    lumMmxFilter[2]=
683
-                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
684
-                    chrMmxFilter[2]=
685
-                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
686
-                    if(flags & SWS_FULL_CHR_H_INT) {
687
-                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
688
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
689
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
690
-                                         alpSrcPtr, dest, dstW, dstY);
691
-                    } else {
692
-                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
693
-                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
694
-                                       dest, dstW, lumAlpha, chrAlpha, dstY);
695
-                    }
696
-                } else { //general RGB
697
-                    if(flags & SWS_FULL_CHR_H_INT) {
698
-                        yuv2rgbXinC_full(c,
699
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
700
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
701
-                                         alpSrcPtr, dest, dstW, dstY);
702
-                    } else {
703
-                        c->yuv2packedX(c,
704
-                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
705
-                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
706
-                                       alpSrcPtr, dest, dstW, dstY);
707
-                    }
708
-                }
709
-            }
710
-        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
711
-            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
712
-            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
713
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
714
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
715
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
716
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
717
-                yuv2nv12XinC(
718
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
719
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
720
-                             dest, uDest, dstW, chrDstW, dstFormat);
721
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
722
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
723
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
724
-                if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
725
-                    yuv2yuvX16inC(
726
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
727
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
728
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
729
-                                  dstFormat);
730
-                } else {
731
-                    yuv2yuvXinC(
732
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
733
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
734
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
735
-                }
736
-            } else {
737
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
738
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
739
-                if(flags & SWS_FULL_CHR_H_INT) {
740
-                    yuv2rgbXinC_full(c,
741
-                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
742
-                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
743
-                                     alpSrcPtr, dest, dstW, dstY);
744
-                } else {
745
-                    yuv2packedXinC(c,
746
-                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
747
-                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
748
-                                   alpSrcPtr, dest, dstW, dstY);
749
-                }
750
-            }
751
-        }
752
-    }
753
-
754
-    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
755
-        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
756
-
757
-    /* store changed local vars back in the context */
758
-    c->dstY= dstY;
759
-    c->lumBufIndex= lumBufIndex;
760
-    c->chrBufIndex= chrBufIndex;
761
-    c->lastInLumBuf= lastInLumBuf;
762
-    c->lastInChrBuf= lastInChrBuf;
763
-
764
-    return dstY - lastDstY;
765
-}
766 113
 
767 114
 static void RENAME(sws_init_swScale)(SwsContext *c)
768 115
 {
769 116
     enum PixelFormat srcFormat = c->srcFormat;
770 117
 
771
-    c->yuv2nv12X    = RENAME(yuv2nv12X   );
772
-    c->yuv2yuv1     = RENAME(yuv2yuv1    );
773 118
     c->yuv2yuvX     = RENAME(yuv2yuvX    );
774
-    c->yuv2packed1  = RENAME(yuv2packed1 );
775
-    c->yuv2packed2  = RENAME(yuv2packed2 );
776 119
     c->yuv2packedX  = RENAME(yuv2packedX );
777
-
778
-    c->hScale       = RENAME(hScale      );
779
-
780
-    if (c->flags & SWS_FAST_BILINEAR)
781
-    {
782
-        c->hyscale_fast = RENAME(hyscale_fast);
783
-        c->hcscale_fast = RENAME(hcscale_fast);
784
-    }
785
-
786
-    c->chrToYV12 = NULL;
787
-    switch(srcFormat) {
788
-        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
789
-        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
790
-        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
791
-        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
792
-        case PIX_FMT_RGB8     :
793
-        case PIX_FMT_BGR8     :
794
-        case PIX_FMT_PAL8     :
795
-        case PIX_FMT_BGR4_BYTE:
796
-        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
797
-        case PIX_FMT_GRAY16BE :
798
-        case PIX_FMT_YUV420P9BE:
799
-        case PIX_FMT_YUV422P10BE:
800
-        case PIX_FMT_YUV420P10BE:
801
-        case PIX_FMT_YUV420P16BE:
802
-        case PIX_FMT_YUV422P16BE:
803
-        case PIX_FMT_YUV444P16BE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16) : RENAME(hScale16X); break;
804
-        case PIX_FMT_GRAY16LE :
805
-        case PIX_FMT_YUV420P9LE:
806
-        case PIX_FMT_YUV422P10LE:
807
-        case PIX_FMT_YUV420P10LE:
808
-        case PIX_FMT_YUV420P16LE:
809
-        case PIX_FMT_YUV422P16LE:
810
-        case PIX_FMT_YUV444P16LE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16X) : RENAME(hScale16); break;
811
-    }
812
-    if (c->chrSrcHSubSample) {
813
-        switch(srcFormat) {
814
-        case PIX_FMT_RGB48BE:
815
-        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
816
-        case PIX_FMT_BGR48BE:
817
-        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half; break;
818
-        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half;  break;
819
-        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
820
-        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
821
-        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
822
-        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
823
-        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half;  break;
824
-        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
825
-        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
826
-        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
827
-        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
828
-        }
829
-    } else {
830
-        switch(srcFormat) {
831
-        case PIX_FMT_RGB48BE:
832
-        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
833
-        case PIX_FMT_BGR48BE:
834
-        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV; break;
835
-        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV;  break;
836
-        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
837
-        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
838
-        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
839
-        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
840
-        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV;  break;
841
-        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
842
-        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
843
-        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
844
-        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
845
-        }
846
-    }
847
-
848
-    c->lumToYV12 = NULL;
849
-    c->alpToYV12 = NULL;
850
-    switch (srcFormat) {
851
-    case PIX_FMT_YUYV422  :
852
-    case PIX_FMT_GRAY8A   :
853
-                            c->lumToYV12 = RENAME(yuy2ToY); break;
854
-    case PIX_FMT_UYVY422  :
855
-                            c->lumToYV12 = RENAME(uyvyToY); break;
856
-    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
857
-    case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
858
-    case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
859
-    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
860
-    case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
861
-    case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
862
-    case PIX_FMT_RGB8     :
863
-    case PIX_FMT_BGR8     :
864
-    case PIX_FMT_PAL8     :
865
-    case PIX_FMT_BGR4_BYTE:
866
-    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
867
-    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
868
-    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
869
-    case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY;  break;
870
-    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
871
-    case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY;  break;
872
-    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
873
-    case PIX_FMT_RGB48BE:
874
-    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
875
-    case PIX_FMT_BGR48BE:
876
-    case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY; break;
877
-    }
878
-    if (c->alpPixBuf) {
879
-        switch (srcFormat) {
880
-        case PIX_FMT_RGB32  :
881
-        case PIX_FMT_RGB32_1:
882
-        case PIX_FMT_BGR32  :
883
-        case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
884
-        case PIX_FMT_GRAY8A : c->alpToYV12 = RENAME(yuy2ToY); break;
885
-        case PIX_FMT_PAL8   : c->alpToYV12 = palToA; break;
886
-        }
887
-    }
888
-
889
-    switch (srcFormat) {
890
-    case PIX_FMT_GRAY8A :
891
-        c->alpSrcOffset = 1;
892
-        break;
893
-    case PIX_FMT_RGB32  :
894
-    case PIX_FMT_BGR32  :
895
-        c->alpSrcOffset = 3;
896
-        break;
897
-    case PIX_FMT_RGB48LE:
898
-    case PIX_FMT_BGR48LE:
899
-        c->lumSrcOffset = 1;
900
-        c->chrSrcOffset = 1;
901
-        c->alpSrcOffset = 1;
902
-        break;
903
-    }
904
-
905
-    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
906
-        if (c->srcRange) {
907
-            c->lumConvertRange = RENAME(lumRangeFromJpeg);
908
-            c->chrConvertRange = RENAME(chrRangeFromJpeg);
909
-        } else {
910
-            c->lumConvertRange = RENAME(lumRangeToJpeg);
911
-            c->chrConvertRange = RENAME(chrRangeToJpeg);
912
-        }
913
-    }
914
-
915
-    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
916
-          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
917
-        c->needs_hcscale = 1;
918 120
 }
... ...
@@ -94,6 +94,7 @@ adjustment.
94 94
 #include "libswscale/rgb2rgb.h"
95 95
 #include "libswscale/swscale.h"
96 96
 #include "libswscale/swscale_internal.h"
97
+#include "libavutil/cpu.h"
97 98
 
98 99
 #undef PROFILE_THE_BEAST
99 100
 #undef INC_SCALING
... ...
@@ -692,7 +693,7 @@ static int altivec_uyvy_rgb32 (SwsContext *c,
692 692
 */
693 693
 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
694 694
 {
695
-    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
695
+    if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
696 696
         return NULL;
697 697
 
698 698
     /*
... ...
@@ -116,12 +116,11 @@ void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t
116 116
  32-bit C version, and and&add trick by Michael Niedermayer
117 117
 */
118 118
 
119
-void sws_rgb2rgb_init(int flags)
119
+void sws_rgb2rgb_init(void)
120 120
 {
121 121
     rgb2rgb_init_c();
122
-#if HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX
123
-    rgb2rgb_init_x86(flags);
124
-#endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
122
+    if (HAVE_MMX)
123
+        rgb2rgb_init_x86();
125 124
 }
126 125
 
127 126
 #if LIBSWSCALE_VERSION_MAJOR < 1
... ...
@@ -166,8 +166,8 @@ extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const u
166 166
                             long width, long height,
167 167
                             long lumStride, long chromStride, long srcStride);
168 168
 
169
-void sws_rgb2rgb_init(int flags);
169
+void sws_rgb2rgb_init(void);
170 170
 
171
-void rgb2rgb_init_x86(int flags);
171
+void rgb2rgb_init_x86(void);
172 172
 
173 173
 #endif /* SWSCALE_RGB2RGB_H */
... ...
@@ -278,25 +278,6 @@ static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst, long src_siz
278 278
     }
279 279
 }
280 280
 
281
-/*
282
- * mm0 = 00 B3 00 B2 00 B1 00 B0
283
- * mm1 = 00 G3 00 G2 00 G1 00 G0
284
- * mm2 = 00 R3 00 R2 00 R1 00 R0
285
- * mm6 = FF FF FF FF FF FF FF FF
286
- * mm7 = 00 00 00 00 00 00 00 00
287
- */
288
-#define PACK_RGB32 \
289
-    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
290
-    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
291
-    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
292
-    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
293
-    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
294
-    "movq       %%mm0, %%mm3    \n\t"                               \
295
-    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
296
-    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
297
-    MOVNTQ"     %%mm0,  %0      \n\t"                               \
298
-    MOVNTQ"     %%mm3, 8%0      \n\t"                               \
299
-
300 281
 static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, long src_size)
301 282
 {
302 283
     const uint16_t *end;
... ...
@@ -63,6 +63,7 @@ untested special converters
63 63
 #include "libavutil/avassert.h"
64 64
 #include "libavutil/intreadwrite.h"
65 65
 #include "libavutil/x86_cpu.h"
66
+#include "libavutil/cpu.h"
66 67
 #include "libavutil/avutil.h"
67 68
 #include "libavutil/mathematics.h"
68 69
 #include "libavutil/bswap.h"
... ...
@@ -71,10 +72,6 @@ untested special converters
71 71
 #undef MOVNTQ
72 72
 #undef PAVGB
73 73
 
74
-//#undef HAVE_MMX2
75
-//#define HAVE_AMD3DNOW
76
-//#undef HAVE_MMX
77
-//#undef ARCH_X86
78 74
 #define DITHER1XBPP
79 75
 
80 76
 #define isPacked(x)         (       \
... ...
@@ -1262,57 +1259,13 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
1262 1262
 
1263 1263
 //Note: we have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW+MMX2 one
1264 1264
 //Plain C versions
1265
-#if CONFIG_RUNTIME_CPUDETECT
1266
-#  define COMPILE_C 1
1267
-#  if   ARCH_X86
1268
-#    define COMPILE_MMX     1
1269
-#    define COMPILE_MMX2    1
1270
-#    define COMPILE_3DNOW   1
1271
-#  elif ARCH_PPC
1272
-#    define COMPILE_ALTIVEC HAVE_ALTIVEC
1273
-#  endif
1274
-#else /* CONFIG_RUNTIME_CPUDETECT */
1275
-#  if   ARCH_X86
1276
-#    if   HAVE_MMX2
1277
-#      define COMPILE_MMX2  1
1278
-#    elif HAVE_AMD3DNOW
1279
-#      define COMPILE_3DNOW 1
1280
-#    elif HAVE_MMX
1281
-#      define COMPILE_MMX   1
1282
-#    else
1283
-#      define COMPILE_C     1
1284
-#    endif
1285
-#  elif ARCH_PPC && HAVE_ALTIVEC
1286
-#    define COMPILE_ALTIVEC 1
1287
-#  else
1288
-#    define COMPILE_C       1
1289
-#  endif
1290
-#endif
1291
-
1292
-#ifndef COMPILE_C
1293
-#  define COMPILE_C 0
1294
-#endif
1295
-#ifndef COMPILE_MMX
1296
-#  define COMPILE_MMX 0
1297
-#endif
1298
-#ifndef COMPILE_MMX2
1299
-#  define COMPILE_MMX2 0
1300
-#endif
1301
-#ifndef COMPILE_3DNOW
1302
-#  define COMPILE_3DNOW 0
1303
-#endif
1304
-#ifndef COMPILE_ALTIVEC
1305
-#  define COMPILE_ALTIVEC 0
1306
-#endif
1307 1265
 
1308
-#define COMPILE_TEMPLATE_MMX 0
1309 1266
 #define COMPILE_TEMPLATE_MMX2 0
1310
-#define COMPILE_TEMPLATE_AMD3DNOW 0
1311 1267
 #define COMPILE_TEMPLATE_ALTIVEC 0
1312 1268
 
1313 1269
 #include "swscale_template.c"
1314 1270
 
1315
-#if COMPILE_ALTIVEC
1271
+#if HAVE_ALTIVEC
1316 1272
 #undef RENAME
1317 1273
 #undef COMPILE_TEMPLATE_ALTIVEC
1318 1274
 #define COMPILE_TEMPLATE_ALTIVEC 1
... ...
@@ -1320,90 +1273,42 @@ static inline void monoblack2Y(uint8_t *dst, const uint8_t *src, long width, uin
1320 1320
 #include "ppc/swscale_template.c"
1321 1321
 #endif
1322 1322
 
1323
-#if ARCH_X86
1324
-
1325 1323
 //MMX versions
1326
-#if COMPILE_MMX
1324
+#if HAVE_MMX
1327 1325
 #undef RENAME
1328
-#undef COMPILE_TEMPLATE_MMX
1329 1326
 #undef COMPILE_TEMPLATE_MMX2
1330
-#undef COMPILE_TEMPLATE_AMD3DNOW
1331
-#define COMPILE_TEMPLATE_MMX 1
1332 1327
 #define COMPILE_TEMPLATE_MMX2 0
1333
-#define COMPILE_TEMPLATE_AMD3DNOW 0
1334 1328
 #define RENAME(a) a ## _MMX
1335 1329
 #include "x86/swscale_template.c"
1336 1330
 #endif
1337 1331
 
1338 1332
 //MMX2 versions
1339
-#if COMPILE_MMX2
1333
+#if HAVE_MMX2
1340 1334
 #undef RENAME
1341
-#undef COMPILE_TEMPLATE_MMX
1342 1335
 #undef COMPILE_TEMPLATE_MMX2
1343
-#undef COMPILE_TEMPLATE_AMD3DNOW
1344
-#define COMPILE_TEMPLATE_MMX 1
1345 1336
 #define COMPILE_TEMPLATE_MMX2 1
1346
-#define COMPILE_TEMPLATE_AMD3DNOW 0
1347 1337
 #define RENAME(a) a ## _MMX2
1348 1338
 #include "x86/swscale_template.c"
1349 1339
 #endif
1350 1340
 
1351
-//3DNOW versions
1352
-#if COMPILE_3DNOW
1353
-#undef RENAME
1354
-#undef COMPILE_TEMPLATE_MMX
1355
-#undef COMPILE_TEMPLATE_MMX2
1356
-#undef COMPILE_TEMPLATE_AMD3DNOW
1357
-#define COMPILE_TEMPLATE_MMX 1
1358
-#define COMPILE_TEMPLATE_MMX2 0
1359
-#define COMPILE_TEMPLATE_AMD3DNOW 1
1360
-#define RENAME(a) a ## _3DNow
1361
-#include "x86/swscale_template.c"
1362
-#endif
1363
-
1364
-#endif //ARCH_X86
1365
-
1366 1341
 SwsFunc ff_getSwsFunc(SwsContext *c)
1367 1342
 {
1343
+    int cpu_flags = av_get_cpu_flags();
1344
+
1368 1345
     sws_init_swScale_c(c);
1369 1346
 
1370
-#if CONFIG_RUNTIME_CPUDETECT
1371
-#if ARCH_X86
1372
-    // ordered per speed fastest first
1373
-    if (c->flags & SWS_CPU_CAPS_MMX2) {
1374
-        sws_init_swScale_MMX2(c);
1375
-        return swScale_MMX2;
1376
-    } else if (c->flags & SWS_CPU_CAPS_3DNOW) {
1377
-        sws_init_swScale_3DNow(c);
1378
-        return swScale_3DNow;
1379
-    } else if (c->flags & SWS_CPU_CAPS_MMX) {
1347
+#if HAVE_MMX
1348
+    if (cpu_flags & AV_CPU_FLAG_MMX)
1380 1349
         sws_init_swScale_MMX(c);
1381
-        return swScale_MMX;
1382
-    }
1383
-
1384
-#else
1385
-#if COMPILE_ALTIVEC
1386
-    if (c->flags & SWS_CPU_CAPS_ALTIVEC) {
1387
-        sws_init_swScale_altivec(c);
1388
-        return swScale_altivec;
1389
-    }
1390 1350
 #endif
1391
-#endif /* ARCH_X86 */
1392
-#else //CONFIG_RUNTIME_CPUDETECT
1393
-#if   COMPILE_TEMPLATE_MMX2
1394
-    sws_init_swScale_MMX2(c);
1395
-    return swScale_MMX2;
1396
-#elif COMPILE_TEMPLATE_AMD3DNOW
1397
-    sws_init_swScale_3DNow(c);
1398
-    return swScale_3DNow;
1399
-#elif COMPILE_TEMPLATE_MMX
1400
-    sws_init_swScale_MMX(c);
1401
-    return swScale_MMX;
1402
-#elif COMPILE_TEMPLATE_ALTIVEC
1403
-    sws_init_swScale_altivec(c);
1404
-    return swScale_altivec;
1351
+#if HAVE_MMX2
1352
+    if (cpu_flags & AV_CPU_FLAG_MMX2)
1353
+        sws_init_swScale_MMX2(c);
1354
+#endif
1355
+#if HAVE_ALTIVEC
1356
+    if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
1357
+        sws_init_swScale_altivec(c);
1405 1358
 #endif
1406
-#endif //!CONFIG_RUNTIME_CPUDETECT
1407 1359
 
1408 1360
     return swScale_c;
1409 1361
 }
... ...
@@ -1900,23 +1805,6 @@ static int planarCopyWrapper(SwsContext *c, const uint8_t* src[], int srcStride[
1900 1900
     return srcSliceH;
1901 1901
 }
1902 1902
 
1903
-int ff_hardcodedcpuflags(void)
1904
-{
1905
-    int flags = 0;
1906
-#if   COMPILE_TEMPLATE_MMX2
1907
-    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_MMX2;
1908
-#elif COMPILE_TEMPLATE_AMD3DNOW
1909
-    flags |= SWS_CPU_CAPS_MMX|SWS_CPU_CAPS_3DNOW;
1910
-#elif COMPILE_TEMPLATE_MMX
1911
-    flags |= SWS_CPU_CAPS_MMX;
1912
-#elif COMPILE_TEMPLATE_ALTIVEC
1913
-    flags |= SWS_CPU_CAPS_ALTIVEC;
1914
-#elif ARCH_BFIN
1915
-    flags |= SWS_CPU_CAPS_BFIN;
1916
-#endif
1917
-    return flags;
1918
-}
1919
-
1920 1903
 void ff_get_unscaled_swscale(SwsContext *c)
1921 1904
 {
1922 1905
     const enum PixelFormat srcFormat = c->srcFormat;
... ...
@@ -2000,8 +1888,8 @@ void ff_get_unscaled_swscale(SwsContext *c)
2000 2000
     if(srcFormat == PIX_FMT_UYVY422 && dstFormat == PIX_FMT_YUV422P)
2001 2001
         c->swScale= uyvyToYuv422Wrapper;
2002 2002
 
2003
-#if COMPILE_ALTIVEC
2004
-    if ((c->flags & SWS_CPU_CAPS_ALTIVEC) &&
2003
+#if HAVE_ALTIVEC
2004
+    if ((av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) &&
2005 2005
         !(c->flags & SWS_BITEXACT) &&
2006 2006
         srcFormat == PIX_FMT_YUV420P) {
2007 2007
         // unscaled YV12 -> packed YUV, we want speed
... ...
@@ -2031,8 +1919,7 @@ void ff_get_unscaled_swscale(SwsContext *c)
2031 2031
             c->swScale= planarCopyWrapper;
2032 2032
     }
2033 2033
 #if ARCH_BFIN
2034
-    if (flags & SWS_CPU_CAPS_BFIN)
2035
-        ff_bfin_get_unscaled_swscale (c);
2034
+    ff_bfin_get_unscaled_swscale (c);
2036 2035
 #endif
2037 2036
 }
2038 2037
 
... ...
@@ -95,13 +95,6 @@ const char *swscale_license(void);
95 95
 #define SWS_ACCURATE_RND      0x40000
96 96
 #define SWS_BITEXACT          0x80000
97 97
 
98
-#define SWS_CPU_CAPS_MMX      0x80000000
99
-#define SWS_CPU_CAPS_MMX2     0x20000000
100
-#define SWS_CPU_CAPS_3DNOW    0x40000000
101
-#define SWS_CPU_CAPS_ALTIVEC  0x10000000
102
-#define SWS_CPU_CAPS_BFIN     0x01000000
103
-#define SWS_CPU_CAPS_SSE2     0x02000000
104
-
105 98
 #define SWS_MAX_REDUCE_CUTOFF 0.002
106 99
 
107 100
 #define SWS_CS_ITU709         1
... ...
@@ -482,11 +482,6 @@ extern const AVClass sws_context_class;
482 482
 void ff_get_unscaled_swscale(SwsContext *c);
483 483
 
484 484
 /**
485
- * Returns the SWS_CPU_CAPS for the optimized code compiled into swscale.
486
- */
487
-int ff_hardcodedcpuflags(void);
488
-
489
-/**
490 485
  * Returns function pointer to fastest main scaler path function depending
491 486
  * on architecture and available optimizations.
492 487
  */
... ...
@@ -363,153 +363,11 @@ static inline void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
363 363
     }
364 364
 }
365 365
 
366
-static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
366
+static inline void hScale16_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
367 367
                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
368 368
 {
369 369
     int i, j;
370
-#if COMPILE_TEMPLATE_MMX
371
-    assert(filterSize % 4 == 0 && filterSize>0);
372
-    if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
373
-        x86_reg counter= -2*dstW;
374
-        filter-= counter*2;
375
-        filterPos-= counter/2;
376
-        dst-= counter/2;
377
-        __asm__ volatile(
378
-            "movd                   %5, %%mm7       \n\t"
379
-#if defined(PIC)
380
-            "push            %%"REG_b"              \n\t"
381
-#endif
382
-            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
383
-            "mov             %%"REG_a", %%"REG_BP"  \n\t"
384
-            ".p2align                4              \n\t"
385
-            "1:                                     \n\t"
386
-            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
387
-            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
388
-            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
389
-            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
390
-            "movq      (%3, %%"REG_a", 2), %%mm0    \n\t"
391
-            "movq      (%3, %%"REG_b", 2), %%mm2    \n\t"
392
-            "pmaddwd             %%mm1, %%mm0       \n\t"
393
-            "pmaddwd             %%mm2, %%mm3       \n\t"
394
-            "movq                %%mm0, %%mm4       \n\t"
395
-            "punpckldq           %%mm3, %%mm0       \n\t"
396
-            "punpckhdq           %%mm3, %%mm4       \n\t"
397
-            "paddd               %%mm4, %%mm0       \n\t"
398
-            "psrad               %%mm7, %%mm0       \n\t"
399
-            "packssdw            %%mm0, %%mm0       \n\t"
400
-            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
401
-            "add                    $4, %%"REG_BP"  \n\t"
402
-            " jnc                   1b              \n\t"
403
-
404
-            "pop            %%"REG_BP"              \n\t"
405
-#if defined(PIC)
406
-            "pop             %%"REG_b"              \n\t"
407
-#endif
408
-            : "+a" (counter)
409
-            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
410
-#if !defined(PIC)
411
-            : "%"REG_b
412
-#endif
413
-        );
414
-    } else if (filterSize==8 && shift<15) {
415
-        x86_reg counter= -2*dstW;
416
-        filter-= counter*4;
417
-        filterPos-= counter/2;
418
-        dst-= counter/2;
419
-        __asm__ volatile(
420
-            "movd                   %5, %%mm7       \n\t"
421
-#if defined(PIC)
422
-            "push            %%"REG_b"              \n\t"
423
-#endif
424
-            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
425
-            "mov              %%"REG_a", %%"REG_BP" \n\t"
426
-            ".p2align                 4             \n\t"
427
-            "1:                                     \n\t"
428
-            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
429
-            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
430
-            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
431
-            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
432
-            "movq       (%3, %%"REG_a", 2), %%mm0   \n\t"
433
-            "movq       (%3, %%"REG_b", 2), %%mm2   \n\t"
434
-            "pmaddwd              %%mm1, %%mm0      \n\t"
435
-            "pmaddwd              %%mm2, %%mm3      \n\t"
436
-
437
-            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
438
-            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
439
-            "movq      8(%3, %%"REG_a", 2), %%mm4   \n\t"
440
-            "movq      8(%3, %%"REG_b", 2), %%mm2   \n\t"
441
-            "pmaddwd              %%mm1, %%mm4      \n\t"
442
-            "pmaddwd              %%mm2, %%mm5      \n\t"
443
-            "paddd                %%mm4, %%mm0      \n\t"
444
-            "paddd                %%mm5, %%mm3      \n\t"
445
-            "movq                 %%mm0, %%mm4      \n\t"
446
-            "punpckldq            %%mm3, %%mm0      \n\t"
447
-            "punpckhdq            %%mm3, %%mm4      \n\t"
448
-            "paddd                %%mm4, %%mm0      \n\t"
449
-            "psrad                %%mm7, %%mm0      \n\t"
450
-            "packssdw             %%mm0, %%mm0      \n\t"
451
-            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
452
-            "add                     $4, %%"REG_BP" \n\t"
453
-            " jnc                    1b             \n\t"
454
-
455
-            "pop             %%"REG_BP"             \n\t"
456
-#if defined(PIC)
457
-            "pop             %%"REG_b"              \n\t"
458
-#endif
459
-            : "+a" (counter)
460
-            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
461
-#if !defined(PIC)
462
-            : "%"REG_b
463
-#endif
464
-        );
465
-    } else if (shift<15){
466
-        const uint16_t *offset = src+filterSize;
467
-        x86_reg counter= -2*dstW;
468
-        //filter-= counter*filterSize/2;
469
-        filterPos-= counter/2;
470
-        dst-= counter/2;
471
-        __asm__ volatile(
472
-            "movd                   %7, %%mm7       \n\t"
473
-            ".p2align                  4            \n\t"
474
-            "1:                                     \n\t"
475
-            "mov                      %2, %%"REG_c" \n\t"
476
-            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
477
-            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
478
-            "mov                      %5, %%"REG_c" \n\t"
479
-            "pxor                  %%mm4, %%mm4     \n\t"
480
-            "pxor                  %%mm5, %%mm5     \n\t"
481
-            "2:                                     \n\t"
482
-            "movq                   (%1), %%mm1     \n\t"
483
-            "movq               (%1, %6), %%mm3     \n\t"
484
-            "movq (%%"REG_c", %%"REG_a", 2), %%mm0     \n\t"
485
-            "movq (%%"REG_c", %%"REG_d", 2), %%mm2     \n\t"
486
-            "pmaddwd               %%mm1, %%mm0     \n\t"
487
-            "pmaddwd               %%mm2, %%mm3     \n\t"
488
-            "paddd                 %%mm3, %%mm5     \n\t"
489
-            "paddd                 %%mm0, %%mm4     \n\t"
490
-            "add                      $8, %1        \n\t"
491
-            "add                      $8, %%"REG_c" \n\t"
492
-            "cmp                      %4, %%"REG_c" \n\t"
493
-            " jb                      2b            \n\t"
494
-            "add                      %6, %1        \n\t"
495
-            "movq                  %%mm4, %%mm0     \n\t"
496
-            "punpckldq             %%mm5, %%mm4     \n\t"
497
-            "punpckhdq             %%mm5, %%mm0     \n\t"
498
-            "paddd                 %%mm0, %%mm4     \n\t"
499
-            "psrad                 %%mm7, %%mm4     \n\t"
500
-            "packssdw              %%mm4, %%mm4     \n\t"
501
-            "mov                      %3, %%"REG_a" \n\t"
502
-            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
503
-            "add                      $4, %0        \n\t"
504
-            " jnc                     1b            \n\t"
505
-
506
-            : "+r" (counter), "+r" (filter)
507
-            : "m" (filterPos), "m" (dst), "m"(offset),
508
-            "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
509
-            : "%"REG_a, "%"REG_c, "%"REG_d
510
-        );
511
-    } else
512
-#endif
370
+
513 371
     for (i=0; i<dstW; i++) {
514 372
         int srcPos= filterPos[i];
515 373
         int val=0;
... ...
@@ -520,7 +378,7 @@ static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src,
520 520
     }
521 521
 }
522 522
 
523
-static inline void RENAME(hScale16X)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
523
+static inline void hScale16X_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
524 524
                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
525 525
 {
526 526
     int i, j;
... ...
@@ -660,6 +518,11 @@ inline static void hcscale_c(SwsContext *c, uint16_t *dst, long dstWidth,
660 660
 #define DEBUG_SWSCALE_BUFFERS 0
661 661
 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
662 662
 
663
+#if HAVE_MMX
664
+static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
665
+                                  int lastInLumBuf, int lastInChrBuf);
666
+#endif
667
+
663 668
 static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
664 669
                      int srcSliceY, int srcSliceH, uint8_t* dst[], int dstStride[])
665 670
 {
... ...
@@ -831,6 +694,9 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
831 831
         if (!enough_lines)
832 832
             break; //we can't output a dstY line so let's try with the next slice
833 833
 
834
+#if HAVE_MMX
835
+        updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
836
+#endif
834 837
         if (dstY < dstH-2) {
835 838
             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
836 839
             const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
... ...
@@ -955,6 +821,12 @@ static int swScale_c(SwsContext *c, const uint8_t* src[], int srcStride[],
955 955
     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
956 956
         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
957 957
 
958
+#if HAVE_MMX2
959
+    if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
960
+        __asm__ volatile("sfence":::"memory");
961
+#endif
962
+    emms_c();
963
+
958 964
     /* store changed local vars back in the context */
959 965
     c->dstY= dstY;
960 966
     c->lumBufIndex= lumBufIndex;
... ...
@@ -1001,14 +873,14 @@ static void sws_init_swScale_c(SwsContext *c)
1001 1001
         case PIX_FMT_YUV420P10BE:
1002 1002
         case PIX_FMT_YUV420P16BE:
1003 1003
         case PIX_FMT_YUV422P16BE:
1004
-        case PIX_FMT_YUV444P16BE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16) : RENAME(hScale16X); break;
1004
+        case PIX_FMT_YUV444P16BE: c->hScale16= HAVE_BIGENDIAN ? hScale16_c : hScale16X_c; break;
1005 1005
         case PIX_FMT_GRAY16LE :
1006 1006
         case PIX_FMT_YUV420P9LE:
1007 1007
         case PIX_FMT_YUV422P10LE:
1008 1008
         case PIX_FMT_YUV420P10LE:
1009 1009
         case PIX_FMT_YUV420P16LE:
1010 1010
         case PIX_FMT_YUV422P16LE:
1011
-        case PIX_FMT_YUV444P16LE: c->hScale16= HAVE_BIGENDIAN ? RENAME(hScale16X) : RENAME(hScale16); break;
1011
+        case PIX_FMT_YUV444P16LE: c->hScale16= HAVE_BIGENDIAN ? hScale16X_c : hScale16_c; break;
1012 1012
     }
1013 1013
     if (c->chrSrcHSubSample) {
1014 1014
         switch(srcFormat) {
... ...
@@ -185,7 +185,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist
185 185
 }
186 186
 
187 187
 static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc,
188
-                      int srcW, int dstW, int filterAlign, int one, int flags,
188
+                      int srcW, int dstW, int filterAlign, int one, int flags, int cpu_flags,
189 189
                       SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
190 190
 {
191 191
     int i;
... ...
@@ -196,10 +196,8 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
196 196
     int64_t *filter2=NULL;
197 197
     const int64_t fone= 1LL<<54;
198 198
     int ret= -1;
199
-#if ARCH_X86
200
-    if (flags & SWS_CPU_CAPS_MMX)
201
-        __asm__ volatile("emms\n\t"::: "memory"); //FIXME this should not be required but it IS (even for non-MMX versions)
202
-#endif
199
+
200
+    emms_c(); //FIXME this should not be required but it IS (even for non-MMX versions)
203 201
 
204 202
     // NOTE: the +1 is for the MMX scaler which reads over the end
205 203
     FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW+1)*sizeof(int16_t), fail);
... ...
@@ -416,7 +414,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
416 416
         if (min>minFilterSize) minFilterSize= min;
417 417
     }
418 418
 
419
-    if (flags & SWS_CPU_CAPS_ALTIVEC) {
419
+    if (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) {
420 420
         // we can handle the special case 4,
421 421
         // so we don't want to go to the full 8
422 422
         if (minFilterSize < 5)
... ...
@@ -431,7 +429,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
431 431
             filterAlign = 1;
432 432
     }
433 433
 
434
-    if (flags & SWS_CPU_CAPS_MMX) {
434
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
435 435
         // special case for unscaled vertical filtering
436 436
         if (minFilterSize == 1 && filterAlign == 2)
437 437
             filterAlign= 1;
... ...
@@ -521,7 +519,7 @@ fail:
521 521
     return ret;
522 522
 }
523 523
 
524
-#if ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT)
524
+#if HAVE_MMX2
525 525
 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits)
526 526
 {
527 527
     uint8_t *fragmentA;
... ...
@@ -679,7 +677,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *fil
679 679
 
680 680
     return fragmentPos + 1;
681 681
 }
682
-#endif /* ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT) */
682
+#endif /* HAVE_MMX2 */
683 683
 
684 684
 static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
685 685
 {
... ...
@@ -687,8 +685,6 @@ static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
687 687
     *v = av_pix_fmt_descriptors[format].log2_chroma_h;
688 688
 }
689 689
 
690
-static int update_flags_cpu(int flags);
691
-
692 690
 int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange, const int table[4], int dstRange, int brightness, int contrast, int saturation)
693 691
 {
694 692
     memcpy(c->srcColorspaceTable, inv_table, sizeof(int)*4);
... ...
@@ -703,15 +699,12 @@ int sws_setColorspaceDetails(SwsContext *c, const int inv_table[4], int srcRange
703 703
 
704 704
     c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->dstFormat]);
705 705
     c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[c->srcFormat]);
706
-    c->flags = update_flags_cpu(c->flags);
707 706
 
708 707
     ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness, contrast, saturation);
709 708
     //FIXME factorize
710 709
 
711
-#if HAVE_ALTIVEC
712
-    if (c->flags & SWS_CPU_CAPS_ALTIVEC)
710
+    if (HAVE_ALTIVEC && av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)
713 711
         ff_yuv2rgb_init_tables_altivec(c, inv_table, brightness, contrast, saturation);
714
-#endif
715 712
     return 0;
716 713
 }
717 714
 
... ...
@@ -741,27 +734,6 @@ static int handle_jpeg(enum PixelFormat *format)
741 741
     }
742 742
 }
743 743
 
744
-static int update_flags_cpu(int flags)
745
-{
746
-#if !CONFIG_RUNTIME_CPUDETECT //ensure that the flags match the compiled variant if cpudetect is off
747
-    flags &= ~( SWS_CPU_CAPS_MMX
748
-               |SWS_CPU_CAPS_MMX2
749
-               |SWS_CPU_CAPS_3DNOW
750
-               |SWS_CPU_CAPS_SSE2
751
-               |SWS_CPU_CAPS_ALTIVEC
752
-               |SWS_CPU_CAPS_BFIN);
753
-    flags |= ff_hardcodedcpuflags();
754
-#else /* !CONFIG_RUNTIME_CPUDETECT */
755
-    int cpuflags = av_get_cpu_flags();
756
-
757
-    flags |= (cpuflags & AV_CPU_FLAG_SSE2 ? SWS_CPU_CAPS_SSE2 : 0);
758
-    flags |= (cpuflags & AV_CPU_FLAG_MMX ? SWS_CPU_CAPS_MMX : 0);
759
-    flags |= (cpuflags & AV_CPU_FLAG_MMX2 ? SWS_CPU_CAPS_MMX2 : 0);
760
-    flags |= (cpuflags & AV_CPU_FLAG_3DNOW ? SWS_CPU_CAPS_3DNOW : 0);
761
-#endif /* CONFIG_RUNTIME_CPUDETECT */
762
-    return flags;
763
-}
764
-
765 744
 SwsContext *sws_alloc_context(void)
766 745
 {
767 746
     SwsContext *c= av_mallocz(sizeof(SwsContext));
... ...
@@ -782,16 +754,14 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
782 782
     int srcH= c->srcH;
783 783
     int dstW= c->dstW;
784 784
     int dstH= c->dstH;
785
-    int flags;
785
+    int flags, cpu_flags;
786 786
     enum PixelFormat srcFormat= c->srcFormat;
787 787
     enum PixelFormat dstFormat= c->dstFormat;
788 788
 
789
-    flags= c->flags = update_flags_cpu(c->flags);
790
-#if ARCH_X86
791
-    if (flags & SWS_CPU_CAPS_MMX)
792
-        __asm__ volatile("emms\n\t"::: "memory");
793
-#endif
794
-    if (!rgb15to16) sws_rgb2rgb_init(flags);
789
+    cpu_flags = av_get_cpu_flags();
790
+    flags     = c->flags;
791
+    emms_c();
792
+    if (!rgb15to16) sws_rgb2rgb_init();
795 793
 
796 794
     unscaled = (srcW == dstW && srcH == dstH);
797 795
 
... ...
@@ -884,7 +854,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
884 884
         }
885 885
     }
886 886
 
887
-    if (flags & SWS_CPU_CAPS_MMX2) {
887
+    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) {
888 888
         c->canMMX2BeUsed= (dstW >=srcW && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
889 889
         if (!c->canMMX2BeUsed && dstW >=srcW && (srcW&15)==0 && (flags&SWS_FAST_BILINEAR)) {
890 890
             if (flags&SWS_PRINT_INFO)
... ...
@@ -910,7 +880,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
910 910
             c->chrXInc+= 20;
911 911
         }
912 912
         //we don't use the x86 asm scaler if MMX is available
913
-        else if (flags & SWS_CPU_CAPS_MMX) {
913
+        else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
914 914
             c->lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
915 915
             c->chrXInc = ((c->chrSrcW-2)<<16)/(c->chrDstW-2) - 20;
916 916
         }
... ...
@@ -918,7 +888,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
918 918
 
919 919
     /* precalculate horizontal scaler filter coefficients */
920 920
     {
921
-#if ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT)
921
+#if HAVE_MMX2
922 922
 // can't downscale !!!
923 923
         if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) {
924 924
             c->lumMmx2FilterCodeSize = initMMX2HScaler(      dstW, c->lumXInc, NULL, NULL, NULL, 8);
... ...
@@ -954,21 +924,21 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
954 954
             mprotect(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ);
955 955
 #endif
956 956
         } else
957
-#endif /* ARCH_X86 && (HAVE_MMX2 || CONFIG_RUNTIME_CPUDETECT) */
957
+#endif /* HAVE_MMX2 */
958 958
         {
959 959
             const int filterAlign=
960
-                (flags & SWS_CPU_CAPS_MMX) ? 4 :
961
-                (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
960
+                (HAVE_MMX     && cpu_flags & AV_CPU_FLAG_MMX) ? 4 :
961
+                (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
962 962
                 1;
963 963
 
964 964
             if (initFilter(&c->hLumFilter, &c->hLumFilterPos, &c->hLumFilterSize, c->lumXInc,
965 965
                            srcW      ,       dstW, filterAlign, 1<<14,
966
-                           (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
966
+                           (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
967 967
                            srcFilter->lumH, dstFilter->lumH, c->param) < 0)
968 968
                 goto fail;
969 969
             if (initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc,
970 970
                            c->chrSrcW, c->chrDstW, filterAlign, 1<<14,
971
-                           (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
971
+                           (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
972 972
                            srcFilter->chrH, dstFilter->chrH, c->param) < 0)
973 973
                 goto fail;
974 974
         }
... ...
@@ -977,18 +947,18 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
977 977
     /* precalculate vertical scaler filter coefficients */
978 978
     {
979 979
         const int filterAlign=
980
-            (flags & SWS_CPU_CAPS_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
981
-            (flags & SWS_CPU_CAPS_ALTIVEC) ? 8 :
980
+            (HAVE_MMX     && cpu_flags & AV_CPU_FLAG_MMX) && (flags & SWS_ACCURATE_RND) ? 2 :
981
+            (HAVE_ALTIVEC && cpu_flags & AV_CPU_FLAG_ALTIVEC) ? 8 :
982 982
             1;
983 983
 
984 984
         if (initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize, c->lumYInc,
985 985
                        srcH      ,        dstH, filterAlign, (1<<12),
986
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags,
986
+                       (flags&SWS_BICUBLIN) ? (flags|SWS_BICUBIC)  : flags, cpu_flags,
987 987
                        srcFilter->lumV, dstFilter->lumV, c->param) < 0)
988 988
             goto fail;
989 989
         if (initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize, c->chrYInc,
990 990
                        c->chrSrcH, c->chrDstH, filterAlign, (1<<12),
991
-                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags,
991
+                       (flags&SWS_BICUBLIN) ? (flags|SWS_BILINEAR) : flags, cpu_flags,
992 992
                        srcFilter->chrV, dstFilter->chrV, c->param) < 0)
993 993
             goto fail;
994 994
 
... ...
@@ -1082,13 +1052,13 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
1082 1082
 #endif
1083 1083
                sws_format_name(dstFormat));
1084 1084
 
1085
-        if      (flags & SWS_CPU_CAPS_MMX2)    av_log(c, AV_LOG_INFO, "using MMX2\n");
1086
-        else if (flags & SWS_CPU_CAPS_3DNOW)   av_log(c, AV_LOG_INFO, "using 3DNOW\n");
1087
-        else if (flags & SWS_CPU_CAPS_MMX)     av_log(c, AV_LOG_INFO, "using MMX\n");
1088
-        else if (flags & SWS_CPU_CAPS_ALTIVEC) av_log(c, AV_LOG_INFO, "using AltiVec\n");
1085
+        if      (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)    av_log(c, AV_LOG_INFO, "using MMX2\n");
1086
+        else if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)   av_log(c, AV_LOG_INFO, "using 3DNOW\n");
1087
+        else if (HAVE_MMX      && cpu_flags & AV_CPU_FLAG_MMX)     av_log(c, AV_LOG_INFO, "using MMX\n");
1088
+        else if (HAVE_ALTIVEC  && cpu_flags & AV_CPU_FLAG_ALTIVEC) av_log(c, AV_LOG_INFO, "using AltiVec\n");
1089 1089
         else                                   av_log(c, AV_LOG_INFO, "using C\n");
1090 1090
 
1091
-        if (flags & SWS_CPU_CAPS_MMX) {
1091
+        if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
1092 1092
             if (c->canMMX2BeUsed && (flags&SWS_FAST_BILINEAR))
1093 1093
                 av_log(c, AV_LOG_VERBOSE, "using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
1094 1094
             else {
... ...
@@ -1107,7 +1077,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
1107 1107
                     av_log(c, AV_LOG_VERBOSE, "using n-tap MMX scaler for horizontal chrominance scaling\n");
1108 1108
             }
1109 1109
         } else {
1110
-#if ARCH_X86
1110
+#if HAVE_MMX
1111 1111
             av_log(c, AV_LOG_VERBOSE, "using x86 asm scaler for horizontal scaling\n");
1112 1112
 #else
1113 1113
             if (flags & SWS_FAST_BILINEAR)
... ...
@@ -1118,31 +1088,41 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter)
1118 1118
         }
1119 1119
         if (isPlanarYUV(dstFormat)) {
1120 1120
             if (c->vLumFilterSize==1)
1121
-                av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1121
+                av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical scaling (YV12 like)\n",
1122
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1122 1123
             else
1123
-                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1124
+                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (YV12 like)\n",
1125
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1124 1126
         } else {
1125 1127
             if (c->vLumFilterSize==1 && c->vChrFilterSize==2)
1126 1128
                 av_log(c, AV_LOG_VERBOSE, "using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
1127
-                       "      2-tap scaler for vertical chrominance scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1129
+                       "      2-tap scaler for vertical chrominance scaling (BGR)\n",
1130
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1128 1131
             else if (c->vLumFilterSize==2 && c->vChrFilterSize==2)
1129
-                av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1132
+                av_log(c, AV_LOG_VERBOSE, "using 2-tap linear %s scaler for vertical scaling (BGR)\n",
1133
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1130 1134
             else
1131
-                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1135
+                av_log(c, AV_LOG_VERBOSE, "using n-tap %s scaler for vertical scaling (BGR)\n",
1136
+                       (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1132 1137
         }
1133 1138
 
1134 1139
         if (dstFormat==PIX_FMT_BGR24)
1135 1140
             av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR24 converter\n",
1136
-                   (flags & SWS_CPU_CAPS_MMX2) ? "MMX2" : ((flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C"));
1141
+                   (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) ? "MMX2" :
1142
+                   ((HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C"));
1137 1143
         else if (dstFormat==PIX_FMT_RGB32)
1138
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1144
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR32 converter\n",
1145
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1139 1146
         else if (dstFormat==PIX_FMT_BGR565)
1140
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1147
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR16 converter\n",
1148
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1141 1149
         else if (dstFormat==PIX_FMT_BGR555)
1142
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1150
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR15 converter\n",
1151
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1143 1152
         else if (dstFormat == PIX_FMT_RGB444BE || dstFormat == PIX_FMT_RGB444LE ||
1144 1153
                  dstFormat == PIX_FMT_BGR444BE || dstFormat == PIX_FMT_BGR444LE)
1145
-            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR12 converter\n", (flags & SWS_CPU_CAPS_MMX) ? "MMX" : "C");
1154
+            av_log(c, AV_LOG_VERBOSE, "using %s YV12->BGR12 converter\n",
1155
+                   (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? "MMX" : "C");
1146 1156
 
1147 1157
         av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
1148 1158
         av_log(c, AV_LOG_DEBUG, "lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
... ...
@@ -1527,7 +1507,7 @@ void sws_freeContext(SwsContext *c)
1527 1527
     av_freep(&c->hLumFilterPos);
1528 1528
     av_freep(&c->hChrFilterPos);
1529 1529
 
1530
-#if ARCH_X86
1530
+#if HAVE_MMX
1531 1531
 #ifdef MAP_ANONYMOUS
1532 1532
     if (c->lumMmx2FilterCode) munmap(c->lumMmx2FilterCode, c->lumMmx2FilterCodeSize);
1533 1533
     if (c->chrMmx2FilterCode) munmap(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize);
... ...
@@ -1540,7 +1520,7 @@ void sws_freeContext(SwsContext *c)
1540 1540
 #endif
1541 1541
     c->lumMmx2FilterCode=NULL;
1542 1542
     c->chrMmx2FilterCode=NULL;
1543
-#endif /* ARCH_X86 */
1543
+#endif /* HAVE_MMX */
1544 1544
 
1545 1545
     av_freep(&c->yuvTable);
1546 1546
 
... ...
@@ -1557,8 +1537,6 @@ struct SwsContext *sws_getCachedContext(struct SwsContext *context,
1557 1557
     if (!param)
1558 1558
         param = default_param;
1559 1559
 
1560
-    flags = update_flags_cpu(flags);
1561
-
1562 1560
     if (context &&
1563 1561
         (context->srcW      != srcW      ||
1564 1562
          context->srcH      != srcH      ||
... ...
@@ -27,6 +27,7 @@
27 27
 
28 28
 #include "config.h"
29 29
 #include "libavutil/x86_cpu.h"
30
+#include "libavutil/cpu.h"
30 31
 #include "libavutil/bswap.h"
31 32
 #include "libswscale/rgb2rgb.h"
32 33
 #include "libswscale/swscale.h"
... ...
@@ -122,16 +123,16 @@ DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
122 122
  32-bit C version, and and&add trick by Michael Niedermayer
123 123
 */
124 124
 
125
-void rgb2rgb_init_x86(int flags)
125
+void rgb2rgb_init_x86(void)
126 126
 {
127
-#if HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX
128
-    if (flags & SWS_CPU_CAPS_SSE2)
129
-        rgb2rgb_init_SSE2();
130
-    else if (flags & SWS_CPU_CAPS_MMX2)
131
-        rgb2rgb_init_MMX2();
132
-    else if (flags & SWS_CPU_CAPS_3DNOW)
133
-        rgb2rgb_init_3DNOW();
134
-    else if (flags & SWS_CPU_CAPS_MMX)
127
+    int cpu_flags = av_get_cpu_flags();
128
+
129
+    if (HAVE_MMX      && cpu_flags & AV_CPU_FLAG_MMX)
135 130
         rgb2rgb_init_MMX();
136
-#endif /* HAVE_MMX2 || HAVE_AMD3DNOW || HAVE_MMX */
131
+    if (HAVE_AMD3DNOW && cpu_flags & AV_CPU_FLAG_3DNOW)
132
+        rgb2rgb_init_3DNOW();
133
+    if (HAVE_MMX2     && cpu_flags & AV_CPU_FLAG_MMX2)
134
+        rgb2rgb_init_MMX2();
135
+    if (HAVE_SSE      && cpu_flags & AV_CPU_FLAG_SSE2)
136
+        rgb2rgb_init_SSE2();
137 137
 }
... ...
@@ -22,24 +22,15 @@
22 22
 
23 23
 #undef REAL_MOVNTQ
24 24
 #undef MOVNTQ
25
-#undef PAVGB
26 25
 #undef PREFETCH
27 26
 
28
-#if COMPILE_TEMPLATE_AMD3DNOW
29
-#define PREFETCH  "prefetch"
30
-#elif COMPILE_TEMPLATE_MMX2
27
+#if COMPILE_TEMPLATE_MMX2
31 28
 #define PREFETCH "prefetchnta"
32 29
 #else
33 30
 #define PREFETCH  " # nop"
34 31
 #endif
35 32
 
36 33
 #if COMPILE_TEMPLATE_MMX2
37
-#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
38
-#elif COMPILE_TEMPLATE_AMD3DNOW
39
-#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
40
-#endif
41
-
42
-#if COMPILE_TEMPLATE_MMX2
43 34
 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
44 35
 #else
45 36
 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
... ...
@@ -709,62 +700,6 @@
709 709
     " jb             1b             \n\t"
710 710
 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
711 711
 
712
-#define WRITEBGR24OLD(dst, dstw, index) \
713
-    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
714
-    "movq      %%mm2, %%mm1             \n\t" /* B */\
715
-    "movq      %%mm5, %%mm6             \n\t" /* R */\
716
-    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
717
-    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
718
-    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
719
-    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
720
-    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
721
-    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
722
-    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
723
-    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
724
-    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
725
-    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
726
-\
727
-    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
728
-    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
729
-    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
730
-    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
731
-    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
732
-    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
733
-    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
734
-    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
735
-\
736
-    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
737
-    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
738
-    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
739
-    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
740
-    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
741
-    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
742
-    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
743
-    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
744
-    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
745
-    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
746
-    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
747
-    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
748
-    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
749
-\
750
-    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
751
-    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
752
-    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
753
-    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
754
-    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
755
-    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
756
-    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
757
-    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
758
-\
759
-    MOVNTQ(%%mm0,   (dst))\
760
-    MOVNTQ(%%mm2,  8(dst))\
761
-    MOVNTQ(%%mm3, 16(dst))\
762
-    "add         $24, "#dst"            \n\t"\
763
-\
764
-    "add          $8, "#index"          \n\t"\
765
-    "cmp     "#dstw", "#index"          \n\t"\
766
-    " jb          1b                    \n\t"
767
-
768 712
 #define WRITEBGR24MMX(dst, dstw, index) \
769 713
     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
770 714
     "movq      %%mm2, %%mm1     \n\t" /* B */\
... ...
@@ -896,7 +831,6 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con
896 896
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
897 897
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
898 898
 {
899
-    if(!(c->flags & SWS_BITEXACT)) {
900 899
         if (c->flags & SWS_ACCURATE_RND) {
901 900
             if (uDest) {
902 901
                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
... ...
@@ -918,27 +852,11 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con
918 918
 
919 919
             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920 920
         }
921
-        return;
922
-    }
923
-    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
924
-                chrFilter, chrSrc, chrFilterSize,
925
-                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
926
-}
927
-
928
-static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
929
-                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
930
-                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
931
-{
932
-    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
933
-                 chrFilter, chrSrc, chrFilterSize,
934
-                 dest, uDest, dstW, chrDstW, dstFormat);
935 921
 }
936 922
 
937 923
 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
938 924
                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
939 925
 {
940
-    int i;
941
-    if(!(c->flags & SWS_BITEXACT)) {
942 926
         long p= 4;
943 927
         const int16_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
944 928
         uint8_t *dst[4]= {aDest, dest, uDest, vDest};
... ...
@@ -967,40 +885,6 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const
967 967
                 }
968 968
             }
969 969
         }
970
-        return;
971
-    }
972
-    for (i=0; i<dstW; i++) {
973
-        int val= (lumSrc[i]+64)>>7;
974
-
975
-        if (val&256) {
976
-            if (val<0) val=0;
977
-            else       val=255;
978
-        }
979
-
980
-        dest[i]= val;
981
-    }
982
-
983
-    if (uDest)
984
-        for (i=0; i<chrDstW; i++) {
985
-            int u=(chrSrc[i       ]+64)>>7;
986
-            int v=(chrSrc[i + VOFW]+64)>>7;
987
-
988
-            if ((u|v)&256) {
989
-                if (u<0)        u=0;
990
-                else if (u>255) u=255;
991
-                if (v<0)        v=0;
992
-                else if (v>255) v=255;
993
-            }
994
-
995
-            uDest[i]= u;
996
-            vDest[i]= v;
997
-        }
998
-
999
-    if (CONFIG_SWSCALE_ALPHA && aDest)
1000
-        for (i=0; i<dstW; i++) {
1001
-            int val= (alpSrc[i]+64)>>7;
1002
-            aDest[i]= av_clip_uint8(val);
1003
-        }
1004 970
 }
1005 971
 
1006 972
 
... ...
@@ -1013,7 +897,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
1013 1013
 {
1014 1014
     x86_reg dummy=0;
1015 1015
     x86_reg dstW_reg = dstW;
1016
-    if(!(c->flags & SWS_BITEXACT)) {
1016
+
1017 1017
         if (c->flags & SWS_ACCURATE_RND) {
1018 1018
             switch(c->dstFormat) {
1019 1019
             case PIX_FMT_RGB32:
... ...
@@ -1170,7 +1054,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
1170 1170
                 return;
1171 1171
             }
1172 1172
         }
1173
-    }
1173
+
1174 1174
     yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1175 1175
                    chrFilter, chrSrc, chrFilterSize,
1176 1176
                    alpSrc, dest, dstW, dstY);
... ...
@@ -1182,11 +1066,6 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter,
1182 1182
 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1183 1183
                           const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1184 1184
 {
1185
-    int  yalpha1=4095- yalpha;
1186
-    int uvalpha1=4095-uvalpha;
1187
-    int i;
1188
-
1189
-    if(!(c->flags & SWS_BITEXACT)) {
1190 1185
         switch(c->dstFormat) {
1191 1186
         //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1192 1187
         case PIX_FMT_RGB32:
... ...
@@ -1317,10 +1196,10 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
1317 1317
                 "a" (&c->redDither)
1318 1318
             );
1319 1319
             return;
1320
-        default: break;
1321 1320
         }
1322
-    }
1323
-    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1321
+
1322
+    yuv2packed2_c(c, buf0, buf1, uvbuf0, uvbuf1, abuf0, abuf1,
1323
+                  dest, dstW, yalpha, uvalpha, y);
1324 1324
 }
1325 1325
 
1326 1326
 /**
... ...
@@ -1329,18 +1208,13 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons
1329 1329
 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1330 1330
                           const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1331 1331
 {
1332
-    const int yalpha1=0;
1333
-    int i;
1334
-
1335
-    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1336
-    const int yalpha= 4096; //FIXME ...
1332
+        const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1337 1333
 
1338
-    if (flags&SWS_FULL_CHR_H_INT) {
1339
-        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1340
-        return;
1341
-    }
1334
+        if (flags&SWS_FULL_CHR_H_INT) {
1335
+            c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1336
+            return;
1337
+        }
1342 1338
 
1343
-    if(!(flags & SWS_BITEXACT)) {
1344 1339
         if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1345 1340
             switch(dstFormat) {
1346 1341
             case PIX_FMT_RGB32:
... ...
@@ -1554,12 +1428,9 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons
1554 1554
                 return;
1555 1555
             }
1556 1556
         }
1557
-    }
1558
-    if (uvalpha < 2048) {
1559
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1560
-    } else {
1561
-        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1562
-    }
1557
+
1558
+    yuv2packed1_c(c, buf0, uvbuf0, uvbuf1, abuf0, dest,
1559
+                  dstW, uvalpha, dstFormat, flags, y);
1563 1560
 }
1564 1561
 
1565 1562
 //FIXME yuy2* can read up to 7 samples too much
... ...
@@ -1866,20 +1737,6 @@ static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
1866 1866
     assert(src1 == src2);
1867 1867
 }
1868 1868
 
1869
-static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1870
-{
1871
-    int i;
1872
-    for (i=0; i<width; i++) {
1873
-        int b= src1[6*i + 0] + src1[6*i + 3];
1874
-        int g= src1[6*i + 1] + src1[6*i + 4];
1875
-        int r= src1[6*i + 2] + src1[6*i + 5];
1876
-
1877
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1878
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1879
-    }
1880
-    assert(src1 == src2);
1881
-}
1882
-
1883 1869
 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1884 1870
 {
1885 1871
     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
... ...
@@ -1891,20 +1748,6 @@ static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t
1891 1891
     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1892 1892
 }
1893 1893
 
1894
-static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1895
-{
1896
-    int i;
1897
-    assert(src1==src2);
1898
-    for (i=0; i<width; i++) {
1899
-        int r= src1[6*i + 0] + src1[6*i + 3];
1900
-        int g= src1[6*i + 1] + src1[6*i + 4];
1901
-        int b= src1[6*i + 2] + src1[6*i + 5];
1902
-
1903
-        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1904
-        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1905
-    }
1906
-}
1907
-
1908 1894
 
1909 1895
 // bilinear / bicubic scaling
1910 1896
 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
... ...
@@ -2061,50 +1904,168 @@ static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, in
2061 2061
     }
2062 2062
 }
2063 2063
 
2064
-//FIXME all pal and rgb srcFormats could do this convertion as well
2065
-//FIXME all scalers more complex than bilinear could do half of this transform
2066
-static void RENAME(chrRangeToJpeg)(int16_t *dst, int width)
2064
+static inline void RENAME(hScale16)(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
2065
+                                    const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
2067 2066
 {
2068
-    int i;
2069
-    for (i = 0; i < width; i++) {
2070
-        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2071
-        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2072
-    }
2073
-}
2074
-static void RENAME(chrRangeFromJpeg)(int16_t *dst, int width)
2075
-{
2076
-    int i;
2077
-    for (i = 0; i < width; i++) {
2078
-        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2079
-        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2067
+    int i, j;
2068
+
2069
+    assert(filterSize % 4 == 0 && filterSize>0);
2070
+    if (filterSize==4 && shift<15) { // Always true for upscaling, sometimes for down, too.
2071
+        x86_reg counter= -2*dstW;
2072
+        filter-= counter*2;
2073
+        filterPos-= counter/2;
2074
+        dst-= counter/2;
2075
+        __asm__ volatile(
2076
+            "movd                   %5, %%mm7       \n\t"
2077
+#if defined(PIC)
2078
+            "push            %%"REG_b"              \n\t"
2079
+#endif
2080
+            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2081
+            "mov             %%"REG_a", %%"REG_BP"  \n\t"
2082
+            ".p2align                4              \n\t"
2083
+            "1:                                     \n\t"
2084
+            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2085
+            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2086
+            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2087
+            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2088
+            "movq      (%3, %%"REG_a", 2), %%mm0    \n\t"
2089
+            "movq      (%3, %%"REG_b", 2), %%mm2    \n\t"
2090
+            "pmaddwd             %%mm1, %%mm0       \n\t"
2091
+            "pmaddwd             %%mm2, %%mm3       \n\t"
2092
+            "movq                %%mm0, %%mm4       \n\t"
2093
+            "punpckldq           %%mm3, %%mm0       \n\t"
2094
+            "punpckhdq           %%mm3, %%mm4       \n\t"
2095
+            "paddd               %%mm4, %%mm0       \n\t"
2096
+            "psrad               %%mm7, %%mm0       \n\t"
2097
+            "packssdw            %%mm0, %%mm0       \n\t"
2098
+            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2099
+            "add                    $4, %%"REG_BP"  \n\t"
2100
+            " jnc                   1b              \n\t"
2101
+
2102
+            "pop            %%"REG_BP"              \n\t"
2103
+#if defined(PIC)
2104
+            "pop             %%"REG_b"              \n\t"
2105
+#endif
2106
+            : "+a" (counter)
2107
+            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
2108
+#if !defined(PIC)
2109
+            : "%"REG_b
2110
+#endif
2111
+        );
2112
+    } else if (filterSize==8 && shift<15) {
2113
+        x86_reg counter= -2*dstW;
2114
+        filter-= counter*4;
2115
+        filterPos-= counter/2;
2116
+        dst-= counter/2;
2117
+        __asm__ volatile(
2118
+            "movd                   %5, %%mm7       \n\t"
2119
+#if defined(PIC)
2120
+            "push            %%"REG_b"              \n\t"
2121
+#endif
2122
+            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2123
+            "mov              %%"REG_a", %%"REG_BP" \n\t"
2124
+            ".p2align                 4             \n\t"
2125
+            "1:                                     \n\t"
2126
+            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2127
+            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2128
+            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2129
+            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2130
+            "movq       (%3, %%"REG_a", 2), %%mm0   \n\t"
2131
+            "movq       (%3, %%"REG_b", 2), %%mm2   \n\t"
2132
+            "pmaddwd              %%mm1, %%mm0      \n\t"
2133
+            "pmaddwd              %%mm2, %%mm3      \n\t"
2134
+
2135
+            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2136
+            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2137
+            "movq      8(%3, %%"REG_a", 2), %%mm4   \n\t"
2138
+            "movq      8(%3, %%"REG_b", 2), %%mm2   \n\t"
2139
+            "pmaddwd              %%mm1, %%mm4      \n\t"
2140
+            "pmaddwd              %%mm2, %%mm5      \n\t"
2141
+            "paddd                %%mm4, %%mm0      \n\t"
2142
+            "paddd                %%mm5, %%mm3      \n\t"
2143
+            "movq                 %%mm0, %%mm4      \n\t"
2144
+            "punpckldq            %%mm3, %%mm0      \n\t"
2145
+            "punpckhdq            %%mm3, %%mm4      \n\t"
2146
+            "paddd                %%mm4, %%mm0      \n\t"
2147
+            "psrad                %%mm7, %%mm0      \n\t"
2148
+            "packssdw             %%mm0, %%mm0      \n\t"
2149
+            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2150
+            "add                     $4, %%"REG_BP" \n\t"
2151
+            " jnc                    1b             \n\t"
2152
+
2153
+            "pop             %%"REG_BP"             \n\t"
2154
+#if defined(PIC)
2155
+            "pop             %%"REG_b"              \n\t"
2156
+#endif
2157
+            : "+a" (counter)
2158
+            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst), "m"(shift)
2159
+#if !defined(PIC)
2160
+            : "%"REG_b
2161
+#endif
2162
+        );
2163
+    } else if (shift<15){
2164
+        const uint16_t *offset = src+filterSize;
2165
+        x86_reg counter= -2*dstW;
2166
+        //filter-= counter*filterSize/2;
2167
+        filterPos-= counter/2;
2168
+        dst-= counter/2;
2169
+        __asm__ volatile(
2170
+            "movd                   %7, %%mm7       \n\t"
2171
+            ".p2align                  4            \n\t"
2172
+            "1:                                     \n\t"
2173
+            "mov                      %2, %%"REG_c" \n\t"
2174
+            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2175
+            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2176
+            "mov                      %5, %%"REG_c" \n\t"
2177
+            "pxor                  %%mm4, %%mm4     \n\t"
2178
+            "pxor                  %%mm5, %%mm5     \n\t"
2179
+            "2:                                     \n\t"
2180
+            "movq                   (%1), %%mm1     \n\t"
2181
+            "movq               (%1, %6), %%mm3     \n\t"
2182
+            "movq (%%"REG_c", %%"REG_a", 2), %%mm0     \n\t"
2183
+            "movq (%%"REG_c", %%"REG_d", 2), %%mm2     \n\t"
2184
+            "pmaddwd               %%mm1, %%mm0     \n\t"
2185
+            "pmaddwd               %%mm2, %%mm3     \n\t"
2186
+            "paddd                 %%mm3, %%mm5     \n\t"
2187
+            "paddd                 %%mm0, %%mm4     \n\t"
2188
+            "add                      $8, %1        \n\t"
2189
+            "add                      $8, %%"REG_c" \n\t"
2190
+            "cmp                      %4, %%"REG_c" \n\t"
2191
+            " jb                      2b            \n\t"
2192
+            "add                      %6, %1        \n\t"
2193
+            "movq                  %%mm4, %%mm0     \n\t"
2194
+            "punpckldq             %%mm5, %%mm4     \n\t"
2195
+            "punpckhdq             %%mm5, %%mm0     \n\t"
2196
+            "paddd                 %%mm0, %%mm4     \n\t"
2197
+            "psrad                 %%mm7, %%mm4     \n\t"
2198
+            "packssdw              %%mm4, %%mm4     \n\t"
2199
+            "mov                      %3, %%"REG_a" \n\t"
2200
+            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2201
+            "add                      $4, %0        \n\t"
2202
+            " jnc                     1b            \n\t"
2203
+
2204
+            : "+r" (counter), "+r" (filter)
2205
+            : "m" (filterPos), "m" (dst), "m"(offset),
2206
+            "m" (src), "r" ((x86_reg)filterSize*2), "m"(shift)
2207
+            : "%"REG_a, "%"REG_c, "%"REG_d
2208
+        );
2209
+    } else
2210
+    for (i=0; i<dstW; i++) {
2211
+        int srcPos= filterPos[i];
2212
+        int val=0;
2213
+        for (j=0; j<filterSize; j++) {
2214
+            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2215
+        }
2216
+        dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
2080 2217
     }
2081 2218
 }
2082
-static void RENAME(lumRangeToJpeg)(int16_t *dst, int width)
2083
-{
2084
-    int i;
2085
-    for (i = 0; i < width; i++)
2086
-        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2087
-}
2088
-static void RENAME(lumRangeFromJpeg)(int16_t *dst, int width)
2089
-{
2090
-    int i;
2091
-    for (i = 0; i < width; i++)
2092
-        dst[i] = (dst[i]*14071 + 33561947)>>14;
2093
-}
2094 2219
 
2095
-#define FAST_BILINEAR_X86 \
2096
-    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2097
-    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2098
-    "shll      $16, %%edi    \n\t"                                              \
2099
-    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2100
-    "mov        %1, %%"REG_D"\n\t"                                              \
2101
-    "shrl       $9, %%esi    \n\t"                                              \
2102 2220
 
2221
+#if COMPILE_TEMPLATE_MMX2
2103 2222
 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2104 2223
                                         long dstWidth, const uint8_t *src, int srcW,
2105 2224
                                         int xInc)
2106 2225
 {
2107
-#if COMPILE_TEMPLATE_MMX2
2108 2226
     int32_t *filterPos = c->hLumFilterPos;
2109 2227
     int16_t *filter    = c->hLumFilter;
2110 2228
     int     canMMX2BeUsed  = c->canMMX2BeUsed;
... ...
@@ -2113,7 +2074,7 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2113 2113
 #if defined(PIC)
2114 2114
     DECLARE_ALIGNED(8, uint64_t, ebxsave);
2115 2115
 #endif
2116
-    if (canMMX2BeUsed) {
2116
+
2117 2117
         __asm__ volatile(
2118 2118
 #if defined(PIC)
2119 2119
             "mov               %%"REG_b", %5        \n\t"
... ...
@@ -2172,80 +2133,12 @@ static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2172 2172
 #endif
2173 2173
         );
2174 2174
         for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2175
-    } else {
2176
-#endif /* COMPILE_TEMPLATE_MMX2 */
2177
-    x86_reg xInc_shr16 = xInc >> 16;
2178
-    uint16_t xInc_mask = xInc & 0xffff;
2179
-    x86_reg dstWidth_reg = dstWidth;
2180
-    //NO MMX just normal asm ...
2181
-    __asm__ volatile(
2182
-        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2183
-        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2184
-        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2185
-        ".p2align                4           \n\t"
2186
-        "1:                                  \n\t"
2187
-        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2188
-        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2189
-        FAST_BILINEAR_X86
2190
-        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2191
-        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2192
-        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2193
-
2194
-        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2195
-        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2196
-        FAST_BILINEAR_X86
2197
-        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2198
-        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2199
-        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2200
-
2201
-
2202
-        "add        $2, %%"REG_a"            \n\t"
2203
-        "cmp        %2, %%"REG_a"            \n\t"
2204
-        " jb        1b                       \n\t"
2205
-
2206
-
2207
-        :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2208
-        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2209
-    );
2210
-#if COMPILE_TEMPLATE_MMX2
2211
-    } //if MMX2 can't be used
2212
-#endif
2213
-}
2214
-
2215
-      // *** horizontal scale Y line to temp buffer
2216
-static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2217
-                                   const int16_t *hLumFilter,
2218
-                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2219
-                                   uint8_t *formatConvBuffer,
2220
-                                   uint32_t *pal, int isAlpha)
2221
-{
2222
-    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2223
-    void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2224
-
2225
-    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2226
-
2227
-    if (toYV12) {
2228
-        toYV12(formatConvBuffer, src, srcW, pal);
2229
-        src= formatConvBuffer;
2230
-    }
2231
-
2232
-    if (c->hScale16) {
2233
-        c->hScale16(dst, dstWidth, (uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
2234
-    } else if (!c->hyscale_fast) {
2235
-        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2236
-    } else { // fast bilinear upscale / crap downscale
2237
-        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2238
-    }
2239
-
2240
-    if (convertRange)
2241
-        convertRange(dst, dstWidth);
2242 2175
 }
2243 2176
 
2244 2177
 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2245 2178
                                         long dstWidth, const uint8_t *src1,
2246 2179
                                         const uint8_t *src2, int srcW, int xInc)
2247 2180
 {
2248
-#if COMPILE_TEMPLATE_MMX2
2249 2181
     int32_t *filterPos = c->hChrFilterPos;
2250 2182
     int16_t *filter    = c->hChrFilter;
2251 2183
     int     canMMX2BeUsed  = c->canMMX2BeUsed;
... ...
@@ -2254,7 +2147,7 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2254 2254
 #if defined(PIC)
2255 2255
     DECLARE_ALIGNED(8, uint64_t, ebxsave);
2256 2256
 #endif
2257
-    if (canMMX2BeUsed) {
2257
+
2258 2258
         __asm__ volatile(
2259 2259
 #if defined(PIC)
2260 2260
             "mov          %%"REG_b", %6         \n\t"
... ...
@@ -2304,252 +2197,32 @@ static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2304 2304
             dst[i] = src1[srcW-1]*128;
2305 2305
             dst[i+VOFW] = src2[srcW-1]*128;
2306 2306
         }
2307
-    } else {
2308
-#endif /* COMPILE_TEMPLATE_MMX2 */
2309
-        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2310
-        uint16_t xInc_mask = xInc & 0xffff;
2311
-        x86_reg dstWidth_reg = dstWidth;
2312
-        __asm__ volatile(
2313
-            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2314
-            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2315
-            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2316
-            ".p2align    4                          \n\t"
2317
-            "1:                                     \n\t"
2318
-            "mov        %0, %%"REG_S"               \n\t"
2319
-            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2320
-            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2321
-            FAST_BILINEAR_X86
2322
-            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2323
-
2324
-            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2325
-            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2326
-            FAST_BILINEAR_X86
2327
-            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2328
-
2329
-            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2330
-            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2331
-            "add        $1, %%"REG_a"               \n\t"
2332
-            "cmp        %2, %%"REG_a"               \n\t"
2333
-            " jb        1b                          \n\t"
2334
-
2335
-/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2336
-which is needed to support GCC 4.0. */
2337
-#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2338
-            :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2339
-#else
2340
-            :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2341
-#endif
2342
-            "r" (src2)
2343
-            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2344
-        );
2345
-#if COMPILE_TEMPLATE_MMX2
2346
-    } //if MMX2 can't be used
2347
-#endif
2348
-}
2349
-
2350
-inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2351
-                                   int srcW, int xInc, const int16_t *hChrFilter,
2352
-                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2353
-                                   uint8_t *formatConvBuffer,
2354
-                                   uint32_t *pal)
2355
-{
2356
-
2357
-    src1 += c->chrSrcOffset;
2358
-    src2 += c->chrSrcOffset;
2359
-
2360
-    if (c->chrToYV12) {
2361
-        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2362
-        src1= formatConvBuffer;
2363
-        src2= formatConvBuffer+VOFW;
2364
-    }
2365
-
2366
-    if (c->hScale16) {
2367
-        c->hScale16(dst     , dstWidth, (uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
2368
-        c->hScale16(dst+VOFW, dstWidth, (uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1);
2369
-    } else if (!c->hcscale_fast) {
2370
-        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2371
-        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2372
-    } else { // fast bilinear upscale / crap downscale
2373
-        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2374
-    }
2375
-
2376
-    if (c->chrConvertRange)
2377
-        c->chrConvertRange(dst, dstWidth);
2378 2307
 }
2308
+#endif /* COMPILE_TEMPLATE_MMX2 */
2379 2309
 
2380
-#define DEBUG_SWSCALE_BUFFERS 0
2381
-#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2382
-
2383
-static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2384
-                           int srcSliceH, uint8_t* dst[], int dstStride[])
2310
+#if !COMPILE_TEMPLATE_MMX2
2311
+static void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
2312
+                                  int lastInLumBuf, int lastInChrBuf)
2385 2313
 {
2386
-    /* load a few things into local vars to make the code more readable? and faster */
2387
-    const int srcW= c->srcW;
2388
-    const int dstW= c->dstW;
2389 2314
     const int dstH= c->dstH;
2390
-    const int chrDstW= c->chrDstW;
2391
-    const int chrSrcW= c->chrSrcW;
2392
-    const int lumXInc= c->lumXInc;
2393
-    const int chrXInc= c->chrXInc;
2394
-    const enum PixelFormat dstFormat= c->dstFormat;
2395 2315
     const int flags= c->flags;
2316
+    int16_t **lumPixBuf= c->lumPixBuf;
2317
+    int16_t **chrPixBuf= c->chrPixBuf;
2318
+    int16_t **alpPixBuf= c->alpPixBuf;
2319
+    const int vLumBufSize= c->vLumBufSize;
2320
+    const int vChrBufSize= c->vChrBufSize;
2396 2321
     int16_t *vLumFilterPos= c->vLumFilterPos;
2397 2322
     int16_t *vChrFilterPos= c->vChrFilterPos;
2398
-    int16_t *hLumFilterPos= c->hLumFilterPos;
2399
-    int16_t *hChrFilterPos= c->hChrFilterPos;
2400 2323
     int16_t *vLumFilter= c->vLumFilter;
2401 2324
     int16_t *vChrFilter= c->vChrFilter;
2402
-    int16_t *hLumFilter= c->hLumFilter;
2403
-    int16_t *hChrFilter= c->hChrFilter;
2404 2325
     int32_t *lumMmxFilter= c->lumMmxFilter;
2405 2326
     int32_t *chrMmxFilter= c->chrMmxFilter;
2406 2327
     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2407 2328
     const int vLumFilterSize= c->vLumFilterSize;
2408 2329
     const int vChrFilterSize= c->vChrFilterSize;
2409
-    const int hLumFilterSize= c->hLumFilterSize;
2410
-    const int hChrFilterSize= c->hChrFilterSize;
2411
-    int16_t **lumPixBuf= c->lumPixBuf;
2412
-    int16_t **chrPixBuf= c->chrPixBuf;
2413
-    int16_t **alpPixBuf= c->alpPixBuf;
2414
-    const int vLumBufSize= c->vLumBufSize;
2415
-    const int vChrBufSize= c->vChrBufSize;
2416
-    uint8_t *formatConvBuffer= c->formatConvBuffer;
2417
-    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2418
-    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2419
-    int lastDstY;
2420
-    uint32_t *pal=c->pal_yuv;
2421
-
2422
-    /* vars which will change and which we need to store back in the context */
2423
-    int dstY= c->dstY;
2424
-    int lumBufIndex= c->lumBufIndex;
2425
-    int chrBufIndex= c->chrBufIndex;
2426
-    int lastInLumBuf= c->lastInLumBuf;
2427
-    int lastInChrBuf= c->lastInChrBuf;
2428
-
2429
-    if (isPacked(c->srcFormat)) {
2430
-        src[0]=
2431
-        src[1]=
2432
-        src[2]=
2433
-        src[3]= src[0];
2434
-        srcStride[0]=
2435
-        srcStride[1]=
2436
-        srcStride[2]=
2437
-        srcStride[3]= srcStride[0];
2438
-    }
2439
-    srcStride[1]<<= c->vChrDrop;
2440
-    srcStride[2]<<= c->vChrDrop;
2441
-
2442
-    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2443
-                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2444
-                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2445
-    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2446
-                   srcSliceY,    srcSliceH,    dstY,    dstH);
2447
-    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2448
-                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2449
-
2450
-    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2451
-        static int warnedAlready=0; //FIXME move this into the context perhaps
2452
-        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2453
-            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2454
-                   "         ->cannot do aligned memory accesses anymore\n");
2455
-            warnedAlready=1;
2456
-        }
2457
-    }
2458
-
2459
-    /* Note the user might start scaling the picture in the middle so this
2460
-       will not get executed. This is not really intended but works
2461
-       currently, so people might do it. */
2462
-    if (srcSliceY ==0) {
2463
-        lumBufIndex=-1;
2464
-        chrBufIndex=-1;
2465
-        dstY=0;
2466
-        lastInLumBuf= -1;
2467
-        lastInChrBuf= -1;
2468
-    }
2469
-
2470
-    lastDstY= dstY;
2471
-
2472
-    for (;dstY < dstH; dstY++) {
2473
-        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2474
-        const int chrDstY= dstY>>c->chrDstVSubSample;
2475
-        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2476
-        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2477
-        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2478
-
2479
-        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2480
-        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2481
-        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2482
-        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2483
-        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2484
-        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2485
-        int enough_lines;
2486
-
2487
-        //handle holes (FAST_BILINEAR & weird filters)
2488
-        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2489
-        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2490
-        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2491
-        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2492
-
2493
-        DEBUG_BUFFERS("dstY: %d\n", dstY);
2494
-        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2495
-                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2496
-        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2497
-                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2498
-
2499
-        // Do we have enough lines in this slice to output the dstY line
2500
-        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2501
-
2502
-        if (!enough_lines) {
2503
-            lastLumSrcY = srcSliceY + srcSliceH - 1;
2504
-            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2505
-            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2506
-                                            lastLumSrcY, lastChrSrcY);
2507
-        }
2508
-
2509
-        //Do horizontal scaling
2510
-        while(lastInLumBuf < lastLumSrcY) {
2511
-            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2512
-            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2513
-            lumBufIndex++;
2514
-            assert(lumBufIndex < 2*vLumBufSize);
2515
-            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2516
-            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2517
-            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2518
-                            hLumFilter, hLumFilterPos, hLumFilterSize,
2519
-                            formatConvBuffer,
2520
-                            pal, 0);
2521
-            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2522
-                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2523
-                                hLumFilter, hLumFilterPos, hLumFilterSize,
2524
-                                formatConvBuffer,
2525
-                                pal, 1);
2526
-            lastInLumBuf++;
2527
-            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2528
-                               lumBufIndex,    lastInLumBuf);
2529
-        }
2530
-        while(lastInChrBuf < lastChrSrcY) {
2531
-            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2532
-            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2533
-            chrBufIndex++;
2534
-            assert(chrBufIndex < 2*vChrBufSize);
2535
-            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2536
-            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2537
-            //FIXME replace parameters through context struct (some at least)
2538
-
2539
-            if (c->needs_hcscale)
2540
-                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2541
-                                hChrFilter, hChrFilterPos, hChrFilterSize,
2542
-                                formatConvBuffer,
2543
-                                pal);
2544
-            lastInChrBuf++;
2545
-            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2546
-                               chrBufIndex,    lastInChrBuf);
2547
-        }
2548
-        //wrap buf index around to stay inside the ring buffer
2549
-        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2550
-        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2551
-        if (!enough_lines)
2552
-            break; //we can't output a dstY line so let's try with the next slice
2330
+    const int chrDstY= dstY>>c->chrDstVSubSample;
2331
+    const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2332
+    const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2553 2333
 
2554 2334
         c->blueDither= ff_dither8[dstY&1];
2555 2335
         if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
... ...
@@ -2557,7 +2230,7 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
2557 2557
         else
2558 2558
             c->greenDither= ff_dither4[dstY&1];
2559 2559
         c->redDither= ff_dither8[(dstY+1)&1];
2560
-        if (dstY < dstH-2) {
2560
+        if (dstY < dstH - 2) {
2561 2561
             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2562 2562
             const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2563 2563
             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
... ...
@@ -2606,183 +2279,52 @@ static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[],
2606 2606
                         ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2607 2607
                 }
2608 2608
             }
2609
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2610
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2611
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2612
-                c->yuv2nv12X(c,
2613
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2614
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2615
-                             dest, uDest, dstW, chrDstW, dstFormat);
2616
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2617
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2618
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2619
-                if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
2620
-                    yuv2yuvX16inC(
2621
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2622
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2623
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2624
-                                  dstFormat);
2625
-                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2626
-                    const int16_t *lumBuf = lumSrcPtr[0];
2627
-                    const int16_t *chrBuf= chrSrcPtr[0];
2628
-                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2629
-                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2630
-                } else { //General YV12
2631
-                    c->yuv2yuvX(c,
2632
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2633
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2634
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2635
-                }
2636
-            } else {
2637
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2638
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2639
-                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2640
-                    int chrAlpha= vChrFilter[2*dstY+1];
2641
-                    if(flags & SWS_FULL_CHR_H_INT) {
2642
-                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2643
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2644
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2645
-                                         alpSrcPtr, dest, dstW, dstY);
2646
-                    } else {
2647
-                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2648
-                                       alpPixBuf ? *alpSrcPtr : NULL,
2649
-                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2650
-                    }
2651
-                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2652
-                    int lumAlpha= vLumFilter[2*dstY+1];
2653
-                    int chrAlpha= vChrFilter[2*dstY+1];
2654
-                    lumMmxFilter[2]=
2655
-                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2656
-                    chrMmxFilter[2]=
2657
-                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2658
-                    if(flags & SWS_FULL_CHR_H_INT) {
2659
-                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2660
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2661
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2662
-                                         alpSrcPtr, dest, dstW, dstY);
2663
-                    } else {
2664
-                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2665
-                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2666
-                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2667
-                    }
2668
-                } else { //general RGB
2669
-                    if(flags & SWS_FULL_CHR_H_INT) {
2670
-                        yuv2rgbXinC_full(c,
2671
-                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2672
-                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2673
-                                         alpSrcPtr, dest, dstW, dstY);
2674
-                    } else {
2675
-                        c->yuv2packedX(c,
2676
-                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2677
-                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2678
-                                       alpSrcPtr, dest, dstW, dstY);
2679
-                    }
2680
-                }
2681
-            }
2682
-        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2683
-            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2684
-            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2685
-            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2686
-            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2687
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2688
-                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2689
-                yuv2nv12XinC(
2690
-                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2691
-                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2692
-                             dest, uDest, dstW, chrDstW, dstFormat);
2693
-            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2694
-                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2695
-                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2696
-                if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
2697
-                    yuv2yuvX16inC(
2698
-                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2699
-                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2700
-                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2701
-                                  dstFormat);
2702
-                } else {
2703
-                    yuv2yuvXinC(
2704
-                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2705
-                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2706
-                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2707
-                }
2708
-            } else {
2709
-                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2710
-                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2711
-                if(flags & SWS_FULL_CHR_H_INT) {
2712
-                    yuv2rgbXinC_full(c,
2713
-                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2714
-                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2715
-                                     alpSrcPtr, dest, dstW, dstY);
2716
-                } else {
2717
-                    yuv2packedXinC(c,
2718
-                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2719
-                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2720
-                                   alpSrcPtr, dest, dstW, dstY);
2721
-                }
2722
-            }
2723 2609
         }
2724
-    }
2725
-
2726
-    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2727
-        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2728
-
2729
-    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2730
-    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2731
-    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2732
-    else                             __asm__ volatile("emms"  :::"memory");
2733
-    /* store changed local vars back in the context */
2734
-    c->dstY= dstY;
2735
-    c->lumBufIndex= lumBufIndex;
2736
-    c->chrBufIndex= chrBufIndex;
2737
-    c->lastInLumBuf= lastInLumBuf;
2738
-    c->lastInChrBuf= lastInChrBuf;
2739
-
2740
-    return dstY - lastDstY;
2741 2610
 }
2611
+#endif /* !COMPILE_TEMPLATE_MMX2 */
2742 2612
 
2743 2613
 static void RENAME(sws_init_swScale)(SwsContext *c)
2744 2614
 {
2745 2615
     enum PixelFormat srcFormat = c->srcFormat;
2746 2616
 
2747
-    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2748
-    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2749
-    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2750
-    c->yuv2packed1  = RENAME(yuv2packed1 );
2751
-    c->yuv2packed2  = RENAME(yuv2packed2 );
2752
-    c->yuv2packedX  = RENAME(yuv2packedX );
2617
+    if (!(c->flags & SWS_BITEXACT)) {
2618
+        c->yuv2yuv1     = RENAME(yuv2yuv1    );
2619
+        c->yuv2yuvX     = RENAME(yuv2yuvX    );
2620
+        c->yuv2packed1  = RENAME(yuv2packed1 );
2621
+        c->yuv2packed2  = RENAME(yuv2packed2 );
2622
+        c->yuv2packedX  = RENAME(yuv2packedX );
2623
+    }
2753 2624
 
2754 2625
     c->hScale       = RENAME(hScale      );
2755 2626
 
2756 2627
     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2628
+#if COMPILE_TEMPLATE_MMX2
2757 2629
     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2758 2630
     {
2759 2631
         c->hyscale_fast = RENAME(hyscale_fast);
2760 2632
         c->hcscale_fast = RENAME(hcscale_fast);
2761 2633
     } else {
2634
+#endif /* COMPILE_TEMPLATE_MMX2 */
2762 2635
         c->hyscale_fast = NULL;
2763 2636
         c->hcscale_fast = NULL;
2637
+#if COMPILE_TEMPLATE_MMX2
2764 2638
     }
2639
+#endif /* COMPILE_TEMPLATE_MMX2 */
2765 2640
 
2766
-    switch(srcFormat) {
2641
+     switch(srcFormat) {
2767 2642
         case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2768 2643
         case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2769 2644
         case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2770 2645
         case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2771
-        case PIX_FMT_YUV420P16BE:
2772
-        case PIX_FMT_YUV422P16BE:
2773
-        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2646
+        case PIX_FMT_GRAY16LE :
2647
+        case PIX_FMT_YUV420P9LE:
2648
+        case PIX_FMT_YUV422P10LE:
2649
+        case PIX_FMT_YUV420P10LE:
2774 2650
         case PIX_FMT_YUV420P16LE:
2775 2651
         case PIX_FMT_YUV422P16LE:
2776
-        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2777
-        default: break;
2778
-    }
2779
-    if (c->chrSrcHSubSample) {
2780
-        switch(srcFormat) {
2781
-        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2782
-        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2783
-        default: break;
2784
-        }
2785
-    } else {
2652
+        case PIX_FMT_YUV444P16LE: c->hScale16= RENAME(hScale16); break;
2653
+    }   
2654
+    if (!c->chrSrcHSubSample) {
2786 2655
         switch(srcFormat) {
2787 2656
         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2788 2657
         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
... ...
@@ -2792,16 +2334,10 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
2792 2792
 
2793 2793
     switch (srcFormat) {
2794 2794
     case PIX_FMT_YUYV422  :
2795
-    case PIX_FMT_YUV420P16BE:
2796
-    case PIX_FMT_YUV422P16BE:
2797
-    case PIX_FMT_YUV444P16BE:
2798 2795
     case PIX_FMT_Y400A    :
2799
-    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
2796
+                            c->lumToYV12 = RENAME(yuy2ToY); break;
2800 2797
     case PIX_FMT_UYVY422  :
2801
-    case PIX_FMT_YUV420P16LE:
2802
-    case PIX_FMT_YUV422P16LE:
2803
-    case PIX_FMT_YUV444P16LE:
2804
-    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
2798
+                            c->lumToYV12 = RENAME(uyvyToY); break;
2805 2799
     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
2806 2800
     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
2807 2801
     default: break;
... ...
@@ -2812,14 +2348,4 @@ static void RENAME(sws_init_swScale)(SwsContext *c)
2812 2812
         default: break;
2813 2813
         }
2814 2814
     }
2815
-
2816
-    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2817
-        if (c->srcRange) {
2818
-            c->lumConvertRange = RENAME(lumRangeFromJpeg);
2819
-            c->chrConvertRange = RENAME(chrRangeFromJpeg);
2820
-        } else {
2821
-            c->lumConvertRange = RENAME(lumRangeToJpeg);
2822
-            c->chrConvertRange = RENAME(chrRangeToJpeg);
2823
-        }
2824
-    }
2825 2815
 }
... ...
@@ -34,6 +34,7 @@
34 34
 #include "libswscale/swscale.h"
35 35
 #include "libswscale/swscale_internal.h"
36 36
 #include "libavutil/x86_cpu.h"
37
+#include "libavutil/cpu.h"
37 38
 
38 39
 #define DITHER1XBPP // only for MMX
39 40
 
... ...
@@ -46,57 +47,58 @@ DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
46 46
 DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
47 47
 
48 48
 //MMX versions
49
+#if HAVE_MMX
49 50
 #undef RENAME
50
-#undef HAVE_MMX2
51
-#undef HAVE_AMD3DNOW
52
-#define HAVE_MMX2 0
53
-#define HAVE_AMD3DNOW 0
51
+#undef COMPILE_TEMPLATE_MMX2
52
+#define COMPILE_TEMPLATE_MMX2 0
54 53
 #define RENAME(a) a ## _MMX
55 54
 #include "yuv2rgb_template.c"
55
+#endif /* HAVE_MMX */
56 56
 
57 57
 //MMX2 versions
58
+#if HAVE_MMX2
58 59
 #undef RENAME
59
-#undef HAVE_MMX2
60
-#define HAVE_MMX2 1
60
+#undef COMPILE_TEMPLATE_MMX2
61
+#define COMPILE_TEMPLATE_MMX2 1
61 62
 #define RENAME(a) a ## _MMX2
62 63
 #include "yuv2rgb_template.c"
64
+#endif /* HAVE_MMX2 */
63 65
 
64 66
 SwsFunc ff_yuv2rgb_init_mmx(SwsContext *c)
65 67
 {
66
-    if (c->flags & SWS_CPU_CAPS_MMX2) {
68
+    int cpu_flags = av_get_cpu_flags();
69
+
70
+    if (c->srcFormat != PIX_FMT_YUV420P &&
71
+        c->srcFormat != PIX_FMT_YUVA420P)
72
+        return NULL;
73
+
74
+    if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2) {
67 75
         switch (c->dstFormat) {
68
-        case PIX_FMT_RGB32:
69
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
70
-                if (HAVE_7REGS) return yuva420_rgb32_MMX2;
71
-                break;
72
-            } else return yuv420_rgb32_MMX2;
73
-        case PIX_FMT_BGR32:
74
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
75
-                if (HAVE_7REGS) return yuva420_bgr32_MMX2;
76
-                break;
77
-            } else return yuv420_bgr32_MMX2;
78 76
         case PIX_FMT_RGB24:  return yuv420_rgb24_MMX2;
79 77
         case PIX_FMT_BGR24:  return yuv420_bgr24_MMX2;
80
-        case PIX_FMT_RGB565: return yuv420_rgb16_MMX2;
81
-        case PIX_FMT_RGB555: return yuv420_rgb15_MMX2;
82 78
         }
83 79
     }
84
-    if (c->flags & SWS_CPU_CAPS_MMX) {
80
+
81
+    if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
85 82
         switch (c->dstFormat) {
86
-        case PIX_FMT_RGB32:
87
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
88
-                if (HAVE_7REGS) return yuva420_rgb32_MMX;
89
-                break;
90
-            } else return yuv420_rgb32_MMX;
91
-        case PIX_FMT_BGR32:
92
-            if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
93
-                if (HAVE_7REGS) return yuva420_bgr32_MMX;
94
-                break;
95
-            } else return yuv420_bgr32_MMX;
96
-        case PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
97
-        case PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
98
-        case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
99
-        case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
83
+            case PIX_FMT_RGB32:
84
+                if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
85
+#if HAVE_7REGS
86
+                    return yuva420_rgb32_MMX;
87
+#endif
88
+                    break;
89
+                } else return yuv420_rgb32_MMX;
90
+            case PIX_FMT_BGR32:
91
+                if (CONFIG_SWSCALE_ALPHA && c->srcFormat == PIX_FMT_YUVA420P) {
92
+#if HAVE_7REGS
93
+                    return yuva420_bgr32_MMX;
94
+#endif
95
+                    break;
96
+                } else return yuv420_bgr32_MMX;
97
+            case PIX_FMT_RGB24:  return yuv420_rgb24_MMX;
98
+            case PIX_FMT_BGR24:  return yuv420_bgr24_MMX;
99
+            case PIX_FMT_RGB565: return yuv420_rgb16_MMX;
100
+            case PIX_FMT_RGB555: return yuv420_rgb15_MMX;
100 101
         }
101 102
     }
102 103
 
... ...
@@ -25,14 +25,7 @@
25 25
 #undef EMMS
26 26
 #undef SFENCE
27 27
 
28
-#if HAVE_AMD3DNOW
29
-/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
30
-#define EMMS   "femms"
31
-#else
32
-#define EMMS   "emms"
33
-#endif
34
-
35
-#if HAVE_MMX2
28
+#if COMPILE_TEMPLATE_MMX2
36 29
 #define MOVNTQ "movntq"
37 30
 #define SFENCE "sfence"
38 31
 #else
... ...
@@ -159,7 +152,8 @@
159 159
     }                                                             \
160 160
 
161 161
 #define YUV2RGB_ENDFUNC                          \
162
-    __asm__ volatile (SFENCE"\n\t"EMMS);         \
162
+    __asm__ volatile (SFENCE"\n\t"               \
163
+                    "emms    \n\t");             \
163 164
     return srcSliceH;                            \
164 165
 
165 166
 #define IF0(x)
... ...
@@ -188,6 +182,7 @@
188 188
     "paddusb "GREEN_DITHER"(%4), %%mm2\n\t"      \
189 189
     "paddusb "RED_DITHER"(%4),   %%mm1\n\t"      \
190 190
 
191
+#if !COMPILE_TEMPLATE_MMX2
191 192
 static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
192 193
                                        int srcStride[],
193 194
                                        int srcSliceY, int srcSliceH,
... ...
@@ -243,6 +238,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
243 243
     YUV2RGB_OPERANDS
244 244
     YUV2RGB_ENDFUNC
245 245
 }
246
+#endif /* !COMPILE_TEMPLATE_MMX2 */
246 247
 
247 248
 #define RGB_PACK24(blue, red)\
248 249
     "packuswb  %%mm3,      %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
... ...
@@ -259,7 +255,7 @@ static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
259 259
     "punpckhwd %%mm6,      %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
260 260
     RGB_PACK24_B
261 261
 
262
-#if HAVE_MMX2
262
+#if COMPILE_TEMPLATE_MMX2
263 263
 DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
264 264
 DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
265 265
 DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
... ...
@@ -366,6 +362,7 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
366 366
     MOVNTQ "   %%mm5,       16(%1)\n\t"      \
367 367
     MOVNTQ "   %%mm"alpha", 24(%1)\n\t"      \
368 368
 
369
+#if !COMPILE_TEMPLATE_MMX2
369 370
 static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
370 371
                                        int srcStride[],
371 372
                                        int srcSliceY, int srcSliceH,
... ...
@@ -386,12 +383,12 @@ static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
386 386
     YUV2RGB_ENDFUNC
387 387
 }
388 388
 
389
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
389 390
 static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
390 391
                                         int srcStride[],
391 392
                                         int srcSliceY, int srcSliceH,
392 393
                                         uint8_t *dst[], int dstStride[])
393 394
 {
394
-#if HAVE_7REGS
395 395
     int y, h_size;
396 396
 
397 397
     YUV2RGB_LOOP(4)
... ...
@@ -406,9 +403,8 @@ static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
406 406
     YUV2RGB_ENDLOOP(4)
407 407
     YUV2RGB_OPERANDS_ALPHA
408 408
     YUV2RGB_ENDFUNC
409
-#endif
410
-    return 0;
411 409
 }
410
+#endif
412 411
 
413 412
 static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
414 413
                                        int srcStride[],
... ...
@@ -430,12 +426,12 @@ static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
430 430
     YUV2RGB_ENDFUNC
431 431
 }
432 432
 
433
+#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
433 434
 static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
434 435
                                         int srcStride[],
435 436
                                         int srcSliceY, int srcSliceH,
436 437
                                         uint8_t *dst[], int dstStride[])
437 438
 {
438
-#if HAVE_7REGS
439 439
     int y, h_size;
440 440
 
441 441
     YUV2RGB_LOOP(4)
... ...
@@ -450,6 +446,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
450 450
     YUV2RGB_ENDLOOP(4)
451 451
     YUV2RGB_OPERANDS_ALPHA
452 452
     YUV2RGB_ENDFUNC
453
-#endif
454
-    return 0;
455 453
 }
454
+#endif
455
+
456
+#endif /* !COMPILE_TEMPLATE_MMX2 */
... ...
@@ -32,7 +32,7 @@
32 32
 #include "rgb2rgb.h"
33 33
 #include "swscale.h"
34 34
 #include "swscale_internal.h"
35
-#include "libavutil/x86_cpu.h"
35
+#include "libavutil/cpu.h"
36 36
 #include "libavutil/bswap.h"
37 37
 
38 38
 extern const uint8_t dither_4x4_16[4][8];
... ...
@@ -579,24 +579,18 @@ CLOSEYUV2RGBFUNC(1)
579 579
 SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
580 580
 {
581 581
     SwsFunc t = NULL;
582
-#if HAVE_MMX
583
-     t = ff_yuv2rgb_init_mmx(c);
584
-#endif
585
-#if HAVE_VIS
586
-    t = ff_yuv2rgb_init_vis(c);
587
-#endif
588
-#if CONFIG_MLIB
589
-    t = ff_yuv2rgb_init_mlib(c);
590
-#endif
591
-#if HAVE_ALTIVEC
592
-    if (c->flags & SWS_CPU_CAPS_ALTIVEC)
593
-        t = ff_yuv2rgb_init_altivec(c);
594
-#endif
595 582
 
596
-#if ARCH_BFIN
597
-    if (c->flags & SWS_CPU_CAPS_BFIN)
583
+    if (HAVE_MMX) {
584
+        t = ff_yuv2rgb_init_mmx(c);
585
+    } else if (HAVE_VIS) {
586
+        t = ff_yuv2rgb_init_vis(c);
587
+    } else if (CONFIG_MLIB) {
588
+        t = ff_yuv2rgb_init_mlib(c);
589
+    } else if (HAVE_ALTIVEC) {
590
+        t = ff_yuv2rgb_init_altivec(c);
591
+    } else if (ARCH_BFIN) {
598 592
         t = ff_yuv2rgb_get_func_ptr_bfin(c);
599
-#endif
593
+    }
600 594
 
601 595
     if (t)
602 596
         return t;