GitList

Browse code

Merge remote-tracking branch 'qatar/master'

* qatar/master:
get_bits: remove x86 inline asm in A32 bitstream reader
doc: Remove outdated information about our issue tracker
avidec: Factor out the sync fucntionality.
fate-aac: Expand coverage.
ac3dsp: add x86-optimized versions of ac3dsp.extract_exponents().
ac3dsp: simplify extract_exponents() now that it does not need to do clipping.
ac3enc: clip coefficients after MDCT.
ac3enc: add int32_t array clipping function to DSPUtil, including x86 versions.
swscale: for >8bit scaling, read in native bit-depth.
matroskadec: matroska_read_seek after after EBML_STOP leads to failure.
doxygen: fix usage of @file directive in libavutil/{dict,file}.h
doxygen: Help doxygen parser to understand the DECLARE_ALIGNED and offsetof macros

Conflicts:
doc/issue_tracker.txt
libavformat/avidec.c
libavutil/dict.h
libswscale/swscale.c
libswscale/utils.c
tests/ref/lavfi/pixfmts_scale

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2011/07/02 10:07:06
Showing 19 changed files

Doxyfile index ad08e08..8907f69 100644
libavcodec/ac3dsp.c index 619addc..96bd123 100644
libavcodec/ac3enc.h index 5f5d2c2..be2767a 100644
libavcodec/ac3enc_fixed.c index b189609..cbe92e1 100644
libavcodec/ac3enc_float.c index 7d01b18..e21b99d 100644
libavcodec/ac3enc_template.c index 85eea54..c7243c7 100644
libavcodec/dsputil.c index 0e596b1..bfbe12e 100644
libavcodec/dsputil.h index f2054a4..401a87a 100644
libavcodec/get_bits.h index 3b09dfd..96d33b3 100644
libavcodec/x86/ac3dsp.asm index 99c5df3..8c958a1 100644
libavcodec/x86/ac3dsp_mmx.c index e853b88..3127570 100644
libavcodec/x86/dsputil_mmx.c index 78cad4c..80bb6cd 100644
libavcodec/x86/dsputil_yasm.asm index 695aba5..1f5a4f6 100644
libavformat/avidec.c index 80620da..366914a 100644
libavformat/matroskadec.c index 61fcec2..f3d7a2d 100644
libavutil/dict.h index 9890e9e..3982b0d 100644
libavutil/file.h index f94d780..f28627c 100644
libswscale/swscale.c index 3e2d2a4..3a92db5 100644
tests/fate/aac.mak index 6701e14..8c80557 100644

@@ -1160,6 +1160,7 @@ INCLUDE_FILE_PATTERNS  =
                      PREDEFINED             = __attribute__(x)="" "RENAME(x)=x ## _TMPL" "DEF(x)=x ## _TMPL" \
                                               HAVE_AV_CONFIG_H HAVE_MMX HAVE_MMX2 HAVE_AMD3DNOW \
                     +                         "DECLARE_ALIGNED(a,t,n)=t n" "offsetof(x,y)=0x42" \
                      # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
                      # this tag can be used to specify a list of macro names that should be expanded.

@@ -164,21 +164,8 @@ static void ac3_extract_exponents_c(uint8_t *exp, int32_t *coef, int nb_coefs)
                          int i;
                          for (i = 0; i < nb_coefs; i++) {
                     -        int e;
                              int v = abs(coef[i]);
                     -        if (v == 0)
                     -            e = 24;
                     -        else {
                     -            e = 23 - av_log2(v);
                     -            if (e >= 24) {
                     -                e = 24;
                     -                coef[i] = 0;
                     -            } else if (e < 0) {
                     -                e = 0;
                     -                coef[i] = av_clip(coef[i], -16777215, 16777215);
                     -            }
                     -        }
                     -        exp[i] = e;
                     +        exp[i] = v ? 23 - av_log2(v) : 24;
+                         }
+                     }

@@ -50,12 +50,16 @@
                      #if CONFIG_AC3ENC_FLOAT
                      #define AC3_NAME(x) ff_ac3_float_ ## x
                      #define MAC_COEF(d,a,b) ((d)+=(a)*(b))
                     +#define COEF_MIN (-16777215.0/16777216.0)
                     +#define COEF_MAX ( 16777215.0/16777216.0)
                      typedef float SampleType;
                      typedef float CoefType;
                      typedef float CoefSumType;
                      #else
                      #define AC3_NAME(x) ff_ac3_fixed_ ## x
                      #define MAC_COEF(d,a,b) MAC64(d,a,b)
                     +#define COEF_MIN -16777215
                     +#define COEF_MAX  16777215
                      typedef int16_t SampleType;
                      typedef int32_t CoefType;
                      typedef int64_t CoefSumType;

@@ -104,6 +104,15 @@ static void scale_coefficients(AC3EncodeContext *s)
+                     }
                     +/**
                     + * Clip MDCT coefficients to allowable range.
                     + */
                     +static void clip_coefficients(DSPContext *dsp, int32_t *coef, unsigned int len)
                     +{
                     +    dsp->vector_clip_int32(coef, coef, COEF_MIN, COEF_MAX, len);
                     +}
+                    +
+                    +
                      static av_cold int ac3_fixed_encode_init(AVCodecContext *avctx)
+                     {
                          AC3EncodeContext *s = avctx->priv_data;

@@ -111,6 +111,15 @@ static void scale_coefficients(AC3EncodeContext *s)
+                     }
                     +/**
                     + * Clip MDCT coefficients to allowable range.
                     + */
                     +static void clip_coefficients(DSPContext *dsp, float *coef, unsigned int len)
                     +{
                     +    dsp->vector_clipf(coef, coef, COEF_MIN, COEF_MAX, len);
                     +}
+                    +
+                    +
                      #if CONFIG_AC3_ENCODER
                      AVCodec ff_ac3_float_encoder = {
                          "ac3_float",

@@ -41,6 +41,8 @@ static void apply_window(DSPContext *dsp, SampleType *output,
                      static int normalize_samples(AC3EncodeContext *s);
                     +static void clip_coefficients(DSPContext *dsp, CoefType *coef, unsigned int len);
+                    +
                      int AC3_NAME(allocate_sample_buffers)(AC3EncodeContext *s)
+                     {
@@ -171,8 +173,8 @@ static void apply_channel_coupling(AC3EncodeContext *s)
                                      cpl_coef[i] += ch_coef[i];
+                             }
                     -        /* coefficients must be clipped to +/- 1.0 in order to be encoded */
                     -        s->dsp.vector_clipf(cpl_coef, cpl_coef, -1.0f, 1.0f, num_cpl_coefs);
                     +        /* coefficients must be clipped in order to be encoded */
                     +        clip_coefficients(&s->dsp, cpl_coef, num_cpl_coefs);
                              /* scale coupling coefficients from float to 24-bit fixed-point */
                              s->ac3dsp.float_to_fixed24(&block->fixed_coef[CPL_CH][cpl_start],
@@ -300,6 +302,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
                              if (!block->cpl_in_use || !block->new_cpl_coords)
                                  continue;
                     +        clip_coefficients(&s->dsp, cpl_coords[blk][1], s->fbw_channels * 16);
                              s->ac3dsp.float_to_fixed24(fixed_cpl_coords[blk][1],
                                                         cpl_coords[blk][1],
                                                         s->fbw_channels * 16);
@@ -433,7 +436,11 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, unsigned char *frame,
                          apply_mdct(s);
                     -    scale_coefficients(s);
                     +    if (s->fixed_point)
                     +        scale_coefficients(s);
+                    +
                     +    clip_coefficients(&s->dsp, s->blocks[0].mdct_coef[1],
                     +                      AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
                          s->cpl_on = s->cpl_enabled;
                          ff_ac3_compute_coupling_strategy(s);
@@ -443,6 +450,9 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, unsigned char *frame,
                          compute_rematrixing_strategy(s);
                     +    if (!s->fixed_point)
                     +        scale_coefficients(s);
+                    +
                          ff_ac3_apply_rematrixing(s);
                          ff_ac3_process_exponents(s);

@@ -2664,6 +2664,22 @@ static void apply_window_int16_c(int16_t *output, const int16_t *input,
+                         }
+                     }
                     +static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                     +                                int32_t max, unsigned int len)
                     +{
                     +    do {
                     +        *dst++ = av_clip(*src++, min, max);
                     +        *dst++ = av_clip(*src++, min, max);
                     +        *dst++ = av_clip(*src++, min, max);
                     +        *dst++ = av_clip(*src++, min, max);
                     +        *dst++ = av_clip(*src++, min, max);
                     +        *dst++ = av_clip(*src++, min, max);
                     +        *dst++ = av_clip(*src++, min, max);
                     +        *dst++ = av_clip(*src++, min, max);
                     +        len -= 8;
                     +    } while (len > 0);
                     +}
+                    +
                      #define W0 2048
                      #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
                      #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -3106,6 +3122,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
                          c->scalarproduct_int16 = scalarproduct_int16_c;
                          c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
                          c->apply_window_int16 = apply_window_int16_c;
                     +    c->vector_clip_int32 = vector_clip_int32_c;
                          c->scalarproduct_float = scalarproduct_float_c;
                          c->butterflies_float = butterflies_float_c;
                          c->vector_fmul_scalar = vector_fmul_scalar_c;

@@ -553,6 +553,22 @@ typedef struct DSPContext {
                          void (*apply_window_int16)(int16_t *output, const int16_t *input,
                                                     const int16_t *window, unsigned int len);
                     +    /**
                     +     * Clip each element in an array of int32_t to a given minimum and maximum value.
                     +     * @param dst  destination array
                     +     *             constraints: 16-byte aligned
                     +     * @param src  source array
                     +     *             constraints: 16-byte aligned
                     +     * @param min  minimum value
                     +     *             constraints: must in the the range [-(1<<24), 1<<24]
                     +     * @param max  maximum value
                     +     *             constraints: must in the the range [-(1<<24), 1<<24]
                     +     * @param len  number of elements in the array
                     +     *             constraints: multiple of 32 greater than zero
                     +     */
                     +    void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
                     +                              int32_t max, unsigned int len);
+                    +
                          /* rv30 functions */
                          qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
                          qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];

@@ -201,19 +201,11 @@ static inline void skip_bits_long(GetBitContext *s, int n){
                              }                                                               \
                          } while (0)
                     -#if ARCH_X86
                     -#   define SKIP_CACHE(name, gb, num)                            \
                     -    __asm__("shldl %2, %1, %0          \n\t"                    \
                     -            "shll  %2, %1              \n\t"                    \
                     -            : "+r" (name##_cache0), "+r" (name##_cache1)        \
                     -            : "Ic" ((uint8_t)(num)))
                     -#else
                      #   define SKIP_CACHE(name, gb, num) do {               \
                              name##_cache0 <<= (num);                        \
                              name##_cache0 |= NEG_USR32(name##_cache1,num);  \
                              name##_cache1 <<= (num);                        \
                          } while (0)
                     -#endif
                      #   define SKIP_COUNTER(name, gb, num) name##_bit_count += (num)

@@ -32,6 +32,11 @@ cextern ac3_bap_bits
                      pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
                      pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
                     +; used in ff_ac3_extract_exponents()
                     +pd_1:   times 4 dd 1
                     +pd_151: times 4 dd 151
                     +pb_shuf_4dwb: db 0, 4, 8, 12
+                    +
                      SECTION .text
                      ;-----------------------------------------------------------------------------
@@ -346,3 +351,100 @@ cglobal ac3_compute_mantissa_size_sse2, 1,2,4, mant_cnt, sum
                          movd       eax, m0
                          add        eax, sumd
                          RET
+                    +
                     +;------------------------------------------------------------------------------
                     +; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
                     +;------------------------------------------------------------------------------
+                    +
                     +%macro PABSD_MMX 2 ; src/dst, tmp
                     +    pxor     %2, %2
                     +    pcmpgtd  %2, %1
                     +    pxor     %1, %2
                     +    psubd    %1, %2
                     +%endmacro
+                    +
                     +%macro PABSD_SSSE3 1-2 ; src/dst, unused
                     +    pabsd    %1, %1
                     +%endmacro
+                    +
                     +%ifdef HAVE_AMD3DNOW
                     +INIT_MMX
                     +cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
                     +    add      expq, lenq
                     +    lea     coefq, [coefq+4*lenq]
                     +    neg      lenq
                     +    movq       m3, [pd_1]
                     +    movq       m4, [pd_151]
                     +.loop:
                     +    movq       m0, [coefq+4*lenq  ]
                     +    movq       m1, [coefq+4*lenq+8]
                     +    PABSD_MMX  m0, m2
                     +    PABSD_MMX  m1, m2
                     +    pslld      m0, 1
                     +    por        m0, m3
                     +    pi2fd      m2, m0
                     +    psrld      m2, 23
                     +    movq       m0, m4
                     +    psubd      m0, m2
                     +    pslld      m1, 1
                     +    por        m1, m3
                     +    pi2fd      m2, m1
                     +    psrld      m2, 23
                     +    movq       m1, m4
                     +    psubd      m1, m2
                     +    packssdw   m0, m0
                     +    packuswb   m0, m0
                     +    packssdw   m1, m1
                     +    packuswb   m1, m1
                     +    punpcklwd  m0, m1
                     +    movd  [expq+lenq], m0
                     +    add      lenq, 4
                     +    jl .loop
                     +    REP_RET
                     +%endif
+                    +
                     +%macro AC3_EXTRACT_EXPONENTS 1
                     +cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
                     +    add     expq, lenq
                     +    lea    coefq, [coefq+4*lenq]
                     +    neg     lenq
                     +    mova      m2, [pd_1]
                     +    mova      m3, [pd_151]
                     +%ifidn %1, ssse3 ;
                     +    movd      m4, [pb_shuf_4dwb]
                     +%endif
                     +.loop:
                     +    ; move 4 32-bit coefs to xmm0
                     +    mova      m0, [coefq+4*lenq]
                     +    ; absolute value
                     +    PABSD     m0, m1
                     +    ; convert to float and extract exponents
                     +    pslld     m0, 1
                     +    por       m0, m2
                     +    cvtdq2ps  m1, m0
                     +    psrld     m1, 23
                     +    mova      m0, m3
                     +    psubd     m0, m1
                     +    ; move the lowest byte in each of 4 dwords to the low dword
                     +%ifidn %1, ssse3
                     +    pshufb    m0, m4
                     +%else
                     +    packssdw  m0, m0
                     +    packuswb  m0, m0
                     +%endif
                     +    movd  [expq+lenq], m0
+                    +
                     +    add     lenq, 4
                     +    jl .loop
                     +    REP_RET
                     +%endmacro
+                    +
                     +%ifdef HAVE_SSE
                     +INIT_XMM
                     +%define PABSD PABSD_MMX
                     +AC3_EXTRACT_EXPONENTS sse2
                     +%ifdef HAVE_SSSE3
                     +%define PABSD PABSD_SSSE3
                     +AC3_EXTRACT_EXPONENTS ssse3
                     +%endif
                     +%endif

@@ -44,6 +44,10 @@ extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned i
                      extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
                     +extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
                     +extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
                     +extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
+                    +
                      av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
+                     {
                          int mm_flags = av_get_cpu_flags();
@@ -56,6 +60,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
                              c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
+                         }
                          if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
                     +        c->extract_exponents = ff_ac3_extract_exponents_3dnow;
                              if (!bit_exact) {
                                  c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
+                             }
@@ -72,6 +77,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
                              c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
                              c->float_to_fixed24 = ff_float_to_fixed24_sse2;
                              c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
                     +        c->extract_exponents = ff_ac3_extract_exponents_sse2;
                              if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
                                  c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
                                  c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
@@ -79,6 +85,9 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
+                         }
                          if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
                              c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
                     +        if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
                     +            c->extract_exponents = ff_ac3_extract_exponents_ssse3;
                     +        }
+                         }
                      #endif
+                     }

@@ -2333,6 +2333,15 @@ int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, i
                      float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
                     +void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src, int32_t min,
                     +                                   int32_t max, unsigned int len);
                     +void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src, int32_t min,
                     +                                   int32_t max, unsigned int len);
                     +void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min,
                     +                                   int32_t max, unsigned int len);
                     +void ff_vector_clip_int32_sse41   (int32_t *dst, const int32_t *src, int32_t min,
                     +                                   int32_t max, unsigned int len);
+                    +
                      void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
+                     {
                          int mm_flags = av_get_cpu_flags();
@@ -2473,6 +2482,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                              c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
                              c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
+                    +
                     +        c->vector_clip_int32 = ff_vector_clip_int32_mmx;
                      #endif
                              if (mm_flags & AV_CPU_FLAG_MMX2) {
@@ -2756,6 +2767,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                      #if HAVE_YASM
                                  c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
                                  c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
                     +            if (mm_flags & AV_CPU_FLAG_ATOM) {
                     +                c->vector_clip_int32 = ff_vector_clip_int32_sse2_int;
                     +            } else {
                     +                c->vector_clip_int32 = ff_vector_clip_int32_sse2;
                     +            }
                                  if (avctx->flags & CODEC_FLAG_BITEXACT) {
                                      c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
                                  } else {
@@ -2781,6 +2797,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
+                                 }
                      #endif
+                             }
+                    +
                     +        if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
                     +#if HAVE_YASM
                     +            c->vector_clip_int32 = ff_vector_clip_int32_sse41;
                     +#endif
                     +        }
+                    +
                      #if HAVE_AVX && HAVE_YASM
                              if (mm_flags & AV_CPU_FLAG_AVX) {
                                  if (bit_depth == 10) {

@@ -1048,3 +1048,118 @@ emu_edge sse
                      %ifdef ARCH_X86_32
                      emu_edge mmx
                      %endif
+                    +
                     +;-----------------------------------------------------------------------------
                     +; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
                     +;                           int32_t max, unsigned int len)
                     +;-----------------------------------------------------------------------------
+                    +
                     +%macro PMINSD_MMX 3 ; dst, src, tmp
                     +    mova      %3, %2
                     +    pcmpgtd   %3, %1
                     +    pxor      %1, %2
                     +    pand      %1, %3
                     +    pxor      %1, %2
                     +%endmacro
+                    +
                     +%macro PMAXSD_MMX 3 ; dst, src, tmp
                     +    mova      %3, %1
                     +    pcmpgtd   %3, %2
                     +    pand      %1, %3
                     +    pandn     %3, %2
                     +    por       %1, %3
                     +%endmacro
+                    +
                     +%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
                     +    PMINSD_MMX %1, %3, %4
                     +    PMAXSD_MMX %1, %2, %4
                     +%endmacro
+                    +
                     +%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
                     +    cvtdq2ps  %1, %1
                     +    minps     %1, %3
                     +    maxps     %1, %2
                     +    cvtps2dq  %1, %1
                     +%endmacro
+                    +
                     +%macro CLIPD_SSE41 3-4 ;  src/dst, min, max, unused
                     +    pminsd  %1, %3
                     +    pmaxsd  %1, %2
                     +%endmacro
+                    +
                     +%macro SPLATD_MMX 1
                     +    punpckldq  %1, %1
                     +%endmacro
+                    +
                     +%macro SPLATD_SSE2 1
                     +    pshufd  %1, %1, 0
                     +%endmacro
+                    +
                     +%macro VECTOR_CLIP_INT32 4
                     +cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
                     +%ifidn %1, sse2
                     +    cvtsi2ss  m4, minm
                     +    cvtsi2ss  m5, maxm
                     +%else
                     +    movd      m4, minm
                     +    movd      m5, maxm
                     +%endif
                     +    SPLATD    m4
                     +    SPLATD    m5
                     +.loop:
                     +%assign %%i 1
                     +%rep %3
                     +    mova      m0,  [srcq+mmsize*0*%%i]
                     +    mova      m1,  [srcq+mmsize*1*%%i]
                     +    mova      m2,  [srcq+mmsize*2*%%i]
                     +    mova      m3,  [srcq+mmsize*3*%%i]
                     +%if %4
                     +    mova      m7,  [srcq+mmsize*4*%%i]
                     +    mova      m8,  [srcq+mmsize*5*%%i]
                     +    mova      m9,  [srcq+mmsize*6*%%i]
                     +    mova      m10, [srcq+mmsize*7*%%i]
                     +%endif
                     +    CLIPD  m0,  m4, m5, m6
                     +    CLIPD  m1,  m4, m5, m6
                     +    CLIPD  m2,  m4, m5, m6
                     +    CLIPD  m3,  m4, m5, m6
                     +%if %4
                     +    CLIPD  m7,  m4, m5, m6
                     +    CLIPD  m8,  m4, m5, m6
                     +    CLIPD  m9,  m4, m5, m6
                     +    CLIPD  m10, m4, m5, m6
                     +%endif
                     +    mova  [dstq+mmsize*0*%%i], m0
                     +    mova  [dstq+mmsize*1*%%i], m1
                     +    mova  [dstq+mmsize*2*%%i], m2
                     +    mova  [dstq+mmsize*3*%%i], m3
                     +%if %4
                     +    mova  [dstq+mmsize*4*%%i], m7
                     +    mova  [dstq+mmsize*5*%%i], m8
                     +    mova  [dstq+mmsize*6*%%i], m9
                     +    mova  [dstq+mmsize*7*%%i], m10
                     +%endif
                     +%assign %%i %%i+1
                     +%endrep
                     +    add     srcq, mmsize*4*(%3+%4)
                     +    add     dstq, mmsize*4*(%3+%4)
                     +    sub     lend, mmsize*(%3+%4)
                     +    jg .loop
                     +    REP_RET
                     +%endmacro
+                    +
                     +INIT_MMX
                     +%define SPLATD SPLATD_MMX
                     +%define CLIPD CLIPD_MMX
                     +VECTOR_CLIP_INT32 mmx, 0, 1, 0
                     +INIT_XMM
                     +%define SPLATD SPLATD_SSE2
                     +VECTOR_CLIP_INT32 sse2_int, 6, 1, 0
                     +%define CLIPD CLIPD_SSE2
                     +VECTOR_CLIP_INT32 sse2, 6, 2, 0
                     +%define CLIPD CLIPD_SSE41
                     +%ifdef m8
                     +VECTOR_CLIP_INT32 sse41, 11, 1, 1
                     +%else
                     +VECTOR_CLIP_INT32 sse41, 6, 1, 0
                     +%endif

@@ -861,13 +861,137 @@ static int get_stream_idx(int *d){
+                         }
+                     }
                     -static int avi_read_packet(AVFormatContext *s, AVPacket *pkt)
                     +static int avi_sync(AVFormatContext *s)
+                     {
                          AVIContext *avi = s->priv_data;
                          AVIOContext *pb = s->pb;
                          int n, d[8];
                          unsigned int size;
                          int64_t i, sync;
+                    +
                     +start_sync:
                     +    memset(d, -1, sizeof(int)*8);
                     +    for(i=sync=avio_tell(pb); !url_feof(pb); i++) {
                     +        int j;
+                    +
                     +        for(j=0; j<7; j++)
                     +            d[j]= d[j+1];
                     +        d[7]= avio_r8(pb);
+                    +
                     +        size= d[4] + (d[5]<<8) + (d[6]<<16) + (d[7]<<24);
+                    +
                     +        n= get_stream_idx(d+2);
                     +//av_log(s, AV_LOG_DEBUG, "%X %X %X %X %X %X %X %X %"PRId64" %d %d\n", d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], i, size, n);
                     +        if(i + (uint64_t)size > avi->fsize || d[0]<0)
                     +            continue;
+                    +
                     +        //parse ix##
                     +        if(  (d[0] == 'i' && d[1] == 'x' && n < s->nb_streams)
                     +        //parse JUNK
                     +           ||(d[0] == 'J' && d[1] == 'U' && d[2] == 'N' && d[3] == 'K')
                     +           ||(d[0] == 'i' && d[1] == 'd' && d[2] == 'x' && d[3] == '1')){
                     +            avio_skip(pb, size);
                     +//av_log(s, AV_LOG_DEBUG, "SKIP\n");
                     +            goto start_sync;
                     +        }
+                    +
                     +        //parse stray LIST
                     +        if(d[0] == 'L' && d[1] == 'I' && d[2] == 'S' && d[3] == 'T'){
                     +            avio_skip(pb, 4);
                     +            goto start_sync;
                     +        }
+                    +
                     +        n= get_stream_idx(d);
+                    +
                     +        if(!((i-avi->last_pkt_pos)&1) && get_stream_idx(d+1) < s->nb_streams)
                     +            continue;
+                    +
                     +        //detect ##ix chunk and skip
                     +        if(d[2] == 'i' && d[3] == 'x' && n < s->nb_streams){
                     +            avio_skip(pb, size);
                     +            goto start_sync;
                     +        }
+                    +
                     +        //parse ##dc/##wb
                     +        if(n < s->nb_streams){
                     +            AVStream *st;
                     +            AVIStream *ast;
                     +            st = s->streams[n];
                     +            ast = st->priv_data;
+                    +
                     +            if(s->nb_streams>=2){
                     +                AVStream *st1  = s->streams[1];
                     +                AVIStream *ast1= st1->priv_data;
                     +                //workaround for broken small-file-bug402.avi
                     +                if(   d[2] == 'w' && d[3] == 'b'
                     +                   && n==0
                     +                   && st ->codec->codec_type == AVMEDIA_TYPE_VIDEO
                     +                   && st1->codec->codec_type == AVMEDIA_TYPE_AUDIO
                     +                   && ast->prefix == 'd'*256+'c'
                     +                   && (d[2]*256+d[3] == ast1->prefix || !ast1->prefix_count)
                     +                  ){
                     +                    n=1;
                     +                    st = st1;
                     +                    ast = ast1;
                     +                    av_log(s, AV_LOG_WARNING, "Invalid stream + prefix combination, assuming audio.\n");
                     +                }
                     +            }
+                    +
+                    +
                     +            if(   (st->discard >= AVDISCARD_DEFAULT && size==0)
                     +               /*|| (st->discard >= AVDISCARD_NONKEY && !(pkt->flags & AV_PKT_FLAG_KEY))*/ //FIXME needs a little reordering
                     +               || st->discard >= AVDISCARD_ALL){
                     +                ast->frame_offset += get_duration(ast, size);
                     +                avio_skip(pb, size);
                     +                goto start_sync;
                     +            }
+                    +
                     +            if (d[2] == 'p' && d[3] == 'c' && size<=4*256+4) {
                     +                int k = avio_r8(pb);
                     +                int last = (k + avio_r8(pb) - 1) & 0xFF;
+                    +
                     +                avio_rl16(pb); //flags
+                    +
                     +                for (; k <= last; k++)
                     +                    ast->pal[k] = avio_rb32(pb)>>8;// b + (g << 8) + (r << 16);
                     +                ast->has_pal= 1;
                     +                goto start_sync;
                     +            } else if(   ((ast->prefix_count<5 || sync+9 > i) && d[2]<128 && d[3]<128) ||
                     +                         d[2]*256+d[3] == ast->prefix /*||
                     +                         (d[2] == 'd' && d[3] == 'c') ||
                     +                         (d[2] == 'w' && d[3] == 'b')*/) {
+                    +
                     +//av_log(s, AV_LOG_DEBUG, "OK\n");
                     +                if(d[2]*256+d[3] == ast->prefix)
                     +                    ast->prefix_count++;
                     +                else{
                     +                    ast->prefix= d[2]*256+d[3];
                     +                    ast->prefix_count= 0;
                     +                }
+                    +
                     +                avi->stream_index= n;
                     +                ast->packet_size= size + 8;
                     +                ast->remaining= size;
+                    +
                     +                if(size || !ast->sample_size){
                     +                    uint64_t pos= avio_tell(pb) - 8;
                     +                    if(!st->index_entries || !st->nb_index_entries || st->index_entries[st->nb_index_entries - 1].pos < pos){
                     +                        av_add_index_entry(st, pos, ast->frame_offset, size, 0, AVINDEX_KEYFRAME);
                     +                    }
                     +                }
                     +                return 0;
                     +            }
                     +        }
                     +    }
+                    +
                     +    return AVERROR_EOF;
                     +}
+                    +
                     +static int avi_read_packet(AVFormatContext *s, AVPacket *pkt)
                     +{
                     +    AVIContext *avi = s->priv_data;
                     +    AVIOContext *pb = s->pb;
                     +    int err;
                          void* dstr;
                          if (CONFIG_DV_DEMUXER && avi->dv_demux) {
@@ -1041,121 +1165,9 @@ resync:
                              return size;
+                         }
                     -    memset(d, -1, sizeof(int)*8);
                     -    for(i=sync=avio_tell(pb); !url_feof(pb); i++) {
                     -        int j;
+                    -
                     -        for(j=0; j<7; j++)
                     -            d[j]= d[j+1];
                     -        d[7]= avio_r8(pb);
+                    -
                     -        size= d[4] + (d[5]<<8) + (d[6]<<16) + (d[7]<<24);
+                    -
                     -        n= get_stream_idx(d+2);
                     -//av_log(s, AV_LOG_DEBUG, "%X %X %X %X %X %X %X %X %"PRId64" %d %d\n", d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], i, size, n);
                     -        if(i + (uint64_t)size > avi->fsize || d[0]<0)
                     -            continue;
+                    -
                     -        //parse ix##
                     -        if(  (d[0] == 'i' && d[1] == 'x' && n < s->nb_streams)
                     -        //parse JUNK
                     -           ||(d[0] == 'J' && d[1] == 'U' && d[2] == 'N' && d[3] == 'K')
                     -           ||(d[0] == 'i' && d[1] == 'd' && d[2] == 'x' && d[3] == '1')){
                     -            avio_skip(pb, size);
                     -//av_log(s, AV_LOG_DEBUG, "SKIP\n");
                     -            goto resync;
                     -        }
+                    -
                     -        //parse stray LIST
                     -        if(d[0] == 'L' && d[1] == 'I' && d[2] == 'S' && d[3] == 'T'){
                     -            avio_skip(pb, 4);
                     -            goto resync;
                     -        }
+                    -
                     -        n= get_stream_idx(d);
+                    -
                     -        if(!((i-avi->last_pkt_pos)&1) && get_stream_idx(d+1) < s->nb_streams)
                     -            continue;
+                    -
                     -        //detect ##ix chunk and skip
                     -        if(d[2] == 'i' && d[3] == 'x' && n < s->nb_streams){
                     -            avio_skip(pb, size);
                     -            goto resync;
                     -        }
+                    -
                     -        //parse ##dc/##wb
                     -        if(n < s->nb_streams){
                     -            AVStream *st;
                     -            AVIStream *ast;
                     -            st = s->streams[n];
                     -            ast = st->priv_data;
+                    -
                     -            if(s->nb_streams>=2){
                     -                AVStream *st1  = s->streams[1];
                     -                AVIStream *ast1= st1->priv_data;
                     -                //workaround for broken small-file-bug402.avi
                     -                if(   d[2] == 'w' && d[3] == 'b'
                     -                   && n==0
                     -                   && st ->codec->codec_type == AVMEDIA_TYPE_VIDEO
                     -                   && st1->codec->codec_type == AVMEDIA_TYPE_AUDIO
                     -                   && ast->prefix == 'd'*256+'c'
                     -                   && (d[2]*256+d[3] == ast1->prefix || !ast1->prefix_count)
                     -                  ){
                     -                    n=1;
                     -                    st = st1;
                     -                    ast = ast1;
                     -                    av_log(s, AV_LOG_WARNING, "Invalid stream + prefix combination, assuming audio.\n");
                     -                }
                     -            }
+                    -
+                    -
                     -            if(   (st->discard >= AVDISCARD_DEFAULT && size==0)
                     -               /*|| (st->discard >= AVDISCARD_NONKEY && !(pkt->flags & AV_PKT_FLAG_KEY))*/ //FIXME needs a little reordering
                     -               || st->discard >= AVDISCARD_ALL){
                     -                ast->frame_offset += get_duration(ast, size);
                     -                avio_skip(pb, size);
                     -                goto resync;
                     -            }
+                    -
                     -            if (d[2] == 'p' && d[3] == 'c' && size<=4*256+4) {
                     -                int k = avio_r8(pb);
                     -                int last = (k + avio_r8(pb) - 1) & 0xFF;
+                    -
                     -                avio_rl16(pb); //flags
+                    -
                     -                for (; k <= last; k++)
                     -                    ast->pal[k] = avio_rb32(pb)>>8;// b + (g << 8) + (r << 16);
                     -                ast->has_pal= 1;
                     -                goto resync;
                     -            } else if(   ((ast->prefix_count<5 || sync+9 > i) && d[2]<128 && d[3]<128) ||
                     -                         d[2]*256+d[3] == ast->prefix /*||
                     -                         (d[2] == 'd' && d[3] == 'c') ||
                     -                         (d[2] == 'w' && d[3] == 'b')*/) {
+                    -
                     -//av_log(s, AV_LOG_DEBUG, "OK\n");
                     -                if(d[2]*256+d[3] == ast->prefix)
                     -                    ast->prefix_count++;
                     -                else{
                     -                    ast->prefix= d[2]*256+d[3];
                     -                    ast->prefix_count= 0;
                     -                }
+                    -
                     -                avi->stream_index= n;
                     -                ast->packet_size= size + 8;
                     -                ast->remaining= size;
+                    -
                     -                if(size || !ast->sample_size){
                     -                    uint64_t pos= avio_tell(pb) - 8;
                     -                    if(!st->index_entries || !st->nb_index_entries || st->index_entries[st->nb_index_entries - 1].pos < pos){
                     -                        av_add_index_entry(st, pos, ast->frame_offset, size, 0, AVINDEX_KEYFRAME);
                     -                    }
                     -                }
                     -                goto resync;
                     -            }
                     -        }
                     -    }
+                    -
                     -    return AVERROR_EOF;
                     +    if ((err = avi_sync(s)) < 0)
                     +        return err;
                     +    goto resync;
+                     }
                      /* XXX: We make the implicit supposition that the positions are sorted

@@ -1960,6 +1960,7 @@ static int matroska_read_seek(AVFormatContext *s, int stream_index,
                          if ((index = av_index_search_timestamp(st, timestamp, flags)) < 0) {
                              avio_seek(s->pb, st->index_entries[st->nb_index_entries-1].pos, SEEK_SET);
                     +        matroska->current_id = 0;
                              while ((index = av_index_search_timestamp(st, timestamp, flags)) < 0) {
                                  matroska_clear_queue(matroska);
                                  if (matroska_parse_cluster(matroska) < 0)
@@ -1988,6 +1989,7 @@ static int matroska_read_seek(AVFormatContext *s, int stream_index,
+                         }
                          avio_seek(s->pb, st->index_entries[index_min].pos, SEEK_SET);
                     +    matroska->current_id = 0;
                          matroska->skip_to_keyframe = !(flags & AVSEEK_FLAG_ANY);
                          matroska->skip_to_timecode = st->index_entries[index].timestamp;
                          matroska->done = 0;

@@ -18,7 +18,8 @@
                       */
                      /**
                     - * @file Public dictionary API.
                     + * @file
                     + * Public dictionary API.
                       * @deprecated
                       *  AVDictionary is provided for compatibility with libav. It is both in
                       *  implementation as well as API inefficient. It does not scale and is

@@ -22,7 +22,8 @@
                      #include "avutil.h"
                      /**
                     - * @file misc file utilities
                     + * @file
                     + * Misc file utilities.
                       */
                      /**

@@ -1783,53 +1783,6 @@ static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
                      #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
                     -// FIXME Maybe dither instead.
                     -static av_always_inline void
                     -yuv9_OR_10ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
                     -                          const uint8_t *_srcU, const uint8_t *_srcV,
                     -                          int width, enum PixelFormat origin, int depth)
                     -{
                     -    int i;
                     -    const uint16_t *srcU = (const uint16_t *) _srcU;
                     -    const uint16_t *srcV = (const uint16_t *) _srcV;
+                    -
                     -    for (i = 0; i < width; i++) {
                     -        dstU[i] = input_pixel(&srcU[i]) >> (depth - 8);
                     -        dstV[i] = input_pixel(&srcV[i]) >> (depth - 8);
                     -    }
                     -}
+                    -
                     -static av_always_inline void
                     -yuv9_or_10ToY_c_template(uint8_t *dstY, const uint8_t *_srcY,
                     -                         int width, enum PixelFormat origin, int depth)
                     -{
                     -    int i;
                     -    const uint16_t *srcY = (const uint16_t*)_srcY;
+                    -
                     -    for (i = 0; i < width; i++)
                     -        dstY[i] = input_pixel(&srcY[i]) >> (depth - 8);
                     -}
+                    -
                     -#undef input_pixel
+                    -
                     -#define YUV_NBPS(depth, BE_LE, origin) \
                     -static void BE_LE ## depth ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
                     -                                     const uint8_t *srcU, const uint8_t *srcV, \
                     -                                     int width, uint32_t *unused) \
                     -{ \
                     -    yuv9_OR_10ToUV_c_template(dstU, dstV, srcU, srcV, width, origin, depth); \
                     -} \
                     -static void BE_LE ## depth ## ToY_c(uint8_t *dstY, const uint8_t *srcY, \
                     -                                    int width, uint32_t *unused) \
                     -{ \
                     -    yuv9_or_10ToY_c_template(dstY, srcY, width, origin, depth); \
                     -}
+                    -
                     -YUV_NBPS( 9, LE, PIX_FMT_YUV420P9LE);
                     -YUV_NBPS( 9, BE, PIX_FMT_YUV420P9BE);
                     -YUV_NBPS(10, LE, PIX_FMT_YUV420P10LE);
                     -YUV_NBPS(10, BE, PIX_FMT_YUV420P10BE);
+                    -
                      static void bgr24ToY_c(int16_t *dst, const uint8_t *src,
                                             int width, uint32_t *unused)
+                     {

@@ -2,10 +2,22 @@ FATE_AAC += fate-aac-al04_44
                      fate-aac-al04_44: CMD = pcm -i $(SAMPLES)/aac/al04_44.mp4
                      fate-aac-al04_44: REF = $(SAMPLES)/aac/al04_44.s16
                     +FATE_AAC += fate-aac-al05_44
                     +fate-aac-al05_44: CMD = pcm -i $(SAMPLES)/aac/al05_44.mp4
                     +fate-aac-al05_44: REF = $(SAMPLES)/aac/al05_44.s16
+                    +
                     +FATE_AAC += fate-aac-al06_44
                     +fate-aac-al06_44: CMD = pcm -i $(SAMPLES)/aac/al06_44.mp4
                     +fate-aac-al06_44: REF = $(SAMPLES)/aac/al06_44.s16
+                    +
                      FATE_AAC += fate-aac-al07_96
                      fate-aac-al07_96: CMD = pcm -i $(SAMPLES)/aac/al07_96.mp4
                      fate-aac-al07_96: REF = $(SAMPLES)/aac/al07_96.s16
                     +FATE_AAC += fate-aac-al17_44
                     +fate-aac-al17_44: CMD = pcm -i $(SAMPLES)/aac/al17_44.mp4
                     +fate-aac-al17_44: REF = $(SAMPLES)/aac/al17_44.s16
+                    +
                      FATE_AAC += fate-aac-am00_88
                      fate-aac-am00_88: CMD = pcm -i $(SAMPLES)/aac/am00_88.mp4
                      fate-aac-am00_88: REF = $(SAMPLES)/aac/am00_88.s16