Browse code

Merge remote-tracking branch 'qatar/master'

* qatar/master:
proresdsp: port x86 assembly to cpuflags.
lavr: x86: improve non-SSE4 version of S16_TO_S32_SX macro
lavfi: better channel layout negotiation
alac: check for truncated packets
alac: reverse lpc coeff order, simplify filter
lavr: add x86-optimized mixing functions
x86: add support for fmaddps fma4 instruction with abstraction to avx/sse
tscc2: fix typo in array index
build: use COMPILE template for HOSTOBJS
build: do full flag handling for all compiler-type tools
eval: fix printing of NaN in eval fate test.
build: Rename aandct component to more descriptive aandcttables
mpegaudio: bury inline asm under HAVE_INLINE_ASM.
x86inc: automatically insert vzeroupper for YMM functions.
rtmp: Check the buffer length of ping packets
rtmp: Allow having more unknown data at the end of a chunk size packet without failing
rtmp: Prevent reading outside of an allocate buffer when receiving server bandwidth packets

Conflicts:
Makefile
configure
libavcodec/x86/proresdsp.asm
libavutil/eval.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2012/07/28 06:42:19
Showing 21 changed files
... ...
@@ -11,7 +11,7 @@ ifndef V
11 11
 Q      = @
12 12
 ECHO   = printf "$(1)\t%s\n" $(2)
13 13
 BRIEF  = CC CXX AS YASM AR LD HOSTCC STRIP CP
14
-SILENT = DEPCC YASMDEP RM RANLIB
14
+SILENT = DEPCC DEPAS DEPHOSTCC YASMDEP RM RANLIB
15 15
 MSG    = $@
16 16
 M      = @$(call ECHO,$(TAG),$@);
17 17
 $(foreach VAR,$(BRIEF), \
... ...
@@ -26,15 +26,16 @@ ALLFFLIBS = avcodec avdevice avfilter avformat avresample avutil postproc swscal
26 26
 IFLAGS     := -I. -I$(SRC_PATH)/
27 27
 CPPFLAGS   := $(IFLAGS) $(CPPFLAGS)
28 28
 CFLAGS     += $(ECFLAGS)
29
-CCFLAGS     = $(CFLAGS)
29
+CCFLAGS     = $(CPPFLAGS) $(CFLAGS)
30
+ASFLAGS    := $(CPPFLAGS) $(ASFLAGS)
30 31
 CXXFLAGS   := $(CFLAGS) $(CXXFLAGS)
31 32
 YASMFLAGS  += $(IFLAGS) -I$(SRC_PATH)/libavutil/x86/ -Pconfig.asm
32
-HOSTCFLAGS += $(IFLAGS)
33
+HOSTCCFLAGS = $(IFLAGS) $(HOSTCFLAGS)
33 34
 LDFLAGS    := $(ALLFFLIBS:%=-Llib%) $(LDFLAGS)
34 35
 
35 36
 define COMPILE
36
-       $($(1)DEP)
37
-       $($(1)) $(CPPFLAGS) $($(1)FLAGS) $($(1)_DEPFLAGS) -c $($(1)_O) $<
37
+       $(call $(1)DEP,$(1))
38
+       $($(1)) $($(1)FLAGS) $($(1)_DEPFLAGS) -c $($(1)_O) $<
38 39
 endef
39 40
 
40 41
 COMPILE_C = $(call COMPILE,CC)
... ...
@@ -101,7 +102,7 @@ checkheaders: $(filter-out $(SKIPHEADERS:.h=.ho),$(ALLHEADERS:.h=.ho))
101 101
 alltools: $(TOOLS)
102 102
 
103 103
 $(HOSTOBJS): %.o: %.c
104
-	$(HOSTCC) $(HOSTCFLAGS) -c -o $@ $<
104
+	$(call COMPILE,HOSTCC)
105 105
 
106 106
 $(HOSTPROGS): %$(HOSTEXESUF): %.o
107 107
 	$(HOSTCC) $(HOSTLDFLAGS) -o $@ $< $(HOSTLIBS)
... ...
@@ -117,4 +118,4 @@ CLEANSUFFIXES     = *.d *.o *~ *.ho *.map *.ver *.gcno *.gcda
117 117
 DISTCLEANSUFFIXES = *.pc
118 118
 LIBSUFFIXES       = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a
119 119
 
120
--include $(wildcard $(OBJS:.o=.d) $(TESTOBJS:.o=.d))
120
+-include $(wildcard $(OBJS:.o=.d) $(HOSTOBJS:.o=.d) $(TESTOBJS:.o=.d))
... ...
@@ -265,6 +265,7 @@ Optimization options (experts only):
265 265
   --disable-sse            disable SSE optimizations
266 266
   --disable-ssse3          disable SSSE3 optimizations
267 267
   --disable-avx            disable AVX optimizations
268
+  --disable-fma4           disable FMA4 optimizations
268 269
   --disable-armv5te        disable armv5te optimizations
269 270
   --disable-armv6          disable armv6 optimizations
270 271
   --disable-armv6t2        disable armv6t2 optimizations
... ...
@@ -1173,6 +1174,7 @@ ARCH_EXT_LIST='
1173 1173
     armv6t2
1174 1174
     armvfp
1175 1175
     avx
1176
+    fma4
1176 1177
     mmi
1177 1178
     mmx
1178 1179
     mmx2
... ...
@@ -1336,7 +1338,7 @@ HAVE_LIST="
1336 1336
 
1337 1337
 # options emitted with CONFIG_ prefix but not available on command line
1338 1338
 CONFIG_EXTRA="
1339
-    aandct
1339
+    aandcttables
1340 1340
     avutil
1341 1341
     golomb
1342 1342
     gplv3
... ...
@@ -1450,6 +1452,7 @@ mmx2_deps="mmx"
1450 1450
 sse_deps="mmx"
1451 1451
 ssse3_deps="sse"
1452 1452
 avx_deps="ssse3"
1453
+fma4_deps="avx"
1453 1454
 
1454 1455
 aligned_stack_if_any="ppc x86"
1455 1456
 fast_64bit_if_any="alpha ia64 mips64 parisc64 ppc64 sparc64 x86_64"
... ...
@@ -1477,7 +1480,7 @@ ac3_fixed_encoder_select="mdct ac3dsp"
1477 1477
 alac_encoder_select="lpc"
1478 1478
 amrnb_decoder_select="lsp"
1479 1479
 amrwb_decoder_select="lsp"
1480
-amv_encoder_select="aandct"
1480
+amv_encoder_select="aandcttables"
1481 1481
 atrac1_decoder_select="mdct sinewin"
1482 1482
 atrac3_decoder_select="mdct"
1483 1483
 binkaudio_dct_decoder_select="mdct rdft dct sinewin"
... ...
@@ -1487,13 +1490,13 @@ cook_decoder_select="mdct sinewin"
1487 1487
 cscd_decoder_suggest="zlib"
1488 1488
 dca_decoder_select="mdct"
1489 1489
 dirac_decoder_select="dwt golomb"
1490
-dnxhd_encoder_select="aandct"
1490
+dnxhd_encoder_select="aandcttables"
1491 1491
 dxa_decoder_select="zlib"
1492 1492
 eac3_decoder_select="ac3_decoder"
1493 1493
 eac3_encoder_select="mdct ac3dsp"
1494
-eamad_decoder_select="aandct"
1495
-eatgq_decoder_select="aandct"
1496
-eatqi_decoder_select="aandct"
1494
+eamad_decoder_select="aandcttables"
1495
+eatgq_decoder_select="aandcttables"
1496
+eatqi_decoder_select="aandcttables"
1497 1497
 exr_decoder_select="zlib"
1498 1498
 ffv1_decoder_select="golomb"
1499 1499
 flac_decoder_select="golomb"
... ...
@@ -1505,9 +1508,9 @@ flashsv2_decoder_select="zlib"
1505 1505
 flv_decoder_select="h263_decoder"
1506 1506
 flv_encoder_select="h263_encoder"
1507 1507
 fraps_decoder_select="huffman"
1508
-h261_encoder_select="aandct"
1508
+h261_encoder_select="aandcttables"
1509 1509
 h263_decoder_select="h263_parser"
1510
-h263_encoder_select="aandct"
1510
+h263_encoder_select="aandcttables"
1511 1511
 h263_vaapi_hwaccel_select="vaapi h263_decoder"
1512 1512
 h263i_decoder_select="h263_decoder"
1513 1513
 h263p_encoder_select="h263_encoder"
... ...
@@ -1523,9 +1526,9 @@ iac_decoder_select="fft mdct sinewin"
1523 1523
 imc_decoder_select="fft mdct sinewin"
1524 1524
 jpegls_decoder_select="golomb"
1525 1525
 jpegls_encoder_select="golomb"
1526
-ljpeg_encoder_select="aandct"
1526
+ljpeg_encoder_select="aandcttables"
1527 1527
 loco_decoder_select="golomb"
1528
-mjpeg_encoder_select="aandct"
1528
+mjpeg_encoder_select="aandcttables"
1529 1529
 mlp_decoder_select="mlp_parser"
1530 1530
 mp1_decoder_select="mpegaudiodsp"
1531 1531
 mp1float_decoder_select="mpegaudiodsp"
... ...
@@ -1544,13 +1547,13 @@ mpeg_xvmc_decoder_deps="X11_extensions_XvMClib_h"
1544 1544
 mpeg_xvmc_decoder_select="mpegvideo_decoder"
1545 1545
 mpeg1_vdpau_decoder_select="vdpau mpeg1video_decoder"
1546 1546
 mpeg1_vdpau_hwaccel_select="vdpau mpeg1video_decoder"
1547
-mpeg1video_encoder_select="aandct"
1547
+mpeg1video_encoder_select="aandcttables"
1548 1548
 mpeg2_crystalhd_decoder_select="crystalhd"
1549 1549
 mpeg2_dxva2_hwaccel_deps="dxva2api_h"
1550 1550
 mpeg2_dxva2_hwaccel_select="dxva2 mpeg2video_decoder"
1551 1551
 mpeg2_vdpau_hwaccel_select="vdpau mpeg2video_decoder"
1552 1552
 mpeg2_vaapi_hwaccel_select="vaapi mpeg2video_decoder"
1553
-mpeg2video_encoder_select="aandct"
1553
+mpeg2video_encoder_select="aandcttables"
1554 1554
 mpeg4_crystalhd_decoder_select="crystalhd"
1555 1555
 mpeg4_decoder_select="h263_decoder mpeg4video_parser"
1556 1556
 mpeg4_encoder_select="h263_encoder"
... ...
@@ -1580,11 +1583,11 @@ rv40_decoder_select="golomb h264chroma h264pred h264qpel"
1580 1580
 shorten_decoder_select="golomb"
1581 1581
 sipr_decoder_select="lsp"
1582 1582
 snow_decoder_select="dwt"
1583
-snow_encoder_select="aandct dwt"
1583
+snow_encoder_select="aandcttables dwt"
1584 1584
 sonic_decoder_select="golomb"
1585 1585
 sonic_encoder_select="golomb"
1586 1586
 sonic_ls_encoder_select="golomb"
1587
-svq1_encoder_select="aandct"
1587
+svq1_encoder_select="aandcttables"
1588 1588
 svq3_decoder_select="golomb h264chroma h264dsp h264pred h264qpel"
1589 1589
 svq3_decoder_suggest="zlib"
1590 1590
 theora_decoder_select="vp3_decoder"
... ...
@@ -1965,6 +1968,8 @@ ldflags_filter=echo
1965 1965
 AS_O='-o $@'
1966 1966
 CC_O='-o $@'
1967 1967
 CXX_O='-o $@'
1968
+LD_O='-o $@'
1969
+HOSTCC_O='-o $@'
1968 1970
 
1969 1971
 host_cflags='-D_ISOC99_SOURCE -D_XOPEN_SOURCE=600 -O3 -g'
1970 1972
 host_libs='-lm'
... ...
@@ -1975,8 +1980,8 @@ target_path='$(CURDIR)'
1975 1975
 
1976 1976
 # since the object filename is not given with the -MM flag, the compiler
1977 1977
 # is only able to print the basename, and we must add the path ourselves
1978
-DEPEND_CMD='$(DEPCC) $(DEPFLAGS) $< | sed -e "/^\#.*/d" -e "s,^[[:space:]]*$(*F)\\.o,$(@D)/$(*F).o," > $(@:.o=.d)'
1979
-DEPFLAGS='$(CPPFLAGS) $(CFLAGS) -MM'
1978
+DEPCMD='$(DEP$(1)) $(DEP$(1)FLAGS) $($(1)DEP_FLAGS) $< | sed -e "/^\#.*/d" -e "s,^[[:space:]]*$(*F)\\.o,$(@D)/$(*F).o," > $(@:.o=.d)'
1979
+DEPFLAGS='-MM'
1980 1980
 
1981 1981
 # find source path
1982 1982
 if test -f configure; then
... ...
@@ -2319,120 +2324,150 @@ tms470_flags(){
2319 2319
     done
2320 2320
 }
2321 2321
 
2322
-if   $cc -v 2>&1 | grep -q '^gcc.*LLVM'; then
2323
-    cc_type=llvm_gcc
2324
-    gcc_extra_ver=$(expr "$($cc --version | head -n1)" : '.*\((.*)\)')
2325
-    cc_ident="llvm-gcc $($cc -dumpversion) $gcc_extra_ver"
2326
-    CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2327
-    AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2328
-    cflags_speed='-O3'
2329
-    cflags_size='-Os'
2330
-elif $cc -v 2>&1 | grep -qi ^gcc; then
2331
-    cc_type=gcc
2332
-    gcc_version=$($cc --version | head -n1)
2333
-    gcc_basever=$($cc -dumpversion)
2334
-    gcc_pkg_ver=$(expr "$gcc_version" : '[^ ]* \(([^)]*)\)')
2335
-    gcc_ext_ver=$(expr "$gcc_version" : ".*$gcc_pkg_ver $gcc_basever \\(.*\\)")
2336
-    cc_ident=$(cleanws "gcc $gcc_basever $gcc_pkg_ver $gcc_ext_ver")
2337
-    if ! $cc -dumpversion | grep -q '^2\.'; then
2338
-        CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2339
-        AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2322
+probe_cc(){
2323
+    pfx=$1
2324
+    _cc=$2
2325
+
2326
+    unset _type _ident _cc_o _flags _cflags _ldflags _depflags _DEPCMD _DEPFLAGS
2327
+    _flags_filter=echo
2328
+
2329
+    if $_cc -v 2>&1 | grep -q '^gcc.*LLVM'; then
2330
+        _type=llvm_gcc
2331
+        gcc_extra_ver=$(expr "$($_cc --version | head -n1)" : '.*\((.*)\)')
2332
+        _ident="llvm-gcc $($_cc -dumpversion) $gcc_extra_ver"
2333
+        _depflags='-MMD -MF $(@:.o=.d) -MT $@'
2334
+        _cflags_speed='-O3'
2335
+        _cflags_size='-Os'
2336
+    elif $_cc -v 2>&1 | grep -qi ^gcc; then
2337
+        _type=gcc
2338
+        gcc_version=$($_cc --version | head -n1)
2339
+        gcc_basever=$($_cc -dumpversion)
2340
+        gcc_pkg_ver=$(expr "$gcc_version" : '[^ ]* \(([^)]*)\)')
2341
+        gcc_ext_ver=$(expr "$gcc_version" : ".*$gcc_pkg_ver $gcc_basever \\(.*\\)")
2342
+        _ident=$(cleanws "gcc $gcc_basever $gcc_pkg_ver $gcc_ext_ver")
2343
+        if ! $_cc -dumpversion | grep -q '^2\.'; then
2344
+            _depflags='-MMD -MF $(@:.o=.d) -MT $@'
2345
+        fi
2346
+        _cflags_speed='-O3'
2347
+        _cflags_size='-Os'
2348
+    elif $_cc --version 2>/dev/null | grep -q Intel; then
2349
+        _type=icc
2350
+        _ident=$($_cc --version | head -n1)
2351
+        _depflags='-MMD'
2352
+        _cflags_speed='-O3'
2353
+        _cflags_size='-Os'
2354
+        _cflags_noopt='-O1'
2355
+    elif $_cc -v 2>&1 | grep -q xlc; then
2356
+        _type=xlc
2357
+        _ident=$($_cc -qversion 2>/dev/null | head -n1)
2358
+        _cflags_speed='-O5'
2359
+        _cflags_size='-O5 -qcompact'
2360
+    elif $_cc -V 2>/dev/null | grep -q Compaq; then
2361
+        _type=ccc
2362
+        _ident=$($_cc -V | head -n1 | cut -d' ' -f1-3)
2363
+        _DEPFLAGS='-M'
2364
+        debuglevel=3
2365
+        _ldflags='-Wl,-z,now' # calls to libots crash without this
2366
+        _cflags_speed='-fast'
2367
+        _cflags_size='-O1'
2368
+    elif $_cc --vsn 2>/dev/null | grep -q "ARM C/C++ Compiler"; then
2369
+        test -d "$sysroot" || die "No valid sysroot specified."
2370
+        _type=armcc
2371
+        _ident=$($_cc --vsn | head -n1)
2372
+        armcc_conf="$PWD/armcc.conf"
2373
+        $_cc --arm_linux_configure                 \
2374
+             --arm_linux_config_file="$armcc_conf" \
2375
+             --configure_sysroot="$sysroot"        \
2376
+             --configure_cpp_headers="$sysinclude" >>$logfile 2>&1 ||
2377
+             die "Error creating armcc configuration file."
2378
+        $_cc --vsn | grep -q RVCT && armcc_opt=rvct || armcc_opt=armcc
2379
+        _flags="--arm_linux_config_file=$armcc_conf --translate_gcc"
2380
+        as_default="${cross_prefix}gcc"
2381
+        _depflags='-MMD'
2382
+        _cflags_speed='-O3'
2383
+        _cflags_size='-Os'
2384
+    elif $_cc -version 2>/dev/null | grep -q TMS470; then
2385
+        _type=tms470
2386
+        _ident=$($_cc -version | head -n1 | tr -s ' ')
2387
+        _flags='--gcc --abi=eabi -me'
2388
+        _cflags='-D__gnuc_va_list=va_list -D__USER_LABEL_PREFIX__='
2389
+        _cc_o='-fe=$@'
2390
+        as_default="${cross_prefix}gcc"
2391
+        ld_default="${cross_prefix}gcc"
2392
+        _depflags='-ppa -ppd=$(@:.o=.d)'
2393
+        _cflags_speed='-O3 -mf=5'
2394
+        _cflags_size='-O3 -mf=2'
2395
+        _flags_filter=tms470_flags
2396
+    elif $_cc -v 2>&1 | grep -q clang; then
2397
+        _type=clang
2398
+        _ident=$($_cc --version | head -n1)
2399
+        _depflags='-MMD'
2400
+        _cflags_speed='-O3'
2401
+        _cflags_size='-Os'
2402
+    elif $_cc -V 2>&1 | grep -q Sun; then
2403
+        _type=suncc
2404
+        _ident=$($_cc -V 2>&1 | head -n1 | cut -d' ' -f 2-)
2405
+        _DEPCMD='$(DEP$(1)) $(DEP$(1)FLAGS) $($(1)DEP_FLAGS) $< | sed -e "1s,^.*: ,$@: ," -e "\$$!s,\$$, \\\," -e "1!s,^.*: , ," > $(@:.o=.d)'
2406
+        _DEPFLAGS='-xM1'
2407
+        _ldflags='-std=c99'
2408
+        _cflags_speed='-O5'
2409
+        _cflags_size='-O5 -xspace'
2410
+        _flags_filter=suncc_flags
2411
+    elif $_cc -v 2>&1 | grep -q 'PathScale\|Path64'; then
2412
+        _type=pathscale
2413
+        _ident=$($_cc -v 2>&1 | head -n1 | tr -d :)
2414
+        _depflags='-MMD -MF $(@:.o=.d) -MT $@'
2415
+        _cflags_speed='-O2'
2416
+        _cflags_size='-Os'
2417
+        _flags_filter='filter_out -Wdisabled-optimization'
2418
+    elif $_cc -v 2>&1 | grep -q Open64; then
2419
+        _type=open64
2420
+        _ident=$($_cc -v 2>&1 | head -n1 | tr -d :)
2421
+        _depflags='-MMD -MF $(@:.o=.d) -MT $@'
2422
+        _cflags_speed='-O2'
2423
+        _cflags_size='-Os'
2424
+        _flags_filter='filter_out -Wdisabled-optimization|-Wtype-limits|-fno-signed-zeros'
2425
+    elif $_cc -V 2>&1 | grep -q Portland; then
2426
+        _type=pgi
2427
+        _ident="PGI $($_cc -V 2>&1 | awk '/^pgcc/ { print $2; exit }')"
2428
+        opt_common='-alias=ansi -Mlre -Mpre'
2429
+        _cflags_speed="-O3 -Mautoinline -Munroll=c:4 $opt_common"
2430
+        _cflags_size="-O2 -Munroll=c:1 $opt_common"
2431
+        _cflags_noopt="-O1"
2432
+        _flags_filter=pgi_flags
2340 2433
     fi
2341
-    cflags_speed='-O3'
2342
-    cflags_size='-Os'
2343
-elif $cc --version 2>/dev/null | grep -q Intel; then
2344
-    cc_type=icc
2345
-    cc_ident=$($cc --version | head -n1)
2346
-    CC_DEPFLAGS='-MMD'
2347
-    AS_DEPFLAGS='-MMD'
2348
-    cflags_speed='-O3'
2349
-    cflags_size='-Os'
2350
-    cflags_noopt='-O1'
2351
-elif $cc -v 2>&1 | grep -q xlc; then
2352
-    cc_type=xlc
2353
-    cc_ident=$($cc -qversion 2>/dev/null | head -n1)
2354
-    cflags_speed='-O5'
2355
-    cflags_size='-O5 -qcompact'
2356
-elif $cc -V 2>/dev/null | grep -q Compaq; then
2357
-    cc_type=ccc
2358
-    cc_ident=$($cc -V | head -n1 | cut -d' ' -f1-3)
2359
-    DEPFLAGS='$(CPPFLAGS) $(CFLAGS) -M'
2360
-    debuglevel=3
2361
-    add_ldflags -Wl,-z,now # calls to libots crash without this
2362
-    cflags_speed='-fast'
2363
-    cflags_size='-O1'
2364
-elif $cc --vsn 2>/dev/null | grep -q "ARM C/C++ Compiler"; then
2365
-    test -d "$sysroot" || die "No valid sysroot specified."
2366
-    cc_type=armcc
2367
-    cc_ident=$($cc --vsn | head -n1)
2368
-    armcc_conf="$PWD/armcc.conf"
2369
-    $cc --arm_linux_configure                 \
2370
-        --arm_linux_config_file="$armcc_conf" \
2371
-        --configure_sysroot="$sysroot"        \
2372
-        --configure_cpp_headers="$sysinclude" >>$logfile 2>&1 ||
2373
-        die "Error creating armcc configuration file."
2374
-    $cc --vsn | grep -q RVCT && armcc_opt=rvct || armcc_opt=armcc
2375
-    cc="$cc --arm_linux_config_file=$armcc_conf --translate_gcc"
2376
-    as_default="${cross_prefix}gcc"
2377
-    CC_DEPFLAGS='-MMD'
2378
-    AS_DEPFLAGS='-MMD'
2379
-    cflags_speed='-O3'
2380
-    cflags_size='-Os'
2381
-    asflags_filter="filter_out -W${armcc_opt}*"
2382
-elif $cc -version 2>/dev/null | grep -q TMS470; then
2383
-    cc_type=tms470
2384
-    cc_ident=$($cc -version | head -n1 | tr -s ' ')
2385
-    cc="$cc --gcc --abi=eabi -me"
2386
-    CC_O='-fe=$@'
2387
-    as_default="${cross_prefix}gcc"
2388
-    ld_default="${cross_prefix}gcc"
2389
-    add_cflags -D__gnuc_va_list=va_list -D__USER_LABEL_PREFIX__=
2390
-    CC_DEPFLAGS='-ppa -ppd=$(@:.o=.d)'
2391
-    AS_DEPFLAGS='-MMD'
2392
-    cflags_speed='-O3 -mf=5'
2393
-    cflags_size='-O3 -mf=2'
2394
-    cflags_filter=tms470_flags
2395
-elif $cc -v 2>&1 | grep -q clang; then
2396
-    cc_type=clang
2397
-    cc_ident=$($cc --version | head -n1)
2398
-    CC_DEPFLAGS='-MMD'
2399
-    AS_DEPFLAGS='-MMD'
2400
-    cflags_speed='-O3'
2401
-    cflags_size='-Os'
2402
-elif $cc -V 2>&1 | grep -q Sun; then
2403
-    cc_type=suncc
2404
-    cc_ident=$($cc -V 2>&1 | head -n1 | cut -d' ' -f 2-)
2405
-    DEPEND_CMD='$(DEPCC) $(DEPFLAGS) $< | sed -e "1s,^.*: ,$@: ," -e "\$$!s,\$$, \\\," -e "1!s,^.*: , ," > $(@:.o=.d)'
2406
-    DEPFLAGS='$(CPPFLAGS) $(CFLAGS) -xM1'
2407
-    add_ldflags -xc99
2408
-    cflags_speed='-O5'
2409
-    cflags_size='-O5 -xspace'
2410
-    cflags_filter=suncc_flags
2411
-elif $cc -v 2>&1 | grep -q 'PathScale\|Path64'; then
2412
-    cc_type=pathscale
2413
-    cc_ident=$($cc -v 2>&1 | head -n1 | tr -d :)
2414
-    CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2415
-    AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2416
-    cflags_speed='-O2'
2417
-    cflags_size='-Os'
2418
-    cflags_filter='filter_out -Wdisabled-optimization'
2419
-elif $cc -v 2>&1 | grep -q Open64; then
2420
-    cc_type=open64
2421
-    cc_ident=$($cc -v 2>&1 | head -n1 | tr -d :)
2422
-    CC_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2423
-    AS_DEPFLAGS='-MMD -MF $(@:.o=.d) -MT $@'
2424
-    cflags_speed='-O2'
2425
-    cflags_size='-Os'
2426
-    cflags_filter='filter_out -Wdisabled-optimization|-Wtype-limits|-fno-signed-zeros'
2427
-elif $cc -V 2>&1 | grep -q Portland; then
2428
-    cc_type=pgi
2429
-    cc_ident="PGI $($cc -V 2>&1 | awk '/^pgcc/ { print $2; exit }')"
2430
-    opt_common='-alias=ansi -Mlre -Mpre'
2431
-    cflags_speed="-O3 -Mautoinline -Munroll=c:4 $opt_common"
2432
-    cflags_size="-O2 -Munroll=c:1 $opt_common"
2433
-    cflags_noopt="-O1"
2434
-    cflags_filter=pgi_flags
2435
-fi
2434
+
2435
+    eval ${pfx}_type=\$_type
2436
+    eval ${pfx}_ident=\$_ident
2437
+}
2438
+
2439
+set_ccvars(){
2440
+    eval ${1}_O=\${_cc_o-\${${1}_O}}
2441
+
2442
+    if [ -n "$_depflags" ]; then
2443
+        eval ${1}_DEPFLAGS=\$_depflags
2444
+    else
2445
+        eval ${1}DEP=\${_DEPCMD:-\$DEPCMD}
2446
+        eval ${1}DEP_FLAGS=\${_DEPFLAGS:-\$DEPFLAGS}
2447
+        eval DEP${1}FLAGS=\$_flags
2448
+    fi
2449
+}
2450
+
2451
+probe_cc cc "$cc"
2452
+cflags_filter=$_flags_filter
2453
+cflags_speed=$_cflags_speed
2454
+cflags_size=$_cflags_size
2455
+cflags_noopt=$_cflags_noopt
2456
+add_cflags $_flags $_cflags
2457
+cc_ldflags=$_ldflags
2458
+set_ccvars CC
2459
+
2460
+probe_cc hostcc "$host_cc"
2461
+host_cflags_filter=$_flags_filter
2462
+host_ldflags_filter=$_flags_filter
2463
+add_host_cflags  $_flags $_cflags
2464
+add_host_ldflags $_flags $_ldflags
2465
+set_ccvars HOSTCC
2436 2466
 
2437 2467
 test -n "$cc_type" && enable $cc_type ||
2438 2468
     warn "Unknown C compiler $cc, unable to select optimal CFLAGS"
... ...
@@ -2442,9 +2477,23 @@ test -n "$cc_type" && enable $cc_type ||
2442 2442
 : ${ld_default:=$cc}
2443 2443
 set_default ar as dep_cc ld
2444 2444
 
2445
-test -n "$CC_DEPFLAGS" || CCDEP=$DEPEND_CMD
2446
-test -n "$CXX_DEPFLAGS" || CXXDEP=$DEPEND_CMD
2447
-test -n "$AS_DEPFLAGS" || ASDEP=$DEPEND_CMD
2445
+probe_cc as "$as"
2446
+asflags_filter=$_flags_filter
2447
+add_asflags $_flags $_cflags
2448
+set_ccvars AS
2449
+
2450
+probe_cc ld "$ld"
2451
+ldflags_filter=$_flags_filter
2452
+add_ldflags $_flags $_ldflags
2453
+test "$cc_type" != "$ld_type" && add_ldflags $cc_ldflags
2454
+LD_O=${_cc_o-$LD_O}
2455
+
2456
+if [ -z "$CC_DEPFLAGS" ] && [ "$dep_cc" != "$cc" ]; then
2457
+    probe_cc depcc "$dep_cc"
2458
+    CCDEP=${_DEPCMD:-$DEPCMD}
2459
+    CCDEP_FLAGS=${_DEPFLAGS:=$DEPFLAGS}
2460
+    DEPCCFLAGS=$_flags
2461
+fi
2448 2462
 
2449 2463
 add_cflags $extra_cflags
2450 2464
 add_cxxflags $extra_cxxflags
... ...
@@ -3140,6 +3189,7 @@ EOF
3140 3140
         check_yasm "pextrd [eax], xmm0, 1" && enable yasm ||
3141 3141
             die "yasm not found, use --disable-yasm for a crippled build"
3142 3142
         check_yasm "vextractf128 xmm0, ymm0, 0" || disable avx
3143
+        check_yasm "vfmaddps ymm0, ymm1, ymm2, ymm3" || disable fma4
3143 3144
     fi
3144 3145
 
3145 3146
     case "$cpu" in
... ...
@@ -3673,6 +3723,7 @@ if enabled x86; then
3673 3673
     echo "SSE enabled               ${sse-no}"
3674 3674
     echo "SSSE3 enabled             ${ssse3-no}"
3675 3675
     echo "AVX enabled               ${avx-no}"
3676
+    echo "FMA4 enabled              ${fma4-no}"
3676 3677
     echo "CMOV enabled              ${cmov-no}"
3677 3678
     echo "CMOV is fast              ${fast_cmov-no}"
3678 3679
     echo "EBX available             ${ebx_available-no}"
... ...
@@ -3814,6 +3865,9 @@ CXX=$cxx
3814 3814
 AS=$as
3815 3815
 LD=$ld
3816 3816
 DEPCC=$dep_cc
3817
+DEPCCFLAGS=$DEPCCFLAGS \$(CPPFLAGS)
3818
+DEPAS=$as
3819
+DEPASFLAGS=$DEPASFLAGS \$(CPPFLAGS)
3817 3820
 YASM=$yasmexe
3818 3821
 YASMDEP=$yasmexe
3819 3822
 AR=$ar
... ...
@@ -3825,9 +3879,10 @@ CPPFLAGS=$CPPFLAGS
3825 3825
 CFLAGS=$CFLAGS
3826 3826
 CXXFLAGS=$CXXFLAGS
3827 3827
 ASFLAGS=$ASFLAGS
3828
-AS_O=$CC_O
3828
+AS_O=$AS_O
3829 3829
 CC_O=$CC_O
3830 3830
 CXX_O=$CXX_O
3831
+LD_O=$LD_O
3831 3832
 LDFLAGS=$LDFLAGS
3832 3833
 FFSERVERLDFLAGS=$FFSERVERLDFLAGS
3833 3834
 SHFLAGS=$SHFLAGS
... ...
@@ -3842,10 +3897,11 @@ SLIBPREF=$SLIBPREF
3842 3842
 SLIBSUF=$SLIBSUF
3843 3843
 EXESUF=$EXESUF
3844 3844
 EXTRA_VERSION=$extra_version
3845
-DEPFLAGS=$DEPFLAGS
3846 3845
 CCDEP=$CCDEP
3847 3846
 CXXDEP=$CXXDEP
3847
+CCDEP_FLAGS=$CCDEP_FLAGS
3848 3848
 ASDEP=$ASDEP
3849
+ASDEP_FLAGS=$ASDEP_FLAGS
3849 3850
 CC_DEPFLAGS=$CC_DEPFLAGS
3850 3851
 AS_DEPFLAGS=$AS_DEPFLAGS
3851 3852
 HOSTCC=$host_cc
... ...
@@ -3853,6 +3909,12 @@ HOSTCFLAGS=$host_cflags
3853 3853
 HOSTEXESUF=$HOSTEXESUF
3854 3854
 HOSTLDFLAGS=$host_ldflags
3855 3855
 HOSTLIBS=$host_libs
3856
+DEPHOSTCC=$host_cc
3857
+DEPHOSTCCFLAGS=$DEPHOSTCCFLAGS \$(HOSTCCFLAGS)
3858
+HOSTCCDEP=$HOSTCCDEP
3859
+HOSTCCDEP_FLAGS=$HOSTCCDEP_FLAGS
3860
+HOSTCC_DEPFLAGS=$HOSTCC_DEPFLAGS
3861
+HOSTCC_O=$HOSTCC_O
3856 3862
 TARGET_EXEC=$target_exec
3857 3863
 TARGET_PATH=$target_path
3858 3864
 SDL_LIBS=$sdl_libs
... ...
@@ -28,8 +28,6 @@ doc/%.txt: doc/%.texi
28 28
 	$(Q)$(TEXIDEP)
29 29
 	$(M)makeinfo --force --no-headers -o $@ $< 2>/dev/null
30 30
 
31
-doc/print_options.o: libavformat/options_table.h libavcodec/options_table.h
32
-
33 31
 GENTEXI  = format codec
34 32
 GENTEXI := $(GENTEXI:%=doc/avoptions_%.texi)
35 33
 
... ...
@@ -32,7 +32,7 @@ OBJS = allcodecs.o                                                      \
32 32
        utils.o                                                          \
33 33
 
34 34
 # parts needed for many different codecs
35
-OBJS-$(CONFIG_AANDCT)                  += aandcttab.o
35
+OBJS-$(CONFIG_AANDCTTABLES)            += aandcttab.o
36 36
 OBJS-$(CONFIG_AC3DSP)                  += ac3dsp.o
37 37
 OBJS-$(CONFIG_CRYSTALHD)               += crystalhd.o
38 38
 OBJS-$(CONFIG_ENCODERS)                += faandct.o jfdctfst.o jfdctint.o
... ...
@@ -200,6 +200,7 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
200 200
                            int lpc_order, int lpc_quant)
201 201
 {
202 202
     int i;
203
+    int32_t *pred = buffer_out;
203 204
 
204 205
     /* first sample always copies */
205 206
     *buffer_out = *error_buffer;
... ...
@@ -223,37 +224,35 @@ static void lpc_prediction(int32_t *error_buffer, int32_t *buffer_out,
223 223
     }
224 224
 
225 225
     /* read warm-up samples */
226
-    for (i = 0; i < lpc_order; i++) {
227
-        buffer_out[i + 1] = sign_extend(buffer_out[i] + error_buffer[i + 1],
228
-                                        bps);
229
-    }
226
+    for (i = 1; i <= lpc_order; i++)
227
+        buffer_out[i] = sign_extend(buffer_out[i - 1] + error_buffer[i], bps);
230 228
 
231 229
     /* NOTE: 4 and 8 are very common cases that could be optimized. */
232 230
 
233
-    for (i = lpc_order; i < nb_samples - 1; i++) {
231
+    for (; i < nb_samples; i++) {
234 232
         int j;
235 233
         int val = 0;
236
-        int error_val = error_buffer[i + 1];
234
+        int error_val = error_buffer[i];
237 235
         int error_sign;
238
-        int d = buffer_out[i - lpc_order];
236
+        int d = *pred++;
239 237
 
240 238
         /* LPC prediction */
241 239
         for (j = 0; j < lpc_order; j++)
242
-            val += (buffer_out[i - j] - d) * lpc_coefs[j];
240
+            val += (pred[j] - d) * lpc_coefs[j];
243 241
         val = (val + (1 << (lpc_quant - 1))) >> lpc_quant;
244 242
         val += d + error_val;
245
-        buffer_out[i + 1] = sign_extend(val, bps);
243
+        buffer_out[i] = sign_extend(val, bps);
246 244
 
247 245
         /* adapt LPC coefficients */
248 246
         error_sign = sign_only(error_val);
249 247
         if (error_sign) {
250
-            for (j = lpc_order - 1; j >= 0 && error_val * error_sign > 0; j--) {
248
+            for (j = 0; j < lpc_order && error_val * error_sign > 0; j++) {
251 249
                 int sign;
252
-                val  = d - buffer_out[i - j];
250
+                val  = d - pred[j];
253 251
                 sign = sign_only(val) * error_sign;
254 252
                 lpc_coefs[j] -= sign;
255 253
                 val *= sign;
256
-                error_val -= (val >> lpc_quant) * (lpc_order - j);
254
+                error_val -= (val >> lpc_quant) * (j + 1);
257 255
             }
258 256
         }
259 257
     }
... ...
@@ -356,7 +355,7 @@ static int decode_element(AVCodecContext *avctx, void *data, int ch_index,
356 356
             lpc_order[ch]         = get_bits(&alac->gb, 5);
357 357
 
358 358
             /* read the predictor table */
359
-            for (i = 0; i < lpc_order[ch]; i++)
359
+            for (i = lpc_order[ch] - 1; i >= 0; i--)
360 360
                 lpc_coefs[ch][i] = get_sbits(&alac->gb, 16);
361 361
         }
362 362
 
... ...
@@ -477,16 +476,19 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
477 477
     ALACContext *alac = avctx->priv_data;
478 478
     enum RawDataBlockType element;
479 479
     int channels;
480
-    int ch, ret;
480
+    int ch, ret, got_end;
481 481
 
482 482
     init_get_bits(&alac->gb, avpkt->data, avpkt->size * 8);
483 483
 
484
+    got_end = 0;
484 485
     alac->nb_samples = 0;
485 486
     ch = 0;
486
-    while (get_bits_left(&alac->gb)) {
487
+    while (get_bits_left(&alac->gb) >= 3) {
487 488
         element = get_bits(&alac->gb, 3);
488
-        if (element == TYPE_END)
489
+        if (element == TYPE_END) {
490
+            got_end = 1;
489 491
             break;
492
+        }
490 493
         if (element > TYPE_CPE && element != TYPE_LFE) {
491 494
             av_log(avctx, AV_LOG_ERROR, "syntax element unsupported: %d", element);
492 495
             return AVERROR_PATCHWELCOME;
... ...
@@ -501,11 +503,15 @@ static int alac_decode_frame(AVCodecContext *avctx, void *data,
501 501
         ret = decode_element(avctx, data,
502 502
                              alac_channel_layout_offsets[alac->channels - 1][ch],
503 503
                              channels);
504
-        if (ret < 0)
504
+        if (ret < 0 && get_bits_left(&alac->gb))
505 505
             return ret;
506 506
 
507 507
         ch += channels;
508 508
     }
509
+    if (!got_end) {
510
+        av_log(avctx, AV_LOG_ERROR, "no end tag found. incomplete packet.\n");
511
+        return AVERROR_INVALIDDATA;
512
+    }
509 513
 
510 514
     if (avpkt->size * 8 - get_bits_count(&alac->gb) > 8) {
511 515
         av_log(avctx, AV_LOG_ERROR, "Error : %d bits left\n",
... ...
@@ -298,8 +298,8 @@ static int tscc2_decode_frame(AVCodecContext *avctx, void *data,
298 298
         if (!size) {
299 299
             int skip_row = 1, j, off = i * c->mb_width;
300 300
             for (j = 0; j < c->mb_width; j++) {
301
-                if (c->slice_quants[off + i] == 1 ||
302
-                    c->slice_quants[off + i] == 2) {
301
+                if (c->slice_quants[off + j] == 1 ||
302
+                    c->slice_quants[off + j] == 2) {
303 303
                     skip_row = 0;
304 304
                     break;
305 305
                 }
... ...
@@ -1158,12 +1158,7 @@ ALIGN 16
1158 1158
     add     src1q, 2*mmsize
1159 1159
     sub     lenq,  2*mmsize
1160 1160
     jge     .loop
1161
-%if mmsize == 32
1162
-    vzeroupper
1163
-    RET
1164
-%else
1165 1161
     REP_RET
1166
-%endif
1167 1162
 %endmacro
1168 1163
 
1169 1164
 INIT_XMM sse
... ...
@@ -1193,12 +1188,7 @@ ALIGN 16
1193 1193
 
1194 1194
     sub     lenq,   2*mmsize
1195 1195
     jge     .loop
1196
-%if mmsize == 32
1197
-    vzeroupper
1198
-    RET
1199
-%else
1200 1196
     REP_RET
1201
-%endif
1202 1197
 %endmacro
1203 1198
 
1204 1199
 INIT_XMM sse
... ...
@@ -1243,10 +1233,6 @@ cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
1243 1243
 %endif
1244 1244
     add       lenq, mmsize
1245 1245
     jl .loop
1246
-%if mmsize == 32
1247
-    vzeroupper
1248
-    RET
1249
-%endif
1250 1246
 .end:
1251 1247
     REP_RET
1252 1248
 %endmacro
... ...
@@ -750,9 +750,6 @@ section .text
750 750
 ; The others pass args in registers and don't spill anything.
751 751
 cglobal fft_dispatch%2, 2,5,8, z, nbits
752 752
     FFT_DISPATCH fullsuffix, nbits
753
-%if mmsize == 32
754
-    vzeroupper
755
-%endif
756 753
     RET
757 754
 %endmacro ; DECL_FFT
758 755
 
... ...
@@ -958,9 +955,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i
958 958
 %if ARCH_X86_64 == 0
959 959
     add esp, 12
960 960
 %endif
961
-%if mmsize == 32
962
-    vzeroupper
963
-%endif
964 961
     RET
965 962
 %endmacro
966 963
 
... ...
@@ -36,6 +36,8 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
36 36
 
37 37
 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
38 38
 
39
+#if HAVE_INLINE_ASM
40
+
39 41
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
40 42
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
41 43
 
... ...
@@ -178,6 +180,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out,
178 178
     *out = sum;
179 179
 }
180 180
 
181
+#endif /* HAVE_INLINE_ASM */
181 182
 
182 183
 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
183 184
 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
... ...
@@ -241,9 +244,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s)
241 241
         }
242 242
     }
243 243
 
244
+#if HAVE_INLINE_ASM
244 245
     if (mm_flags & AV_CPU_FLAG_SSE2) {
245 246
         s->apply_window_float = apply_window_mp3;
246 247
     }
248
+#endif /* HAVE_INLINE_ASM */
247 249
 #if HAVE_YASM
248 250
     if (0) {
249 251
 #if HAVE_AVX
... ...
@@ -83,8 +83,7 @@ section .text align=16
83 83
 
84 84
 ; %1 = row or col (for rounding variable)
85 85
 ; %2 = number of bits to shift at the end
86
-; %3 = optimization
87
-%macro IDCT_1D 3
86
+%macro IDCT_1D 2
88 87
     ; a0 = (W4 * row[0]) + (1 << (15 - 1));
89 88
     ; a1 = a0;
90 89
     ; a2 = a0;
... ...
@@ -235,8 +234,8 @@ section .text align=16
235 235
 
236 236
 ; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
237 237
 ;                               DCTELEM *block, const int16_t *qmat);
238
-%macro idct_put_fn 2
239
-cglobal prores_idct_put_10_%1, 4, 4, %2
238
+%macro idct_put_fn 1
239
+cglobal prores_idct_put_10, 4, 4, %1
240 240
     movsxd      r1,  r1d
241 241
     pxor        m15, m15           ; zero
242 242
 
... ...
@@ -252,7 +251,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
252 252
     pmullw      m13,[r3+64]
253 253
     pmullw      m12,[r3+96]
254 254
 
255
-    IDCT_1D     row, 15,  %1
255
+    IDCT_1D     row, 15
256 256
 
257 257
     ; transpose for second part of IDCT
258 258
     TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
... ...
@@ -267,7 +266,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
267 267
 
268 268
     ; for (i = 0; i < 8; i++)
269 269
     ;     idctSparseColAdd(dest + i, line_size, block + i);
270
-    IDCT_1D     col, 18,  %1
270
+    IDCT_1D     col, 18
271 271
 
272 272
     ; clip/store
273 273
     mova        m3, [pw_4]
... ...
@@ -302,13 +301,27 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
302 302
     RET
303 303
 %endmacro
304 304
 
305
-INIT_XMM
306
-idct_put_fn sse2, 16
307
-INIT_XMM
308
-idct_put_fn sse4, 16
305
+%macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp
306
+%if cpuflag(sse4)
307
+    movhlps     %2,  %1
308
+    pmovsxwd    %1,  %1
309
+    pmovsxwd    %2,  %2
310
+%else ; sse2
311
+    pxor        %3,  %3
312
+    pcmpgtw     %3,  %1
313
+    mova        %2,  %1
314
+    punpcklwd   %1,  %3
315
+    punpckhwd   %2,  %3
316
+%endif
317
+%endmacro
318
+
319
+INIT_XMM sse2
320
+idct_put_fn 16
321
+INIT_XMM sse4
322
+idct_put_fn 16
309 323
 %if HAVE_AVX
310
-INIT_AVX
311
-idct_put_fn avx,  16
324
+INIT_XMM avx
325
+idct_put_fn 16
312 326
 %endif
313 327
 
314 328
 %endif
... ...
@@ -578,11 +578,44 @@ static void swap_samplerates(AVFilterGraph *graph)
578 578
         swap_samplerates_on_filter(graph->filters[i]);
579 579
 }
580 580
 
581
+#define CH_CENTER_PAIR (AV_CH_FRONT_LEFT_OF_CENTER | AV_CH_FRONT_RIGHT_OF_CENTER)
582
+#define CH_FRONT_PAIR  (AV_CH_FRONT_LEFT           | AV_CH_FRONT_RIGHT)
583
+#define CH_STEREO_PAIR (AV_CH_STEREO_LEFT          | AV_CH_STEREO_RIGHT)
584
+#define CH_WIDE_PAIR   (AV_CH_WIDE_LEFT            | AV_CH_WIDE_RIGHT)
585
+#define CH_SIDE_PAIR   (AV_CH_SIDE_LEFT            | AV_CH_SIDE_RIGHT)
586
+#define CH_DIRECT_PAIR (AV_CH_SURROUND_DIRECT_LEFT | AV_CH_SURROUND_DIRECT_RIGHT)
587
+#define CH_BACK_PAIR   (AV_CH_BACK_LEFT            | AV_CH_BACK_RIGHT)
588
+
589
+/* allowable substitutions for channel pairs when comparing layouts,
590
+ * ordered by priority for both values */
591
+static const uint64_t ch_subst[][2] = {
592
+    { CH_FRONT_PAIR,      CH_CENTER_PAIR     },
593
+    { CH_FRONT_PAIR,      CH_WIDE_PAIR       },
594
+    { CH_FRONT_PAIR,      AV_CH_FRONT_CENTER },
595
+    { CH_CENTER_PAIR,     CH_FRONT_PAIR      },
596
+    { CH_CENTER_PAIR,     CH_WIDE_PAIR       },
597
+    { CH_CENTER_PAIR,     AV_CH_FRONT_CENTER },
598
+    { CH_WIDE_PAIR,       CH_FRONT_PAIR      },
599
+    { CH_WIDE_PAIR,       CH_CENTER_PAIR     },
600
+    { CH_WIDE_PAIR,       AV_CH_FRONT_CENTER },
601
+    { AV_CH_FRONT_CENTER, CH_FRONT_PAIR      },
602
+    { AV_CH_FRONT_CENTER, CH_CENTER_PAIR     },
603
+    { AV_CH_FRONT_CENTER, CH_WIDE_PAIR       },
604
+    { CH_SIDE_PAIR,       CH_DIRECT_PAIR     },
605
+    { CH_SIDE_PAIR,       CH_BACK_PAIR       },
606
+    { CH_SIDE_PAIR,       AV_CH_BACK_CENTER  },
607
+    { CH_BACK_PAIR,       CH_DIRECT_PAIR     },
608
+    { CH_BACK_PAIR,       CH_SIDE_PAIR       },
609
+    { CH_BACK_PAIR,       AV_CH_BACK_CENTER  },
610
+    { AV_CH_BACK_CENTER,  CH_BACK_PAIR       },
611
+    { AV_CH_BACK_CENTER,  CH_DIRECT_PAIR     },
612
+    { AV_CH_BACK_CENTER,  CH_SIDE_PAIR       },
613
+};
614
+
581 615
 static void swap_channel_layouts_on_filter(AVFilterContext *filter)
582 616
 {
583 617
     AVFilterLink *link = NULL;
584
-    uint64_t chlayout;
585
-    int i, j;
618
+    int i, j, k;
586 619
 
587 620
     for (i = 0; i < filter->nb_inputs; i++) {
588 621
         link = filter->inputs[i];
... ...
@@ -594,27 +627,55 @@ static void swap_channel_layouts_on_filter(AVFilterContext *filter)
594 594
     if (i == filter->nb_inputs)
595 595
         return;
596 596
 
597
-    chlayout = link->out_channel_layouts->channel_layouts[0];
598
-
599 597
     for (i = 0; i < filter->nb_outputs; i++) {
600 598
         AVFilterLink *outlink = filter->outputs[i];
601
-        int best_idx, best_score = INT_MIN;
599
+        int best_idx, best_score = INT_MIN, best_count_diff = INT_MAX;
602 600
 
603 601
         if (outlink->type != AVMEDIA_TYPE_AUDIO ||
604 602
             outlink->in_channel_layouts->nb_channel_layouts < 2)
605 603
             continue;
606 604
 
607 605
         for (j = 0; j < outlink->in_channel_layouts->nb_channel_layouts; j++) {
606
+            uint64_t  in_chlayout = link->out_channel_layouts->channel_layouts[0];
608 607
             uint64_t out_chlayout = outlink->in_channel_layouts->channel_layouts[j];
609
-            int matched_channels  = av_get_channel_layout_nb_channels(chlayout &
610
-                                                                      out_chlayout);
611
-            int extra_channels     = av_get_channel_layout_nb_channels(out_chlayout &
612
-                                                                       (~chlayout));
613
-            int score = matched_channels - extra_channels;
608
+            int  in_channels      = av_get_channel_layout_nb_channels(in_chlayout);
609
+            int out_channels      = av_get_channel_layout_nb_channels(out_chlayout);
610
+            int count_diff        = out_channels - in_channels;
611
+            int matched_channels, extra_channels;
612
+            int score = 0;
613
+
614
+            /* channel substitution */
615
+            for (k = 0; k < FF_ARRAY_ELEMS(ch_subst); k++) {
616
+                uint64_t cmp0 = ch_subst[k][0];
617
+                uint64_t cmp1 = ch_subst[k][1];
618
+                if (( in_chlayout & cmp0) && (!(out_chlayout & cmp0)) &&
619
+                    (out_chlayout & cmp1) && (!( in_chlayout & cmp1))) {
620
+                    in_chlayout  &= ~cmp0;
621
+                    out_chlayout &= ~cmp1;
622
+                    /* add score for channel match, minus a deduction for
623
+                       having to do the substitution */
624
+                    score += 10 * av_get_channel_layout_nb_channels(cmp1) - 2;
625
+                }
626
+            }
614 627
 
615
-            if (score > best_score) {
628
+            /* no penalty for LFE channel mismatch */
629
+            if ( (in_chlayout & AV_CH_LOW_FREQUENCY) &&
630
+                (out_chlayout & AV_CH_LOW_FREQUENCY))
631
+                score += 10;
632
+            in_chlayout  &= ~AV_CH_LOW_FREQUENCY;
633
+            out_chlayout &= ~AV_CH_LOW_FREQUENCY;
634
+
635
+            matched_channels = av_get_channel_layout_nb_channels(in_chlayout &
636
+                                                                 out_chlayout);
637
+            extra_channels   = av_get_channel_layout_nb_channels(out_chlayout &
638
+                                                                 (~in_chlayout));
639
+            score += 10 * matched_channels - 5 * extra_channels;
640
+
641
+            if (score > best_score ||
642
+                (count_diff < best_count_diff && score == best_score)) {
616 643
                 best_score = score;
617 644
                 best_idx   = j;
645
+                best_count_diff = count_diff;
618 646
             }
619 647
         }
620 648
         FFSWAP(uint64_t, outlink->in_channel_layouts->channel_layouts[0],
... ...
@@ -515,6 +515,12 @@ static int gen_pong(URLContext *s, RTMPContext *rt, RTMPPacket *ppkt)
515 515
     uint8_t *p;
516 516
     int ret;
517 517
 
518
+    if (ppkt->data_size < 6) {
519
+        av_log(s, AV_LOG_ERROR, "Too short ping packet (%d)\n",
520
+               ppkt->data_size);
521
+        return AVERROR_INVALIDDATA;
522
+    }
523
+
518 524
     if ((ret = ff_rtmp_packet_create(&pkt, RTMP_NETWORK_CHANNEL, RTMP_PT_PING,
519 525
                                      ppkt->timestamp + 1, 6)) < 0)
520 526
         return ret;
... ...
@@ -885,9 +891,9 @@ static int handle_chunk_size(URLContext *s, RTMPPacket *pkt)
885 885
     RTMPContext *rt = s->priv_data;
886 886
     int ret;
887 887
 
888
-    if (pkt->data_size != 4) {
888
+    if (pkt->data_size < 4) {
889 889
         av_log(s, AV_LOG_ERROR,
890
-               "Chunk size change packet is not 4 bytes long (%d)\n",
890
+               "Too short chunk size change packet (%d)\n",
891 891
                pkt->data_size);
892 892
         return AVERROR_INVALIDDATA;
893 893
     }
... ...
@@ -913,6 +919,12 @@ static int handle_ping(URLContext *s, RTMPPacket *pkt)
913 913
     RTMPContext *rt = s->priv_data;
914 914
     int t, ret;
915 915
 
916
+    if (pkt->data_size < 2) {
917
+        av_log(s, AV_LOG_ERROR, "Too short ping packet (%d)\n",
918
+               pkt->data_size);
919
+        return AVERROR_INVALIDDATA;
920
+    }
921
+
916 922
     t = AV_RB16(pkt->data);
917 923
     if (t == 6) {
918 924
         if ((ret = gen_pong(s, rt, pkt)) < 0)
... ...
@@ -950,6 +962,13 @@ static int handle_server_bw(URLContext *s, RTMPPacket *pkt)
950 950
 {
951 951
     RTMPContext *rt = s->priv_data;
952 952
 
953
+    if (pkt->data_size < 4) {
954
+        av_log(s, AV_LOG_ERROR,
955
+               "Too short server bandwidth report packet (%d)\n",
956
+               pkt->data_size);
957
+        return AVERROR_INVALIDDATA;
958
+    }
959
+
953 960
     rt->server_bw = AV_RB32(pkt->data);
954 961
     if (rt->server_bw <= 0) {
955 962
         av_log(s, AV_LOG_ERROR, "Incorrect server bandwidth %d\n",
... ...
@@ -246,9 +246,10 @@ static int handle_buffered_output(AVAudioResampleContext *avr,
246 246
     return 0;
247 247
 }
248 248
 
249
-int avresample_convert(AVAudioResampleContext *avr, void **output,
250
-                       int out_plane_size, int out_samples, void **input,
251
-                       int in_plane_size, int in_samples)
249
+int attribute_align_arg avresample_convert(AVAudioResampleContext *avr,
250
+                                           void **output, int out_plane_size,
251
+                                           int out_samples, void **input,
252
+                                           int in_plane_size, int in_samples)
252 253
 {
253 254
     AudioData input_buffer;
254 255
     AudioData output_buffer;
... ...
@@ -145,12 +145,7 @@ cglobal conv_s32_to_flt, 3,3,3, dst, src, len
145 145
     mova  [dstq+lenq+mmsize], m2
146 146
     add     lenq, mmsize*2
147 147
     jl .loop
148
-%if mmsize == 32
149
-    vzeroupper
150
-    RET
151
-%else
152 148
     REP_RET
153
-%endif
154 149
 %endmacro
155 150
 
156 151
 INIT_XMM sse2
... ...
@@ -218,12 +213,7 @@ cglobal conv_flt_to_s32, 3,3,5, dst, src, len
218 218
     mova  [dstq+lenq+3*mmsize], m3
219 219
     add     lenq, mmsize*4
220 220
     jl .loop
221
-%if mmsize == 32
222
-    vzeroupper
223
-    RET
224
-%else
225 221
     REP_RET
226
-%endif
227 222
 %endmacro
228 223
 
229 224
 INIT_XMM sse2
... ...
@@ -51,12 +51,7 @@ cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
51 51
     add        srcq, mmsize*2
52 52
     sub        lend, mmsize*2/4
53 53
     jg .loop
54
-%if mmsize == 32
55
-    vzeroupper
56
-    RET
57
-%else
58 54
     REP_RET
59
-%endif
60 55
 %endmacro
61 56
 
62 57
 INIT_XMM sse
... ...
@@ -175,12 +170,7 @@ cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1
175 175
     add       src0q, mmsize
176 176
     sub        lend, mmsize/4
177 177
     jg .loop
178
-%if mmsize == 32
179
-    vzeroupper
180
-    RET
181
-%else
182 178
     REP_RET
183
-%endif
184 179
 %endmacro
185 180
 
186 181
 INIT_XMM sse
... ...
@@ -236,3 +226,296 @@ MIX_1_TO_2_S16P_FLT
236 236
 INIT_XMM avx
237 237
 MIX_1_TO_2_S16P_FLT
238 238
 %endif
239
+
240
+;-----------------------------------------------------------------------------
241
+; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix,
242
+;                                      int len, int out_ch, int in_ch);
243
+;-----------------------------------------------------------------------------
244
+
245
+%macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp
246
+; define some names to make the code clearer
247
+%assign  in_channels %1
248
+%assign out_channels %2
249
+%assign stereo out_channels - 1
250
+%ifidn %3, s16p
251
+    %assign is_s16 1
252
+%else
253
+    %assign is_s16 0
254
+%endif
255
+
256
+; determine how many matrix elements must go on the stack vs. mmregs
257
+%assign matrix_elements in_channels * out_channels
258
+%if is_s16
259
+    %if stereo
260
+        %assign needed_mmregs 7
261
+    %else
262
+        %assign needed_mmregs 5
263
+    %endif
264
+%else
265
+    %if stereo
266
+        %assign needed_mmregs 4
267
+    %else
268
+        %assign needed_mmregs 3
269
+    %endif
270
+%endif
271
+%assign matrix_elements_mm num_mmregs - needed_mmregs
272
+%if matrix_elements < matrix_elements_mm
273
+    %assign matrix_elements_mm matrix_elements
274
+%endif
275
+%if matrix_elements_mm < matrix_elements
276
+    %assign matrix_elements_stack matrix_elements - matrix_elements_mm
277
+%else
278
+    %assign matrix_elements_stack 0
279
+%endif
280
+
281
+cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, src0, src1, len, src2, src3, src4, src5, src6, src7
282
+
283
+; get aligned stack space if needed
284
+%if matrix_elements_stack > 0
285
+    %if mmsize == 32
286
+    %assign bkpreg %1 + 1
287
+    %define bkpq r %+ bkpreg %+ q
288
+    mov           bkpq, rsp
289
+    and           rsp, ~(mmsize-1)
290
+    sub           rsp, matrix_elements_stack * mmsize
291
+    %else
292
+    %assign pad matrix_elements_stack * mmsize + (mmsize - gprsize) - (stack_offset & (mmsize - gprsize))
293
+    SUB           rsp, pad
294
+    %endif
295
+%endif
296
+
297
+; load matrix pointers
298
+%define matrix0q r1q
299
+%define matrix1q r3q
300
+%if stereo
301
+    mov      matrix1q, [matrix0q+gprsize]
302
+%endif
303
+    mov      matrix0q, [matrix0q]
304
+
305
+; define matrix coeff names
306
+%assign %%i 0
307
+%assign %%j needed_mmregs
308
+%rep in_channels
309
+    %if %%i >= matrix_elements_mm
310
+        CAT_XDEFINE mx_stack_0_, %%i, 1
311
+        CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize]
312
+    %else
313
+        CAT_XDEFINE mx_stack_0_, %%i, 0
314
+        CAT_XDEFINE mx_0_, %%i, m %+ %%j
315
+        %assign %%j %%j+1
316
+    %endif
317
+    %assign %%i %%i+1
318
+%endrep
319
+%if stereo
320
+%assign %%i 0
321
+%rep in_channels
322
+    %if in_channels + %%i >= matrix_elements_mm
323
+        CAT_XDEFINE mx_stack_1_, %%i, 1
324
+        CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize]
325
+    %else
326
+        CAT_XDEFINE mx_stack_1_, %%i, 0
327
+        CAT_XDEFINE mx_1_, %%i, m %+ %%j
328
+        %assign %%j %%j+1
329
+    %endif
330
+    %assign %%i %%i+1
331
+%endrep
332
+%endif
333
+
334
+; load/splat matrix coeffs
335
+%assign %%i 0
336
+%rep in_channels
337
+    %if mx_stack_0_ %+ %%i
338
+        VBROADCASTSS m0, [matrix0q+4*%%i]
339
+        mova  mx_0_ %+ %%i, m0
340
+    %else
341
+        VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i]
342
+    %endif
343
+    %if stereo
344
+    %if mx_stack_1_ %+ %%i
345
+        VBROADCASTSS m0, [matrix1q+4*%%i]
346
+        mova  mx_1_ %+ %%i, m0
347
+    %else
348
+        VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i]
349
+    %endif
350
+    %endif
351
+    %assign %%i %%i+1
352
+%endrep
353
+
354
+; load channel pointers to registers as offsets from the first channel pointer
355
+%if ARCH_X86_64
356
+    movsxd       lenq, r2d
357
+%endif
358
+    shl          lenq, 2-is_s16
359
+%assign %%i 1
360
+%rep (in_channels - 1)
361
+    %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5
362
+    mov         src5q, [src0q+%%i*gprsize]
363
+    add         src5q, lenq
364
+    mov         src %+ %%i %+ m, src5q
365
+    %else
366
+    mov         src %+ %%i %+ q, [src0q+%%i*gprsize]
367
+    add         src %+ %%i %+ q, lenq
368
+    %endif
369
+    %assign %%i %%i+1
370
+%endrep
371
+    mov         src0q, [src0q]
372
+    add         src0q, lenq
373
+    neg          lenq
374
+.loop
375
+; for x86-32 with 7-8 channels we do not have enough gp registers for all src
376
+; pointers, so we have to load some of them from the stack each time
377
+%define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5
378
+%if is_s16
379
+    ; mix with s16p input
380
+    mova           m0, [src0q+lenq]
381
+    S16_TO_S32_SX   0, 1
382
+    cvtdq2ps       m0, m0
383
+    cvtdq2ps       m1, m1
384
+    %if stereo
385
+    mulps          m2, m0, mx_1_0
386
+    mulps          m3, m1, mx_1_0
387
+    %endif
388
+    mulps          m0, m0, mx_0_0
389
+    mulps          m1, m1, mx_0_0
390
+%assign %%i 1
391
+%rep (in_channels - 1)
392
+    %if copy_src_from_stack
393
+        %define src_ptr src5q
394
+    %else
395
+        %define src_ptr src %+ %%i %+ q
396
+    %endif
397
+    %if stereo
398
+    %if copy_src_from_stack
399
+    mov       src_ptr, src %+ %%i %+ m
400
+    %endif
401
+    mova           m4, [src_ptr+lenq]
402
+    S16_TO_S32_SX   4, 5
403
+    cvtdq2ps       m4, m4
404
+    cvtdq2ps       m5, m5
405
+    fmaddps        m2, m4, mx_1_ %+ %%i, m2, m6
406
+    fmaddps        m3, m5, mx_1_ %+ %%i, m3, m6
407
+    fmaddps        m0, m4, mx_0_ %+ %%i, m0, m4
408
+    fmaddps        m1, m5, mx_0_ %+ %%i, m1, m5
409
+    %else
410
+    %if copy_src_from_stack
411
+    mov       src_ptr, src %+ %%i %+ m
412
+    %endif
413
+    mova           m2, [src_ptr+lenq]
414
+    S16_TO_S32_SX   2, 3
415
+    cvtdq2ps       m2, m2
416
+    cvtdq2ps       m3, m3
417
+    fmaddps        m0, m2, mx_0_ %+ %%i, m0, m4
418
+    fmaddps        m1, m3, mx_0_ %+ %%i, m1, m4
419
+    %endif
420
+    %assign %%i %%i+1
421
+%endrep
422
+    %if stereo
423
+    cvtps2dq       m2, m2
424
+    cvtps2dq       m3, m3
425
+    packssdw       m2, m3
426
+    mova [src1q+lenq], m2
427
+    %endif
428
+    cvtps2dq       m0, m0
429
+    cvtps2dq       m1, m1
430
+    packssdw       m0, m1
431
+    mova [src0q+lenq], m0
432
+%else
433
+    ; mix with fltp input
434
+    %if stereo || mx_stack_0_0
435
+    mova           m0, [src0q+lenq]
436
+    %endif
437
+    %if stereo
438
+    mulps          m1, m0, mx_1_0
439
+    %endif
440
+    %if stereo || mx_stack_0_0
441
+    mulps          m0, m0, mx_0_0
442
+    %else
443
+    mulps          m0, [src0q+lenq], mx_0_0
444
+    %endif
445
+%assign %%i 1
446
+%rep (in_channels - 1)
447
+    %if copy_src_from_stack
448
+        %define src_ptr src5q
449
+        mov   src_ptr, src %+ %%i %+ m
450
+    %else
451
+        %define src_ptr src %+ %%i %+ q
452
+    %endif
453
+    ; avoid extra load for mono if matrix is in a mm register
454
+    %if stereo || mx_stack_0_ %+ %%i
455
+    mova           m2, [src_ptr+lenq]
456
+    %endif
457
+    %if stereo
458
+    fmaddps        m1, m2, mx_1_ %+ %%i, m1, m3
459
+    %endif
460
+    %if stereo || mx_stack_0_ %+ %%i
461
+    fmaddps        m0, m2, mx_0_ %+ %%i, m0, m2
462
+    %else
463
+    fmaddps        m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1
464
+    %endif
465
+    %assign %%i %%i+1
466
+%endrep
467
+    mova [src0q+lenq], m0
468
+    %if stereo
469
+    mova [src1q+lenq], m1
470
+    %endif
471
+%endif
472
+
473
+    add          lenq, mmsize
474
+    jl .loop
475
+; restore stack pointer
476
+%if matrix_elements_stack > 0
477
+    %if mmsize == 32
478
+    mov           rsp, bkpq
479
+    %else
480
+    ADD           rsp, pad
481
+    %endif
482
+%endif
483
+; zero ymm high halves
484
+%if mmsize == 32
485
+    vzeroupper
486
+%endif
487
+    RET
488
+%endmacro
489
+
490
+%macro MIX_3_8_TO_1_2_FLT_FUNCS 0
491
+%assign %%i 3
492
+%rep 6
493
+    INIT_XMM sse
494
+    MIX_3_8_TO_1_2_FLT %%i, 1, fltp
495
+    MIX_3_8_TO_1_2_FLT %%i, 2, fltp
496
+    INIT_XMM sse2
497
+    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
498
+    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
499
+    INIT_XMM sse4
500
+    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
501
+    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
502
+    ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues
503
+    %if HAVE_AVX
504
+    %if ARCH_X86_64 || %%i < 6
505
+    INIT_YMM avx
506
+    %else
507
+    INIT_XMM avx
508
+    %endif
509
+    MIX_3_8_TO_1_2_FLT %%i, 1, fltp
510
+    MIX_3_8_TO_1_2_FLT %%i, 2, fltp
511
+    INIT_XMM avx
512
+    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
513
+    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
514
+    %endif
515
+    %if HAVE_FMA4
516
+    %if ARCH_X86_64 || %%i < 6
517
+    INIT_YMM fma4
518
+    %else
519
+    INIT_XMM fma4
520
+    %endif
521
+    MIX_3_8_TO_1_2_FLT %%i, 1, fltp
522
+    MIX_3_8_TO_1_2_FLT %%i, 2, fltp
523
+    INIT_XMM fma4
524
+    MIX_3_8_TO_1_2_FLT %%i, 1, s16p
525
+    MIX_3_8_TO_1_2_FLT %%i, 2, s16p
526
+    %endif
527
+    %assign %%i %%i+1
528
+%endrep
529
+%endmacro
530
+
531
+MIX_3_8_TO_1_2_FLT_FUNCS
... ...
@@ -47,6 +47,129 @@ extern void ff_mix_1_to_2_s16p_flt_sse4(int16_t **src, float **matrix, int len,
47 47
 extern void ff_mix_1_to_2_s16p_flt_avx (int16_t **src, float **matrix, int len,
48 48
                                         int out_ch, int in_ch);
49 49
 
50
+#define DEFINE_MIX_3_8_TO_1_2(chan)                                         \
51
+extern void ff_mix_ ## chan ## _to_1_fltp_flt_sse(float **src,              \
52
+                                                  float **matrix, int len,  \
53
+                                                  int out_ch, int in_ch);   \
54
+extern void ff_mix_ ## chan ## _to_2_fltp_flt_sse(float **src,              \
55
+                                                  float **matrix, int len,  \
56
+                                                  int out_ch, int in_ch);   \
57
+                                                                            \
58
+extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse2(int16_t **src,           \
59
+                                                   float **matrix, int len, \
60
+                                                   int out_ch, int in_ch);  \
61
+extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse2(int16_t **src,           \
62
+                                                   float **matrix, int len, \
63
+                                                   int out_ch, int in_ch);  \
64
+                                                                            \
65
+extern void ff_mix_ ## chan ## _to_1_s16p_flt_sse4(int16_t **src,           \
66
+                                                   float **matrix, int len, \
67
+                                                   int out_ch, int in_ch);  \
68
+extern void ff_mix_ ## chan ## _to_2_s16p_flt_sse4(int16_t **src,           \
69
+                                                   float **matrix, int len, \
70
+                                                   int out_ch, int in_ch);  \
71
+                                                                            \
72
+extern void ff_mix_ ## chan ## _to_1_fltp_flt_avx(float **src,              \
73
+                                                  float **matrix, int len,  \
74
+                                                  int out_ch, int in_ch);   \
75
+extern void ff_mix_ ## chan ## _to_2_fltp_flt_avx(float **src,              \
76
+                                                  float **matrix, int len,  \
77
+                                                  int out_ch, int in_ch);   \
78
+                                                                            \
79
+extern void ff_mix_ ## chan ## _to_1_s16p_flt_avx(int16_t **src,            \
80
+                                                  float **matrix, int len,  \
81
+                                                  int out_ch, int in_ch);   \
82
+extern void ff_mix_ ## chan ## _to_2_s16p_flt_avx(int16_t **src,            \
83
+                                                  float **matrix, int len,  \
84
+                                                  int out_ch, int in_ch);   \
85
+                                                                            \
86
+extern void ff_mix_ ## chan ## _to_1_fltp_flt_fma4(float **src,             \
87
+                                                   float **matrix, int len, \
88
+                                                   int out_ch, int in_ch);  \
89
+extern void ff_mix_ ## chan ## _to_2_fltp_flt_fma4(float **src,             \
90
+                                                   float **matrix, int len, \
91
+                                                   int out_ch, int in_ch);  \
92
+                                                                            \
93
+extern void ff_mix_ ## chan ## _to_1_s16p_flt_fma4(int16_t **src,           \
94
+                                                   float **matrix, int len, \
95
+                                                   int out_ch, int in_ch);  \
96
+extern void ff_mix_ ## chan ## _to_2_s16p_flt_fma4(int16_t **src,           \
97
+                                                   float **matrix, int len, \
98
+                                                   int out_ch, int in_ch);
99
+
100
+DEFINE_MIX_3_8_TO_1_2(3)
101
+DEFINE_MIX_3_8_TO_1_2(4)
102
+DEFINE_MIX_3_8_TO_1_2(5)
103
+DEFINE_MIX_3_8_TO_1_2(6)
104
+DEFINE_MIX_3_8_TO_1_2(7)
105
+DEFINE_MIX_3_8_TO_1_2(8)
106
+
107
+#define SET_MIX_3_8_TO_1_2(chan)                                            \
108
+    if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {                           \
109
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
110
+                              chan, 1, 16, 4, "SSE",                        \
111
+                              ff_mix_ ## chan ## _to_1_fltp_flt_sse);       \
112
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
113
+                              chan, 2, 16, 4, "SSE",                        \
114
+                              ff_mix_## chan ##_to_2_fltp_flt_sse);         \
115
+    }                                                                       \
116
+    if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {                          \
117
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
118
+                              chan, 1, 16, 8, "SSE2",                       \
119
+                              ff_mix_ ## chan ## _to_1_s16p_flt_sse2);      \
120
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
121
+                              chan, 2, 16, 8, "SSE2",                       \
122
+                              ff_mix_ ## chan ## _to_2_s16p_flt_sse2);      \
123
+    }                                                                       \
124
+    if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {                          \
125
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
126
+                              chan, 1, 16, 8, "SSE4",                       \
127
+                              ff_mix_ ## chan ## _to_1_s16p_flt_sse4);      \
128
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
129
+                              chan, 2, 16, 8, "SSE4",                       \
130
+                              ff_mix_ ## chan ## _to_2_s16p_flt_sse4);      \
131
+    }                                                                       \
132
+    if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {                           \
133
+        int ptr_align = 32;                                                 \
134
+        int smp_align = 8;                                                  \
135
+        if (ARCH_X86_32 || chan >= 6) {                                     \
136
+            ptr_align = 16;                                                 \
137
+            smp_align = 4;                                                  \
138
+        }                                                                   \
139
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
140
+                              chan, 1, ptr_align, smp_align, "AVX",         \
141
+                              ff_mix_ ## chan ## _to_1_fltp_flt_avx);       \
142
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
143
+                              chan, 2, ptr_align, smp_align, "AVX",         \
144
+                              ff_mix_ ## chan ## _to_2_fltp_flt_avx);       \
145
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
146
+                              chan, 1, 16, 8, "AVX",                        \
147
+                              ff_mix_ ## chan ## _to_1_s16p_flt_avx);       \
148
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
149
+                              chan, 2, 16, 8, "AVX",                        \
150
+                              ff_mix_ ## chan ## _to_2_s16p_flt_avx);       \
151
+    }                                                                       \
152
+    if (mm_flags & AV_CPU_FLAG_FMA4 && HAVE_FMA4) {                         \
153
+        int ptr_align = 32;                                                 \
154
+        int smp_align = 8;                                                  \
155
+        if (ARCH_X86_32 || chan >= 6) {                                     \
156
+            ptr_align = 16;                                                 \
157
+            smp_align = 4;                                                  \
158
+        }                                                                   \
159
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
160
+                              chan, 1, ptr_align, smp_align, "FMA4",        \
161
+                              ff_mix_ ## chan ## _to_1_fltp_flt_fma4);      \
162
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,\
163
+                              chan, 2, ptr_align, smp_align, "FMA4",        \
164
+                              ff_mix_ ## chan ## _to_2_fltp_flt_fma4);      \
165
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
166
+                              chan, 1, 16, 8, "FMA4",                       \
167
+                              ff_mix_ ## chan ## _to_1_s16p_flt_fma4);      \
168
+        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,\
169
+                              chan, 2, 16, 8, "FMA4",                       \
170
+                              ff_mix_ ## chan ## _to_2_s16p_flt_fma4);      \
171
+    }
172
+
50 173
 av_cold void ff_audio_mix_init_x86(AudioMix *am)
51 174
 {
52 175
 #if HAVE_YASM
... ...
@@ -80,5 +203,12 @@ av_cold void ff_audio_mix_init_x86(AudioMix *am)
80 80
         ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
81 81
                               1, 2, 16, 8, "AVX", ff_mix_1_to_2_s16p_flt_avx);
82 82
     }
83
+
84
+    SET_MIX_3_8_TO_1_2(3)
85
+    SET_MIX_3_8_TO_1_2(4)
86
+    SET_MIX_3_8_TO_1_2(5)
87
+    SET_MIX_3_8_TO_1_2(6)
88
+    SET_MIX_3_8_TO_1_2(7)
89
+    SET_MIX_3_8_TO_1_2(8)
83 90
 #endif
84 91
 }
... ...
@@ -26,7 +26,8 @@
26 26
     pmovsxwd     m%1, m%1
27 27
     SWAP %1, %2
28 28
 %else
29
-    punpckhwd    m%2, m%1
29
+    mova         m%2, m%1
30
+    punpckhwd    m%2, m%2
30 31
     punpcklwd    m%1, m%1
31 32
     psrad        m%2, 16
32 33
     psrad        m%1, 16
... ...
@@ -797,11 +797,10 @@ int main(int argc, char **argv)
797 797
         av_expr_parse_and_eval(&d, *expr,
798 798
                                const_names, const_values,
799 799
                                NULL, NULL, NULL, NULL, NULL, 0, NULL);
800
-        if(isnan(d)){
800
+        if (isnan(d))
801 801
             printf("'%s' -> nan\n\n", *expr);
802
-        }else{
802
+        else
803 803
             printf("'%s' -> %f\n\n", *expr, d);
804
-        }
805 804
     }
806 805
 
807 806
     av_expr_parse_and_eval(&d, "1+(5-2)^(3-1)+1/2+sin(PI)-max(-2.2,-3.1)",
... ...
@@ -42,12 +42,7 @@ ALIGN 16
42 42
 
43 43
     sub       lenq, 2*mmsize
44 44
     jge       .loop
45
-%if mmsize == 32
46
-    vzeroupper
47
-    RET
48
-%else
49 45
     REP_RET
50
-%endif
51 46
 %endmacro
52 47
 
53 48
 INIT_XMM sse
... ...
@@ -88,12 +83,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
88 88
     mova  [dstq+lenq+mmsize], m2
89 89
     sub    lenq, 2*mmsize
90 90
     jge .loop
91
-%if mmsize == 32
92
-    vzeroupper
93
-    RET
94
-%else
95 91
     REP_RET
96
-%endif
97 92
 %endmacro
98 93
 
99 94
 INIT_XMM sse
... ...
@@ -392,11 +392,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120
392 392
 %macro RET 0
393 393
     WIN64_RESTORE_XMM_INTERNAL rsp
394 394
     POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
395
+%if mmsize == 32
396
+    vzeroupper
397
+%endif
395 398
     ret
396 399
 %endmacro
397 400
 
398 401
 %macro REP_RET 0
399
-    %if regs_used > 7 || xmm_regs_used > 6
402
+    %if regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
400 403
         RET
401 404
     %else
402 405
         rep ret
... ...
@@ -433,11 +436,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 72
433 433
 
434 434
 %macro RET 0
435 435
     POP_IF_USED 14, 13, 12, 11, 10, 9
436
+%if mmsize == 32
437
+    vzeroupper
438
+%endif
436 439
     ret
437 440
 %endmacro
438 441
 
439 442
 %macro REP_RET 0
440
-    %if regs_used > 9
443
+    %if regs_used > 9 || mmsize == 32
441 444
         RET
442 445
     %else
443 446
         rep ret
... ...
@@ -479,11 +485,14 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
479 479
 
480 480
 %macro RET 0
481 481
     POP_IF_USED 6, 5, 4, 3
482
+%if mmsize == 32
483
+    vzeroupper
484
+%endif
482 485
     ret
483 486
 %endmacro
484 487
 
485 488
 %macro REP_RET 0
486
-    %if regs_used > 3
489
+    %if regs_used > 3 || mmsize == 32
487 490
         RET
488 491
     %else
489 492
         rep ret
... ...
@@ -1126,16 +1135,22 @@ AVX_INSTR pfmul, 1, 0, 1
1126 1126
 %undef j
1127 1127
 
1128 1128
 %macro FMA_INSTR 3
1129
-    %macro %1 4-7 %1, %2, %3
1130
-        %if cpuflag(xop)
1131
-            v%5 %1, %2, %3, %4
1129
+    %macro %1 5-8 %1, %2, %3
1130
+        %if cpuflag(xop) || cpuflag(fma4)
1131
+            v%6 %1, %2, %3, %4
1132 1132
         %else
1133
-            %6 %1, %2, %3
1134
-            %7 %1, %4
1133
+            %ifidn %1, %4
1134
+                %7 %5, %2, %3
1135
+                %8 %1, %4, %5
1136
+            %else
1137
+                %7 %1, %2, %3
1138
+                %8 %1, %4
1139
+            %endif
1135 1140
         %endif
1136 1141
     %endmacro
1137 1142
 %endmacro
1138 1143
 
1144
+FMA_INSTR  fmaddps,   mulps, addps
1139 1145
 FMA_INSTR  pmacsdd,  pmulld, paddd
1140 1146
 FMA_INSTR  pmacsww,  pmullw, paddw
1141 1147
 FMA_INSTR pmadcswd, pmaddwd, paddd
... ...
@@ -15,9 +15,6 @@ ffservertest: ffserver$(EXESUF) tests/vsynth1/00.pgm tests/data/asynth1.sw
15 15
 
16 16
 OBJDIRS += tests/data tests/vsynth1
17 17
 
18
-# Required due to missing automatic dependency tracking for HOSTOBJS.
19
-tests/rotozoom.o tests/videogen.o: tests/utils.c
20
-
21 18
 tests/vsynth1/00.pgm: tests/videogen$(HOSTEXESUF) | tests/vsynth1
22 19
 	$(M)./$< 'tests/vsynth1/'
23 20