Browse code

Merge remote-tracking branch 'qatar/master'

* qatar/master:
ARM: ac3dsp: optimised update_bap_counts()
mpegaudiodec: Fix av_dlog() invocation.
h264/10bit: add HAVE_ALIGNED_STACK checks.
Update 8-bit H.264 IDCT function names to reflect bit-depth.
Add IDCT functions for 10-bit H.264.
mpegaudioenc: Fix broken av_dlog statement.
Employ correct printf format specifiers, mostly in debug output.
ARM: fix MUL64 inline asm for pre-armv6

Conflicts:
libavcodec/mpegaudioenc.c
libavformat/ape.c
libavformat/mxfdec.c
libavformat/r3d.c

Merged-by: Michael Niedermayer <michaelni@gmx.at>

Michael Niedermayer authored on 2011/06/02 12:12:10
Showing 15 changed files
... ...
@@ -1,4 +1,5 @@
1 1
 OBJS-$(CONFIG_AC3DSP)                  += arm/ac3dsp_init_arm.o         \
2
+                                          arm/ac3dsp_arm.o
2 3
 
3 4
 OBJS-$(CONFIG_DCA_DECODER)             += arm/dcadsp_init_arm.o         \
4 5
 
5 6
new file mode 100644
... ...
@@ -0,0 +1,35 @@
0
+/*
1
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
2
+ *
3
+ * This file is part of Libav.
4
+ *
5
+ * Libav is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * Libav is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with Libav; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "asm.S"
21
+
22
+function ff_ac3_update_bap_counts_arm, export=1
23
+        push            {lr}
24
+        ldrb            lr,  [r1], #1
25
+1:
26
+        lsl             r3,  lr,  #1
27
+        ldrh            r12, [r0, r3]
28
+        subs            r2,  r2,  #1
29
+        ldrbgt          lr,  [r1], #1
30
+        add             r12, r12, #1
31
+        strh            r12, [r0, r3]
32
+        bgt             1b
33
+        pop             {pc}
34
+endfunc
... ...
@@ -35,10 +35,12 @@ void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
35 35
                                      int snr_offset, int floor,
36 36
                                      const uint8_t *bap_tab, uint8_t *bap);
37 37
 
38
-int ff_ac3_compute_mantissa_size_arm(int cnt[5], uint8_t *bap, int nb_coefs);
38
+void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len);
39 39
 
40 40
 av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
41 41
 {
42
+    c->update_bap_counts         = ff_ac3_update_bap_counts_arm;
43
+
42 44
     if (HAVE_ARMV6) {
43 45
         c->bit_alloc_calc_bap    = ff_ac3_bit_alloc_calc_bap_armv6;
44 46
     }
... ...
@@ -41,6 +41,8 @@ static inline av_const int MULL(int a, int b, unsigned shift)
41 41
 }
42 42
 
43 43
 #define MULH MULH
44
+#define MUL64 MUL64
45
+
44 46
 #if HAVE_ARMV6
45 47
 static inline av_const int MULH(int a, int b)
46 48
 {
... ...
@@ -48,6 +50,13 @@ static inline av_const int MULH(int a, int b)
48 48
     __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
49 49
     return r;
50 50
 }
51
+
52
+static inline av_const int64_t MUL64(int a, int b)
53
+{
54
+    int64_t x;
55
+    __asm__ ("smull %Q0, %R0, %1, %2" : "=r"(x) : "r"(a), "r"(b));
56
+    return x;
57
+}
51 58
 #else
52 59
 static inline av_const int MULH(int a, int b)
53 60
 {
... ...
@@ -55,15 +64,14 @@ static inline av_const int MULH(int a, int b)
55 55
     __asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
56 56
     return hi;
57 57
 }
58
-#endif
59 58
 
60 59
 static inline av_const int64_t MUL64(int a, int b)
61 60
 {
62 61
     int64_t x;
63
-    __asm__ ("smull %Q0, %R0, %1, %2" : "=r"(x) : "r"(a), "r"(b));
62
+    __asm__ ("smull %Q0, %R0, %1, %2" : "=&r"(x) : "r"(a), "r"(b));
64 63
     return x;
65 64
 }
66
-#define MUL64 MUL64
65
+#endif
67 66
 
68 67
 static inline av_const int64_t MAC64(int64_t d, int a, int b)
69 68
 {
... ...
@@ -66,7 +66,6 @@ typedef struct H264DSPContext{
66 66
     void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
67 67
     void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
68 68
 
69
-    void (*h264_dct)(DCTELEM block[4][4]);
70 69
     void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
71 70
     void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
72 71
     void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
... ...
@@ -406,8 +406,9 @@ static av_cold int decode_init(AVCodecContext * avctx)
406 406
                 k = i & 1;
407 407
                 is_table_lsf[j][k ^ 1][i] = FIXR(f);
408 408
                 is_table_lsf[j][k][i] = FIXR(1.0);
409
-                av_dlog(avctx, "is_table_lsf %d %d: %x %x\n",
410
-                        i, j, is_table_lsf[j][0][i], is_table_lsf[j][1][i]);
409
+                av_dlog(avctx, "is_table_lsf %d %d: %f %f\n",
410
+                        i, j, (float) is_table_lsf[j][0][i],
411
+                        (float) is_table_lsf[j][1][i]);
411 412
             }
412 413
         }
413 414
 
... ...
@@ -548,13 +548,11 @@ static void compute_bit_allocation(MpegAudioContext *s,
548 548
                 }
549 549
             }
550 550
         }
551
-#if 0
552
-        printf("current=%d max=%d max_sb=%d alloc=%d\n",
553
-               current_frame_size, max_frame_size, max_sb,
554
-               bit_alloc[max_sb]);
555
-#endif
556 551
         if (max_sb < 0)
557 552
             break;
553
+        av_dlog(NULL, "current=%d max=%d max_sb=%d max_ch=%d alloc=%d\n",
554
+                current_frame_size, max_frame_size, max_sb, max_ch,
555
+                bit_alloc[max_ch][max_sb]);
558 556
 
559 557
         /* find alloc table entry (XXX: not optimal, should use
560 558
            pointer table) */
... ...
@@ -12,8 +12,9 @@ YASM-OBJS-$(CONFIG_FFT)                += x86/fft_mmx.o                 \
12 12
 MMX-OBJS-$(CONFIG_H264DSP)             += x86/h264dsp_mmx.o
13 13
 YASM-OBJS-$(CONFIG_H264DSP)            += x86/h264_deblock.o            \
14 14
                                           x86/h264_deblock_10bit.o      \
15
-                                          x86/h264_weight.o             \
16 15
                                           x86/h264_idct.o               \
16
+                                          x86/h264_idct_10bit.o         \
17
+                                          x86/h264_weight.o             \
17 18
 
18 19
 YASM-OBJS-$(CONFIG_H264PRED)           += x86/h264_intrapred.o
19 20
 MMX-OBJS-$(CONFIG_H264PRED)            += x86/h264_intrapred_init.o
... ...
@@ -73,7 +73,7 @@ SECTION .text
73 73
 
74 74
 INIT_MMX
75 75
 ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
76
-cglobal h264_idct_add_mmx, 3, 3, 0
76
+cglobal h264_idct_add_8_mmx, 3, 3, 0
77 77
     IDCT4_ADD    r0, r1, r2
78 78
     RET
79 79
 
... ...
@@ -125,7 +125,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0
125 125
     SUMSUB_BA    w, 0, 4
126 126
     SUMSUB_BA    w, 3, 2
127 127
     SUMSUB_BA    w, 1, 5
128
-    SWAP          7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
128
+    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
129 129
 %endmacro
130 130
 
131 131
 %macro IDCT8_1D_FULL 1
... ...
@@ -177,7 +177,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0
177 177
 
178 178
 INIT_MMX
179 179
 ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
180
-cglobal h264_idct8_add_mmx, 3, 4, 0
180
+cglobal h264_idct8_add_8_mmx, 3, 4, 0
181 181
     %assign pad 128+4-(stack_offset&7)
182 182
     SUB         rsp, pad
183 183
 
... ...
@@ -237,7 +237,7 @@ cglobal h264_idct8_add_mmx, 3, 4, 0
237 237
 
238 238
 INIT_XMM
239 239
 ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
240
-cglobal h264_idct8_add_sse2, 3, 4, 10
240
+cglobal h264_idct8_add_8_sse2, 3, 4, 10
241 241
     IDCT8_ADD_SSE r0, r1, r2, r3
242 242
     RET
243 243
 
... ...
@@ -261,7 +261,7 @@ cglobal h264_idct8_add_sse2, 3, 4, 10
261 261
     packuswb     m1, m1
262 262
 %endmacro
263 263
 
264
-%macro DC_ADD_MMX2_OP 3-4
264
+%macro DC_ADD_MMX2_OP 4
265 265
     %1           m2, [%2     ]
266 266
     %1           m3, [%2+%3  ]
267 267
     %1           m4, [%2+%3*2]
... ...
@@ -282,13 +282,13 @@ cglobal h264_idct8_add_sse2, 3, 4, 10
282 282
 
283 283
 INIT_MMX
284 284
 ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
285
-cglobal h264_idct_dc_add_mmx2, 3, 3, 0
285
+cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0
286 286
     DC_ADD_MMX2_INIT r1, r2
287 287
     DC_ADD_MMX2_OP movh, r0, r2, r1
288 288
     RET
289 289
 
290 290
 ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
291
-cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
291
+cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
292 292
     DC_ADD_MMX2_INIT r1, r2
293 293
     DC_ADD_MMX2_OP mova, r0, r2, r1
294 294
     lea          r0, [r0+r2*4]
... ...
@@ -297,7 +297,7 @@ cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
297 297
 
298 298
 ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
299 299
 ;             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
300
-cglobal h264_idct_add16_mmx, 5, 7, 0
300
+cglobal h264_idct_add16_8_mmx, 5, 7, 0
301 301
     xor          r5, r5
302 302
 %ifdef PIC
303 303
     lea         r11, [scan8_mem]
... ...
@@ -319,7 +319,7 @@ cglobal h264_idct_add16_mmx, 5, 7, 0
319 319
 
320 320
 ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
321 321
 ;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
322
-cglobal h264_idct8_add4_mmx, 5, 7, 0
322
+cglobal h264_idct8_add4_8_mmx, 5, 7, 0
323 323
     %assign pad 128+4-(stack_offset&7)
324 324
     SUB         rsp, pad
325 325
 
... ...
@@ -351,7 +351,7 @@ cglobal h264_idct8_add4_mmx, 5, 7, 0
351 351
 
352 352
 ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
353 353
 ;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
354
-cglobal h264_idct_add16_mmx2, 5, 7, 0
354
+cglobal h264_idct_add16_8_mmx2, 5, 7, 0
355 355
     xor          r5, r5
356 356
 %ifdef PIC
357 357
     lea         r11, [scan8_mem]
... ...
@@ -398,7 +398,7 @@ cglobal h264_idct_add16_mmx2, 5, 7, 0
398 398
 
399 399
 ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
400 400
 ;                             DCTELEM *block, int stride, const uint8_t nnzc[6*8])
401
-cglobal h264_idct_add16intra_mmx, 5, 7, 0
401
+cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
402 402
     xor          r5, r5
403 403
 %ifdef PIC
404 404
     lea         r11, [scan8_mem]
... ...
@@ -421,7 +421,7 @@ cglobal h264_idct_add16intra_mmx, 5, 7, 0
421 421
 
422 422
 ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
423 423
 ;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
424
-cglobal h264_idct_add16intra_mmx2, 5, 7, 0
424
+cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
425 425
     xor          r5, r5
426 426
 %ifdef PIC
427 427
     lea         r11, [scan8_mem]
... ...
@@ -466,7 +466,7 @@ cglobal h264_idct_add16intra_mmx2, 5, 7, 0
466 466
 
467 467
 ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
468 468
 ;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
469
-cglobal h264_idct8_add4_mmx2, 5, 7, 0
469
+cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
470 470
     %assign pad 128+4-(stack_offset&7)
471 471
     SUB         rsp, pad
472 472
 
... ...
@@ -529,7 +529,7 @@ cglobal h264_idct8_add4_mmx2, 5, 7, 0
529 529
 INIT_XMM
530 530
 ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
531 531
 ;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
532
-cglobal h264_idct8_add4_sse2, 5, 7, 10
532
+cglobal h264_idct8_add4_8_sse2, 5, 7, 10
533 533
     xor          r5, r5
534 534
 %ifdef PIC
535 535
     lea         r11, [scan8_mem]
... ...
@@ -607,7 +607,7 @@ h264_idct_add8_mmx_plane:
607 607
 
608 608
 ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
609 609
 ;                       DCTELEM *block, int stride, const uint8_t nnzc[6*8])
610
-cglobal h264_idct_add8_mmx, 5, 7, 0
610
+cglobal h264_idct_add8_8_mmx, 5, 7, 0
611 611
     mov          r5, 16
612 612
     add          r2, 512
613 613
 %ifdef PIC
... ...
@@ -668,7 +668,7 @@ h264_idct_add8_mmx2_plane
668 668
 
669 669
 ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
670 670
 ;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
671
-cglobal h264_idct_add8_mmx2, 5, 7, 0
671
+cglobal h264_idct_add8_8_mmx2, 5, 7, 0
672 672
     mov          r5, 16
673 673
     add          r2, 512
674 674
 %ifdef ARCH_X86_64
... ...
@@ -744,7 +744,7 @@ x264_add8x4_idct_sse2:
744 744
 
745 745
 ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
746 746
 ;                         DCTELEM *block, int stride, const uint8_t nnzc[6*8])
747
-cglobal h264_idct_add16_sse2, 5, 5, 8
747
+cglobal h264_idct_add16_8_sse2, 5, 5, 8
748 748
 %ifdef ARCH_X86_64
749 749
     mov        r10, r0
750 750
 %endif
... ...
@@ -791,7 +791,7 @@ cglobal h264_idct_add16_sse2, 5, 5, 8
791 791
 
792 792
 ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
793 793
 ;                              DCTELEM *block, int stride, const uint8_t nnzc[6*8])
794
-cglobal h264_idct_add16intra_sse2, 5, 7, 8
794
+cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
795 795
 %ifdef ARCH_X86_64
796 796
     mov        r10, r0
797 797
 %endif
... ...
@@ -840,7 +840,7 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8
840 840
 
841 841
 ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
842 842
 ;                        DCTELEM *block, int stride, const uint8_t nnzc[6*8])
843
-cglobal h264_idct_add8_sse2, 5, 7, 8
843
+cglobal h264_idct_add8_8_sse2, 5, 7, 8
844 844
     add          r2, 512
845 845
 %ifdef ARCH_X86_64
846 846
     mov         r10, r0
847 847
new file mode 100644
... ...
@@ -0,0 +1,570 @@
0
+;*****************************************************************************
1
+;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
2
+;*****************************************************************************
3
+;* Copyright (C) 2005-2011 x264 project
4
+;*
5
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
6
+;*
7
+;* This file is part of Libav.
8
+;*
9
+;* Libav is free software; you can redistribute it and/or
10
+;* modify it under the terms of the GNU Lesser General Public
11
+;* License as published by the Free Software Foundation; either
12
+;* version 2.1 of the License, or (at your option) any later version.
13
+;*
14
+;* Libav is distributed in the hope that it will be useful,
15
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
+;* Lesser General Public License for more details.
18
+;*
19
+;* You should have received a copy of the GNU Lesser General Public
20
+;* License along with Libav; if not, write to the Free Software
21
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
+;******************************************************************************
23
+
24
+%include "x86inc.asm"
25
+%include "x86util.asm"
26
+
27
+SECTION_RODATA
28
+
29
+pw_pixel_max: times 8 dw ((1 << 10)-1)
30
+pd_32:        times 4 dd 32
31
+scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
32
+           db 6+1*8, 7+1*8, 6+2*8, 7+2*8
33
+           db 4+3*8, 5+3*8, 4+4*8, 5+4*8
34
+           db 6+3*8, 7+3*8, 6+4*8, 7+4*8
35
+           db 1+1*8, 2+1*8
36
+           db 1+2*8, 2+2*8
37
+           db 1+4*8, 2+4*8
38
+           db 1+5*8, 2+5*8
39
+
40
+%ifdef PIC
41
+%define scan8 r11
42
+%else
43
+%define scan8 scan8_mem
44
+%endif
45
+
46
+SECTION .text
47
+
48
+;-----------------------------------------------------------------------------
49
+; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
50
+;-----------------------------------------------------------------------------
51
+%macro STORE_DIFFx2 6
52
+    psrad       %1, 6
53
+    psrad       %2, 6
54
+    packssdw    %1, %2
55
+    movq        %3, [%5]
56
+    movhps      %3, [%5+%6]
57
+    paddsw      %1, %3
58
+    CLIPW       %1, %4, [pw_pixel_max]
59
+    movq      [%5], %1
60
+    movhps [%5+%6], %1
61
+%endmacro
62
+
63
+%macro STORE_DIFF16 5
64
+    psrad       %1, 6
65
+    psrad       %2, 6
66
+    packssdw    %1, %2
67
+    paddsw      %1, [%5]
68
+    CLIPW       %1, %3, %4
69
+    mova      [%5], %1
70
+%endmacro
71
+
72
+;dst, in, stride
73
+%macro IDCT4_ADD_10 3
74
+    mova  m0, [%2+ 0]
75
+    mova  m1, [%2+16]
76
+    mova  m2, [%2+32]
77
+    mova  m3, [%2+48]
78
+    IDCT4_1D d,0,1,2,3,4,5
79
+    TRANSPOSE4x4D 0,1,2,3,4
80
+    paddd m0, [pd_32]
81
+    IDCT4_1D d,0,1,2,3,4,5
82
+    pxor  m5, m5
83
+    STORE_DIFFx2 m0, m1, m4, m5, %1, %3
84
+    lea   %1, [%1+%3*2]
85
+    STORE_DIFFx2 m2, m3, m4, m5, %1, %3
86
+%endmacro
87
+
88
+%macro IDCT_ADD_10 1
89
+cglobal h264_idct_add_10_%1, 3,3
90
+    IDCT4_ADD_10 r0, r1, r2
91
+    RET
92
+%endmacro
93
+
94
+INIT_XMM
95
+IDCT_ADD_10 sse2
96
+%ifdef HAVE_AVX
97
+INIT_AVX
98
+IDCT_ADD_10 avx
99
+%endif
100
+
101
+;-----------------------------------------------------------------------------
102
+; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
103
+;-----------------------------------------------------------------------------
104
+;;;;;;; NO FATE SAMPLES TRIGGER THIS
105
+%macro ADD4x4IDCT 1
106
+add4x4_idct_%1:
107
+    add   r5, r0
108
+    mova  m0, [r2+ 0]
109
+    mova  m1, [r2+16]
110
+    mova  m2, [r2+32]
111
+    mova  m3, [r2+48]
112
+    IDCT4_1D d,0,1,2,3,4,5
113
+    TRANSPOSE4x4D 0,1,2,3,4
114
+    paddd m0, [pd_32]
115
+    IDCT4_1D d,0,1,2,3,4,5
116
+    pxor  m5, m5
117
+    STORE_DIFFx2 m0, m1, m4, m5, r5, r3
118
+    lea   r5, [r5+r3*2]
119
+    STORE_DIFFx2 m2, m3, m4, m5, r5, r3
120
+    ret
121
+%endmacro
122
+
123
+INIT_XMM
124
+ALIGN 16
125
+ADD4x4IDCT sse2
126
+%ifdef HAVE_AVX
127
+INIT_AVX
128
+ALIGN 16
129
+ADD4x4IDCT avx
130
+%endif
131
+
132
+%macro ADD16_OP 3
133
+    cmp          byte [r4+%3], 0
134
+    jz .skipblock%2
135
+    mov         r5d, dword [r1+%2*4]
136
+    call add4x4_idct_%1
137
+.skipblock%2:
138
+%if %2<15
139
+    add          r2, 64
140
+%endif
141
+%endmacro
142
+
143
+%macro IDCT_ADD16_10 1
144
+cglobal h264_idct_add16_10_%1, 5,6
145
+    ADD16_OP %1, 0, 4+1*8
146
+    ADD16_OP %1, 1, 5+1*8
147
+    ADD16_OP %1, 2, 4+2*8
148
+    ADD16_OP %1, 3, 5+2*8
149
+    ADD16_OP %1, 4, 6+1*8
150
+    ADD16_OP %1, 5, 7+1*8
151
+    ADD16_OP %1, 6, 6+2*8
152
+    ADD16_OP %1, 7, 7+2*8
153
+    ADD16_OP %1, 8, 4+3*8
154
+    ADD16_OP %1, 9, 5+3*8
155
+    ADD16_OP %1, 10, 4+4*8
156
+    ADD16_OP %1, 11, 5+4*8
157
+    ADD16_OP %1, 12, 6+3*8
158
+    ADD16_OP %1, 13, 7+3*8
159
+    ADD16_OP %1, 14, 6+4*8
160
+    ADD16_OP %1, 15, 7+4*8
161
+    RET
162
+%endmacro
163
+
164
+INIT_XMM
165
+IDCT_ADD16_10 sse2
166
+%ifdef HAVE_AVX
167
+INIT_AVX
168
+IDCT_ADD16_10 avx
169
+%endif
170
+
171
+;-----------------------------------------------------------------------------
172
+; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
173
+;-----------------------------------------------------------------------------
174
+%macro IDCT_DC_ADD_OP_10 3
175
+    pxor      m5, m5
176
+%if avx_enabled
177
+    paddw     m1, m0, [%1+0   ]
178
+    paddw     m2, m0, [%1+%2  ]
179
+    paddw     m3, m0, [%1+%2*2]
180
+    paddw     m4, m0, [%1+%3  ]
181
+%else
182
+    mova      m1, [%1+0   ]
183
+    mova      m2, [%1+%2  ]
184
+    mova      m3, [%1+%2*2]
185
+    mova      m4, [%1+%3  ]
186
+    paddw     m1, m0
187
+    paddw     m2, m0
188
+    paddw     m3, m0
189
+    paddw     m4, m0
190
+%endif
191
+    CLIPW     m1, m5, m6
192
+    CLIPW     m2, m5, m6
193
+    CLIPW     m3, m5, m6
194
+    CLIPW     m4, m5, m6
195
+    mova [%1+0   ], m1
196
+    mova [%1+%2  ], m2
197
+    mova [%1+%2*2], m3
198
+    mova [%1+%3  ], m4
199
+%endmacro
200
+
201
+INIT_MMX
202
+cglobal h264_idct_dc_add_10_mmx2,3,3
203
+    movd      m0, dword [r1]
204
+    paddd     m0, [pd_32]
205
+    psrad     m0, 6
206
+    lea       r1, [r2*3]
207
+    pshufw    m0, m0, 0
208
+    mova      m6, [pw_pixel_max]
209
+    IDCT_DC_ADD_OP_10 r0, r2, r1
210
+    RET
211
+
212
+;-----------------------------------------------------------------------------
213
+; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
214
+;-----------------------------------------------------------------------------
215
+%macro IDCT8_DC_ADD 1
216
+cglobal h264_idct8_dc_add_10_%1,3,3,7
217
+    mov      r1d, dword [r1]
218
+    add       r1, 32
219
+    sar       r1, 6
220
+    movd      m0, r1d
221
+    lea       r1, [r2*3]
222
+    SPLATW    m0, m0, 0
223
+    mova      m6, [pw_pixel_max]
224
+    IDCT_DC_ADD_OP_10 r0, r2, r1
225
+    lea       r0, [r0+r2*4]
226
+    IDCT_DC_ADD_OP_10 r0, r2, r1
227
+    RET
228
+%endmacro
229
+
230
+INIT_XMM
231
+IDCT8_DC_ADD sse2
232
+%ifdef HAVE_AVX
233
+INIT_AVX
234
+IDCT8_DC_ADD avx
235
+%endif
236
+
237
+;-----------------------------------------------------------------------------
238
+; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
239
+;-----------------------------------------------------------------------------
240
+%macro AC 2
241
+.ac%2
242
+    mov  r5d, dword [r1+(%2+0)*4]
243
+    call add4x4_idct_%1
244
+    mov  r5d, dword [r1+(%2+1)*4]
245
+    add  r2, 64
246
+    call add4x4_idct_%1
247
+    add  r2, 64
248
+    jmp .skipadd%2
249
+%endmacro
250
+
251
+%macro ADD16_OP_INTRA 3
252
+    cmp         word [r4+%3], 0
253
+    jnz .ac%2
254
+    mov         r6d, dword [r2+ 0]
255
+    or          r6d, dword [r2+64]
256
+    jz .skipblock%2
257
+    mov  r5d, dword [r1+(%2+0)*4]
258
+    call idct_dc_add_%1
259
+.skipblock%2:
260
+%if %2<15
261
+    add          r2, 128
262
+%endif
263
+.skipadd%2:
264
+%endmacro
265
+
266
+%macro IDCT_ADD16INTRA_10 1
267
+idct_dc_add_%1:
268
+    add       r5, r0
269
+    movq      m0, [r2+ 0]
270
+    movhps    m0, [r2+64]
271
+    paddd     m0, [pd_32]
272
+    psrad     m0, 6
273
+    pshufhw   m0, m0, 0
274
+    pshuflw   m0, m0, 0
275
+    lea       r6, [r3*3]
276
+    mova      m6, [pw_pixel_max]
277
+    IDCT_DC_ADD_OP_10 r5, r3, r6
278
+    ret
279
+
280
+cglobal h264_idct_add16intra_10_%1,5,7,8
281
+    ADD16_OP_INTRA %1, 0, 4+1*8
282
+    ADD16_OP_INTRA %1, 2, 4+2*8
283
+    ADD16_OP_INTRA %1, 4, 6+1*8
284
+    ADD16_OP_INTRA %1, 6, 6+2*8
285
+    ADD16_OP_INTRA %1, 8, 4+3*8
286
+    ADD16_OP_INTRA %1, 10, 4+4*8
287
+    ADD16_OP_INTRA %1, 12, 6+3*8
288
+    ADD16_OP_INTRA %1, 14, 6+4*8
289
+    RET
290
+%assign i 14
291
+%rep 8
292
+    AC %1, i
293
+%assign i i-2
294
+%endrep
295
+%endmacro
296
+
297
+INIT_XMM
298
+IDCT_ADD16INTRA_10 sse2
299
+%ifdef HAVE_AVX
300
+INIT_AVX
301
+IDCT_ADD16INTRA_10 avx
302
+%endif
303
+
304
+;-----------------------------------------------------------------------------
305
+; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
306
+;-----------------------------------------------------------------------------
307
+%macro IDCT_ADD8 1
308
+cglobal h264_idct_add8_10_%1,5,7
309
+    mov          r5, 16
310
+    add          r2, 1024
311
+%ifdef PIC
312
+    lea         r11, [scan8_mem]
313
+%endif
314
+%ifdef ARCH_X86_64
315
+    mov         r10, r0
316
+%endif
317
+.nextblock:
318
+    movzx        r6, byte [scan8+r5]
319
+    movzx        r6, byte [r4+r6]
320
+    or          r6d, dword [r2]
321
+    test         r6, r6
322
+    jz .skipblock
323
+%ifdef ARCH_X86_64
324
+    mov         r0d, dword [r1+r5*4]
325
+    add          r0, [r10]
326
+%else
327
+    mov          r0, r0m
328
+    mov          r0, [r0]
329
+    add          r0, dword [r1+r5*4]
330
+%endif
331
+    IDCT4_ADD_10 r0, r2, r3
332
+.skipblock:
333
+    inc          r5
334
+    add          r2, 64
335
+    test         r5, 3
336
+    jnz .nextblock
337
+%ifdef ARCH_X86_64
338
+    add         r10, gprsize
339
+%else
340
+    add        r0mp, gprsize
341
+%endif
342
+    test         r5, 4
343
+    jnz .nextblock
344
+    REP_RET
345
+%endmacro ; IDCT_ADD8
346
+
347
+INIT_XMM
348
+IDCT_ADD8 sse2
349
+%ifdef HAVE_AVX
350
+INIT_AVX
351
+IDCT_ADD8 avx
352
+%endif
353
+
354
+;-----------------------------------------------------------------------------
355
+; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
356
+;-----------------------------------------------------------------------------
357
+%macro IDCT8_1D 2
358
+    SWAP         0, 1
359
+    psrad        m4, m5, 1
360
+    psrad        m1, m0, 1
361
+    paddd        m4, m5
362
+    paddd        m1, m0
363
+    paddd        m4, m7
364
+    paddd        m1, m5
365
+    psubd        m4, m0
366
+    paddd        m1, m3
367
+
368
+    psubd        m0, m3
369
+    psubd        m5, m3
370
+    paddd        m0, m7
371
+    psubd        m5, m7
372
+    psrad        m3, 1
373
+    psrad        m7, 1
374
+    psubd        m0, m3
375
+    psubd        m5, m7
376
+
377
+    SWAP         1, 7
378
+    psrad        m1, m7, 2
379
+    psrad        m3, m4, 2
380
+    paddd        m3, m0
381
+    psrad        m0, 2
382
+    paddd        m1, m5
383
+    psrad        m5, 2
384
+    psubd        m0, m4
385
+    psubd        m7, m5
386
+
387
+    SWAP         5, 6
388
+    psrad        m4, m2, 1
389
+    psrad        m6, m5, 1
390
+    psubd        m4, m5
391
+    paddd        m6, m2
392
+
393
+    mova         m2, %1
394
+    mova         m5, %2
395
+    SUMSUB_BA    d, 5, 2
396
+    SUMSUB_BA    d, 6, 5
397
+    SUMSUB_BA    d, 4, 2
398
+    SUMSUB_BA    d, 7, 6
399
+    SUMSUB_BA    d, 0, 4
400
+    SUMSUB_BA    d, 3, 2
401
+    SUMSUB_BA    d, 1, 5
402
+    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
403
+%endmacro
404
+
405
+%macro IDCT8_1D_FULL 1
406
+    mova         m7, [%1+112*2]
407
+    mova         m6, [%1+ 96*2]
408
+    mova         m5, [%1+ 80*2]
409
+    mova         m3, [%1+ 48*2]
410
+    mova         m2, [%1+ 32*2]
411
+    mova         m1, [%1+ 16*2]
412
+    IDCT8_1D   [%1], [%1+ 64*2]
413
+%endmacro
414
+
415
+; %1=int16_t *block, %2=int16_t *dstblock
416
+%macro IDCT8_ADD_SSE_START 2
417
+    IDCT8_1D_FULL %1
418
+%ifdef ARCH_X86_64
419
+    TRANSPOSE4x4D  0,1,2,3,8
420
+    mova    [%2    ], m0
421
+    TRANSPOSE4x4D  4,5,6,7,8
422
+    mova    [%2+8*2], m4
423
+%else
424
+    mova         [%1], m7
425
+    TRANSPOSE4x4D   0,1,2,3,7
426
+    mova           m7, [%1]
427
+    mova    [%2     ], m0
428
+    mova    [%2+16*2], m1
429
+    mova    [%2+32*2], m2
430
+    mova    [%2+48*2], m3
431
+    TRANSPOSE4x4D   4,5,6,7,3
432
+    mova    [%2+ 8*2], m4
433
+    mova    [%2+24*2], m5
434
+    mova    [%2+40*2], m6
435
+    mova    [%2+56*2], m7
436
+%endif
437
+%endmacro
438
+
439
+; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
440
+%macro IDCT8_ADD_SSE_END 3
441
+    IDCT8_1D_FULL %2
442
+    mova  [%2     ], m6
443
+    mova  [%2+16*2], m7
444
+
445
+    pxor         m7, m7
446
+    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
447
+    lea          %1, [%1+%3*2]
448
+    STORE_DIFFx2 m2, m3, m6, m7, %1, %3
449
+    mova         m0, [%2     ]
450
+    mova         m1, [%2+16*2]
451
+    lea          %1, [%1+%3*2]
452
+    STORE_DIFFx2 m4, m5, m6, m7, %1, %3
453
+    lea          %1, [%1+%3*2]
454
+    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
455
+%endmacro
456
+
457
+%macro IDCT8_ADD 1
458
+cglobal h264_idct8_add_10_%1, 3,4,16
459
+%ifndef UNIX64
460
+    %assign pad 16-gprsize-(stack_offset&15)
461
+    sub  rsp, pad
462
+    call h264_idct8_add1_10_%1
463
+    add  rsp, pad
464
+    RET
465
+%endif
466
+
467
+ALIGN 16
468
+; TODO: does not need to use stack
469
+h264_idct8_add1_10_%1:
470
+%assign pad 256+16-gprsize
471
+    sub          rsp, pad
472
+    add   dword [r1], 32
473
+
474
+%ifdef ARCH_X86_64
475
+    IDCT8_ADD_SSE_START r1, rsp
476
+    SWAP 1,  9
477
+    SWAP 2, 10
478
+    SWAP 3, 11
479
+    SWAP 5, 13
480
+    SWAP 6, 14
481
+    SWAP 7, 15
482
+    IDCT8_ADD_SSE_START r1+16, rsp+128
483
+    PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
484
+    IDCT8_1D [rsp], [rsp+128]
485
+    SWAP 0,  8
486
+    SWAP 1,  9
487
+    SWAP 2, 10
488
+    SWAP 3, 11
489
+    SWAP 4, 12
490
+    SWAP 5, 13
491
+    SWAP 6, 14
492
+    SWAP 7, 15
493
+    IDCT8_1D [rsp+16], [rsp+144]
494
+    psrad         m8, 6
495
+    psrad         m0, 6
496
+    packssdw      m8, m0
497
+    paddsw        m8, [r0]
498
+    pxor          m0, m0
499
+    CLIPW         m8, m0, [pw_pixel_max]
500
+    mova        [r0], m8
501
+    mova          m8, [pw_pixel_max]
502
+    STORE_DIFF16  m9, m1, m0, m8, r0+r2
503
+    lea           r0, [r0+r2*2]
504
+    STORE_DIFF16 m10, m2, m0, m8, r0
505
+    STORE_DIFF16 m11, m3, m0, m8, r0+r2
506
+    lea           r0, [r0+r2*2]
507
+    STORE_DIFF16 m12, m4, m0, m8, r0
508
+    STORE_DIFF16 m13, m5, m0, m8, r0+r2
509
+    lea           r0, [r0+r2*2]
510
+    STORE_DIFF16 m14, m6, m0, m8, r0
511
+    STORE_DIFF16 m15, m7, m0, m8, r0+r2
512
+%else
513
+    IDCT8_ADD_SSE_START r1,    rsp
514
+    IDCT8_ADD_SSE_START r1+16, rsp+128
515
+    lea           r3, [r0+8]
516
+    IDCT8_ADD_SSE_END r0, rsp,    r2
517
+    IDCT8_ADD_SSE_END r3, rsp+16, r2
518
+%endif ; ARCH_X86_64
519
+
520
+    add          rsp, pad
521
+    ret
522
+%endmacro
523
+
524
+INIT_XMM
525
+IDCT8_ADD sse2
526
+%ifdef HAVE_AVX
527
+INIT_AVX
528
+IDCT8_ADD avx
529
+%endif
530
+
531
+;-----------------------------------------------------------------------------
532
+; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
533
+;-----------------------------------------------------------------------------
534
+;;;;;;; NO FATE SAMPLES TRIGGER THIS
535
+%macro IDCT8_ADD4_OP 3
536
+    cmp       byte [r4+%3], 0
537
+    jz .skipblock%2
538
+    mov      r0d, dword [r6+%2*4]
539
+    add       r0, r5
540
+    call h264_idct8_add1_10_%1
541
+.skipblock%2:
542
+%if %2<12
543
+    add       r1, 256
544
+%endif
545
+%endmacro
546
+
547
+%macro IDCT8_ADD4 1
548
+cglobal h264_idct8_add4_10_%1, 0,7,16
549
+    %assign pad 16-gprsize-(stack_offset&15)
550
+    SUB      rsp, pad
551
+    mov       r5, r0mp
552
+    mov       r6, r1mp
553
+    mov       r1, r2mp
554
+    mov      r2d, r3m
555
+    movifnidn r4, r4mp
556
+    IDCT8_ADD4_OP %1,  0, 4+1*8
557
+    IDCT8_ADD4_OP %1,  4, 6+1*8
558
+    IDCT8_ADD4_OP %1,  8, 4+3*8
559
+    IDCT8_ADD4_OP %1, 12, 6+3*8
560
+    ADD       rsp, pad
561
+    RET
562
+%endmacro ; IDCT8_ADD4
563
+
564
+INIT_XMM
565
+IDCT8_ADD4 sse2
566
+%ifdef HAVE_AVX
567
+INIT_AVX
568
+IDCT8_ADD4 avx
569
+%endif
... ...
@@ -27,38 +27,61 @@ DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
27 27
 
28 28
 /***********************************/
29 29
 /* IDCT */
30
+#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
31
+void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride);
32
+
33
+IDCT_ADD_FUNC(, 8, mmx)
34
+IDCT_ADD_FUNC(, 10, sse2)
35
+IDCT_ADD_FUNC(_dc, 8, mmx2)
36
+IDCT_ADD_FUNC(_dc, 10, mmx2)
37
+IDCT_ADD_FUNC(8_dc, 8, mmx2)
38
+IDCT_ADD_FUNC(8_dc, 10, sse2)
39
+IDCT_ADD_FUNC(8, 8, mmx)
40
+IDCT_ADD_FUNC(8, 8, sse2)
41
+IDCT_ADD_FUNC(8, 10, sse2)
42
+#if HAVE_AVX
43
+IDCT_ADD_FUNC(, 10, avx)
44
+IDCT_ADD_FUNC(8_dc, 10, avx)
45
+IDCT_ADD_FUNC(8, 10, avx)
46
+#endif
47
+
48
+
49
+#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
50
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
51
+                              (uint8_t *dst, const int *block_offset, \
52
+                              DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
53
+
54
+IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
55
+IDCT_ADD_REP_FUNC(8, 4, 8, mmx2)
56
+IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
57
+IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
58
+IDCT_ADD_REP_FUNC(8, 4, 10, avx)
59
+IDCT_ADD_REP_FUNC(, 16, 8, mmx)
60
+IDCT_ADD_REP_FUNC(, 16, 8, mmx2)
61
+IDCT_ADD_REP_FUNC(, 16, 8, sse2)
62
+IDCT_ADD_REP_FUNC(, 16, 10, sse2)
63
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
64
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2)
65
+IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
66
+IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
67
+#if HAVE_AVX
68
+IDCT_ADD_REP_FUNC(, 16, 10, avx)
69
+IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
70
+#endif
71
+
72
+
73
+#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
74
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
75
+                              (uint8_t **dst, const int *block_offset, \
76
+                              DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
77
+IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
78
+IDCT_ADD_REP_FUNC2(, 8, 8, mmx2)
79
+IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
80
+IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
81
+#if HAVE_AVX
82
+IDCT_ADD_REP_FUNC2(, 8, 10, avx)
83
+#endif
30 84
 
31
-void ff_h264_idct_add_mmx     (uint8_t *dst, int16_t *block, int stride);
32
-void ff_h264_idct8_add_mmx    (uint8_t *dst, int16_t *block, int stride);
33
-void ff_h264_idct8_add_sse2   (uint8_t *dst, int16_t *block, int stride);
34
-void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride);
35
-void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride);
36
-
37
-void ff_h264_idct_add16_mmx      (uint8_t *dst, const int *block_offset,
38
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
39
-void ff_h264_idct8_add4_mmx      (uint8_t *dst, const int *block_offset,
40
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
41
-void ff_h264_idct_add16_mmx2     (uint8_t *dst, const int *block_offset,
42
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
43
-void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset,
44
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
45
-void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
46
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
47
-void ff_h264_idct8_add4_mmx2     (uint8_t *dst, const int *block_offset,
48
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
49
-void ff_h264_idct8_add4_sse2     (uint8_t *dst, const int *block_offset,
50
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
51
-void ff_h264_idct_add8_mmx       (uint8_t **dest, const int *block_offset,
52
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
53
-void ff_h264_idct_add8_mmx2      (uint8_t **dest, const int *block_offset,
54
-                                  DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
55
-
56
-void ff_h264_idct_add16_sse2     (uint8_t *dst, const int *block_offset, DCTELEM *block,
57
-                                  int stride, const uint8_t nnzc[6*8]);
58
-void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block,
59
-                                  int stride, const uint8_t nnzc[6*8]);
60
-void ff_h264_idct_add8_sse2      (uint8_t **dest, const int *block_offset, DCTELEM *block,
61
-                                  int stride, const uint8_t nnzc[6*8]);
62 85
 void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
63 86
 void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
64 87
 
... ...
@@ -313,24 +336,24 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
313 313
     }
314 314
 #if HAVE_YASM
315 315
     if (mm_flags & AV_CPU_FLAG_MMX) {
316
-        c->h264_idct_dc_add=
317
-        c->h264_idct_add= ff_h264_idct_add_mmx;
318
-        c->h264_idct8_dc_add=
319
-        c->h264_idct8_add= ff_h264_idct8_add_mmx;
320
-
321
-        c->h264_idct_add16     = ff_h264_idct_add16_mmx;
322
-        c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
323
-        c->h264_idct_add8      = ff_h264_idct_add8_mmx;
324
-        c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
316
+        c->h264_idct_dc_add         =
317
+        c->h264_idct_add            = ff_h264_idct_add_8_mmx;
318
+        c->h264_idct8_dc_add        =
319
+        c->h264_idct8_add           = ff_h264_idct8_add_8_mmx;
320
+
321
+        c->h264_idct_add16          = ff_h264_idct_add16_8_mmx;
322
+        c->h264_idct8_add4          = ff_h264_idct8_add4_8_mmx;
323
+        c->h264_idct_add8           = ff_h264_idct_add8_8_mmx;
324
+        c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_mmx;
325 325
         c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
326 326
 
327 327
         if (mm_flags & AV_CPU_FLAG_MMX2) {
328
-            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
329
-            c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
330
-            c->h264_idct_add16     = ff_h264_idct_add16_mmx2;
331
-            c->h264_idct8_add4     = ff_h264_idct8_add4_mmx2;
332
-            c->h264_idct_add8      = ff_h264_idct_add8_mmx2;
333
-            c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
328
+            c->h264_idct_dc_add    = ff_h264_idct_dc_add_8_mmx2;
329
+            c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_8_mmx2;
330
+            c->h264_idct_add16     = ff_h264_idct_add16_8_mmx2;
331
+            c->h264_idct8_add4     = ff_h264_idct8_add4_8_mmx2;
332
+            c->h264_idct_add8      = ff_h264_idct_add8_8_mmx2;
333
+            c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2;
334 334
 
335 335
             c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext;
336 336
             c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
... ...
@@ -361,8 +384,12 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
361 361
             c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
362 362
 
363 363
             if (mm_flags&AV_CPU_FLAG_SSE2) {
364
-                c->h264_idct8_add = ff_h264_idct8_add_sse2;
365
-                c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
364
+                c->h264_idct8_add           = ff_h264_idct8_add_8_sse2;
365
+
366
+                c->h264_idct_add16          = ff_h264_idct_add16_8_sse2;
367
+                c->h264_idct8_add4          = ff_h264_idct8_add4_8_sse2;
368
+                c->h264_idct_add8           = ff_h264_idct_add8_8_sse2;
369
+                c->h264_idct_add16intra     = ff_h264_idct_add16intra_8_sse2;
366 370
                 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
367 371
 
368 372
                 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
... ...
@@ -383,10 +410,6 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
383 383
                 c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
384 384
                 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
385 385
 #endif
386
-
387
-                c->h264_idct_add16 = ff_h264_idct_add16_sse2;
388
-                c->h264_idct_add8  = ff_h264_idct_add8_sse2;
389
-                c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
390 386
             }
391 387
             if (mm_flags&AV_CPU_FLAG_SSSE3) {
392 388
                 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
... ...
@@ -418,7 +441,19 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
418 418
             c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
419 419
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
420 420
 #endif
421
+            c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2;
421 422
             if (mm_flags&AV_CPU_FLAG_SSE2) {
423
+                c->h264_idct_add       = ff_h264_idct_add_10_sse2;
424
+                c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_sse2;
425
+
426
+                c->h264_idct_add16     = ff_h264_idct_add16_10_sse2;
427
+                c->h264_idct_add8      = ff_h264_idct_add8_10_sse2;
428
+                c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2;
429
+#if HAVE_ALIGNED_STACK
430
+                c->h264_idct8_add      = ff_h264_idct8_add_10_sse2;
431
+                c->h264_idct8_add4     = ff_h264_idct8_add4_10_sse2;
432
+#endif
433
+
422 434
                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
423 435
                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
424 436
 #if HAVE_ALIGNED_STACK
... ...
@@ -428,7 +463,20 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
428 428
                 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
429 429
 #endif
430 430
             }
431
+#if HAVE_AVX
431 432
             if (mm_flags&AV_CPU_FLAG_AVX) {
433
+                c->h264_idct_dc_add    =
434
+                c->h264_idct_add       = ff_h264_idct_add_10_avx;
435
+                c->h264_idct8_dc_add   = ff_h264_idct8_dc_add_10_avx;
436
+
437
+                c->h264_idct_add16     = ff_h264_idct_add16_10_avx;
438
+                c->h264_idct_add8      = ff_h264_idct_add8_10_avx;
439
+                c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx;
440
+#if HAVE_ALIGNED_STACK
441
+                c->h264_idct8_add      = ff_h264_idct8_add_10_avx;
442
+                c->h264_idct8_add4     = ff_h264_idct8_add4_10_avx;
443
+#endif
444
+
432 445
                 c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx;
433 446
                 c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx;
434 447
 #if HAVE_ALIGNED_STACK
... ...
@@ -438,6 +486,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
438 438
                 c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
439 439
 #endif
440 440
             }
441
+#endif /* HAVE_AVX */
441 442
         }
442 443
     }
443 444
 #endif
... ...
@@ -153,7 +153,7 @@ static int movie_init(AVFilterContext *ctx)
153 153
     movie->w = movie->codec_ctx->width;
154 154
     movie->h = movie->codec_ctx->height;
155 155
 
156
-    av_log(ctx, AV_LOG_INFO, "seek_point:%lld format_name:%s file_name:%s stream_index:%d\n",
156
+    av_log(ctx, AV_LOG_INFO, "seek_point:%"PRIi64" format_name:%s file_name:%s stream_index:%d\n",
157 157
            movie->seek_point, movie->format_name, movie->file_name,
158 158
            movie->stream_index);
159 159
 
... ...
@@ -101,14 +101,14 @@ static void ape_dumpinfo(AVFormatContext * s, APEContext * ape_ctx)
101 101
 
102 102
     av_log(s, AV_LOG_DEBUG, "Descriptor Block:\n\n");
103 103
     av_log(s, AV_LOG_DEBUG, "magic                = \"%c%c%c%c\"\n", ape_ctx->magic[0], ape_ctx->magic[1], ape_ctx->magic[2], ape_ctx->magic[3]);
104
-    av_log(s, AV_LOG_DEBUG, "fileversion          = %d\n", ape_ctx->fileversion);
105
-    av_log(s, AV_LOG_DEBUG, "descriptorlength     = %d\n", ape_ctx->descriptorlength);
106
-    av_log(s, AV_LOG_DEBUG, "headerlength         = %d\n", ape_ctx->headerlength);
107
-    av_log(s, AV_LOG_DEBUG, "seektablelength      = %d\n", ape_ctx->seektablelength);
108
-    av_log(s, AV_LOG_DEBUG, "wavheaderlength      = %d\n", ape_ctx->wavheaderlength);
109
-    av_log(s, AV_LOG_DEBUG, "audiodatalength      = %d\n", ape_ctx->audiodatalength);
110
-    av_log(s, AV_LOG_DEBUG, "audiodatalength_high = %d\n", ape_ctx->audiodatalength_high);
111
-    av_log(s, AV_LOG_DEBUG, "wavtaillength        = %d\n", ape_ctx->wavtaillength);
104
+    av_log(s, AV_LOG_DEBUG, "fileversion          = %"PRId16"\n", ape_ctx->fileversion);
105
+    av_log(s, AV_LOG_DEBUG, "descriptorlength     = %"PRIu32"\n", ape_ctx->descriptorlength);
106
+    av_log(s, AV_LOG_DEBUG, "headerlength         = %"PRIu32"\n", ape_ctx->headerlength);
107
+    av_log(s, AV_LOG_DEBUG, "seektablelength      = %"PRIu32"\n", ape_ctx->seektablelength);
108
+    av_log(s, AV_LOG_DEBUG, "wavheaderlength      = %"PRIu32"\n", ape_ctx->wavheaderlength);
109
+    av_log(s, AV_LOG_DEBUG, "audiodatalength      = %"PRIu32"\n", ape_ctx->audiodatalength);
110
+    av_log(s, AV_LOG_DEBUG, "audiodatalength_high = %"PRIu32"\n", ape_ctx->audiodatalength_high);
111
+    av_log(s, AV_LOG_DEBUG, "wavtaillength        = %"PRIu32"\n", ape_ctx->wavtaillength);
112 112
     av_log(s, AV_LOG_DEBUG, "md5                  = ");
113 113
     for (i = 0; i < 16; i++)
114 114
          av_log(s, AV_LOG_DEBUG, "%02x", ape_ctx->md5[i]);
... ...
@@ -116,14 +116,14 @@ static void ape_dumpinfo(AVFormatContext * s, APEContext * ape_ctx)
116 116
 
117 117
     av_log(s, AV_LOG_DEBUG, "\nHeader Block:\n\n");
118 118
 
119
-    av_log(s, AV_LOG_DEBUG, "compressiontype      = %d\n", ape_ctx->compressiontype);
120
-    av_log(s, AV_LOG_DEBUG, "formatflags          = %d\n", ape_ctx->formatflags);
121
-    av_log(s, AV_LOG_DEBUG, "blocksperframe       = %d\n", ape_ctx->blocksperframe);
122
-    av_log(s, AV_LOG_DEBUG, "finalframeblocks     = %d\n", ape_ctx->finalframeblocks);
123
-    av_log(s, AV_LOG_DEBUG, "totalframes          = %d\n", ape_ctx->totalframes);
124
-    av_log(s, AV_LOG_DEBUG, "bps                  = %d\n", ape_ctx->bps);
125
-    av_log(s, AV_LOG_DEBUG, "channels             = %d\n", ape_ctx->channels);
126
-    av_log(s, AV_LOG_DEBUG, "samplerate           = %d\n", ape_ctx->samplerate);
119
+    av_log(s, AV_LOG_DEBUG, "compressiontype      = %"PRIu16"\n", ape_ctx->compressiontype);
120
+    av_log(s, AV_LOG_DEBUG, "formatflags          = %"PRIu16"\n", ape_ctx->formatflags);
121
+    av_log(s, AV_LOG_DEBUG, "blocksperframe       = %"PRIu32"\n", ape_ctx->blocksperframe);
122
+    av_log(s, AV_LOG_DEBUG, "finalframeblocks     = %"PRIu32"\n", ape_ctx->finalframeblocks);
123
+    av_log(s, AV_LOG_DEBUG, "totalframes          = %"PRIu32"\n", ape_ctx->totalframes);
124
+    av_log(s, AV_LOG_DEBUG, "bps                  = %"PRIu16"\n", ape_ctx->bps);
125
+    av_log(s, AV_LOG_DEBUG, "channels             = %"PRIu16"\n", ape_ctx->channels);
126
+    av_log(s, AV_LOG_DEBUG, "samplerate           = %"PRIu32"\n", ape_ctx->samplerate);
127 127
 
128 128
     av_log(s, AV_LOG_DEBUG, "\nSeektable\n\n");
129 129
     if ((ape_ctx->seektablelength / sizeof(uint32_t)) != ape_ctx->totalframes) {
... ...
@@ -140,12 +140,14 @@ static void ape_dumpinfo(AVFormatContext * s, APEContext * ape_ctx)
140 140
 
141 141
     av_log(s, AV_LOG_DEBUG, "\nFrames\n\n");
142 142
     for (i = 0; i < ape_ctx->totalframes; i++)
143
-        av_log(s, AV_LOG_DEBUG, "%8d   %8lld %8d (%d samples)\n", i, ape_ctx->frames[i].pos, ape_ctx->frames[i].size, ape_ctx->frames[i].nblocks);
143
+        av_log(s, AV_LOG_DEBUG, "%8d   %8"PRId64" %8d (%d samples)\n", i,
144
+               ape_ctx->frames[i].pos, ape_ctx->frames[i].size,
145
+               ape_ctx->frames[i].nblocks);
144 146
 
145 147
     av_log(s, AV_LOG_DEBUG, "\nCalculated information:\n\n");
146
-    av_log(s, AV_LOG_DEBUG, "junklength           = %d\n", ape_ctx->junklength);
147
-    av_log(s, AV_LOG_DEBUG, "firstframe           = %d\n", ape_ctx->firstframe);
148
-    av_log(s, AV_LOG_DEBUG, "totalsamples         = %d\n", ape_ctx->totalsamples);
148
+    av_log(s, AV_LOG_DEBUG, "junklength           = %"PRIu32"\n", ape_ctx->junklength);
149
+    av_log(s, AV_LOG_DEBUG, "firstframe           = %"PRIu32"\n", ape_ctx->firstframe);
150
+    av_log(s, AV_LOG_DEBUG, "totalsamples         = %"PRIu32"\n", ape_ctx->totalsamples);
149 151
 #endif
150 152
 }
151 153
 
... ...
@@ -169,7 +171,8 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap)
169 169
     ape->fileversion = avio_rl16(pb);
170 170
 
171 171
     if (ape->fileversion < APE_MIN_VERSION || ape->fileversion > APE_MAX_VERSION) {
172
-        av_log(s, AV_LOG_ERROR, "Unsupported file version - %d.%02d\n", ape->fileversion / 1000, (ape->fileversion % 1000) / 10);
172
+        av_log(s, AV_LOG_ERROR, "Unsupported file version - %"PRId16".%02"PRId16"\n",
173
+               ape->fileversion / 1000, (ape->fileversion % 1000) / 10);
173 174
         return -1;
174 175
     }
175 176
 
... ...
@@ -247,11 +250,12 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap)
247 247
         return AVERROR(EINVAL);
248 248
     }
249 249
     if(ape->totalframes > UINT_MAX / sizeof(APEFrame)){
250
-        av_log(s, AV_LOG_ERROR, "Too many frames: %d\n", ape->totalframes);
250
+        av_log(s, AV_LOG_ERROR, "Too many frames: %"PRIu32"\n",
251
+               ape->totalframes);
251 252
         return -1;
252 253
     }
253 254
     if (ape->seektablelength && (ape->seektablelength / sizeof(*ape->seektable)) < ape->totalframes) {
254
-        av_log(s, AV_LOG_ERROR, "Number of seek entries is less than number of frames: %zd vs. %d\n",
255
+        av_log(s, AV_LOG_ERROR, "Number of seek entries is less than number of frames: %ld vs. %"PRIu32"\n",
255 256
                ape->seektablelength / sizeof(*ape->seektable), ape->totalframes);
256 257
         return AVERROR_INVALIDDATA;
257 258
     }
... ...
@@ -301,7 +305,9 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap)
301 301
         avio_seek(pb, 0, SEEK_SET);
302 302
     }
303 303
 
304
-    av_log(s, AV_LOG_DEBUG, "Decoding file - v%d.%02d, compression level %d\n", ape->fileversion / 1000, (ape->fileversion % 1000) / 10, ape->compressiontype);
304
+    av_log(s, AV_LOG_DEBUG, "Decoding file - v%d.%02d, compression level %"PRIu16"\n",
305
+           ape->fileversion / 1000, (ape->fileversion % 1000) / 10,
306
+           ape->compressiontype);
305 307
 
306 308
     /* now we are ready: build format streams */
307 309
     st = av_new_stream(s, 0);
... ...
@@ -522,8 +522,8 @@ static int mxf_read_index_table_segment(void *arg, AVIOContext *pb, int tag, int
522 522
     case 0x3F06: av_dlog(NULL, "IndexSID %d\n", avio_rb32(pb)); break;
523 523
     case 0x3F07: av_dlog(NULL, "BodySID %d\n", avio_rb32(pb)); break;
524 524
     case 0x3F0B: av_dlog(NULL, "IndexEditRate %d/%d\n", avio_rb32(pb), avio_rb32(pb)); break;
525
-    case 0x3F0C: av_dlog(NULL, "IndexStartPosition %"PRId64"\n", avio_rb64(pb)); break;
526
-    case 0x3F0D: av_dlog(NULL, "IndexDuration %"PRId64"\n", avio_rb64(pb)); break;
525
+    case 0x3F0C: av_dlog(NULL, "IndexStartPosition %"PRIu64"\n", avio_rb64(pb)); break;
526
+    case 0x3F0D: av_dlog(NULL, "IndexDuration %"PRIu64"\n", avio_rb64(pb)); break;
527 527
     }
528 528
     return 0;
529 529
 }
... ...
@@ -920,7 +920,7 @@ static int mxf_read_header(AVFormatContext *s, AVFormatParameters *ap)
920 920
         if (klv_read_packet(&klv, s->pb) < 0)
921 921
             return -1;
922 922
         PRINT_KEY(s, "read header", klv.key);
923
-        av_dlog(s, "size %"PRId64" offset %#"PRIx64"\n", klv.length, klv.offset);
923
+        av_dlog(s, "size %"PRIu64" offset %#"PRIx64"\n", klv.length, klv.offset);
924 924
         if (IS_KLV_KEY(klv.key, mxf_encrypted_triplet_key) ||
925 925
             IS_KLV_KEY(klv.key, mxf_essence_element_key)) {
926 926
             /* FIXME avoid seek */
... ...
@@ -43,7 +43,7 @@ static int read_atom(AVFormatContext *s, Atom *atom)
43 43
     if (atom->size < 8)
44 44
         return -1;
45 45
     atom->tag = avio_rl32(s->pb);
46
-    av_dlog(s, "atom %d %.4s offset %#"PRIx64"\n",
46
+    av_dlog(s, "atom %u %.4s offset %#"PRIx64"\n",
47 47
             atom->size, (char*)&atom->tag, atom->offset);
48 48
     return atom->size;
49 49
 }
... ...
@@ -356,7 +356,8 @@ static int r3d_seek(AVFormatContext *s, int stream_index, int64_t sample_time, i
356 356
 
357 357
     frame_num = sample_time*st->codec->time_base.den/
358 358
         ((int64_t)st->codec->time_base.num*st->time_base.den);
359
-    av_dlog(s, "seek frame num %d timestamp %"PRId64"\n", frame_num, sample_time);
359
+    av_dlog(s, "seek frame num %d timestamp %"PRId64"\n",
360
+            frame_num, sample_time);
360 361
 
361 362
     if (frame_num < r3d->video_offsets_count) {
362 363
         avio_seek(s->pb, r3d->video_offsets_count, SEEK_SET);