* qatar/master:
ARM: ac3dsp: optimised update_bap_counts()
mpegaudiodec: Fix av_dlog() invocation.
h264/10bit: add HAVE_ALIGNED_STACK checks.
Update 8-bit H.264 IDCT function names to reflect bit-depth.
Add IDCT functions for 10-bit H.264.
mpegaudioenc: Fix broken av_dlog statement.
Employ correct printf format specifiers, mostly in debug output.
ARM: fix MUL64 inline asm for pre-armv6
Conflicts:
libavcodec/mpegaudioenc.c
libavformat/ape.c
libavformat/mxfdec.c
libavformat/r3d.c
Merged-by: Michael Niedermayer <michaelni@gmx.at>
| 5 | 6 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,35 @@ |
| 0 |
+/* |
|
| 1 |
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> |
|
| 2 |
+ * |
|
| 3 |
+ * This file is part of Libav. |
|
| 4 |
+ * |
|
| 5 |
+ * Libav is free software; you can redistribute it and/or |
|
| 6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
| 7 |
+ * License as published by the Free Software Foundation; either |
|
| 8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
| 9 |
+ * |
|
| 10 |
+ * Libav is distributed in the hope that it will be useful, |
|
| 11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 13 |
+ * Lesser General Public License for more details. |
|
| 14 |
+ * |
|
| 15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
| 16 |
+ * License along with Libav; if not, write to the Free Software |
|
| 17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 18 |
+ */ |
|
| 19 |
+ |
|
| 20 |
+#include "asm.S" |
|
| 21 |
+ |
|
| 22 |
+function ff_ac3_update_bap_counts_arm, export=1 |
|
| 23 |
+ push {lr}
|
|
| 24 |
+ ldrb lr, [r1], #1 |
|
| 25 |
+1: |
|
| 26 |
+ lsl r3, lr, #1 |
|
| 27 |
+ ldrh r12, [r0, r3] |
|
| 28 |
+ subs r2, r2, #1 |
|
| 29 |
+ ldrbgt lr, [r1], #1 |
|
| 30 |
+ add r12, r12, #1 |
|
| 31 |
+ strh r12, [r0, r3] |
|
| 32 |
+ bgt 1b |
|
| 33 |
+ pop {pc}
|
|
| 34 |
+endfunc |
| ... | ... |
@@ -35,10 +35,12 @@ void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd, |
| 35 | 35 |
int snr_offset, int floor, |
| 36 | 36 |
const uint8_t *bap_tab, uint8_t *bap); |
| 37 | 37 |
|
| 38 |
-int ff_ac3_compute_mantissa_size_arm(int cnt[5], uint8_t *bap, int nb_coefs); |
|
| 38 |
+void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len); |
|
| 39 | 39 |
|
| 40 | 40 |
av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact) |
| 41 | 41 |
{
|
| 42 |
+ c->update_bap_counts = ff_ac3_update_bap_counts_arm; |
|
| 43 |
+ |
|
| 42 | 44 |
if (HAVE_ARMV6) {
|
| 43 | 45 |
c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6; |
| 44 | 46 |
} |
| ... | ... |
@@ -41,6 +41,8 @@ static inline av_const int MULL(int a, int b, unsigned shift) |
| 41 | 41 |
} |
| 42 | 42 |
|
| 43 | 43 |
#define MULH MULH |
| 44 |
+#define MUL64 MUL64 |
|
| 45 |
+ |
|
| 44 | 46 |
#if HAVE_ARMV6 |
| 45 | 47 |
static inline av_const int MULH(int a, int b) |
| 46 | 48 |
{
|
| ... | ... |
@@ -48,6 +50,13 @@ static inline av_const int MULH(int a, int b) |
| 48 | 48 |
__asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
|
| 49 | 49 |
return r; |
| 50 | 50 |
} |
| 51 |
+ |
|
| 52 |
+static inline av_const int64_t MUL64(int a, int b) |
|
| 53 |
+{
|
|
| 54 |
+ int64_t x; |
|
| 55 |
+ __asm__ ("smull %Q0, %R0, %1, %2" : "=r"(x) : "r"(a), "r"(b));
|
|
| 56 |
+ return x; |
|
| 57 |
+} |
|
| 51 | 58 |
#else |
| 52 | 59 |
static inline av_const int MULH(int a, int b) |
| 53 | 60 |
{
|
| ... | ... |
@@ -55,15 +64,14 @@ static inline av_const int MULH(int a, int b) |
| 55 | 55 |
__asm__ ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));
|
| 56 | 56 |
return hi; |
| 57 | 57 |
} |
| 58 |
-#endif |
|
| 59 | 58 |
|
| 60 | 59 |
static inline av_const int64_t MUL64(int a, int b) |
| 61 | 60 |
{
|
| 62 | 61 |
int64_t x; |
| 63 |
- __asm__ ("smull %Q0, %R0, %1, %2" : "=r"(x) : "r"(a), "r"(b));
|
|
| 62 |
+ __asm__ ("smull %Q0, %R0, %1, %2" : "=&r"(x) : "r"(a), "r"(b));
|
|
| 64 | 63 |
return x; |
| 65 | 64 |
} |
| 66 |
-#define MUL64 MUL64 |
|
| 65 |
+#endif |
|
| 67 | 66 |
|
| 68 | 67 |
static inline av_const int64_t MAC64(int64_t d, int a, int b) |
| 69 | 68 |
{
|
| ... | ... |
@@ -66,7 +66,6 @@ typedef struct H264DSPContext{
|
| 66 | 66 |
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); |
| 67 | 67 |
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); |
| 68 | 68 |
|
| 69 |
- void (*h264_dct)(DCTELEM block[4][4]); |
|
| 70 | 69 |
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); |
| 71 | 70 |
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); |
| 72 | 71 |
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); |
| ... | ... |
@@ -406,8 +406,9 @@ static av_cold int decode_init(AVCodecContext * avctx) |
| 406 | 406 |
k = i & 1; |
| 407 | 407 |
is_table_lsf[j][k ^ 1][i] = FIXR(f); |
| 408 | 408 |
is_table_lsf[j][k][i] = FIXR(1.0); |
| 409 |
- av_dlog(avctx, "is_table_lsf %d %d: %x %x\n", |
|
| 410 |
- i, j, is_table_lsf[j][0][i], is_table_lsf[j][1][i]); |
|
| 409 |
+ av_dlog(avctx, "is_table_lsf %d %d: %f %f\n", |
|
| 410 |
+ i, j, (float) is_table_lsf[j][0][i], |
|
| 411 |
+ (float) is_table_lsf[j][1][i]); |
|
| 411 | 412 |
} |
| 412 | 413 |
} |
| 413 | 414 |
|
| ... | ... |
@@ -548,13 +548,11 @@ static void compute_bit_allocation(MpegAudioContext *s, |
| 548 | 548 |
} |
| 549 | 549 |
} |
| 550 | 550 |
} |
| 551 |
-#if 0 |
|
| 552 |
- printf("current=%d max=%d max_sb=%d alloc=%d\n",
|
|
| 553 |
- current_frame_size, max_frame_size, max_sb, |
|
| 554 |
- bit_alloc[max_sb]); |
|
| 555 |
-#endif |
|
| 556 | 551 |
if (max_sb < 0) |
| 557 | 552 |
break; |
| 553 |
+ av_dlog(NULL, "current=%d max=%d max_sb=%d max_ch=%d alloc=%d\n", |
|
| 554 |
+ current_frame_size, max_frame_size, max_sb, max_ch, |
|
| 555 |
+ bit_alloc[max_ch][max_sb]); |
|
| 558 | 556 |
|
| 559 | 557 |
/* find alloc table entry (XXX: not optimal, should use |
| 560 | 558 |
pointer table) */ |
| ... | ... |
@@ -12,8 +12,9 @@ YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ |
| 12 | 12 |
MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o |
| 13 | 13 |
YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ |
| 14 | 14 |
x86/h264_deblock_10bit.o \ |
| 15 |
- x86/h264_weight.o \ |
|
| 16 | 15 |
x86/h264_idct.o \ |
| 16 |
+ x86/h264_idct_10bit.o \ |
|
| 17 |
+ x86/h264_weight.o \ |
|
| 17 | 18 |
|
| 18 | 19 |
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o |
| 19 | 20 |
MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o |
| ... | ... |
@@ -73,7 +73,7 @@ SECTION .text |
| 73 | 73 |
|
| 74 | 74 |
INIT_MMX |
| 75 | 75 |
; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) |
| 76 |
-cglobal h264_idct_add_mmx, 3, 3, 0 |
|
| 76 |
+cglobal h264_idct_add_8_mmx, 3, 3, 0 |
|
| 77 | 77 |
IDCT4_ADD r0, r1, r2 |
| 78 | 78 |
RET |
| 79 | 79 |
|
| ... | ... |
@@ -125,7 +125,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0 |
| 125 | 125 |
SUMSUB_BA w, 0, 4 |
| 126 | 126 |
SUMSUB_BA w, 3, 2 |
| 127 | 127 |
SUMSUB_BA w, 1, 5 |
| 128 |
- SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
|
| 128 |
+ SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
|
| 129 | 129 |
%endmacro |
| 130 | 130 |
|
| 131 | 131 |
%macro IDCT8_1D_FULL 1 |
| ... | ... |
@@ -177,7 +177,7 @@ cglobal h264_idct_add_mmx, 3, 3, 0 |
| 177 | 177 |
|
| 178 | 178 |
INIT_MMX |
| 179 | 179 |
; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) |
| 180 |
-cglobal h264_idct8_add_mmx, 3, 4, 0 |
|
| 180 |
+cglobal h264_idct8_add_8_mmx, 3, 4, 0 |
|
| 181 | 181 |
%assign pad 128+4-(stack_offset&7) |
| 182 | 182 |
SUB rsp, pad |
| 183 | 183 |
|
| ... | ... |
@@ -237,7 +237,7 @@ cglobal h264_idct8_add_mmx, 3, 4, 0 |
| 237 | 237 |
|
| 238 | 238 |
INIT_XMM |
| 239 | 239 |
; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) |
| 240 |
-cglobal h264_idct8_add_sse2, 3, 4, 10 |
|
| 240 |
+cglobal h264_idct8_add_8_sse2, 3, 4, 10 |
|
| 241 | 241 |
IDCT8_ADD_SSE r0, r1, r2, r3 |
| 242 | 242 |
RET |
| 243 | 243 |
|
| ... | ... |
@@ -261,7 +261,7 @@ cglobal h264_idct8_add_sse2, 3, 4, 10 |
| 261 | 261 |
packuswb m1, m1 |
| 262 | 262 |
%endmacro |
| 263 | 263 |
|
| 264 |
-%macro DC_ADD_MMX2_OP 3-4 |
|
| 264 |
+%macro DC_ADD_MMX2_OP 4 |
|
| 265 | 265 |
%1 m2, [%2 ] |
| 266 | 266 |
%1 m3, [%2+%3 ] |
| 267 | 267 |
%1 m4, [%2+%3*2] |
| ... | ... |
@@ -282,13 +282,13 @@ cglobal h264_idct8_add_sse2, 3, 4, 10 |
| 282 | 282 |
|
| 283 | 283 |
INIT_MMX |
| 284 | 284 |
; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) |
| 285 |
-cglobal h264_idct_dc_add_mmx2, 3, 3, 0 |
|
| 285 |
+cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0 |
|
| 286 | 286 |
DC_ADD_MMX2_INIT r1, r2 |
| 287 | 287 |
DC_ADD_MMX2_OP movh, r0, r2, r1 |
| 288 | 288 |
RET |
| 289 | 289 |
|
| 290 | 290 |
; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) |
| 291 |
-cglobal h264_idct8_dc_add_mmx2, 3, 3, 0 |
|
| 291 |
+cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0 |
|
| 292 | 292 |
DC_ADD_MMX2_INIT r1, r2 |
| 293 | 293 |
DC_ADD_MMX2_OP mova, r0, r2, r1 |
| 294 | 294 |
lea r0, [r0+r2*4] |
| ... | ... |
@@ -297,7 +297,7 @@ cglobal h264_idct8_dc_add_mmx2, 3, 3, 0 |
| 297 | 297 |
|
| 298 | 298 |
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, |
| 299 | 299 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 300 |
-cglobal h264_idct_add16_mmx, 5, 7, 0 |
|
| 300 |
+cglobal h264_idct_add16_8_mmx, 5, 7, 0 |
|
| 301 | 301 |
xor r5, r5 |
| 302 | 302 |
%ifdef PIC |
| 303 | 303 |
lea r11, [scan8_mem] |
| ... | ... |
@@ -319,7 +319,7 @@ cglobal h264_idct_add16_mmx, 5, 7, 0 |
| 319 | 319 |
|
| 320 | 320 |
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, |
| 321 | 321 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 322 |
-cglobal h264_idct8_add4_mmx, 5, 7, 0 |
|
| 322 |
+cglobal h264_idct8_add4_8_mmx, 5, 7, 0 |
|
| 323 | 323 |
%assign pad 128+4-(stack_offset&7) |
| 324 | 324 |
SUB rsp, pad |
| 325 | 325 |
|
| ... | ... |
@@ -351,7 +351,7 @@ cglobal h264_idct8_add4_mmx, 5, 7, 0 |
| 351 | 351 |
|
| 352 | 352 |
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, |
| 353 | 353 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 354 |
-cglobal h264_idct_add16_mmx2, 5, 7, 0 |
|
| 354 |
+cglobal h264_idct_add16_8_mmx2, 5, 7, 0 |
|
| 355 | 355 |
xor r5, r5 |
| 356 | 356 |
%ifdef PIC |
| 357 | 357 |
lea r11, [scan8_mem] |
| ... | ... |
@@ -398,7 +398,7 @@ cglobal h264_idct_add16_mmx2, 5, 7, 0 |
| 398 | 398 |
|
| 399 | 399 |
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, |
| 400 | 400 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 401 |
-cglobal h264_idct_add16intra_mmx, 5, 7, 0 |
|
| 401 |
+cglobal h264_idct_add16intra_8_mmx, 5, 7, 0 |
|
| 402 | 402 |
xor r5, r5 |
| 403 | 403 |
%ifdef PIC |
| 404 | 404 |
lea r11, [scan8_mem] |
| ... | ... |
@@ -421,7 +421,7 @@ cglobal h264_idct_add16intra_mmx, 5, 7, 0 |
| 421 | 421 |
|
| 422 | 422 |
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, |
| 423 | 423 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 424 |
-cglobal h264_idct_add16intra_mmx2, 5, 7, 0 |
|
| 424 |
+cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0 |
|
| 425 | 425 |
xor r5, r5 |
| 426 | 426 |
%ifdef PIC |
| 427 | 427 |
lea r11, [scan8_mem] |
| ... | ... |
@@ -466,7 +466,7 @@ cglobal h264_idct_add16intra_mmx2, 5, 7, 0 |
| 466 | 466 |
|
| 467 | 467 |
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, |
| 468 | 468 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 469 |
-cglobal h264_idct8_add4_mmx2, 5, 7, 0 |
|
| 469 |
+cglobal h264_idct8_add4_8_mmx2, 5, 7, 0 |
|
| 470 | 470 |
%assign pad 128+4-(stack_offset&7) |
| 471 | 471 |
SUB rsp, pad |
| 472 | 472 |
|
| ... | ... |
@@ -529,7 +529,7 @@ cglobal h264_idct8_add4_mmx2, 5, 7, 0 |
| 529 | 529 |
INIT_XMM |
| 530 | 530 |
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, |
| 531 | 531 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 532 |
-cglobal h264_idct8_add4_sse2, 5, 7, 10 |
|
| 532 |
+cglobal h264_idct8_add4_8_sse2, 5, 7, 10 |
|
| 533 | 533 |
xor r5, r5 |
| 534 | 534 |
%ifdef PIC |
| 535 | 535 |
lea r11, [scan8_mem] |
| ... | ... |
@@ -607,7 +607,7 @@ h264_idct_add8_mmx_plane: |
| 607 | 607 |
|
| 608 | 608 |
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, |
| 609 | 609 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 610 |
-cglobal h264_idct_add8_mmx, 5, 7, 0 |
|
| 610 |
+cglobal h264_idct_add8_8_mmx, 5, 7, 0 |
|
| 611 | 611 |
mov r5, 16 |
| 612 | 612 |
add r2, 512 |
| 613 | 613 |
%ifdef PIC |
| ... | ... |
@@ -668,7 +668,7 @@ h264_idct_add8_mmx2_plane |
| 668 | 668 |
|
| 669 | 669 |
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, |
| 670 | 670 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 671 |
-cglobal h264_idct_add8_mmx2, 5, 7, 0 |
|
| 671 |
+cglobal h264_idct_add8_8_mmx2, 5, 7, 0 |
|
| 672 | 672 |
mov r5, 16 |
| 673 | 673 |
add r2, 512 |
| 674 | 674 |
%ifdef ARCH_X86_64 |
| ... | ... |
@@ -744,7 +744,7 @@ x264_add8x4_idct_sse2: |
| 744 | 744 |
|
| 745 | 745 |
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, |
| 746 | 746 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 747 |
-cglobal h264_idct_add16_sse2, 5, 5, 8 |
|
| 747 |
+cglobal h264_idct_add16_8_sse2, 5, 5, 8 |
|
| 748 | 748 |
%ifdef ARCH_X86_64 |
| 749 | 749 |
mov r10, r0 |
| 750 | 750 |
%endif |
| ... | ... |
@@ -791,7 +791,7 @@ cglobal h264_idct_add16_sse2, 5, 5, 8 |
| 791 | 791 |
|
| 792 | 792 |
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, |
| 793 | 793 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 794 |
-cglobal h264_idct_add16intra_sse2, 5, 7, 8 |
|
| 794 |
+cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 |
|
| 795 | 795 |
%ifdef ARCH_X86_64 |
| 796 | 796 |
mov r10, r0 |
| 797 | 797 |
%endif |
| ... | ... |
@@ -840,7 +840,7 @@ cglobal h264_idct_add16intra_sse2, 5, 7, 8 |
| 840 | 840 |
|
| 841 | 841 |
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, |
| 842 | 842 |
; DCTELEM *block, int stride, const uint8_t nnzc[6*8]) |
| 843 |
-cglobal h264_idct_add8_sse2, 5, 7, 8 |
|
| 843 |
+cglobal h264_idct_add8_8_sse2, 5, 7, 8 |
|
| 844 | 844 |
add r2, 512 |
| 845 | 845 |
%ifdef ARCH_X86_64 |
| 846 | 846 |
mov r10, r0 |
| 847 | 847 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,570 @@ |
| 0 |
+;***************************************************************************** |
|
| 1 |
+;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code |
|
| 2 |
+;***************************************************************************** |
|
| 3 |
+;* Copyright (C) 2005-2011 x264 project |
|
| 4 |
+;* |
|
| 5 |
+;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
|
| 6 |
+;* |
|
| 7 |
+;* This file is part of Libav. |
|
| 8 |
+;* |
|
| 9 |
+;* Libav is free software; you can redistribute it and/or |
|
| 10 |
+;* modify it under the terms of the GNU Lesser General Public |
|
| 11 |
+;* License as published by the Free Software Foundation; either |
|
| 12 |
+;* version 2.1 of the License, or (at your option) any later version. |
|
| 13 |
+;* |
|
| 14 |
+;* Libav is distributed in the hope that it will be useful, |
|
| 15 |
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 16 |
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
| 17 |
+;* Lesser General Public License for more details. |
|
| 18 |
+;* |
|
| 19 |
+;* You should have received a copy of the GNU Lesser General Public |
|
| 20 |
+;* License along with Libav; if not, write to the Free Software |
|
| 21 |
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
| 22 |
+;****************************************************************************** |
|
| 23 |
+ |
|
| 24 |
+%include "x86inc.asm" |
|
| 25 |
+%include "x86util.asm" |
|
| 26 |
+ |
|
| 27 |
+SECTION_RODATA |
|
| 28 |
+ |
|
| 29 |
+pw_pixel_max: times 8 dw ((1 << 10)-1) |
|
| 30 |
+pd_32: times 4 dd 32 |
|
| 31 |
+scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 |
|
| 32 |
+ db 6+1*8, 7+1*8, 6+2*8, 7+2*8 |
|
| 33 |
+ db 4+3*8, 5+3*8, 4+4*8, 5+4*8 |
|
| 34 |
+ db 6+3*8, 7+3*8, 6+4*8, 7+4*8 |
|
| 35 |
+ db 1+1*8, 2+1*8 |
|
| 36 |
+ db 1+2*8, 2+2*8 |
|
| 37 |
+ db 1+4*8, 2+4*8 |
|
| 38 |
+ db 1+5*8, 2+5*8 |
|
| 39 |
+ |
|
| 40 |
+%ifdef PIC |
|
| 41 |
+%define scan8 r11 |
|
| 42 |
+%else |
|
| 43 |
+%define scan8 scan8_mem |
|
| 44 |
+%endif |
|
| 45 |
+ |
|
| 46 |
+SECTION .text |
|
| 47 |
+ |
|
| 48 |
+;----------------------------------------------------------------------------- |
|
| 49 |
+; void h264_idct_add(pixel *dst, dctcoef *block, int stride) |
|
| 50 |
+;----------------------------------------------------------------------------- |
|
| 51 |
+%macro STORE_DIFFx2 6 |
|
| 52 |
+ psrad %1, 6 |
|
| 53 |
+ psrad %2, 6 |
|
| 54 |
+ packssdw %1, %2 |
|
| 55 |
+ movq %3, [%5] |
|
| 56 |
+ movhps %3, [%5+%6] |
|
| 57 |
+ paddsw %1, %3 |
|
| 58 |
+ CLIPW %1, %4, [pw_pixel_max] |
|
| 59 |
+ movq [%5], %1 |
|
| 60 |
+ movhps [%5+%6], %1 |
|
| 61 |
+%endmacro |
|
| 62 |
+ |
|
| 63 |
+%macro STORE_DIFF16 5 |
|
| 64 |
+ psrad %1, 6 |
|
| 65 |
+ psrad %2, 6 |
|
| 66 |
+ packssdw %1, %2 |
|
| 67 |
+ paddsw %1, [%5] |
|
| 68 |
+ CLIPW %1, %3, %4 |
|
| 69 |
+ mova [%5], %1 |
|
| 70 |
+%endmacro |
|
| 71 |
+ |
|
| 72 |
+;dst, in, stride |
|
| 73 |
+%macro IDCT4_ADD_10 3 |
|
| 74 |
+ mova m0, [%2+ 0] |
|
| 75 |
+ mova m1, [%2+16] |
|
| 76 |
+ mova m2, [%2+32] |
|
| 77 |
+ mova m3, [%2+48] |
|
| 78 |
+ IDCT4_1D d,0,1,2,3,4,5 |
|
| 79 |
+ TRANSPOSE4x4D 0,1,2,3,4 |
|
| 80 |
+ paddd m0, [pd_32] |
|
| 81 |
+ IDCT4_1D d,0,1,2,3,4,5 |
|
| 82 |
+ pxor m5, m5 |
|
| 83 |
+ STORE_DIFFx2 m0, m1, m4, m5, %1, %3 |
|
| 84 |
+ lea %1, [%1+%3*2] |
|
| 85 |
+ STORE_DIFFx2 m2, m3, m4, m5, %1, %3 |
|
| 86 |
+%endmacro |
|
| 87 |
+ |
|
| 88 |
+%macro IDCT_ADD_10 1 |
|
| 89 |
+cglobal h264_idct_add_10_%1, 3,3 |
|
| 90 |
+ IDCT4_ADD_10 r0, r1, r2 |
|
| 91 |
+ RET |
|
| 92 |
+%endmacro |
|
| 93 |
+ |
|
| 94 |
+INIT_XMM |
|
| 95 |
+IDCT_ADD_10 sse2 |
|
| 96 |
+%ifdef HAVE_AVX |
|
| 97 |
+INIT_AVX |
|
| 98 |
+IDCT_ADD_10 avx |
|
| 99 |
+%endif |
|
| 100 |
+ |
|
| 101 |
+;----------------------------------------------------------------------------- |
|
| 102 |
+; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
|
| 103 |
+;----------------------------------------------------------------------------- |
|
| 104 |
+;;;;;;; NO FATE SAMPLES TRIGGER THIS |
|
| 105 |
+%macro ADD4x4IDCT 1 |
|
| 106 |
+add4x4_idct_%1: |
|
| 107 |
+ add r5, r0 |
|
| 108 |
+ mova m0, [r2+ 0] |
|
| 109 |
+ mova m1, [r2+16] |
|
| 110 |
+ mova m2, [r2+32] |
|
| 111 |
+ mova m3, [r2+48] |
|
| 112 |
+ IDCT4_1D d,0,1,2,3,4,5 |
|
| 113 |
+ TRANSPOSE4x4D 0,1,2,3,4 |
|
| 114 |
+ paddd m0, [pd_32] |
|
| 115 |
+ IDCT4_1D d,0,1,2,3,4,5 |
|
| 116 |
+ pxor m5, m5 |
|
| 117 |
+ STORE_DIFFx2 m0, m1, m4, m5, r5, r3 |
|
| 118 |
+ lea r5, [r5+r3*2] |
|
| 119 |
+ STORE_DIFFx2 m2, m3, m4, m5, r5, r3 |
|
| 120 |
+ ret |
|
| 121 |
+%endmacro |
|
| 122 |
+ |
|
| 123 |
+INIT_XMM |
|
| 124 |
+ALIGN 16 |
|
| 125 |
+ADD4x4IDCT sse2 |
|
| 126 |
+%ifdef HAVE_AVX |
|
| 127 |
+INIT_AVX |
|
| 128 |
+ALIGN 16 |
|
| 129 |
+ADD4x4IDCT avx |
|
| 130 |
+%endif |
|
| 131 |
+ |
|
| 132 |
+%macro ADD16_OP 3 |
|
| 133 |
+ cmp byte [r4+%3], 0 |
|
| 134 |
+ jz .skipblock%2 |
|
| 135 |
+ mov r5d, dword [r1+%2*4] |
|
| 136 |
+ call add4x4_idct_%1 |
|
| 137 |
+.skipblock%2: |
|
| 138 |
+%if %2<15 |
|
| 139 |
+ add r2, 64 |
|
| 140 |
+%endif |
|
| 141 |
+%endmacro |
|
| 142 |
+ |
|
| 143 |
+%macro IDCT_ADD16_10 1 |
|
| 144 |
+cglobal h264_idct_add16_10_%1, 5,6 |
|
| 145 |
+ ADD16_OP %1, 0, 4+1*8 |
|
| 146 |
+ ADD16_OP %1, 1, 5+1*8 |
|
| 147 |
+ ADD16_OP %1, 2, 4+2*8 |
|
| 148 |
+ ADD16_OP %1, 3, 5+2*8 |
|
| 149 |
+ ADD16_OP %1, 4, 6+1*8 |
|
| 150 |
+ ADD16_OP %1, 5, 7+1*8 |
|
| 151 |
+ ADD16_OP %1, 6, 6+2*8 |
|
| 152 |
+ ADD16_OP %1, 7, 7+2*8 |
|
| 153 |
+ ADD16_OP %1, 8, 4+3*8 |
|
| 154 |
+ ADD16_OP %1, 9, 5+3*8 |
|
| 155 |
+ ADD16_OP %1, 10, 4+4*8 |
|
| 156 |
+ ADD16_OP %1, 11, 5+4*8 |
|
| 157 |
+ ADD16_OP %1, 12, 6+3*8 |
|
| 158 |
+ ADD16_OP %1, 13, 7+3*8 |
|
| 159 |
+ ADD16_OP %1, 14, 6+4*8 |
|
| 160 |
+ ADD16_OP %1, 15, 7+4*8 |
|
| 161 |
+ RET |
|
| 162 |
+%endmacro |
|
| 163 |
+ |
|
| 164 |
+INIT_XMM |
|
| 165 |
+IDCT_ADD16_10 sse2 |
|
| 166 |
+%ifdef HAVE_AVX |
|
| 167 |
+INIT_AVX |
|
| 168 |
+IDCT_ADD16_10 avx |
|
| 169 |
+%endif |
|
| 170 |
+ |
|
| 171 |
+;----------------------------------------------------------------------------- |
|
| 172 |
+; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) |
|
| 173 |
+;----------------------------------------------------------------------------- |
|
| 174 |
+%macro IDCT_DC_ADD_OP_10 3 |
|
| 175 |
+ pxor m5, m5 |
|
| 176 |
+%if avx_enabled |
|
| 177 |
+ paddw m1, m0, [%1+0 ] |
|
| 178 |
+ paddw m2, m0, [%1+%2 ] |
|
| 179 |
+ paddw m3, m0, [%1+%2*2] |
|
| 180 |
+ paddw m4, m0, [%1+%3 ] |
|
| 181 |
+%else |
|
| 182 |
+ mova m1, [%1+0 ] |
|
| 183 |
+ mova m2, [%1+%2 ] |
|
| 184 |
+ mova m3, [%1+%2*2] |
|
| 185 |
+ mova m4, [%1+%3 ] |
|
| 186 |
+ paddw m1, m0 |
|
| 187 |
+ paddw m2, m0 |
|
| 188 |
+ paddw m3, m0 |
|
| 189 |
+ paddw m4, m0 |
|
| 190 |
+%endif |
|
| 191 |
+ CLIPW m1, m5, m6 |
|
| 192 |
+ CLIPW m2, m5, m6 |
|
| 193 |
+ CLIPW m3, m5, m6 |
|
| 194 |
+ CLIPW m4, m5, m6 |
|
| 195 |
+ mova [%1+0 ], m1 |
|
| 196 |
+ mova [%1+%2 ], m2 |
|
| 197 |
+ mova [%1+%2*2], m3 |
|
| 198 |
+ mova [%1+%3 ], m4 |
|
| 199 |
+%endmacro |
|
| 200 |
+ |
|
| 201 |
+INIT_MMX |
|
| 202 |
+cglobal h264_idct_dc_add_10_mmx2,3,3 |
|
| 203 |
+ movd m0, dword [r1] |
|
| 204 |
+ paddd m0, [pd_32] |
|
| 205 |
+ psrad m0, 6 |
|
| 206 |
+ lea r1, [r2*3] |
|
| 207 |
+ pshufw m0, m0, 0 |
|
| 208 |
+ mova m6, [pw_pixel_max] |
|
| 209 |
+ IDCT_DC_ADD_OP_10 r0, r2, r1 |
|
| 210 |
+ RET |
|
| 211 |
+ |
|
| 212 |
+;----------------------------------------------------------------------------- |
|
| 213 |
+; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) |
|
| 214 |
+;----------------------------------------------------------------------------- |
|
| 215 |
+%macro IDCT8_DC_ADD 1 |
|
| 216 |
+cglobal h264_idct8_dc_add_10_%1,3,3,7 |
|
| 217 |
+ mov r1d, dword [r1] |
|
| 218 |
+ add r1, 32 |
|
| 219 |
+ sar r1, 6 |
|
| 220 |
+ movd m0, r1d |
|
| 221 |
+ lea r1, [r2*3] |
|
| 222 |
+ SPLATW m0, m0, 0 |
|
| 223 |
+ mova m6, [pw_pixel_max] |
|
| 224 |
+ IDCT_DC_ADD_OP_10 r0, r2, r1 |
|
| 225 |
+ lea r0, [r0+r2*4] |
|
| 226 |
+ IDCT_DC_ADD_OP_10 r0, r2, r1 |
|
| 227 |
+ RET |
|
| 228 |
+%endmacro |
|
| 229 |
+ |
|
| 230 |
+INIT_XMM |
|
| 231 |
+IDCT8_DC_ADD sse2 |
|
| 232 |
+%ifdef HAVE_AVX |
|
| 233 |
+INIT_AVX |
|
| 234 |
+IDCT8_DC_ADD avx |
|
| 235 |
+%endif |
|
| 236 |
+ |
|
| 237 |
+;----------------------------------------------------------------------------- |
|
| 238 |
+; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
|
| 239 |
+;----------------------------------------------------------------------------- |
|
| 240 |
+%macro AC 2 |
|
| 241 |
+.ac%2 |
|
| 242 |
+ mov r5d, dword [r1+(%2+0)*4] |
|
| 243 |
+ call add4x4_idct_%1 |
|
| 244 |
+ mov r5d, dword [r1+(%2+1)*4] |
|
| 245 |
+ add r2, 64 |
|
| 246 |
+ call add4x4_idct_%1 |
|
| 247 |
+ add r2, 64 |
|
| 248 |
+ jmp .skipadd%2 |
|
| 249 |
+%endmacro |
|
| 250 |
+ |
|
| 251 |
+%macro ADD16_OP_INTRA 3 |
|
| 252 |
+ cmp word [r4+%3], 0 |
|
| 253 |
+ jnz .ac%2 |
|
| 254 |
+ mov r6d, dword [r2+ 0] |
|
| 255 |
+ or r6d, dword [r2+64] |
|
| 256 |
+ jz .skipblock%2 |
|
| 257 |
+ mov r5d, dword [r1+(%2+0)*4] |
|
| 258 |
+ call idct_dc_add_%1 |
|
| 259 |
+.skipblock%2: |
|
| 260 |
+%if %2<15 |
|
| 261 |
+ add r2, 128 |
|
| 262 |
+%endif |
|
| 263 |
+.skipadd%2: |
|
| 264 |
+%endmacro |
|
| 265 |
+ |
|
| 266 |
+%macro IDCT_ADD16INTRA_10 1 |
|
| 267 |
+idct_dc_add_%1: |
|
| 268 |
+ add r5, r0 |
|
| 269 |
+ movq m0, [r2+ 0] |
|
| 270 |
+ movhps m0, [r2+64] |
|
| 271 |
+ paddd m0, [pd_32] |
|
| 272 |
+ psrad m0, 6 |
|
| 273 |
+ pshufhw m0, m0, 0 |
|
| 274 |
+ pshuflw m0, m0, 0 |
|
| 275 |
+ lea r6, [r3*3] |
|
| 276 |
+ mova m6, [pw_pixel_max] |
|
| 277 |
+ IDCT_DC_ADD_OP_10 r5, r3, r6 |
|
| 278 |
+ ret |
|
| 279 |
+ |
|
| 280 |
+cglobal h264_idct_add16intra_10_%1,5,7,8 |
|
| 281 |
+ ADD16_OP_INTRA %1, 0, 4+1*8 |
|
| 282 |
+ ADD16_OP_INTRA %1, 2, 4+2*8 |
|
| 283 |
+ ADD16_OP_INTRA %1, 4, 6+1*8 |
|
| 284 |
+ ADD16_OP_INTRA %1, 6, 6+2*8 |
|
| 285 |
+ ADD16_OP_INTRA %1, 8, 4+3*8 |
|
| 286 |
+ ADD16_OP_INTRA %1, 10, 4+4*8 |
|
| 287 |
+ ADD16_OP_INTRA %1, 12, 6+3*8 |
|
| 288 |
+ ADD16_OP_INTRA %1, 14, 6+4*8 |
|
| 289 |
+ RET |
|
| 290 |
+%assign i 14 |
|
| 291 |
+%rep 8 |
|
| 292 |
+ AC %1, i |
|
| 293 |
+%assign i i-2 |
|
| 294 |
+%endrep |
|
| 295 |
+%endmacro |
|
| 296 |
+ |
|
| 297 |
+INIT_XMM |
|
| 298 |
+IDCT_ADD16INTRA_10 sse2 |
|
| 299 |
+%ifdef HAVE_AVX |
|
| 300 |
+INIT_AVX |
|
| 301 |
+IDCT_ADD16INTRA_10 avx |
|
| 302 |
+%endif |
|
| 303 |
+ |
|
| 304 |
+;----------------------------------------------------------------------------- |
|
| 305 |
+; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
|
| 306 |
+;----------------------------------------------------------------------------- |
|
| 307 |
+%macro IDCT_ADD8 1 |
|
| 308 |
+cglobal h264_idct_add8_10_%1,5,7 |
|
| 309 |
+ mov r5, 16 |
|
| 310 |
+ add r2, 1024 |
|
| 311 |
+%ifdef PIC |
|
| 312 |
+ lea r11, [scan8_mem] |
|
| 313 |
+%endif |
|
| 314 |
+%ifdef ARCH_X86_64 |
|
| 315 |
+ mov r10, r0 |
|
| 316 |
+%endif |
|
| 317 |
+.nextblock: |
|
| 318 |
+ movzx r6, byte [scan8+r5] |
|
| 319 |
+ movzx r6, byte [r4+r6] |
|
| 320 |
+ or r6d, dword [r2] |
|
| 321 |
+ test r6, r6 |
|
| 322 |
+ jz .skipblock |
|
| 323 |
+%ifdef ARCH_X86_64 |
|
| 324 |
+ mov r0d, dword [r1+r5*4] |
|
| 325 |
+ add r0, [r10] |
|
| 326 |
+%else |
|
| 327 |
+ mov r0, r0m |
|
| 328 |
+ mov r0, [r0] |
|
| 329 |
+ add r0, dword [r1+r5*4] |
|
| 330 |
+%endif |
|
| 331 |
+ IDCT4_ADD_10 r0, r2, r3 |
|
| 332 |
+.skipblock: |
|
| 333 |
+ inc r5 |
|
| 334 |
+ add r2, 64 |
|
| 335 |
+ test r5, 3 |
|
| 336 |
+ jnz .nextblock |
|
| 337 |
+%ifdef ARCH_X86_64 |
|
| 338 |
+ add r10, gprsize |
|
| 339 |
+%else |
|
| 340 |
+ add r0mp, gprsize |
|
| 341 |
+%endif |
|
| 342 |
+ test r5, 4 |
|
| 343 |
+ jnz .nextblock |
|
| 344 |
+ REP_RET |
|
| 345 |
+%endmacro ; IDCT_ADD8 |
|
| 346 |
+ |
|
| 347 |
+INIT_XMM |
|
| 348 |
+IDCT_ADD8 sse2 |
|
| 349 |
+%ifdef HAVE_AVX |
|
| 350 |
+INIT_AVX |
|
| 351 |
+IDCT_ADD8 avx |
|
| 352 |
+%endif |
|
| 353 |
+ |
|
| 354 |
+;----------------------------------------------------------------------------- |
|
| 355 |
+; void h264_idct8_add(pixel *dst, dctcoef *block, int stride) |
|
| 356 |
+;----------------------------------------------------------------------------- |
|
| 357 |
+%macro IDCT8_1D 2 |
|
| 358 |
+ SWAP 0, 1 |
|
| 359 |
+ psrad m4, m5, 1 |
|
| 360 |
+ psrad m1, m0, 1 |
|
| 361 |
+ paddd m4, m5 |
|
| 362 |
+ paddd m1, m0 |
|
| 363 |
+ paddd m4, m7 |
|
| 364 |
+ paddd m1, m5 |
|
| 365 |
+ psubd m4, m0 |
|
| 366 |
+ paddd m1, m3 |
|
| 367 |
+ |
|
| 368 |
+ psubd m0, m3 |
|
| 369 |
+ psubd m5, m3 |
|
| 370 |
+ paddd m0, m7 |
|
| 371 |
+ psubd m5, m7 |
|
| 372 |
+ psrad m3, 1 |
|
| 373 |
+ psrad m7, 1 |
|
| 374 |
+ psubd m0, m3 |
|
| 375 |
+ psubd m5, m7 |
|
| 376 |
+ |
|
| 377 |
+ SWAP 1, 7 |
|
| 378 |
+ psrad m1, m7, 2 |
|
| 379 |
+ psrad m3, m4, 2 |
|
| 380 |
+ paddd m3, m0 |
|
| 381 |
+ psrad m0, 2 |
|
| 382 |
+ paddd m1, m5 |
|
| 383 |
+ psrad m5, 2 |
|
| 384 |
+ psubd m0, m4 |
|
| 385 |
+ psubd m7, m5 |
|
| 386 |
+ |
|
| 387 |
+ SWAP 5, 6 |
|
| 388 |
+ psrad m4, m2, 1 |
|
| 389 |
+ psrad m6, m5, 1 |
|
| 390 |
+ psubd m4, m5 |
|
| 391 |
+ paddd m6, m2 |
|
| 392 |
+ |
|
| 393 |
+ mova m2, %1 |
|
| 394 |
+ mova m5, %2 |
|
| 395 |
+ SUMSUB_BA d, 5, 2 |
|
| 396 |
+ SUMSUB_BA d, 6, 5 |
|
| 397 |
+ SUMSUB_BA d, 4, 2 |
|
| 398 |
+ SUMSUB_BA d, 7, 6 |
|
| 399 |
+ SUMSUB_BA d, 0, 4 |
|
| 400 |
+ SUMSUB_BA d, 3, 2 |
|
| 401 |
+ SUMSUB_BA d, 1, 5 |
|
| 402 |
+ SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
|
| 403 |
+%endmacro |
|
| 404 |
+ |
|
| 405 |
+%macro IDCT8_1D_FULL 1 |
|
| 406 |
+ mova m7, [%1+112*2] |
|
| 407 |
+ mova m6, [%1+ 96*2] |
|
| 408 |
+ mova m5, [%1+ 80*2] |
|
| 409 |
+ mova m3, [%1+ 48*2] |
|
| 410 |
+ mova m2, [%1+ 32*2] |
|
| 411 |
+ mova m1, [%1+ 16*2] |
|
| 412 |
+ IDCT8_1D [%1], [%1+ 64*2] |
|
| 413 |
+%endmacro |
|
| 414 |
+ |
|
| 415 |
+; %1=int16_t *block, %2=int16_t *dstblock |
|
| 416 |
+%macro IDCT8_ADD_SSE_START 2 |
|
| 417 |
+ IDCT8_1D_FULL %1 |
|
| 418 |
+%ifdef ARCH_X86_64 |
|
| 419 |
+ TRANSPOSE4x4D 0,1,2,3,8 |
|
| 420 |
+ mova [%2 ], m0 |
|
| 421 |
+ TRANSPOSE4x4D 4,5,6,7,8 |
|
| 422 |
+ mova [%2+8*2], m4 |
|
| 423 |
+%else |
|
| 424 |
+ mova [%1], m7 |
|
| 425 |
+ TRANSPOSE4x4D 0,1,2,3,7 |
|
| 426 |
+ mova m7, [%1] |
|
| 427 |
+ mova [%2 ], m0 |
|
| 428 |
+ mova [%2+16*2], m1 |
|
| 429 |
+ mova [%2+32*2], m2 |
|
| 430 |
+ mova [%2+48*2], m3 |
|
| 431 |
+ TRANSPOSE4x4D 4,5,6,7,3 |
|
| 432 |
+ mova [%2+ 8*2], m4 |
|
| 433 |
+ mova [%2+24*2], m5 |
|
| 434 |
+ mova [%2+40*2], m6 |
|
| 435 |
+ mova [%2+56*2], m7 |
|
| 436 |
+%endif |
|
| 437 |
+%endmacro |
|
| 438 |
+ |
|
| 439 |
+; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
|
| 440 |
+%macro IDCT8_ADD_SSE_END 3 |
|
| 441 |
+ IDCT8_1D_FULL %2 |
|
| 442 |
+ mova [%2 ], m6 |
|
| 443 |
+ mova [%2+16*2], m7 |
|
| 444 |
+ |
|
| 445 |
+ pxor m7, m7 |
|
| 446 |
+ STORE_DIFFx2 m0, m1, m6, m7, %1, %3 |
|
| 447 |
+ lea %1, [%1+%3*2] |
|
| 448 |
+ STORE_DIFFx2 m2, m3, m6, m7, %1, %3 |
|
| 449 |
+ mova m0, [%2 ] |
|
| 450 |
+ mova m1, [%2+16*2] |
|
| 451 |
+ lea %1, [%1+%3*2] |
|
| 452 |
+ STORE_DIFFx2 m4, m5, m6, m7, %1, %3 |
|
| 453 |
+ lea %1, [%1+%3*2] |
|
| 454 |
+ STORE_DIFFx2 m0, m1, m6, m7, %1, %3 |
|
| 455 |
+%endmacro |
|
| 456 |
+ |
|
| 457 |
+%macro IDCT8_ADD 1 |
|
| 458 |
+cglobal h264_idct8_add_10_%1, 3,4,16 |
|
| 459 |
+%ifndef UNIX64 |
|
| 460 |
+ %assign pad 16-gprsize-(stack_offset&15) |
|
| 461 |
+ sub rsp, pad |
|
| 462 |
+ call h264_idct8_add1_10_%1 |
|
| 463 |
+ add rsp, pad |
|
| 464 |
+ RET |
|
| 465 |
+%endif |
|
| 466 |
+ |
|
| 467 |
+ALIGN 16 |
|
| 468 |
+; TODO: does not need to use stack |
|
| 469 |
+h264_idct8_add1_10_%1: |
|
| 470 |
+%assign pad 256+16-gprsize |
|
| 471 |
+ sub rsp, pad |
|
| 472 |
+ add dword [r1], 32 |
|
| 473 |
+ |
|
| 474 |
+%ifdef ARCH_X86_64 |
|
| 475 |
+ IDCT8_ADD_SSE_START r1, rsp |
|
| 476 |
+ SWAP 1, 9 |
|
| 477 |
+ SWAP 2, 10 |
|
| 478 |
+ SWAP 3, 11 |
|
| 479 |
+ SWAP 5, 13 |
|
| 480 |
+ SWAP 6, 14 |
|
| 481 |
+ SWAP 7, 15 |
|
| 482 |
+ IDCT8_ADD_SSE_START r1+16, rsp+128 |
|
| 483 |
+ PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 |
|
| 484 |
+ IDCT8_1D [rsp], [rsp+128] |
|
| 485 |
+ SWAP 0, 8 |
|
| 486 |
+ SWAP 1, 9 |
|
| 487 |
+ SWAP 2, 10 |
|
| 488 |
+ SWAP 3, 11 |
|
| 489 |
+ SWAP 4, 12 |
|
| 490 |
+ SWAP 5, 13 |
|
| 491 |
+ SWAP 6, 14 |
|
| 492 |
+ SWAP 7, 15 |
|
| 493 |
+ IDCT8_1D [rsp+16], [rsp+144] |
|
| 494 |
+ psrad m8, 6 |
|
| 495 |
+ psrad m0, 6 |
|
| 496 |
+ packssdw m8, m0 |
|
| 497 |
+ paddsw m8, [r0] |
|
| 498 |
+ pxor m0, m0 |
|
| 499 |
+ CLIPW m8, m0, [pw_pixel_max] |
|
| 500 |
+ mova [r0], m8 |
|
| 501 |
+ mova m8, [pw_pixel_max] |
|
| 502 |
+ STORE_DIFF16 m9, m1, m0, m8, r0+r2 |
|
| 503 |
+ lea r0, [r0+r2*2] |
|
| 504 |
+ STORE_DIFF16 m10, m2, m0, m8, r0 |
|
| 505 |
+ STORE_DIFF16 m11, m3, m0, m8, r0+r2 |
|
| 506 |
+ lea r0, [r0+r2*2] |
|
| 507 |
+ STORE_DIFF16 m12, m4, m0, m8, r0 |
|
| 508 |
+ STORE_DIFF16 m13, m5, m0, m8, r0+r2 |
|
| 509 |
+ lea r0, [r0+r2*2] |
|
| 510 |
+ STORE_DIFF16 m14, m6, m0, m8, r0 |
|
| 511 |
+ STORE_DIFF16 m15, m7, m0, m8, r0+r2 |
|
| 512 |
+%else |
|
| 513 |
+ IDCT8_ADD_SSE_START r1, rsp |
|
| 514 |
+ IDCT8_ADD_SSE_START r1+16, rsp+128 |
|
| 515 |
+ lea r3, [r0+8] |
|
| 516 |
+ IDCT8_ADD_SSE_END r0, rsp, r2 |
|
| 517 |
+ IDCT8_ADD_SSE_END r3, rsp+16, r2 |
|
| 518 |
+%endif ; ARCH_X86_64 |
|
| 519 |
+ |
|
| 520 |
+ add rsp, pad |
|
| 521 |
+ ret |
|
| 522 |
+%endmacro |
|
| 523 |
+ |
|
| 524 |
+INIT_XMM |
|
| 525 |
+IDCT8_ADD sse2 |
|
| 526 |
+%ifdef HAVE_AVX |
|
| 527 |
+INIT_AVX |
|
| 528 |
+IDCT8_ADD avx |
|
| 529 |
+%endif |
|
| 530 |
+ |
|
| 531 |
+;----------------------------------------------------------------------------- |
|
| 532 |
+; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
|
| 533 |
+;----------------------------------------------------------------------------- |
|
| 534 |
+;;;;;;; NO FATE SAMPLES TRIGGER THIS |
|
| 535 |
+%macro IDCT8_ADD4_OP 3 |
|
| 536 |
+ cmp byte [r4+%3], 0 |
|
| 537 |
+ jz .skipblock%2 |
|
| 538 |
+ mov r0d, dword [r6+%2*4] |
|
| 539 |
+ add r0, r5 |
|
| 540 |
+ call h264_idct8_add1_10_%1 |
|
| 541 |
+.skipblock%2: |
|
| 542 |
+%if %2<12 |
|
| 543 |
+ add r1, 256 |
|
| 544 |
+%endif |
|
| 545 |
+%endmacro |
|
| 546 |
+ |
|
| 547 |
+%macro IDCT8_ADD4 1 |
|
| 548 |
+cglobal h264_idct8_add4_10_%1, 0,7,16 |
|
| 549 |
+ %assign pad 16-gprsize-(stack_offset&15) |
|
| 550 |
+ SUB rsp, pad |
|
| 551 |
+ mov r5, r0mp |
|
| 552 |
+ mov r6, r1mp |
|
| 553 |
+ mov r1, r2mp |
|
| 554 |
+ mov r2d, r3m |
|
| 555 |
+ movifnidn r4, r4mp |
|
| 556 |
+ IDCT8_ADD4_OP %1, 0, 4+1*8 |
|
| 557 |
+ IDCT8_ADD4_OP %1, 4, 6+1*8 |
|
| 558 |
+ IDCT8_ADD4_OP %1, 8, 4+3*8 |
|
| 559 |
+ IDCT8_ADD4_OP %1, 12, 6+3*8 |
|
| 560 |
+ ADD rsp, pad |
|
| 561 |
+ RET |
|
| 562 |
+%endmacro ; IDCT8_ADD4 |
|
| 563 |
+ |
|
| 564 |
+INIT_XMM |
|
| 565 |
+IDCT8_ADD4 sse2 |
|
| 566 |
+%ifdef HAVE_AVX |
|
| 567 |
+INIT_AVX |
|
| 568 |
+IDCT8_ADD4 avx |
|
| 569 |
+%endif |
| ... | ... |
@@ -27,38 +27,61 @@ DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; |
| 27 | 27 |
|
| 28 | 28 |
/***********************************/ |
| 29 | 29 |
/* IDCT */ |
| 30 |
+#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ |
|
| 31 |
+void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride); |
|
| 32 |
+ |
|
| 33 |
+IDCT_ADD_FUNC(, 8, mmx) |
|
| 34 |
+IDCT_ADD_FUNC(, 10, sse2) |
|
| 35 |
+IDCT_ADD_FUNC(_dc, 8, mmx2) |
|
| 36 |
+IDCT_ADD_FUNC(_dc, 10, mmx2) |
|
| 37 |
+IDCT_ADD_FUNC(8_dc, 8, mmx2) |
|
| 38 |
+IDCT_ADD_FUNC(8_dc, 10, sse2) |
|
| 39 |
+IDCT_ADD_FUNC(8, 8, mmx) |
|
| 40 |
+IDCT_ADD_FUNC(8, 8, sse2) |
|
| 41 |
+IDCT_ADD_FUNC(8, 10, sse2) |
|
| 42 |
+#if HAVE_AVX |
|
| 43 |
+IDCT_ADD_FUNC(, 10, avx) |
|
| 44 |
+IDCT_ADD_FUNC(8_dc, 10, avx) |
|
| 45 |
+IDCT_ADD_FUNC(8, 10, avx) |
|
| 46 |
+#endif |
|
| 47 |
+ |
|
| 48 |
+ |
|
| 49 |
+#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ |
|
| 50 |
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ |
|
| 51 |
+ (uint8_t *dst, const int *block_offset, \ |
|
| 52 |
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 53 |
+ |
|
| 54 |
+IDCT_ADD_REP_FUNC(8, 4, 8, mmx) |
|
| 55 |
+IDCT_ADD_REP_FUNC(8, 4, 8, mmx2) |
|
| 56 |
+IDCT_ADD_REP_FUNC(8, 4, 8, sse2) |
|
| 57 |
+IDCT_ADD_REP_FUNC(8, 4, 10, sse2) |
|
| 58 |
+IDCT_ADD_REP_FUNC(8, 4, 10, avx) |
|
| 59 |
+IDCT_ADD_REP_FUNC(, 16, 8, mmx) |
|
| 60 |
+IDCT_ADD_REP_FUNC(, 16, 8, mmx2) |
|
| 61 |
+IDCT_ADD_REP_FUNC(, 16, 8, sse2) |
|
| 62 |
+IDCT_ADD_REP_FUNC(, 16, 10, sse2) |
|
| 63 |
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) |
|
| 64 |
+IDCT_ADD_REP_FUNC(, 16intra, 8, mmx2) |
|
| 65 |
+IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) |
|
| 66 |
+IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) |
|
| 67 |
+#if HAVE_AVX |
|
| 68 |
+IDCT_ADD_REP_FUNC(, 16, 10, avx) |
|
| 69 |
+IDCT_ADD_REP_FUNC(, 16intra, 10, avx) |
|
| 70 |
+#endif |
|
| 71 |
+ |
|
| 72 |
+ |
|
| 73 |
+#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ |
|
| 74 |
+void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ |
|
| 75 |
+ (uint8_t **dst, const int *block_offset, \ |
|
| 76 |
+ DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 77 |
+IDCT_ADD_REP_FUNC2(, 8, 8, mmx) |
|
| 78 |
+IDCT_ADD_REP_FUNC2(, 8, 8, mmx2) |
|
| 79 |
+IDCT_ADD_REP_FUNC2(, 8, 8, sse2) |
|
| 80 |
+IDCT_ADD_REP_FUNC2(, 8, 10, sse2) |
|
| 81 |
+#if HAVE_AVX |
|
| 82 |
+IDCT_ADD_REP_FUNC2(, 8, 10, avx) |
|
| 83 |
+#endif |
|
| 30 | 84 |
|
| 31 |
-void ff_h264_idct_add_mmx (uint8_t *dst, int16_t *block, int stride); |
|
| 32 |
-void ff_h264_idct8_add_mmx (uint8_t *dst, int16_t *block, int stride); |
|
| 33 |
-void ff_h264_idct8_add_sse2 (uint8_t *dst, int16_t *block, int stride); |
|
| 34 |
-void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride); |
|
| 35 |
-void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride); |
|
| 36 |
- |
|
| 37 |
-void ff_h264_idct_add16_mmx (uint8_t *dst, const int *block_offset, |
|
| 38 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 39 |
-void ff_h264_idct8_add4_mmx (uint8_t *dst, const int *block_offset, |
|
| 40 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 41 |
-void ff_h264_idct_add16_mmx2 (uint8_t *dst, const int *block_offset, |
|
| 42 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 43 |
-void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset, |
|
| 44 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 45 |
-void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, |
|
| 46 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 47 |
-void ff_h264_idct8_add4_mmx2 (uint8_t *dst, const int *block_offset, |
|
| 48 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 49 |
-void ff_h264_idct8_add4_sse2 (uint8_t *dst, const int *block_offset, |
|
| 50 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 51 |
-void ff_h264_idct_add8_mmx (uint8_t **dest, const int *block_offset, |
|
| 52 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 53 |
-void ff_h264_idct_add8_mmx2 (uint8_t **dest, const int *block_offset, |
|
| 54 |
- DCTELEM *block, int stride, const uint8_t nnzc[6*8]); |
|
| 55 |
- |
|
| 56 |
-void ff_h264_idct_add16_sse2 (uint8_t *dst, const int *block_offset, DCTELEM *block, |
|
| 57 |
- int stride, const uint8_t nnzc[6*8]); |
|
| 58 |
-void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, |
|
| 59 |
- int stride, const uint8_t nnzc[6*8]); |
|
| 60 |
-void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, |
|
| 61 |
- int stride, const uint8_t nnzc[6*8]); |
|
| 62 | 85 |
void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); |
| 63 | 86 |
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); |
| 64 | 87 |
|
| ... | ... |
@@ -313,24 +336,24 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) |
| 313 | 313 |
} |
| 314 | 314 |
#if HAVE_YASM |
| 315 | 315 |
if (mm_flags & AV_CPU_FLAG_MMX) {
|
| 316 |
- c->h264_idct_dc_add= |
|
| 317 |
- c->h264_idct_add= ff_h264_idct_add_mmx; |
|
| 318 |
- c->h264_idct8_dc_add= |
|
| 319 |
- c->h264_idct8_add= ff_h264_idct8_add_mmx; |
|
| 320 |
- |
|
| 321 |
- c->h264_idct_add16 = ff_h264_idct_add16_mmx; |
|
| 322 |
- c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; |
|
| 323 |
- c->h264_idct_add8 = ff_h264_idct_add8_mmx; |
|
| 324 |
- c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; |
|
| 316 |
+ c->h264_idct_dc_add = |
|
| 317 |
+ c->h264_idct_add = ff_h264_idct_add_8_mmx; |
|
| 318 |
+ c->h264_idct8_dc_add = |
|
| 319 |
+ c->h264_idct8_add = ff_h264_idct8_add_8_mmx; |
|
| 320 |
+ |
|
| 321 |
+ c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; |
|
| 322 |
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; |
|
| 323 |
+ c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; |
|
| 324 |
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; |
|
| 325 | 325 |
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; |
| 326 | 326 |
|
| 327 | 327 |
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
| 328 |
- c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; |
|
| 329 |
- c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; |
|
| 330 |
- c->h264_idct_add16 = ff_h264_idct_add16_mmx2; |
|
| 331 |
- c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; |
|
| 332 |
- c->h264_idct_add8 = ff_h264_idct_add8_mmx2; |
|
| 333 |
- c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; |
|
| 328 |
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmx2; |
|
| 329 |
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmx2; |
|
| 330 |
+ c->h264_idct_add16 = ff_h264_idct_add16_8_mmx2; |
|
| 331 |
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx2; |
|
| 332 |
+ c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; |
|
| 333 |
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2; |
|
| 334 | 334 |
|
| 335 | 335 |
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext; |
| 336 | 336 |
c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext; |
| ... | ... |
@@ -361,8 +384,12 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) |
| 361 | 361 |
c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; |
| 362 | 362 |
|
| 363 | 363 |
if (mm_flags&AV_CPU_FLAG_SSE2) {
|
| 364 |
- c->h264_idct8_add = ff_h264_idct8_add_sse2; |
|
| 365 |
- c->h264_idct8_add4= ff_h264_idct8_add4_sse2; |
|
| 364 |
+ c->h264_idct8_add = ff_h264_idct8_add_8_sse2; |
|
| 365 |
+ |
|
| 366 |
+ c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; |
|
| 367 |
+ c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; |
|
| 368 |
+ c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; |
|
| 369 |
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; |
|
| 366 | 370 |
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; |
| 367 | 371 |
|
| 368 | 372 |
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; |
| ... | ... |
@@ -383,10 +410,6 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) |
| 383 | 383 |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; |
| 384 | 384 |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; |
| 385 | 385 |
#endif |
| 386 |
- |
|
| 387 |
- c->h264_idct_add16 = ff_h264_idct_add16_sse2; |
|
| 388 |
- c->h264_idct_add8 = ff_h264_idct_add8_sse2; |
|
| 389 |
- c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; |
|
| 390 | 386 |
} |
| 391 | 387 |
if (mm_flags&AV_CPU_FLAG_SSSE3) {
|
| 392 | 388 |
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; |
| ... | ... |
@@ -418,7 +441,19 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) |
| 418 | 418 |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; |
| 419 | 419 |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; |
| 420 | 420 |
#endif |
| 421 |
+ c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2; |
|
| 421 | 422 |
if (mm_flags&AV_CPU_FLAG_SSE2) {
|
| 423 |
+ c->h264_idct_add = ff_h264_idct_add_10_sse2; |
|
| 424 |
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; |
|
| 425 |
+ |
|
| 426 |
+ c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; |
|
| 427 |
+ c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; |
|
| 428 |
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2; |
|
| 429 |
+#if HAVE_ALIGNED_STACK |
|
| 430 |
+ c->h264_idct8_add = ff_h264_idct8_add_10_sse2; |
|
| 431 |
+ c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; |
|
| 432 |
+#endif |
|
| 433 |
+ |
|
| 422 | 434 |
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2; |
| 423 | 435 |
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2; |
| 424 | 436 |
#if HAVE_ALIGNED_STACK |
| ... | ... |
@@ -428,7 +463,20 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) |
| 428 | 428 |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; |
| 429 | 429 |
#endif |
| 430 | 430 |
} |
| 431 |
+#if HAVE_AVX |
|
| 431 | 432 |
if (mm_flags&AV_CPU_FLAG_AVX) {
|
| 433 |
+ c->h264_idct_dc_add = |
|
| 434 |
+ c->h264_idct_add = ff_h264_idct_add_10_avx; |
|
| 435 |
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; |
|
| 436 |
+ |
|
| 437 |
+ c->h264_idct_add16 = ff_h264_idct_add16_10_avx; |
|
| 438 |
+ c->h264_idct_add8 = ff_h264_idct_add8_10_avx; |
|
| 439 |
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx; |
|
| 440 |
+#if HAVE_ALIGNED_STACK |
|
| 441 |
+ c->h264_idct8_add = ff_h264_idct8_add_10_avx; |
|
| 442 |
+ c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; |
|
| 443 |
+#endif |
|
| 444 |
+ |
|
| 432 | 445 |
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx; |
| 433 | 446 |
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx; |
| 434 | 447 |
#if HAVE_ALIGNED_STACK |
| ... | ... |
@@ -438,6 +486,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth) |
| 438 | 438 |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; |
| 439 | 439 |
#endif |
| 440 | 440 |
} |
| 441 |
+#endif /* HAVE_AVX */ |
|
| 441 | 442 |
} |
| 442 | 443 |
} |
| 443 | 444 |
#endif |
| ... | ... |
@@ -153,7 +153,7 @@ static int movie_init(AVFilterContext *ctx) |
| 153 | 153 |
movie->w = movie->codec_ctx->width; |
| 154 | 154 |
movie->h = movie->codec_ctx->height; |
| 155 | 155 |
|
| 156 |
- av_log(ctx, AV_LOG_INFO, "seek_point:%lld format_name:%s file_name:%s stream_index:%d\n", |
|
| 156 |
+ av_log(ctx, AV_LOG_INFO, "seek_point:%"PRIi64" format_name:%s file_name:%s stream_index:%d\n", |
|
| 157 | 157 |
movie->seek_point, movie->format_name, movie->file_name, |
| 158 | 158 |
movie->stream_index); |
| 159 | 159 |
|
| ... | ... |
@@ -101,14 +101,14 @@ static void ape_dumpinfo(AVFormatContext * s, APEContext * ape_ctx) |
| 101 | 101 |
|
| 102 | 102 |
av_log(s, AV_LOG_DEBUG, "Descriptor Block:\n\n"); |
| 103 | 103 |
av_log(s, AV_LOG_DEBUG, "magic = \"%c%c%c%c\"\n", ape_ctx->magic[0], ape_ctx->magic[1], ape_ctx->magic[2], ape_ctx->magic[3]); |
| 104 |
- av_log(s, AV_LOG_DEBUG, "fileversion = %d\n", ape_ctx->fileversion); |
|
| 105 |
- av_log(s, AV_LOG_DEBUG, "descriptorlength = %d\n", ape_ctx->descriptorlength); |
|
| 106 |
- av_log(s, AV_LOG_DEBUG, "headerlength = %d\n", ape_ctx->headerlength); |
|
| 107 |
- av_log(s, AV_LOG_DEBUG, "seektablelength = %d\n", ape_ctx->seektablelength); |
|
| 108 |
- av_log(s, AV_LOG_DEBUG, "wavheaderlength = %d\n", ape_ctx->wavheaderlength); |
|
| 109 |
- av_log(s, AV_LOG_DEBUG, "audiodatalength = %d\n", ape_ctx->audiodatalength); |
|
| 110 |
- av_log(s, AV_LOG_DEBUG, "audiodatalength_high = %d\n", ape_ctx->audiodatalength_high); |
|
| 111 |
- av_log(s, AV_LOG_DEBUG, "wavtaillength = %d\n", ape_ctx->wavtaillength); |
|
| 104 |
+ av_log(s, AV_LOG_DEBUG, "fileversion = %"PRId16"\n", ape_ctx->fileversion); |
|
| 105 |
+ av_log(s, AV_LOG_DEBUG, "descriptorlength = %"PRIu32"\n", ape_ctx->descriptorlength); |
|
| 106 |
+ av_log(s, AV_LOG_DEBUG, "headerlength = %"PRIu32"\n", ape_ctx->headerlength); |
|
| 107 |
+ av_log(s, AV_LOG_DEBUG, "seektablelength = %"PRIu32"\n", ape_ctx->seektablelength); |
|
| 108 |
+ av_log(s, AV_LOG_DEBUG, "wavheaderlength = %"PRIu32"\n", ape_ctx->wavheaderlength); |
|
| 109 |
+ av_log(s, AV_LOG_DEBUG, "audiodatalength = %"PRIu32"\n", ape_ctx->audiodatalength); |
|
| 110 |
+ av_log(s, AV_LOG_DEBUG, "audiodatalength_high = %"PRIu32"\n", ape_ctx->audiodatalength_high); |
|
| 111 |
+ av_log(s, AV_LOG_DEBUG, "wavtaillength = %"PRIu32"\n", ape_ctx->wavtaillength); |
|
| 112 | 112 |
av_log(s, AV_LOG_DEBUG, "md5 = "); |
| 113 | 113 |
for (i = 0; i < 16; i++) |
| 114 | 114 |
av_log(s, AV_LOG_DEBUG, "%02x", ape_ctx->md5[i]); |
| ... | ... |
@@ -116,14 +116,14 @@ static void ape_dumpinfo(AVFormatContext * s, APEContext * ape_ctx) |
| 116 | 116 |
|
| 117 | 117 |
av_log(s, AV_LOG_DEBUG, "\nHeader Block:\n\n"); |
| 118 | 118 |
|
| 119 |
- av_log(s, AV_LOG_DEBUG, "compressiontype = %d\n", ape_ctx->compressiontype); |
|
| 120 |
- av_log(s, AV_LOG_DEBUG, "formatflags = %d\n", ape_ctx->formatflags); |
|
| 121 |
- av_log(s, AV_LOG_DEBUG, "blocksperframe = %d\n", ape_ctx->blocksperframe); |
|
| 122 |
- av_log(s, AV_LOG_DEBUG, "finalframeblocks = %d\n", ape_ctx->finalframeblocks); |
|
| 123 |
- av_log(s, AV_LOG_DEBUG, "totalframes = %d\n", ape_ctx->totalframes); |
|
| 124 |
- av_log(s, AV_LOG_DEBUG, "bps = %d\n", ape_ctx->bps); |
|
| 125 |
- av_log(s, AV_LOG_DEBUG, "channels = %d\n", ape_ctx->channels); |
|
| 126 |
- av_log(s, AV_LOG_DEBUG, "samplerate = %d\n", ape_ctx->samplerate); |
|
| 119 |
+ av_log(s, AV_LOG_DEBUG, "compressiontype = %"PRIu16"\n", ape_ctx->compressiontype); |
|
| 120 |
+ av_log(s, AV_LOG_DEBUG, "formatflags = %"PRIu16"\n", ape_ctx->formatflags); |
|
| 121 |
+ av_log(s, AV_LOG_DEBUG, "blocksperframe = %"PRIu32"\n", ape_ctx->blocksperframe); |
|
| 122 |
+ av_log(s, AV_LOG_DEBUG, "finalframeblocks = %"PRIu32"\n", ape_ctx->finalframeblocks); |
|
| 123 |
+ av_log(s, AV_LOG_DEBUG, "totalframes = %"PRIu32"\n", ape_ctx->totalframes); |
|
| 124 |
+ av_log(s, AV_LOG_DEBUG, "bps = %"PRIu16"\n", ape_ctx->bps); |
|
| 125 |
+ av_log(s, AV_LOG_DEBUG, "channels = %"PRIu16"\n", ape_ctx->channels); |
|
| 126 |
+ av_log(s, AV_LOG_DEBUG, "samplerate = %"PRIu32"\n", ape_ctx->samplerate); |
|
| 127 | 127 |
|
| 128 | 128 |
av_log(s, AV_LOG_DEBUG, "\nSeektable\n\n"); |
| 129 | 129 |
if ((ape_ctx->seektablelength / sizeof(uint32_t)) != ape_ctx->totalframes) {
|
| ... | ... |
@@ -140,12 +140,14 @@ static void ape_dumpinfo(AVFormatContext * s, APEContext * ape_ctx) |
| 140 | 140 |
|
| 141 | 141 |
av_log(s, AV_LOG_DEBUG, "\nFrames\n\n"); |
| 142 | 142 |
for (i = 0; i < ape_ctx->totalframes; i++) |
| 143 |
- av_log(s, AV_LOG_DEBUG, "%8d %8lld %8d (%d samples)\n", i, ape_ctx->frames[i].pos, ape_ctx->frames[i].size, ape_ctx->frames[i].nblocks); |
|
| 143 |
+ av_log(s, AV_LOG_DEBUG, "%8d %8"PRId64" %8d (%d samples)\n", i, |
|
| 144 |
+ ape_ctx->frames[i].pos, ape_ctx->frames[i].size, |
|
| 145 |
+ ape_ctx->frames[i].nblocks); |
|
| 144 | 146 |
|
| 145 | 147 |
av_log(s, AV_LOG_DEBUG, "\nCalculated information:\n\n"); |
| 146 |
- av_log(s, AV_LOG_DEBUG, "junklength = %d\n", ape_ctx->junklength); |
|
| 147 |
- av_log(s, AV_LOG_DEBUG, "firstframe = %d\n", ape_ctx->firstframe); |
|
| 148 |
- av_log(s, AV_LOG_DEBUG, "totalsamples = %d\n", ape_ctx->totalsamples); |
|
| 148 |
+ av_log(s, AV_LOG_DEBUG, "junklength = %"PRIu32"\n", ape_ctx->junklength); |
|
| 149 |
+ av_log(s, AV_LOG_DEBUG, "firstframe = %"PRIu32"\n", ape_ctx->firstframe); |
|
| 150 |
+ av_log(s, AV_LOG_DEBUG, "totalsamples = %"PRIu32"\n", ape_ctx->totalsamples); |
|
| 149 | 151 |
#endif |
| 150 | 152 |
} |
| 151 | 153 |
|
| ... | ... |
@@ -169,7 +171,8 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap) |
| 169 | 169 |
ape->fileversion = avio_rl16(pb); |
| 170 | 170 |
|
| 171 | 171 |
if (ape->fileversion < APE_MIN_VERSION || ape->fileversion > APE_MAX_VERSION) {
|
| 172 |
- av_log(s, AV_LOG_ERROR, "Unsupported file version - %d.%02d\n", ape->fileversion / 1000, (ape->fileversion % 1000) / 10); |
|
| 172 |
+ av_log(s, AV_LOG_ERROR, "Unsupported file version - %"PRId16".%02"PRId16"\n", |
|
| 173 |
+ ape->fileversion / 1000, (ape->fileversion % 1000) / 10); |
|
| 173 | 174 |
return -1; |
| 174 | 175 |
} |
| 175 | 176 |
|
| ... | ... |
@@ -247,11 +250,12 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap) |
| 247 | 247 |
return AVERROR(EINVAL); |
| 248 | 248 |
} |
| 249 | 249 |
if(ape->totalframes > UINT_MAX / sizeof(APEFrame)){
|
| 250 |
- av_log(s, AV_LOG_ERROR, "Too many frames: %d\n", ape->totalframes); |
|
| 250 |
+ av_log(s, AV_LOG_ERROR, "Too many frames: %"PRIu32"\n", |
|
| 251 |
+ ape->totalframes); |
|
| 251 | 252 |
return -1; |
| 252 | 253 |
} |
| 253 | 254 |
if (ape->seektablelength && (ape->seektablelength / sizeof(*ape->seektable)) < ape->totalframes) {
|
| 254 |
- av_log(s, AV_LOG_ERROR, "Number of seek entries is less than number of frames: %zd vs. %d\n", |
|
| 255 |
+ av_log(s, AV_LOG_ERROR, "Number of seek entries is less than number of frames: %ld vs. %"PRIu32"\n", |
|
| 255 | 256 |
ape->seektablelength / sizeof(*ape->seektable), ape->totalframes); |
| 256 | 257 |
return AVERROR_INVALIDDATA; |
| 257 | 258 |
} |
| ... | ... |
@@ -301,7 +305,9 @@ static int ape_read_header(AVFormatContext * s, AVFormatParameters * ap) |
| 301 | 301 |
avio_seek(pb, 0, SEEK_SET); |
| 302 | 302 |
} |
| 303 | 303 |
|
| 304 |
- av_log(s, AV_LOG_DEBUG, "Decoding file - v%d.%02d, compression level %d\n", ape->fileversion / 1000, (ape->fileversion % 1000) / 10, ape->compressiontype); |
|
| 304 |
+ av_log(s, AV_LOG_DEBUG, "Decoding file - v%d.%02d, compression level %"PRIu16"\n", |
|
| 305 |
+ ape->fileversion / 1000, (ape->fileversion % 1000) / 10, |
|
| 306 |
+ ape->compressiontype); |
|
| 305 | 307 |
|
| 306 | 308 |
/* now we are ready: build format streams */ |
| 307 | 309 |
st = av_new_stream(s, 0); |
| ... | ... |
@@ -522,8 +522,8 @@ static int mxf_read_index_table_segment(void *arg, AVIOContext *pb, int tag, int |
| 522 | 522 |
case 0x3F06: av_dlog(NULL, "IndexSID %d\n", avio_rb32(pb)); break; |
| 523 | 523 |
case 0x3F07: av_dlog(NULL, "BodySID %d\n", avio_rb32(pb)); break; |
| 524 | 524 |
case 0x3F0B: av_dlog(NULL, "IndexEditRate %d/%d\n", avio_rb32(pb), avio_rb32(pb)); break; |
| 525 |
- case 0x3F0C: av_dlog(NULL, "IndexStartPosition %"PRId64"\n", avio_rb64(pb)); break; |
|
| 526 |
- case 0x3F0D: av_dlog(NULL, "IndexDuration %"PRId64"\n", avio_rb64(pb)); break; |
|
| 525 |
+ case 0x3F0C: av_dlog(NULL, "IndexStartPosition %"PRIu64"\n", avio_rb64(pb)); break; |
|
| 526 |
+ case 0x3F0D: av_dlog(NULL, "IndexDuration %"PRIu64"\n", avio_rb64(pb)); break; |
|
| 527 | 527 |
} |
| 528 | 528 |
return 0; |
| 529 | 529 |
} |
| ... | ... |
@@ -920,7 +920,7 @@ static int mxf_read_header(AVFormatContext *s, AVFormatParameters *ap) |
| 920 | 920 |
if (klv_read_packet(&klv, s->pb) < 0) |
| 921 | 921 |
return -1; |
| 922 | 922 |
PRINT_KEY(s, "read header", klv.key); |
| 923 |
- av_dlog(s, "size %"PRId64" offset %#"PRIx64"\n", klv.length, klv.offset); |
|
| 923 |
+ av_dlog(s, "size %"PRIu64" offset %#"PRIx64"\n", klv.length, klv.offset); |
|
| 924 | 924 |
if (IS_KLV_KEY(klv.key, mxf_encrypted_triplet_key) || |
| 925 | 925 |
IS_KLV_KEY(klv.key, mxf_essence_element_key)) {
|
| 926 | 926 |
/* FIXME avoid seek */ |
| ... | ... |
@@ -43,7 +43,7 @@ static int read_atom(AVFormatContext *s, Atom *atom) |
| 43 | 43 |
if (atom->size < 8) |
| 44 | 44 |
return -1; |
| 45 | 45 |
atom->tag = avio_rl32(s->pb); |
| 46 |
- av_dlog(s, "atom %d %.4s offset %#"PRIx64"\n", |
|
| 46 |
+ av_dlog(s, "atom %u %.4s offset %#"PRIx64"\n", |
|
| 47 | 47 |
atom->size, (char*)&atom->tag, atom->offset); |
| 48 | 48 |
return atom->size; |
| 49 | 49 |
} |
| ... | ... |
@@ -356,7 +356,8 @@ static int r3d_seek(AVFormatContext *s, int stream_index, int64_t sample_time, i |
| 356 | 356 |
|
| 357 | 357 |
frame_num = sample_time*st->codec->time_base.den/ |
| 358 | 358 |
((int64_t)st->codec->time_base.num*st->time_base.den); |
| 359 |
- av_dlog(s, "seek frame num %d timestamp %"PRId64"\n", frame_num, sample_time); |
|
| 359 |
+ av_dlog(s, "seek frame num %d timestamp %"PRId64"\n", |
|
| 360 |
+ frame_num, sample_time); |
|
| 360 | 361 |
|
| 361 | 362 |
if (frame_num < r3d->video_offsets_count) {
|
| 362 | 363 |
avio_seek(s->pb, r3d->video_offsets_count, SEEK_SET); |