| ... | ... |
@@ -2509,6 +2509,18 @@ static void butterflies_float_c(float *restrict v1, float *restrict v2, |
| 2509 | 2509 |
} |
| 2510 | 2510 |
} |
| 2511 | 2511 |
|
| 2512 |
+static void butterflies_float_interleave_c(float *dst, const float *src0, |
|
| 2513 |
+ const float *src1, int len) |
|
| 2514 |
+{
|
|
| 2515 |
+ int i; |
|
| 2516 |
+ for (i = 0; i < len; i++) {
|
|
| 2517 |
+ float f1 = src0[i]; |
|
| 2518 |
+ float f2 = src1[i]; |
|
| 2519 |
+ dst[2*i ] = f1 + f2; |
|
| 2520 |
+ dst[2*i + 1] = f1 - f2; |
|
| 2521 |
+ } |
|
| 2522 |
+} |
|
| 2523 |
+ |
|
| 2512 | 2524 |
static float scalarproduct_float_c(const float *v1, const float *v2, int len) |
| 2513 | 2525 |
{
|
| 2514 | 2526 |
float p = 0.0; |
| ... | ... |
@@ -3036,6 +3048,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx) |
| 3036 | 3036 |
c->vector_clip_int32 = vector_clip_int32_c; |
| 3037 | 3037 |
c->scalarproduct_float = scalarproduct_float_c; |
| 3038 | 3038 |
c->butterflies_float = butterflies_float_c; |
| 3039 |
+ c->butterflies_float_interleave = butterflies_float_interleave_c; |
|
| 3039 | 3040 |
c->vector_fmul_scalar = vector_fmul_scalar_c; |
| 3040 | 3041 |
c->vector_fmac_scalar = vector_fmac_scalar_c; |
| 3041 | 3042 |
|
| ... | ... |
@@ -453,6 +453,23 @@ typedef struct DSPContext {
|
| 453 | 453 |
*/ |
| 454 | 454 |
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len); |
| 455 | 455 |
|
| 456 |
+ /** |
|
| 457 |
+ * Calculate the sum and difference of two vectors of floats and interleave |
|
| 458 |
+ * results into a separate output vector of floats, with each sum |
|
| 459 |
+ * positioned before the corresponding difference. |
|
| 460 |
+ * |
|
| 461 |
+ * @param dst output vector |
|
| 462 |
+ * constraints: 16-byte aligned |
|
| 463 |
+ * @param src0 first input vector |
|
| 464 |
+ * constraints: 32-byte aligned |
|
| 465 |
+ * @param src1 second input vector |
|
| 466 |
+ * constraints: 32-byte aligned |
|
| 467 |
+ * @param len number of elements in the input |
|
| 468 |
+ * constraints: multiple of 8 |
|
| 469 |
+ */ |
|
| 470 |
+ void (*butterflies_float_interleave)(float *dst, const float *src0, |
|
| 471 |
+ const float *src1, int len); |
|
| 472 |
+ |
|
| 456 | 473 |
/* (I)DCT */ |
| 457 | 474 |
void (*fdct)(DCTELEM *block/* align 16*/); |
| 458 | 475 |
void (*fdct248)(DCTELEM *block/* align 16*/); |
| ... | ... |
@@ -665,8 +665,9 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype, |
| 665 | 665 |
float *out) |
| 666 | 666 |
{
|
| 667 | 667 |
const ModeTab *mtab = tctx->mtab; |
| 668 |
+ int size1, size2; |
|
| 668 | 669 |
float *prev_buf = tctx->prev_frame + tctx->last_block_pos[0]; |
| 669 |
- int i, j; |
|
| 670 |
+ int i; |
|
| 670 | 671 |
|
| 671 | 672 |
for (i = 0; i < tctx->avctx->channels; i++) {
|
| 672 | 673 |
imdct_and_window(tctx, ftype, wtype, |
| ... | ... |
@@ -675,27 +676,24 @@ static void imdct_output(TwinContext *tctx, enum FrameType ftype, int wtype, |
| 675 | 675 |
i); |
| 676 | 676 |
} |
| 677 | 677 |
|
| 678 |
+ size2 = tctx->last_block_pos[0]; |
|
| 679 |
+ size1 = mtab->size - size2; |
|
| 678 | 680 |
if (tctx->avctx->channels == 2) {
|
| 679 |
- for (i = 0; i < mtab->size - tctx->last_block_pos[0]; i++) {
|
|
| 680 |
- float f1 = prev_buf[ i]; |
|
| 681 |
- float f2 = prev_buf[2*mtab->size + i]; |
|
| 682 |
- out[2*i ] = f1 + f2; |
|
| 683 |
- out[2*i + 1] = f1 - f2; |
|
| 684 |
- } |
|
| 685 |
- for (j = 0; i < mtab->size; j++,i++) {
|
|
| 686 |
- float f1 = tctx->curr_frame[ j]; |
|
| 687 |
- float f2 = tctx->curr_frame[2*mtab->size + j]; |
|
| 688 |
- out[2*i ] = f1 + f2; |
|
| 689 |
- out[2*i + 1] = f1 - f2; |
|
| 690 |
- } |
|
| 681 |
+ tctx->dsp.butterflies_float_interleave(out, prev_buf, |
|
| 682 |
+ &prev_buf[2*mtab->size], |
|
| 683 |
+ size1); |
|
| 684 |
+ |
|
| 685 |
+ out += 2 * size1; |
|
| 686 |
+ |
|
| 687 |
+ tctx->dsp.butterflies_float_interleave(out, tctx->curr_frame, |
|
| 688 |
+ &tctx->curr_frame[2*mtab->size], |
|
| 689 |
+ size2); |
|
| 691 | 690 |
} else {
|
| 692 |
- memcpy(out, prev_buf, |
|
| 693 |
- (mtab->size - tctx->last_block_pos[0]) * sizeof(*out)); |
|
| 691 |
+ memcpy(out, prev_buf, size1 * sizeof(*out)); |
|
| 694 | 692 |
|
| 695 |
- out += mtab->size - tctx->last_block_pos[0]; |
|
| 693 |
+ out += size1; |
|
| 696 | 694 |
|
| 697 |
- memcpy(out, tctx->curr_frame, |
|
| 698 |
- (tctx->last_block_pos[0]) * sizeof(*out)); |
|
| 695 |
+ memcpy(out, tctx->curr_frame, size2 * sizeof(*out)); |
|
| 699 | 696 |
} |
| 700 | 697 |
|
| 701 | 698 |
} |
| ... | ... |
@@ -2424,6 +2424,11 @@ void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min |
| 2424 | 2424 |
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min, |
| 2425 | 2425 |
int32_t max, unsigned int len); |
| 2426 | 2426 |
|
| 2427 |
+extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0, |
|
| 2428 |
+ const float *src1, int len); |
|
| 2429 |
+extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0, |
|
| 2430 |
+ const float *src1, int len); |
|
| 2431 |
+ |
|
| 2427 | 2432 |
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
| 2428 | 2433 |
{
|
| 2429 | 2434 |
int mm_flags = av_get_cpu_flags(); |
| ... | ... |
@@ -2868,6 +2873,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
| 2868 | 2868 |
c->vector_clipf = vector_clipf_sse; |
| 2869 | 2869 |
#if HAVE_YASM |
| 2870 | 2870 |
c->scalarproduct_float = ff_scalarproduct_float_sse; |
| 2871 |
+ c->butterflies_float_interleave = ff_butterflies_float_interleave_sse; |
|
| 2871 | 2872 |
#endif |
| 2872 | 2873 |
} |
| 2873 | 2874 |
if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) |
| ... | ... |
@@ -2925,6 +2931,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) |
| 2925 | 2925 |
c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; |
| 2926 | 2926 |
c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx; |
| 2927 | 2927 |
} |
| 2928 |
+ c->butterflies_float_interleave = ff_butterflies_float_interleave_avx; |
|
| 2928 | 2929 |
} |
| 2929 | 2930 |
#endif |
| 2930 | 2931 |
} |
| ... | ... |
@@ -1129,3 +1129,51 @@ VECTOR_CLIP_INT32 11, 1, 1, 0 |
| 1129 | 1129 |
%else |
| 1130 | 1130 |
VECTOR_CLIP_INT32 6, 1, 0, 0 |
| 1131 | 1131 |
%endif |
| 1132 |
+ |
|
| 1133 |
+;----------------------------------------------------------------------------- |
|
| 1134 |
+; void ff_butterflies_float_interleave(float *dst, const float *src0, |
|
| 1135 |
+; const float *src1, int len); |
|
| 1136 |
+;----------------------------------------------------------------------------- |
|
| 1137 |
+ |
|
| 1138 |
+%macro BUTTERFLIES_FLOAT_INTERLEAVE 0 |
|
| 1139 |
+cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len |
|
| 1140 |
+%ifdef ARCH_X86_64 |
|
| 1141 |
+ movsxd lenq, lend |
|
| 1142 |
+%endif |
|
| 1143 |
+ test lenq, lenq |
|
| 1144 |
+ jz .end |
|
| 1145 |
+ shl lenq, 2 |
|
| 1146 |
+ lea src0q, [src0q + lenq] |
|
| 1147 |
+ lea src1q, [src1q + lenq] |
|
| 1148 |
+ lea dstq, [ dstq + 2*lenq] |
|
| 1149 |
+ neg lenq |
|
| 1150 |
+.loop: |
|
| 1151 |
+ mova m0, [src0q + lenq] |
|
| 1152 |
+ mova m1, [src1q + lenq] |
|
| 1153 |
+ subps m2, m0, m1 |
|
| 1154 |
+ addps m0, m0, m1 |
|
| 1155 |
+ unpcklps m1, m0, m2 |
|
| 1156 |
+ unpckhps m0, m0, m2 |
|
| 1157 |
+%if cpuflag(avx) |
|
| 1158 |
+ vextractf128 [dstq + 2*lenq ], m1, 0 |
|
| 1159 |
+ vextractf128 [dstq + 2*lenq + 16], m0, 0 |
|
| 1160 |
+ vextractf128 [dstq + 2*lenq + 32], m1, 1 |
|
| 1161 |
+ vextractf128 [dstq + 2*lenq + 48], m0, 1 |
|
| 1162 |
+%else |
|
| 1163 |
+ mova [dstq + 2*lenq ], m1 |
|
| 1164 |
+ mova [dstq + 2*lenq + mmsize], m0 |
|
| 1165 |
+%endif |
|
| 1166 |
+ add lenq, mmsize |
|
| 1167 |
+ jl .loop |
|
| 1168 |
+%if mmsize == 32 |
|
| 1169 |
+ vzeroupper |
|
| 1170 |
+ RET |
|
| 1171 |
+%endif |
|
| 1172 |
+.end: |
|
| 1173 |
+ REP_RET |
|
| 1174 |
+%endmacro |
|
| 1175 |
+ |
|
| 1176 |
+INIT_XMM sse |
|
| 1177 |
+BUTTERFLIES_FLOAT_INTERLEAVE |
|
| 1178 |
+INIT_YMM avx |
|
| 1179 |
+BUTTERFLIES_FLOAT_INTERLEAVE |