Signed-off-by: Justin Ruggles <justin.ruggles@gmail.com>
Kieran Kunhya authored on 2012/04/21 04:49:30... | ... |
@@ -267,8 +267,8 @@ static const int8_t sbr_offset[6][16] = { |
267 | 267 |
}; |
268 | 268 |
|
269 | 269 |
///< window coefficients for analysis/synthesis QMF banks |
270 |
-static DECLARE_ALIGNED(16, float, sbr_qmf_window_ds)[320]; |
|
271 |
-static DECLARE_ALIGNED(16, float, sbr_qmf_window_us)[640] = { |
|
270 |
+static DECLARE_ALIGNED(32, float, sbr_qmf_window_ds)[320]; |
|
271 |
+static DECLARE_ALIGNED(32, float, sbr_qmf_window_us)[640] = { |
|
272 | 272 |
0.0000000000, -0.0005525286, -0.0005617692, -0.0004947518, |
273 | 273 |
-0.0004875227, -0.0004893791, -0.0005040714, -0.0005226564, |
274 | 274 |
-0.0005466565, -0.0005677802, -0.0005870930, -0.0006132747, |
... | ... |
@@ -33,8 +33,8 @@ |
33 | 33 |
|
34 | 34 |
#include <stdint.h> |
35 | 35 |
|
36 |
-DECLARE_ALIGNED(16, float, ff_aac_kbd_long_1024)[1024]; |
|
37 |
-DECLARE_ALIGNED(16, float, ff_aac_kbd_short_128)[128]; |
|
36 |
+DECLARE_ALIGNED(32, float, ff_aac_kbd_long_1024)[1024]; |
|
37 |
+DECLARE_ALIGNED(32, float, ff_aac_kbd_short_128)[128]; |
|
38 | 38 |
|
39 | 39 |
const uint8_t ff_aac_num_swb_1024[] = { |
40 | 40 |
41, 41, 47, 49, 49, 51, 47, 47, 43, 43, 43, 40, 40 |
... | ... |
@@ -44,8 +44,8 @@ |
44 | 44 |
/* @name window coefficients |
45 | 45 |
* @{ |
46 | 46 |
*/ |
47 |
-DECLARE_ALIGNED(16, extern float, ff_aac_kbd_long_1024)[1024]; |
|
48 |
-DECLARE_ALIGNED(16, extern float, ff_aac_kbd_short_128)[128]; |
|
47 |
+DECLARE_ALIGNED(32, extern float, ff_aac_kbd_long_1024)[1024]; |
|
48 |
+DECLARE_ALIGNED(32, extern float, ff_aac_kbd_short_128)[128]; |
|
49 | 49 |
// @} |
50 | 50 |
|
51 | 51 |
/* @name number of scalefactor window bands for long and short transform windows respectively |
... | ... |
@@ -398,7 +398,7 @@ typedef struct DSPContext { |
398 | 398 |
/* assume len is a multiple of 4, and arrays are 16-byte aligned */ |
399 | 399 |
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); |
400 | 400 |
void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len); |
401 |
- /* assume len is a multiple of 8, and arrays are 16-byte aligned */ |
|
401 |
+ /* assume len is a multiple of 16, and arrays are 32-byte aligned */ |
|
402 | 402 |
void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len); |
403 | 403 |
void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); |
404 | 404 |
/* assume len is a multiple of 8, and src arrays are 16-byte aligned */ |
... | ... |
@@ -38,8 +38,8 @@ |
38 | 38 |
typedef struct { |
39 | 39 |
AVFrame frame; |
40 | 40 |
DSPContext dsp; |
41 |
- DECLARE_ALIGNED(16, float, sp_lpc)[FFALIGN(36, 8)]; ///< LPC coefficients for speech data (spec: A) |
|
42 |
- DECLARE_ALIGNED(16, float, gain_lpc)[FFALIGN(10, 8)]; ///< LPC coefficients for gain (spec: GB) |
|
41 |
+ DECLARE_ALIGNED(32, float, sp_lpc)[FFALIGN(36, 16)]; ///< LPC coefficients for speech data (spec: A) |
|
42 |
+ DECLARE_ALIGNED(32, float, gain_lpc)[FFALIGN(10, 16)]; ///< LPC coefficients for gain (spec: GB) |
|
43 | 43 |
|
44 | 44 |
/** speech data history (spec: SB). |
45 | 45 |
* Its first 70 coefficients are updated only at backward filtering. |
... | ... |
@@ -133,11 +133,11 @@ static void do_hybrid_window(RA288Context *ractx, |
133 | 133 |
int i; |
134 | 134 |
float buffer1[MAX_BACKWARD_FILTER_ORDER + 1]; |
135 | 135 |
float buffer2[MAX_BACKWARD_FILTER_ORDER + 1]; |
136 |
- LOCAL_ALIGNED_16(float, work, [FFALIGN(MAX_BACKWARD_FILTER_ORDER + |
|
137 |
- MAX_BACKWARD_FILTER_LEN + |
|
138 |
- MAX_BACKWARD_FILTER_NONREC, 8)]); |
|
136 |
+ LOCAL_ALIGNED(32, float, work, [FFALIGN(MAX_BACKWARD_FILTER_ORDER + |
|
137 |
+ MAX_BACKWARD_FILTER_LEN + |
|
138 |
+ MAX_BACKWARD_FILTER_NONREC, 16)]); |
|
139 | 139 |
|
140 |
- ractx->dsp.vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 8)); |
|
140 |
+ ractx->dsp.vector_fmul(work, window, hist, FFALIGN(order + n + non_rec, 16)); |
|
141 | 141 |
|
142 | 142 |
convolve(buffer1, work + order , n , order); |
143 | 143 |
convolve(buffer2, work + order + n, non_rec, order); |
... | ... |
@@ -164,7 +164,7 @@ static void backward_filter(RA288Context *ractx, |
164 | 164 |
do_hybrid_window(ractx, order, n, non_rec, temp, hist, rec, window); |
165 | 165 |
|
166 | 166 |
if (!compute_lpc_coefs(temp, order, lpc, 0, 1, 1)) |
167 |
- ractx->dsp.vector_fmul(lpc, lpc, tab, FFALIGN(order, 8)); |
|
167 |
+ ractx->dsp.vector_fmul(lpc, lpc, tab, FFALIGN(order, 16)); |
|
168 | 168 |
|
169 | 169 |
memmove(hist, hist + n, move_size*sizeof(*hist)); |
170 | 170 |
} |
... | ... |
@@ -97,7 +97,7 @@ static const int16_t codetable[128][5]={ |
97 | 97 |
{ 3746, -606, 53, -269, -3301}, { 606, 2018, -1316, 4064, 398} |
98 | 98 |
}; |
99 | 99 |
|
100 |
-DECLARE_ALIGNED(16, static const float, syn_window)[FFALIGN(111, 8)]={ |
|
100 |
+DECLARE_ALIGNED(32, static const float, syn_window)[FFALIGN(111, 16)]={ |
|
101 | 101 |
0.576690972, 0.580838025, 0.585013986, 0.589219987, 0.59345597, 0.597723007, |
102 | 102 |
0.602020264, 0.606384277, 0.610748291, 0.615142822, 0.619598389, 0.624084473, |
103 | 103 |
0.628570557, 0.633117676, 0.637695313, 0.642272949, 0.646911621, 0.651580811, |
... | ... |
@@ -119,7 +119,7 @@ DECLARE_ALIGNED(16, static const float, syn_window)[FFALIGN(111, 8)]={ |
119 | 119 |
0.142852783, 0.0954284668,0.0477600098 |
120 | 120 |
}; |
121 | 121 |
|
122 |
-DECLARE_ALIGNED(16, static const float, gain_window)[FFALIGN(38, 8)]={ |
|
122 |
+DECLARE_ALIGNED(32, static const float, gain_window)[FFALIGN(38, 16)]={ |
|
123 | 123 |
0.505699992, 0.524200022, 0.54339999, 0.563300014, 0.583953857, 0.60534668, |
124 | 124 |
0.627502441, 0.650482178, 0.674316406, 0.699005127, 0.724578857, 0.75112915, |
125 | 125 |
0.778625488, 0.807128906, 0.836669922, 0.86730957, 0.899078369, 0.932006836, |
... | ... |
@@ -130,7 +130,7 @@ DECLARE_ALIGNED(16, static const float, gain_window)[FFALIGN(38, 8)]={ |
130 | 130 |
}; |
131 | 131 |
|
132 | 132 |
/** synthesis bandwidth broadening table */ |
133 |
-DECLARE_ALIGNED(16, static const float, syn_bw_tab)[FFALIGN(36, 8)] = { |
|
133 |
+DECLARE_ALIGNED(32, static const float, syn_bw_tab)[FFALIGN(36, 16)] = { |
|
134 | 134 |
0.98828125, 0.976699829, 0.965254128, 0.953942537, 0.942763507, 0.931715488, |
135 | 135 |
0.920796931, 0.910006344, 0.899342179, 0.888803005, 0.878387332, 0.868093729, |
136 | 136 |
0.857920766, 0.847867012, 0.837931097, 0.828111589, 0.818407178, 0.808816493, |
... | ... |
@@ -140,7 +140,7 @@ DECLARE_ALIGNED(16, static const float, syn_bw_tab)[FFALIGN(36, 8)] = { |
140 | 140 |
}; |
141 | 141 |
|
142 | 142 |
/** gain bandwidth broadening table */ |
143 |
-DECLARE_ALIGNED(16, static const float, gain_bw_tab)[FFALIGN(10, 8)] = { |
|
143 |
+DECLARE_ALIGNED(32, static const float, gain_bw_tab)[FFALIGN(10, 16)] = { |
|
144 | 144 |
0.90625, 0.821289063, 0.74432373, 0.674499512, 0.61126709, |
145 | 145 |
0.553955078, 0.50201416, 0.454956055, 0.41229248, 0.373657227 |
146 | 146 |
}; |
... | ... |
@@ -78,8 +78,8 @@ typedef struct { |
78 | 78 |
* @name State variables |
79 | 79 |
* @{ |
80 | 80 |
*/ |
81 |
- DECLARE_ALIGNED(16, float, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE]; |
|
82 |
- DECLARE_ALIGNED(16, float, analysis_filterbank_samples) [1312]; |
|
81 |
+ DECLARE_ALIGNED(32, float, synthesis_filterbank_samples)[SBR_SYNTHESIS_BUF_SIZE]; |
|
82 |
+ DECLARE_ALIGNED(32, float, analysis_filterbank_samples) [1312]; |
|
83 | 83 |
int synthesis_filterbank_samples_offset; |
84 | 84 |
///l_APrev and l_A |
85 | 85 |
int e_a[2]; |
... | ... |
@@ -2348,135 +2348,6 @@ static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], |
2348 | 2348 |
} |
2349 | 2349 |
} |
2350 | 2350 |
|
2351 |
-static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, |
|
2352 |
- int len) |
|
2353 |
-{ |
|
2354 |
- x86_reg i = (len - 4) * 4; |
|
2355 |
- __asm__ volatile ( |
|
2356 |
- "1: \n\t" |
|
2357 |
- "movq (%2, %0), %%mm0 \n\t" |
|
2358 |
- "movq 8(%2, %0), %%mm1 \n\t" |
|
2359 |
- "pfmul (%3, %0), %%mm0 \n\t" |
|
2360 |
- "pfmul 8(%3, %0), %%mm1 \n\t" |
|
2361 |
- "movq %%mm0, (%1, %0) \n\t" |
|
2362 |
- "movq %%mm1, 8(%1, %0) \n\t" |
|
2363 |
- "sub $16, %0 \n\t" |
|
2364 |
- "jge 1b \n\t" |
|
2365 |
- "femms \n\t" |
|
2366 |
- : "+r"(i) |
|
2367 |
- : "r"(dst), "r"(src0), "r"(src1) |
|
2368 |
- : "memory" |
|
2369 |
- ); |
|
2370 |
-} |
|
2371 |
- |
|
2372 |
-static void vector_fmul_sse(float *dst, const float *src0, const float *src1, |
|
2373 |
- int len) |
|
2374 |
-{ |
|
2375 |
- x86_reg i = (len - 8) * 4; |
|
2376 |
- __asm__ volatile ( |
|
2377 |
- "1: \n\t" |
|
2378 |
- "movaps (%2, %0), %%xmm0 \n\t" |
|
2379 |
- "movaps 16(%2, %0), %%xmm1 \n\t" |
|
2380 |
- "mulps (%3, %0), %%xmm0 \n\t" |
|
2381 |
- "mulps 16(%3, %0), %%xmm1 \n\t" |
|
2382 |
- "movaps %%xmm0, (%1, %0) \n\t" |
|
2383 |
- "movaps %%xmm1, 16(%1, %0) \n\t" |
|
2384 |
- "sub $32, %0 \n\t" |
|
2385 |
- "jge 1b \n\t" |
|
2386 |
- : "+r"(i) |
|
2387 |
- : "r"(dst), "r"(src0), "r"(src1) |
|
2388 |
- : "memory" |
|
2389 |
- ); |
|
2390 |
-} |
|
2391 |
- |
|
2392 |
-static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, |
|
2393 |
- const float *src1, int len) |
|
2394 |
-{ |
|
2395 |
- x86_reg i = len * 4 - 16; |
|
2396 |
- __asm__ volatile ( |
|
2397 |
- "1: \n\t" |
|
2398 |
- "pswapd 8(%1), %%mm0 \n\t" |
|
2399 |
- "pswapd (%1), %%mm1 \n\t" |
|
2400 |
- "pfmul (%3, %0), %%mm0 \n\t" |
|
2401 |
- "pfmul 8(%3, %0), %%mm1 \n\t" |
|
2402 |
- "movq %%mm0, (%2, %0) \n\t" |
|
2403 |
- "movq %%mm1, 8(%2, %0) \n\t" |
|
2404 |
- "add $16, %1 \n\t" |
|
2405 |
- "sub $16, %0 \n\t" |
|
2406 |
- "jge 1b \n\t" |
|
2407 |
- : "+r"(i), "+r"(src1) |
|
2408 |
- : "r"(dst), "r"(src0) |
|
2409 |
- ); |
|
2410 |
- __asm__ volatile ("femms"); |
|
2411 |
-} |
|
2412 |
- |
|
2413 |
-static void vector_fmul_reverse_sse(float *dst, const float *src0, |
|
2414 |
- const float *src1, int len) |
|
2415 |
-{ |
|
2416 |
- x86_reg i = len * 4 - 32; |
|
2417 |
- __asm__ volatile ( |
|
2418 |
- "1: \n\t" |
|
2419 |
- "movaps 16(%1), %%xmm0 \n\t" |
|
2420 |
- "movaps (%1), %%xmm1 \n\t" |
|
2421 |
- "shufps $0x1b, %%xmm0, %%xmm0 \n\t" |
|
2422 |
- "shufps $0x1b, %%xmm1, %%xmm1 \n\t" |
|
2423 |
- "mulps (%3, %0), %%xmm0 \n\t" |
|
2424 |
- "mulps 16(%3, %0), %%xmm1 \n\t" |
|
2425 |
- "movaps %%xmm0, (%2, %0) \n\t" |
|
2426 |
- "movaps %%xmm1, 16(%2, %0) \n\t" |
|
2427 |
- "add $32, %1 \n\t" |
|
2428 |
- "sub $32, %0 \n\t" |
|
2429 |
- "jge 1b \n\t" |
|
2430 |
- : "+r"(i), "+r"(src1) |
|
2431 |
- : "r"(dst), "r"(src0) |
|
2432 |
- ); |
|
2433 |
-} |
|
2434 |
- |
|
2435 |
-static void vector_fmul_add_3dnow(float *dst, const float *src0, |
|
2436 |
- const float *src1, const float *src2, int len) |
|
2437 |
-{ |
|
2438 |
- x86_reg i = (len - 4) * 4; |
|
2439 |
- __asm__ volatile ( |
|
2440 |
- "1: \n\t" |
|
2441 |
- "movq (%2, %0), %%mm0 \n\t" |
|
2442 |
- "movq 8(%2, %0), %%mm1 \n\t" |
|
2443 |
- "pfmul (%3, %0), %%mm0 \n\t" |
|
2444 |
- "pfmul 8(%3, %0), %%mm1 \n\t" |
|
2445 |
- "pfadd (%4, %0), %%mm0 \n\t" |
|
2446 |
- "pfadd 8(%4, %0), %%mm1 \n\t" |
|
2447 |
- "movq %%mm0, (%1, %0) \n\t" |
|
2448 |
- "movq %%mm1, 8(%1, %0) \n\t" |
|
2449 |
- "sub $16, %0 \n\t" |
|
2450 |
- "jge 1b \n\t" |
|
2451 |
- : "+r"(i) |
|
2452 |
- : "r"(dst), "r"(src0), "r"(src1), "r"(src2) |
|
2453 |
- : "memory" |
|
2454 |
- ); |
|
2455 |
- __asm__ volatile ("femms"); |
|
2456 |
-} |
|
2457 |
- |
|
2458 |
-static void vector_fmul_add_sse(float *dst, const float *src0, |
|
2459 |
- const float *src1, const float *src2, int len) |
|
2460 |
-{ |
|
2461 |
- x86_reg i = (len - 8) * 4; |
|
2462 |
- __asm__ volatile ( |
|
2463 |
- "1: \n\t" |
|
2464 |
- "movaps (%2, %0), %%xmm0 \n\t" |
|
2465 |
- "movaps 16(%2, %0), %%xmm1 \n\t" |
|
2466 |
- "mulps (%3, %0), %%xmm0 \n\t" |
|
2467 |
- "mulps 16(%3, %0), %%xmm1 \n\t" |
|
2468 |
- "addps (%4, %0), %%xmm0 \n\t" |
|
2469 |
- "addps 16(%4, %0), %%xmm1 \n\t" |
|
2470 |
- "movaps %%xmm0, (%1, %0) \n\t" |
|
2471 |
- "movaps %%xmm1, 16(%1, %0) \n\t" |
|
2472 |
- "sub $32, %0 \n\t" |
|
2473 |
- "jge 1b \n\t" |
|
2474 |
- : "+r"(i) |
|
2475 |
- : "r"(dst), "r"(src0), "r"(src1), "r"(src2) |
|
2476 |
- : "memory" |
|
2477 |
- ); |
|
2478 |
-} |
|
2479 |
- |
|
2480 | 2351 |
#if HAVE_6REGS |
2481 | 2352 |
static void vector_fmul_window_3dnow2(float *dst, const float *src0, |
2482 | 2353 |
const float *src1, const float *win, |
... | ... |
@@ -2631,6 +2502,21 @@ int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, |
2631 | 2631 |
|
2632 | 2632 |
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order); |
2633 | 2633 |
|
2634 |
+void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1, |
|
2635 |
+ int len); |
|
2636 |
+void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1, |
|
2637 |
+ int len); |
|
2638 |
+ |
|
2639 |
+void ff_vector_fmul_reverse_sse(float *dst, const float *src0, |
|
2640 |
+ const float *src1, int len); |
|
2641 |
+void ff_vector_fmul_reverse_avx(float *dst, const float *src0, |
|
2642 |
+ const float *src1, int len); |
|
2643 |
+ |
|
2644 |
+void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1, |
|
2645 |
+ const float *src2, int len); |
|
2646 |
+void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1, |
|
2647 |
+ const float *src2, int len); |
|
2648 |
+ |
|
2634 | 2649 |
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, |
2635 | 2650 |
int32_t min, int32_t max, unsigned int len); |
2636 | 2651 |
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, |
... | ... |
@@ -2918,8 +2804,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, |
2918 | 2918 |
#endif |
2919 | 2919 |
|
2920 | 2920 |
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; |
2921 |
- c->vector_fmul = vector_fmul_3dnow; |
|
2922 |
- c->vector_fmul_add = vector_fmul_add_3dnow; |
|
2923 | 2921 |
|
2924 | 2922 |
#if HAVE_7REGS |
2925 | 2923 |
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov; |
... | ... |
@@ -2929,7 +2813,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx, |
2929 | 2929 |
static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx, |
2930 | 2930 |
int mm_flags) |
2931 | 2931 |
{ |
2932 |
- c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; |
|
2933 | 2932 |
#if HAVE_6REGS |
2934 | 2933 |
c->vector_fmul_window = vector_fmul_window_3dnow2; |
2935 | 2934 |
#endif |
... | ... |
@@ -2949,11 +2832,11 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags) |
2949 | 2949 |
|
2950 | 2950 |
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; |
2951 | 2951 |
c->ac3_downmix = ac3_downmix_sse; |
2952 |
- c->vector_fmul = vector_fmul_sse; |
|
2953 |
- c->vector_fmul_reverse = vector_fmul_reverse_sse; |
|
2954 |
- |
|
2955 |
- if (!(mm_flags & AV_CPU_FLAG_3DNOW)) |
|
2956 |
- c->vector_fmul_add = vector_fmul_add_sse; |
|
2952 |
+#if HAVE_YASM |
|
2953 |
+ c->vector_fmul = ff_vector_fmul_sse; |
|
2954 |
+ c->vector_fmul_reverse = ff_vector_fmul_reverse_sse; |
|
2955 |
+ c->vector_fmul_add = ff_vector_fmul_add_sse; |
|
2956 |
+#endif |
|
2957 | 2957 |
|
2958 | 2958 |
#if HAVE_6REGS |
2959 | 2959 |
c->vector_fmul_window = vector_fmul_window_sse; |
... | ... |
@@ -3112,6 +2995,9 @@ static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags) |
3112 | 3112 |
} |
3113 | 3113 |
} |
3114 | 3114 |
c->butterflies_float_interleave = ff_butterflies_float_interleave_avx; |
3115 |
+ c->vector_fmul = ff_vector_fmul_avx; |
|
3116 |
+ c->vector_fmul_reverse = ff_vector_fmul_reverse_avx; |
|
3117 |
+ c->vector_fmul_add = ff_vector_fmul_add_avx; |
|
3115 | 3118 |
#endif |
3116 | 3119 |
} |
3117 | 3120 |
|
... | ... |
@@ -1130,6 +1130,111 @@ VECTOR_CLIP_INT32 6, 1, 0, 0 |
1130 | 1130 |
%endif |
1131 | 1131 |
|
1132 | 1132 |
;----------------------------------------------------------------------------- |
1133 |
+; void vector_fmul(float *dst, const float *src0, const float *src1, int len) |
|
1134 |
+;----------------------------------------------------------------------------- |
|
1135 |
+%macro VECTOR_FMUL 0 |
|
1136 |
+cglobal vector_fmul, 4,4,2, dst, src0, src1, len |
|
1137 |
+ lea lenq, [lend*4 - 2*mmsize] |
|
1138 |
+ALIGN 16 |
|
1139 |
+.loop |
|
1140 |
+ mova m0, [src0q + lenq] |
|
1141 |
+ mova m1, [src0q + lenq + mmsize] |
|
1142 |
+ mulps m0, m0, [src1q + lenq] |
|
1143 |
+ mulps m1, m1, [src1q + lenq + mmsize] |
|
1144 |
+ mova [dstq + lenq], m0 |
|
1145 |
+ mova [dstq + lenq + mmsize], m1 |
|
1146 |
+ |
|
1147 |
+ sub lenq, 2*mmsize |
|
1148 |
+ jge .loop |
|
1149 |
+%if mmsize == 32 |
|
1150 |
+ vzeroupper |
|
1151 |
+ RET |
|
1152 |
+%else |
|
1153 |
+ REP_RET |
|
1154 |
+%endif |
|
1155 |
+%endmacro |
|
1156 |
+ |
|
1157 |
+INIT_XMM sse |
|
1158 |
+VECTOR_FMUL |
|
1159 |
+INIT_YMM avx |
|
1160 |
+VECTOR_FMUL |
|
1161 |
+ |
|
1162 |
+;----------------------------------------------------------------------------- |
|
1163 |
+; void vector_fmul_reverse(float *dst, const float *src0, const float *src1, |
|
1164 |
+; int len) |
|
1165 |
+;----------------------------------------------------------------------------- |
|
1166 |
+%macro VECTOR_FMUL_REVERSE 0 |
|
1167 |
+cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len |
|
1168 |
+ lea lenq, [lend*4 - 2*mmsize] |
|
1169 |
+ALIGN 16 |
|
1170 |
+.loop |
|
1171 |
+%if cpuflag(avx) |
|
1172 |
+ vmovaps xmm0, [src1q + 16] |
|
1173 |
+ vinsertf128 m0, m0, [src1q], 1 |
|
1174 |
+ vshufps m0, m0, m0, q0123 |
|
1175 |
+ vmovaps xmm1, [src1q + mmsize + 16] |
|
1176 |
+ vinsertf128 m1, m1, [src1q + mmsize], 1 |
|
1177 |
+ vshufps m1, m1, m1, q0123 |
|
1178 |
+%else |
|
1179 |
+ mova m0, [src1q] |
|
1180 |
+ mova m1, [src1q + mmsize] |
|
1181 |
+ shufps m0, m0, q0123 |
|
1182 |
+ shufps m1, m1, q0123 |
|
1183 |
+%endif |
|
1184 |
+ mulps m0, m0, [src0q + lenq + mmsize] |
|
1185 |
+ mulps m1, m1, [src0q + lenq] |
|
1186 |
+ mova [dstq + lenq + mmsize], m0 |
|
1187 |
+ mova [dstq + lenq], m1 |
|
1188 |
+ add src1q, 2*mmsize |
|
1189 |
+ sub lenq, 2*mmsize |
|
1190 |
+ jge .loop |
|
1191 |
+%if mmsize == 32 |
|
1192 |
+ vzeroupper |
|
1193 |
+ RET |
|
1194 |
+%else |
|
1195 |
+ REP_RET |
|
1196 |
+%endif |
|
1197 |
+%endmacro |
|
1198 |
+ |
|
1199 |
+INIT_XMM sse |
|
1200 |
+VECTOR_FMUL_REVERSE |
|
1201 |
+INIT_YMM avx |
|
1202 |
+VECTOR_FMUL_REVERSE |
|
1203 |
+ |
|
1204 |
+;----------------------------------------------------------------------------- |
|
1205 |
+; vector_fmul_add(float *dst, const float *src0, const float *src1, |
|
1206 |
+; const float *src2, int len) |
|
1207 |
+;----------------------------------------------------------------------------- |
|
1208 |
+%macro VECTOR_FMUL_ADD 0 |
|
1209 |
+cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len |
|
1210 |
+ lea lenq, [lend*4 - 2*mmsize] |
|
1211 |
+ALIGN 16 |
|
1212 |
+.loop |
|
1213 |
+ mova m0, [src0q + lenq] |
|
1214 |
+ mova m1, [src0q + lenq + mmsize] |
|
1215 |
+ mulps m0, m0, [src1q + lenq] |
|
1216 |
+ mulps m1, m1, [src1q + lenq + mmsize] |
|
1217 |
+ addps m0, m0, [src2q + lenq] |
|
1218 |
+ addps m1, m1, [src2q + lenq + mmsize] |
|
1219 |
+ mova [dstq + lenq], m0 |
|
1220 |
+ mova [dstq + lenq + mmsize], m1 |
|
1221 |
+ |
|
1222 |
+ sub lenq, 2*mmsize |
|
1223 |
+ jge .loop |
|
1224 |
+%if mmsize == 32 |
|
1225 |
+ vzeroupper |
|
1226 |
+ RET |
|
1227 |
+%else |
|
1228 |
+ REP_RET |
|
1229 |
+%endif |
|
1230 |
+%endmacro |
|
1231 |
+ |
|
1232 |
+INIT_XMM sse |
|
1233 |
+VECTOR_FMUL_ADD |
|
1234 |
+INIT_YMM avx |
|
1235 |
+VECTOR_FMUL_ADD |
|
1236 |
+ |
|
1237 |
+;----------------------------------------------------------------------------- |
|
1133 | 1238 |
; void ff_butterflies_float_interleave(float *dst, const float *src0, |
1134 | 1239 |
; const float *src1, int len); |
1135 | 1240 |
;----------------------------------------------------------------------------- |