AC3DSPContext.ac3_max_msb_abs_int16() finds the maximum MSB of the absolute
value of each element in an array of int16_t.
Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>
| ... | ... |
@@ -42,9 +42,18 @@ static void ac3_exponent_min_c(uint8_t *exp, int num_reuse_blocks, int nb_coefs) |
| 42 | 42 |
} |
| 43 | 43 |
} |
| 44 | 44 |
|
| 45 |
+static int ac3_max_msb_abs_int16_c(const int16_t *src, int len) |
|
| 46 |
+{
|
|
| 47 |
+ int i, v = 0; |
|
| 48 |
+ for (i = 0; i < len; i++) |
|
| 49 |
+ v |= abs(src[i]); |
|
| 50 |
+ return v; |
|
| 51 |
+} |
|
| 52 |
+ |
|
| 45 | 53 |
av_cold void ff_ac3dsp_init(AC3DSPContext *c) |
| 46 | 54 |
{
|
| 47 | 55 |
c->ac3_exponent_min = ac3_exponent_min_c; |
| 56 |
+ c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c; |
|
| 48 | 57 |
|
| 49 | 58 |
if (HAVE_MMX) |
| 50 | 59 |
ff_ac3dsp_init_x86(c); |
| ... | ... |
@@ -35,6 +35,17 @@ typedef struct AC3DSPContext {
|
| 35 | 35 |
* @param nb_coefs number of frequency coefficients. |
| 36 | 36 |
*/ |
| 37 | 37 |
void (*ac3_exponent_min)(uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
| 38 |
+ |
|
| 39 |
+ /** |
|
| 40 |
+ * Calculate the maximum MSB of the absolute value of each element in an |
|
| 41 |
+ * array of int16_t. |
|
| 42 |
+ * @param src input array |
|
| 43 |
+ * constraints: align 16. values must be in range [-32767,32767] |
|
| 44 |
+ * @param len number of values in the array |
|
| 45 |
+ * constraints: multiple of 16 greater than 0 |
|
| 46 |
+ * @return a value with the same MSB as max(abs(src[])) |
|
| 47 |
+ */ |
|
| 48 |
+ int (*ac3_max_msb_abs_int16)(const int16_t *src, int len); |
|
| 38 | 49 |
} AC3DSPContext; |
| 39 | 50 |
|
| 40 | 51 |
void ff_ac3dsp_init (AC3DSPContext *c); |
| ... | ... |
@@ -270,14 +270,9 @@ static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input, |
| 270 | 270 |
* @param n number of values in the array |
| 271 | 271 |
* @return log2(max(abs(tab[]))) |
| 272 | 272 |
*/ |
| 273 |
-static int log2_tab(int16_t *tab, int n) |
|
| 273 |
+static int log2_tab(AC3EncodeContext *s, int16_t *src, int len) |
|
| 274 | 274 |
{
|
| 275 |
- int i, v; |
|
| 276 |
- |
|
| 277 |
- v = 0; |
|
| 278 |
- for (i = 0; i < n; i++) |
|
| 279 |
- v |= abs(tab[i]); |
|
| 280 |
- |
|
| 275 |
+ int v = s->ac3dsp.ac3_max_msb_abs_int16(src, len); |
|
| 281 | 276 |
return av_log2(v); |
| 282 | 277 |
} |
| 283 | 278 |
|
| ... | ... |
@@ -308,7 +303,7 @@ static void lshift_tab(int16_t *tab, int n, unsigned int lshift) |
| 308 | 308 |
*/ |
| 309 | 309 |
static int normalize_samples(AC3EncodeContext *s) |
| 310 | 310 |
{
|
| 311 |
- int v = 14 - log2_tab(s->windowed_samples, AC3_WINDOW_SIZE); |
|
| 311 |
+ int v = 14 - log2_tab(s, s->windowed_samples, AC3_WINDOW_SIZE); |
|
| 312 | 312 |
lshift_tab(s->windowed_samples, AC3_WINDOW_SIZE, v); |
| 313 | 313 |
return v - 9; |
| 314 | 314 |
} |
| ... | ... |
@@ -65,3 +65,72 @@ AC3_EXPONENT_MIN sse2 |
| 65 | 65 |
%endif |
| 66 | 66 |
%undef PMINUB |
| 67 | 67 |
%undef LOOP_ALIGN |
| 68 |
+ |
|
| 69 |
+;----------------------------------------------------------------------------- |
|
| 70 |
+; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) |
|
| 71 |
+; |
|
| 72 |
+; This function uses 2 different methods to calculate a valid result. |
|
| 73 |
+; 1) logical 'or' of abs of each element |
|
| 74 |
+; This is used for ssse3 because of the pabsw instruction. |
|
| 75 |
+; It is also used for mmx because of the lack of min/max instructions. |
|
| 76 |
+; 2) calculate min/max for the array, then or(abs(min),abs(max)) |
|
| 77 |
+; This is used for mmxext and sse2 because they have pminsw/pmaxsw. |
|
| 78 |
+;----------------------------------------------------------------------------- |
|
| 79 |
+ |
|
| 80 |
+%macro AC3_MAX_MSB_ABS_INT16 2 |
|
| 81 |
+cglobal ac3_max_msb_abs_int16_%1, 2,2,5, src, len |
|
| 82 |
+ pxor m2, m2 |
|
| 83 |
+ pxor m3, m3 |
|
| 84 |
+.loop: |
|
| 85 |
+%ifidn %2, min_max |
|
| 86 |
+ mova m0, [srcq] |
|
| 87 |
+ mova m1, [srcq+mmsize] |
|
| 88 |
+ pminsw m2, m0 |
|
| 89 |
+ pminsw m2, m1 |
|
| 90 |
+ pmaxsw m3, m0 |
|
| 91 |
+ pmaxsw m3, m1 |
|
| 92 |
+%else ; or_abs |
|
| 93 |
+%ifidn %1, mmx |
|
| 94 |
+ mova m0, [srcq] |
|
| 95 |
+ mova m1, [srcq+mmsize] |
|
| 96 |
+ ABS2 m0, m1, m3, m4 |
|
| 97 |
+%else ; ssse3 |
|
| 98 |
+ ; using memory args is faster for ssse3 |
|
| 99 |
+ pabsw m0, [srcq] |
|
| 100 |
+ pabsw m1, [srcq+mmsize] |
|
| 101 |
+%endif |
|
| 102 |
+ por m2, m0 |
|
| 103 |
+ por m2, m1 |
|
| 104 |
+%endif |
|
| 105 |
+ add srcq, mmsize*2 |
|
| 106 |
+ sub lend, mmsize |
|
| 107 |
+ ja .loop |
|
| 108 |
+%ifidn %2, min_max |
|
| 109 |
+ ABS2 m2, m3, m0, m1 |
|
| 110 |
+ por m2, m3 |
|
| 111 |
+%endif |
|
| 112 |
+%ifidn mmsize, 16 |
|
| 113 |
+ mova m0, m2 |
|
| 114 |
+ punpckhqdq m0, m0 |
|
| 115 |
+ por m2, m0 |
|
| 116 |
+%endif |
|
| 117 |
+ PSHUFLW m0, m2, 0xe |
|
| 118 |
+ por m2, m0 |
|
| 119 |
+ PSHUFLW m0, m2, 0x1 |
|
| 120 |
+ por m2, m0 |
|
| 121 |
+ movd eax, m2 |
|
| 122 |
+ and eax, 0xFFFF |
|
| 123 |
+ RET |
|
| 124 |
+%endmacro |
|
| 125 |
+ |
|
| 126 |
+INIT_MMX |
|
| 127 |
+%define ABS2 ABS2_MMX |
|
| 128 |
+%define PSHUFLW pshufw |
|
| 129 |
+AC3_MAX_MSB_ABS_INT16 mmx, or_abs |
|
| 130 |
+%define ABS2 ABS2_MMX2 |
|
| 131 |
+AC3_MAX_MSB_ABS_INT16 mmxext, min_max |
|
| 132 |
+INIT_XMM |
|
| 133 |
+%define PSHUFLW pshuflw |
|
| 134 |
+AC3_MAX_MSB_ABS_INT16 sse2, min_max |
|
| 135 |
+%define ABS2 ABS2_SSSE3 |
|
| 136 |
+AC3_MAX_MSB_ABS_INT16 ssse3, or_abs |
| ... | ... |
@@ -27,6 +27,11 @@ extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int n |
| 27 | 27 |
extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
| 28 | 28 |
extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
| 29 | 29 |
|
| 30 |
+extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); |
|
| 31 |
+extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); |
|
| 32 |
+extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); |
|
| 33 |
+extern int ff_ac3_max_msb_abs_int16_ssse3 (const int16_t *src, int len); |
|
| 34 |
+ |
|
| 30 | 35 |
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) |
| 31 | 36 |
{
|
| 32 | 37 |
int mm_flags = av_get_cpu_flags(); |
| ... | ... |
@@ -34,12 +39,18 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) |
| 34 | 34 |
#if HAVE_YASM |
| 35 | 35 |
if (mm_flags & AV_CPU_FLAG_MMX) {
|
| 36 | 36 |
c->ac3_exponent_min = ff_ac3_exponent_min_mmx; |
| 37 |
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; |
|
| 37 | 38 |
} |
| 38 | 39 |
if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
|
| 39 | 40 |
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; |
| 41 |
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; |
|
| 40 | 42 |
} |
| 41 | 43 |
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
|
| 42 | 44 |
c->ac3_exponent_min = ff_ac3_exponent_min_sse2; |
| 45 |
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; |
|
| 46 |
+ } |
|
| 47 |
+ if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
|
|
| 48 |
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; |
|
| 43 | 49 |
} |
| 44 | 50 |
#endif |
| 45 | 51 |
} |