Browse code

rv34: NEON optimised inverse transform functions

Signed-off-by: Mans Rullgard <mans@mansr.com>

Janne Grunau authored on 2011/09/24 20:05:55
Showing 5 changed files
... ...
@@ -62,6 +62,12 @@ NEON-OBJS-$(CONFIG_AC3DSP)             += arm/ac3dsp_neon.o
62 62
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
63 63
                                           arm/synth_filter_neon.o       \
64 64
 
65
+NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_init_neon.o       \
66
+                                          arm/rv34dsp_neon.o            \
67
+
68
+NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_init_neon.o       \
69
+                                          arm/rv34dsp_neon.o            \
70
+
65 71
 NEON-OBJS-$(CONFIG_VP3_DECODER)        += arm/vp3dsp_neon.o
66 72
 
67 73
 NEON-OBJS-$(CONFIG_VP5_DECODER)        += arm/vp56dsp_neon.o            \
68 74
new file mode 100644
... ...
@@ -0,0 +1,33 @@
0
+/*
1
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
2
+ *
3
+ * This file is part of Libav.
4
+ *
5
+ * Libav is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * Libav is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with Libav; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include <stdint.h>
21
+
22
+#include "libavcodec/avcodec.h"
23
+#include "libavcodec/rv34dsp.h"
24
+
25
+void ff_rv34_inv_transform_neon(DCTELEM *block);
26
+void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
27
+
28
+void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
29
+{
30
+    c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
31
+    c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
32
+}
0 33
new file mode 100644
... ...
@@ -0,0 +1,109 @@
0
+/*
1
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
2
+ *
3
+ * This file is part of Libav.
4
+ *
5
+ * Libav is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * Libav is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with Libav; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "asm.S"
21
+
22
+.macro rv34_inv_transform
23
+        mov             r1,  #16
24
+        vld1.16         {d28}, [r0,:64], r1     @ block[i+8*0]
25
+        vld1.16         {d29}, [r0,:64], r1     @ block[i+8*1]
26
+        vld1.16         {d30}, [r0,:64], r1     @ block[i+8*2]
27
+        vld1.16         {d31}, [r0,:64], r1     @ block[i+8*3]
28
+        vmov.s16        d0,  #13
29
+        vshll.s16       q12, d29, #3
30
+        vshll.s16       q13, d29, #4
31
+        vshll.s16       q9,  d31, #3
32
+        vshll.s16       q1,  d31, #4
33
+        vmull.s16       q10, d28, d0
34
+        vmlal.s16       q10, d30, d0
35
+        vmull.s16       q11, d28, d0
36
+        vmlsl.s16       q11, d30, d0
37
+        vsubw.s16       q12, q12, d29   @ z2 = block[i+8*1]*7
38
+        vaddw.s16       q13, q13, d29   @ z3 = block[i+8*1]*17
39
+        vsubw.s16       q9,  q9,  d31
40
+        vaddw.s16       q1,  q1,  d31
41
+        vadd.s32        q13, q13, q9    @ z3 = 17*block[i+8*1] +  7*block[i+8*3]
42
+        vsub.s32        q12, q12, q1    @ z2 = 7*block[i+8*1]  - 17*block[i+8*3]
43
+        vadd.s32        q1,  q10, q13   @ z0 + z3
44
+        vadd.s32        q2,  q11, q12   @ z1 + z2
45
+        vsub.s32        q8,  q10, q13   @ z0 - z3
46
+        vsub.s32        q3,  q11, q12   @ z1 - z2
47
+        vtrn.32         q1,  q2
48
+        vtrn.32         q3,  q8
49
+        vswp            d3,  d6
50
+        vswp            d5,  d16
51
+        vmov.s32        d0,  #13
52
+        vadd.s32        q10, q1,  q3
53
+        vsub.s32        q11, q1,  q3
54
+        vshl.s32        q12, q2,  #3
55
+        vshl.s32        q9,  q2,  #4
56
+        vmul.s32        q13, q11, d0[0]
57
+        vshl.s32        q11, q8,  #4
58
+        vadd.s32        q9,  q9,  q2
59
+        vshl.s32        q15, q8,  #3
60
+        vsub.s32        q12, q12, q2
61
+        vadd.s32        q11, q11, q8
62
+        vmul.s32        q14, q10, d0[0]
63
+        vsub.s32        q8,  q15, q8
64
+        vsub.s32        q12, q12, q11
65
+        vadd.s32        q9,  q9,  q8
66
+        vadd.s32        q2,  q13, q12   @ z1 + z2
67
+        vadd.s32        q1,  q14, q9    @ z0 + z3
68
+        vsub.s32        q3,  q13, q12   @ z1 - z2
69
+        vsub.s32        q15, q14, q9    @ z0 - z3
70
+.endm
71
+
72
+/* void ff_rv34_inv_transform_neon(DCTELEM *block); */
73
+function ff_rv34_inv_transform_neon, export=1
74
+        mov             r2,  r0
75
+        rv34_inv_transform
76
+        vrshrn.s32      d1,  q2,  #10   @ (z1 + z2) >> 10
77
+        vrshrn.s32      d0,  q1,  #10   @ (z0 + z3) >> 10
78
+        vrshrn.s32      d2,  q3,  #10   @ (z1 - z2) >> 10
79
+        vrshrn.s32      d3,  q15, #10   @ (z0 - z3) >> 10
80
+        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1
81
+        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1
82
+        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1
83
+        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
84
+        bx              lr
85
+endfunc
86
+
87
+/* void rv34_inv_transform_noround_neon(DCTELEM *block); */
88
+function ff_rv34_inv_transform_noround_neon, export=1
89
+        mov             r2,  r0
90
+        rv34_inv_transform
91
+        vshl.s32        q11, q2,  #1
92
+        vshl.s32        q10, q1,  #1
93
+        vshl.s32        q12, q3,  #1
94
+        vshl.s32        q13, q15, #1
95
+        vadd.s32        q11, q11, q2
96
+        vadd.s32        q10, q10, q1
97
+        vadd.s32        q12, q12, q3
98
+        vadd.s32        q13, q13, q15
99
+        vshrn.s32       d0,  q10, #11   @ (z0 + z3)*3 >> 11
100
+        vshrn.s32       d1,  q11, #11   @ (z1 + z2)*3 >> 11
101
+        vshrn.s32       d2,  q12, #11   @ (z1 - z2)*3 >> 11
102
+        vshrn.s32       d3,  q13, #11   @ (z0 - z3)*3 >> 11
103
+        vst4.16         {d0[0], d1[0], d2[0], d3[0]}, [r2,:64], r1
104
+        vst4.16         {d0[1], d1[1], d2[1], d3[1]}, [r2,:64], r1
105
+        vst4.16         {d0[2], d1[2], d2[2], d3[2]}, [r2,:64], r1
106
+        vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
107
+        bx              lr
108
+endfunc
... ...
@@ -103,4 +103,7 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){
103 103
 av_cold void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp) {
104 104
     c->rv34_inv_transform_tab[0] = rv34_inv_transform_c;
105 105
     c->rv34_inv_transform_tab[1] = rv34_inv_transform_noround_c;
106
+
107
+    if (HAVE_NEON)
108
+        ff_rv34dsp_init_neon(c, dsp);
106 109
 }
... ...
@@ -56,6 +56,8 @@ void ff_rv30dsp_init(RV34DSPContext *c, DSPContext* dsp);
56 56
 void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp);
57 57
 void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp);
58 58
 
59
+void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
60
+
59 61
 void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp);
60 62
 
61 63
 #endif /* AVCODEC_RV34DSP_H */