Browse code

swscale/arm/yuv2rgb: remove 32bit code path

Matthieu Bouron authored on 2016/03/23 01:46:10
Showing 2 changed files
... ...
@@ -61,14 +61,14 @@ static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[
61 61
     return 0;
62 62
 }
63 63
 
64
-#define YUV_TO_RGB_TABLE(precision)                                                         \
65
-        c->yuv2rgb_v2r_coeff / ((precision) == 16 ? 1 << 7 : 1),                            \
66
-        c->yuv2rgb_u2g_coeff / ((precision) == 16 ? 1 << 7 : 1),                            \
67
-        c->yuv2rgb_v2g_coeff / ((precision) == 16 ? 1 << 7 : 1),                            \
68
-        c->yuv2rgb_u2b_coeff / ((precision) == 16 ? 1 << 7 : 1),                            \
69
-
70
-#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt, precision)                                \
71
-int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h,                                  \
64
+#define YUV_TO_RGB_TABLE                                                                    \
65
+        c->yuv2rgb_v2r_coeff / (1 << 7),                                                    \
66
+        c->yuv2rgb_u2g_coeff / (1 << 7),                                                    \
67
+        c->yuv2rgb_v2g_coeff / (1 << 7),                                                    \
68
+        c->yuv2rgb_u2b_coeff / (1 << 7),                                                    \
69
+
70
+#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt)                                           \
71
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h,                                              \
72 72
                                  uint8_t *dst, int linesize,                                \
73 73
                                  const uint8_t *srcY, int linesizeY,                        \
74 74
                                  const uint8_t *srcU, int linesizeU,                        \
... ...
@@ -77,37 +77,34 @@ int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h,
77 77
                                  int y_offset,                                              \
78 78
                                  int y_coeff);                                              \
79 79
                                                                                             \
80
-static int ifmt##_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[], \
80
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],             \
81 81
                                            int srcStride[], int srcSliceY, int srcSliceH,   \
82 82
                                            uint8_t *dst[], int dstStride[]) {               \
83
-    const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) };                        \
83
+    const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE };                                   \
84 84
                                                                                             \
85
-    ff_##ifmt##_to_##ofmt##_neon_##precision(c->srcW, srcSliceH,                            \
85
+    ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH,                                        \
86 86
                                  dst[0] + srcSliceY * dstStride[0], dstStride[0],           \
87 87
                                  src[0], srcStride[0],                                      \
88 88
                                  src[1], srcStride[1],                                      \
89 89
                                  src[2], srcStride[2],                                      \
90 90
                                  yuv2rgb_table,                                             \
91 91
                                  c->yuv2rgb_y_offset >> 9,                                  \
92
-                                 c->yuv2rgb_y_coeff / ((precision) == 16 ? 1 << 7 : 1));    \
92
+                                 c->yuv2rgb_y_coeff / (1 << 7));                            \
93 93
                                                                                             \
94 94
     return 0;                                                                               \
95 95
 }                                                                                           \
96 96
 
97
-#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx, precision)                                  \
98
-DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb, precision)                                        \
99
-DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba, precision)                                        \
100
-DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr, precision)                                        \
101
-DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra, precision)                                        \
97
+#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx)                                             \
98
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb)                                                   \
99
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba)                                                   \
100
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr)                                                   \
101
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra)                                                   \
102 102
 
103
-#define DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuvx)                               \
104
-DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx, 16)                                                 \
103
+DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
104
+DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
105 105
 
106
-DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuv420p)
107
-DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuv422p)
108
-
109
-#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt, precision)                                 \
110
-int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h,                                  \
106
+#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt)                                            \
107
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h,                                              \
111 108
                                  uint8_t *dst, int linesize,                                \
112 109
                                  const uint8_t *srcY, int linesizeY,                        \
113 110
                                  const uint8_t *srcC, int linesizeC,                        \
... ...
@@ -115,32 +112,29 @@ int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h,
115 115
                                  int y_offset,                                              \
116 116
                                  int y_coeff);                                              \
117 117
                                                                                             \
118
-static int ifmt##_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[], \
118
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[],             \
119 119
                                            int srcStride[], int srcSliceY, int srcSliceH,   \
120 120
                                            uint8_t *dst[], int dstStride[]) {               \
121
-    const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) };                        \
121
+    const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE };                                   \
122 122
                                                                                             \
123
-    ff_##ifmt##_to_##ofmt##_neon_##precision(c->srcW, srcSliceH,                            \
123
+    ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH,                            \
124 124
                                  dst[0] + srcSliceY * dstStride[0], dstStride[0],           \
125 125
                                  src[0], srcStride[0], src[1], srcStride[1],                \
126 126
                                  yuv2rgb_table,                                             \
127 127
                                  c->yuv2rgb_y_offset >> 9,                                  \
128
-                                 c->yuv2rgb_y_coeff / ((precision) == 16 ? 1 << 7 : 1));    \
128
+                                 c->yuv2rgb_y_coeff / (1 << 7));                            \
129 129
                                                                                             \
130 130
     return 0;                                                                               \
131 131
 }                                                                                           \
132 132
 
133
-#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx, precision)                                    \
134
-DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb, precision)                                          \
135
-DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba, precision)                                          \
136
-DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr, precision)                                          \
137
-DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra, precision)                                          \
138
-
139
-#define DECLARE_FF_NVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(nvx)                                 \
140
-DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx, 16)                                                   \
133
+#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx)                                               \
134
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb)                                                     \
135
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba)                                                     \
136
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr)                                                     \
137
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra)                                                     \
141 138
 
142
-DECLARE_FF_NVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(nv12)
143
-DECLARE_FF_NVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(nv21)
139
+DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
140
+DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
144 141
 
145 142
 /* We need a 16 pixel width alignment. This constraint can easily be removed
146 143
  * for input reading but for the output which is 4-bytes per pixel (RGBA) the
... ...
@@ -152,7 +146,7 @@ DECLARE_FF_NVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(nv21)
152 152
         && !(c->srcH & 1)                                                                   \
153 153
         && !(c->srcW & 15)                                                                  \
154 154
         && !accurate_rnd) {                                                                 \
155
-        c->swscale = ifmt##_to_##ofmt##_neon_wrapper_16;                                    \
155
+        c->swscale = ifmt##_to_##ofmt##_neon_wrapper;                                       \
156 156
     }                                                                                       \
157 157
 } while (0)
158 158
 
... ...
@@ -22,7 +22,7 @@
22 22
 #include "libavutil/arm/asm.S"
23 23
 
24 24
 
25
-.macro compute_premult_16 half_u1, half_u2, half_v1, half_v2
25
+.macro compute_premult half_u1, half_u2, half_v1, half_v2
26 26
     vmov                d2, \half_u1                                   @ copy left q14 to left q1
27 27
     vmov                d3, \half_u1                                   @ copy left q14 to right q1
28 28
     vmov                d4, \half_u2                                   @ copy right q14 to left q2
... ...
@@ -49,56 +49,22 @@
49 49
     vmul.s16            q13, q2, d1[3]                                 @  U * u2b             (right, blue)
50 50
 .endm
51 51
 
52
-.macro compute_premult_32 half_u half_v
53
-    vmov                d2, \half_u                                    @ copy left q14 to left q1
54
-    vmov                d3, \half_u                                    @ copy left q14 to right q1
55
-    vmov                d4, \half_v                                    @ copy left q15 to left q2
56
-    vmov                d5, \half_v                                    @ copy left q15 to right q2
57
-
58
-    vzip.16             d2, d3                                         @ U1U1U2U2U3U3U4U4
59
-    vzip.16             d4, d5                                         @ V1V1V2V2V3V3V4V4
60
-
61
-    vmull.s16           q8,  d4, d1[0]                                 @  V * v2r             (left,  red)
62
-    vmull.s16           q9,  d5, d1[0]                                 @  V * v2r             (right, red)
63
-    vmull.s16           q10, d2, d1[1]                                 @  U * u2g
64
-    vmull.s16           q11, d3, d1[1]                                 @  U * u2g
65
-    vmlal.s16           q10, d4, d1[2]                                 @  U * u2g + V * v2g   (left,  green)
66
-    vmlal.s16           q11, d5, d1[2]                                 @  U * u2g + V * v2g   (right, green)
67
-    vmull.s16           q12, d2, d1[3]                                 @  U * u2b             (left,  blue)
68
-    vmull.s16           q13, d3, d1[3]                                 @  U * u2b             (right, blue)
69
-.endm
70
-
71
-.macro compute_color_16 dst_comp1 dst_comp2 pre1 pre2
52
+.macro compute_color dst_comp1 dst_comp2 pre1 pre2
72 53
     vadd.s16            q1, q14, \pre1
73 54
     vadd.s16            q2, q15, \pre2
74 55
     vqrshrun.s16        \dst_comp1, q1, #6
75 56
     vqrshrun.s16        \dst_comp2, q2, #6
76 57
 .endm
77 58
 
78
-.macro compute_color_32 dst_comp pre1 pre2
79
-    vadd.s32            q3, q1, \pre1
80
-    vadd.s32            q4, q2, \pre2
81
-    vqrshrun.s32        d10, q3, #13
82
-    vqrshrun.s32        d11, q4, #13                                   @ q5 = ({q3,q4} + (1<<12)) >> 13
83
-    vqmovn.u16          \dst_comp, q5                                  @ saturate 16bit -> 8bit
84
-.endm
85
-
86
-.macro compute_rgba_16 r1 r2 g1 g2 b1 b2 a1 a2
87
-    compute_color_16    \r1, \r2, q8,  q9
88
-    compute_color_16    \g1, \g2, q10, q11
89
-    compute_color_16    \b1, \b2, q12, q13
59
+.macro compute_rgba r1 r2 g1 g2 b1 b2 a1 a2
60
+    compute_color       \r1, \r2, q8,  q9
61
+    compute_color       \g1, \g2, q10, q11
62
+    compute_color       \b1, \b2, q12, q13
90 63
     vmov.u8             \a1, #255
91 64
     vmov.u8             \a2, #255
92 65
 .endm
93 66
 
94
-.macro compute_rgba_32 r g b a
95
-    compute_color_32    \r, q8,  q9
96
-    compute_color_32    \g, q10, q11
97
-    compute_color_32    \b, q12, q13
98
-    vmov.u8             \a, #255
99
-.endm
100
-
101
-.macro compute_16px_16 dst y0 y1 ofmt
67
+.macro compute_16px dst y0 y1 ofmt
102 68
     vmovl.u8            q14, \y0                                       @ 8px of y
103 69
     vmovl.u8            q15, \y1                                       @ 8px of y
104 70
 
... ...
@@ -114,91 +80,39 @@
114 114
 
115 115
 
116 116
 .ifc \ofmt,argb
117
-    compute_rgba_16     d7, d11, d8, d12, d9, d13, d6, d10
117
+    compute_rgba        d7, d11, d8, d12, d9, d13, d6, d10
118 118
 .endif
119 119
 
120 120
 .ifc \ofmt,rgba
121
-    compute_rgba_16     d6, d10, d7, d11, d8, d12, d9, d13
121
+    compute_rgba        d6, d10, d7, d11, d8, d12, d9, d13
122 122
 .endif
123 123
 
124 124
 .ifc \ofmt,abgr
125
-    compute_rgba_16     d9, d13, d8, d12, d7, d11, d6, d10
125
+    compute_rgba        d9, d13, d8, d12, d7, d11, d6, d10
126 126
 .endif
127 127
 
128 128
 .ifc \ofmt,bgra
129
-    compute_rgba_16     d8, d12, d7, d11, d6, d10, d9, d13
129
+    compute_rgba        d8, d12, d7, d11, d6, d10, d9, d13
130 130
 .endif
131 131
     vst4.8              {q3, q4}, [\dst,:128]!
132 132
     vst4.8              {q5, q6}, [\dst,:128]!
133 133
 
134 134
 .endm
135 135
 
136
-.macro compute_8px_32 dst half_y ofmt
137
-    vmovl.u8            q7, \half_y                                    @ 8px of Y
138
-    vdup.16             q5, r9
139
-    vsub.s16            q7, q5
140
-    vmull.s16           q1, d14, d0                                    @ q1 = (srcY - y_offset) * y_coeff (left)
141
-    vmull.s16           q2, d15, d0                                    @ q2 = (srcY - y_offset) * y_coeff (right)
142
-
143
-.ifc \ofmt,argb
144
-    compute_rgba_32     d13, d14, d15, d12
145
-.endif
146
-
147
-.ifc \ofmt,rgba
148
-    compute_rgba_32     d12, d13, d14, d15
149
-.endif
150
-
151
-.ifc \ofmt,abgr
152
-    compute_rgba_32     d15, d14, d13, d12
153
-.endif
154
-
155
-.ifc \ofmt,bgra
156
-    compute_rgba_32     d14, d13, d12, d15
157
-.endif
158
-
159
-    vst4.8              {q6, q7}, [\dst,:128]!
160
-.endm
161
-
162
-.macro process_1l_16px_16 ofmt
163
-    compute_premult_16  d28, d29, d30, d31
136
+.macro process_1l_16px ofmt
137
+    compute_premult     d28, d29, d30, d31
164 138
     vld1.8              {q7}, [r4]!
165
-    compute_16px_16     r2, d14, d15, \ofmt
139
+    compute_16px        r2, d14, d15, \ofmt
166 140
 .endm
167 141
 
168
-.macro process_1l_16px_32 ofmt
169
-    compute_premult_32  d28, d30
170
-    vld1.8              {q7}, [r4]!
171
-    vmov                d28, d15                                       @ save right of the line of luma for later use
172
-    compute_8px_32      r2, d14, \ofmt
173
-
174
-    compute_premult_32  d29, d31
175
-    compute_8px_32      r2,  d28, \ofmt
176
-.endm
177
-
178
-.macro process_2l_16px_16 ofmt
179
-    compute_premult_16  d28, d29, d30, d31
180
-
181
-    vld1.8              {q7}, [r4]!                                    @ first line of luma
182
-    compute_16px_16     r2, d14, d15, \ofmt
183
-
184
-    vld1.8              {q7}, [r12]!                                   @ second line of luma
185
-    compute_16px_16     r11, d14, d15, \ofmt
186
-.endm
187
-
188
-.macro process_2l_16px_32 ofmt
189
-    compute_premult_32  d28, d30
142
+.macro process_2l_16px ofmt
143
+    compute_premult     d28, d29, d30, d31
190 144
 
191 145
     vld1.8              {q7}, [r4]!                                    @ first line of luma
192
-    vmov                d28, d15                                       @ save right of the first line of luma for later use
193
-    compute_8px_32      r2, d14, \ofmt
146
+    compute_16px        r2, d14, d15, \ofmt
194 147
 
195 148
     vld1.8              {q7}, [r12]!                                   @ second line of luma
196
-    vmov                d30, d15                                       @ save right of the second line of luma for later use
197
-    compute_8px_32      r11, d14, \ofmt
198
-
199
-    compute_premult_32  d29, d31
200
-    compute_8px_32      r2,  d28, \ofmt
201
-    compute_8px_32      r11, d30, \ofmt
149
+    compute_16px        r11, d14, d15, \ofmt
202 150
 .endm
203 151
 
204 152
 .macro load_args_nvx
... ...
@@ -266,8 +180,8 @@
266 266
     ldr                 r10,[sp, #120]                                 @ r10 = srcV
267 267
 .endm
268 268
 
269
-.macro declare_func ifmt ofmt precision
270
-function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
269
+.macro declare_func ifmt ofmt
270
+function ff_\ifmt\()_to_\ofmt\()_neon, export=1
271 271
 
272 272
 .ifc \ifmt,nv12
273 273
     load_args_nvx
... ...
@@ -301,7 +215,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
301 301
     vsubl.u8            q14, d2, d10                                   @ q14 = U - 128
302 302
     vsubl.u8            q15, d3, d10                                   @ q15 = V - 128
303 303
 
304
-    process_2l_16px_\precision \ofmt
304
+    process_2l_16px \ofmt
305 305
 .endif
306 306
 
307 307
 .ifc \ifmt,nv21
... ...
@@ -311,7 +225,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
311 311
     vsubl.u8            q14, d3, d10                                   @ q14 = U - 128
312 312
     vsubl.u8            q15, d2, d10                                   @ q15 = V - 128
313 313
 
314
-    process_2l_16px_\precision \ofmt
314
+    process_2l_16px \ofmt
315 315
 .endif
316 316
 
317 317
 .ifc \ifmt,yuv420p
... ...
@@ -323,7 +237,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
323 323
     vsubl.u8            q14, d2, d10                                   @ q14 = U - 128
324 324
     vsubl.u8            q15, d3, d10                                   @ q15 = V - 128
325 325
 
326
-    process_2l_16px_\precision \ofmt
326
+    process_2l_16px \ofmt
327 327
 .endif
328 328
 
329 329
 .ifc \ifmt,yuv422p
... ...
@@ -334,7 +248,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
334 334
     vsubl.u8            q14, d2, d10                                   @ q14 = U - 128
335 335
     vsubl.u8            q15, d3, d10                                   @ q15 = V - 128
336 336
 
337
-    process_1l_16px_\precision \ofmt
337
+    process_1l_16px \ofmt
338 338
 .endif
339 339
 
340 340
     subs                r8, r8, #16                                    @ width -= 16
... ...
@@ -390,18 +304,14 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
390 390
 endfunc
391 391
 .endm
392 392
 
393
-.macro declare_rgb_funcs ifmt precision
394
-    declare_func \ifmt, argb, \precision
395
-    declare_func \ifmt, rgba, \precision
396
-    declare_func \ifmt, abgr, \precision
397
-    declare_func \ifmt, bgra, \precision
393
+.macro declare_rgb_funcs ifmt
394
+    declare_func \ifmt, argb
395
+    declare_func \ifmt, rgba
396
+    declare_func \ifmt, abgr
397
+    declare_func \ifmt, bgra
398 398
 .endm
399 399
 
400
-declare_rgb_funcs nv12, 16
401
-declare_rgb_funcs nv21, 16
402
-declare_rgb_funcs nv12, 32
403
-declare_rgb_funcs nv21, 32
404
-declare_rgb_funcs yuv420p, 16
405
-declare_rgb_funcs yuv420p, 32
406
-declare_rgb_funcs yuv422p, 16
407
-declare_rgb_funcs yuv422p, 32
400
+declare_rgb_funcs nv12
401
+declare_rgb_funcs nv21
402
+declare_rgb_funcs yuv420p
403
+declare_rgb_funcs yuv422p