| ... | ... |
@@ -63,6 +63,50 @@ static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[ |
| 63 | 63 |
} |
| 64 | 64 |
#endif |
| 65 | 65 |
|
| 66 |
+#define YUV_TO_RGB_TABLE(precision) \ |
|
| 67 |
+ c->yuv2rgb_v2r_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 68 |
+ c->yuv2rgb_u2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 69 |
+ c->yuv2rgb_v2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 70 |
+ c->yuv2rgb_u2b_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 71 |
+ |
|
| 72 |
+#define DECLARE_FF_YUV420P_TO_RGBX_FUNCS(ofmt, precision) \ |
|
| 73 |
+int ff_yuv420p_to_##ofmt##_neon_##precision(int w, int h, \ |
|
| 74 |
+ uint8_t *dst, int linesize, \ |
|
| 75 |
+ const uint8_t *srcY, int linesizeY, \ |
|
| 76 |
+ const uint8_t *srcU, int linesizeU, \ |
|
| 77 |
+ const uint8_t *srcV, int linesizeV, \ |
|
| 78 |
+ const int16_t *table, \ |
|
| 79 |
+ int y_offset, \ |
|
| 80 |
+ int y_coeff); \ |
|
| 81 |
+ \ |
|
| 82 |
+static int yuv420p_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[],\ |
|
| 83 |
+ int srcStride[], int srcSliceY, int srcSliceH, \ |
|
| 84 |
+ uint8_t *dst[], int dstStride[]) { \
|
|
| 85 |
+ const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) }; \
|
|
| 86 |
+ \ |
|
| 87 |
+ ff_yuv420p_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ |
|
| 88 |
+ dst[0] + srcSliceY * dstStride[0], dstStride[0], \ |
|
| 89 |
+ src[0], srcStride[0], \ |
|
| 90 |
+ src[1], srcStride[1], \ |
|
| 91 |
+ src[2], srcStride[2], \ |
|
| 92 |
+ yuv2rgb_table, \ |
|
| 93 |
+ c->yuv2rgb_y_offset >> 9, \ |
|
| 94 |
+ c->yuv2rgb_y_coeff / ((precision) == 16 ? 1 << 7 : 1)); \ |
|
| 95 |
+ \ |
|
| 96 |
+ return 0; \ |
|
| 97 |
+} \ |
|
| 98 |
+ |
|
| 99 |
+#define DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(precision) \ |
|
| 100 |
+DECLARE_FF_YUV420P_TO_RGBX_FUNCS(argb, precision) \ |
|
| 101 |
+DECLARE_FF_YUV420P_TO_RGBX_FUNCS(rgba, precision) \ |
|
| 102 |
+DECLARE_FF_YUV420P_TO_RGBX_FUNCS(abgr, precision) \ |
|
| 103 |
+DECLARE_FF_YUV420P_TO_RGBX_FUNCS(bgra, precision) \ |
|
| 104 |
+ |
|
| 105 |
+#define DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS \ |
|
| 106 |
+DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(16) \ |
|
| 107 |
+ |
|
| 108 |
+DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS |
|
| 109 |
+ |
|
| 66 | 110 |
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt, precision) \ |
| 67 | 111 |
int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, \ |
| 68 | 112 |
uint8_t *dst, int linesize, \ |
| ... | ... |
@@ -75,12 +119,7 @@ int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, |
| 75 | 75 |
static int ifmt##_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[], \ |
| 76 | 76 |
int srcStride[], int srcSliceY, int srcSliceH, \ |
| 77 | 77 |
uint8_t *dst[], int dstStride[]) { \
|
| 78 |
- const int16_t yuv2rgb_table[] = { \
|
|
| 79 |
- c->yuv2rgb_v2r_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 80 |
- c->yuv2rgb_u2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 81 |
- c->yuv2rgb_v2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 82 |
- c->yuv2rgb_u2b_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
|
| 83 |
- }; \ |
|
| 78 |
+ const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) }; \
|
|
| 84 | 79 |
\ |
| 85 | 80 |
ff_##ifmt##_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ |
| 86 | 81 |
dst[0] + srcSliceY * dstStride[0], dstStride[0], \ |
| ... | ... |
@@ -138,6 +177,7 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
|
| 138 | 138 |
|
| 139 | 139 |
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd); |
| 140 | 140 |
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); |
| 141 |
+ SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); |
|
| 141 | 142 |
} |
| 142 | 143 |
|
| 143 | 144 |
void ff_get_unscaled_swscale_arm(SwsContext *c) |
| ... | ... |
@@ -103,7 +103,8 @@ |
| 103 | 103 |
vmovl.u8 q15, \y1 @ 8px of y |
| 104 | 104 |
|
| 105 | 105 |
vdup.16 q5, r9 @ q5 = y_offset |
| 106 |
- vdup.16 q7, r10 @ q7 = y_coeff |
|
| 106 |
+ vmov d14, d0 @ q7 = y_coeff |
|
| 107 |
+ vmov d15, d0 @ q7 = y_coeff |
|
| 107 | 108 |
|
| 108 | 109 |
vsub.s16 q14, q5 |
| 109 | 110 |
vsub.s16 q15, q5 |
| ... | ... |
@@ -184,7 +185,7 @@ |
| 184 | 184 |
compute_8px_32 r11, d30, \ofmt |
| 185 | 185 |
.endm |
| 186 | 186 |
|
| 187 |
-.macro load_args |
|
| 187 |
+.macro load_args_nvx |
|
| 188 | 188 |
push {r4-r12, lr}
|
| 189 | 189 |
vpush {q4-q7}
|
| 190 | 190 |
ldr r4, [sp, #104] @ r4 = srcY |
| ... | ... |
@@ -206,9 +207,42 @@ |
| 206 | 206 |
sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) |
| 207 | 207 |
.endm |
| 208 | 208 |
|
| 209 |
+.macro load_args_yuv420p |
|
| 210 |
+ push {r4-r12, lr}
|
|
| 211 |
+ vpush {q4-q7}
|
|
| 212 |
+ ldr r4, [sp, #104] @ r4 = srcY |
|
| 213 |
+ ldr r5, [sp, #108] @ r5 = linesizeY |
|
| 214 |
+ ldr r6, [sp, #112] @ r6 = srcU |
|
| 215 |
+ ldr r8, [sp, #128] @ r8 = table |
|
| 216 |
+ ldr r9, [sp, #132] @ r9 = y_offset |
|
| 217 |
+ ldr r10,[sp, #136] @ r10 = y_coeff |
|
| 218 |
+ vdup.16 d0, r10 @ d0 = y_coeff |
|
| 219 |
+ vld1.16 {d1}, [r8] @ d1 = *table
|
|
| 220 |
+ add r11, r2, r3 @ r11 = dst + linesize (dst2) |
|
| 221 |
+ add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) |
|
| 222 |
+ lsl r3, r3, #1 |
|
| 223 |
+ lsl r5, r5, #1 |
|
| 224 |
+ lsl r8, r0, #2 |
|
| 225 |
+ sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding) |
|
| 226 |
+ sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) |
|
| 227 |
+ ldr r10,[sp, #120] @ r10 = srcV |
|
| 228 |
+.endm |
|
| 229 |
+ |
|
| 209 | 230 |
.macro declare_func ifmt ofmt precision |
| 210 | 231 |
function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 |
| 211 |
- load_args |
|
| 232 |
+ |
|
| 233 |
+.ifc \ifmt,nv12 |
|
| 234 |
+ load_args_nvx |
|
| 235 |
+.endif |
|
| 236 |
+ |
|
| 237 |
+.ifc \ifmt,nv21 |
|
| 238 |
+ load_args_nvx |
|
| 239 |
+.endif |
|
| 240 |
+ |
|
| 241 |
+.ifc \ifmt,yuv420p |
|
| 242 |
+ load_args_yuv420p |
|
| 243 |
+.endif |
|
| 244 |
+ |
|
| 212 | 245 |
1: |
| 213 | 246 |
mov r8, r0 @ r8 = width |
| 214 | 247 |
2: |
| ... | ... |
@@ -216,16 +250,30 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 |
| 216 | 216 |
pld [r4, #64*3] |
| 217 | 217 |
pld [r12, #64*3] |
| 218 | 218 |
|
| 219 |
- vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
|
| 220 | 219 |
vmov.i8 d10, #128 |
| 220 |
+ |
|
| 221 | 221 |
.ifc \ifmt,nv12 |
| 222 |
+ vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
|
| 222 | 223 |
vsubl.u8 q14, d2, d10 @ q14 = U - 128 |
| 223 | 224 |
vsubl.u8 q15, d3, d10 @ q15 = V - 128 |
| 224 |
-.else |
|
| 225 |
+.endif |
|
| 226 |
+ |
|
| 227 |
+.ifc \ifmt,nv21 |
|
| 228 |
+ vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
|
| 225 | 229 |
vsubl.u8 q14, d3, d10 @ q14 = U - 128 |
| 226 | 230 |
vsubl.u8 q15, d2, d10 @ q15 = V - 128 |
| 227 | 231 |
.endif |
| 228 | 232 |
|
| 233 |
+.ifc \ifmt,yuv420p |
|
| 234 |
+ pld [r10, #64*3] |
|
| 235 |
+ |
|
| 236 |
+ vld1.8 d2, [r6]! @ d2: chroma red line |
|
| 237 |
+ vld1.8 d3, [r10]! @ d3: chroma blue line |
|
| 238 |
+ vsubl.u8 q14, d2, d10 @ q14 = U - 128 |
|
| 239 |
+ vsubl.u8 q15, d3, d10 @ q15 = V - 128 |
|
| 240 |
+.endif |
|
| 241 |
+ |
|
| 242 |
+ |
|
| 229 | 243 |
process_16px_\precision \ofmt |
| 230 | 244 |
|
| 231 | 245 |
subs r8, r8, #16 @ width -= 16 |
| ... | ... |
@@ -235,7 +283,24 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 |
| 235 | 235 |
add r4, r4, r5 @ srcY += paddingY |
| 236 | 236 |
add r11, r11, r3 @ dst2 += padding |
| 237 | 237 |
add r12, r12, r5 @ srcY2 += paddingY |
| 238 |
+ |
|
| 239 |
+.ifc \ifmt,nv12 |
|
| 238 | 240 |
add r6, r6, r7 @ srcC += paddingC |
| 241 |
+.endif |
|
| 242 |
+ |
|
| 243 |
+.ifc \ifmt,nv21 |
|
| 244 |
+ add r6, r6, r7 @ srcC += paddingC |
|
| 245 |
+.endif |
|
| 246 |
+ |
|
| 247 |
+.ifc \ifmt,yuv420p |
|
| 248 |
+ ldr r7, [sp, #116] @ r7 = linesizeU |
|
| 249 |
+ sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) |
|
| 250 |
+ add r6, r6, r7 @ srcU += paddingU |
|
| 251 |
+ |
|
| 252 |
+ ldr r7, [sp, #124] @ r7 = linesizeV |
|
| 253 |
+ sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) |
|
| 254 |
+ add r10, r10, r7 @ srcU += paddingV |
|
| 255 |
+.endif |
|
| 239 | 256 |
|
| 240 | 257 |
subs r1, r1, #2 @ height -= 2 |
| 241 | 258 |
bgt 1b |
| ... | ... |
@@ -257,3 +322,5 @@ declare_rgb_funcs nv12, 16 |
| 257 | 257 |
declare_rgb_funcs nv21, 16 |
| 258 | 258 |
declare_rgb_funcs nv12, 32 |
| 259 | 259 |
declare_rgb_funcs nv21, 32 |
| 260 |
+declare_rgb_funcs yuv420p, 16 |
|
| 261 |
+declare_rgb_funcs yuv420p, 32 |