| ... | ... |
@@ -69,8 +69,8 @@ static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[ |
| 69 | 69 |
c->yuv2rgb_v2g_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
| 70 | 70 |
c->yuv2rgb_u2b_coeff / ((precision) == 16 ? 1 << 7 : 1), \ |
| 71 | 71 |
|
| 72 |
-#define DECLARE_FF_YUV420P_TO_RGBX_FUNCS(ofmt, precision) \ |
|
| 73 |
-int ff_yuv420p_to_##ofmt##_neon_##precision(int w, int h, \ |
|
| 72 |
+#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt, precision) \ |
|
| 73 |
+int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, \ |
|
| 74 | 74 |
uint8_t *dst, int linesize, \ |
| 75 | 75 |
const uint8_t *srcY, int linesizeY, \ |
| 76 | 76 |
const uint8_t *srcU, int linesizeU, \ |
| ... | ... |
@@ -79,12 +79,12 @@ int ff_yuv420p_to_##ofmt##_neon_##precision(int w, int h, |
| 79 | 79 |
int y_offset, \ |
| 80 | 80 |
int y_coeff); \ |
| 81 | 81 |
\ |
| 82 |
-static int yuv420p_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[],\ |
|
| 82 |
+static int ifmt##_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uint8_t *src[], \ |
|
| 83 | 83 |
int srcStride[], int srcSliceY, int srcSliceH, \ |
| 84 | 84 |
uint8_t *dst[], int dstStride[]) { \
|
| 85 | 85 |
const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE(precision) }; \
|
| 86 | 86 |
\ |
| 87 |
- ff_yuv420p_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ |
|
| 87 |
+ ff_##ifmt##_to_##ofmt##_neon_##precision(c->srcW, srcSliceH, \ |
|
| 88 | 88 |
dst[0] + srcSliceY * dstStride[0], dstStride[0], \ |
| 89 | 89 |
src[0], srcStride[0], \ |
| 90 | 90 |
src[1], srcStride[1], \ |
| ... | ... |
@@ -96,16 +96,17 @@ static int yuv420p_to_##ofmt##_neon_wrapper_##precision(SwsContext *c, const uin |
| 96 | 96 |
return 0; \ |
| 97 | 97 |
} \ |
| 98 | 98 |
|
| 99 |
-#define DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(precision) \ |
|
| 100 |
-DECLARE_FF_YUV420P_TO_RGBX_FUNCS(argb, precision) \ |
|
| 101 |
-DECLARE_FF_YUV420P_TO_RGBX_FUNCS(rgba, precision) \ |
|
| 102 |
-DECLARE_FF_YUV420P_TO_RGBX_FUNCS(abgr, precision) \ |
|
| 103 |
-DECLARE_FF_YUV420P_TO_RGBX_FUNCS(bgra, precision) \ |
|
| 99 |
+#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx, precision) \ |
|
| 100 |
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb, precision) \ |
|
| 101 |
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba, precision) \ |
|
| 102 |
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr, precision) \ |
|
| 103 |
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra, precision) \ |
|
| 104 | 104 |
|
| 105 |
-#define DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS \ |
|
| 106 |
-DECLARE_FF_YUV420P_TO_ALL_RGBX_FUNCS(16) \ |
|
| 105 |
+#define DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuvx) \ |
|
| 106 |
+DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx, 16) \ |
|
| 107 | 107 |
|
| 108 |
-DECLARE_FF_YUV420P_TO_ALL_RGBX_ALL_PRECISION_FUNCS |
|
| 108 |
+DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuv420p) |
|
| 109 |
+DECLARE_FF_YUVX_TO_ALL_RGBX_ALL_PRECISION_FUNCS(yuv422p) |
|
| 109 | 110 |
|
| 110 | 111 |
#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt, precision) \ |
| 111 | 112 |
int ff_##ifmt##_to_##ofmt##_neon_##precision(int w, int h, \ |
| ... | ... |
@@ -178,6 +179,7 @@ static void get_unscaled_swscale_neon(SwsContext *c) {
|
| 178 | 178 |
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd); |
| 179 | 179 |
SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); |
| 180 | 180 |
SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); |
| 181 |
+ SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd); |
|
| 181 | 182 |
} |
| 182 | 183 |
|
| 183 | 184 |
void ff_get_unscaled_swscale_arm(SwsContext *c) |
| ... | ... |
@@ -159,7 +159,23 @@ |
| 159 | 159 |
vst4.8 {q6, q7}, [\dst,:128]!
|
| 160 | 160 |
.endm |
| 161 | 161 |
|
| 162 |
-.macro process_16px_16 ofmt |
|
| 162 |
+.macro process_1l_16px_16 ofmt |
|
| 163 |
+ compute_premult_16 d28, d29, d30, d31 |
|
| 164 |
+ vld1.8 {q7}, [r4]!
|
|
| 165 |
+ compute_16px_16 r2, d14, d15, \ofmt |
|
| 166 |
+.endm |
|
| 167 |
+ |
|
| 168 |
+.macro process_1l_16px_32 ofmt |
|
| 169 |
+ compute_premult_32 d28, d30 |
|
| 170 |
+ vld1.8 {q7}, [r4]!
|
|
| 171 |
+ vmov d28, d15 @ save right of the line of luma for later use |
|
| 172 |
+ compute_8px_32 r2, d14, \ofmt |
|
| 173 |
+ |
|
| 174 |
+ compute_premult_32 d29, d31 |
|
| 175 |
+ compute_8px_32 r2, d28, \ofmt |
|
| 176 |
+.endm |
|
| 177 |
+ |
|
| 178 |
+.macro process_2l_16px_16 ofmt |
|
| 163 | 179 |
compute_premult_16 d28, d29, d30, d31 |
| 164 | 180 |
|
| 165 | 181 |
vld1.8 {q7}, [r4]! @ first line of luma
|
| ... | ... |
@@ -169,7 +185,7 @@ |
| 169 | 169 |
compute_16px_16 r11, d14, d15, \ofmt |
| 170 | 170 |
.endm |
| 171 | 171 |
|
| 172 |
-.macro process_16px_32 ofmt |
|
| 172 |
+.macro process_2l_16px_32 ofmt |
|
| 173 | 173 |
compute_premult_32 d28, d30 |
| 174 | 174 |
|
| 175 | 175 |
vld1.8 {q7}, [r4]! @ first line of luma
|
| ... | ... |
@@ -228,6 +244,28 @@ |
| 228 | 228 |
ldr r10,[sp, #120] @ r10 = srcV |
| 229 | 229 |
.endm |
| 230 | 230 |
|
| 231 |
+.macro load_args_yuv422p |
|
| 232 |
+ push {r4-r12, lr}
|
|
| 233 |
+ vpush {q4-q7}
|
|
| 234 |
+ ldr r4, [sp, #104] @ r4 = srcY |
|
| 235 |
+ ldr r5, [sp, #108] @ r5 = linesizeY |
|
| 236 |
+ ldr r6, [sp, #112] @ r6 = srcU |
|
| 237 |
+ ldr r7, [sp, #116] @ r7 = linesizeU |
|
| 238 |
+ ldr r12,[sp, #124] @ r12 = linesizeV |
|
| 239 |
+ ldr r8, [sp, #128] @ r8 = table |
|
| 240 |
+ ldr r9, [sp, #132] @ r9 = y_offset |
|
| 241 |
+ ldr r10,[sp, #136] @ r10 = y_coeff |
|
| 242 |
+ vdup.16 d0, r10 @ d0 = y_coeff |
|
| 243 |
+ vld1.16 {d1}, [r8] @ d1 = *table
|
|
| 244 |
+ add r11, r2, r3 @ r11 = dst + linesize (dst2) |
|
| 245 |
+ lsl r8, r0, #2 |
|
| 246 |
+ sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding) |
|
| 247 |
+ sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) |
|
| 248 |
+ sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) |
|
| 249 |
+ sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV) |
|
| 250 |
+ ldr r10,[sp, #120] @ r10 = srcV |
|
| 251 |
+.endm |
|
| 252 |
+ |
|
| 231 | 253 |
.macro declare_func ifmt ofmt precision |
| 232 | 254 |
function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 |
| 233 | 255 |
|
| ... | ... |
@@ -243,56 +281,89 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 |
| 243 | 243 |
load_args_yuv420p |
| 244 | 244 |
.endif |
| 245 | 245 |
|
| 246 |
+ |
|
| 247 |
+.ifc \ifmt,yuv422p |
|
| 248 |
+ load_args_yuv422p |
|
| 249 |
+.endif |
|
| 250 |
+ |
|
| 246 | 251 |
1: |
| 247 | 252 |
mov r8, r0 @ r8 = width |
| 248 | 253 |
2: |
| 249 | 254 |
pld [r6, #64*3] |
| 250 | 255 |
pld [r4, #64*3] |
| 251 |
- pld [r12, #64*3] |
|
| 252 | 256 |
|
| 253 | 257 |
vmov.i8 d10, #128 |
| 254 | 258 |
|
| 255 | 259 |
.ifc \ifmt,nv12 |
| 260 |
+ pld [r12, #64*3] |
|
| 261 |
+ |
|
| 256 | 262 |
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
| 257 | 263 |
vsubl.u8 q14, d2, d10 @ q14 = U - 128 |
| 258 | 264 |
vsubl.u8 q15, d3, d10 @ q15 = V - 128 |
| 265 |
+ |
|
| 266 |
+ process_2l_16px_\precision \ofmt |
|
| 259 | 267 |
.endif |
| 260 | 268 |
|
| 261 | 269 |
.ifc \ifmt,nv21 |
| 270 |
+ pld [r12, #64*3] |
|
| 271 |
+ |
|
| 262 | 272 |
vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
|
| 263 | 273 |
vsubl.u8 q14, d3, d10 @ q14 = U - 128 |
| 264 | 274 |
vsubl.u8 q15, d2, d10 @ q15 = V - 128 |
| 275 |
+ |
|
| 276 |
+ process_2l_16px_\precision \ofmt |
|
| 265 | 277 |
.endif |
| 266 | 278 |
|
| 267 | 279 |
.ifc \ifmt,yuv420p |
| 268 | 280 |
pld [r10, #64*3] |
| 281 |
+ pld [r12, #64*3] |
|
| 269 | 282 |
|
| 270 | 283 |
vld1.8 d2, [r6]! @ d2: chroma red line |
| 271 | 284 |
vld1.8 d3, [r10]! @ d3: chroma blue line |
| 272 | 285 |
vsubl.u8 q14, d2, d10 @ q14 = U - 128 |
| 273 | 286 |
vsubl.u8 q15, d3, d10 @ q15 = V - 128 |
| 287 |
+ |
|
| 288 |
+ process_2l_16px_\precision \ofmt |
|
| 274 | 289 |
.endif |
| 275 | 290 |
|
| 291 |
+.ifc \ifmt,yuv422p |
|
| 292 |
+ pld [r10, #64*3] |
|
| 276 | 293 |
|
| 277 |
- process_16px_\precision \ofmt |
|
| 294 |
+ vld1.8 d2, [r6]! @ d2: chroma red line |
|
| 295 |
+ vld1.8 d3, [r10]! @ d3: chroma blue line |
|
| 296 |
+ vsubl.u8 q14, d2, d10 @ q14 = U - 128 |
|
| 297 |
+ vsubl.u8 q15, d3, d10 @ q15 = V - 128 |
|
| 298 |
+ |
|
| 299 |
+ process_1l_16px_\precision \ofmt |
|
| 300 |
+.endif |
|
| 278 | 301 |
|
| 279 | 302 |
subs r8, r8, #16 @ width -= 16 |
| 280 | 303 |
bgt 2b |
| 281 | 304 |
|
| 282 | 305 |
add r2, r2, r3 @ dst += padding |
| 283 | 306 |
add r4, r4, r5 @ srcY += paddingY |
| 307 |
+ |
|
| 308 |
+.ifc \ifmt,nv12 |
|
| 284 | 309 |
add r11, r11, r3 @ dst2 += padding |
| 285 | 310 |
add r12, r12, r5 @ srcY2 += paddingY |
| 286 | 311 |
|
| 287 |
-.ifc \ifmt,nv12 |
|
| 288 | 312 |
add r6, r6, r7 @ srcC += paddingC |
| 313 |
+ |
|
| 314 |
+ subs r1, r1, #2 @ height -= 2 |
|
| 289 | 315 |
.endif |
| 290 | 316 |
|
| 291 | 317 |
.ifc \ifmt,nv21 |
| 318 |
+ add r11, r11, r3 @ dst2 += padding |
|
| 319 |
+ add r12, r12, r5 @ srcY2 += paddingY |
|
| 320 |
+ |
|
| 292 | 321 |
add r6, r6, r7 @ srcC += paddingC |
| 322 |
+ subs r1, r1, #2 @ height -= 2 |
|
| 293 | 323 |
.endif |
| 294 | 324 |
|
| 295 | 325 |
.ifc \ifmt,yuv420p |
| 326 |
+ add r11, r11, r3 @ dst2 += padding |
|
| 327 |
+ add r12, r12, r5 @ srcY2 += paddingY |
|
| 328 |
+ |
|
| 296 | 329 |
ldr r7, [sp, #116] @ r7 = linesizeU |
| 297 | 330 |
sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) |
| 298 | 331 |
add r6, r6, r7 @ srcU += paddingU |
| ... | ... |
@@ -300,9 +371,17 @@ function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1 |
| 300 | 300 |
ldr r7, [sp, #124] @ r7 = linesizeV |
| 301 | 301 |
sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) |
| 302 | 302 |
add r10, r10, r7 @ srcV += paddingV |
| 303 |
-.endif |
|
| 304 | 303 |
|
| 305 | 304 |
subs r1, r1, #2 @ height -= 2 |
| 305 |
+.endif |
|
| 306 |
+ |
|
| 307 |
+.ifc \ifmt,yuv422p |
|
| 308 |
+ add r6, r6, r7 @ srcU += paddingU |
|
| 309 |
+ add r10,r10,r12 @ srcV += paddingV |
|
| 310 |
+ |
|
| 311 |
+ subs r1, r1, #1 @ height -= 1 |
|
| 312 |
+.endif |
|
| 313 |
+ |
|
| 306 | 314 |
bgt 1b |
| 307 | 315 |
|
| 308 | 316 |
vpop {q4-q7}
|
| ... | ... |
@@ -324,3 +403,5 @@ declare_rgb_funcs nv12, 32 |
| 324 | 324 |
declare_rgb_funcs nv21, 32 |
| 325 | 325 |
declare_rgb_funcs yuv420p, 16 |
| 326 | 326 |
declare_rgb_funcs yuv420p, 32 |
| 327 |
+declare_rgb_funcs yuv422p, 16 |
|
| 328 |
+declare_rgb_funcs yuv422p, 32 |