6 | 8 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,70 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com> |
|
2 |
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com> |
|
3 |
+ * |
|
4 |
+ * This file is part of FFmpeg. |
|
5 |
+ * |
|
6 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
7 |
+ * modify it under the terms of the GNU Lesser General Public |
|
8 |
+ * License as published by the Free Software Foundation; either |
|
9 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
10 |
+ * |
|
11 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+ * Lesser General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU Lesser General Public |
|
17 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+#include "libavutil/arm/asm.S" |
|
22 |
+ |
|
23 |
+function ff_hscale_8_to_15_neon, export=1 |
|
24 |
+ push {r4-r12, lr} |
|
25 |
+ vpush {q4-q7} |
|
26 |
+ ldr r4, [sp, #104] @ filter |
|
27 |
+ ldr r5, [sp, #108] @ filterPos |
|
28 |
+ ldr r6, [sp, #112] @ filterSize |
|
29 |
+ add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2 |
|
30 |
+1: ldr r8, [r5], #4 @ filterPos[0] |
|
31 |
+ ldr r9, [r5], #4 @ filterPos[1] |
|
32 |
+ vmov.s32 q4, #0 @ val accumulator |
|
33 |
+ vmov.s32 q5, #0 @ val accumulator |
|
34 |
+ mov r7, r6 @ tmpfilterSize = filterSize |
|
35 |
+ mov r0, r3 @ srcp |
|
36 |
+2: add r11, r0, r8 @ srcp + filterPos[0] |
|
37 |
+ add r12, r0, r9 @ srcp + filterPos[1] |
|
38 |
+ vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}] |
|
39 |
+ vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}] |
|
40 |
+ vld1.16 {q2}, [r4]! @ load 8x16-bit filter values |
|
41 |
+ vld1.16 {q3}, [r10]! @ load 8x16-bit filter values |
|
42 |
+ vmovl.u8 q0, d0 @ unpack src values to 16-bit |
|
43 |
+ vmovl.u8 q1, d2 @ unpack src values to 16-bit |
|
44 |
+ vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1) |
|
45 |
+ vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2) |
|
46 |
+ vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1) |
|
47 |
+ vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2) |
|
48 |
+ vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) |
|
49 |
+ vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) |
|
50 |
+ vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) |
|
51 |
+ vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) |
|
52 |
+ vadd.s32 q4, q8 @ update val accumulator |
|
53 |
+ vadd.s32 q5, q10 @ update val accumulator |
|
54 |
+ add r0, #8 @ srcp += 8 |
|
55 |
+ subs r7, #8 @ tmpfilterSize -= 8 |
|
56 |
+ bgt 2b @ loop until tmpfilterSize is consumed |
|
57 |
+ mov r4, r10 @ filter = filter2 |
|
58 |
+ add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2 |
|
59 |
+ vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1) |
|
60 |
+ vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2) |
|
61 |
+ vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit |
|
62 |
+ vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values |
|
63 |
+ vst1.32 {d8[0]},[r1]! @ write destination |
|
64 |
+ subs r2, #2 @ dstW -= 2 |
|
65 |
+ bgt 1b @ loop until end of line |
|
66 |
+ vpop {q4-q7} |
|
67 |
+ pop {r4-r12, lr} |
|
68 |
+ mov pc, lr |
|
69 |
+endfunc |
0 | 70 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,37 @@ |
0 |
+/* |
|
1 |
+ * This file is part of FFmpeg. |
|
2 |
+ * |
|
3 |
+ * FFmpeg is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * FFmpeg is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with FFmpeg; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#include "config.h" |
|
19 |
+#include "libswscale/swscale.h" |
|
20 |
+#include "libswscale/swscale_internal.h" |
|
21 |
+#include "libavutil/arm/cpu.h" |
|
22 |
+ |
|
23 |
+void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW, |
|
24 |
+ const uint8_t *src, const int16_t *filter, |
|
25 |
+ const int32_t *filterPos, int filterSize); |
|
26 |
+ |
|
27 |
+av_cold void ff_sws_init_swscale_arm(SwsContext *c) |
|
28 |
+{ |
|
29 |
+ int cpu_flags = av_get_cpu_flags(); |
|
30 |
+ |
|
31 |
+ if (have_neon(cpu_flags)) { |
|
32 |
+ if (c->srcBpc == 8 && c->dstBpc <= 14) { |
|
33 |
+ c->hyScale = c->hcScale = ff_hscale_8_to_15_neon; |
|
34 |
+ } |
|
35 |
+ } |
|
36 |
+} |
... | ... |
@@ -892,6 +892,7 @@ void ff_sws_init_output_funcs(SwsContext *c, |
892 | 892 |
void ff_sws_init_swscale_ppc(SwsContext *c); |
893 | 893 |
void ff_sws_init_swscale_x86(SwsContext *c); |
894 | 894 |
void ff_sws_init_swscale_aarch64(SwsContext *c); |
895 |
+void ff_sws_init_swscale_arm(SwsContext *c); |
|
895 | 896 |
|
896 | 897 |
void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth, |
897 | 898 |
const uint8_t *src, int srcW, int xInc); |