Signed-off-by: Martin Storsjö <martin@martin.st>
Ronald S. Bultje authored on 2013/03/11 08:16:45... | ... |
@@ -31,6 +31,11 @@ OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o |
31 | 31 |
OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o |
32 | 32 |
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o |
33 | 33 |
|
34 |
+OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \ |
|
35 |
+ arm/hpeldsp_arm.o |
|
36 |
+ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ |
|
37 |
+ arm/hpeldsp_armv6.o |
|
38 |
+ |
|
34 | 39 |
OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o |
35 | 40 |
OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \ |
36 | 41 |
arm/rv40dsp_init_arm.o \ |
... | ... |
@@ -84,6 +89,9 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_neon.o \ |
84 | 84 |
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ |
85 | 85 |
arm/synth_filter_neon.o \ |
86 | 86 |
|
87 |
+NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \ |
|
88 |
+ arm/hpeldsp_neon.o |
|
89 |
+ |
|
87 | 90 |
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o |
88 | 91 |
NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o |
89 | 92 |
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ |
... | ... |
@@ -26,590 +26,6 @@ |
26 | 26 |
#define pld @ |
27 | 27 |
#endif |
28 | 28 |
|
29 |
-.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 |
|
30 |
- mov \Rd0, \Rn0, lsr #(\shift * 8) |
|
31 |
- mov \Rd1, \Rn1, lsr #(\shift * 8) |
|
32 |
- mov \Rd2, \Rn2, lsr #(\shift * 8) |
|
33 |
- mov \Rd3, \Rn3, lsr #(\shift * 8) |
|
34 |
- orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) |
|
35 |
- orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) |
|
36 |
- orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) |
|
37 |
- orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) |
|
38 |
-.endm |
|
39 |
-.macro ALIGN_DWORD shift, R0, R1, R2 |
|
40 |
- mov \R0, \R0, lsr #(\shift * 8) |
|
41 |
- orr \R0, \R0, \R1, lsl #(32 - \shift * 8) |
|
42 |
- mov \R1, \R1, lsr #(\shift * 8) |
|
43 |
- orr \R1, \R1, \R2, lsl #(32 - \shift * 8) |
|
44 |
-.endm |
|
45 |
-.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 |
|
46 |
- mov \Rdst0, \Rsrc0, lsr #(\shift * 8) |
|
47 |
- mov \Rdst1, \Rsrc1, lsr #(\shift * 8) |
|
48 |
- orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) |
|
49 |
- orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) |
|
50 |
-.endm |
|
51 |
- |
|
52 |
-.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask |
|
53 |
- @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) |
|
54 |
- @ Rmask = 0xFEFEFEFE |
|
55 |
- @ Rn = destroy |
|
56 |
- eor \Rd0, \Rn0, \Rm0 |
|
57 |
- eor \Rd1, \Rn1, \Rm1 |
|
58 |
- orr \Rn0, \Rn0, \Rm0 |
|
59 |
- orr \Rn1, \Rn1, \Rm1 |
|
60 |
- and \Rd0, \Rd0, \Rmask |
|
61 |
- and \Rd1, \Rd1, \Rmask |
|
62 |
- sub \Rd0, \Rn0, \Rd0, lsr #1 |
|
63 |
- sub \Rd1, \Rn1, \Rd1, lsr #1 |
|
64 |
-.endm |
|
65 |
- |
|
66 |
-.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask |
|
67 |
- @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) |
|
68 |
- @ Rmask = 0xFEFEFEFE |
|
69 |
- @ Rn = destroy |
|
70 |
- eor \Rd0, \Rn0, \Rm0 |
|
71 |
- eor \Rd1, \Rn1, \Rm1 |
|
72 |
- and \Rn0, \Rn0, \Rm0 |
|
73 |
- and \Rn1, \Rn1, \Rm1 |
|
74 |
- and \Rd0, \Rd0, \Rmask |
|
75 |
- and \Rd1, \Rd1, \Rmask |
|
76 |
- add \Rd0, \Rn0, \Rd0, lsr #1 |
|
77 |
- add \Rd1, \Rn1, \Rd1, lsr #1 |
|
78 |
-.endm |
|
79 |
- |
|
80 |
-.macro JMP_ALIGN tmp, reg |
|
81 |
- ands \tmp, \reg, #3 |
|
82 |
- bic \reg, \reg, #3 |
|
83 |
- beq 1f |
|
84 |
- subs \tmp, \tmp, #1 |
|
85 |
- beq 2f |
|
86 |
- subs \tmp, \tmp, #1 |
|
87 |
- beq 3f |
|
88 |
- b 4f |
|
89 |
-.endm |
|
90 |
- |
|
91 |
-@ ---------------------------------------------------------------- |
|
92 |
- .align 5 |
|
93 |
-function ff_put_pixels16_arm, export=1 |
|
94 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
95 |
- @ block = word aligned, pixles = unaligned |
|
96 |
- pld [r1] |
|
97 |
- push {r4-r11, lr} |
|
98 |
- JMP_ALIGN r5, r1 |
|
99 |
-1: |
|
100 |
- ldm r1, {r4-r7} |
|
101 |
- add r1, r1, r2 |
|
102 |
- stm r0, {r4-r7} |
|
103 |
- pld [r1] |
|
104 |
- subs r3, r3, #1 |
|
105 |
- add r0, r0, r2 |
|
106 |
- bne 1b |
|
107 |
- pop {r4-r11, pc} |
|
108 |
- .align 5 |
|
109 |
-2: |
|
110 |
- ldm r1, {r4-r8} |
|
111 |
- add r1, r1, r2 |
|
112 |
- ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 |
|
113 |
- pld [r1] |
|
114 |
- subs r3, r3, #1 |
|
115 |
- stm r0, {r9-r12} |
|
116 |
- add r0, r0, r2 |
|
117 |
- bne 2b |
|
118 |
- pop {r4-r11, pc} |
|
119 |
- .align 5 |
|
120 |
-3: |
|
121 |
- ldm r1, {r4-r8} |
|
122 |
- add r1, r1, r2 |
|
123 |
- ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 |
|
124 |
- pld [r1] |
|
125 |
- subs r3, r3, #1 |
|
126 |
- stm r0, {r9-r12} |
|
127 |
- add r0, r0, r2 |
|
128 |
- bne 3b |
|
129 |
- pop {r4-r11, pc} |
|
130 |
- .align 5 |
|
131 |
-4: |
|
132 |
- ldm r1, {r4-r8} |
|
133 |
- add r1, r1, r2 |
|
134 |
- ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 |
|
135 |
- pld [r1] |
|
136 |
- subs r3, r3, #1 |
|
137 |
- stm r0, {r9-r12} |
|
138 |
- add r0, r0, r2 |
|
139 |
- bne 4b |
|
140 |
- pop {r4-r11,pc} |
|
141 |
-endfunc |
|
142 |
- |
|
143 |
-@ ---------------------------------------------------------------- |
|
144 |
- .align 5 |
|
145 |
-function ff_put_pixels8_arm, export=1 |
|
146 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
147 |
- @ block = word aligned, pixles = unaligned |
|
148 |
- pld [r1] |
|
149 |
- push {r4-r5,lr} |
|
150 |
- JMP_ALIGN r5, r1 |
|
151 |
-1: |
|
152 |
- ldm r1, {r4-r5} |
|
153 |
- add r1, r1, r2 |
|
154 |
- subs r3, r3, #1 |
|
155 |
- pld [r1] |
|
156 |
- stm r0, {r4-r5} |
|
157 |
- add r0, r0, r2 |
|
158 |
- bne 1b |
|
159 |
- pop {r4-r5,pc} |
|
160 |
- .align 5 |
|
161 |
-2: |
|
162 |
- ldm r1, {r4-r5, r12} |
|
163 |
- add r1, r1, r2 |
|
164 |
- ALIGN_DWORD 1, r4, r5, r12 |
|
165 |
- pld [r1] |
|
166 |
- subs r3, r3, #1 |
|
167 |
- stm r0, {r4-r5} |
|
168 |
- add r0, r0, r2 |
|
169 |
- bne 2b |
|
170 |
- pop {r4-r5,pc} |
|
171 |
- .align 5 |
|
172 |
-3: |
|
173 |
- ldm r1, {r4-r5, r12} |
|
174 |
- add r1, r1, r2 |
|
175 |
- ALIGN_DWORD 2, r4, r5, r12 |
|
176 |
- pld [r1] |
|
177 |
- subs r3, r3, #1 |
|
178 |
- stm r0, {r4-r5} |
|
179 |
- add r0, r0, r2 |
|
180 |
- bne 3b |
|
181 |
- pop {r4-r5,pc} |
|
182 |
- .align 5 |
|
183 |
-4: |
|
184 |
- ldm r1, {r4-r5, r12} |
|
185 |
- add r1, r1, r2 |
|
186 |
- ALIGN_DWORD 3, r4, r5, r12 |
|
187 |
- pld [r1] |
|
188 |
- subs r3, r3, #1 |
|
189 |
- stm r0, {r4-r5} |
|
190 |
- add r0, r0, r2 |
|
191 |
- bne 4b |
|
192 |
- pop {r4-r5,pc} |
|
193 |
-endfunc |
|
194 |
- |
|
195 |
-@ ---------------------------------------------------------------- |
|
196 |
- .align 5 |
|
197 |
-function ff_put_pixels8_x2_arm, export=1 |
|
198 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
199 |
- @ block = word aligned, pixles = unaligned |
|
200 |
- pld [r1] |
|
201 |
- push {r4-r10,lr} |
|
202 |
- ldr r12, =0xfefefefe |
|
203 |
- JMP_ALIGN r5, r1 |
|
204 |
-1: |
|
205 |
- ldm r1, {r4-r5, r10} |
|
206 |
- add r1, r1, r2 |
|
207 |
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
208 |
- pld [r1] |
|
209 |
- RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
210 |
- subs r3, r3, #1 |
|
211 |
- stm r0, {r8-r9} |
|
212 |
- add r0, r0, r2 |
|
213 |
- bne 1b |
|
214 |
- pop {r4-r10,pc} |
|
215 |
- .align 5 |
|
216 |
-2: |
|
217 |
- ldm r1, {r4-r5, r10} |
|
218 |
- add r1, r1, r2 |
|
219 |
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
220 |
- ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 |
|
221 |
- pld [r1] |
|
222 |
- RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
223 |
- subs r3, r3, #1 |
|
224 |
- stm r0, {r4-r5} |
|
225 |
- add r0, r0, r2 |
|
226 |
- bne 2b |
|
227 |
- pop {r4-r10,pc} |
|
228 |
- .align 5 |
|
229 |
-3: |
|
230 |
- ldm r1, {r4-r5, r10} |
|
231 |
- add r1, r1, r2 |
|
232 |
- ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 |
|
233 |
- ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 |
|
234 |
- pld [r1] |
|
235 |
- RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
236 |
- subs r3, r3, #1 |
|
237 |
- stm r0, {r4-r5} |
|
238 |
- add r0, r0, r2 |
|
239 |
- bne 3b |
|
240 |
- pop {r4-r10,pc} |
|
241 |
- .align 5 |
|
242 |
-4: |
|
243 |
- ldm r1, {r4-r5, r10} |
|
244 |
- add r1, r1, r2 |
|
245 |
- ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 |
|
246 |
- pld [r1] |
|
247 |
- RND_AVG32 r8, r9, r6, r7, r5, r10, r12 |
|
248 |
- subs r3, r3, #1 |
|
249 |
- stm r0, {r8-r9} |
|
250 |
- add r0, r0, r2 |
|
251 |
- bne 4b |
|
252 |
- pop {r4-r10,pc} |
|
253 |
-endfunc |
|
254 |
- |
|
255 |
- .align 5 |
|
256 |
-function ff_put_no_rnd_pixels8_x2_arm, export=1 |
|
257 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
258 |
- @ block = word aligned, pixles = unaligned |
|
259 |
- pld [r1] |
|
260 |
- push {r4-r10,lr} |
|
261 |
- ldr r12, =0xfefefefe |
|
262 |
- JMP_ALIGN r5, r1 |
|
263 |
-1: |
|
264 |
- ldm r1, {r4-r5, r10} |
|
265 |
- add r1, r1, r2 |
|
266 |
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
267 |
- pld [r1] |
|
268 |
- NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
269 |
- subs r3, r3, #1 |
|
270 |
- stm r0, {r8-r9} |
|
271 |
- add r0, r0, r2 |
|
272 |
- bne 1b |
|
273 |
- pop {r4-r10,pc} |
|
274 |
- .align 5 |
|
275 |
-2: |
|
276 |
- ldm r1, {r4-r5, r10} |
|
277 |
- add r1, r1, r2 |
|
278 |
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
279 |
- ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 |
|
280 |
- pld [r1] |
|
281 |
- NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
282 |
- subs r3, r3, #1 |
|
283 |
- stm r0, {r4-r5} |
|
284 |
- add r0, r0, r2 |
|
285 |
- bne 2b |
|
286 |
- pop {r4-r10,pc} |
|
287 |
- .align 5 |
|
288 |
-3: |
|
289 |
- ldm r1, {r4-r5, r10} |
|
290 |
- add r1, r1, r2 |
|
291 |
- ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 |
|
292 |
- ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 |
|
293 |
- pld [r1] |
|
294 |
- NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
295 |
- subs r3, r3, #1 |
|
296 |
- stm r0, {r4-r5} |
|
297 |
- add r0, r0, r2 |
|
298 |
- bne 3b |
|
299 |
- pop {r4-r10,pc} |
|
300 |
- .align 5 |
|
301 |
-4: |
|
302 |
- ldm r1, {r4-r5, r10} |
|
303 |
- add r1, r1, r2 |
|
304 |
- ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 |
|
305 |
- pld [r1] |
|
306 |
- NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 |
|
307 |
- subs r3, r3, #1 |
|
308 |
- stm r0, {r8-r9} |
|
309 |
- add r0, r0, r2 |
|
310 |
- bne 4b |
|
311 |
- pop {r4-r10,pc} |
|
312 |
-endfunc |
|
313 |
- |
|
314 |
- |
|
315 |
-@ ---------------------------------------------------------------- |
|
316 |
- .align 5 |
|
317 |
-function ff_put_pixels8_y2_arm, export=1 |
|
318 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
319 |
- @ block = word aligned, pixles = unaligned |
|
320 |
- pld [r1] |
|
321 |
- push {r4-r11,lr} |
|
322 |
- mov r3, r3, lsr #1 |
|
323 |
- ldr r12, =0xfefefefe |
|
324 |
- JMP_ALIGN r5, r1 |
|
325 |
-1: |
|
326 |
- ldm r1, {r4-r5} |
|
327 |
- add r1, r1, r2 |
|
328 |
-6: ldm r1, {r6-r7} |
|
329 |
- add r1, r1, r2 |
|
330 |
- pld [r1] |
|
331 |
- RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
332 |
- ldm r1, {r4-r5} |
|
333 |
- add r1, r1, r2 |
|
334 |
- stm r0, {r8-r9} |
|
335 |
- add r0, r0, r2 |
|
336 |
- pld [r1] |
|
337 |
- RND_AVG32 r8, r9, r6, r7, r4, r5, r12 |
|
338 |
- subs r3, r3, #1 |
|
339 |
- stm r0, {r8-r9} |
|
340 |
- add r0, r0, r2 |
|
341 |
- bne 6b |
|
342 |
- pop {r4-r11,pc} |
|
343 |
- .align 5 |
|
344 |
-2: |
|
345 |
- ldm r1, {r4-r6} |
|
346 |
- add r1, r1, r2 |
|
347 |
- pld [r1] |
|
348 |
- ALIGN_DWORD 1, r4, r5, r6 |
|
349 |
-6: ldm r1, {r7-r9} |
|
350 |
- add r1, r1, r2 |
|
351 |
- pld [r1] |
|
352 |
- ALIGN_DWORD 1, r7, r8, r9 |
|
353 |
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
354 |
- stm r0, {r10-r11} |
|
355 |
- add r0, r0, r2 |
|
356 |
- ldm r1, {r4-r6} |
|
357 |
- add r1, r1, r2 |
|
358 |
- pld [r1] |
|
359 |
- ALIGN_DWORD 1, r4, r5, r6 |
|
360 |
- subs r3, r3, #1 |
|
361 |
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
362 |
- stm r0, {r10-r11} |
|
363 |
- add r0, r0, r2 |
|
364 |
- bne 6b |
|
365 |
- pop {r4-r11,pc} |
|
366 |
- .align 5 |
|
367 |
-3: |
|
368 |
- ldm r1, {r4-r6} |
|
369 |
- add r1, r1, r2 |
|
370 |
- pld [r1] |
|
371 |
- ALIGN_DWORD 2, r4, r5, r6 |
|
372 |
-6: ldm r1, {r7-r9} |
|
373 |
- add r1, r1, r2 |
|
374 |
- pld [r1] |
|
375 |
- ALIGN_DWORD 2, r7, r8, r9 |
|
376 |
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
377 |
- stm r0, {r10-r11} |
|
378 |
- add r0, r0, r2 |
|
379 |
- ldm r1, {r4-r6} |
|
380 |
- add r1, r1, r2 |
|
381 |
- pld [r1] |
|
382 |
- ALIGN_DWORD 2, r4, r5, r6 |
|
383 |
- subs r3, r3, #1 |
|
384 |
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
385 |
- stm r0, {r10-r11} |
|
386 |
- add r0, r0, r2 |
|
387 |
- bne 6b |
|
388 |
- pop {r4-r11,pc} |
|
389 |
- .align 5 |
|
390 |
-4: |
|
391 |
- ldm r1, {r4-r6} |
|
392 |
- add r1, r1, r2 |
|
393 |
- pld [r1] |
|
394 |
- ALIGN_DWORD 3, r4, r5, r6 |
|
395 |
-6: ldm r1, {r7-r9} |
|
396 |
- add r1, r1, r2 |
|
397 |
- pld [r1] |
|
398 |
- ALIGN_DWORD 3, r7, r8, r9 |
|
399 |
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
400 |
- stm r0, {r10-r11} |
|
401 |
- add r0, r0, r2 |
|
402 |
- ldm r1, {r4-r6} |
|
403 |
- add r1, r1, r2 |
|
404 |
- pld [r1] |
|
405 |
- ALIGN_DWORD 3, r4, r5, r6 |
|
406 |
- subs r3, r3, #1 |
|
407 |
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
408 |
- stm r0, {r10-r11} |
|
409 |
- add r0, r0, r2 |
|
410 |
- bne 6b |
|
411 |
- pop {r4-r11,pc} |
|
412 |
-endfunc |
|
413 |
- |
|
414 |
- .align 5 |
|
415 |
-function ff_put_no_rnd_pixels8_y2_arm, export=1 |
|
416 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
417 |
- @ block = word aligned, pixles = unaligned |
|
418 |
- pld [r1] |
|
419 |
- push {r4-r11,lr} |
|
420 |
- mov r3, r3, lsr #1 |
|
421 |
- ldr r12, =0xfefefefe |
|
422 |
- JMP_ALIGN r5, r1 |
|
423 |
-1: |
|
424 |
- ldm r1, {r4-r5} |
|
425 |
- add r1, r1, r2 |
|
426 |
-6: ldm r1, {r6-r7} |
|
427 |
- add r1, r1, r2 |
|
428 |
- pld [r1] |
|
429 |
- NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
430 |
- ldm r1, {r4-r5} |
|
431 |
- add r1, r1, r2 |
|
432 |
- stm r0, {r8-r9} |
|
433 |
- add r0, r0, r2 |
|
434 |
- pld [r1] |
|
435 |
- NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 |
|
436 |
- subs r3, r3, #1 |
|
437 |
- stm r0, {r8-r9} |
|
438 |
- add r0, r0, r2 |
|
439 |
- bne 6b |
|
440 |
- pop {r4-r11,pc} |
|
441 |
- .align 5 |
|
442 |
-2: |
|
443 |
- ldm r1, {r4-r6} |
|
444 |
- add r1, r1, r2 |
|
445 |
- pld [r1] |
|
446 |
- ALIGN_DWORD 1, r4, r5, r6 |
|
447 |
-6: ldm r1, {r7-r9} |
|
448 |
- add r1, r1, r2 |
|
449 |
- pld [r1] |
|
450 |
- ALIGN_DWORD 1, r7, r8, r9 |
|
451 |
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
452 |
- stm r0, {r10-r11} |
|
453 |
- add r0, r0, r2 |
|
454 |
- ldm r1, {r4-r6} |
|
455 |
- add r1, r1, r2 |
|
456 |
- pld [r1] |
|
457 |
- ALIGN_DWORD 1, r4, r5, r6 |
|
458 |
- subs r3, r3, #1 |
|
459 |
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
460 |
- stm r0, {r10-r11} |
|
461 |
- add r0, r0, r2 |
|
462 |
- bne 6b |
|
463 |
- pop {r4-r11,pc} |
|
464 |
- .align 5 |
|
465 |
-3: |
|
466 |
- ldm r1, {r4-r6} |
|
467 |
- add r1, r1, r2 |
|
468 |
- pld [r1] |
|
469 |
- ALIGN_DWORD 2, r4, r5, r6 |
|
470 |
-6: ldm r1, {r7-r9} |
|
471 |
- add r1, r1, r2 |
|
472 |
- pld [r1] |
|
473 |
- ALIGN_DWORD 2, r7, r8, r9 |
|
474 |
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
475 |
- stm r0, {r10-r11} |
|
476 |
- add r0, r0, r2 |
|
477 |
- ldm r1, {r4-r6} |
|
478 |
- add r1, r1, r2 |
|
479 |
- pld [r1] |
|
480 |
- ALIGN_DWORD 2, r4, r5, r6 |
|
481 |
- subs r3, r3, #1 |
|
482 |
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
483 |
- stm r0, {r10-r11} |
|
484 |
- add r0, r0, r2 |
|
485 |
- bne 6b |
|
486 |
- pop {r4-r11,pc} |
|
487 |
- .align 5 |
|
488 |
-4: |
|
489 |
- ldm r1, {r4-r6} |
|
490 |
- add r1, r1, r2 |
|
491 |
- pld [r1] |
|
492 |
- ALIGN_DWORD 3, r4, r5, r6 |
|
493 |
-6: ldm r1, {r7-r9} |
|
494 |
- add r1, r1, r2 |
|
495 |
- pld [r1] |
|
496 |
- ALIGN_DWORD 3, r7, r8, r9 |
|
497 |
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
498 |
- stm r0, {r10-r11} |
|
499 |
- add r0, r0, r2 |
|
500 |
- ldm r1, {r4-r6} |
|
501 |
- add r1, r1, r2 |
|
502 |
- pld [r1] |
|
503 |
- ALIGN_DWORD 3, r4, r5, r6 |
|
504 |
- subs r3, r3, #1 |
|
505 |
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
506 |
- stm r0, {r10-r11} |
|
507 |
- add r0, r0, r2 |
|
508 |
- bne 6b |
|
509 |
- pop {r4-r11,pc} |
|
510 |
-endfunc |
|
511 |
- |
|
512 |
- .ltorg |
|
513 |
- |
|
514 |
-@ ---------------------------------------------------------------- |
|
515 |
-.macro RND_XY2_IT align, rnd |
|
516 |
- @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) |
|
517 |
- @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) |
|
518 |
-.if \align == 0 |
|
519 |
- ldm r1, {r6-r8} |
|
520 |
-.elseif \align == 3 |
|
521 |
- ldm r1, {r5-r7} |
|
522 |
-.else |
|
523 |
- ldm r1, {r8-r10} |
|
524 |
-.endif |
|
525 |
- add r1, r1, r2 |
|
526 |
- pld [r1] |
|
527 |
-.if \align == 0 |
|
528 |
- ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 |
|
529 |
-.elseif \align == 1 |
|
530 |
- ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 |
|
531 |
- ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 |
|
532 |
-.elseif \align == 2 |
|
533 |
- ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 |
|
534 |
- ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 |
|
535 |
-.elseif \align == 3 |
|
536 |
- ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 |
|
537 |
-.endif |
|
538 |
- ldr r14, =0x03030303 |
|
539 |
- tst r3, #1 |
|
540 |
- and r8, r4, r14 |
|
541 |
- and r9, r5, r14 |
|
542 |
- and r10, r6, r14 |
|
543 |
- and r11, r7, r14 |
|
544 |
- it eq |
|
545 |
- andeq r14, r14, r14, \rnd #1 |
|
546 |
- add r8, r8, r10 |
|
547 |
- add r9, r9, r11 |
|
548 |
- ldr r12, =0xfcfcfcfc >> 2 |
|
549 |
- itt eq |
|
550 |
- addeq r8, r8, r14 |
|
551 |
- addeq r9, r9, r14 |
|
552 |
- and r4, r12, r4, lsr #2 |
|
553 |
- and r5, r12, r5, lsr #2 |
|
554 |
- and r6, r12, r6, lsr #2 |
|
555 |
- and r7, r12, r7, lsr #2 |
|
556 |
- add r10, r4, r6 |
|
557 |
- add r11, r5, r7 |
|
558 |
- subs r3, r3, #1 |
|
559 |
-.endm |
|
560 |
- |
|
561 |
-.macro RND_XY2_EXPAND align, rnd |
|
562 |
- RND_XY2_IT \align, \rnd |
|
563 |
-6: push {r8-r11} |
|
564 |
- RND_XY2_IT \align, \rnd |
|
565 |
- pop {r4-r7} |
|
566 |
- add r4, r4, r8 |
|
567 |
- add r5, r5, r9 |
|
568 |
- ldr r14, =0x0f0f0f0f |
|
569 |
- add r6, r6, r10 |
|
570 |
- add r7, r7, r11 |
|
571 |
- and r4, r14, r4, lsr #2 |
|
572 |
- and r5, r14, r5, lsr #2 |
|
573 |
- add r4, r4, r6 |
|
574 |
- add r5, r5, r7 |
|
575 |
- stm r0, {r4-r5} |
|
576 |
- add r0, r0, r2 |
|
577 |
- bge 6b |
|
578 |
- pop {r4-r11,pc} |
|
579 |
-.endm |
|
580 |
- |
|
581 |
- .align 5 |
|
582 |
-function ff_put_pixels8_xy2_arm, export=1 |
|
583 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
584 |
- @ block = word aligned, pixles = unaligned |
|
585 |
- pld [r1] |
|
586 |
- push {r4-r11,lr} @ R14 is also called LR |
|
587 |
- JMP_ALIGN r5, r1 |
|
588 |
-1: RND_XY2_EXPAND 0, lsl |
|
589 |
- .align 5 |
|
590 |
-2: RND_XY2_EXPAND 1, lsl |
|
591 |
- .align 5 |
|
592 |
-3: RND_XY2_EXPAND 2, lsl |
|
593 |
- .align 5 |
|
594 |
-4: RND_XY2_EXPAND 3, lsl |
|
595 |
-endfunc |
|
596 |
- |
|
597 |
- .align 5 |
|
598 |
-function ff_put_no_rnd_pixels8_xy2_arm, export=1 |
|
599 |
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
600 |
- @ block = word aligned, pixles = unaligned |
|
601 |
- pld [r1] |
|
602 |
- push {r4-r11,lr} |
|
603 |
- JMP_ALIGN r5, r1 |
|
604 |
-1: RND_XY2_EXPAND 0, lsr |
|
605 |
- .align 5 |
|
606 |
-2: RND_XY2_EXPAND 1, lsr |
|
607 |
- .align 5 |
|
608 |
-3: RND_XY2_EXPAND 2, lsr |
|
609 |
- .align 5 |
|
610 |
-4: RND_XY2_EXPAND 3, lsr |
|
611 |
-endfunc |
|
612 |
- |
|
613 | 29 |
.align 5 |
614 | 30 |
@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) |
615 | 31 |
function ff_add_pixels_clamped_arm, export=1 |
... | ... |
@@ -20,244 +20,6 @@ |
20 | 20 |
|
21 | 21 |
#include "libavutil/arm/asm.S" |
22 | 22 |
|
23 |
-.macro call_2x_pixels type, subp |
|
24 |
-function ff_\type\()_pixels16\subp\()_armv6, export=1 |
|
25 |
- push {r0-r3, lr} |
|
26 |
- bl ff_\type\()_pixels8\subp\()_armv6 |
|
27 |
- pop {r0-r3, lr} |
|
28 |
- add r0, r0, #8 |
|
29 |
- add r1, r1, #8 |
|
30 |
- b ff_\type\()_pixels8\subp\()_armv6 |
|
31 |
-endfunc |
|
32 |
-.endm |
|
33 |
- |
|
34 |
-call_2x_pixels avg |
|
35 |
-call_2x_pixels put, _x2 |
|
36 |
-call_2x_pixels put, _y2 |
|
37 |
-call_2x_pixels put, _x2_no_rnd |
|
38 |
-call_2x_pixels put, _y2_no_rnd |
|
39 |
- |
|
40 |
-function ff_put_pixels16_armv6, export=1 |
|
41 |
- push {r4-r11} |
|
42 |
-1: |
|
43 |
- ldr r5, [r1, #4] |
|
44 |
- ldr r6, [r1, #8] |
|
45 |
- ldr r7, [r1, #12] |
|
46 |
- ldr_post r4, r1, r2 |
|
47 |
- strd r6, r7, [r0, #8] |
|
48 |
- ldr r9, [r1, #4] |
|
49 |
- strd_post r4, r5, r0, r2 |
|
50 |
- ldr r10, [r1, #8] |
|
51 |
- ldr r11, [r1, #12] |
|
52 |
- ldr_post r8, r1, r2 |
|
53 |
- strd r10, r11, [r0, #8] |
|
54 |
- subs r3, r3, #2 |
|
55 |
- strd_post r8, r9, r0, r2 |
|
56 |
- bne 1b |
|
57 |
- |
|
58 |
- pop {r4-r11} |
|
59 |
- bx lr |
|
60 |
-endfunc |
|
61 |
- |
|
62 |
-function ff_put_pixels8_armv6, export=1 |
|
63 |
- push {r4-r7} |
|
64 |
-1: |
|
65 |
- ldr r5, [r1, #4] |
|
66 |
- ldr_post r4, r1, r2 |
|
67 |
- ldr r7, [r1, #4] |
|
68 |
- strd_post r4, r5, r0, r2 |
|
69 |
- ldr_post r6, r1, r2 |
|
70 |
- subs r3, r3, #2 |
|
71 |
- strd_post r6, r7, r0, r2 |
|
72 |
- bne 1b |
|
73 |
- |
|
74 |
- pop {r4-r7} |
|
75 |
- bx lr |
|
76 |
-endfunc |
|
77 |
- |
|
78 |
-function ff_put_pixels8_x2_armv6, export=1 |
|
79 |
- push {r4-r11, lr} |
|
80 |
- mov r12, #1 |
|
81 |
- orr r12, r12, r12, lsl #8 |
|
82 |
- orr r12, r12, r12, lsl #16 |
|
83 |
-1: |
|
84 |
- ldr r4, [r1] |
|
85 |
- subs r3, r3, #2 |
|
86 |
- ldr r5, [r1, #4] |
|
87 |
- ldr r7, [r1, #5] |
|
88 |
- lsr r6, r4, #8 |
|
89 |
- ldr_pre r8, r1, r2 |
|
90 |
- orr r6, r6, r5, lsl #24 |
|
91 |
- ldr r9, [r1, #4] |
|
92 |
- ldr r11, [r1, #5] |
|
93 |
- lsr r10, r8, #8 |
|
94 |
- add r1, r1, r2 |
|
95 |
- orr r10, r10, r9, lsl #24 |
|
96 |
- eor r14, r4, r6 |
|
97 |
- uhadd8 r4, r4, r6 |
|
98 |
- eor r6, r5, r7 |
|
99 |
- uhadd8 r5, r5, r7 |
|
100 |
- and r14, r14, r12 |
|
101 |
- and r6, r6, r12 |
|
102 |
- uadd8 r4, r4, r14 |
|
103 |
- eor r14, r8, r10 |
|
104 |
- uadd8 r5, r5, r6 |
|
105 |
- eor r6, r9, r11 |
|
106 |
- uhadd8 r8, r8, r10 |
|
107 |
- and r14, r14, r12 |
|
108 |
- uhadd8 r9, r9, r11 |
|
109 |
- and r6, r6, r12 |
|
110 |
- uadd8 r8, r8, r14 |
|
111 |
- strd_post r4, r5, r0, r2 |
|
112 |
- uadd8 r9, r9, r6 |
|
113 |
- strd_post r8, r9, r0, r2 |
|
114 |
- bne 1b |
|
115 |
- |
|
116 |
- pop {r4-r11, pc} |
|
117 |
-endfunc |
|
118 |
- |
|
119 |
-function ff_put_pixels8_y2_armv6, export=1 |
|
120 |
- push {r4-r11} |
|
121 |
- mov r12, #1 |
|
122 |
- orr r12, r12, r12, lsl #8 |
|
123 |
- orr r12, r12, r12, lsl #16 |
|
124 |
- ldr r4, [r1] |
|
125 |
- ldr r5, [r1, #4] |
|
126 |
- ldr_pre r6, r1, r2 |
|
127 |
- ldr r7, [r1, #4] |
|
128 |
-1: |
|
129 |
- subs r3, r3, #2 |
|
130 |
- uhadd8 r8, r4, r6 |
|
131 |
- eor r10, r4, r6 |
|
132 |
- uhadd8 r9, r5, r7 |
|
133 |
- eor r11, r5, r7 |
|
134 |
- and r10, r10, r12 |
|
135 |
- ldr_pre r4, r1, r2 |
|
136 |
- uadd8 r8, r8, r10 |
|
137 |
- and r11, r11, r12 |
|
138 |
- uadd8 r9, r9, r11 |
|
139 |
- ldr r5, [r1, #4] |
|
140 |
- uhadd8 r10, r4, r6 |
|
141 |
- eor r6, r4, r6 |
|
142 |
- uhadd8 r11, r5, r7 |
|
143 |
- and r6, r6, r12 |
|
144 |
- eor r7, r5, r7 |
|
145 |
- uadd8 r10, r10, r6 |
|
146 |
- and r7, r7, r12 |
|
147 |
- ldr_pre r6, r1, r2 |
|
148 |
- uadd8 r11, r11, r7 |
|
149 |
- strd_post r8, r9, r0, r2 |
|
150 |
- ldr r7, [r1, #4] |
|
151 |
- strd_post r10, r11, r0, r2 |
|
152 |
- bne 1b |
|
153 |
- |
|
154 |
- pop {r4-r11} |
|
155 |
- bx lr |
|
156 |
-endfunc |
|
157 |
- |
|
158 |
-function ff_put_pixels8_x2_no_rnd_armv6, export=1 |
|
159 |
- push {r4-r9, lr} |
|
160 |
-1: |
|
161 |
- subs r3, r3, #2 |
|
162 |
- ldr r4, [r1] |
|
163 |
- ldr r5, [r1, #4] |
|
164 |
- ldr r7, [r1, #5] |
|
165 |
- ldr_pre r8, r1, r2 |
|
166 |
- ldr r9, [r1, #4] |
|
167 |
- ldr r14, [r1, #5] |
|
168 |
- add r1, r1, r2 |
|
169 |
- lsr r6, r4, #8 |
|
170 |
- orr r6, r6, r5, lsl #24 |
|
171 |
- lsr r12, r8, #8 |
|
172 |
- orr r12, r12, r9, lsl #24 |
|
173 |
- uhadd8 r4, r4, r6 |
|
174 |
- uhadd8 r5, r5, r7 |
|
175 |
- uhadd8 r8, r8, r12 |
|
176 |
- uhadd8 r9, r9, r14 |
|
177 |
- stm r0, {r4,r5} |
|
178 |
- add r0, r0, r2 |
|
179 |
- stm r0, {r8,r9} |
|
180 |
- add r0, r0, r2 |
|
181 |
- bne 1b |
|
182 |
- |
|
183 |
- pop {r4-r9, pc} |
|
184 |
-endfunc |
|
185 |
- |
|
186 |
-function ff_put_pixels8_y2_no_rnd_armv6, export=1 |
|
187 |
- push {r4-r9, lr} |
|
188 |
- ldr r4, [r1] |
|
189 |
- ldr r5, [r1, #4] |
|
190 |
- ldr_pre r6, r1, r2 |
|
191 |
- ldr r7, [r1, #4] |
|
192 |
-1: |
|
193 |
- subs r3, r3, #2 |
|
194 |
- uhadd8 r8, r4, r6 |
|
195 |
- ldr_pre r4, r1, r2 |
|
196 |
- uhadd8 r9, r5, r7 |
|
197 |
- ldr r5, [r1, #4] |
|
198 |
- uhadd8 r12, r4, r6 |
|
199 |
- ldr_pre r6, r1, r2 |
|
200 |
- uhadd8 r14, r5, r7 |
|
201 |
- ldr r7, [r1, #4] |
|
202 |
- stm r0, {r8,r9} |
|
203 |
- add r0, r0, r2 |
|
204 |
- stm r0, {r12,r14} |
|
205 |
- add r0, r0, r2 |
|
206 |
- bne 1b |
|
207 |
- |
|
208 |
- pop {r4-r9, pc} |
|
209 |
-endfunc |
|
210 |
- |
|
211 |
-function ff_avg_pixels8_armv6, export=1 |
|
212 |
- pld [r1, r2] |
|
213 |
- push {r4-r10, lr} |
|
214 |
- mov lr, #1 |
|
215 |
- orr lr, lr, lr, lsl #8 |
|
216 |
- orr lr, lr, lr, lsl #16 |
|
217 |
- ldrd r4, r5, [r0] |
|
218 |
- ldr r10, [r1, #4] |
|
219 |
- ldr_post r9, r1, r2 |
|
220 |
- subs r3, r3, #2 |
|
221 |
-1: |
|
222 |
- pld [r1, r2] |
|
223 |
- eor r8, r4, r9 |
|
224 |
- uhadd8 r4, r4, r9 |
|
225 |
- eor r12, r5, r10 |
|
226 |
- ldrd_reg r6, r7, r0, r2 |
|
227 |
- uhadd8 r5, r5, r10 |
|
228 |
- and r8, r8, lr |
|
229 |
- ldr r10, [r1, #4] |
|
230 |
- and r12, r12, lr |
|
231 |
- uadd8 r4, r4, r8 |
|
232 |
- ldr_post r9, r1, r2 |
|
233 |
- eor r8, r6, r9 |
|
234 |
- uadd8 r5, r5, r12 |
|
235 |
- pld [r1, r2, lsl #1] |
|
236 |
- eor r12, r7, r10 |
|
237 |
- uhadd8 r6, r6, r9 |
|
238 |
- strd_post r4, r5, r0, r2 |
|
239 |
- uhadd8 r7, r7, r10 |
|
240 |
- beq 2f |
|
241 |
- and r8, r8, lr |
|
242 |
- ldrd_reg r4, r5, r0, r2 |
|
243 |
- uadd8 r6, r6, r8 |
|
244 |
- ldr r10, [r1, #4] |
|
245 |
- and r12, r12, lr |
|
246 |
- subs r3, r3, #2 |
|
247 |
- uadd8 r7, r7, r12 |
|
248 |
- ldr_post r9, r1, r2 |
|
249 |
- strd_post r6, r7, r0, r2 |
|
250 |
- b 1b |
|
251 |
-2: |
|
252 |
- and r8, r8, lr |
|
253 |
- and r12, r12, lr |
|
254 |
- uadd8 r6, r6, r8 |
|
255 |
- uadd8 r7, r7, r12 |
|
256 |
- strd_post r6, r7, r0, r2 |
|
257 |
- |
|
258 |
- pop {r4-r10, pc} |
|
259 |
-endfunc |
|
260 |
- |
|
261 | 23 |
function ff_add_pixels_clamped_armv6, export=1 |
262 | 24 |
push {r4-r8,lr} |
263 | 25 |
mov r3, #8 |
... | ... |
@@ -30,24 +30,6 @@ void ff_simple_idct_arm(int16_t *data); |
30 | 30 |
static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); |
31 | 31 |
static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); |
32 | 32 |
|
33 |
-void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
34 |
-void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
35 |
-void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
36 |
-void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
37 |
- |
|
38 |
-void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
39 |
-void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
40 |
-void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
41 |
- |
|
42 |
-void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
43 |
- |
|
44 |
-CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) |
|
45 |
-CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) |
|
46 |
-CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) |
|
47 |
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) |
|
48 |
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) |
|
49 |
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) |
|
50 |
- |
|
51 | 33 |
void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest, |
52 | 34 |
int line_size); |
53 | 35 |
|
... | ... |
@@ -76,7 +58,6 @@ static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block) |
76 | 76 |
|
77 | 77 |
av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) |
78 | 78 |
{ |
79 |
- const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
|
80 | 79 |
int cpu_flags = av_get_cpu_flags(); |
81 | 80 |
|
82 | 81 |
ff_put_pixels_clamped = c->put_pixels_clamped; |
... | ... |
@@ -99,26 +80,6 @@ av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) |
99 | 99 |
|
100 | 100 |
c->add_pixels_clamped = ff_add_pixels_clamped_arm; |
101 | 101 |
|
102 |
- if (!high_bit_depth) { |
|
103 |
- c->put_pixels_tab[0][0] = ff_put_pixels16_arm; |
|
104 |
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; |
|
105 |
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; |
|
106 |
- c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; |
|
107 |
- c->put_pixels_tab[1][0] = ff_put_pixels8_arm; |
|
108 |
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; |
|
109 |
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; |
|
110 |
- c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; |
|
111 |
- |
|
112 |
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; |
|
113 |
- c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; |
|
114 |
- c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; |
|
115 |
- c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; |
|
116 |
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; |
|
117 |
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; |
|
118 |
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; |
|
119 |
- c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; |
|
120 |
- } |
|
121 |
- |
|
122 | 102 |
if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx); |
123 | 103 |
if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx); |
124 | 104 |
if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx); |
... | ... |
@@ -27,24 +27,6 @@ void ff_simple_idct_armv6(int16_t *data); |
27 | 27 |
void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); |
28 | 28 |
void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); |
29 | 29 |
|
30 |
-void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
31 |
-void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
32 |
-void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
33 |
- |
|
34 |
-void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
35 |
-void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
36 |
- |
|
37 |
-void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
38 |
- |
|
39 |
-void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
40 |
-void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
41 |
-void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
42 |
- |
|
43 |
-void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
44 |
-void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
45 |
- |
|
46 |
-void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
47 |
- |
|
48 | 30 |
void ff_add_pixels_clamped_armv6(const int16_t *block, |
49 | 31 |
uint8_t *restrict pixels, |
50 | 32 |
int line_size); |
... | ... |
@@ -82,29 +64,6 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx) |
82 | 82 |
c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; |
83 | 83 |
} |
84 | 84 |
|
85 |
- if (!high_bit_depth) { |
|
86 |
- c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; |
|
87 |
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; |
|
88 |
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; |
|
89 |
-/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ |
|
90 |
- c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; |
|
91 |
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; |
|
92 |
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; |
|
93 |
-/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ |
|
94 |
- |
|
95 |
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; |
|
96 |
- c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; |
|
97 |
- c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; |
|
98 |
-/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ |
|
99 |
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; |
|
100 |
- c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; |
|
101 |
- c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; |
|
102 |
-/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ |
|
103 |
- |
|
104 |
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; |
|
105 |
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; |
|
106 |
- } |
|
107 |
- |
|
108 | 85 |
if (!high_bit_depth) |
109 | 86 |
c->get_pixels = ff_get_pixels_armv6; |
110 | 87 |
c->add_pixels_clamped = ff_add_pixels_clamped_armv6; |
... | ... |
@@ -32,33 +32,6 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); |
32 | 32 |
void ff_clear_block_neon(int16_t *block); |
33 | 33 |
void ff_clear_blocks_neon(int16_t *blocks); |
34 | 34 |
|
35 |
-void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
36 |
-void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
37 |
-void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
38 |
-void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
39 |
-void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
40 |
-void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
41 |
-void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
42 |
-void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
43 |
-void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
44 |
-void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
45 |
-void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
46 |
-void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
47 |
-void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
48 |
-void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
49 |
- |
|
50 |
-void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
51 |
-void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
52 |
-void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
53 |
-void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
54 |
-void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
55 |
-void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
56 |
-void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
57 |
-void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
58 |
-void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
59 |
-void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
60 |
-void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
61 |
- |
|
62 | 35 |
void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); |
63 | 36 |
void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); |
64 | 37 |
void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); |
... | ... |
@@ -92,38 +65,6 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) |
92 | 92 |
if (!high_bit_depth) { |
93 | 93 |
c->clear_block = ff_clear_block_neon; |
94 | 94 |
c->clear_blocks = ff_clear_blocks_neon; |
95 |
- |
|
96 |
- c->put_pixels_tab[0][0] = ff_put_pixels16_neon; |
|
97 |
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; |
|
98 |
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; |
|
99 |
- c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; |
|
100 |
- c->put_pixels_tab[1][0] = ff_put_pixels8_neon; |
|
101 |
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; |
|
102 |
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; |
|
103 |
- c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; |
|
104 |
- |
|
105 |
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; |
|
106 |
- c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; |
|
107 |
- c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; |
|
108 |
- c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; |
|
109 |
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; |
|
110 |
- c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; |
|
111 |
- c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; |
|
112 |
- c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; |
|
113 |
- |
|
114 |
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; |
|
115 |
- c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon; |
|
116 |
- c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon; |
|
117 |
- c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon; |
|
118 |
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; |
|
119 |
- c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon; |
|
120 |
- c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon; |
|
121 |
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon; |
|
122 |
- |
|
123 |
- c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon; |
|
124 |
- c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon; |
|
125 |
- c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon; |
|
126 |
- c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon; |
|
127 | 95 |
} |
128 | 96 |
|
129 | 97 |
c->add_pixels_clamped = ff_add_pixels_clamped_neon; |
... | ... |
@@ -37,394 +37,6 @@ function ff_clear_blocks_neon, export=1 |
37 | 37 |
bx lr |
38 | 38 |
endfunc |
39 | 39 |
|
40 |
-.macro pixels16 rnd=1, avg=0 |
|
41 |
- .if \avg |
|
42 |
- mov r12, r0 |
|
43 |
- .endif |
|
44 |
-1: vld1.8 {q0}, [r1], r2 |
|
45 |
- vld1.8 {q1}, [r1], r2 |
|
46 |
- vld1.8 {q2}, [r1], r2 |
|
47 |
- pld [r1, r2, lsl #2] |
|
48 |
- vld1.8 {q3}, [r1], r2 |
|
49 |
- pld [r1] |
|
50 |
- pld [r1, r2] |
|
51 |
- pld [r1, r2, lsl #1] |
|
52 |
- .if \avg |
|
53 |
- vld1.8 {q8}, [r12,:128], r2 |
|
54 |
- vrhadd.u8 q0, q0, q8 |
|
55 |
- vld1.8 {q9}, [r12,:128], r2 |
|
56 |
- vrhadd.u8 q1, q1, q9 |
|
57 |
- vld1.8 {q10}, [r12,:128], r2 |
|
58 |
- vrhadd.u8 q2, q2, q10 |
|
59 |
- vld1.8 {q11}, [r12,:128], r2 |
|
60 |
- vrhadd.u8 q3, q3, q11 |
|
61 |
- .endif |
|
62 |
- subs r3, r3, #4 |
|
63 |
- vst1.64 {q0}, [r0,:128], r2 |
|
64 |
- vst1.64 {q1}, [r0,:128], r2 |
|
65 |
- vst1.64 {q2}, [r0,:128], r2 |
|
66 |
- vst1.64 {q3}, [r0,:128], r2 |
|
67 |
- bne 1b |
|
68 |
- bx lr |
|
69 |
-.endm |
|
70 |
- |
|
71 |
-.macro pixels16_x2 rnd=1, avg=0 |
|
72 |
-1: vld1.8 {d0-d2}, [r1], r2 |
|
73 |
- vld1.8 {d4-d6}, [r1], r2 |
|
74 |
- pld [r1] |
|
75 |
- pld [r1, r2] |
|
76 |
- subs r3, r3, #2 |
|
77 |
- vext.8 q1, q0, q1, #1 |
|
78 |
- avg q0, q0, q1 |
|
79 |
- vext.8 q3, q2, q3, #1 |
|
80 |
- avg q2, q2, q3 |
|
81 |
- .if \avg |
|
82 |
- vld1.8 {q1}, [r0,:128], r2 |
|
83 |
- vld1.8 {q3}, [r0,:128] |
|
84 |
- vrhadd.u8 q0, q0, q1 |
|
85 |
- vrhadd.u8 q2, q2, q3 |
|
86 |
- sub r0, r0, r2 |
|
87 |
- .endif |
|
88 |
- vst1.8 {q0}, [r0,:128], r2 |
|
89 |
- vst1.8 {q2}, [r0,:128], r2 |
|
90 |
- bne 1b |
|
91 |
- bx lr |
|
92 |
-.endm |
|
93 |
- |
|
94 |
-.macro pixels16_y2 rnd=1, avg=0 |
|
95 |
- sub r3, r3, #2 |
|
96 |
- vld1.8 {q0}, [r1], r2 |
|
97 |
- vld1.8 {q1}, [r1], r2 |
|
98 |
-1: subs r3, r3, #2 |
|
99 |
- avg q2, q0, q1 |
|
100 |
- vld1.8 {q0}, [r1], r2 |
|
101 |
- avg q3, q0, q1 |
|
102 |
- vld1.8 {q1}, [r1], r2 |
|
103 |
- pld [r1] |
|
104 |
- pld [r1, r2] |
|
105 |
- .if \avg |
|
106 |
- vld1.8 {q8}, [r0,:128], r2 |
|
107 |
- vld1.8 {q9}, [r0,:128] |
|
108 |
- vrhadd.u8 q2, q2, q8 |
|
109 |
- vrhadd.u8 q3, q3, q9 |
|
110 |
- sub r0, r0, r2 |
|
111 |
- .endif |
|
112 |
- vst1.8 {q2}, [r0,:128], r2 |
|
113 |
- vst1.8 {q3}, [r0,:128], r2 |
|
114 |
- bne 1b |
|
115 |
- |
|
116 |
- avg q2, q0, q1 |
|
117 |
- vld1.8 {q0}, [r1], r2 |
|
118 |
- avg q3, q0, q1 |
|
119 |
- .if \avg |
|
120 |
- vld1.8 {q8}, [r0,:128], r2 |
|
121 |
- vld1.8 {q9}, [r0,:128] |
|
122 |
- vrhadd.u8 q2, q2, q8 |
|
123 |
- vrhadd.u8 q3, q3, q9 |
|
124 |
- sub r0, r0, r2 |
|
125 |
- .endif |
|
126 |
- vst1.8 {q2}, [r0,:128], r2 |
|
127 |
- vst1.8 {q3}, [r0,:128], r2 |
|
128 |
- |
|
129 |
- bx lr |
|
130 |
-.endm |
|
131 |
- |
|
132 |
-.macro pixels16_xy2 rnd=1, avg=0 |
|
133 |
- sub r3, r3, #2 |
|
134 |
- vld1.8 {d0-d2}, [r1], r2 |
|
135 |
- vld1.8 {d4-d6}, [r1], r2 |
|
136 |
-NRND vmov.i16 q13, #1 |
|
137 |
- pld [r1] |
|
138 |
- pld [r1, r2] |
|
139 |
- vext.8 q1, q0, q1, #1 |
|
140 |
- vext.8 q3, q2, q3, #1 |
|
141 |
- vaddl.u8 q8, d0, d2 |
|
142 |
- vaddl.u8 q10, d1, d3 |
|
143 |
- vaddl.u8 q9, d4, d6 |
|
144 |
- vaddl.u8 q11, d5, d7 |
|
145 |
-1: subs r3, r3, #2 |
|
146 |
- vld1.8 {d0-d2}, [r1], r2 |
|
147 |
- vadd.u16 q12, q8, q9 |
|
148 |
- pld [r1] |
|
149 |
-NRND vadd.u16 q12, q12, q13 |
|
150 |
- vext.8 q15, q0, q1, #1 |
|
151 |
- vadd.u16 q1 , q10, q11 |
|
152 |
- shrn d28, q12, #2 |
|
153 |
-NRND vadd.u16 q1, q1, q13 |
|
154 |
- shrn d29, q1, #2 |
|
155 |
- .if \avg |
|
156 |
- vld1.8 {q8}, [r0,:128] |
|
157 |
- vrhadd.u8 q14, q14, q8 |
|
158 |
- .endif |
|
159 |
- vaddl.u8 q8, d0, d30 |
|
160 |
- vld1.8 {d2-d4}, [r1], r2 |
|
161 |
- vaddl.u8 q10, d1, d31 |
|
162 |
- vst1.8 {q14}, [r0,:128], r2 |
|
163 |
- vadd.u16 q12, q8, q9 |
|
164 |
- pld [r1, r2] |
|
165 |
-NRND vadd.u16 q12, q12, q13 |
|
166 |
- vext.8 q2, q1, q2, #1 |
|
167 |
- vadd.u16 q0, q10, q11 |
|
168 |
- shrn d30, q12, #2 |
|
169 |
-NRND vadd.u16 q0, q0, q13 |
|
170 |
- shrn d31, q0, #2 |
|
171 |
- .if \avg |
|
172 |
- vld1.8 {q9}, [r0,:128] |
|
173 |
- vrhadd.u8 q15, q15, q9 |
|
174 |
- .endif |
|
175 |
- vaddl.u8 q9, d2, d4 |
|
176 |
- vaddl.u8 q11, d3, d5 |
|
177 |
- vst1.8 {q15}, [r0,:128], r2 |
|
178 |
- bgt 1b |
|
179 |
- |
|
180 |
- vld1.8 {d0-d2}, [r1], r2 |
|
181 |
- vadd.u16 q12, q8, q9 |
|
182 |
-NRND vadd.u16 q12, q12, q13 |
|
183 |
- vext.8 q15, q0, q1, #1 |
|
184 |
- vadd.u16 q1 , q10, q11 |
|
185 |
- shrn d28, q12, #2 |
|
186 |
-NRND vadd.u16 q1, q1, q13 |
|
187 |
- shrn d29, q1, #2 |
|
188 |
- .if \avg |
|
189 |
- vld1.8 {q8}, [r0,:128] |
|
190 |
- vrhadd.u8 q14, q14, q8 |
|
191 |
- .endif |
|
192 |
- vaddl.u8 q8, d0, d30 |
|
193 |
- vaddl.u8 q10, d1, d31 |
|
194 |
- vst1.8 {q14}, [r0,:128], r2 |
|
195 |
- vadd.u16 q12, q8, q9 |
|
196 |
-NRND vadd.u16 q12, q12, q13 |
|
197 |
- vadd.u16 q0, q10, q11 |
|
198 |
- shrn d30, q12, #2 |
|
199 |
-NRND vadd.u16 q0, q0, q13 |
|
200 |
- shrn d31, q0, #2 |
|
201 |
- .if \avg |
|
202 |
- vld1.8 {q9}, [r0,:128] |
|
203 |
- vrhadd.u8 q15, q15, q9 |
|
204 |
- .endif |
|
205 |
- vst1.8 {q15}, [r0,:128], r2 |
|
206 |
- |
|
207 |
- bx lr |
|
208 |
-.endm |
|
209 |
- |
|
210 |
-.macro pixels8 rnd=1, avg=0 |
|
211 |
-1: vld1.8 {d0}, [r1], r2 |
|
212 |
- vld1.8 {d1}, [r1], r2 |
|
213 |
- vld1.8 {d2}, [r1], r2 |
|
214 |
- pld [r1, r2, lsl #2] |
|
215 |
- vld1.8 {d3}, [r1], r2 |
|
216 |
- pld [r1] |
|
217 |
- pld [r1, r2] |
|
218 |
- pld [r1, r2, lsl #1] |
|
219 |
- .if \avg |
|
220 |
- vld1.8 {d4}, [r0,:64], r2 |
|
221 |
- vrhadd.u8 d0, d0, d4 |
|
222 |
- vld1.8 {d5}, [r0,:64], r2 |
|
223 |
- vrhadd.u8 d1, d1, d5 |
|
224 |
- vld1.8 {d6}, [r0,:64], r2 |
|
225 |
- vrhadd.u8 d2, d2, d6 |
|
226 |
- vld1.8 {d7}, [r0,:64], r2 |
|
227 |
- vrhadd.u8 d3, d3, d7 |
|
228 |
- sub r0, r0, r2, lsl #2 |
|
229 |
- .endif |
|
230 |
- subs r3, r3, #4 |
|
231 |
- vst1.8 {d0}, [r0,:64], r2 |
|
232 |
- vst1.8 {d1}, [r0,:64], r2 |
|
233 |
- vst1.8 {d2}, [r0,:64], r2 |
|
234 |
- vst1.8 {d3}, [r0,:64], r2 |
|
235 |
- bne 1b |
|
236 |
- bx lr |
|
237 |
-.endm |
|
238 |
- |
|
239 |
-.macro pixels8_x2 rnd=1, avg=0 |
|
240 |
-1: vld1.8 {q0}, [r1], r2 |
|
241 |
- vext.8 d1, d0, d1, #1 |
|
242 |
- vld1.8 {q1}, [r1], r2 |
|
243 |
- vext.8 d3, d2, d3, #1 |
|
244 |
- pld [r1] |
|
245 |
- pld [r1, r2] |
|
246 |
- subs r3, r3, #2 |
|
247 |
- vswp d1, d2 |
|
248 |
- avg q0, q0, q1 |
|
249 |
- .if \avg |
|
250 |
- vld1.8 {d4}, [r0,:64], r2 |
|
251 |
- vld1.8 {d5}, [r0,:64] |
|
252 |
- vrhadd.u8 q0, q0, q2 |
|
253 |
- sub r0, r0, r2 |
|
254 |
- .endif |
|
255 |
- vst1.8 {d0}, [r0,:64], r2 |
|
256 |
- vst1.8 {d1}, [r0,:64], r2 |
|
257 |
- bne 1b |
|
258 |
- bx lr |
|
259 |
-.endm |
|
260 |
- |
|
261 |
-.macro pixels8_y2 rnd=1, avg=0 |
|
262 |
- sub r3, r3, #2 |
|
263 |
- vld1.8 {d0}, [r1], r2 |
|
264 |
- vld1.8 {d1}, [r1], r2 |
|
265 |
-1: subs r3, r3, #2 |
|
266 |
- avg d4, d0, d1 |
|
267 |
- vld1.8 {d0}, [r1], r2 |
|
268 |
- avg d5, d0, d1 |
|
269 |
- vld1.8 {d1}, [r1], r2 |
|
270 |
- pld [r1] |
|
271 |
- pld [r1, r2] |
|
272 |
- .if \avg |
|
273 |
- vld1.8 {d2}, [r0,:64], r2 |
|
274 |
- vld1.8 {d3}, [r0,:64] |
|
275 |
- vrhadd.u8 q2, q2, q1 |
|
276 |
- sub r0, r0, r2 |
|
277 |
- .endif |
|
278 |
- vst1.8 {d4}, [r0,:64], r2 |
|
279 |
- vst1.8 {d5}, [r0,:64], r2 |
|
280 |
- bne 1b |
|
281 |
- |
|
282 |
- avg d4, d0, d1 |
|
283 |
- vld1.8 {d0}, [r1], r2 |
|
284 |
- avg d5, d0, d1 |
|
285 |
- .if \avg |
|
286 |
- vld1.8 {d2}, [r0,:64], r2 |
|
287 |
- vld1.8 {d3}, [r0,:64] |
|
288 |
- vrhadd.u8 q2, q2, q1 |
|
289 |
- sub r0, r0, r2 |
|
290 |
- .endif |
|
291 |
- vst1.8 {d4}, [r0,:64], r2 |
|
292 |
- vst1.8 {d5}, [r0,:64], r2 |
|
293 |
- |
|
294 |
- bx lr |
|
295 |
-.endm |
|
296 |
- |
|
297 |
-.macro pixels8_xy2 rnd=1, avg=0 |
|
298 |
- sub r3, r3, #2 |
|
299 |
- vld1.8 {q0}, [r1], r2 |
|
300 |
- vld1.8 {q1}, [r1], r2 |
|
301 |
-NRND vmov.i16 q11, #1 |
|
302 |
- pld [r1] |
|
303 |
- pld [r1, r2] |
|
304 |
- vext.8 d4, d0, d1, #1 |
|
305 |
- vext.8 d6, d2, d3, #1 |
|
306 |
- vaddl.u8 q8, d0, d4 |
|
307 |
- vaddl.u8 q9, d2, d6 |
|
308 |
-1: subs r3, r3, #2 |
|
309 |
- vld1.8 {q0}, [r1], r2 |
|
310 |
- pld [r1] |
|
311 |
- vadd.u16 q10, q8, q9 |
|
312 |
- vext.8 d4, d0, d1, #1 |
|
313 |
-NRND vadd.u16 q10, q10, q11 |
|
314 |
- vaddl.u8 q8, d0, d4 |
|
315 |
- shrn d5, q10, #2 |
|
316 |
- vld1.8 {q1}, [r1], r2 |
|
317 |
- vadd.u16 q10, q8, q9 |
|
318 |
- pld [r1, r2] |
|
319 |
- .if \avg |
|
320 |
- vld1.8 {d7}, [r0,:64] |
|
321 |
- vrhadd.u8 d5, d5, d7 |
|
322 |
- .endif |
|
323 |
-NRND vadd.u16 q10, q10, q11 |
|
324 |
- vst1.8 {d5}, [r0,:64], r2 |
|
325 |
- shrn d7, q10, #2 |
|
326 |
- .if \avg |
|
327 |
- vld1.8 {d5}, [r0,:64] |
|
328 |
- vrhadd.u8 d7, d7, d5 |
|
329 |
- .endif |
|
330 |
- vext.8 d6, d2, d3, #1 |
|
331 |
- vaddl.u8 q9, d2, d6 |
|
332 |
- vst1.8 {d7}, [r0,:64], r2 |
|
333 |
- bgt 1b |
|
334 |
- |
|
335 |
- vld1.8 {q0}, [r1], r2 |
|
336 |
- vadd.u16 q10, q8, q9 |
|
337 |
- vext.8 d4, d0, d1, #1 |
|
338 |
-NRND vadd.u16 q10, q10, q11 |
|
339 |
- vaddl.u8 q8, d0, d4 |
|
340 |
- shrn d5, q10, #2 |
|
341 |
- vadd.u16 q10, q8, q9 |
|
342 |
- .if \avg |
|
343 |
- vld1.8 {d7}, [r0,:64] |
|
344 |
- vrhadd.u8 d5, d5, d7 |
|
345 |
- .endif |
|
346 |
-NRND vadd.u16 q10, q10, q11 |
|
347 |
- vst1.8 {d5}, [r0,:64], r2 |
|
348 |
- shrn d7, q10, #2 |
|
349 |
- .if \avg |
|
350 |
- vld1.8 {d5}, [r0,:64] |
|
351 |
- vrhadd.u8 d7, d7, d5 |
|
352 |
- .endif |
|
353 |
- vst1.8 {d7}, [r0,:64], r2 |
|
354 |
- |
|
355 |
- bx lr |
|
356 |
-.endm |
|
357 |
- |
|
358 |
-.macro pixfunc pfx, name, suf, rnd=1, avg=0 |
|
359 |
- .if \rnd |
|
360 |
- .macro avg rd, rn, rm |
|
361 |
- vrhadd.u8 \rd, \rn, \rm |
|
362 |
- .endm |
|
363 |
- .macro shrn rd, rn, rm |
|
364 |
- vrshrn.u16 \rd, \rn, \rm |
|
365 |
- .endm |
|
366 |
- .macro NRND insn:vararg |
|
367 |
- .endm |
|
368 |
- .else |
|
369 |
- .macro avg rd, rn, rm |
|
370 |
- vhadd.u8 \rd, \rn, \rm |
|
371 |
- .endm |
|
372 |
- .macro shrn rd, rn, rm |
|
373 |
- vshrn.u16 \rd, \rn, \rm |
|
374 |
- .endm |
|
375 |
- .macro NRND insn:vararg |
|
376 |
- \insn |
|
377 |
- .endm |
|
378 |
- .endif |
|
379 |
-function ff_\pfx\name\suf\()_neon, export=1 |
|
380 |
- \name \rnd, \avg |
|
381 |
-endfunc |
|
382 |
- .purgem avg |
|
383 |
- .purgem shrn |
|
384 |
- .purgem NRND |
|
385 |
-.endm |
|
386 |
- |
|
387 |
-.macro pixfunc2 pfx, name, avg=0 |
|
388 |
- pixfunc \pfx, \name, rnd=1, avg=\avg |
|
389 |
- pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg |
|
390 |
-.endm |
|
391 |
- |
|
392 |
-function ff_put_h264_qpel16_mc00_neon, export=1 |
|
393 |
- mov r3, #16 |
|
394 |
-endfunc |
|
395 |
- |
|
396 |
- pixfunc put_, pixels16, avg=0 |
|
397 |
- pixfunc2 put_, pixels16_x2, avg=0 |
|
398 |
- pixfunc2 put_, pixels16_y2, avg=0 |
|
399 |
- pixfunc2 put_, pixels16_xy2, avg=0 |
|
400 |
- |
|
401 |
-function ff_avg_h264_qpel16_mc00_neon, export=1 |
|
402 |
- mov r3, #16 |
|
403 |
-endfunc |
|
404 |
- |
|
405 |
- pixfunc avg_, pixels16, avg=1 |
|
406 |
- pixfunc2 avg_, pixels16_x2, avg=1 |
|
407 |
- pixfunc2 avg_, pixels16_y2, avg=1 |
|
408 |
- pixfunc2 avg_, pixels16_xy2, avg=1 |
|
409 |
- |
|
410 |
-function ff_put_h264_qpel8_mc00_neon, export=1 |
|
411 |
- mov r3, #8 |
|
412 |
-endfunc |
|
413 |
- |
|
414 |
- pixfunc put_, pixels8, avg=0 |
|
415 |
- pixfunc2 put_, pixels8_x2, avg=0 |
|
416 |
- pixfunc2 put_, pixels8_y2, avg=0 |
|
417 |
- pixfunc2 put_, pixels8_xy2, avg=0 |
|
418 |
- |
|
419 |
-function ff_avg_h264_qpel8_mc00_neon, export=1 |
|
420 |
- mov r3, #8 |
|
421 |
-endfunc |
|
422 |
- |
|
423 |
- pixfunc avg_, pixels8, avg=1 |
|
424 |
- pixfunc avg_, pixels8_x2, avg=1 |
|
425 |
- pixfunc avg_, pixels8_y2, avg=1 |
|
426 |
- pixfunc avg_, pixels8_xy2, avg=1 |
|
427 |
- |
|
428 | 40 |
function ff_put_pixels_clamped_neon, export=1 |
429 | 41 |
vld1.16 {d16-d19}, [r0,:128]! |
430 | 42 |
vqmovun.s16 d0, q8 |
431 | 43 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,611 @@ |
0 |
+@ |
|
1 |
+@ ARMv4 optimized DSP utils |
|
2 |
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> |
|
3 |
+@ |
|
4 |
+@ This file is part of Libav. |
|
5 |
+@ |
|
6 |
+@ Libav is free software; you can redistribute it and/or |
|
7 |
+@ modify it under the terms of the GNU Lesser General Public |
|
8 |
+@ License as published by the Free Software Foundation; either |
|
9 |
+@ version 2.1 of the License, or (at your option) any later version. |
|
10 |
+@ |
|
11 |
+@ Libav is distributed in the hope that it will be useful, |
|
12 |
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+@ Lesser General Public License for more details. |
|
15 |
+@ |
|
16 |
+@ You should have received a copy of the GNU Lesser General Public |
|
17 |
+@ License along with Libav; if not, write to the Free Software |
|
18 |
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+@ |
|
20 |
+ |
|
21 |
+#include "config.h" |
|
22 |
+#include "libavutil/arm/asm.S" |
|
23 |
+ |
|
24 |
+#if !HAVE_ARMV5TE_EXTERNAL |
|
25 |
+#define pld @ |
|
26 |
+#endif |
|
27 |
+ |
|
28 |
+.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 |
|
29 |
+ mov \Rd0, \Rn0, lsr #(\shift * 8) |
|
30 |
+ mov \Rd1, \Rn1, lsr #(\shift * 8) |
|
31 |
+ mov \Rd2, \Rn2, lsr #(\shift * 8) |
|
32 |
+ mov \Rd3, \Rn3, lsr #(\shift * 8) |
|
33 |
+ orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) |
|
34 |
+ orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) |
|
35 |
+ orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) |
|
36 |
+ orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) |
|
37 |
+.endm |
|
38 |
+.macro ALIGN_DWORD shift, R0, R1, R2 |
|
39 |
+ mov \R0, \R0, lsr #(\shift * 8) |
|
40 |
+ orr \R0, \R0, \R1, lsl #(32 - \shift * 8) |
|
41 |
+ mov \R1, \R1, lsr #(\shift * 8) |
|
42 |
+ orr \R1, \R1, \R2, lsl #(32 - \shift * 8) |
|
43 |
+.endm |
|
44 |
+.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 |
|
45 |
+ mov \Rdst0, \Rsrc0, lsr #(\shift * 8) |
|
46 |
+ mov \Rdst1, \Rsrc1, lsr #(\shift * 8) |
|
47 |
+ orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) |
|
48 |
+ orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) |
|
49 |
+.endm |
|
50 |
+ |
|
51 |
+.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask |
|
52 |
+ @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) |
|
53 |
+ @ Rmask = 0xFEFEFEFE |
|
54 |
+ @ Rn = destroy |
|
55 |
+ eor \Rd0, \Rn0, \Rm0 |
|
56 |
+ eor \Rd1, \Rn1, \Rm1 |
|
57 |
+ orr \Rn0, \Rn0, \Rm0 |
|
58 |
+ orr \Rn1, \Rn1, \Rm1 |
|
59 |
+ and \Rd0, \Rd0, \Rmask |
|
60 |
+ and \Rd1, \Rd1, \Rmask |
|
61 |
+ sub \Rd0, \Rn0, \Rd0, lsr #1 |
|
62 |
+ sub \Rd1, \Rn1, \Rd1, lsr #1 |
|
63 |
+.endm |
|
64 |
+ |
|
65 |
+.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask |
|
66 |
+ @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) |
|
67 |
+ @ Rmask = 0xFEFEFEFE |
|
68 |
+ @ Rn = destroy |
|
69 |
+ eor \Rd0, \Rn0, \Rm0 |
|
70 |
+ eor \Rd1, \Rn1, \Rm1 |
|
71 |
+ and \Rn0, \Rn0, \Rm0 |
|
72 |
+ and \Rn1, \Rn1, \Rm1 |
|
73 |
+ and \Rd0, \Rd0, \Rmask |
|
74 |
+ and \Rd1, \Rd1, \Rmask |
|
75 |
+ add \Rd0, \Rn0, \Rd0, lsr #1 |
|
76 |
+ add \Rd1, \Rn1, \Rd1, lsr #1 |
|
77 |
+.endm |
|
78 |
+ |
|
79 |
+.macro JMP_ALIGN tmp, reg |
|
80 |
+ ands \tmp, \reg, #3 |
|
81 |
+ bic \reg, \reg, #3 |
|
82 |
+ beq 1f |
|
83 |
+ subs \tmp, \tmp, #1 |
|
84 |
+ beq 2f |
|
85 |
+ subs \tmp, \tmp, #1 |
|
86 |
+ beq 3f |
|
87 |
+ b 4f |
|
88 |
+.endm |
|
89 |
+ |
|
90 |
+@ ---------------------------------------------------------------- |
|
91 |
+ .align 5 |
|
92 |
+function ff_put_pixels16_arm, export=1 |
|
93 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
94 |
+ @ block = word aligned, pixles = unaligned |
|
95 |
+ pld [r1] |
|
96 |
+ push {r4-r11, lr} |
|
97 |
+ JMP_ALIGN r5, r1 |
|
98 |
+1: |
|
99 |
+ ldm r1, {r4-r7} |
|
100 |
+ add r1, r1, r2 |
|
101 |
+ stm r0, {r4-r7} |
|
102 |
+ pld [r1] |
|
103 |
+ subs r3, r3, #1 |
|
104 |
+ add r0, r0, r2 |
|
105 |
+ bne 1b |
|
106 |
+ pop {r4-r11, pc} |
|
107 |
+ .align 5 |
|
108 |
+2: |
|
109 |
+ ldm r1, {r4-r8} |
|
110 |
+ add r1, r1, r2 |
|
111 |
+ ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 |
|
112 |
+ pld [r1] |
|
113 |
+ subs r3, r3, #1 |
|
114 |
+ stm r0, {r9-r12} |
|
115 |
+ add r0, r0, r2 |
|
116 |
+ bne 2b |
|
117 |
+ pop {r4-r11, pc} |
|
118 |
+ .align 5 |
|
119 |
+3: |
|
120 |
+ ldm r1, {r4-r8} |
|
121 |
+ add r1, r1, r2 |
|
122 |
+ ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 |
|
123 |
+ pld [r1] |
|
124 |
+ subs r3, r3, #1 |
|
125 |
+ stm r0, {r9-r12} |
|
126 |
+ add r0, r0, r2 |
|
127 |
+ bne 3b |
|
128 |
+ pop {r4-r11, pc} |
|
129 |
+ .align 5 |
|
130 |
+4: |
|
131 |
+ ldm r1, {r4-r8} |
|
132 |
+ add r1, r1, r2 |
|
133 |
+ ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 |
|
134 |
+ pld [r1] |
|
135 |
+ subs r3, r3, #1 |
|
136 |
+ stm r0, {r9-r12} |
|
137 |
+ add r0, r0, r2 |
|
138 |
+ bne 4b |
|
139 |
+ pop {r4-r11,pc} |
|
140 |
+endfunc |
|
141 |
+ |
|
142 |
+@ ---------------------------------------------------------------- |
|
143 |
+ .align 5 |
|
144 |
+function ff_put_pixels8_arm, export=1 |
|
145 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
146 |
+ @ block = word aligned, pixles = unaligned |
|
147 |
+ pld [r1] |
|
148 |
+ push {r4-r5,lr} |
|
149 |
+ JMP_ALIGN r5, r1 |
|
150 |
+1: |
|
151 |
+ ldm r1, {r4-r5} |
|
152 |
+ add r1, r1, r2 |
|
153 |
+ subs r3, r3, #1 |
|
154 |
+ pld [r1] |
|
155 |
+ stm r0, {r4-r5} |
|
156 |
+ add r0, r0, r2 |
|
157 |
+ bne 1b |
|
158 |
+ pop {r4-r5,pc} |
|
159 |
+ .align 5 |
|
160 |
+2: |
|
161 |
+ ldm r1, {r4-r5, r12} |
|
162 |
+ add r1, r1, r2 |
|
163 |
+ ALIGN_DWORD 1, r4, r5, r12 |
|
164 |
+ pld [r1] |
|
165 |
+ subs r3, r3, #1 |
|
166 |
+ stm r0, {r4-r5} |
|
167 |
+ add r0, r0, r2 |
|
168 |
+ bne 2b |
|
169 |
+ pop {r4-r5,pc} |
|
170 |
+ .align 5 |
|
171 |
+3: |
|
172 |
+ ldm r1, {r4-r5, r12} |
|
173 |
+ add r1, r1, r2 |
|
174 |
+ ALIGN_DWORD 2, r4, r5, r12 |
|
175 |
+ pld [r1] |
|
176 |
+ subs r3, r3, #1 |
|
177 |
+ stm r0, {r4-r5} |
|
178 |
+ add r0, r0, r2 |
|
179 |
+ bne 3b |
|
180 |
+ pop {r4-r5,pc} |
|
181 |
+ .align 5 |
|
182 |
+4: |
|
183 |
+ ldm r1, {r4-r5, r12} |
|
184 |
+ add r1, r1, r2 |
|
185 |
+ ALIGN_DWORD 3, r4, r5, r12 |
|
186 |
+ pld [r1] |
|
187 |
+ subs r3, r3, #1 |
|
188 |
+ stm r0, {r4-r5} |
|
189 |
+ add r0, r0, r2 |
|
190 |
+ bne 4b |
|
191 |
+ pop {r4-r5,pc} |
|
192 |
+endfunc |
|
193 |
+ |
|
194 |
+@ ---------------------------------------------------------------- |
|
195 |
+ .align 5 |
|
196 |
+function ff_put_pixels8_x2_arm, export=1 |
|
197 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
198 |
+ @ block = word aligned, pixles = unaligned |
|
199 |
+ pld [r1] |
|
200 |
+ push {r4-r10,lr} |
|
201 |
+ ldr r12, =0xfefefefe |
|
202 |
+ JMP_ALIGN r5, r1 |
|
203 |
+1: |
|
204 |
+ ldm r1, {r4-r5, r10} |
|
205 |
+ add r1, r1, r2 |
|
206 |
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
207 |
+ pld [r1] |
|
208 |
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
209 |
+ subs r3, r3, #1 |
|
210 |
+ stm r0, {r8-r9} |
|
211 |
+ add r0, r0, r2 |
|
212 |
+ bne 1b |
|
213 |
+ pop {r4-r10,pc} |
|
214 |
+ .align 5 |
|
215 |
+2: |
|
216 |
+ ldm r1, {r4-r5, r10} |
|
217 |
+ add r1, r1, r2 |
|
218 |
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
219 |
+ ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 |
|
220 |
+ pld [r1] |
|
221 |
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
222 |
+ subs r3, r3, #1 |
|
223 |
+ stm r0, {r4-r5} |
|
224 |
+ add r0, r0, r2 |
|
225 |
+ bne 2b |
|
226 |
+ pop {r4-r10,pc} |
|
227 |
+ .align 5 |
|
228 |
+3: |
|
229 |
+ ldm r1, {r4-r5, r10} |
|
230 |
+ add r1, r1, r2 |
|
231 |
+ ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 |
|
232 |
+ ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 |
|
233 |
+ pld [r1] |
|
234 |
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
235 |
+ subs r3, r3, #1 |
|
236 |
+ stm r0, {r4-r5} |
|
237 |
+ add r0, r0, r2 |
|
238 |
+ bne 3b |
|
239 |
+ pop {r4-r10,pc} |
|
240 |
+ .align 5 |
|
241 |
+4: |
|
242 |
+ ldm r1, {r4-r5, r10} |
|
243 |
+ add r1, r1, r2 |
|
244 |
+ ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 |
|
245 |
+ pld [r1] |
|
246 |
+ RND_AVG32 r8, r9, r6, r7, r5, r10, r12 |
|
247 |
+ subs r3, r3, #1 |
|
248 |
+ stm r0, {r8-r9} |
|
249 |
+ add r0, r0, r2 |
|
250 |
+ bne 4b |
|
251 |
+ pop {r4-r10,pc} |
|
252 |
+endfunc |
|
253 |
+ |
|
254 |
+ .align 5 |
|
255 |
+function ff_put_no_rnd_pixels8_x2_arm, export=1 |
|
256 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
257 |
+ @ block = word aligned, pixles = unaligned |
|
258 |
+ pld [r1] |
|
259 |
+ push {r4-r10,lr} |
|
260 |
+ ldr r12, =0xfefefefe |
|
261 |
+ JMP_ALIGN r5, r1 |
|
262 |
+1: |
|
263 |
+ ldm r1, {r4-r5, r10} |
|
264 |
+ add r1, r1, r2 |
|
265 |
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
266 |
+ pld [r1] |
|
267 |
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
268 |
+ subs r3, r3, #1 |
|
269 |
+ stm r0, {r8-r9} |
|
270 |
+ add r0, r0, r2 |
|
271 |
+ bne 1b |
|
272 |
+ pop {r4-r10,pc} |
|
273 |
+ .align 5 |
|
274 |
+2: |
|
275 |
+ ldm r1, {r4-r5, r10} |
|
276 |
+ add r1, r1, r2 |
|
277 |
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 |
|
278 |
+ ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 |
|
279 |
+ pld [r1] |
|
280 |
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
281 |
+ subs r3, r3, #1 |
|
282 |
+ stm r0, {r4-r5} |
|
283 |
+ add r0, r0, r2 |
|
284 |
+ bne 2b |
|
285 |
+ pop {r4-r10,pc} |
|
286 |
+ .align 5 |
|
287 |
+3: |
|
288 |
+ ldm r1, {r4-r5, r10} |
|
289 |
+ add r1, r1, r2 |
|
290 |
+ ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 |
|
291 |
+ ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 |
|
292 |
+ pld [r1] |
|
293 |
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 |
|
294 |
+ subs r3, r3, #1 |
|
295 |
+ stm r0, {r4-r5} |
|
296 |
+ add r0, r0, r2 |
|
297 |
+ bne 3b |
|
298 |
+ pop {r4-r10,pc} |
|
299 |
+ .align 5 |
|
300 |
+4: |
|
301 |
+ ldm r1, {r4-r5, r10} |
|
302 |
+ add r1, r1, r2 |
|
303 |
+ ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 |
|
304 |
+ pld [r1] |
|
305 |
+ NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 |
|
306 |
+ subs r3, r3, #1 |
|
307 |
+ stm r0, {r8-r9} |
|
308 |
+ add r0, r0, r2 |
|
309 |
+ bne 4b |
|
310 |
+ pop {r4-r10,pc} |
|
311 |
+endfunc |
|
312 |
+ |
|
313 |
+ |
|
314 |
+@ ---------------------------------------------------------------- |
|
315 |
+ .align 5 |
|
316 |
+function ff_put_pixels8_y2_arm, export=1 |
|
317 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
318 |
+ @ block = word aligned, pixles = unaligned |
|
319 |
+ pld [r1] |
|
320 |
+ push {r4-r11,lr} |
|
321 |
+ mov r3, r3, lsr #1 |
|
322 |
+ ldr r12, =0xfefefefe |
|
323 |
+ JMP_ALIGN r5, r1 |
|
324 |
+1: |
|
325 |
+ ldm r1, {r4-r5} |
|
326 |
+ add r1, r1, r2 |
|
327 |
+6: ldm r1, {r6-r7} |
|
328 |
+ add r1, r1, r2 |
|
329 |
+ pld [r1] |
|
330 |
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
331 |
+ ldm r1, {r4-r5} |
|
332 |
+ add r1, r1, r2 |
|
333 |
+ stm r0, {r8-r9} |
|
334 |
+ add r0, r0, r2 |
|
335 |
+ pld [r1] |
|
336 |
+ RND_AVG32 r8, r9, r6, r7, r4, r5, r12 |
|
337 |
+ subs r3, r3, #1 |
|
338 |
+ stm r0, {r8-r9} |
|
339 |
+ add r0, r0, r2 |
|
340 |
+ bne 6b |
|
341 |
+ pop {r4-r11,pc} |
|
342 |
+ .align 5 |
|
343 |
+2: |
|
344 |
+ ldm r1, {r4-r6} |
|
345 |
+ add r1, r1, r2 |
|
346 |
+ pld [r1] |
|
347 |
+ ALIGN_DWORD 1, r4, r5, r6 |
|
348 |
+6: ldm r1, {r7-r9} |
|
349 |
+ add r1, r1, r2 |
|
350 |
+ pld [r1] |
|
351 |
+ ALIGN_DWORD 1, r7, r8, r9 |
|
352 |
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
353 |
+ stm r0, {r10-r11} |
|
354 |
+ add r0, r0, r2 |
|
355 |
+ ldm r1, {r4-r6} |
|
356 |
+ add r1, r1, r2 |
|
357 |
+ pld [r1] |
|
358 |
+ ALIGN_DWORD 1, r4, r5, r6 |
|
359 |
+ subs r3, r3, #1 |
|
360 |
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
361 |
+ stm r0, {r10-r11} |
|
362 |
+ add r0, r0, r2 |
|
363 |
+ bne 6b |
|
364 |
+ pop {r4-r11,pc} |
|
365 |
+ .align 5 |
|
366 |
+3: |
|
367 |
+ ldm r1, {r4-r6} |
|
368 |
+ add r1, r1, r2 |
|
369 |
+ pld [r1] |
|
370 |
+ ALIGN_DWORD 2, r4, r5, r6 |
|
371 |
+6: ldm r1, {r7-r9} |
|
372 |
+ add r1, r1, r2 |
|
373 |
+ pld [r1] |
|
374 |
+ ALIGN_DWORD 2, r7, r8, r9 |
|
375 |
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
376 |
+ stm r0, {r10-r11} |
|
377 |
+ add r0, r0, r2 |
|
378 |
+ ldm r1, {r4-r6} |
|
379 |
+ add r1, r1, r2 |
|
380 |
+ pld [r1] |
|
381 |
+ ALIGN_DWORD 2, r4, r5, r6 |
|
382 |
+ subs r3, r3, #1 |
|
383 |
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
384 |
+ stm r0, {r10-r11} |
|
385 |
+ add r0, r0, r2 |
|
386 |
+ bne 6b |
|
387 |
+ pop {r4-r11,pc} |
|
388 |
+ .align 5 |
|
389 |
+4: |
|
390 |
+ ldm r1, {r4-r6} |
|
391 |
+ add r1, r1, r2 |
|
392 |
+ pld [r1] |
|
393 |
+ ALIGN_DWORD 3, r4, r5, r6 |
|
394 |
+6: ldm r1, {r7-r9} |
|
395 |
+ add r1, r1, r2 |
|
396 |
+ pld [r1] |
|
397 |
+ ALIGN_DWORD 3, r7, r8, r9 |
|
398 |
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
399 |
+ stm r0, {r10-r11} |
|
400 |
+ add r0, r0, r2 |
|
401 |
+ ldm r1, {r4-r6} |
|
402 |
+ add r1, r1, r2 |
|
403 |
+ pld [r1] |
|
404 |
+ ALIGN_DWORD 3, r4, r5, r6 |
|
405 |
+ subs r3, r3, #1 |
|
406 |
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
407 |
+ stm r0, {r10-r11} |
|
408 |
+ add r0, r0, r2 |
|
409 |
+ bne 6b |
|
410 |
+ pop {r4-r11,pc} |
|
411 |
+endfunc |
|
412 |
+ |
|
413 |
+ .align 5 |
|
414 |
+function ff_put_no_rnd_pixels8_y2_arm, export=1 |
|
415 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
416 |
+ @ block = word aligned, pixles = unaligned |
|
417 |
+ pld [r1] |
|
418 |
+ push {r4-r11,lr} |
|
419 |
+ mov r3, r3, lsr #1 |
|
420 |
+ ldr r12, =0xfefefefe |
|
421 |
+ JMP_ALIGN r5, r1 |
|
422 |
+1: |
|
423 |
+ ldm r1, {r4-r5} |
|
424 |
+ add r1, r1, r2 |
|
425 |
+6: ldm r1, {r6-r7} |
|
426 |
+ add r1, r1, r2 |
|
427 |
+ pld [r1] |
|
428 |
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 |
|
429 |
+ ldm r1, {r4-r5} |
|
430 |
+ add r1, r1, r2 |
|
431 |
+ stm r0, {r8-r9} |
|
432 |
+ add r0, r0, r2 |
|
433 |
+ pld [r1] |
|
434 |
+ NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 |
|
435 |
+ subs r3, r3, #1 |
|
436 |
+ stm r0, {r8-r9} |
|
437 |
+ add r0, r0, r2 |
|
438 |
+ bne 6b |
|
439 |
+ pop {r4-r11,pc} |
|
440 |
+ .align 5 |
|
441 |
+2: |
|
442 |
+ ldm r1, {r4-r6} |
|
443 |
+ add r1, r1, r2 |
|
444 |
+ pld [r1] |
|
445 |
+ ALIGN_DWORD 1, r4, r5, r6 |
|
446 |
+6: ldm r1, {r7-r9} |
|
447 |
+ add r1, r1, r2 |
|
448 |
+ pld [r1] |
|
449 |
+ ALIGN_DWORD 1, r7, r8, r9 |
|
450 |
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
451 |
+ stm r0, {r10-r11} |
|
452 |
+ add r0, r0, r2 |
|
453 |
+ ldm r1, {r4-r6} |
|
454 |
+ add r1, r1, r2 |
|
455 |
+ pld [r1] |
|
456 |
+ ALIGN_DWORD 1, r4, r5, r6 |
|
457 |
+ subs r3, r3, #1 |
|
458 |
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
459 |
+ stm r0, {r10-r11} |
|
460 |
+ add r0, r0, r2 |
|
461 |
+ bne 6b |
|
462 |
+ pop {r4-r11,pc} |
|
463 |
+ .align 5 |
|
464 |
+3: |
|
465 |
+ ldm r1, {r4-r6} |
|
466 |
+ add r1, r1, r2 |
|
467 |
+ pld [r1] |
|
468 |
+ ALIGN_DWORD 2, r4, r5, r6 |
|
469 |
+6: ldm r1, {r7-r9} |
|
470 |
+ add r1, r1, r2 |
|
471 |
+ pld [r1] |
|
472 |
+ ALIGN_DWORD 2, r7, r8, r9 |
|
473 |
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
474 |
+ stm r0, {r10-r11} |
|
475 |
+ add r0, r0, r2 |
|
476 |
+ ldm r1, {r4-r6} |
|
477 |
+ add r1, r1, r2 |
|
478 |
+ pld [r1] |
|
479 |
+ ALIGN_DWORD 2, r4, r5, r6 |
|
480 |
+ subs r3, r3, #1 |
|
481 |
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
482 |
+ stm r0, {r10-r11} |
|
483 |
+ add r0, r0, r2 |
|
484 |
+ bne 6b |
|
485 |
+ pop {r4-r11,pc} |
|
486 |
+ .align 5 |
|
487 |
+4: |
|
488 |
+ ldm r1, {r4-r6} |
|
489 |
+ add r1, r1, r2 |
|
490 |
+ pld [r1] |
|
491 |
+ ALIGN_DWORD 3, r4, r5, r6 |
|
492 |
+6: ldm r1, {r7-r9} |
|
493 |
+ add r1, r1, r2 |
|
494 |
+ pld [r1] |
|
495 |
+ ALIGN_DWORD 3, r7, r8, r9 |
|
496 |
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 |
|
497 |
+ stm r0, {r10-r11} |
|
498 |
+ add r0, r0, r2 |
|
499 |
+ ldm r1, {r4-r6} |
|
500 |
+ add r1, r1, r2 |
|
501 |
+ pld [r1] |
|
502 |
+ ALIGN_DWORD 3, r4, r5, r6 |
|
503 |
+ subs r3, r3, #1 |
|
504 |
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 |
|
505 |
+ stm r0, {r10-r11} |
|
506 |
+ add r0, r0, r2 |
|
507 |
+ bne 6b |
|
508 |
+ pop {r4-r11,pc} |
|
509 |
+endfunc |
|
510 |
+ |
|
511 |
+ .ltorg |
|
512 |
+ |
|
513 |
+@ ---------------------------------------------------------------- |
|
514 |
+.macro RND_XY2_IT align, rnd |
|
515 |
+ @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) |
|
516 |
+ @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) |
|
517 |
+.if \align == 0 |
|
518 |
+ ldm r1, {r6-r8} |
|
519 |
+.elseif \align == 3 |
|
520 |
+ ldm r1, {r5-r7} |
|
521 |
+.else |
|
522 |
+ ldm r1, {r8-r10} |
|
523 |
+.endif |
|
524 |
+ add r1, r1, r2 |
|
525 |
+ pld [r1] |
|
526 |
+.if \align == 0 |
|
527 |
+ ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 |
|
528 |
+.elseif \align == 1 |
|
529 |
+ ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 |
|
530 |
+ ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 |
|
531 |
+.elseif \align == 2 |
|
532 |
+ ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 |
|
533 |
+ ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 |
|
534 |
+.elseif \align == 3 |
|
535 |
+ ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 |
|
536 |
+.endif |
|
537 |
+ ldr r14, =0x03030303 |
|
538 |
+ tst r3, #1 |
|
539 |
+ and r8, r4, r14 |
|
540 |
+ and r9, r5, r14 |
|
541 |
+ and r10, r6, r14 |
|
542 |
+ and r11, r7, r14 |
|
543 |
+ it eq |
|
544 |
+ andeq r14, r14, r14, \rnd #1 |
|
545 |
+ add r8, r8, r10 |
|
546 |
+ add r9, r9, r11 |
|
547 |
+ ldr r12, =0xfcfcfcfc >> 2 |
|
548 |
+ itt eq |
|
549 |
+ addeq r8, r8, r14 |
|
550 |
+ addeq r9, r9, r14 |
|
551 |
+ and r4, r12, r4, lsr #2 |
|
552 |
+ and r5, r12, r5, lsr #2 |
|
553 |
+ and r6, r12, r6, lsr #2 |
|
554 |
+ and r7, r12, r7, lsr #2 |
|
555 |
+ add r10, r4, r6 |
|
556 |
+ add r11, r5, r7 |
|
557 |
+ subs r3, r3, #1 |
|
558 |
+.endm |
|
559 |
+ |
|
560 |
+.macro RND_XY2_EXPAND align, rnd |
|
561 |
+ RND_XY2_IT \align, \rnd |
|
562 |
+6: push {r8-r11} |
|
563 |
+ RND_XY2_IT \align, \rnd |
|
564 |
+ pop {r4-r7} |
|
565 |
+ add r4, r4, r8 |
|
566 |
+ add r5, r5, r9 |
|
567 |
+ ldr r14, =0x0f0f0f0f |
|
568 |
+ add r6, r6, r10 |
|
569 |
+ add r7, r7, r11 |
|
570 |
+ and r4, r14, r4, lsr #2 |
|
571 |
+ and r5, r14, r5, lsr #2 |
|
572 |
+ add r4, r4, r6 |
|
573 |
+ add r5, r5, r7 |
|
574 |
+ stm r0, {r4-r5} |
|
575 |
+ add r0, r0, r2 |
|
576 |
+ bge 6b |
|
577 |
+ pop {r4-r11,pc} |
|
578 |
+.endm |
|
579 |
+ |
|
580 |
+ .align 5 |
|
581 |
+function ff_put_pixels8_xy2_arm, export=1 |
|
582 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
583 |
+ @ block = word aligned, pixles = unaligned |
|
584 |
+ pld [r1] |
|
585 |
+ push {r4-r11,lr} @ R14 is also called LR |
|
586 |
+ JMP_ALIGN r5, r1 |
|
587 |
+1: RND_XY2_EXPAND 0, lsl |
|
588 |
+ .align 5 |
|
589 |
+2: RND_XY2_EXPAND 1, lsl |
|
590 |
+ .align 5 |
|
591 |
+3: RND_XY2_EXPAND 2, lsl |
|
592 |
+ .align 5 |
|
593 |
+4: RND_XY2_EXPAND 3, lsl |
|
594 |
+endfunc |
|
595 |
+ |
|
596 |
+ .align 5 |
|
597 |
+function ff_put_no_rnd_pixels8_xy2_arm, export=1 |
|
598 |
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) |
|
599 |
+ @ block = word aligned, pixles = unaligned |
|
600 |
+ pld [r1] |
|
601 |
+ push {r4-r11,lr} |
|
602 |
+ JMP_ALIGN r5, r1 |
|
603 |
+1: RND_XY2_EXPAND 0, lsr |
|
604 |
+ .align 5 |
|
605 |
+2: RND_XY2_EXPAND 1, lsr |
|
606 |
+ .align 5 |
|
607 |
+3: RND_XY2_EXPAND 2, lsr |
|
608 |
+ .align 5 |
|
609 |
+4: RND_XY2_EXPAND 3, lsr |
|
610 |
+endfunc |
0 | 611 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,27 @@ |
0 |
+/* |
|
1 |
+ * This file is part of Libav. |
|
2 |
+ * |
|
3 |
+ * Libav is free software; you can redistribute it and/or |
|
4 |
+ * modify it under the terms of the GNU Lesser General Public |
|
5 |
+ * License as published by the Free Software Foundation; either |
|
6 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
7 |
+ * |
|
8 |
+ * Libav is distributed in the hope that it will be useful, |
|
9 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
10 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
11 |
+ * Lesser General Public License for more details. |
|
12 |
+ * |
|
13 |
+ * You should have received a copy of the GNU Lesser General Public |
|
14 |
+ * License along with Libav; if not, write to the Free Software |
|
15 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
16 |
+ */ |
|
17 |
+ |
|
18 |
+#ifndef AVCODEC_ARM_HPELDSP_H |
|
19 |
+#define AVCODEC_ARM_HPELDSP_H |
|
20 |
+ |
|
21 |
+#include "libavcodec/hpeldsp.h" |
|
22 |
+ |
|
23 |
+void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags); |
|
24 |
+void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags); |
|
25 |
+ |
|
26 |
+#endif /* AVCODEC_ARM_HPELDSP_H */ |
0 | 27 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,259 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> |
|
2 |
+ * |
|
3 |
+ * This file is part of Libav. |
|
4 |
+ * |
|
5 |
+ * Libav is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * Libav is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with Libav; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include "libavutil/arm/asm.S" |
|
21 |
+ |
|
22 |
+.macro call_2x_pixels type, subp |
|
23 |
+function ff_\type\()_pixels16\subp\()_armv6, export=1 |
|
24 |
+ push {r0-r3, lr} |
|
25 |
+ bl ff_\type\()_pixels8\subp\()_armv6 |
|
26 |
+ pop {r0-r3, lr} |
|
27 |
+ add r0, r0, #8 |
|
28 |
+ add r1, r1, #8 |
|
29 |
+ b ff_\type\()_pixels8\subp\()_armv6 |
|
30 |
+endfunc |
|
31 |
+.endm |
|
32 |
+ |
|
33 |
+call_2x_pixels avg |
|
34 |
+call_2x_pixels put, _x2 |
|
35 |
+call_2x_pixels put, _y2 |
|
36 |
+call_2x_pixels put, _x2_no_rnd |
|
37 |
+call_2x_pixels put, _y2_no_rnd |
|
38 |
+ |
|
39 |
+function ff_put_pixels16_armv6, export=1 |
|
40 |
+ push {r4-r11} |
|
41 |
+1: |
|
42 |
+ ldr r5, [r1, #4] |
|
43 |
+ ldr r6, [r1, #8] |
|
44 |
+ ldr r7, [r1, #12] |
|
45 |
+ ldr_post r4, r1, r2 |
|
46 |
+ strd r6, r7, [r0, #8] |
|
47 |
+ ldr r9, [r1, #4] |
|
48 |
+ strd_post r4, r5, r0, r2 |
|
49 |
+ ldr r10, [r1, #8] |
|
50 |
+ ldr r11, [r1, #12] |
|
51 |
+ ldr_post r8, r1, r2 |
|
52 |
+ strd r10, r11, [r0, #8] |
|
53 |
+ subs r3, r3, #2 |
|
54 |
+ strd_post r8, r9, r0, r2 |
|
55 |
+ bne 1b |
|
56 |
+ |
|
57 |
+ pop {r4-r11} |
|
58 |
+ bx lr |
|
59 |
+endfunc |
|
60 |
+ |
|
61 |
+function ff_put_pixels8_armv6, export=1 |
|
62 |
+ push {r4-r7} |
|
63 |
+1: |
|
64 |
+ ldr r5, [r1, #4] |
|
65 |
+ ldr_post r4, r1, r2 |
|
66 |
+ ldr r7, [r1, #4] |
|
67 |
+ strd_post r4, r5, r0, r2 |
|
68 |
+ ldr_post r6, r1, r2 |
|
69 |
+ subs r3, r3, #2 |
|
70 |
+ strd_post r6, r7, r0, r2 |
|
71 |
+ bne 1b |
|
72 |
+ |
|
73 |
+ pop {r4-r7} |
|
74 |
+ bx lr |
|
75 |
+endfunc |
|
76 |
+ |
|
77 |
+function ff_put_pixels8_x2_armv6, export=1 |
|
78 |
+ push {r4-r11, lr} |
|
79 |
+ mov r12, #1 |
|
80 |
+ orr r12, r12, r12, lsl #8 |
|
81 |
+ orr r12, r12, r12, lsl #16 |
|
82 |
+1: |
|
83 |
+ ldr r4, [r1] |
|
84 |
+ subs r3, r3, #2 |
|
85 |
+ ldr r5, [r1, #4] |
|
86 |
+ ldr r7, [r1, #5] |
|
87 |
+ lsr r6, r4, #8 |
|
88 |
+ ldr_pre r8, r1, r2 |
|
89 |
+ orr r6, r6, r5, lsl #24 |
|
90 |
+ ldr r9, [r1, #4] |
|
91 |
+ ldr r11, [r1, #5] |
|
92 |
+ lsr r10, r8, #8 |
|
93 |
+ add r1, r1, r2 |
|
94 |
+ orr r10, r10, r9, lsl #24 |
|
95 |
+ eor r14, r4, r6 |
|
96 |
+ uhadd8 r4, r4, r6 |
|
97 |
+ eor r6, r5, r7 |
|
98 |
+ uhadd8 r5, r5, r7 |
|
99 |
+ and r14, r14, r12 |
|
100 |
+ and r6, r6, r12 |
|
101 |
+ uadd8 r4, r4, r14 |
|
102 |
+ eor r14, r8, r10 |
|
103 |
+ uadd8 r5, r5, r6 |
|
104 |
+ eor r6, r9, r11 |
|
105 |
+ uhadd8 r8, r8, r10 |
|
106 |
+ and r14, r14, r12 |
|
107 |
+ uhadd8 r9, r9, r11 |
|
108 |
+ and r6, r6, r12 |
|
109 |
+ uadd8 r8, r8, r14 |
|
110 |
+ strd_post r4, r5, r0, r2 |
|
111 |
+ uadd8 r9, r9, r6 |
|
112 |
+ strd_post r8, r9, r0, r2 |
|
113 |
+ bne 1b |
|
114 |
+ |
|
115 |
+ pop {r4-r11, pc} |
|
116 |
+endfunc |
|
117 |
+ |
|
118 |
+function ff_put_pixels8_y2_armv6, export=1 |
|
119 |
+ push {r4-r11} |
|
120 |
+ mov r12, #1 |
|
121 |
+ orr r12, r12, r12, lsl #8 |
|
122 |
+ orr r12, r12, r12, lsl #16 |
|
123 |
+ ldr r4, [r1] |
|
124 |
+ ldr r5, [r1, #4] |
|
125 |
+ ldr_pre r6, r1, r2 |
|
126 |
+ ldr r7, [r1, #4] |
|
127 |
+1: |
|
128 |
+ subs r3, r3, #2 |
|
129 |
+ uhadd8 r8, r4, r6 |
|
130 |
+ eor r10, r4, r6 |
|
131 |
+ uhadd8 r9, r5, r7 |
|
132 |
+ eor r11, r5, r7 |
|
133 |
+ and r10, r10, r12 |
|
134 |
+ ldr_pre r4, r1, r2 |
|
135 |
+ uadd8 r8, r8, r10 |
|
136 |
+ and r11, r11, r12 |
|
137 |
+ uadd8 r9, r9, r11 |
|
138 |
+ ldr r5, [r1, #4] |
|
139 |
+ uhadd8 r10, r4, r6 |
|
140 |
+ eor r6, r4, r6 |
|
141 |
+ uhadd8 r11, r5, r7 |
|
142 |
+ and r6, r6, r12 |
|
143 |
+ eor r7, r5, r7 |
|
144 |
+ uadd8 r10, r10, r6 |
|
145 |
+ and r7, r7, r12 |
|
146 |
+ ldr_pre r6, r1, r2 |
|
147 |
+ uadd8 r11, r11, r7 |
|
148 |
+ strd_post r8, r9, r0, r2 |
|
149 |
+ ldr r7, [r1, #4] |
|
150 |
+ strd_post r10, r11, r0, r2 |
|
151 |
+ bne 1b |
|
152 |
+ |
|
153 |
+ pop {r4-r11} |
|
154 |
+ bx lr |
|
155 |
+endfunc |
|
156 |
+ |
|
157 |
+function ff_put_pixels8_x2_no_rnd_armv6, export=1 |
|
158 |
+ push {r4-r9, lr} |
|
159 |
+1: |
|
160 |
+ subs r3, r3, #2 |
|
161 |
+ ldr r4, [r1] |
|
162 |
+ ldr r5, [r1, #4] |
|
163 |
+ ldr r7, [r1, #5] |
|
164 |
+ ldr_pre r8, r1, r2 |
|
165 |
+ ldr r9, [r1, #4] |
|
166 |
+ ldr r14, [r1, #5] |
|
167 |
+ add r1, r1, r2 |
|
168 |
+ lsr r6, r4, #8 |
|
169 |
+ orr r6, r6, r5, lsl #24 |
|
170 |
+ lsr r12, r8, #8 |
|
171 |
+ orr r12, r12, r9, lsl #24 |
|
172 |
+ uhadd8 r4, r4, r6 |
|
173 |
+ uhadd8 r5, r5, r7 |
|
174 |
+ uhadd8 r8, r8, r12 |
|
175 |
+ uhadd8 r9, r9, r14 |
|
176 |
+ stm r0, {r4,r5} |
|
177 |
+ add r0, r0, r2 |
|
178 |
+ stm r0, {r8,r9} |
|
179 |
+ add r0, r0, r2 |
|
180 |
+ bne 1b |
|
181 |
+ |
|
182 |
+ pop {r4-r9, pc} |
|
183 |
+endfunc |
|
184 |
+ |
|
185 |
+function ff_put_pixels8_y2_no_rnd_armv6, export=1 |
|
186 |
+ push {r4-r9, lr} |
|
187 |
+ ldr r4, [r1] |
|
188 |
+ ldr r5, [r1, #4] |
|
189 |
+ ldr_pre r6, r1, r2 |
|
190 |
+ ldr r7, [r1, #4] |
|
191 |
+1: |
|
192 |
+ subs r3, r3, #2 |
|
193 |
+ uhadd8 r8, r4, r6 |
|
194 |
+ ldr_pre r4, r1, r2 |
|
195 |
+ uhadd8 r9, r5, r7 |
|
196 |
+ ldr r5, [r1, #4] |
|
197 |
+ uhadd8 r12, r4, r6 |
|
198 |
+ ldr_pre r6, r1, r2 |
|
199 |
+ uhadd8 r14, r5, r7 |
|
200 |
+ ldr r7, [r1, #4] |
|
201 |
+ stm r0, {r8,r9} |
|
202 |
+ add r0, r0, r2 |
|
203 |
+ stm r0, {r12,r14} |
|
204 |
+ add r0, r0, r2 |
|
205 |
+ bne 1b |
|
206 |
+ |
|
207 |
+ pop {r4-r9, pc} |
|
208 |
+endfunc |
|
209 |
+ |
|
210 |
+function ff_avg_pixels8_armv6, export=1 |
|
211 |
+ pld [r1, r2] |
|
212 |
+ push {r4-r10, lr} |
|
213 |
+ mov lr, #1 |
|
214 |
+ orr lr, lr, lr, lsl #8 |
|
215 |
+ orr lr, lr, lr, lsl #16 |
|
216 |
+ ldrd r4, r5, [r0] |
|
217 |
+ ldr r10, [r1, #4] |
|
218 |
+ ldr_post r9, r1, r2 |
|
219 |
+ subs r3, r3, #2 |
|
220 |
+1: |
|
221 |
+ pld [r1, r2] |
|
222 |
+ eor r8, r4, r9 |
|
223 |
+ uhadd8 r4, r4, r9 |
|
224 |
+ eor r12, r5, r10 |
|
225 |
+ ldrd_reg r6, r7, r0, r2 |
|
226 |
+ uhadd8 r5, r5, r10 |
|
227 |
+ and r8, r8, lr |
|
228 |
+ ldr r10, [r1, #4] |
|
229 |
+ and r12, r12, lr |
|
230 |
+ uadd8 r4, r4, r8 |
|
231 |
+ ldr_post r9, r1, r2 |
|
232 |
+ eor r8, r6, r9 |
|
233 |
+ uadd8 r5, r5, r12 |
|
234 |
+ pld [r1, r2, lsl #1] |
|
235 |
+ eor r12, r7, r10 |
|
236 |
+ uhadd8 r6, r6, r9 |
|
237 |
+ strd_post r4, r5, r0, r2 |
|
238 |
+ uhadd8 r7, r7, r10 |
|
239 |
+ beq 2f |
|
240 |
+ and r8, r8, lr |
|
241 |
+ ldrd_reg r4, r5, r0, r2 |
|
242 |
+ uadd8 r6, r6, r8 |
|
243 |
+ ldr r10, [r1, #4] |
|
244 |
+ and r12, r12, lr |
|
245 |
+ subs r3, r3, #2 |
|
246 |
+ uadd8 r7, r7, r12 |
|
247 |
+ ldr_post r9, r1, r2 |
|
248 |
+ strd_post r6, r7, r0, r2 |
|
249 |
+ b 1b |
|
250 |
+2: |
|
251 |
+ and r8, r8, lr |
|
252 |
+ and r12, r12, lr |
|
253 |
+ uadd8 r6, r6, r8 |
|
254 |
+ uadd8 r7, r7, r12 |
|
255 |
+ strd_post r6, r7, r0, r2 |
|
256 |
+ |
|
257 |
+ pop {r4-r10, pc} |
|
258 |
+endfunc |
0 | 259 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,71 @@ |
0 |
+/* |
|
1 |
+ * ARM optimized DSP utils |
|
2 |
+ * Copyright (c) 2001 Lionel Ulmer |
|
3 |
+ * |
|
4 |
+ * This file is part of Libav. |
|
5 |
+ * |
|
6 |
+ * Libav is free software; you can redistribute it and/or |
|
7 |
+ * modify it under the terms of the GNU Lesser General Public |
|
8 |
+ * License as published by the Free Software Foundation; either |
|
9 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
10 |
+ * |
|
11 |
+ * Libav is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+ * Lesser General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU Lesser General Public |
|
17 |
+ * License along with Libav; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+#include "libavutil/arm/cpu.h" |
|
22 |
+#include "libavutil/attributes.h" |
|
23 |
+#include "libavcodec/rnd_avg.h" |
|
24 |
+#include "hpeldsp_arm.h" |
|
25 |
+ |
|
26 |
+void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
27 |
+void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
28 |
+void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
29 |
+void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
30 |
+ |
|
31 |
+void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
32 |
+void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
33 |
+void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
34 |
+ |
|
35 |
+void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); |
|
36 |
+ |
|
37 |
+CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) |
|
38 |
+CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) |
|
39 |
+CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) |
|
40 |
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) |
|
41 |
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) |
|
42 |
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) |
|
43 |
+ |
|
44 |
+av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags) |
|
45 |
+{ |
|
46 |
+ int cpu_flags = av_get_cpu_flags(); |
|
47 |
+ |
|
48 |
+ c->put_pixels_tab[0][0] = ff_put_pixels16_arm; |
|
49 |
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; |
|
50 |
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; |
|
51 |
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; |
|
52 |
+ c->put_pixels_tab[1][0] = ff_put_pixels8_arm; |
|
53 |
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; |
|
54 |
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; |
|
55 |
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; |
|
56 |
+ |
|
57 |
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; |
|
58 |
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; |
|
59 |
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; |
|
60 |
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; |
|
61 |
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; |
|
62 |
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; |
|
63 |
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; |
|
64 |
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; |
|
65 |
+ |
|
66 |
+ if (have_armv6(cpu_flags)) |
|
67 |
+ ff_hpeldsp_init_armv6(c, flags); |
|
68 |
+ if (have_neon(cpu_flags)) |
|
69 |
+ ff_hpeldsp_init_neon(c, flags); |
|
70 |
+} |
0 | 71 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,67 @@ |
0 |
+/* |
|
1 |
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> |
|
2 |
+ * |
|
3 |
+ * This file is part of Libav. |
|
4 |
+ * |
|
5 |
+ * Libav is free software; you can redistribute it and/or |
|
6 |
+ * modify it under the terms of the GNU Lesser General Public |
|
7 |
+ * License as published by the Free Software Foundation; either |
|
8 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
9 |
+ * |
|
10 |
+ * Libav is distributed in the hope that it will be useful, |
|
11 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13 |
+ * Lesser General Public License for more details. |
|
14 |
+ * |
|
15 |
+ * You should have received a copy of the GNU Lesser General Public |
|
16 |
+ * License along with Libav; if not, write to the Free Software |
|
17 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18 |
+ */ |
|
19 |
+ |
|
20 |
+#include <stddef.h> |
|
21 |
+#include <stdint.h> |
|
22 |
+ |
|
23 |
+#include "libavutil/attributes.h" |
|
24 |
+#include "hpeldsp_arm.h" |
|
25 |
+ |
|
26 |
+void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
27 |
+void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
28 |
+void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
29 |
+ |
|
30 |
+void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
31 |
+void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
32 |
+ |
|
33 |
+void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
34 |
+ |
|
35 |
+void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
36 |
+void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
37 |
+void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
38 |
+ |
|
39 |
+void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
40 |
+void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
41 |
+ |
|
42 |
+void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
43 |
+ |
|
44 |
+av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags) |
|
45 |
+{ |
|
46 |
+ c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; |
|
47 |
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; |
|
48 |
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; |
|
49 |
+/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ |
|
50 |
+ c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; |
|
51 |
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; |
|
52 |
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; |
|
53 |
+/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ |
|
54 |
+ |
|
55 |
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; |
|
56 |
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; |
|
57 |
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; |
|
58 |
+/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ |
|
59 |
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; |
|
60 |
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; |
|
61 |
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; |
|
62 |
+/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ |
|
63 |
+ |
|
64 |
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; |
|
65 |
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; |
|
66 |
+} |
0 | 67 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,88 @@ |
0 |
+/* |
|
1 |
+ * ARM NEON optimised DSP functions |
|
2 |
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
|
3 |
+ * |
|
4 |
+ * This file is part of Libav. |
|
5 |
+ * |
|
6 |
+ * Libav is free software; you can redistribute it and/or |
|
7 |
+ * modify it under the terms of the GNU Lesser General Public |
|
8 |
+ * License as published by the Free Software Foundation; either |
|
9 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
10 |
+ * |
|
11 |
+ * Libav is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+ * Lesser General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU Lesser General Public |
|
17 |
+ * License along with Libav; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+#include <stddef.h> |
|
22 |
+#include <stdint.h> |
|
23 |
+ |
|
24 |
+#include "libavutil/attributes.h" |
|
25 |
+#include "hpeldsp_arm.h" |
|
26 |
+ |
|
27 |
+void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
28 |
+void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
29 |
+void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
30 |
+void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
31 |
+void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
32 |
+void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
33 |
+void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
34 |
+void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
35 |
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
36 |
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
37 |
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
38 |
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
39 |
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
40 |
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
41 |
+ |
|
42 |
+void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
43 |
+void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
44 |
+void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
45 |
+void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
46 |
+void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
47 |
+void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
48 |
+void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
49 |
+void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
50 |
+void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
51 |
+void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
52 |
+void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); |
|
53 |
+ |
|
54 |
+av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags) |
|
55 |
+{ |
|
56 |
+ c->put_pixels_tab[0][0] = ff_put_pixels16_neon; |
|
57 |
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; |
|
58 |
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; |
|
59 |
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; |
|
60 |
+ c->put_pixels_tab[1][0] = ff_put_pixels8_neon; |
|
61 |
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; |
|
62 |
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; |
|
63 |
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; |
|
64 |
+ |
|
65 |
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; |
|
66 |
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; |
|
67 |
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; |
|
68 |
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; |
|
69 |
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; |
|
70 |
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; |
|
71 |
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; |
|
72 |
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; |
|
73 |
+ |
|
74 |
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; |
|
75 |
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon; |
|
76 |
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon; |
|
77 |
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon; |
|
78 |
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; |
|
79 |
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon; |
|
80 |
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon; |
|
81 |
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon; |
|
82 |
+ |
|
83 |
+ c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon; |
|
84 |
+ c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon; |
|
85 |
+ c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon; |
|
86 |
+ c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon; |
|
87 |
+} |
0 | 88 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,410 @@ |
0 |
+/* |
|
1 |
+ * ARM NEON optimised DSP functions |
|
2 |
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
|
3 |
+ * |
|
4 |
+ * This file is part of Libav. |
|
5 |
+ * |
|
6 |
+ * Libav is free software; you can redistribute it and/or |
|
7 |
+ * modify it under the terms of the GNU Lesser General Public |
|
8 |
+ * License as published by the Free Software Foundation; either |
|
9 |
+ * version 2.1 of the License, or (at your option) any later version. |
|
10 |
+ * |
|
11 |
+ * Libav is distributed in the hope that it will be useful, |
|
12 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
14 |
+ * Lesser General Public License for more details. |
|
15 |
+ * |
|
16 |
+ * You should have received a copy of the GNU Lesser General Public |
|
17 |
+ * License along with Libav; if not, write to the Free Software |
|
18 |
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+#include "libavutil/arm/asm.S" |
|
22 |
+ |
|
23 |
+.macro pixels16 rnd=1, avg=0 |
|
24 |
+ .if \avg |
|
25 |
+ mov r12, r0 |
|
26 |
+ .endif |
|
27 |
+1: vld1.8 {q0}, [r1], r2 |
|
28 |
+ vld1.8 {q1}, [r1], r2 |
|
29 |
+ vld1.8 {q2}, [r1], r2 |
|
30 |
+ pld [r1, r2, lsl #2] |
|
31 |
+ vld1.8 {q3}, [r1], r2 |
|
32 |
+ pld [r1] |
|
33 |
+ pld [r1, r2] |
|
34 |
+ pld [r1, r2, lsl #1] |
|
35 |
+ .if \avg |
|
36 |
+ vld1.8 {q8}, [r12,:128], r2 |
|
37 |
+ vrhadd.u8 q0, q0, q8 |
|
38 |
+ vld1.8 {q9}, [r12,:128], r2 |
|
39 |
+ vrhadd.u8 q1, q1, q9 |
|
40 |
+ vld1.8 {q10}, [r12,:128], r2 |
|
41 |
+ vrhadd.u8 q2, q2, q10 |
|
42 |
+ vld1.8 {q11}, [r12,:128], r2 |
|
43 |
+ vrhadd.u8 q3, q3, q11 |
|
44 |
+ .endif |
|
45 |
+ subs r3, r3, #4 |
|
46 |
+ vst1.64 {q0}, [r0,:128], r2 |
|
47 |
+ vst1.64 {q1}, [r0,:128], r2 |
|
48 |
+ vst1.64 {q2}, [r0,:128], r2 |
|
49 |
+ vst1.64 {q3}, [r0,:128], r2 |
|
50 |
+ bne 1b |
|
51 |
+ bx lr |
|
52 |
+.endm |
|
53 |
+ |
|
54 |
+.macro pixels16_x2 rnd=1, avg=0 |
|
55 |
+1: vld1.8 {d0-d2}, [r1], r2 |
|
56 |
+ vld1.8 {d4-d6}, [r1], r2 |
|
57 |
+ pld [r1] |
|
58 |
+ pld [r1, r2] |
|
59 |
+ subs r3, r3, #2 |
|
60 |
+ vext.8 q1, q0, q1, #1 |
|
61 |
+ avg q0, q0, q1 |
|
62 |
+ vext.8 q3, q2, q3, #1 |
|
63 |
+ avg q2, q2, q3 |
|
64 |
+ .if \avg |
|
65 |
+ vld1.8 {q1}, [r0,:128], r2 |
|
66 |
+ vld1.8 {q3}, [r0,:128] |
|
67 |
+ vrhadd.u8 q0, q0, q1 |
|
68 |
+ vrhadd.u8 q2, q2, q3 |
|
69 |
+ sub r0, r0, r2 |
|
70 |
+ .endif |
|
71 |
+ vst1.8 {q0}, [r0,:128], r2 |
|
72 |
+ vst1.8 {q2}, [r0,:128], r2 |
|
73 |
+ bne 1b |
|
74 |
+ bx lr |
|
75 |
+.endm |
|
76 |
+ |
|
77 |
+.macro pixels16_y2 rnd=1, avg=0 |
|
78 |
+ sub r3, r3, #2 |
|
79 |
+ vld1.8 {q0}, [r1], r2 |
|
80 |
+ vld1.8 {q1}, [r1], r2 |
|
81 |
+1: subs r3, r3, #2 |
|
82 |
+ avg q2, q0, q1 |
|
83 |
+ vld1.8 {q0}, [r1], r2 |
|
84 |
+ avg q3, q0, q1 |
|
85 |
+ vld1.8 {q1}, [r1], r2 |
|
86 |
+ pld [r1] |
|
87 |
+ pld [r1, r2] |
|
88 |
+ .if \avg |
|
89 |
+ vld1.8 {q8}, [r0,:128], r2 |
|
90 |
+ vld1.8 {q9}, [r0,:128] |
|
91 |
+ vrhadd.u8 q2, q2, q8 |
|
92 |
+ vrhadd.u8 q3, q3, q9 |
|
93 |
+ sub r0, r0, r2 |
|
94 |
+ .endif |
|
95 |
+ vst1.8 {q2}, [r0,:128], r2 |
|
96 |
+ vst1.8 {q3}, [r0,:128], r2 |
|
97 |
+ bne 1b |
|
98 |
+ |
|
99 |
+ avg q2, q0, q1 |
|
100 |
+ vld1.8 {q0}, [r1], r2 |
|
101 |
+ avg q3, q0, q1 |
|
102 |
+ .if \avg |
|
103 |
+ vld1.8 {q8}, [r0,:128], r2 |
|
104 |
+ vld1.8 {q9}, [r0,:128] |
|
105 |
+ vrhadd.u8 q2, q2, q8 |
|
106 |
+ vrhadd.u8 q3, q3, q9 |
|
107 |
+ sub r0, r0, r2 |
|
108 |
+ .endif |
|
109 |
+ vst1.8 {q2}, [r0,:128], r2 |
|
110 |
+ vst1.8 {q3}, [r0,:128], r2 |
|
111 |
+ |
|
112 |
+ bx lr |
|
113 |
+.endm |
|
114 |
+ |
|
115 |
+.macro pixels16_xy2 rnd=1, avg=0 |
|
116 |
+ sub r3, r3, #2 |
|
117 |
+ vld1.8 {d0-d2}, [r1], r2 |
|
118 |
+ vld1.8 {d4-d6}, [r1], r2 |
|
119 |
+NRND vmov.i16 q13, #1 |
|
120 |
+ pld [r1] |
|
121 |
+ pld [r1, r2] |
|
122 |
+ vext.8 q1, q0, q1, #1 |
|
123 |
+ vext.8 q3, q2, q3, #1 |
|
124 |
+ vaddl.u8 q8, d0, d2 |
|
125 |
+ vaddl.u8 q10, d1, d3 |
|
126 |
+ vaddl.u8 q9, d4, d6 |
|
127 |
+ vaddl.u8 q11, d5, d7 |
|
128 |
+1: subs r3, r3, #2 |
|
129 |
+ vld1.8 {d0-d2}, [r1], r2 |
|
130 |
+ vadd.u16 q12, q8, q9 |
|
131 |
+ pld [r1] |
|
132 |
+NRND vadd.u16 q12, q12, q13 |
|
133 |
+ vext.8 q15, q0, q1, #1 |
|
134 |
+ vadd.u16 q1 , q10, q11 |
|
135 |
+ shrn d28, q12, #2 |
|
136 |
+NRND vadd.u16 q1, q1, q13 |
|
137 |
+ shrn d29, q1, #2 |
|
138 |
+ .if \avg |
|
139 |
+ vld1.8 {q8}, [r0,:128] |
|
140 |
+ vrhadd.u8 q14, q14, q8 |
|
141 |
+ .endif |
|
142 |
+ vaddl.u8 q8, d0, d30 |
|
143 |
+ vld1.8 {d2-d4}, [r1], r2 |
|
144 |
+ vaddl.u8 q10, d1, d31 |
|
145 |
+ vst1.8 {q14}, [r0,:128], r2 |
|
146 |
+ vadd.u16 q12, q8, q9 |
|
147 |
+ pld [r1, r2] |
|
148 |
+NRND vadd.u16 q12, q12, q13 |
|
149 |
+ vext.8 q2, q1, q2, #1 |
|
150 |
+ vadd.u16 q0, q10, q11 |
|
151 |
+ shrn d30, q12, #2 |
|
152 |
+NRND vadd.u16 q0, q0, q13 |
|
153 |
+ shrn d31, q0, #2 |
|
154 |
+ .if \avg |
|
155 |
+ vld1.8 {q9}, [r0,:128] |
|
156 |
+ vrhadd.u8 q15, q15, q9 |
|
157 |
+ .endif |
|
158 |
+ vaddl.u8 q9, d2, d4 |
|
159 |
+ vaddl.u8 q11, d3, d5 |
|
160 |
+ vst1.8 {q15}, [r0,:128], r2 |
|
161 |
+ bgt 1b |
|
162 |
+ |
|
163 |
+ vld1.8 {d0-d2}, [r1], r2 |
|
164 |
+ vadd.u16 q12, q8, q9 |
|
165 |
+NRND vadd.u16 q12, q12, q13 |
|
166 |
+ vext.8 q15, q0, q1, #1 |
|
167 |
+ vadd.u16 q1 , q10, q11 |
|
168 |
+ shrn d28, q12, #2 |
|
169 |
+NRND vadd.u16 q1, q1, q13 |
|
170 |
+ shrn d29, q1, #2 |
|
171 |
+ .if \avg |
|
172 |
+ vld1.8 {q8}, [r0,:128] |
|
173 |
+ vrhadd.u8 q14, q14, q8 |
|
174 |
+ .endif |
|
175 |
+ vaddl.u8 q8, d0, d30 |
|
176 |
+ vaddl.u8 q10, d1, d31 |
|
177 |
+ vst1.8 {q14}, [r0,:128], r2 |
|
178 |
+ vadd.u16 q12, q8, q9 |
|
179 |
+NRND vadd.u16 q12, q12, q13 |
|
180 |
+ vadd.u16 q0, q10, q11 |
|
181 |
+ shrn d30, q12, #2 |
|
182 |
+NRND vadd.u16 q0, q0, q13 |
|
183 |
+ shrn d31, q0, #2 |
|
184 |
+ .if \avg |
|
185 |
+ vld1.8 {q9}, [r0,:128] |
|
186 |
+ vrhadd.u8 q15, q15, q9 |
|
187 |
+ .endif |
|
188 |
+ vst1.8 {q15}, [r0,:128], r2 |
|
189 |
+ |
|
190 |
+ bx lr |
|
191 |
+.endm |
|
192 |
+ |
|
193 |
+.macro pixels8 rnd=1, avg=0 |
|
194 |
+1: vld1.8 {d0}, [r1], r2 |
|
195 |
+ vld1.8 {d1}, [r1], r2 |
|
196 |
+ vld1.8 {d2}, [r1], r2 |
|
197 |
+ pld [r1, r2, lsl #2] |
|
198 |
+ vld1.8 {d3}, [r1], r2 |
|
199 |
+ pld [r1] |
|
200 |
+ pld [r1, r2] |
|
201 |
+ pld [r1, r2, lsl #1] |
|
202 |
+ .if \avg |
|
203 |
+ vld1.8 {d4}, [r0,:64], r2 |
|
204 |
+ vrhadd.u8 d0, d0, d4 |
|
205 |
+ vld1.8 {d5}, [r0,:64], r2 |
|
206 |
+ vrhadd.u8 d1, d1, d5 |
|
207 |
+ vld1.8 {d6}, [r0,:64], r2 |
|
208 |
+ vrhadd.u8 d2, d2, d6 |
|
209 |
+ vld1.8 {d7}, [r0,:64], r2 |
|
210 |
+ vrhadd.u8 d3, d3, d7 |
|
211 |
+ sub r0, r0, r2, lsl #2 |
|
212 |
+ .endif |
|
213 |
+ subs r3, r3, #4 |
|
214 |
+ vst1.8 {d0}, [r0,:64], r2 |
|
215 |
+ vst1.8 {d1}, [r0,:64], r2 |
|
216 |
+ vst1.8 {d2}, [r0,:64], r2 |
|
217 |
+ vst1.8 {d3}, [r0,:64], r2 |
|
218 |
+ bne 1b |
|
219 |
+ bx lr |
|
220 |
+.endm |
|
221 |
+ |
|
222 |
+.macro pixels8_x2 rnd=1, avg=0 |
|
223 |
+1: vld1.8 {q0}, [r1], r2 |
|
224 |
+ vext.8 d1, d0, d1, #1 |
|
225 |
+ vld1.8 {q1}, [r1], r2 |
|
226 |
+ vext.8 d3, d2, d3, #1 |
|
227 |
+ pld [r1] |
|
228 |
+ pld [r1, r2] |
|
229 |
+ subs r3, r3, #2 |
|
230 |
+ vswp d1, d2 |
|
231 |
+ avg q0, q0, q1 |
|
232 |
+ .if \avg |
|
233 |
+ vld1.8 {d4}, [r0,:64], r2 |
|
234 |
+ vld1.8 {d5}, [r0,:64] |
|
235 |
+ vrhadd.u8 q0, q0, q2 |
|
236 |
+ sub r0, r0, r2 |
|
237 |
+ .endif |
|
238 |
+ vst1.8 {d0}, [r0,:64], r2 |
|
239 |
+ vst1.8 {d1}, [r0,:64], r2 |
|
240 |
+ bne 1b |
|
241 |
+ bx lr |
|
242 |
+.endm |
|
243 |
+ |
|
244 |
+.macro pixels8_y2 rnd=1, avg=0 |
|
245 |
+ sub r3, r3, #2 |
|
246 |
+ vld1.8 {d0}, [r1], r2 |
|
247 |
+ vld1.8 {d1}, [r1], r2 |
|
248 |
+1: subs r3, r3, #2 |
|
249 |
+ avg d4, d0, d1 |
|
250 |
+ vld1.8 {d0}, [r1], r2 |
|
251 |
+ avg d5, d0, d1 |
|
252 |
+ vld1.8 {d1}, [r1], r2 |
|
253 |
+ pld [r1] |
|
254 |
+ pld [r1, r2] |
|
255 |
+ .if \avg |
|
256 |
+ vld1.8 {d2}, [r0,:64], r2 |
|
257 |
+ vld1.8 {d3}, [r0,:64] |
|
258 |
+ vrhadd.u8 q2, q2, q1 |
|
259 |
+ sub r0, r0, r2 |
|
260 |
+ .endif |
|
261 |
+ vst1.8 {d4}, [r0,:64], r2 |
|
262 |
+ vst1.8 {d5}, [r0,:64], r2 |
|
263 |
+ bne 1b |
|
264 |
+ |
|
265 |
+ avg d4, d0, d1 |
|
266 |
+ vld1.8 {d0}, [r1], r2 |
|
267 |
+ avg d5, d0, d1 |
|
268 |
+ .if \avg |
|
269 |
+ vld1.8 {d2}, [r0,:64], r2 |
|
270 |
+ vld1.8 {d3}, [r0,:64] |
|
271 |
+ vrhadd.u8 q2, q2, q1 |
|
272 |
+ sub r0, r0, r2 |
|
273 |
+ .endif |
|
274 |
+ vst1.8 {d4}, [r0,:64], r2 |
|
275 |
+ vst1.8 {d5}, [r0,:64], r2 |
|
276 |
+ |
|
277 |
+ bx lr |
|
278 |
+.endm |
|
279 |
+ |
|
280 |
+.macro pixels8_xy2 rnd=1, avg=0 |
|
281 |
+ sub r3, r3, #2 |
|
282 |
+ vld1.8 {q0}, [r1], r2 |
|
283 |
+ vld1.8 {q1}, [r1], r2 |
|
284 |
+NRND vmov.i16 q11, #1 |
|
285 |
+ pld [r1] |
|
286 |
+ pld [r1, r2] |
|
287 |
+ vext.8 d4, d0, d1, #1 |
|
288 |
+ vext.8 d6, d2, d3, #1 |
|
289 |
+ vaddl.u8 q8, d0, d4 |
|
290 |
+ vaddl.u8 q9, d2, d6 |
|
291 |
+1: subs r3, r3, #2 |
|
292 |
+ vld1.8 {q0}, [r1], r2 |
|
293 |
+ pld [r1] |
|
294 |
+ vadd.u16 q10, q8, q9 |
|
295 |
+ vext.8 d4, d0, d1, #1 |
|
296 |
+NRND vadd.u16 q10, q10, q11 |
|
297 |
+ vaddl.u8 q8, d0, d4 |
|
298 |
+ shrn d5, q10, #2 |
|
299 |
+ vld1.8 {q1}, [r1], r2 |
|
300 |
+ vadd.u16 q10, q8, q9 |
|
301 |
+ pld [r1, r2] |
|
302 |
+ .if \avg |
|
303 |
+ vld1.8 {d7}, [r0,:64] |
|
304 |
+ vrhadd.u8 d5, d5, d7 |
|
305 |
+ .endif |
|
306 |
+NRND vadd.u16 q10, q10, q11 |
|
307 |
+ vst1.8 {d5}, [r0,:64], r2 |
|
308 |
+ shrn d7, q10, #2 |
|
309 |
+ .if \avg |
|
310 |
+ vld1.8 {d5}, [r0,:64] |
|
311 |
+ vrhadd.u8 d7, d7, d5 |
|
312 |
+ .endif |
|
313 |
+ vext.8 d6, d2, d3, #1 |
|
314 |
+ vaddl.u8 q9, d2, d6 |
|
315 |
+ vst1.8 {d7}, [r0,:64], r2 |
|
316 |
+ bgt 1b |
|
317 |
+ |
|
318 |
+ vld1.8 {q0}, [r1], r2 |
|
319 |
+ vadd.u16 q10, q8, q9 |
|
320 |
+ vext.8 d4, d0, d1, #1 |
|
321 |
+NRND vadd.u16 q10, q10, q11 |
|
322 |
+ vaddl.u8 q8, d0, d4 |
|
323 |
+ shrn d5, q10, #2 |
|
324 |
+ vadd.u16 q10, q8, q9 |
|
325 |
+ .if \avg |
|
326 |
+ vld1.8 {d7}, [r0,:64] |
|
327 |
+ vrhadd.u8 d5, d5, d7 |
|
328 |
+ .endif |
|
329 |
+NRND vadd.u16 q10, q10, q11 |
|
330 |
+ vst1.8 {d5}, [r0,:64], r2 |
|
331 |
+ shrn d7, q10, #2 |
|
332 |
+ .if \avg |
|
333 |
+ vld1.8 {d5}, [r0,:64] |
|
334 |
+ vrhadd.u8 d7, d7, d5 |
|
335 |
+ .endif |
|
336 |
+ vst1.8 {d7}, [r0,:64], r2 |
|
337 |
+ |
|
338 |
+ bx lr |
|
339 |
+.endm |
|
340 |
+ |
|
341 |
+.macro pixfunc pfx, name, suf, rnd=1, avg=0 |
|
342 |
+ .if \rnd |
|
343 |
+ .macro avg rd, rn, rm |
|
344 |
+ vrhadd.u8 \rd, \rn, \rm |
|
345 |
+ .endm |
|
346 |
+ .macro shrn rd, rn, rm |
|
347 |
+ vrshrn.u16 \rd, \rn, \rm |
|
348 |
+ .endm |
|
349 |
+ .macro NRND insn:vararg |
|
350 |
+ .endm |
|
351 |
+ .else |
|
352 |
+ .macro avg rd, rn, rm |
|
353 |
+ vhadd.u8 \rd, \rn, \rm |
|
354 |
+ .endm |
|
355 |
+ .macro shrn rd, rn, rm |
|
356 |
+ vshrn.u16 \rd, \rn, \rm |
|
357 |
+ .endm |
|
358 |
+ .macro NRND insn:vararg |
|
359 |
+ \insn |
|
360 |
+ .endm |
|
361 |
+ .endif |
|
362 |
+function ff_\pfx\name\suf\()_neon, export=1 |
|
363 |
+ \name \rnd, \avg |
|
364 |
+endfunc |
|
365 |
+ .purgem avg |
|
366 |
+ .purgem shrn |
|
367 |
+ .purgem NRND |
|
368 |
+.endm |
|
369 |
+ |
|
370 |
+.macro pixfunc2 pfx, name, avg=0 |
|
371 |
+ pixfunc \pfx, \name, rnd=1, avg=\avg |
|
372 |
+ pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg |
|
373 |
+.endm |
|
374 |
+ |
|
375 |
+function ff_put_h264_qpel16_mc00_neon, export=1 |
|
376 |
+ mov r3, #16 |
|
377 |
+endfunc |
|
378 |
+ |
|
379 |
+ pixfunc put_, pixels16, avg=0 |
|
380 |
+ pixfunc2 put_, pixels16_x2, avg=0 |
|
381 |
+ pixfunc2 put_, pixels16_y2, avg=0 |
|
382 |
+ pixfunc2 put_, pixels16_xy2, avg=0 |
|
383 |
+ |
|
384 |
+function ff_avg_h264_qpel16_mc00_neon, export=1 |
|
385 |
+ mov r3, #16 |
|
386 |
+endfunc |
|
387 |
+ |
|
388 |
+ pixfunc avg_, pixels16, avg=1 |
|
389 |
+ pixfunc2 avg_, pixels16_x2, avg=1 |
|
390 |
+ pixfunc2 avg_, pixels16_y2, avg=1 |
|
391 |
+ pixfunc2 avg_, pixels16_xy2, avg=1 |
|
392 |
+ |
|
393 |
+function ff_put_h264_qpel8_mc00_neon, export=1 |
|
394 |
+ mov r3, #8 |
|
395 |
+endfunc |
|
396 |
+ |
|
397 |
+ pixfunc put_, pixels8, avg=0 |
|
398 |
+ pixfunc2 put_, pixels8_x2, avg=0 |
|
399 |
+ pixfunc2 put_, pixels8_y2, avg=0 |
|
400 |
+ pixfunc2 put_, pixels8_xy2, avg=0 |
|
401 |
+ |
|
402 |
+function ff_avg_h264_qpel8_mc00_neon, export=1 |
|
403 |
+ mov r3, #8 |
|
404 |
+endfunc |
|
405 |
+ |
|
406 |
+ pixfunc avg_, pixels8, avg=1 |
|
407 |
+ pixfunc avg_, pixels8_x2, avg=1 |
|
408 |
+ pixfunc avg_, pixels8_y2, avg=1 |
|
409 |
+ pixfunc avg_, pixels8_xy2, avg=1 |
... | ... |
@@ -94,6 +94,7 @@ typedef struct HpelDSPContext { |
94 | 94 |
|
95 | 95 |
void ff_hpeldsp_init(HpelDSPContext *c, int flags); |
96 | 96 |
|
97 |
+void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); |
|
97 | 98 |
void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags); |
98 | 99 |
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags); |
99 | 100 |
|