Browse code

arm: hpeldsp: Move half-pel assembly from dsputil to hpeldsp

Signed-off-by: Martin Storsjö <martin@martin.st>

Ronald S. Bultje authored on 2013/03/11 08:16:45
Showing 16 changed files
... ...
@@ -31,6 +31,11 @@ OBJS-$(CONFIG_H264DSP)                 += arm/h264dsp_init_arm.o
31 31
 OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
32 32
 OBJS-$(CONFIG_H264QPEL)                += arm/h264qpel_init_arm.o
33 33
 
34
+OBJS-$(CONFIG_HPELDSP)                 += arm/hpeldsp_init_arm.o        \
35
+                                          arm/hpeldsp_arm.o
36
+ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
37
+                                          arm/hpeldsp_armv6.o
38
+
34 39
 OBJS-$(CONFIG_RV30_DECODER)            += arm/rv34dsp_init_arm.o
35 40
 OBJS-$(CONFIG_RV40_DECODER)            += arm/rv34dsp_init_arm.o        \
36 41
                                           arm/rv40dsp_init_arm.o        \
... ...
@@ -84,6 +89,9 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/sbrdsp_neon.o             \
84 84
 NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/dcadsp_neon.o             \
85 85
                                           arm/synth_filter_neon.o       \
86 86
 
87
+NEON-OBJS-$(CONFIG_HPELDSP)            += arm/hpeldsp_init_neon.o       \
88
+                                          arm/hpeldsp_neon.o
89
+
87 90
 NEON-OBJS-$(CONFIG_MPEGVIDEO)          += arm/mpegvideo_neon.o
88 91
 NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
89 92
 NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
... ...
@@ -26,590 +26,6 @@
26 26
 #define pld @
27 27
 #endif
28 28
 
29
-.macro  ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
30
-        mov             \Rd0, \Rn0, lsr #(\shift * 8)
31
-        mov             \Rd1, \Rn1, lsr #(\shift * 8)
32
-        mov             \Rd2, \Rn2, lsr #(\shift * 8)
33
-        mov             \Rd3, \Rn3, lsr #(\shift * 8)
34
-        orr             \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
35
-        orr             \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
36
-        orr             \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
37
-        orr             \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
38
-.endm
39
-.macro  ALIGN_DWORD shift, R0, R1, R2
40
-        mov             \R0, \R0, lsr #(\shift * 8)
41
-        orr             \R0, \R0, \R1, lsl #(32 - \shift * 8)
42
-        mov             \R1, \R1, lsr #(\shift * 8)
43
-        orr             \R1, \R1, \R2, lsl #(32 - \shift * 8)
44
-.endm
45
-.macro  ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
46
-        mov             \Rdst0, \Rsrc0, lsr #(\shift * 8)
47
-        mov             \Rdst1, \Rsrc1, lsr #(\shift * 8)
48
-        orr             \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
49
-        orr             \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
50
-.endm
51
-
52
-.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
53
-        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
54
-        @ Rmask = 0xFEFEFEFE
55
-        @ Rn = destroy
56
-        eor             \Rd0, \Rn0, \Rm0
57
-        eor             \Rd1, \Rn1, \Rm1
58
-        orr             \Rn0, \Rn0, \Rm0
59
-        orr             \Rn1, \Rn1, \Rm1
60
-        and             \Rd0, \Rd0, \Rmask
61
-        and             \Rd1, \Rd1, \Rmask
62
-        sub             \Rd0, \Rn0, \Rd0, lsr #1
63
-        sub             \Rd1, \Rn1, \Rd1, lsr #1
64
-.endm
65
-
66
-.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
67
-        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
68
-        @ Rmask = 0xFEFEFEFE
69
-        @ Rn = destroy
70
-        eor             \Rd0, \Rn0, \Rm0
71
-        eor             \Rd1, \Rn1, \Rm1
72
-        and             \Rn0, \Rn0, \Rm0
73
-        and             \Rn1, \Rn1, \Rm1
74
-        and             \Rd0, \Rd0, \Rmask
75
-        and             \Rd1, \Rd1, \Rmask
76
-        add             \Rd0, \Rn0, \Rd0, lsr #1
77
-        add             \Rd1, \Rn1, \Rd1, lsr #1
78
-.endm
79
-
80
-.macro  JMP_ALIGN tmp, reg
81
-        ands            \tmp, \reg, #3
82
-        bic             \reg, \reg, #3
83
-        beq             1f
84
-        subs            \tmp, \tmp, #1
85
-        beq             2f
86
-        subs            \tmp, \tmp, #1
87
-        beq             3f
88
-        b    4f
89
-.endm
90
-
91
-@ ----------------------------------------------------------------
92
-        .align 5
93
-function ff_put_pixels16_arm, export=1
94
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
95
-        @ block = word aligned, pixles = unaligned
96
-        pld             [r1]
97
-        push            {r4-r11, lr}
98
-        JMP_ALIGN       r5,  r1
99
-1:
100
-        ldm             r1,  {r4-r7}
101
-        add             r1,  r1,  r2
102
-        stm             r0,  {r4-r7}
103
-        pld             [r1]
104
-        subs            r3,  r3,  #1
105
-        add             r0,  r0,  r2
106
-        bne             1b
107
-        pop             {r4-r11, pc}
108
-        .align 5
109
-2:
110
-        ldm             r1,  {r4-r8}
111
-        add             r1,  r1,  r2
112
-        ALIGN_QWORD_D   1,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
113
-        pld             [r1]
114
-        subs            r3,  r3,  #1
115
-        stm             r0,  {r9-r12}
116
-        add             r0,  r0,  r2
117
-        bne             2b
118
-        pop             {r4-r11, pc}
119
-        .align 5
120
-3:
121
-        ldm             r1,  {r4-r8}
122
-        add             r1,  r1,  r2
123
-        ALIGN_QWORD_D   2,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
124
-        pld             [r1]
125
-        subs            r3,  r3,  #1
126
-        stm             r0,  {r9-r12}
127
-        add             r0,  r0,  r2
128
-        bne             3b
129
-        pop             {r4-r11, pc}
130
-        .align 5
131
-4:
132
-        ldm             r1,  {r4-r8}
133
-        add             r1,  r1,  r2
134
-        ALIGN_QWORD_D   3,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
135
-        pld             [r1]
136
-        subs            r3,  r3,  #1
137
-        stm             r0,  {r9-r12}
138
-        add             r0,  r0,  r2
139
-        bne             4b
140
-        pop             {r4-r11,pc}
141
-endfunc
142
-
143
-@ ----------------------------------------------------------------
144
-        .align 5
145
-function ff_put_pixels8_arm, export=1
146
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
147
-        @ block = word aligned, pixles = unaligned
148
-        pld             [r1]
149
-        push            {r4-r5,lr}
150
-        JMP_ALIGN       r5,  r1
151
-1:
152
-        ldm             r1,  {r4-r5}
153
-        add             r1,  r1,  r2
154
-        subs            r3,  r3,  #1
155
-        pld             [r1]
156
-        stm             r0,  {r4-r5}
157
-        add             r0,  r0,  r2
158
-        bne             1b
159
-        pop             {r4-r5,pc}
160
-        .align 5
161
-2:
162
-        ldm             r1,  {r4-r5, r12}
163
-        add             r1,  r1,  r2
164
-        ALIGN_DWORD     1,   r4,  r5,  r12
165
-        pld             [r1]
166
-        subs            r3,  r3,  #1
167
-        stm             r0,  {r4-r5}
168
-        add             r0,  r0,  r2
169
-        bne             2b
170
-        pop             {r4-r5,pc}
171
-        .align 5
172
-3:
173
-        ldm             r1,  {r4-r5, r12}
174
-        add             r1,  r1,  r2
175
-        ALIGN_DWORD     2,   r4,  r5,  r12
176
-        pld             [r1]
177
-        subs            r3,  r3,  #1
178
-        stm             r0,  {r4-r5}
179
-        add             r0,  r0,  r2
180
-        bne             3b
181
-        pop             {r4-r5,pc}
182
-        .align 5
183
-4:
184
-        ldm             r1,  {r4-r5, r12}
185
-        add             r1,  r1,  r2
186
-        ALIGN_DWORD     3,   r4,  r5,  r12
187
-        pld             [r1]
188
-        subs            r3,  r3,  #1
189
-        stm             r0,  {r4-r5}
190
-        add             r0,  r0,  r2
191
-        bne             4b
192
-        pop             {r4-r5,pc}
193
-endfunc
194
-
195
-@ ----------------------------------------------------------------
196
-        .align 5
197
-function ff_put_pixels8_x2_arm, export=1
198
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
199
-        @ block = word aligned, pixles = unaligned
200
-        pld             [r1]
201
-        push            {r4-r10,lr}
202
-        ldr             r12, =0xfefefefe
203
-        JMP_ALIGN       r5,  r1
204
-1:
205
-        ldm             r1,  {r4-r5, r10}
206
-        add             r1,  r1,  r2
207
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
208
-        pld             [r1]
209
-        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
210
-        subs            r3,  r3,  #1
211
-        stm             r0,  {r8-r9}
212
-        add             r0,  r0,  r2
213
-        bne             1b
214
-        pop             {r4-r10,pc}
215
-        .align 5
216
-2:
217
-        ldm             r1,  {r4-r5, r10}
218
-        add             r1,  r1,  r2
219
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
220
-        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
221
-        pld             [r1]
222
-        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
223
-        subs            r3,  r3,  #1
224
-        stm             r0,  {r4-r5}
225
-        add             r0,  r0,  r2
226
-        bne             2b
227
-        pop             {r4-r10,pc}
228
-        .align 5
229
-3:
230
-        ldm             r1,  {r4-r5, r10}
231
-        add             r1,  r1,  r2
232
-        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
233
-        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
234
-        pld             [r1]
235
-        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
236
-        subs            r3,  r3,  #1
237
-        stm             r0,  {r4-r5}
238
-        add             r0,  r0,  r2
239
-        bne             3b
240
-        pop             {r4-r10,pc}
241
-        .align 5
242
-4:
243
-        ldm             r1,  {r4-r5, r10}
244
-        add             r1,  r1,  r2
245
-        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
246
-        pld             [r1]
247
-        RND_AVG32       r8,  r9,  r6,  r7,  r5,  r10, r12
248
-        subs            r3,  r3,  #1
249
-        stm             r0,  {r8-r9}
250
-        add             r0,  r0,  r2
251
-        bne             4b
252
-        pop             {r4-r10,pc}
253
-endfunc
254
-
255
-        .align 5
256
-function ff_put_no_rnd_pixels8_x2_arm, export=1
257
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
258
-        @ block = word aligned, pixles = unaligned
259
-        pld             [r1]
260
-        push            {r4-r10,lr}
261
-        ldr             r12, =0xfefefefe
262
-        JMP_ALIGN       r5,  r1
263
-1:
264
-        ldm             r1,  {r4-r5, r10}
265
-        add             r1,  r1,  r2
266
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
267
-        pld             [r1]
268
-        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
269
-        subs            r3,  r3,  #1
270
-        stm             r0,  {r8-r9}
271
-        add             r0,  r0,  r2
272
-        bne             1b
273
-        pop             {r4-r10,pc}
274
-        .align 5
275
-2:
276
-        ldm             r1,  {r4-r5, r10}
277
-        add             r1,  r1,  r2
278
-        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
279
-        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
280
-        pld             [r1]
281
-        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
282
-        subs            r3,  r3,  #1
283
-        stm             r0,  {r4-r5}
284
-        add             r0,  r0,  r2
285
-        bne             2b
286
-        pop             {r4-r10,pc}
287
-        .align 5
288
-3:
289
-        ldm             r1,  {r4-r5, r10}
290
-        add             r1,  r1,  r2
291
-        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
292
-        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
293
-        pld             [r1]
294
-        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
295
-        subs            r3,  r3,  #1
296
-        stm             r0,  {r4-r5}
297
-        add             r0,  r0,  r2
298
-        bne             3b
299
-        pop             {r4-r10,pc}
300
-        .align 5
301
-4:
302
-        ldm             r1,  {r4-r5, r10}
303
-        add             r1,  r1,  r2
304
-        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
305
-        pld             [r1]
306
-        NO_RND_AVG32    r8,  r9,  r6,  r7,  r5,  r10, r12
307
-        subs            r3,  r3,  #1
308
-        stm             r0,  {r8-r9}
309
-        add             r0,  r0,  r2
310
-        bne             4b
311
-        pop             {r4-r10,pc}
312
-endfunc
313
-
314
-
315
-@ ----------------------------------------------------------------
316
-        .align 5
317
-function ff_put_pixels8_y2_arm, export=1
318
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
319
-        @ block = word aligned, pixles = unaligned
320
-        pld             [r1]
321
-        push            {r4-r11,lr}
322
-        mov             r3,  r3,  lsr #1
323
-        ldr             r12, =0xfefefefe
324
-        JMP_ALIGN       r5,  r1
325
-1:
326
-        ldm             r1,  {r4-r5}
327
-        add             r1,  r1,  r2
328
-6:      ldm             r1,  {r6-r7}
329
-        add             r1,  r1,  r2
330
-        pld             [r1]
331
-        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
332
-        ldm             r1,  {r4-r5}
333
-        add             r1,  r1,  r2
334
-        stm             r0,  {r8-r9}
335
-        add             r0,  r0,  r2
336
-        pld             [r1]
337
-        RND_AVG32       r8,  r9,  r6,  r7,  r4,  r5,  r12
338
-        subs            r3,  r3,  #1
339
-        stm             r0,  {r8-r9}
340
-        add             r0,  r0,  r2
341
-        bne             6b
342
-        pop             {r4-r11,pc}
343
-        .align 5
344
-2:
345
-        ldm             r1,  {r4-r6}
346
-        add             r1,  r1,  r2
347
-        pld             [r1]
348
-        ALIGN_DWORD     1,   r4,  r5,  r6
349
-6:      ldm             r1,  {r7-r9}
350
-        add             r1,  r1,  r2
351
-        pld             [r1]
352
-        ALIGN_DWORD     1,   r7,  r8,  r9
353
-        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
354
-        stm             r0,  {r10-r11}
355
-        add             r0,  r0,  r2
356
-        ldm             r1,  {r4-r6}
357
-        add             r1,  r1,  r2
358
-        pld             [r1]
359
-        ALIGN_DWORD     1,   r4,  r5,  r6
360
-        subs            r3,  r3,  #1
361
-        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
362
-        stm             r0,  {r10-r11}
363
-        add             r0,  r0,  r2
364
-        bne             6b
365
-        pop             {r4-r11,pc}
366
-        .align 5
367
-3:
368
-        ldm             r1,  {r4-r6}
369
-        add             r1,  r1,  r2
370
-        pld             [r1]
371
-        ALIGN_DWORD     2,   r4,  r5,  r6
372
-6:      ldm             r1,  {r7-r9}
373
-        add             r1,  r1,  r2
374
-        pld             [r1]
375
-        ALIGN_DWORD     2,   r7,  r8,  r9
376
-        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
377
-        stm             r0,  {r10-r11}
378
-        add             r0,  r0,  r2
379
-        ldm             r1,  {r4-r6}
380
-        add             r1,  r1,  r2
381
-        pld             [r1]
382
-        ALIGN_DWORD     2,   r4,  r5,  r6
383
-        subs            r3,  r3,  #1
384
-        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
385
-        stm             r0,  {r10-r11}
386
-        add             r0,  r0,  r2
387
-        bne             6b
388
-        pop             {r4-r11,pc}
389
-        .align 5
390
-4:
391
-        ldm             r1,  {r4-r6}
392
-        add             r1,  r1,  r2
393
-        pld             [r1]
394
-        ALIGN_DWORD     3,   r4,  r5,  r6
395
-6:      ldm             r1,  {r7-r9}
396
-        add             r1,  r1,  r2
397
-        pld             [r1]
398
-        ALIGN_DWORD     3,   r7,  r8,  r9
399
-        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
400
-        stm             r0,  {r10-r11}
401
-        add             r0,  r0,  r2
402
-        ldm             r1,  {r4-r6}
403
-        add             r1,  r1,  r2
404
-        pld             [r1]
405
-        ALIGN_DWORD     3,   r4,  r5,  r6
406
-        subs            r3,  r3,  #1
407
-        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
408
-        stm             r0,  {r10-r11}
409
-        add             r0,  r0,  r2
410
-        bne             6b
411
-        pop             {r4-r11,pc}
412
-endfunc
413
-
414
-        .align 5
415
-function ff_put_no_rnd_pixels8_y2_arm, export=1
416
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
417
-        @ block = word aligned, pixles = unaligned
418
-        pld             [r1]
419
-        push            {r4-r11,lr}
420
-        mov             r3,  r3,  lsr #1
421
-        ldr             r12, =0xfefefefe
422
-        JMP_ALIGN       r5,  r1
423
-1:
424
-        ldm             r1,  {r4-r5}
425
-        add             r1,  r1,  r2
426
-6:      ldm             r1,  {r6-r7}
427
-        add             r1,  r1,  r2
428
-        pld             [r1]
429
-        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
430
-        ldm             r1,  {r4-r5}
431
-        add             r1,  r1,  r2
432
-        stm             r0,  {r8-r9}
433
-        add             r0,  r0,  r2
434
-        pld             [r1]
435
-        NO_RND_AVG32    r8,  r9,  r6,  r7,  r4,  r5,  r12
436
-        subs            r3,  r3,  #1
437
-        stm             r0,  {r8-r9}
438
-        add             r0,  r0,  r2
439
-        bne             6b
440
-        pop             {r4-r11,pc}
441
-        .align 5
442
-2:
443
-        ldm             r1,  {r4-r6}
444
-        add             r1,  r1,  r2
445
-        pld             [r1]
446
-        ALIGN_DWORD     1,   r4,  r5,  r6
447
-6:      ldm             r1,  {r7-r9}
448
-        add             r1,  r1,  r2
449
-        pld             [r1]
450
-        ALIGN_DWORD     1,   r7,  r8,  r9
451
-        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
452
-        stm             r0,  {r10-r11}
453
-        add             r0,  r0,  r2
454
-        ldm             r1,  {r4-r6}
455
-        add             r1,  r1,  r2
456
-        pld             [r1]
457
-        ALIGN_DWORD     1,   r4,  r5,  r6
458
-        subs            r3,  r3,  #1
459
-        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
460
-        stm             r0,  {r10-r11}
461
-        add             r0,  r0,  r2
462
-        bne             6b
463
-        pop             {r4-r11,pc}
464
-        .align 5
465
-3:
466
-        ldm             r1,  {r4-r6}
467
-        add             r1,  r1,  r2
468
-        pld             [r1]
469
-        ALIGN_DWORD     2,   r4,  r5,  r6
470
-6:      ldm             r1,  {r7-r9}
471
-        add             r1,  r1,  r2
472
-        pld             [r1]
473
-        ALIGN_DWORD     2,   r7,  r8,  r9
474
-        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
475
-        stm             r0,  {r10-r11}
476
-        add             r0,  r0,  r2
477
-        ldm             r1,  {r4-r6}
478
-        add             r1,  r1,  r2
479
-        pld             [r1]
480
-        ALIGN_DWORD     2,   r4,  r5,  r6
481
-        subs            r3,  r3,  #1
482
-        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
483
-        stm             r0,  {r10-r11}
484
-        add             r0,  r0,  r2
485
-        bne             6b
486
-        pop             {r4-r11,pc}
487
-        .align 5
488
-4:
489
-        ldm             r1,  {r4-r6}
490
-        add             r1,  r1,  r2
491
-        pld             [r1]
492
-        ALIGN_DWORD     3,   r4,  r5,  r6
493
-6:      ldm             r1,  {r7-r9}
494
-        add             r1,  r1,  r2
495
-        pld             [r1]
496
-        ALIGN_DWORD     3,   r7,  r8,  r9
497
-        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
498
-        stm             r0,  {r10-r11}
499
-        add             r0,  r0,  r2
500
-        ldm             r1,  {r4-r6}
501
-        add             r1,  r1,  r2
502
-        pld             [r1]
503
-        ALIGN_DWORD     3,   r4,  r5,  r6
504
-        subs            r3,  r3,  #1
505
-        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
506
-        stm             r0,  {r10-r11}
507
-        add             r0,  r0,  r2
508
-        bne             6b
509
-        pop             {r4-r11,pc}
510
-endfunc
511
-
512
-        .ltorg
513
-
514
-@ ----------------------------------------------------------------
515
-.macro  RND_XY2_IT align, rnd
516
-        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
517
-        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
518
-.if \align == 0
519
-        ldm             r1,  {r6-r8}
520
-.elseif \align == 3
521
-        ldm             r1,  {r5-r7}
522
-.else
523
-        ldm             r1,  {r8-r10}
524
-.endif
525
-        add             r1,  r1,  r2
526
-        pld             [r1]
527
-.if \align == 0
528
-        ALIGN_DWORD_D   1,   r4,  r5,  r6,  r7,  r8
529
-.elseif \align == 1
530
-        ALIGN_DWORD_D   1,   r4,  r5,  r8,  r9,  r10
531
-        ALIGN_DWORD_D   2,   r6,  r7,  r8,  r9,  r10
532
-.elseif \align == 2
533
-        ALIGN_DWORD_D   2,   r4,  r5,  r8,  r9,  r10
534
-        ALIGN_DWORD_D   3,   r6,  r7,  r8,  r9,  r10
535
-.elseif \align == 3
536
-        ALIGN_DWORD_D   3,   r4,  r5,  r5,  r6,  r7
537
-.endif
538
-        ldr             r14, =0x03030303
539
-        tst             r3,  #1
540
-        and             r8,  r4,  r14
541
-        and             r9,  r5,  r14
542
-        and             r10, r6,  r14
543
-        and             r11, r7,  r14
544
-        it              eq
545
-        andeq           r14, r14, r14, \rnd #1
546
-        add             r8,  r8,  r10
547
-        add             r9,  r9,  r11
548
-        ldr             r12, =0xfcfcfcfc >> 2
549
-        itt             eq
550
-        addeq           r8,  r8,  r14
551
-        addeq           r9,  r9,  r14
552
-        and             r4,  r12, r4,  lsr #2
553
-        and             r5,  r12, r5,  lsr #2
554
-        and             r6,  r12, r6,  lsr #2
555
-        and             r7,  r12, r7,  lsr #2
556
-        add             r10, r4,  r6
557
-        add             r11, r5,  r7
558
-        subs            r3,  r3,  #1
559
-.endm
560
-
561
-.macro RND_XY2_EXPAND align, rnd
562
-        RND_XY2_IT      \align, \rnd
563
-6:      push            {r8-r11}
564
-        RND_XY2_IT      \align, \rnd
565
-        pop             {r4-r7}
566
-        add             r4,  r4,  r8
567
-        add             r5,  r5,  r9
568
-        ldr             r14, =0x0f0f0f0f
569
-        add             r6,  r6,  r10
570
-        add             r7,  r7,  r11
571
-        and             r4,  r14, r4,  lsr #2
572
-        and             r5,  r14, r5,  lsr #2
573
-        add             r4,  r4,  r6
574
-        add             r5,  r5,  r7
575
-        stm             r0,  {r4-r5}
576
-        add             r0,  r0,  r2
577
-        bge             6b
578
-        pop             {r4-r11,pc}
579
-.endm
580
-
581
-        .align 5
582
-function ff_put_pixels8_xy2_arm, export=1
583
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
584
-        @ block = word aligned, pixles = unaligned
585
-        pld             [r1]
586
-        push            {r4-r11,lr} @ R14 is also called LR
587
-        JMP_ALIGN       r5,  r1
588
-1:      RND_XY2_EXPAND  0, lsl
589
-        .align 5
590
-2:      RND_XY2_EXPAND  1, lsl
591
-        .align 5
592
-3:      RND_XY2_EXPAND  2, lsl
593
-        .align 5
594
-4:      RND_XY2_EXPAND  3, lsl
595
-endfunc
596
-
597
-        .align 5
598
-function ff_put_no_rnd_pixels8_xy2_arm, export=1
599
-        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
600
-        @ block = word aligned, pixles = unaligned
601
-        pld             [r1]
602
-        push            {r4-r11,lr}
603
-        JMP_ALIGN       r5,  r1
604
-1:      RND_XY2_EXPAND  0, lsr
605
-        .align 5
606
-2:      RND_XY2_EXPAND  1, lsr
607
-        .align 5
608
-3:      RND_XY2_EXPAND  2, lsr
609
-        .align 5
610
-4:      RND_XY2_EXPAND  3, lsr
611
-endfunc
612
-
613 29
         .align 5
614 30
 @ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
615 31
 function ff_add_pixels_clamped_arm, export=1
... ...
@@ -20,244 +20,6 @@
20 20
 
21 21
 #include "libavutil/arm/asm.S"
22 22
 
23
-.macro  call_2x_pixels  type, subp
24
-function ff_\type\()_pixels16\subp\()_armv6, export=1
25
-        push            {r0-r3, lr}
26
-        bl              ff_\type\()_pixels8\subp\()_armv6
27
-        pop             {r0-r3, lr}
28
-        add             r0,  r0,  #8
29
-        add             r1,  r1,  #8
30
-        b               ff_\type\()_pixels8\subp\()_armv6
31
-endfunc
32
-.endm
33
-
34
-call_2x_pixels          avg
35
-call_2x_pixels          put, _x2
36
-call_2x_pixels          put, _y2
37
-call_2x_pixels          put, _x2_no_rnd
38
-call_2x_pixels          put, _y2_no_rnd
39
-
40
-function ff_put_pixels16_armv6, export=1
41
-        push            {r4-r11}
42
-1:
43
-        ldr             r5,  [r1, #4]
44
-        ldr             r6,  [r1, #8]
45
-        ldr             r7,  [r1, #12]
46
-        ldr_post        r4,  r1,  r2
47
-        strd            r6,  r7,  [r0, #8]
48
-        ldr             r9,  [r1, #4]
49
-        strd_post       r4,  r5,  r0,  r2
50
-        ldr             r10, [r1, #8]
51
-        ldr             r11, [r1, #12]
52
-        ldr_post        r8,  r1,  r2
53
-        strd            r10, r11, [r0, #8]
54
-        subs            r3,  r3,  #2
55
-        strd_post       r8,  r9,  r0,  r2
56
-        bne             1b
57
-
58
-        pop             {r4-r11}
59
-        bx              lr
60
-endfunc
61
-
62
-function ff_put_pixels8_armv6, export=1
63
-        push            {r4-r7}
64
-1:
65
-        ldr             r5,  [r1, #4]
66
-        ldr_post        r4,  r1,  r2
67
-        ldr             r7,  [r1, #4]
68
-        strd_post       r4,  r5,  r0,  r2
69
-        ldr_post        r6,  r1,  r2
70
-        subs            r3,  r3,  #2
71
-        strd_post       r6,  r7,  r0,  r2
72
-        bne             1b
73
-
74
-        pop             {r4-r7}
75
-        bx              lr
76
-endfunc
77
-
78
-function ff_put_pixels8_x2_armv6, export=1
79
-        push            {r4-r11, lr}
80
-        mov             r12, #1
81
-        orr             r12, r12, r12, lsl #8
82
-        orr             r12, r12, r12, lsl #16
83
-1:
84
-        ldr             r4,  [r1]
85
-        subs            r3,  r3,  #2
86
-        ldr             r5,  [r1, #4]
87
-        ldr             r7,  [r1, #5]
88
-        lsr             r6,  r4,  #8
89
-        ldr_pre         r8,  r1,  r2
90
-        orr             r6,  r6,  r5,  lsl #24
91
-        ldr             r9,  [r1, #4]
92
-        ldr             r11, [r1, #5]
93
-        lsr             r10, r8,  #8
94
-        add             r1,  r1,  r2
95
-        orr             r10, r10, r9,  lsl #24
96
-        eor             r14, r4,  r6
97
-        uhadd8          r4,  r4,  r6
98
-        eor             r6,  r5,  r7
99
-        uhadd8          r5,  r5,  r7
100
-        and             r14, r14, r12
101
-        and             r6,  r6,  r12
102
-        uadd8           r4,  r4,  r14
103
-        eor             r14, r8,  r10
104
-        uadd8           r5,  r5,  r6
105
-        eor             r6,  r9,  r11
106
-        uhadd8          r8,  r8,  r10
107
-        and             r14, r14, r12
108
-        uhadd8          r9,  r9,  r11
109
-        and             r6,  r6,  r12
110
-        uadd8           r8,  r8,  r14
111
-        strd_post       r4,  r5,  r0,  r2
112
-        uadd8           r9,  r9,  r6
113
-        strd_post       r8,  r9,  r0,  r2
114
-        bne             1b
115
-
116
-        pop             {r4-r11, pc}
117
-endfunc
118
-
119
-function ff_put_pixels8_y2_armv6, export=1
120
-        push            {r4-r11}
121
-        mov             r12, #1
122
-        orr             r12, r12, r12, lsl #8
123
-        orr             r12, r12, r12, lsl #16
124
-        ldr             r4,  [r1]
125
-        ldr             r5,  [r1, #4]
126
-        ldr_pre         r6,  r1,  r2
127
-        ldr             r7,  [r1, #4]
128
-1:
129
-        subs            r3,  r3,  #2
130
-        uhadd8          r8,  r4,  r6
131
-        eor             r10, r4,  r6
132
-        uhadd8          r9,  r5,  r7
133
-        eor             r11, r5,  r7
134
-        and             r10, r10, r12
135
-        ldr_pre         r4,  r1,  r2
136
-        uadd8           r8,  r8,  r10
137
-        and             r11, r11, r12
138
-        uadd8           r9,  r9,  r11
139
-        ldr             r5,  [r1, #4]
140
-        uhadd8          r10, r4,  r6
141
-        eor             r6,  r4,  r6
142
-        uhadd8          r11, r5,  r7
143
-        and             r6,  r6,  r12
144
-        eor             r7,  r5,  r7
145
-        uadd8           r10, r10, r6
146
-        and             r7,  r7,  r12
147
-        ldr_pre         r6,  r1,  r2
148
-        uadd8           r11, r11, r7
149
-        strd_post       r8,  r9,  r0,  r2
150
-        ldr             r7,  [r1, #4]
151
-        strd_post       r10, r11, r0,  r2
152
-        bne             1b
153
-
154
-        pop             {r4-r11}
155
-        bx              lr
156
-endfunc
157
-
158
-function ff_put_pixels8_x2_no_rnd_armv6, export=1
159
-        push            {r4-r9, lr}
160
-1:
161
-        subs            r3,  r3,  #2
162
-        ldr             r4,  [r1]
163
-        ldr             r5,  [r1, #4]
164
-        ldr             r7,  [r1, #5]
165
-        ldr_pre         r8,  r1,  r2
166
-        ldr             r9,  [r1, #4]
167
-        ldr             r14, [r1, #5]
168
-        add             r1,  r1,  r2
169
-        lsr             r6,  r4,  #8
170
-        orr             r6,  r6,  r5,  lsl #24
171
-        lsr             r12, r8,  #8
172
-        orr             r12, r12, r9,  lsl #24
173
-        uhadd8          r4,  r4,  r6
174
-        uhadd8          r5,  r5,  r7
175
-        uhadd8          r8,  r8,  r12
176
-        uhadd8          r9,  r9,  r14
177
-        stm             r0,  {r4,r5}
178
-        add             r0,  r0,  r2
179
-        stm             r0,  {r8,r9}
180
-        add             r0,  r0,  r2
181
-        bne             1b
182
-
183
-        pop             {r4-r9, pc}
184
-endfunc
185
-
186
-function ff_put_pixels8_y2_no_rnd_armv6, export=1
187
-        push            {r4-r9, lr}
188
-        ldr             r4,  [r1]
189
-        ldr             r5,  [r1, #4]
190
-        ldr_pre         r6,  r1,  r2
191
-        ldr             r7,  [r1, #4]
192
-1:
193
-        subs            r3,  r3,  #2
194
-        uhadd8          r8,  r4,  r6
195
-        ldr_pre         r4,  r1,  r2
196
-        uhadd8          r9,  r5,  r7
197
-        ldr             r5,  [r1, #4]
198
-        uhadd8          r12, r4,  r6
199
-        ldr_pre         r6,  r1,  r2
200
-        uhadd8          r14, r5,  r7
201
-        ldr             r7,  [r1, #4]
202
-        stm             r0,  {r8,r9}
203
-        add             r0,  r0,  r2
204
-        stm             r0,  {r12,r14}
205
-        add             r0,  r0,  r2
206
-        bne             1b
207
-
208
-        pop             {r4-r9, pc}
209
-endfunc
210
-
211
-function ff_avg_pixels8_armv6, export=1
212
-        pld             [r1, r2]
213
-        push            {r4-r10, lr}
214
-        mov             lr,  #1
215
-        orr             lr,  lr,  lr,  lsl #8
216
-        orr             lr,  lr,  lr,  lsl #16
217
-        ldrd            r4,  r5,  [r0]
218
-        ldr             r10, [r1, #4]
219
-        ldr_post        r9,  r1,  r2
220
-        subs            r3,  r3,  #2
221
-1:
222
-        pld             [r1, r2]
223
-        eor             r8,  r4,  r9
224
-        uhadd8          r4,  r4,  r9
225
-        eor             r12, r5,  r10
226
-        ldrd_reg        r6,  r7,  r0,  r2
227
-        uhadd8          r5,  r5,  r10
228
-        and             r8,  r8,  lr
229
-        ldr             r10, [r1, #4]
230
-        and             r12, r12, lr
231
-        uadd8           r4,  r4,  r8
232
-        ldr_post        r9,  r1,  r2
233
-        eor             r8,  r6,  r9
234
-        uadd8           r5,  r5,  r12
235
-        pld             [r1, r2,  lsl #1]
236
-        eor             r12, r7,  r10
237
-        uhadd8          r6,  r6,  r9
238
-        strd_post       r4,  r5,  r0,  r2
239
-        uhadd8          r7,  r7,  r10
240
-        beq             2f
241
-        and             r8,  r8,  lr
242
-        ldrd_reg        r4,  r5,  r0,  r2
243
-        uadd8           r6,  r6,  r8
244
-        ldr             r10, [r1, #4]
245
-        and             r12, r12, lr
246
-        subs            r3,  r3,  #2
247
-        uadd8           r7,  r7,  r12
248
-        ldr_post        r9,  r1,  r2
249
-        strd_post       r6,  r7,  r0,  r2
250
-        b               1b
251
-2:
252
-        and             r8,  r8,  lr
253
-        and             r12, r12, lr
254
-        uadd8           r6,  r6,  r8
255
-        uadd8           r7,  r7,  r12
256
-        strd_post       r6,  r7,  r0,  r2
257
-
258
-        pop             {r4-r10, pc}
259
-endfunc
260
-
261 23
 function ff_add_pixels_clamped_armv6, export=1
262 24
         push            {r4-r8,lr}
263 25
         mov             r3,  #8
... ...
@@ -30,24 +30,6 @@ void ff_simple_idct_arm(int16_t *data);
30 30
 static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
31 31
 static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
32 32
 
33
-void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
34
-void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
35
-void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
36
-void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
37
-
38
-void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
39
-void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
40
-void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
41
-
42
-void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
43
-
44
-CALL_2X_PIXELS(ff_put_pixels16_x2_arm,         ff_put_pixels8_x2_arm,        8)
45
-CALL_2X_PIXELS(ff_put_pixels16_y2_arm,         ff_put_pixels8_y2_arm,        8)
46
-CALL_2X_PIXELS(ff_put_pixels16_xy2_arm,        ff_put_pixels8_xy2_arm,       8)
47
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm,  ff_put_no_rnd_pixels8_x2_arm, 8)
48
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm,  ff_put_no_rnd_pixels8_y2_arm, 8)
49
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
50
-
51 33
 void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
52 34
                                int line_size);
53 35
 
... ...
@@ -76,7 +58,6 @@ static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
76 76
 
77 77
 av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx)
78 78
 {
79
-    const int high_bit_depth = avctx->bits_per_raw_sample > 8;
80 79
     int cpu_flags = av_get_cpu_flags();
81 80
 
82 81
     ff_put_pixels_clamped = c->put_pixels_clamped;
... ...
@@ -99,26 +80,6 @@ av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx)
99 99
 
100 100
     c->add_pixels_clamped = ff_add_pixels_clamped_arm;
101 101
 
102
-    if (!high_bit_depth) {
103
-    c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
104
-    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
105
-    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
106
-    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
107
-    c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
108
-    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
109
-    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
110
-    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
111
-
112
-    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
113
-    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
114
-    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
115
-    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
116
-    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
117
-    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
118
-    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
119
-    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
120
-    }
121
-
122 102
     if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx);
123 103
     if (have_armv6(cpu_flags))   ff_dsputil_init_armv6(c, avctx);
124 104
     if (have_neon(cpu_flags))    ff_dsputil_init_neon(c, avctx);
... ...
@@ -27,24 +27,6 @@ void ff_simple_idct_armv6(int16_t *data);
27 27
 void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
28 28
 void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
29 29
 
30
-void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
31
-void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
32
-void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
33
-
34
-void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
35
-void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
36
-
37
-void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
38
-
39
-void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
40
-void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
41
-void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
42
-
43
-void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
44
-void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
45
-
46
-void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
47
-
48 30
 void ff_add_pixels_clamped_armv6(const int16_t *block,
49 31
                                  uint8_t *restrict pixels,
50 32
                                  int line_size);
... ...
@@ -82,29 +64,6 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx)
82 82
         c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
83 83
     }
84 84
 
85
-    if (!high_bit_depth) {
86
-    c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
87
-    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
88
-    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
89
-/*     c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
90
-    c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
91
-    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
92
-    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
93
-/*     c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
94
-
95
-    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
96
-    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
97
-    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
98
-/*     c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
99
-    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
100
-    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
101
-    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
102
-/*     c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
103
-
104
-    c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
105
-    c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
106
-    }
107
-
108 85
     if (!high_bit_depth)
109 86
         c->get_pixels = ff_get_pixels_armv6;
110 87
     c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
... ...
@@ -32,33 +32,6 @@ void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
32 32
 void ff_clear_block_neon(int16_t *block);
33 33
 void ff_clear_blocks_neon(int16_t *blocks);
34 34
 
35
-void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
36
-void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
37
-void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
38
-void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
39
-void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
40
-void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
41
-void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
42
-void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
43
-void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
44
-void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
45
-void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
46
-void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
47
-void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
48
-void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
49
-
50
-void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
51
-void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
52
-void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
53
-void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
54
-void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
55
-void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
56
-void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
57
-void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
58
-void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
59
-void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
60
-void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
61
-
62 35
 void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
63 36
 void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
64 37
 void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
... ...
@@ -92,38 +65,6 @@ av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
92 92
     if (!high_bit_depth) {
93 93
         c->clear_block  = ff_clear_block_neon;
94 94
         c->clear_blocks = ff_clear_blocks_neon;
95
-
96
-        c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
97
-        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
98
-        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
99
-        c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
100
-        c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
101
-        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
102
-        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
103
-        c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
104
-
105
-        c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
106
-        c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
107
-        c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
108
-        c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
109
-        c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
110
-        c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
111
-        c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
112
-        c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
113
-
114
-        c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
115
-        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
116
-        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
117
-        c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
118
-        c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
119
-        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
120
-        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
121
-        c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
122
-
123
-        c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
124
-        c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
125
-        c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
126
-        c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
127 95
     }
128 96
 
129 97
     c->add_pixels_clamped = ff_add_pixels_clamped_neon;
... ...
@@ -37,394 +37,6 @@ function ff_clear_blocks_neon, export=1
37 37
         bx              lr
38 38
 endfunc
39 39
 
40
-.macro  pixels16        rnd=1, avg=0
41
-  .if \avg
42
-        mov             r12, r0
43
-  .endif
44
-1:      vld1.8          {q0},     [r1], r2
45
-        vld1.8          {q1},     [r1], r2
46
-        vld1.8          {q2},     [r1], r2
47
-        pld             [r1, r2, lsl #2]
48
-        vld1.8          {q3},     [r1], r2
49
-        pld             [r1]
50
-        pld             [r1, r2]
51
-        pld             [r1, r2, lsl #1]
52
-  .if \avg
53
-        vld1.8          {q8},     [r12,:128], r2
54
-        vrhadd.u8       q0,  q0,  q8
55
-        vld1.8          {q9},     [r12,:128], r2
56
-        vrhadd.u8       q1,  q1,  q9
57
-        vld1.8          {q10},    [r12,:128], r2
58
-        vrhadd.u8       q2,  q2,  q10
59
-        vld1.8          {q11},    [r12,:128], r2
60
-        vrhadd.u8       q3,  q3,  q11
61
-  .endif
62
-        subs            r3,  r3,  #4
63
-        vst1.64         {q0},     [r0,:128], r2
64
-        vst1.64         {q1},     [r0,:128], r2
65
-        vst1.64         {q2},     [r0,:128], r2
66
-        vst1.64         {q3},     [r0,:128], r2
67
-        bne             1b
68
-        bx              lr
69
-.endm
70
-
71
-.macro  pixels16_x2     rnd=1, avg=0
72
-1:      vld1.8          {d0-d2},  [r1], r2
73
-        vld1.8          {d4-d6},  [r1], r2
74
-        pld             [r1]
75
-        pld             [r1, r2]
76
-        subs            r3,  r3,  #2
77
-        vext.8          q1,  q0,  q1,  #1
78
-        avg             q0,  q0,  q1
79
-        vext.8          q3,  q2,  q3,  #1
80
-        avg             q2,  q2,  q3
81
-  .if \avg
82
-        vld1.8          {q1},     [r0,:128], r2
83
-        vld1.8          {q3},     [r0,:128]
84
-        vrhadd.u8       q0,  q0,  q1
85
-        vrhadd.u8       q2,  q2,  q3
86
-        sub             r0,  r0,  r2
87
-  .endif
88
-        vst1.8          {q0},     [r0,:128], r2
89
-        vst1.8          {q2},     [r0,:128], r2
90
-        bne             1b
91
-        bx              lr
92
-.endm
93
-
94
-.macro  pixels16_y2     rnd=1, avg=0
95
-        sub             r3,  r3,  #2
96
-        vld1.8          {q0},     [r1], r2
97
-        vld1.8          {q1},     [r1], r2
98
-1:      subs            r3,  r3,  #2
99
-        avg             q2,  q0,  q1
100
-        vld1.8          {q0},     [r1], r2
101
-        avg             q3,  q0,  q1
102
-        vld1.8          {q1},     [r1], r2
103
-        pld             [r1]
104
-        pld             [r1, r2]
105
-  .if \avg
106
-        vld1.8          {q8},     [r0,:128], r2
107
-        vld1.8          {q9},     [r0,:128]
108
-        vrhadd.u8       q2,  q2,  q8
109
-        vrhadd.u8       q3,  q3,  q9
110
-        sub             r0,  r0,  r2
111
-  .endif
112
-        vst1.8          {q2},     [r0,:128], r2
113
-        vst1.8          {q3},     [r0,:128], r2
114
-        bne             1b
115
-
116
-        avg             q2,  q0,  q1
117
-        vld1.8          {q0},     [r1], r2
118
-        avg             q3,  q0,  q1
119
-  .if \avg
120
-        vld1.8          {q8},     [r0,:128], r2
121
-        vld1.8          {q9},     [r0,:128]
122
-        vrhadd.u8       q2,  q2,  q8
123
-        vrhadd.u8       q3,  q3,  q9
124
-        sub             r0,  r0,  r2
125
-  .endif
126
-        vst1.8          {q2},     [r0,:128], r2
127
-        vst1.8          {q3},     [r0,:128], r2
128
-
129
-        bx              lr
130
-.endm
131
-
132
-.macro  pixels16_xy2    rnd=1, avg=0
133
-        sub             r3,  r3,  #2
134
-        vld1.8          {d0-d2},  [r1], r2
135
-        vld1.8          {d4-d6},  [r1], r2
136
-NRND    vmov.i16        q13, #1
137
-        pld             [r1]
138
-        pld             [r1, r2]
139
-        vext.8          q1,  q0,  q1,  #1
140
-        vext.8          q3,  q2,  q3,  #1
141
-        vaddl.u8        q8,  d0,  d2
142
-        vaddl.u8        q10, d1,  d3
143
-        vaddl.u8        q9,  d4,  d6
144
-        vaddl.u8        q11, d5,  d7
145
-1:      subs            r3,  r3,  #2
146
-        vld1.8          {d0-d2},  [r1], r2
147
-        vadd.u16        q12, q8,  q9
148
-        pld             [r1]
149
-NRND    vadd.u16        q12, q12, q13
150
-        vext.8          q15, q0,  q1,  #1
151
-        vadd.u16        q1 , q10, q11
152
-        shrn            d28, q12, #2
153
-NRND    vadd.u16        q1,  q1,  q13
154
-        shrn            d29, q1,  #2
155
-  .if \avg
156
-        vld1.8          {q8},     [r0,:128]
157
-        vrhadd.u8       q14, q14, q8
158
-  .endif
159
-        vaddl.u8        q8,  d0,  d30
160
-        vld1.8          {d2-d4},  [r1], r2
161
-        vaddl.u8        q10, d1,  d31
162
-        vst1.8          {q14},    [r0,:128], r2
163
-        vadd.u16        q12, q8,  q9
164
-        pld             [r1, r2]
165
-NRND    vadd.u16        q12, q12, q13
166
-        vext.8          q2,  q1,  q2,  #1
167
-        vadd.u16        q0,  q10, q11
168
-        shrn            d30, q12, #2
169
-NRND    vadd.u16        q0,  q0,  q13
170
-        shrn            d31, q0,  #2
171
-  .if \avg
172
-        vld1.8          {q9},     [r0,:128]
173
-        vrhadd.u8       q15, q15, q9
174
-  .endif
175
-        vaddl.u8        q9,  d2,  d4
176
-        vaddl.u8        q11, d3,  d5
177
-        vst1.8          {q15},    [r0,:128], r2
178
-        bgt             1b
179
-
180
-        vld1.8          {d0-d2},  [r1], r2
181
-        vadd.u16        q12, q8,  q9
182
-NRND    vadd.u16        q12, q12, q13
183
-        vext.8          q15, q0,  q1,  #1
184
-        vadd.u16        q1 , q10, q11
185
-        shrn            d28, q12, #2
186
-NRND    vadd.u16        q1,  q1,  q13
187
-        shrn            d29, q1,  #2
188
-  .if \avg
189
-        vld1.8          {q8},     [r0,:128]
190
-        vrhadd.u8       q14, q14, q8
191
-  .endif
192
-        vaddl.u8        q8,  d0,  d30
193
-        vaddl.u8        q10, d1,  d31
194
-        vst1.8          {q14},    [r0,:128], r2
195
-        vadd.u16        q12, q8,  q9
196
-NRND    vadd.u16        q12, q12, q13
197
-        vadd.u16        q0,  q10, q11
198
-        shrn            d30, q12, #2
199
-NRND    vadd.u16        q0,  q0,  q13
200
-        shrn            d31, q0,  #2
201
-  .if \avg
202
-        vld1.8          {q9},     [r0,:128]
203
-        vrhadd.u8       q15, q15, q9
204
-  .endif
205
-        vst1.8          {q15},    [r0,:128], r2
206
-
207
-        bx              lr
208
-.endm
209
-
210
-.macro  pixels8         rnd=1, avg=0
211
-1:      vld1.8          {d0},     [r1], r2
212
-        vld1.8          {d1},     [r1], r2
213
-        vld1.8          {d2},     [r1], r2
214
-        pld             [r1, r2, lsl #2]
215
-        vld1.8          {d3},     [r1], r2
216
-        pld             [r1]
217
-        pld             [r1, r2]
218
-        pld             [r1, r2, lsl #1]
219
-  .if \avg
220
-        vld1.8          {d4},     [r0,:64], r2
221
-        vrhadd.u8       d0,  d0,  d4
222
-        vld1.8          {d5},     [r0,:64], r2
223
-        vrhadd.u8       d1,  d1,  d5
224
-        vld1.8          {d6},     [r0,:64], r2
225
-        vrhadd.u8       d2,  d2,  d6
226
-        vld1.8          {d7},     [r0,:64], r2
227
-        vrhadd.u8       d3,  d3,  d7
228
-        sub             r0,  r0,  r2,  lsl #2
229
-  .endif
230
-        subs            r3,  r3,  #4
231
-        vst1.8          {d0},     [r0,:64], r2
232
-        vst1.8          {d1},     [r0,:64], r2
233
-        vst1.8          {d2},     [r0,:64], r2
234
-        vst1.8          {d3},     [r0,:64], r2
235
-        bne             1b
236
-        bx              lr
237
-.endm
238
-
239
-.macro  pixels8_x2      rnd=1, avg=0
240
-1:      vld1.8          {q0},     [r1], r2
241
-        vext.8          d1,  d0,  d1,  #1
242
-        vld1.8          {q1},     [r1], r2
243
-        vext.8          d3,  d2,  d3,  #1
244
-        pld             [r1]
245
-        pld             [r1, r2]
246
-        subs            r3,  r3,  #2
247
-        vswp            d1,  d2
248
-        avg             q0,  q0,  q1
249
-  .if \avg
250
-        vld1.8          {d4},     [r0,:64], r2
251
-        vld1.8          {d5},     [r0,:64]
252
-        vrhadd.u8       q0,  q0,  q2
253
-        sub             r0,  r0,  r2
254
-  .endif
255
-        vst1.8          {d0},     [r0,:64], r2
256
-        vst1.8          {d1},     [r0,:64], r2
257
-        bne             1b
258
-        bx              lr
259
-.endm
260
-
261
-.macro  pixels8_y2      rnd=1, avg=0
262
-        sub             r3,  r3,  #2
263
-        vld1.8          {d0},     [r1], r2
264
-        vld1.8          {d1},     [r1], r2
265
-1:      subs            r3,  r3,  #2
266
-        avg             d4,  d0,  d1
267
-        vld1.8          {d0},     [r1], r2
268
-        avg             d5,  d0,  d1
269
-        vld1.8          {d1},     [r1], r2
270
-        pld             [r1]
271
-        pld             [r1, r2]
272
-  .if \avg
273
-        vld1.8          {d2},     [r0,:64], r2
274
-        vld1.8          {d3},     [r0,:64]
275
-        vrhadd.u8       q2,  q2,  q1
276
-        sub             r0,  r0,  r2
277
-  .endif
278
-        vst1.8          {d4},     [r0,:64], r2
279
-        vst1.8          {d5},     [r0,:64], r2
280
-        bne             1b
281
-
282
-        avg             d4,  d0,  d1
283
-        vld1.8          {d0},     [r1], r2
284
-        avg             d5,  d0,  d1
285
-  .if \avg
286
-        vld1.8          {d2},     [r0,:64], r2
287
-        vld1.8          {d3},     [r0,:64]
288
-        vrhadd.u8       q2,  q2,  q1
289
-        sub             r0,  r0,  r2
290
-  .endif
291
-        vst1.8          {d4},     [r0,:64], r2
292
-        vst1.8          {d5},     [r0,:64], r2
293
-
294
-        bx              lr
295
-.endm
296
-
297
-.macro  pixels8_xy2     rnd=1, avg=0
298
-        sub             r3,  r3,  #2
299
-        vld1.8          {q0},     [r1], r2
300
-        vld1.8          {q1},     [r1], r2
301
-NRND    vmov.i16        q11, #1
302
-        pld             [r1]
303
-        pld             [r1, r2]
304
-        vext.8          d4,  d0,  d1,  #1
305
-        vext.8          d6,  d2,  d3,  #1
306
-        vaddl.u8        q8,  d0,  d4
307
-        vaddl.u8        q9,  d2,  d6
308
-1:      subs            r3,  r3,  #2
309
-        vld1.8          {q0},     [r1], r2
310
-        pld             [r1]
311
-        vadd.u16        q10, q8,  q9
312
-        vext.8          d4,  d0,  d1,  #1
313
-NRND    vadd.u16        q10, q10, q11
314
-        vaddl.u8        q8,  d0,  d4
315
-        shrn            d5,  q10, #2
316
-        vld1.8          {q1},     [r1], r2
317
-        vadd.u16        q10, q8,  q9
318
-        pld             [r1, r2]
319
-  .if \avg
320
-        vld1.8          {d7},     [r0,:64]
321
-        vrhadd.u8       d5,  d5,  d7
322
-  .endif
323
-NRND    vadd.u16        q10, q10, q11
324
-        vst1.8          {d5},     [r0,:64], r2
325
-        shrn            d7,  q10, #2
326
-  .if \avg
327
-        vld1.8          {d5},     [r0,:64]
328
-        vrhadd.u8       d7,  d7,  d5
329
-  .endif
330
-        vext.8          d6,  d2,  d3,  #1
331
-        vaddl.u8        q9,  d2,  d6
332
-        vst1.8          {d7},     [r0,:64], r2
333
-        bgt             1b
334
-
335
-        vld1.8          {q0},     [r1], r2
336
-        vadd.u16        q10, q8,  q9
337
-        vext.8          d4,  d0,  d1,  #1
338
-NRND    vadd.u16        q10, q10, q11
339
-        vaddl.u8        q8,  d0,  d4
340
-        shrn            d5,  q10, #2
341
-        vadd.u16        q10, q8,  q9
342
-  .if \avg
343
-        vld1.8          {d7},     [r0,:64]
344
-        vrhadd.u8       d5,  d5,  d7
345
-  .endif
346
-NRND    vadd.u16        q10, q10, q11
347
-        vst1.8          {d5},     [r0,:64], r2
348
-        shrn            d7,  q10, #2
349
-  .if \avg
350
-        vld1.8          {d5},     [r0,:64]
351
-        vrhadd.u8       d7,  d7,  d5
352
-  .endif
353
-        vst1.8          {d7},     [r0,:64], r2
354
-
355
-        bx              lr
356
-.endm
357
-
358
-.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
359
-  .if \rnd
360
-    .macro avg  rd, rn, rm
361
-        vrhadd.u8       \rd, \rn, \rm
362
-    .endm
363
-    .macro shrn rd, rn, rm
364
-        vrshrn.u16      \rd, \rn, \rm
365
-    .endm
366
-    .macro NRND insn:vararg
367
-    .endm
368
-  .else
369
-    .macro avg  rd, rn, rm
370
-        vhadd.u8        \rd, \rn, \rm
371
-    .endm
372
-    .macro shrn rd, rn, rm
373
-        vshrn.u16       \rd, \rn, \rm
374
-    .endm
375
-    .macro NRND insn:vararg
376
-        \insn
377
-    .endm
378
-  .endif
379
-function ff_\pfx\name\suf\()_neon, export=1
380
-        \name           \rnd, \avg
381
-endfunc
382
-        .purgem         avg
383
-        .purgem         shrn
384
-        .purgem         NRND
385
-.endm
386
-
387
-.macro  pixfunc2        pfx, name, avg=0
388
-        pixfunc         \pfx, \name,          rnd=1, avg=\avg
389
-        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
390
-.endm
391
-
392
-function ff_put_h264_qpel16_mc00_neon, export=1
393
-        mov             r3,  #16
394
-endfunc
395
-
396
-        pixfunc         put_, pixels16,     avg=0
397
-        pixfunc2        put_, pixels16_x2,  avg=0
398
-        pixfunc2        put_, pixels16_y2,  avg=0
399
-        pixfunc2        put_, pixels16_xy2, avg=0
400
-
401
-function ff_avg_h264_qpel16_mc00_neon, export=1
402
-        mov             r3,  #16
403
-endfunc
404
-
405
-        pixfunc         avg_, pixels16,     avg=1
406
-        pixfunc2        avg_, pixels16_x2,  avg=1
407
-        pixfunc2        avg_, pixels16_y2,  avg=1
408
-        pixfunc2        avg_, pixels16_xy2, avg=1
409
-
410
-function ff_put_h264_qpel8_mc00_neon, export=1
411
-        mov             r3,  #8
412
-endfunc
413
-
414
-        pixfunc         put_, pixels8,     avg=0
415
-        pixfunc2        put_, pixels8_x2,  avg=0
416
-        pixfunc2        put_, pixels8_y2,  avg=0
417
-        pixfunc2        put_, pixels8_xy2, avg=0
418
-
419
-function ff_avg_h264_qpel8_mc00_neon, export=1
420
-        mov             r3,  #8
421
-endfunc
422
-
423
-        pixfunc         avg_, pixels8,     avg=1
424
-        pixfunc         avg_, pixels8_x2,  avg=1
425
-        pixfunc         avg_, pixels8_y2,  avg=1
426
-        pixfunc         avg_, pixels8_xy2, avg=1
427
-
428 40
 function ff_put_pixels_clamped_neon, export=1
429 41
         vld1.16         {d16-d19}, [r0,:128]!
430 42
         vqmovun.s16     d0, q8
431 43
new file mode 100644
... ...
@@ -0,0 +1,611 @@
0
+@
1
+@ ARMv4 optimized DSP utils
2
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
3
+@
4
+@ This file is part of Libav.
5
+@
6
+@ Libav is free software; you can redistribute it and/or
7
+@ modify it under the terms of the GNU Lesser General Public
8
+@ License as published by the Free Software Foundation; either
9
+@ version 2.1 of the License, or (at your option) any later version.
10
+@
11
+@ Libav is distributed in the hope that it will be useful,
12
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+@ Lesser General Public License for more details.
15
+@
16
+@ You should have received a copy of the GNU Lesser General Public
17
+@ License along with Libav; if not, write to the Free Software
18
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+@
20
+
21
+#include "config.h"
22
+#include "libavutil/arm/asm.S"
23
+
24
+#if !HAVE_ARMV5TE_EXTERNAL
25
+#define pld @
26
+#endif
27
+
28
+.macro  ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
29
+        mov             \Rd0, \Rn0, lsr #(\shift * 8)
30
+        mov             \Rd1, \Rn1, lsr #(\shift * 8)
31
+        mov             \Rd2, \Rn2, lsr #(\shift * 8)
32
+        mov             \Rd3, \Rn3, lsr #(\shift * 8)
33
+        orr             \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
34
+        orr             \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
35
+        orr             \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
36
+        orr             \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
37
+.endm
38
+.macro  ALIGN_DWORD shift, R0, R1, R2
39
+        mov             \R0, \R0, lsr #(\shift * 8)
40
+        orr             \R0, \R0, \R1, lsl #(32 - \shift * 8)
41
+        mov             \R1, \R1, lsr #(\shift * 8)
42
+        orr             \R1, \R1, \R2, lsl #(32 - \shift * 8)
43
+.endm
44
+.macro  ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
45
+        mov             \Rdst0, \Rsrc0, lsr #(\shift * 8)
46
+        mov             \Rdst1, \Rsrc1, lsr #(\shift * 8)
47
+        orr             \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
48
+        orr             \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
49
+.endm
50
+
51
+.macro  RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
52
+        @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
53
+        @ Rmask = 0xFEFEFEFE
54
+        @ Rn = destroy
55
+        eor             \Rd0, \Rn0, \Rm0
56
+        eor             \Rd1, \Rn1, \Rm1
57
+        orr             \Rn0, \Rn0, \Rm0
58
+        orr             \Rn1, \Rn1, \Rm1
59
+        and             \Rd0, \Rd0, \Rmask
60
+        and             \Rd1, \Rd1, \Rmask
61
+        sub             \Rd0, \Rn0, \Rd0, lsr #1
62
+        sub             \Rd1, \Rn1, \Rd1, lsr #1
63
+.endm
64
+
65
+.macro  NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
66
+        @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
67
+        @ Rmask = 0xFEFEFEFE
68
+        @ Rn = destroy
69
+        eor             \Rd0, \Rn0, \Rm0
70
+        eor             \Rd1, \Rn1, \Rm1
71
+        and             \Rn0, \Rn0, \Rm0
72
+        and             \Rn1, \Rn1, \Rm1
73
+        and             \Rd0, \Rd0, \Rmask
74
+        and             \Rd1, \Rd1, \Rmask
75
+        add             \Rd0, \Rn0, \Rd0, lsr #1
76
+        add             \Rd1, \Rn1, \Rd1, lsr #1
77
+.endm
78
+
79
+.macro  JMP_ALIGN tmp, reg
80
+        ands            \tmp, \reg, #3
81
+        bic             \reg, \reg, #3
82
+        beq             1f
83
+        subs            \tmp, \tmp, #1
84
+        beq             2f
85
+        subs            \tmp, \tmp, #1
86
+        beq             3f
87
+        b    4f
88
+.endm
89
+
90
+@ ----------------------------------------------------------------
91
+        .align 5
92
+function ff_put_pixels16_arm, export=1
93
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
94
+        @ block = word aligned, pixles = unaligned
95
+        pld             [r1]
96
+        push            {r4-r11, lr}
97
+        JMP_ALIGN       r5,  r1
98
+1:
99
+        ldm             r1,  {r4-r7}
100
+        add             r1,  r1,  r2
101
+        stm             r0,  {r4-r7}
102
+        pld             [r1]
103
+        subs            r3,  r3,  #1
104
+        add             r0,  r0,  r2
105
+        bne             1b
106
+        pop             {r4-r11, pc}
107
+        .align 5
108
+2:
109
+        ldm             r1,  {r4-r8}
110
+        add             r1,  r1,  r2
111
+        ALIGN_QWORD_D   1,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
112
+        pld             [r1]
113
+        subs            r3,  r3,  #1
114
+        stm             r0,  {r9-r12}
115
+        add             r0,  r0,  r2
116
+        bne             2b
117
+        pop             {r4-r11, pc}
118
+        .align 5
119
+3:
120
+        ldm             r1,  {r4-r8}
121
+        add             r1,  r1,  r2
122
+        ALIGN_QWORD_D   2,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
123
+        pld             [r1]
124
+        subs            r3,  r3,  #1
125
+        stm             r0,  {r9-r12}
126
+        add             r0,  r0,  r2
127
+        bne             3b
128
+        pop             {r4-r11, pc}
129
+        .align 5
130
+4:
131
+        ldm             r1,  {r4-r8}
132
+        add             r1,  r1,  r2
133
+        ALIGN_QWORD_D   3,   r9,  r10, r11, r12, r4,  r5,  r6,  r7,  r8
134
+        pld             [r1]
135
+        subs            r3,  r3,  #1
136
+        stm             r0,  {r9-r12}
137
+        add             r0,  r0,  r2
138
+        bne             4b
139
+        pop             {r4-r11,pc}
140
+endfunc
141
+
142
+@ ----------------------------------------------------------------
143
+        .align 5
144
+function ff_put_pixels8_arm, export=1
145
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
146
+        @ block = word aligned, pixles = unaligned
147
+        pld             [r1]
148
+        push            {r4-r5,lr}
149
+        JMP_ALIGN       r5,  r1
150
+1:
151
+        ldm             r1,  {r4-r5}
152
+        add             r1,  r1,  r2
153
+        subs            r3,  r3,  #1
154
+        pld             [r1]
155
+        stm             r0,  {r4-r5}
156
+        add             r0,  r0,  r2
157
+        bne             1b
158
+        pop             {r4-r5,pc}
159
+        .align 5
160
+2:
161
+        ldm             r1,  {r4-r5, r12}
162
+        add             r1,  r1,  r2
163
+        ALIGN_DWORD     1,   r4,  r5,  r12
164
+        pld             [r1]
165
+        subs            r3,  r3,  #1
166
+        stm             r0,  {r4-r5}
167
+        add             r0,  r0,  r2
168
+        bne             2b
169
+        pop             {r4-r5,pc}
170
+        .align 5
171
+3:
172
+        ldm             r1,  {r4-r5, r12}
173
+        add             r1,  r1,  r2
174
+        ALIGN_DWORD     2,   r4,  r5,  r12
175
+        pld             [r1]
176
+        subs            r3,  r3,  #1
177
+        stm             r0,  {r4-r5}
178
+        add             r0,  r0,  r2
179
+        bne             3b
180
+        pop             {r4-r5,pc}
181
+        .align 5
182
+4:
183
+        ldm             r1,  {r4-r5, r12}
184
+        add             r1,  r1,  r2
185
+        ALIGN_DWORD     3,   r4,  r5,  r12
186
+        pld             [r1]
187
+        subs            r3,  r3,  #1
188
+        stm             r0,  {r4-r5}
189
+        add             r0,  r0,  r2
190
+        bne             4b
191
+        pop             {r4-r5,pc}
192
+endfunc
193
+
194
+@ ----------------------------------------------------------------
195
+        .align 5
196
+function ff_put_pixels8_x2_arm, export=1
197
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
198
+        @ block = word aligned, pixles = unaligned
199
+        pld             [r1]
200
+        push            {r4-r10,lr}
201
+        ldr             r12, =0xfefefefe
202
+        JMP_ALIGN       r5,  r1
203
+1:
204
+        ldm             r1,  {r4-r5, r10}
205
+        add             r1,  r1,  r2
206
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
207
+        pld             [r1]
208
+        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
209
+        subs            r3,  r3,  #1
210
+        stm             r0,  {r8-r9}
211
+        add             r0,  r0,  r2
212
+        bne             1b
213
+        pop             {r4-r10,pc}
214
+        .align 5
215
+2:
216
+        ldm             r1,  {r4-r5, r10}
217
+        add             r1,  r1,  r2
218
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
219
+        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
220
+        pld             [r1]
221
+        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
222
+        subs            r3,  r3,  #1
223
+        stm             r0,  {r4-r5}
224
+        add             r0,  r0,  r2
225
+        bne             2b
226
+        pop             {r4-r10,pc}
227
+        .align 5
228
+3:
229
+        ldm             r1,  {r4-r5, r10}
230
+        add             r1,  r1,  r2
231
+        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
232
+        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
233
+        pld             [r1]
234
+        RND_AVG32       r4,  r5,  r6,  r7,  r8,  r9,  r12
235
+        subs            r3,  r3,  #1
236
+        stm             r0,  {r4-r5}
237
+        add             r0,  r0,  r2
238
+        bne             3b
239
+        pop             {r4-r10,pc}
240
+        .align 5
241
+4:
242
+        ldm             r1,  {r4-r5, r10}
243
+        add             r1,  r1,  r2
244
+        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
245
+        pld             [r1]
246
+        RND_AVG32       r8,  r9,  r6,  r7,  r5,  r10, r12
247
+        subs            r3,  r3,  #1
248
+        stm             r0,  {r8-r9}
249
+        add             r0,  r0,  r2
250
+        bne             4b
251
+        pop             {r4-r10,pc}
252
+endfunc
253
+
254
+        .align 5
255
+function ff_put_no_rnd_pixels8_x2_arm, export=1
256
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
257
+        @ block = word aligned, pixles = unaligned
258
+        pld             [r1]
259
+        push            {r4-r10,lr}
260
+        ldr             r12, =0xfefefefe
261
+        JMP_ALIGN       r5,  r1
262
+1:
263
+        ldm             r1,  {r4-r5, r10}
264
+        add             r1,  r1,  r2
265
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
266
+        pld             [r1]
267
+        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
268
+        subs            r3,  r3,  #1
269
+        stm             r0,  {r8-r9}
270
+        add             r0,  r0,  r2
271
+        bne             1b
272
+        pop             {r4-r10,pc}
273
+        .align 5
274
+2:
275
+        ldm             r1,  {r4-r5, r10}
276
+        add             r1,  r1,  r2
277
+        ALIGN_DWORD_D   1,   r6,  r7,  r4,  r5,  r10
278
+        ALIGN_DWORD_D   2,   r8,  r9,  r4,  r5,  r10
279
+        pld             [r1]
280
+        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
281
+        subs            r3,  r3,  #1
282
+        stm             r0,  {r4-r5}
283
+        add             r0,  r0,  r2
284
+        bne             2b
285
+        pop             {r4-r10,pc}
286
+        .align 5
287
+3:
288
+        ldm             r1,  {r4-r5, r10}
289
+        add             r1,  r1,  r2
290
+        ALIGN_DWORD_D   2,   r6,  r7,  r4,  r5,  r10
291
+        ALIGN_DWORD_D   3,   r8,  r9,  r4,  r5,  r10
292
+        pld             [r1]
293
+        NO_RND_AVG32    r4,  r5,  r6,  r7,  r8,  r9,  r12
294
+        subs            r3,  r3,  #1
295
+        stm             r0,  {r4-r5}
296
+        add             r0,  r0,  r2
297
+        bne             3b
298
+        pop             {r4-r10,pc}
299
+        .align 5
300
+4:
301
+        ldm             r1,  {r4-r5, r10}
302
+        add             r1,  r1,  r2
303
+        ALIGN_DWORD_D   3,   r6,  r7,  r4,  r5,  r10
304
+        pld             [r1]
305
+        NO_RND_AVG32    r8,  r9,  r6,  r7,  r5,  r10, r12
306
+        subs            r3,  r3,  #1
307
+        stm             r0,  {r8-r9}
308
+        add             r0,  r0,  r2
309
+        bne             4b
310
+        pop             {r4-r10,pc}
311
+endfunc
312
+
313
+
314
+@ ----------------------------------------------------------------
315
+        .align 5
316
+function ff_put_pixels8_y2_arm, export=1
317
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
318
+        @ block = word aligned, pixles = unaligned
319
+        pld             [r1]
320
+        push            {r4-r11,lr}
321
+        mov             r3,  r3,  lsr #1
322
+        ldr             r12, =0xfefefefe
323
+        JMP_ALIGN       r5,  r1
324
+1:
325
+        ldm             r1,  {r4-r5}
326
+        add             r1,  r1,  r2
327
+6:      ldm             r1,  {r6-r7}
328
+        add             r1,  r1,  r2
329
+        pld             [r1]
330
+        RND_AVG32       r8,  r9,  r4,  r5,  r6,  r7,  r12
331
+        ldm             r1,  {r4-r5}
332
+        add             r1,  r1,  r2
333
+        stm             r0,  {r8-r9}
334
+        add             r0,  r0,  r2
335
+        pld             [r1]
336
+        RND_AVG32       r8,  r9,  r6,  r7,  r4,  r5,  r12
337
+        subs            r3,  r3,  #1
338
+        stm             r0,  {r8-r9}
339
+        add             r0,  r0,  r2
340
+        bne             6b
341
+        pop             {r4-r11,pc}
342
+        .align 5
343
+2:
344
+        ldm             r1,  {r4-r6}
345
+        add             r1,  r1,  r2
346
+        pld             [r1]
347
+        ALIGN_DWORD     1,   r4,  r5,  r6
348
+6:      ldm             r1,  {r7-r9}
349
+        add             r1,  r1,  r2
350
+        pld             [r1]
351
+        ALIGN_DWORD     1,   r7,  r8,  r9
352
+        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
353
+        stm             r0,  {r10-r11}
354
+        add             r0,  r0,  r2
355
+        ldm             r1,  {r4-r6}
356
+        add             r1,  r1,  r2
357
+        pld             [r1]
358
+        ALIGN_DWORD     1,   r4,  r5,  r6
359
+        subs            r3,  r3,  #1
360
+        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
361
+        stm             r0,  {r10-r11}
362
+        add             r0,  r0,  r2
363
+        bne             6b
364
+        pop             {r4-r11,pc}
365
+        .align 5
366
+3:
367
+        ldm             r1,  {r4-r6}
368
+        add             r1,  r1,  r2
369
+        pld             [r1]
370
+        ALIGN_DWORD     2,   r4,  r5,  r6
371
+6:      ldm             r1,  {r7-r9}
372
+        add             r1,  r1,  r2
373
+        pld             [r1]
374
+        ALIGN_DWORD     2,   r7,  r8,  r9
375
+        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
376
+        stm             r0,  {r10-r11}
377
+        add             r0,  r0,  r2
378
+        ldm             r1,  {r4-r6}
379
+        add             r1,  r1,  r2
380
+        pld             [r1]
381
+        ALIGN_DWORD     2,   r4,  r5,  r6
382
+        subs            r3,  r3,  #1
383
+        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
384
+        stm             r0,  {r10-r11}
385
+        add             r0,  r0,  r2
386
+        bne             6b
387
+        pop             {r4-r11,pc}
388
+        .align 5
389
+4:
390
+        ldm             r1,  {r4-r6}
391
+        add             r1,  r1,  r2
392
+        pld             [r1]
393
+        ALIGN_DWORD     3,   r4,  r5,  r6
394
+6:      ldm             r1,  {r7-r9}
395
+        add             r1,  r1,  r2
396
+        pld             [r1]
397
+        ALIGN_DWORD     3,   r7,  r8,  r9
398
+        RND_AVG32       r10, r11, r4,  r5,  r7,  r8,  r12
399
+        stm             r0,  {r10-r11}
400
+        add             r0,  r0,  r2
401
+        ldm             r1,  {r4-r6}
402
+        add             r1,  r1,  r2
403
+        pld             [r1]
404
+        ALIGN_DWORD     3,   r4,  r5,  r6
405
+        subs            r3,  r3,  #1
406
+        RND_AVG32       r10, r11, r7,  r8,  r4,  r5,  r12
407
+        stm             r0,  {r10-r11}
408
+        add             r0,  r0,  r2
409
+        bne             6b
410
+        pop             {r4-r11,pc}
411
+endfunc
412
+
413
+        .align 5
414
+function ff_put_no_rnd_pixels8_y2_arm, export=1
415
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
416
+        @ block = word aligned, pixles = unaligned
417
+        pld             [r1]
418
+        push            {r4-r11,lr}
419
+        mov             r3,  r3,  lsr #1
420
+        ldr             r12, =0xfefefefe
421
+        JMP_ALIGN       r5,  r1
422
+1:
423
+        ldm             r1,  {r4-r5}
424
+        add             r1,  r1,  r2
425
+6:      ldm             r1,  {r6-r7}
426
+        add             r1,  r1,  r2
427
+        pld             [r1]
428
+        NO_RND_AVG32    r8,  r9,  r4,  r5,  r6,  r7,  r12
429
+        ldm             r1,  {r4-r5}
430
+        add             r1,  r1,  r2
431
+        stm             r0,  {r8-r9}
432
+        add             r0,  r0,  r2
433
+        pld             [r1]
434
+        NO_RND_AVG32    r8,  r9,  r6,  r7,  r4,  r5,  r12
435
+        subs            r3,  r3,  #1
436
+        stm             r0,  {r8-r9}
437
+        add             r0,  r0,  r2
438
+        bne             6b
439
+        pop             {r4-r11,pc}
440
+        .align 5
441
+2:
442
+        ldm             r1,  {r4-r6}
443
+        add             r1,  r1,  r2
444
+        pld             [r1]
445
+        ALIGN_DWORD     1,   r4,  r5,  r6
446
+6:      ldm             r1,  {r7-r9}
447
+        add             r1,  r1,  r2
448
+        pld             [r1]
449
+        ALIGN_DWORD     1,   r7,  r8,  r9
450
+        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
451
+        stm             r0,  {r10-r11}
452
+        add             r0,  r0,  r2
453
+        ldm             r1,  {r4-r6}
454
+        add             r1,  r1,  r2
455
+        pld             [r1]
456
+        ALIGN_DWORD     1,   r4,  r5,  r6
457
+        subs            r3,  r3,  #1
458
+        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
459
+        stm             r0,  {r10-r11}
460
+        add             r0,  r0,  r2
461
+        bne             6b
462
+        pop             {r4-r11,pc}
463
+        .align 5
464
+3:
465
+        ldm             r1,  {r4-r6}
466
+        add             r1,  r1,  r2
467
+        pld             [r1]
468
+        ALIGN_DWORD     2,   r4,  r5,  r6
469
+6:      ldm             r1,  {r7-r9}
470
+        add             r1,  r1,  r2
471
+        pld             [r1]
472
+        ALIGN_DWORD     2,   r7,  r8,  r9
473
+        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
474
+        stm             r0,  {r10-r11}
475
+        add             r0,  r0,  r2
476
+        ldm             r1,  {r4-r6}
477
+        add             r1,  r1,  r2
478
+        pld             [r1]
479
+        ALIGN_DWORD     2,   r4,  r5,  r6
480
+        subs            r3,  r3,  #1
481
+        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
482
+        stm             r0,  {r10-r11}
483
+        add             r0,  r0,  r2
484
+        bne             6b
485
+        pop             {r4-r11,pc}
486
+        .align 5
487
+4:
488
+        ldm             r1,  {r4-r6}
489
+        add             r1,  r1,  r2
490
+        pld             [r1]
491
+        ALIGN_DWORD     3,   r4,  r5,  r6
492
+6:      ldm             r1,  {r7-r9}
493
+        add             r1,  r1,  r2
494
+        pld             [r1]
495
+        ALIGN_DWORD     3,   r7,  r8,  r9
496
+        NO_RND_AVG32    r10, r11, r4,  r5,  r7,  r8,  r12
497
+        stm             r0,  {r10-r11}
498
+        add             r0,  r0,  r2
499
+        ldm             r1,  {r4-r6}
500
+        add             r1,  r1,  r2
501
+        pld             [r1]
502
+        ALIGN_DWORD     3,   r4,  r5,  r6
503
+        subs            r3,  r3,  #1
504
+        NO_RND_AVG32    r10, r11, r7,  r8,  r4,  r5,  r12
505
+        stm             r0,  {r10-r11}
506
+        add             r0,  r0,  r2
507
+        bne             6b
508
+        pop             {r4-r11,pc}
509
+endfunc
510
+
511
+        .ltorg
512
+
513
+@ ----------------------------------------------------------------
514
+.macro  RND_XY2_IT align, rnd
515
+        @ l1=  (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
516
+        @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
517
+.if \align == 0
518
+        ldm             r1,  {r6-r8}
519
+.elseif \align == 3
520
+        ldm             r1,  {r5-r7}
521
+.else
522
+        ldm             r1,  {r8-r10}
523
+.endif
524
+        add             r1,  r1,  r2
525
+        pld             [r1]
526
+.if \align == 0
527
+        ALIGN_DWORD_D   1,   r4,  r5,  r6,  r7,  r8
528
+.elseif \align == 1
529
+        ALIGN_DWORD_D   1,   r4,  r5,  r8,  r9,  r10
530
+        ALIGN_DWORD_D   2,   r6,  r7,  r8,  r9,  r10
531
+.elseif \align == 2
532
+        ALIGN_DWORD_D   2,   r4,  r5,  r8,  r9,  r10
533
+        ALIGN_DWORD_D   3,   r6,  r7,  r8,  r9,  r10
534
+.elseif \align == 3
535
+        ALIGN_DWORD_D   3,   r4,  r5,  r5,  r6,  r7
536
+.endif
537
+        ldr             r14, =0x03030303
538
+        tst             r3,  #1
539
+        and             r8,  r4,  r14
540
+        and             r9,  r5,  r14
541
+        and             r10, r6,  r14
542
+        and             r11, r7,  r14
543
+        it              eq
544
+        andeq           r14, r14, r14, \rnd #1
545
+        add             r8,  r8,  r10
546
+        add             r9,  r9,  r11
547
+        ldr             r12, =0xfcfcfcfc >> 2
548
+        itt             eq
549
+        addeq           r8,  r8,  r14
550
+        addeq           r9,  r9,  r14
551
+        and             r4,  r12, r4,  lsr #2
552
+        and             r5,  r12, r5,  lsr #2
553
+        and             r6,  r12, r6,  lsr #2
554
+        and             r7,  r12, r7,  lsr #2
555
+        add             r10, r4,  r6
556
+        add             r11, r5,  r7
557
+        subs            r3,  r3,  #1
558
+.endm
559
+
560
+.macro RND_XY2_EXPAND align, rnd
561
+        RND_XY2_IT      \align, \rnd
562
+6:      push            {r8-r11}
563
+        RND_XY2_IT      \align, \rnd
564
+        pop             {r4-r7}
565
+        add             r4,  r4,  r8
566
+        add             r5,  r5,  r9
567
+        ldr             r14, =0x0f0f0f0f
568
+        add             r6,  r6,  r10
569
+        add             r7,  r7,  r11
570
+        and             r4,  r14, r4,  lsr #2
571
+        and             r5,  r14, r5,  lsr #2
572
+        add             r4,  r4,  r6
573
+        add             r5,  r5,  r7
574
+        stm             r0,  {r4-r5}
575
+        add             r0,  r0,  r2
576
+        bge             6b
577
+        pop             {r4-r11,pc}
578
+.endm
579
+
580
+        .align 5
581
+function ff_put_pixels8_xy2_arm, export=1
582
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
583
+        @ block = word aligned, pixles = unaligned
584
+        pld             [r1]
585
+        push            {r4-r11,lr} @ R14 is also called LR
586
+        JMP_ALIGN       r5,  r1
587
+1:      RND_XY2_EXPAND  0, lsl
588
+        .align 5
589
+2:      RND_XY2_EXPAND  1, lsl
590
+        .align 5
591
+3:      RND_XY2_EXPAND  2, lsl
592
+        .align 5
593
+4:      RND_XY2_EXPAND  3, lsl
594
+endfunc
595
+
596
+        .align 5
597
+function ff_put_no_rnd_pixels8_xy2_arm, export=1
598
+        @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
599
+        @ block = word aligned, pixles = unaligned
600
+        pld             [r1]
601
+        push            {r4-r11,lr}
602
+        JMP_ALIGN       r5,  r1
603
+1:      RND_XY2_EXPAND  0, lsr
604
+        .align 5
605
+2:      RND_XY2_EXPAND  1, lsr
606
+        .align 5
607
+3:      RND_XY2_EXPAND  2, lsr
608
+        .align 5
609
+4:      RND_XY2_EXPAND  3, lsr
610
+endfunc
0 611
new file mode 100644
... ...
@@ -0,0 +1,27 @@
0
+/*
1
+ * This file is part of Libav.
2
+ *
3
+ * Libav is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * Libav is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with Libav; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#ifndef AVCODEC_ARM_HPELDSP_H
19
+#define AVCODEC_ARM_HPELDSP_H
20
+
21
+#include "libavcodec/hpeldsp.h"
22
+
23
+void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags);
24
+void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags);
25
+
26
+#endif /* AVCODEC_ARM_HPELDSP_H */
0 27
new file mode 100644
... ...
@@ -0,0 +1,259 @@
0
+/*
1
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
2
+ *
3
+ * This file is part of Libav.
4
+ *
5
+ * Libav is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * Libav is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with Libav; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include "libavutil/arm/asm.S"
21
+
22
+.macro  call_2x_pixels  type, subp
23
+function ff_\type\()_pixels16\subp\()_armv6, export=1
24
+        push            {r0-r3, lr}
25
+        bl              ff_\type\()_pixels8\subp\()_armv6
26
+        pop             {r0-r3, lr}
27
+        add             r0,  r0,  #8
28
+        add             r1,  r1,  #8
29
+        b               ff_\type\()_pixels8\subp\()_armv6
30
+endfunc
31
+.endm
32
+
33
+call_2x_pixels          avg
34
+call_2x_pixels          put, _x2
35
+call_2x_pixels          put, _y2
36
+call_2x_pixels          put, _x2_no_rnd
37
+call_2x_pixels          put, _y2_no_rnd
38
+
39
+function ff_put_pixels16_armv6, export=1
40
+        push            {r4-r11}
41
+1:
42
+        ldr             r5,  [r1, #4]
43
+        ldr             r6,  [r1, #8]
44
+        ldr             r7,  [r1, #12]
45
+        ldr_post        r4,  r1,  r2
46
+        strd            r6,  r7,  [r0, #8]
47
+        ldr             r9,  [r1, #4]
48
+        strd_post       r4,  r5,  r0,  r2
49
+        ldr             r10, [r1, #8]
50
+        ldr             r11, [r1, #12]
51
+        ldr_post        r8,  r1,  r2
52
+        strd            r10, r11, [r0, #8]
53
+        subs            r3,  r3,  #2
54
+        strd_post       r8,  r9,  r0,  r2
55
+        bne             1b
56
+
57
+        pop             {r4-r11}
58
+        bx              lr
59
+endfunc
60
+
61
+function ff_put_pixels8_armv6, export=1
62
+        push            {r4-r7}
63
+1:
64
+        ldr             r5,  [r1, #4]
65
+        ldr_post        r4,  r1,  r2
66
+        ldr             r7,  [r1, #4]
67
+        strd_post       r4,  r5,  r0,  r2
68
+        ldr_post        r6,  r1,  r2
69
+        subs            r3,  r3,  #2
70
+        strd_post       r6,  r7,  r0,  r2
71
+        bne             1b
72
+
73
+        pop             {r4-r7}
74
+        bx              lr
75
+endfunc
76
+
77
+function ff_put_pixels8_x2_armv6, export=1
78
+        push            {r4-r11, lr}
79
+        mov             r12, #1
80
+        orr             r12, r12, r12, lsl #8
81
+        orr             r12, r12, r12, lsl #16
82
+1:
83
+        ldr             r4,  [r1]
84
+        subs            r3,  r3,  #2
85
+        ldr             r5,  [r1, #4]
86
+        ldr             r7,  [r1, #5]
87
+        lsr             r6,  r4,  #8
88
+        ldr_pre         r8,  r1,  r2
89
+        orr             r6,  r6,  r5,  lsl #24
90
+        ldr             r9,  [r1, #4]
91
+        ldr             r11, [r1, #5]
92
+        lsr             r10, r8,  #8
93
+        add             r1,  r1,  r2
94
+        orr             r10, r10, r9,  lsl #24
95
+        eor             r14, r4,  r6
96
+        uhadd8          r4,  r4,  r6
97
+        eor             r6,  r5,  r7
98
+        uhadd8          r5,  r5,  r7
99
+        and             r14, r14, r12
100
+        and             r6,  r6,  r12
101
+        uadd8           r4,  r4,  r14
102
+        eor             r14, r8,  r10
103
+        uadd8           r5,  r5,  r6
104
+        eor             r6,  r9,  r11
105
+        uhadd8          r8,  r8,  r10
106
+        and             r14, r14, r12
107
+        uhadd8          r9,  r9,  r11
108
+        and             r6,  r6,  r12
109
+        uadd8           r8,  r8,  r14
110
+        strd_post       r4,  r5,  r0,  r2
111
+        uadd8           r9,  r9,  r6
112
+        strd_post       r8,  r9,  r0,  r2
113
+        bne             1b
114
+
115
+        pop             {r4-r11, pc}
116
+endfunc
117
+
118
+function ff_put_pixels8_y2_armv6, export=1
119
+        push            {r4-r11}
120
+        mov             r12, #1
121
+        orr             r12, r12, r12, lsl #8
122
+        orr             r12, r12, r12, lsl #16
123
+        ldr             r4,  [r1]
124
+        ldr             r5,  [r1, #4]
125
+        ldr_pre         r6,  r1,  r2
126
+        ldr             r7,  [r1, #4]
127
+1:
128
+        subs            r3,  r3,  #2
129
+        uhadd8          r8,  r4,  r6
130
+        eor             r10, r4,  r6
131
+        uhadd8          r9,  r5,  r7
132
+        eor             r11, r5,  r7
133
+        and             r10, r10, r12
134
+        ldr_pre         r4,  r1,  r2
135
+        uadd8           r8,  r8,  r10
136
+        and             r11, r11, r12
137
+        uadd8           r9,  r9,  r11
138
+        ldr             r5,  [r1, #4]
139
+        uhadd8          r10, r4,  r6
140
+        eor             r6,  r4,  r6
141
+        uhadd8          r11, r5,  r7
142
+        and             r6,  r6,  r12
143
+        eor             r7,  r5,  r7
144
+        uadd8           r10, r10, r6
145
+        and             r7,  r7,  r12
146
+        ldr_pre         r6,  r1,  r2
147
+        uadd8           r11, r11, r7
148
+        strd_post       r8,  r9,  r0,  r2
149
+        ldr             r7,  [r1, #4]
150
+        strd_post       r10, r11, r0,  r2
151
+        bne             1b
152
+
153
+        pop             {r4-r11}
154
+        bx              lr
155
+endfunc
156
+
157
+function ff_put_pixels8_x2_no_rnd_armv6, export=1
158
+        push            {r4-r9, lr}
159
+1:
160
+        subs            r3,  r3,  #2
161
+        ldr             r4,  [r1]
162
+        ldr             r5,  [r1, #4]
163
+        ldr             r7,  [r1, #5]
164
+        ldr_pre         r8,  r1,  r2
165
+        ldr             r9,  [r1, #4]
166
+        ldr             r14, [r1, #5]
167
+        add             r1,  r1,  r2
168
+        lsr             r6,  r4,  #8
169
+        orr             r6,  r6,  r5,  lsl #24
170
+        lsr             r12, r8,  #8
171
+        orr             r12, r12, r9,  lsl #24
172
+        uhadd8          r4,  r4,  r6
173
+        uhadd8          r5,  r5,  r7
174
+        uhadd8          r8,  r8,  r12
175
+        uhadd8          r9,  r9,  r14
176
+        stm             r0,  {r4,r5}
177
+        add             r0,  r0,  r2
178
+        stm             r0,  {r8,r9}
179
+        add             r0,  r0,  r2
180
+        bne             1b
181
+
182
+        pop             {r4-r9, pc}
183
+endfunc
184
+
185
+function ff_put_pixels8_y2_no_rnd_armv6, export=1
186
+        push            {r4-r9, lr}
187
+        ldr             r4,  [r1]
188
+        ldr             r5,  [r1, #4]
189
+        ldr_pre         r6,  r1,  r2
190
+        ldr             r7,  [r1, #4]
191
+1:
192
+        subs            r3,  r3,  #2
193
+        uhadd8          r8,  r4,  r6
194
+        ldr_pre         r4,  r1,  r2
195
+        uhadd8          r9,  r5,  r7
196
+        ldr             r5,  [r1, #4]
197
+        uhadd8          r12, r4,  r6
198
+        ldr_pre         r6,  r1,  r2
199
+        uhadd8          r14, r5,  r7
200
+        ldr             r7,  [r1, #4]
201
+        stm             r0,  {r8,r9}
202
+        add             r0,  r0,  r2
203
+        stm             r0,  {r12,r14}
204
+        add             r0,  r0,  r2
205
+        bne             1b
206
+
207
+        pop             {r4-r9, pc}
208
+endfunc
209
+
210
+function ff_avg_pixels8_armv6, export=1
211
+        pld             [r1, r2]
212
+        push            {r4-r10, lr}
213
+        mov             lr,  #1
214
+        orr             lr,  lr,  lr,  lsl #8
215
+        orr             lr,  lr,  lr,  lsl #16
216
+        ldrd            r4,  r5,  [r0]
217
+        ldr             r10, [r1, #4]
218
+        ldr_post        r9,  r1,  r2
219
+        subs            r3,  r3,  #2
220
+1:
221
+        pld             [r1, r2]
222
+        eor             r8,  r4,  r9
223
+        uhadd8          r4,  r4,  r9
224
+        eor             r12, r5,  r10
225
+        ldrd_reg        r6,  r7,  r0,  r2
226
+        uhadd8          r5,  r5,  r10
227
+        and             r8,  r8,  lr
228
+        ldr             r10, [r1, #4]
229
+        and             r12, r12, lr
230
+        uadd8           r4,  r4,  r8
231
+        ldr_post        r9,  r1,  r2
232
+        eor             r8,  r6,  r9
233
+        uadd8           r5,  r5,  r12
234
+        pld             [r1, r2,  lsl #1]
235
+        eor             r12, r7,  r10
236
+        uhadd8          r6,  r6,  r9
237
+        strd_post       r4,  r5,  r0,  r2
238
+        uhadd8          r7,  r7,  r10
239
+        beq             2f
240
+        and             r8,  r8,  lr
241
+        ldrd_reg        r4,  r5,  r0,  r2
242
+        uadd8           r6,  r6,  r8
243
+        ldr             r10, [r1, #4]
244
+        and             r12, r12, lr
245
+        subs            r3,  r3,  #2
246
+        uadd8           r7,  r7,  r12
247
+        ldr_post        r9,  r1,  r2
248
+        strd_post       r6,  r7,  r0,  r2
249
+        b               1b
250
+2:
251
+        and             r8,  r8,  lr
252
+        and             r12, r12, lr
253
+        uadd8           r6,  r6,  r8
254
+        uadd8           r7,  r7,  r12
255
+        strd_post       r6,  r7,  r0,  r2
256
+
257
+        pop             {r4-r10, pc}
258
+endfunc
0 259
new file mode 100644
... ...
@@ -0,0 +1,71 @@
0
+/*
1
+ * ARM optimized DSP utils
2
+ * Copyright (c) 2001 Lionel Ulmer
3
+ *
4
+ * This file is part of Libav.
5
+ *
6
+ * Libav is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * Libav is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with Libav; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#include "libavutil/arm/cpu.h"
22
+#include "libavutil/attributes.h"
23
+#include "libavcodec/rnd_avg.h"
24
+#include "hpeldsp_arm.h"
25
+
26
+void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
27
+void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
28
+void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
29
+void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
30
+
31
+void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
32
+void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
33
+void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
34
+
35
+void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
36
+
37
+CALL_2X_PIXELS(ff_put_pixels16_x2_arm,         ff_put_pixels8_x2_arm,        8)
38
+CALL_2X_PIXELS(ff_put_pixels16_y2_arm,         ff_put_pixels8_y2_arm,        8)
39
+CALL_2X_PIXELS(ff_put_pixels16_xy2_arm,        ff_put_pixels8_xy2_arm,       8)
40
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm,  ff_put_no_rnd_pixels8_x2_arm, 8)
41
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm,  ff_put_no_rnd_pixels8_y2_arm, 8)
42
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
43
+
44
+av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags)
45
+{
46
+    int cpu_flags = av_get_cpu_flags();
47
+
48
+    c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
49
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
50
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
51
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
52
+    c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
53
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
54
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
55
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
56
+
57
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
58
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
59
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
60
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
61
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
62
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
63
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
64
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
65
+
66
+    if (have_armv6(cpu_flags))
67
+        ff_hpeldsp_init_armv6(c, flags);
68
+    if (have_neon(cpu_flags))
69
+        ff_hpeldsp_init_neon(c, flags);
70
+}
0 71
new file mode 100644
... ...
@@ -0,0 +1,67 @@
0
+/*
1
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
2
+ *
3
+ * This file is part of Libav.
4
+ *
5
+ * Libav is free software; you can redistribute it and/or
6
+ * modify it under the terms of the GNU Lesser General Public
7
+ * License as published by the Free Software Foundation; either
8
+ * version 2.1 of the License, or (at your option) any later version.
9
+ *
10
+ * Libav is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public
16
+ * License along with Libav; if not, write to the Free Software
17
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+#include <stddef.h>
21
+#include <stdint.h>
22
+
23
+#include "libavutil/attributes.h"
24
+#include "hpeldsp_arm.h"
25
+
26
+void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
27
+void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
28
+void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
29
+
30
+void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
31
+void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
32
+
33
+void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
34
+
35
+void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
36
+void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
37
+void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
38
+
39
+void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
40
+void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
41
+
42
+void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
43
+
44
+av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags)
45
+{
46
+    c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
47
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
48
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
49
+/*     c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
50
+    c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
51
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
52
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
53
+/*     c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
54
+
55
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
56
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
57
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
58
+/*     c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
59
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
60
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
61
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
62
+/*     c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
63
+
64
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
65
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
66
+}
0 67
new file mode 100644
... ...
@@ -0,0 +1,88 @@
0
+/*
1
+ * ARM NEON optimised DSP functions
2
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
+ *
4
+ * This file is part of Libav.
5
+ *
6
+ * Libav is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * Libav is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with Libav; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#include <stddef.h>
22
+#include <stdint.h>
23
+
24
+#include "libavutil/attributes.h"
25
+#include "hpeldsp_arm.h"
26
+
27
+void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
28
+void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
29
+void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
30
+void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
31
+void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
32
+void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
33
+void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
34
+void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
35
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
36
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
37
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
38
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
39
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
40
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
41
+
42
+void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
43
+void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
44
+void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
45
+void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
46
+void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
47
+void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
48
+void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
49
+void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
50
+void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
51
+void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
52
+void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
53
+
54
+av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags)
55
+{
56
+    c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
57
+    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
58
+    c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
59
+    c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
60
+    c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
61
+    c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
62
+    c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
63
+    c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
64
+
65
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
66
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
67
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
68
+    c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
69
+    c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
70
+    c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
71
+    c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
72
+    c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
73
+
74
+    c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
75
+    c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
76
+    c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
77
+    c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
78
+    c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
79
+    c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
80
+    c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
81
+    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
82
+
83
+    c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
84
+    c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
85
+    c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
86
+    c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
87
+}
0 88
new file mode 100644
... ...
@@ -0,0 +1,410 @@
0
+/*
1
+ * ARM NEON optimised DSP functions
2
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
+ *
4
+ * This file is part of Libav.
5
+ *
6
+ * Libav is free software; you can redistribute it and/or
7
+ * modify it under the terms of the GNU Lesser General Public
8
+ * License as published by the Free Software Foundation; either
9
+ * version 2.1 of the License, or (at your option) any later version.
10
+ *
11
+ * Libav is distributed in the hope that it will be useful,
12
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
+ * Lesser General Public License for more details.
15
+ *
16
+ * You should have received a copy of the GNU Lesser General Public
17
+ * License along with Libav; if not, write to the Free Software
18
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ */
20
+
21
+#include "libavutil/arm/asm.S"
22
+
23
+.macro  pixels16        rnd=1, avg=0
24
+  .if \avg
25
+        mov             r12, r0
26
+  .endif
27
+1:      vld1.8          {q0},     [r1], r2
28
+        vld1.8          {q1},     [r1], r2
29
+        vld1.8          {q2},     [r1], r2
30
+        pld             [r1, r2, lsl #2]
31
+        vld1.8          {q3},     [r1], r2
32
+        pld             [r1]
33
+        pld             [r1, r2]
34
+        pld             [r1, r2, lsl #1]
35
+  .if \avg
36
+        vld1.8          {q8},     [r12,:128], r2
37
+        vrhadd.u8       q0,  q0,  q8
38
+        vld1.8          {q9},     [r12,:128], r2
39
+        vrhadd.u8       q1,  q1,  q9
40
+        vld1.8          {q10},    [r12,:128], r2
41
+        vrhadd.u8       q2,  q2,  q10
42
+        vld1.8          {q11},    [r12,:128], r2
43
+        vrhadd.u8       q3,  q3,  q11
44
+  .endif
45
+        subs            r3,  r3,  #4
46
+        vst1.64         {q0},     [r0,:128], r2
47
+        vst1.64         {q1},     [r0,:128], r2
48
+        vst1.64         {q2},     [r0,:128], r2
49
+        vst1.64         {q3},     [r0,:128], r2
50
+        bne             1b
51
+        bx              lr
52
+.endm
53
+
54
+.macro  pixels16_x2     rnd=1, avg=0
55
+1:      vld1.8          {d0-d2},  [r1], r2
56
+        vld1.8          {d4-d6},  [r1], r2
57
+        pld             [r1]
58
+        pld             [r1, r2]
59
+        subs            r3,  r3,  #2
60
+        vext.8          q1,  q0,  q1,  #1
61
+        avg             q0,  q0,  q1
62
+        vext.8          q3,  q2,  q3,  #1
63
+        avg             q2,  q2,  q3
64
+  .if \avg
65
+        vld1.8          {q1},     [r0,:128], r2
66
+        vld1.8          {q3},     [r0,:128]
67
+        vrhadd.u8       q0,  q0,  q1
68
+        vrhadd.u8       q2,  q2,  q3
69
+        sub             r0,  r0,  r2
70
+  .endif
71
+        vst1.8          {q0},     [r0,:128], r2
72
+        vst1.8          {q2},     [r0,:128], r2
73
+        bne             1b
74
+        bx              lr
75
+.endm
76
+
77
+.macro  pixels16_y2     rnd=1, avg=0
78
+        sub             r3,  r3,  #2
79
+        vld1.8          {q0},     [r1], r2
80
+        vld1.8          {q1},     [r1], r2
81
+1:      subs            r3,  r3,  #2
82
+        avg             q2,  q0,  q1
83
+        vld1.8          {q0},     [r1], r2
84
+        avg             q3,  q0,  q1
85
+        vld1.8          {q1},     [r1], r2
86
+        pld             [r1]
87
+        pld             [r1, r2]
88
+  .if \avg
89
+        vld1.8          {q8},     [r0,:128], r2
90
+        vld1.8          {q9},     [r0,:128]
91
+        vrhadd.u8       q2,  q2,  q8
92
+        vrhadd.u8       q3,  q3,  q9
93
+        sub             r0,  r0,  r2
94
+  .endif
95
+        vst1.8          {q2},     [r0,:128], r2
96
+        vst1.8          {q3},     [r0,:128], r2
97
+        bne             1b
98
+
99
+        avg             q2,  q0,  q1
100
+        vld1.8          {q0},     [r1], r2
101
+        avg             q3,  q0,  q1
102
+  .if \avg
103
+        vld1.8          {q8},     [r0,:128], r2
104
+        vld1.8          {q9},     [r0,:128]
105
+        vrhadd.u8       q2,  q2,  q8
106
+        vrhadd.u8       q3,  q3,  q9
107
+        sub             r0,  r0,  r2
108
+  .endif
109
+        vst1.8          {q2},     [r0,:128], r2
110
+        vst1.8          {q3},     [r0,:128], r2
111
+
112
+        bx              lr
113
+.endm
114
+
115
+.macro  pixels16_xy2    rnd=1, avg=0
116
+        sub             r3,  r3,  #2
117
+        vld1.8          {d0-d2},  [r1], r2
118
+        vld1.8          {d4-d6},  [r1], r2
119
+NRND    vmov.i16        q13, #1
120
+        pld             [r1]
121
+        pld             [r1, r2]
122
+        vext.8          q1,  q0,  q1,  #1
123
+        vext.8          q3,  q2,  q3,  #1
124
+        vaddl.u8        q8,  d0,  d2
125
+        vaddl.u8        q10, d1,  d3
126
+        vaddl.u8        q9,  d4,  d6
127
+        vaddl.u8        q11, d5,  d7
128
+1:      subs            r3,  r3,  #2
129
+        vld1.8          {d0-d2},  [r1], r2
130
+        vadd.u16        q12, q8,  q9
131
+        pld             [r1]
132
+NRND    vadd.u16        q12, q12, q13
133
+        vext.8          q15, q0,  q1,  #1
134
+        vadd.u16        q1 , q10, q11
135
+        shrn            d28, q12, #2
136
+NRND    vadd.u16        q1,  q1,  q13
137
+        shrn            d29, q1,  #2
138
+  .if \avg
139
+        vld1.8          {q8},     [r0,:128]
140
+        vrhadd.u8       q14, q14, q8
141
+  .endif
142
+        vaddl.u8        q8,  d0,  d30
143
+        vld1.8          {d2-d4},  [r1], r2
144
+        vaddl.u8        q10, d1,  d31
145
+        vst1.8          {q14},    [r0,:128], r2
146
+        vadd.u16        q12, q8,  q9
147
+        pld             [r1, r2]
148
+NRND    vadd.u16        q12, q12, q13
149
+        vext.8          q2,  q1,  q2,  #1
150
+        vadd.u16        q0,  q10, q11
151
+        shrn            d30, q12, #2
152
+NRND    vadd.u16        q0,  q0,  q13
153
+        shrn            d31, q0,  #2
154
+  .if \avg
155
+        vld1.8          {q9},     [r0,:128]
156
+        vrhadd.u8       q15, q15, q9
157
+  .endif
158
+        vaddl.u8        q9,  d2,  d4
159
+        vaddl.u8        q11, d3,  d5
160
+        vst1.8          {q15},    [r0,:128], r2
161
+        bgt             1b
162
+
163
+        vld1.8          {d0-d2},  [r1], r2
164
+        vadd.u16        q12, q8,  q9
165
+NRND    vadd.u16        q12, q12, q13
166
+        vext.8          q15, q0,  q1,  #1
167
+        vadd.u16        q1 , q10, q11
168
+        shrn            d28, q12, #2
169
+NRND    vadd.u16        q1,  q1,  q13
170
+        shrn            d29, q1,  #2
171
+  .if \avg
172
+        vld1.8          {q8},     [r0,:128]
173
+        vrhadd.u8       q14, q14, q8
174
+  .endif
175
+        vaddl.u8        q8,  d0,  d30
176
+        vaddl.u8        q10, d1,  d31
177
+        vst1.8          {q14},    [r0,:128], r2
178
+        vadd.u16        q12, q8,  q9
179
+NRND    vadd.u16        q12, q12, q13
180
+        vadd.u16        q0,  q10, q11
181
+        shrn            d30, q12, #2
182
+NRND    vadd.u16        q0,  q0,  q13
183
+        shrn            d31, q0,  #2
184
+  .if \avg
185
+        vld1.8          {q9},     [r0,:128]
186
+        vrhadd.u8       q15, q15, q9
187
+  .endif
188
+        vst1.8          {q15},    [r0,:128], r2
189
+
190
+        bx              lr
191
+.endm
192
+
193
+.macro  pixels8         rnd=1, avg=0
194
+1:      vld1.8          {d0},     [r1], r2
195
+        vld1.8          {d1},     [r1], r2
196
+        vld1.8          {d2},     [r1], r2
197
+        pld             [r1, r2, lsl #2]
198
+        vld1.8          {d3},     [r1], r2
199
+        pld             [r1]
200
+        pld             [r1, r2]
201
+        pld             [r1, r2, lsl #1]
202
+  .if \avg
203
+        vld1.8          {d4},     [r0,:64], r2
204
+        vrhadd.u8       d0,  d0,  d4
205
+        vld1.8          {d5},     [r0,:64], r2
206
+        vrhadd.u8       d1,  d1,  d5
207
+        vld1.8          {d6},     [r0,:64], r2
208
+        vrhadd.u8       d2,  d2,  d6
209
+        vld1.8          {d7},     [r0,:64], r2
210
+        vrhadd.u8       d3,  d3,  d7
211
+        sub             r0,  r0,  r2,  lsl #2
212
+  .endif
213
+        subs            r3,  r3,  #4
214
+        vst1.8          {d0},     [r0,:64], r2
215
+        vst1.8          {d1},     [r0,:64], r2
216
+        vst1.8          {d2},     [r0,:64], r2
217
+        vst1.8          {d3},     [r0,:64], r2
218
+        bne             1b
219
+        bx              lr
220
+.endm
221
+
222
+.macro  pixels8_x2      rnd=1, avg=0
223
+1:      vld1.8          {q0},     [r1], r2
224
+        vext.8          d1,  d0,  d1,  #1
225
+        vld1.8          {q1},     [r1], r2
226
+        vext.8          d3,  d2,  d3,  #1
227
+        pld             [r1]
228
+        pld             [r1, r2]
229
+        subs            r3,  r3,  #2
230
+        vswp            d1,  d2
231
+        avg             q0,  q0,  q1
232
+  .if \avg
233
+        vld1.8          {d4},     [r0,:64], r2
234
+        vld1.8          {d5},     [r0,:64]
235
+        vrhadd.u8       q0,  q0,  q2
236
+        sub             r0,  r0,  r2
237
+  .endif
238
+        vst1.8          {d0},     [r0,:64], r2
239
+        vst1.8          {d1},     [r0,:64], r2
240
+        bne             1b
241
+        bx              lr
242
+.endm
243
+
244
+.macro  pixels8_y2      rnd=1, avg=0
245
+        sub             r3,  r3,  #2
246
+        vld1.8          {d0},     [r1], r2
247
+        vld1.8          {d1},     [r1], r2
248
+1:      subs            r3,  r3,  #2
249
+        avg             d4,  d0,  d1
250
+        vld1.8          {d0},     [r1], r2
251
+        avg             d5,  d0,  d1
252
+        vld1.8          {d1},     [r1], r2
253
+        pld             [r1]
254
+        pld             [r1, r2]
255
+  .if \avg
256
+        vld1.8          {d2},     [r0,:64], r2
257
+        vld1.8          {d3},     [r0,:64]
258
+        vrhadd.u8       q2,  q2,  q1
259
+        sub             r0,  r0,  r2
260
+  .endif
261
+        vst1.8          {d4},     [r0,:64], r2
262
+        vst1.8          {d5},     [r0,:64], r2
263
+        bne             1b
264
+
265
+        avg             d4,  d0,  d1
266
+        vld1.8          {d0},     [r1], r2
267
+        avg             d5,  d0,  d1
268
+  .if \avg
269
+        vld1.8          {d2},     [r0,:64], r2
270
+        vld1.8          {d3},     [r0,:64]
271
+        vrhadd.u8       q2,  q2,  q1
272
+        sub             r0,  r0,  r2
273
+  .endif
274
+        vst1.8          {d4},     [r0,:64], r2
275
+        vst1.8          {d5},     [r0,:64], r2
276
+
277
+        bx              lr
278
+.endm
279
+
280
+.macro  pixels8_xy2     rnd=1, avg=0
281
+        sub             r3,  r3,  #2
282
+        vld1.8          {q0},     [r1], r2
283
+        vld1.8          {q1},     [r1], r2
284
+NRND    vmov.i16        q11, #1
285
+        pld             [r1]
286
+        pld             [r1, r2]
287
+        vext.8          d4,  d0,  d1,  #1
288
+        vext.8          d6,  d2,  d3,  #1
289
+        vaddl.u8        q8,  d0,  d4
290
+        vaddl.u8        q9,  d2,  d6
291
+1:      subs            r3,  r3,  #2
292
+        vld1.8          {q0},     [r1], r2
293
+        pld             [r1]
294
+        vadd.u16        q10, q8,  q9
295
+        vext.8          d4,  d0,  d1,  #1
296
+NRND    vadd.u16        q10, q10, q11
297
+        vaddl.u8        q8,  d0,  d4
298
+        shrn            d5,  q10, #2
299
+        vld1.8          {q1},     [r1], r2
300
+        vadd.u16        q10, q8,  q9
301
+        pld             [r1, r2]
302
+  .if \avg
303
+        vld1.8          {d7},     [r0,:64]
304
+        vrhadd.u8       d5,  d5,  d7
305
+  .endif
306
+NRND    vadd.u16        q10, q10, q11
307
+        vst1.8          {d5},     [r0,:64], r2
308
+        shrn            d7,  q10, #2
309
+  .if \avg
310
+        vld1.8          {d5},     [r0,:64]
311
+        vrhadd.u8       d7,  d7,  d5
312
+  .endif
313
+        vext.8          d6,  d2,  d3,  #1
314
+        vaddl.u8        q9,  d2,  d6
315
+        vst1.8          {d7},     [r0,:64], r2
316
+        bgt             1b
317
+
318
+        vld1.8          {q0},     [r1], r2
319
+        vadd.u16        q10, q8,  q9
320
+        vext.8          d4,  d0,  d1,  #1
321
+NRND    vadd.u16        q10, q10, q11
322
+        vaddl.u8        q8,  d0,  d4
323
+        shrn            d5,  q10, #2
324
+        vadd.u16        q10, q8,  q9
325
+  .if \avg
326
+        vld1.8          {d7},     [r0,:64]
327
+        vrhadd.u8       d5,  d5,  d7
328
+  .endif
329
+NRND    vadd.u16        q10, q10, q11
330
+        vst1.8          {d5},     [r0,:64], r2
331
+        shrn            d7,  q10, #2
332
+  .if \avg
333
+        vld1.8          {d5},     [r0,:64]
334
+        vrhadd.u8       d7,  d7,  d5
335
+  .endif
336
+        vst1.8          {d7},     [r0,:64], r2
337
+
338
+        bx              lr
339
+.endm
340
+
341
+.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
342
+  .if \rnd
343
+    .macro avg  rd, rn, rm
344
+        vrhadd.u8       \rd, \rn, \rm
345
+    .endm
346
+    .macro shrn rd, rn, rm
347
+        vrshrn.u16      \rd, \rn, \rm
348
+    .endm
349
+    .macro NRND insn:vararg
350
+    .endm
351
+  .else
352
+    .macro avg  rd, rn, rm
353
+        vhadd.u8        \rd, \rn, \rm
354
+    .endm
355
+    .macro shrn rd, rn, rm
356
+        vshrn.u16       \rd, \rn, \rm
357
+    .endm
358
+    .macro NRND insn:vararg
359
+        \insn
360
+    .endm
361
+  .endif
362
+function ff_\pfx\name\suf\()_neon, export=1
363
+        \name           \rnd, \avg
364
+endfunc
365
+        .purgem         avg
366
+        .purgem         shrn
367
+        .purgem         NRND
368
+.endm
369
+
370
+.macro  pixfunc2        pfx, name, avg=0
371
+        pixfunc         \pfx, \name,          rnd=1, avg=\avg
372
+        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
373
+.endm
374
+
375
+function ff_put_h264_qpel16_mc00_neon, export=1
376
+        mov             r3,  #16
377
+endfunc
378
+
379
+        pixfunc         put_, pixels16,     avg=0
380
+        pixfunc2        put_, pixels16_x2,  avg=0
381
+        pixfunc2        put_, pixels16_y2,  avg=0
382
+        pixfunc2        put_, pixels16_xy2, avg=0
383
+
384
+function ff_avg_h264_qpel16_mc00_neon, export=1
385
+        mov             r3,  #16
386
+endfunc
387
+
388
+        pixfunc         avg_, pixels16,     avg=1
389
+        pixfunc2        avg_, pixels16_x2,  avg=1
390
+        pixfunc2        avg_, pixels16_y2,  avg=1
391
+        pixfunc2        avg_, pixels16_xy2, avg=1
392
+
393
+function ff_put_h264_qpel8_mc00_neon, export=1
394
+        mov             r3,  #8
395
+endfunc
396
+
397
+        pixfunc         put_, pixels8,     avg=0
398
+        pixfunc2        put_, pixels8_x2,  avg=0
399
+        pixfunc2        put_, pixels8_y2,  avg=0
400
+        pixfunc2        put_, pixels8_xy2, avg=0
401
+
402
+function ff_avg_h264_qpel8_mc00_neon, export=1
403
+        mov             r3,  #8
404
+endfunc
405
+
406
+        pixfunc         avg_, pixels8,     avg=1
407
+        pixfunc         avg_, pixels8_x2,  avg=1
408
+        pixfunc         avg_, pixels8_y2,  avg=1
409
+        pixfunc         avg_, pixels8_xy2, avg=1
... ...
@@ -54,6 +54,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
54 54
     hpel_funcs(avg, [3],  2);
55 55
     hpel_funcs(avg_no_rnd,, 16);
56 56
 
57
+    if (ARCH_ARM)
58
+        ff_hpeldsp_init_arm(c, flags);
57 59
     if (ARCH_PPC)
58 60
         ff_hpeldsp_init_ppc(c, flags);
59 61
     if (ARCH_X86)
... ...
@@ -94,6 +94,7 @@ typedef struct HpelDSPContext {
94 94
 
95 95
 void ff_hpeldsp_init(HpelDSPContext *c, int flags);
96 96
 
97
+void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
97 98
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
98 99
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
99 100