Signed-off-by: Mans Rullgard <mans@mansr.com>
| ... | ... |
@@ -113,6 +113,12 @@ T add \rn, \rn, \rm |
| 113 | 113 |
T ldr \rt, [\rn] |
| 114 | 114 |
.endm |
| 115 | 115 |
|
| 116 |
+.macro ldr_dpre rt, rn, rm:vararg |
|
| 117 |
+A ldr \rt, [\rn, -\rm]! |
|
| 118 |
+T sub \rn, \rn, \rm |
|
| 119 |
+T ldr \rt, [\rn] |
|
| 120 |
+.endm |
|
| 121 |
+ |
|
| 116 | 122 |
.macro ldr_post rt, rn, rm:vararg |
| 117 | 123 |
A ldr \rt, [\rn], \rm |
| 118 | 124 |
T ldr \rt, [\rn] |
| ... | ... |
@@ -54,6 +54,13 @@ void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); |
| 54 | 54 |
void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int); |
| 55 | 55 |
void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int); |
| 56 | 56 |
|
| 57 |
+int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, int stride, |
|
| 58 |
+ int beta, int beta2, int edge, |
|
| 59 |
+ int *p1, int *q1); |
|
| 60 |
+int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, int stride, |
|
| 61 |
+ int beta, int beta2, int edge, |
|
| 62 |
+ int *p1, int *q1); |
|
| 63 |
+ |
|
| 57 | 64 |
void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) |
| 58 | 65 |
{
|
| 59 | 66 |
c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon; |
| ... | ... |
@@ -116,4 +123,7 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) |
| 116 | 116 |
|
| 117 | 117 |
c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon; |
| 118 | 118 |
c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon; |
| 119 |
+ |
|
| 120 |
+ c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon; |
|
| 121 |
+ c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon; |
|
| 119 | 122 |
} |
| ... | ... |
@@ -722,3 +722,89 @@ function ff_rv40_weight_func_8_neon, export=1 |
| 722 | 722 |
bne 1b |
| 723 | 723 |
bx lr |
| 724 | 724 |
endfunc |
| 725 |
+ |
|
| 726 |
+function ff_rv40_h_loop_filter_strength_neon, export=1 |
|
| 727 |
+ pkhbt r2, r3, r2, lsl #18 |
|
| 728 |
+ |
|
| 729 |
+ ldr r3, [r0] |
|
| 730 |
+ ldr_dpre r12, r0, r1 |
|
| 731 |
+ teq r3, r12 |
|
| 732 |
+ beq 1f |
|
| 733 |
+ |
|
| 734 |
+ sub r0, r0, r1, lsl #1 |
|
| 735 |
+ |
|
| 736 |
+ vld1.32 {d4[]}, [r0,:32], r1 @ -3
|
|
| 737 |
+ vld1.32 {d0[]}, [r0,:32], r1 @ -2
|
|
| 738 |
+ vld1.32 {d4[1]}, [r0,:32], r1 @ -1
|
|
| 739 |
+ vld1.32 {d5[]}, [r0,:32], r1 @ 0
|
|
| 740 |
+ vld1.32 {d1[]}, [r0,:32], r1 @ 1
|
|
| 741 |
+ vld1.32 {d5[0]}, [r0,:32], r1 @ 2
|
|
| 742 |
+ |
|
| 743 |
+ vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1 |
|
| 744 |
+ vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0 |
|
| 745 |
+ vdup.32 d30, r2 @ beta2, beta << 2 |
|
| 746 |
+ vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1 |
|
| 747 |
+ vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0 |
|
| 748 |
+ vabd.u16 d16, d18, d16 |
|
| 749 |
+ vclt.u16 d16, d16, d30 |
|
| 750 |
+ |
|
| 751 |
+ ldrd r2, r3, [sp, #4] |
|
| 752 |
+ vmovl.u16 q12, d16 |
|
| 753 |
+ vtrn.16 d16, d17 |
|
| 754 |
+ vshr.u32 q12, q12, #15 |
|
| 755 |
+ ldr r0, [sp] |
|
| 756 |
+ vst1.32 {d24[1]}, [r2,:32]
|
|
| 757 |
+ vst1.32 {d25[1]}, [r3,:32]
|
|
| 758 |
+ |
|
| 759 |
+ cmp r0, #0 |
|
| 760 |
+ it eq |
|
| 761 |
+ bxeq lr |
|
| 762 |
+ |
|
| 763 |
+ vand d18, d16, d17 |
|
| 764 |
+ vtrn.32 d18, d19 |
|
| 765 |
+ vand d18, d18, d19 |
|
| 766 |
+ vmov.u16 r0, d18[0] |
|
| 767 |
+ bx lr |
|
| 768 |
+1: |
|
| 769 |
+ ldrd r2, r3, [sp, #4] |
|
| 770 |
+ mov r0, #0 |
|
| 771 |
+ str r0, [r2] |
|
| 772 |
+ str r0, [r3] |
|
| 773 |
+ bx lr |
|
| 774 |
+endfunc |
|
| 775 |
+ |
|
| 776 |
+function ff_rv40_v_loop_filter_strength_neon, export=1 |
|
| 777 |
+ sub r0, r0, #3 |
|
| 778 |
+ pkhbt r2, r3, r2, lsl #18 |
|
| 779 |
+ |
|
| 780 |
+ vld1.8 {d0}, [r0], r1
|
|
| 781 |
+ vld1.8 {d1}, [r0], r1
|
|
| 782 |
+ vld1.8 {d2}, [r0], r1
|
|
| 783 |
+ vld1.8 {d3}, [r0], r1
|
|
| 784 |
+ |
|
| 785 |
+ vaddl.u8 q0, d0, d1 |
|
| 786 |
+ vaddl.u8 q1, d2, d3 |
|
| 787 |
+ vdup.32 q15, r2 |
|
| 788 |
+ vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2 |
|
| 789 |
+ vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2 |
|
| 790 |
+ vabd.u16 q0, q1, q0 |
|
| 791 |
+ vclt.u16 q0, q0, q15 |
|
| 792 |
+ |
|
| 793 |
+ ldrd r2, r3, [sp, #4] |
|
| 794 |
+ vmovl.u16 q1, d0 |
|
| 795 |
+ vext.16 d1, d0, d1, #3 |
|
| 796 |
+ vshr.u32 q1, q1, #15 |
|
| 797 |
+ ldr r0, [sp] |
|
| 798 |
+ vst1.32 {d2[1]}, [r2,:32]
|
|
| 799 |
+ vst1.32 {d3[1]}, [r3,:32]
|
|
| 800 |
+ |
|
| 801 |
+ cmp r0, #0 |
|
| 802 |
+ it eq |
|
| 803 |
+ bxeq lr |
|
| 804 |
+ |
|
| 805 |
+ vand d0, d0, d1 |
|
| 806 |
+ vtrn.16 d0, d1 |
|
| 807 |
+ vand d0, d0, d1 |
|
| 808 |
+ vmov.u16 r0, d0[0] |
|
| 809 |
+ bx lr |
|
| 810 |
+endfunc |