Browse code

rv40: NEON optimised loop filter strength selection

Signed-off-by: Mans Rullgard <mans@mansr.com>

Mans Rullgard authored on 2011/12/08 13:26:12
Showing 3 changed files
... ...
@@ -113,6 +113,12 @@ T       add             \rn, \rn, \rm
113 113
 T       ldr             \rt, [\rn]
114 114
 .endm
115 115
 
116
+.macro  ldr_dpre        rt,  rn,  rm:vararg
117
+A       ldr             \rt, [\rn, -\rm]!
118
+T       sub             \rn, \rn, \rm
119
+T       ldr             \rt, [\rn]
120
+.endm
121
+
116 122
 .macro  ldr_post        rt,  rn,  rm:vararg
117 123
 A       ldr             \rt, [\rn], \rm
118 124
 T       ldr             \rt, [\rn]
... ...
@@ -54,6 +54,13 @@ void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
54 54
 void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int);
55 55
 void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, int);
56 56
 
57
+int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, int stride,
58
+                                        int beta, int beta2, int edge,
59
+                                        int *p1, int *q1);
60
+int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, int stride,
61
+                                        int beta, int beta2, int edge,
62
+                                        int *p1, int *q1);
63
+
57 64
 void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
58 65
 {
59 66
     c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
... ...
@@ -116,4 +123,7 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
116 116
 
117 117
     c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon;
118 118
     c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon;
119
+
120
+    c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
121
+    c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
119 122
 }
... ...
@@ -722,3 +722,89 @@ function ff_rv40_weight_func_8_neon, export=1
722 722
         bne             1b
723 723
         bx              lr
724 724
 endfunc
725
+
726
+function ff_rv40_h_loop_filter_strength_neon, export=1
727
+        pkhbt           r2,  r3,  r2,  lsl #18
728
+
729
+        ldr             r3,  [r0]
730
+        ldr_dpre        r12, r0,  r1
731
+        teq             r3,  r12
732
+        beq             1f
733
+
734
+        sub             r0,  r0,  r1,  lsl #1
735
+
736
+        vld1.32         {d4[]},   [r0,:32], r1  @ -3
737
+        vld1.32         {d0[]},   [r0,:32], r1  @ -2
738
+        vld1.32         {d4[1]},  [r0,:32], r1  @ -1
739
+        vld1.32         {d5[]},   [r0,:32], r1  @  0
740
+        vld1.32         {d1[]},   [r0,:32], r1  @  1
741
+        vld1.32         {d5[0]},  [r0,:32], r1  @  2
742
+
743
+        vpaddl.u8       q8,  q0                 @ -2, -2, -2, -2,  1,  1,  1,  1
744
+        vpaddl.u8       q9,  q2                 @ -3, -3, -1, -1,  2,  2,  0,  0
745
+        vdup.32         d30, r2                 @ beta2, beta << 2
746
+        vpadd.u16       d16, d16, d17           @ -2, -2,  1,  1
747
+        vpadd.u16       d18, d18, d19           @ -3, -1,  2,  0
748
+        vabd.u16        d16, d18, d16
749
+        vclt.u16        d16, d16, d30
750
+
751
+        ldrd            r2,  r3,  [sp, #4]
752
+        vmovl.u16       q12, d16
753
+        vtrn.16         d16, d17
754
+        vshr.u32        q12, q12, #15
755
+        ldr             r0,  [sp]
756
+        vst1.32         {d24[1]}, [r2,:32]
757
+        vst1.32         {d25[1]}, [r3,:32]
758
+
759
+        cmp             r0,  #0
760
+        it              eq
761
+        bxeq            lr
762
+
763
+        vand            d18, d16, d17
764
+        vtrn.32         d18, d19
765
+        vand            d18, d18, d19
766
+        vmov.u16        r0,  d18[0]
767
+        bx              lr
768
+1:
769
+        ldrd            r2,  r3,  [sp, #4]
770
+        mov             r0,  #0
771
+        str             r0,  [r2]
772
+        str             r0,  [r3]
773
+        bx              lr
774
+endfunc
775
+
776
+function ff_rv40_v_loop_filter_strength_neon, export=1
777
+        sub             r0,  r0,  #3
778
+        pkhbt           r2,  r3,  r2,  lsl #18
779
+
780
+        vld1.8          {d0},     [r0], r1
781
+        vld1.8          {d1},     [r0], r1
782
+        vld1.8          {d2},     [r0], r1
783
+        vld1.8          {d3},     [r0], r1
784
+
785
+        vaddl.u8        q0,  d0,  d1
786
+        vaddl.u8        q1,  d2,  d3
787
+        vdup.32         q15, r2
788
+        vadd.u16        q0,  q0,  q1            @ -3, -2, -1,  0,  1,  2
789
+        vext.16         q1,  q0,  q0,  #1       @ -2, -1,  0,  1,  2
790
+        vabd.u16        q0,  q1,  q0
791
+        vclt.u16        q0,  q0,  q15
792
+
793
+        ldrd            r2,  r3,  [sp, #4]
794
+        vmovl.u16       q1,  d0
795
+        vext.16         d1,  d0,  d1,  #3
796
+        vshr.u32        q1,  q1,  #15
797
+        ldr             r0,  [sp]
798
+        vst1.32         {d2[1]},  [r2,:32]
799
+        vst1.32         {d3[1]},  [r3,:32]
800
+
801
+        cmp             r0,  #0
802
+        it              eq
803
+        bxeq            lr
804
+
805
+        vand            d0,  d0,  d1
806
+        vtrn.16         d0,  d1
807
+        vand            d0,  d0,  d1
808
+        vmov.u16        r0,  d0[0]
809
+        bx              lr
810
+endfunc