Browse code

Assembly version of put_pixels. This is currently the function that takes the most time, and it allows for more efficient unaligned access and better control over memory latencies.

Originally committed as revision 711 to svn://svn.ffmpeg.org/ffmpeg/trunk

Falk Hüffner authored on 2002/07/02 08:47:01
Showing 2 changed files
... ...
@@ -22,6 +22,8 @@
22 22
 
23 23
 void simple_idct_axp(DCTELEM *block);
24 24
 
25
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
26
+			int line_size, int h);
25 27
 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
26 28
 				int line_size);
27 29
 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
... ...
@@ -232,12 +234,12 @@ static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
232 232
 
233 233
 void dsputil_init_alpha(void)
234 234
 {
235
-    put_pixels_tab[0] = put_pixels_axp;
235
+    put_pixels_tab[0] = put_pixels_axp_asm;
236 236
     put_pixels_tab[1] = put_pixels_x2_axp;
237 237
     put_pixels_tab[2] = put_pixels_y2_axp;
238 238
     put_pixels_tab[3] = put_pixels_xy2_axp;
239 239
 
240
-    put_no_rnd_pixels_tab[0] = put_pixels_axp;
240
+    put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
241 241
     put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
242 242
     put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
243 243
     put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
... ...
@@ -44,6 +44,123 @@
44 44
         .text
45 45
 
46 46
 /************************************************************************
47
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
48
+ *                         int line_size, int h)
49
+ */
50
+        .align 6
51
+        .globl put_pixels_axp_asm
52
+        .ent put_pixels_axp_asm
53
+put_pixels_axp_asm:
54
+        .frame sp, 0, ra
55
+        .prologue 0
56
+
57
+#ifdef HAVE_GPROF
58
+        lda     AT, _mcount
59
+        jsr     AT, (AT), _mcount
60
+#endif
61
+
62
+        and     a1, 7, t0
63
+        beq     t0, $aligned
64
+
65
+        .align 4
66
+$unaligned:
67
+        ldq_u   t0, 0(a1)
68
+        ldq_u   t1, 8(a1)
69
+        addq    a1, a2, a1
70
+        nop
71
+
72
+        ldq_u   t2, 0(a1)
73
+        ldq_u   t3, 8(a1)
74
+        addq    a1, a2, a1
75
+        nop
76
+
77
+	ldq_u   t4, 0(a1)
78
+        ldq_u   t5, 8(a1)
79
+        addq    a1, a2, a1
80
+        nop
81
+
82
+        ldq_u   t6, 0(a1)
83
+        ldq_u   t7, 8(a1)
84
+        extql   t0, a1, t0
85
+        addq    a1, a2, a1
86
+
87
+        extqh   t1, a1, t1
88
+        addq    a0, a2, t8
89
+        extql   t2, a1, t2
90
+        addq    t8, a2, t9
91
+
92
+        extqh   t3, a1, t3
93
+        addq    t9, a2, ta
94
+        extql   t4, a1, t4
95
+        or      t0, t1, t0
96
+
97
+        extqh   t5, a1, t5
98
+        or      t2, t3, t2
99
+        extql   t6, a1, t6
100
+        or      t4, t5, t4
101
+
102
+        extqh   t7, a1, t7
103
+        or      t6, t7, t6
104
+        stq     t0, 0(a0)
105
+        stq     t2, 0(t8)
106
+
107
+        stq     t4, 0(t9)
108
+        subq    a3, 4, a3
109
+        stq     t6, 0(ta)
110
+        addq    ta, a2, a0
111
+
112
+        bne     a3, $unaligned
113
+        ret
114
+
115
+        .align 4
116
+$aligned:
117
+        ldq     t0, 0(a1)
118
+        addq    a1, a2, a1
119
+        ldq     t1, 0(a1)
120
+        addq    a1, a2, a1
121
+
122
+        ldq     t2, 0(a1)
123
+        addq    a1, a2, a1
124
+        ldq     t3, 0(a1)
125
+        addq    a1, a2, a1
126
+
127
+        ldq     t4, 0(a1)
128
+        addq    a1, a2, a1
129
+        ldq     t5, 0(a1)
130
+        addq    a1, a2, a1
131
+
132
+        ldq     t6, 0(a1)
133
+        addq    a1, a2, a1
134
+        ldq     t7, 0(a1)
135
+        addq    a1, a2, a1
136
+
137
+        addq    a0, a2, t8
138
+        stq     t0, 0(a0)
139
+        addq    t8, a2, t9
140
+        stq     t1, 0(t8)
141
+
142
+        addq    t9, a2, ta
143
+        stq     t2, 0(t9)
144
+        addq    ta, a2, tb
145
+        stq     t3, 0(ta)
146
+
147
+        addq    tb, a2, tc
148
+        stq     t4, 0(tb)
149
+        addq    tc, a2, td
150
+        stq     t5, 0(tc)
151
+
152
+        addq    td, a2, te
153
+        stq     t6, 0(td)
154
+        addq    te, a2, a0
155
+        stq     t7, 0(te)
156
+
157
+        subq    a3, 8, a3
158
+        bne     a3, $aligned
159
+
160
+        ret
161
+        .end put_pixels_axp_asm
162
+
163
+/************************************************************************
47 164
  * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
48 165
  *                                 int line_size)
49 166
  */