Browse code

Add Alpha assembly for pix_abs16x16. Optimized for pca56, no large win on ev6.

Originally committed as revision 979 to svn://svn.ffmpeg.org/ffmpeg/trunk

Falk Hüffner authored on 2002/09/30 00:14:28
Showing 4 changed files
... ...
@@ -68,7 +68,7 @@ endif
68 68
 # alpha specific stuff
69 69
 ifeq ($(TARGET_ARCH_ALPHA),yes)
70 70
 OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o alpha/motion_est_alpha.o
71
-ASM_OBJS += alpha/dsputil_alpha_asm.o
71
+ASM_OBJS += alpha/dsputil_alpha_asm.o alpha/motion_est_mvi_asm.o
72 72
 CFLAGS += -fforce-addr -freduce-all-givs
73 73
 endif
74 74
 
... ...
@@ -34,7 +34,7 @@ void get_pixels_mvi(DCTELEM *restrict block,
34 34
 void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
35 35
                      int stride);
36 36
 int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
37
-int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
37
+int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size);
38 38
 int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
39 39
 int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
40 40
 int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
... ...
@@ -335,7 +335,7 @@ void dsputil_init_alpha(void)
335 335
         get_pixels       = get_pixels_mvi;
336 336
         diff_pixels      = diff_pixels_mvi;
337 337
         pix_abs8x8       = pix_abs8x8_mvi;
338
-        pix_abs16x16     = pix_abs16x16_mvi;
338
+        pix_abs16x16     = pix_abs16x16_mvi_asm;
339 339
         pix_abs16x16_x2  = pix_abs16x16_x2_mvi;
340 340
         pix_abs16x16_y2  = pix_abs16x16_y2_mvi;
341 341
         pix_abs16x16_xy2 = pix_abs16x16_xy2_mvi;
... ...
@@ -117,6 +117,7 @@ int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
117 117
     return result;
118 118
 }
119 119
 
120
+#if 0				/* now done in assembly */
120 121
 int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
121 122
 {
122 123
     int result = 0;
... ...
@@ -157,6 +158,7 @@ int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
157 157
 
158 158
     return result;
159 159
 }
160
+#endif
160 161
 
161 162
 int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
162 163
 {
163 164
new file mode 100644
... ...
@@ -0,0 +1,186 @@
0
+/*
1
+ * Alpha optimized DSP utils
2
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
3
+ *
4
+ * This program is free software; you can redistribute it and/or modify
5
+ * it under the terms of the GNU General Public License as published by
6
+ * the Free Software Foundation; either version 2 of the License, or
7
+ * (at your option) any later version.
8
+ *
9
+ * This program is distributed in the hope that it will be useful,
10
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
+ * GNU General Public License for more details.
13
+ *
14
+ * You should have received a copy of the GNU General Public License
15
+ * along with this program; if not, write to the Free Software
16
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
+ */
18
+
19
+#include "regdef.h"
20
+#ifdef HAVE_AV_CONFIG_H	
21
+#include "config.h"
22
+#endif
23
+
24
+/* Some nicer register names.  */
25
+#define ta t10
26
+#define tb t11
27
+#define tc t12
28
+#define td AT
29
+/* Danger: these overlap with the argument list and the return value */
30
+#define te a5
31
+#define tf a4
32
+#define tg a3
33
+#define th v0
34
+        
35
+        .set noat
36
+        .set noreorder
37
+        .arch pca56
38
+        .text
39
+
40
+/*****************************************************************************
41
+ * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size)
42
+ *
43
+ * This code is written with a pca56 in mind. For ev6, one should
44
+ * really take the increased latency of 3 cycles for MVI instructions
45
+ * into account.
46
+ *
47
+ * It is important to keep the loading and first use of a register as
48
+ * far apart as possible, because if a register is accessed before it
49
+ * has been fetched from memory, the CPU will stall.
50
+ */
51
+        .align 4
52
+        .globl pix_abs16x16_mvi_asm
53
+        .ent pix_abs16x16_mvi_asm
54
+pix_abs16x16_mvi_asm:
55
+        .frame sp, 0, ra, 0
56
+        .prologue 0
57
+
58
+#ifdef HAVE_GPROF
59
+        lda     AT, _mcount
60
+        jsr     AT, (AT), _mcount
61
+#endif
62
+
63
+        and     a1, 7, t0
64
+        clr     v0
65
+        lda     a3, 16
66
+        beq     t0, $aligned
67
+        .align 4
68
+$unaligned:
69
+        /* Registers:
70
+           line 0:
71
+           t0:  left_u -> left lo -> left
72
+           t1:  mid
73
+           t2:  right_u -> right hi -> right
74
+           t3:  ref left
75
+           t4:  ref right
76
+           line 1:
77
+           t5:  left_u -> left lo -> left
78
+           t6:  mid
79
+           t7:  right_u -> right hi -> right
80
+           t8:  ref left
81
+           t9:  ref right
82
+           temp:
83
+           ta:  left hi
84
+           tb:  right lo
85
+           tc:  error left
86
+           td:  error right  */
87
+
88
+        /* load line 0 */
89
+        ldq_u   t0, 0(a1)       # left_u
90
+        ldq_u   t1, 8(a1)       # mid
91
+        ldq_u   t2, 16(a1)      # right_u
92
+        ldq     t3, 0(a0)       # ref left
93
+        ldq     t4, 8(a0)       # ref right
94
+        addq    a0, a2, a0      # pix1
95
+        addq    a1, a2, a1      # pix2
96
+        /* load line 1 */        
97
+        ldq_u   t5, 0(a1)       # left_u
98
+        ldq_u   t6, 8(a1)       # mid
99
+        ldq_u   t7, 16(a1)      # right_u
100
+        ldq     t8, 0(a0)       # ref left
101
+        ldq     t9, 8(a0)       # ref right
102
+        addq    a0, a2, a0      # pix1
103
+        addq    a1, a2, a1      # pix2
104
+        /* calc line 0 */
105
+        extql   t0, a1, t0      # left lo
106
+        extqh   t1, a1, ta      # left hi
107
+        extql   t1, a1, tb      # right lo
108
+        or      t0, ta, t0      # left
109
+        extqh   t2, a1, t2      # right hi
110
+        perr    t3, t0, tc      # error left
111
+        or      t2, tb, t2      # right
112
+        perr    t4, t2, td      # error right
113
+        addq    v0, tc, v0      # add error left
114
+        addq    v0, td, v0      # add error left
115
+        /* calc line 1 */
116
+        extql   t5, a1, t5      # left lo
117
+        extqh   t6, a1, ta      # left hi
118
+        extql   t6, a1, tb      # right lo
119
+        or      t5, ta, t5      # left
120
+        extqh   t7, a1, t7      # right hi
121
+        perr    t8, t5, tc      # error left
122
+        or      t7, tb, t7      # right
123
+        perr    t9, t7, td      # error right
124
+        addq    v0, tc, v0      # add error left
125
+        addq    v0, td, v0      # add error left
126
+        /* loop */
127
+        subq    a3,  2, a3      # h -= 2
128
+        bne     a3, $unaligned
129
+        ret
130
+
131
+        .align 4
132
+$aligned:
133
+        /* load line 0 */
134
+        ldq     t0, 0(a1)       # left
135
+        ldq     t1, 8(a1)       # right
136
+        addq    a1, a2, a1      # pix2
137
+        ldq     t2, 0(a0)       # ref left
138
+        ldq     t3, 8(a0)       # ref right
139
+        addq    a0, a2, a0      # pix1
140
+        /* load line 1 */
141
+        ldq     t4, 0(a1)       # left
142
+        ldq     t5, 8(a1)       # right
143
+        addq    a1, a2, a1      # pix2
144
+        ldq     t6, 0(a0)       # ref left
145
+        ldq     t7, 8(a0)       # ref right
146
+        addq    a0, a2, a0      # pix1
147
+        /* load line 2 */
148
+        ldq     t8, 0(a1)       # left
149
+        ldq     t9, 8(a1)       # right
150
+        addq    a1, a2, a1      # pix2
151
+        ldq     ta, 0(a0)       # ref left
152
+        ldq     tb, 8(a0)       # ref right
153
+        addq    a0, a2, a0      # pix1
154
+        /* load line 3 */
155
+        ldq     tc, 0(a1)       # left
156
+        ldq     td, 8(a1)       # right
157
+        addq    a1, a2, a1      # pix2
158
+        ldq     te, 0(a0)       # ref left
159
+        ldq     tf, 8(a0)       # ref right
160
+        /* calc line 0 */
161
+        perr    t0, t2, t0      # error left
162
+        addq    a0, a2, a0      # pix1
163
+        perr    t1, t3, t1      # error right
164
+        addq    v0, t0, v0      # add error left
165
+        /* calc line 1 */
166
+        perr    t4, t6, t0      # error left
167
+        addq    v0, t1, v0      # add error right
168
+        perr    t5, t7, t1      # error right
169
+        addq    v0, t0, v0      # add error left
170
+        /* calc line 2 */
171
+        perr    t8, ta, t0      # error left
172
+        addq    v0, t1, v0      # add error right
173
+        perr    t9, tb, t1      # error right
174
+        addq    v0, t0, v0      # add error left
175
+        /* calc line 3 */
176
+        perr    tc, te, t0      # error left
177
+        addq    v0, t1, v0      # add error right
178
+        perr    td, tf, t1      # error right
179
+        addq    v0, t0, v0      # add error left
180
+        addq    v0, t1, v0      # add error right
181
+        /* loop */
182
+        subq    a3,  4, a3      # h -= 4
183
+        bne     a3, $aligned
184
+        ret
185
+        .end pix_abs16x16_mvi_asm