Originally committed as revision 979 to svn://svn.ffmpeg.org/ffmpeg/trunk
| ... | ... |
@@ -68,7 +68,7 @@ endif |
| 68 | 68 |
# alpha specific stuff |
| 69 | 69 |
ifeq ($(TARGET_ARCH_ALPHA),yes) |
| 70 | 70 |
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o alpha/motion_est_alpha.o |
| 71 |
-ASM_OBJS += alpha/dsputil_alpha_asm.o |
|
| 71 |
+ASM_OBJS += alpha/dsputil_alpha_asm.o alpha/motion_est_mvi_asm.o |
|
| 72 | 72 |
CFLAGS += -fforce-addr -freduce-all-givs |
| 73 | 73 |
endif |
| 74 | 74 |
|
| ... | ... |
@@ -34,7 +34,7 @@ void get_pixels_mvi(DCTELEM *restrict block, |
| 34 | 34 |
void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, |
| 35 | 35 |
int stride); |
| 36 | 36 |
int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); |
| 37 |
-int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); |
|
| 37 |
+int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size); |
|
| 38 | 38 |
int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); |
| 39 | 39 |
int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); |
| 40 | 40 |
int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); |
| ... | ... |
@@ -335,7 +335,7 @@ void dsputil_init_alpha(void) |
| 335 | 335 |
get_pixels = get_pixels_mvi; |
| 336 | 336 |
diff_pixels = diff_pixels_mvi; |
| 337 | 337 |
pix_abs8x8 = pix_abs8x8_mvi; |
| 338 |
- pix_abs16x16 = pix_abs16x16_mvi; |
|
| 338 |
+ pix_abs16x16 = pix_abs16x16_mvi_asm; |
|
| 339 | 339 |
pix_abs16x16_x2 = pix_abs16x16_x2_mvi; |
| 340 | 340 |
pix_abs16x16_y2 = pix_abs16x16_y2_mvi; |
| 341 | 341 |
pix_abs16x16_xy2 = pix_abs16x16_xy2_mvi; |
| ... | ... |
@@ -117,6 +117,7 @@ int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
| 117 | 117 |
return result; |
| 118 | 118 |
} |
| 119 | 119 |
|
| 120 |
+#if 0 /* now done in assembly */ |
|
| 120 | 121 |
int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
| 121 | 122 |
{
|
| 122 | 123 |
int result = 0; |
| ... | ... |
@@ -157,6 +158,7 @@ int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
| 157 | 157 |
|
| 158 | 158 |
return result; |
| 159 | 159 |
} |
| 160 |
+#endif |
|
| 160 | 161 |
|
| 161 | 162 |
int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) |
| 162 | 163 |
{
|
| 163 | 164 |
new file mode 100644 |
| ... | ... |
@@ -0,0 +1,186 @@ |
| 0 |
+/* |
|
| 1 |
+ * Alpha optimized DSP utils |
|
| 2 |
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org> |
|
| 3 |
+ * |
|
| 4 |
+ * This program is free software; you can redistribute it and/or modify |
|
| 5 |
+ * it under the terms of the GNU General Public License as published by |
|
| 6 |
+ * the Free Software Foundation; either version 2 of the License, or |
|
| 7 |
+ * (at your option) any later version. |
|
| 8 |
+ * |
|
| 9 |
+ * This program is distributed in the hope that it will be useful, |
|
| 10 |
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 11 |
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 12 |
+ * GNU General Public License for more details. |
|
| 13 |
+ * |
|
| 14 |
+ * You should have received a copy of the GNU General Public License |
|
| 15 |
+ * along with this program; if not, write to the Free Software |
|
| 16 |
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|
| 17 |
+ */ |
|
| 18 |
+ |
|
| 19 |
+#include "regdef.h" |
|
| 20 |
+#ifdef HAVE_AV_CONFIG_H |
|
| 21 |
+#include "config.h" |
|
| 22 |
+#endif |
|
| 23 |
+ |
|
| 24 |
+/* Some nicer register names. */ |
|
| 25 |
+#define ta t10 |
|
| 26 |
+#define tb t11 |
|
| 27 |
+#define tc t12 |
|
| 28 |
+#define td AT |
|
| 29 |
+/* Danger: these overlap with the argument list and the return value */ |
|
| 30 |
+#define te a5 |
|
| 31 |
+#define tf a4 |
|
| 32 |
+#define tg a3 |
|
| 33 |
+#define th v0 |
|
| 34 |
+ |
|
| 35 |
+ .set noat |
|
| 36 |
+ .set noreorder |
|
| 37 |
+ .arch pca56 |
|
| 38 |
+ .text |
|
| 39 |
+ |
|
| 40 |
+/***************************************************************************** |
|
| 41 |
+ * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size) |
|
| 42 |
+ * |
|
| 43 |
+ * This code is written with a pca56 in mind. For ev6, one should |
|
| 44 |
+ * really take the increased latency of 3 cycles for MVI instructions |
|
| 45 |
+ * into account. |
|
| 46 |
+ * |
|
| 47 |
+ * It is important to keep the loading and first use of a register as |
|
| 48 |
+ * far apart as possible, because if a register is accessed before it |
|
| 49 |
+ * has been fetched from memory, the CPU will stall. |
|
| 50 |
+ */ |
|
| 51 |
+ .align 4 |
|
| 52 |
+ .globl pix_abs16x16_mvi_asm |
|
| 53 |
+ .ent pix_abs16x16_mvi_asm |
|
| 54 |
+pix_abs16x16_mvi_asm: |
|
| 55 |
+ .frame sp, 0, ra, 0 |
|
| 56 |
+ .prologue 0 |
|
| 57 |
+ |
|
| 58 |
+#ifdef HAVE_GPROF |
|
| 59 |
+ lda AT, _mcount |
|
| 60 |
+ jsr AT, (AT), _mcount |
|
| 61 |
+#endif |
|
| 62 |
+ |
|
| 63 |
+ and a1, 7, t0 |
|
| 64 |
+ clr v0 |
|
| 65 |
+ lda a3, 16 |
|
| 66 |
+ beq t0, $aligned |
|
| 67 |
+ .align 4 |
|
| 68 |
+$unaligned: |
|
| 69 |
+ /* Registers: |
|
| 70 |
+ line 0: |
|
| 71 |
+ t0: left_u -> left lo -> left |
|
| 72 |
+ t1: mid |
|
| 73 |
+ t2: right_u -> right hi -> right |
|
| 74 |
+ t3: ref left |
|
| 75 |
+ t4: ref right |
|
| 76 |
+ line 1: |
|
| 77 |
+ t5: left_u -> left lo -> left |
|
| 78 |
+ t6: mid |
|
| 79 |
+ t7: right_u -> right hi -> right |
|
| 80 |
+ t8: ref left |
|
| 81 |
+ t9: ref right |
|
| 82 |
+ temp: |
|
| 83 |
+ ta: left hi |
|
| 84 |
+ tb: right lo |
|
| 85 |
+ tc: error left |
|
| 86 |
+ td: error right */ |
|
| 87 |
+ |
|
| 88 |
+ /* load line 0 */ |
|
| 89 |
+ ldq_u t0, 0(a1) # left_u |
|
| 90 |
+ ldq_u t1, 8(a1) # mid |
|
| 91 |
+ ldq_u t2, 16(a1) # right_u |
|
| 92 |
+ ldq t3, 0(a0) # ref left |
|
| 93 |
+ ldq t4, 8(a0) # ref right |
|
| 94 |
+ addq a0, a2, a0 # pix1 |
|
| 95 |
+ addq a1, a2, a1 # pix2 |
|
| 96 |
+ /* load line 1 */ |
|
| 97 |
+ ldq_u t5, 0(a1) # left_u |
|
| 98 |
+ ldq_u t6, 8(a1) # mid |
|
| 99 |
+ ldq_u t7, 16(a1) # right_u |
|
| 100 |
+ ldq t8, 0(a0) # ref left |
|
| 101 |
+ ldq t9, 8(a0) # ref right |
|
| 102 |
+ addq a0, a2, a0 # pix1 |
|
| 103 |
+ addq a1, a2, a1 # pix2 |
|
| 104 |
+ /* calc line 0 */ |
|
| 105 |
+ extql t0, a1, t0 # left lo |
|
| 106 |
+ extqh t1, a1, ta # left hi |
|
| 107 |
+ extql t1, a1, tb # right lo |
|
| 108 |
+ or t0, ta, t0 # left |
|
| 109 |
+ extqh t2, a1, t2 # right hi |
|
| 110 |
+ perr t3, t0, tc # error left |
|
| 111 |
+ or t2, tb, t2 # right |
|
| 112 |
+ perr t4, t2, td # error right |
|
| 113 |
+ addq v0, tc, v0 # add error left |
|
| 114 |
+ addq v0, td, v0 # add error left |
|
| 115 |
+ /* calc line 1 */ |
|
| 116 |
+ extql t5, a1, t5 # left lo |
|
| 117 |
+ extqh t6, a1, ta # left hi |
|
| 118 |
+ extql t6, a1, tb # right lo |
|
| 119 |
+ or t5, ta, t5 # left |
|
| 120 |
+ extqh t7, a1, t7 # right hi |
|
| 121 |
+ perr t8, t5, tc # error left |
|
| 122 |
+ or t7, tb, t7 # right |
|
| 123 |
+ perr t9, t7, td # error right |
|
| 124 |
+ addq v0, tc, v0 # add error left |
|
| 125 |
+ addq v0, td, v0 # add error left |
|
| 126 |
+ /* loop */ |
|
| 127 |
+ subq a3, 2, a3 # h -= 2 |
|
| 128 |
+ bne a3, $unaligned |
|
| 129 |
+ ret |
|
| 130 |
+ |
|
| 131 |
+ .align 4 |
|
| 132 |
+$aligned: |
|
| 133 |
+ /* load line 0 */ |
|
| 134 |
+ ldq t0, 0(a1) # left |
|
| 135 |
+ ldq t1, 8(a1) # right |
|
| 136 |
+ addq a1, a2, a1 # pix2 |
|
| 137 |
+ ldq t2, 0(a0) # ref left |
|
| 138 |
+ ldq t3, 8(a0) # ref right |
|
| 139 |
+ addq a0, a2, a0 # pix1 |
|
| 140 |
+ /* load line 1 */ |
|
| 141 |
+ ldq t4, 0(a1) # left |
|
| 142 |
+ ldq t5, 8(a1) # right |
|
| 143 |
+ addq a1, a2, a1 # pix2 |
|
| 144 |
+ ldq t6, 0(a0) # ref left |
|
| 145 |
+ ldq t7, 8(a0) # ref right |
|
| 146 |
+ addq a0, a2, a0 # pix1 |
|
| 147 |
+ /* load line 2 */ |
|
| 148 |
+ ldq t8, 0(a1) # left |
|
| 149 |
+ ldq t9, 8(a1) # right |
|
| 150 |
+ addq a1, a2, a1 # pix2 |
|
| 151 |
+ ldq ta, 0(a0) # ref left |
|
| 152 |
+ ldq tb, 8(a0) # ref right |
|
| 153 |
+ addq a0, a2, a0 # pix1 |
|
| 154 |
+ /* load line 3 */ |
|
| 155 |
+ ldq tc, 0(a1) # left |
|
| 156 |
+ ldq td, 8(a1) # right |
|
| 157 |
+ addq a1, a2, a1 # pix2 |
|
| 158 |
+ ldq te, 0(a0) # ref left |
|
| 159 |
+ ldq tf, 8(a0) # ref right |
|
| 160 |
+ /* calc line 0 */ |
|
| 161 |
+ perr t0, t2, t0 # error left |
|
| 162 |
+ addq a0, a2, a0 # pix1 |
|
| 163 |
+ perr t1, t3, t1 # error right |
|
| 164 |
+ addq v0, t0, v0 # add error left |
|
| 165 |
+ /* calc line 1 */ |
|
| 166 |
+ perr t4, t6, t0 # error left |
|
| 167 |
+ addq v0, t1, v0 # add error right |
|
| 168 |
+ perr t5, t7, t1 # error right |
|
| 169 |
+ addq v0, t0, v0 # add error left |
|
| 170 |
+ /* calc line 2 */ |
|
| 171 |
+ perr t8, ta, t0 # error left |
|
| 172 |
+ addq v0, t1, v0 # add error right |
|
| 173 |
+ perr t9, tb, t1 # error right |
|
| 174 |
+ addq v0, t0, v0 # add error left |
|
| 175 |
+ /* calc line 3 */ |
|
| 176 |
+ perr tc, te, t0 # error left |
|
| 177 |
+ addq v0, t1, v0 # add error right |
|
| 178 |
+ perr td, tf, t1 # error right |
|
| 179 |
+ addq v0, t0, v0 # add error left |
|
| 180 |
+ addq v0, t1, v0 # add error right |
|
| 181 |
+ /* loop */ |
|
| 182 |
+ subq a3, 4, a3 # h -= 4 |
|
| 183 |
+ bne a3, $aligned |
|
| 184 |
+ ret |
|
| 185 |
+ .end pix_abs16x16_mvi_asm |