Originally committed as revision 4384 to svn://svn.ffmpeg.org/ffmpeg/trunk
| ... | ... |
@@ -56,13 +56,33 @@ static inline uint64_t WORD_VEC(uint64_t x) |
| 56 | 56 |
return x; |
| 57 | 57 |
} |
| 58 | 58 |
|
| 59 |
-#define ldq(p) (*(const uint64_t *) (p)) |
|
| 60 |
-#define ldl(p) (*(const int32_t *) (p)) |
|
| 61 |
-#define stl(l, p) do { *(uint32_t *) (p) = (l); } while (0)
|
|
| 62 |
-#define stq(l, p) do { *(uint64_t *) (p) = (l); } while (0)
|
|
| 63 | 59 |
#define sextw(x) ((int16_t) (x)) |
| 64 | 60 |
|
| 65 | 61 |
#ifdef __GNUC__ |
| 62 |
+#define ldq(p) \ |
|
| 63 |
+ (((union { \
|
|
| 64 |
+ uint64_t __l; \ |
|
| 65 |
+ __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)]; \ |
|
| 66 |
+ } *) (p))->__l) |
|
| 67 |
+#define ldl(p) \ |
|
| 68 |
+ (((union { \
|
|
| 69 |
+ int32_t __l; \ |
|
| 70 |
+ __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)]; \ |
|
| 71 |
+ } *) (p))->__l) |
|
| 72 |
+#define stq(l, p) \ |
|
| 73 |
+ do { \
|
|
| 74 |
+ (((union { \
|
|
| 75 |
+ uint64_t __l; \ |
|
| 76 |
+ __typeof__(*(p)) __s[sizeof (uint64_t) / sizeof *(p)]; \ |
|
| 77 |
+ } *) (p))->__l) = l; \ |
|
| 78 |
+ } while (0) |
|
| 79 |
+#define stl(l, p) \ |
|
| 80 |
+ do { \
|
|
| 81 |
+ (((union { \
|
|
| 82 |
+ int32_t __l; \ |
|
| 83 |
+ __typeof__(*(p)) __s[sizeof (int32_t) / sizeof *(p)]; \ |
|
| 84 |
+ } *) (p))->__l) = l; \ |
|
| 85 |
+ } while (0) |
|
| 66 | 86 |
struct unaligned_long { uint64_t l; } __attribute__((packed));
|
| 67 | 87 |
#define ldq_u(p) (*(const uint64_t *) (((uint64_t) (p)) & ~7ul)) |
| 68 | 88 |
#define uldq(a) (((const struct unaligned_long *) (a))->l) |
| ... | ... |
@@ -132,6 +152,10 @@ struct unaligned_long { uint64_t l; } __attribute__((packed));
|
| 132 | 132 |
#elif defined(__DECC) /* Digital/Compaq/hp "ccc" compiler */ |
| 133 | 133 |
|
| 134 | 134 |
#include <c_asm.h> |
| 135 |
+#define ldq(p) (*(const uint64_t *) (p)) |
|
| 136 |
+#define ldl(p) (*(const int32_t *) (p)) |
|
| 137 |
+#define stq(l, p) do { *(uint64_t *) (p) = (l); } while (0)
|
|
| 138 |
+#define stl(l, p) do { *(int32_t *) (p) = (l); } while (0)
|
|
| 135 | 139 |
#define ldq_u(a) asm ("ldq_u %v0,0(%a0)", a)
|
| 136 | 140 |
#define uldq(a) (*(const __unaligned uint64_t *) (a)) |
| 137 | 141 |
#define cmpbge(a, b) asm ("cmpbge %a0,%a1,%v0", a, b)
|
| ... | ... |
@@ -235,25 +235,22 @@ static inline void idct_col2(DCTELEM *col) |
| 235 | 235 |
{
|
| 236 | 236 |
int i; |
| 237 | 237 |
uint64_t l, r; |
| 238 |
- uint64_t *lcol = (uint64_t *) col; |
|
| 239 | 238 |
|
| 240 | 239 |
for (i = 0; i < 8; ++i) {
|
| 241 |
- int_fast32_t a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4; |
|
| 240 |
+ int_fast32_t a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4; |
|
| 242 | 241 |
|
| 243 | 242 |
a0 *= W4; |
| 244 |
- col[0] = a0 >> COL_SHIFT; |
|
| 245 |
- ++col; |
|
| 243 |
+ col[i] = a0 >> COL_SHIFT; |
|
| 246 | 244 |
} |
| 247 | 245 |
|
| 248 |
- l = lcol[0]; |
|
| 249 |
- r = lcol[1]; |
|
| 250 |
- lcol[ 2] = l; lcol[ 3] = r; |
|
| 251 |
- lcol[ 4] = l; lcol[ 5] = r; |
|
| 252 |
- lcol[ 6] = l; lcol[ 7] = r; |
|
| 253 |
- lcol[ 8] = l; lcol[ 9] = r; |
|
| 254 |
- lcol[10] = l; lcol[11] = r; |
|
| 255 |
- lcol[12] = l; lcol[13] = r; |
|
| 256 |
- lcol[14] = l; lcol[15] = r; |
|
| 246 |
+ l = ldq(col + 0 * 4); r = ldq(col + 1 * 4); |
|
| 247 |
+ stq(l, col + 2 * 4); stq(r, col + 3 * 4); |
|
| 248 |
+ stq(l, col + 4 * 4); stq(r, col + 5 * 4); |
|
| 249 |
+ stq(l, col + 6 * 4); stq(r, col + 7 * 4); |
|
| 250 |
+ stq(l, col + 8 * 4); stq(r, col + 9 * 4); |
|
| 251 |
+ stq(l, col + 10 * 4); stq(r, col + 11 * 4); |
|
| 252 |
+ stq(l, col + 12 * 4); stq(r, col + 13 * 4); |
|
| 253 |
+ stq(l, col + 14 * 4); stq(r, col + 15 * 4); |
|
| 257 | 254 |
} |
| 258 | 255 |
|
| 259 | 256 |
void simple_idct_axp(DCTELEM *block) |
| ... | ... |
@@ -275,22 +272,20 @@ void simple_idct_axp(DCTELEM *block) |
| 275 | 275 |
if (rowsZero) {
|
| 276 | 276 |
idct_col2(block); |
| 277 | 277 |
} else if (rowsConstant) {
|
| 278 |
- uint64_t *lblock = (uint64_t *) block; |
|
| 279 |
- |
|
| 280 | 278 |
idct_col(block); |
| 281 | 279 |
for (i = 0; i < 8; i += 2) {
|
| 282 |
- uint64_t v = (uint16_t) block[i * 8]; |
|
| 283 |
- uint64_t w = (uint16_t) block[i * 8 + 8]; |
|
| 280 |
+ uint64_t v = (uint16_t) block[0]; |
|
| 281 |
+ uint64_t w = (uint16_t) block[8]; |
|
| 284 | 282 |
|
| 285 | 283 |
v |= v << 16; |
| 286 | 284 |
w |= w << 16; |
| 287 | 285 |
v |= v << 32; |
| 288 | 286 |
w |= w << 32; |
| 289 |
- lblock[0] = v; |
|
| 290 |
- lblock[1] = v; |
|
| 291 |
- lblock[2] = w; |
|
| 292 |
- lblock[3] = w; |
|
| 293 |
- lblock += 4; |
|
| 287 |
+ stq(v, block + 0 * 4); |
|
| 288 |
+ stq(v, block + 1 * 4); |
|
| 289 |
+ stq(w, block + 2 * 4); |
|
| 290 |
+ stq(w, block + 3 * 4); |
|
| 291 |
+ block += 4 * 4; |
|
| 294 | 292 |
} |
| 295 | 293 |
} else {
|
| 296 | 294 |
for (i = 0; i < 8; i++) |