GitList

Browse code

Implement put_pixels_clamped and add_pixels_clamped in Assembler. This allows better scheduling of the memory accesses, and is portable among all compilers.

Originally committed as revision 709 to svn://svn.ffmpeg.org/ffmpeg/trunk

Falk Hüffner authored on 2002/07/01 13:26:07
Showing 4 changed files

libavcodec/Makefile index fd0903a..3d69b42 100644
libavcodec/alpha/dsputil_alpha.c index 06d2fda..5e1aa20 100644
libavcodec/alpha/dsputil_alpha_asm.S index 0000000..9e2476d
libavcodec/alpha/regdef.h index 0000000..7e7fc06

@@ -63,10 +63,11 @@ endif
                      # alpha specific stuff
                      ifeq ($(TARGET_ARCH_ALPHA),yes)
                      OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o
                     +ASM_OBJS += alpha/dsputil_alpha_asm.o
                      CFLAGS += -Wa,-mpca56
                      endif
                     -SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.s)
                     +SRCS := $(OBJS:.o=.c) $(ASM_OBJS:.o=.S)
                      OBJS := $(OBJS) $(ASM_OBJS)
                      LIB= libavcodec.a

libavcodec/alpha/dsputil_alpha.c

History View file @ bb7d493

@@ -22,64 +22,86 @@
                      void simple_idct_axp(DCTELEM *block);
                     -static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
                     -				   int line_size)
                     +void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
                     +				int line_size);
                     +void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
                     +				int line_size);
+                    +
                     +#if 0
                     +/* These functions were the base for the optimized assembler routines,
                     +   and remain here for documentation purposes.  */
                     +static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
                     +                                   int line_size)
+                     {
                          int i = 8;
                     +    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
                          ASM_ACCEPT_MVI;
                          do {
                     -	UINT64 shorts;
                     +        uint64_t shorts0, shorts1;
                     -	shorts = ldq(block);
                     -	shorts = maxsw4(shorts, 0);
                     -	shorts = minsw4(shorts, WORD_VEC(0x00ff));
                     -	stl(pkwb(shorts), pixels);
                     +        shorts0 = ldq(block);
                     +        shorts0 = maxsw4(shorts0, 0);
                     +        shorts0 = minsw4(shorts0, clampmask);
                     +        stl(pkwb(shorts0), pixels);
                     -	shorts = ldq(block + 4);
                     -	shorts = maxsw4(shorts, 0);
                     -	shorts = minsw4(shorts, WORD_VEC(0x00ff));
                     -	stl(pkwb(shorts), pixels + 4);
                     +        shorts1 = ldq(block + 4);
                     +        shorts1 = maxsw4(shorts1, 0);
                     +        shorts1 = minsw4(shorts1, clampmask);
                     +        stl(pkwb(shorts1), pixels + 4);
                     -	pixels += line_size;
                     -	block += 8;
                     +        pixels += line_size;
                     +        block += 8;
                          } while (--i);
+                     }
                     -static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
                     -				   int line_size)
                     +void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
                     +                            int line_size)
+                     {
                     -    int i = 8;
                     +    int h = 8;
                     +    /* Keep this function a leaf function by generating the constants
                     +       manually (mainly for the hack value ;-).  */
                     +    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
                     +    uint64_t signmask  = zap(-1, 0x33);
                     +    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
                          ASM_ACCEPT_MVI;
                          do {
                     -	UINT64 shorts;
+                    -
                     -	shorts = ldq(block);
                     -	shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
                     -	shorts += unpkbw(ldl(pixels));
                     -	shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
                     -	shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
                     -	shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
                     -	shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
                     -	stl(pkwb(shorts), pixels);
+                    -
                     -	/* next 4 */
                     -	shorts = ldq(block + 4);
                     -	shorts &= ~WORD_VEC(0x8000);
                     -	shorts += unpkbw(ldl(pixels + 4));
                     -	shorts &= ~WORD_VEC(0x8000);
                     -	shorts = minuw4(shorts, WORD_VEC(0x4000));
                     -	shorts &= ~WORD_VEC(0x4000);
                     -	shorts = minsw4(shorts, WORD_VEC(0x00ff));
                     -	stl(pkwb(shorts), pixels + 4);
+                    -
                     -	pixels += line_size;
                     -	block += 8;
                     -    } while (--i);
                     +        uint64_t shorts0, pix0, signs0;
                     +        uint64_t shorts1, pix1, signs1;
+                    +
                     +        shorts0 = ldq(block);
                     +        shorts1 = ldq(block + 4);
+                    +
                     +        pix0    = unpkbw(ldl(pixels));
                     +        /* Signed subword add (MMX paddw).  */
                     +        signs0  = shorts0 & signmask;
                     +        shorts0 &= ~signmask;
                     +        shorts0 += pix0;
                     +        shorts0 ^= signs0;
                     +        /* Clamp. */
                     +        shorts0 = maxsw4(shorts0, 0);
                     +        shorts0 = minsw4(shorts0, clampmask);
+                    +
                     +        /* Next 4.  */
                     +        pix1    = unpkbw(ldl(pixels + 4));
                     +        signs1  = shorts1 & signmask;
                     +        shorts1 &= ~signmask;
                     +        shorts1 += pix1;
                     +        shorts1 ^= signs1;
                     +        shorts1 = maxsw4(shorts1, 0);
                     +        shorts1 = minsw4(shorts1, clampmask);
+                    +
                     +        stl(pkwb(shorts0), pixels);
                     +        stl(pkwb(shorts1), pixels + 4);
+                    +
                     +        pixels += line_size;
                     +        block += 8;
                     +    } while (--h);
+                     }
                     +#endif
                      /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
                         Since the immediate result could be greater than 255, we do the
@@ -222,7 +244,7 @@ void dsputil_init_alpha(void)
                          /* amask clears all bits that correspond to present features.  */
                          if (amask(AMASK_MVI) == 0) {
                     -	put_pixels_clamped = put_pixels_clamped_axp;
                     -	add_pixels_clamped = add_pixels_clamped_axp;
                     +        put_pixels_clamped = put_pixels_clamped_mvi_asm;
                     +        add_pixels_clamped = add_pixels_clamped_mvi_asm;
+                         }
+                     }

libavcodec/alpha/dsputil_alpha_asm.S

History View file @ bb7d493

                     new file mode 100644
@@ -0,0 +1,176 @@
                     +/*
                     + * Alpha optimized DSP utils
                     + * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
                     + *
                     + * This program is free software; you can redistribute it and/or modify
                     + * it under the terms of the GNU General Public License as published by
                     + * the Free Software Foundation; either version 2 of the License, or
                     + * (at your option) any later version.
                     + *
                     + * This program is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     + * GNU General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU General Public License
                     + * along with this program; if not, write to the Free Software
                     + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
                     + */
+                    +
                     +/*
                     + * These functions are scheduled for pca56. They should work
                     + * reasonably on ev6, though.
                     + */
+                    +
                     +#include "regdef.h"
+                    +
                     +/* Some nicer register names.  */
                     +#define ta t10
                     +#define tb t11
                     +#define tc t12
                     +#define td AT
                     +/* Danger: these overlap with the argument list and the return value */
                     +#define te a5
                     +#define tf a4
                     +#define tg a3
                     +#define th v0
+                    +
                     +        .set noat
                     +        .set noreorder
                     +        .arch pca56
                     +        .text
+                    +
                     +/************************************************************************
                     + * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
                     + *                                 int line_size)
                     + */
                     +        .align 6
                     +        .globl put_pixels_clamped_mvi_asm
                     +        .ent put_pixels_clamped_mvi_asm
                     +put_pixels_clamped_mvi_asm:
                     +        .frame sp, 0, ra
                     +        .prologue 0
+                    +
                     +        lda     t8, -1
                     +        lda     t9, 8           # loop counter
                     +        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
+                    +
                     +        .align 4
                     +1:      ldq     t0,  0(a0)
                     +        ldq     t1,  8(a0)
                     +        ldq     t2, 16(a0)
                     +        ldq     t3, 24(a0)
+                    +
                     +        maxsw4  t0, zero, t0
                     +        subq    t9, 2, t9
                     +        maxsw4  t1, zero, t1
                     +        lda     a0, 32(a0)
+                    +
                     +        maxsw4  t2, zero, t2
                     +        addq    a1, a2, ta
                     +        maxsw4  t3, zero, t3
                     +        minsw4  t0, t8, t0
+                    +
                     +        minsw4  t1, t8, t1
                     +        minsw4  t2, t8, t2
                     +        minsw4  t3, t8, t3
                     +        pkwb    t0, t0
+                    +
                     +        pkwb    t1, t1
                     +        pkwb    t2, t2
                     +        pkwb    t3, t3
                     +        stl     t0, 0(a1)
+                    +
                     +        stl     t1, 4(a1)
                     +        addq    ta, a2, a1
                     +        stl     t2, 0(ta)
                     +        stl     t3, 4(ta)
+                    +
                     +        bne     t9, 1b
                     +        ret
                     +        .end put_pixels_clamped_mvi_asm
+                    +
                     +/************************************************************************
                     + * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
                     + *                                 int line_size)
                     + */
                     +        .align 6
                     +        .globl add_pixels_clamped_mvi_asm
                     +        .ent add_pixels_clamped_mvi_asm
                     +add_pixels_clamped_mvi_asm:
                     +        .frame sp, 0, ra
                     +        .prologue 0
+                    +
                     +        lda     t1, -1
                     +        lda     th, 8
                     +        zap     t1, 0x33, tg
                     +        nop
+                    +
                     +        srl     tg, 1, t0
                     +        xor     tg, t0, tg      # 0x8000800080008000
                     +        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
+                    +
                     +        .align 4
                     +1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
                     +        ldl     t4, 4(a1)       # pix1
                     +        addq    a1, a2, te      # pixels += line_size
                     +        ldq     t0, 0(a0)       # shorts0
+                    +
                     +        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
                     +        ldl     ta, 4(te)       # pix3
                     +        ldq     t3, 8(a0)       # shorts1
                     +        ldq     t6, 16(a0)      # shorts2
+                    +
                     +        ldq     t9, 24(a0)      # shorts3
                     +        unpkbw  t1, t1          # 0 0 (quarter/op no.)
                     +        and     t0, tg, t2      # 0 1
                     +        unpkbw  t4, t4          # 1 0
+                    +
                     +        bic     t0, tg, t0      # 0 2
                     +        unpkbw  t7, t7          # 2 0
                     +        and     t3, tg, t5      # 1 1
                     +        addq    t0, t1, t0      # 0 3
+                    +
                     +        xor     t0, t2, t0      # 0 4
                     +        unpkbw  ta, ta          # 3 0
                     +        and     t6, tg, t8      # 2 1
                     +        maxsw4  t0, zero, t0    # 0 5
+                    +
                     +        bic     t3, tg, t3      # 1 2
                     +        bic     t6, tg, t6      # 2 2
                     +        minsw4  t0, tf, t0      # 0 6
                     +        addq    t3, t4, t3      # 1 3
+                    +
                     +        pkwb    t0, t0          # 0 7
                     +        xor     t3, t5, t3      # 1 4
                     +        maxsw4  t3, zero, t3    # 1 5
                     +        addq    t6, t7, t6      # 2 3
+                    +
                     +        xor     t6, t8, t6      # 2 4
                     +        and     t9, tg, tb      # 3 1
                     +        minsw4  t3, tf, t3      # 1 6
                     +        bic     t9, tg, t9      # 3 2
+                    +
                     +        maxsw4  t6, zero, t6    # 2 5
                     +        addq    t9, ta, t9      # 3 3
                     +        stl     t0, 0(a1)       # 0 8
                     +        minsw4  t6, tf, t6      # 2 6
+                    +
                     +        xor     t9, tb, t9      # 3 4
                     +        maxsw4  t9, zero, t9    # 3 5
                     +        lda     a0, 32(a0)      # block += 16;
                     +        pkwb    t3, t3          # 1 7
+                    +
                     +        minsw4  t9, tf, t9      # 3 6
                     +        subq    th, 2, th
                     +        pkwb    t6, t6          # 2 7
                     +        pkwb    t9, t9          # 3 7
+                    +
                     +        stl     t3, 4(a1)       # 1 8
                     +        addq    te, a2, a1      # pixels += line_size
                     +        stl     t6, 0(te)       # 2 8
                     +        stl     t9, 4(te)       # 3 8
+                    +
                     +        bne     th, 1b
                     +        ret
                     +        .end add_pixels_clamped_mvi_asm

libavcodec/alpha/regdef.h

History View file @ bb7d493

                     new file mode 100644
@@ -0,0 +1,45 @@
                     +/* Some BSDs don't seem to have regdef.h... sigh  */
                     +#ifndef alpha_regdef_h
                     +#define alpha_regdef_h
+                    +
                     +#define v0      $0      /* function return value */
+                    +
                     +#define t0      $1      /* temporary registers (caller-saved) */
                     +#define t1      $2
                     +#define t2      $3
                     +#define t3      $4
                     +#define t4      $5
                     +#define t5      $6
                     +#define t6      $7
                     +#define t7      $8
+                    +
                     +#define s0      $9      /* saved-registers (callee-saved registers) */
                     +#define s1      $10
                     +#define s2      $11
                     +#define s3      $12
                     +#define s4      $13
                     +#define s5      $14
                     +#define s6      $15
                     +#define fp      s6      /* frame-pointer (s6 in frame-less procedures) */
+                    +
                     +#define a0      $16     /* argument registers (caller-saved) */
                     +#define a1      $17
                     +#define a2      $18
                     +#define a3      $19
                     +#define a4      $20
                     +#define a5      $21
+                    +
                     +#define t8      $22     /* more temps (caller-saved) */
                     +#define t9      $23
                     +#define t10     $24
                     +#define t11     $25
                     +#define ra      $26     /* return address register */
                     +#define t12     $27
+                    +
                     +#define pv      t12     /* procedure-variable register */
                     +#define AT      $at     /* assembler temporary */
                     +#define gp      $29     /* global pointer */
                     +#define sp      $30     /* stack pointer */
                     +#define zero    $31     /* reads as zero, writes are noops */
+                    +
                     +#endif /* alpha_regdef_h */