GitList

Browse code

Another (final?) patch for libpostproc. This one replace horizClassify by a transpose/(use Vert)/transpose sequence. This add LowPass and DefFilter for "free". I also fixed the header in postprocess.c and special-cased some of the well-aligned cases (all horiz stuff is well-aligned).

patch by (Romain Dolbeau <dolbeau at irisa dot fr>)

Originally committed as revision 3175 to svn://svn.ffmpeg.org/ffmpeg/trunk

Romain Dolbeau authored on 2004/05/30 10:53:43
Showing 3 changed files

libavcodec/libpostproc/postprocess.c index e669c13..afc7f9e 100644
libavcodec/libpostproc/postprocess_altivec_template.c index a9bae10..1c59b94 100644
libavcodec/libpostproc/postprocess_template.c index 209fff2..316d9f3 100644

libavcodec/libpostproc/postprocess.c

History View file @ 2064626

@@ -29,10 +29,10 @@ isVertDC		Ec	Ec			Ec
                      isVertMinMaxOk		Ec	Ec			Ec
                      doVertLowPass		E		e	e	Ec
                      doVertDefFilter		Ec	Ec	e	e	Ec
                     -isHorizDC		Ec	Ec
                     -isHorizMinMaxOk		a	E
                     -doHorizLowPass		E		e	e
                     -doHorizDefFilter	Ec	Ec	e	e
                     +isHorizDC		Ec	Ec			Ec
                     +isHorizMinMaxOk		a	E			Ec
                     +doHorizLowPass		E		e	e	Ec
                     +doHorizDefFilter	Ec	Ec	e	e	Ec
                      do_a_deblock		Ec	E	Ec	E
                      deRing			E		e	e*	Ecp
                      Vertical RKAlgo1	E		a	a
@@ -43,7 +43,7 @@ LinIpolDeinterlace	e		E	E*
                      CubicIpolDeinterlace	a		e	e*
                      LinBlendDeinterlace	e		E	E*
                      MedianDeinterlace#	E	Ec	Ec
                     -TempDeNoiser#		E		e	e
                     +TempDeNoiser#		E		e	e	Ec
                      * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
                      # more or less selfinvented filters so the exactness isnt too meaningfull

libavcodec/libpostproc/postprocess_altivec_template.c

History View file @ 2064626

@@ -73,7 +73,9 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
                        vector signed short v2QP;
                        vector unsigned short v4QP;
                        vector unsigned short v_dcThreshold;
                     -  int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
                     +  const int properStride = (stride % 16);
                     +  const int srcAlign = ((unsigned long)src2 % 16);
                     +  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
                        const vector signed int zero = vec_splat_s32(0);
                        const vector signed short mask = vec_splat_s16(1);
                        vector signed int v_numEq = vec_splat_s32(0);
@@ -90,6 +92,8 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
                        src2 += stride * 4;
                     +  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
+                    +
                      #define LOAD_LINE(i)							\
                        register int j##i = i * stride;					\
                        vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
@@ -99,139 +103,41 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
                          v_srcA2##i = vec_ld(j##i + 16, src2);				\
                        const vector unsigned char v_srcA##i =				\
                          vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
                     -  vector signed short v_srcAss##i =					\
                     +  v_srcAss##i =                                                         \
                          (vector signed short)vec_mergeh((vector signed char)zero,		\
                      				    (vector signed char)v_srcA##i)
                     -  LOAD_LINE(0);
                     -  LOAD_LINE(1);
                     -  LOAD_LINE(2);
                     -  LOAD_LINE(3);
                     -  LOAD_LINE(4);
                     -  LOAD_LINE(5);
                     -  LOAD_LINE(6);
                     -  LOAD_LINE(7);
                     -#undef LOAD_LINE
+                    -
                     -#define ITER(i, j)							\
                     -  const vector signed short v_diff##i =					\
                     -    vec_sub(v_srcAss##i, v_srcAss##j);					\
                     -  const vector signed short v_sum##i =					\
                     -    vec_add(v_diff##i, v_dcOffset);					\
                     -  const vector signed short v_comp##i =					\
                     -    (vector signed short)vec_cmplt((vector unsigned short)v_sum##i,	\
                     -				   v_dcThreshold);			\
                     -  const vector signed short v_part##i = vec_and(mask, v_comp##i);	\
                     -  v_numEq = vec_sum4s(v_part##i, v_numEq);
+                    -
                     -  ITER(0, 1);
                     -  ITER(1, 2);
                     -  ITER(2, 3);
                     -  ITER(3, 4);
                     -  ITER(4, 5);
                     -  ITER(5, 6);
                     -  ITER(6, 7);
                     -#undef ITER
+                    -
                     -  v_numEq = vec_sums(v_numEq, zero);
+                    -
                     -  v_numEq = vec_splat(v_numEq, 3);
                     -  vec_ste(v_numEq, 0, &numEq);
+                    -
                     -  if (numEq > c->ppMode.flatnessThreshold)
                     -    {
                     -      const vector unsigned char mmoP1 = (const vector unsigned char)
                     -	AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
                     -	    0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
                     -      const vector unsigned char mmoP2 = (const vector unsigned char)
                     -	AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
                     -	    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
                     -      const vector unsigned char mmoP = (const vector unsigned char)
                     -	vec_lvsl(8, (unsigned char*)0);
+                    -
                     -      vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
                     -      vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
                     -      vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
                     -      vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
                     -      vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
                     -      vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
                     -      vector signed short mmoDiff = vec_sub(mmoL, mmoR);
                     -      vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
+                    -
                     -      if (vec_any_gt(mmoSum, v4QP))
                     -	return 0;
                     -      else
                     -	return 1;
                     -    }
                     -  else return 2;
                     -}
+                    -
                     -/* this is the same as vertClassify_altivec,
                     -   with an added 8x8 transpose after the loading,
                     -   and w/o the stride*4 offset */
                     -static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c) {
                     -  /*
                     -    this code makes no assumption on src or stride.
                     -    One could remove the recomputation of the perm
                     -    vector by assuming (stride % 16) == 0, unfortunately
                     -    this is not always true.
                     -  */
                     -  register int y;
                     -  short __attribute__ ((aligned(16))) data[8];
                     -  int numEq;
                     -  uint8_t *src2 = src;
                     -  vector signed short v_dcOffset;
                     -  vector signed short v2QP;
                     -  vector unsigned short v4QP;
                     -  vector unsigned short v_dcThreshold;
                     -  int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
                     -  const vector signed int zero = vec_splat_s32(0);
                     -  const vector signed short mask = vec_splat_s16(1);
                     -  vector signed int v_numEq = vec_splat_s32(0);
+                    -
                     -  data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
                     -  data[1] = data[0] * 2 + 1;
                     -  data[2] = c->QP * 2;
                     -  data[3] = c->QP * 4;
                     -  vector signed short v_data = vec_ld(0, data);
                     -  v_dcOffset = vec_splat(v_data, 0);
                     -  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
                     -  v2QP = vec_splat(v_data, 2);
                     -  v4QP = (vector unsigned short)vec_splat(v_data, 3);
+                    -
                     -  //  src2 += stride * 4;
+                    -
                     -#define LOAD_LINE(i)							\
                     -  register int j##i = i * stride;					\
                     -  vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
                     -  const vector unsigned char v_srcA1##i = vec_ld(j##i, src2);		\
                     -  vector unsigned char v_srcA2##i;					\
                     -  if (two_vectors)							\
                     -    v_srcA2##i = vec_ld(j##i + 16, src2);				\
                     -  const vector unsigned char v_srcA##i =				\
                     -    vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
                     -  vector signed short v_srcAss##i =					\
                     +#define LOAD_LINE_ALIGNED(i)                                            \
                     +  register int j##i = i * stride;                                       \
                     +  const vector unsigned char v_srcA##i = vec_ld(j##i, src2);            \
                     +  v_srcAss##i =                                                         \
                          (vector signed short)vec_mergeh((vector signed char)zero,		\
                      				    (vector signed char)v_srcA##i)
                     -  LOAD_LINE(0);
                     -  LOAD_LINE(1);
                     -  LOAD_LINE(2);
                     -  LOAD_LINE(3);
                     -  LOAD_LINE(4);
                     -  LOAD_LINE(5);
                     -  LOAD_LINE(6);
                     -  LOAD_LINE(7);
                     +    // special casing the aligned case is worthwhile, as all call from
                     +    // the (transposed) horizontable deblocks will be aligned, i naddition
                     +    // to the naturraly aligned vertical deblocks.
                     +    if (properStride && srcAlign) {
                     +      LOAD_LINE_ALIGNED(0);
                     +      LOAD_LINE_ALIGNED(1);
                     +      LOAD_LINE_ALIGNED(2);
                     +      LOAD_LINE_ALIGNED(3);
                     +      LOAD_LINE_ALIGNED(4);
                     +      LOAD_LINE_ALIGNED(5);
                     +      LOAD_LINE_ALIGNED(6);
                     +      LOAD_LINE_ALIGNED(7);
                     +    } else {
                     +      LOAD_LINE(0);
                     +      LOAD_LINE(1);
                     +      LOAD_LINE(2);
                     +      LOAD_LINE(3);
                     +      LOAD_LINE(4);
                     +      LOAD_LINE(5);
                     +      LOAD_LINE(6);
                     +      LOAD_LINE(7);
                     +    }
                      #undef LOAD_LINE
+                    -
                     -  ALTIVEC_TRANSPOSE_8x8_SHORT(v_srcAss0,
                     -			      v_srcAss1,
                     -			      v_srcAss2,
                     -			      v_srcAss3,
                     -			      v_srcAss4,
                     -			      v_srcAss5,
                     -			      v_srcAss6,
                     -			      v_srcAss7);
                     +#undef LOAD_LINE_ALIGNED
                      #define ITER(i, j)							\
                        const vector signed short v_diff##i =					\
@@ -286,7 +192,6 @@ static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c)
                        else return 2;
+                     }
+                    -
                      static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
                        /*
                          this code makes no assumption on src or stride.
@@ -298,37 +203,65 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
                        */
                        uint8_t *src2 = src;
                        const vector signed int zero = vec_splat_s32(0);
                     +  const int properStride = (stride % 16);
                     +  const int srcAlign = ((unsigned long)src2 % 16);
                        short __attribute__ ((aligned(16))) qp[8];
                        qp[0] = c->QP;
                        vector signed short vqp = vec_ld(0, qp);
                        vqp = vec_splat(vqp, 0);
                     +  src2 += stride*3;
+                    +
                     +  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
                     +  vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
                     +  vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
                     +  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
+                    +
                      #define LOAD_LINE(i)                                                    \
                        const vector unsigned char perml##i =					\
                          vec_lvsl(i * stride, src2);						\
                     -  const vector unsigned char vbA##i =					\
                     -    vec_ld(i * stride, src2);						\
                     -  const vector unsigned char vbB##i =					\
                     -    vec_ld(i * stride + 16, src2);					\
                     -  const vector unsigned char vbT##i =					\
                     -    vec_perm(vbA##i, vbB##i, perml##i);					\
                     -  const vector signed short vb##i =					\
                     +  vbA##i = vec_ld(i * stride, src2);                                    \
                     +  vbB##i = vec_ld(i * stride + 16, src2);                               \
                     +  vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                          \
                     +  vb##i =                                                               \
                          (vector signed short)vec_mergeh((vector unsigned char)zero,		\
                      				    (vector unsigned char)vbT##i)
+                    -
                     -  src2 += stride*3;
                     -  LOAD_LINE(0);
                     -  LOAD_LINE(1);
                     -  LOAD_LINE(2);
                     -  LOAD_LINE(3);
                     -  LOAD_LINE(4);
                     -  LOAD_LINE(5);
                     -  LOAD_LINE(6);
                     -  LOAD_LINE(7);
                     -  LOAD_LINE(8);
                     -  LOAD_LINE(9);
                     +#define LOAD_LINE_ALIGNED(i)                                            \
                     +  register int j##i = i * stride;                                       \
                     +  vbT##i = vec_ld(j##i, src2);                                          \
                     +  vb##i =                                                               \
                     +    (vector signed short)vec_mergeh((vector signed char)zero,		\
                     +				    (vector signed char)vbT##i)
+                    +
                     +    // special casing the aligned case is worthwhile, as all call from
                     +    // the (transposed) horizontable deblocks will be aligned, in addition
                     +    // to the naturraly aligned vertical deblocks.
                     +    if (properStride && srcAlign) {
                     +      LOAD_LINE_ALIGNED(0);
                     +      LOAD_LINE_ALIGNED(1);
                     +      LOAD_LINE_ALIGNED(2);
                     +      LOAD_LINE_ALIGNED(3);
                     +      LOAD_LINE_ALIGNED(4);
                     +      LOAD_LINE_ALIGNED(5);
                     +      LOAD_LINE_ALIGNED(6);
                     +      LOAD_LINE_ALIGNED(7);
                     +      LOAD_LINE_ALIGNED(8);
                     +      LOAD_LINE_ALIGNED(9);
                     +    } else {
                     +      LOAD_LINE(0);
                     +      LOAD_LINE(1);
                     +      LOAD_LINE(2);
                     +      LOAD_LINE(3);
                     +      LOAD_LINE(4);
                     +      LOAD_LINE(5);
                     +      LOAD_LINE(6);
                     +      LOAD_LINE(7);
                     +      LOAD_LINE(8);
                     +      LOAD_LINE(9);
                     +    }
                      #undef LOAD_LINE
                     +#undef LOAD_LINE_ALIGNED
                        const vector unsigned short v_1 = vec_splat_u16(1);
                        const vector unsigned short v_2 = vec_splat_u16(2);
@@ -413,16 +346,37 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
                        vec_st(svA##i, i * stride, src2);				\
                        vec_st(svB##i, i * stride + 16, src2)
                     -  PACK_AND_STORE(1);
                     -  PACK_AND_STORE(2);
                     -  PACK_AND_STORE(3);
                     -  PACK_AND_STORE(4);
                     -  PACK_AND_STORE(5);
                     -  PACK_AND_STORE(6);
                     -  PACK_AND_STORE(7);
                     -  PACK_AND_STORE(8);
+                    -
                     +#define PACK_AND_STORE_ALIGNED(i)				\
                     +  const vector unsigned char vf##i =				\
                     +    vec_packsu(vr##i, (vector signed short)zero);		\
                     +  const vector unsigned char vg##i =				\
                     +    vec_perm(vf##i, vbT##i, permHH);				\
                     +  vec_st(vg##i, i * stride, src2)
+                    +
                     +  // special casing the aligned case is worthwhile, as all call from
                     +  // the (transposed) horizontable deblocks will be aligned, in addition
                     +  // to the naturraly aligned vertical deblocks.
                     +  if (properStride && srcAlign) {
                     +    PACK_AND_STORE_ALIGNED(1);
                     +    PACK_AND_STORE_ALIGNED(2);
                     +    PACK_AND_STORE_ALIGNED(3);
                     +    PACK_AND_STORE_ALIGNED(4);
                     +    PACK_AND_STORE_ALIGNED(5);
                     +    PACK_AND_STORE_ALIGNED(6);
                     +    PACK_AND_STORE_ALIGNED(7);
                     +    PACK_AND_STORE_ALIGNED(8);
                     +  } else {
                     +    PACK_AND_STORE(1);
                     +    PACK_AND_STORE(2);
                     +    PACK_AND_STORE(3);
                     +    PACK_AND_STORE(4);
                     +    PACK_AND_STORE(5);
                     +    PACK_AND_STORE(6);
                     +    PACK_AND_STORE(7);
                     +    PACK_AND_STORE(8);
                     +  }
                      #undef PACK_AND_STORE
                     +#undef PACK_AND_STORE_ALIGNED
+                     }
@@ -1043,3 +997,200 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
                        PACK_AND_STORE(tempBlured, 7);
                      #undef PACK_AND_STORE
+                     }
+                    +
                     +static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
                     +  const vector unsigned char zero = vec_splat_u8(0);
+                    +
                     +#define LOAD_DOUBLE_LINE(i, j)						\
                     +  vector unsigned char perm1##i = vec_lvsl(i * stride, src);		\
                     +  vector unsigned char perm2##i = vec_lvsl(j * stride, src);		\
                     +  vector unsigned char srcA##i = vec_ld(i * stride, src);		\
                     +  vector unsigned char srcB##i = vec_ld(i * stride + 16, src);          \
                     +  vector unsigned char srcC##i = vec_ld(j * stride, src);		\
                     +  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src);           \
                     +  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i);	\
                     +  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
+                    +
                     +  LOAD_DOUBLE_LINE(0, 1);
                     +  LOAD_DOUBLE_LINE(2, 3);
                     +  LOAD_DOUBLE_LINE(4, 5);
                     +  LOAD_DOUBLE_LINE(6, 7);
                     +#undef LOAD_DOUBLE_LINE
+                    +
                     +  vector unsigned char tempA = vec_mergeh(src0, zero);
                     +  vector unsigned char tempB = vec_mergel(src0, zero);
                     +  vector unsigned char tempC = vec_mergeh(src1, zero);
                     +  vector unsigned char tempD = vec_mergel(src1, zero);
                     +  vector unsigned char tempE = vec_mergeh(src2, zero);
                     +  vector unsigned char tempF = vec_mergel(src2, zero);
                     +  vector unsigned char tempG = vec_mergeh(src3, zero);
                     +  vector unsigned char tempH = vec_mergel(src3, zero);
                     +  vector unsigned char tempI = vec_mergeh(src4, zero);
                     +  vector unsigned char tempJ = vec_mergel(src4, zero);
                     +  vector unsigned char tempK = vec_mergeh(src5, zero);
                     +  vector unsigned char tempL = vec_mergel(src5, zero);
                     +  vector unsigned char tempM = vec_mergeh(src6, zero);
                     +  vector unsigned char tempN = vec_mergel(src6, zero);
                     +  vector unsigned char tempO = vec_mergeh(src7, zero);
                     +  vector unsigned char tempP = vec_mergel(src7, zero);
+                    +
                     +  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
                     +  vector unsigned char temp1 = vec_mergel(tempA, tempI);
                     +  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
                     +  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
                     +  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
                     +  vector unsigned char temp5 = vec_mergel(tempC, tempK);
                     +  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
                     +  vector unsigned char temp7 = vec_mergel(tempD, tempL);
                     +  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
                     +  vector unsigned char temp9 = vec_mergel(tempE, tempM);
                     +  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
                     +  vector unsigned char temp11 = vec_mergel(tempF, tempN);
                     +  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
                     +  vector unsigned char temp13 = vec_mergel(tempG, tempO);
                     +  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
                     +  vector unsigned char temp15 = vec_mergel(tempH, tempP);
+                    +
                     +  tempA = vec_mergeh(temp0, temp8);
                     +  tempB = vec_mergel(temp0, temp8);
                     +  tempC = vec_mergeh(temp1, temp9);
                     +  tempD = vec_mergel(temp1, temp9);
                     +  tempE = vec_mergeh(temp2, temp10);
                     +  tempF = vec_mergel(temp2, temp10);
                     +  tempG = vec_mergeh(temp3, temp11);
                     +  tempH = vec_mergel(temp3, temp11);
                     +  tempI = vec_mergeh(temp4, temp12);
                     +  tempJ = vec_mergel(temp4, temp12);
                     +  tempK = vec_mergeh(temp5, temp13);
                     +  tempL = vec_mergel(temp5, temp13);
                     +  tempM = vec_mergeh(temp6, temp14);
                     +  tempN = vec_mergel(temp6, temp14);
                     +  tempO = vec_mergeh(temp7, temp15);
                     +  tempP = vec_mergel(temp7, temp15);
+                    +
                     +  temp0 = vec_mergeh(tempA, tempI);
                     +  temp1 = vec_mergel(tempA, tempI);
                     +  temp2 = vec_mergeh(tempB, tempJ);
                     +  temp3 = vec_mergel(tempB, tempJ);
                     +  temp4 = vec_mergeh(tempC, tempK);
                     +  temp5 = vec_mergel(tempC, tempK);
                     +  temp6 = vec_mergeh(tempD, tempL);
                     +  temp7 = vec_mergel(tempD, tempL);
                     +  temp8 = vec_mergeh(tempE, tempM);
                     +  temp9 = vec_mergel(tempE, tempM);
                     +  temp10 = vec_mergeh(tempF, tempN);
                     +  temp11 = vec_mergel(tempF, tempN);
                     +  temp12 = vec_mergeh(tempG, tempO);
                     +  temp13 = vec_mergel(tempG, tempO);
                     +  temp14 = vec_mergeh(tempH, tempP);
                     +  temp15 = vec_mergel(tempH, tempP);
+                    +
                     +  vec_st(temp0, 0, dst);
                     +  vec_st(temp1, 16, dst);
                     +  vec_st(temp2, 32, dst);
                     +  vec_st(temp3, 48, dst);
                     +  vec_st(temp4, 64, dst);
                     +  vec_st(temp5, 80, dst);
                     +  vec_st(temp6, 96, dst);
                     +  vec_st(temp7, 112, dst);
                     +  vec_st(temp8, 128, dst);
                     +  vec_st(temp9, 144, dst);
                     +  vec_st(temp10, 160, dst);
                     +  vec_st(temp11, 176, dst);
                     +  vec_st(temp12, 192, dst);
                     +  vec_st(temp13, 208, dst);
                     +  vec_st(temp14, 224, dst);
                     +  vec_st(temp15, 240, dst);
                     +}
+                    +
                     +static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
                     +  const vector unsigned char zero = vec_splat_u8(0);
                     +  const vector unsigned char magic_perm = (const vector unsigned char)
                     +    AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                     +	0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
+                    +
                     +#define LOAD_DOUBLE_LINE(i, j)			    		\
                     +  vector unsigned char src##i = vec_ld(i * 16, src);		\
                     +  vector unsigned char src##j = vec_ld(j * 16, src)
+                    +
                     +  LOAD_DOUBLE_LINE(0, 1);
                     +  LOAD_DOUBLE_LINE(2, 3);
                     +  LOAD_DOUBLE_LINE(4, 5);
                     +  LOAD_DOUBLE_LINE(6, 7);
                     +  LOAD_DOUBLE_LINE(8, 9);
                     +  LOAD_DOUBLE_LINE(10, 11);
                     +  LOAD_DOUBLE_LINE(12, 13);
                     +  LOAD_DOUBLE_LINE(14, 15);
                     +#undef LOAD_DOUBLE_LINE
+                    +
                     +  vector unsigned char tempA = vec_mergeh(src0, src8);
                     +  vector unsigned char tempB;
                     +  vector unsigned char tempC = vec_mergeh(src1, src9);
                     +  vector unsigned char tempD;
                     +  vector unsigned char tempE = vec_mergeh(src2, src10);
                     +  vector unsigned char tempG = vec_mergeh(src3, src11);
                     +  vector unsigned char tempI = vec_mergeh(src4, src12);
                     +  vector unsigned char tempJ;
                     +  vector unsigned char tempK = vec_mergeh(src5, src13);
                     +  vector unsigned char tempL;
                     +  vector unsigned char tempM = vec_mergeh(src6, src14);
                     +  vector unsigned char tempO = vec_mergeh(src7, src15);
+                    +
                     +  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
                     +  vector unsigned char temp1 = vec_mergel(tempA, tempI);
                     +  vector unsigned char temp2;
                     +  vector unsigned char temp3;
                     +  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
                     +  vector unsigned char temp5 = vec_mergel(tempC, tempK);
                     +  vector unsigned char temp6;
                     +  vector unsigned char temp7;
                     +  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
                     +  vector unsigned char temp9 = vec_mergel(tempE, tempM);
                     +  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
                     +  vector unsigned char temp13 = vec_mergel(tempG, tempO);
+                    +
                     +  tempA = vec_mergeh(temp0, temp8);
                     +  tempB = vec_mergel(temp0, temp8);
                     +  tempC = vec_mergeh(temp1, temp9);
                     +  tempD = vec_mergel(temp1, temp9);
                     +  tempI = vec_mergeh(temp4, temp12);
                     +  tempJ = vec_mergel(temp4, temp12);
                     +  tempK = vec_mergeh(temp5, temp13);
                     +  tempL = vec_mergel(temp5, temp13);
+                    +
                     +  temp0 = vec_mergeh(tempA, tempI);
                     +  temp1 = vec_mergel(tempA, tempI);
                     +  temp2 = vec_mergeh(tempB, tempJ);
                     +  temp3 = vec_mergel(tempB, tempJ);
                     +  temp4 = vec_mergeh(tempC, tempK);
                     +  temp5 = vec_mergel(tempC, tempK);
                     +  temp6 = vec_mergeh(tempD, tempL);
                     +  temp7 = vec_mergel(tempD, tempL);
+                    +
+                    +
                     +  const vector signed char neg1 = vec_splat_s8(-1);
                     +#define STORE_DOUBLE_LINE(i, j)						\
                     +  vector unsigned char dstA##i = vec_ld(i * stride, dst);		\
                     +  vector unsigned char dstB##i = vec_ld(i * stride + 16, dst);		\
                     +  vector unsigned char dstA##j = vec_ld(j * stride, dst);		\
                     +  vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst);		\
                     +  vector unsigned char align##i = vec_lvsr(i * stride, dst);		\
                     +  vector unsigned char align##j = vec_lvsr(j * stride, dst);		\
                     +  vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i);	\
                     +  vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j);	\
                     +  vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);	\
                     +  vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);	\
                     +  vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i);	\
                     +  vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i);	\
                     +  vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j);	\
                     +  vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j);	\
                     +  vec_st(dstAF##i, i * stride, dst);					\
                     +  vec_st(dstBF##i, i * stride + 16, dst);				\
                     +  vec_st(dstAF##j, j * stride, dst);					\
                     +  vec_st(dstBF##j, j * stride + 16, dst)
+                    +
                     +  STORE_DOUBLE_LINE(0,1);
                     +  STORE_DOUBLE_LINE(2,3);
                     +  STORE_DOUBLE_LINE(4,5);
                     +  STORE_DOUBLE_LINE(6,7);
                     +}

libavcodec/libpostproc/postprocess_template.c

History View file @ 2064626

@@ -3684,12 +3684,27 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
                      					horizX1Filter(dstBlock-4, stride, QP);
                      				else if(mode & H_DEBLOCK)
+                     				{
                     +#ifdef HAVE_ALTIVEC
                     +					unsigned char __attribute__ ((aligned(16))) tempBlock[272];
                     +					transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
+                    +
                     +					const int t=vertClassify_altivec(tempBlock-48, 16, &c);
                     +					if(t==1) {
                     +						doVertLowPass_altivec(tempBlock-48, 16, &c);
                     +                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
                     +                                        }
                     +					else if(t==2) {
                     +						doVertDefFilter_altivec(tempBlock-48, 16, &c);
                     +                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
                     +                                        }
                     +#else
                      					const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
                      					if(t==1)
                      						RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
                      					else if(t==2)
                      						RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
                     +#endif
                      				}else if(mode & H_A_DEBLOCK){
                      					RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
+                     				}