Browse code

Another (final?) patch for libpostproc. This one replace horizClassify by a transpose/(use Vert)/transpose sequence. This add LowPass and DefFilter for "free". I also fixed the header in postprocess.c and special-cased some of the well-aligned cases (all horiz stuff is well-aligned).

patch by (Romain Dolbeau <dolbeau at irisa dot fr>)

Originally committed as revision 3175 to svn://svn.ffmpeg.org/ffmpeg/trunk

Romain Dolbeau authored on 2004/05/30 10:53:43
Showing 3 changed files
... ...
@@ -29,10 +29,10 @@ isVertDC		Ec	Ec			Ec
29 29
 isVertMinMaxOk		Ec	Ec			Ec
30 30
 doVertLowPass		E		e	e	Ec
31 31
 doVertDefFilter		Ec	Ec	e	e	Ec
32
-isHorizDC		Ec	Ec
33
-isHorizMinMaxOk		a	E
34
-doHorizLowPass		E		e	e
35
-doHorizDefFilter	Ec	Ec	e	e
32
+isHorizDC		Ec	Ec			Ec
33
+isHorizMinMaxOk		a	E			Ec
34
+doHorizLowPass		E		e	e	Ec
35
+doHorizDefFilter	Ec	Ec	e	e	Ec
36 36
 do_a_deblock		Ec	E	Ec	E
37 37
 deRing			E		e	e*	Ecp
38 38
 Vertical RKAlgo1	E		a	a
... ...
@@ -43,7 +43,7 @@ LinIpolDeinterlace	e		E	E*
43 43
 CubicIpolDeinterlace	a		e	e*
44 44
 LinBlendDeinterlace	e		E	E*
45 45
 MedianDeinterlace#	E	Ec	Ec
46
-TempDeNoiser#		E		e	e
46
+TempDeNoiser#		E		e	e	Ec
47 47
 
48 48
 * i dont have a 3dnow CPU -> its untested, but noone said it doesnt work so it seems to work
49 49
 # more or less selfinvented filters so the exactness isnt too meaningfull
... ...
@@ -73,7 +73,9 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
73 73
   vector signed short v2QP;
74 74
   vector unsigned short v4QP;
75 75
   vector unsigned short v_dcThreshold;
76
-  int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
76
+  const int properStride = (stride % 16);
77
+  const int srcAlign = ((unsigned long)src2 % 16);
78
+  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
77 79
   const vector signed int zero = vec_splat_s32(0);
78 80
   const vector signed short mask = vec_splat_s16(1);
79 81
   vector signed int v_numEq = vec_splat_s32(0);
... ...
@@ -90,6 +92,8 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
90 90
 
91 91
   src2 += stride * 4;
92 92
 
93
+  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
94
+
93 95
 #define LOAD_LINE(i)							\
94 96
   register int j##i = i * stride;					\
95 97
   vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
... ...
@@ -99,139 +103,41 @@ static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
99 99
     v_srcA2##i = vec_ld(j##i + 16, src2);				\
100 100
   const vector unsigned char v_srcA##i =				\
101 101
     vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
102
-  vector signed short v_srcAss##i =					\
102
+  v_srcAss##i =                                                         \
103 103
     (vector signed short)vec_mergeh((vector signed char)zero,		\
104 104
 				    (vector signed char)v_srcA##i)
105 105
 
106
-  LOAD_LINE(0);
107
-  LOAD_LINE(1);
108
-  LOAD_LINE(2);
109
-  LOAD_LINE(3);
110
-  LOAD_LINE(4);
111
-  LOAD_LINE(5);
112
-  LOAD_LINE(6);
113
-  LOAD_LINE(7);
114
-#undef LOAD_LINE
115
-
116
-#define ITER(i, j)							\
117
-  const vector signed short v_diff##i =					\
118
-    vec_sub(v_srcAss##i, v_srcAss##j);					\
119
-  const vector signed short v_sum##i =					\
120
-    vec_add(v_diff##i, v_dcOffset);					\
121
-  const vector signed short v_comp##i =					\
122
-    (vector signed short)vec_cmplt((vector unsigned short)v_sum##i,	\
123
-				   v_dcThreshold);			\
124
-  const vector signed short v_part##i = vec_and(mask, v_comp##i);	\
125
-  v_numEq = vec_sum4s(v_part##i, v_numEq);
126
-
127
-  ITER(0, 1);
128
-  ITER(1, 2);
129
-  ITER(2, 3);
130
-  ITER(3, 4);
131
-  ITER(4, 5);
132
-  ITER(5, 6);
133
-  ITER(6, 7);
134
-#undef ITER
135
-
136
-  v_numEq = vec_sums(v_numEq, zero);
137
-	
138
-  v_numEq = vec_splat(v_numEq, 3);
139
-  vec_ste(v_numEq, 0, &numEq);
140
-
141
-  if (numEq > c->ppMode.flatnessThreshold)
142
-    {
143
-      const vector unsigned char mmoP1 = (const vector unsigned char)
144
-	AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
145
-	    0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
146
-      const vector unsigned char mmoP2 = (const vector unsigned char)
147
-	AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
148
-	    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
149
-      const vector unsigned char mmoP = (const vector unsigned char)
150
-	vec_lvsl(8, (unsigned char*)0);
151
-      
152
-      vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
153
-      vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
154
-      vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
155
-      vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
156
-      vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
157
-      vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
158
-      vector signed short mmoDiff = vec_sub(mmoL, mmoR);
159
-      vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
160
-      
161
-      if (vec_any_gt(mmoSum, v4QP))
162
-	return 0;
163
-      else
164
-	return 1;
165
-    }
166
-  else return 2; 
167
-}
168
-
169
-/* this is the same as vertClassify_altivec,
170
-   with an added 8x8 transpose after the loading,
171
-   and w/o the stride*4 offset */
172
-static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c) {
173
-  /*
174
-    this code makes no assumption on src or stride.
175
-    One could remove the recomputation of the perm
176
-    vector by assuming (stride % 16) == 0, unfortunately
177
-    this is not always true.
178
-  */
179
-  register int y;
180
-  short __attribute__ ((aligned(16))) data[8];
181
-  int numEq;
182
-  uint8_t *src2 = src;
183
-  vector signed short v_dcOffset;
184
-  vector signed short v2QP;
185
-  vector unsigned short v4QP;
186
-  vector unsigned short v_dcThreshold;
187
-  int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
188
-  const vector signed int zero = vec_splat_s32(0);
189
-  const vector signed short mask = vec_splat_s16(1);
190
-  vector signed int v_numEq = vec_splat_s32(0);
191
-	
192
-  data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
193
-  data[1] = data[0] * 2 + 1;
194
-  data[2] = c->QP * 2;
195
-  data[3] = c->QP * 4;
196
-  vector signed short v_data = vec_ld(0, data);
197
-  v_dcOffset = vec_splat(v_data, 0);
198
-  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
199
-  v2QP = vec_splat(v_data, 2);
200
-  v4QP = (vector unsigned short)vec_splat(v_data, 3);
201
-
202
-  //  src2 += stride * 4;
203
-
204
-#define LOAD_LINE(i)							\
205
-  register int j##i = i * stride;					\
206
-  vector unsigned char perm##i = vec_lvsl(j##i, src2);			\
207
-  const vector unsigned char v_srcA1##i = vec_ld(j##i, src2);		\
208
-  vector unsigned char v_srcA2##i;					\
209
-  if (two_vectors)							\
210
-    v_srcA2##i = vec_ld(j##i + 16, src2);				\
211
-  const vector unsigned char v_srcA##i =				\
212
-    vec_perm(v_srcA1##i, v_srcA2##i, perm##i);				\
213
-  vector signed short v_srcAss##i =					\
106
+#define LOAD_LINE_ALIGNED(i)                                            \
107
+  register int j##i = i * stride;                                       \
108
+  const vector unsigned char v_srcA##i = vec_ld(j##i, src2);            \
109
+  v_srcAss##i =                                                         \
214 110
     (vector signed short)vec_mergeh((vector signed char)zero,		\
215 111
 				    (vector signed char)v_srcA##i)
216 112
 
217
-  LOAD_LINE(0);
218
-  LOAD_LINE(1);
219
-  LOAD_LINE(2);
220
-  LOAD_LINE(3);
221
-  LOAD_LINE(4);
222
-  LOAD_LINE(5);
223
-  LOAD_LINE(6);
224
-  LOAD_LINE(7);
113
+    // special casing the aligned case is worthwhile, as all call from
114
+    // the (transposed) horizontable deblocks will be aligned, i naddition
115
+    // to the naturraly aligned vertical deblocks.
116
+    if (properStride && srcAlign) {
117
+      LOAD_LINE_ALIGNED(0);
118
+      LOAD_LINE_ALIGNED(1);
119
+      LOAD_LINE_ALIGNED(2);
120
+      LOAD_LINE_ALIGNED(3);
121
+      LOAD_LINE_ALIGNED(4);
122
+      LOAD_LINE_ALIGNED(5);
123
+      LOAD_LINE_ALIGNED(6);
124
+      LOAD_LINE_ALIGNED(7);
125
+    } else {
126
+      LOAD_LINE(0);
127
+      LOAD_LINE(1);
128
+      LOAD_LINE(2);
129
+      LOAD_LINE(3);
130
+      LOAD_LINE(4);
131
+      LOAD_LINE(5);
132
+      LOAD_LINE(6);
133
+      LOAD_LINE(7);
134
+    }
225 135
 #undef LOAD_LINE
226
-
227
-  ALTIVEC_TRANSPOSE_8x8_SHORT(v_srcAss0,
228
-			      v_srcAss1,
229
-			      v_srcAss2,
230
-			      v_srcAss3,
231
-			      v_srcAss4,
232
-			      v_srcAss5,
233
-			      v_srcAss6,
234
-			      v_srcAss7);
136
+#undef LOAD_LINE_ALIGNED
235 137
 
236 138
 #define ITER(i, j)							\
237 139
   const vector signed short v_diff##i =					\
... ...
@@ -286,7 +192,6 @@ static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c)
286 286
   else return 2; 
287 287
 }
288 288
 
289
-
290 289
 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
291 290
   /*
292 291
     this code makes no assumption on src or stride.
... ...
@@ -298,37 +203,65 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
298 298
   */
299 299
   uint8_t *src2 = src;
300 300
   const vector signed int zero = vec_splat_s32(0);
301
+  const int properStride = (stride % 16);
302
+  const int srcAlign = ((unsigned long)src2 % 16);
301 303
   short __attribute__ ((aligned(16))) qp[8];
302 304
   qp[0] = c->QP;
303 305
   vector signed short vqp = vec_ld(0, qp);
304 306
   vqp = vec_splat(vqp, 0);
305 307
 	
308
+  src2 += stride*3;
309
+
310
+  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
311
+  vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9;
312
+  vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9;
313
+  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
314
+	
306 315
 #define LOAD_LINE(i)                                                    \
307 316
   const vector unsigned char perml##i =					\
308 317
     vec_lvsl(i * stride, src2);						\
309
-  const vector unsigned char vbA##i =					\
310
-    vec_ld(i * stride, src2);						\
311
-  const vector unsigned char vbB##i =					\
312
-    vec_ld(i * stride + 16, src2);					\
313
-  const vector unsigned char vbT##i =					\
314
-    vec_perm(vbA##i, vbB##i, perml##i);					\
315
-  const vector signed short vb##i =					\
318
+  vbA##i = vec_ld(i * stride, src2);                                    \
319
+  vbB##i = vec_ld(i * stride + 16, src2);                               \
320
+  vbT##i = vec_perm(vbA##i, vbB##i, perml##i);                          \
321
+  vb##i =                                                               \
316 322
     (vector signed short)vec_mergeh((vector unsigned char)zero,		\
317 323
 				    (vector unsigned char)vbT##i)
318
-	
319
-  src2 += stride*3;
320 324
 
321
-  LOAD_LINE(0);
322
-  LOAD_LINE(1);
323
-  LOAD_LINE(2);
324
-  LOAD_LINE(3);
325
-  LOAD_LINE(4);
326
-  LOAD_LINE(5);
327
-  LOAD_LINE(6);
328
-  LOAD_LINE(7);
329
-  LOAD_LINE(8);
330
-  LOAD_LINE(9);
325
+#define LOAD_LINE_ALIGNED(i)                                            \
326
+  register int j##i = i * stride;                                       \
327
+  vbT##i = vec_ld(j##i, src2);                                          \
328
+  vb##i =                                                               \
329
+    (vector signed short)vec_mergeh((vector signed char)zero,		\
330
+				    (vector signed char)vbT##i)
331
+
332
+    // special casing the aligned case is worthwhile, as all call from
333
+    // the (transposed) horizontable deblocks will be aligned, in addition
334
+    // to the naturraly aligned vertical deblocks.
335
+    if (properStride && srcAlign) {
336
+      LOAD_LINE_ALIGNED(0);
337
+      LOAD_LINE_ALIGNED(1);
338
+      LOAD_LINE_ALIGNED(2);
339
+      LOAD_LINE_ALIGNED(3);
340
+      LOAD_LINE_ALIGNED(4);
341
+      LOAD_LINE_ALIGNED(5);
342
+      LOAD_LINE_ALIGNED(6);
343
+      LOAD_LINE_ALIGNED(7);
344
+      LOAD_LINE_ALIGNED(8);
345
+      LOAD_LINE_ALIGNED(9);
346
+    } else {
347
+      LOAD_LINE(0);
348
+      LOAD_LINE(1);
349
+      LOAD_LINE(2);
350
+      LOAD_LINE(3);
351
+      LOAD_LINE(4);
352
+      LOAD_LINE(5);
353
+      LOAD_LINE(6);
354
+      LOAD_LINE(7);
355
+      LOAD_LINE(8);
356
+      LOAD_LINE(9);
357
+    }
331 358
 #undef LOAD_LINE
359
+#undef LOAD_LINE_ALIGNED
332 360
 
333 361
   const vector unsigned short v_1 = vec_splat_u16(1);
334 362
   const vector unsigned short v_2 = vec_splat_u16(2);
... ...
@@ -413,16 +346,37 @@ static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
413 413
   vec_st(svA##i, i * stride, src2);				\
414 414
   vec_st(svB##i, i * stride + 16, src2)
415 415
 
416
-  PACK_AND_STORE(1);
417
-  PACK_AND_STORE(2);
418
-  PACK_AND_STORE(3);
419
-  PACK_AND_STORE(4);
420
-  PACK_AND_STORE(5);
421
-  PACK_AND_STORE(6);
422
-  PACK_AND_STORE(7);
423
-  PACK_AND_STORE(8);
424
-
416
+#define PACK_AND_STORE_ALIGNED(i)				\
417
+  const vector unsigned char vf##i =				\
418
+    vec_packsu(vr##i, (vector signed short)zero);		\
419
+  const vector unsigned char vg##i =				\
420
+    vec_perm(vf##i, vbT##i, permHH);				\
421
+  vec_st(vg##i, i * stride, src2)
422
+
423
+  // special casing the aligned case is worthwhile, as all call from
424
+  // the (transposed) horizontable deblocks will be aligned, in addition
425
+  // to the naturraly aligned vertical deblocks.
426
+  if (properStride && srcAlign) {
427
+    PACK_AND_STORE_ALIGNED(1);
428
+    PACK_AND_STORE_ALIGNED(2);
429
+    PACK_AND_STORE_ALIGNED(3);
430
+    PACK_AND_STORE_ALIGNED(4);
431
+    PACK_AND_STORE_ALIGNED(5);
432
+    PACK_AND_STORE_ALIGNED(6);
433
+    PACK_AND_STORE_ALIGNED(7);
434
+    PACK_AND_STORE_ALIGNED(8);
435
+  } else {
436
+    PACK_AND_STORE(1);
437
+    PACK_AND_STORE(2);
438
+    PACK_AND_STORE(3);
439
+    PACK_AND_STORE(4);
440
+    PACK_AND_STORE(5);
441
+    PACK_AND_STORE(6);
442
+    PACK_AND_STORE(7);
443
+    PACK_AND_STORE(8);
444
+  }
425 445
 #undef PACK_AND_STORE
446
+#undef PACK_AND_STORE_ALIGNED
426 447
 }
427 448
 
428 449
 
... ...
@@ -1043,3 +997,200 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
1043 1043
   PACK_AND_STORE(tempBlured, 7);
1044 1044
 #undef PACK_AND_STORE
1045 1045
 }
1046
+
1047
+static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1048
+  const vector unsigned char zero = vec_splat_u8(0);
1049
+
1050
+#define LOAD_DOUBLE_LINE(i, j)						\
1051
+  vector unsigned char perm1##i = vec_lvsl(i * stride, src);		\
1052
+  vector unsigned char perm2##i = vec_lvsl(j * stride, src);		\
1053
+  vector unsigned char srcA##i = vec_ld(i * stride, src);		\
1054
+  vector unsigned char srcB##i = vec_ld(i * stride + 16, src);          \
1055
+  vector unsigned char srcC##i = vec_ld(j * stride, src);		\
1056
+  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src);           \
1057
+  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i);	\
1058
+  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1059
+  
1060
+  LOAD_DOUBLE_LINE(0, 1);
1061
+  LOAD_DOUBLE_LINE(2, 3);
1062
+  LOAD_DOUBLE_LINE(4, 5);
1063
+  LOAD_DOUBLE_LINE(6, 7);
1064
+#undef LOAD_DOUBLE_LINE
1065
+
1066
+  vector unsigned char tempA = vec_mergeh(src0, zero);
1067
+  vector unsigned char tempB = vec_mergel(src0, zero);
1068
+  vector unsigned char tempC = vec_mergeh(src1, zero);
1069
+  vector unsigned char tempD = vec_mergel(src1, zero);
1070
+  vector unsigned char tempE = vec_mergeh(src2, zero);
1071
+  vector unsigned char tempF = vec_mergel(src2, zero);
1072
+  vector unsigned char tempG = vec_mergeh(src3, zero);
1073
+  vector unsigned char tempH = vec_mergel(src3, zero);
1074
+  vector unsigned char tempI = vec_mergeh(src4, zero);
1075
+  vector unsigned char tempJ = vec_mergel(src4, zero);
1076
+  vector unsigned char tempK = vec_mergeh(src5, zero);
1077
+  vector unsigned char tempL = vec_mergel(src5, zero);
1078
+  vector unsigned char tempM = vec_mergeh(src6, zero);
1079
+  vector unsigned char tempN = vec_mergel(src6, zero);
1080
+  vector unsigned char tempO = vec_mergeh(src7, zero);
1081
+  vector unsigned char tempP = vec_mergel(src7, zero);
1082
+
1083
+  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1084
+  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1085
+  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1086
+  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1087
+  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1088
+  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1089
+  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1090
+  vector unsigned char temp7 = vec_mergel(tempD, tempL);
1091
+  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1092
+  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1093
+  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1094
+  vector unsigned char temp11 = vec_mergel(tempF, tempN);
1095
+  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1096
+  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1097
+  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1098
+  vector unsigned char temp15 = vec_mergel(tempH, tempP);
1099
+
1100
+  tempA = vec_mergeh(temp0, temp8);
1101
+  tempB = vec_mergel(temp0, temp8);
1102
+  tempC = vec_mergeh(temp1, temp9);
1103
+  tempD = vec_mergel(temp1, temp9);
1104
+  tempE = vec_mergeh(temp2, temp10);
1105
+  tempF = vec_mergel(temp2, temp10);
1106
+  tempG = vec_mergeh(temp3, temp11);
1107
+  tempH = vec_mergel(temp3, temp11);
1108
+  tempI = vec_mergeh(temp4, temp12);
1109
+  tempJ = vec_mergel(temp4, temp12);
1110
+  tempK = vec_mergeh(temp5, temp13);
1111
+  tempL = vec_mergel(temp5, temp13);
1112
+  tempM = vec_mergeh(temp6, temp14);
1113
+  tempN = vec_mergel(temp6, temp14);
1114
+  tempO = vec_mergeh(temp7, temp15);
1115
+  tempP = vec_mergel(temp7, temp15);
1116
+
1117
+  temp0 = vec_mergeh(tempA, tempI);
1118
+  temp1 = vec_mergel(tempA, tempI);
1119
+  temp2 = vec_mergeh(tempB, tempJ);
1120
+  temp3 = vec_mergel(tempB, tempJ);
1121
+  temp4 = vec_mergeh(tempC, tempK);
1122
+  temp5 = vec_mergel(tempC, tempK);
1123
+  temp6 = vec_mergeh(tempD, tempL);
1124
+  temp7 = vec_mergel(tempD, tempL);
1125
+  temp8 = vec_mergeh(tempE, tempM);
1126
+  temp9 = vec_mergel(tempE, tempM);
1127
+  temp10 = vec_mergeh(tempF, tempN);
1128
+  temp11 = vec_mergel(tempF, tempN);
1129
+  temp12 = vec_mergeh(tempG, tempO);
1130
+  temp13 = vec_mergel(tempG, tempO);
1131
+  temp14 = vec_mergeh(tempH, tempP);
1132
+  temp15 = vec_mergel(tempH, tempP);
1133
+
1134
+  vec_st(temp0, 0, dst);
1135
+  vec_st(temp1, 16, dst);
1136
+  vec_st(temp2, 32, dst);
1137
+  vec_st(temp3, 48, dst);
1138
+  vec_st(temp4, 64, dst);
1139
+  vec_st(temp5, 80, dst);
1140
+  vec_st(temp6, 96, dst);
1141
+  vec_st(temp7, 112, dst);
1142
+  vec_st(temp8, 128, dst);
1143
+  vec_st(temp9, 144, dst);
1144
+  vec_st(temp10, 160, dst);
1145
+  vec_st(temp11, 176, dst);
1146
+  vec_st(temp12, 192, dst);
1147
+  vec_st(temp13, 208, dst);
1148
+  vec_st(temp14, 224, dst);
1149
+  vec_st(temp15, 240, dst);
1150
+}
1151
+
1152
+static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1153
+  const vector unsigned char zero = vec_splat_u8(0);
1154
+  const vector unsigned char magic_perm = (const vector unsigned char)
1155
+    AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1156
+	0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
1157
+  
1158
+#define LOAD_DOUBLE_LINE(i, j)			    		\
1159
+  vector unsigned char src##i = vec_ld(i * 16, src);		\
1160
+  vector unsigned char src##j = vec_ld(j * 16, src)
1161
+
1162
+  LOAD_DOUBLE_LINE(0, 1);
1163
+  LOAD_DOUBLE_LINE(2, 3);
1164
+  LOAD_DOUBLE_LINE(4, 5);
1165
+  LOAD_DOUBLE_LINE(6, 7);
1166
+  LOAD_DOUBLE_LINE(8, 9);
1167
+  LOAD_DOUBLE_LINE(10, 11);
1168
+  LOAD_DOUBLE_LINE(12, 13);
1169
+  LOAD_DOUBLE_LINE(14, 15);
1170
+#undef LOAD_DOUBLE_LINE
1171
+
1172
+  vector unsigned char tempA = vec_mergeh(src0, src8);
1173
+  vector unsigned char tempB;
1174
+  vector unsigned char tempC = vec_mergeh(src1, src9);
1175
+  vector unsigned char tempD;
1176
+  vector unsigned char tempE = vec_mergeh(src2, src10);
1177
+  vector unsigned char tempG = vec_mergeh(src3, src11);
1178
+  vector unsigned char tempI = vec_mergeh(src4, src12);
1179
+  vector unsigned char tempJ;
1180
+  vector unsigned char tempK = vec_mergeh(src5, src13);
1181
+  vector unsigned char tempL;
1182
+  vector unsigned char tempM = vec_mergeh(src6, src14);
1183
+  vector unsigned char tempO = vec_mergeh(src7, src15);
1184
+
1185
+  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1186
+  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1187
+  vector unsigned char temp2;
1188
+  vector unsigned char temp3;
1189
+  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1190
+  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1191
+  vector unsigned char temp6;
1192
+  vector unsigned char temp7;
1193
+  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1194
+  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1195
+  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1196
+  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1197
+
1198
+  tempA = vec_mergeh(temp0, temp8);
1199
+  tempB = vec_mergel(temp0, temp8);
1200
+  tempC = vec_mergeh(temp1, temp9);
1201
+  tempD = vec_mergel(temp1, temp9);
1202
+  tempI = vec_mergeh(temp4, temp12);
1203
+  tempJ = vec_mergel(temp4, temp12);
1204
+  tempK = vec_mergeh(temp5, temp13);
1205
+  tempL = vec_mergel(temp5, temp13);
1206
+
1207
+  temp0 = vec_mergeh(tempA, tempI);
1208
+  temp1 = vec_mergel(tempA, tempI);
1209
+  temp2 = vec_mergeh(tempB, tempJ);
1210
+  temp3 = vec_mergel(tempB, tempJ);
1211
+  temp4 = vec_mergeh(tempC, tempK);
1212
+  temp5 = vec_mergel(tempC, tempK);
1213
+  temp6 = vec_mergeh(tempD, tempL);
1214
+  temp7 = vec_mergel(tempD, tempL);
1215
+
1216
+
1217
+  const vector signed char neg1 = vec_splat_s8(-1);
1218
+#define STORE_DOUBLE_LINE(i, j)						\
1219
+  vector unsigned char dstA##i = vec_ld(i * stride, dst);		\
1220
+  vector unsigned char dstB##i = vec_ld(i * stride + 16, dst);		\
1221
+  vector unsigned char dstA##j = vec_ld(j * stride, dst);		\
1222
+  vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst);		\
1223
+  vector unsigned char align##i = vec_lvsr(i * stride, dst);		\
1224
+  vector unsigned char align##j = vec_lvsr(j * stride, dst);		\
1225
+  vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i);	\
1226
+  vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j);	\
1227
+  vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i);	\
1228
+  vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j);	\
1229
+  vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i);	\
1230
+  vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i);	\
1231
+  vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j);	\
1232
+  vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j);	\
1233
+  vec_st(dstAF##i, i * stride, dst);					\
1234
+  vec_st(dstBF##i, i * stride + 16, dst);				\
1235
+  vec_st(dstAF##j, j * stride, dst);					\
1236
+  vec_st(dstBF##j, j * stride + 16, dst)
1237
+
1238
+  STORE_DOUBLE_LINE(0,1);
1239
+  STORE_DOUBLE_LINE(2,3);
1240
+  STORE_DOUBLE_LINE(4,5);
1241
+  STORE_DOUBLE_LINE(6,7);
1242
+}
... ...
@@ -3684,12 +3684,27 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int
3684 3684
 					horizX1Filter(dstBlock-4, stride, QP);
3685 3685
 				else if(mode & H_DEBLOCK)
3686 3686
 				{
3687
+#ifdef HAVE_ALTIVEC
3688
+					unsigned char __attribute__ ((aligned(16))) tempBlock[272];
3689
+					transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3690
+
3691
+					const int t=vertClassify_altivec(tempBlock-48, 16, &c);
3692
+					if(t==1) {
3693
+						doVertLowPass_altivec(tempBlock-48, 16, &c);
3694
+                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3695
+                                        }
3696
+					else if(t==2) {
3697
+						doVertDefFilter_altivec(tempBlock-48, 16, &c);
3698
+                                                transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3699
+                                        }
3700
+#else
3687 3701
 					const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3688 3702
 
3689 3703
 					if(t==1)
3690 3704
 						RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3691 3705
 					else if(t==2)
3692 3706
 						RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3707
+#endif
3693 3708
 				}else if(mode & H_A_DEBLOCK){
3694 3709
 					RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
3695 3710
 				}