Browse code

fixed a rounding bug thing in the X1 Filter changed the X1 Filter slightly to make flat blocks look like in the 9tap lpf minor change to the -pp numbers & added decimal numbers in comments new experimental horizontal deblocking filter

Originally committed as revision 2180 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

Michael Niedermayer authored on 2001/10/13 11:31:15
Showing 3 changed files
... ...
@@ -27,8 +27,9 @@ isHorizMinMaxOk		a
27 27
 doHorizLowPass		E		a	a*
28 28
 doHorizDefFilter	E	ac	ac
29 29
 deRing
30
-RKAlgo1			E		a	a*
31
-X1			a		E	E*
30
+Vertical RKAlgo1	E		a	a*
31
+Vertical X1		a		E	E*
32
+Horizontal X1		a		E	E*
32 33
 
33 34
 
34 35
 * i dont have a 3dnow CPU -> its untested
... ...
@@ -40,7 +41,7 @@ c = checked against the other implementations (-vo md5)
40 40
 
41 41
 /*
42 42
 TODO:
43
-verify that everything workes as it should
43
+verify that everything workes as it should (how?)
44 44
 reduce the time wasted on the mem transfer
45 45
 implement dering
46 46
 implement everything in C at least (done at the moment but ...)
... ...
@@ -51,6 +52,9 @@ write a faster and higher quality deblocking filter :)
51 51
 do something about the speed of the horizontal filters
52 52
 make the mainloop more flexible (variable number of blocks at once
53 53
 	(the if/else stuff per block is slowing things down)
54
+compare the quality & speed of all filters
55
+implement a few simple deinterlacing filters
56
+split this huge file
54 57
 ...
55 58
 
56 59
 Notes:
... ...
@@ -58,7 +62,7 @@ Notes:
58 58
 */
59 59
 
60 60
 /*
61
-Changelog:
61
+Changelog: use the CVS log
62 62
 0.1.3
63 63
 	bugfixes: last 3 lines not brightness/contrast corrected
64 64
 		brightness statistics messed up with initial black pic
... ...
@@ -99,11 +103,13 @@ static uint64_t bm10000000=	0xFF00000000000000LL;
99 99
 static uint64_t bm10000001=	0xFF000000000000FFLL;
100 100
 static uint64_t bm11000011=	0xFFFF00000000FFFFLL;
101 101
 static uint64_t bm00000011=	0x000000000000FFFFLL;
102
+static uint64_t bm11111110=	0xFFFFFFFFFFFFFF00LL;
102 103
 static uint64_t bm11000000=	0xFFFF000000000000LL;
103 104
 static uint64_t bm00011000=	0x000000FFFF000000LL;
104 105
 static uint64_t bm00110011=	0x0000FFFF0000FFFFLL;
105 106
 static uint64_t bm11001100=	0xFFFF0000FFFF0000LL;
106 107
 static uint64_t b00= 		0x0000000000000000LL;
108
+static uint64_t b01= 		0x0101010101010101LL;
107 109
 static uint64_t b02= 		0x0202020202020202LL;
108 110
 static uint64_t b0F= 		0x0F0F0F0F0F0F0F0FLL;
109 111
 static uint64_t bFF= 		0xFFFFFFFFFFFFFFFFLL;
... ...
@@ -544,7 +550,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
544 544
 	x/8 = 1
545 545
 	1 12 12 23
546 546
  */
547
-static inline void vertRKFilter(uint8_t *src, int stride, int QP)
547
+static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
548 548
 {
549 549
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
550 550
 // FIXME rounding
... ...
@@ -638,7 +644,8 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
638 638
 
639 639
 /**
640 640
  * Experimental Filter 1
641
- * will nor damage linear gradients
641
+ * will not damage linear gradients
642
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
642 643
  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
643 644
  * MMX2 version does correct clipping C version doesnt
644 645
  */
... ...
@@ -675,9 +682,13 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
675 675
 		"movq %%mm4, %%mm3				\n\t" // d
676 676
 		"psubusb pQPb, %%mm4				\n\t"
677 677
 		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
678
+		"psubusb b01, %%mm3				\n\t"
678 679
 		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
679 680
 
680 681
 		PAVGB(%%mm7, %%mm3)				      // d/2
682
+		"movq %%mm3, %%mm1				\n\t" // d/2
683
+		PAVGB(%%mm7, %%mm3)				      // d/4
684
+		PAVGB(%%mm1, %%mm3)				      // 3*d/8
681 685
 
682 686
 		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
683 687
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
... ...
@@ -691,31 +702,31 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
691 691
 		"pxor %%mm2, %%mm0				\n\t"
692 692
 		"movq %%mm0, (%%ebx)				\n\t" // line 5
693 693
 
694
-		PAVGB(%%mm7, %%mm3)				      // d/4
694
+		PAVGB(%%mm7, %%mm1)				      // d/4
695 695
 
696 696
 		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
697 697
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
698
-		"psubusb %%mm3, %%mm0				\n\t"
698
+		"psubusb %%mm1, %%mm0				\n\t"
699 699
 		"pxor %%mm2, %%mm0				\n\t"
700 700
 		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
701 701
 
702 702
 		"movq (%%ebx, %1), %%mm0			\n\t" // line 6
703 703
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
704
-		"paddusb %%mm3, %%mm0				\n\t"
704
+		"paddusb %%mm1, %%mm0				\n\t"
705 705
 		"pxor %%mm2, %%mm0				\n\t"
706 706
 		"movq %%mm0, (%%ebx, %1)			\n\t" // line 6
707 707
 
708
-		PAVGB(%%mm7, %%mm3)				      // d/8
708
+		PAVGB(%%mm7, %%mm1)				      // d/8
709 709
 
710 710
 		"movq (%%eax, %1), %%mm0			\n\t" // line 2
711 711
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
712
-		"psubusb %%mm3, %%mm0				\n\t"
712
+		"psubusb %%mm1, %%mm0				\n\t"
713 713
 		"pxor %%mm2, %%mm0				\n\t"
714 714
 		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
715 715
 
716 716
 		"movq (%%ebx, %1, 2), %%mm0			\n\t" // line 7
717 717
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
718
-		"paddusb %%mm3, %%mm0				\n\t"
718
+		"paddusb %%mm1, %%mm0				\n\t"
719 719
 		"pxor %%mm2, %%mm0				\n\t"
720 720
 		"movq %%mm0, (%%ebx, %1, 2)			\n\t" // line 7
721 721
 
... ...
@@ -739,7 +750,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
739 739
 	{
740 740
 		int a= src[l3] - src[l4];
741 741
 		int b= src[l4] - src[l5];
742
-		int c= src[l6] - src[l7];
742
+		int c= src[l5] - src[l6];
743 743
 
744 744
 		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
745 745
 
... ...
@@ -749,8 +760,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
749 749
 
750 750
 			src[l2] +=v/8;
751 751
 			src[l3] +=v/4;
752
-			src[l4] +=v/2;
753
-			src[l5] -=v/2;
752
+			src[l4] +=3*v/8;
753
+			src[l5] -=3*v/8;
754 754
 			src[l6] -=v/4;
755 755
 			src[l7] -=v/8;
756 756
 
... ...
@@ -789,6 +800,211 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
789 789
 #endif
790 790
 }
791 791
 
792
+/**
793
+ * Experimental Filter 1 (Horizontal)
794
+ * will not damage linear gradients
795
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
796
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move)
797
+ * MMX2 version does correct clipping C version doesnt
798
+ * not identical with the vertical one
799
+ */
800
+static inline void horizX1Filter(uint8_t *src, int stride, int QP)
801
+{
802
+	int y;
803
+	static uint64_t *lut= NULL;
804
+	if(lut==NULL)
805
+	{
806
+		int i;
807
+		lut= (uint64_t*)memalign(8, 256*8);
808
+		for(i=0; i<256; i++)
809
+		{
810
+			int v= i < 128 ? 2*i : 2*(i-256);
811
+/*
812
+//Simulate 112242211 9-Tap filter
813
+			uint64_t a= (v/16) & 0xFF;
814
+			uint64_t b= (v/8) & 0xFF;
815
+			uint64_t c= (v/4) & 0xFF;
816
+			uint64_t d= (3*v/8) & 0xFF;
817
+*/
818
+//Simulate piecewise linear interpolation
819
+			uint64_t a= (v/16) & 0xFF;
820
+			uint64_t b= (v*3/16) & 0xFF;
821
+			uint64_t c= (v*5/16) & 0xFF;
822
+			uint64_t d= (7*v/16) & 0xFF;
823
+			uint64_t A= (0x100 - a)&0xFF;
824
+			uint64_t B= (0x100 - b)&0xFF;
825
+			uint64_t C= (0x100 - c)&0xFF;
826
+			uint64_t D= (0x100 - c)&0xFF;
827
+
828
+			lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
829
+				(D<<24) | (C<<16) | (B<<8) | (A);
830
+			//lut[i] = (v<<32) | (v<<24);
831
+		}
832
+	}
833
+
834
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
835
+	asm volatile(
836
+		"pxor %%mm7, %%mm7				\n\t" // 0
837
+//		"movq b80, %%mm6				\n\t" // MIN_SIGNED_BYTE
838
+		"leal (%0, %1), %%eax				\n\t"
839
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
840
+
841
+		"movq b80, %%mm6				\n\t"
842
+		"movd %2, %%mm5					\n\t" // QP
843
+		"movq %%mm5, %%mm4				\n\t"
844
+		"paddusb %%mm5, %%mm5				\n\t" // 2QP
845
+		"paddusb %%mm5, %%mm4				\n\t" // 3QP
846
+		"pxor %%mm5, %%mm5				\n\t" // 0
847
+		"psubb %%mm4, %%mm5				\n\t" // -3QP
848
+		"por bm11111110, %%mm5				\n\t" // ...,FF,FF,-3QP
849
+		"psllq $24, %%mm5				\n\t"
850
+
851
+//	0	1	2	3	4	5	6	7	8	9
852
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
853
+
854
+#define HX1old(a) \
855
+		"movd " #a ", %%mm0				\n\t"\
856
+		"movd 4" #a ", %%mm1				\n\t"\
857
+		"punpckldq %%mm1, %%mm0				\n\t"\
858
+		"movq %%mm0, %%mm1				\n\t"\
859
+		"movq %%mm0, %%mm2				\n\t"\
860
+		"psrlq $8, %%mm1				\n\t"\
861
+		"psubusb %%mm1, %%mm2				\n\t"\
862
+		"psubusb %%mm0, %%mm1				\n\t"\
863

                
864

                
865

                
866

                
867

                
868
+		"psubusb %%mm3, %%mm1			\n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
869
+		"paddb %%mm5, %%mm1				\n\t"\
870
+		"psubusb %%mm5, %%mm1				\n\t"\
871
+		PAVGB(%%mm7, %%mm1)\
872
+		"pxor %%mm2, %%mm1				\n\t"\
873
+		"psubb %%mm2, %%mm1				\n\t"\
874
+		"psrlq $24, %%mm1				\n\t"\
875
+		"movd %%mm1, %%ecx				\n\t"\
876
+		"paddb %%mm6, %%mm0				\n\t"\
877
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
878
+		"paddb %%mm6, %%mm0				\n\t"\
879
+		"movq %%mm0, " #a "				\n\t"\
880
+
881
+/*
882
+HX1old((%0))
883
+HX1old((%%eax))
884
+HX1old((%%eax, %1))
885
+HX1old((%%eax, %1, 2))
886
+HX1old((%0, %1, 4))
887
+HX1old((%%ebx))
888
+HX1old((%%ebx, %1))
889
+HX1old((%%ebx, %1, 2))
890
+*/
891
+
892
+//FIXME add some comments, its unreadable ...
893
+#define HX1b(a, c, b, d) \
894
+		"movd " #a ", %%mm0				\n\t"\
895
+		"movd 4" #a ", %%mm1				\n\t"\
896
+		"punpckldq %%mm1, %%mm0				\n\t"\
897
+		"movd " #b ", %%mm4				\n\t"\
898
+		"movq %%mm0, %%mm1				\n\t"\
899
+		"movq %%mm0, %%mm2				\n\t"\
900
+		"psrlq $8, %%mm1				\n\t"\
901
+		"movd 4" #b ", %%mm3				\n\t"\
902
+		"psubusb %%mm1, %%mm2				\n\t"\
903
+		"psubusb %%mm0, %%mm1				\n\t"\
904

                
905

                
906
+		"punpckldq %%mm3, %%mm4				\n\t"\
907
+		"movq %%mm1, %%mm3				\n\t"\
908

                
909

                
910
+		"paddb %%mm6, %%mm0				\n\t"\
911

                
912
+		"psubusb %%mm3, %%mm1			\n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
913
+		"movq %%mm4, %%mm3				\n\t"\
914
+		"paddb %%mm5, %%mm1				\n\t"\
915
+		"psubusb %%mm5, %%mm1				\n\t"\
916
+		"psrlq $8, %%mm3				\n\t"\
917
+		PAVGB(%%mm7, %%mm1)\
918
+		"pxor %%mm2, %%mm1				\n\t"\
919
+		"psubb %%mm2, %%mm1				\n\t"\
920
+		"movq %%mm4, %%mm2				\n\t"\
921
+		"psrlq $24, %%mm1				\n\t"\
922
+		"psubusb %%mm3, %%mm2				\n\t"\
923
+		"movd %%mm1, %%ecx				\n\t"\
924
+		"psubusb %%mm4, %%mm3				\n\t"\
925
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
926

                
927
+		"paddb %%mm6, %%mm0				\n\t"\
928

                
929
+		"movq %%mm3, %%mm1				\n\t"\
930

                
931
+		"movq %%mm0, " #a "				\n\t"\
932

                
933
+		"paddb %%mm6, %%mm4				\n\t"\
934

                
935
+		"psubusb %%mm1, %%mm3			\n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
936
+		"paddb %%mm5, %%mm3				\n\t"\
937
+		"psubusb %%mm5, %%mm3				\n\t"\
938
+		PAVGB(%%mm7, %%mm3)\
939
+		"pxor %%mm2, %%mm3				\n\t"\
940
+		"psubb %%mm2, %%mm3				\n\t"\
941
+		"psrlq $24, %%mm3				\n\t"\
942
+		"movd " #c ", %%mm0				\n\t"\
943
+		"movd 4" #c ", %%mm1				\n\t"\
944
+		"punpckldq %%mm1, %%mm0				\n\t"\
945
+		"paddb %%mm6, %%mm0				\n\t"\
946
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
947
+		"paddb %%mm6, %%mm0				\n\t"\
948
+		"movq %%mm0, " #c "				\n\t"\
949
+		"movd %%mm3, %%ecx				\n\t"\
950
+		"movd " #d ", %%mm0				\n\t"\
951
+		"paddsb (%3, %%ecx, 8), %%mm4			\n\t"\
952
+		"movd 4" #d ", %%mm1				\n\t"\
953
+		"paddb %%mm6, %%mm4				\n\t"\
954
+		"punpckldq %%mm1, %%mm0				\n\t"\
955
+		"movq %%mm4, " #b "				\n\t"\
956
+		"paddb %%mm6, %%mm0				\n\t"\
957
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
958
+		"paddb %%mm6, %%mm0				\n\t"\
959
+		"movq %%mm0, " #d "				\n\t"\
960
+
961
+HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
962
+HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
963
+
964
+
965
+		:
966
+		: "r" (src), "r" (stride), "r" (QP), "r" (lut)
967
+		: "%eax", "%ebx", "%ecx"
968
+	);
969
+#else
970
+
971
+//FIXME (has little in common with the mmx2 version)
972
+	for(y=0; y<BLOCK_SIZE; y++)
973
+	{
974
+		int a= src[1] - src[2];
975
+		int b= src[3] - src[4];
976
+		int c= src[5] - src[6];
977
+
978
+		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
979
+
980
+		if(d < QP)
981
+		{
982
+			int v = d * SIGN(-b);
983
+
984
+			src[1] +=v/8;
985
+			src[2] +=v/4;
986
+			src[3] +=3*v/8;
987
+			src[4] -=3*v/8;
988
+			src[5] -=v/4;
989
+			src[6] -=v/8;
990
+
991
+		}
992
+		src+=stride;
993
+	}
994
+#endif
995
+}
996
+
792 997
 
793 998
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
794 999
 {
... ...
@@ -1638,13 +1854,14 @@ void  postprocess(unsigned char * src[], int src_stride,
1638 1638
 	vertical_size   >>= 1;
1639 1639
 	src_stride      >>= 1;
1640 1640
 	dst_stride      >>= 1;
1641
+	mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
1641 1642
 
1642 1643
 	if(1)
1643 1644
 	{
1644 1645
 		postProcess(src[1], src_stride, dst[1], dst_stride,
1645
-			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
1646
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1646 1647
 		postProcess(src[2], src_stride, dst[2], dst_stride,
1647
-			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
1648
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1648 1649
 	}
1649 1650
 	else
1650 1651
 	{
... ...
@@ -1929,9 +2146,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
1929 1929
 #endif
1930 1930
 				if(mode & V_DEBLOCK)
1931 1931
 				{
1932
-					if(mode & RK_FILTER)
1933
-						vertRKFilter(vertBlock, stride, QP);
1934
-					else if(mode & X1_FILTER)
1932
+					if(mode & V_RK1_FILTER)
1933
+						vertRK1Filter(vertBlock, stride, QP);
1934
+					else if(mode & V_X1_FILTER)
1935 1935
 						vertX1Filter(vertBlock, stride, QP);
1936 1936
 					else
1937 1937
 					{
... ...
@@ -1962,13 +2179,18 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
1962 1962
 #endif
1963 1963
 				if(mode & H_DEBLOCK)
1964 1964
 				{
1965
-					if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1965
+					if(mode & H_X1_FILTER)
1966
+						horizX1Filter(dstBlock-4, stride, QP);
1967
+					else
1966 1968
 					{
1967
-						if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
1968
-							doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
1969
+						if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1970
+						{
1971
+							if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
1972
+								doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
1973
+						}
1974
+						else
1975
+							doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1969 1976
 					}
1970
-					else
1971
-						doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1972 1977
 				}
1973 1978
 #ifdef MORE_TIMEING
1974 1979
 				T1= rdtsc();
... ...
@@ -28,24 +28,23 @@
28 28
 #define DERING		0x04
29 29
 #define LEVEL_FIX	0x08 /* Brightness & Contrast */
30 30
 
31
-#define LUM_V_DEBLOCK	V_DEBLOCK
32
-#define LUM_H_DEBLOCK	H_DEBLOCK
33
-#define CHROM_V_DEBLOCK	(V_DEBLOCK<<4)
34
-#define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)
35
-#define LUM_DERING	DERING
36
-#define CHROM_DERING	(DERING<<4)
37
-#define LUM_LEVEL_FIX	LEVEL_FIX
31
+#define LUM_V_DEBLOCK	V_DEBLOCK		//   1
32
+#define LUM_H_DEBLOCK	H_DEBLOCK		//   2
33
+#define CHROM_V_DEBLOCK	(V_DEBLOCK<<4)		//  16
34
+#define CHROM_H_DEBLOCK	(H_DEBLOCK<<4)		//  32
35
+#define LUM_DERING	DERING			//   4
36
+#define CHROM_DERING	(DERING<<4)		//  64
37
+#define LUM_LEVEL_FIX	LEVEL_FIX		//   8
38 38
 //not supported currently
39
-#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)
39
+#define CHROM_LEVEL_FIX	(LEVEL_FIX<<4)		// 128
40 40
 
41
-// Experimental stuff
42
-#define RK_FILTER		0x0100
43
-#define LUM_V_RK_FILTER		RK_FILTER
44
-#define CHROM_V_RK_FILTER	(RK_FILTER<<4)
41
+// Experimental vertical filters
42
+#define V_RK1_FILTER	0x0100			// 256
43
+#define V_X1_FILTER	0x0200			// 512
45 44
 
46
-#define X1_FILTER		0x0200
47
-#define LUM_V_X1_FILTER		X1_FILTER
48
-#define CHROM_V_X1_FILTER	(X1_FILTER<<4)
45
+// Experimental horizontal filters
46
+#define H_RK1_FILTER	0x1000			// 4096
47
+#define H_X1_FILTER	0x2000			// 8192
49 48
 
50 49
 
51 50
 #define TIMEING
... ...
@@ -27,8 +27,9 @@ isHorizMinMaxOk		a
27 27
 doHorizLowPass		E		a	a*
28 28
 doHorizDefFilter	E	ac	ac
29 29
 deRing
30
-RKAlgo1			E		a	a*
31
-X1			a		E	E*
30
+Vertical RKAlgo1	E		a	a*
31
+Vertical X1		a		E	E*
32
+Horizontal X1		a		E	E*
32 33
 
33 34
 
34 35
 * i dont have a 3dnow CPU -> its untested
... ...
@@ -40,7 +41,7 @@ c = checked against the other implementations (-vo md5)
40 40
 
41 41
 /*
42 42
 TODO:
43
-verify that everything workes as it should
43
+verify that everything workes as it should (how?)
44 44
 reduce the time wasted on the mem transfer
45 45
 implement dering
46 46
 implement everything in C at least (done at the moment but ...)
... ...
@@ -51,6 +52,9 @@ write a faster and higher quality deblocking filter :)
51 51
 do something about the speed of the horizontal filters
52 52
 make the mainloop more flexible (variable number of blocks at once
53 53
 	(the if/else stuff per block is slowing things down)
54
+compare the quality & speed of all filters
55
+implement a few simple deinterlacing filters
56
+split this huge file
54 57
 ...
55 58
 
56 59
 Notes:
... ...
@@ -58,7 +62,7 @@ Notes:
58 58
 */
59 59
 
60 60
 /*
61
-Changelog:
61
+Changelog: use the CVS log
62 62
 0.1.3
63 63
 	bugfixes: last 3 lines not brightness/contrast corrected
64 64
 		brightness statistics messed up with initial black pic
... ...
@@ -99,11 +103,13 @@ static uint64_t bm10000000=	0xFF00000000000000LL;
99 99
 static uint64_t bm10000001=	0xFF000000000000FFLL;
100 100
 static uint64_t bm11000011=	0xFFFF00000000FFFFLL;
101 101
 static uint64_t bm00000011=	0x000000000000FFFFLL;
102
+static uint64_t bm11111110=	0xFFFFFFFFFFFFFF00LL;
102 103
 static uint64_t bm11000000=	0xFFFF000000000000LL;
103 104
 static uint64_t bm00011000=	0x000000FFFF000000LL;
104 105
 static uint64_t bm00110011=	0x0000FFFF0000FFFFLL;
105 106
 static uint64_t bm11001100=	0xFFFF0000FFFF0000LL;
106 107
 static uint64_t b00= 		0x0000000000000000LL;
108
+static uint64_t b01= 		0x0101010101010101LL;
107 109
 static uint64_t b02= 		0x0202020202020202LL;
108 110
 static uint64_t b0F= 		0x0F0F0F0F0F0F0F0FLL;
109 111
 static uint64_t bFF= 		0xFFFFFFFFFFFFFFFFLL;
... ...
@@ -544,7 +550,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP)
544 544
 	x/8 = 1
545 545
 	1 12 12 23
546 546
  */
547
-static inline void vertRKFilter(uint8_t *src, int stride, int QP)
547
+static inline void vertRK1Filter(uint8_t *src, int stride, int QP)
548 548
 {
549 549
 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
550 550
 // FIXME rounding
... ...
@@ -638,7 +644,8 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP)
638 638
 
639 639
 /**
640 640
  * Experimental Filter 1
641
- * will nor damage linear gradients
641
+ * will not damage linear gradients
642
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
642 643
  * can only smooth blocks at the expected locations (it cant smooth them if they did move)
643 644
  * MMX2 version does correct clipping C version doesnt
644 645
  */
... ...
@@ -675,9 +682,13 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
675 675
 		"movq %%mm4, %%mm3				\n\t" // d
676 676
 		"psubusb pQPb, %%mm4				\n\t"
677 677
 		"pcmpeqb %%mm7, %%mm4				\n\t" // d <= QP ? -1 : 0
678
+		"psubusb b01, %%mm3				\n\t"
678 679
 		"pand %%mm4, %%mm3				\n\t" // d <= QP ? d : 0
679 680
 
680 681
 		PAVGB(%%mm7, %%mm3)				      // d/2
682
+		"movq %%mm3, %%mm1				\n\t" // d/2
683
+		PAVGB(%%mm7, %%mm3)				      // d/4
684
+		PAVGB(%%mm1, %%mm3)				      // 3*d/8
681 685
 
682 686
 		"movq (%0, %1, 4), %%mm0			\n\t" // line 4
683 687
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
... ...
@@ -691,31 +702,31 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
691 691
 		"pxor %%mm2, %%mm0				\n\t"
692 692
 		"movq %%mm0, (%%ebx)				\n\t" // line 5
693 693
 
694
-		PAVGB(%%mm7, %%mm3)				      // d/4
694
+		PAVGB(%%mm7, %%mm1)				      // d/4
695 695
 
696 696
 		"movq (%%eax, %1, 2), %%mm0			\n\t" // line 3
697 697
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
698
-		"psubusb %%mm3, %%mm0				\n\t"
698
+		"psubusb %%mm1, %%mm0				\n\t"
699 699
 		"pxor %%mm2, %%mm0				\n\t"
700 700
 		"movq %%mm0, (%%eax, %1, 2)			\n\t" // line 3
701 701
 
702 702
 		"movq (%%ebx, %1), %%mm0			\n\t" // line 6
703 703
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
704
-		"paddusb %%mm3, %%mm0				\n\t"
704
+		"paddusb %%mm1, %%mm0				\n\t"
705 705
 		"pxor %%mm2, %%mm0				\n\t"
706 706
 		"movq %%mm0, (%%ebx, %1)			\n\t" // line 6
707 707
 
708
-		PAVGB(%%mm7, %%mm3)				      // d/8
708
+		PAVGB(%%mm7, %%mm1)				      // d/8
709 709
 
710 710
 		"movq (%%eax, %1), %%mm0			\n\t" // line 2
711 711
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
712
-		"psubusb %%mm3, %%mm0				\n\t"
712
+		"psubusb %%mm1, %%mm0				\n\t"
713 713
 		"pxor %%mm2, %%mm0				\n\t"
714 714
 		"movq %%mm0, (%%eax, %1)			\n\t" // line 2
715 715
 
716 716
 		"movq (%%ebx, %1, 2), %%mm0			\n\t" // line 7
717 717
 		"pxor %%mm2, %%mm0				\n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
718
-		"paddusb %%mm3, %%mm0				\n\t"
718
+		"paddusb %%mm1, %%mm0				\n\t"
719 719
 		"pxor %%mm2, %%mm0				\n\t"
720 720
 		"movq %%mm0, (%%ebx, %1, 2)			\n\t" // line 7
721 721
 
... ...
@@ -739,7 +750,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
739 739
 	{
740 740
 		int a= src[l3] - src[l4];
741 741
 		int b= src[l4] - src[l5];
742
-		int c= src[l6] - src[l7];
742
+		int c= src[l5] - src[l6];
743 743
 
744 744
 		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
745 745
 
... ...
@@ -749,8 +760,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
749 749
 
750 750
 			src[l2] +=v/8;
751 751
 			src[l3] +=v/4;
752
-			src[l4] +=v/2;
753
-			src[l5] -=v/2;
752
+			src[l4] +=3*v/8;
753
+			src[l5] -=3*v/8;
754 754
 			src[l6] -=v/4;
755 755
 			src[l7] -=v/8;
756 756
 
... ...
@@ -789,6 +800,211 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP)
789 789
 #endif
790 790
 }
791 791
 
792
+/**
793
+ * Experimental Filter 1 (Horizontal)
794
+ * will not damage linear gradients
795
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
796
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move)
797
+ * MMX2 version does correct clipping C version doesnt
798
+ * not identical with the vertical one
799
+ */
800
+static inline void horizX1Filter(uint8_t *src, int stride, int QP)
801
+{
802
+	int y;
803
+	static uint64_t *lut= NULL;
804
+	if(lut==NULL)
805
+	{
806
+		int i;
807
+		lut= (uint64_t*)memalign(8, 256*8);
808
+		for(i=0; i<256; i++)
809
+		{
810
+			int v= i < 128 ? 2*i : 2*(i-256);
811
+/*
812
+//Simulate 112242211 9-Tap filter
813
+			uint64_t a= (v/16) & 0xFF;
814
+			uint64_t b= (v/8) & 0xFF;
815
+			uint64_t c= (v/4) & 0xFF;
816
+			uint64_t d= (3*v/8) & 0xFF;
817
+*/
818
+//Simulate piecewise linear interpolation
819
+			uint64_t a= (v/16) & 0xFF;
820
+			uint64_t b= (v*3/16) & 0xFF;
821
+			uint64_t c= (v*5/16) & 0xFF;
822
+			uint64_t d= (7*v/16) & 0xFF;
823
+			uint64_t A= (0x100 - a)&0xFF;
824
+			uint64_t B= (0x100 - b)&0xFF;
825
+			uint64_t C= (0x100 - c)&0xFF;
826
+			uint64_t D= (0x100 - c)&0xFF;
827
+
828
+			lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
829
+				(D<<24) | (C<<16) | (B<<8) | (A);
830
+			//lut[i] = (v<<32) | (v<<24);
831
+		}
832
+	}
833
+
834
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
835
+	asm volatile(
836
+		"pxor %%mm7, %%mm7				\n\t" // 0
837
+//		"movq b80, %%mm6				\n\t" // MIN_SIGNED_BYTE
838
+		"leal (%0, %1), %%eax				\n\t"
839
+		"leal (%%eax, %1, 4), %%ebx			\n\t"
840
+
841
+		"movq b80, %%mm6				\n\t"
842
+		"movd %2, %%mm5					\n\t" // QP
843
+		"movq %%mm5, %%mm4				\n\t"
844
+		"paddusb %%mm5, %%mm5				\n\t" // 2QP
845
+		"paddusb %%mm5, %%mm4				\n\t" // 3QP
846
+		"pxor %%mm5, %%mm5				\n\t" // 0
847
+		"psubb %%mm4, %%mm5				\n\t" // -3QP
848
+		"por bm11111110, %%mm5				\n\t" // ...,FF,FF,-3QP
849
+		"psllq $24, %%mm5				\n\t"
850
+
851
+//	0	1	2	3	4	5	6	7	8	9
852
+//	%0	eax	eax+%1	eax+2%1	%0+4%1	ebx	ebx+%1	ebx+2%1	%0+8%1	ebx+4%1
853
+
854
+#define HX1old(a) \
855
+		"movd " #a ", %%mm0				\n\t"\
856
+		"movd 4" #a ", %%mm1				\n\t"\
857
+		"punpckldq %%mm1, %%mm0				\n\t"\
858
+		"movq %%mm0, %%mm1				\n\t"\
859
+		"movq %%mm0, %%mm2				\n\t"\
860
+		"psrlq $8, %%mm1				\n\t"\
861
+		"psubusb %%mm1, %%mm2				\n\t"\
862
+		"psubusb %%mm0, %%mm1				\n\t"\
863

                
864

                
865

                
866

                
867

                
868
+		"psubusb %%mm3, %%mm1			\n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
869
+		"paddb %%mm5, %%mm1				\n\t"\
870
+		"psubusb %%mm5, %%mm1				\n\t"\
871
+		PAVGB(%%mm7, %%mm1)\
872
+		"pxor %%mm2, %%mm1				\n\t"\
873
+		"psubb %%mm2, %%mm1				\n\t"\
874
+		"psrlq $24, %%mm1				\n\t"\
875
+		"movd %%mm1, %%ecx				\n\t"\
876
+		"paddb %%mm6, %%mm0				\n\t"\
877
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
878
+		"paddb %%mm6, %%mm0				\n\t"\
879
+		"movq %%mm0, " #a "				\n\t"\
880
+
881
+/*
882
+HX1old((%0))
883
+HX1old((%%eax))
884
+HX1old((%%eax, %1))
885
+HX1old((%%eax, %1, 2))
886
+HX1old((%0, %1, 4))
887
+HX1old((%%ebx))
888
+HX1old((%%ebx, %1))
889
+HX1old((%%ebx, %1, 2))
890
+*/
891
+
892
+//FIXME add some comments, its unreadable ...
893
+#define HX1b(a, c, b, d) \
894
+		"movd " #a ", %%mm0				\n\t"\
895
+		"movd 4" #a ", %%mm1				\n\t"\
896
+		"punpckldq %%mm1, %%mm0				\n\t"\
897
+		"movd " #b ", %%mm4				\n\t"\
898
+		"movq %%mm0, %%mm1				\n\t"\
899
+		"movq %%mm0, %%mm2				\n\t"\
900
+		"psrlq $8, %%mm1				\n\t"\
901
+		"movd 4" #b ", %%mm3				\n\t"\
902
+		"psubusb %%mm1, %%mm2				\n\t"\
903
+		"psubusb %%mm0, %%mm1				\n\t"\
904

                
905

                
906
+		"punpckldq %%mm3, %%mm4				\n\t"\
907
+		"movq %%mm1, %%mm3				\n\t"\
908

                
909

                
910
+		"paddb %%mm6, %%mm0				\n\t"\
911

                
912
+		"psubusb %%mm3, %%mm1			\n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
913
+		"movq %%mm4, %%mm3				\n\t"\
914
+		"paddb %%mm5, %%mm1				\n\t"\
915
+		"psubusb %%mm5, %%mm1				\n\t"\
916
+		"psrlq $8, %%mm3				\n\t"\
917
+		PAVGB(%%mm7, %%mm1)\
918
+		"pxor %%mm2, %%mm1				\n\t"\
919
+		"psubb %%mm2, %%mm1				\n\t"\
920
+		"movq %%mm4, %%mm2				\n\t"\
921
+		"psrlq $24, %%mm1				\n\t"\
922
+		"psubusb %%mm3, %%mm2				\n\t"\
923
+		"movd %%mm1, %%ecx				\n\t"\
924
+		"psubusb %%mm4, %%mm3				\n\t"\
925
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
926

                
927
+		"paddb %%mm6, %%mm0				\n\t"\
928

                
929
+		"movq %%mm3, %%mm1				\n\t"\
930

                
931
+		"movq %%mm0, " #a "				\n\t"\
932

                
933
+		"paddb %%mm6, %%mm4				\n\t"\
934

                
935
+		"psubusb %%mm1, %%mm3			\n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\
936
+		"paddb %%mm5, %%mm3				\n\t"\
937
+		"psubusb %%mm5, %%mm3				\n\t"\
938
+		PAVGB(%%mm7, %%mm3)\
939
+		"pxor %%mm2, %%mm3				\n\t"\
940
+		"psubb %%mm2, %%mm3				\n\t"\
941
+		"psrlq $24, %%mm3				\n\t"\
942
+		"movd " #c ", %%mm0				\n\t"\
943
+		"movd 4" #c ", %%mm1				\n\t"\
944
+		"punpckldq %%mm1, %%mm0				\n\t"\
945
+		"paddb %%mm6, %%mm0				\n\t"\
946
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
947
+		"paddb %%mm6, %%mm0				\n\t"\
948
+		"movq %%mm0, " #c "				\n\t"\
949
+		"movd %%mm3, %%ecx				\n\t"\
950
+		"movd " #d ", %%mm0				\n\t"\
951
+		"paddsb (%3, %%ecx, 8), %%mm4			\n\t"\
952
+		"movd 4" #d ", %%mm1				\n\t"\
953
+		"paddb %%mm6, %%mm4				\n\t"\
954
+		"punpckldq %%mm1, %%mm0				\n\t"\
955
+		"movq %%mm4, " #b "				\n\t"\
956
+		"paddb %%mm6, %%mm0				\n\t"\
957
+		"paddsb (%3, %%ecx, 8), %%mm0			\n\t"\
958
+		"paddb %%mm6, %%mm0				\n\t"\
959
+		"movq %%mm0, " #d "				\n\t"\
960
+
961
+HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2))
962
+HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2))
963
+
964
+
965
+		:
966
+		: "r" (src), "r" (stride), "r" (QP), "r" (lut)
967
+		: "%eax", "%ebx", "%ecx"
968
+	);
969
+#else
970
+
971
+//FIXME (has little in common with the mmx2 version)
972
+	for(y=0; y<BLOCK_SIZE; y++)
973
+	{
974
+		int a= src[1] - src[2];
975
+		int b= src[3] - src[4];
976
+		int c= src[5] - src[6];
977
+
978
+		int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0);
979
+
980
+		if(d < QP)
981
+		{
982
+			int v = d * SIGN(-b);
983
+
984
+			src[1] +=v/8;
985
+			src[2] +=v/4;
986
+			src[3] +=3*v/8;
987
+			src[4] -=3*v/8;
988
+			src[5] -=v/4;
989
+			src[6] -=v/8;
990
+
991
+		}
992
+		src+=stride;
993
+	}
994
+#endif
995
+}
996
+
792 997
 
793 998
 static inline void doVertDefFilter(uint8_t src[], int stride, int QP)
794 999
 {
... ...
@@ -1638,13 +1854,14 @@ void  postprocess(unsigned char * src[], int src_stride,
1638 1638
 	vertical_size   >>= 1;
1639 1639
 	src_stride      >>= 1;
1640 1640
 	dst_stride      >>= 1;
1641
+	mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00);
1641 1642
 
1642 1643
 	if(1)
1643 1644
 	{
1644 1645
 		postProcess(src[1], src_stride, dst[1], dst_stride,
1645
-			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
1646
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1646 1647
 		postProcess(src[2], src_stride, dst[2], dst_stride,
1647
-			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4);
1648
+			horizontal_size, vertical_size, QP_store, QP_stride, 1, mode);
1648 1649
 	}
1649 1650
 	else
1650 1651
 	{
... ...
@@ -1929,9 +2146,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
1929 1929
 #endif
1930 1930
 				if(mode & V_DEBLOCK)
1931 1931
 				{
1932
-					if(mode & RK_FILTER)
1933
-						vertRKFilter(vertBlock, stride, QP);
1934
-					else if(mode & X1_FILTER)
1932
+					if(mode & V_RK1_FILTER)
1933
+						vertRK1Filter(vertBlock, stride, QP);
1934
+					else if(mode & V_X1_FILTER)
1935 1935
 						vertX1Filter(vertBlock, stride, QP);
1936 1936
 					else
1937 1937
 					{
... ...
@@ -1962,13 +2179,18 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int
1962 1962
 #endif
1963 1963
 				if(mode & H_DEBLOCK)
1964 1964
 				{
1965
-					if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1965
+					if(mode & H_X1_FILTER)
1966
+						horizX1Filter(dstBlock-4, stride, QP);
1967
+					else
1966 1968
 					{
1967
-						if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
1968
-							doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
1969
+						if( isHorizDCAndCopy2Temp(dstBlock-4, stride))
1970
+						{
1971
+							if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP))
1972
+								doHorizLowPassAndCopyBack(dstBlock-4, stride, QP);
1973
+						}
1974
+						else
1975
+							doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1969 1976
 					}
1970
-					else
1971
-						doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP);
1972 1977
 				}
1973 1978
 #ifdef MORE_TIMEING
1974 1979
 				T1= rdtsc();