Originally committed as revision 2180 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc
Michael Niedermayer authored on 2001/10/13 11:31:15... | ... |
@@ -27,8 +27,9 @@ isHorizMinMaxOk a |
27 | 27 |
doHorizLowPass E a a* |
28 | 28 |
doHorizDefFilter E ac ac |
29 | 29 |
deRing |
30 |
-RKAlgo1 E a a* |
|
31 |
-X1 a E E* |
|
30 |
+Vertical RKAlgo1 E a a* |
|
31 |
+Vertical X1 a E E* |
|
32 |
+Horizontal X1 a E E* |
|
32 | 33 |
|
33 | 34 |
|
34 | 35 |
* i dont have a 3dnow CPU -> its untested |
... | ... |
@@ -40,7 +41,7 @@ c = checked against the other implementations (-vo md5) |
40 | 40 |
|
41 | 41 |
/* |
42 | 42 |
TODO: |
43 |
-verify that everything workes as it should |
|
43 |
+verify that everything workes as it should (how?) |
|
44 | 44 |
reduce the time wasted on the mem transfer |
45 | 45 |
implement dering |
46 | 46 |
implement everything in C at least (done at the moment but ...) |
... | ... |
@@ -51,6 +52,9 @@ write a faster and higher quality deblocking filter :) |
51 | 51 |
do something about the speed of the horizontal filters |
52 | 52 |
make the mainloop more flexible (variable number of blocks at once |
53 | 53 |
(the if/else stuff per block is slowing things down) |
54 |
+compare the quality & speed of all filters |
|
55 |
+implement a few simple deinterlacing filters |
|
56 |
+split this huge file |
|
54 | 57 |
... |
55 | 58 |
|
56 | 59 |
Notes: |
... | ... |
@@ -58,7 +62,7 @@ Notes: |
58 | 58 |
*/ |
59 | 59 |
|
60 | 60 |
/* |
61 |
-Changelog: |
|
61 |
+Changelog: use the CVS log |
|
62 | 62 |
0.1.3 |
63 | 63 |
bugfixes: last 3 lines not brightness/contrast corrected |
64 | 64 |
brightness statistics messed up with initial black pic |
... | ... |
@@ -99,11 +103,13 @@ static uint64_t bm10000000= 0xFF00000000000000LL; |
99 | 99 |
static uint64_t bm10000001= 0xFF000000000000FFLL; |
100 | 100 |
static uint64_t bm11000011= 0xFFFF00000000FFFFLL; |
101 | 101 |
static uint64_t bm00000011= 0x000000000000FFFFLL; |
102 |
+static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL; |
|
102 | 103 |
static uint64_t bm11000000= 0xFFFF000000000000LL; |
103 | 104 |
static uint64_t bm00011000= 0x000000FFFF000000LL; |
104 | 105 |
static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; |
105 | 106 |
static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; |
106 | 107 |
static uint64_t b00= 0x0000000000000000LL; |
108 |
+static uint64_t b01= 0x0101010101010101LL; |
|
107 | 109 |
static uint64_t b02= 0x0202020202020202LL; |
108 | 110 |
static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; |
109 | 111 |
static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; |
... | ... |
@@ -544,7 +550,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP) |
544 | 544 |
x/8 = 1 |
545 | 545 |
1 12 12 23 |
546 | 546 |
*/ |
547 |
-static inline void vertRKFilter(uint8_t *src, int stride, int QP) |
|
547 |
+static inline void vertRK1Filter(uint8_t *src, int stride, int QP) |
|
548 | 548 |
{ |
549 | 549 |
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
550 | 550 |
// FIXME rounding |
... | ... |
@@ -638,7 +644,8 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP) |
638 | 638 |
|
639 | 639 |
/** |
640 | 640 |
* Experimental Filter 1 |
641 |
- * will nor damage linear gradients |
|
641 |
+ * will not damage linear gradients |
|
642 |
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
642 | 643 |
* can only smooth blocks at the expected locations (it cant smooth them if they did move) |
643 | 644 |
* MMX2 version does correct clipping C version doesnt |
644 | 645 |
*/ |
... | ... |
@@ -675,9 +682,13 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
675 | 675 |
"movq %%mm4, %%mm3 \n\t" // d |
676 | 676 |
"psubusb pQPb, %%mm4 \n\t" |
677 | 677 |
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
678 |
+ "psubusb b01, %%mm3 \n\t" |
|
678 | 679 |
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
679 | 680 |
|
680 | 681 |
PAVGB(%%mm7, %%mm3) // d/2 |
682 |
+ "movq %%mm3, %%mm1 \n\t" // d/2 |
|
683 |
+ PAVGB(%%mm7, %%mm3) // d/4 |
|
684 |
+ PAVGB(%%mm1, %%mm3) // 3*d/8 |
|
681 | 685 |
|
682 | 686 |
"movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
683 | 687 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
... | ... |
@@ -691,31 +702,31 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
691 | 691 |
"pxor %%mm2, %%mm0 \n\t" |
692 | 692 |
"movq %%mm0, (%%ebx) \n\t" // line 5 |
693 | 693 |
|
694 |
- PAVGB(%%mm7, %%mm3) // d/4 |
|
694 |
+ PAVGB(%%mm7, %%mm1) // d/4 |
|
695 | 695 |
|
696 | 696 |
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
697 | 697 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
698 |
- "psubusb %%mm3, %%mm0 \n\t" |
|
698 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
699 | 699 |
"pxor %%mm2, %%mm0 \n\t" |
700 | 700 |
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
701 | 701 |
|
702 | 702 |
"movq (%%ebx, %1), %%mm0 \n\t" // line 6 |
703 | 703 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
704 |
- "paddusb %%mm3, %%mm0 \n\t" |
|
704 |
+ "paddusb %%mm1, %%mm0 \n\t" |
|
705 | 705 |
"pxor %%mm2, %%mm0 \n\t" |
706 | 706 |
"movq %%mm0, (%%ebx, %1) \n\t" // line 6 |
707 | 707 |
|
708 |
- PAVGB(%%mm7, %%mm3) // d/8 |
|
708 |
+ PAVGB(%%mm7, %%mm1) // d/8 |
|
709 | 709 |
|
710 | 710 |
"movq (%%eax, %1), %%mm0 \n\t" // line 2 |
711 | 711 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
712 |
- "psubusb %%mm3, %%mm0 \n\t" |
|
712 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
713 | 713 |
"pxor %%mm2, %%mm0 \n\t" |
714 | 714 |
"movq %%mm0, (%%eax, %1) \n\t" // line 2 |
715 | 715 |
|
716 | 716 |
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 |
717 | 717 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
718 |
- "paddusb %%mm3, %%mm0 \n\t" |
|
718 |
+ "paddusb %%mm1, %%mm0 \n\t" |
|
719 | 719 |
"pxor %%mm2, %%mm0 \n\t" |
720 | 720 |
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 |
721 | 721 |
|
... | ... |
@@ -739,7 +750,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
739 | 739 |
{ |
740 | 740 |
int a= src[l3] - src[l4]; |
741 | 741 |
int b= src[l4] - src[l5]; |
742 |
- int c= src[l6] - src[l7]; |
|
742 |
+ int c= src[l5] - src[l6]; |
|
743 | 743 |
|
744 | 744 |
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
745 | 745 |
|
... | ... |
@@ -749,8 +760,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
749 | 749 |
|
750 | 750 |
src[l2] +=v/8; |
751 | 751 |
src[l3] +=v/4; |
752 |
- src[l4] +=v/2; |
|
753 |
- src[l5] -=v/2; |
|
752 |
+ src[l4] +=3*v/8; |
|
753 |
+ src[l5] -=3*v/8; |
|
754 | 754 |
src[l6] -=v/4; |
755 | 755 |
src[l7] -=v/8; |
756 | 756 |
|
... | ... |
@@ -789,6 +800,211 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
789 | 789 |
#endif |
790 | 790 |
} |
791 | 791 |
|
792 |
+/** |
|
793 |
+ * Experimental Filter 1 (Horizontal) |
|
794 |
+ * will not damage linear gradients |
|
795 |
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
796 |
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
797 |
+ * MMX2 version does correct clipping C version doesnt |
|
798 |
+ * not identical with the vertical one |
|
799 |
+ */ |
|
800 |
+static inline void horizX1Filter(uint8_t *src, int stride, int QP) |
|
801 |
+{ |
|
802 |
+ int y; |
|
803 |
+ static uint64_t *lut= NULL; |
|
804 |
+ if(lut==NULL) |
|
805 |
+ { |
|
806 |
+ int i; |
|
807 |
+ lut= (uint64_t*)memalign(8, 256*8); |
|
808 |
+ for(i=0; i<256; i++) |
|
809 |
+ { |
|
810 |
+ int v= i < 128 ? 2*i : 2*(i-256); |
|
811 |
+/* |
|
812 |
+//Simulate 112242211 9-Tap filter |
|
813 |
+ uint64_t a= (v/16) & 0xFF; |
|
814 |
+ uint64_t b= (v/8) & 0xFF; |
|
815 |
+ uint64_t c= (v/4) & 0xFF; |
|
816 |
+ uint64_t d= (3*v/8) & 0xFF; |
|
817 |
+*/ |
|
818 |
+//Simulate piecewise linear interpolation |
|
819 |
+ uint64_t a= (v/16) & 0xFF; |
|
820 |
+ uint64_t b= (v*3/16) & 0xFF; |
|
821 |
+ uint64_t c= (v*5/16) & 0xFF; |
|
822 |
+ uint64_t d= (7*v/16) & 0xFF; |
|
823 |
+ uint64_t A= (0x100 - a)&0xFF; |
|
824 |
+ uint64_t B= (0x100 - b)&0xFF; |
|
825 |
+ uint64_t C= (0x100 - c)&0xFF; |
|
826 |
+ uint64_t D= (0x100 - c)&0xFF; |
|
827 |
+ |
|
828 |
+ lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | |
|
829 |
+ (D<<24) | (C<<16) | (B<<8) | (A); |
|
830 |
+ //lut[i] = (v<<32) | (v<<24); |
|
831 |
+ } |
|
832 |
+ } |
|
833 |
+ |
|
834 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
835 |
+ asm volatile( |
|
836 |
+ "pxor %%mm7, %%mm7 \n\t" // 0 |
|
837 |
+// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
|
838 |
+ "leal (%0, %1), %%eax \n\t" |
|
839 |
+ "leal (%%eax, %1, 4), %%ebx \n\t" |
|
840 |
+ |
|
841 |
+ "movq b80, %%mm6 \n\t" |
|
842 |
+ "movd %2, %%mm5 \n\t" // QP |
|
843 |
+ "movq %%mm5, %%mm4 \n\t" |
|
844 |
+ "paddusb %%mm5, %%mm5 \n\t" // 2QP |
|
845 |
+ "paddusb %%mm5, %%mm4 \n\t" // 3QP |
|
846 |
+ "pxor %%mm5, %%mm5 \n\t" // 0 |
|
847 |
+ "psubb %%mm4, %%mm5 \n\t" // -3QP |
|
848 |
+ "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP |
|
849 |
+ "psllq $24, %%mm5 \n\t" |
|
850 |
+ |
|
851 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
852 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
853 |
+ |
|
854 |
+#define HX1old(a) \ |
|
855 |
+ "movd " #a ", %%mm0 \n\t"\ |
|
856 |
+ "movd 4" #a ", %%mm1 \n\t"\ |
|
857 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
858 |
+ "movq %%mm0, %%mm1 \n\t"\ |
|
859 |
+ "movq %%mm0, %%mm2 \n\t"\ |
|
860 |
+ "psrlq $8, %%mm1 \n\t"\ |
|
861 |
+ "psubusb %%mm1, %%mm2 \n\t"\ |
|
862 |
+ "psubusb %%mm0, %%mm1 \n\t"\ |
|
863 | ||
864 | ||
865 | ||
866 | ||
867 | ||
868 |
+ "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ |
|
869 |
+ "paddb %%mm5, %%mm1 \n\t"\ |
|
870 |
+ "psubusb %%mm5, %%mm1 \n\t"\ |
|
871 |
+ PAVGB(%%mm7, %%mm1)\ |
|
872 |
+ "pxor %%mm2, %%mm1 \n\t"\ |
|
873 |
+ "psubb %%mm2, %%mm1 \n\t"\ |
|
874 |
+ "psrlq $24, %%mm1 \n\t"\ |
|
875 |
+ "movd %%mm1, %%ecx \n\t"\ |
|
876 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
877 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
878 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
879 |
+ "movq %%mm0, " #a " \n\t"\ |
|
880 |
+ |
|
881 |
+/* |
|
882 |
+HX1old((%0)) |
|
883 |
+HX1old((%%eax)) |
|
884 |
+HX1old((%%eax, %1)) |
|
885 |
+HX1old((%%eax, %1, 2)) |
|
886 |
+HX1old((%0, %1, 4)) |
|
887 |
+HX1old((%%ebx)) |
|
888 |
+HX1old((%%ebx, %1)) |
|
889 |
+HX1old((%%ebx, %1, 2)) |
|
890 |
+*/ |
|
891 |
+ |
|
892 |
+//FIXME add some comments, its unreadable ... |
|
893 |
+#define HX1b(a, c, b, d) \ |
|
894 |
+ "movd " #a ", %%mm0 \n\t"\ |
|
895 |
+ "movd 4" #a ", %%mm1 \n\t"\ |
|
896 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
897 |
+ "movd " #b ", %%mm4 \n\t"\ |
|
898 |
+ "movq %%mm0, %%mm1 \n\t"\ |
|
899 |
+ "movq %%mm0, %%mm2 \n\t"\ |
|
900 |
+ "psrlq $8, %%mm1 \n\t"\ |
|
901 |
+ "movd 4" #b ", %%mm3 \n\t"\ |
|
902 |
+ "psubusb %%mm1, %%mm2 \n\t"\ |
|
903 |
+ "psubusb %%mm0, %%mm1 \n\t"\ |
|
904 | ||
905 | ||
906 |
+ "punpckldq %%mm3, %%mm4 \n\t"\ |
|
907 |
+ "movq %%mm1, %%mm3 \n\t"\ |
|
908 | ||
909 | ||
910 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
911 | ||
912 |
+ "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ |
|
913 |
+ "movq %%mm4, %%mm3 \n\t"\ |
|
914 |
+ "paddb %%mm5, %%mm1 \n\t"\ |
|
915 |
+ "psubusb %%mm5, %%mm1 \n\t"\ |
|
916 |
+ "psrlq $8, %%mm3 \n\t"\ |
|
917 |
+ PAVGB(%%mm7, %%mm1)\ |
|
918 |
+ "pxor %%mm2, %%mm1 \n\t"\ |
|
919 |
+ "psubb %%mm2, %%mm1 \n\t"\ |
|
920 |
+ "movq %%mm4, %%mm2 \n\t"\ |
|
921 |
+ "psrlq $24, %%mm1 \n\t"\ |
|
922 |
+ "psubusb %%mm3, %%mm2 \n\t"\ |
|
923 |
+ "movd %%mm1, %%ecx \n\t"\ |
|
924 |
+ "psubusb %%mm4, %%mm3 \n\t"\ |
|
925 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
926 | ||
927 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
928 | ||
929 |
+ "movq %%mm3, %%mm1 \n\t"\ |
|
930 | ||
931 |
+ "movq %%mm0, " #a " \n\t"\ |
|
932 | ||
933 |
+ "paddb %%mm6, %%mm4 \n\t"\ |
|
934 | ||
935 |
+ "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ |
|
936 |
+ "paddb %%mm5, %%mm3 \n\t"\ |
|
937 |
+ "psubusb %%mm5, %%mm3 \n\t"\ |
|
938 |
+ PAVGB(%%mm7, %%mm3)\ |
|
939 |
+ "pxor %%mm2, %%mm3 \n\t"\ |
|
940 |
+ "psubb %%mm2, %%mm3 \n\t"\ |
|
941 |
+ "psrlq $24, %%mm3 \n\t"\ |
|
942 |
+ "movd " #c ", %%mm0 \n\t"\ |
|
943 |
+ "movd 4" #c ", %%mm1 \n\t"\ |
|
944 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
945 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
946 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
947 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
948 |
+ "movq %%mm0, " #c " \n\t"\ |
|
949 |
+ "movd %%mm3, %%ecx \n\t"\ |
|
950 |
+ "movd " #d ", %%mm0 \n\t"\ |
|
951 |
+ "paddsb (%3, %%ecx, 8), %%mm4 \n\t"\ |
|
952 |
+ "movd 4" #d ", %%mm1 \n\t"\ |
|
953 |
+ "paddb %%mm6, %%mm4 \n\t"\ |
|
954 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
955 |
+ "movq %%mm4, " #b " \n\t"\ |
|
956 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
957 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
958 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
959 |
+ "movq %%mm0, " #d " \n\t"\ |
|
960 |
+ |
|
961 |
+HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2)) |
|
962 |
+HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) |
|
963 |
+ |
|
964 |
+ |
|
965 |
+ : |
|
966 |
+ : "r" (src), "r" (stride), "r" (QP), "r" (lut) |
|
967 |
+ : "%eax", "%ebx", "%ecx" |
|
968 |
+ ); |
|
969 |
+#else |
|
970 |
+ |
|
971 |
+//FIXME (has little in common with the mmx2 version) |
|
972 |
+ for(y=0; y<BLOCK_SIZE; y++) |
|
973 |
+ { |
|
974 |
+ int a= src[1] - src[2]; |
|
975 |
+ int b= src[3] - src[4]; |
|
976 |
+ int c= src[5] - src[6]; |
|
977 |
+ |
|
978 |
+ int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
|
979 |
+ |
|
980 |
+ if(d < QP) |
|
981 |
+ { |
|
982 |
+ int v = d * SIGN(-b); |
|
983 |
+ |
|
984 |
+ src[1] +=v/8; |
|
985 |
+ src[2] +=v/4; |
|
986 |
+ src[3] +=3*v/8; |
|
987 |
+ src[4] -=3*v/8; |
|
988 |
+ src[5] -=v/4; |
|
989 |
+ src[6] -=v/8; |
|
990 |
+ |
|
991 |
+ } |
|
992 |
+ src+=stride; |
|
993 |
+ } |
|
994 |
+#endif |
|
995 |
+} |
|
996 |
+ |
|
792 | 997 |
|
793 | 998 |
static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
794 | 999 |
{ |
... | ... |
@@ -1638,13 +1854,14 @@ void postprocess(unsigned char * src[], int src_stride, |
1638 | 1638 |
vertical_size >>= 1; |
1639 | 1639 |
src_stride >>= 1; |
1640 | 1640 |
dst_stride >>= 1; |
1641 |
+ mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); |
|
1641 | 1642 |
|
1642 | 1643 |
if(1) |
1643 | 1644 |
{ |
1644 | 1645 |
postProcess(src[1], src_stride, dst[1], dst_stride, |
1645 |
- horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); |
|
1646 |
+ horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
|
1646 | 1647 |
postProcess(src[2], src_stride, dst[2], dst_stride, |
1647 |
- horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); |
|
1648 |
+ horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
|
1648 | 1649 |
} |
1649 | 1650 |
else |
1650 | 1651 |
{ |
... | ... |
@@ -1929,9 +2146,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int |
1929 | 1929 |
#endif |
1930 | 1930 |
if(mode & V_DEBLOCK) |
1931 | 1931 |
{ |
1932 |
- if(mode & RK_FILTER) |
|
1933 |
- vertRKFilter(vertBlock, stride, QP); |
|
1934 |
- else if(mode & X1_FILTER) |
|
1932 |
+ if(mode & V_RK1_FILTER) |
|
1933 |
+ vertRK1Filter(vertBlock, stride, QP); |
|
1934 |
+ else if(mode & V_X1_FILTER) |
|
1935 | 1935 |
vertX1Filter(vertBlock, stride, QP); |
1936 | 1936 |
else |
1937 | 1937 |
{ |
... | ... |
@@ -1962,13 +2179,18 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int |
1962 | 1962 |
#endif |
1963 | 1963 |
if(mode & H_DEBLOCK) |
1964 | 1964 |
{ |
1965 |
- if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
|
1965 |
+ if(mode & H_X1_FILTER) |
|
1966 |
+ horizX1Filter(dstBlock-4, stride, QP); |
|
1967 |
+ else |
|
1966 | 1968 |
{ |
1967 |
- if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) |
|
1968 |
- doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); |
|
1969 |
+ if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
|
1970 |
+ { |
|
1971 |
+ if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) |
|
1972 |
+ doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); |
|
1973 |
+ } |
|
1974 |
+ else |
|
1975 |
+ doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); |
|
1969 | 1976 |
} |
1970 |
- else |
|
1971 |
- doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); |
|
1972 | 1977 |
} |
1973 | 1978 |
#ifdef MORE_TIMEING |
1974 | 1979 |
T1= rdtsc(); |
... | ... |
@@ -28,24 +28,23 @@ |
28 | 28 |
#define DERING 0x04 |
29 | 29 |
#define LEVEL_FIX 0x08 /* Brightness & Contrast */ |
30 | 30 |
|
31 |
-#define LUM_V_DEBLOCK V_DEBLOCK |
|
32 |
-#define LUM_H_DEBLOCK H_DEBLOCK |
|
33 |
-#define CHROM_V_DEBLOCK (V_DEBLOCK<<4) |
|
34 |
-#define CHROM_H_DEBLOCK (H_DEBLOCK<<4) |
|
35 |
-#define LUM_DERING DERING |
|
36 |
-#define CHROM_DERING (DERING<<4) |
|
37 |
-#define LUM_LEVEL_FIX LEVEL_FIX |
|
31 |
+#define LUM_V_DEBLOCK V_DEBLOCK // 1 |
|
32 |
+#define LUM_H_DEBLOCK H_DEBLOCK // 2 |
|
33 |
+#define CHROM_V_DEBLOCK (V_DEBLOCK<<4) // 16 |
|
34 |
+#define CHROM_H_DEBLOCK (H_DEBLOCK<<4) // 32 |
|
35 |
+#define LUM_DERING DERING // 4 |
|
36 |
+#define CHROM_DERING (DERING<<4) // 64 |
|
37 |
+#define LUM_LEVEL_FIX LEVEL_FIX // 8 |
|
38 | 38 |
//not supported currently |
39 |
-#define CHROM_LEVEL_FIX (LEVEL_FIX<<4) |
|
39 |
+#define CHROM_LEVEL_FIX (LEVEL_FIX<<4) // 128 |
|
40 | 40 |
|
41 |
-// Experimental stuff |
|
42 |
-#define RK_FILTER 0x0100 |
|
43 |
-#define LUM_V_RK_FILTER RK_FILTER |
|
44 |
-#define CHROM_V_RK_FILTER (RK_FILTER<<4) |
|
41 |
+// Experimental vertical filters |
|
42 |
+#define V_RK1_FILTER 0x0100 // 256 |
|
43 |
+#define V_X1_FILTER 0x0200 // 512 |
|
45 | 44 |
|
46 |
-#define X1_FILTER 0x0200 |
|
47 |
-#define LUM_V_X1_FILTER X1_FILTER |
|
48 |
-#define CHROM_V_X1_FILTER (X1_FILTER<<4) |
|
45 |
+// Experimental horizontal filters |
|
46 |
+#define H_RK1_FILTER 0x1000 // 4096 |
|
47 |
+#define H_X1_FILTER 0x2000 // 8192 |
|
49 | 48 |
|
50 | 49 |
|
51 | 50 |
#define TIMEING |
... | ... |
@@ -27,8 +27,9 @@ isHorizMinMaxOk a |
27 | 27 |
doHorizLowPass E a a* |
28 | 28 |
doHorizDefFilter E ac ac |
29 | 29 |
deRing |
30 |
-RKAlgo1 E a a* |
|
31 |
-X1 a E E* |
|
30 |
+Vertical RKAlgo1 E a a* |
|
31 |
+Vertical X1 a E E* |
|
32 |
+Horizontal X1 a E E* |
|
32 | 33 |
|
33 | 34 |
|
34 | 35 |
* i dont have a 3dnow CPU -> its untested |
... | ... |
@@ -40,7 +41,7 @@ c = checked against the other implementations (-vo md5) |
40 | 40 |
|
41 | 41 |
/* |
42 | 42 |
TODO: |
43 |
-verify that everything workes as it should |
|
43 |
+verify that everything workes as it should (how?) |
|
44 | 44 |
reduce the time wasted on the mem transfer |
45 | 45 |
implement dering |
46 | 46 |
implement everything in C at least (done at the moment but ...) |
... | ... |
@@ -51,6 +52,9 @@ write a faster and higher quality deblocking filter :) |
51 | 51 |
do something about the speed of the horizontal filters |
52 | 52 |
make the mainloop more flexible (variable number of blocks at once |
53 | 53 |
(the if/else stuff per block is slowing things down) |
54 |
+compare the quality & speed of all filters |
|
55 |
+implement a few simple deinterlacing filters |
|
56 |
+split this huge file |
|
54 | 57 |
... |
55 | 58 |
|
56 | 59 |
Notes: |
... | ... |
@@ -58,7 +62,7 @@ Notes: |
58 | 58 |
*/ |
59 | 59 |
|
60 | 60 |
/* |
61 |
-Changelog: |
|
61 |
+Changelog: use the CVS log |
|
62 | 62 |
0.1.3 |
63 | 63 |
bugfixes: last 3 lines not brightness/contrast corrected |
64 | 64 |
brightness statistics messed up with initial black pic |
... | ... |
@@ -99,11 +103,13 @@ static uint64_t bm10000000= 0xFF00000000000000LL; |
99 | 99 |
static uint64_t bm10000001= 0xFF000000000000FFLL; |
100 | 100 |
static uint64_t bm11000011= 0xFFFF00000000FFFFLL; |
101 | 101 |
static uint64_t bm00000011= 0x000000000000FFFFLL; |
102 |
+static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL; |
|
102 | 103 |
static uint64_t bm11000000= 0xFFFF000000000000LL; |
103 | 104 |
static uint64_t bm00011000= 0x000000FFFF000000LL; |
104 | 105 |
static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; |
105 | 106 |
static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; |
106 | 107 |
static uint64_t b00= 0x0000000000000000LL; |
108 |
+static uint64_t b01= 0x0101010101010101LL; |
|
107 | 109 |
static uint64_t b02= 0x0202020202020202LL; |
108 | 110 |
static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; |
109 | 111 |
static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; |
... | ... |
@@ -544,7 +550,7 @@ static inline void doVertLowPass(uint8_t *src, int stride, int QP) |
544 | 544 |
x/8 = 1 |
545 | 545 |
1 12 12 23 |
546 | 546 |
*/ |
547 |
-static inline void vertRKFilter(uint8_t *src, int stride, int QP) |
|
547 |
+static inline void vertRK1Filter(uint8_t *src, int stride, int QP) |
|
548 | 548 |
{ |
549 | 549 |
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
550 | 550 |
// FIXME rounding |
... | ... |
@@ -638,7 +644,8 @@ static inline void vertRKFilter(uint8_t *src, int stride, int QP) |
638 | 638 |
|
639 | 639 |
/** |
640 | 640 |
* Experimental Filter 1 |
641 |
- * will nor damage linear gradients |
|
641 |
+ * will not damage linear gradients |
|
642 |
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
642 | 643 |
* can only smooth blocks at the expected locations (it cant smooth them if they did move) |
643 | 644 |
* MMX2 version does correct clipping C version doesnt |
644 | 645 |
*/ |
... | ... |
@@ -675,9 +682,13 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
675 | 675 |
"movq %%mm4, %%mm3 \n\t" // d |
676 | 676 |
"psubusb pQPb, %%mm4 \n\t" |
677 | 677 |
"pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
678 |
+ "psubusb b01, %%mm3 \n\t" |
|
678 | 679 |
"pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
679 | 680 |
|
680 | 681 |
PAVGB(%%mm7, %%mm3) // d/2 |
682 |
+ "movq %%mm3, %%mm1 \n\t" // d/2 |
|
683 |
+ PAVGB(%%mm7, %%mm3) // d/4 |
|
684 |
+ PAVGB(%%mm1, %%mm3) // 3*d/8 |
|
681 | 685 |
|
682 | 686 |
"movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
683 | 687 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
... | ... |
@@ -691,31 +702,31 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
691 | 691 |
"pxor %%mm2, %%mm0 \n\t" |
692 | 692 |
"movq %%mm0, (%%ebx) \n\t" // line 5 |
693 | 693 |
|
694 |
- PAVGB(%%mm7, %%mm3) // d/4 |
|
694 |
+ PAVGB(%%mm7, %%mm1) // d/4 |
|
695 | 695 |
|
696 | 696 |
"movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
697 | 697 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
698 |
- "psubusb %%mm3, %%mm0 \n\t" |
|
698 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
699 | 699 |
"pxor %%mm2, %%mm0 \n\t" |
700 | 700 |
"movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
701 | 701 |
|
702 | 702 |
"movq (%%ebx, %1), %%mm0 \n\t" // line 6 |
703 | 703 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
704 |
- "paddusb %%mm3, %%mm0 \n\t" |
|
704 |
+ "paddusb %%mm1, %%mm0 \n\t" |
|
705 | 705 |
"pxor %%mm2, %%mm0 \n\t" |
706 | 706 |
"movq %%mm0, (%%ebx, %1) \n\t" // line 6 |
707 | 707 |
|
708 |
- PAVGB(%%mm7, %%mm3) // d/8 |
|
708 |
+ PAVGB(%%mm7, %%mm1) // d/8 |
|
709 | 709 |
|
710 | 710 |
"movq (%%eax, %1), %%mm0 \n\t" // line 2 |
711 | 711 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
712 |
- "psubusb %%mm3, %%mm0 \n\t" |
|
712 |
+ "psubusb %%mm1, %%mm0 \n\t" |
|
713 | 713 |
"pxor %%mm2, %%mm0 \n\t" |
714 | 714 |
"movq %%mm0, (%%eax, %1) \n\t" // line 2 |
715 | 715 |
|
716 | 716 |
"movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 |
717 | 717 |
"pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
718 |
- "paddusb %%mm3, %%mm0 \n\t" |
|
718 |
+ "paddusb %%mm1, %%mm0 \n\t" |
|
719 | 719 |
"pxor %%mm2, %%mm0 \n\t" |
720 | 720 |
"movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 |
721 | 721 |
|
... | ... |
@@ -739,7 +750,7 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
739 | 739 |
{ |
740 | 740 |
int a= src[l3] - src[l4]; |
741 | 741 |
int b= src[l4] - src[l5]; |
742 |
- int c= src[l6] - src[l7]; |
|
742 |
+ int c= src[l5] - src[l6]; |
|
743 | 743 |
|
744 | 744 |
int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
745 | 745 |
|
... | ... |
@@ -749,8 +760,8 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
749 | 749 |
|
750 | 750 |
src[l2] +=v/8; |
751 | 751 |
src[l3] +=v/4; |
752 |
- src[l4] +=v/2; |
|
753 |
- src[l5] -=v/2; |
|
752 |
+ src[l4] +=3*v/8; |
|
753 |
+ src[l5] -=3*v/8; |
|
754 | 754 |
src[l6] -=v/4; |
755 | 755 |
src[l7] -=v/8; |
756 | 756 |
|
... | ... |
@@ -789,6 +800,211 @@ static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
789 | 789 |
#endif |
790 | 790 |
} |
791 | 791 |
|
792 |
+/** |
|
793 |
+ * Experimental Filter 1 (Horizontal) |
|
794 |
+ * will not damage linear gradients |
|
795 |
+ * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
|
796 |
+ * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
797 |
+ * MMX2 version does correct clipping C version doesnt |
|
798 |
+ * not identical with the vertical one |
|
799 |
+ */ |
|
800 |
+static inline void horizX1Filter(uint8_t *src, int stride, int QP) |
|
801 |
+{ |
|
802 |
+ int y; |
|
803 |
+ static uint64_t *lut= NULL; |
|
804 |
+ if(lut==NULL) |
|
805 |
+ { |
|
806 |
+ int i; |
|
807 |
+ lut= (uint64_t*)memalign(8, 256*8); |
|
808 |
+ for(i=0; i<256; i++) |
|
809 |
+ { |
|
810 |
+ int v= i < 128 ? 2*i : 2*(i-256); |
|
811 |
+/* |
|
812 |
+//Simulate 112242211 9-Tap filter |
|
813 |
+ uint64_t a= (v/16) & 0xFF; |
|
814 |
+ uint64_t b= (v/8) & 0xFF; |
|
815 |
+ uint64_t c= (v/4) & 0xFF; |
|
816 |
+ uint64_t d= (3*v/8) & 0xFF; |
|
817 |
+*/ |
|
818 |
+//Simulate piecewise linear interpolation |
|
819 |
+ uint64_t a= (v/16) & 0xFF; |
|
820 |
+ uint64_t b= (v*3/16) & 0xFF; |
|
821 |
+ uint64_t c= (v*5/16) & 0xFF; |
|
822 |
+ uint64_t d= (7*v/16) & 0xFF; |
|
823 |
+ uint64_t A= (0x100 - a)&0xFF; |
|
824 |
+ uint64_t B= (0x100 - b)&0xFF; |
|
825 |
+ uint64_t C= (0x100 - c)&0xFF; |
|
826 |
+ uint64_t D= (0x100 - c)&0xFF; |
|
827 |
+ |
|
828 |
+ lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | |
|
829 |
+ (D<<24) | (C<<16) | (B<<8) | (A); |
|
830 |
+ //lut[i] = (v<<32) | (v<<24); |
|
831 |
+ } |
|
832 |
+ } |
|
833 |
+ |
|
834 |
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
835 |
+ asm volatile( |
|
836 |
+ "pxor %%mm7, %%mm7 \n\t" // 0 |
|
837 |
+// "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
|
838 |
+ "leal (%0, %1), %%eax \n\t" |
|
839 |
+ "leal (%%eax, %1, 4), %%ebx \n\t" |
|
840 |
+ |
|
841 |
+ "movq b80, %%mm6 \n\t" |
|
842 |
+ "movd %2, %%mm5 \n\t" // QP |
|
843 |
+ "movq %%mm5, %%mm4 \n\t" |
|
844 |
+ "paddusb %%mm5, %%mm5 \n\t" // 2QP |
|
845 |
+ "paddusb %%mm5, %%mm4 \n\t" // 3QP |
|
846 |
+ "pxor %%mm5, %%mm5 \n\t" // 0 |
|
847 |
+ "psubb %%mm4, %%mm5 \n\t" // -3QP |
|
848 |
+ "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP |
|
849 |
+ "psllq $24, %%mm5 \n\t" |
|
850 |
+ |
|
851 |
+// 0 1 2 3 4 5 6 7 8 9 |
|
852 |
+// %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
853 |
+ |
|
854 |
+#define HX1old(a) \ |
|
855 |
+ "movd " #a ", %%mm0 \n\t"\ |
|
856 |
+ "movd 4" #a ", %%mm1 \n\t"\ |
|
857 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
858 |
+ "movq %%mm0, %%mm1 \n\t"\ |
|
859 |
+ "movq %%mm0, %%mm2 \n\t"\ |
|
860 |
+ "psrlq $8, %%mm1 \n\t"\ |
|
861 |
+ "psubusb %%mm1, %%mm2 \n\t"\ |
|
862 |
+ "psubusb %%mm0, %%mm1 \n\t"\ |
|
863 | ||
864 | ||
865 | ||
866 | ||
867 | ||
868 |
+ "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ |
|
869 |
+ "paddb %%mm5, %%mm1 \n\t"\ |
|
870 |
+ "psubusb %%mm5, %%mm1 \n\t"\ |
|
871 |
+ PAVGB(%%mm7, %%mm1)\ |
|
872 |
+ "pxor %%mm2, %%mm1 \n\t"\ |
|
873 |
+ "psubb %%mm2, %%mm1 \n\t"\ |
|
874 |
+ "psrlq $24, %%mm1 \n\t"\ |
|
875 |
+ "movd %%mm1, %%ecx \n\t"\ |
|
876 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
877 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
878 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
879 |
+ "movq %%mm0, " #a " \n\t"\ |
|
880 |
+ |
|
881 |
+/* |
|
882 |
+HX1old((%0)) |
|
883 |
+HX1old((%%eax)) |
|
884 |
+HX1old((%%eax, %1)) |
|
885 |
+HX1old((%%eax, %1, 2)) |
|
886 |
+HX1old((%0, %1, 4)) |
|
887 |
+HX1old((%%ebx)) |
|
888 |
+HX1old((%%ebx, %1)) |
|
889 |
+HX1old((%%ebx, %1, 2)) |
|
890 |
+*/ |
|
891 |
+ |
|
892 |
+//FIXME add some comments, its unreadable ... |
|
893 |
+#define HX1b(a, c, b, d) \ |
|
894 |
+ "movd " #a ", %%mm0 \n\t"\ |
|
895 |
+ "movd 4" #a ", %%mm1 \n\t"\ |
|
896 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
897 |
+ "movd " #b ", %%mm4 \n\t"\ |
|
898 |
+ "movq %%mm0, %%mm1 \n\t"\ |
|
899 |
+ "movq %%mm0, %%mm2 \n\t"\ |
|
900 |
+ "psrlq $8, %%mm1 \n\t"\ |
|
901 |
+ "movd 4" #b ", %%mm3 \n\t"\ |
|
902 |
+ "psubusb %%mm1, %%mm2 \n\t"\ |
|
903 |
+ "psubusb %%mm0, %%mm1 \n\t"\ |
|
904 | ||
905 | ||
906 |
+ "punpckldq %%mm3, %%mm4 \n\t"\ |
|
907 |
+ "movq %%mm1, %%mm3 \n\t"\ |
|
908 | ||
909 | ||
910 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
911 | ||
912 |
+ "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ |
|
913 |
+ "movq %%mm4, %%mm3 \n\t"\ |
|
914 |
+ "paddb %%mm5, %%mm1 \n\t"\ |
|
915 |
+ "psubusb %%mm5, %%mm1 \n\t"\ |
|
916 |
+ "psrlq $8, %%mm3 \n\t"\ |
|
917 |
+ PAVGB(%%mm7, %%mm1)\ |
|
918 |
+ "pxor %%mm2, %%mm1 \n\t"\ |
|
919 |
+ "psubb %%mm2, %%mm1 \n\t"\ |
|
920 |
+ "movq %%mm4, %%mm2 \n\t"\ |
|
921 |
+ "psrlq $24, %%mm1 \n\t"\ |
|
922 |
+ "psubusb %%mm3, %%mm2 \n\t"\ |
|
923 |
+ "movd %%mm1, %%ecx \n\t"\ |
|
924 |
+ "psubusb %%mm4, %%mm3 \n\t"\ |
|
925 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
926 | ||
927 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
928 | ||
929 |
+ "movq %%mm3, %%mm1 \n\t"\ |
|
930 | ||
931 |
+ "movq %%mm0, " #a " \n\t"\ |
|
932 | ||
933 |
+ "paddb %%mm6, %%mm4 \n\t"\ |
|
934 | ||
935 |
+ "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ |
|
936 |
+ "paddb %%mm5, %%mm3 \n\t"\ |
|
937 |
+ "psubusb %%mm5, %%mm3 \n\t"\ |
|
938 |
+ PAVGB(%%mm7, %%mm3)\ |
|
939 |
+ "pxor %%mm2, %%mm3 \n\t"\ |
|
940 |
+ "psubb %%mm2, %%mm3 \n\t"\ |
|
941 |
+ "psrlq $24, %%mm3 \n\t"\ |
|
942 |
+ "movd " #c ", %%mm0 \n\t"\ |
|
943 |
+ "movd 4" #c ", %%mm1 \n\t"\ |
|
944 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
945 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
946 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
947 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
948 |
+ "movq %%mm0, " #c " \n\t"\ |
|
949 |
+ "movd %%mm3, %%ecx \n\t"\ |
|
950 |
+ "movd " #d ", %%mm0 \n\t"\ |
|
951 |
+ "paddsb (%3, %%ecx, 8), %%mm4 \n\t"\ |
|
952 |
+ "movd 4" #d ", %%mm1 \n\t"\ |
|
953 |
+ "paddb %%mm6, %%mm4 \n\t"\ |
|
954 |
+ "punpckldq %%mm1, %%mm0 \n\t"\ |
|
955 |
+ "movq %%mm4, " #b " \n\t"\ |
|
956 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
957 |
+ "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ |
|
958 |
+ "paddb %%mm6, %%mm0 \n\t"\ |
|
959 |
+ "movq %%mm0, " #d " \n\t"\ |
|
960 |
+ |
|
961 |
+HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2)) |
|
962 |
+HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) |
|
963 |
+ |
|
964 |
+ |
|
965 |
+ : |
|
966 |
+ : "r" (src), "r" (stride), "r" (QP), "r" (lut) |
|
967 |
+ : "%eax", "%ebx", "%ecx" |
|
968 |
+ ); |
|
969 |
+#else |
|
970 |
+ |
|
971 |
+//FIXME (has little in common with the mmx2 version) |
|
972 |
+ for(y=0; y<BLOCK_SIZE; y++) |
|
973 |
+ { |
|
974 |
+ int a= src[1] - src[2]; |
|
975 |
+ int b= src[3] - src[4]; |
|
976 |
+ int c= src[5] - src[6]; |
|
977 |
+ |
|
978 |
+ int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
|
979 |
+ |
|
980 |
+ if(d < QP) |
|
981 |
+ { |
|
982 |
+ int v = d * SIGN(-b); |
|
983 |
+ |
|
984 |
+ src[1] +=v/8; |
|
985 |
+ src[2] +=v/4; |
|
986 |
+ src[3] +=3*v/8; |
|
987 |
+ src[4] -=3*v/8; |
|
988 |
+ src[5] -=v/4; |
|
989 |
+ src[6] -=v/8; |
|
990 |
+ |
|
991 |
+ } |
|
992 |
+ src+=stride; |
|
993 |
+ } |
|
994 |
+#endif |
|
995 |
+} |
|
996 |
+ |
|
792 | 997 |
|
793 | 998 |
static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
794 | 999 |
{ |
... | ... |
@@ -1638,13 +1854,14 @@ void postprocess(unsigned char * src[], int src_stride, |
1638 | 1638 |
vertical_size >>= 1; |
1639 | 1639 |
src_stride >>= 1; |
1640 | 1640 |
dst_stride >>= 1; |
1641 |
+ mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); |
|
1641 | 1642 |
|
1642 | 1643 |
if(1) |
1643 | 1644 |
{ |
1644 | 1645 |
postProcess(src[1], src_stride, dst[1], dst_stride, |
1645 |
- horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); |
|
1646 |
+ horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
|
1646 | 1647 |
postProcess(src[2], src_stride, dst[2], dst_stride, |
1647 |
- horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); |
|
1648 |
+ horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
|
1648 | 1649 |
} |
1649 | 1650 |
else |
1650 | 1651 |
{ |
... | ... |
@@ -1929,9 +2146,9 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int |
1929 | 1929 |
#endif |
1930 | 1930 |
if(mode & V_DEBLOCK) |
1931 | 1931 |
{ |
1932 |
- if(mode & RK_FILTER) |
|
1933 |
- vertRKFilter(vertBlock, stride, QP); |
|
1934 |
- else if(mode & X1_FILTER) |
|
1932 |
+ if(mode & V_RK1_FILTER) |
|
1933 |
+ vertRK1Filter(vertBlock, stride, QP); |
|
1934 |
+ else if(mode & V_X1_FILTER) |
|
1935 | 1935 |
vertX1Filter(vertBlock, stride, QP); |
1936 | 1936 |
else |
1937 | 1937 |
{ |
... | ... |
@@ -1962,13 +2179,18 @@ void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int |
1962 | 1962 |
#endif |
1963 | 1963 |
if(mode & H_DEBLOCK) |
1964 | 1964 |
{ |
1965 |
- if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
|
1965 |
+ if(mode & H_X1_FILTER) |
|
1966 |
+ horizX1Filter(dstBlock-4, stride, QP); |
|
1967 |
+ else |
|
1966 | 1968 |
{ |
1967 |
- if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) |
|
1968 |
- doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); |
|
1969 |
+ if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
|
1970 |
+ { |
|
1971 |
+ if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) |
|
1972 |
+ doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); |
|
1973 |
+ } |
|
1974 |
+ else |
|
1975 |
+ doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); |
|
1969 | 1976 |
} |
1970 |
- else |
|
1971 |
- doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); |
|
1972 | 1977 |
} |
1973 | 1978 |
#ifdef MORE_TIMEING |
1974 | 1979 |
T1= rdtsc(); |