Browse code

qpel in mmx2/3dnow qpel refinement quality parameter

Originally committed as revision 1393 to svn://svn.ffmpeg.org/ffmpeg/trunk

Michael Niedermayer authored on 2003/01/06 00:57:10
Showing 11 changed files
... ...
@@ -5,8 +5,8 @@
5 5
 
6 6
 #define LIBAVCODEC_VERSION_INT 0x000406
7 7
 #define LIBAVCODEC_VERSION     "0.4.6"
8
-#define LIBAVCODEC_BUILD       4651
9
-#define LIBAVCODEC_BUILD_STR   "4651"
8
+#define LIBAVCODEC_BUILD       4652
9
+#define LIBAVCODEC_BUILD_STR   "4652"
10 10
 
11 11
 enum CodecID {
12 12
     CODEC_ID_NONE, 
... ...
@@ -909,7 +909,7 @@ typedef struct AVCodecContext {
909 909
      * decoding: unused
910 910
      */
911 911
     int me_pre_cmp;
912
-    
912
+
913 913
     /**
914 914
      * ME pre pass diamond size & shape
915 915
      * encoding: set by user.
... ...
@@ -917,6 +917,13 @@ typedef struct AVCodecContext {
917 917
      */
918 918
     int pre_dia_size;
919 919
 
920
+    /**
921
+     * subpel ME quality
922
+     * encoding: set by user.
923
+     * decoding: unused
924
+     */
925
+    int me_subpel_quality;
926
+
920 927
 } AVCodecContext;
921 928
 
922 929
 typedef struct AVCodec {
... ...
@@ -781,6 +781,7 @@ static inline void copy_block9(UINT8 *dst, UINT8 *src, int dstStride, int srcStr
781 781
     }
782 782
 }
783 783
 
784
+
784 785
 #define QPEL_MC(r, OPNAME, RND, OP) \
785 786
 static void OPNAME ## mpeg4_qpel8_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
786 787
     UINT8 *cm = cropTbl + MAX_NEG_CROP;\
... ...
@@ -830,6 +831,7 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass(UINT8 *dst, UINT8 *src, int dstStrid
830 830
 static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h){\
831 831
     UINT8 *cm = cropTbl + MAX_NEG_CROP;\
832 832
     int i;\
833
+    \
833 834
     for(i=0; i<h; i++)\
834 835
     {\
835 836
         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
... ...
@@ -853,9 +855,10 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass(UINT8 *dst, UINT8 *src, int dstStri
853 853
     }\
854 854
 }\
855 855
 \
856
-static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w){\
856
+static void OPNAME ## mpeg4_qpel16_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride){\
857 857
     UINT8 *cm = cropTbl + MAX_NEG_CROP;\
858 858
     int i;\
859
+    const int w=16;\
859 860
     for(i=0; i<w; i++)\
860 861
     {\
861 862
         const int src0= src[0*srcStride];\
... ...
@@ -1046,21 +1049,21 @@ static void OPNAME ## qpel16_mc01_c(UINT8 *dst, UINT8 *src, int stride){\
1046 1046
     UINT8 full[24*17];\
1047 1047
     UINT8 half[256];\
1048 1048
     copy_block17(full, src, 24, stride, 17);\
1049
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1049
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1050 1050
     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1051 1051
 }\
1052 1052
 \
1053 1053
 static void OPNAME ## qpel16_mc02_c(UINT8 *dst, UINT8 *src, int stride){\
1054 1054
     UINT8 full[24*17];\
1055 1055
     copy_block17(full, src, 24, stride, 17);\
1056
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24, 16);\
1056
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1057 1057
 }\
1058 1058
 \
1059 1059
 static void OPNAME ## qpel16_mc03_c(UINT8 *dst, UINT8 *src, int stride){\
1060 1060
     UINT8 full[24*17];\
1061 1061
     UINT8 half[256];\
1062 1062
     copy_block17(full, src, 24, stride, 17);\
1063
-    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24, 16);\
1063
+    put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1064 1064
     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1065 1065
 }\
1066 1066
 static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
... ...
@@ -1070,8 +1073,8 @@ static void OPNAME ## qpel16_mc11_c(UINT8 *dst, UINT8 *src, int stride){\
1070 1070
     UINT8 halfHV[256];\
1071 1071
     copy_block17(full, src, 24, stride, 17);\
1072 1072
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1073
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1074
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1073
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1074
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1075 1075
     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1076 1076
 }\
1077 1077
 static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
... ...
@@ -1081,8 +1084,8 @@ static void OPNAME ## qpel16_mc31_c(UINT8 *dst, UINT8 *src, int stride){\
1081 1081
     UINT8 halfHV[256];\
1082 1082
     copy_block17(full, src, 24, stride, 17);\
1083 1083
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1084
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1085
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1084
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1085
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1086 1086
     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1087 1087
 }\
1088 1088
 static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
... ...
@@ -1092,8 +1095,8 @@ static void OPNAME ## qpel16_mc13_c(UINT8 *dst, UINT8 *src, int stride){\
1092 1092
     UINT8 halfHV[256];\
1093 1093
     copy_block17(full, src, 24, stride, 17);\
1094 1094
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1095
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1096
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1095
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1096
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1097 1097
     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1098 1098
 }\
1099 1099
 static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
... ...
@@ -1103,22 +1106,22 @@ static void OPNAME ## qpel16_mc33_c(UINT8 *dst, UINT8 *src, int stride){\
1103 1103
     UINT8 halfHV[256];\
1104 1104
     copy_block17(full, src, 24, stride, 17);\
1105 1105
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1106
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1107
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1106
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1107
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1108 1108
     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1109 1109
 }\
1110 1110
 static void OPNAME ## qpel16_mc21_c(UINT8 *dst, UINT8 *src, int stride){\
1111 1111
     UINT8 halfH[272];\
1112 1112
     UINT8 halfHV[256];\
1113 1113
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1114
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1114
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1115 1115
     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1116 1116
 }\
1117 1117
 static void OPNAME ## qpel16_mc23_c(UINT8 *dst, UINT8 *src, int stride){\
1118 1118
     UINT8 halfH[272];\
1119 1119
     UINT8 halfHV[256];\
1120 1120
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1121
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1121
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1122 1122
     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1123 1123
 }\
1124 1124
 static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
... ...
@@ -1128,8 +1131,8 @@ static void OPNAME ## qpel16_mc12_c(UINT8 *dst, UINT8 *src, int stride){\
1128 1128
     UINT8 halfHV[256];\
1129 1129
     copy_block17(full, src, 24, stride, 17);\
1130 1130
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1131
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24, 16);\
1132
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1131
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1132
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1133 1133
     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1134 1134
 }\
1135 1135
 static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
... ...
@@ -1139,14 +1142,14 @@ static void OPNAME ## qpel16_mc32_c(UINT8 *dst, UINT8 *src, int stride){\
1139 1139
     UINT8 halfHV[256];\
1140 1140
     copy_block17(full, src, 24, stride, 17);\
1141 1141
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24, 16);\
1143
-    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16, 16);\
1142
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1143
+    put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144 1144
     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1145 1145
 }\
1146 1146
 static void OPNAME ## qpel16_mc22_c(UINT8 *dst, UINT8 *src, int stride){\
1147 1147
     UINT8 halfH[272];\
1148 1148
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1149
-    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16, 16);\
1149
+    OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1150 1150
 }
1151 1151
 
1152 1152
 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
... ...
@@ -102,6 +102,7 @@ typedef struct DSPContext {
102 102
     me_cmp_func quant_psnr[2];
103 103
     int (*hadamard8_abs )(uint8_t *src, int stride, int mean);
104 104
 
105
+    me_cmp_func me_pre_cmp[11];
105 106
     me_cmp_func me_cmp[11];
106 107
     me_cmp_func me_sub_cmp[11];
107 108
     me_cmp_func mb_cmp[11];
... ...
@@ -53,6 +53,11 @@ static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101U
53 53
 static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
54 54
 static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
55 55
 
56
+static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
57
+static const uint64_t ff_pw_3  __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
58
+static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
59
+static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
60
+
56 61
 #define JUMPALIGN() __asm __volatile (".balign 8"::)
57 62
 #define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
58 63
 
... ...
@@ -646,10 +651,698 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride)
646 646
 
647 647
 WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
648 648
 
649
+#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
650
+        "paddw " #m4 ", " #m3 "		\n\t" /* x1 */\
651
+        "movq " #pw_20 ", %%mm4		\n\t" /* 20 */\
652
+        "pmullw " #m3 ", %%mm4		\n\t" /* 20x1 */\
653
+        "movq "#in7", " #m3 "		\n\t" /* d */\
654
+        "movq "#in0", %%mm5		\n\t" /* D */\
655
+        "paddw " #m3 ", %%mm5		\n\t" /* x4 */\
656
+        "psubw %%mm5, %%mm4		\n\t" /* 20x1 - x4 */\
657
+        "movq "#in1", %%mm5		\n\t" /* C */\
658
+        "movq "#in2", %%mm6		\n\t" /* B */\
659
+        "paddw " #m6 ", %%mm5		\n\t" /* x3 */\
660
+        "paddw " #m5 ", %%mm6		\n\t" /* x2 */\
661
+        "paddw %%mm6, %%mm6		\n\t" /* 2x2 */\
662
+        "psubw %%mm6, %%mm5		\n\t" /* -2x2 + x3 */\
663
+        "pmullw " #pw_3 ", %%mm5	\n\t" /* -6x2 + 3x3 */\
664
+        "paddw " #rnd ", %%mm4		\n\t" /* x2 */\
665
+        "paddw %%mm4, %%mm5		\n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
666
+        "psraw $5, %%mm5		\n\t"\
667
+        "packuswb %%mm5, %%mm5		\n\t"\
668
+        OP(%%mm5, out, %%mm7, d)
669
+
670
+#define QPEL_BASE(OPNAME, ROUNDER, RND, OP)\
671
+void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
672
+    uint64_t temp;\
673
+\
674
+    asm volatile(\
675
+        "pxor %%mm7, %%mm7		\n\t"\
676
+        "1:				\n\t"\
677
+        "movq  (%0), %%mm0		\n\t" /* ABCDEFGH */\
678
+        "movq %%mm0, %%mm1		\n\t" /* ABCDEFGH */\
679
+        "movq %%mm0, %%mm2		\n\t" /* ABCDEFGH */\
680
+        "punpcklbw %%mm7, %%mm0		\n\t" /* 0A0B0C0D */\
681
+        "punpckhbw %%mm7, %%mm1		\n\t" /* 0E0F0G0H */\
682
+        "pshufw $0x90, %%mm0, %%mm5	\n\t" /* 0A0A0B0C */\
683
+        "pshufw $0x41, %%mm0, %%mm6	\n\t" /* 0B0A0A0B */\
684
+        "movq %%mm2, %%mm3		\n\t" /* ABCDEFGH */\
685
+        "movq %%mm2, %%mm4		\n\t" /* ABCDEFGH */\
686
+        "psllq $8, %%mm2		\n\t" /* 0ABCDEFG */\
687
+        "psllq $16, %%mm3		\n\t" /* 00ABCDEF */\
688
+        "psllq $24, %%mm4		\n\t" /* 000ABCDE */\
689
+        "punpckhbw %%mm7, %%mm2		\n\t" /* 0D0E0F0G */\
690
+        "punpckhbw %%mm7, %%mm3		\n\t" /* 0C0D0E0F */\
691
+        "punpckhbw %%mm7, %%mm4		\n\t" /* 0B0C0D0E */\
692
+        "paddw %%mm3, %%mm5		\n\t" /* b */\
693
+        "paddw %%mm2, %%mm6		\n\t" /* c */\
694
+        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
695
+        "psubw %%mm5, %%mm6		\n\t" /* c - 2b */\
696
+        "pshufw $0x06, %%mm0, %%mm5	\n\t" /* 0C0B0A0A */\
697
+        "pmullw %6, %%mm6		\n\t" /* 3c - 6b */\
698
+        "paddw %%mm4, %%mm0		\n\t" /* a */\
699
+        "paddw %%mm1, %%mm5		\n\t" /* d */\
700
+        "pmullw %5, %%mm0		\n\t" /* 20a */\
701
+        "psubw %%mm5, %%mm0		\n\t" /* 20a - d */\
702
+        "paddw %8, %%mm6		\n\t"\
703
+        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
704
+        "psraw $5, %%mm0		\n\t"\
705
+        "movq %%mm0, %7			\n\t"\
706
+        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
707
+        \
708
+        "movq 5(%0), %%mm0		\n\t" /* FGHIJKLM */\
709
+        "movq %%mm0, %%mm5		\n\t" /* FGHIJKLM */\
710
+        "movq %%mm0, %%mm6		\n\t" /* FGHIJKLM */\
711
+        "psrlq $8, %%mm0		\n\t" /* GHIJKLM0 */\
712
+        "psrlq $16, %%mm5		\n\t" /* HIJKLM00 */\
713
+        "punpcklbw %%mm7, %%mm0		\n\t" /* 0G0H0I0J */\
714
+        "punpcklbw %%mm7, %%mm5		\n\t" /* 0H0I0J0K */\
715
+        "paddw %%mm0, %%mm2		\n\t" /* b */\
716
+        "paddw %%mm5, %%mm3		\n\t" /* c */\
717
+        "paddw %%mm2, %%mm2		\n\t" /* 2b */\
718
+        "psubw %%mm2, %%mm3		\n\t" /* c - 2b */\
719
+        "movq %%mm6, %%mm2		\n\t" /* FGHIJKLM */\
720
+        "psrlq $24, %%mm6		\n\t" /* IJKLM000 */\
721
+        "punpcklbw %%mm7, %%mm2		\n\t" /* 0F0G0H0I */\
722
+        "punpcklbw %%mm7, %%mm6		\n\t" /* 0I0J0K0L */\
723
+        "pmullw %6, %%mm3		\n\t" /* 3c - 6b */\
724
+        "paddw %%mm2, %%mm1		\n\t" /* a */\
725
+        "paddw %%mm6, %%mm4		\n\t" /* d */\
726
+        "pmullw %5, %%mm1		\n\t" /* 20a */\
727
+        "psubw %%mm4, %%mm3		\n\t" /* - 6b +3c - d */\
728
+        "paddw %8, %%mm1		\n\t"\
729
+        "paddw %%mm1, %%mm3		\n\t" /* 20a - 6b +3c - d */\
730
+        "psraw $5, %%mm3		\n\t"\
731
+        "movq %7, %%mm1			\n\t"\
732
+        "packuswb %%mm3, %%mm1		\n\t"\
733
+        OP(%%mm1, (%1),%%mm4, q)\
734
+        /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
735
+        \
736
+        "movq 9(%0), %%mm1		\n\t" /* JKLMNOPQ */\
737
+        "movq %%mm1, %%mm4		\n\t" /* JKLMNOPQ */\
738
+        "movq %%mm1, %%mm3		\n\t" /* JKLMNOPQ */\
739
+        "psrlq $8, %%mm1		\n\t" /* KLMNOPQ0 */\
740
+        "psrlq $16, %%mm4		\n\t" /* LMNOPQ00 */\
741
+        "punpcklbw %%mm7, %%mm1		\n\t" /* 0K0L0M0N */\
742
+        "punpcklbw %%mm7, %%mm4		\n\t" /* 0L0M0N0O */\
743
+        "paddw %%mm1, %%mm5		\n\t" /* b */\
744
+        "paddw %%mm4, %%mm0		\n\t" /* c */\
745
+        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
746
+        "psubw %%mm5, %%mm0		\n\t" /* c - 2b */\
747
+        "movq %%mm3, %%mm5		\n\t" /* JKLMNOPQ */\
748
+        "psrlq $24, %%mm3		\n\t" /* MNOPQ000 */\
749
+        "pmullw %6, %%mm0		\n\t" /* 3c - 6b */\
750
+        "punpcklbw %%mm7, %%mm3		\n\t" /* 0M0N0O0P */\
751
+        "paddw %%mm3, %%mm2		\n\t" /* d */\
752
+        "psubw %%mm2, %%mm0		\n\t" /* -6b + 3c - d */\
753
+        "movq %%mm5, %%mm2		\n\t" /* JKLMNOPQ */\
754
+        "punpcklbw %%mm7, %%mm2		\n\t" /* 0J0K0L0M */\
755
+        "punpckhbw %%mm7, %%mm5		\n\t" /* 0N0O0P0Q */\
756
+        "paddw %%mm2, %%mm6		\n\t" /* a */\
757
+        "pmullw %5, %%mm6		\n\t" /* 20a */\
758
+        "paddw %8, %%mm0		\n\t"\
759
+        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
760
+        "psraw $5, %%mm0		\n\t"\
761
+        /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
762
+        \
763
+        "paddw %%mm5, %%mm3		\n\t" /* a */\
764
+        "pshufw $0xF9, %%mm5, %%mm6	\n\t" /* 0O0P0Q0Q */\
765
+        "paddw %%mm4, %%mm6		\n\t" /* b */\
766
+        "pshufw $0xBE, %%mm5, %%mm4	\n\t" /* 0P0Q0Q0P */\
767
+        "pshufw $0x6F, %%mm5, %%mm5	\n\t" /* 0Q0Q0P0O */\
768
+        "paddw %%mm1, %%mm4		\n\t" /* c */\
769
+        "paddw %%mm2, %%mm5		\n\t" /* d */\
770
+        "paddw %%mm6, %%mm6		\n\t" /* 2b */\
771
+        "psubw %%mm6, %%mm4		\n\t" /* c - 2b */\
772
+        "pmullw %5, %%mm3		\n\t" /* 20a */\
773
+        "pmullw %6, %%mm4		\n\t" /* 3c - 6b */\
774
+        "psubw %%mm5, %%mm3		\n\t" /* -6b + 3c - d */\
775
+        "paddw %8, %%mm4		\n\t"\
776
+        "paddw %%mm3, %%mm4		\n\t" /* 20a - 6b + 3c - d */\
777
+        "psraw $5, %%mm4		\n\t"\
778
+        "packuswb %%mm4, %%mm0		\n\t"\
779
+        OP(%%mm0, 8(%1), %%mm4, q)\
780
+        \
781
+        "addl %3, %0			\n\t"\
782
+        "addl %4, %1			\n\t"\
783
+        "decl %2			\n\t"\
784
+        " jnz 1b				\n\t"\
785
+        : "+r"(src), "+r"(dst), "+g"(h)\
786
+        : "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
787
+    );\
788
+}\
789
+\
790
+static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
791
+    int i;\
792
+    int16_t temp[16];\
793
+    /* quick HACK, XXX FIXME MUST be optimized */\
794
+    for(i=0; i<h; i++)\
795
+    {\
796
+        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
797
+        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
798
+        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
799
+        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
800
+        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
801
+        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
802
+        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
803
+        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
804
+        temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
805
+        temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
806
+        temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
807
+        temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
808
+        temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
809
+        temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
810
+        temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
811
+        temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
812
+        asm volatile(\
813
+            "movq (%0), %%mm0		\n\t"\
814
+            "movq 8(%0), %%mm1		\n\t"\
815
+            "paddw %2, %%mm0		\n\t"\
816
+            "paddw %2, %%mm1		\n\t"\
817
+            "psraw $5, %%mm0		\n\t"\
818
+            "psraw $5, %%mm1		\n\t"\
819
+            "packuswb %%mm1, %%mm0	\n\t"\
820
+            OP(%%mm0, (%1), %%mm1, q)\
821
+            "movq 16(%0), %%mm0		\n\t"\
822
+            "movq 24(%0), %%mm1		\n\t"\
823
+            "paddw %2, %%mm0		\n\t"\
824
+            "paddw %2, %%mm1		\n\t"\
825
+            "psraw $5, %%mm0		\n\t"\
826
+            "psraw $5, %%mm1		\n\t"\
827
+            "packuswb %%mm1, %%mm0	\n\t"\
828
+            OP(%%mm0, 8(%1), %%mm1, q)\
829
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
830
+        );\
831
+        dst+=dstStride;\
832
+        src+=srcStride;\
833
+    }\
834
+}\
835
+\
836
+void OPNAME ## mpeg4_qpel16_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
837
+    uint64_t temp[17*4];\
838
+    uint64_t *temp_ptr= temp;\
839
+    int count= 17;\
840
+\
841
+    /*FIXME unroll */\
842
+    asm volatile(\
843
+        "pxor %%mm7, %%mm7		\n\t"\
844
+        "1:				\n\t"\
845
+        "movq (%0), %%mm0		\n\t"\
846
+        "movq (%0), %%mm1		\n\t"\
847
+        "movq 8(%0), %%mm2		\n\t"\
848
+        "movq 8(%0), %%mm3		\n\t"\
849
+        "punpcklbw %%mm7, %%mm0		\n\t"\
850
+        "punpckhbw %%mm7, %%mm1		\n\t"\
851
+        "punpcklbw %%mm7, %%mm2		\n\t"\
852
+        "punpckhbw %%mm7, %%mm3		\n\t"\
853
+        "movq %%mm0, (%1)		\n\t"\
854
+        "movq %%mm1, 17*8(%1)		\n\t"\
855
+        "movq %%mm2, (%1, %4)		\n\t"\
856
+        "movq %%mm3, (%1, %5)		\n\t"\
857
+        "addl $8, %1			\n\t"\
858
+        "addl %3, %0			\n\t"\
859
+        "decl %2			\n\t"\
860
+        " jnz 1b			\n\t"\
861
+        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
862
+        : "r" (srcStride), "r"(2*8*17), "r"(3*8*17)\
863
+    );\
864
+    \
865
+    temp_ptr= temp;\
866
+    count=4;\
867
+    \
868
+/*FIXME reorder for speed */\
869
+    asm volatile(\
870
+        /*"pxor %%mm7, %%mm7		\n\t"*/\
871
+        "1:				\n\t"\
872
+        "movq (%0), %%mm0		\n\t"\
873
+        "movq 8(%0), %%mm1		\n\t"\
874
+        "movq 16(%0), %%mm2		\n\t"\
875
+        "movq 24(%0), %%mm3		\n\t"\
876
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
877
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
878
+        "addl %4, %1			\n\t"\
879
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
880
+        \
881
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
882
+        "addl %4, %1			\n\t"\
883
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
884
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
885
+        "addl %4, %1			\n\t"\
886
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
887
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
888
+        "addl %4, %1			\n\t"\
889
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
890
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
891
+        "addl %4, %1			\n\t"\
892
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
893
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
894
+        "addl %4, %1			\n\t"\
895
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
896
+        \
897
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
898
+        "addl %4, %1			\n\t"  \
899
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
900
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
901
+        \
902
+        "addl $136, %0			\n\t"\
903
+        "addl %8, %1			\n\t"\
904
+        "decl %2			\n\t"\
905
+        " jnz 1b			\n\t"\
906
+         \
907
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
908
+        : "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*dstStride)\
909
+    );\
910
+}\
911
+void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
912
+    uint64_t temp;\
913
+\
914
+    asm volatile(\
915
+        "pxor %%mm7, %%mm7		\n\t"\
916
+        "1:				\n\t"\
917
+        "movq  (%0), %%mm0		\n\t" /* ABCDEFGH */\
918
+        "movq %%mm0, %%mm1		\n\t" /* ABCDEFGH */\
919
+        "movq %%mm0, %%mm2		\n\t" /* ABCDEFGH */\
920
+        "punpcklbw %%mm7, %%mm0		\n\t" /* 0A0B0C0D */\
921
+        "punpckhbw %%mm7, %%mm1		\n\t" /* 0E0F0G0H */\
922
+        "pshufw $0x90, %%mm0, %%mm5	\n\t" /* 0A0A0B0C */\
923
+        "pshufw $0x41, %%mm0, %%mm6	\n\t" /* 0B0A0A0B */\
924
+        "movq %%mm2, %%mm3		\n\t" /* ABCDEFGH */\
925
+        "movq %%mm2, %%mm4		\n\t" /* ABCDEFGH */\
926
+        "psllq $8, %%mm2		\n\t" /* 0ABCDEFG */\
927
+        "psllq $16, %%mm3		\n\t" /* 00ABCDEF */\
928
+        "psllq $24, %%mm4		\n\t" /* 000ABCDE */\
929
+        "punpckhbw %%mm7, %%mm2		\n\t" /* 0D0E0F0G */\
930
+        "punpckhbw %%mm7, %%mm3		\n\t" /* 0C0D0E0F */\
931
+        "punpckhbw %%mm7, %%mm4		\n\t" /* 0B0C0D0E */\
932
+        "paddw %%mm3, %%mm5		\n\t" /* b */\
933
+        "paddw %%mm2, %%mm6		\n\t" /* c */\
934
+        "paddw %%mm5, %%mm5		\n\t" /* 2b */\
935
+        "psubw %%mm5, %%mm6		\n\t" /* c - 2b */\
936
+        "pshufw $0x06, %%mm0, %%mm5	\n\t" /* 0C0B0A0A */\
937
+        "pmullw %6, %%mm6		\n\t" /* 3c - 6b */\
938
+        "paddw %%mm4, %%mm0		\n\t" /* a */\
939
+        "paddw %%mm1, %%mm5		\n\t" /* d */\
940
+        "pmullw %5, %%mm0		\n\t" /* 20a */\
941
+        "psubw %%mm5, %%mm0		\n\t" /* 20a - d */\
942
+        "paddw %8, %%mm6		\n\t"\
943
+        "paddw %%mm6, %%mm0		\n\t" /* 20a - 6b + 3c - d */\
944
+        "psraw $5, %%mm0		\n\t"\
945
+        /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
946
+        \
947
+        "movd 5(%0), %%mm5		\n\t" /* FGHI */\
948
+        "punpcklbw %%mm7, %%mm5		\n\t" /* 0F0G0H0I */\
949
+        "pshufw $0xF9, %%mm5, %%mm6	\n\t" /* 0G0H0I0I */\
950
+        "paddw %%mm5, %%mm1		\n\t" /* a */\
951
+        "paddw %%mm6, %%mm2		\n\t" /* b */\
952
+        "pshufw $0xBE, %%mm5, %%mm6	\n\t" /* 0H0I0I0H */\
953
+        "pshufw $0x6F, %%mm5, %%mm5	\n\t" /* 0I0I0H0G */\
954
+        "paddw %%mm6, %%mm3		\n\t" /* c */\
955
+        "paddw %%mm5, %%mm4		\n\t" /* d */\
956
+        "paddw %%mm2, %%mm2		\n\t" /* 2b */\
957
+        "psubw %%mm2, %%mm3		\n\t" /* c - 2b */\
958
+        "pmullw %5, %%mm1		\n\t" /* 20a */\
959
+        "pmullw %6, %%mm3		\n\t" /* 3c - 6b */\
960
+        "psubw %%mm4, %%mm3		\n\t" /* -6b + 3c - d */\
961
+        "paddw %8, %%mm1		\n\t"\
962
+        "paddw %%mm1, %%mm3		\n\t" /* 20a - 6b + 3c - d */\
963
+        "psraw $5, %%mm3		\n\t"\
964
+        "packuswb %%mm3, %%mm0		\n\t"\
965
+        OP(%%mm0, (%1), %%mm4, q)\
966
+        \
967
+        "addl %3, %0			\n\t"\
968
+        "addl %4, %1			\n\t"\
969
+        "decl %2			\n\t"\
970
+        " jnz 1b				\n\t"\
971
+        : "+r"(src), "+r"(dst), "+g"(h)\
972
+        : "r"(srcStride), "r"(dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\
973
+    );\
974
+}\
975
+\
976
+static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
977
+    int i;\
978
+    int16_t temp[8];\
979
+    /* quick HACK, XXX FIXME MUST be optimized */\
980
+    for(i=0; i<h; i++)\
981
+    {\
982
+        temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
983
+        temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
984
+        temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
985
+        temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
986
+        temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
987
+        temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
988
+        temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
989
+        temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
990
+        asm volatile(\
991
+            "movq (%0), %%mm0		\n\t"\
992
+            "movq 8(%0), %%mm1		\n\t"\
993
+            "paddw %2, %%mm0		\n\t"\
994
+            "paddw %2, %%mm1		\n\t"\
995
+            "psraw $5, %%mm0		\n\t"\
996
+            "psraw $5, %%mm1		\n\t"\
997
+            "packuswb %%mm1, %%mm0	\n\t"\
998
+            OP(%%mm0, (%1), %%mm1, q)\
999
+            :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1000
+        );\
1001
+        dst+=dstStride;\
1002
+        src+=srcStride;\
1003
+    }\
1004
+}\
1005
+\
1006
+void OPNAME ## mpeg4_qpel8_v_lowpass_mmx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1007
+    uint64_t temp[9*4];\
1008
+    uint64_t *temp_ptr= temp;\
1009
+    int count= 9;\
1010
+\
1011
+    /*FIXME unroll */\
1012
+    asm volatile(\
1013
+        "pxor %%mm7, %%mm7		\n\t"\
1014
+        "1:				\n\t"\
1015
+        "movq (%0), %%mm0		\n\t"\
1016
+        "movq (%0), %%mm1		\n\t"\
1017
+        "punpcklbw %%mm7, %%mm0		\n\t"\
1018
+        "punpckhbw %%mm7, %%mm1		\n\t"\
1019
+        "movq %%mm0, (%1)		\n\t"\
1020
+        "movq %%mm1, 9*8(%1)		\n\t"\
1021
+        "addl $8, %1			\n\t"\
1022
+        "addl %3, %0			\n\t"\
1023
+        "decl %2			\n\t"\
1024
+        " jnz 1b			\n\t"\
1025
+        : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1026
+        : "r" (srcStride)\
1027
+    );\
1028
+    \
1029
+    temp_ptr= temp;\
1030
+    count=2;\
1031
+    \
1032
+/*FIXME reorder for speed */\
1033
+    asm volatile(\
1034
+        /*"pxor %%mm7, %%mm7		\n\t"*/\
1035
+        "1:				\n\t"\
1036
+        "movq (%0), %%mm0		\n\t"\
1037
+        "movq 8(%0), %%mm1		\n\t"\
1038
+        "movq 16(%0), %%mm2		\n\t"\
1039
+        "movq 24(%0), %%mm3		\n\t"\
1040
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)\
1041
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP)\
1042
+        "addl %4, %1			\n\t"\
1043
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)\
1044
+        \
1045
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1046
+        "addl %4, %1			\n\t"\
1047
+        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %7,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1048
+        \
1049
+        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %7, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1050
+        "addl %4, %1			\n\t"\
1051
+        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %7, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1052
+        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %7, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1053
+                \
1054
+        "addl $72, %0			\n\t"\
1055
+        "addl %8, %1			\n\t"\
1056
+        "decl %2			\n\t"\
1057
+        " jnz 1b			\n\t"\
1058
+         \
1059
+        : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1060
+        : "r"(dstStride), "r"(2*dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*dstStride)\
1061
+    );\
1062
+}
1063
+
1064
+#define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1065
+\
1066
+static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1067
+    put_pixels8_mmx(dst, src, stride, 8);\
1068
+}\
1069
+\
1070
+static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1071
+    uint64_t temp[32];\
1072
+    uint8_t * const half= (uint8_t*)temp;\
1073
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1074
+    OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1075
+}\
1076
+\
1077
+static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1078
+    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1079
+}\
1080
+\
1081
+static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1082
+    uint64_t temp[32];\
1083
+    uint8_t * const half= (uint8_t*)temp;\
1084
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1085
+    OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
1086
+}\
1087
+\
1088
+static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1089
+    uint64_t temp[32];\
1090
+    uint8_t * const half= (uint8_t*)temp;\
1091
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\
1092
+    OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
1093
+}\
1094
+\
1095
+static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1096
+    OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, src, stride, stride);\
1097
+}\
1098
+\
1099
+static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1100
+    uint64_t temp[32];\
1101
+    uint8_t * const half= (uint8_t*)temp;\
1102
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(half, src, 8, stride);\
1103
+    OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
1104
+}\
1105
+static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1106
+    uint64_t half[8*2 + 8*2 + 18*2];\
1107
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1108
+    uint8_t * const halfV= ((uint8_t*)half);\
1109
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1110
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1111
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1112
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1113
+    OPNAME ## pixels8_l4_mmx(dst, src, (uint8_t*)half, stride, 8);\
1114
+}\
1115
+static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1116
+    uint64_t half[8*2 + 8*2 + 18*2];\
1117
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64 + 8;\
1118
+    uint8_t * const halfV= ((uint8_t*)half);\
1119
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1120
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1121
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1122
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1123
+    OPNAME ## pixels8_l4_mmx(dst, src+1, (uint8_t*)half, stride, 8);\
1124
+}\
1125
+static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1126
+    uint64_t half[8*2 + 8*2 + 9*2];\
1127
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1128
+    uint8_t * const halfV= ((uint8_t*)half);\
1129
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1130
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1131
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1132
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1133
+    OPNAME ## pixels8_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 8);\
1134
+}\
1135
+static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1136
+    uint64_t half[8*2 + 8*2 + 9*2];\
1137
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1138
+    uint8_t * const halfV= ((uint8_t*)half);\
1139
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1140
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src  , 8, stride, 9);\
1141
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1142
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1143
+    OPNAME ## pixels8_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 8);\
1144
+}\
1145
+static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1146
+    uint64_t half[8*2 + 9*2];\
1147
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1148
+    uint8_t * const halfHV= ((uint8_t*)half);\
1149
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1150
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1151
+    OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
1152
+}\
1153
+static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1154
+    uint64_t half[8*2 + 9*2];\
1155
+    uint8_t * const halfH= ((uint8_t*)half) + 64;\
1156
+    uint8_t * const halfHV= ((uint8_t*)half);\
1157
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1158
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1159
+    OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
1160
+}\
1161
+static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1162
+    uint64_t half[8*2 + 8*2 + 9*2];\
1163
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1164
+    uint8_t * const halfV= ((uint8_t*)half);\
1165
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1166
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1167
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src, 8, stride);\
1168
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1169
+    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1170
+}\
1171
+static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1172
+    uint64_t half[8*2 + 8*2 + 9*2];\
1173
+    uint8_t * const halfH= ((uint8_t*)half) + 2*64;\
1174
+    uint8_t * const halfV= ((uint8_t*)half);\
1175
+    uint8_t * const halfHV= ((uint8_t*)half) + 64;\
1176
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1177
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfV, src+1, 8, stride);\
1178
+    put ## RND ## mpeg4_qpel8_v_lowpass_mmx(halfHV, halfH, 8, 8);\
1179
+    OPNAME ## pixels8_l2_mmx(dst, halfV, halfHV, stride, 8, 8);\
1180
+}\
1181
+static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1182
+    uint64_t half[9*2];\
1183
+    uint8_t * const halfH= ((uint8_t*)half);\
1184
+    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1185
+    OPNAME ## mpeg4_qpel8_v_lowpass_mmx(dst, halfH, stride, 8);\
1186
+}\
1187
+static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
1188
+    put_pixels16_mmx(dst, src, stride, 16);\
1189
+}\
1190
+\
1191
+static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1192
+    uint64_t temp[32];\
1193
+    uint8_t * const half= (uint8_t*)temp;\
1194
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1195
+    OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1196
+}\
1197
+\
1198
+static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1199
+    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1200
+}\
1201
+\
1202
+static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1203
+    uint64_t temp[32];\
1204
+    uint8_t * const half= (uint8_t*)temp;\
1205
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1206
+    OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
1207
+}\
1208
+\
1209
+static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1210
+    uint64_t temp[32];\
1211
+    uint8_t * const half= (uint8_t*)temp;\
1212
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\
1213
+    OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
1214
+}\
1215
+\
1216
+static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1217
+    OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, src, stride, stride);\
1218
+}\
1219
+\
1220
+static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1221
+    uint64_t temp[32];\
1222
+    uint8_t * const half= (uint8_t*)temp;\
1223
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(half, src, 16, stride);\
1224
+    OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
1225
+}\
1226
+static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1227
+    uint64_t half[16*2 + 16*2 + 18*2];\
1228
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1229
+    uint8_t * const halfV= ((uint8_t*)half);\
1230
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1231
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1232
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1233
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1234
+    OPNAME ## pixels16_l4_mmx(dst, src, (uint8_t*)half, stride, 16);\
1235
+}\
1236
+static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1237
+    uint64_t half[16*2 + 16*2 + 18*2];\
1238
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256 + 16;\
1239
+    uint8_t * const halfV= ((uint8_t*)half);\
1240
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1241
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1242
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1243
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1244
+    OPNAME ## pixels16_l4_mmx(dst, src+1, (uint8_t*)half, stride, 16);\
1245
+}\
1246
+static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1247
+    uint64_t half[16*2 + 16*2 + 17*2];\
1248
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1249
+    uint8_t * const halfV= ((uint8_t*)half);\
1250
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1251
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1252
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1253
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1254
+    OPNAME ## pixels16_l4_mmx(dst, src+stride, (uint8_t*)half, stride, 16);\
1255
+}\
1256
+static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1257
+    uint64_t half[16*2 + 16*2 + 17*2];\
1258
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1259
+    uint8_t * const halfV= ((uint8_t*)half);\
1260
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1261
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src  , 16, stride, 17);\
1262
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1263
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1264
+    OPNAME ## pixels16_l4_mmx(dst, src+stride+1, (uint8_t*)half, stride, 16);\
1265
+}\
1266
+static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1267
+    uint64_t half[16*2 + 17*2];\
1268
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1269
+    uint8_t * const halfHV= ((uint8_t*)half);\
1270
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1271
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1272
+    OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
1273
+}\
1274
+static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1275
+    uint64_t half[16*2 + 17*2];\
1276
+    uint8_t * const halfH= ((uint8_t*)half) + 256;\
1277
+    uint8_t * const halfHV= ((uint8_t*)half);\
1278
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1279
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1280
+    OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
1281
+}\
1282
+static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1283
+    uint64_t half[16*2 + 16*2 + 17*2];\
1284
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1285
+    uint8_t * const halfV= ((uint8_t*)half);\
1286
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1287
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1288
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src, 16, stride);\
1289
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1290
+    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1291
+}\
1292
+static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1293
+    uint64_t half[16*2 + 16*2 + 17*2];\
1294
+    uint8_t * const halfH= ((uint8_t*)half) + 2*256;\
1295
+    uint8_t * const halfV= ((uint8_t*)half);\
1296
+    uint8_t * const halfHV= ((uint8_t*)half) + 256;\
1297
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1298
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfV, src+1, 16, stride);\
1299
+    put ## RND ## mpeg4_qpel16_v_lowpass_mmx(halfHV, halfH, 16, 16);\
1300
+    OPNAME ## pixels16_l2_mmx(dst, halfV, halfHV, stride, 16, 16);\
1301
+}\
1302
+static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
1303
+    uint64_t half[17*2];\
1304
+    uint8_t * const halfH= ((uint8_t*)half);\
1305
+    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1306
+    OPNAME ## mpeg4_qpel16_v_lowpass_mmx(dst, halfH, stride, 16);\
1307
+}
1308
+
1309
+
1310
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "	\n\t"
1311
+#define AVG_OP(a,b,temp, size) \
1312
+"mov" #size " " #b ", " #temp "	\n\t"\
1313
+"pavgusb " #temp ", " #a "	\n\t"\
1314
+"mov" #size " " #a ", " #b "	\n\t"
1315
+
1316
+QPEL_BASE(put_       , ff_pw_16, _       , PUT_OP)
1317
+QPEL_BASE(avg_       , ff_pw_16, _       , AVG_OP)
1318
+QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1319
+QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, 3dnow)
1320
+QPEL_OP(avg_       , ff_pw_16, _       , AVG_OP, 3dnow)
1321
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1322
+
1323
+#undef AVG_OP
1324
+#define AVG_OP(a,b,temp, size) \
1325
+"mov" #size " " #b ", " #temp "	\n\t"\
1326
+"pavgb " #temp ", " #a "	\n\t"\
1327
+"mov" #size " " #a ", " #b "	\n\t"
1328
+QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
1329
+QPEL_OP(avg_       , ff_pw_16, _       , AVG_OP, mmx2)
1330
+QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1331
+
649 1332
 #if 0
650 1333
 static void just_return() { return; }
651 1334
 #endif
652 1335
 
1336
+#define SET_QPEL_FUNC(postfix1, postfix2) \
1337
+    c->put_ ## postfix1 = put_ ## postfix2;\
1338
+    c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
1339
+    c->avg_ ## postfix1 = avg_ ## postfix2;
1340
+    
653 1341
 void dsputil_init_mmx(DSPContext* c, unsigned mask)
654 1342
 {
655 1343
     mm_flags = mm_support();
... ...
@@ -724,7 +1417,7 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
724 724
         c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
725 725
         c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
726 726
         c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
727
-        
727
+                
728 728
         c->add_bytes= add_bytes_mmx;
729 729
         c->diff_bytes= diff_bytes_mmx;
730 730
         
... ...
@@ -767,6 +1460,38 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
767 767
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
768 768
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
769 769
             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
770
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
771
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
772
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
773
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
774
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
775
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
776
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
777
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
778
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
779
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
780
+            SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
781
+            SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
782
+            SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
783
+            SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
784
+            SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
785
+            SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
786
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
787
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
788
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
789
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
790
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
791
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
792
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
793
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
794
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
795
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
796
+            SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
797
+            SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
798
+            SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
799
+            SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
800
+            SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
801
+            SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
770 802
         } else if (mm_flags & MM_3DNOW) {
771 803
             c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
772 804
             c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
... ...
@@ -787,6 +1512,39 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
787 787
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
788 788
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
789 789
             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
790
+        
791
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
792
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
793
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
794
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
795
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
796
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
797
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
798
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
799
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
800
+            SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
801
+            SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
802
+            SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
803
+            SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
804
+            SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
805
+            SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
806
+            SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
807
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
808
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
809
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
810
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
811
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
812
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
813
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
814
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
815
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
816
+            SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
817
+            SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
818
+            SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
819
+            SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
820
+            SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
821
+            SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
822
+            SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
790 823
         }
791 824
     }
792 825
 
... ...
@@ -53,6 +53,38 @@ static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size
53 53
 	:"%eax", "memory");
54 54
 }
55 55
 
56
+static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
57
+{
58
+    __asm __volatile(
59
+	"1:				\n\t"
60
+	"movq	(%1), %%mm0		\n\t"
61
+	"addl	%4, %1			\n\t"
62
+	"movq	(%1), %%mm1		\n\t"
63
+	"addl	%4, %1			\n\t"
64
+	PAVGB" (%2), %%mm0		\n\t"
65
+	PAVGB" 8(%2), %%mm1		\n\t"
66
+	"movq	%%mm0, (%3)		\n\t"
67
+	"addl	%5, %3			\n\t"
68
+	"movq	%%mm1, (%3)		\n\t"
69
+	"addl	%5, %3			\n\t"
70
+	"movq	(%1), %%mm0		\n\t"
71
+	"addl	%4, %1			\n\t"
72
+	"movq	(%1), %%mm1		\n\t"
73
+	"addl	%4, %1			\n\t"
74
+	PAVGB" 16(%2), %%mm0		\n\t"
75
+	PAVGB" 24(%2), %%mm1		\n\t"
76
+	"movq	%%mm0, (%3)		\n\t"
77
+	"addl	%5, %3			\n\t"
78
+	"movq	%%mm1, (%3)		\n\t"
79
+	"addl	%5, %3			\n\t"
80
+        "addl	$32, %2			\n\t"
81
+	"subl	$4, %0			\n\t"
82
+	"jnz	1b			\n\t"
83
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
84
+	:"r"(src1Stride), "r"(dstStride)
85
+	:"memory");
86
+}
87
+
56 88
 static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
57 89
 {
58 90
     __asm __volatile(
... ...
@@ -92,6 +124,34 @@ static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
92 92
 	:"r" (line_size)
93 93
 	:"%eax", "memory");
94 94
 }
95
+
96
+static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
97
+{
98
+    __asm __volatile(
99
+	"1:				\n\t"
100
+	"movq	(%1), %%mm0		\n\t"
101
+	"movq	8(%1), %%mm1		\n\t"
102
+	"addl	%4, %1			\n\t"
103
+	PAVGB" (%2), %%mm0		\n\t"
104
+	PAVGB" 8(%2), %%mm1		\n\t"
105
+	"movq	%%mm0, (%3)		\n\t"
106
+	"movq	%%mm1, 8(%3)		\n\t"
107
+	"addl	%5, %3			\n\t"
108
+	"movq	(%1), %%mm0		\n\t"
109
+	"movq	8(%1), %%mm1		\n\t"
110
+	"addl	%4, %1			\n\t"
111
+	PAVGB" 16(%2), %%mm0		\n\t"
112
+	PAVGB" 24(%2), %%mm1		\n\t"
113
+	"movq	%%mm0, (%3)		\n\t"
114
+	"movq	%%mm1, 8(%3)		\n\t"
115
+	"addl	%5, %3			\n\t"
116
+        "addl	$32, %2			\n\t"
117
+	"subl	$2, %0			\n\t"
118
+	"jnz	1b			\n\t"
119
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
120
+	:"r"(src1Stride), "r"(dstStride)
121
+	:"memory");
122
+}
95 123
  
96 124
 /* GL: this function does incorrect rounding if overflow */
97 125
 static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
... ...
@@ -54,6 +54,42 @@ static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
54 54
 	:"eax", "memory");
55 55
 }
56 56
 
57
+static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
58
+{
59
+    MOVQ_BFE(mm6);
60
+    __asm __volatile(
61
+	".balign 8			\n\t"
62
+	"1:				\n\t"
63
+	"movq	(%1), %%mm0		\n\t"
64
+	"movq	(%2), %%mm1		\n\t"
65
+	"addl	%4, %1			\n\t"
66
+	"movq	(%1), %%mm2		\n\t"
67
+	"movq	8(%2), %%mm3		\n\t"
68
+	"addl	%4, %1			\n\t"
69
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
70
+	"movq	%%mm4, (%3)		\n\t"
71
+	"addl	%5, %3			\n\t"
72
+	"movq	%%mm5, (%3)		\n\t"
73
+	"addl	%5, %3			\n\t"
74
+	"movq	(%1), %%mm0		\n\t"
75
+	"movq	16(%2), %%mm1		\n\t"
76
+	"addl	%4, %1			\n\t"
77
+	"movq	(%1), %%mm2		\n\t"
78
+	"movq	24(%2), %%mm3		\n\t"
79
+	"addl	%4, %1			\n\t"
80
+	"addl	$32, %2			\n\t"
81
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
82
+	"movq	%%mm4, (%3)		\n\t"
83
+	"addl	%5, %3			\n\t"
84
+	"movq	%%mm5, (%3)		\n\t"
85
+	"addl	%5, %3			\n\t"
86
+	"subl	$4, %0			\n\t"
87
+	"jnz	1b			\n\t"
88
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
89
+	:"r"(src1Stride), "r"(dstStride)
90
+	:"memory");
91
+}
92
+
57 93
 static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
58 94
 {
59 95
     MOVQ_BFE(mm6);
... ...
@@ -90,7 +126,7 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
90 90
 	"movq	9(%1, %3), %%mm3	\n\t"
91 91
 	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
92 92
 	"movq	%%mm4, 8(%2)		\n\t"
93
-	"movq	%%mm5, 8(%2, %3)		\n\t"
93
+	"movq	%%mm5, 8(%2, %3)	\n\t"
94 94
 	"addl	%%eax, %1		\n\t"
95 95
 	"addl	%%eax, %2		\n\t"
96 96
 	"subl	$4, %0			\n\t"
... ...
@@ -100,6 +136,38 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
100 100
 	:"eax", "memory");
101 101
 }
102 102
 
103
+static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
104
+{
105
+    MOVQ_BFE(mm6);
106
+    __asm __volatile(
107
+	".balign 8			\n\t"
108
+	"1:				\n\t"
109
+	"movq	(%1), %%mm0		\n\t"
110
+	"movq	(%2), %%mm1		\n\t"
111
+	"movq	8(%1), %%mm2		\n\t"
112
+	"movq	8(%2), %%mm3		\n\t"
113
+	"addl	%4, %1			\n\t"
114
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
115
+	"movq	%%mm4, (%3)		\n\t"
116
+	"movq	%%mm5, 8(%3)		\n\t"
117
+	"addl	%5, %3			\n\t"
118
+	"movq	(%1), %%mm0		\n\t"
119
+	"movq	16(%2), %%mm1		\n\t"
120
+	"movq	8(%1), %%mm2		\n\t"
121
+	"movq	24(%2), %%mm3		\n\t"
122
+	"addl	%4, %1			\n\t"
123
+	PAVGBP(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
124
+	"movq	%%mm4, (%3)		\n\t"
125
+	"movq	%%mm5, 8(%3)		\n\t"
126
+	"addl	%5, %3			\n\t"
127
+	"addl	$32, %2			\n\t"
128
+	"subl	$2, %0			\n\t"
129
+	"jnz	1b			\n\t"
130
+	:"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
131
+	:"r"(src1Stride), "r"(dstStride)
132
+	:"memory");
133
+}
134
+
103 135
 static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
104 136
 {
105 137
     MOVQ_BFE(mm6);
... ...
@@ -195,6 +263,124 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
195 195
 	:"eax", "memory");
196 196
 }
197 197
 
198
+static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
199
+{
200
+    MOVQ_ZERO(mm7);
201
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
202
+    __asm __volatile(
203
+	".balign 8      		\n\t"
204
+	"1:				\n\t"
205
+	"movq	(%1), %%mm0		\n\t"
206
+	"movq	(%2), %%mm1		\n\t"
207
+	"movq	64(%2), %%mm2		\n\t"
208
+	"movq	136(%2), %%mm3		\n\t"
209
+	"punpcklbw %%mm7, %%mm0		\n\t"
210
+	"punpcklbw %%mm7, %%mm1		\n\t"
211
+	"punpcklbw %%mm7, %%mm2		\n\t"
212
+	"punpcklbw %%mm7, %%mm3		\n\t"
213
+	"paddusw %%mm6, %%mm0		\n\t"
214
+	"paddusw %%mm0, %%mm1		\n\t"
215
+	"paddusw %%mm2, %%mm3		\n\t"
216
+	"paddusw %%mm1, %%mm3		\n\t"
217
+	"psrlw	$2, %%mm3		\n\t"
218
+	"movq	(%1), %%mm0		\n\t"
219
+	"movq	(%2), %%mm1		\n\t"
220
+	"movq	64(%2), %%mm2		\n\t"
221
+	"movq	136(%2), %%mm4		\n\t"
222
+	"punpckhbw %%mm7, %%mm0		\n\t"
223
+	"punpckhbw %%mm7, %%mm1		\n\t"
224
+	"punpckhbw %%mm7, %%mm2		\n\t"
225
+	"punpckhbw %%mm7, %%mm4		\n\t"
226
+	"paddusw %%mm6, %%mm0		\n\t"
227
+	"paddusw %%mm0, %%mm1		\n\t"
228
+	"paddusw %%mm2, %%mm4		\n\t"
229
+	"paddusw %%mm1, %%mm4		\n\t"
230
+	"psrlw	$2, %%mm4		\n\t"
231
+	"packuswb  %%mm4, %%mm3		\n\t"
232
+	"movq	%%mm3, (%0)		\n\t"
233
+        "addl	%4, %0			\n\t"
234
+        "addl	%4, %1			\n\t"
235
+        "addl	$8, %2			\n\t" 
236
+        "decl	%3			\n\t"
237
+	"jnz	1b			\n\t"
238
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
239
+	:"r"(stride)
240
+	:"memory");
241
+}
242
+
243
+static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
244
+{
245
+    MOVQ_ZERO(mm7);
246
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
247
+    __asm __volatile(
248
+	".balign 8      		\n\t"
249
+	"1:				\n\t"
250
+	"movq	(%1), %%mm0		\n\t"
251
+	"movq	(%2), %%mm1		\n\t"
252
+	"movq	256(%2), %%mm2		\n\t"
253
+	"movq	528(%2), %%mm3		\n\t"
254
+	"punpcklbw %%mm7, %%mm0		\n\t"
255
+	"punpcklbw %%mm7, %%mm1		\n\t"
256
+	"punpcklbw %%mm7, %%mm2		\n\t"
257
+	"punpcklbw %%mm7, %%mm3		\n\t"
258
+	"paddusw %%mm6, %%mm0		\n\t"
259
+	"paddusw %%mm0, %%mm1		\n\t"
260
+	"paddusw %%mm2, %%mm3		\n\t"
261
+	"paddusw %%mm1, %%mm3		\n\t"
262
+	"psrlw	$2, %%mm3		\n\t"
263
+	"movq	(%1), %%mm0		\n\t"
264
+	"movq	(%2), %%mm1		\n\t"
265
+	"movq	256(%2), %%mm2		\n\t"
266
+	"movq	528(%2), %%mm4		\n\t"
267
+	"punpckhbw %%mm7, %%mm0		\n\t"
268
+	"punpckhbw %%mm7, %%mm1		\n\t"
269
+	"punpckhbw %%mm7, %%mm2		\n\t"
270
+	"punpckhbw %%mm7, %%mm4		\n\t"
271
+	"paddusw %%mm6, %%mm0		\n\t"
272
+	"paddusw %%mm0, %%mm1		\n\t"
273
+	"paddusw %%mm2, %%mm4		\n\t"
274
+	"paddusw %%mm1, %%mm4		\n\t"
275
+	"psrlw	$2, %%mm4		\n\t"
276
+	"packuswb  %%mm4, %%mm3		\n\t"
277
+	"movq	%%mm3, (%0)		\n\t"
278
+	"movq	8(%1), %%mm0		\n\t"
279
+	"movq	8(%2), %%mm1		\n\t"
280
+	"movq	264(%2), %%mm2		\n\t"
281
+	"movq	536(%2), %%mm3		\n\t"
282
+	"punpcklbw %%mm7, %%mm0		\n\t"
283
+	"punpcklbw %%mm7, %%mm1		\n\t"
284
+	"punpcklbw %%mm7, %%mm2		\n\t"
285
+	"punpcklbw %%mm7, %%mm3		\n\t"
286
+	"paddusw %%mm6, %%mm0		\n\t"
287
+	"paddusw %%mm0, %%mm1		\n\t"
288
+	"paddusw %%mm2, %%mm3		\n\t"
289
+	"paddusw %%mm1, %%mm3		\n\t"
290
+	"psrlw	$2, %%mm3		\n\t"
291
+	"movq	8(%1), %%mm0		\n\t"
292
+	"movq	8(%2), %%mm1		\n\t"
293
+	"movq	264(%2), %%mm2		\n\t"
294
+	"movq	536(%2), %%mm4		\n\t"
295
+	"punpckhbw %%mm7, %%mm0		\n\t"
296
+	"punpckhbw %%mm7, %%mm1		\n\t"
297
+	"punpckhbw %%mm7, %%mm2		\n\t"
298
+	"punpckhbw %%mm7, %%mm4		\n\t"
299
+	"paddusw %%mm6, %%mm0		\n\t"
300
+	"paddusw %%mm0, %%mm1		\n\t"
301
+	"paddusw %%mm2, %%mm4		\n\t"
302
+	"paddusw %%mm1, %%mm4		\n\t"
303
+	"psrlw	$2, %%mm4		\n\t"
304
+	"packuswb  %%mm4, %%mm3		\n\t"
305
+	"movq	%%mm3, 8(%0)		\n\t"
306
+        "addl	%4, %0			\n\t"
307
+        "addl	%4, %1			\n\t"
308
+        "addl	$16, %2			\n\t" 
309
+        "decl	%3			\n\t"
310
+	"jnz	1b			\n\t"
311
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
312
+	:"r"(stride)
313
+	:"memory");
314
+}
315
+
198 316
 // avg_pixels
199 317
 // in case more speed is needed - unroling would certainly help
200 318
 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
... ...
@@ -259,6 +445,27 @@ static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz
259 259
     } while (--h);
260 260
 }
261 261
 
262
+static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
263
+{
264
+    MOVQ_BFE(mm6);
265
+    JUMPALIGN();
266
+    do {
267
+	__asm __volatile(
268
+	    "movq  %1, %%mm0		\n\t"
269
+	    "movq  %2, %%mm1		\n\t"
270
+	    "movq  %0, %%mm3		\n\t"
271
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
272
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
273
+	    "movq  %%mm0, %0		\n\t"
274
+	    :"+m"(*dst)
275
+	    :"m"(*src1), "m"(*src2)
276
+	    :"memory");
277
+	dst += dstStride;
278
+        src1 += src1Stride;
279
+        src2 += 8;
280
+    } while (--h);
281
+}
282
+
262 283
 static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
263 284
 {
264 285
     MOVQ_BFE(mm6);
... ...
@@ -285,6 +492,33 @@ static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si
285 285
     } while (--h);
286 286
 }
287 287
 
288
+static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
289
+{
290
+    MOVQ_BFE(mm6);
291
+    JUMPALIGN();
292
+    do {
293
+	__asm __volatile(
294
+	    "movq  %1, %%mm0		\n\t"
295
+	    "movq  %2, %%mm1		\n\t"
296
+	    "movq  %0, %%mm3		\n\t"
297
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
298
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
299
+	    "movq  %%mm0, %0		\n\t"
300
+	    "movq  8%1, %%mm0		\n\t"
301
+	    "movq  8%2, %%mm1		\n\t"
302
+	    "movq  8%0, %%mm3		\n\t"
303
+	    PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
304
+	    PAVGB(%%mm3, %%mm2, %%mm0, %%mm6)
305
+	    "movq  %%mm0, 8%0		\n\t"
306
+	    :"+m"(*dst)
307
+	    :"m"(*src1), "m"(*src2)
308
+	    :"memory");
309
+	dst += dstStride;
310
+        src1 += src1Stride;
311
+        src2 += 16;
312
+    } while (--h);
313
+}
314
+
288 315
 static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
289 316
 {
290 317
     MOVQ_BFE(mm6);
... ...
@@ -399,6 +633,133 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
399 399
 	:"eax", "memory");
400 400
 }
401 401
 
402
+static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
403
+{
404
+    MOVQ_ZERO(mm7);
405
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
406
+    MOVQ_BFE(mm5);
407
+    __asm __volatile(
408
+	".balign 8      		\n\t"
409
+	"1:				\n\t"
410
+	"movq	(%1), %%mm0		\n\t"
411
+	"movq	(%2), %%mm1		\n\t"
412
+	"movq	64(%2), %%mm2		\n\t"
413
+	"movq	136(%2), %%mm3		\n\t"
414
+	"punpcklbw %%mm7, %%mm0		\n\t"
415
+	"punpcklbw %%mm7, %%mm1		\n\t"
416
+	"punpcklbw %%mm7, %%mm2		\n\t"
417
+	"punpcklbw %%mm7, %%mm3		\n\t"
418
+	"paddusw %%mm6, %%mm0		\n\t"
419
+	"paddusw %%mm0, %%mm1		\n\t"
420
+	"paddusw %%mm2, %%mm3		\n\t"
421
+	"paddusw %%mm1, %%mm3		\n\t"
422
+	"psrlw	$2, %%mm3		\n\t"
423
+	"movq	(%1), %%mm0		\n\t"
424
+	"movq	(%2), %%mm1		\n\t"
425
+	"movq	64(%2), %%mm2		\n\t"
426
+	"movq	136(%4), %%mm4		\n\t"
427
+	"punpckhbw %%mm7, %%mm0		\n\t"
428
+	"punpckhbw %%mm7, %%mm1		\n\t"
429
+	"punpckhbw %%mm7, %%mm2		\n\t"
430
+	"punpckhbw %%mm7, %%mm4		\n\t"
431
+	"paddusw %%mm6, %%mm0		\n\t"
432
+	"paddusw %%mm0, %%mm1		\n\t"
433
+	"paddusw %%mm2, %%mm4		\n\t"
434
+	"paddusw %%mm1, %%mm4		\n\t"
435
+	"psrlw	$2, %%mm4		\n\t"
436
+	"packuswb  %%mm4, %%mm3		\n\t"
437
+	"movq	(%0), %%mm4		\n\t"
438
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
439
+	"movq	%%mm3, (%0)		\n\t"
440
+        "addl	%4, %0			\n\t"
441
+        "addl	%4, %1			\n\t"
442
+        "addl	$8, %2			\n\t" 
443
+        "decl	%3			\n\t"
444
+	"jnz	1b			\n\t"
445
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
446
+	:"r"(stride)
447
+	:"memory");
448
+}
449
+
450
+static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
451
+{
452
+    MOVQ_ZERO(mm7);
453
+    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
454
+    MOVQ_BFE(mm5);
455
+    __asm __volatile(
456
+	".balign 8      		\n\t"
457
+	"1:				\n\t"
458
+	"movq	(%1), %%mm0		\n\t"
459
+	"movq	(%2), %%mm1		\n\t"
460
+	"movq	256(%2), %%mm2		\n\t"
461
+	"movq	528(%2), %%mm3		\n\t"
462
+	"punpcklbw %%mm7, %%mm0		\n\t"
463
+	"punpcklbw %%mm7, %%mm1		\n\t"
464
+	"punpcklbw %%mm7, %%mm2		\n\t"
465
+	"punpcklbw %%mm7, %%mm3		\n\t"
466
+	"paddusw %%mm6, %%mm0		\n\t"
467
+	"paddusw %%mm0, %%mm1		\n\t"
468
+	"paddusw %%mm2, %%mm3		\n\t"
469
+	"paddusw %%mm1, %%mm3		\n\t"
470
+	"psrlw	$2, %%mm3		\n\t"
471
+	"movq	(%1), %%mm0		\n\t"
472
+	"movq	(%2), %%mm1		\n\t"
473
+	"movq	256(%2), %%mm2		\n\t"
474
+	"movq	528(%4), %%mm4		\n\t"
475
+	"punpckhbw %%mm7, %%mm0		\n\t"
476
+	"punpckhbw %%mm7, %%mm1		\n\t"
477
+	"punpckhbw %%mm7, %%mm2		\n\t"
478
+	"punpckhbw %%mm7, %%mm4		\n\t"
479
+	"paddusw %%mm6, %%mm0		\n\t"
480
+	"paddusw %%mm0, %%mm1		\n\t"
481
+	"paddusw %%mm2, %%mm4		\n\t"
482
+	"paddusw %%mm1, %%mm4		\n\t"
483
+	"psrlw	$2, %%mm4		\n\t"
484
+	"packuswb  %%mm4, %%mm3		\n\t"
485
+	"movq	(%0), %%mm4		\n\t"
486
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
487
+	"movq	%%mm3, (%0)		\n\t"
488
+	"movq	8(%1), %%mm0		\n\t"
489
+	"movq	8(%2), %%mm1		\n\t"
490
+	"movq	264(%2), %%mm2		\n\t"
491
+	"movq	536(%2), %%mm3		\n\t"
492
+	"punpcklbw %%mm7, %%mm0		\n\t"
493
+	"punpcklbw %%mm7, %%mm1		\n\t"
494
+	"punpcklbw %%mm7, %%mm2		\n\t"
495
+	"punpcklbw %%mm7, %%mm3		\n\t"
496
+	"paddusw %%mm6, %%mm0		\n\t"
497
+	"paddusw %%mm0, %%mm1		\n\t"
498
+	"paddusw %%mm2, %%mm3		\n\t"
499
+	"paddusw %%mm1, %%mm3		\n\t"
500
+	"psrlw	$2, %%mm3		\n\t"
501
+	"movq	8(%1), %%mm0		\n\t"
502
+	"movq	8(%2), %%mm1		\n\t"
503
+	"movq	264(%2), %%mm2		\n\t"
504
+	"movq	536(%4), %%mm4		\n\t"
505
+	"punpckhbw %%mm7, %%mm0		\n\t"
506
+	"punpckhbw %%mm7, %%mm1		\n\t"
507
+	"punpckhbw %%mm7, %%mm2		\n\t"
508
+	"punpckhbw %%mm7, %%mm4		\n\t"
509
+	"paddusw %%mm6, %%mm0		\n\t"
510
+	"paddusw %%mm0, %%mm1		\n\t"
511
+	"paddusw %%mm2, %%mm4		\n\t"
512
+	"paddusw %%mm1, %%mm4		\n\t"
513
+	"psrlw	$2, %%mm4		\n\t"
514
+	"packuswb  %%mm4, %%mm3		\n\t"
515
+	"movq	8(%0), %%mm4		\n\t"
516
+        PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
517
+	"movq	%%mm3, 8(%0)		\n\t"
518
+        "addl	%4, %0			\n\t"
519
+        "addl	%4, %1			\n\t"
520
+        "addl	$16, %2			\n\t" 
521
+        "decl	%3			\n\t"
522
+	"jnz	1b			\n\t"
523
+	:"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
524
+	:"r"(stride)
525
+	:"memory");
526
+}
527
+
528
+
402 529
 //FIXME optimize
403 530
 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
404 531
     DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
... ...
@@ -311,6 +311,7 @@ static inline int get_penalty_factor(MpegEncContext *s, int type){
311 311
 }
312 312
 
313 313
 void ff_init_me(MpegEncContext *s){
314
+    set_cmp(s, s->dsp.me_pre_cmp, s->avctx->me_pre_cmp);
314 315
     set_cmp(s, s->dsp.me_cmp, s->avctx->me_cmp);
315 316
     set_cmp(s, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
316 317
     set_cmp(s, s->dsp.mb_cmp, s->avctx->mb_cmp);
... ...
@@ -336,6 +337,12 @@ void ff_init_me(MpegEncContext *s){
336 336
         s->me.motion_search[0]= simple_epzs_motion_search;
337 337
         s->me.motion_search[1]= simple_epzs_motion_search4;
338 338
     }
339
+    
340
+    if(s->avctx->me_pre_cmp&FF_CMP_CHROMA){
341
+        s->me.pre_motion_search= simple_chroma_epzs_motion_search;
342
+    }else{
343
+        s->me.pre_motion_search= simple_epzs_motion_search;
344
+    }
339 345
 }
340 346
       
341 347
 static int pix_dev(UINT8 * pix, int line_size, int mean)
... ...
@@ -1037,7 +1044,7 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
1037 1037
     
1038 1038
     assert(s->quarter_sample==0 || s->quarter_sample==1);
1039 1039
 
1040
-    s->me.penalty_factor    = get_penalty_factor(s, s->avctx->me_cmp);
1040
+    s->me.pre_penalty_factor    = get_penalty_factor(s, s->avctx->me_pre_cmp);
1041 1041
 
1042 1042
     get_limits(s, &range, &xmin, &ymin, &xmax, &ymax, s->f_code);
1043 1043
     rel_xmin= xmin - mb_x*16;
... ...
@@ -1072,8 +1079,8 @@ int ff_pre_estimate_p_frame_motion(MpegEncContext * s,
1072 1072
         pred_x = P_MEDIAN[0];
1073 1073
         pred_y = P_MEDIAN[1];
1074 1074
     }
1075
-    dmin = s->me.motion_search[0](s, 0, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
1076
-                                  &s->last_picture, s->p_mv_table, (1<<16)>>shift, mv_penalty);
1075
+    dmin = s->me.pre_motion_search(s, 0, &mx, &my, P, pred_x, pred_y, rel_xmin, rel_ymin, rel_xmax, rel_ymax, 
1076
+                                   &s->last_picture, s->p_mv_table, (1<<16)>>shift, mv_penalty);
1077 1077
 
1078 1078
     s->p_mv_table[xy][0] = mx<<shift;
1079 1079
     s->p_mv_table[xy][1] = my<<shift;
... ...
@@ -268,6 +268,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
268 268
     const int my = *my_ptr;   
269 269
     const int penalty_factor= s->me.sub_penalty_factor;
270 270
     const int map_generation= s->me.map_generation;
271
+    const int subpel_quality= s->avctx->me_subpel_quality;
271 272
     uint32_t *map= s->me.map;
272 273
     me_cmp_func cmp, chroma_cmp;
273 274
     me_cmp_func cmp_sub, chroma_cmp_sub;
... ...
@@ -309,7 +310,7 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
309 309
         
310 310
         memset(best, 64, sizeof(int)*8);
311 311
 #if 1
312
-        if(s->avctx->dia_size>=2){        
312
+        if(s->me.dia_size>=2){        
313 313
             const int tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
314 314
             const int bl= score_map[(index+(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
315 315
             const int tr= score_map[(index-(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
... ...
@@ -388,24 +389,34 @@ static int RENAME(qpel_motion_search)(MpegEncContext * s,
388 388
                 }
389 389
             }            
390 390
         }
391
-        for(i=0; i<8; i++){
391
+        for(i=0; i<subpel_quality; i++){
392 392
             nx= best_pos[i][0];
393 393
             ny= best_pos[i][1];
394 394
             CHECK_QUARTER_MV(nx&3, ny&3, nx>>2, ny>>2)
395 395
         }
396
+
396 397
 #if 0
397
-            nx= FFMAX(4*mx - bx, bx - 4*mx);
398
-            ny= FFMAX(4*my - by, by - 4*my);
398
+            const int tl= score_map[(index-(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
399
+            const int bl= score_map[(index+(1<<ME_MAP_SHIFT)-1)&(ME_MAP_SIZE-1)];
400
+            const int tr= score_map[(index-(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
401
+            const int br= score_map[(index+(1<<ME_MAP_SHIFT)+1)&(ME_MAP_SIZE-1)];
402
+//            if(l < r && l < t && l < b && l < tl && l < bl && l < tr && l < br && bl < tl){
403
+            if(tl<br){
404
+
405
+//            nx= FFMAX(4*mx - bx, bx - 4*mx);
406
+//            ny= FFMAX(4*my - by, by - 4*my);
399 407
             
400
-            static int stats[4][4];
401
-            stats[nx][ny]++;
402
-            if(256*256*256*64 % (stats[0][0]+1) ==0){
403
-                for(i=0; i<16; i++){
404
-                    if((i&3)==0) printf("\n");
408
+            static int stats[7][7], count;
409
+            count++;
410
+            stats[4*mx - bx + 3][4*my - by + 3]++;
411
+            if(256*256*256*64 % count ==0){
412
+                for(i=0; i<49; i++){
413
+                    if((i%7)==0) printf("\n");
405 414
                     printf("%6d ", stats[0][i]);
406 415
                 }
407 416
                 printf("\n");
408 417
             }
418
+            }
409 419
 #endif
410 420
 #else
411 421
 
... ...
@@ -659,7 +670,7 @@ static inline int RENAME(sab_diamond_search)(MpegEncContext * s, int *best, int
659 659
 {
660 660
     me_cmp_func cmp, chroma_cmp;
661 661
     Minima minima[MAX_SAB_SIZE];
662
-    const int minima_count= ABS(s->avctx->dia_size);
662
+    const int minima_count= ABS(s->me.dia_size);
663 663
     int i, j;
664 664
     LOAD_COMMON(s->mb_x*16, s->mb_y*16);
665 665
     
... ...
@@ -744,7 +755,7 @@ static inline int RENAME(var_diamond_search)(MpegEncContext * s, int *best, int
744 744
     cmp= s->dsp.me_cmp[size];
745 745
     chroma_cmp= s->dsp.me_cmp[size+1];
746 746
 
747
-    for(dia_size=1; dia_size<=s->avctx->dia_size; dia_size++){
747
+    for(dia_size=1; dia_size<=s->me.dia_size; dia_size++){
748 748
         int dir, start, end;
749 749
         const int x= best[0];
750 750
         const int y= best[1];
... ...
@@ -893,15 +904,15 @@ static int RENAME(epzs_motion_search)(MpegEncContext * s, int block,
893 893
     }
894 894
 
895 895
 //check(best[0],best[1],0, b0)
896
-    if(s->avctx->dia_size==-1)
896
+    if(s->me.dia_size==-1)
897 897
         dmin= RENAME(funny_diamond_search)(s, best, dmin, ref_picture,
898 898
                                    pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
899 899
 				   shift, map, map_generation, size, mv_penalty);
900
-    else if(s->avctx->dia_size<-1)
900
+    else if(s->me.dia_size<-1)
901 901
         dmin= RENAME(sab_diamond_search)(s, best, dmin, ref_picture,
902 902
                                    pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
903 903
 				   shift, map, map_generation, size, mv_penalty);
904
-    else if(s->avctx->dia_size<2)
904
+    else if(s->me.dia_size<2)
905 905
         dmin= RENAME(small_diamond_search)(s, best, dmin, ref_picture,
906 906
                                    pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
907 907
 				   shift, map, map_generation, size, mv_penalty);
... ...
@@ -969,15 +980,15 @@ static int RENAME(epzs_motion_search4)(MpegEncContext * s, int block,
969 969
                         (last_mv[ref_mv_xy+ref_mv_stride][1]*ref_mv_scale + (1<<15))>>16)
970 970
     }
971 971
 
972
-    if(s->avctx->dia_size==-1)
972
+    if(s->me.dia_size==-1)
973 973
         dmin= RENAME(funny_diamond_search)(s, best, dmin, ref_picture,
974 974
                                    pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
975 975
 				   shift, map, map_generation, size, mv_penalty);
976
-    else if(s->avctx->dia_size<-1)
976
+    else if(s->me.dia_size<-1)
977 977
         dmin= RENAME(sab_diamond_search)(s, best, dmin, ref_picture,
978 978
                                    pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
979 979
 				   shift, map, map_generation, size, mv_penalty);
980
-    else if(s->avctx->dia_size<2)
980
+    else if(s->me.dia_size<2)
981 981
         dmin= RENAME(small_diamond_search)(s, best, dmin, ref_picture,
982 982
                                    pred_x, pred_y, penalty_factor, xmin, ymin, xmax, ymax, 
983 983
 				   shift, map, map_generation, size, mv_penalty);
... ...
@@ -2786,12 +2786,12 @@ static void encode_picture(MpegEncContext *s, int picture_number)
2786 2786
         else if(s->pict_type!=B_TYPE)
2787 2787
             s->no_rounding ^= 1;          
2788 2788
     }
2789
-
2790 2789
     /* Estimate motion for every MB */
2791 2790
     if(s->pict_type != I_TYPE){
2792 2791
         if(s->pict_type != B_TYPE){
2793 2792
             if((s->avctx->pre_me && s->last_non_b_pict_type==I_TYPE) || s->avctx->pre_me==2){
2794 2793
                 s->me.pre_pass=1;
2794
+                s->me.dia_size= s->avctx->pre_dia_size;
2795 2795
 
2796 2796
                 for(mb_y=s->mb_height-1; mb_y >=0 ; mb_y--) {
2797 2797
                     for(mb_x=s->mb_width-1; mb_x >=0 ; mb_x--) {
... ...
@@ -2804,6 +2804,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
2804 2804
             }
2805 2805
         }
2806 2806
 
2807
+        s->me.dia_size= s->avctx->dia_size;
2807 2808
         for(mb_y=0; mb_y < s->mb_height; mb_y++) {
2808 2809
             s->block_index[0]= s->block_wrap[0]*(mb_y*2 + 1) - 1;
2809 2810
             s->block_index[1]= s->block_wrap[0]*(mb_y*2 + 1);
... ...
@@ -2816,7 +2817,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
2816 2816
                 s->block_index[1]+=2;
2817 2817
                 s->block_index[2]+=2;
2818 2818
                 s->block_index[3]+=2;
2819
-
2819
+                
2820 2820
                 /* compute motion vector & mb_type and store in context */
2821 2821
                 if(s->pict_type==B_TYPE)
2822 2822
                     ff_estimate_b_frame_motion(s, mb_x, mb_y);
... ...
@@ -139,9 +139,11 @@ typedef struct MotionEstContext{
139 139
     uint32_t *map;                     /* map to avoid duplicate evaluations */
140 140
     uint32_t *score_map;               /* map to store the scores */
141 141
     int map_generation;  
142
+    int pre_penalty_factor;
142 143
     int penalty_factor;
143 144
     int sub_penalty_factor;
144 145
     int pre_pass;                      /* = 1 for the pre pass */
146
+    int dia_size;
145 147
     UINT16 (*mv_penalty)[MAX_MV*2+1];  /* amount of bits needed to encode a MV */
146 148
     int (*sub_motion_search)(struct MpegEncContext * s,
147 149
 				  int *mx_ptr, int *my_ptr, int dmin,
... ...
@@ -153,6 +155,11 @@ typedef struct MotionEstContext{
153 153
                              int P[10][2], int pred_x, int pred_y,
154 154
                              int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
155 155
                              int ref_mv_scale, uint16_t * const mv_penalty);
156
+    int (*pre_motion_search)(struct MpegEncContext * s, int block,
157
+                             int *mx_ptr, int *my_ptr,
158
+                             int P[10][2], int pred_x, int pred_y,
159
+                             int xmin, int ymin, int xmax, int ymax, Picture *ref_picture, int16_t (*last_mv)[2], 
160
+                             int ref_mv_scale, uint16_t * const mv_penalty);
156 161
 }MotionEstContext;
157 162
 
158 163
 typedef struct MpegEncContext {
... ...
@@ -234,6 +234,7 @@ void avcodec_get_context_defaults(AVCodecContext *s){
234 234
     s->me_method= ME_EPZS;
235 235
     s->get_buffer= avcodec_default_get_buffer;
236 236
     s->release_buffer= avcodec_default_release_buffer;
237
+    s->me_subpel_quality=8;
237 238
 }
238 239
 
239 240
 /**