libavcodec/dsputil.c
de6d9b64
 /*
  * DSP utils
406792e7
  * Copyright (c) 2000, 2001 Fabrice Bellard
8f2ab833
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64
  *
7b94177e
  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  *
b78e7197
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
ff4ec49e
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
b78e7197
  * version 2.1 of the License, or (at your option) any later version.
de6d9b64
  *
b78e7197
  * FFmpeg is distributed in the hope that it will be useful,
de6d9b64
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
de6d9b64
  *
ff4ec49e
  * You should have received a copy of the GNU Lesser General Public
b78e7197
  * License along with FFmpeg; if not, write to the Free Software
5509bffa
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
de6d9b64
  */
115329f1
 
983e3246
 /**
ba87f080
  * @file
983e3246
  * DSP utils
  */
115329f1
 
6fee1b90
 #include "libavutil/attributes.h"
737eb597
 #include "libavutil/imgutils.h"
218aefce
 #include "libavutil/internal.h"
de6d9b64
 #include "avcodec.h"
b9ba5253
 #include "copy_block.h"
5d3d39c7
 #include "dct.h"
de6d9b64
 #include "dsputil.h"
b0368839
 #include "simple_idct.h"
65e4c8c9
 #include "faandct.h"
6f08c541
 #include "faanidct.h"
35685a3c
 #include "imgconvert.h"
199436b9
 #include "mathops.h"
af818f7a
 #include "mpegvideo.h"
 #include "config.h"
5d50fcc5
 #include "diracdsp.h"
5596c60c
 
1d503957
 uint32_t ff_squareTbl[512] = {0, };
de6d9b64
 
4a88d81c
 #define BIT_DEPTH 16
e8b9e1e2
 #include "dsputil_template.c"
 #undef BIT_DEPTH
 
8dbe5856
 #define BIT_DEPTH 8
325eefa2
 #include "dsputil_template.c"
8dffcca5
 
917f55cc
 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
 #define pb_7f (~0UL/255 * 0x7f)
 #define pb_80 (~0UL/255 * 0x80)
469bd7b1
 
10acc479
 /* Specific zigzag scan for 248 idct. NOTE that unlike the
    specification, we interleave the fields */
 const uint8_t ff_zigzag248_direct[64] = {
      0,  8,  1,  9, 16, 24,  2, 10,
     17, 25, 32, 40, 48, 56, 33, 41,
     18, 26,  3, 11,  4, 12, 19, 27,
     34, 42, 49, 57, 50, 58, 35, 43,
     20, 28,  5, 13,  6, 14, 21, 29,
     36, 44, 51, 59, 52, 60, 37, 45,
     22, 30,  7, 15, 23, 31, 38, 46,
     53, 61, 54, 62, 39, 47, 55, 63,
 };
 
0c1a9eda
 const uint8_t ff_alternate_horizontal_scan[64] = {
115329f1
     0,  1,   2,  3,  8,  9, 16, 17,
e0eac44e
     10, 11,  4,  5,  6,  7, 15, 14,
115329f1
     13, 12, 19, 18, 24, 25, 32, 33,
e0eac44e
     26, 27, 20, 21, 22, 23, 28, 29,
115329f1
     30, 31, 34, 35, 40, 41, 48, 49,
e0eac44e
     42, 43, 36, 37, 38, 39, 44, 45,
115329f1
     46, 47, 50, 51, 56, 57, 58, 59,
e0eac44e
     52, 53, 54, 55, 60, 61, 62, 63,
 };
 
0c1a9eda
 const uint8_t ff_alternate_vertical_scan[64] = {
115329f1
     0,  8,  16, 24,  1,  9,  2, 10,
e0eac44e
     17, 25, 32, 40, 48, 56, 57, 49,
115329f1
     41, 33, 26, 18,  3, 11,  4, 12,
e0eac44e
     19, 27, 34, 42, 50, 58, 35, 43,
115329f1
     51, 59, 20, 28,  5, 13,  6, 14,
e0eac44e
     21, 29, 36, 44, 52, 60, 37, 45,
115329f1
     53, 61, 22, 30,  7, 15, 23, 31,
e0eac44e
     38, 46, 54, 62, 39, 47, 55, 63,
 };
 
b0368839
 /* Input permutation for the simple_idct_mmx */
 static const uint8_t simple_mmx_permutation[64]={
bb270c08
         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
b0368839
 };
 
0e956ba2
 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 
6fee1b90
 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
                                const uint8_t *src_scantable)
 {
4c79b95c
     int i;
     int end;
 
     st->scantable= src_scantable;
 
     for(i=0; i<64; i++){
         int j;
         j = src_scantable[i];
         st->permutated[i] = permutation[j];
     }
 
     end=-1;
     for(i=0; i<64; i++){
         int j;
         j = st->permutated[i];
         if(j>end) end=j;
         st->raster_end[i]= end;
     }
 }
 
6fee1b90
 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
                                            int idct_permutation_type)
92fb52d9
 {
     int i;
 
     switch(idct_permutation_type){
     case FF_NO_IDCT_PERM:
         for(i=0; i<64; i++)
             idct_permutation[i]= i;
         break;
     case FF_LIBMPEG2_IDCT_PERM:
         for(i=0; i<64; i++)
             idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
         break;
     case FF_SIMPLE_IDCT_PERM:
         for(i=0; i<64; i++)
             idct_permutation[i]= simple_mmx_permutation[i];
         break;
     case FF_TRANSPOSE_IDCT_PERM:
         for(i=0; i<64; i++)
             idct_permutation[i]= ((i&7)<<3) | (i>>3);
         break;
     case FF_PARTTRANS_IDCT_PERM:
         for(i=0; i<64; i++)
             idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
         break;
     case FF_SSE2_IDCT_PERM:
         for(i=0; i<64; i++)
             idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
         break;
     default:
         av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
     }
 }
 
0c1a9eda
 static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
 {
     int s, i, j;
 
     s = 0;
     for (i = 0; i < 16; i++) {
bb270c08
         for (j = 0; j < 16; j += 8) {
             s += pix[0];
             s += pix[1];
             s += pix[2];
             s += pix[3];
             s += pix[4];
             s += pix[5];
             s += pix[6];
             s += pix[7];
             pix += 8;
         }
         pix += line_size - 16;
3aa102be
     }
     return s;
 }
 
0c1a9eda
 static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
 {
     int s, i, j;
1d503957
     uint32_t *sq = ff_squareTbl + 256;
3aa102be
 
     s = 0;
     for (i = 0; i < 16; i++) {
bb270c08
         for (j = 0; j < 16; j += 8) {
e463f0e9
 #if 0
             s += sq[pix[0]];
             s += sq[pix[1]];
             s += sq[pix[2]];
             s += sq[pix[3]];
             s += sq[pix[4]];
             s += sq[pix[5]];
             s += sq[pix[6]];
             s += sq[pix[7]];
 #else
d9a9f50a
 #if HAVE_FAST_64BIT
bb270c08
             register uint64_t x=*(uint64_t*)pix;
             s += sq[x&0xff];
             s += sq[(x>>8)&0xff];
             s += sq[(x>>16)&0xff];
             s += sq[(x>>24)&0xff];
2a006cd3
             s += sq[(x>>32)&0xff];
             s += sq[(x>>40)&0xff];
             s += sq[(x>>48)&0xff];
             s += sq[(x>>56)&0xff];
 #else
bb270c08
             register uint32_t x=*(uint32_t*)pix;
             s += sq[x&0xff];
             s += sq[(x>>8)&0xff];
             s += sq[(x>>16)&0xff];
             s += sq[(x>>24)&0xff];
2a006cd3
             x=*(uint32_t*)(pix+4);
             s += sq[x&0xff];
             s += sq[(x>>8)&0xff];
             s += sq[(x>>16)&0xff];
             s += sq[(x>>24)&0xff];
 #endif
e463f0e9
 #endif
bb270c08
             pix += 8;
         }
         pix += line_size - 16;
3aa102be
     }
     return s;
 }
 
96711ecf
 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
3d2e8cce
     int i;
115329f1
 
3d2e8cce
     for(i=0; i+8<=w; i+=8){
8fc0162a
         dst[i+0]= av_bswap32(src[i+0]);
         dst[i+1]= av_bswap32(src[i+1]);
         dst[i+2]= av_bswap32(src[i+2]);
         dst[i+3]= av_bswap32(src[i+3]);
         dst[i+4]= av_bswap32(src[i+4]);
         dst[i+5]= av_bswap32(src[i+5]);
         dst[i+6]= av_bswap32(src[i+6]);
         dst[i+7]= av_bswap32(src[i+7]);
3d2e8cce
     }
     for(;i<w; i++){
8fc0162a
         dst[i+0]= av_bswap32(src[i+0]);
3d2e8cce
     }
 }
3aa102be
 
381d37fd
 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
 {
     while (len--)
         *dst++ = av_bswap16(*src++);
 }
 
26efc54e
 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 {
     int s, i;
1d503957
     uint32_t *sq = ff_squareTbl + 256;
26efc54e
 
     s = 0;
     for (i = 0; i < h; i++) {
         s += sq[pix1[0] - pix2[0]];
         s += sq[pix1[1] - pix2[1]];
         s += sq[pix1[2] - pix2[2]];
         s += sq[pix1[3] - pix2[3]];
         pix1 += line_size;
         pix2 += line_size;
     }
     return s;
 }
 
bb198e19
 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
 {
     int s, i;
1d503957
     uint32_t *sq = ff_squareTbl + 256;
1457ab52
 
     s = 0;
bb198e19
     for (i = 0; i < h; i++) {
1457ab52
         s += sq[pix1[0] - pix2[0]];
         s += sq[pix1[1] - pix2[1]];
         s += sq[pix1[2] - pix2[2]];
         s += sq[pix1[3] - pix2[3]];
         s += sq[pix1[4] - pix2[4]];
         s += sq[pix1[5] - pix2[5]];
         s += sq[pix1[6] - pix2[6]];
         s += sq[pix1[7] - pix2[7]];
         pix1 += line_size;
         pix2 += line_size;
     }
     return s;
 }
 
bb198e19
 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48
 {
6b026927
     int s, i;
1d503957
     uint32_t *sq = ff_squareTbl + 256;
9c76bd48
 
     s = 0;
bb198e19
     for (i = 0; i < h; i++) {
6b026927
         s += sq[pix1[ 0] - pix2[ 0]];
         s += sq[pix1[ 1] - pix2[ 1]];
         s += sq[pix1[ 2] - pix2[ 2]];
         s += sq[pix1[ 3] - pix2[ 3]];
         s += sq[pix1[ 4] - pix2[ 4]];
         s += sq[pix1[ 5] - pix2[ 5]];
         s += sq[pix1[ 6] - pix2[ 6]];
         s += sq[pix1[ 7] - pix2[ 7]];
         s += sq[pix1[ 8] - pix2[ 8]];
         s += sq[pix1[ 9] - pix2[ 9]];
         s += sq[pix1[10] - pix2[10]];
         s += sq[pix1[11] - pix2[11]];
         s += sq[pix1[12] - pix2[12]];
         s += sq[pix1[13] - pix2[13]];
         s += sq[pix1[14] - pix2[14]];
         s += sq[pix1[15] - pix2[15]];
2a006cd3
 
6b026927
         pix1 += line_size;
         pix2 += line_size;
9c76bd48
     }
     return s;
 }
 
ac898759
 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
bb270c08
                           const uint8_t *s2, int stride){
9dbcbd92
     int i;
 
     /* read the pixels */
     for(i=0;i<8;i++) {
c13e1abd
         block[0] = s1[0] - s2[0];
         block[1] = s1[1] - s2[1];
         block[2] = s1[2] - s2[2];
         block[3] = s1[3] - s2[3];
         block[4] = s1[4] - s2[4];
         block[5] = s1[5] - s2[5];
         block[6] = s1[6] - s2[6];
         block[7] = s1[7] - s2[7];
9dbcbd92
         s1 += stride;
         s2 += stride;
c13e1abd
         block += 8;
9dbcbd92
     }
 }
 
ac898759
 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
dbc9f84e
                                  int line_size)
de6d9b64
 {
     int i;
115329f1
 
de6d9b64
     /* read the pixels */
     for(i=0;i<8;i++) {
c23acbae
         pixels[0] = av_clip_uint8(block[0]);
         pixels[1] = av_clip_uint8(block[1]);
         pixels[2] = av_clip_uint8(block[2]);
         pixels[3] = av_clip_uint8(block[3]);
         pixels[4] = av_clip_uint8(block[4]);
         pixels[5] = av_clip_uint8(block[5]);
         pixels[6] = av_clip_uint8(block[6]);
         pixels[7] = av_clip_uint8(block[7]);
c13e1abd
 
         pixels += line_size;
         block += 8;
de6d9b64
     }
 }
 
ac898759
 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
70d54392
                                  int line_size)
 {
     int i;
 
     /* read the pixels */
     for(i=0;i<4;i++) {
         pixels[0] = av_clip_uint8(block[0]);
         pixels[1] = av_clip_uint8(block[1]);
         pixels[2] = av_clip_uint8(block[2]);
         pixels[3] = av_clip_uint8(block[3]);
 
         pixels += line_size;
         block += 8;
     }
 }
 
ac898759
 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
70d54392
                                  int line_size)
 {
     int i;
 
     /* read the pixels */
     for(i=0;i<2;i++) {
         pixels[0] = av_clip_uint8(block[0]);
         pixels[1] = av_clip_uint8(block[1]);
 
         pixels += line_size;
         block += 8;
     }
 }
 
88bd7fdc
 static void put_signed_pixels_clamped_c(const int16_t *block,
e62e455f
                                         uint8_t *av_restrict pixels,
dbc9f84e
                                         int line_size)
f9ed9d85
 {
     int i, j;
 
     for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
             if (*block < -128)
                 *pixels = 0;
             else if (*block > 127)
                 *pixels = 255;
             else
                 *pixels = (uint8_t)(*block + 128);
             block++;
             pixels++;
         }
         pixels += (line_size - 8);
     }
 }
 
7ff1a4b1
 static void add_pixels8_c(uint8_t *av_restrict pixels,
                           int16_t *block,
                           int line_size)
 {
     int i;
 
     for(i=0;i<8;i++) {
         pixels[0] += block[0];
         pixels[1] += block[1];
         pixels[2] += block[2];
         pixels[3] += block[3];
         pixels[4] += block[4];
         pixels[5] += block[5];
         pixels[6] += block[6];
         pixels[7] += block[7];
         pixels += line_size;
         block += 8;
     }
 }
 
ac898759
 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
dbc9f84e
                                  int line_size)
de6d9b64
 {
     int i;
115329f1
 
de6d9b64
     /* read the pixels */
     for(i=0;i<8;i++) {
c23acbae
         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
c13e1abd
         pixels += line_size;
         block += 8;
de6d9b64
     }
 }
178fcca8
 
ac898759
 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
70d54392
                           int line_size)
 {
     int i;
 
     /* read the pixels */
     for(i=0;i<4;i++) {
         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
         pixels += line_size;
         block += 8;
     }
 }
 
ac898759
 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
70d54392
                           int line_size)
 {
     int i;
 
     /* read the pixels */
     for(i=0;i<2;i++) {
         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
         pixels += line_size;
         block += 8;
     }
 }
 
88bd7fdc
 static int sum_abs_dctelem_c(int16_t *block)
1edbfe19
 {
     int sum=0, i;
     for(i=0; i<64; i++)
         sum+= FFABS(block[i]);
     return sum;
 }
 
342c7dfd
 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 {
     int i;
 
     for (i = 0; i < h; i++) {
         memset(block, value, 16);
         block += line_size;
     }
 }
 
 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 {
     int i;
 
     for (i = 0; i < h; i++) {
         memset(block, value, 8);
         block += line_size;
     }
 }
 
de6d9b64
 #define avg2(a,b) ((a+b+1)>>1)
 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 
0c1a9eda
 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
 {
     const int A=(16-x16)*(16-y16);
     const int B=(   x16)*(16-y16);
     const int C=(16-x16)*(   y16);
     const int D=(   x16)*(   y16);
     int i;
 
     for(i=0; i<h; i++)
     {
b3184779
         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
         dst+= stride;
         src+= stride;
44eb4951
     }
 }
 
703c8195
 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
 {
     int y, vx, vy;
     const int s= 1<<shift;
115329f1
 
073b013d
     width--;
     height--;
 
     for(y=0; y<h; y++){
         int x;
 
         vx= ox;
         vy= oy;
         for(x=0; x<8; x++){ //XXX FIXME optimize
             int src_x, src_y, frac_x, frac_y, index;
 
             src_x= vx>>16;
             src_y= vy>>16;
             frac_x= src_x&(s-1);
             frac_y= src_y&(s-1);
             src_x>>=shift;
             src_y>>=shift;
115329f1
 
073b013d
             if((unsigned)src_x < width){
                 if((unsigned)src_y < height){
                     index= src_x + src_y*stride;
                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
                                            + src[index       +1]*   frac_x )*(s-frac_y)
                                         + (  src[index+stride  ]*(s-frac_x)
                                            + src[index+stride+1]*   frac_x )*   frac_y
                                         + r)>>(shift*2);
                 }else{
f66e4f5f
                     index= src_x + av_clip(src_y, 0, height)*stride;
115329f1
                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
073b013d
                                           + src[index       +1]*   frac_x )*s
                                         + r)>>(shift*2);
                 }
             }else{
                 if((unsigned)src_y < height){
f66e4f5f
                     index= av_clip(src_x, 0, width) + src_y*stride;
115329f1
                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
073b013d
                                            + src[index+stride  ]*   frac_y )*s
                                         + r)>>(shift*2);
                 }else{
f66e4f5f
                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
073b013d
                     dst[y*stride + x]=    src[index         ];
                 }
             }
115329f1
 
073b013d
             vx+= dxx;
             vy+= dyx;
         }
         ox += dxy;
         oy += dyy;
     }
 }
669ac79c
 
 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     switch(width){
8dbe5856
     case 2: put_pixels2_8_c (dst, src, stride, height); break;
     case 4: put_pixels4_8_c (dst, src, stride, height); break;
     case 8: put_pixels8_8_c (dst, src, stride, height); break;
     case 16:put_pixels16_8_c(dst, src, stride, height); break;
669ac79c
     }
 }
 
 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
115329f1
 
669ac79c
 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
115329f1
 
669ac79c
 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669ac79c
       }
       src += stride;
       dst += stride;
     }
 }
da3b9756
 
 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     switch(width){
8dbe5856
     case 2: avg_pixels2_8_c (dst, src, stride, height); break;
     case 4: avg_pixels4_8_c (dst, src, stride, height); break;
     case 8: avg_pixels8_8_c (dst, src, stride, height); break;
     case 16:avg_pixels16_8_c(dst, src, stride, height); break;
da3b9756
     }
 }
 
 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
115329f1
 
da3b9756
 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
115329f1
 
da3b9756
 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
 
 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
     int i,j;
     for (i=0; i < height; i++) {
       for (j=0; j < width; j++) {
bb270c08
         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
       }
       src += stride;
       dst += stride;
     }
 }
669ac79c
 
b3184779
 #define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
c09da45f
     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
     int i;\
     for(i=0; i<h; i++)\
     {\
         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
         dst+=dstStride;\
         src+=srcStride;\
     }\
44eb4951
 }\
 \
0c1a9eda
 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953
     const int w=8;\
c09da45f
     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
     int i;\
     for(i=0; i<w; i++)\
     {\
         const int src0= src[0*srcStride];\
         const int src1= src[1*srcStride];\
         const int src2= src[2*srcStride];\
         const int src3= src[3*srcStride];\
         const int src4= src[4*srcStride];\
         const int src5= src[5*srcStride];\
         const int src6= src[6*srcStride];\
         const int src7= src[7*srcStride];\
         const int src8= src[8*srcStride];\
         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
         dst++;\
         src++;\
     }\
 }\
 \
0c1a9eda
 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
c09da45f
     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
     int i;\
826f429a
     \
b3184779
     for(i=0; i<h; i++)\
     {\
         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
         dst+=dstStride;\
         src+=srcStride;\
     }\
 }\
 \
0c1a9eda
 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
c09da45f
     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
     int i;\
826f429a
     const int w=16;\
b3184779
     for(i=0; i<w; i++)\
     {\
         const int src0= src[0*srcStride];\
         const int src1= src[1*srcStride];\
         const int src2= src[2*srcStride];\
         const int src3= src[3*srcStride];\
         const int src4= src[4*srcStride];\
         const int src5= src[5*srcStride];\
         const int src6= src[6*srcStride];\
         const int src7= src[7*srcStride];\
         const int src8= src[8*srcStride];\
         const int src9= src[9*srcStride];\
         const int src10= src[10*srcStride];\
         const int src11= src[11*srcStride];\
         const int src12= src[12*srcStride];\
         const int src13= src[13*srcStride];\
         const int src14= src[14*srcStride];\
         const int src15= src[15*srcStride];\
         const int src16= src[16*srcStride];\
         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
         dst++;\
         src++;\
     }\
 }\
 \
a8b60158
 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t half[64];\
b3184779
     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
44eb4951
 }\
 \
a8b60158
 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
b3184779
     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
 }\
 \
a8b60158
 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t half[64];\
b3184779
     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
 }\
 \
a8b60158
 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t half[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
44eb4951
 }\
 \
a8b60158
 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
db794953
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
 }\
 \
a8b60158
 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t half[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
44eb4951
 }\
a8b60158
 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfV[64];\
     uint8_t halfHV[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951
 }\
a8b60158
 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
db794953
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
8dbe5856
     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
db794953
 }\
a8b60158
 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfV[64];\
     uint8_t halfHV[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951
 }\
a8b60158
 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
db794953
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
8dbe5856
     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
db794953
 }\
a8b60158
 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfV[64];\
     uint8_t halfHV[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951
 }\
a8b60158
 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
db794953
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
8dbe5856
     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
db794953
 }\
a8b60158
 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfV[64];\
     uint8_t halfHV[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951
 }\
a8b60158
 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
db794953
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
8dbe5856
     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
db794953
 }\
a8b60158
 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
b3184779
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951
 }\
a8b60158
 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t halfH[72];\
     uint8_t halfHV[64];\
b3184779
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951
 }\
a8b60158
 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfV[64];\
     uint8_t halfHV[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951
 }\
a8b60158
 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
db794953
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
8dbe5856
     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
db794953
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 }\
a8b60158
 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
     uint8_t halfV[64];\
     uint8_t halfHV[64];\
b3184779
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
8dbe5856
     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951
 }\
a8b60158
 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[16*9];\
     uint8_t halfH[72];\
db794953
     copy_block9(full, src, 16, stride, 9);\
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
8dbe5856
     put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
db794953
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
 }\
a8b60158
 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t halfH[72];\
b3184779
     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953
     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779
 }\
 \
a8b60158
 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t half[256];\
b3184779
     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
b3184779
 }\
 \
a8b60158
 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
b3184779
     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951
 }\
b3184779
 \
a8b60158
 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t half[256];\
b3184779
     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
b3184779
 }\
 \
a8b60158
 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t half[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
b3184779
 }\
 \
a8b60158
 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
826f429a
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
 }\
 \
a8b60158
 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t half[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
b3184779
 }\
a8b60158
 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfV[256];\
     uint8_t halfHV[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
b3184779
 }\
a8b60158
 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
db794953
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
8dbe5856
     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
db794953
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
db794953
 }\
a8b60158
 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfV[256];\
     uint8_t halfHV[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
b3184779
 }\
a8b60158
 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
db794953
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
8dbe5856
     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
db794953
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
db794953
 }\
a8b60158
 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfV[256];\
     uint8_t halfHV[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
b3184779
 }\
a8b60158
 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
db794953
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
8dbe5856
     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
db794953
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
db794953
 }\
a8b60158
 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfV[256];\
     uint8_t halfHV[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
b3184779
 }\
a8b60158
 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
db794953
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
8dbe5856
     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
db794953
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
db794953
 }\
a8b60158
 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
b3184779
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
b3184779
 }\
a8b60158
 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t halfH[272];\
     uint8_t halfHV[256];\
b3184779
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
b3184779
 }\
a8b60158
 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfV[256];\
     uint8_t halfHV[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
b3184779
 }\
a8b60158
 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
db794953
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
8dbe5856
     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
db794953
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 }\
a8b60158
 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
     uint8_t halfV[256];\
     uint8_t halfHV[256];\
b3184779
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
8dbe5856
     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
b3184779
 }\
a8b60158
 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t full[24*17];\
     uint8_t halfH[272];\
db794953
     copy_block17(full, src, 24, stride, 17);\
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
8dbe5856
     put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
db794953
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
 }\
a8b60158
 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
 {\
0c1a9eda
     uint8_t halfH[272];\
b3184779
     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a
     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457
 }
44eb4951
 
b3184779
 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
 #define op_put(a, b) a = cm[((b) + 16)>>5]
 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
 
 QPEL_MC(0, put_       , _       , op_put)
 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
 QPEL_MC(0, avg_       , _       , op_avg)
 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
 #undef op_avg
 #undef op_avg_no_rnd
 #undef op_put
 #undef op_put_no_rnd
44eb4951
 
db4e4f76
 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
6d25c9db
     put_pixels8_8_c(dst, src, stride, 8);
9d40f710
 }
db4e4f76
 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
6d25c9db
     avg_pixels8_8_c(dst, src, stride, 8);
9d40f710
 }
db4e4f76
 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
6d25c9db
     put_pixels16_8_c(dst, src, stride, 16);
9d40f710
 }
db4e4f76
 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
6d25c9db
     avg_pixels16_8_c(dst, src, stride, 16);
9d40f710
 }
 
3d1b1caa
 #define put_qpel8_mc00_c  ff_put_pixels8x8_c
 #define avg_qpel8_mc00_c  ff_avg_pixels8x8_c
 #define put_qpel16_mc00_c ff_put_pixels16x16_c
 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
9d40f710
 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
3d1b1caa
 
1457ab52
 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
c09da45f
     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
     int i;
 
     for(i=0; i<h; i++){
         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
         dst+=dstStride;
115329f1
         src+=srcStride;
1457ab52
     }
 }
 
b250f9c6
 #if CONFIG_RV40_DECODER
a8b60158
 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
8dbe5856
     put_pixels16_xy2_8_c(dst, src, stride, 16);
2d8a0815
 }
a8b60158
 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
8dbe5856
     avg_pixels16_xy2_8_c(dst, src, stride, 16);
2d8a0815
 }
a8b60158
 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
8dbe5856
     put_pixels8_xy2_8_c(dst, src, stride, 8);
2d8a0815
 }
a8b60158
 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
8dbe5856
     avg_pixels8_xy2_8_c(dst, src, stride, 8);
2d8a0815
 }
 #endif /* CONFIG_RV40_DECODER */
 
5d50fcc5
 #if CONFIG_DIRAC_DECODER
 #define DIRAC_MC(OPNAME)\
 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
 }\
 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
 {\
     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
 }
 DIRAC_MC(put)
 DIRAC_MC(avg)
 #endif
 
1457ab52
 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
c09da45f
     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
     int i;
 
     for(i=0; i<w; i++){
         const int src_1= src[ -srcStride];
         const int src0 = src[0          ];
         const int src1 = src[  srcStride];
         const int src2 = src[2*srcStride];
         const int src3 = src[3*srcStride];
         const int src4 = src[4*srcStride];
         const int src5 = src[5*srcStride];
         const int src6 = src[6*srcStride];
         const int src7 = src[7*srcStride];
         const int src8 = src[8*srcStride];
         const int src9 = src[9*srcStride];
         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
         src++;
         dst++;
     }
 }
 
a8b60158
 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
1457ab52
     uint8_t half[64];
     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
8dbe5856
     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1457ab52
 }
 
a8b60158
 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
1457ab52
     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
 }
 
a8b60158
 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
1457ab52
     uint8_t half[64];
     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
8dbe5856
     put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1457ab52
 }
 
a8b60158
 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
1457ab52
     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
 }
 
a8b60158
 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
1457ab52
     uint8_t halfH[88];
     uint8_t halfV[64];
     uint8_t halfHV[64];
     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
8dbe5856
     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1457ab52
 }
a8b60158
 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
1457ab52
     uint8_t halfH[88];
     uint8_t halfV[64];
     uint8_t halfHV[64];
     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
8dbe5856
     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1457ab52
 }
a8b60158
 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
 {
1457ab52
     uint8_t halfH[88];
     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
 }
 
bb198e19
 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
 {
     int s, i;
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
de6d9b64
         s += abs(pix1[0] - pix2[0]);
         s += abs(pix1[1] - pix2[1]);
         s += abs(pix1[2] - pix2[2]);
         s += abs(pix1[3] - pix2[3]);
         s += abs(pix1[4] - pix2[4]);
         s += abs(pix1[5] - pix2[5]);
         s += abs(pix1[6] - pix2[6]);
         s += abs(pix1[7] - pix2[7]);
         s += abs(pix1[8] - pix2[8]);
         s += abs(pix1[9] - pix2[9]);
         s += abs(pix1[10] - pix2[10]);
         s += abs(pix1[11] - pix2[11]);
         s += abs(pix1[12] - pix2[12]);
         s += abs(pix1[13] - pix2[13]);
         s += abs(pix1[14] - pix2[14]);
         s += abs(pix1[15] - pix2[15]);
         pix1 += line_size;
         pix2 += line_size;
     }
     return s;
 }
 
bb198e19
 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
 {
     int s, i;
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
de6d9b64
         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
         pix1 += line_size;
         pix2 += line_size;
     }
     return s;
 }
 
bb198e19
 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
 {
     int s, i;
0c1a9eda
     uint8_t *pix3 = pix2 + line_size;
de6d9b64
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
de6d9b64
         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
         pix1 += line_size;
         pix2 += line_size;
         pix3 += line_size;
     }
     return s;
 }
 
bb198e19
 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
 {
     int s, i;
0c1a9eda
     uint8_t *pix3 = pix2 + line_size;
de6d9b64
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
de6d9b64
         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
         pix1 += line_size;
         pix2 += line_size;
         pix3 += line_size;
     }
     return s;
 }
 
bb198e19
 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
 {
     int s, i;
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
ba6802de
         s += abs(pix1[0] - pix2[0]);
         s += abs(pix1[1] - pix2[1]);
         s += abs(pix1[2] - pix2[2]);
         s += abs(pix1[3] - pix2[3]);
         s += abs(pix1[4] - pix2[4]);
         s += abs(pix1[5] - pix2[5]);
         s += abs(pix1[6] - pix2[6]);
         s += abs(pix1[7] - pix2[7]);
         pix1 += line_size;
         pix2 += line_size;
     }
     return s;
 }
 
bb198e19
 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
 {
     int s, i;
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
ba6802de
         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
         pix1 += line_size;
         pix2 += line_size;
     }
     return s;
 }
 
bb198e19
 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
 {
     int s, i;
0c1a9eda
     uint8_t *pix3 = pix2 + line_size;
ba6802de
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
ba6802de
         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
         pix1 += line_size;
         pix2 += line_size;
         pix3 += line_size;
     }
     return s;
 }
 
bb198e19
 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
 {
     int s, i;
0c1a9eda
     uint8_t *pix3 = pix2 + line_size;
ba6802de
 
     s = 0;
bb198e19
     for(i=0;i<h;i++) {
ba6802de
         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
         pix1 += line_size;
         pix2 += line_size;
         pix3 += line_size;
     }
     return s;
 }
 
bf4e3bd2
 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
     MpegEncContext *c = v;
e6a2ac34
     int score1=0;
     int score2=0;
     int x,y;
d4c5d2ad
 
e6a2ac34
     for(y=0; y<h; y++){
         for(x=0; x<16; x++){
             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
         }
         if(y+1<h){
             for(x=0; x<15; x++){
c26abfa5
                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
e6a2ac34
                              - s1[x+1] + s1[x+1+stride])
c26abfa5
                         -FFABS(  s2[x  ] - s2[x  +stride]
e6a2ac34
                              - s2[x+1] + s2[x+1+stride]);
             }
         }
         s1+= stride;
         s2+= stride;
     }
d4c5d2ad
 
c26abfa5
     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
     else  return score1 + FFABS(score2)*8;
e6a2ac34
 }
 
bf4e3bd2
 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
     MpegEncContext *c = v;
e6a2ac34
     int score1=0;
     int score2=0;
     int x,y;
115329f1
 
e6a2ac34
     for(y=0; y<h; y++){
         for(x=0; x<8; x++){
             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
         }
         if(y+1<h){
             for(x=0; x<7; x++){
c26abfa5
                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
e6a2ac34
                              - s1[x+1] + s1[x+1+stride])
c26abfa5
                         -FFABS(  s2[x  ] - s2[x  +stride]
e6a2ac34
                              - s2[x+1] + s2[x+1+stride]);
             }
         }
         s1+= stride;
         s2+= stride;
     }
115329f1
 
c26abfa5
     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
     else  return score1 + FFABS(score2)*8;
e6a2ac34
 }
 
364a1797
 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
     int i;
     unsigned int sum=0;
 
     for(i=0; i<8*8; i++){
         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
         int w= weight[i];
         b>>= RECON_SHIFT;
72c2d8a3
         av_assert2(-512<b && b<512);
364a1797
 
         sum += (w*b)*(w*b)>>4;
     }
     return sum>>2;
 }
 
 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
     int i;
 
     for(i=0; i<8*8; i++){
         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
115329f1
     }
364a1797
 }
 
622348f9
 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
     return 0;
 }
 
 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
     int i;
115329f1
 
3899eb2f
     memset(cmp, 0, sizeof(void*)*6);
115329f1
 
3899eb2f
     for(i=0; i<6; i++){
622348f9
         switch(type&0xFF){
         case FF_CMP_SAD:
             cmp[i]= c->sad[i];
             break;
         case FF_CMP_SATD:
             cmp[i]= c->hadamard8_diff[i];
             break;
         case FF_CMP_SSE:
             cmp[i]= c->sse[i];
             break;
         case FF_CMP_DCT:
             cmp[i]= c->dct_sad[i];
             break;
27c61ac5
         case FF_CMP_DCT264:
             cmp[i]= c->dct264_sad[i];
             break;
0fd6aea1
         case FF_CMP_DCTMAX:
             cmp[i]= c->dct_max[i];
             break;
622348f9
         case FF_CMP_PSNR:
             cmp[i]= c->quant_psnr[i];
             break;
         case FF_CMP_BIT:
             cmp[i]= c->bit[i];
             break;
         case FF_CMP_RD:
             cmp[i]= c->rd[i];
             break;
         case FF_CMP_VSAD:
             cmp[i]= c->vsad[i];
             break;
         case FF_CMP_VSSE:
             cmp[i]= c->vsse[i];
             break;
         case FF_CMP_ZERO:
             cmp[i]= zero_cmp;
             break;
e6a2ac34
         case FF_CMP_NSSE:
             cmp[i]= c->nsse[i];
             break;
05aec7bb
 #if CONFIG_DWT
26efc54e
         case FF_CMP_W53:
             cmp[i]= c->w53[i];
             break;
         case FF_CMP_W97:
             cmp[i]= c->w97[i];
             break;
3a6fc8fa
 #endif
622348f9
         default:
             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
         }
     }
 }
 
11f18faf
 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
469bd7b1
     long i;
f21ca39e
     for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
469bd7b1
         long a = *(long*)(src+i);
         long b = *(long*)(dst+i);
         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
11f18faf
     }
     for(; i<w; i++)
         dst[i+0] += src[i+0];
 }
 
dd2631a6
 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
469bd7b1
     long i;
b250f9c6
 #if !HAVE_FAST_UNALIGNED
469bd7b1
     if((long)src2 & (sizeof(long)-1)){
31304587
         for(i=0; i+7<w; i+=8){
             dst[i+0] = src1[i+0]-src2[i+0];
             dst[i+1] = src1[i+1]-src2[i+1];
             dst[i+2] = src1[i+2]-src2[i+2];
             dst[i+3] = src1[i+3]-src2[i+3];
             dst[i+4] = src1[i+4]-src2[i+4];
             dst[i+5] = src1[i+5]-src2[i+5];
             dst[i+6] = src1[i+6]-src2[i+6];
             dst[i+7] = src1[i+7]-src2[i+7];
         }
469bd7b1
     }else
 #endif
f21ca39e
     for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
469bd7b1
         long a = *(long*)(src1+i);
         long b = *(long*)(src2+i);
         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
     }
11f18faf
     for(; i<w; i++)
         dst[i+0] = src1[i+0]-src2[i+0];
 }
 
e17ccf60
 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3daa434a
     int i;
     uint8_t l, lt;
 
     l= *left;
     lt= *left_top;
 
     for(i=0; i<w; i++){
         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
         lt= src1[i];
         dst[i]= l;
     }
 
     *left= l;
     *left_top= lt;
 }
 
e17ccf60
 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
84705403
     int i;
     uint8_t l, lt;
 
     l= *left;
     lt= *left_top;
 
     for(i=0; i<w; i++){
         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
         lt= src1[i];
         l= src2[i];
         dst[i]= l - pred;
115329f1
     }
84705403
 
     *left= l;
     *left_top= lt;
 }
 
2d4bbdec
 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
73c6f598
     int i;
 
     for(i=0; i<w-1; i++){
         acc+= src[i];
         dst[i]= acc;
         i++;
         acc+= src[i];
         dst[i]= acc;
     }
 
     for(; i<w; i++){
         acc+= src[i];
         dst[i]= acc;
     }
 
     return acc;
 }
 
 #if HAVE_BIGENDIAN
 #define B 3
 #define G 2
 #define R 1
f267d3ac
 #define A 0
73c6f598
 #else
 #define B 0
 #define G 1
 #define R 2
f267d3ac
 #define A 3
73c6f598
 #endif
f267d3ac
 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
73c6f598
     int i;
f267d3ac
     int r,g,b,a;
73c6f598
     r= *red;
     g= *green;
     b= *blue;
f267d3ac
     a= *alpha;
73c6f598
 
     for(i=0; i<w; i++){
         b+= src[4*i+B];
         g+= src[4*i+G];
         r+= src[4*i+R];
f267d3ac
         a+= src[4*i+A];
73c6f598
 
         dst[4*i+B]= b;
         dst[4*i+G]= g;
         dst[4*i+R]= r;
f267d3ac
         dst[4*i+A]= a;
73c6f598
     }
 
     *red= r;
     *green= g;
     *blue= b;
f267d3ac
     *alpha= a;
73c6f598
 }
 #undef B
 #undef G
 #undef R
f267d3ac
 #undef A
73c6f598
 
1457ab52
 #define BUTTERFLY2(o1,o2,i1,i2) \
 o1= (i1)+(i2);\
 o2= (i1)-(i2);
 
 #define BUTTERFLY1(x,y) \
 {\
     int a,b;\
     a= x;\
     b= y;\
     x= a+b;\
     y= a-b;\
 }
 
c26abfa5
 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1457ab52
 
bb198e19
 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
     int i;
     int temp[64];
     int sum=0;
115329f1
 
70a97345
     av_assert2(h==8);
1457ab52
 
     for(i=0; i<8; i++){
         //FIXME try pointer walks
         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
115329f1
 
1457ab52
         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1
 
1457ab52
         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
     }
 
     for(i=0; i<8; i++){
         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1
 
1457ab52
         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
 
115329f1
         sum +=
1457ab52
              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
     }
     return sum;
 }
 
622348f9
 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
     int i;
     int temp[64];
     int sum=0;
115329f1
 
70a97345
     av_assert2(h==8);
115329f1
 
1457ab52
     for(i=0; i<8; i++){
         //FIXME try pointer walks
622348f9
         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
115329f1
 
1457ab52
         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1
 
1457ab52
         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
     }
 
     for(i=0; i<8; i++){
         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1
 
1457ab52
         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
115329f1
 
         sum +=
1457ab52
              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
     }
115329f1
 
c26abfa5
     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
115329f1
 
1457ab52
     return sum;
 }
 
bb198e19
 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52
     MpegEncContext * const s= (MpegEncContext *)c;
88bd7fdc
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
115329f1
 
70a97345
     av_assert2(h==8);
1457ab52
 
     s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839
     s->dsp.fdct(temp);
1edbfe19
     return s->dsp.sum_abs_dctelem(temp);
1457ab52
 }
 
b250f9c6
 #if CONFIG_GPL
27c61ac5
 #define DCT8_1D {\
     const int s07 = SRC(0) + SRC(7);\
     const int s16 = SRC(1) + SRC(6);\
     const int s25 = SRC(2) + SRC(5);\
     const int s34 = SRC(3) + SRC(4);\
     const int a0 = s07 + s34;\
     const int a1 = s16 + s25;\
     const int a2 = s07 - s34;\
     const int a3 = s16 - s25;\
     const int d07 = SRC(0) - SRC(7);\
     const int d16 = SRC(1) - SRC(6);\
     const int d25 = SRC(2) - SRC(5);\
     const int d34 = SRC(3) - SRC(4);\
     const int a4 = d16 + d25 + (d07 + (d07>>1));\
     const int a5 = d07 - d34 - (d25 + (d25>>1));\
     const int a6 = d07 + d34 - (d16 + (d16>>1));\
     const int a7 = d16 - d25 + (d34 + (d34>>1));\
     DST(0,  a0 + a1     ) ;\
     DST(1,  a4 + (a7>>2)) ;\
     DST(2,  a2 + (a3>>1)) ;\
     DST(3,  a5 + (a6>>2)) ;\
     DST(4,  a0 - a1     ) ;\
     DST(5,  a6 - (a5>>2)) ;\
     DST(6, (a2>>1) - a3 ) ;\
     DST(7, (a4>>2) - a7 ) ;\
 }
 
 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
     MpegEncContext * const s= (MpegEncContext *)c;
88bd7fdc
     int16_t dct[8][8];
27c61ac5
     int i;
     int sum=0;
 
8d15910a
     s->dsp.diff_pixels(dct[0], src1, src2, stride);
27c61ac5
 
 #define SRC(x) dct[i][x]
 #define DST(x,v) dct[i][x]= v
     for( i = 0; i < 8; i++ )
         DCT8_1D
 #undef SRC
 #undef DST
 
 #define SRC(x) dct[x][i]
c26abfa5
 #define DST(x,v) sum += FFABS(v)
27c61ac5
     for( i = 0; i < 8; i++ )
         DCT8_1D
 #undef SRC
 #undef DST
     return sum;
 }
 #endif
 
0fd6aea1
 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
     MpegEncContext * const s= (MpegEncContext *)c;
88bd7fdc
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
0fd6aea1
     int sum=0, i;
115329f1
 
70a97345
     av_assert2(h==8);
0fd6aea1
 
     s->dsp.diff_pixels(temp, src1, src2, stride);
     s->dsp.fdct(temp);
 
     for(i=0; i<64; i++)
c26abfa5
         sum= FFMAX(sum, FFABS(temp[i]));
115329f1
 
0fd6aea1
     return sum;
 }
 
bb198e19
 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52
     MpegEncContext * const s= (MpegEncContext *)c;
88bd7fdc
     LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
     int16_t * const bak = temp+64;
1457ab52
     int sum=0, i;
 
70a97345
     av_assert2(h==8);
1457ab52
     s->mb_intra=0;
115329f1
 
1457ab52
     s->dsp.diff_pixels(temp, src1, src2, stride);
115329f1
 
88bd7fdc
     memcpy(bak, temp, 64*sizeof(int16_t));
115329f1
 
67725183
     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd
     s->dct_unquantize_inter(s, temp, 0, s->qscale);
e7a972e1
     ff_simple_idct_8(temp); //FIXME
115329f1
 
1457ab52
     for(i=0; i<64; i++)
         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
115329f1
 
1457ab52
     return sum;
 }
 
bb198e19
 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94
     MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda
     const uint8_t *scantable= s->intra_scantable.permutated;
88bd7fdc
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
40d11227
     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
e6dba5df
     int i, last, run, bits, level, distortion, start_i;
3a87ac94
     const int esc_length= s->ac_esc_length;
     uint8_t * length;
     uint8_t * last_length;
115329f1
 
70a97345
     av_assert2(h==8);
bb198e19
 
90d43b52
     copy_block8(lsrc1, src1, 8, stride, 8);
     copy_block8(lsrc2, src2, 8, stride, 8);
3a87ac94
 
90d43b52
     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
67725183
 
     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
 
     bits=0;
115329f1
 
3a87ac94
     if (s->mb_intra) {
115329f1
         start_i = 1;
3a87ac94
         length     = s->intra_ac_vlc_length;
         last_length= s->intra_ac_vlc_last_length;
67725183
         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
     } else {
         start_i = 0;
         length     = s->inter_ac_vlc_length;
         last_length= s->inter_ac_vlc_last_length;
     }
115329f1
 
67725183
     if(last>=start_i){
3a87ac94
         run=0;
         for(i=start_i; i<last; i++){
             int j= scantable[i];
             level= temp[j];
115329f1
 
3a87ac94
             if(level){
                 level+=64;
                 if((level&(~127)) == 0){
                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
                 }else
                     bits+= esc_length;
                 run=0;
             }else
                 run++;
         }
         i= scantable[last];
115329f1
 
3a87ac94
         level= temp[i] + 64;
1d0eab1d
 
72c2d8a3
         av_assert2(level - 64);
115329f1
 
3a87ac94
         if((level&(~127)) == 0){
             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
         }else
             bits+= esc_length;
115329f1
 
67725183
     }
 
     if(last>=0){
d50635cd
         if(s->mb_intra)
             s->dct_unquantize_intra(s, temp, 0, s->qscale);
         else
             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94
     }
115329f1
 
90d43b52
     s->dsp.idct_add(lsrc2, 8, temp);
115329f1
 
90d43b52
     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3a87ac94
 
e6dba5df
     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
 }
 
bb198e19
 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94
     MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda
     const uint8_t *scantable= s->intra_scantable.permutated;
88bd7fdc
     LOCAL_ALIGNED_16(int16_t, temp, [64]);
3a87ac94
     int i, last, run, bits, level, start_i;
     const int esc_length= s->ac_esc_length;
     uint8_t * length;
     uint8_t * last_length;
bb198e19
 
70a97345
     av_assert2(h==8);
115329f1
 
67725183
     s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94
 
67725183
     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
 
     bits=0;
115329f1
 
3a87ac94
     if (s->mb_intra) {
115329f1
         start_i = 1;
3a87ac94
         length     = s->intra_ac_vlc_length;
         last_length= s->intra_ac_vlc_last_length;
67725183
         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
     } else {
         start_i = 0;
         length     = s->inter_ac_vlc_length;
         last_length= s->inter_ac_vlc_last_length;
     }
115329f1
 
67725183
     if(last>=start_i){
3a87ac94
         run=0;
         for(i=start_i; i<last; i++){
             int j= scantable[i];
             level= temp[j];
115329f1
 
3a87ac94
             if(level){
                 level+=64;
                 if((level&(~127)) == 0){
                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
                 }else
                     bits+= esc_length;
                 run=0;
             }else
                 run++;
         }
         i= scantable[last];
115329f1
 
67725183
         level= temp[i] + 64;
115329f1
 
72c2d8a3
         av_assert2(level - 64);
115329f1
 
3a87ac94
         if((level&(~127)) == 0){
             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
         }else
             bits+= esc_length;
     }
 
     return bits;
 }
 
7fb7f636
 #define VSAD_INTRA(size) \
 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
     int score=0;                                                                                            \
     int x,y;                                                                                                \
                                                                                                             \
     for(y=1; y<h; y++){                                                                                     \
         for(x=0; x<size; x+=4){                                                                             \
             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
         }                                                                                                   \
         s+= stride;                                                                                         \
     }                                                                                                       \
                                                                                                             \
     return score;                                                                                           \
 }
 VSAD_INTRA(8)
 VSAD_INTRA(16)
622348f9
 
 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
     int score=0;
     int x,y;
115329f1
 
622348f9
     for(y=1; y<h; y++){
         for(x=0; x<16; x++){
c26abfa5
             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
622348f9
         }
         s1+= stride;
         s2+= stride;
     }
115329f1
 
622348f9
     return score;
 }
 
 #define SQ(a) ((a)*(a))
7fb7f636
 #define VSSE_INTRA(size) \
 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
     int score=0;                                                                                            \
     int x,y;                                                                                                \
                                                                                                             \
     for(y=1; y<h; y++){                                                                                     \
         for(x=0; x<size; x+=4){                                                                               \
             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
         }                                                                                                   \
         s+= stride;                                                                                         \
     }                                                                                                       \
                                                                                                             \
     return score;                                                                                           \
 }
 VSSE_INTRA(8)
 VSSE_INTRA(16)
622348f9
 
 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
     int score=0;
     int x,y;
115329f1
 
622348f9
     for(y=1; y<h; y++){
         for(x=0; x<16; x++){
             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
         }
         s1+= stride;
         s2+= stride;
     }
115329f1
 
622348f9
     return score;
 }
 
a00177a9
 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
                                int size){
59006372
     int score=0;
     int i;
     for(i=0; i<size; i++)
         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
     return score;
 }
 
059866eb
 #define WRAPPER8_16_SQ(name8, name16)\
 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
     int score=0;\
     score +=name8(s, dst           , src           , stride, 8);\
     score +=name8(s, dst+8         , src+8         , stride, 8);\
     if(h==16){\
         dst += 8*stride;\
         src += 8*stride;\
         score +=name8(s, dst           , src           , stride, 8);\
         score +=name8(s, dst+8         , src+8         , stride, 8);\
     }\
     return score;\
 }
 
9fbd14ac
 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
b250f9c6
 #if CONFIG_GPL
9fbd14ac
 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
60900991
 #endif
9fbd14ac
 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
1457ab52
 
0a68cd87
 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
                    uint32_t maxi, uint32_t maxisign)
 {
 
     if(a > mini) return mini;
187a5379
     else if((a^(1U<<31)) > maxisign) return maxi;
0a68cd87
     else return a;
 }
 
50e23ae9
 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
0a68cd87
     int i;
     uint32_t mini = *(uint32_t*)min;
     uint32_t maxi = *(uint32_t*)max;
187a5379
     uint32_t maxisign = maxi ^ (1U<<31);
0a68cd87
     uint32_t *dsti = (uint32_t*)dst;
50e23ae9
     const uint32_t *srci = (const uint32_t*)src;
0a68cd87
     for(i=0; i<len; i+=8) {
         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
     }
 }
50e23ae9
 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
0a68cd87
     int i;
     if(min < 0 && max > 0) {
         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
     } else {
         for(i=0; i < len; i+=8) {
             dst[i    ] = av_clipf(src[i    ], min, max);
             dst[i + 1] = av_clipf(src[i + 1], min, max);
             dst[i + 2] = av_clipf(src[i + 2], min, max);
             dst[i + 3] = av_clipf(src[i + 3], min, max);
             dst[i + 4] = av_clipf(src[i + 4], min, max);
             dst[i + 5] = av_clipf(src[i + 5], min, max);
             dst[i + 6] = av_clipf(src[i + 6], min, max);
             dst[i + 7] = av_clipf(src[i + 7], min, max);
         }
     }
 }
 
7e1ce6a6
 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
88c0536a
 {
     int res = 0;
 
     while (order--)
7e1ce6a6
         res += *v1++ * *v2++;
88c0536a
 
     return res;
 }
 
b3858964
 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
b1159ad9
 {
     int res = 0;
     while (order--) {
         res   += *v1 * *v2++;
         *v1++ += mul * *v3++;
     }
     return res;
 }
 
6054cd25
 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
                                 int32_t max, unsigned int len)
 {
     do {
         *dst++ = av_clip(*src++, min, max);
         *dst++ = av_clip(*src++, min, max);
         *dst++ = av_clip(*src++, min, max);
         *dst++ = av_clip(*src++, min, max);
         *dst++ = av_clip(*src++, min, max);
         *dst++ = av_clip(*src++, min, max);
         *dst++ = av_clip(*src++, min, max);
         *dst++ = av_clip(*src++, min, max);
         len -= 8;
     } while (len > 0);
 }
 
088f38a4
 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
b0368839
 {
c8e1b2fb
     ff_j_rev_dct (block);
dbc9f84e
     put_pixels_clamped_c(block, dest, line_size);
b0368839
 }
088f38a4
 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
b0368839
 {
c8e1b2fb
     ff_j_rev_dct (block);
dbc9f84e
     add_pixels_clamped_c(block, dest, line_size);
b0368839
 }
 
ac898759
 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
70d54392
 {
     ff_j_rev_dct4 (block);
     put_pixels_clamped4_c(block, dest, line_size);
 }
ac898759
 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
70d54392
 {
     ff_j_rev_dct4 (block);
     add_pixels_clamped4_c(block, dest, line_size);
 }
 
ac898759
 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
70d54392
 {
     ff_j_rev_dct2 (block);
     put_pixels_clamped2_c(block, dest, line_size);
 }
ac898759
 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
70d54392
 {
     ff_j_rev_dct2 (block);
     add_pixels_clamped2_c(block, dest, line_size);
 }
 
ac898759
 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
70d54392
 {
     dest[0] = av_clip_uint8((block[0] + 4)>>3);
 }
ac898759
 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
70d54392
 {
     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
 }
 
59cf08ce
 /* init static data */
9cf0841e
 av_cold void ff_dsputil_static_init(void)
e0eac44e
 {
d2975f8d
     int i;
e0eac44e
 
59cf08ce
     for(i=0;i<512;i++) {
1d503957
         ff_squareTbl[i] = (i - 256) * (i - 256);
59cf08ce
     }
 }
92ddb692
 
6dc7d5da
 int ff_check_alignment(void){
     static int did_fail=0;
29b42c66
     LOCAL_ALIGNED_16(int, aligned, [4]);
6dc7d5da
 
29b42c66
     if((intptr_t)aligned & 15){
6dc7d5da
         if(!did_fail){
b250f9c6
 #if HAVE_MMX || HAVE_ALTIVEC
6dc7d5da
             av_log(NULL, AV_LOG_ERROR,
c1173617
                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
5e4c7ca2
                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2d2b5a14
                 "Do not report crashes to FFmpeg developers.\n");
6dc7d5da
 #endif
             did_fail=1;
         }
         return -1;
     }
     return 0;
 }
92ddb692
 
9cf0841e
 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
59cf08ce
 {
6dc7d5da
     ff_check_alignment();
 
b250f9c6
 #if CONFIG_ENCODERS
0a72533e
     if (avctx->bits_per_raw_sample == 10) {
         c->fdct    = ff_jpeg_fdct_islow_10;
         c->fdct248 = ff_fdct248_islow_10;
     } else {
         if(avctx->dct_algo==FF_DCT_FASTINT) {
3e2efacd
             c->fdct    = ff_fdct_ifast;
             c->fdct248 = ff_fdct_ifast248;
0a72533e
         }
         else if(avctx->dct_algo==FF_DCT_FAAN) {
             c->fdct    = ff_faandct;
             c->fdct248 = ff_faandct248;
         }
         else {
             c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
             c->fdct248 = ff_fdct248_islow_8;
         }
10acc479
     }
b0368839
 #endif //CONFIG_ENCODERS
 
70d54392
     if(avctx->lowres==1){
         c->idct_put= ff_jref_idct4_put;
         c->idct_add= ff_jref_idct4_add;
         c->idct    = ff_j_rev_dct4;
         c->idct_permutation_type= FF_NO_IDCT_PERM;
     }else if(avctx->lowres==2){
         c->idct_put= ff_jref_idct2_put;
         c->idct_add= ff_jref_idct2_add;
         c->idct    = ff_j_rev_dct2;
         c->idct_permutation_type= FF_NO_IDCT_PERM;
     }else if(avctx->lowres==3){
         c->idct_put= ff_jref_idct1_put;
         c->idct_add= ff_jref_idct1_add;
         c->idct    = ff_j_rev_dct1;
         c->idct_permutation_type= FF_NO_IDCT_PERM;
     }else{
         if (avctx->bits_per_raw_sample == 10) {
             c->idct_put              = ff_simple_idct_put_10;
             c->idct_add              = ff_simple_idct_add_10;
             c->idct                  = ff_simple_idct_10;
             c->idct_permutation_type = FF_NO_IDCT_PERM;
ae57e824
         } else if (avctx->bits_per_raw_sample == 12) {
             c->idct_put              = ff_simple_idct_put_12;
             c->idct_add              = ff_simple_idct_add_12;
             c->idct                  = ff_simple_idct_12;
             c->idct_permutation_type = FF_NO_IDCT_PERM;
70d54392
         } else {
178fcca8
         if(avctx->idct_algo==FF_IDCT_INT){
088f38a4
             c->idct_put= jref_idct_put;
             c->idct_add= jref_idct_add;
c8e1b2fb
             c->idct    = ff_j_rev_dct;
178fcca8
             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
6f08c541
         }else if(avctx->idct_algo==FF_IDCT_FAAN){
             c->idct_put= ff_faanidct_put;
             c->idct_add= ff_faanidct_add;
             c->idct    = ff_faanidct;
             c->idct_permutation_type= FF_NO_IDCT_PERM;
178fcca8
         }else{ //accurate/default
e7a972e1
             c->idct_put = ff_simple_idct_put_8;
             c->idct_add = ff_simple_idct_add_8;
             c->idct     = ff_simple_idct_8;
178fcca8
             c->idct_permutation_type= FF_NO_IDCT_PERM;
         }
70d54392
         }
b0368839
     }
 
eb4b3dd3
     c->diff_pixels = diff_pixels_c;
dbc9f84e
     c->put_pixels_clamped = put_pixels_clamped_c;
     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
     c->add_pixels_clamped = add_pixels_clamped_c;
1edbfe19
     c->sum_abs_dctelem = sum_abs_dctelem_c;
eb4b3dd3
     c->gmc1 = gmc1_c;
703c8195
     c->gmc = ff_gmc_c;
eb4b3dd3
     c->pix_sum = pix_sum_c;
     c->pix_norm1 = pix_norm1_c;
 
342c7dfd
     c->fill_block_tab[0] = fill_block16_c;
     c->fill_block_tab[1] = fill_block8_c;
 
45553457
     /* TODO [0] 16  [1] 8 */
bb198e19
     c->pix_abs[0][0] = pix_abs16_c;
     c->pix_abs[0][1] = pix_abs16_x2_c;
     c->pix_abs[0][2] = pix_abs16_y2_c;
     c->pix_abs[0][3] = pix_abs16_xy2_c;
     c->pix_abs[1][0] = pix_abs8_c;
     c->pix_abs[1][1] = pix_abs8_x2_c;
     c->pix_abs[1][2] = pix_abs8_y2_c;
     c->pix_abs[1][3] = pix_abs8_xy2_c;
eb4b3dd3
 
669ac79c
     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
 
da3b9756
     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
 
45553457
 #define dspfunc(PFX, IDX, NUM) \
     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
 
     dspfunc(put_qpel, 0, 16);
     dspfunc(put_no_rnd_qpel, 0, 16);
 
     dspfunc(avg_qpel, 0, 16);
     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
 
     dspfunc(put_qpel, 1, 8);
     dspfunc(put_no_rnd_qpel, 1, 8);
 
     dspfunc(avg_qpel, 1, 8);
     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
0da71265
 
45553457
 #undef dspfunc
5a6a9e78
 
3d1b1caa
     c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
1457ab52
     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
115329f1
 
bb198e19
 #define SET_CMP_FUNC(name) \
     c->name[0]= name ## 16_c;\
     c->name[1]= name ## 8x8_c;
115329f1
 
bb198e19
     SET_CMP_FUNC(hadamard8_diff)
622348f9
     c->hadamard8_diff[4]= hadamard8_intra16_c;
7fb7f636
     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
bb198e19
     SET_CMP_FUNC(dct_sad)
0fd6aea1
     SET_CMP_FUNC(dct_max)
b250f9c6
 #if CONFIG_GPL
27c61ac5
     SET_CMP_FUNC(dct264_sad)
60900991
 #endif
bb198e19
     c->sad[0]= pix_abs16_c;
     c->sad[1]= pix_abs8_c;
     c->sse[0]= sse16_c;
     c->sse[1]= sse8_c;
26efc54e
     c->sse[2]= sse4_c;
bb198e19
     SET_CMP_FUNC(quant_psnr)
     SET_CMP_FUNC(rd)
     SET_CMP_FUNC(bit)
622348f9
     c->vsad[0]= vsad16_c;
     c->vsad[4]= vsad_intra16_c;
7fb7f636
     c->vsad[5]= vsad_intra8_c;
622348f9
     c->vsse[0]= vsse16_c;
     c->vsse[4]= vsse_intra16_c;
7fb7f636
     c->vsse[5]= vsse_intra8_c;
e6a2ac34
     c->nsse[0]= nsse16_c;
     c->nsse[1]= nsse8_c;
aeef76b3
 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
05aec7bb
     ff_dsputil_init_dwt(c);
3a6fc8fa
 #endif
26efc54e
 
59006372
     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
 
11f18faf
     c->add_bytes= add_bytes_c;
     c->diff_bytes= diff_bytes_c;
3daa434a
     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
84705403
     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
73c6f598
     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3d2e8cce
     c->bswap_buf= bswap_buf;
381d37fd
     c->bswap16_buf = bswap16_buf;
42251a2a
 
364a1797
     c->try_8x8basis= try_8x8basis_c;
     c->add_8x8basis= add_8x8basis_c;
11f18faf
 
0a68cd87
     c->vector_clipf = vector_clipf_c;
88c0536a
     c->scalarproduct_int16 = scalarproduct_int16_c;
b1159ad9
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
6054cd25
     c->vector_clip_int32 = vector_clip_int32_c;
53b57211
 
9686abb8
     c->shrink[0]= av_image_copy_plane;
54009d42
     c->shrink[1]= ff_shrink22;
     c->shrink[2]= ff_shrink44;
     c->shrink[3]= ff_shrink88;
 
7ff1a4b1
     c->add_pixels8 = add_pixels8_c;
 
8dbe5856
 #undef FUNC
 #undef FUNCC
 #define FUNC(f, depth) f ## _ ## depth
 #define FUNCC(f, depth) f ## _ ## depth ## _c
 
8a523cfa
     c->draw_edges                    = FUNCC(draw_edges, 8);
     c->clear_block                   = FUNCC(clear_block, 8);
     c->clear_blocks                  = FUNCC(clear_blocks, 8);
8dbe5856
 
619e0da1
 #define BIT_DEPTH_FUNCS(depth) \
d4d186d1
     c->get_pixels                    = FUNCC(get_pixels,   depth);
19a0729b
 
a82beafd
     switch (avctx->bits_per_raw_sample) {
     case 9:
     case 10:
e8b9e1e2
     case 12:
     case 14:
4a88d81c
         BIT_DEPTH_FUNCS(16);
e8b9e1e2
         break;
a82beafd
     default:
71c689e8
         if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
4a88d81c
             BIT_DEPTH_FUNCS(8);
71c689e8
         }
a82beafd
         break;
8dbe5856
     }
 
 
67e6a9f5
     if (ARCH_ALPHA)
         ff_dsputil_init_alpha(c, avctx);
     if (ARCH_ARM)
         ff_dsputil_init_arm(c, avctx);
     if (ARCH_BFIN)
         ff_dsputil_init_bfin(c, avctx);
     if (ARCH_PPC)
         ff_dsputil_init_ppc(c, avctx);
     if (ARCH_SH4)
         ff_dsputil_init_sh4(c, avctx);
     if (HAVE_VIS)
         ff_dsputil_init_vis(c, avctx);
     if (ARCH_X86)
         ff_dsputil_init_x86(c, avctx);
43f1708f
 
92fb52d9
     ff_init_scantable_permutation(c->idct_permutation,
                                   c->idct_permutation_type);
57060b1e
 }
6cb20852
 
 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
 {
     ff_dsputil_init(c, avctx);
 }
1e3104cd
 
 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
 {
     ff_dsputil_init(c, avctx);
 }