GitList

Browse code

huffyuvencdsp: move shared functions to a new lossless_videoencdsp context

Signed-off-by: James Almer <jamrial@gmail.com>

James Almer authored on 2017/01/08 11:13:48
Showing 17 changed files

configure index d4232f5..b2fb960 100755
libavcodec/Makefile index 7864079..4494f26 100644
libavcodec/huffyuv.h index c18247e..83309d4 100644
libavcodec/huffyuvenc.c index f5bc99f..89639b7 100644
libavcodec/huffyuvencdsp.c index f051021..7edcce8 100644
libavcodec/huffyuvencdsp.h index 141dad8..603f9c8 100644
libavcodec/lossless_videoencdsp.c index 0000000..5cc4934
libavcodec/lossless_videoencdsp.h index 0000000..3d645b1
libavcodec/pngenc.c index 3aeff83..f67a546 100644
libavcodec/utvideo.h index 3b2fe5f..68257fe 100644
libavcodec/utvideoenc.c index d82f6a3..2b1230f 100644
libavcodec/x86/Makefile index 1db1137..2f0354a 100644
libavcodec/x86/huffyuvencdsp.asm index 78ad202..1228aa8 100644
libavcodec/x86/huffyuvencdsp_init.c index 0000000..f66bc8c
libavcodec/x86/huffyuvencdsp_mmx.c index 2402021..0000000
libavcodec/x86/lossless_videoencdsp.asm index 0000000..63fd721
libavcodec/x86/lossless_videoencdsp_init.c index 0000000..fc728c9

@@ -2115,6 +2115,7 @@ CONFIG_EXTRA="
                          libx262
                          llauddsp
                          llviddsp
                     +    llvidencdsp
                          lpc
                          lzf
                          me_cmp
@@ -2366,7 +2367,7 @@ amv_decoder_select="sp5x_decoder exif"
                      amv_encoder_select="aandcttables jpegtables mpegvideoenc"
                      ape_decoder_select="bswapdsp llauddsp"
                      apng_decoder_select="zlib"
                     -apng_encoder_select="huffyuvencdsp zlib"
                     +apng_encoder_select="llvidencdsp zlib"
                      asv1_decoder_select="blockdsp bswapdsp idctdsp"
                      asv1_encoder_select="bswapdsp fdctdsp pixblockdsp"
                      asv2_decoder_select="blockdsp bswapdsp idctdsp"
@@ -2430,7 +2431,7 @@ hap_encoder_deps="libsnappy"
                      hap_encoder_select="texturedspenc"
                      hevc_decoder_select="bswapdsp cabac golomb videodsp"
                      huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
                     -huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp"
                     +huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
                      iac_decoder_select="imc_decoder"
                      imc_decoder_select="bswapdsp fft mdct sinewin"
                      indeo3_decoder_select="hpeldsp"
@@ -2491,7 +2492,7 @@ on2avc_decoder_select="mdct"
                      opus_decoder_deps="swresample"
                      opus_decoder_select="imdct15"
                      png_decoder_select="zlib"
                     -png_encoder_select="huffyuvencdsp zlib"
                     +png_encoder_select="llvidencdsp zlib"
                      prores_decoder_select="blockdsp idctdsp"
                      prores_encoder_select="fdctdsp"
                      qcelp_decoder_select="lsp"
@@ -2534,7 +2535,7 @@ tscc_decoder_select="zlib"
                      twinvq_decoder_select="mdct lsp sinewin"
                      txd_decoder_select="texturedsp"
                      utvideo_decoder_select="bswapdsp llviddsp"
                     -utvideo_encoder_select="bswapdsp huffman huffyuvencdsp"
                     +utvideo_encoder_select="bswapdsp huffman llvidencdsp"
                      vble_decoder_select="llviddsp"
                      vc1_decoder_select="blockdsp h263_decoder h264qpel intrax8 mpegvideo vc1dsp"
                      vc1_qsv_decoder_deps="libmfx"

libavcodec/Makefile

History View file @ cf9ef83

@@ -91,6 +91,7 @@ OBJS-$(CONFIG_JPEGTABLES)              += jpegtables.o
                      OBJS-$(CONFIG_LIBXVID)                 += libxvid_rc.o
                      OBJS-$(CONFIG_LLAUDDSP)                += lossless_audiodsp.o
                      OBJS-$(CONFIG_LLVIDDSP)                += lossless_videodsp.o
                     +OBJS-$(CONFIG_LLVIDENCDSP)             += lossless_videoencdsp.o
                      OBJS-$(CONFIG_LPC)                     += lpc.o
                      OBJS-$(CONFIG_LSP)                     += lsp.o
                      OBJS-$(CONFIG_LZF)                     += lzf.o

libavcodec/huffyuv.h

History View file @ cf9ef83

@@ -38,6 +38,7 @@
                      #include "huffyuvencdsp.h"
                      #include "put_bits.h"
                      #include "lossless_videodsp.h"
                     +#include "lossless_videoencdsp.h"
                      #define VLC_BITS 12
@@ -89,6 +90,7 @@ typedef struct HYuvContext {
                          HuffYUVDSPContext hdsp;
                          HuffYUVEncDSPContext hencdsp;
                          LLVidDSPContext llviddsp;
                     +    LLVidEncDSPContext llvidencdsp;
                          int non_determ; // non-deterministic, multi-threaded encoder allowed
                      } HYuvContext;

libavcodec/huffyuvenc.c

History View file @ cf9ef83

@@ -33,6 +33,7 @@
                      #include "huffman.h"
                      #include "huffyuvencdsp.h"
                      #include "internal.h"
                     +#include "lossless_videoencdsp.h"
                      #include "put_bits.h"
                      #include "libavutil/opt.h"
                      #include "libavutil/pixdesc.h"
@@ -41,7 +42,7 @@ static inline void diff_bytes(HYuvContext *s, uint8_t *dst,
                                                    const uint8_t *src0, const uint8_t *src1, int w)
+                     {
                          if (s->bps <= 8) {
                     -        s->hencdsp.diff_bytes(dst, src0, src1, w);
                     +        s->llvidencdsp.diff_bytes(dst, src0, src1, w);
                          } else {
                              s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w);
+                         }
@@ -65,7 +66,7 @@ static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst,
                                      dst[i] = temp - left;
                                      left   = temp;
+                                 }
                     -            s->hencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32);
                     +            s->llvidencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32);
                                  return src[w-1];
+                             }
                          } else {
@@ -117,7 +118,7 @@ static inline void sub_left_prediction_bgr32(HYuvContext *s, uint8_t *dst,
                              a = at;
+                         }
                     -    s->hencdsp.diff_bytes(dst + 16, src + 16, src + 12, w * 4 - 16);
                     +    s->llvidencdsp.diff_bytes(dst + 16, src + 16, src + 12, w * 4 - 16);
                          *red   = src[(w - 1) * 4 + R];
                          *green = src[(w - 1) * 4 + G];
@@ -146,7 +147,7 @@ static inline void sub_left_prediction_rgb24(HYuvContext *s, uint8_t *dst,
                              b = bt;
+                         }
                     -    s->hencdsp.diff_bytes(dst + 48, src + 48, src + 48 - 3, w * 3 - 48);
                     +    s->llvidencdsp.diff_bytes(dst + 48, src + 48, src + 48 - 3, w * 3 - 48);
                          *red   = src[(w - 1) * 3 + 0];
                          *green = src[(w - 1) * 3 + 1];
@@ -156,7 +157,7 @@ static inline void sub_left_prediction_rgb24(HYuvContext *s, uint8_t *dst,
                      static void sub_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top)
+                     {
                          if (s->bps <= 8) {
                     -        s->hencdsp.sub_hfyu_median_pred(dst, src1, src2, w , left, left_top);
                     +        s->llvidencdsp.sub_median_pred(dst, src1, src2, w , left, left_top);
                          } else {
                              s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top);
+                         }
@@ -218,6 +219,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
                          ff_huffyuv_common_init(avctx);
                          ff_huffyuvencdsp_init(&s->hencdsp, avctx);
                     +    ff_llvidencdsp_init(&s->llvidencdsp);
                          avctx->extradata = av_mallocz(3*MAX_N + 4);
                          if (s->flags&AV_CODEC_FLAG_PASS1) {
@@ -823,9 +825,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                  lefttopy = p->data[0][3];
                                  lefttopu = p->data[1][1];
                                  lefttopv = p->data[2][1];
                     -            s->hencdsp.sub_hfyu_median_pred(s->temp[0], p->data[0] + 4, p->data[0] + fake_ystride + 4, width  - 4, &lefty, &lefttopy);
                     -            s->hencdsp.sub_hfyu_median_pred(s->temp[1], p->data[1] + 2, p->data[1] + fake_ustride + 2, width2 - 2, &leftu, &lefttopu);
                     -            s->hencdsp.sub_hfyu_median_pred(s->temp[2], p->data[2] + 2, p->data[2] + fake_vstride + 2, width2 - 2, &leftv, &lefttopv);
                     +            s->llvidencdsp.sub_median_pred(s->temp[0], p->data[0] + 4, p->data[0] + fake_ystride + 4, width  - 4, &lefty, &lefttopy);
                     +            s->llvidencdsp.sub_median_pred(s->temp[1], p->data[1] + 2, p->data[1] + fake_ustride + 2, width2 - 2, &leftu, &lefttopu);
                     +            s->llvidencdsp.sub_median_pred(s->temp[2], p->data[2] + 2, p->data[2] + fake_vstride + 2, width2 - 2, &leftv, &lefttopv);
                                  encode_422_bitstream(s, 0, width - 4);
                                  y++; cy++;
@@ -835,7 +837,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                      if (s->bitstream_bpp == 12) {
                                          while (2 * cy > y) {
                                              ydst = p->data[0] + p->linesize[0] * y;
                     -                        s->hencdsp.sub_hfyu_median_pred(s->temp[0], ydst - fake_ystride, ydst, width, &lefty, &lefttopy);
                     +                        s->llvidencdsp.sub_median_pred(s->temp[0], ydst - fake_ystride, ydst, width, &lefty, &lefttopy);
                                              encode_gray_bitstream(s, width);
                                              y++;
+                                         }
@@ -845,9 +847,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                      udst = p->data[1] + p->linesize[1] * cy;
                                      vdst = p->data[2] + p->linesize[2] * cy;
                     -                s->hencdsp.sub_hfyu_median_pred(s->temp[0], ydst - fake_ystride, ydst, width,  &lefty, &lefttopy);
                     -                s->hencdsp.sub_hfyu_median_pred(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
                     -                s->hencdsp.sub_hfyu_median_pred(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
                     +                s->llvidencdsp.sub_median_pred(s->temp[0], ydst - fake_ystride, ydst, width,  &lefty, &lefttopy);
                     +                s->llvidencdsp.sub_median_pred(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
                     +                s->llvidencdsp.sub_median_pred(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
                                      encode_422_bitstream(s, 0, width);
+                                 }
@@ -860,7 +862,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                          ydst = p->data[0] + p->linesize[0] * y;
                                          if (s->predictor == PLANE && s->interlaced < y) {
                     -                        s->hencdsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
                     +                        s->llvidencdsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
                                              lefty = sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
                                          } else {
@@ -876,9 +878,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                                      vdst = p->data[2] + p->linesize[2] * cy;
                                      if (s->predictor == PLANE && s->interlaced < cy) {
                     -                    s->hencdsp.diff_bytes(s->temp[1],          ydst, ydst - fake_ystride, width);
                     -                    s->hencdsp.diff_bytes(s->temp[2],          udst, udst - fake_ustride, width2);
                     -                    s->hencdsp.diff_bytes(s->temp[2] + width2, vdst, vdst - fake_vstride, width2);
                     +                    s->llvidencdsp.diff_bytes(s->temp[1],          ydst, ydst - fake_ystride, width);
                     +                    s->llvidencdsp.diff_bytes(s->temp[2],          udst, udst - fake_ustride, width2);
                     +                    s->llvidencdsp.diff_bytes(s->temp[2] + width2, vdst, vdst - fake_vstride, width2);
                                          lefty = sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
                                          leftu = sub_left_prediction(s, s->temp[1], s->temp[2], width2, leftu);
@@ -911,7 +913,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                              for (y = 1; y < s->height; y++) {
                                  uint8_t *dst = data + y*stride;
                                  if (s->predictor == PLANE && s->interlaced < y) {
                     -                s->hencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride, width * 4);
                     +                s->llvidencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride, width * 4);
                                      sub_left_prediction_bgr32(s, s->temp[0], s->temp[1], width,
                                                                &leftr, &leftg, &leftb, &lefta);
                                  } else {
@@ -939,7 +941,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
                              for (y = 1; y < s->height; y++) {
                                  uint8_t *dst = data + y * stride;
                                  if (s->predictor == PLANE && s->interlaced < y) {
                     -                s->hencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride,
                     +                s->llvidencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride,
                                                            width * 3);
                                      sub_left_prediction_rgb24(s, s->temp[0], s->temp[1], width,
                                                                &leftr, &leftg, &leftb);

libavcodec/huffyuvencdsp.c

History View file @ cf9ef83

@@ -21,38 +21,6 @@
                      #include "huffyuvencdsp.h"
                      #include "mathops.h"
                     -// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
                     -#define pb_7f (~0UL / 255 * 0x7f)
                     -#define pb_80 (~0UL / 255 * 0x80)
+                    -
                     -static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w)
                     -{
                     -    long i;
+                    -
                     -#if !HAVE_FAST_UNALIGNED
                     -    if (((long)src1 | (long)src2) & (sizeof(long) - 1)) {
                     -        for (i = 0; i + 7 < w; i += 8) {
                     -            dst[i + 0] = src1[i + 0] - src2[i + 0];
                     -            dst[i + 1] = src1[i + 1] - src2[i + 1];
                     -            dst[i + 2] = src1[i + 2] - src2[i + 2];
                     -            dst[i + 3] = src1[i + 3] - src2[i + 3];
                     -            dst[i + 4] = src1[i + 4] - src2[i + 4];
                     -            dst[i + 5] = src1[i + 5] - src2[i + 5];
                     -            dst[i + 6] = src1[i + 6] - src2[i + 6];
                     -            dst[i + 7] = src1[i + 7] - src2[i + 7];
                     -        }
                     -    } else
                     -#endif
                     -    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
                     -        long a = *(long *) (src1 + i);
                     -        long b = *(long *) (src2 + i);
                     -        *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
                     -                              ((a ^ b ^ pb_80) & pb_80);
                     -    }
                     -    for (; i < w; i++)
                     -        dst[i + 0] = src1[i + 0] - src2[i + 0];
                     -}
+                    -
                      static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){
                          long i;
                      #if !HAVE_FAST_UNALIGNED
@@ -79,27 +47,6 @@ static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *sr
                              dst[i] = (src1[i] - src2[i]) & mask;
+                     }
                     -static void sub_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
                     -                                   const uint8_t *src2, intptr_t w,
                     -                                   int *left, int *left_top)
                     -{
                     -    int i;
                     -    uint8_t l, lt;
+                    -
                     -    l  = *left;
                     -    lt = *left_top;
+                    -
                     -    for (i = 0; i < w; i++) {
                     -        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
                     -        lt     = src1[i];
                     -        l      = src2[i];
                     -        dst[i] = l - pred;
                     -    }
+                    -
                     -    *left     = l;
                     -    *left_top = lt;
                     -}
+                    -
                      static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){
                          int i;
                          uint16_t l, lt;
@@ -120,9 +67,7 @@ static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, co
                      av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+                     {
                     -    c->diff_bytes           = diff_bytes_c;
                          c->diff_int16           = diff_int16_c;
                     -    c->sub_hfyu_median_pred = sub_hfyu_median_pred_c;
                          c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
                          if (ARCH_X86)

libavcodec/huffyuvencdsp.h

History View file @ cf9ef83

@@ -24,22 +24,11 @@
                      #include "avcodec.h"
                      typedef struct HuffYUVEncDSPContext {
                     -    void (*diff_bytes)(uint8_t *dst /* align 16 */,
                     -                       const uint8_t *src1 /* align 16 */,
                     -                       const uint8_t *src2 /* align 1 */,
                     -                       intptr_t w);
                          void (*diff_int16)(uint16_t *dst /* align 16 */,
                                             const uint16_t *src1 /* align 16 */,
                                             const uint16_t *src2 /* align 1 */,
                                             unsigned mask, int w);
                     -    /**
                     -     * Subtract HuffYUV's variant of median prediction.
                     -     * Note, this might read from src1[-1], src2[-1].
                     -     */
                     -    void (*sub_hfyu_median_pred)(uint8_t *dst, const uint8_t *src1,
                     -                                 const uint8_t *src2, intptr_t w,
                     -                                 int *left, int *left_top);
                          void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1,
                                                             const uint16_t *src2, unsigned mask,
                                                             int w, int *left, int *left_top);

libavcodec/lossless_videoencdsp.c

History View file @ cf9ef83

                     new file mode 100644
@@ -0,0 +1,84 @@
                     +/*
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "config.h"
                     +#include "libavutil/attributes.h"
                     +#include "lossless_videoencdsp.h"
                     +#include "mathops.h"
+                    +
                     +// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
                     +#define pb_7f (~0UL / 255 * 0x7f)
                     +#define pb_80 (~0UL / 255 * 0x80)
+                    +
                     +static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w)
                     +{
                     +    long i;
+                    +
                     +#if !HAVE_FAST_UNALIGNED
                     +    if (((long)src1 | (long)src2) & (sizeof(long) - 1)) {
                     +        for (i = 0; i + 7 < w; i += 8) {
                     +            dst[i + 0] = src1[i + 0] - src2[i + 0];
                     +            dst[i + 1] = src1[i + 1] - src2[i + 1];
                     +            dst[i + 2] = src1[i + 2] - src2[i + 2];
                     +            dst[i + 3] = src1[i + 3] - src2[i + 3];
                     +            dst[i + 4] = src1[i + 4] - src2[i + 4];
                     +            dst[i + 5] = src1[i + 5] - src2[i + 5];
                     +            dst[i + 6] = src1[i + 6] - src2[i + 6];
                     +            dst[i + 7] = src1[i + 7] - src2[i + 7];
                     +        }
                     +    } else
                     +#endif
                     +    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
                     +        long a = *(long *) (src1 + i);
                     +        long b = *(long *) (src2 + i);
                     +        *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
                     +                              ((a ^ b ^ pb_80) & pb_80);
                     +    }
                     +    for (; i < w; i++)
                     +        dst[i + 0] = src1[i + 0] - src2[i + 0];
                     +}
+                    +
                     +static void sub_median_pred_c(uint8_t *dst, const uint8_t *src1,
                     +                              const uint8_t *src2, intptr_t w,
                     +                              int *left, int *left_top)
                     +{
                     +    int i;
                     +    uint8_t l, lt;
+                    +
                     +    l  = *left;
                     +    lt = *left_top;
+                    +
                     +    for (i = 0; i < w; i++) {
                     +        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
                     +        lt     = src1[i];
                     +        l      = src2[i];
                     +        dst[i] = l - pred;
                     +    }
+                    +
                     +    *left     = l;
                     +    *left_top = lt;
                     +}
+                    +
                     +av_cold void ff_llvidencdsp_init(LLVidEncDSPContext *c)
                     +{
                     +    c->diff_bytes      = diff_bytes_c;
                     +    c->sub_median_pred = sub_median_pred_c;
+                    +
                     +    if (ARCH_X86)
                     +        ff_llvidencdsp_init_x86(c);
                     +}

libavcodec/lossless_videoencdsp.h

History View file @ cf9ef83

                     new file mode 100644
@@ -0,0 +1,41 @@
                     +/*
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#ifndef AVCODEC_LOSSLESS_VIDEOENCDSP_H
                     +#define AVCODEC_LOSSLESS_VIDEOENCDSP_H
+                    +
                     +#include <stdint.h>
+                    +
                     +typedef struct LLVidEncDSPContext {
                     +    void (*diff_bytes)(uint8_t *dst /* align 16 */,
                     +                       const uint8_t *src1 /* align 16 */,
                     +                       const uint8_t *src2 /* align 1 */,
                     +                       intptr_t w);
                     +    /**
                     +     * Subtract HuffYUV's variant of median prediction.
                     +     * Note, this might read from src1[-1], src2[-1].
                     +     */
                     +    void (*sub_median_pred)(uint8_t *dst, const uint8_t *src1,
                     +                            const uint8_t *src2, intptr_t w,
                     +                            int *left, int *left_top);
                     +} LLVidEncDSPContext;
+                    +
                     +void ff_llvidencdsp_init(LLVidEncDSPContext *c);
                     +void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c);
+                    +
                     +#endif /* AVCODEC_LOSSLESS_VIDEOENCDSP_H */

libavcodec/pngenc.c

History View file @ cf9ef83

@@ -22,7 +22,7 @@
                      #include "avcodec.h"
                      #include "internal.h"
                      #include "bytestream.h"
                     -#include "huffyuvencdsp.h"
                     +#include "lossless_videoencdsp.h"
                      #include "png.h"
                      #include "apng.h"
@@ -47,7 +47,7 @@ typedef struct APNGFctlChunk {
                      typedef struct PNGEncContext {
                          AVClass *class;
                     -    HuffYUVEncDSPContext hdsp;
                     +    LLVidEncDSPContext llvidencdsp;
                          uint8_t *bytestream;
                          uint8_t *bytestream_start;
@@ -159,7 +159,7 @@ static void sub_left_prediction(PNGEncContext *c, uint8_t *dst, const uint8_t *s
                          for (x = 0; x < unaligned_w; x++)
                              *dst++ = *src1++ - *src2++;
                          size -= unaligned_w;
                     -    c->hdsp.diff_bytes(dst, src1, src2, size);
                     +    c->llvidencdsp.diff_bytes(dst, src1, src2, size);
+                     }
                      static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
@@ -175,7 +175,7 @@ static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
                              sub_left_prediction(c, dst, src, bpp, size);
                              break;
                          case PNG_FILTER_VALUE_UP:
                     -        c->hdsp.diff_bytes(dst, src, top, size);
                     +        c->llvidencdsp.diff_bytes(dst, src, top, size);
                              break;
                          case PNG_FILTER_VALUE_AVG:
                              for (i = 0; i < bpp; i++)
@@ -1015,7 +1015,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
                      FF_ENABLE_DEPRECATION_WARNINGS
                      #endif
                     -    ff_huffyuvencdsp_init(&s->hdsp, avctx);
                     +    ff_llvidencdsp_init(&s->llvidencdsp);
                      #if FF_API_PRIVATE_OPT
                      FF_DISABLE_DEPRECATION_WARNINGS

libavcodec/utvideo.h

History View file @ cf9ef83

@@ -30,8 +30,8 @@
                      #include "libavutil/common.h"
                      #include "avcodec.h"
                      #include "bswapdsp.h"
                     -#include "huffyuvencdsp.h"
                      #include "lossless_videodsp.h"
                     +#include "lossless_videoencdsp.h"
                      enum {
                          PRED_NONE = 0,
@@ -70,8 +70,8 @@ typedef struct UtvideoContext {
                          const AVClass *class;
                          AVCodecContext *avctx;
                          BswapDSPContext bdsp;
                     -    HuffYUVEncDSPContext hdsp;
                          LLVidDSPContext llviddsp;
                     +    LLVidEncDSPContext llvidencdsp;
                          uint32_t frame_info_size, flags, frame_info;
                          int      planes;

libavcodec/utvideoenc.c

History View file @ cf9ef83

@@ -33,7 +33,6 @@
                      #include "bswapdsp.h"
                      #include "bytestream.h"
                      #include "put_bits.h"
                     -#include "huffyuvencdsp.h"
                      #include "mathops.h"
                      #include "utvideo.h"
                      #include "huffman.h"
@@ -120,7 +119,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx)
+                         }
                          ff_bswapdsp_init(&c->bdsp);
                     -    ff_huffyuvencdsp_init(&c->hdsp, avctx);
                     +    ff_llvidencdsp_init(&c->llvidencdsp);
                      #if FF_API_PRIVATE_OPT
                      FF_DISABLE_DEPRECATION_WARNINGS
@@ -324,7 +323,7 @@ static void median_predict(UtvideoContext *c, uint8_t *src, uint8_t *dst, int st
                          /* Rest of the coded part uses median prediction */
                          for (j = 1; j < height; j++) {
                     -        c->hdsp.sub_hfyu_median_pred(dst, src - stride, src, width, &A, &B);
                     +        c->llvidencdsp.sub_median_pred(dst, src - stride, src, width, &A, &B);
                              dst += width;
                              src += stride;
+                         }

libavcodec/x86/Makefile

History View file @ cf9ef83

@@ -20,8 +20,9 @@ OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
                      OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
                      OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
                      OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
                     +OBJS-$(CONFIG_LLVIDENCDSP)             += x86/lossless_videoencdsp_init.o
                      OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
                     -OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
                     +OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
                      OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
                      OBJS-$(CONFIG_LPC)                     += x86/lpc.o
                      OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
@@ -114,6 +115,7 @@ YASM-OBJS-$(CONFIG_HUFFYUVENCDSP)      += x86/huffyuvencdsp.o
                      YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
                      YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
                      YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
                     +YASM-OBJS-$(CONFIG_LLVIDENCDSP)        += x86/lossless_videoencdsp.o
                      YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
                      YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
                      YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o

libavcodec/x86/huffyuvencdsp.asm

History View file @ cf9ef83

@@ -27,128 +27,8 @@
                      section .text
                     -; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     -;                    intptr_t w);
                     -%macro DIFF_BYTES_PROLOGUE 0
                     -%if ARCH_X86_32
                     -cglobal diff_bytes, 3,5,2, dst, src1, src2
                     -%define wq r4q
                     -    DECLARE_REG_TMP 3
                     -    mov               wq, r3mp
                     -%else
                     -cglobal diff_bytes, 4,5,2, dst, src1, src2, w
                     -    DECLARE_REG_TMP 4
                     -%endif ; ARCH_X86_32
                     -%define i t0q
                     -%endmacro
+                    -
                     -; label to jump to if w < regsize
                     -%macro DIFF_BYTES_LOOP_PREP 1
                     -    mov                i, wq
                     -    and                i, -2 * regsize
                     -        jz            %1
                     -    add             dstq, i
                     -    add            src1q, i
                     -    add            src2q, i
                     -    neg                i
                     -%endmacro
+                    -
                     -; mov type used for src1q, dstq, first reg, second reg
                     -%macro DIFF_BYTES_LOOP_CORE 4
                     -%if mmsize != 16
                     -    mov%1             %3, [src1q + i]
                     -    mov%1             %4, [src1q + i + regsize]
                     -    psubb             %3, [src2q + i]
                     -    psubb             %4, [src2q + i + regsize]
                     -    mov%2           [dstq + i], %3
                     -    mov%2 [regsize + dstq + i], %4
                     -%else
                     -    ; SSE enforces alignment of psubb operand
                     -    mov%1             %3, [src1q + i]
                     -    movu              %4, [src2q + i]
                     -    psubb             %3, %4
                     -    mov%2     [dstq + i], %3
                     -    mov%1             %3, [src1q + i + regsize]
                     -    movu              %4, [src2q + i + regsize]
                     -    psubb             %3, %4
                     -    mov%2 [regsize + dstq + i], %3
                     -%endif
                     -%endmacro
+                    -
                     -%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
                     -    %define regsize mmsize
                     -.loop_%1%2:
                     -    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
                     -    add                i, 2 * regsize
                     -        jl    .loop_%1%2
                     -.skip_main_%1%2:
                     -    and               wq, 2 * regsize - 1
                     -        jz     .end_%1%2
                     -%if mmsize > 16
                     -    ; fall back to narrower xmm
                     -    %define regsize mmsize / 2
                     -    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
                     -.loop2_%1%2:
                     -    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
                     -    add                i, 2 * regsize
                     -        jl   .loop2_%1%2
                     -.setup_loop_gpr_%1%2:
                     -    and               wq, 2 * regsize - 1
                     -        jz     .end_%1%2
                     -%endif
                     -    add             dstq, wq
                     -    add            src1q, wq
                     -    add            src2q, wq
                     -    neg               wq
                     -.loop_gpr_%1%2:
                     -    mov              t0b, [src1q + wq]
                     -    sub              t0b, [src2q + wq]
                     -    mov      [dstq + wq], t0b
                     -    inc               wq
                     -        jl .loop_gpr_%1%2
                     -.end_%1%2:
                     -    REP_RET
                     -%endmacro
+                    -
                     -%if ARCH_X86_32
                     -INIT_MMX mmx
                     -DIFF_BYTES_PROLOGUE
                     -    %define regsize mmsize
                     -    DIFF_BYTES_LOOP_PREP .skip_main_aa
                     -    DIFF_BYTES_BODY    a, a
                     -%undef i
                     -%endif
+                    -
                     -INIT_XMM sse2
                     -DIFF_BYTES_PROLOGUE
                     -    %define regsize mmsize
                     -    DIFF_BYTES_LOOP_PREP .skip_main_aa
                     -    test            dstq, regsize - 1
                     -        jnz     .loop_uu
                     -    test           src1q, regsize - 1
                     -        jnz     .loop_ua
                     -    DIFF_BYTES_BODY    a, a
                     -    DIFF_BYTES_BODY    u, a
                     -    DIFF_BYTES_BODY    u, u
                     -%undef i
+                    -
                     -%if HAVE_AVX2_EXTERNAL
                     -INIT_YMM avx2
                     -DIFF_BYTES_PROLOGUE
                     -    %define regsize mmsize
                     -    ; Directly using unaligned SSE2 version is marginally faster than
                     -    ; branching based on arguments.
                     -    DIFF_BYTES_LOOP_PREP .skip_main_uu
                     -    test            dstq, regsize - 1
                     -        jnz     .loop_uu
                     -    test           src1q, regsize - 1
                     -        jnz     .loop_ua
                     -    DIFF_BYTES_BODY    a, a
                     -    DIFF_BYTES_BODY    u, a
                     -    DIFF_BYTES_BODY    u, u
                     -%undef i
                     -%endif
+                    -
                     +; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     +;                    unsigned mask, int w);
                      %macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
                          movd    m4, maskd
                          SPLATW  m4, m4

libavcodec/x86/huffyuvencdsp_init.c

History View file @ cf9ef83

                     new file mode 100644
@@ -0,0 +1,54 @@
                     +/*
                     + * SIMD-optimized HuffYUV encoding functions
                     + * Copyright (c) 2000, 2001 Fabrice Bellard
                     + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
                     + *
                     + * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "libavutil/attributes.h"
                     +#include "libavutil/cpu.h"
                     +#include "libavutil/pixdesc.h"
                     +#include "libavutil/x86/cpu.h"
                     +#include "libavcodec/huffyuvencdsp.h"
+                    +
                     +void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
                     +                        unsigned mask, int w);
                     +void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
                     +                        unsigned mask, int w);
                     +void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
                     +                                          unsigned mask, int w, int *left, int *left_top);
+                    +
                     +av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
                     +{
                     +    av_unused int cpu_flags = av_get_cpu_flags();
                     +    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+                    +
                     +    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
                     +        c->diff_int16 = ff_diff_int16_mmx;
                     +    }
+                    +
                     +    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
                     +        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
                     +    }
+                    +
                     +    if (EXTERNAL_SSE2(cpu_flags)) {
                     +        c->diff_int16 = ff_diff_int16_sse2;
                     +    }
                     +}

libavcodec/x86/huffyuvencdsp_mmx.c

History View file @ cf9ef83

                     deleted file mode 100644
@@ -1,118 +0,0 @@
                     -/*
                     - * SIMD-optimized HuffYUV encoding functions
                     - * Copyright (c) 2000, 2001 Fabrice Bellard
                     - * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
                     - *
                     - * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
                     - *
                     - * This file is part of FFmpeg.
                     - *
                     - * FFmpeg is free software; you can redistribute it and/or
                     - * modify it under the terms of the GNU Lesser General Public
                     - * License as published by the Free Software Foundation; either
                     - * version 2.1 of the License, or (at your option) any later version.
                     - *
                     - * FFmpeg is distributed in the hope that it will be useful,
                     - * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     - * Lesser General Public License for more details.
                     - *
                     - * You should have received a copy of the GNU Lesser General Public
                     - * License along with FFmpeg; if not, write to the Free Software
                     - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     - */
+                    -
                     -#include "libavutil/attributes.h"
                     -#include "libavutil/cpu.h"
                     -#include "libavutil/pixdesc.h"
                     -#include "libavutil/x86/asm.h"
                     -#include "libavutil/x86/cpu.h"
                     -#include "libavcodec/huffyuvencdsp.h"
                     -#include "libavcodec/mathops.h"
+                    -
                     -void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     -                       intptr_t w);
                     -void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     -                        intptr_t w);
                     -void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     -                        intptr_t w);
                     -void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
                     -                        unsigned mask, int w);
                     -void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
                     -                        unsigned mask, int w);
                     -void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
                     -                                          unsigned mask, int w, int *left, int *left_top);
+                    -
                     -#if HAVE_INLINE_ASM
+                    -
                     -static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
                     -                                        const uint8_t *src2, intptr_t w,
                     -                                        int *left, int *left_top)
                     -{
                     -    x86_reg i = 0;
                     -    uint8_t l, lt;
+                    -
                     -    __asm__ volatile (
                     -        "movq  (%1, %0), %%mm0          \n\t" // LT
                     -        "psllq $8, %%mm0                \n\t"
                     -        "1:                             \n\t"
                     -        "movq  (%1, %0), %%mm1          \n\t" // T
                     -        "movq  -1(%2, %0), %%mm2        \n\t" // L
                     -        "movq  (%2, %0), %%mm3          \n\t" // X
                     -        "movq %%mm2, %%mm4              \n\t" // L
                     -        "psubb %%mm0, %%mm2             \n\t"
                     -        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
                     -        "movq %%mm4, %%mm5              \n\t" // L
                     -        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
                     -        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
                     -        "pminub %%mm2, %%mm4            \n\t"
                     -        "pmaxub %%mm1, %%mm4            \n\t"
                     -        "psubb %%mm4, %%mm3             \n\t" // dst - pred
                     -        "movq %%mm3, (%3, %0)           \n\t"
                     -        "add $8, %0                     \n\t"
                     -        "movq -1(%1, %0), %%mm0         \n\t" // LT
                     -        "cmp %4, %0                     \n\t"
                     -        " jb 1b                         \n\t"
                     -        : "+r" (i)
                     -        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
+                    -
                     -    l  = *left;
                     -    lt = *left_top;
+                    -
                     -    dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
+                    -
                     -    *left_top = src1[w - 1];
                     -    *left     = src2[w - 1];
                     -}
+                    -
                     -#endif /* HAVE_INLINE_ASM */
+                    -
                     -av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
                     -{
                     -    av_unused int cpu_flags = av_get_cpu_flags();
                     -    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+                    -
                     -    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
                     -        c->diff_bytes = ff_diff_bytes_mmx;
                     -        c->diff_int16 = ff_diff_int16_mmx;
                     -    }
+                    -
                     -#if HAVE_INLINE_ASM
                     -    if (INLINE_MMXEXT(cpu_flags)) {
                     -        c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
                     -    }
                     -#endif /* HAVE_INLINE_ASM */
+                    -
                     -    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
                     -        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
                     -    }
+                    -
                     -    if (EXTERNAL_SSE2(cpu_flags)) {
                     -        c->diff_bytes = ff_diff_bytes_sse2;
                     -        c->diff_int16 = ff_diff_int16_sse2;
                     -    }
+                    -
                     -    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                     -        c->diff_bytes = ff_diff_bytes_avx2;
                     -    }
                     -}

libavcodec/x86/lossless_videoencdsp.asm

History View file @ cf9ef83

                     new file mode 100644
@@ -0,0 +1,150 @@
                     +;************************************************************************
                     +;* SIMD-optimized lossless video encoding functions
                     +;* Copyright (c) 2000, 2001 Fabrice Bellard
                     +;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
                     +;*
                     +;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
                     +;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
                     +;*
                     +;* This file is part of FFmpeg.
                     +;*
                     +;* FFmpeg is free software; you can redistribute it and/or
                     +;* modify it under the terms of the GNU Lesser General Public
                     +;* License as published by the Free Software Foundation; either
                     +;* version 2.1 of the License, or (at your option) any later version.
                     +;*
                     +;* FFmpeg is distributed in the hope that it will be useful,
                     +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
                     +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     +;* Lesser General Public License for more details.
                     +;*
                     +;* You should have received a copy of the GNU Lesser General Public
                     +;* License along with FFmpeg; if not, write to the Free Software
                     +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     +;******************************************************************************
+                    +
                     +%include "libavutil/x86/x86util.asm"
+                    +
                     +section .text
+                    +
                     +; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     +;                    intptr_t w);
                     +%macro DIFF_BYTES_PROLOGUE 0
                     +%if ARCH_X86_32
                     +cglobal diff_bytes, 3,5,2, dst, src1, src2
                     +%define wq r4q
                     +    DECLARE_REG_TMP 3
                     +    mov               wq, r3mp
                     +%else
                     +cglobal diff_bytes, 4,5,2, dst, src1, src2, w
                     +    DECLARE_REG_TMP 4
                     +%endif ; ARCH_X86_32
                     +%define i t0q
                     +%endmacro
+                    +
                     +; label to jump to if w < regsize
                     +%macro DIFF_BYTES_LOOP_PREP 1
                     +    mov                i, wq
                     +    and                i, -2 * regsize
                     +        jz            %1
                     +    add             dstq, i
                     +    add            src1q, i
                     +    add            src2q, i
                     +    neg                i
                     +%endmacro
+                    +
                     +; mov type used for src1q, dstq, first reg, second reg
                     +%macro DIFF_BYTES_LOOP_CORE 4
                     +%if mmsize != 16
                     +    mov%1             %3, [src1q + i]
                     +    mov%1             %4, [src1q + i + regsize]
                     +    psubb             %3, [src2q + i]
                     +    psubb             %4, [src2q + i + regsize]
                     +    mov%2           [dstq + i], %3
                     +    mov%2 [regsize + dstq + i], %4
                     +%else
                     +    ; SSE enforces alignment of psubb operand
                     +    mov%1             %3, [src1q + i]
                     +    movu              %4, [src2q + i]
                     +    psubb             %3, %4
                     +    mov%2     [dstq + i], %3
                     +    mov%1             %3, [src1q + i + regsize]
                     +    movu              %4, [src2q + i + regsize]
                     +    psubb             %3, %4
                     +    mov%2 [regsize + dstq + i], %3
                     +%endif
                     +%endmacro
+                    +
                     +%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
                     +    %define regsize mmsize
                     +.loop_%1%2:
                     +    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
                     +    add                i, 2 * regsize
                     +        jl    .loop_%1%2
                     +.skip_main_%1%2:
                     +    and               wq, 2 * regsize - 1
                     +        jz     .end_%1%2
                     +%if mmsize > 16
                     +    ; fall back to narrower xmm
                     +    %define regsize mmsize / 2
                     +    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
                     +.loop2_%1%2:
                     +    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
                     +    add                i, 2 * regsize
                     +        jl   .loop2_%1%2
                     +.setup_loop_gpr_%1%2:
                     +    and               wq, 2 * regsize - 1
                     +        jz     .end_%1%2
                     +%endif
                     +    add             dstq, wq
                     +    add            src1q, wq
                     +    add            src2q, wq
                     +    neg               wq
                     +.loop_gpr_%1%2:
                     +    mov              t0b, [src1q + wq]
                     +    sub              t0b, [src2q + wq]
                     +    mov      [dstq + wq], t0b
                     +    inc               wq
                     +        jl .loop_gpr_%1%2
                     +.end_%1%2:
                     +    REP_RET
                     +%endmacro
+                    +
                     +%if ARCH_X86_32
                     +INIT_MMX mmx
                     +DIFF_BYTES_PROLOGUE
                     +    %define regsize mmsize
                     +    DIFF_BYTES_LOOP_PREP .skip_main_aa
                     +    DIFF_BYTES_BODY    a, a
                     +%undef i
                     +%endif
+                    +
                     +INIT_XMM sse2
                     +DIFF_BYTES_PROLOGUE
                     +    %define regsize mmsize
                     +    DIFF_BYTES_LOOP_PREP .skip_main_aa
                     +    test            dstq, regsize - 1
                     +        jnz     .loop_uu
                     +    test           src1q, regsize - 1
                     +        jnz     .loop_ua
                     +    DIFF_BYTES_BODY    a, a
                     +    DIFF_BYTES_BODY    u, a
                     +    DIFF_BYTES_BODY    u, u
                     +%undef i
+                    +
                     +%if HAVE_AVX2_EXTERNAL
                     +INIT_YMM avx2
                     +DIFF_BYTES_PROLOGUE
                     +    %define regsize mmsize
                     +    ; Directly using unaligned SSE2 version is marginally faster than
                     +    ; branching based on arguments.
                     +    DIFF_BYTES_LOOP_PREP .skip_main_uu
                     +    test            dstq, regsize - 1
                     +        jnz     .loop_uu
                     +    test           src1q, regsize - 1
                     +        jnz     .loop_ua
                     +    DIFF_BYTES_BODY    a, a
                     +    DIFF_BYTES_BODY    u, a
                     +    DIFF_BYTES_BODY    u, u
                     +%undef i
                     +%endif

libavcodec/x86/lossless_videoencdsp_init.c

History View file @ cf9ef83

                     new file mode 100644
@@ -0,0 +1,104 @@
                     +/*
                     + * SIMD-optimized lossless video encoding functions
                     + * Copyright (c) 2000, 2001 Fabrice Bellard
                     + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
                     + *
                     + * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
                     + *
                     + * This file is part of FFmpeg.
                     + *
                     + * FFmpeg is free software; you can redistribute it and/or
                     + * modify it under the terms of the GNU Lesser General Public
                     + * License as published by the Free Software Foundation; either
                     + * version 2.1 of the License, or (at your option) any later version.
                     + *
                     + * FFmpeg is distributed in the hope that it will be useful,
                     + * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     + * Lesser General Public License for more details.
                     + *
                     + * You should have received a copy of the GNU Lesser General Public
                     + * License along with FFmpeg; if not, write to the Free Software
                     + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
                     + */
+                    +
                     +#include "libavutil/attributes.h"
                     +#include "libavutil/cpu.h"
                     +#include "libavutil/x86/asm.h"
                     +#include "libavutil/x86/cpu.h"
                     +#include "libavcodec/lossless_videoencdsp.h"
                     +#include "libavcodec/mathops.h"
+                    +
                     +void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     +                       intptr_t w);
                     +void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     +                        intptr_t w);
                     +void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                     +                        intptr_t w);
+                    +
                     +#if HAVE_INLINE_ASM
+                    +
                     +static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
                     +                                   const uint8_t *src2, intptr_t w,
                     +                                   int *left, int *left_top)
                     +{
                     +    x86_reg i = 0;
                     +    uint8_t l, lt;
+                    +
                     +    __asm__ volatile (
                     +        "movq  (%1, %0), %%mm0          \n\t" // LT
                     +        "psllq $8, %%mm0                \n\t"
                     +        "1:                             \n\t"
                     +        "movq  (%1, %0), %%mm1          \n\t" // T
                     +        "movq  -1(%2, %0), %%mm2        \n\t" // L
                     +        "movq  (%2, %0), %%mm3          \n\t" // X
                     +        "movq %%mm2, %%mm4              \n\t" // L
                     +        "psubb %%mm0, %%mm2             \n\t"
                     +        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
                     +        "movq %%mm4, %%mm5              \n\t" // L
                     +        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
                     +        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
                     +        "pminub %%mm2, %%mm4            \n\t"
                     +        "pmaxub %%mm1, %%mm4            \n\t"
                     +        "psubb %%mm4, %%mm3             \n\t" // dst - pred
                     +        "movq %%mm3, (%3, %0)           \n\t"
                     +        "add $8, %0                     \n\t"
                     +        "movq -1(%1, %0), %%mm0         \n\t" // LT
                     +        "cmp %4, %0                     \n\t"
                     +        " jb 1b                         \n\t"
                     +        : "+r" (i)
                     +        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
+                    +
                     +    l  = *left;
                     +    lt = *left_top;
+                    +
                     +    dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
+                    +
                     +    *left_top = src1[w - 1];
                     +    *left     = src2[w - 1];
                     +}
+                    +
                     +#endif /* HAVE_INLINE_ASM */
+                    +
                     +av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
                     +{
                     +    av_unused int cpu_flags = av_get_cpu_flags();
+                    +
                     +    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
                     +        c->diff_bytes = ff_diff_bytes_mmx;
                     +    }
+                    +
                     +#if HAVE_INLINE_ASM
                     +    if (INLINE_MMXEXT(cpu_flags)) {
                     +        c->sub_median_pred = sub_median_pred_mmxext;
                     +    }
                     +#endif /* HAVE_INLINE_ASM */
+                    +
                     +    if (EXTERNAL_SSE2(cpu_flags)) {
                     +        c->diff_bytes = ff_diff_bytes_sse2;
                     +    }
+                    +
                     +    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
                     +        c->diff_bytes = ff_diff_bytes_avx2;
                     +    }
                     +}