Browse code

huffyuvencdsp: move shared functions to a new lossless_videoencdsp context

Signed-off-by: James Almer <jamrial@gmail.com>

James Almer authored on 2017/01/08 11:13:48
Showing 17 changed files
... ...
@@ -2115,6 +2115,7 @@ CONFIG_EXTRA="
2115 2115
     libx262
2116 2116
     llauddsp
2117 2117
     llviddsp
2118
+    llvidencdsp
2118 2119
     lpc
2119 2120
     lzf
2120 2121
     me_cmp
... ...
@@ -2366,7 +2367,7 @@ amv_decoder_select="sp5x_decoder exif"
2366 2366
 amv_encoder_select="aandcttables jpegtables mpegvideoenc"
2367 2367
 ape_decoder_select="bswapdsp llauddsp"
2368 2368
 apng_decoder_select="zlib"
2369
-apng_encoder_select="huffyuvencdsp zlib"
2369
+apng_encoder_select="llvidencdsp zlib"
2370 2370
 asv1_decoder_select="blockdsp bswapdsp idctdsp"
2371 2371
 asv1_encoder_select="bswapdsp fdctdsp pixblockdsp"
2372 2372
 asv2_decoder_select="blockdsp bswapdsp idctdsp"
... ...
@@ -2430,7 +2431,7 @@ hap_encoder_deps="libsnappy"
2430 2430
 hap_encoder_select="texturedspenc"
2431 2431
 hevc_decoder_select="bswapdsp cabac golomb videodsp"
2432 2432
 huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
2433
-huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp"
2433
+huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
2434 2434
 iac_decoder_select="imc_decoder"
2435 2435
 imc_decoder_select="bswapdsp fft mdct sinewin"
2436 2436
 indeo3_decoder_select="hpeldsp"
... ...
@@ -2491,7 +2492,7 @@ on2avc_decoder_select="mdct"
2491 2491
 opus_decoder_deps="swresample"
2492 2492
 opus_decoder_select="imdct15"
2493 2493
 png_decoder_select="zlib"
2494
-png_encoder_select="huffyuvencdsp zlib"
2494
+png_encoder_select="llvidencdsp zlib"
2495 2495
 prores_decoder_select="blockdsp idctdsp"
2496 2496
 prores_encoder_select="fdctdsp"
2497 2497
 qcelp_decoder_select="lsp"
... ...
@@ -2534,7 +2535,7 @@ tscc_decoder_select="zlib"
2534 2534
 twinvq_decoder_select="mdct lsp sinewin"
2535 2535
 txd_decoder_select="texturedsp"
2536 2536
 utvideo_decoder_select="bswapdsp llviddsp"
2537
-utvideo_encoder_select="bswapdsp huffman huffyuvencdsp"
2537
+utvideo_encoder_select="bswapdsp huffman llvidencdsp"
2538 2538
 vble_decoder_select="llviddsp"
2539 2539
 vc1_decoder_select="blockdsp h263_decoder h264qpel intrax8 mpegvideo vc1dsp"
2540 2540
 vc1_qsv_decoder_deps="libmfx"
... ...
@@ -91,6 +91,7 @@ OBJS-$(CONFIG_JPEGTABLES)              += jpegtables.o
91 91
 OBJS-$(CONFIG_LIBXVID)                 += libxvid_rc.o
92 92
 OBJS-$(CONFIG_LLAUDDSP)                += lossless_audiodsp.o
93 93
 OBJS-$(CONFIG_LLVIDDSP)                += lossless_videodsp.o
94
+OBJS-$(CONFIG_LLVIDENCDSP)             += lossless_videoencdsp.o
94 95
 OBJS-$(CONFIG_LPC)                     += lpc.o
95 96
 OBJS-$(CONFIG_LSP)                     += lsp.o
96 97
 OBJS-$(CONFIG_LZF)                     += lzf.o
... ...
@@ -38,6 +38,7 @@
38 38
 #include "huffyuvencdsp.h"
39 39
 #include "put_bits.h"
40 40
 #include "lossless_videodsp.h"
41
+#include "lossless_videoencdsp.h"
41 42
 
42 43
 #define VLC_BITS 12
43 44
 
... ...
@@ -89,6 +90,7 @@ typedef struct HYuvContext {
89 89
     HuffYUVDSPContext hdsp;
90 90
     HuffYUVEncDSPContext hencdsp;
91 91
     LLVidDSPContext llviddsp;
92
+    LLVidEncDSPContext llvidencdsp;
92 93
     int non_determ; // non-deterministic, multi-threaded encoder allowed
93 94
 } HYuvContext;
94 95
 
... ...
@@ -33,6 +33,7 @@
33 33
 #include "huffman.h"
34 34
 #include "huffyuvencdsp.h"
35 35
 #include "internal.h"
36
+#include "lossless_videoencdsp.h"
36 37
 #include "put_bits.h"
37 38
 #include "libavutil/opt.h"
38 39
 #include "libavutil/pixdesc.h"
... ...
@@ -41,7 +42,7 @@ static inline void diff_bytes(HYuvContext *s, uint8_t *dst,
41 41
                               const uint8_t *src0, const uint8_t *src1, int w)
42 42
 {
43 43
     if (s->bps <= 8) {
44
-        s->hencdsp.diff_bytes(dst, src0, src1, w);
44
+        s->llvidencdsp.diff_bytes(dst, src0, src1, w);
45 45
     } else {
46 46
         s->hencdsp.diff_int16((uint16_t *)dst, (const uint16_t *)src0, (const uint16_t *)src1, s->n - 1, w);
47 47
     }
... ...
@@ -65,7 +66,7 @@ static inline int sub_left_prediction(HYuvContext *s, uint8_t *dst,
65 65
                 dst[i] = temp - left;
66 66
                 left   = temp;
67 67
             }
68
-            s->hencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32);
68
+            s->llvidencdsp.diff_bytes(dst + 32, src + 32, src + 31, w - 32);
69 69
             return src[w-1];
70 70
         }
71 71
     } else {
... ...
@@ -117,7 +118,7 @@ static inline void sub_left_prediction_bgr32(HYuvContext *s, uint8_t *dst,
117 117
         a = at;
118 118
     }
119 119
 
120
-    s->hencdsp.diff_bytes(dst + 16, src + 16, src + 12, w * 4 - 16);
120
+    s->llvidencdsp.diff_bytes(dst + 16, src + 16, src + 12, w * 4 - 16);
121 121
 
122 122
     *red   = src[(w - 1) * 4 + R];
123 123
     *green = src[(w - 1) * 4 + G];
... ...
@@ -146,7 +147,7 @@ static inline void sub_left_prediction_rgb24(HYuvContext *s, uint8_t *dst,
146 146
         b = bt;
147 147
     }
148 148
 
149
-    s->hencdsp.diff_bytes(dst + 48, src + 48, src + 48 - 3, w * 3 - 48);
149
+    s->llvidencdsp.diff_bytes(dst + 48, src + 48, src + 48 - 3, w * 3 - 48);
150 150
 
151 151
     *red   = src[(w - 1) * 3 + 0];
152 152
     *green = src[(w - 1) * 3 + 1];
... ...
@@ -156,7 +157,7 @@ static inline void sub_left_prediction_rgb24(HYuvContext *s, uint8_t *dst,
156 156
 static void sub_median_prediction(HYuvContext *s, uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top)
157 157
 {
158 158
     if (s->bps <= 8) {
159
-        s->hencdsp.sub_hfyu_median_pred(dst, src1, src2, w , left, left_top);
159
+        s->llvidencdsp.sub_median_pred(dst, src1, src2, w , left, left_top);
160 160
     } else {
161 161
         s->hencdsp.sub_hfyu_median_pred_int16((uint16_t *)dst, (const uint16_t *)src1, (const uint16_t *)src2, s->n - 1, w , left, left_top);
162 162
     }
... ...
@@ -218,6 +219,7 @@ static av_cold int encode_init(AVCodecContext *avctx)
218 218
 
219 219
     ff_huffyuv_common_init(avctx);
220 220
     ff_huffyuvencdsp_init(&s->hencdsp, avctx);
221
+    ff_llvidencdsp_init(&s->llvidencdsp);
221 222
 
222 223
     avctx->extradata = av_mallocz(3*MAX_N + 4);
223 224
     if (s->flags&AV_CODEC_FLAG_PASS1) {
... ...
@@ -823,9 +825,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
823 823
             lefttopy = p->data[0][3];
824 824
             lefttopu = p->data[1][1];
825 825
             lefttopv = p->data[2][1];
826
-            s->hencdsp.sub_hfyu_median_pred(s->temp[0], p->data[0] + 4, p->data[0] + fake_ystride + 4, width  - 4, &lefty, &lefttopy);
827
-            s->hencdsp.sub_hfyu_median_pred(s->temp[1], p->data[1] + 2, p->data[1] + fake_ustride + 2, width2 - 2, &leftu, &lefttopu);
828
-            s->hencdsp.sub_hfyu_median_pred(s->temp[2], p->data[2] + 2, p->data[2] + fake_vstride + 2, width2 - 2, &leftv, &lefttopv);
826
+            s->llvidencdsp.sub_median_pred(s->temp[0], p->data[0] + 4, p->data[0] + fake_ystride + 4, width  - 4, &lefty, &lefttopy);
827
+            s->llvidencdsp.sub_median_pred(s->temp[1], p->data[1] + 2, p->data[1] + fake_ustride + 2, width2 - 2, &leftu, &lefttopu);
828
+            s->llvidencdsp.sub_median_pred(s->temp[2], p->data[2] + 2, p->data[2] + fake_vstride + 2, width2 - 2, &leftv, &lefttopv);
829 829
             encode_422_bitstream(s, 0, width - 4);
830 830
             y++; cy++;
831 831
 
... ...
@@ -835,7 +837,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
835 835
                 if (s->bitstream_bpp == 12) {
836 836
                     while (2 * cy > y) {
837 837
                         ydst = p->data[0] + p->linesize[0] * y;
838
-                        s->hencdsp.sub_hfyu_median_pred(s->temp[0], ydst - fake_ystride, ydst, width, &lefty, &lefttopy);
838
+                        s->llvidencdsp.sub_median_pred(s->temp[0], ydst - fake_ystride, ydst, width, &lefty, &lefttopy);
839 839
                         encode_gray_bitstream(s, width);
840 840
                         y++;
841 841
                     }
... ...
@@ -845,9 +847,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
845 845
                 udst = p->data[1] + p->linesize[1] * cy;
846 846
                 vdst = p->data[2] + p->linesize[2] * cy;
847 847
 
848
-                s->hencdsp.sub_hfyu_median_pred(s->temp[0], ydst - fake_ystride, ydst, width,  &lefty, &lefttopy);
849
-                s->hencdsp.sub_hfyu_median_pred(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
850
-                s->hencdsp.sub_hfyu_median_pred(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
848
+                s->llvidencdsp.sub_median_pred(s->temp[0], ydst - fake_ystride, ydst, width,  &lefty, &lefttopy);
849
+                s->llvidencdsp.sub_median_pred(s->temp[1], udst - fake_ustride, udst, width2, &leftu, &lefttopu);
850
+                s->llvidencdsp.sub_median_pred(s->temp[2], vdst - fake_vstride, vdst, width2, &leftv, &lefttopv);
851 851
 
852 852
                 encode_422_bitstream(s, 0, width);
853 853
             }
... ...
@@ -860,7 +862,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
860 860
                     ydst = p->data[0] + p->linesize[0] * y;
861 861
 
862 862
                     if (s->predictor == PLANE && s->interlaced < y) {
863
-                        s->hencdsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
863
+                        s->llvidencdsp.diff_bytes(s->temp[1], ydst, ydst - fake_ystride, width);
864 864
 
865 865
                         lefty = sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
866 866
                     } else {
... ...
@@ -876,9 +878,9 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
876 876
                 vdst = p->data[2] + p->linesize[2] * cy;
877 877
 
878 878
                 if (s->predictor == PLANE && s->interlaced < cy) {
879
-                    s->hencdsp.diff_bytes(s->temp[1],          ydst, ydst - fake_ystride, width);
880
-                    s->hencdsp.diff_bytes(s->temp[2],          udst, udst - fake_ustride, width2);
881
-                    s->hencdsp.diff_bytes(s->temp[2] + width2, vdst, vdst - fake_vstride, width2);
879
+                    s->llvidencdsp.diff_bytes(s->temp[1],          ydst, ydst - fake_ystride, width);
880
+                    s->llvidencdsp.diff_bytes(s->temp[2],          udst, udst - fake_ustride, width2);
881
+                    s->llvidencdsp.diff_bytes(s->temp[2] + width2, vdst, vdst - fake_vstride, width2);
882 882
 
883 883
                     lefty = sub_left_prediction(s, s->temp[0], s->temp[1], width , lefty);
884 884
                     leftu = sub_left_prediction(s, s->temp[1], s->temp[2], width2, leftu);
... ...
@@ -911,7 +913,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
911 911
         for (y = 1; y < s->height; y++) {
912 912
             uint8_t *dst = data + y*stride;
913 913
             if (s->predictor == PLANE && s->interlaced < y) {
914
-                s->hencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride, width * 4);
914
+                s->llvidencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride, width * 4);
915 915
                 sub_left_prediction_bgr32(s, s->temp[0], s->temp[1], width,
916 916
                                           &leftr, &leftg, &leftb, &lefta);
917 917
             } else {
... ...
@@ -939,7 +941,7 @@ static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
939 939
         for (y = 1; y < s->height; y++) {
940 940
             uint8_t *dst = data + y * stride;
941 941
             if (s->predictor == PLANE && s->interlaced < y) {
942
-                s->hencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride,
942
+                s->llvidencdsp.diff_bytes(s->temp[1], dst, dst - fake_stride,
943 943
                                       width * 3);
944 944
                 sub_left_prediction_rgb24(s, s->temp[0], s->temp[1], width,
945 945
                                           &leftr, &leftg, &leftb);
... ...
@@ -21,38 +21,6 @@
21 21
 #include "huffyuvencdsp.h"
22 22
 #include "mathops.h"
23 23
 
24
-// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
25
-#define pb_7f (~0UL / 255 * 0x7f)
26
-#define pb_80 (~0UL / 255 * 0x80)
27
-
28
-static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w)
29
-{
30
-    long i;
31
-
32
-#if !HAVE_FAST_UNALIGNED
33
-    if (((long)src1 | (long)src2) & (sizeof(long) - 1)) {
34
-        for (i = 0; i + 7 < w; i += 8) {
35
-            dst[i + 0] = src1[i + 0] - src2[i + 0];
36
-            dst[i + 1] = src1[i + 1] - src2[i + 1];
37
-            dst[i + 2] = src1[i + 2] - src2[i + 2];
38
-            dst[i + 3] = src1[i + 3] - src2[i + 3];
39
-            dst[i + 4] = src1[i + 4] - src2[i + 4];
40
-            dst[i + 5] = src1[i + 5] - src2[i + 5];
41
-            dst[i + 6] = src1[i + 6] - src2[i + 6];
42
-            dst[i + 7] = src1[i + 7] - src2[i + 7];
43
-        }
44
-    } else
45
-#endif
46
-    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
47
-        long a = *(long *) (src1 + i);
48
-        long b = *(long *) (src2 + i);
49
-        *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
50
-                              ((a ^ b ^ pb_80) & pb_80);
51
-    }
52
-    for (; i < w; i++)
53
-        dst[i + 0] = src1[i + 0] - src2[i + 0];
54
-}
55
-
56 24
 static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w){
57 25
     long i;
58 26
 #if !HAVE_FAST_UNALIGNED
... ...
@@ -79,27 +47,6 @@ static void diff_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *sr
79 79
         dst[i] = (src1[i] - src2[i]) & mask;
80 80
 }
81 81
 
82
-static void sub_hfyu_median_pred_c(uint8_t *dst, const uint8_t *src1,
83
-                                   const uint8_t *src2, intptr_t w,
84
-                                   int *left, int *left_top)
85
-{
86
-    int i;
87
-    uint8_t l, lt;
88
-
89
-    l  = *left;
90
-    lt = *left_top;
91
-
92
-    for (i = 0; i < w; i++) {
93
-        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
94
-        lt     = src1[i];
95
-        l      = src2[i];
96
-        dst[i] = l - pred;
97
-    }
98
-
99
-    *left     = l;
100
-    *left_top = lt;
101
-}
102
-
103 82
 static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, const uint16_t *src2, unsigned mask, int w, int *left, int *left_top){
104 83
     int i;
105 84
     uint16_t l, lt;
... ...
@@ -120,9 +67,7 @@ static void sub_hfyu_median_pred_int16_c(uint16_t *dst, const uint16_t *src1, co
120 120
 
121 121
 av_cold void ff_huffyuvencdsp_init(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
122 122
 {
123
-    c->diff_bytes           = diff_bytes_c;
124 123
     c->diff_int16           = diff_int16_c;
125
-    c->sub_hfyu_median_pred = sub_hfyu_median_pred_c;
126 124
     c->sub_hfyu_median_pred_int16 = sub_hfyu_median_pred_int16_c;
127 125
 
128 126
     if (ARCH_X86)
... ...
@@ -24,22 +24,11 @@
24 24
 #include "avcodec.h"
25 25
 
26 26
 typedef struct HuffYUVEncDSPContext {
27
-    void (*diff_bytes)(uint8_t *dst /* align 16 */,
28
-                       const uint8_t *src1 /* align 16 */,
29
-                       const uint8_t *src2 /* align 1 */,
30
-                       intptr_t w);
31 27
     void (*diff_int16)(uint16_t *dst /* align 16 */,
32 28
                        const uint16_t *src1 /* align 16 */,
33 29
                        const uint16_t *src2 /* align 1 */,
34 30
                        unsigned mask, int w);
35 31
 
36
-    /**
37
-     * Subtract HuffYUV's variant of median prediction.
38
-     * Note, this might read from src1[-1], src2[-1].
39
-     */
40
-    void (*sub_hfyu_median_pred)(uint8_t *dst, const uint8_t *src1,
41
-                                 const uint8_t *src2, intptr_t w,
42
-                                 int *left, int *left_top);
43 32
     void (*sub_hfyu_median_pred_int16)(uint16_t *dst, const uint16_t *src1,
44 33
                                        const uint16_t *src2, unsigned mask,
45 34
                                        int w, int *left, int *left_top);
46 35
new file mode 100644
... ...
@@ -0,0 +1,84 @@
0
+/*
1
+ * This file is part of FFmpeg.
2
+ *
3
+ * FFmpeg is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * FFmpeg is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with FFmpeg; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#include "config.h"
19
+#include "libavutil/attributes.h"
20
+#include "lossless_videoencdsp.h"
21
+#include "mathops.h"
22
+
23
+// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
24
+#define pb_7f (~0UL / 255 * 0x7f)
25
+#define pb_80 (~0UL / 255 * 0x80)
26
+
27
+static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, intptr_t w)
28
+{
29
+    long i;
30
+
31
+#if !HAVE_FAST_UNALIGNED
32
+    if (((long)src1 | (long)src2) & (sizeof(long) - 1)) {
33
+        for (i = 0; i + 7 < w; i += 8) {
34
+            dst[i + 0] = src1[i + 0] - src2[i + 0];
35
+            dst[i + 1] = src1[i + 1] - src2[i + 1];
36
+            dst[i + 2] = src1[i + 2] - src2[i + 2];
37
+            dst[i + 3] = src1[i + 3] - src2[i + 3];
38
+            dst[i + 4] = src1[i + 4] - src2[i + 4];
39
+            dst[i + 5] = src1[i + 5] - src2[i + 5];
40
+            dst[i + 6] = src1[i + 6] - src2[i + 6];
41
+            dst[i + 7] = src1[i + 7] - src2[i + 7];
42
+        }
43
+    } else
44
+#endif
45
+    for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
46
+        long a = *(long *) (src1 + i);
47
+        long b = *(long *) (src2 + i);
48
+        *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
49
+                              ((a ^ b ^ pb_80) & pb_80);
50
+    }
51
+    for (; i < w; i++)
52
+        dst[i + 0] = src1[i + 0] - src2[i + 0];
53
+}
54
+
55
+static void sub_median_pred_c(uint8_t *dst, const uint8_t *src1,
56
+                              const uint8_t *src2, intptr_t w,
57
+                              int *left, int *left_top)
58
+{
59
+    int i;
60
+    uint8_t l, lt;
61
+
62
+    l  = *left;
63
+    lt = *left_top;
64
+
65
+    for (i = 0; i < w; i++) {
66
+        const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
67
+        lt     = src1[i];
68
+        l      = src2[i];
69
+        dst[i] = l - pred;
70
+    }
71
+
72
+    *left     = l;
73
+    *left_top = lt;
74
+}
75
+
76
+av_cold void ff_llvidencdsp_init(LLVidEncDSPContext *c)
77
+{
78
+    c->diff_bytes      = diff_bytes_c;
79
+    c->sub_median_pred = sub_median_pred_c;
80
+
81
+    if (ARCH_X86)
82
+        ff_llvidencdsp_init_x86(c);
83
+}
0 84
new file mode 100644
... ...
@@ -0,0 +1,41 @@
0
+/*
1
+ * This file is part of FFmpeg.
2
+ *
3
+ * FFmpeg is free software; you can redistribute it and/or
4
+ * modify it under the terms of the GNU Lesser General Public
5
+ * License as published by the Free Software Foundation; either
6
+ * version 2.1 of the License, or (at your option) any later version.
7
+ *
8
+ * FFmpeg is distributed in the hope that it will be useful,
9
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
+ * Lesser General Public License for more details.
12
+ *
13
+ * You should have received a copy of the GNU Lesser General Public
14
+ * License along with FFmpeg; if not, write to the Free Software
15
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
+ */
17
+
18
+#ifndef AVCODEC_LOSSLESS_VIDEOENCDSP_H
19
+#define AVCODEC_LOSSLESS_VIDEOENCDSP_H
20
+
21
+#include <stdint.h>
22
+
23
+typedef struct LLVidEncDSPContext {
24
+    void (*diff_bytes)(uint8_t *dst /* align 16 */,
25
+                       const uint8_t *src1 /* align 16 */,
26
+                       const uint8_t *src2 /* align 1 */,
27
+                       intptr_t w);
28
+    /**
29
+     * Subtract HuffYUV's variant of median prediction.
30
+     * Note, this might read from src1[-1], src2[-1].
31
+     */
32
+    void (*sub_median_pred)(uint8_t *dst, const uint8_t *src1,
33
+                            const uint8_t *src2, intptr_t w,
34
+                            int *left, int *left_top);
35
+} LLVidEncDSPContext;
36
+
37
+void ff_llvidencdsp_init(LLVidEncDSPContext *c);
38
+void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c);
39
+
40
+#endif /* AVCODEC_LOSSLESS_VIDEOENCDSP_H */
... ...
@@ -22,7 +22,7 @@
22 22
 #include "avcodec.h"
23 23
 #include "internal.h"
24 24
 #include "bytestream.h"
25
-#include "huffyuvencdsp.h"
25
+#include "lossless_videoencdsp.h"
26 26
 #include "png.h"
27 27
 #include "apng.h"
28 28
 
... ...
@@ -47,7 +47,7 @@ typedef struct APNGFctlChunk {
47 47
 
48 48
 typedef struct PNGEncContext {
49 49
     AVClass *class;
50
-    HuffYUVEncDSPContext hdsp;
50
+    LLVidEncDSPContext llvidencdsp;
51 51
 
52 52
     uint8_t *bytestream;
53 53
     uint8_t *bytestream_start;
... ...
@@ -159,7 +159,7 @@ static void sub_left_prediction(PNGEncContext *c, uint8_t *dst, const uint8_t *s
159 159
     for (x = 0; x < unaligned_w; x++)
160 160
         *dst++ = *src1++ - *src2++;
161 161
     size -= unaligned_w;
162
-    c->hdsp.diff_bytes(dst, src1, src2, size);
162
+    c->llvidencdsp.diff_bytes(dst, src1, src2, size);
163 163
 }
164 164
 
165 165
 static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
... ...
@@ -175,7 +175,7 @@ static void png_filter_row(PNGEncContext *c, uint8_t *dst, int filter_type,
175 175
         sub_left_prediction(c, dst, src, bpp, size);
176 176
         break;
177 177
     case PNG_FILTER_VALUE_UP:
178
-        c->hdsp.diff_bytes(dst, src, top, size);
178
+        c->llvidencdsp.diff_bytes(dst, src, top, size);
179 179
         break;
180 180
     case PNG_FILTER_VALUE_AVG:
181 181
         for (i = 0; i < bpp; i++)
... ...
@@ -1015,7 +1015,7 @@ FF_DISABLE_DEPRECATION_WARNINGS
1015 1015
 FF_ENABLE_DEPRECATION_WARNINGS
1016 1016
 #endif
1017 1017
 
1018
-    ff_huffyuvencdsp_init(&s->hdsp, avctx);
1018
+    ff_llvidencdsp_init(&s->llvidencdsp);
1019 1019
 
1020 1020
 #if FF_API_PRIVATE_OPT
1021 1021
 FF_DISABLE_DEPRECATION_WARNINGS
... ...
@@ -30,8 +30,8 @@
30 30
 #include "libavutil/common.h"
31 31
 #include "avcodec.h"
32 32
 #include "bswapdsp.h"
33
-#include "huffyuvencdsp.h"
34 33
 #include "lossless_videodsp.h"
34
+#include "lossless_videoencdsp.h"
35 35
 
36 36
 enum {
37 37
     PRED_NONE = 0,
... ...
@@ -70,8 +70,8 @@ typedef struct UtvideoContext {
70 70
     const AVClass *class;
71 71
     AVCodecContext *avctx;
72 72
     BswapDSPContext bdsp;
73
-    HuffYUVEncDSPContext hdsp;
74 73
     LLVidDSPContext llviddsp;
74
+    LLVidEncDSPContext llvidencdsp;
75 75
 
76 76
     uint32_t frame_info_size, flags, frame_info;
77 77
     int      planes;
... ...
@@ -33,7 +33,6 @@
33 33
 #include "bswapdsp.h"
34 34
 #include "bytestream.h"
35 35
 #include "put_bits.h"
36
-#include "huffyuvencdsp.h"
37 36
 #include "mathops.h"
38 37
 #include "utvideo.h"
39 38
 #include "huffman.h"
... ...
@@ -120,7 +119,7 @@ static av_cold int utvideo_encode_init(AVCodecContext *avctx)
120 120
     }
121 121
 
122 122
     ff_bswapdsp_init(&c->bdsp);
123
-    ff_huffyuvencdsp_init(&c->hdsp, avctx);
123
+    ff_llvidencdsp_init(&c->llvidencdsp);
124 124
 
125 125
 #if FF_API_PRIVATE_OPT
126 126
 FF_DISABLE_DEPRECATION_WARNINGS
... ...
@@ -324,7 +323,7 @@ static void median_predict(UtvideoContext *c, uint8_t *src, uint8_t *dst, int st
324 324
 
325 325
     /* Rest of the coded part uses median prediction */
326 326
     for (j = 1; j < height; j++) {
327
-        c->hdsp.sub_hfyu_median_pred(dst, src - stride, src, width, &A, &B);
327
+        c->llvidencdsp.sub_median_pred(dst, src - stride, src, width, &A, &B);
328 328
         dst += width;
329 329
         src += stride;
330 330
     }
... ...
@@ -20,8 +20,9 @@ OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
20 20
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
21 21
 OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
22 22
 OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
23
+OBJS-$(CONFIG_LLVIDENCDSP)             += x86/lossless_videoencdsp_init.o
23 24
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
24
-OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
25
+OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
25 26
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
26 27
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
27 28
 OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
... ...
@@ -114,6 +115,7 @@ YASM-OBJS-$(CONFIG_HUFFYUVENCDSP)      += x86/huffyuvencdsp.o
114 114
 YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
115 115
 YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
116 116
 YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
117
+YASM-OBJS-$(CONFIG_LLVIDENCDSP)        += x86/lossless_videoencdsp.o
117 118
 YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
118 119
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
119 120
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
... ...
@@ -27,128 +27,8 @@
27 27
 
28 28
 section .text
29 29
 
30
-; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
31
-;                    intptr_t w);
32
-%macro DIFF_BYTES_PROLOGUE 0
33
-%if ARCH_X86_32
34
-cglobal diff_bytes, 3,5,2, dst, src1, src2
35
-%define wq r4q
36
-    DECLARE_REG_TMP 3
37
-    mov               wq, r3mp
38
-%else
39
-cglobal diff_bytes, 4,5,2, dst, src1, src2, w
40
-    DECLARE_REG_TMP 4
41
-%endif ; ARCH_X86_32
42
-%define i t0q
43
-%endmacro
44
-
45
-; label to jump to if w < regsize
46
-%macro DIFF_BYTES_LOOP_PREP 1
47
-    mov                i, wq
48
-    and                i, -2 * regsize
49
-        jz            %1
50
-    add             dstq, i
51
-    add            src1q, i
52
-    add            src2q, i
53
-    neg                i
54
-%endmacro
55
-
56
-; mov type used for src1q, dstq, first reg, second reg
57
-%macro DIFF_BYTES_LOOP_CORE 4
58
-%if mmsize != 16
59
-    mov%1             %3, [src1q + i]
60
-    mov%1             %4, [src1q + i + regsize]
61
-    psubb             %3, [src2q + i]
62
-    psubb             %4, [src2q + i + regsize]
63
-    mov%2           [dstq + i], %3
64
-    mov%2 [regsize + dstq + i], %4
65
-%else
66
-    ; SSE enforces alignment of psubb operand
67
-    mov%1             %3, [src1q + i]
68
-    movu              %4, [src2q + i]
69
-    psubb             %3, %4
70
-    mov%2     [dstq + i], %3
71
-    mov%1             %3, [src1q + i + regsize]
72
-    movu              %4, [src2q + i + regsize]
73
-    psubb             %3, %4
74
-    mov%2 [regsize + dstq + i], %3
75
-%endif
76
-%endmacro
77
-
78
-%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
79
-    %define regsize mmsize
80
-.loop_%1%2:
81
-    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
82
-    add                i, 2 * regsize
83
-        jl    .loop_%1%2
84
-.skip_main_%1%2:
85
-    and               wq, 2 * regsize - 1
86
-        jz     .end_%1%2
87
-%if mmsize > 16
88
-    ; fall back to narrower xmm
89
-    %define regsize mmsize / 2
90
-    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
91
-.loop2_%1%2:
92
-    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
93
-    add                i, 2 * regsize
94
-        jl   .loop2_%1%2
95
-.setup_loop_gpr_%1%2:
96
-    and               wq, 2 * regsize - 1
97
-        jz     .end_%1%2
98
-%endif
99
-    add             dstq, wq
100
-    add            src1q, wq
101
-    add            src2q, wq
102
-    neg               wq
103
-.loop_gpr_%1%2:
104
-    mov              t0b, [src1q + wq]
105
-    sub              t0b, [src2q + wq]
106
-    mov      [dstq + wq], t0b
107
-    inc               wq
108
-        jl .loop_gpr_%1%2
109
-.end_%1%2:
110
-    REP_RET
111
-%endmacro
112
-
113
-%if ARCH_X86_32
114
-INIT_MMX mmx
115
-DIFF_BYTES_PROLOGUE
116
-    %define regsize mmsize
117
-    DIFF_BYTES_LOOP_PREP .skip_main_aa
118
-    DIFF_BYTES_BODY    a, a
119
-%undef i
120
-%endif
121
-
122
-INIT_XMM sse2
123
-DIFF_BYTES_PROLOGUE
124
-    %define regsize mmsize
125
-    DIFF_BYTES_LOOP_PREP .skip_main_aa
126
-    test            dstq, regsize - 1
127
-        jnz     .loop_uu
128
-    test           src1q, regsize - 1
129
-        jnz     .loop_ua
130
-    DIFF_BYTES_BODY    a, a
131
-    DIFF_BYTES_BODY    u, a
132
-    DIFF_BYTES_BODY    u, u
133
-%undef i
134
-
135
-%if HAVE_AVX2_EXTERNAL
136
-INIT_YMM avx2
137
-DIFF_BYTES_PROLOGUE
138
-    %define regsize mmsize
139
-    ; Directly using unaligned SSE2 version is marginally faster than
140
-    ; branching based on arguments.
141
-    DIFF_BYTES_LOOP_PREP .skip_main_uu
142
-    test            dstq, regsize - 1
143
-        jnz     .loop_uu
144
-    test           src1q, regsize - 1
145
-        jnz     .loop_ua
146
-    DIFF_BYTES_BODY    a, a
147
-    DIFF_BYTES_BODY    u, a
148
-    DIFF_BYTES_BODY    u, u
149
-%undef i
150
-%endif
151
-
30
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
31
+;                    unsigned mask, int w);
152 32
 %macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
153 33
     movd    m4, maskd
154 34
     SPLATW  m4, m4
155 35
new file mode 100644
... ...
@@ -0,0 +1,54 @@
0
+/*
1
+ * SIMD-optimized HuffYUV encoding functions
2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
3
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+ *
5
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
6
+ *
7
+ * This file is part of FFmpeg.
8
+ *
9
+ * FFmpeg is free software; you can redistribute it and/or
10
+ * modify it under the terms of the GNU Lesser General Public
11
+ * License as published by the Free Software Foundation; either
12
+ * version 2.1 of the License, or (at your option) any later version.
13
+ *
14
+ * FFmpeg is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
+ * Lesser General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU Lesser General Public
20
+ * License along with FFmpeg; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
+ */
23
+
24
+#include "libavutil/attributes.h"
25
+#include "libavutil/cpu.h"
26
+#include "libavutil/pixdesc.h"
27
+#include "libavutil/x86/cpu.h"
28
+#include "libavcodec/huffyuvencdsp.h"
29
+
30
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
31
+                        unsigned mask, int w);
32
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
33
+                        unsigned mask, int w);
34
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
35
+                                          unsigned mask, int w, int *left, int *left_top);
36
+
37
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
38
+{
39
+    av_unused int cpu_flags = av_get_cpu_flags();
40
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
41
+
42
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
43
+        c->diff_int16 = ff_diff_int16_mmx;
44
+    }
45
+
46
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
47
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
48
+    }
49
+
50
+    if (EXTERNAL_SSE2(cpu_flags)) {
51
+        c->diff_int16 = ff_diff_int16_sse2;
52
+    }
53
+}
0 54
deleted file mode 100644
... ...
@@ -1,118 +0,0 @@
1
-/*
2
- * SIMD-optimized HuffYUV encoding functions
3
- * Copyright (c) 2000, 2001 Fabrice Bellard
4
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
- *
6
- * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7
- *
8
- * This file is part of FFmpeg.
9
- *
10
- * FFmpeg is free software; you can redistribute it and/or
11
- * modify it under the terms of the GNU Lesser General Public
12
- * License as published by the Free Software Foundation; either
13
- * version 2.1 of the License, or (at your option) any later version.
14
- *
15
- * FFmpeg is distributed in the hope that it will be useful,
16
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
17
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
- * Lesser General Public License for more details.
19
- *
20
- * You should have received a copy of the GNU Lesser General Public
21
- * License along with FFmpeg; if not, write to the Free Software
22
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
- */
24
-
25
-#include "libavutil/attributes.h"
26
-#include "libavutil/cpu.h"
27
-#include "libavutil/pixdesc.h"
28
-#include "libavutil/x86/asm.h"
29
-#include "libavutil/x86/cpu.h"
30
-#include "libavcodec/huffyuvencdsp.h"
31
-#include "libavcodec/mathops.h"
32
-
33
-void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
34
-                       intptr_t w);
35
-void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
36
-                        intptr_t w);
37
-void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
38
-                        intptr_t w);
39
-void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
40
-                        unsigned mask, int w);
41
-void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
42
-                        unsigned mask, int w);
43
-void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
44
-                                          unsigned mask, int w, int *left, int *left_top);
45
-
46
-#if HAVE_INLINE_ASM
47
-
48
-static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
49
-                                        const uint8_t *src2, intptr_t w,
50
-                                        int *left, int *left_top)
51
-{
52
-    x86_reg i = 0;
53
-    uint8_t l, lt;
54
-
55
-    __asm__ volatile (
56
-        "movq  (%1, %0), %%mm0          \n\t" // LT
57
-        "psllq $8, %%mm0                \n\t"
58
-        "1:                             \n\t"
59
-        "movq  (%1, %0), %%mm1          \n\t" // T
60
-        "movq  -1(%2, %0), %%mm2        \n\t" // L
61
-        "movq  (%2, %0), %%mm3          \n\t" // X
62
-        "movq %%mm2, %%mm4              \n\t" // L
63
-        "psubb %%mm0, %%mm2             \n\t"
64
-        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
65
-        "movq %%mm4, %%mm5              \n\t" // L
66
-        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
67
-        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
68
-        "pminub %%mm2, %%mm4            \n\t"
69
-        "pmaxub %%mm1, %%mm4            \n\t"
70
-        "psubb %%mm4, %%mm3             \n\t" // dst - pred
71
-        "movq %%mm3, (%3, %0)           \n\t"
72
-        "add $8, %0                     \n\t"
73
-        "movq -1(%1, %0), %%mm0         \n\t" // LT
74
-        "cmp %4, %0                     \n\t"
75
-        " jb 1b                         \n\t"
76
-        : "+r" (i)
77
-        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
78
-
79
-    l  = *left;
80
-    lt = *left_top;
81
-
82
-    dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
83
-
84
-    *left_top = src1[w - 1];
85
-    *left     = src2[w - 1];
86
-}
87
-
88
-#endif /* HAVE_INLINE_ASM */
89
-
90
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
91
-{
92
-    av_unused int cpu_flags = av_get_cpu_flags();
93
-    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
94
-
95
-    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
96
-        c->diff_bytes = ff_diff_bytes_mmx;
97
-        c->diff_int16 = ff_diff_int16_mmx;
98
-    }
99
-
100
-#if HAVE_INLINE_ASM
101
-    if (INLINE_MMXEXT(cpu_flags)) {
102
-        c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
103
-    }
104
-#endif /* HAVE_INLINE_ASM */
105
-
106
-    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
107
-        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
108
-    }
109
-
110
-    if (EXTERNAL_SSE2(cpu_flags)) {
111
-        c->diff_bytes = ff_diff_bytes_sse2;
112
-        c->diff_int16 = ff_diff_int16_sse2;
113
-    }
114
-
115
-    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
116
-        c->diff_bytes = ff_diff_bytes_avx2;
117
-    }
118
-}
119 1
new file mode 100644
... ...
@@ -0,0 +1,150 @@
0
+;************************************************************************
1
+;* SIMD-optimized lossless video encoding functions
2
+;* Copyright (c) 2000, 2001 Fabrice Bellard
3
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+;*
5
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
6
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
7
+;*
8
+;* This file is part of FFmpeg.
9
+;*
10
+;* FFmpeg is free software; you can redistribute it and/or
11
+;* modify it under the terms of the GNU Lesser General Public
12
+;* License as published by the Free Software Foundation; either
13
+;* version 2.1 of the License, or (at your option) any later version.
14
+;*
15
+;* FFmpeg is distributed in the hope that it will be useful,
16
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
+;* Lesser General Public License for more details.
19
+;*
20
+;* You should have received a copy of the GNU Lesser General Public
21
+;* License along with FFmpeg; if not, write to the Free Software
22
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
+;******************************************************************************
24
+
25
+%include "libavutil/x86/x86util.asm"
26
+
27
+section .text
28
+
29
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
30
+;                    intptr_t w);
31
+%macro DIFF_BYTES_PROLOGUE 0
32
+%if ARCH_X86_32
33
+cglobal diff_bytes, 3,5,2, dst, src1, src2
34
+%define wq r4q
35
+    DECLARE_REG_TMP 3
36
+    mov               wq, r3mp
37
+%else
38
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
39
+    DECLARE_REG_TMP 4
40
+%endif ; ARCH_X86_32
41
+%define i t0q
42
+%endmacro
43
+
44
+; label to jump to if w < regsize
45
+%macro DIFF_BYTES_LOOP_PREP 1
46
+    mov                i, wq
47
+    and                i, -2 * regsize
48
+        jz            %1
49
+    add             dstq, i
50
+    add            src1q, i
51
+    add            src2q, i
52
+    neg                i
53
+%endmacro
54
+
55
+; mov type used for src1q, dstq, first reg, second reg
56
+%macro DIFF_BYTES_LOOP_CORE 4
57
+%if mmsize != 16
58
+    mov%1             %3, [src1q + i]
59
+    mov%1             %4, [src1q + i + regsize]
60
+    psubb             %3, [src2q + i]
61
+    psubb             %4, [src2q + i + regsize]
62
+    mov%2           [dstq + i], %3
63
+    mov%2 [regsize + dstq + i], %4
64
+%else
65
+    ; SSE enforces alignment of psubb operand
66
+    mov%1             %3, [src1q + i]
67
+    movu              %4, [src2q + i]
68
+    psubb             %3, %4
69
+    mov%2     [dstq + i], %3
70
+    mov%1             %3, [src1q + i + regsize]
71
+    movu              %4, [src2q + i + regsize]
72
+    psubb             %3, %4
73
+    mov%2 [regsize + dstq + i], %3
74
+%endif
75
+%endmacro
76
+
77
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
78
+    %define regsize mmsize
79
+.loop_%1%2:
80
+    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
81
+    add                i, 2 * regsize
82
+        jl    .loop_%1%2
83
+.skip_main_%1%2:
84
+    and               wq, 2 * regsize - 1
85
+        jz     .end_%1%2
86
+%if mmsize > 16
87
+    ; fall back to narrower xmm
88
+    %define regsize mmsize / 2
89
+    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
90
+.loop2_%1%2:
91
+    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
92
+    add                i, 2 * regsize
93
+        jl   .loop2_%1%2
94
+.setup_loop_gpr_%1%2:
95
+    and               wq, 2 * regsize - 1
96
+        jz     .end_%1%2
97
+%endif
98
+    add             dstq, wq
99
+    add            src1q, wq
100
+    add            src2q, wq
101
+    neg               wq
102
+.loop_gpr_%1%2:
103
+    mov              t0b, [src1q + wq]
104
+    sub              t0b, [src2q + wq]
105
+    mov      [dstq + wq], t0b
106
+    inc               wq
107
+        jl .loop_gpr_%1%2
108
+.end_%1%2:
109
+    REP_RET
110
+%endmacro
111
+
112
+%if ARCH_X86_32
113
+INIT_MMX mmx
114
+DIFF_BYTES_PROLOGUE
115
+    %define regsize mmsize
116
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
117
+    DIFF_BYTES_BODY    a, a
118
+%undef i
119
+%endif
120
+
121
+INIT_XMM sse2
122
+DIFF_BYTES_PROLOGUE
123
+    %define regsize mmsize
124
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
125
+    test            dstq, regsize - 1
126
+        jnz     .loop_uu
127
+    test           src1q, regsize - 1
128
+        jnz     .loop_ua
129
+    DIFF_BYTES_BODY    a, a
130
+    DIFF_BYTES_BODY    u, a
131
+    DIFF_BYTES_BODY    u, u
132
+%undef i
133
+
134
+%if HAVE_AVX2_EXTERNAL
135
+INIT_YMM avx2
136
+DIFF_BYTES_PROLOGUE
137
+    %define regsize mmsize
138
+    ; Directly using unaligned SSE2 version is marginally faster than
139
+    ; branching based on arguments.
140
+    DIFF_BYTES_LOOP_PREP .skip_main_uu
141
+    test            dstq, regsize - 1
142
+        jnz     .loop_uu
143
+    test           src1q, regsize - 1
144
+        jnz     .loop_ua
145
+    DIFF_BYTES_BODY    a, a
146
+    DIFF_BYTES_BODY    u, a
147
+    DIFF_BYTES_BODY    u, u
148
+%undef i
149
+%endif
0 150
new file mode 100644
... ...
@@ -0,0 +1,104 @@
0
+/*
1
+ * SIMD-optimized lossless video encoding functions
2
+ * Copyright (c) 2000, 2001 Fabrice Bellard
3
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4
+ *
5
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
6
+ *
7
+ * This file is part of FFmpeg.
8
+ *
9
+ * FFmpeg is free software; you can redistribute it and/or
10
+ * modify it under the terms of the GNU Lesser General Public
11
+ * License as published by the Free Software Foundation; either
12
+ * version 2.1 of the License, or (at your option) any later version.
13
+ *
14
+ * FFmpeg is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
+ * Lesser General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU Lesser General Public
20
+ * License along with FFmpeg; if not, write to the Free Software
21
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
+ */
23
+
24
+#include "libavutil/attributes.h"
25
+#include "libavutil/cpu.h"
26
+#include "libavutil/x86/asm.h"
27
+#include "libavutil/x86/cpu.h"
28
+#include "libavcodec/lossless_videoencdsp.h"
29
+#include "libavcodec/mathops.h"
30
+
31
+void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
32
+                       intptr_t w);
33
+void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
34
+                        intptr_t w);
35
+void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
36
+                        intptr_t w);
37
+
38
+#if HAVE_INLINE_ASM
39
+
40
+static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
41
+                                   const uint8_t *src2, intptr_t w,
42
+                                   int *left, int *left_top)
43
+{
44
+    x86_reg i = 0;
45
+    uint8_t l, lt;
46
+
47
+    __asm__ volatile (
48
+        "movq  (%1, %0), %%mm0          \n\t" // LT
49
+        "psllq $8, %%mm0                \n\t"
50
+        "1:                             \n\t"
51
+        "movq  (%1, %0), %%mm1          \n\t" // T
52
+        "movq  -1(%2, %0), %%mm2        \n\t" // L
53
+        "movq  (%2, %0), %%mm3          \n\t" // X
54
+        "movq %%mm2, %%mm4              \n\t" // L
55
+        "psubb %%mm0, %%mm2             \n\t"
56
+        "paddb %%mm1, %%mm2             \n\t" // L + T - LT
57
+        "movq %%mm4, %%mm5              \n\t" // L
58
+        "pmaxub %%mm1, %%mm4            \n\t" // max(T, L)
59
+        "pminub %%mm5, %%mm1            \n\t" // min(T, L)
60
+        "pminub %%mm2, %%mm4            \n\t"
61
+        "pmaxub %%mm1, %%mm4            \n\t"
62
+        "psubb %%mm4, %%mm3             \n\t" // dst - pred
63
+        "movq %%mm3, (%3, %0)           \n\t"
64
+        "add $8, %0                     \n\t"
65
+        "movq -1(%1, %0), %%mm0         \n\t" // LT
66
+        "cmp %4, %0                     \n\t"
67
+        " jb 1b                         \n\t"
68
+        : "+r" (i)
69
+        : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
70
+
71
+    l  = *left;
72
+    lt = *left_top;
73
+
74
+    dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
75
+
76
+    *left_top = src1[w - 1];
77
+    *left     = src2[w - 1];
78
+}
79
+
80
+#endif /* HAVE_INLINE_ASM */
81
+
82
+av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
83
+{
84
+    av_unused int cpu_flags = av_get_cpu_flags();
85
+
86
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
87
+        c->diff_bytes = ff_diff_bytes_mmx;
88
+    }
89
+
90
+#if HAVE_INLINE_ASM
91
+    if (INLINE_MMXEXT(cpu_flags)) {
92
+        c->sub_median_pred = sub_median_pred_mmxext;
93
+    }
94
+#endif /* HAVE_INLINE_ASM */
95
+
96
+    if (EXTERNAL_SSE2(cpu_flags)) {
97
+        c->diff_bytes = ff_diff_bytes_sse2;
98
+    }
99
+
100
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
101
+        c->diff_bytes = ff_diff_bytes_avx2;
102
+    }
103
+}