Browse code

vendor: github.com/klauspost/compress v1.15.9

various fixes in zstd compression

- https://github.com/klauspost/compress/releases/tag/v1.15.9
- https://github.com/klauspost/compress/releases/tag/v1.15.8
- https://github.com/klauspost/compress/releases/tag/v1.15.7
- https://github.com/klauspost/compress/releases/tag/v1.15.6
- https://github.com/klauspost/compress/releases/tag/v1.15.5
- https://github.com/klauspost/compress/releases/tag/v1.15.4
- https://github.com/klauspost/compress/releases/tag/v1.15.3
- https://github.com/klauspost/compress/releases/tag/v1.15.2

full diff: https://github.com/klauspost/compress/compare/v1.15.1...v1.15.9

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>

Sebastiaan van Stijn authored on 2022/08/26 08:46:28
Showing 47 changed files
... ...
@@ -47,7 +47,7 @@ require (
47 47
 	github.com/hashicorp/serf v0.8.5
48 48
 	github.com/imdario/mergo v0.3.12
49 49
 	github.com/ishidawataru/sctp v0.0.0-20210707070123-9a39160e9062
50
-	github.com/klauspost/compress v1.15.1
50
+	github.com/klauspost/compress v1.15.9
51 51
 	github.com/miekg/dns v1.1.27
52 52
 	github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible
53 53
 	github.com/moby/buildkit v0.10.4
... ...
@@ -691,8 +691,9 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
691 691
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
692 692
 github.com/klauspost/compress v1.11.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
693 693
 github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
694
-github.com/klauspost/compress v1.15.1 h1:y9FcTHGyrebwfP0ZZqFiaxTaiDnUrGkJkI+f583BL1A=
695 694
 github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
695
+github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
696
+github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
696 697
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
697 698
 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
698 699
 github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
... ...
@@ -23,3 +23,10 @@ _testmain.go
23 23
 *.test
24 24
 *.prof
25 25
 /s2/cmd/_s2sx/sfx-exe
26
+
27
+# Linux perf files
28
+perf.data
29
+perf.data.old
30
+
31
+# gdb history
32
+.gdb_history
... ...
@@ -17,6 +17,72 @@ This package provides various compression algorithms.
17 17
 
18 18
 # changelog
19 19
 
20
+* July 13, 2022 (v1.15.8)
21
+
22
+	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
23
+	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
24
+	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
25
+	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
26
+	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
27
+	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
28
+	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
29
+
30
+* June 29, 2022 (v1.15.7)
31
+
32
+	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633
33
+	* zip: Merge upstream  https://github.com/klauspost/compress/pull/631
34
+	* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
35
+	* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
36
+	* flate: Faster histograms  https://github.com/klauspost/compress/pull/620
37
+	* deflate: Use compound hcode  https://github.com/klauspost/compress/pull/622
38
+
39
+* June 3, 2022 (v1.15.6)
40
+	* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
41
+	* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
42
+	* zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
43
+	* zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
44
+	* zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
45
+	* gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
46
+	* s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
47
+	* s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
48
+	* snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
49
+	* s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
50
+
51
+* May 25, 2022 (v1.15.5)
52
+	* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
53
+	* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
54
+	* huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596
55
+	* zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588
56
+	* zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592
57
+	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
58
+	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
59
+	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
60
+	* flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590
61
+
62
+
63
+* May 11, 2022 (v1.15.4)
64
+	* huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)
65
+	* inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)
66
+	* zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)
67
+	* zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)
68
+
69
+* May 5, 2022 (v1.15.3)
70
+	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
71
+	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
72
+
73
+* Apr 26, 2022 (v1.15.2)
74
+	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
75
+	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
76
+	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
77
+	* Minimum version is Go 1.16, added CI test on 1.18.
78
+
79
+* Mar 11, 2022 (v1.15.1)
80
+	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
81
+	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
82
+	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
83
+	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
84
+	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
85
+
20 86
 * Mar 3, 2022 (v1.15.0)
21 87
 	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
22 88
 	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
... ...
@@ -60,6 +126,9 @@ While the release has been extensively tested, it is recommended to testing when
60 60
 	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
61 61
 	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
62 62
 
63
+<details>
64
+	<summary>See changes to v1.13.x</summary>
65
+	
63 66
 * Aug 30, 2021 (v1.13.5)
64 67
 	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
65 68
 	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
... ...
@@ -88,6 +157,8 @@ While the release has been extensively tested, it is recommended to testing when
88 88
 	* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
89 89
 	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
90 90
 	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
91
+</details>
92
+
91 93
 
92 94
 <details>
93 95
 	<summary>See changes to v1.12.x</summary>
94 96
deleted file mode 100644
... ...
@@ -1,5 +0,0 @@
1
-package huff0
2
-
3
-//go:generate go run generate.go
4
-//go:generate asmfmt -w decompress_amd64.s
5
-//go:generate asmfmt -w decompress_8b_amd64.s
... ...
@@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
165 165
 	return uint16(b.value >> ((64 - n) & 63))
166 166
 }
167 167
 
168
-// peekTopBits(n) is equvialent to peekBitFast(64 - n)
169
-func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
170
-	return uint16(b.value >> n)
171
-}
172
-
173 168
 func (b *bitReaderShifted) advance(n uint8) {
174 169
 	b.bitsRead += n
175 170
 	b.value <<= n & 63
... ...
@@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
220 220
 	}
221 221
 }
222 222
 
223
-// finished returns true if all bits have been read from the bit stream.
224
-func (b *bitReaderShifted) finished() bool {
225
-	return b.off == 0 && b.bitsRead >= 64
226
-}
227
-
228 223
 func (b *bitReaderShifted) remaining() uint {
229 224
 	return b.off*8 + uint(64-b.bitsRead)
230 225
 }
... ...
@@ -5,8 +5,6 @@
5 5
 
6 6
 package huff0
7 7
 
8
-import "fmt"
9
-
10 8
 // bitWriter will write bits.
11 9
 // First bit will be LSB of the first byte of output.
12 10
 type bitWriter struct {
... ...
@@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
23 23
 	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
24 24
 	0xFFFF, 0xFFFF} /* up to 16 bits */
25 25
 
26
-// addBits16NC will add up to 16 bits.
27
-// It will not check if there is space for them,
28
-// so the caller must ensure that it has flushed recently.
29
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
30
-	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
31
-	b.nBits += bits
32
-}
33
-
34 26
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
35 27
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
36 28
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
... ...
@@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
70 70
 	b.nBits += encA.nBits + encB.nBits
71 71
 }
72 72
 
73
-// addBits16ZeroNC will add up to 16 bits.
74
-// It will not check if there is space for them,
75
-// so the caller must ensure that it has flushed recently.
76
-// This is fastest if bits can be zero.
77
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
78
-	if bits == 0 {
79
-		return
80
-	}
81
-	value <<= (16 - bits) & 15
82
-	value >>= (16 - bits) & 15
83
-	b.bitContainer |= uint64(value) << (b.nBits & 63)
84
-	b.nBits += bits
85
-}
86
-
87
-// flush will flush all pending full bytes.
88
-// There will be at least 56 bits available for writing when this has been called.
89
-// Using flush32 is faster, but leaves less space for writing.
90
-func (b *bitWriter) flush() {
91
-	v := b.nBits >> 3
92
-	switch v {
93
-	case 0:
94
-		return
95
-	case 1:
96
-		b.out = append(b.out,
97
-			byte(b.bitContainer),
98
-		)
99
-		b.bitContainer >>= 1 << 3
100
-	case 2:
101
-		b.out = append(b.out,
102
-			byte(b.bitContainer),
103
-			byte(b.bitContainer>>8),
104
-		)
105
-		b.bitContainer >>= 2 << 3
106
-	case 3:
107
-		b.out = append(b.out,
108
-			byte(b.bitContainer),
109
-			byte(b.bitContainer>>8),
110
-			byte(b.bitContainer>>16),
111
-		)
112
-		b.bitContainer >>= 3 << 3
113
-	case 4:
114
-		b.out = append(b.out,
115
-			byte(b.bitContainer),
116
-			byte(b.bitContainer>>8),
117
-			byte(b.bitContainer>>16),
118
-			byte(b.bitContainer>>24),
119
-		)
120
-		b.bitContainer >>= 4 << 3
121
-	case 5:
122
-		b.out = append(b.out,
123
-			byte(b.bitContainer),
124
-			byte(b.bitContainer>>8),
125
-			byte(b.bitContainer>>16),
126
-			byte(b.bitContainer>>24),
127
-			byte(b.bitContainer>>32),
128
-		)
129
-		b.bitContainer >>= 5 << 3
130
-	case 6:
131
-		b.out = append(b.out,
132
-			byte(b.bitContainer),
133
-			byte(b.bitContainer>>8),
134
-			byte(b.bitContainer>>16),
135
-			byte(b.bitContainer>>24),
136
-			byte(b.bitContainer>>32),
137
-			byte(b.bitContainer>>40),
138
-		)
139
-		b.bitContainer >>= 6 << 3
140
-	case 7:
141
-		b.out = append(b.out,
142
-			byte(b.bitContainer),
143
-			byte(b.bitContainer>>8),
144
-			byte(b.bitContainer>>16),
145
-			byte(b.bitContainer>>24),
146
-			byte(b.bitContainer>>32),
147
-			byte(b.bitContainer>>40),
148
-			byte(b.bitContainer>>48),
149
-		)
150
-		b.bitContainer >>= 7 << 3
151
-	case 8:
152
-		b.out = append(b.out,
153
-			byte(b.bitContainer),
154
-			byte(b.bitContainer>>8),
155
-			byte(b.bitContainer>>16),
156
-			byte(b.bitContainer>>24),
157
-			byte(b.bitContainer>>32),
158
-			byte(b.bitContainer>>40),
159
-			byte(b.bitContainer>>48),
160
-			byte(b.bitContainer>>56),
161
-		)
162
-		b.bitContainer = 0
163
-		b.nBits = 0
164
-		return
165
-	default:
166
-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
167
-	}
168
-	b.nBits &= 7
169
-}
170
-
171 73
 // flush32 will flush out, so there are at least 32 bits available for writing.
172 74
 func (b *bitWriter) flush32() {
173 75
 	if b.nBits < 32 {
... ...
@@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
201 201
 	b.flushAlign()
202 202
 	return nil
203 203
 }
204
-
205
-// reset and continue writing by appending to out.
206
-func (b *bitWriter) reset(out []byte) {
207
-	b.bitContainer = 0
208
-	b.nBits = 0
209
-	b.out = out
210
-}
... ...
@@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
20 20
 	b.off = 0
21 21
 }
22 22
 
23
-// advance the stream b n bytes.
24
-func (b *byteReader) advance(n uint) {
25
-	b.off += int(n)
26
-}
27
-
28 23
 // Int32 returns a little endian int32 starting at current offset.
29 24
 func (b byteReader) Int32() int32 {
30 25
 	v3 := int32(b.b[b.off+3])
... ...
@@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
43 43
 	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
44 44
 }
45 45
 
46
-// unread returns the unread portion of the input.
47
-func (b byteReader) unread() []byte {
48
-	return b.b[b.off:]
49
-}
50
-
51 46
 // remain will return the number of bytes remaining.
52 47
 func (b byteReader) remain() int {
53 48
 	return len(b.b) - b.off
... ...
@@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
404 404
 	return true
405 405
 }
406 406
 
407
+//lint:ignore U1000 used for debugging
407 408
 func (s *Scratch) validateTable(c cTable) bool {
408 409
 	if len(c) < int(s.symbolLen) {
409 410
 		return false
... ...
@@ -11,7 +11,6 @@ import (
11 11
 
12 12
 type dTable struct {
13 13
 	single []dEntrySingle
14
-	double []dEntryDouble
15 14
 }
16 15
 
17 16
 // single-symbols decoding
... ...
@@ -19,13 +18,6 @@ type dEntrySingle struct {
19 19
 	entry uint16
20 20
 }
21 21
 
22
-// double-symbols decoding
23
-type dEntryDouble struct {
24
-	seq   [4]byte
25
-	nBits uint8
26
-	len   uint8
27
-}
28
-
29 22
 // Uses special code for all tables that are < 8 bits.
30 23
 const use8BitTables = true
31 24
 
... ...
@@ -35,7 +27,7 @@ const use8BitTables = true
35 35
 // If no Scratch is provided a new one is allocated.
36 36
 // The returned Scratch can be used for encoding or decoding input using this table.
37 37
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
38
-	s, err = s.prepare(in)
38
+	s, err = s.prepare(nil)
39 39
 	if err != nil {
40 40
 		return s, nil, err
41 41
 	}
... ...
@@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
236 236
 	return &[4][256]byte{}
237 237
 }
238 238
 
239
-// Decompress1X will decompress a 1X encoded stream.
240
-// The cap of the output buffer will be the maximum decompressed size.
241
-// The length of the supplied input must match the end of a block exactly.
242
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
243
-	if len(d.dt.single) == 0 {
244
-		return nil, errors.New("no table loaded")
245
-	}
246
-	if use8BitTables && d.actualTableLog <= 8 {
247
-		return d.decompress1X8Bit(dst, src)
248
-	}
249
-	var br bitReaderShifted
250
-	err := br.init(src)
251
-	if err != nil {
252
-		return dst, err
253
-	}
254
-	maxDecodedSize := cap(dst)
255
-	dst = dst[:0]
256
-
257
-	// Avoid bounds check by always having full sized table.
258
-	const tlSize = 1 << tableLogMax
259
-	const tlMask = tlSize - 1
260
-	dt := d.dt.single[:tlSize]
261
-
262
-	// Use temp table to avoid bound checks/append penalty.
263
-	bufs := d.buffer()
264
-	buf := &bufs[0]
265
-	var off uint8
266
-
267
-	for br.off >= 8 {
268
-		br.fillFast()
269
-		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
270
-		br.advance(uint8(v.entry))
271
-		buf[off+0] = uint8(v.entry >> 8)
272
-
273
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
274
-		br.advance(uint8(v.entry))
275
-		buf[off+1] = uint8(v.entry >> 8)
276
-
277
-		// Refill
278
-		br.fillFast()
279
-
280
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
281
-		br.advance(uint8(v.entry))
282
-		buf[off+2] = uint8(v.entry >> 8)
283
-
284
-		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
285
-		br.advance(uint8(v.entry))
286
-		buf[off+3] = uint8(v.entry >> 8)
287
-
288
-		off += 4
289
-		if off == 0 {
290
-			if len(dst)+256 > maxDecodedSize {
291
-				br.close()
292
-				d.bufs.Put(bufs)
293
-				return nil, ErrMaxDecodedSizeExceeded
294
-			}
295
-			dst = append(dst, buf[:]...)
296
-		}
297
-	}
298
-
299
-	if len(dst)+int(off) > maxDecodedSize {
300
-		d.bufs.Put(bufs)
301
-		br.close()
302
-		return nil, ErrMaxDecodedSizeExceeded
303
-	}
304
-	dst = append(dst, buf[:off]...)
305
-
306
-	// br < 8, so uint8 is fine
307
-	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
308
-	for bitsLeft > 0 {
309
-		br.fill()
310
-		if false && br.bitsRead >= 32 {
311
-			if br.off >= 4 {
312
-				v := br.in[br.off-4:]
313
-				v = v[:4]
314
-				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
315
-				br.value = (br.value << 32) | uint64(low)
316
-				br.bitsRead -= 32
317
-				br.off -= 4
318
-			} else {
319
-				for br.off > 0 {
320
-					br.value = (br.value << 8) | uint64(br.in[br.off-1])
321
-					br.bitsRead -= 8
322
-					br.off--
323
-				}
324
-			}
325
-		}
326
-		if len(dst) >= maxDecodedSize {
327
-			d.bufs.Put(bufs)
328
-			br.close()
329
-			return nil, ErrMaxDecodedSizeExceeded
330
-		}
331
-		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
332
-		nBits := uint8(v.entry)
333
-		br.advance(nBits)
334
-		bitsLeft -= nBits
335
-		dst = append(dst, uint8(v.entry>>8))
336
-	}
337
-	d.bufs.Put(bufs)
338
-	return dst, br.close()
339
-}
340
-
341 239
 // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
342 240
 // The cap of the output buffer will be the maximum decompressed size.
343 241
 // The length of the supplied input must match the end of a block exactly.
... ...
@@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
995 995
 
996 996
 	const shift = 56
997 997
 	const tlSize = 1 << 8
998
-	const tlMask = tlSize - 1
999 998
 	single := d.dt.single[:tlSize]
1000 999
 
1001 1000
 	// Use temp table to avoid bound checks/append penalty.
1002 1001
deleted file mode 100644
... ...
@@ -1,488 +0,0 @@
1
-// +build !appengine
2
-// +build gc
3
-// +build !noasm
4
-
5
-#include "textflag.h"
6
-#include "funcdata.h"
7
-#include "go_asm.h"
8
-
9
-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
10
-
11
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
12
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
13
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
14
-#define off             R8
15
-#define buffer          DI
16
-#define table           SI
17
-
18
-#define br_bits_read    R9
19
-#define br_value        R10
20
-#define br_offset       R11
21
-#define peek_bits       R12
22
-#define exhausted       DX
23
-
24
-#define br0             R13
25
-#define br1             R14
26
-#define br2             R15
27
-#define br3             BP
28
-
29
-	MOVQ BP, 0(SP)
30
-
31
-	XORQ exhausted, exhausted // exhausted = false
32
-	XORQ off, off             // off = 0
33
-
34
-	MOVBQZX peekBits+32(FP), peek_bits
35
-	MOVQ    buf+40(FP), buffer
36
-	MOVQ    tbl+48(FP), table
37
-
38
-	MOVQ pbr0+0(FP), br0
39
-	MOVQ pbr1+8(FP), br1
40
-	MOVQ pbr2+16(FP), br2
41
-	MOVQ pbr3+24(FP), br3
42
-
43
-main_loop:
44
-
45
-	// const stream = 0
46
-	// br0.fillFast()
47
-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
48
-	MOVQ    bitReaderShifted_value(br0), br_value
49
-	MOVQ    bitReaderShifted_off(br0), br_offset
50
-
51
-	// if b.bitsRead >= 32 {
52
-	CMPQ br_bits_read, $32
53
-	JB   skip_fill0
54
-
55
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
56
-	SUBQ $4, br_offset     // b.off -= 4
57
-
58
-	// v := b.in[b.off-4 : b.off]
59
-	// v = v[:4]
60
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
61
-	MOVQ bitReaderShifted_in(br0), AX
62
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
63
-
64
-	// b.value |= uint64(low) << (b.bitsRead & 63)
65
-	MOVQ br_bits_read, CX
66
-	SHLQ CL, AX
67
-	ORQ  AX, br_value
68
-
69
-	// exhausted = exhausted || (br0.off < 4)
70
-	CMPQ  br_offset, $4
71
-	SETLT DL
72
-	ORB   DL, DH
73
-
74
-	// }
75
-skip_fill0:
76
-
77
-	// val0 := br0.peekTopBits(peekBits)
78
-	MOVQ br_value, AX
79
-	MOVQ peek_bits, CX
80
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
81
-
82
-	// v0 := table[val0&mask]
83
-	MOVW 0(table)(AX*2), AX // AX - v0
84
-
85
-	// br0.advance(uint8(v0.entry))
86
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
87
-	MOVBQZX AL, CX
88
-	SHLQ    CL, br_value     // value <<= n
89
-	ADDQ    CX, br_bits_read // bits_read += n
90
-
91
-	// val1 := br0.peekTopBits(peekBits)
92
-	MOVQ peek_bits, CX
93
-	MOVQ br_value, AX
94
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
95
-
96
-	// v1 := table[val1&mask]
97
-	MOVW 0(table)(AX*2), AX // AX - v1
98
-
99
-	// br0.advance(uint8(v1.entry))
100
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
101
-	MOVBQZX AL, CX
102
-	SHLQ    CX, br_value     // value <<= n
103
-	ADDQ    CX, br_bits_read // bits_read += n
104
-
105
-	// these two writes get coalesced
106
-	// buf[stream][off] = uint8(v0.entry >> 8)
107
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
108
-	MOVW BX, 0(buffer)(off*1)
109
-
110
-	// SECOND PART:
111
-	// val2 := br0.peekTopBits(peekBits)
112
-	MOVQ br_value, AX
113
-	MOVQ peek_bits, CX
114
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
115
-
116
-	// v2 := table[val0&mask]
117
-	MOVW 0(table)(AX*2), AX // AX - v0
118
-
119
-	// br0.advance(uint8(v0.entry))
120
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
121
-	MOVBQZX AL, CX
122
-	SHLQ    CL, br_value     // value <<= n
123
-	ADDQ    CX, br_bits_read // bits_read += n
124
-
125
-	// val3 := br0.peekTopBits(peekBits)
126
-	MOVQ peek_bits, CX
127
-	MOVQ br_value, AX
128
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
129
-
130
-	// v3 := table[val1&mask]
131
-	MOVW 0(table)(AX*2), AX // AX - v1
132
-
133
-	// br0.advance(uint8(v1.entry))
134
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
135
-	MOVBQZX AL, CX
136
-	SHLQ    CX, br_value     // value <<= n
137
-	ADDQ    CX, br_bits_read // bits_read += n
138
-
139
-	// these two writes get coalesced
140
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
141
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
142
-	MOVW BX, 0+2(buffer)(off*1)
143
-
144
-	// update the bitrader reader structure
145
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
146
-	MOVQ br_value, bitReaderShifted_value(br0)
147
-	MOVQ br_offset, bitReaderShifted_off(br0)
148
-
149
-	// const stream = 1
150
-	// br1.fillFast()
151
-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
152
-	MOVQ    bitReaderShifted_value(br1), br_value
153
-	MOVQ    bitReaderShifted_off(br1), br_offset
154
-
155
-	// if b.bitsRead >= 32 {
156
-	CMPQ br_bits_read, $32
157
-	JB   skip_fill1
158
-
159
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
160
-	SUBQ $4, br_offset     // b.off -= 4
161
-
162
-	// v := b.in[b.off-4 : b.off]
163
-	// v = v[:4]
164
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
165
-	MOVQ bitReaderShifted_in(br1), AX
166
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
167
-
168
-	// b.value |= uint64(low) << (b.bitsRead & 63)
169
-	MOVQ br_bits_read, CX
170
-	SHLQ CL, AX
171
-	ORQ  AX, br_value
172
-
173
-	// exhausted = exhausted || (br1.off < 4)
174
-	CMPQ  br_offset, $4
175
-	SETLT DL
176
-	ORB   DL, DH
177
-
178
-	// }
179
-skip_fill1:
180
-
181
-	// val0 := br1.peekTopBits(peekBits)
182
-	MOVQ br_value, AX
183
-	MOVQ peek_bits, CX
184
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
185
-
186
-	// v0 := table[val0&mask]
187
-	MOVW 0(table)(AX*2), AX // AX - v0
188
-
189
-	// br1.advance(uint8(v0.entry))
190
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
191
-	MOVBQZX AL, CX
192
-	SHLQ    CL, br_value     // value <<= n
193
-	ADDQ    CX, br_bits_read // bits_read += n
194
-
195
-	// val1 := br1.peekTopBits(peekBits)
196
-	MOVQ peek_bits, CX
197
-	MOVQ br_value, AX
198
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
199
-
200
-	// v1 := table[val1&mask]
201
-	MOVW 0(table)(AX*2), AX // AX - v1
202
-
203
-	// br1.advance(uint8(v1.entry))
204
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
205
-	MOVBQZX AL, CX
206
-	SHLQ    CX, br_value     // value <<= n
207
-	ADDQ    CX, br_bits_read // bits_read += n
208
-
209
-	// these two writes get coalesced
210
-	// buf[stream][off] = uint8(v0.entry >> 8)
211
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
212
-	MOVW BX, 256(buffer)(off*1)
213
-
214
-	// SECOND PART:
215
-	// val2 := br1.peekTopBits(peekBits)
216
-	MOVQ br_value, AX
217
-	MOVQ peek_bits, CX
218
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
219
-
220
-	// v2 := table[val0&mask]
221
-	MOVW 0(table)(AX*2), AX // AX - v0
222
-
223
-	// br1.advance(uint8(v0.entry))
224
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
225
-	MOVBQZX AL, CX
226
-	SHLQ    CL, br_value     // value <<= n
227
-	ADDQ    CX, br_bits_read // bits_read += n
228
-
229
-	// val3 := br1.peekTopBits(peekBits)
230
-	MOVQ peek_bits, CX
231
-	MOVQ br_value, AX
232
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
233
-
234
-	// v3 := table[val1&mask]
235
-	MOVW 0(table)(AX*2), AX // AX - v1
236
-
237
-	// br1.advance(uint8(v1.entry))
238
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
239
-	MOVBQZX AL, CX
240
-	SHLQ    CX, br_value     // value <<= n
241
-	ADDQ    CX, br_bits_read // bits_read += n
242
-
243
-	// these two writes get coalesced
244
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
245
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
246
-	MOVW BX, 256+2(buffer)(off*1)
247
-
248
-	// update the bitrader reader structure
249
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
250
-	MOVQ br_value, bitReaderShifted_value(br1)
251
-	MOVQ br_offset, bitReaderShifted_off(br1)
252
-
253
-	// const stream = 2
254
-	// br2.fillFast()
255
-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
256
-	MOVQ    bitReaderShifted_value(br2), br_value
257
-	MOVQ    bitReaderShifted_off(br2), br_offset
258
-
259
-	// if b.bitsRead >= 32 {
260
-	CMPQ br_bits_read, $32
261
-	JB   skip_fill2
262
-
263
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
264
-	SUBQ $4, br_offset     // b.off -= 4
265
-
266
-	// v := b.in[b.off-4 : b.off]
267
-	// v = v[:4]
268
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
269
-	MOVQ bitReaderShifted_in(br2), AX
270
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
271
-
272
-	// b.value |= uint64(low) << (b.bitsRead & 63)
273
-	MOVQ br_bits_read, CX
274
-	SHLQ CL, AX
275
-	ORQ  AX, br_value
276
-
277
-	// exhausted = exhausted || (br2.off < 4)
278
-	CMPQ  br_offset, $4
279
-	SETLT DL
280
-	ORB   DL, DH
281
-
282
-	// }
283
-skip_fill2:
284
-
285
-	// val0 := br2.peekTopBits(peekBits)
286
-	MOVQ br_value, AX
287
-	MOVQ peek_bits, CX
288
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
289
-
290
-	// v0 := table[val0&mask]
291
-	MOVW 0(table)(AX*2), AX // AX - v0
292
-
293
-	// br2.advance(uint8(v0.entry))
294
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
295
-	MOVBQZX AL, CX
296
-	SHLQ    CL, br_value     // value <<= n
297
-	ADDQ    CX, br_bits_read // bits_read += n
298
-
299
-	// val1 := br2.peekTopBits(peekBits)
300
-	MOVQ peek_bits, CX
301
-	MOVQ br_value, AX
302
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
303
-
304
-	// v1 := table[val1&mask]
305
-	MOVW 0(table)(AX*2), AX // AX - v1
306
-
307
-	// br2.advance(uint8(v1.entry))
308
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
309
-	MOVBQZX AL, CX
310
-	SHLQ    CX, br_value     // value <<= n
311
-	ADDQ    CX, br_bits_read // bits_read += n
312
-
313
-	// these two writes get coalesced
314
-	// buf[stream][off] = uint8(v0.entry >> 8)
315
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
316
-	MOVW BX, 512(buffer)(off*1)
317
-
318
-	// SECOND PART:
319
-	// val2 := br2.peekTopBits(peekBits)
320
-	MOVQ br_value, AX
321
-	MOVQ peek_bits, CX
322
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
323
-
324
-	// v2 := table[val0&mask]
325
-	MOVW 0(table)(AX*2), AX // AX - v0
326
-
327
-	// br2.advance(uint8(v0.entry))
328
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
329
-	MOVBQZX AL, CX
330
-	SHLQ    CL, br_value     // value <<= n
331
-	ADDQ    CX, br_bits_read // bits_read += n
332
-
333
-	// val3 := br2.peekTopBits(peekBits)
334
-	MOVQ peek_bits, CX
335
-	MOVQ br_value, AX
336
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
337
-
338
-	// v3 := table[val1&mask]
339
-	MOVW 0(table)(AX*2), AX // AX - v1
340
-
341
-	// br2.advance(uint8(v1.entry))
342
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
343
-	MOVBQZX AL, CX
344
-	SHLQ    CX, br_value     // value <<= n
345
-	ADDQ    CX, br_bits_read // bits_read += n
346
-
347
-	// these two writes get coalesced
348
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
349
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
350
-	MOVW BX, 512+2(buffer)(off*1)
351
-
352
-	// update the bitrader reader structure
353
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
354
-	MOVQ br_value, bitReaderShifted_value(br2)
355
-	MOVQ br_offset, bitReaderShifted_off(br2)
356
-
357
-	// const stream = 3
358
-	// br3.fillFast()
359
-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
360
-	MOVQ    bitReaderShifted_value(br3), br_value
361
-	MOVQ    bitReaderShifted_off(br3), br_offset
362
-
363
-	// if b.bitsRead >= 32 {
364
-	CMPQ br_bits_read, $32
365
-	JB   skip_fill3
366
-
367
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
368
-	SUBQ $4, br_offset     // b.off -= 4
369
-
370
-	// v := b.in[b.off-4 : b.off]
371
-	// v = v[:4]
372
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
373
-	MOVQ bitReaderShifted_in(br3), AX
374
-	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
375
-
376
-	// b.value |= uint64(low) << (b.bitsRead & 63)
377
-	MOVQ br_bits_read, CX
378
-	SHLQ CL, AX
379
-	ORQ  AX, br_value
380
-
381
-	// exhausted = exhausted || (br3.off < 4)
382
-	CMPQ  br_offset, $4
383
-	SETLT DL
384
-	ORB   DL, DH
385
-
386
-	// }
387
-skip_fill3:
388
-
389
-	// val0 := br3.peekTopBits(peekBits)
390
-	MOVQ br_value, AX
391
-	MOVQ peek_bits, CX
392
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
393
-
394
-	// v0 := table[val0&mask]
395
-	MOVW 0(table)(AX*2), AX // AX - v0
396
-
397
-	// br3.advance(uint8(v0.entry))
398
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
399
-	MOVBQZX AL, CX
400
-	SHLQ    CL, br_value     // value <<= n
401
-	ADDQ    CX, br_bits_read // bits_read += n
402
-
403
-	// val1 := br3.peekTopBits(peekBits)
404
-	MOVQ peek_bits, CX
405
-	MOVQ br_value, AX
406
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
407
-
408
-	// v1 := table[val1&mask]
409
-	MOVW 0(table)(AX*2), AX // AX - v1
410
-
411
-	// br3.advance(uint8(v1.entry))
412
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
413
-	MOVBQZX AL, CX
414
-	SHLQ    CX, br_value     // value <<= n
415
-	ADDQ    CX, br_bits_read // bits_read += n
416
-
417
-	// these two writes get coalesced
418
-	// buf[stream][off] = uint8(v0.entry >> 8)
419
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
420
-	MOVW BX, 768(buffer)(off*1)
421
-
422
-	// SECOND PART:
423
-	// val2 := br3.peekTopBits(peekBits)
424
-	MOVQ br_value, AX
425
-	MOVQ peek_bits, CX
426
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
427
-
428
-	// v2 := table[val0&mask]
429
-	MOVW 0(table)(AX*2), AX // AX - v0
430
-
431
-	// br3.advance(uint8(v0.entry))
432
-	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
433
-	MOVBQZX AL, CX
434
-	SHLQ    CL, br_value     // value <<= n
435
-	ADDQ    CX, br_bits_read // bits_read += n
436
-
437
-	// val3 := br3.peekTopBits(peekBits)
438
-	MOVQ peek_bits, CX
439
-	MOVQ br_value, AX
440
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
441
-
442
-	// v3 := table[val1&mask]
443
-	MOVW 0(table)(AX*2), AX // AX - v1
444
-
445
-	// br3.advance(uint8(v1.entry))
446
-	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
447
-	MOVBQZX AL, CX
448
-	SHLQ    CX, br_value     // value <<= n
449
-	ADDQ    CX, br_bits_read // bits_read += n
450
-
451
-	// these two writes get coalesced
452
-	// buf[stream][off+2] = uint8(v2.entry >> 8)
453
-	// buf[stream][off+3] = uint8(v3.entry >> 8)
454
-	MOVW BX, 768+2(buffer)(off*1)
455
-
456
-	// update the bitrader reader structure
457
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
458
-	MOVQ br_value, bitReaderShifted_value(br3)
459
-	MOVQ br_offset, bitReaderShifted_off(br3)
460
-
461
-	ADDQ $4, off // off += 2
462
-
463
-	TESTB DH, DH // any br[i].ofs < 4?
464
-	JNZ   end
465
-
466
-	CMPQ off, $bufoff
467
-	JL   main_loop
468
-
469
-end:
470
-	MOVQ 0(SP), BP
471
-
472
-	MOVB off, ret+56(FP)
473
-	RET
474
-
475
-#undef off
476
-#undef buffer
477
-#undef table
478
-
479
-#undef br_bits_read
480
-#undef br_value
481
-#undef br_offset
482
-#undef peek_bits
483
-#undef exhausted
484
-
485
-#undef br0
486
-#undef br1
487
-#undef br2
488
-#undef br3
489 1
deleted file mode 100644
... ...
@@ -1,197 +0,0 @@
1
-// +build !appengine
2
-// +build gc
3
-// +build !noasm
4
-
5
-#include "textflag.h"
6
-#include "funcdata.h"
7
-#include "go_asm.h"
8
-
9
-
10
-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
11
-
12
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
13
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
14
-TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
15
-#define off             R8
16
-#define buffer          DI
17
-#define table           SI
18
-
19
-#define br_bits_read    R9
20
-#define br_value        R10
21
-#define br_offset       R11
22
-#define peek_bits       R12
23
-#define exhausted       DX
24
-
25
-#define br0             R13
26
-#define br1             R14
27
-#define br2             R15
28
-#define br3             BP
29
-
30
-    MOVQ    BP, 0(SP)
31
-
32
-    XORQ    exhausted, exhausted    // exhausted = false
33
-    XORQ    off, off                // off = 0
34
-
35
-    MOVBQZX peekBits+32(FP), peek_bits
36
-    MOVQ    buf+40(FP), buffer
37
-    MOVQ    tbl+48(FP), table
38
-
39
-    MOVQ    pbr0+0(FP), br0
40
-    MOVQ    pbr1+8(FP), br1
41
-    MOVQ    pbr2+16(FP), br2
42
-    MOVQ    pbr3+24(FP), br3
43
-
44
-main_loop:
45
-{{ define "decode_2_values_x86" }}
46
-    // const stream = {{ var "id" }}
47
-    // br{{ var "id"}}.fillFast()
48
-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
49
-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
50
-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
51
-
52
-	// if b.bitsRead >= 32 {
53
-    CMPQ    br_bits_read, $32
54
-    JB      skip_fill{{ var "id" }}
55
-
56
-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
57
-    SUBQ    $4, br_offset           // b.off -= 4
58
-
59
-	// v := b.in[b.off-4 : b.off]
60
-	// v = v[:4]
61
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
62
-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
63
-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
64
-
65
-	// b.value |= uint64(low) << (b.bitsRead & 63)
66
-    MOVQ    br_bits_read, CX
67
-    SHLQ    CL, AX
68
-    ORQ     AX, br_value
69
-
70
-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
71
-    CMPQ    br_offset, $4
72
-    SETLT   DL
73
-    ORB     DL, DH
74
-    // }
75
-skip_fill{{ var "id" }}:
76
-
77
-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
78
-    MOVQ    br_value, AX
79
-    MOVQ    peek_bits, CX
80
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
81
-
82
-    // v0 := table[val0&mask]
83
-    MOVW    0(table)(AX*2), AX      // AX - v0
84
-
85
-    // br{{ var "id"}}.advance(uint8(v0.entry))
86
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
87
-    MOVBQZX AL, CX
88
-    SHLQ    CL, br_value            // value <<= n
89
-    ADDQ    CX, br_bits_read        // bits_read += n
90
-
91
-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
92
-    MOVQ    peek_bits, CX
93
-    MOVQ    br_value, AX
94
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
95
-
96
-    // v1 := table[val1&mask]
97
-    MOVW    0(table)(AX*2), AX      // AX - v1
98
-
99
-    // br{{ var "id"}}.advance(uint8(v1.entry))
100
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
101
-    MOVBQZX AL, CX
102
-    SHLQ    CX, br_value            // value <<= n
103
-    ADDQ    CX, br_bits_read        // bits_read += n
104
-
105
-
106
-    // these two writes get coalesced
107
-    // buf[stream][off] = uint8(v0.entry >> 8)
108
-    // buf[stream][off+1] = uint8(v1.entry >> 8)
109
-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
110
-
111
-    // SECOND PART:
112
-    // val2 := br{{ var "id"}}.peekTopBits(peekBits)
113
-    MOVQ    br_value, AX
114
-    MOVQ    peek_bits, CX
115
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
116
-
117
-    // v2 := table[val0&mask]
118
-    MOVW    0(table)(AX*2), AX      // AX - v0
119
-
120
-    // br{{ var "id"}}.advance(uint8(v0.entry))
121
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
122
-    MOVBQZX AL, CX
123
-    SHLQ    CL, br_value            // value <<= n
124
-    ADDQ    CX, br_bits_read        // bits_read += n
125
-
126
-    // val3 := br{{ var "id"}}.peekTopBits(peekBits)
127
-    MOVQ    peek_bits, CX
128
-    MOVQ    br_value, AX
129
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
130
-
131
-    // v3 := table[val1&mask]
132
-    MOVW    0(table)(AX*2), AX      // AX - v1
133
-
134
-    // br{{ var "id"}}.advance(uint8(v1.entry))
135
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
136
-    MOVBQZX AL, CX
137
-    SHLQ    CX, br_value            // value <<= n
138
-    ADDQ    CX, br_bits_read        // bits_read += n
139
-
140
-
141
-    // these two writes get coalesced
142
-    // buf[stream][off+2] = uint8(v2.entry >> 8)
143
-    // buf[stream][off+3] = uint8(v3.entry >> 8)
144
-    MOVW    BX, {{ var "bufofs" }}+2(buffer)(off*1)
145
-
146
-    // update the bitrader reader structure
147
-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
148
-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
149
-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
150
-{{ end }}
151
-
152
-    {{ set "id" "0" }}
153
-    {{ set "ofs" "0" }}
154
-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
155
-    {{ template "decode_2_values_x86" . }}
156
-
157
-    {{ set "id" "1" }}
158
-    {{ set "ofs" "8" }}
159
-    {{ set "bufofs" "256" }}
160
-    {{ template "decode_2_values_x86" . }}
161
-
162
-    {{ set "id" "2" }}
163
-    {{ set "ofs" "16" }}
164
-    {{ set "bufofs" "512" }}
165
-    {{ template "decode_2_values_x86" . }}
166
-
167
-    {{ set "id" "3" }}
168
-    {{ set "ofs" "24" }}
169
-    {{ set "bufofs" "768" }}
170
-    {{ template "decode_2_values_x86" . }}
171
-
172
-    ADDQ    $4, off     // off += 2
173
-
174
-    TESTB   DH, DH      // any br[i].ofs < 4?
175
-    JNZ     end
176
-
177
-    CMPQ    off, $bufoff
178
-    JL      main_loop
179
-end:
180
-    MOVQ    0(SP), BP
181
-
182
-    MOVB    off, ret+56(FP)
183
-    RET
184
-#undef  off
185
-#undef  buffer
186
-#undef  table
187
-
188
-#undef  br_bits_read
189
-#undef  br_value
190
-#undef  br_offset
191
-#undef  peek_bits
192
-#undef  exhausted
193
-
194
-#undef  br0
195
-#undef  br1
196
-#undef  br2
197
-#undef  br3
... ...
@@ -2,30 +2,40 @@
2 2
 // +build amd64,!appengine,!noasm,gc
3 3
 
4 4
 // This file contains the specialisation of Decoder.Decompress4X
5
-// that uses an asm implementation of its main loop.
5
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
6 6
 package huff0
7 7
 
8 8
 import (
9 9
 	"errors"
10 10
 	"fmt"
11
+
12
+	"github.com/klauspost/compress/internal/cpuinfo"
11 13
 )
12 14
 
13 15
 // decompress4x_main_loop_x86 is an x86 assembler implementation
14 16
 // of Decompress4X when tablelog > 8.
15
-// go:noescape
16
-func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
17
-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
17
+//go:noescape
18
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
18 19
 
19 20
 // decompress4x_8b_loop_x86 is an x86 assembler implementation
20 21
 // of Decompress4X when tablelog <= 8 which decodes 4 entries
21 22
 // per loop.
22
-// go:noescape
23
-func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
24
-	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
23
+//go:noescape
24
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
25 25
 
26 26
 // fallback8BitSize is the size where using Go version is faster.
27 27
 const fallback8BitSize = 800
28 28
 
29
+type decompress4xContext struct {
30
+	pbr      *[4]bitReaderShifted
31
+	peekBits uint8
32
+	out      *byte
33
+	dstEvery int
34
+	tbl      *dEntrySingle
35
+	decoded  int
36
+	limit    *byte
37
+}
38
+
29 39
 // Decompress4X will decompress a 4X encoded stream.
30 40
 // The length of the supplied input must match the end of a block exactly.
31 41
 // The *capacity* of the dst slice must match the destination size of
... ...
@@ -42,6 +52,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
42 42
 	if cap(dst) < fallback8BitSize && use8BitTables {
43 43
 		return d.decompress4X8bit(dst, src)
44 44
 	}
45
+
45 46
 	var br [4]bitReaderShifted
46 47
 	// Decode "jump table"
47 48
 	start := 6
... ...
@@ -71,70 +82,25 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
71 71
 	const tlMask = tlSize - 1
72 72
 	single := d.dt.single[:tlSize]
73 73
 
74
-	// Use temp table to avoid bound checks/append penalty.
75
-	buf := d.buffer()
76
-	var off uint8
77 74
 	var decoded int
78 75
 
79
-	const debug = false
80
-
81
-	// see: bitReaderShifted.peekBitsFast()
82
-	peekBits := uint8((64 - d.actualTableLog) & 63)
83
-
84
-	// Decode 2 values from each decoder/loop.
85
-	const bufoff = 256
86
-	for {
87
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
88
-			break
76
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
77
+		ctx := decompress4xContext{
78
+			pbr:      &br,
79
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
80
+			out:      &out[0],
81
+			dstEvery: dstEvery,
82
+			tbl:      &single[0],
83
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
89 84
 		}
90
-
91 85
 		if use8BitTables {
92
-			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
86
+			decompress4x_8b_main_loop_amd64(&ctx)
93 87
 		} else {
94
-			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
95
-		}
96
-		if debug {
97
-			fmt.Print("DEBUG: ")
98
-			fmt.Printf("off=%d,", off)
99
-			for i := 0; i < 4; i++ {
100
-				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
101
-					i, br[i].bitsRead, br[i].value, br[i].off)
102
-			}
103
-			fmt.Println("")
104
-		}
105
-
106
-		if off != 0 {
107
-			break
88
+			decompress4x_main_loop_amd64(&ctx)
108 89
 		}
109 90
 
110
-		if bufoff > dstEvery {
111
-			d.bufs.Put(buf)
112
-			return nil, errors.New("corruption detected: stream overrun 1")
113
-		}
114
-		copy(out, buf[0][:])
115
-		copy(out[dstEvery:], buf[1][:])
116
-		copy(out[dstEvery*2:], buf[2][:])
117
-		copy(out[dstEvery*3:], buf[3][:])
118
-		out = out[bufoff:]
119
-		decoded += bufoff * 4
120
-		// There must at least be 3 buffers left.
121
-		if len(out) < dstEvery*3 {
122
-			d.bufs.Put(buf)
123
-			return nil, errors.New("corruption detected: stream overrun 2")
124
-		}
125
-	}
126
-	if off > 0 {
127
-		ioff := int(off)
128
-		if len(out) < dstEvery*3+ioff {
129
-			d.bufs.Put(buf)
130
-			return nil, errors.New("corruption detected: stream overrun 3")
131
-		}
132
-		copy(out, buf[0][:off])
133
-		copy(out[dstEvery:], buf[1][:off])
134
-		copy(out[dstEvery*2:], buf[2][:off])
135
-		copy(out[dstEvery*3:], buf[3][:off])
136
-		decoded += int(off) * 4
137
-		out = out[off:]
91
+		decoded = ctx.decoded
92
+		out = out[decoded/4:]
138 93
 	}
139 94
 
140 95
 	// Decode remaining.
... ...
@@ -150,7 +116,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
150 150
 		for bitsLeft > 0 {
151 151
 			br.fill()
152 152
 			if offset >= endsAt {
153
-				d.bufs.Put(buf)
154 153
 				return nil, errors.New("corruption detected: stream overrun 4")
155 154
 			}
156 155
 
... ...
@@ -164,7 +129,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
164 164
 			offset++
165 165
 		}
166 166
 		if offset != endsAt {
167
-			d.bufs.Put(buf)
168 167
 			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
169 168
 		}
170 169
 		decoded += offset - dstEvery*i
... ...
@@ -173,9 +137,86 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
173 173
 			return nil, err
174 174
 		}
175 175
 	}
176
-	d.bufs.Put(buf)
177 176
 	if dstSize != decoded {
178 177
 		return nil, errors.New("corruption detected: short output block")
179 178
 	}
180 179
 	return dst, nil
181 180
 }
181
+
182
+// decompress4x_main_loop_x86 is an x86 assembler implementation
183
+// of Decompress1X when tablelog > 8.
184
+//go:noescape
185
+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
186
+
187
+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
188
+// of Decompress1X when tablelog > 8.
189
+//go:noescape
190
+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
191
+
192
+type decompress1xContext struct {
193
+	pbr      *bitReaderShifted
194
+	peekBits uint8
195
+	out      *byte
196
+	outCap   int
197
+	tbl      *dEntrySingle
198
+	decoded  int
199
+}
200
+
201
+// Error reported by asm implementations
202
+const error_max_decoded_size_exeeded = -1
203
+
204
+// Decompress1X will decompress a 1X encoded stream.
205
+// The cap of the output buffer will be the maximum decompressed size.
206
+// The length of the supplied input must match the end of a block exactly.
207
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
208
+	if len(d.dt.single) == 0 {
209
+		return nil, errors.New("no table loaded")
210
+	}
211
+	var br bitReaderShifted
212
+	err := br.init(src)
213
+	if err != nil {
214
+		return dst, err
215
+	}
216
+	maxDecodedSize := cap(dst)
217
+	dst = dst[:maxDecodedSize]
218
+
219
+	const tlSize = 1 << tableLogMax
220
+	const tlMask = tlSize - 1
221
+
222
+	if maxDecodedSize >= 4 {
223
+		ctx := decompress1xContext{
224
+			pbr:      &br,
225
+			out:      &dst[0],
226
+			outCap:   maxDecodedSize,
227
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
228
+			tbl:      &d.dt.single[0],
229
+		}
230
+
231
+		if cpuinfo.HasBMI2() {
232
+			decompress1x_main_loop_bmi2(&ctx)
233
+		} else {
234
+			decompress1x_main_loop_amd64(&ctx)
235
+		}
236
+		if ctx.decoded == error_max_decoded_size_exeeded {
237
+			return nil, ErrMaxDecodedSizeExceeded
238
+		}
239
+
240
+		dst = dst[:ctx.decoded]
241
+	}
242
+
243
+	// br < 8, so uint8 is fine
244
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
245
+	for bitsLeft > 0 {
246
+		br.fill()
247
+		if len(dst) >= maxDecodedSize {
248
+			br.close()
249
+			return nil, ErrMaxDecodedSizeExceeded
250
+		}
251
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
252
+		nBits := uint8(v.entry)
253
+		br.advance(nBits)
254
+		bitsLeft -= nBits
255
+		dst = append(dst, uint8(v.entry>>8))
256
+	}
257
+	return dst, br.close()
258
+}
... ...
@@ -1,506 +1,847 @@
1
-// +build !appengine
2
-// +build gc
3
-// +build !noasm
4
-
5
-#include "textflag.h"
6
-#include "funcdata.h"
7
-#include "go_asm.h"
8
-
9
-#ifdef GOAMD64_v4
10
-#ifndef GOAMD64_v3
11
-#define GOAMD64_v3
12
-#endif
13
-#endif
14
-
15
-#define bufoff      256 // see decompress.go, we're using [4][256]byte table
16
-
17
-// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
18
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
19
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
20
-#define off             R8
21
-#define buffer          DI
22
-#define table           SI
23
-
24
-#define br_bits_read    R9
25
-#define br_value        R10
26
-#define br_offset       R11
27
-#define peek_bits       R12
28
-#define exhausted       DX
29
-
30
-#define br0             R13
31
-#define br1             R14
32
-#define br2             R15
33
-#define br3             BP
34
-
35
-	MOVQ BP, 0(SP)
36
-
37
-	XORQ exhausted, exhausted // exhausted = false
38
-	XORQ off, off             // off = 0
39
-
40
-	MOVBQZX peekBits+32(FP), peek_bits
41
-	MOVQ    buf+40(FP), buffer
42
-	MOVQ    tbl+48(FP), table
43
-
44
-	MOVQ pbr0+0(FP), br0
45
-	MOVQ pbr1+8(FP), br1
46
-	MOVQ pbr2+16(FP), br2
47
-	MOVQ pbr3+24(FP), br3
1
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
48 2
 
49
-main_loop:
50
-
51
-	// const stream = 0
52
-	// br0.fillFast()
53
-	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
54
-	MOVQ    bitReaderShifted_value(br0), br_value
55
-	MOVQ    bitReaderShifted_off(br0), br_offset
3
+//go:build amd64 && !appengine && !noasm && gc
4
+// +build amd64,!appengine,!noasm,gc
56 5
 
57
-	// We must have at least 2 * max tablelog left
58
-	CMPQ br_bits_read, $64-22
59
-	JBE  skip_fill0
6
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
7
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
8
+	XORQ DX, DX
60 9
 
61
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
62
-	SUBQ $4, br_offset     // b.off -= 4
10
+	// Preload values
11
+	MOVQ    ctx+0(FP), AX
12
+	MOVBQZX 8(AX), DI
13
+	MOVQ    16(AX), SI
14
+	MOVQ    48(AX), BX
15
+	MOVQ    24(AX), R9
16
+	MOVQ    32(AX), R10
17
+	MOVQ    (AX), R11
63 18
 
64
-	// v := b.in[b.off-4 : b.off]
65
-	// v = v[:4]
66
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
67
-	MOVQ bitReaderShifted_in(br0), AX
19
+	// Main loop
20
+main_loop:
21
+	MOVQ  SI, R8
22
+	CMPQ  R8, BX
23
+	SETGE DL
24
+
25
+	// br0.fillFast32()
26
+	MOVQ    32(R11), R12
27
+	MOVBQZX 40(R11), R13
28
+	CMPQ    R13, $0x20
29
+	JBE     skip_fill0
30
+	MOVQ    24(R11), AX
31
+	SUBQ    $0x20, R13
32
+	SUBQ    $0x04, AX
33
+	MOVQ    (R11), R14
68 34
 
69 35
 	// b.value |= uint64(low) << (b.bitsRead & 63)
70
-#ifdef GOAMD64_v3
71
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
72
-
73
-#else
74
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
75
-	MOVQ br_bits_read, CX
76
-	SHLQ CL, AX
77
-
78
-#endif
79
-
80
-	ORQ AX, br_value
36
+	MOVL (AX)(R14*1), R14
37
+	MOVQ R13, CX
38
+	SHLQ CL, R14
39
+	MOVQ AX, 24(R11)
40
+	ORQ  R14, R12
81 41
 
82 42
 	// exhausted = exhausted || (br0.off < 4)
83
-	CMPQ  br_offset, $4
84
-	SETLT DL
85
-	ORB   DL, DH
43
+	CMPQ  AX, $0x04
44
+	SETLT AL
45
+	ORB   AL, DL
86 46
 
87
-	// }
88 47
 skip_fill0:
89
-
90 48
 	// val0 := br0.peekTopBits(peekBits)
91
-#ifdef GOAMD64_v3
92
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
93
-
94
-#else
95
-	MOVQ br_value, AX
96
-	MOVQ peek_bits, CX
97
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
98
-
99
-#endif
49
+	MOVQ R12, R14
50
+	MOVQ DI, CX
51
+	SHRQ CL, R14
100 52
 
101 53
 	// v0 := table[val0&mask]
102
-	MOVW 0(table)(AX*2), AX // AX - v0
103
-
104
-	// br0.advance(uint8(v0.entry))
105
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
106
-
107
-#ifdef GOAMD64_v3
108
-	MOVBQZX AL, CX
109
-	SHLXQ   AX, br_value, br_value // value <<= n
110
-
111
-#else
112
-	MOVBQZX AL, CX
113
-	SHLQ    CL, br_value // value <<= n
114
-
115
-#endif
116
-
117
-	ADDQ CX, br_bits_read // bits_read += n
54
+	MOVW (R10)(R14*2), CX
118 55
 
119
-#ifdef GOAMD64_v3
120
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
56
+	// br0.advance(uint8(v0.entry)
57
+	MOVB CH, AL
58
+	SHLQ CL, R12
59
+	ADDB CL, R13
121 60
 
122
-#else
123 61
 	// val1 := br0.peekTopBits(peekBits)
124
-	MOVQ peek_bits, CX
125
-	MOVQ br_value, AX
126
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
127
-
128
-#endif
62
+	MOVQ DI, CX
63
+	MOVQ R12, R14
64
+	SHRQ CL, R14
129 65
 
130 66
 	// v1 := table[val1&mask]
131
-	MOVW 0(table)(AX*2), AX // AX - v1
67
+	MOVW (R10)(R14*2), CX
132 68
 
133 69
 	// br0.advance(uint8(v1.entry))
134
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
135
-
136
-#ifdef GOAMD64_v3
137
-	MOVBQZX AL, CX
138
-	SHLXQ   AX, br_value, br_value // value <<= n
139
-
140
-#else
141
-	MOVBQZX AL, CX
142
-	SHLQ    CL, br_value // value <<= n
143
-
144
-#endif
145
-
146
-	ADDQ CX, br_bits_read // bits_read += n
70
+	MOVB CH, AH
71
+	SHLQ CL, R12
72
+	ADDB CL, R13
147 73
 
148 74
 	// these two writes get coalesced
149
-	// buf[stream][off] = uint8(v0.entry >> 8)
150
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
151
-	MOVW BX, 0(buffer)(off*1)
152
-
153
-	// update the bitrader reader structure
154
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
155
-	MOVQ br_value, bitReaderShifted_value(br0)
156
-	MOVQ br_offset, bitReaderShifted_off(br0)
157
-
158
-	// const stream = 1
159
-	// br1.fillFast()
160
-	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
161
-	MOVQ    bitReaderShifted_value(br1), br_value
162
-	MOVQ    bitReaderShifted_off(br1), br_offset
163
-
164
-	// We must have at least 2 * max tablelog left
165
-	CMPQ br_bits_read, $64-22
166
-	JBE  skip_fill1
167
-
168
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
169
-	SUBQ $4, br_offset     // b.off -= 4
170
-
171
-	// v := b.in[b.off-4 : b.off]
172
-	// v = v[:4]
173
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
174
-	MOVQ bitReaderShifted_in(br1), AX
75
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
76
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
77
+	MOVW AX, (R8)
78
+
79
+	// update the bitreader structure
80
+	MOVQ R12, 32(R11)
81
+	MOVB R13, 40(R11)
82
+	ADDQ R9, R8
83
+
84
+	// br1.fillFast32()
85
+	MOVQ    80(R11), R12
86
+	MOVBQZX 88(R11), R13
87
+	CMPQ    R13, $0x20
88
+	JBE     skip_fill1
89
+	MOVQ    72(R11), AX
90
+	SUBQ    $0x20, R13
91
+	SUBQ    $0x04, AX
92
+	MOVQ    48(R11), R14
175 93
 
176 94
 	// b.value |= uint64(low) << (b.bitsRead & 63)
177
-#ifdef GOAMD64_v3
178
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
179
-
180
-#else
181
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
182
-	MOVQ br_bits_read, CX
183
-	SHLQ CL, AX
184
-
185
-#endif
186
-
187
-	ORQ AX, br_value
95
+	MOVL (AX)(R14*1), R14
96
+	MOVQ R13, CX
97
+	SHLQ CL, R14
98
+	MOVQ AX, 72(R11)
99
+	ORQ  R14, R12
188 100
 
189 101
 	// exhausted = exhausted || (br1.off < 4)
190
-	CMPQ  br_offset, $4
191
-	SETLT DL
192
-	ORB   DL, DH
102
+	CMPQ  AX, $0x04
103
+	SETLT AL
104
+	ORB   AL, DL
193 105
 
194
-	// }
195 106
 skip_fill1:
196
-
197 107
 	// val0 := br1.peekTopBits(peekBits)
198
-#ifdef GOAMD64_v3
199
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
200
-
201
-#else
202
-	MOVQ br_value, AX
203
-	MOVQ peek_bits, CX
204
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
205
-
206
-#endif
108
+	MOVQ R12, R14
109
+	MOVQ DI, CX
110
+	SHRQ CL, R14
207 111
 
208 112
 	// v0 := table[val0&mask]
209
-	MOVW 0(table)(AX*2), AX // AX - v0
210
-
211
-	// br1.advance(uint8(v0.entry))
212
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
213
-
214
-#ifdef GOAMD64_v3
215
-	MOVBQZX AL, CX
216
-	SHLXQ   AX, br_value, br_value // value <<= n
217
-
218
-#else
219
-	MOVBQZX AL, CX
220
-	SHLQ    CL, br_value // value <<= n
221
-
222
-#endif
223
-
224
-	ADDQ CX, br_bits_read // bits_read += n
113
+	MOVW (R10)(R14*2), CX
225 114
 
226
-#ifdef GOAMD64_v3
227
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
115
+	// br1.advance(uint8(v0.entry)
116
+	MOVB CH, AL
117
+	SHLQ CL, R12
118
+	ADDB CL, R13
228 119
 
229
-#else
230 120
 	// val1 := br1.peekTopBits(peekBits)
231
-	MOVQ peek_bits, CX
232
-	MOVQ br_value, AX
233
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
234
-
235
-#endif
121
+	MOVQ DI, CX
122
+	MOVQ R12, R14
123
+	SHRQ CL, R14
236 124
 
237 125
 	// v1 := table[val1&mask]
238
-	MOVW 0(table)(AX*2), AX // AX - v1
126
+	MOVW (R10)(R14*2), CX
239 127
 
240 128
 	// br1.advance(uint8(v1.entry))
241
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
242
-
243
-#ifdef GOAMD64_v3
244
-	MOVBQZX AL, CX
245
-	SHLXQ   AX, br_value, br_value // value <<= n
246
-
247
-#else
248
-	MOVBQZX AL, CX
249
-	SHLQ    CL, br_value // value <<= n
250
-
251
-#endif
252
-
253
-	ADDQ CX, br_bits_read // bits_read += n
129
+	MOVB CH, AH
130
+	SHLQ CL, R12
131
+	ADDB CL, R13
254 132
 
255 133
 	// these two writes get coalesced
256
-	// buf[stream][off] = uint8(v0.entry >> 8)
257
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
258
-	MOVW BX, 256(buffer)(off*1)
259
-
260
-	// update the bitrader reader structure
261
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
262
-	MOVQ br_value, bitReaderShifted_value(br1)
263
-	MOVQ br_offset, bitReaderShifted_off(br1)
264
-
265
-	// const stream = 2
266
-	// br2.fillFast()
267
-	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
268
-	MOVQ    bitReaderShifted_value(br2), br_value
269
-	MOVQ    bitReaderShifted_off(br2), br_offset
270
-
271
-	// We must have at least 2 * max tablelog left
272
-	CMPQ br_bits_read, $64-22
273
-	JBE  skip_fill2
274
-
275
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
276
-	SUBQ $4, br_offset     // b.off -= 4
277
-
278
-	// v := b.in[b.off-4 : b.off]
279
-	// v = v[:4]
280
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
281
-	MOVQ bitReaderShifted_in(br2), AX
134
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
135
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
136
+	MOVW AX, (R8)
137
+
138
+	// update the bitreader structure
139
+	MOVQ R12, 80(R11)
140
+	MOVB R13, 88(R11)
141
+	ADDQ R9, R8
142
+
143
+	// br2.fillFast32()
144
+	MOVQ    128(R11), R12
145
+	MOVBQZX 136(R11), R13
146
+	CMPQ    R13, $0x20
147
+	JBE     skip_fill2
148
+	MOVQ    120(R11), AX
149
+	SUBQ    $0x20, R13
150
+	SUBQ    $0x04, AX
151
+	MOVQ    96(R11), R14
282 152
 
283 153
 	// b.value |= uint64(low) << (b.bitsRead & 63)
284
-#ifdef GOAMD64_v3
285
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
286
-
287
-#else
288
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
289
-	MOVQ br_bits_read, CX
290
-	SHLQ CL, AX
291
-
292
-#endif
293
-
294
-	ORQ AX, br_value
154
+	MOVL (AX)(R14*1), R14
155
+	MOVQ R13, CX
156
+	SHLQ CL, R14
157
+	MOVQ AX, 120(R11)
158
+	ORQ  R14, R12
295 159
 
296 160
 	// exhausted = exhausted || (br2.off < 4)
297
-	CMPQ  br_offset, $4
298
-	SETLT DL
299
-	ORB   DL, DH
161
+	CMPQ  AX, $0x04
162
+	SETLT AL
163
+	ORB   AL, DL
300 164
 
301
-	// }
302 165
 skip_fill2:
303
-
304 166
 	// val0 := br2.peekTopBits(peekBits)
305
-#ifdef GOAMD64_v3
306
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
307
-
308
-#else
309
-	MOVQ br_value, AX
310
-	MOVQ peek_bits, CX
311
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
312
-
313
-#endif
167
+	MOVQ R12, R14
168
+	MOVQ DI, CX
169
+	SHRQ CL, R14
314 170
 
315 171
 	// v0 := table[val0&mask]
316
-	MOVW 0(table)(AX*2), AX // AX - v0
317
-
318
-	// br2.advance(uint8(v0.entry))
319
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
320
-
321
-#ifdef GOAMD64_v3
322
-	MOVBQZX AL, CX
323
-	SHLXQ   AX, br_value, br_value // value <<= n
324
-
325
-#else
326
-	MOVBQZX AL, CX
327
-	SHLQ    CL, br_value // value <<= n
328
-
329
-#endif
172
+	MOVW (R10)(R14*2), CX
330 173
 
331
-	ADDQ CX, br_bits_read // bits_read += n
174
+	// br2.advance(uint8(v0.entry)
175
+	MOVB CH, AL
176
+	SHLQ CL, R12
177
+	ADDB CL, R13
332 178
 
333
-#ifdef GOAMD64_v3
334
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
335
-
336
-#else
337 179
 	// val1 := br2.peekTopBits(peekBits)
338
-	MOVQ peek_bits, CX
339
-	MOVQ br_value, AX
340
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
341
-
342
-#endif
180
+	MOVQ DI, CX
181
+	MOVQ R12, R14
182
+	SHRQ CL, R14
343 183
 
344 184
 	// v1 := table[val1&mask]
345
-	MOVW 0(table)(AX*2), AX // AX - v1
185
+	MOVW (R10)(R14*2), CX
346 186
 
347 187
 	// br2.advance(uint8(v1.entry))
348
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
188
+	MOVB CH, AH
189
+	SHLQ CL, R12
190
+	ADDB CL, R13
349 191
 
350
-#ifdef GOAMD64_v3
351
-	MOVBQZX AL, CX
352
-	SHLXQ   AX, br_value, br_value // value <<= n
353
-
354
-#else
355
-	MOVBQZX AL, CX
356
-	SHLQ    CL, br_value // value <<= n
192
+	// these two writes get coalesced
193
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
194
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
195
+	MOVW AX, (R8)
196
+
197
+	// update the bitreader structure
198
+	MOVQ R12, 128(R11)
199
+	MOVB R13, 136(R11)
200
+	ADDQ R9, R8
201
+
202
+	// br3.fillFast32()
203
+	MOVQ    176(R11), R12
204
+	MOVBQZX 184(R11), R13
205
+	CMPQ    R13, $0x20
206
+	JBE     skip_fill3
207
+	MOVQ    168(R11), AX
208
+	SUBQ    $0x20, R13
209
+	SUBQ    $0x04, AX
210
+	MOVQ    144(R11), R14
357 211
 
358
-#endif
212
+	// b.value |= uint64(low) << (b.bitsRead & 63)
213
+	MOVL (AX)(R14*1), R14
214
+	MOVQ R13, CX
215
+	SHLQ CL, R14
216
+	MOVQ AX, 168(R11)
217
+	ORQ  R14, R12
359 218
 
360
-	ADDQ CX, br_bits_read // bits_read += n
219
+	// exhausted = exhausted || (br3.off < 4)
220
+	CMPQ  AX, $0x04
221
+	SETLT AL
222
+	ORB   AL, DL
361 223
 
362
-	// these two writes get coalesced
363
-	// buf[stream][off] = uint8(v0.entry >> 8)
364
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
365
-	MOVW BX, 512(buffer)(off*1)
224
+skip_fill3:
225
+	// val0 := br3.peekTopBits(peekBits)
226
+	MOVQ R12, R14
227
+	MOVQ DI, CX
228
+	SHRQ CL, R14
366 229
 
367
-	// update the bitrader reader structure
368
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
369
-	MOVQ br_value, bitReaderShifted_value(br2)
370
-	MOVQ br_offset, bitReaderShifted_off(br2)
230
+	// v0 := table[val0&mask]
231
+	MOVW (R10)(R14*2), CX
371 232
 
372
-	// const stream = 3
373
-	// br3.fillFast()
374
-	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
375
-	MOVQ    bitReaderShifted_value(br3), br_value
376
-	MOVQ    bitReaderShifted_off(br3), br_offset
233
+	// br3.advance(uint8(v0.entry)
234
+	MOVB CH, AL
235
+	SHLQ CL, R12
236
+	ADDB CL, R13
377 237
 
378
-	// We must have at least 2 * max tablelog left
379
-	CMPQ br_bits_read, $64-22
380
-	JBE  skip_fill3
238
+	// val1 := br3.peekTopBits(peekBits)
239
+	MOVQ DI, CX
240
+	MOVQ R12, R14
241
+	SHRQ CL, R14
381 242
 
382
-	SUBQ $32, br_bits_read // b.bitsRead -= 32
383
-	SUBQ $4, br_offset     // b.off -= 4
243
+	// v1 := table[val1&mask]
244
+	MOVW (R10)(R14*2), CX
384 245
 
385
-	// v := b.in[b.off-4 : b.off]
386
-	// v = v[:4]
387
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
388
-	MOVQ bitReaderShifted_in(br3), AX
246
+	// br3.advance(uint8(v1.entry))
247
+	MOVB CH, AH
248
+	SHLQ CL, R12
249
+	ADDB CL, R13
389 250
 
390
-	// b.value |= uint64(low) << (b.bitsRead & 63)
391
-#ifdef GOAMD64_v3
392
-	SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
251
+	// these two writes get coalesced
252
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
253
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
254
+	MOVW AX, (R8)
255
+
256
+	// update the bitreader structure
257
+	MOVQ  R12, 176(R11)
258
+	MOVB  R13, 184(R11)
259
+	ADDQ  $0x02, SI
260
+	TESTB DL, DL
261
+	JZ    main_loop
262
+	MOVQ  ctx+0(FP), AX
263
+	SUBQ  16(AX), SI
264
+	SHLQ  $0x02, SI
265
+	MOVQ  SI, 40(AX)
266
+	RET
393 267
 
394
-#else
395
-	MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
396
-	MOVQ br_bits_read, CX
397
-	SHLQ CL, AX
268
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
269
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
270
+	XORQ DX, DX
398 271
 
399
-#endif
272
+	// Preload values
273
+	MOVQ    ctx+0(FP), CX
274
+	MOVBQZX 8(CX), DI
275
+	MOVQ    16(CX), BX
276
+	MOVQ    48(CX), SI
277
+	MOVQ    24(CX), R9
278
+	MOVQ    32(CX), R10
279
+	MOVQ    (CX), R11
400 280
 
401
-	ORQ AX, br_value
281
+	// Main loop
282
+main_loop:
283
+	MOVQ  BX, R8
284
+	CMPQ  R8, SI
285
+	SETGE DL
286
+
287
+	// br0.fillFast32()
288
+	MOVQ    32(R11), R12
289
+	MOVBQZX 40(R11), R13
290
+	CMPQ    R13, $0x20
291
+	JBE     skip_fill0
292
+	MOVQ    24(R11), R14
293
+	SUBQ    $0x20, R13
294
+	SUBQ    $0x04, R14
295
+	MOVQ    (R11), R15
402 296
 
403
-	// exhausted = exhausted || (br3.off < 4)
404
-	CMPQ  br_offset, $4
405
-	SETLT DL
406
-	ORB   DL, DH
297
+	// b.value |= uint64(low) << (b.bitsRead & 63)
298
+	MOVL (R14)(R15*1), R15
299
+	MOVQ R13, CX
300
+	SHLQ CL, R15
301
+	MOVQ R14, 24(R11)
302
+	ORQ  R15, R12
407 303
 
408
-	// }
409
-skip_fill3:
304
+	// exhausted = exhausted || (br0.off < 4)
305
+	CMPQ  R14, $0x04
306
+	SETLT AL
307
+	ORB   AL, DL
410 308
 
411
-	// val0 := br3.peekTopBits(peekBits)
412
-#ifdef GOAMD64_v3
413
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
309
+skip_fill0:
310
+	// val0 := br0.peekTopBits(peekBits)
311
+	MOVQ R12, R14
312
+	MOVQ DI, CX
313
+	SHRQ CL, R14
414 314
 
415
-#else
416
-	MOVQ br_value, AX
417
-	MOVQ peek_bits, CX
418
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
315
+	// v0 := table[val0&mask]
316
+	MOVW (R10)(R14*2), CX
419 317
 
420
-#endif
318
+	// br0.advance(uint8(v0.entry)
319
+	MOVB CH, AL
320
+	SHLQ CL, R12
321
+	ADDB CL, R13
421 322
 
422
-	// v0 := table[val0&mask]
423
-	MOVW 0(table)(AX*2), AX // AX - v0
323
+	// val1 := br0.peekTopBits(peekBits)
324
+	MOVQ R12, R14
325
+	MOVQ DI, CX
326
+	SHRQ CL, R14
327
+
328
+	// v1 := table[val0&mask]
329
+	MOVW (R10)(R14*2), CX
330
+
331
+	// br0.advance(uint8(v1.entry)
332
+	MOVB   CH, AH
333
+	SHLQ   CL, R12
334
+	ADDB   CL, R13
335
+	BSWAPL AX
336
+
337
+	// val2 := br0.peekTopBits(peekBits)
338
+	MOVQ R12, R14
339
+	MOVQ DI, CX
340
+	SHRQ CL, R14
341
+
342
+	// v2 := table[val0&mask]
343
+	MOVW (R10)(R14*2), CX
344
+
345
+	// br0.advance(uint8(v2.entry)
346
+	MOVB CH, AH
347
+	SHLQ CL, R12
348
+	ADDB CL, R13
349
+
350
+	// val3 := br0.peekTopBits(peekBits)
351
+	MOVQ R12, R14
352
+	MOVQ DI, CX
353
+	SHRQ CL, R14
354
+
355
+	// v3 := table[val0&mask]
356
+	MOVW (R10)(R14*2), CX
357
+
358
+	// br0.advance(uint8(v3.entry)
359
+	MOVB   CH, AL
360
+	SHLQ   CL, R12
361
+	ADDB   CL, R13
362
+	BSWAPL AX
363
+
364
+	// these four writes get coalesced
365
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
366
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
367
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
368
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
369
+	MOVL AX, (R8)
370
+
371
+	// update the bitreader structure
372
+	MOVQ R12, 32(R11)
373
+	MOVB R13, 40(R11)
374
+	ADDQ R9, R8
375
+
376
+	// br1.fillFast32()
377
+	MOVQ    80(R11), R12
378
+	MOVBQZX 88(R11), R13
379
+	CMPQ    R13, $0x20
380
+	JBE     skip_fill1
381
+	MOVQ    72(R11), R14
382
+	SUBQ    $0x20, R13
383
+	SUBQ    $0x04, R14
384
+	MOVQ    48(R11), R15
424 385
 
425
-	// br3.advance(uint8(v0.entry))
426
-	MOVB AH, BL // BL = uint8(v0.entry >> 8)
386
+	// b.value |= uint64(low) << (b.bitsRead & 63)
387
+	MOVL (R14)(R15*1), R15
388
+	MOVQ R13, CX
389
+	SHLQ CL, R15
390
+	MOVQ R14, 72(R11)
391
+	ORQ  R15, R12
427 392
 
428
-#ifdef GOAMD64_v3
429
-	MOVBQZX AL, CX
430
-	SHLXQ   AX, br_value, br_value // value <<= n
393
+	// exhausted = exhausted || (br1.off < 4)
394
+	CMPQ  R14, $0x04
395
+	SETLT AL
396
+	ORB   AL, DL
431 397
 
432
-#else
433
-	MOVBQZX AL, CX
434
-	SHLQ    CL, br_value // value <<= n
398
+skip_fill1:
399
+	// val0 := br1.peekTopBits(peekBits)
400
+	MOVQ R12, R14
401
+	MOVQ DI, CX
402
+	SHRQ CL, R14
435 403
 
436
-#endif
404
+	// v0 := table[val0&mask]
405
+	MOVW (R10)(R14*2), CX
437 406
 
438
-	ADDQ CX, br_bits_read // bits_read += n
407
+	// br1.advance(uint8(v0.entry)
408
+	MOVB CH, AL
409
+	SHLQ CL, R12
410
+	ADDB CL, R13
439 411
 
440
-#ifdef GOAMD64_v3
441
-	SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
412
+	// val1 := br1.peekTopBits(peekBits)
413
+	MOVQ R12, R14
414
+	MOVQ DI, CX
415
+	SHRQ CL, R14
416
+
417
+	// v1 := table[val0&mask]
418
+	MOVW (R10)(R14*2), CX
419
+
420
+	// br1.advance(uint8(v1.entry)
421
+	MOVB   CH, AH
422
+	SHLQ   CL, R12
423
+	ADDB   CL, R13
424
+	BSWAPL AX
425
+
426
+	// val2 := br1.peekTopBits(peekBits)
427
+	MOVQ R12, R14
428
+	MOVQ DI, CX
429
+	SHRQ CL, R14
430
+
431
+	// v2 := table[val0&mask]
432
+	MOVW (R10)(R14*2), CX
433
+
434
+	// br1.advance(uint8(v2.entry)
435
+	MOVB CH, AH
436
+	SHLQ CL, R12
437
+	ADDB CL, R13
438
+
439
+	// val3 := br1.peekTopBits(peekBits)
440
+	MOVQ R12, R14
441
+	MOVQ DI, CX
442
+	SHRQ CL, R14
443
+
444
+	// v3 := table[val0&mask]
445
+	MOVW (R10)(R14*2), CX
446
+
447
+	// br1.advance(uint8(v3.entry)
448
+	MOVB   CH, AL
449
+	SHLQ   CL, R12
450
+	ADDB   CL, R13
451
+	BSWAPL AX
452
+
453
+	// these four writes get coalesced
454
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
455
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
456
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
457
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
458
+	MOVL AX, (R8)
459
+
460
+	// update the bitreader structure
461
+	MOVQ R12, 80(R11)
462
+	MOVB R13, 88(R11)
463
+	ADDQ R9, R8
464
+
465
+	// br2.fillFast32()
466
+	MOVQ    128(R11), R12
467
+	MOVBQZX 136(R11), R13
468
+	CMPQ    R13, $0x20
469
+	JBE     skip_fill2
470
+	MOVQ    120(R11), R14
471
+	SUBQ    $0x20, R13
472
+	SUBQ    $0x04, R14
473
+	MOVQ    96(R11), R15
442 474
 
443
-#else
444
-	// val1 := br3.peekTopBits(peekBits)
445
-	MOVQ peek_bits, CX
446
-	MOVQ br_value, AX
447
-	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
475
+	// b.value |= uint64(low) << (b.bitsRead & 63)
476
+	MOVL (R14)(R15*1), R15
477
+	MOVQ R13, CX
478
+	SHLQ CL, R15
479
+	MOVQ R14, 120(R11)
480
+	ORQ  R15, R12
448 481
 
449
-#endif
482
+	// exhausted = exhausted || (br2.off < 4)
483
+	CMPQ  R14, $0x04
484
+	SETLT AL
485
+	ORB   AL, DL
450 486
 
451
-	// v1 := table[val1&mask]
452
-	MOVW 0(table)(AX*2), AX // AX - v1
487
+skip_fill2:
488
+	// val0 := br2.peekTopBits(peekBits)
489
+	MOVQ R12, R14
490
+	MOVQ DI, CX
491
+	SHRQ CL, R14
453 492
 
454
-	// br3.advance(uint8(v1.entry))
455
-	MOVB AH, BH // BH = uint8(v1.entry >> 8)
493
+	// v0 := table[val0&mask]
494
+	MOVW (R10)(R14*2), CX
456 495
 
457
-#ifdef GOAMD64_v3
458
-	MOVBQZX AL, CX
459
-	SHLXQ   AX, br_value, br_value // value <<= n
496
+	// br2.advance(uint8(v0.entry)
497
+	MOVB CH, AL
498
+	SHLQ CL, R12
499
+	ADDB CL, R13
460 500
 
461
-#else
462
-	MOVBQZX AL, CX
463
-	SHLQ    CL, br_value // value <<= n
501
+	// val1 := br2.peekTopBits(peekBits)
502
+	MOVQ R12, R14
503
+	MOVQ DI, CX
504
+	SHRQ CL, R14
505
+
506
+	// v1 := table[val0&mask]
507
+	MOVW (R10)(R14*2), CX
508
+
509
+	// br2.advance(uint8(v1.entry)
510
+	MOVB   CH, AH
511
+	SHLQ   CL, R12
512
+	ADDB   CL, R13
513
+	BSWAPL AX
514
+
515
+	// val2 := br2.peekTopBits(peekBits)
516
+	MOVQ R12, R14
517
+	MOVQ DI, CX
518
+	SHRQ CL, R14
519
+
520
+	// v2 := table[val0&mask]
521
+	MOVW (R10)(R14*2), CX
522
+
523
+	// br2.advance(uint8(v2.entry)
524
+	MOVB CH, AH
525
+	SHLQ CL, R12
526
+	ADDB CL, R13
527
+
528
+	// val3 := br2.peekTopBits(peekBits)
529
+	MOVQ R12, R14
530
+	MOVQ DI, CX
531
+	SHRQ CL, R14
532
+
533
+	// v3 := table[val0&mask]
534
+	MOVW (R10)(R14*2), CX
535
+
536
+	// br2.advance(uint8(v3.entry)
537
+	MOVB   CH, AL
538
+	SHLQ   CL, R12
539
+	ADDB   CL, R13
540
+	BSWAPL AX
541
+
542
+	// these four writes get coalesced
543
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
544
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
545
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
546
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
547
+	MOVL AX, (R8)
548
+
549
+	// update the bitreader structure
550
+	MOVQ R12, 128(R11)
551
+	MOVB R13, 136(R11)
552
+	ADDQ R9, R8
553
+
554
+	// br3.fillFast32()
555
+	MOVQ    176(R11), R12
556
+	MOVBQZX 184(R11), R13
557
+	CMPQ    R13, $0x20
558
+	JBE     skip_fill3
559
+	MOVQ    168(R11), R14
560
+	SUBQ    $0x20, R13
561
+	SUBQ    $0x04, R14
562
+	MOVQ    144(R11), R15
464 563
 
465
-#endif
564
+	// b.value |= uint64(low) << (b.bitsRead & 63)
565
+	MOVL (R14)(R15*1), R15
566
+	MOVQ R13, CX
567
+	SHLQ CL, R15
568
+	MOVQ R14, 168(R11)
569
+	ORQ  R15, R12
466 570
 
467
-	ADDQ CX, br_bits_read // bits_read += n
571
+	// exhausted = exhausted || (br3.off < 4)
572
+	CMPQ  R14, $0x04
573
+	SETLT AL
574
+	ORB   AL, DL
468 575
 
469
-	// these two writes get coalesced
470
-	// buf[stream][off] = uint8(v0.entry >> 8)
471
-	// buf[stream][off+1] = uint8(v1.entry >> 8)
472
-	MOVW BX, 768(buffer)(off*1)
576
+skip_fill3:
577
+	// val0 := br3.peekTopBits(peekBits)
578
+	MOVQ R12, R14
579
+	MOVQ DI, CX
580
+	SHRQ CL, R14
473 581
 
474
-	// update the bitrader reader structure
475
-	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
476
-	MOVQ br_value, bitReaderShifted_value(br3)
477
-	MOVQ br_offset, bitReaderShifted_off(br3)
582
+	// v0 := table[val0&mask]
583
+	MOVW (R10)(R14*2), CX
478 584
 
479
-	ADDQ $2, off // off += 2
585
+	// br3.advance(uint8(v0.entry)
586
+	MOVB CH, AL
587
+	SHLQ CL, R12
588
+	ADDB CL, R13
480 589
 
481
-	TESTB DH, DH // any br[i].ofs < 4?
482
-	JNZ   end
590
+	// val1 := br3.peekTopBits(peekBits)
591
+	MOVQ R12, R14
592
+	MOVQ DI, CX
593
+	SHRQ CL, R14
594
+
595
+	// v1 := table[val0&mask]
596
+	MOVW (R10)(R14*2), CX
597
+
598
+	// br3.advance(uint8(v1.entry)
599
+	MOVB   CH, AH
600
+	SHLQ   CL, R12
601
+	ADDB   CL, R13
602
+	BSWAPL AX
603
+
604
+	// val2 := br3.peekTopBits(peekBits)
605
+	MOVQ R12, R14
606
+	MOVQ DI, CX
607
+	SHRQ CL, R14
608
+
609
+	// v2 := table[val0&mask]
610
+	MOVW (R10)(R14*2), CX
611
+
612
+	// br3.advance(uint8(v2.entry)
613
+	MOVB CH, AH
614
+	SHLQ CL, R12
615
+	ADDB CL, R13
616
+
617
+	// val3 := br3.peekTopBits(peekBits)
618
+	MOVQ R12, R14
619
+	MOVQ DI, CX
620
+	SHRQ CL, R14
621
+
622
+	// v3 := table[val0&mask]
623
+	MOVW (R10)(R14*2), CX
624
+
625
+	// br3.advance(uint8(v3.entry)
626
+	MOVB   CH, AL
627
+	SHLQ   CL, R12
628
+	ADDB   CL, R13
629
+	BSWAPL AX
630
+
631
+	// these four writes get coalesced
632
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
633
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
634
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
635
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
636
+	MOVL AX, (R8)
637
+
638
+	// update the bitreader structure
639
+	MOVQ  R12, 176(R11)
640
+	MOVB  R13, 184(R11)
641
+	ADDQ  $0x04, BX
642
+	TESTB DL, DL
643
+	JZ    main_loop
644
+	MOVQ  ctx+0(FP), AX
645
+	SUBQ  16(AX), BX
646
+	SHLQ  $0x02, BX
647
+	MOVQ  BX, 40(AX)
648
+	RET
483 649
 
484
-	CMPQ off, $bufoff
485
-	JL   main_loop
650
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
651
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
652
+	MOVQ    ctx+0(FP), CX
653
+	MOVQ    16(CX), DX
654
+	MOVQ    24(CX), BX
655
+	CMPQ    BX, $0x04
656
+	JB      error_max_decoded_size_exeeded
657
+	LEAQ    (DX)(BX*1), BX
658
+	MOVQ    (CX), SI
659
+	MOVQ    (SI), R8
660
+	MOVQ    24(SI), R9
661
+	MOVQ    32(SI), R10
662
+	MOVBQZX 40(SI), R11
663
+	MOVQ    32(CX), SI
664
+	MOVBQZX 8(CX), DI
665
+	JMP     loop_condition
486 666
 
487
-end:
488
-	MOVQ 0(SP), BP
667
+main_loop:
668
+	// Check if we have room for 4 bytes in the output buffer
669
+	LEAQ 4(DX), CX
670
+	CMPQ CX, BX
671
+	JGE  error_max_decoded_size_exeeded
672
+
673
+	// Decode 4 values
674
+	CMPQ R11, $0x20
675
+	JL   bitReader_fillFast_1_end
676
+	SUBQ $0x20, R11
677
+	SUBQ $0x04, R9
678
+	MOVL (R8)(R9*1), R12
679
+	MOVQ R11, CX
680
+	SHLQ CL, R12
681
+	ORQ  R12, R10
682
+
683
+bitReader_fillFast_1_end:
684
+	MOVQ    DI, CX
685
+	MOVQ    R10, R12
686
+	SHRQ    CL, R12
687
+	MOVW    (SI)(R12*2), CX
688
+	MOVB    CH, AL
689
+	MOVBQZX CL, CX
690
+	ADDQ    CX, R11
691
+	SHLQ    CL, R10
692
+	MOVQ    DI, CX
693
+	MOVQ    R10, R12
694
+	SHRQ    CL, R12
695
+	MOVW    (SI)(R12*2), CX
696
+	MOVB    CH, AH
697
+	MOVBQZX CL, CX
698
+	ADDQ    CX, R11
699
+	SHLQ    CL, R10
700
+	BSWAPL  AX
701
+	CMPQ    R11, $0x20
702
+	JL      bitReader_fillFast_2_end
703
+	SUBQ    $0x20, R11
704
+	SUBQ    $0x04, R9
705
+	MOVL    (R8)(R9*1), R12
706
+	MOVQ    R11, CX
707
+	SHLQ    CL, R12
708
+	ORQ     R12, R10
709
+
710
+bitReader_fillFast_2_end:
711
+	MOVQ    DI, CX
712
+	MOVQ    R10, R12
713
+	SHRQ    CL, R12
714
+	MOVW    (SI)(R12*2), CX
715
+	MOVB    CH, AH
716
+	MOVBQZX CL, CX
717
+	ADDQ    CX, R11
718
+	SHLQ    CL, R10
719
+	MOVQ    DI, CX
720
+	MOVQ    R10, R12
721
+	SHRQ    CL, R12
722
+	MOVW    (SI)(R12*2), CX
723
+	MOVB    CH, AL
724
+	MOVBQZX CL, CX
725
+	ADDQ    CX, R11
726
+	SHLQ    CL, R10
727
+	BSWAPL  AX
728
+
729
+	// Store the decoded values
730
+	MOVL AX, (DX)
731
+	ADDQ $0x04, DX
732
+
733
+loop_condition:
734
+	CMPQ R9, $0x08
735
+	JGE  main_loop
736
+
737
+	// Update ctx structure
738
+	MOVQ ctx+0(FP), AX
739
+	SUBQ 16(AX), DX
740
+	MOVQ DX, 40(AX)
741
+	MOVQ (AX), AX
742
+	MOVQ R9, 24(AX)
743
+	MOVQ R10, 32(AX)
744
+	MOVB R11, 40(AX)
745
+	RET
489 746
 
490
-	MOVB off, ret+56(FP)
747
+	// Report error
748
+error_max_decoded_size_exeeded:
749
+	MOVQ ctx+0(FP), AX
750
+	MOVQ $-1, CX
751
+	MOVQ CX, 40(AX)
491 752
 	RET
492 753
 
493
-#undef off
494
-#undef buffer
495
-#undef table
754
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
755
+// Requires: BMI2
756
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
757
+	MOVQ    ctx+0(FP), CX
758
+	MOVQ    16(CX), DX
759
+	MOVQ    24(CX), BX
760
+	CMPQ    BX, $0x04
761
+	JB      error_max_decoded_size_exeeded
762
+	LEAQ    (DX)(BX*1), BX
763
+	MOVQ    (CX), SI
764
+	MOVQ    (SI), R8
765
+	MOVQ    24(SI), R9
766
+	MOVQ    32(SI), R10
767
+	MOVBQZX 40(SI), R11
768
+	MOVQ    32(CX), SI
769
+	MOVBQZX 8(CX), DI
770
+	JMP     loop_condition
496 771
 
497
-#undef br_bits_read
498
-#undef br_value
499
-#undef br_offset
500
-#undef peek_bits
501
-#undef exhausted
772
+main_loop:
773
+	// Check if we have room for 4 bytes in the output buffer
774
+	LEAQ 4(DX), CX
775
+	CMPQ CX, BX
776
+	JGE  error_max_decoded_size_exeeded
777
+
778
+	// Decode 4 values
779
+	CMPQ  R11, $0x20
780
+	JL    bitReader_fillFast_1_end
781
+	SUBQ  $0x20, R11
782
+	SUBQ  $0x04, R9
783
+	MOVL  (R8)(R9*1), CX
784
+	SHLXQ R11, CX, CX
785
+	ORQ   CX, R10
786
+
787
+bitReader_fillFast_1_end:
788
+	SHRXQ   DI, R10, CX
789
+	MOVW    (SI)(CX*2), CX
790
+	MOVB    CH, AL
791
+	MOVBQZX CL, CX
792
+	ADDQ    CX, R11
793
+	SHLXQ   CX, R10, R10
794
+	SHRXQ   DI, R10, CX
795
+	MOVW    (SI)(CX*2), CX
796
+	MOVB    CH, AH
797
+	MOVBQZX CL, CX
798
+	ADDQ    CX, R11
799
+	SHLXQ   CX, R10, R10
800
+	BSWAPL  AX
801
+	CMPQ    R11, $0x20
802
+	JL      bitReader_fillFast_2_end
803
+	SUBQ    $0x20, R11
804
+	SUBQ    $0x04, R9
805
+	MOVL    (R8)(R9*1), CX
806
+	SHLXQ   R11, CX, CX
807
+	ORQ     CX, R10
808
+
809
+bitReader_fillFast_2_end:
810
+	SHRXQ   DI, R10, CX
811
+	MOVW    (SI)(CX*2), CX
812
+	MOVB    CH, AH
813
+	MOVBQZX CL, CX
814
+	ADDQ    CX, R11
815
+	SHLXQ   CX, R10, R10
816
+	SHRXQ   DI, R10, CX
817
+	MOVW    (SI)(CX*2), CX
818
+	MOVB    CH, AL
819
+	MOVBQZX CL, CX
820
+	ADDQ    CX, R11
821
+	SHLXQ   CX, R10, R10
822
+	BSWAPL  AX
823
+
824
+	// Store the decoded values
825
+	MOVL AX, (DX)
826
+	ADDQ $0x04, DX
827
+
828
+loop_condition:
829
+	CMPQ R9, $0x08
830
+	JGE  main_loop
831
+
832
+	// Update ctx structure
833
+	MOVQ ctx+0(FP), AX
834
+	SUBQ 16(AX), DX
835
+	MOVQ DX, 40(AX)
836
+	MOVQ (AX), AX
837
+	MOVQ R9, 24(AX)
838
+	MOVQ R10, 32(AX)
839
+	MOVB R11, 40(AX)
840
+	RET
502 841
 
503
-#undef br0
504
-#undef br1
505
-#undef br2
506
-#undef br3
842
+	// Report error
843
+error_max_decoded_size_exeeded:
844
+	MOVQ ctx+0(FP), AX
845
+	MOVQ $-1, CX
846
+	MOVQ CX, 40(AX)
847
+	RET
507 848
deleted file mode 100644
... ...
@@ -1,195 +0,0 @@
1
-// +build !appengine
2
-// +build gc
3
-// +build !noasm
4
-
5
-#include "textflag.h"
6
-#include "funcdata.h"
7
-#include "go_asm.h"
8
-
9
-#ifdef GOAMD64_v4
10
-#ifndef GOAMD64_v3
11
-#define GOAMD64_v3
12
-#endif
13
-#endif
14
-
15
-#define bufoff      256     // see decompress.go, we're using [4][256]byte table
16
-
17
-//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
18
-//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
19
-TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
20
-#define off             R8
21
-#define buffer          DI
22
-#define table           SI
23
-
24
-#define br_bits_read    R9
25
-#define br_value        R10
26
-#define br_offset       R11
27
-#define peek_bits       R12
28
-#define exhausted       DX
29
-
30
-#define br0             R13
31
-#define br1             R14
32
-#define br2             R15
33
-#define br3             BP
34
-
35
-    MOVQ    BP, 0(SP)
36
-
37
-    XORQ    exhausted, exhausted    // exhausted = false
38
-    XORQ    off, off                // off = 0
39
-
40
-    MOVBQZX peekBits+32(FP), peek_bits
41
-    MOVQ    buf+40(FP), buffer
42
-    MOVQ    tbl+48(FP), table
43
-
44
-    MOVQ    pbr0+0(FP), br0
45
-    MOVQ    pbr1+8(FP), br1
46
-    MOVQ    pbr2+16(FP), br2
47
-    MOVQ    pbr3+24(FP), br3
48
-
49
-main_loop:
50
-{{ define "decode_2_values_x86" }}
51
-    // const stream = {{ var "id" }}
52
-    // br{{ var "id"}}.fillFast()
53
-    MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
54
-    MOVQ    bitReaderShifted_value(br{{ var "id" }}), br_value
55
-    MOVQ    bitReaderShifted_off(br{{ var "id" }}), br_offset
56
-
57
-    // We must have at least 2 * max tablelog left
58
-    CMPQ    br_bits_read, $64-22
59
-    JBE     skip_fill{{ var "id" }}
60
-
61
-    SUBQ    $32, br_bits_read       // b.bitsRead -= 32
62
-    SUBQ    $4, br_offset           // b.off -= 4
63
-
64
-	// v := b.in[b.off-4 : b.off]
65
-	// v = v[:4]
66
-	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
67
-    MOVQ    bitReaderShifted_in(br{{ var "id" }}), AX
68
-
69
-	// b.value |= uint64(low) << (b.bitsRead & 63)
70
-#ifdef GOAMD64_v3
71
-    SHLXQ   br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
72
-#else
73
-    MOVL    0(br_offset)(AX*1), AX  // AX = uint32(b.in[b.off:b.off+4])
74
-    MOVQ    br_bits_read, CX
75
-    SHLQ    CL, AX
76
-#endif
77
-
78
-    ORQ     AX, br_value
79
-
80
-    // exhausted = exhausted || (br{{ var "id"}}.off < 4)
81
-    CMPQ    br_offset, $4
82
-    SETLT   DL
83
-    ORB     DL, DH
84
-    // }
85
-skip_fill{{ var "id" }}:
86
-
87
-    // val0 := br{{ var "id"}}.peekTopBits(peekBits)
88
-#ifdef GOAMD64_v3
89
-    SHRXQ   peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
90
-#else
91
-    MOVQ    br_value, AX
92
-    MOVQ    peek_bits, CX
93
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
94
-#endif
95
-
96
-    // v0 := table[val0&mask]
97
-    MOVW    0(table)(AX*2), AX      // AX - v0
98
-
99
-    // br{{ var "id"}}.advance(uint8(v0.entry))
100
-    MOVB    AH, BL                  // BL = uint8(v0.entry >> 8)
101
-
102
-#ifdef GOAMD64_v3
103
-    MOVBQZX AL, CX
104
-    SHLXQ   AX, br_value, br_value // value <<= n
105
-#else
106
-    MOVBQZX AL, CX
107
-    SHLQ    CL, br_value            // value <<= n
108
-#endif
109
-
110
-    ADDQ    CX, br_bits_read        // bits_read += n
111
-
112
-
113
-#ifdef GOAMD64_v3
114
-    SHRXQ    peek_bits, br_value, AX  // AX = (value >> peek_bits) & mask
115
-#else
116
-    // val1 := br{{ var "id"}}.peekTopBits(peekBits)
117
-    MOVQ    peek_bits, CX
118
-    MOVQ    br_value, AX
119
-    SHRQ    CL, AX                  // AX = (value >> peek_bits) & mask
120
-#endif
121
-
122
-    // v1 := table[val1&mask]
123
-    MOVW    0(table)(AX*2), AX      // AX - v1
124
-
125
-    // br{{ var "id"}}.advance(uint8(v1.entry))
126
-    MOVB    AH, BH                  // BH = uint8(v1.entry >> 8)
127
-
128
-#ifdef GOAMD64_v3
129
-    MOVBQZX AL, CX
130
-    SHLXQ   AX, br_value, br_value // value <<= n
131
-#else
132
-    MOVBQZX AL, CX
133
-    SHLQ    CL, br_value            // value <<= n
134
-#endif
135
-
136
-    ADDQ    CX, br_bits_read        // bits_read += n
137
-
138
-
139
-    // these two writes get coalesced
140
-    // buf[stream][off] = uint8(v0.entry >> 8)
141
-    // buf[stream][off+1] = uint8(v1.entry >> 8)
142
-    MOVW    BX, {{ var "bufofs" }}(buffer)(off*1)
143
-
144
-    // update the bitrader reader structure
145
-    MOVB    br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
146
-    MOVQ    br_value, bitReaderShifted_value(br{{ var "id" }})
147
-    MOVQ    br_offset, bitReaderShifted_off(br{{ var "id" }})
148
-{{ end }}
149
-
150
-    {{ set "id" "0" }}
151
-    {{ set "ofs" "0" }}
152
-    {{ set "bufofs" "0" }} {{/* id * bufoff */}}
153
-    {{ template "decode_2_values_x86" . }}
154
-
155
-    {{ set "id" "1" }}
156
-    {{ set "ofs" "8" }}
157
-    {{ set "bufofs" "256" }}
158
-    {{ template "decode_2_values_x86" . }}
159
-
160
-    {{ set "id" "2" }}
161
-    {{ set "ofs" "16" }}
162
-    {{ set "bufofs" "512" }}
163
-    {{ template "decode_2_values_x86" . }}
164
-
165
-    {{ set "id" "3" }}
166
-    {{ set "ofs" "24" }}
167
-    {{ set "bufofs" "768" }}
168
-    {{ template "decode_2_values_x86" . }}
169
-
170
-    ADDQ    $2, off     // off += 2
171
-
172
-    TESTB   DH, DH      // any br[i].ofs < 4?
173
-    JNZ     end
174
-
175
-    CMPQ    off, $bufoff
176
-    JL      main_loop
177
-end:
178
-    MOVQ    0(SP), BP
179
-
180
-    MOVB    off, ret+56(FP)
181
-    RET
182
-#undef  off
183
-#undef  buffer
184
-#undef  table
185
-
186
-#undef  br_bits_read
187
-#undef  br_value
188
-#undef  br_offset
189
-#undef  peek_bits
190
-#undef  exhausted
191
-
192
-#undef  br0
193
-#undef  br1
194
-#undef  br2
195
-#undef  br3
... ...
@@ -191,3 +191,105 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
191 191
 	}
192 192
 	return dst, nil
193 193
 }
194
+
195
+// Decompress1X will decompress a 1X encoded stream.
196
+// The cap of the output buffer will be the maximum decompressed size.
197
+// The length of the supplied input must match the end of a block exactly.
198
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
199
+	if len(d.dt.single) == 0 {
200
+		return nil, errors.New("no table loaded")
201
+	}
202
+	if use8BitTables && d.actualTableLog <= 8 {
203
+		return d.decompress1X8Bit(dst, src)
204
+	}
205
+	var br bitReaderShifted
206
+	err := br.init(src)
207
+	if err != nil {
208
+		return dst, err
209
+	}
210
+	maxDecodedSize := cap(dst)
211
+	dst = dst[:0]
212
+
213
+	// Avoid bounds check by always having full sized table.
214
+	const tlSize = 1 << tableLogMax
215
+	const tlMask = tlSize - 1
216
+	dt := d.dt.single[:tlSize]
217
+
218
+	// Use temp table to avoid bound checks/append penalty.
219
+	bufs := d.buffer()
220
+	buf := &bufs[0]
221
+	var off uint8
222
+
223
+	for br.off >= 8 {
224
+		br.fillFast()
225
+		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
226
+		br.advance(uint8(v.entry))
227
+		buf[off+0] = uint8(v.entry >> 8)
228
+
229
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
230
+		br.advance(uint8(v.entry))
231
+		buf[off+1] = uint8(v.entry >> 8)
232
+
233
+		// Refill
234
+		br.fillFast()
235
+
236
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
237
+		br.advance(uint8(v.entry))
238
+		buf[off+2] = uint8(v.entry >> 8)
239
+
240
+		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
241
+		br.advance(uint8(v.entry))
242
+		buf[off+3] = uint8(v.entry >> 8)
243
+
244
+		off += 4
245
+		if off == 0 {
246
+			if len(dst)+256 > maxDecodedSize {
247
+				br.close()
248
+				d.bufs.Put(bufs)
249
+				return nil, ErrMaxDecodedSizeExceeded
250
+			}
251
+			dst = append(dst, buf[:]...)
252
+		}
253
+	}
254
+
255
+	if len(dst)+int(off) > maxDecodedSize {
256
+		d.bufs.Put(bufs)
257
+		br.close()
258
+		return nil, ErrMaxDecodedSizeExceeded
259
+	}
260
+	dst = append(dst, buf[:off]...)
261
+
262
+	// br < 8, so uint8 is fine
263
+	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
264
+	for bitsLeft > 0 {
265
+		br.fill()
266
+		if false && br.bitsRead >= 32 {
267
+			if br.off >= 4 {
268
+				v := br.in[br.off-4:]
269
+				v = v[:4]
270
+				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
271
+				br.value = (br.value << 32) | uint64(low)
272
+				br.bitsRead -= 32
273
+				br.off -= 4
274
+			} else {
275
+				for br.off > 0 {
276
+					br.value = (br.value << 8) | uint64(br.in[br.off-1])
277
+					br.bitsRead -= 8
278
+					br.off--
279
+				}
280
+			}
281
+		}
282
+		if len(dst) >= maxDecodedSize {
283
+			d.bufs.Put(bufs)
284
+			br.close()
285
+			return nil, ErrMaxDecodedSizeExceeded
286
+		}
287
+		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
288
+		nBits := uint8(v.entry)
289
+		br.advance(nBits)
290
+		bitsLeft -= nBits
291
+		dst = append(dst, uint8(v.entry>>8))
292
+	}
293
+	d.bufs.Put(bufs)
294
+	return dst, br.close()
295
+}
194 296
new file mode 100644
... ...
@@ -0,0 +1,34 @@
0
+// Package cpuinfo gives runtime info about the current CPU.
1
+//
2
+// This is a very limited module meant for use internally
3
+// in this project. For more versatile solution check
4
+// https://github.com/klauspost/cpuid.
5
+package cpuinfo
6
+
7
+// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
8
+func HasBMI1() bool {
9
+	return hasBMI1
10
+}
11
+
12
+// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
13
+func HasBMI2() bool {
14
+	return hasBMI2
15
+}
16
+
17
+// DisableBMI2 will disable BMI2, for testing purposes.
18
+// Call returned function to restore previous state.
19
+func DisableBMI2() func() {
20
+	old := hasBMI2
21
+	hasBMI2 = false
22
+	return func() {
23
+		hasBMI2 = old
24
+	}
25
+}
26
+
27
+// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
28
+func HasBMI() bool {
29
+	return HasBMI1() && HasBMI2()
30
+}
31
+
32
+var hasBMI1 bool
33
+var hasBMI2 bool
0 34
new file mode 100644
... ...
@@ -0,0 +1,11 @@
0
+//go:build amd64 && !appengine && !noasm && gc
1
+// +build amd64,!appengine,!noasm,gc
2
+
3
+package cpuinfo
4
+
5
+// go:noescape
6
+func x86extensions() (bmi1, bmi2 bool)
7
+
8
+func init() {
9
+	hasBMI1, hasBMI2 = x86extensions()
10
+}
0 11
new file mode 100644
... ...
@@ -0,0 +1,36 @@
0
+// +build !appengine
1
+// +build gc
2
+// +build !noasm
3
+
4
+#include "textflag.h"
5
+#include "funcdata.h"
6
+#include "go_asm.h"
7
+
8
+TEXT ·x86extensions(SB), NOSPLIT, $0
9
+	// 1. determine max EAX value
10
+	XORQ AX, AX
11
+	CPUID
12
+
13
+	CMPQ AX, $7
14
+	JB   unsupported
15
+
16
+	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
17
+	MOVQ $7, AX
18
+	MOVQ $0, CX
19
+	CPUID
20
+
21
+	BTQ   $3, BX // bit 3 = BMI1
22
+	SETCS AL
23
+
24
+	BTQ   $8, BX // bit 8 = BMI2
25
+	SETCS AH
26
+
27
+	MOVB AL, bmi1+0(FP)
28
+	MOVB AH, bmi2+1(FP)
29
+	RET
30
+
31
+unsupported:
32
+	XORQ AX, AX
33
+	MOVB AL, bmi1+0(FP)
34
+	MOVB AL, bmi2+1(FP)
35
+	RET
... ...
@@ -386,47 +386,31 @@ In practice this means that concurrency is often limited to utilizing about 3 co
386 386
   
387 387
 ### Benchmarks
388 388
 
389
-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
390
-
391 389
 The first two are streaming decodes and the last are smaller inputs. 
392
- 
390
+
391
+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
392
+
393 393
 ```
394
-BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
395
-BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
396
-
397
-BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
398
-BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
399
-
400
-Concurrent performance:
401
-
402
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
403
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
404
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
405
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
406
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
407
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
408
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
409
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
410
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
411
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
412
-BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
413
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
414
-
415
-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
416
-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
417
-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
418
-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
419
-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
420
-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
421
-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
422
-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
423
-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
424
-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
425
-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
426
-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
394
+BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
395
+BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
396
+
397
+Concurrent blocks, performance:
398
+
399
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
400
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
401
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
402
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
403
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
404
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
405
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
406
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
407
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
408
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
409
+BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
410
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
427 411
 ```
428 412
 
429
-This reflects the performance around May 2020, but this may be out of date.
413
+This reflects the performance around May 2022, but this may be out of date.
430 414
 
431 415
 ## Zstd inside ZIP files
432 416
 
... ...
@@ -63,13 +63,6 @@ func (b *bitReader) get32BitsFast(n uint8) uint32 {
63 63
 	return v
64 64
 }
65 65
 
66
-func (b *bitReader) get16BitsFast(n uint8) uint16 {
67
-	const regMask = 64 - 1
68
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
69
-	b.bitsRead += n
70
-	return v
71
-}
72
-
73 66
 // fillFast() will make sure at least 32 bits are available.
74 67
 // There must be at least 4 bytes available.
75 68
 func (b *bitReader) fillFast() {
... ...
@@ -5,8 +5,6 @@
5 5
 
6 6
 package zstd
7 7
 
8
-import "fmt"
9
-
10 8
 // bitWriter will write bits.
11 9
 // First bit will be LSB of the first byte of output.
12 10
 type bitWriter struct {
... ...
@@ -73,80 +71,6 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
73 73
 	b.nBits += bits
74 74
 }
75 75
 
76
-// flush will flush all pending full bytes.
77
-// There will be at least 56 bits available for writing when this has been called.
78
-// Using flush32 is faster, but leaves less space for writing.
79
-func (b *bitWriter) flush() {
80
-	v := b.nBits >> 3
81
-	switch v {
82
-	case 0:
83
-	case 1:
84
-		b.out = append(b.out,
85
-			byte(b.bitContainer),
86
-		)
87
-	case 2:
88
-		b.out = append(b.out,
89
-			byte(b.bitContainer),
90
-			byte(b.bitContainer>>8),
91
-		)
92
-	case 3:
93
-		b.out = append(b.out,
94
-			byte(b.bitContainer),
95
-			byte(b.bitContainer>>8),
96
-			byte(b.bitContainer>>16),
97
-		)
98
-	case 4:
99
-		b.out = append(b.out,
100
-			byte(b.bitContainer),
101
-			byte(b.bitContainer>>8),
102
-			byte(b.bitContainer>>16),
103
-			byte(b.bitContainer>>24),
104
-		)
105
-	case 5:
106
-		b.out = append(b.out,
107
-			byte(b.bitContainer),
108
-			byte(b.bitContainer>>8),
109
-			byte(b.bitContainer>>16),
110
-			byte(b.bitContainer>>24),
111
-			byte(b.bitContainer>>32),
112
-		)
113
-	case 6:
114
-		b.out = append(b.out,
115
-			byte(b.bitContainer),
116
-			byte(b.bitContainer>>8),
117
-			byte(b.bitContainer>>16),
118
-			byte(b.bitContainer>>24),
119
-			byte(b.bitContainer>>32),
120
-			byte(b.bitContainer>>40),
121
-		)
122
-	case 7:
123
-		b.out = append(b.out,
124
-			byte(b.bitContainer),
125
-			byte(b.bitContainer>>8),
126
-			byte(b.bitContainer>>16),
127
-			byte(b.bitContainer>>24),
128
-			byte(b.bitContainer>>32),
129
-			byte(b.bitContainer>>40),
130
-			byte(b.bitContainer>>48),
131
-		)
132
-	case 8:
133
-		b.out = append(b.out,
134
-			byte(b.bitContainer),
135
-			byte(b.bitContainer>>8),
136
-			byte(b.bitContainer>>16),
137
-			byte(b.bitContainer>>24),
138
-			byte(b.bitContainer>>32),
139
-			byte(b.bitContainer>>40),
140
-			byte(b.bitContainer>>48),
141
-			byte(b.bitContainer>>56),
142
-		)
143
-	default:
144
-		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
145
-	}
146
-	b.bitContainer >>= v << 3
147
-	b.nBits &= 7
148
-}
149
-
150 76
 // flush32 will flush out, so there are at least 32 bits available for writing.
151 77
 func (b *bitWriter) flush32() {
152 78
 	if b.nBits < 32 {
... ...
@@ -5,9 +5,14 @@
5 5
 package zstd
6 6
 
7 7
 import (
8
+	"bytes"
9
+	"encoding/binary"
8 10
 	"errors"
9 11
 	"fmt"
10 12
 	"io"
13
+	"io/ioutil"
14
+	"os"
15
+	"path/filepath"
11 16
 	"sync"
12 17
 
13 18
 	"github.com/klauspost/compress/huff0"
... ...
@@ -38,14 +43,14 @@ const (
38 38
 	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
39 39
 	maxCompressedBlockSize = 128 << 10
40 40
 
41
+	compressedBlockOverAlloc    = 16
42
+	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
43
+
41 44
 	// Maximum possible block size (all Raw+Uncompressed).
42 45
 	maxBlockSize = (1 << 21) - 1
43 46
 
44
-	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
45
-	maxCompressedLiteralSize = 1 << 18
46
-	maxRLELiteralSize        = 1 << 20
47
-	maxMatchLen              = 131074
48
-	maxSequences             = 0x7f00 + 0xffff
47
+	maxMatchLen  = 131074
48
+	maxSequences = 0x7f00 + 0xffff
49 49
 
50 50
 	// We support slightly less than the reference decoder to be able to
51 51
 	// use ints on 32 bit archs.
... ...
@@ -97,7 +102,6 @@ type blockDec struct {
97 97
 
98 98
 	// Block is RLE, this is the size.
99 99
 	RLESize uint32
100
-	tmp     [4]byte
101 100
 
102 101
 	Type blockType
103 102
 
... ...
@@ -136,7 +140,7 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
136 136
 	b.Type = blockType((bh >> 1) & 3)
137 137
 	// find size.
138 138
 	cSize := int(bh >> 3)
139
-	maxSize := maxBlockSize
139
+	maxSize := maxCompressedBlockSizeAlloc
140 140
 	switch b.Type {
141 141
 	case blockTypeReserved:
142 142
 		return ErrReservedBlockType
... ...
@@ -157,9 +161,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
157 157
 			println("Data size on stream:", cSize)
158 158
 		}
159 159
 		b.RLESize = 0
160
-		maxSize = maxCompressedBlockSize
160
+		maxSize = maxCompressedBlockSizeAlloc
161 161
 		if windowSize < maxCompressedBlockSize && b.lowMem {
162
-			maxSize = int(windowSize)
162
+			maxSize = int(windowSize) + compressedBlockOverAlloc
163 163
 		}
164 164
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
165 165
 			if debugDecoder {
... ...
@@ -190,9 +194,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
190 190
 	// Read block data.
191 191
 	if cap(b.dataStorage) < cSize {
192 192
 		if b.lowMem || cSize > maxCompressedBlockSize {
193
-			b.dataStorage = make([]byte, 0, cSize)
193
+			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
194 194
 		} else {
195
-			b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
195
+			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
196 196
 		}
197 197
 	}
198 198
 	if cap(b.dst) <= maxSize {
... ...
@@ -360,14 +364,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
360 360
 		}
361 361
 		if cap(b.literalBuf) < litRegenSize {
362 362
 			if b.lowMem {
363
-				b.literalBuf = make([]byte, litRegenSize)
363
+				b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
364 364
 			} else {
365
-				if litRegenSize > maxCompressedLiteralSize {
366
-					// Exceptional
367
-					b.literalBuf = make([]byte, litRegenSize)
368
-				} else {
369
-					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
370
-				}
365
+				b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
371 366
 			}
372 367
 		}
373 368
 		literals = b.literalBuf[:litRegenSize]
... ...
@@ -397,14 +396,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
397 397
 		// Ensure we have space to store it.
398 398
 		if cap(b.literalBuf) < litRegenSize {
399 399
 			if b.lowMem {
400
-				b.literalBuf = make([]byte, 0, litRegenSize)
400
+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
401 401
 			} else {
402
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
402
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
403 403
 			}
404 404
 		}
405 405
 		var err error
406 406
 		// Use our out buffer.
407
-		huff.MaxDecodedSize = maxCompressedBlockSize
407
+		huff.MaxDecodedSize = litRegenSize
408 408
 		if fourStreams {
409 409
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
410 410
 		} else {
... ...
@@ -429,9 +428,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
429 429
 		// Ensure we have space to store it.
430 430
 		if cap(b.literalBuf) < litRegenSize {
431 431
 			if b.lowMem {
432
-				b.literalBuf = make([]byte, 0, litRegenSize)
432
+				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
433 433
 			} else {
434
-				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
434
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
435 435
 			}
436 436
 		}
437 437
 		huff := hist.huffTree
... ...
@@ -448,7 +447,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
448 448
 			return in, err
449 449
 		}
450 450
 		hist.huffTree = huff
451
-		huff.MaxDecodedSize = maxCompressedBlockSize
451
+		huff.MaxDecodedSize = litRegenSize
452 452
 		// Use our out buffer.
453 453
 		if fourStreams {
454 454
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
... ...
@@ -463,6 +462,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
463 463
 		if len(literals) != litRegenSize {
464 464
 			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
465 465
 		}
466
+		// Re-cap to get extra size.
467
+		literals = b.literalBuf[:len(literals)]
466 468
 		if debugDecoder {
467 469
 			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
468 470
 		}
... ...
@@ -486,10 +487,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
486 486
 		b.dst = append(b.dst, hist.decoders.literals...)
487 487
 		return nil
488 488
 	}
489
-	err = hist.decoders.decodeSync(hist)
489
+	before := len(hist.decoders.out)
490
+	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
490 491
 	if err != nil {
491 492
 		return err
492 493
 	}
494
+	if hist.decoders.maxSyncLen > 0 {
495
+		hist.decoders.maxSyncLen += uint64(before)
496
+		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
497
+	}
493 498
 	b.dst = hist.decoders.out
494 499
 	hist.recentOffsets = hist.decoders.prevOffset
495 500
 	return nil
... ...
@@ -632,6 +638,22 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
632 632
 		println("initializing sequences:", err)
633 633
 		return err
634 634
 	}
635
+	// Extract blocks...
636
+	if false && hist.dict == nil {
637
+		fatalErr := func(err error) {
638
+			if err != nil {
639
+				panic(err)
640
+			}
641
+		}
642
+		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
643
+		var buf bytes.Buffer
644
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
645
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
646
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
647
+		buf.Write(in)
648
+		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
649
+	}
650
+
635 651
 	return nil
636 652
 }
637 653
 
... ...
@@ -650,6 +672,7 @@ func (b *blockDec) decodeSequences(hist *history) error {
650 650
 	}
651 651
 	hist.decoders.windowSize = hist.windowSize
652 652
 	hist.decoders.prevOffset = hist.recentOffsets
653
+
653 654
 	err := hist.decoders.decode(b.sequence)
654 655
 	hist.recentOffsets = hist.decoders.prevOffset
655 656
 	return err
... ...
@@ -23,7 +23,7 @@ type byteBuffer interface {
23 23
 	readByte() (byte, error)
24 24
 
25 25
 	// Skip n bytes.
26
-	skipN(n int) error
26
+	skipN(n int64) error
27 27
 }
28 28
 
29 29
 // in-memory buffer
... ...
@@ -52,10 +52,6 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
52 52
 	return r, nil
53 53
 }
54 54
 
55
-func (b *byteBuf) remain() []byte {
56
-	return *b
57
-}
58
-
59 55
 func (b *byteBuf) readByte() (byte, error) {
60 56
 	bb := *b
61 57
 	if len(bb) < 1 {
... ...
@@ -66,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) {
66 66
 	return r, nil
67 67
 }
68 68
 
69
-func (b *byteBuf) skipN(n int) error {
69
+func (b *byteBuf) skipN(n int64) error {
70 70
 	bb := *b
71
-	if len(bb) < n {
71
+	if n < 0 {
72
+		return fmt.Errorf("negative skip (%d) requested", n)
73
+	}
74
+	if int64(len(bb)) < n {
72 75
 		return io.ErrUnexpectedEOF
73 76
 	}
74 77
 	*b = bb[n:]
... ...
@@ -124,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) {
124 124
 	return r.tmp[0], nil
125 125
 }
126 126
 
127
-func (r *readerWrapper) skipN(n int) error {
128
-	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
129
-	if n2 != int64(n) {
127
+func (r *readerWrapper) skipN(n int64) error {
128
+	n2, err := io.CopyN(ioutil.Discard, r.r, n)
129
+	if n2 != n {
130 130
 		err = io.ErrUnexpectedEOF
131 131
 	}
132 132
 	return err
... ...
@@ -13,12 +13,6 @@ type byteReader struct {
13 13
 	off int
14 14
 }
15 15
 
16
-// init will initialize the reader and set the input.
17
-func (b *byteReader) init(in []byte) {
18
-	b.b = in
19
-	b.off = 0
20
-}
21
-
22 16
 // advance the stream b n bytes.
23 17
 func (b *byteReader) advance(n uint) {
24 18
 	b.off += int(n)
... ...
@@ -347,18 +347,23 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
347 347
 			}
348 348
 			frame.history.setDict(&dict)
349 349
 		}
350
-
351
-		if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
352
-			return dst, ErrDecoderSizeExceeded
350
+		if frame.WindowSize > d.o.maxWindowSize {
351
+			if debugDecoder {
352
+				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
353
+			}
354
+			return dst, ErrWindowSizeExceeded
353 355
 		}
354
-		if frame.FrameContentSize < 1<<30 {
355
-			// Never preallocate more than 1 GB up front.
356
+		if frame.FrameContentSize != fcsUnknown {
357
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
358
+				return dst, ErrDecoderSizeExceeded
359
+			}
356 360
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
357
-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
361
+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
358 362
 				copy(dst2, dst)
359 363
 				dst = dst2
360 364
 			}
361 365
 		}
366
+
362 367
 		if cap(dst) == 0 {
363 368
 			// Allocate len(input) * 2 by default if nothing is provided
364 369
 			// and we didn't get frame content size.
... ...
@@ -437,7 +442,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
437 437
 		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
438 438
 	}
439 439
 
440
-	if len(next.b) > 0 {
440
+	if !d.o.ignoreChecksum && len(next.b) > 0 {
441 441
 		n, err := d.current.crc.Write(next.b)
442 442
 		if err == nil {
443 443
 			if n != len(next.b) {
... ...
@@ -449,7 +454,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
449 449
 		got := d.current.crc.Sum64()
450 450
 		var tmp [4]byte
451 451
 		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
452
-		if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
452
+		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
453 453
 			if debugDecoder {
454 454
 				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
455 455
 			}
... ...
@@ -533,9 +538,15 @@ func (d *Decoder) nextBlockSync() (ok bool) {
533 533
 
534 534
 		// Update/Check CRC
535 535
 		if d.frame.HasCheckSum {
536
-			d.frame.crc.Write(d.current.b)
536
+			if !d.o.ignoreChecksum {
537
+				d.frame.crc.Write(d.current.b)
538
+			}
537 539
 			if d.current.d.Last {
538
-				d.current.err = d.frame.checkCRC()
540
+				if !d.o.ignoreChecksum {
541
+					d.current.err = d.frame.checkCRC()
542
+				} else {
543
+					d.current.err = d.frame.consumeCRC()
544
+				}
539 545
 				if d.current.err != nil {
540 546
 					println("CRC error:", d.current.err)
541 547
 					return false
... ...
@@ -629,60 +640,18 @@ func (d *Decoder) startSyncDecoder(r io.Reader) error {
629 629
 
630 630
 // Create Decoder:
631 631
 // ASYNC:
632
-// Spawn 4 go routines.
633
-// 0: Read frames and decode blocks.
634
-// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
635
-// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
636
-// 3: Wait for stream history, execute sequences, send stream history.
632
+// Spawn 3 go routines.
633
+// 0: Read frames and decode block literals.
634
+// 1: Decode sequences.
635
+// 2: Execute sequences, send to output.
637 636
 func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
638 637
 	defer d.streamWg.Done()
639 638
 	br := readerWrapper{r: r}
640 639
 
641
-	var seqPrepare = make(chan *blockDec, d.o.concurrent)
642 640
 	var seqDecode = make(chan *blockDec, d.o.concurrent)
643 641
 	var seqExecute = make(chan *blockDec, d.o.concurrent)
644 642
 
645
-	// Async 1: Prepare blocks...
646
-	go func() {
647
-		var hist history
648
-		var hasErr bool
649
-		for block := range seqPrepare {
650
-			if hasErr {
651
-				if block != nil {
652
-					seqDecode <- block
653
-				}
654
-				continue
655
-			}
656
-			if block.async.newHist != nil {
657
-				if debugDecoder {
658
-					println("Async 1: new history")
659
-				}
660
-				hist.reset()
661
-				if block.async.newHist.dict != nil {
662
-					hist.setDict(block.async.newHist.dict)
663
-				}
664
-			}
665
-			if block.err != nil || block.Type != blockTypeCompressed {
666
-				hasErr = block.err != nil
667
-				seqDecode <- block
668
-				continue
669
-			}
670
-
671
-			remain, err := block.decodeLiterals(block.data, &hist)
672
-			block.err = err
673
-			hasErr = block.err != nil
674
-			if err == nil {
675
-				block.async.literals = hist.decoders.literals
676
-				block.async.seqData = remain
677
-			} else if debugDecoder {
678
-				println("decodeLiterals error:", err)
679
-			}
680
-			seqDecode <- block
681
-		}
682
-		close(seqDecode)
683
-	}()
684
-
685
-	// Async 2: Decode sequences...
643
+	// Async 1: Decode sequences...
686 644
 	go func() {
687 645
 		var hist history
688 646
 		var hasErr bool
... ...
@@ -696,7 +665,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
696 696
 			}
697 697
 			if block.async.newHist != nil {
698 698
 				if debugDecoder {
699
-					println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
699
+					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
700 700
 				}
701 701
 				hist.decoders = block.async.newHist.decoders
702 702
 				hist.recentOffsets = block.async.newHist.recentOffsets
... ...
@@ -750,7 +719,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
750 750
 			}
751 751
 			if block.async.newHist != nil {
752 752
 				if debugDecoder {
753
-					println("Async 3: new history")
753
+					println("Async 2: new history")
754 754
 				}
755 755
 				hist.windowSize = block.async.newHist.windowSize
756 756
 				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
... ...
@@ -837,6 +806,33 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
837 837
 
838 838
 decodeStream:
839 839
 	for {
840
+		var hist history
841
+		var hasErr bool
842
+
843
+		decodeBlock := func(block *blockDec) {
844
+			if hasErr {
845
+				if block != nil {
846
+					seqDecode <- block
847
+				}
848
+				return
849
+			}
850
+			if block.err != nil || block.Type != blockTypeCompressed {
851
+				hasErr = block.err != nil
852
+				seqDecode <- block
853
+				return
854
+			}
855
+
856
+			remain, err := block.decodeLiterals(block.data, &hist)
857
+			block.err = err
858
+			hasErr = block.err != nil
859
+			if err == nil {
860
+				block.async.literals = hist.decoders.literals
861
+				block.async.seqData = remain
862
+			} else if debugDecoder {
863
+				println("decodeLiterals error:", err)
864
+			}
865
+			seqDecode <- block
866
+		}
840 867
 		frame := d.frame
841 868
 		if debugDecoder {
842 869
 			println("New frame...")
... ...
@@ -863,7 +859,7 @@ decodeStream:
863 863
 			case <-ctx.Done():
864 864
 			case dec := <-d.decoders:
865 865
 				dec.sendErr(err)
866
-				seqPrepare <- dec
866
+				decodeBlock(dec)
867 867
 			}
868 868
 			break decodeStream
869 869
 		}
... ...
@@ -883,6 +879,10 @@ decodeStream:
883 883
 				if debugDecoder {
884 884
 					println("Alloc History:", h.allocFrameBuffer)
885 885
 				}
886
+				hist.reset()
887
+				if h.dict != nil {
888
+					hist.setDict(h.dict)
889
+				}
886 890
 				dec.async.newHist = &h
887 891
 				dec.async.fcs = frame.FrameContentSize
888 892
 				historySent = true
... ...
@@ -909,7 +909,7 @@ decodeStream:
909 909
 			}
910 910
 			err = dec.err
911 911
 			last := dec.Last
912
-			seqPrepare <- dec
912
+			decodeBlock(dec)
913 913
 			if err != nil {
914 914
 				break decodeStream
915 915
 			}
... ...
@@ -918,7 +918,7 @@ decodeStream:
918 918
 			}
919 919
 		}
920 920
 	}
921
-	close(seqPrepare)
921
+	close(seqDecode)
922 922
 	wg.Wait()
923 923
 	d.frame.history.b = frameHistCache
924 924
 }
... ...
@@ -19,6 +19,7 @@ type decoderOptions struct {
19 19
 	maxDecodedSize uint64
20 20
 	maxWindowSize  uint64
21 21
 	dicts          []dict
22
+	ignoreChecksum bool
22 23
 }
23 24
 
24 25
 func (o *decoderOptions) setDefault() {
... ...
@@ -31,7 +32,7 @@ func (o *decoderOptions) setDefault() {
31 31
 	if o.concurrent > 4 {
32 32
 		o.concurrent = 4
33 33
 	}
34
-	o.maxDecodedSize = 1 << 63
34
+	o.maxDecodedSize = 64 << 30
35 35
 }
36 36
 
37 37
 // WithDecoderLowmem will set whether to use a lower amount of memory,
... ...
@@ -66,7 +67,7 @@ func WithDecoderConcurrency(n int) DOption {
66 66
 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
67 67
 // non-streaming operations or maximum window size for streaming operations.
68 68
 // This can be used to control memory usage of potentially hostile content.
69
-// Maximum and default is 1 << 63 bytes.
69
+// Maximum is 1 << 63 bytes. Default is 64GiB.
70 70
 func WithDecoderMaxMemory(n uint64) DOption {
71 71
 	return func(o *decoderOptions) error {
72 72
 		if n == 0 {
... ...
@@ -112,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
112 112
 		return nil
113 113
 	}
114 114
 }
115
+
116
+// IgnoreChecksum allows to forcibly ignore checksum checking.
117
+func IgnoreChecksum(b bool) DOption {
118
+	return func(o *decoderOptions) error {
119
+		o.ignoreChecksum = b
120
+		return nil
121
+	}
122
+}
... ...
@@ -156,8 +156,8 @@ encodeLoop:
156 156
 				panic("offset0 was 0")
157 157
 			}
158 158
 
159
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
160 159
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
160
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
161 161
 			candidateL := e.longTable[nextHashL]
162 162
 			candidateS := e.table[nextHashS]
163 163
 
... ...
@@ -518,8 +518,8 @@ encodeLoop:
518 518
 			}
519 519
 
520 520
 			// Store this, since we have it.
521
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
522 521
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
522
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
523 523
 
524 524
 			// We have at least 4 byte match.
525 525
 			// No need to check backwards. We come straight from a match
... ...
@@ -674,8 +674,8 @@ encodeLoop:
674 674
 				panic("offset0 was 0")
675 675
 			}
676 676
 
677
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
678 677
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
678
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
679 679
 			candidateL := e.longTable[nextHashL]
680 680
 			candidateS := e.table[nextHashS]
681 681
 
... ...
@@ -1047,8 +1047,8 @@ encodeLoop:
1047 1047
 			}
1048 1048
 
1049 1049
 			// Store this, since we have it.
1050
-			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
1051 1050
 			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
1051
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
1052 1052
 
1053 1053
 			// We have at least 4 byte match.
1054 1054
 			// No need to check backwards. We come straight from a match
... ...
@@ -127,8 +127,8 @@ encodeLoop:
127 127
 				panic("offset0 was 0")
128 128
 			}
129 129
 
130
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
131 130
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
131
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
132 132
 			candidateL := e.longTable[nextHashL]
133 133
 			candidateS := e.table[nextHashS]
134 134
 
... ...
@@ -439,8 +439,8 @@ encodeLoop:
439 439
 		var t int32
440 440
 		for {
441 441
 
442
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
443 442
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
443
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
444 444
 			candidateL := e.longTable[nextHashL]
445 445
 			candidateS := e.table[nextHashS]
446 446
 
... ...
@@ -785,8 +785,8 @@ encodeLoop:
785 785
 				panic("offset0 was 0")
786 786
 			}
787 787
 
788
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
789 788
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
789
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
790 790
 			candidateL := e.longTable[nextHashL]
791 791
 			candidateS := e.table[nextHashS]
792 792
 
... ...
@@ -969,7 +969,7 @@ encodeLoop:
969 969
 		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
970 970
 		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
971 971
 		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
972
-		longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
972
+		longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
973 973
 		e.longTable[longHash1] = te0
974 974
 		e.longTable[longHash2] = te1
975 975
 		e.markLongShardDirty(longHash1)
... ...
@@ -1002,8 +1002,8 @@ encodeLoop:
1002 1002
 			}
1003 1003
 
1004 1004
 			// Store this, since we have it.
1005
-			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
1006 1005
 			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
1006
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
1007 1007
 
1008 1008
 			// We have at least 4 byte match.
1009 1009
 			// No need to check backwards. We come straight from a match
... ...
@@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
528 528
 		// If a non-single block is needed the encoder will reset again.
529 529
 		e.encoders <- enc
530 530
 	}()
531
-	// Use single segments when above minimum window and below 1MB.
532
-	single := len(src) < 1<<20 && len(src) > MinWindowSize
531
+	// Use single segments when above minimum window and below window size.
532
+	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
533 533
 	if e.o.single != nil {
534 534
 		single = *e.o.single
535 535
 	}
... ...
@@ -551,7 +551,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
551 551
 	}
552 552
 
553 553
 	// If we can do everything in one block, prefer that.
554
-	if len(src) <= maxCompressedBlockSize {
554
+	if len(src) <= e.o.blockSize {
555 555
 		enc.Reset(e.o.dict, true)
556 556
 		// Slightly faster with no history and everything in one block.
557 557
 		if e.o.crc {
... ...
@@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
283 283
 // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
284 284
 // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
285 285
 // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
286
-// If this is not specified, block encodes will automatically choose this based on the input size.
286
+// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
287 287
 // This setting has no effect on streamed encodes.
288 288
 func WithSingleSegment(b bool) EOption {
289 289
 	return func(o *encoderOptions) error {
... ...
@@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
106 106
 		}
107 107
 		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
108 108
 		println("Skipping frame with", n, "bytes.")
109
-		err = br.skipN(int(n))
109
+		err = br.skipN(int64(n))
110 110
 		if err != nil {
111 111
 			if debugDecoder {
112 112
 				println("Reading discarded frame", err)
... ...
@@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
231 231
 		d.crc.Reset()
232 232
 	}
233 233
 
234
+	if d.WindowSize > d.o.maxWindowSize {
235
+		if debugDecoder {
236
+			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
237
+		}
238
+		return ErrWindowSizeExceeded
239
+	}
240
+
234 241
 	if d.WindowSize == 0 && d.SingleSegment {
235 242
 		// We may not need window in this case.
236 243
 		d.WindowSize = d.FrameContentSize
237 244
 		if d.WindowSize < MinWindowSize {
238 245
 			d.WindowSize = MinWindowSize
239 246
 		}
240
-	}
241
-
242
-	if d.WindowSize > uint64(d.o.maxWindowSize) {
243
-		if debugDecoder {
244
-			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
247
+		if d.WindowSize > d.o.maxDecodedSize {
248
+			if debugDecoder {
249
+				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
250
+			}
251
+			return ErrDecoderSizeExceeded
245 252
 		}
246
-		return ErrWindowSizeExceeded
247 253
 	}
254
+
248 255
 	// The minimum Window_Size is 1 KB.
249 256
 	if d.WindowSize < MinWindowSize {
250 257
 		if debugDecoder {
... ...
@@ -253,10 +260,11 @@ func (d *frameDec) reset(br byteBuffer) error {
253 253
 		return ErrWindowSizeTooSmall
254 254
 	}
255 255
 	d.history.windowSize = int(d.WindowSize)
256
-	if d.o.lowMem && d.history.windowSize < maxBlockSize {
256
+	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
257
+		// Alloc 2x window size if not low-mem, or very small window size.
257 258
 		d.history.allocFrameBuffer = d.history.windowSize * 2
258
-		// TODO: Maybe use FrameContent size
259 259
 	} else {
260
+		// Alloc with one additional block
260 261
 		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
261 262
 	}
262 263
 
... ...
@@ -290,13 +298,6 @@ func (d *frameDec) checkCRC() error {
290 290
 	if !d.HasCheckSum {
291 291
 		return nil
292 292
 	}
293
-	var tmp [4]byte
294
-	got := d.crc.Sum64()
295
-	// Flip to match file order.
296
-	tmp[0] = byte(got >> 0)
297
-	tmp[1] = byte(got >> 8)
298
-	tmp[2] = byte(got >> 16)
299
-	tmp[3] = byte(got >> 24)
300 293
 
301 294
 	// We can overwrite upper tmp now
302 295
 	want, err := d.rawInput.readSmall(4)
... ...
@@ -305,7 +306,19 @@ func (d *frameDec) checkCRC() error {
305 305
 		return err
306 306
 	}
307 307
 
308
-	if !bytes.Equal(tmp[:], want) && !ignoreCRC {
308
+	if d.o.ignoreChecksum {
309
+		return nil
310
+	}
311
+
312
+	var tmp [4]byte
313
+	got := d.crc.Sum64()
314
+	// Flip to match file order.
315
+	tmp[0] = byte(got >> 0)
316
+	tmp[1] = byte(got >> 8)
317
+	tmp[2] = byte(got >> 16)
318
+	tmp[3] = byte(got >> 24)
319
+
320
+	if !bytes.Equal(tmp[:], want) {
309 321
 		if debugDecoder {
310 322
 			println("CRC Check Failed:", tmp[:], "!=", want)
311 323
 		}
... ...
@@ -317,6 +330,19 @@ func (d *frameDec) checkCRC() error {
317 317
 	return nil
318 318
 }
319 319
 
320
+// consumeCRC reads the checksum data if the frame has one.
321
+func (d *frameDec) consumeCRC() error {
322
+	if d.HasCheckSum {
323
+		_, err := d.rawInput.readSmall(4)
324
+		if err != nil {
325
+			println("CRC missing?", err)
326
+			return err
327
+		}
328
+	}
329
+
330
+	return nil
331
+}
332
+
320 333
 // runDecoder will create a sync decoder that will decode a block of data.
321 334
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
322 335
 	saved := d.history.b
... ...
@@ -326,6 +352,19 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
326 326
 	d.history.ignoreBuffer = len(dst)
327 327
 	// Store input length, so we only check new data.
328 328
 	crcStart := len(dst)
329
+	d.history.decoders.maxSyncLen = 0
330
+	if d.FrameContentSize != fcsUnknown {
331
+		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
332
+		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
333
+			return dst, ErrDecoderSizeExceeded
334
+		}
335
+		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
336
+			// Alloc for output
337
+			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
338
+			copy(dst2, dst)
339
+			dst = dst2
340
+		}
341
+	}
329 342
 	var err error
330 343
 	for {
331 344
 		err = dec.reset(d.rawInput, d.WindowSize)
... ...
@@ -360,13 +399,17 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
360 360
 		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
361 361
 			err = ErrFrameSizeMismatch
362 362
 		} else if d.HasCheckSum {
363
-			var n int
364
-			n, err = d.crc.Write(dst[crcStart:])
365
-			if err == nil {
366
-				if n != len(dst)-crcStart {
367
-					err = io.ErrShortWrite
368
-				} else {
369
-					err = d.checkCRC()
363
+			if d.o.ignoreChecksum {
364
+				err = d.consumeCRC()
365
+			} else {
366
+				var n int
367
+				n, err = d.crc.Write(dst[crcStart:])
368
+				if err == nil {
369
+					if n != len(dst)-crcStart {
370
+						err = io.ErrShortWrite
371
+					} else {
372
+						err = d.checkCRC()
373
+					}
370 374
 				}
371 375
 			}
372 376
 		}
... ...
@@ -5,8 +5,10 @@
5 5
 package zstd
6 6
 
7 7
 import (
8
+	"encoding/binary"
8 9
 	"errors"
9 10
 	"fmt"
11
+	"io"
10 12
 )
11 13
 
12 14
 const (
... ...
@@ -178,10 +180,32 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
178 178
 		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
179 179
 	}
180 180
 	b.advance((bitCount + 7) >> 3)
181
-	// println(s.norm[:s.symbolLen], s.symbolLen)
182 181
 	return s.buildDtable()
183 182
 }
184 183
 
184
+func (s *fseDecoder) mustReadFrom(r io.Reader) {
185
+	fatalErr := func(err error) {
186
+		if err != nil {
187
+			panic(err)
188
+		}
189
+	}
190
+	// 	dt             [maxTablesize]decSymbol // Decompression table.
191
+	//	symbolLen      uint16                  // Length of active part of the symbol table.
192
+	//	actualTableLog uint8                   // Selected tablelog.
193
+	//	maxBits        uint8                   // Maximum number of additional bits
194
+	//	// used for table creation to avoid allocations.
195
+	//	stateTable [256]uint16
196
+	//	norm       [maxSymbolValue + 1]int16
197
+	//	preDefined bool
198
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
199
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
200
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
201
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
202
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
203
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
204
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
205
+}
206
+
185 207
 // decSymbol contains information about a state entry,
186 208
 // Including the state offset base, the output symbol and
187 209
 // the number of bits to read for the low part of the destination state.
... ...
@@ -204,18 +228,10 @@ func (d decSymbol) newState() uint16 {
204 204
 	return uint16(d >> 16)
205 205
 }
206 206
 
207
-func (d decSymbol) baseline() uint32 {
208
-	return uint32(d >> 32)
209
-}
210
-
211 207
 func (d decSymbol) baselineInt() int {
212 208
 	return int(d >> 32)
213 209
 }
214 210
 
215
-func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
216
-	*d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
217
-}
218
-
219 211
 func (d *decSymbol) setNBits(nBits uint8) {
220 212
 	const mask = 0xffffffffffffff00
221 213
 	*d = (*d & mask) | decSymbol(nBits)
... ...
@@ -231,11 +247,6 @@ func (d *decSymbol) setNewState(state uint16) {
231 231
 	*d = (*d & mask) | decSymbol(state)<<16
232 232
 }
233 233
 
234
-func (d *decSymbol) setBaseline(baseline uint32) {
235
-	const mask = 0xffffffff
236
-	*d = (*d & mask) | decSymbol(baseline)<<32
237
-}
238
-
239 234
 func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
240 235
 	const mask = 0xffff00ff
241 236
 	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
... ...
@@ -257,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) {
257 257
 	s.dt[0] = symbol
258 258
 }
259 259
 
260
-// buildDtable will build the decoding table.
261
-func (s *fseDecoder) buildDtable() error {
262
-	tableSize := uint32(1 << s.actualTableLog)
263
-	highThreshold := tableSize - 1
264
-	symbolNext := s.stateTable[:256]
265
-
266
-	// Init, lay down lowprob symbols
267
-	{
268
-		for i, v := range s.norm[:s.symbolLen] {
269
-			if v == -1 {
270
-				s.dt[highThreshold].setAddBits(uint8(i))
271
-				highThreshold--
272
-				symbolNext[i] = 1
273
-			} else {
274
-				symbolNext[i] = uint16(v)
275
-			}
276
-		}
277
-	}
278
-	// Spread symbols
279
-	{
280
-		tableMask := tableSize - 1
281
-		step := tableStep(tableSize)
282
-		position := uint32(0)
283
-		for ss, v := range s.norm[:s.symbolLen] {
284
-			for i := 0; i < int(v); i++ {
285
-				s.dt[position].setAddBits(uint8(ss))
286
-				position = (position + step) & tableMask
287
-				for position > highThreshold {
288
-					// lowprob area
289
-					position = (position + step) & tableMask
290
-				}
291
-			}
292
-		}
293
-		if position != 0 {
294
-			// position must reach all cells once, otherwise normalizedCounter is incorrect
295
-			return errors.New("corrupted input (position != 0)")
296
-		}
297
-	}
298
-
299
-	// Build Decoding table
300
-	{
301
-		tableSize := uint16(1 << s.actualTableLog)
302
-		for u, v := range s.dt[:tableSize] {
303
-			symbol := v.addBits()
304
-			nextState := symbolNext[symbol]
305
-			symbolNext[symbol] = nextState + 1
306
-			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
307
-			s.dt[u&maxTableMask].setNBits(nBits)
308
-			newState := (nextState << nBits) - tableSize
309
-			if newState > tableSize {
310
-				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
311
-			}
312
-			if newState == uint16(u) && nBits == 0 {
313
-				// Seems weird that this is possible with nbits > 0.
314
-				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
315
-			}
316
-			s.dt[u&maxTableMask].setNewState(newState)
317
-		}
318
-	}
319
-	return nil
320
-}
321
-
322 260
 // transform will transform the decoder table into a table usable for
323 261
 // decoding without having to apply the transformation while decoding.
324 262
 // The state will contain the base value and the number of bits to read.
... ...
@@ -352,34 +301,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
352 352
 	s.state = dt[br.getBits(tableLog)]
353 353
 }
354 354
 
355
-// next returns the current symbol and sets the next state.
356
-// At least tablelog bits must be available in the bit reader.
357
-func (s *fseState) next(br *bitReader) {
358
-	lowBits := uint16(br.getBits(s.state.nbBits()))
359
-	s.state = s.dt[s.state.newState()+lowBits]
360
-}
361
-
362
-// finished returns true if all bits have been read from the bitstream
363
-// and the next state would require reading bits from the input.
364
-func (s *fseState) finished(br *bitReader) bool {
365
-	return br.finished() && s.state.nbBits() > 0
366
-}
367
-
368
-// final returns the current state symbol without decoding the next.
369
-func (s *fseState) final() (int, uint8) {
370
-	return s.state.baselineInt(), s.state.addBits()
371
-}
372
-
373 355
 // final returns the current state symbol without decoding the next.
374 356
 func (s decSymbol) final() (int, uint8) {
375 357
 	return s.baselineInt(), s.addBits()
376 358
 }
377
-
378
-// nextFast returns the next symbol and sets the next state.
379
-// This can only be used if no symbols are 0 bits.
380
-// At least tablelog bits must be available in the bit reader.
381
-func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
382
-	lowBits := br.get16BitsFast(s.state.nbBits())
383
-	s.state = s.dt[s.state.newState()+lowBits]
384
-	return s.state.baseline(), s.state.addBits()
385
-}
386 359
new file mode 100644
... ...
@@ -0,0 +1,64 @@
0
+//go:build amd64 && !appengine && !noasm && gc
1
+// +build amd64,!appengine,!noasm,gc
2
+
3
+package zstd
4
+
5
+import (
6
+	"fmt"
7
+)
8
+
9
+type buildDtableAsmContext struct {
10
+	// inputs
11
+	stateTable *uint16
12
+	norm       *int16
13
+	dt         *uint64
14
+
15
+	// outputs --- set by the procedure in the case of error;
16
+	// for interpretation please see the error handling part below
17
+	errParam1 uint64
18
+	errParam2 uint64
19
+}
20
+
21
+// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
22
+// Function returns non-zero exit code on error.
23
+// go:noescape
24
+func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
25
+
26
+// please keep in sync with _generate/gen_fse.go
27
+const (
28
+	errorCorruptedNormalizedCounter = 1
29
+	errorNewStateTooBig             = 2
30
+	errorNewStateNoBits             = 3
31
+)
32
+
33
+// buildDtable will build the decoding table.
34
+func (s *fseDecoder) buildDtable() error {
35
+	ctx := buildDtableAsmContext{
36
+		stateTable: &s.stateTable[0],
37
+		norm:       &s.norm[0],
38
+		dt:         (*uint64)(&s.dt[0]),
39
+	}
40
+	code := buildDtable_asm(s, &ctx)
41
+
42
+	if code != 0 {
43
+		switch code {
44
+		case errorCorruptedNormalizedCounter:
45
+			position := ctx.errParam1
46
+			return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
47
+
48
+		case errorNewStateTooBig:
49
+			newState := decSymbol(ctx.errParam1)
50
+			size := ctx.errParam2
51
+			return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
52
+
53
+		case errorNewStateNoBits:
54
+			newState := decSymbol(ctx.errParam1)
55
+			oldState := decSymbol(ctx.errParam2)
56
+			return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
57
+
58
+		default:
59
+			return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
60
+		}
61
+	}
62
+	return nil
63
+}
0 64
new file mode 100644
... ...
@@ -0,0 +1,127 @@
0
+// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
1
+
2
+//go:build !appengine && !noasm && gc && !noasm
3
+// +build !appengine,!noasm,gc,!noasm
4
+
5
+// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
6
+TEXT ·buildDtable_asm(SB), $0-24
7
+	MOVQ ctx+8(FP), CX
8
+	MOVQ s+0(FP), DI
9
+
10
+	// Load values
11
+	MOVBQZX 4098(DI), DX
12
+	XORQ    AX, AX
13
+	BTSQ    DX, AX
14
+	MOVQ    (CX), BX
15
+	MOVQ    16(CX), SI
16
+	LEAQ    -1(AX), R8
17
+	MOVQ    8(CX), CX
18
+	MOVWQZX 4096(DI), DI
19
+
20
+	// End load values
21
+	// Init, lay down lowprob symbols
22
+	XORQ R9, R9
23
+	JMP  init_main_loop_condition
24
+
25
+init_main_loop:
26
+	MOVWQSX (CX)(R9*2), R10
27
+	CMPW    R10, $-1
28
+	JNE     do_not_update_high_threshold
29
+	MOVB    R9, 1(SI)(R8*8)
30
+	DECQ    R8
31
+	MOVQ    $0x0000000000000001, R10
32
+
33
+do_not_update_high_threshold:
34
+	MOVW R10, (BX)(R9*2)
35
+	INCQ R9
36
+
37
+init_main_loop_condition:
38
+	CMPQ R9, DI
39
+	JL   init_main_loop
40
+
41
+	// Spread symbols
42
+	// Calculate table step
43
+	MOVQ AX, R9
44
+	SHRQ $0x01, R9
45
+	MOVQ AX, R10
46
+	SHRQ $0x03, R10
47
+	LEAQ 3(R9)(R10*1), R9
48
+
49
+	// Fill add bits values
50
+	LEAQ -1(AX), R10
51
+	XORQ R11, R11
52
+	XORQ R12, R12
53
+	JMP  spread_main_loop_condition
54
+
55
+spread_main_loop:
56
+	XORQ    R13, R13
57
+	MOVWQSX (CX)(R12*2), R14
58
+	JMP     spread_inner_loop_condition
59
+
60
+spread_inner_loop:
61
+	MOVB R12, 1(SI)(R11*8)
62
+
63
+adjust_position:
64
+	ADDQ R9, R11
65
+	ANDQ R10, R11
66
+	CMPQ R11, R8
67
+	JG   adjust_position
68
+	INCQ R13
69
+
70
+spread_inner_loop_condition:
71
+	CMPQ R13, R14
72
+	JL   spread_inner_loop
73
+	INCQ R12
74
+
75
+spread_main_loop_condition:
76
+	CMPQ  R12, DI
77
+	JL    spread_main_loop
78
+	TESTQ R11, R11
79
+	JZ    spread_check_ok
80
+	MOVQ  ctx+8(FP), AX
81
+	MOVQ  R11, 24(AX)
82
+	MOVQ  $+1, ret+16(FP)
83
+	RET
84
+
85
+spread_check_ok:
86
+	// Build Decoding table
87
+	XORQ DI, DI
88
+
89
+build_table_main_table:
90
+	MOVBQZX 1(SI)(DI*8), CX
91
+	MOVWQZX (BX)(CX*2), R8
92
+	LEAQ    1(R8), R9
93
+	MOVW    R9, (BX)(CX*2)
94
+	MOVQ    R8, R9
95
+	BSRQ    R9, R9
96
+	MOVQ    DX, CX
97
+	SUBQ    R9, CX
98
+	SHLQ    CL, R8
99
+	SUBQ    AX, R8
100
+	MOVB    CL, (SI)(DI*8)
101
+	MOVW    R8, 2(SI)(DI*8)
102
+	CMPQ    R8, AX
103
+	JLE     build_table_check1_ok
104
+	MOVQ    ctx+8(FP), CX
105
+	MOVQ    R8, 24(CX)
106
+	MOVQ    AX, 32(CX)
107
+	MOVQ    $+2, ret+16(FP)
108
+	RET
109
+
110
+build_table_check1_ok:
111
+	TESTB CL, CL
112
+	JNZ   build_table_check2_ok
113
+	CMPW  R8, DI
114
+	JNE   build_table_check2_ok
115
+	MOVQ  ctx+8(FP), AX
116
+	MOVQ  R8, 24(AX)
117
+	MOVQ  DI, 32(AX)
118
+	MOVQ  $+3, ret+16(FP)
119
+	RET
120
+
121
+build_table_check2_ok:
122
+	INCQ DI
123
+	CMPQ DI, AX
124
+	JL   build_table_main_table
125
+	MOVQ $+0, ret+16(FP)
126
+	RET
0 127
new file mode 100644
... ...
@@ -0,0 +1,72 @@
0
+//go:build !amd64 || appengine || !gc || noasm
1
+// +build !amd64 appengine !gc noasm
2
+
3
+package zstd
4
+
5
+import (
6
+	"errors"
7
+	"fmt"
8
+)
9
+
10
+// buildDtable will build the decoding table.
11
+func (s *fseDecoder) buildDtable() error {
12
+	tableSize := uint32(1 << s.actualTableLog)
13
+	highThreshold := tableSize - 1
14
+	symbolNext := s.stateTable[:256]
15
+
16
+	// Init, lay down lowprob symbols
17
+	{
18
+		for i, v := range s.norm[:s.symbolLen] {
19
+			if v == -1 {
20
+				s.dt[highThreshold].setAddBits(uint8(i))
21
+				highThreshold--
22
+				symbolNext[i] = 1
23
+			} else {
24
+				symbolNext[i] = uint16(v)
25
+			}
26
+		}
27
+	}
28
+
29
+	// Spread symbols
30
+	{
31
+		tableMask := tableSize - 1
32
+		step := tableStep(tableSize)
33
+		position := uint32(0)
34
+		for ss, v := range s.norm[:s.symbolLen] {
35
+			for i := 0; i < int(v); i++ {
36
+				s.dt[position].setAddBits(uint8(ss))
37
+				position = (position + step) & tableMask
38
+				for position > highThreshold {
39
+					// lowprob area
40
+					position = (position + step) & tableMask
41
+				}
42
+			}
43
+		}
44
+		if position != 0 {
45
+			// position must reach all cells once, otherwise normalizedCounter is incorrect
46
+			return errors.New("corrupted input (position != 0)")
47
+		}
48
+	}
49
+
50
+	// Build Decoding table
51
+	{
52
+		tableSize := uint16(1 << s.actualTableLog)
53
+		for u, v := range s.dt[:tableSize] {
54
+			symbol := v.addBits()
55
+			nextState := symbolNext[symbol]
56
+			symbolNext[symbol] = nextState + 1
57
+			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
58
+			s.dt[u&maxTableMask].setNBits(nBits)
59
+			newState := (nextState << nBits) - tableSize
60
+			if newState > tableSize {
61
+				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
62
+			}
63
+			if newState == uint16(u) && nBits == 0 {
64
+				// Seems weird that this is possible with nbits > 0.
65
+				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
66
+			}
67
+			s.dt[u&maxTableMask].setNewState(newState)
68
+		}
69
+	}
70
+	return nil
71
+}
... ...
@@ -76,21 +76,6 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
76 76
 	s.clearCount = maxCount != 0
77 77
 }
78 78
 
79
-// prepare will prepare and allocate scratch tables used for both compression and decompression.
80
-func (s *fseEncoder) prepare() (*fseEncoder, error) {
81
-	if s == nil {
82
-		s = &fseEncoder{}
83
-	}
84
-	s.useRLE = false
85
-	if s.clearCount && s.maxCount == 0 {
86
-		for i := range s.count {
87
-			s.count[i] = 0
88
-		}
89
-		s.clearCount = false
90
-	}
91
-	return s, nil
92
-}
93
-
94 79
 // allocCtable will allocate tables needed for compression.
95 80
 // If existing tables a re big enough, they are simply re-used.
96 81
 func (s *fseEncoder) allocCtable() {
... ...
@@ -709,14 +694,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
709 709
 	c.state = c.stateTable[lu]
710 710
 }
711 711
 
712
-// encode the output symbol provided and write it to the bitstream.
713
-func (c *cState) encode(symbolTT symbolTransform) {
714
-	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
715
-	dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
716
-	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
717
-	c.state = c.stateTable[dstState]
718
-}
719
-
720 712
 // flush will write the tablelog to the output and flush the remaining full bytes.
721 713
 func (c *cState) flush(tableLog uint8) {
722 714
 	c.bw.flush32()
723 715
deleted file mode 100644
... ...
@@ -1,11 +0,0 @@
1
-//go:build ignorecrc
2
-// +build ignorecrc
3
-
4
-// Copyright 2019+ Klaus Post. All rights reserved.
5
-// License information can be found in the LICENSE file.
6
-// Based on work by Yann Collet, released under BSD License.
7
-
8
-package zstd
9
-
10
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
11
-const ignoreCRC = true
12 1
deleted file mode 100644
... ...
@@ -1,11 +0,0 @@
1
-//go:build !ignorecrc
2
-// +build !ignorecrc
3
-
4
-// Copyright 2019+ Klaus Post. All rights reserved.
5
-// License information can be found in the LICENSE file.
6
-// Based on work by Yann Collet, released under BSD License.
7
-
8
-package zstd
9
-
10
-// ignoreCRC can be used for fuzz testing to ignore CRC values...
11
-const ignoreCRC = false
... ...
@@ -33,9 +33,3 @@ func hashLen(u uint64, length, mls uint8) uint32 {
33 33
 		return (uint32(u) * prime4bytes) >> (32 - length)
34 34
 	}
35 35
 }
36
-
37
-// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
38
-// Preferably h should be a constant and should always be <32.
39
-func hash3(u uint32, h uint8) uint32 {
40
-	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
41
-}
... ...
@@ -73,6 +73,7 @@ type sequenceDecs struct {
73 73
 	seqSize      int
74 74
 	windowSize   int
75 75
 	maxBits      uint8
76
+	maxSyncLen   uint64
76 77
 }
77 78
 
78 79
 // initialize all 3 decoders from the stream input.
... ...
@@ -98,153 +99,13 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) erro
98 98
 	return nil
99 99
 }
100 100
 
101
-// decode sequences from the stream with the provided history.
102
-func (s *sequenceDecs) decode(seqs []seqVals) error {
103
-	br := s.br
104
-
105
-	// Grab full sizes tables, to avoid bounds checks.
106
-	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
107
-	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
108
-	s.seqSize = 0
109
-	litRemain := len(s.literals)
110
-	maxBlockSize := maxCompressedBlockSize
111
-	if s.windowSize < maxBlockSize {
112
-		maxBlockSize = s.windowSize
113
-	}
114
-	for i := range seqs {
115
-		var ll, mo, ml int
116
-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
117
-			// inlined function:
118
-			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
119
-
120
-			// Final will not read from stream.
121
-			var llB, mlB, moB uint8
122
-			ll, llB = llState.final()
123
-			ml, mlB = mlState.final()
124
-			mo, moB = ofState.final()
125
-
126
-			// extra bits are stored in reverse order.
127
-			br.fillFast()
128
-			mo += br.getBits(moB)
129
-			if s.maxBits > 32 {
130
-				br.fillFast()
131
-			}
132
-			ml += br.getBits(mlB)
133
-			ll += br.getBits(llB)
134
-
135
-			if moB > 1 {
136
-				s.prevOffset[2] = s.prevOffset[1]
137
-				s.prevOffset[1] = s.prevOffset[0]
138
-				s.prevOffset[0] = mo
139
-			} else {
140
-				// mo = s.adjustOffset(mo, ll, moB)
141
-				// Inlined for rather big speedup
142
-				if ll == 0 {
143
-					// There is an exception though, when current sequence's literals_length = 0.
144
-					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
145
-					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
146
-					mo++
147
-				}
148
-
149
-				if mo == 0 {
150
-					mo = s.prevOffset[0]
151
-				} else {
152
-					var temp int
153
-					if mo == 3 {
154
-						temp = s.prevOffset[0] - 1
155
-					} else {
156
-						temp = s.prevOffset[mo]
157
-					}
158
-
159
-					if temp == 0 {
160
-						// 0 is not valid; input is corrupted; force offset to 1
161
-						println("WARNING: temp was 0")
162
-						temp = 1
163
-					}
164
-
165
-					if mo != 1 {
166
-						s.prevOffset[2] = s.prevOffset[1]
167
-					}
168
-					s.prevOffset[1] = s.prevOffset[0]
169
-					s.prevOffset[0] = temp
170
-					mo = temp
171
-				}
172
-			}
173
-			br.fillFast()
174
-		} else {
175
-			if br.overread() {
176
-				if debugDecoder {
177
-					printf("reading sequence %d, exceeded available data\n", i)
178
-				}
179
-				return io.ErrUnexpectedEOF
180
-			}
181
-			ll, mo, ml = s.next(br, llState, mlState, ofState)
182
-			br.fill()
183
-		}
184
-
185
-		if debugSequences {
186
-			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
187
-		}
188
-		// Evaluate.
189
-		// We might be doing this async, so do it early.
190
-		if mo == 0 && ml > 0 {
191
-			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
192
-		}
193
-		if ml > maxMatchLen {
194
-			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
195
-		}
196
-		s.seqSize += ll + ml
197
-		if s.seqSize > maxBlockSize {
198
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
199
-		}
200
-		litRemain -= ll
201
-		if litRemain < 0 {
202
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
203
-		}
204
-		seqs[i] = seqVals{
205
-			ll: ll,
206
-			ml: ml,
207
-			mo: mo,
208
-		}
209
-		if i == len(seqs)-1 {
210
-			// This is the last sequence, so we shouldn't update state.
211
-			break
212
-		}
213
-
214
-		// Manually inlined, ~ 5-20% faster
215
-		// Update all 3 states at once. Approx 20% faster.
216
-		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
217
-		if nBits == 0 {
218
-			llState = llTable[llState.newState()&maxTableMask]
219
-			mlState = mlTable[mlState.newState()&maxTableMask]
220
-			ofState = ofTable[ofState.newState()&maxTableMask]
221
-		} else {
222
-			bits := br.get32BitsFast(nBits)
223
-			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
224
-			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
225
-
226
-			lowBits = uint16(bits >> (ofState.nbBits() & 31))
227
-			lowBits &= bitMask[mlState.nbBits()&15]
228
-			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
229
-
230
-			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
231
-			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
232
-		}
233
-	}
234
-	s.seqSize += litRemain
235
-	if s.seqSize > maxBlockSize {
236
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
237
-	}
238
-	err := br.close()
239
-	if err != nil {
240
-		printf("Closing sequences: %v, %+v\n", err, *br)
241
-	}
242
-	return err
243
-}
244
-
245 101
 // execute will execute the decoded sequence with the provided history.
246 102
 // The sequence must be evaluated before being sent.
247 103
 func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
104
+	if len(s.dict) == 0 {
105
+		return s.executeSimple(seqs, hist)
106
+	}
107
+
248 108
 	// Ensure we have enough output size...
249 109
 	if len(s.out)+s.seqSize > cap(s.out) {
250 110
 		addBytes := s.seqSize + len(s.out)
... ...
@@ -327,6 +188,7 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
327 327
 			}
328 328
 		}
329 329
 	}
330
+
330 331
 	// Add final literals
331 332
 	copy(out[t:], s.literals)
332 333
 	if debugDecoder {
... ...
@@ -341,14 +203,18 @@ func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
341 341
 }
342 342
 
343 343
 // decode sequences from the stream with the provided history.
344
-func (s *sequenceDecs) decodeSync(history *history) error {
344
+func (s *sequenceDecs) decodeSync(hist []byte) error {
345
+	supported, err := s.decodeSyncSimple(hist)
346
+	if supported {
347
+		return err
348
+	}
349
+
345 350
 	br := s.br
346 351
 	seqs := s.nSeqs
347 352
 	startSize := len(s.out)
348 353
 	// Grab full sizes tables, to avoid bounds checks.
349 354
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
350 355
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
351
-	hist := history.b[history.ignoreBuffer:]
352 356
 	out := s.out
353 357
 	maxBlockSize := maxCompressedBlockSize
354 358
 	if s.windowSize < maxBlockSize {
... ...
@@ -433,7 +299,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
433 433
 		}
434 434
 		size := ll + ml + len(out)
435 435
 		if size-startSize > maxBlockSize {
436
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
436
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
437 437
 		}
438 438
 		if size > cap(out) {
439 439
 			// Not enough size, which can happen under high volume block streaming conditions
... ...
@@ -463,13 +329,13 @@ func (s *sequenceDecs) decodeSync(history *history) error {
463 463
 
464 464
 		if mo > len(out)+len(hist) || mo > s.windowSize {
465 465
 			if len(s.dict) == 0 {
466
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
466
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
467 467
 			}
468 468
 
469 469
 			// we may be in dictionary.
470 470
 			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
471 471
 			if dictO < 0 || dictO >= len(s.dict) {
472
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist))
472
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
473 473
 			}
474 474
 			end := dictO + ml
475 475
 			if end > len(s.dict) {
... ...
@@ -530,6 +396,7 @@ func (s *sequenceDecs) decodeSync(history *history) error {
530 530
 			ofState = ofTable[ofState.newState()&maxTableMask]
531 531
 		} else {
532 532
 			bits := br.get32BitsFast(nBits)
533
+
533 534
 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
534 535
 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
535 536
 
... ...
@@ -543,8 +410,8 @@ func (s *sequenceDecs) decodeSync(history *history) error {
543 543
 	}
544 544
 
545 545
 	// Check if space for literals
546
-	if len(s.literals)+len(s.out)-startSize > maxBlockSize {
547
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", len(s.out), maxBlockSize)
546
+	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
547
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
548 548
 	}
549 549
 
550 550
 	// Add final literals
... ...
@@ -552,16 +419,6 @@ func (s *sequenceDecs) decodeSync(history *history) error {
552 552
 	return br.close()
553 553
 }
554 554
 
555
-// update states, at least 27 bits must be available.
556
-func (s *sequenceDecs) update(br *bitReader) {
557
-	// Max 8 bits
558
-	s.litLengths.state.next(br)
559
-	// Max 9 bits
560
-	s.matchLengths.state.next(br)
561
-	// Max 8 bits
562
-	s.offsets.state.next(br)
563
-}
564
-
565 555
 var bitMask [16]uint16
566 556
 
567 557
 func init() {
... ...
@@ -570,87 +427,6 @@ func init() {
570 570
 	}
571 571
 }
572 572
 
573
-// update states, at least 27 bits must be available.
574
-func (s *sequenceDecs) updateAlt(br *bitReader) {
575
-	// Update all 3 states at once. Approx 20% faster.
576
-	a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
577
-
578
-	nBits := a.nbBits() + b.nbBits() + c.nbBits()
579
-	if nBits == 0 {
580
-		s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
581
-		s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
582
-		s.offsets.state.state = s.offsets.state.dt[c.newState()]
583
-		return
584
-	}
585
-	bits := br.get32BitsFast(nBits)
586
-	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
587
-	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
588
-
589
-	lowBits = uint16(bits >> (c.nbBits() & 31))
590
-	lowBits &= bitMask[b.nbBits()&15]
591
-	s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
592
-
593
-	lowBits = uint16(bits) & bitMask[c.nbBits()&15]
594
-	s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
595
-}
596
-
597
-// nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
598
-func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
599
-	// Final will not read from stream.
600
-	ll, llB := llState.final()
601
-	ml, mlB := mlState.final()
602
-	mo, moB := ofState.final()
603
-
604
-	// extra bits are stored in reverse order.
605
-	br.fillFast()
606
-	mo += br.getBits(moB)
607
-	if s.maxBits > 32 {
608
-		br.fillFast()
609
-	}
610
-	ml += br.getBits(mlB)
611
-	ll += br.getBits(llB)
612
-
613
-	if moB > 1 {
614
-		s.prevOffset[2] = s.prevOffset[1]
615
-		s.prevOffset[1] = s.prevOffset[0]
616
-		s.prevOffset[0] = mo
617
-		return
618
-	}
619
-	// mo = s.adjustOffset(mo, ll, moB)
620
-	// Inlined for rather big speedup
621
-	if ll == 0 {
622
-		// There is an exception though, when current sequence's literals_length = 0.
623
-		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
624
-		// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
625
-		mo++
626
-	}
627
-
628
-	if mo == 0 {
629
-		mo = s.prevOffset[0]
630
-		return
631
-	}
632
-	var temp int
633
-	if mo == 3 {
634
-		temp = s.prevOffset[0] - 1
635
-	} else {
636
-		temp = s.prevOffset[mo]
637
-	}
638
-
639
-	if temp == 0 {
640
-		// 0 is not valid; input is corrupted; force offset to 1
641
-		println("temp was 0")
642
-		temp = 1
643
-	}
644
-
645
-	if mo != 1 {
646
-		s.prevOffset[2] = s.prevOffset[1]
647
-	}
648
-	s.prevOffset[1] = s.prevOffset[0]
649
-	s.prevOffset[0] = temp
650
-	mo = temp
651
-	return
652
-}
653
-
654 573
 func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
655 574
 	// Final will not read from stream.
656 575
 	ll, llB := llState.final()
657 576
new file mode 100644
... ...
@@ -0,0 +1,368 @@
0
+//go:build amd64 && !appengine && !noasm && gc
1
+// +build amd64,!appengine,!noasm,gc
2
+
3
+package zstd
4
+
5
+import (
6
+	"fmt"
7
+
8
+	"github.com/klauspost/compress/internal/cpuinfo"
9
+)
10
+
11
+type decodeSyncAsmContext struct {
12
+	llTable     []decSymbol
13
+	mlTable     []decSymbol
14
+	ofTable     []decSymbol
15
+	llState     uint64
16
+	mlState     uint64
17
+	ofState     uint64
18
+	iteration   int
19
+	litRemain   int
20
+	out         []byte
21
+	outPosition int
22
+	literals    []byte
23
+	litPosition int
24
+	history     []byte
25
+	windowSize  int
26
+	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
27
+	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
28
+	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
29
+}
30
+
31
+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
32
+//
33
+// Please refer to seqdec_generic.go for the reference implementation.
34
+//go:noescape
35
+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
36
+
37
+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
38
+//go:noescape
39
+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
40
+
41
+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
42
+//go:noescape
43
+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
44
+
45
+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
46
+//go:noescape
47
+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
48
+
49
+// decode sequences from the stream with the provided history but without a dictionary.
50
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
51
+	if len(s.dict) > 0 {
52
+		return false, nil
53
+	}
54
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
55
+		return false, nil
56
+	}
57
+
58
+	// FIXME: Using unsafe memory copies leads to rare, random crashes
59
+	// with fuzz testing. It is therefore disabled for now.
60
+	const useSafe = true
61
+	/*
62
+		useSafe := false
63
+		if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
64
+			useSafe = true
65
+		}
66
+		if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
67
+			useSafe = true
68
+		}
69
+		if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
70
+			useSafe = true
71
+		}
72
+	*/
73
+
74
+	br := s.br
75
+
76
+	maxBlockSize := maxCompressedBlockSize
77
+	if s.windowSize < maxBlockSize {
78
+		maxBlockSize = s.windowSize
79
+	}
80
+
81
+	ctx := decodeSyncAsmContext{
82
+		llTable:     s.litLengths.fse.dt[:maxTablesize],
83
+		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
84
+		ofTable:     s.offsets.fse.dt[:maxTablesize],
85
+		llState:     uint64(s.litLengths.state.state),
86
+		mlState:     uint64(s.matchLengths.state.state),
87
+		ofState:     uint64(s.offsets.state.state),
88
+		iteration:   s.nSeqs - 1,
89
+		litRemain:   len(s.literals),
90
+		out:         s.out,
91
+		outPosition: len(s.out),
92
+		literals:    s.literals,
93
+		windowSize:  s.windowSize,
94
+		history:     hist,
95
+	}
96
+
97
+	s.seqSize = 0
98
+	startSize := len(s.out)
99
+
100
+	var errCode int
101
+	if cpuinfo.HasBMI2() {
102
+		if useSafe {
103
+			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
104
+		} else {
105
+			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
106
+		}
107
+	} else {
108
+		if useSafe {
109
+			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
110
+		} else {
111
+			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
112
+		}
113
+	}
114
+	switch errCode {
115
+	case noError:
116
+		break
117
+
118
+	case errorMatchLenOfsMismatch:
119
+		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
120
+
121
+	case errorMatchLenTooBig:
122
+		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
123
+
124
+	case errorMatchOffTooBig:
125
+		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
126
+			ctx.mo, ctx.outPosition+len(hist)-startSize)
127
+
128
+	case errorNotEnoughLiterals:
129
+		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
130
+			ctx.ll, ctx.litRemain+ctx.ll)
131
+
132
+	case errorNotEnoughSpace:
133
+		size := ctx.outPosition + ctx.ll + ctx.ml
134
+		if debugDecoder {
135
+			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
136
+		}
137
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
138
+
139
+	default:
140
+		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
141
+	}
142
+
143
+	s.seqSize += ctx.litRemain
144
+	if s.seqSize > maxBlockSize {
145
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
146
+	}
147
+	err := br.close()
148
+	if err != nil {
149
+		printf("Closing sequences: %v, %+v\n", err, *br)
150
+		return true, err
151
+	}
152
+
153
+	s.literals = s.literals[ctx.litPosition:]
154
+	t := ctx.outPosition
155
+	s.out = s.out[:t]
156
+
157
+	// Add final literals
158
+	s.out = append(s.out, s.literals...)
159
+	if debugDecoder {
160
+		t += len(s.literals)
161
+		if t != len(s.out) {
162
+			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
163
+		}
164
+	}
165
+
166
+	return true, nil
167
+}
168
+
169
+// --------------------------------------------------------------------------------
170
+
171
+type decodeAsmContext struct {
172
+	llTable   []decSymbol
173
+	mlTable   []decSymbol
174
+	ofTable   []decSymbol
175
+	llState   uint64
176
+	mlState   uint64
177
+	ofState   uint64
178
+	iteration int
179
+	seqs      []seqVals
180
+	litRemain int
181
+}
182
+
183
+const noError = 0
184
+
185
+// error reported when mo == 0 && ml > 0
186
+const errorMatchLenOfsMismatch = 1
187
+
188
+// error reported when ml > maxMatchLen
189
+const errorMatchLenTooBig = 2
190
+
191
+// error reported when mo > available history or mo > s.windowSize
192
+const errorMatchOffTooBig = 3
193
+
194
+// error reported when the sum of literal lengths exeeceds the literal buffer size
195
+const errorNotEnoughLiterals = 4
196
+
197
+// error reported when capacity of `out` is too small
198
+const errorNotEnoughSpace = 5
199
+
200
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
201
+//
202
+// Please refer to seqdec_generic.go for the reference implementation.
203
+//go:noescape
204
+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
205
+
206
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
207
+//
208
+// Please refer to seqdec_generic.go for the reference implementation.
209
+//go:noescape
210
+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
211
+
212
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
213
+//go:noescape
214
+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
215
+
216
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
217
+//go:noescape
218
+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
219
+
220
+// decode sequences from the stream without the provided history.
221
+func (s *sequenceDecs) decode(seqs []seqVals) error {
222
+	br := s.br
223
+
224
+	maxBlockSize := maxCompressedBlockSize
225
+	if s.windowSize < maxBlockSize {
226
+		maxBlockSize = s.windowSize
227
+	}
228
+
229
+	ctx := decodeAsmContext{
230
+		llTable:   s.litLengths.fse.dt[:maxTablesize],
231
+		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
232
+		ofTable:   s.offsets.fse.dt[:maxTablesize],
233
+		llState:   uint64(s.litLengths.state.state),
234
+		mlState:   uint64(s.matchLengths.state.state),
235
+		ofState:   uint64(s.offsets.state.state),
236
+		seqs:      seqs,
237
+		iteration: len(seqs) - 1,
238
+		litRemain: len(s.literals),
239
+	}
240
+
241
+	s.seqSize = 0
242
+	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
243
+	var errCode int
244
+	if cpuinfo.HasBMI2() {
245
+		if lte56bits {
246
+			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
247
+		} else {
248
+			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
249
+		}
250
+	} else {
251
+		if lte56bits {
252
+			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
253
+		} else {
254
+			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
255
+		}
256
+	}
257
+	if errCode != 0 {
258
+		i := len(seqs) - ctx.iteration - 1
259
+		switch errCode {
260
+		case errorMatchLenOfsMismatch:
261
+			ml := ctx.seqs[i].ml
262
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
263
+
264
+		case errorMatchLenTooBig:
265
+			ml := ctx.seqs[i].ml
266
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
267
+
268
+		case errorNotEnoughLiterals:
269
+			ll := ctx.seqs[i].ll
270
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
271
+		}
272
+
273
+		return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
274
+	}
275
+
276
+	if ctx.litRemain < 0 {
277
+		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
278
+			len(s.literals), len(s.literals)-ctx.litRemain)
279
+	}
280
+
281
+	s.seqSize += ctx.litRemain
282
+	if s.seqSize > maxBlockSize {
283
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
284
+	}
285
+	err := br.close()
286
+	if err != nil {
287
+		printf("Closing sequences: %v, %+v\n", err, *br)
288
+	}
289
+	return err
290
+}
291
+
292
+// --------------------------------------------------------------------------------
293
+
294
+type executeAsmContext struct {
295
+	seqs        []seqVals
296
+	seqIndex    int
297
+	out         []byte
298
+	history     []byte
299
+	literals    []byte
300
+	outPosition int
301
+	litPosition int
302
+	windowSize  int
303
+}
304
+
305
+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
306
+//
307
+// Returns false if a match offset is too big.
308
+//
309
+// Please refer to seqdec_generic.go for the reference implementation.
310
+//go:noescape
311
+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
312
+
313
+// Same as above, but with safe memcopies
314
+//go:noescape
315
+func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
316
+
317
+// executeSimple handles cases when dictionary is not used.
318
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
319
+	// Ensure we have enough output size...
320
+	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
321
+		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
322
+		s.out = append(s.out, make([]byte, addBytes)...)
323
+		s.out = s.out[:len(s.out)-addBytes]
324
+	}
325
+
326
+	if debugDecoder {
327
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
328
+	}
329
+
330
+	var t = len(s.out)
331
+	out := s.out[:t+s.seqSize]
332
+
333
+	ctx := executeAsmContext{
334
+		seqs:        seqs,
335
+		seqIndex:    0,
336
+		out:         out,
337
+		history:     hist,
338
+		outPosition: t,
339
+		litPosition: 0,
340
+		literals:    s.literals,
341
+		windowSize:  s.windowSize,
342
+	}
343
+	var ok bool
344
+	if cap(s.literals) < len(s.literals)+compressedBlockOverAlloc {
345
+		ok = sequenceDecs_executeSimple_safe_amd64(&ctx)
346
+	} else {
347
+		ok = sequenceDecs_executeSimple_amd64(&ctx)
348
+	}
349
+	if !ok {
350
+		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
351
+			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
352
+	}
353
+	s.literals = s.literals[ctx.litPosition:]
354
+	t = ctx.outPosition
355
+
356
+	// Add final literals
357
+	copy(out[t:], s.literals)
358
+	if debugDecoder {
359
+		t += len(s.literals)
360
+		if t != len(out) {
361
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
362
+		}
363
+	}
364
+	s.out = out
365
+
366
+	return nil
367
+}
0 368
new file mode 100644
... ...
@@ -0,0 +1,4100 @@
0
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
1
+
2
+//go:build !appengine && !noasm && gc && !noasm
3
+// +build !appengine,!noasm,gc,!noasm
4
+
5
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
6
+// Requires: CMOV
7
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
8
+	MOVQ    br+8(FP), AX
9
+	MOVQ    32(AX), DX
10
+	MOVBQZX 40(AX), BX
11
+	MOVQ    24(AX), SI
12
+	MOVQ    (AX), AX
13
+	ADDQ    SI, AX
14
+	MOVQ    AX, (SP)
15
+	MOVQ    ctx+16(FP), AX
16
+	MOVQ    72(AX), DI
17
+	MOVQ    80(AX), R8
18
+	MOVQ    88(AX), R9
19
+	MOVQ    104(AX), R10
20
+	MOVQ    s+0(FP), AX
21
+	MOVQ    144(AX), R11
22
+	MOVQ    152(AX), R12
23
+	MOVQ    160(AX), R13
24
+
25
+sequenceDecs_decode_amd64_main_loop:
26
+	MOVQ (SP), R14
27
+
28
+	// Fill bitreader to have enough for the offset and match length.
29
+	CMPQ SI, $0x08
30
+	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
31
+	MOVQ BX, AX
32
+	SHRQ $0x03, AX
33
+	SUBQ AX, R14
34
+	MOVQ (R14), DX
35
+	SUBQ AX, SI
36
+	ANDQ $0x07, BX
37
+	JMP  sequenceDecs_decode_amd64_fill_end
38
+
39
+sequenceDecs_decode_amd64_fill_byte_by_byte:
40
+	CMPQ    SI, $0x00
41
+	JLE     sequenceDecs_decode_amd64_fill_end
42
+	CMPQ    BX, $0x07
43
+	JLE     sequenceDecs_decode_amd64_fill_end
44
+	SHLQ    $0x08, DX
45
+	SUBQ    $0x01, R14
46
+	SUBQ    $0x01, SI
47
+	SUBQ    $0x08, BX
48
+	MOVBQZX (R14), AX
49
+	ORQ     AX, DX
50
+	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
51
+
52
+sequenceDecs_decode_amd64_fill_end:
53
+	// Update offset
54
+	MOVQ  R9, AX
55
+	MOVQ  BX, CX
56
+	MOVQ  DX, R15
57
+	SHLQ  CL, R15
58
+	MOVB  AH, CL
59
+	SHRQ  $0x20, AX
60
+	TESTQ CX, CX
61
+	JZ    sequenceDecs_decode_amd64_of_update_zero
62
+	ADDQ  CX, BX
63
+	CMPQ  BX, $0x40
64
+	JA    sequenceDecs_decode_amd64_of_update_zero
65
+	CMPQ  CX, $0x40
66
+	JAE   sequenceDecs_decode_amd64_of_update_zero
67
+	NEGQ  CX
68
+	SHRQ  CL, R15
69
+	ADDQ  R15, AX
70
+
71
+sequenceDecs_decode_amd64_of_update_zero:
72
+	MOVQ AX, 16(R10)
73
+
74
+	// Update match length
75
+	MOVQ  R8, AX
76
+	MOVQ  BX, CX
77
+	MOVQ  DX, R15
78
+	SHLQ  CL, R15
79
+	MOVB  AH, CL
80
+	SHRQ  $0x20, AX
81
+	TESTQ CX, CX
82
+	JZ    sequenceDecs_decode_amd64_ml_update_zero
83
+	ADDQ  CX, BX
84
+	CMPQ  BX, $0x40
85
+	JA    sequenceDecs_decode_amd64_ml_update_zero
86
+	CMPQ  CX, $0x40
87
+	JAE   sequenceDecs_decode_amd64_ml_update_zero
88
+	NEGQ  CX
89
+	SHRQ  CL, R15
90
+	ADDQ  R15, AX
91
+
92
+sequenceDecs_decode_amd64_ml_update_zero:
93
+	MOVQ AX, 8(R10)
94
+
95
+	// Fill bitreader to have enough for the remaining
96
+	CMPQ SI, $0x08
97
+	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
98
+	MOVQ BX, AX
99
+	SHRQ $0x03, AX
100
+	SUBQ AX, R14
101
+	MOVQ (R14), DX
102
+	SUBQ AX, SI
103
+	ANDQ $0x07, BX
104
+	JMP  sequenceDecs_decode_amd64_fill_2_end
105
+
106
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
107
+	CMPQ    SI, $0x00
108
+	JLE     sequenceDecs_decode_amd64_fill_2_end
109
+	CMPQ    BX, $0x07
110
+	JLE     sequenceDecs_decode_amd64_fill_2_end
111
+	SHLQ    $0x08, DX
112
+	SUBQ    $0x01, R14
113
+	SUBQ    $0x01, SI
114
+	SUBQ    $0x08, BX
115
+	MOVBQZX (R14), AX
116
+	ORQ     AX, DX
117
+	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
118
+
119
+sequenceDecs_decode_amd64_fill_2_end:
120
+	// Update literal length
121
+	MOVQ  DI, AX
122
+	MOVQ  BX, CX
123
+	MOVQ  DX, R15
124
+	SHLQ  CL, R15
125
+	MOVB  AH, CL
126
+	SHRQ  $0x20, AX
127
+	TESTQ CX, CX
128
+	JZ    sequenceDecs_decode_amd64_ll_update_zero
129
+	ADDQ  CX, BX
130
+	CMPQ  BX, $0x40
131
+	JA    sequenceDecs_decode_amd64_ll_update_zero
132
+	CMPQ  CX, $0x40
133
+	JAE   sequenceDecs_decode_amd64_ll_update_zero
134
+	NEGQ  CX
135
+	SHRQ  CL, R15
136
+	ADDQ  R15, AX
137
+
138
+sequenceDecs_decode_amd64_ll_update_zero:
139
+	MOVQ AX, (R10)
140
+
141
+	// Fill bitreader for state updates
142
+	MOVQ    R14, (SP)
143
+	MOVQ    R9, AX
144
+	SHRQ    $0x08, AX
145
+	MOVBQZX AL, AX
146
+	MOVQ    ctx+16(FP), CX
147
+	CMPQ    96(CX), $0x00
148
+	JZ      sequenceDecs_decode_amd64_skip_update
149
+
150
+	// Update Literal Length State
151
+	MOVBQZX DI, R14
152
+	SHRQ    $0x10, DI
153
+	MOVWQZX DI, DI
154
+	LEAQ    (BX)(R14*1), CX
155
+	MOVQ    DX, R15
156
+	MOVQ    CX, BX
157
+	ROLQ    CL, R15
158
+	MOVL    $0x00000001, BP
159
+	MOVB    R14, CL
160
+	SHLL    CL, BP
161
+	DECL    BP
162
+	ANDQ    BP, R15
163
+	ADDQ    R15, DI
164
+
165
+	// Load ctx.llTable
166
+	MOVQ ctx+16(FP), CX
167
+	MOVQ (CX), CX
168
+	MOVQ (CX)(DI*8), DI
169
+
170
+	// Update Match Length State
171
+	MOVBQZX R8, R14
172
+	SHRQ    $0x10, R8
173
+	MOVWQZX R8, R8
174
+	LEAQ    (BX)(R14*1), CX
175
+	MOVQ    DX, R15
176
+	MOVQ    CX, BX
177
+	ROLQ    CL, R15
178
+	MOVL    $0x00000001, BP
179
+	MOVB    R14, CL
180
+	SHLL    CL, BP
181
+	DECL    BP
182
+	ANDQ    BP, R15
183
+	ADDQ    R15, R8
184
+
185
+	// Load ctx.mlTable
186
+	MOVQ ctx+16(FP), CX
187
+	MOVQ 24(CX), CX
188
+	MOVQ (CX)(R8*8), R8
189
+
190
+	// Update Offset State
191
+	MOVBQZX R9, R14
192
+	SHRQ    $0x10, R9
193
+	MOVWQZX R9, R9
194
+	LEAQ    (BX)(R14*1), CX
195
+	MOVQ    DX, R15
196
+	MOVQ    CX, BX
197
+	ROLQ    CL, R15
198
+	MOVL    $0x00000001, BP
199
+	MOVB    R14, CL
200
+	SHLL    CL, BP
201
+	DECL    BP
202
+	ANDQ    BP, R15
203
+	ADDQ    R15, R9
204
+
205
+	// Load ctx.ofTable
206
+	MOVQ ctx+16(FP), CX
207
+	MOVQ 48(CX), CX
208
+	MOVQ (CX)(R9*8), R9
209
+
210
+sequenceDecs_decode_amd64_skip_update:
211
+	// Adjust offset
212
+	MOVQ 16(R10), CX
213
+	CMPQ AX, $0x01
214
+	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
215
+	MOVQ R12, R13
216
+	MOVQ R11, R12
217
+	MOVQ CX, R11
218
+	JMP  sequenceDecs_decode_amd64_after_adjust
219
+
220
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
221
+	CMPQ (R10), $0x00000000
222
+	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
223
+	INCQ CX
224
+	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
225
+
226
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
227
+	TESTQ CX, CX
228
+	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
229
+	MOVQ  R11, CX
230
+	JMP   sequenceDecs_decode_amd64_after_adjust
231
+
232
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
233
+	CMPQ CX, $0x01
234
+	JB   sequenceDecs_decode_amd64_adjust_zero
235
+	JEQ  sequenceDecs_decode_amd64_adjust_one
236
+	CMPQ CX, $0x02
237
+	JA   sequenceDecs_decode_amd64_adjust_three
238
+	JMP  sequenceDecs_decode_amd64_adjust_two
239
+
240
+sequenceDecs_decode_amd64_adjust_zero:
241
+	MOVQ R11, AX
242
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
243
+
244
+sequenceDecs_decode_amd64_adjust_one:
245
+	MOVQ R12, AX
246
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
247
+
248
+sequenceDecs_decode_amd64_adjust_two:
249
+	MOVQ R13, AX
250
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
251
+
252
+sequenceDecs_decode_amd64_adjust_three:
253
+	LEAQ -1(R11), AX
254
+
255
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
256
+	TESTQ AX, AX
257
+	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
258
+	MOVQ  $0x00000001, AX
259
+
260
+sequenceDecs_decode_amd64_adjust_temp_valid:
261
+	CMPQ    CX, $0x01
262
+	CMOVQNE R12, R13
263
+	MOVQ    R11, R12
264
+	MOVQ    AX, R11
265
+	MOVQ    AX, CX
266
+
267
+sequenceDecs_decode_amd64_after_adjust:
268
+	MOVQ CX, 16(R10)
269
+
270
+	// Check values
271
+	MOVQ  8(R10), AX
272
+	MOVQ  (R10), R14
273
+	LEAQ  (AX)(R14*1), R15
274
+	MOVQ  s+0(FP), BP
275
+	ADDQ  R15, 256(BP)
276
+	MOVQ  ctx+16(FP), R15
277
+	SUBQ  R14, 128(R15)
278
+	JS    error_not_enough_literals
279
+	CMPQ  AX, $0x00020002
280
+	JA    sequenceDecs_decode_amd64_error_match_len_too_big
281
+	TESTQ CX, CX
282
+	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
283
+	TESTQ AX, AX
284
+	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
285
+
286
+sequenceDecs_decode_amd64_match_len_ofs_ok:
287
+	ADDQ $0x18, R10
288
+	MOVQ ctx+16(FP), AX
289
+	DECQ 96(AX)
290
+	JNS  sequenceDecs_decode_amd64_main_loop
291
+	MOVQ s+0(FP), AX
292
+	MOVQ R11, 144(AX)
293
+	MOVQ R12, 152(AX)
294
+	MOVQ R13, 160(AX)
295
+	MOVQ br+8(FP), AX
296
+	MOVQ DX, 32(AX)
297
+	MOVB BL, 40(AX)
298
+	MOVQ SI, 24(AX)
299
+
300
+	// Return success
301
+	MOVQ $0x00000000, ret+24(FP)
302
+	RET
303
+
304
+	// Return with match length error
305
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
306
+	MOVQ $0x00000001, ret+24(FP)
307
+	RET
308
+
309
+	// Return with match too long error
310
+sequenceDecs_decode_amd64_error_match_len_too_big:
311
+	MOVQ $0x00000002, ret+24(FP)
312
+	RET
313
+
314
+	// Return with match offset too long error
315
+	MOVQ $0x00000003, ret+24(FP)
316
+	RET
317
+
318
+	// Return with not enough literals error
319
+error_not_enough_literals:
320
+	MOVQ $0x00000004, ret+24(FP)
321
+	RET
322
+
323
+	// Return with not enough output space error
324
+	MOVQ $0x00000005, ret+24(FP)
325
+	RET
326
+
327
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
328
+// Requires: CMOV
329
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
330
+	MOVQ    br+8(FP), AX
331
+	MOVQ    32(AX), DX
332
+	MOVBQZX 40(AX), BX
333
+	MOVQ    24(AX), SI
334
+	MOVQ    (AX), AX
335
+	ADDQ    SI, AX
336
+	MOVQ    AX, (SP)
337
+	MOVQ    ctx+16(FP), AX
338
+	MOVQ    72(AX), DI
339
+	MOVQ    80(AX), R8
340
+	MOVQ    88(AX), R9
341
+	MOVQ    104(AX), R10
342
+	MOVQ    s+0(FP), AX
343
+	MOVQ    144(AX), R11
344
+	MOVQ    152(AX), R12
345
+	MOVQ    160(AX), R13
346
+
347
+sequenceDecs_decode_56_amd64_main_loop:
348
+	MOVQ (SP), R14
349
+
350
+	// Fill bitreader to have enough for the offset and match length.
351
+	CMPQ SI, $0x08
352
+	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
353
+	MOVQ BX, AX
354
+	SHRQ $0x03, AX
355
+	SUBQ AX, R14
356
+	MOVQ (R14), DX
357
+	SUBQ AX, SI
358
+	ANDQ $0x07, BX
359
+	JMP  sequenceDecs_decode_56_amd64_fill_end
360
+
361
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
362
+	CMPQ    SI, $0x00
363
+	JLE     sequenceDecs_decode_56_amd64_fill_end
364
+	CMPQ    BX, $0x07
365
+	JLE     sequenceDecs_decode_56_amd64_fill_end
366
+	SHLQ    $0x08, DX
367
+	SUBQ    $0x01, R14
368
+	SUBQ    $0x01, SI
369
+	SUBQ    $0x08, BX
370
+	MOVBQZX (R14), AX
371
+	ORQ     AX, DX
372
+	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
373
+
374
+sequenceDecs_decode_56_amd64_fill_end:
375
+	// Update offset
376
+	MOVQ  R9, AX
377
+	MOVQ  BX, CX
378
+	MOVQ  DX, R15
379
+	SHLQ  CL, R15
380
+	MOVB  AH, CL
381
+	SHRQ  $0x20, AX
382
+	TESTQ CX, CX
383
+	JZ    sequenceDecs_decode_56_amd64_of_update_zero
384
+	ADDQ  CX, BX
385
+	CMPQ  BX, $0x40
386
+	JA    sequenceDecs_decode_56_amd64_of_update_zero
387
+	CMPQ  CX, $0x40
388
+	JAE   sequenceDecs_decode_56_amd64_of_update_zero
389
+	NEGQ  CX
390
+	SHRQ  CL, R15
391
+	ADDQ  R15, AX
392
+
393
+sequenceDecs_decode_56_amd64_of_update_zero:
394
+	MOVQ AX, 16(R10)
395
+
396
+	// Update match length
397
+	MOVQ  R8, AX
398
+	MOVQ  BX, CX
399
+	MOVQ  DX, R15
400
+	SHLQ  CL, R15
401
+	MOVB  AH, CL
402
+	SHRQ  $0x20, AX
403
+	TESTQ CX, CX
404
+	JZ    sequenceDecs_decode_56_amd64_ml_update_zero
405
+	ADDQ  CX, BX
406
+	CMPQ  BX, $0x40
407
+	JA    sequenceDecs_decode_56_amd64_ml_update_zero
408
+	CMPQ  CX, $0x40
409
+	JAE   sequenceDecs_decode_56_amd64_ml_update_zero
410
+	NEGQ  CX
411
+	SHRQ  CL, R15
412
+	ADDQ  R15, AX
413
+
414
+sequenceDecs_decode_56_amd64_ml_update_zero:
415
+	MOVQ AX, 8(R10)
416
+
417
+	// Update literal length
418
+	MOVQ  DI, AX
419
+	MOVQ  BX, CX
420
+	MOVQ  DX, R15
421
+	SHLQ  CL, R15
422
+	MOVB  AH, CL
423
+	SHRQ  $0x20, AX
424
+	TESTQ CX, CX
425
+	JZ    sequenceDecs_decode_56_amd64_ll_update_zero
426
+	ADDQ  CX, BX
427
+	CMPQ  BX, $0x40
428
+	JA    sequenceDecs_decode_56_amd64_ll_update_zero
429
+	CMPQ  CX, $0x40
430
+	JAE   sequenceDecs_decode_56_amd64_ll_update_zero
431
+	NEGQ  CX
432
+	SHRQ  CL, R15
433
+	ADDQ  R15, AX
434
+
435
+sequenceDecs_decode_56_amd64_ll_update_zero:
436
+	MOVQ AX, (R10)
437
+
438
+	// Fill bitreader for state updates
439
+	MOVQ    R14, (SP)
440
+	MOVQ    R9, AX
441
+	SHRQ    $0x08, AX
442
+	MOVBQZX AL, AX
443
+	MOVQ    ctx+16(FP), CX
444
+	CMPQ    96(CX), $0x00
445
+	JZ      sequenceDecs_decode_56_amd64_skip_update
446
+
447
+	// Update Literal Length State
448
+	MOVBQZX DI, R14
449
+	SHRQ    $0x10, DI
450
+	MOVWQZX DI, DI
451
+	LEAQ    (BX)(R14*1), CX
452
+	MOVQ    DX, R15
453
+	MOVQ    CX, BX
454
+	ROLQ    CL, R15
455
+	MOVL    $0x00000001, BP
456
+	MOVB    R14, CL
457
+	SHLL    CL, BP
458
+	DECL    BP
459
+	ANDQ    BP, R15
460
+	ADDQ    R15, DI
461
+
462
+	// Load ctx.llTable
463
+	MOVQ ctx+16(FP), CX
464
+	MOVQ (CX), CX
465
+	MOVQ (CX)(DI*8), DI
466
+
467
+	// Update Match Length State
468
+	MOVBQZX R8, R14
469
+	SHRQ    $0x10, R8
470
+	MOVWQZX R8, R8
471
+	LEAQ    (BX)(R14*1), CX
472
+	MOVQ    DX, R15
473
+	MOVQ    CX, BX
474
+	ROLQ    CL, R15
475
+	MOVL    $0x00000001, BP
476
+	MOVB    R14, CL
477
+	SHLL    CL, BP
478
+	DECL    BP
479
+	ANDQ    BP, R15
480
+	ADDQ    R15, R8
481
+
482
+	// Load ctx.mlTable
483
+	MOVQ ctx+16(FP), CX
484
+	MOVQ 24(CX), CX
485
+	MOVQ (CX)(R8*8), R8
486
+
487
+	// Update Offset State
488
+	MOVBQZX R9, R14
489
+	SHRQ    $0x10, R9
490
+	MOVWQZX R9, R9
491
+	LEAQ    (BX)(R14*1), CX
492
+	MOVQ    DX, R15
493
+	MOVQ    CX, BX
494
+	ROLQ    CL, R15
495
+	MOVL    $0x00000001, BP
496
+	MOVB    R14, CL
497
+	SHLL    CL, BP
498
+	DECL    BP
499
+	ANDQ    BP, R15
500
+	ADDQ    R15, R9
501
+
502
+	// Load ctx.ofTable
503
+	MOVQ ctx+16(FP), CX
504
+	MOVQ 48(CX), CX
505
+	MOVQ (CX)(R9*8), R9
506
+
507
+sequenceDecs_decode_56_amd64_skip_update:
508
+	// Adjust offset
509
+	MOVQ 16(R10), CX
510
+	CMPQ AX, $0x01
511
+	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
512
+	MOVQ R12, R13
513
+	MOVQ R11, R12
514
+	MOVQ CX, R11
515
+	JMP  sequenceDecs_decode_56_amd64_after_adjust
516
+
517
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
518
+	CMPQ (R10), $0x00000000
519
+	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
520
+	INCQ CX
521
+	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
522
+
523
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
524
+	TESTQ CX, CX
525
+	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
526
+	MOVQ  R11, CX
527
+	JMP   sequenceDecs_decode_56_amd64_after_adjust
528
+
529
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
530
+	CMPQ CX, $0x01
531
+	JB   sequenceDecs_decode_56_amd64_adjust_zero
532
+	JEQ  sequenceDecs_decode_56_amd64_adjust_one
533
+	CMPQ CX, $0x02
534
+	JA   sequenceDecs_decode_56_amd64_adjust_three
535
+	JMP  sequenceDecs_decode_56_amd64_adjust_two
536
+
537
+sequenceDecs_decode_56_amd64_adjust_zero:
538
+	MOVQ R11, AX
539
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
540
+
541
+sequenceDecs_decode_56_amd64_adjust_one:
542
+	MOVQ R12, AX
543
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
544
+
545
+sequenceDecs_decode_56_amd64_adjust_two:
546
+	MOVQ R13, AX
547
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
548
+
549
+sequenceDecs_decode_56_amd64_adjust_three:
550
+	LEAQ -1(R11), AX
551
+
552
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
553
+	TESTQ AX, AX
554
+	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
555
+	MOVQ  $0x00000001, AX
556
+
557
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
558
+	CMPQ    CX, $0x01
559
+	CMOVQNE R12, R13
560
+	MOVQ    R11, R12
561
+	MOVQ    AX, R11
562
+	MOVQ    AX, CX
563
+
564
+sequenceDecs_decode_56_amd64_after_adjust:
565
+	MOVQ CX, 16(R10)
566
+
567
+	// Check values
568
+	MOVQ  8(R10), AX
569
+	MOVQ  (R10), R14
570
+	LEAQ  (AX)(R14*1), R15
571
+	MOVQ  s+0(FP), BP
572
+	ADDQ  R15, 256(BP)
573
+	MOVQ  ctx+16(FP), R15
574
+	SUBQ  R14, 128(R15)
575
+	JS    error_not_enough_literals
576
+	CMPQ  AX, $0x00020002
577
+	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
578
+	TESTQ CX, CX
579
+	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
580
+	TESTQ AX, AX
581
+	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
582
+
583
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
584
+	ADDQ $0x18, R10
585
+	MOVQ ctx+16(FP), AX
586
+	DECQ 96(AX)
587
+	JNS  sequenceDecs_decode_56_amd64_main_loop
588
+	MOVQ s+0(FP), AX
589
+	MOVQ R11, 144(AX)
590
+	MOVQ R12, 152(AX)
591
+	MOVQ R13, 160(AX)
592
+	MOVQ br+8(FP), AX
593
+	MOVQ DX, 32(AX)
594
+	MOVB BL, 40(AX)
595
+	MOVQ SI, 24(AX)
596
+
597
+	// Return success
598
+	MOVQ $0x00000000, ret+24(FP)
599
+	RET
600
+
601
+	// Return with match length error
602
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
603
+	MOVQ $0x00000001, ret+24(FP)
604
+	RET
605
+
606
+	// Return with match too long error
607
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
608
+	MOVQ $0x00000002, ret+24(FP)
609
+	RET
610
+
611
+	// Return with match offset too long error
612
+	MOVQ $0x00000003, ret+24(FP)
613
+	RET
614
+
615
+	// Return with not enough literals error
616
+error_not_enough_literals:
617
+	MOVQ $0x00000004, ret+24(FP)
618
+	RET
619
+
620
+	// Return with not enough output space error
621
+	MOVQ $0x00000005, ret+24(FP)
622
+	RET
623
+
624
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
625
+// Requires: BMI, BMI2, CMOV
626
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
627
+	MOVQ    br+8(FP), CX
628
+	MOVQ    32(CX), AX
629
+	MOVBQZX 40(CX), DX
630
+	MOVQ    24(CX), BX
631
+	MOVQ    (CX), CX
632
+	ADDQ    BX, CX
633
+	MOVQ    CX, (SP)
634
+	MOVQ    ctx+16(FP), CX
635
+	MOVQ    72(CX), SI
636
+	MOVQ    80(CX), DI
637
+	MOVQ    88(CX), R8
638
+	MOVQ    104(CX), R9
639
+	MOVQ    s+0(FP), CX
640
+	MOVQ    144(CX), R10
641
+	MOVQ    152(CX), R11
642
+	MOVQ    160(CX), R12
643
+
644
+sequenceDecs_decode_bmi2_main_loop:
645
+	MOVQ (SP), R13
646
+
647
+	// Fill bitreader to have enough for the offset and match length.
648
+	CMPQ BX, $0x08
649
+	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
650
+	MOVQ DX, CX
651
+	SHRQ $0x03, CX
652
+	SUBQ CX, R13
653
+	MOVQ (R13), AX
654
+	SUBQ CX, BX
655
+	ANDQ $0x07, DX
656
+	JMP  sequenceDecs_decode_bmi2_fill_end
657
+
658
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
659
+	CMPQ    BX, $0x00
660
+	JLE     sequenceDecs_decode_bmi2_fill_end
661
+	CMPQ    DX, $0x07
662
+	JLE     sequenceDecs_decode_bmi2_fill_end
663
+	SHLQ    $0x08, AX
664
+	SUBQ    $0x01, R13
665
+	SUBQ    $0x01, BX
666
+	SUBQ    $0x08, DX
667
+	MOVBQZX (R13), CX
668
+	ORQ     CX, AX
669
+	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
670
+
671
+sequenceDecs_decode_bmi2_fill_end:
672
+	// Update offset
673
+	MOVQ   $0x00000808, CX
674
+	BEXTRQ CX, R8, R14
675
+	MOVQ   AX, R15
676
+	LEAQ   (DX)(R14*1), CX
677
+	ROLQ   CL, R15
678
+	BZHIQ  R14, R15, R15
679
+	MOVQ   CX, DX
680
+	MOVQ   R8, CX
681
+	SHRQ   $0x20, CX
682
+	ADDQ   R15, CX
683
+	MOVQ   CX, 16(R9)
684
+
685
+	// Update match length
686
+	MOVQ   $0x00000808, CX
687
+	BEXTRQ CX, DI, R14
688
+	MOVQ   AX, R15
689
+	LEAQ   (DX)(R14*1), CX
690
+	ROLQ   CL, R15
691
+	BZHIQ  R14, R15, R15
692
+	MOVQ   CX, DX
693
+	MOVQ   DI, CX
694
+	SHRQ   $0x20, CX
695
+	ADDQ   R15, CX
696
+	MOVQ   CX, 8(R9)
697
+
698
+	// Fill bitreader to have enough for the remaining
699
+	CMPQ BX, $0x08
700
+	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
701
+	MOVQ DX, CX
702
+	SHRQ $0x03, CX
703
+	SUBQ CX, R13
704
+	MOVQ (R13), AX
705
+	SUBQ CX, BX
706
+	ANDQ $0x07, DX
707
+	JMP  sequenceDecs_decode_bmi2_fill_2_end
708
+
709
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
710
+	CMPQ    BX, $0x00
711
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
712
+	CMPQ    DX, $0x07
713
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
714
+	SHLQ    $0x08, AX
715
+	SUBQ    $0x01, R13
716
+	SUBQ    $0x01, BX
717
+	SUBQ    $0x08, DX
718
+	MOVBQZX (R13), CX
719
+	ORQ     CX, AX
720
+	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
721
+
722
+sequenceDecs_decode_bmi2_fill_2_end:
723
+	// Update literal length
724
+	MOVQ   $0x00000808, CX
725
+	BEXTRQ CX, SI, R14
726
+	MOVQ   AX, R15
727
+	LEAQ   (DX)(R14*1), CX
728
+	ROLQ   CL, R15
729
+	BZHIQ  R14, R15, R15
730
+	MOVQ   CX, DX
731
+	MOVQ   SI, CX
732
+	SHRQ   $0x20, CX
733
+	ADDQ   R15, CX
734
+	MOVQ   CX, (R9)
735
+
736
+	// Fill bitreader for state updates
737
+	MOVQ    R13, (SP)
738
+	MOVQ    $0x00000808, CX
739
+	BEXTRQ  CX, R8, R13
740
+	MOVQ    ctx+16(FP), CX
741
+	CMPQ    96(CX), $0x00
742
+	JZ      sequenceDecs_decode_bmi2_skip_update
743
+	LEAQ    (SI)(DI*1), R14
744
+	ADDQ    R8, R14
745
+	MOVBQZX R14, R14
746
+	LEAQ    (DX)(R14*1), CX
747
+	MOVQ    AX, R15
748
+	MOVQ    CX, DX
749
+	ROLQ    CL, R15
750
+	BZHIQ   R14, R15, R15
751
+
752
+	// Update Offset State
753
+	BZHIQ  R8, R15, CX
754
+	SHRXQ  R8, R15, R15
755
+	MOVQ   $0x00001010, R14
756
+	BEXTRQ R14, R8, R8
757
+	ADDQ   CX, R8
758
+
759
+	// Load ctx.ofTable
760
+	MOVQ ctx+16(FP), CX
761
+	MOVQ 48(CX), CX
762
+	MOVQ (CX)(R8*8), R8
763
+
764
+	// Update Match Length State
765
+	BZHIQ  DI, R15, CX
766
+	SHRXQ  DI, R15, R15
767
+	MOVQ   $0x00001010, R14
768
+	BEXTRQ R14, DI, DI
769
+	ADDQ   CX, DI
770
+
771
+	// Load ctx.mlTable
772
+	MOVQ ctx+16(FP), CX
773
+	MOVQ 24(CX), CX
774
+	MOVQ (CX)(DI*8), DI
775
+
776
+	// Update Literal Length State
777
+	BZHIQ  SI, R15, CX
778
+	MOVQ   $0x00001010, R14
779
+	BEXTRQ R14, SI, SI
780
+	ADDQ   CX, SI
781
+
782
+	// Load ctx.llTable
783
+	MOVQ ctx+16(FP), CX
784
+	MOVQ (CX), CX
785
+	MOVQ (CX)(SI*8), SI
786
+
787
+sequenceDecs_decode_bmi2_skip_update:
788
+	// Adjust offset
789
+	MOVQ 16(R9), CX
790
+	CMPQ R13, $0x01
791
+	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
792
+	MOVQ R11, R12
793
+	MOVQ R10, R11
794
+	MOVQ CX, R10
795
+	JMP  sequenceDecs_decode_bmi2_after_adjust
796
+
797
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
798
+	CMPQ (R9), $0x00000000
799
+	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
800
+	INCQ CX
801
+	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
802
+
803
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
804
+	TESTQ CX, CX
805
+	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
806
+	MOVQ  R10, CX
807
+	JMP   sequenceDecs_decode_bmi2_after_adjust
808
+
809
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
810
+	CMPQ CX, $0x01
811
+	JB   sequenceDecs_decode_bmi2_adjust_zero
812
+	JEQ  sequenceDecs_decode_bmi2_adjust_one
813
+	CMPQ CX, $0x02
814
+	JA   sequenceDecs_decode_bmi2_adjust_three
815
+	JMP  sequenceDecs_decode_bmi2_adjust_two
816
+
817
+sequenceDecs_decode_bmi2_adjust_zero:
818
+	MOVQ R10, R13
819
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
820
+
821
+sequenceDecs_decode_bmi2_adjust_one:
822
+	MOVQ R11, R13
823
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
824
+
825
+sequenceDecs_decode_bmi2_adjust_two:
826
+	MOVQ R12, R13
827
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
828
+
829
+sequenceDecs_decode_bmi2_adjust_three:
830
+	LEAQ -1(R10), R13
831
+
832
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
833
+	TESTQ R13, R13
834
+	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
835
+	MOVQ  $0x00000001, R13
836
+
837
+sequenceDecs_decode_bmi2_adjust_temp_valid:
838
+	CMPQ    CX, $0x01
839
+	CMOVQNE R11, R12
840
+	MOVQ    R10, R11
841
+	MOVQ    R13, R10
842
+	MOVQ    R13, CX
843
+
844
+sequenceDecs_decode_bmi2_after_adjust:
845
+	MOVQ CX, 16(R9)
846
+
847
+	// Check values
848
+	MOVQ  8(R9), R13
849
+	MOVQ  (R9), R14
850
+	LEAQ  (R13)(R14*1), R15
851
+	MOVQ  s+0(FP), BP
852
+	ADDQ  R15, 256(BP)
853
+	MOVQ  ctx+16(FP), R15
854
+	SUBQ  R14, 128(R15)
855
+	JS    error_not_enough_literals
856
+	CMPQ  R13, $0x00020002
857
+	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
858
+	TESTQ CX, CX
859
+	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
860
+	TESTQ R13, R13
861
+	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
862
+
863
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
864
+	ADDQ $0x18, R9
865
+	MOVQ ctx+16(FP), CX
866
+	DECQ 96(CX)
867
+	JNS  sequenceDecs_decode_bmi2_main_loop
868
+	MOVQ s+0(FP), CX
869
+	MOVQ R10, 144(CX)
870
+	MOVQ R11, 152(CX)
871
+	MOVQ R12, 160(CX)
872
+	MOVQ br+8(FP), CX
873
+	MOVQ AX, 32(CX)
874
+	MOVB DL, 40(CX)
875
+	MOVQ BX, 24(CX)
876
+
877
+	// Return success
878
+	MOVQ $0x00000000, ret+24(FP)
879
+	RET
880
+
881
+	// Return with match length error
882
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
883
+	MOVQ $0x00000001, ret+24(FP)
884
+	RET
885
+
886
+	// Return with match too long error
887
+sequenceDecs_decode_bmi2_error_match_len_too_big:
888
+	MOVQ $0x00000002, ret+24(FP)
889
+	RET
890
+
891
+	// Return with match offset too long error
892
+	MOVQ $0x00000003, ret+24(FP)
893
+	RET
894
+
895
+	// Return with not enough literals error
896
+error_not_enough_literals:
897
+	MOVQ $0x00000004, ret+24(FP)
898
+	RET
899
+
900
+	// Return with not enough output space error
901
+	MOVQ $0x00000005, ret+24(FP)
902
+	RET
903
+
904
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
905
+// Requires: BMI, BMI2, CMOV
906
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
907
+	MOVQ    br+8(FP), CX
908
+	MOVQ    32(CX), AX
909
+	MOVBQZX 40(CX), DX
910
+	MOVQ    24(CX), BX
911
+	MOVQ    (CX), CX
912
+	ADDQ    BX, CX
913
+	MOVQ    CX, (SP)
914
+	MOVQ    ctx+16(FP), CX
915
+	MOVQ    72(CX), SI
916
+	MOVQ    80(CX), DI
917
+	MOVQ    88(CX), R8
918
+	MOVQ    104(CX), R9
919
+	MOVQ    s+0(FP), CX
920
+	MOVQ    144(CX), R10
921
+	MOVQ    152(CX), R11
922
+	MOVQ    160(CX), R12
923
+
924
+sequenceDecs_decode_56_bmi2_main_loop:
925
+	MOVQ (SP), R13
926
+
927
+	// Fill bitreader to have enough for the offset and match length.
928
+	CMPQ BX, $0x08
929
+	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
930
+	MOVQ DX, CX
931
+	SHRQ $0x03, CX
932
+	SUBQ CX, R13
933
+	MOVQ (R13), AX
934
+	SUBQ CX, BX
935
+	ANDQ $0x07, DX
936
+	JMP  sequenceDecs_decode_56_bmi2_fill_end
937
+
938
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
939
+	CMPQ    BX, $0x00
940
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
941
+	CMPQ    DX, $0x07
942
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
943
+	SHLQ    $0x08, AX
944
+	SUBQ    $0x01, R13
945
+	SUBQ    $0x01, BX
946
+	SUBQ    $0x08, DX
947
+	MOVBQZX (R13), CX
948
+	ORQ     CX, AX
949
+	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
950
+
951
+sequenceDecs_decode_56_bmi2_fill_end:
952
+	// Update offset
953
+	MOVQ   $0x00000808, CX
954
+	BEXTRQ CX, R8, R14
955
+	MOVQ   AX, R15
956
+	LEAQ   (DX)(R14*1), CX
957
+	ROLQ   CL, R15
958
+	BZHIQ  R14, R15, R15
959
+	MOVQ   CX, DX
960
+	MOVQ   R8, CX
961
+	SHRQ   $0x20, CX
962
+	ADDQ   R15, CX
963
+	MOVQ   CX, 16(R9)
964
+
965
+	// Update match length
966
+	MOVQ   $0x00000808, CX
967
+	BEXTRQ CX, DI, R14
968
+	MOVQ   AX, R15
969
+	LEAQ   (DX)(R14*1), CX
970
+	ROLQ   CL, R15
971
+	BZHIQ  R14, R15, R15
972
+	MOVQ   CX, DX
973
+	MOVQ   DI, CX
974
+	SHRQ   $0x20, CX
975
+	ADDQ   R15, CX
976
+	MOVQ   CX, 8(R9)
977
+
978
+	// Update literal length
979
+	MOVQ   $0x00000808, CX
980
+	BEXTRQ CX, SI, R14
981
+	MOVQ   AX, R15
982
+	LEAQ   (DX)(R14*1), CX
983
+	ROLQ   CL, R15
984
+	BZHIQ  R14, R15, R15
985
+	MOVQ   CX, DX
986
+	MOVQ   SI, CX
987
+	SHRQ   $0x20, CX
988
+	ADDQ   R15, CX
989
+	MOVQ   CX, (R9)
990
+
991
+	// Fill bitreader for state updates
992
+	MOVQ    R13, (SP)
993
+	MOVQ    $0x00000808, CX
994
+	BEXTRQ  CX, R8, R13
995
+	MOVQ    ctx+16(FP), CX
996
+	CMPQ    96(CX), $0x00
997
+	JZ      sequenceDecs_decode_56_bmi2_skip_update
998
+	LEAQ    (SI)(DI*1), R14
999
+	ADDQ    R8, R14
1000
+	MOVBQZX R14, R14
1001
+	LEAQ    (DX)(R14*1), CX
1002
+	MOVQ    AX, R15
1003
+	MOVQ    CX, DX
1004
+	ROLQ    CL, R15
1005
+	BZHIQ   R14, R15, R15
1006
+
1007
+	// Update Offset State
1008
+	BZHIQ  R8, R15, CX
1009
+	SHRXQ  R8, R15, R15
1010
+	MOVQ   $0x00001010, R14
1011
+	BEXTRQ R14, R8, R8
1012
+	ADDQ   CX, R8
1013
+
1014
+	// Load ctx.ofTable
1015
+	MOVQ ctx+16(FP), CX
1016
+	MOVQ 48(CX), CX
1017
+	MOVQ (CX)(R8*8), R8
1018
+
1019
+	// Update Match Length State
1020
+	BZHIQ  DI, R15, CX
1021
+	SHRXQ  DI, R15, R15
1022
+	MOVQ   $0x00001010, R14
1023
+	BEXTRQ R14, DI, DI
1024
+	ADDQ   CX, DI
1025
+
1026
+	// Load ctx.mlTable
1027
+	MOVQ ctx+16(FP), CX
1028
+	MOVQ 24(CX), CX
1029
+	MOVQ (CX)(DI*8), DI
1030
+
1031
+	// Update Literal Length State
1032
+	BZHIQ  SI, R15, CX
1033
+	MOVQ   $0x00001010, R14
1034
+	BEXTRQ R14, SI, SI
1035
+	ADDQ   CX, SI
1036
+
1037
+	// Load ctx.llTable
1038
+	MOVQ ctx+16(FP), CX
1039
+	MOVQ (CX), CX
1040
+	MOVQ (CX)(SI*8), SI
1041
+
1042
+sequenceDecs_decode_56_bmi2_skip_update:
1043
+	// Adjust offset
1044
+	MOVQ 16(R9), CX
1045
+	CMPQ R13, $0x01
1046
+	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
1047
+	MOVQ R11, R12
1048
+	MOVQ R10, R11
1049
+	MOVQ CX, R10
1050
+	JMP  sequenceDecs_decode_56_bmi2_after_adjust
1051
+
1052
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
1053
+	CMPQ (R9), $0x00000000
1054
+	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
1055
+	INCQ CX
1056
+	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1057
+
1058
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
1059
+	TESTQ CX, CX
1060
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
1061
+	MOVQ  R10, CX
1062
+	JMP   sequenceDecs_decode_56_bmi2_after_adjust
1063
+
1064
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
1065
+	CMPQ CX, $0x01
1066
+	JB   sequenceDecs_decode_56_bmi2_adjust_zero
1067
+	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
1068
+	CMPQ CX, $0x02
1069
+	JA   sequenceDecs_decode_56_bmi2_adjust_three
1070
+	JMP  sequenceDecs_decode_56_bmi2_adjust_two
1071
+
1072
+sequenceDecs_decode_56_bmi2_adjust_zero:
1073
+	MOVQ R10, R13
1074
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1075
+
1076
+sequenceDecs_decode_56_bmi2_adjust_one:
1077
+	MOVQ R11, R13
1078
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1079
+
1080
+sequenceDecs_decode_56_bmi2_adjust_two:
1081
+	MOVQ R12, R13
1082
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
1083
+
1084
+sequenceDecs_decode_56_bmi2_adjust_three:
1085
+	LEAQ -1(R10), R13
1086
+
1087
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
1088
+	TESTQ R13, R13
1089
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
1090
+	MOVQ  $0x00000001, R13
1091
+
1092
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
1093
+	CMPQ    CX, $0x01
1094
+	CMOVQNE R11, R12
1095
+	MOVQ    R10, R11
1096
+	MOVQ    R13, R10
1097
+	MOVQ    R13, CX
1098
+
1099
+sequenceDecs_decode_56_bmi2_after_adjust:
1100
+	MOVQ CX, 16(R9)
1101
+
1102
+	// Check values
1103
+	MOVQ  8(R9), R13
1104
+	MOVQ  (R9), R14
1105
+	LEAQ  (R13)(R14*1), R15
1106
+	MOVQ  s+0(FP), BP
1107
+	ADDQ  R15, 256(BP)
1108
+	MOVQ  ctx+16(FP), R15
1109
+	SUBQ  R14, 128(R15)
1110
+	JS    error_not_enough_literals
1111
+	CMPQ  R13, $0x00020002
1112
+	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
1113
+	TESTQ CX, CX
1114
+	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
1115
+	TESTQ R13, R13
1116
+	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
1117
+
1118
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
1119
+	ADDQ $0x18, R9
1120
+	MOVQ ctx+16(FP), CX
1121
+	DECQ 96(CX)
1122
+	JNS  sequenceDecs_decode_56_bmi2_main_loop
1123
+	MOVQ s+0(FP), CX
1124
+	MOVQ R10, 144(CX)
1125
+	MOVQ R11, 152(CX)
1126
+	MOVQ R12, 160(CX)
1127
+	MOVQ br+8(FP), CX
1128
+	MOVQ AX, 32(CX)
1129
+	MOVB DL, 40(CX)
1130
+	MOVQ BX, 24(CX)
1131
+
1132
+	// Return success
1133
+	MOVQ $0x00000000, ret+24(FP)
1134
+	RET
1135
+
1136
+	// Return with match length error
1137
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
1138
+	MOVQ $0x00000001, ret+24(FP)
1139
+	RET
1140
+
1141
+	// Return with match too long error
1142
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
1143
+	MOVQ $0x00000002, ret+24(FP)
1144
+	RET
1145
+
1146
+	// Return with match offset too long error
1147
+	MOVQ $0x00000003, ret+24(FP)
1148
+	RET
1149
+
1150
+	// Return with not enough literals error
1151
+error_not_enough_literals:
1152
+	MOVQ $0x00000004, ret+24(FP)
1153
+	RET
1154
+
1155
+	// Return with not enough output space error
1156
+	MOVQ $0x00000005, ret+24(FP)
1157
+	RET
1158
+
1159
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
1160
+// Requires: SSE
1161
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
1162
+	MOVQ  ctx+0(FP), R10
1163
+	MOVQ  8(R10), CX
1164
+	TESTQ CX, CX
1165
+	JZ    empty_seqs
1166
+	MOVQ  (R10), AX
1167
+	MOVQ  24(R10), DX
1168
+	MOVQ  32(R10), BX
1169
+	MOVQ  80(R10), SI
1170
+	MOVQ  104(R10), DI
1171
+	MOVQ  120(R10), R8
1172
+	MOVQ  56(R10), R9
1173
+	MOVQ  64(R10), R10
1174
+	ADDQ  R10, R9
1175
+
1176
+	// seqsBase += 24 * seqIndex
1177
+	LEAQ (DX)(DX*2), R11
1178
+	SHLQ $0x03, R11
1179
+	ADDQ R11, AX
1180
+
1181
+	// outBase += outPosition
1182
+	ADDQ DI, BX
1183
+
1184
+main_loop:
1185
+	MOVQ (AX), R11
1186
+	MOVQ 16(AX), R12
1187
+	MOVQ 8(AX), R13
1188
+
1189
+	// Copy literals
1190
+	TESTQ R11, R11
1191
+	JZ    check_offset
1192
+	XORQ  R14, R14
1193
+
1194
+copy_1:
1195
+	MOVUPS (SI)(R14*1), X0
1196
+	MOVUPS X0, (BX)(R14*1)
1197
+	ADDQ   $0x10, R14
1198
+	CMPQ   R14, R11
1199
+	JB     copy_1
1200
+	ADDQ   R11, SI
1201
+	ADDQ   R11, BX
1202
+	ADDQ   R11, DI
1203
+
1204
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1205
+check_offset:
1206
+	LEAQ (DI)(R10*1), R11
1207
+	CMPQ R12, R11
1208
+	JG   error_match_off_too_big
1209
+	CMPQ R12, R8
1210
+	JG   error_match_off_too_big
1211
+
1212
+	// Copy match from history
1213
+	MOVQ R12, R11
1214
+	SUBQ DI, R11
1215
+	JLS  copy_match
1216
+	MOVQ R9, R14
1217
+	SUBQ R11, R14
1218
+	CMPQ R13, R11
1219
+	JG   copy_all_from_history
1220
+	MOVQ R13, R11
1221
+	SUBQ $0x10, R11
1222
+	JB   copy_4_small
1223
+
1224
+copy_4_loop:
1225
+	MOVUPS (R14), X0
1226
+	MOVUPS X0, (BX)
1227
+	ADDQ   $0x10, R14
1228
+	ADDQ   $0x10, BX
1229
+	SUBQ   $0x10, R11
1230
+	JAE    copy_4_loop
1231
+	LEAQ   16(R14)(R11*1), R14
1232
+	LEAQ   16(BX)(R11*1), BX
1233
+	MOVUPS -16(R14), X0
1234
+	MOVUPS X0, -16(BX)
1235
+	JMP    copy_4_end
1236
+
1237
+copy_4_small:
1238
+	CMPQ R13, $0x03
1239
+	JE   copy_4_move_3
1240
+	CMPQ R13, $0x08
1241
+	JB   copy_4_move_4through7
1242
+	JMP  copy_4_move_8through16
1243
+
1244
+copy_4_move_3:
1245
+	MOVW (R14), R11
1246
+	MOVB 2(R14), R12
1247
+	MOVW R11, (BX)
1248
+	MOVB R12, 2(BX)
1249
+	ADDQ R13, R14
1250
+	ADDQ R13, BX
1251
+	JMP  copy_4_end
1252
+
1253
+copy_4_move_4through7:
1254
+	MOVL (R14), R11
1255
+	MOVL -4(R14)(R13*1), R12
1256
+	MOVL R11, (BX)
1257
+	MOVL R12, -4(BX)(R13*1)
1258
+	ADDQ R13, R14
1259
+	ADDQ R13, BX
1260
+	JMP  copy_4_end
1261
+
1262
+copy_4_move_8through16:
1263
+	MOVQ (R14), R11
1264
+	MOVQ -8(R14)(R13*1), R12
1265
+	MOVQ R11, (BX)
1266
+	MOVQ R12, -8(BX)(R13*1)
1267
+	ADDQ R13, R14
1268
+	ADDQ R13, BX
1269
+
1270
+copy_4_end:
1271
+	ADDQ R13, DI
1272
+	ADDQ $0x18, AX
1273
+	INCQ DX
1274
+	CMPQ DX, CX
1275
+	JB   main_loop
1276
+	JMP  loop_finished
1277
+
1278
+copy_all_from_history:
1279
+	MOVQ R11, R15
1280
+	SUBQ $0x10, R15
1281
+	JB   copy_5_small
1282
+
1283
+copy_5_loop:
1284
+	MOVUPS (R14), X0
1285
+	MOVUPS X0, (BX)
1286
+	ADDQ   $0x10, R14
1287
+	ADDQ   $0x10, BX
1288
+	SUBQ   $0x10, R15
1289
+	JAE    copy_5_loop
1290
+	LEAQ   16(R14)(R15*1), R14
1291
+	LEAQ   16(BX)(R15*1), BX
1292
+	MOVUPS -16(R14), X0
1293
+	MOVUPS X0, -16(BX)
1294
+	JMP    copy_5_end
1295
+
1296
+copy_5_small:
1297
+	CMPQ R11, $0x03
1298
+	JE   copy_5_move_3
1299
+	JB   copy_5_move_1or2
1300
+	CMPQ R11, $0x08
1301
+	JB   copy_5_move_4through7
1302
+	JMP  copy_5_move_8through16
1303
+
1304
+copy_5_move_1or2:
1305
+	MOVB (R14), R15
1306
+	MOVB -1(R14)(R11*1), BP
1307
+	MOVB R15, (BX)
1308
+	MOVB BP, -1(BX)(R11*1)
1309
+	ADDQ R11, R14
1310
+	ADDQ R11, BX
1311
+	JMP  copy_5_end
1312
+
1313
+copy_5_move_3:
1314
+	MOVW (R14), R15
1315
+	MOVB 2(R14), BP
1316
+	MOVW R15, (BX)
1317
+	MOVB BP, 2(BX)
1318
+	ADDQ R11, R14
1319
+	ADDQ R11, BX
1320
+	JMP  copy_5_end
1321
+
1322
+copy_5_move_4through7:
1323
+	MOVL (R14), R15
1324
+	MOVL -4(R14)(R11*1), BP
1325
+	MOVL R15, (BX)
1326
+	MOVL BP, -4(BX)(R11*1)
1327
+	ADDQ R11, R14
1328
+	ADDQ R11, BX
1329
+	JMP  copy_5_end
1330
+
1331
+copy_5_move_8through16:
1332
+	MOVQ (R14), R15
1333
+	MOVQ -8(R14)(R11*1), BP
1334
+	MOVQ R15, (BX)
1335
+	MOVQ BP, -8(BX)(R11*1)
1336
+	ADDQ R11, R14
1337
+	ADDQ R11, BX
1338
+
1339
+copy_5_end:
1340
+	ADDQ R11, DI
1341
+	SUBQ R11, R13
1342
+
1343
+	// Copy match from the current buffer
1344
+copy_match:
1345
+	MOVQ BX, R11
1346
+	SUBQ R12, R11
1347
+
1348
+	// ml <= mo
1349
+	CMPQ R13, R12
1350
+	JA   copy_overlapping_match
1351
+
1352
+	// Copy non-overlapping match
1353
+	ADDQ R13, DI
1354
+	MOVQ BX, R12
1355
+	ADDQ R13, BX
1356
+
1357
+copy_2:
1358
+	MOVUPS (R11), X0
1359
+	MOVUPS X0, (R12)
1360
+	ADDQ   $0x10, R11
1361
+	ADDQ   $0x10, R12
1362
+	SUBQ   $0x10, R13
1363
+	JHI    copy_2
1364
+	JMP    handle_loop
1365
+
1366
+	// Copy overlapping match
1367
+copy_overlapping_match:
1368
+	ADDQ R13, DI
1369
+
1370
+copy_slow_3:
1371
+	MOVB (R11), R12
1372
+	MOVB R12, (BX)
1373
+	INCQ R11
1374
+	INCQ BX
1375
+	DECQ R13
1376
+	JNZ  copy_slow_3
1377
+
1378
+handle_loop:
1379
+	ADDQ $0x18, AX
1380
+	INCQ DX
1381
+	CMPQ DX, CX
1382
+	JB   main_loop
1383
+
1384
+loop_finished:
1385
+	// Return value
1386
+	MOVB $0x01, ret+8(FP)
1387
+
1388
+	// Update the context
1389
+	MOVQ ctx+0(FP), AX
1390
+	MOVQ DX, 24(AX)
1391
+	MOVQ DI, 104(AX)
1392
+	MOVQ 80(AX), CX
1393
+	SUBQ CX, SI
1394
+	MOVQ SI, 112(AX)
1395
+	RET
1396
+
1397
+error_match_off_too_big:
1398
+	// Return value
1399
+	MOVB $0x00, ret+8(FP)
1400
+
1401
+	// Update the context
1402
+	MOVQ ctx+0(FP), AX
1403
+	MOVQ DX, 24(AX)
1404
+	MOVQ DI, 104(AX)
1405
+	MOVQ 80(AX), CX
1406
+	SUBQ CX, SI
1407
+	MOVQ SI, 112(AX)
1408
+	RET
1409
+
1410
+empty_seqs:
1411
+	// Return value
1412
+	MOVB $0x01, ret+8(FP)
1413
+	RET
1414
+
1415
+// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
1416
+// Requires: SSE
1417
+TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
1418
+	MOVQ  ctx+0(FP), R10
1419
+	MOVQ  8(R10), CX
1420
+	TESTQ CX, CX
1421
+	JZ    empty_seqs
1422
+	MOVQ  (R10), AX
1423
+	MOVQ  24(R10), DX
1424
+	MOVQ  32(R10), BX
1425
+	MOVQ  80(R10), SI
1426
+	MOVQ  104(R10), DI
1427
+	MOVQ  120(R10), R8
1428
+	MOVQ  56(R10), R9
1429
+	MOVQ  64(R10), R10
1430
+	ADDQ  R10, R9
1431
+
1432
+	// seqsBase += 24 * seqIndex
1433
+	LEAQ (DX)(DX*2), R11
1434
+	SHLQ $0x03, R11
1435
+	ADDQ R11, AX
1436
+
1437
+	// outBase += outPosition
1438
+	ADDQ DI, BX
1439
+
1440
+main_loop:
1441
+	MOVQ (AX), R11
1442
+	MOVQ 16(AX), R12
1443
+	MOVQ 8(AX), R13
1444
+
1445
+	// Copy literals
1446
+	TESTQ R11, R11
1447
+	JZ    check_offset
1448
+	MOVQ  R11, R14
1449
+	SUBQ  $0x10, R14
1450
+	JB    copy_1_small
1451
+
1452
+copy_1_loop:
1453
+	MOVUPS (SI), X0
1454
+	MOVUPS X0, (BX)
1455
+	ADDQ   $0x10, SI
1456
+	ADDQ   $0x10, BX
1457
+	SUBQ   $0x10, R14
1458
+	JAE    copy_1_loop
1459
+	LEAQ   16(SI)(R14*1), SI
1460
+	LEAQ   16(BX)(R14*1), BX
1461
+	MOVUPS -16(SI), X0
1462
+	MOVUPS X0, -16(BX)
1463
+	JMP    copy_1_end
1464
+
1465
+copy_1_small:
1466
+	CMPQ R11, $0x03
1467
+	JE   copy_1_move_3
1468
+	JB   copy_1_move_1or2
1469
+	CMPQ R11, $0x08
1470
+	JB   copy_1_move_4through7
1471
+	JMP  copy_1_move_8through16
1472
+
1473
+copy_1_move_1or2:
1474
+	MOVB (SI), R14
1475
+	MOVB -1(SI)(R11*1), R15
1476
+	MOVB R14, (BX)
1477
+	MOVB R15, -1(BX)(R11*1)
1478
+	ADDQ R11, SI
1479
+	ADDQ R11, BX
1480
+	JMP  copy_1_end
1481
+
1482
+copy_1_move_3:
1483
+	MOVW (SI), R14
1484
+	MOVB 2(SI), R15
1485
+	MOVW R14, (BX)
1486
+	MOVB R15, 2(BX)
1487
+	ADDQ R11, SI
1488
+	ADDQ R11, BX
1489
+	JMP  copy_1_end
1490
+
1491
+copy_1_move_4through7:
1492
+	MOVL (SI), R14
1493
+	MOVL -4(SI)(R11*1), R15
1494
+	MOVL R14, (BX)
1495
+	MOVL R15, -4(BX)(R11*1)
1496
+	ADDQ R11, SI
1497
+	ADDQ R11, BX
1498
+	JMP  copy_1_end
1499
+
1500
+copy_1_move_8through16:
1501
+	MOVQ (SI), R14
1502
+	MOVQ -8(SI)(R11*1), R15
1503
+	MOVQ R14, (BX)
1504
+	MOVQ R15, -8(BX)(R11*1)
1505
+	ADDQ R11, SI
1506
+	ADDQ R11, BX
1507
+
1508
+copy_1_end:
1509
+	ADDQ R11, DI
1510
+
1511
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
1512
+check_offset:
1513
+	LEAQ (DI)(R10*1), R11
1514
+	CMPQ R12, R11
1515
+	JG   error_match_off_too_big
1516
+	CMPQ R12, R8
1517
+	JG   error_match_off_too_big
1518
+
1519
+	// Copy match from history
1520
+	MOVQ R12, R11
1521
+	SUBQ DI, R11
1522
+	JLS  copy_match
1523
+	MOVQ R9, R14
1524
+	SUBQ R11, R14
1525
+	CMPQ R13, R11
1526
+	JG   copy_all_from_history
1527
+	MOVQ R13, R11
1528
+	SUBQ $0x10, R11
1529
+	JB   copy_4_small
1530
+
1531
+copy_4_loop:
1532
+	MOVUPS (R14), X0
1533
+	MOVUPS X0, (BX)
1534
+	ADDQ   $0x10, R14
1535
+	ADDQ   $0x10, BX
1536
+	SUBQ   $0x10, R11
1537
+	JAE    copy_4_loop
1538
+	LEAQ   16(R14)(R11*1), R14
1539
+	LEAQ   16(BX)(R11*1), BX
1540
+	MOVUPS -16(R14), X0
1541
+	MOVUPS X0, -16(BX)
1542
+	JMP    copy_4_end
1543
+
1544
+copy_4_small:
1545
+	CMPQ R13, $0x03
1546
+	JE   copy_4_move_3
1547
+	CMPQ R13, $0x08
1548
+	JB   copy_4_move_4through7
1549
+	JMP  copy_4_move_8through16
1550
+
1551
+copy_4_move_3:
1552
+	MOVW (R14), R11
1553
+	MOVB 2(R14), R12
1554
+	MOVW R11, (BX)
1555
+	MOVB R12, 2(BX)
1556
+	ADDQ R13, R14
1557
+	ADDQ R13, BX
1558
+	JMP  copy_4_end
1559
+
1560
+copy_4_move_4through7:
1561
+	MOVL (R14), R11
1562
+	MOVL -4(R14)(R13*1), R12
1563
+	MOVL R11, (BX)
1564
+	MOVL R12, -4(BX)(R13*1)
1565
+	ADDQ R13, R14
1566
+	ADDQ R13, BX
1567
+	JMP  copy_4_end
1568
+
1569
+copy_4_move_8through16:
1570
+	MOVQ (R14), R11
1571
+	MOVQ -8(R14)(R13*1), R12
1572
+	MOVQ R11, (BX)
1573
+	MOVQ R12, -8(BX)(R13*1)
1574
+	ADDQ R13, R14
1575
+	ADDQ R13, BX
1576
+
1577
+copy_4_end:
1578
+	ADDQ R13, DI
1579
+	ADDQ $0x18, AX
1580
+	INCQ DX
1581
+	CMPQ DX, CX
1582
+	JB   main_loop
1583
+	JMP  loop_finished
1584
+
1585
+copy_all_from_history:
1586
+	MOVQ R11, R15
1587
+	SUBQ $0x10, R15
1588
+	JB   copy_5_small
1589
+
1590
+copy_5_loop:
1591
+	MOVUPS (R14), X0
1592
+	MOVUPS X0, (BX)
1593
+	ADDQ   $0x10, R14
1594
+	ADDQ   $0x10, BX
1595
+	SUBQ   $0x10, R15
1596
+	JAE    copy_5_loop
1597
+	LEAQ   16(R14)(R15*1), R14
1598
+	LEAQ   16(BX)(R15*1), BX
1599
+	MOVUPS -16(R14), X0
1600
+	MOVUPS X0, -16(BX)
1601
+	JMP    copy_5_end
1602
+
1603
+copy_5_small:
1604
+	CMPQ R11, $0x03
1605
+	JE   copy_5_move_3
1606
+	JB   copy_5_move_1or2
1607
+	CMPQ R11, $0x08
1608
+	JB   copy_5_move_4through7
1609
+	JMP  copy_5_move_8through16
1610
+
1611
+copy_5_move_1or2:
1612
+	MOVB (R14), R15
1613
+	MOVB -1(R14)(R11*1), BP
1614
+	MOVB R15, (BX)
1615
+	MOVB BP, -1(BX)(R11*1)
1616
+	ADDQ R11, R14
1617
+	ADDQ R11, BX
1618
+	JMP  copy_5_end
1619
+
1620
+copy_5_move_3:
1621
+	MOVW (R14), R15
1622
+	MOVB 2(R14), BP
1623
+	MOVW R15, (BX)
1624
+	MOVB BP, 2(BX)
1625
+	ADDQ R11, R14
1626
+	ADDQ R11, BX
1627
+	JMP  copy_5_end
1628
+
1629
+copy_5_move_4through7:
1630
+	MOVL (R14), R15
1631
+	MOVL -4(R14)(R11*1), BP
1632
+	MOVL R15, (BX)
1633
+	MOVL BP, -4(BX)(R11*1)
1634
+	ADDQ R11, R14
1635
+	ADDQ R11, BX
1636
+	JMP  copy_5_end
1637
+
1638
+copy_5_move_8through16:
1639
+	MOVQ (R14), R15
1640
+	MOVQ -8(R14)(R11*1), BP
1641
+	MOVQ R15, (BX)
1642
+	MOVQ BP, -8(BX)(R11*1)
1643
+	ADDQ R11, R14
1644
+	ADDQ R11, BX
1645
+
1646
+copy_5_end:
1647
+	ADDQ R11, DI
1648
+	SUBQ R11, R13
1649
+
1650
+	// Copy match from the current buffer
1651
+copy_match:
1652
+	MOVQ BX, R11
1653
+	SUBQ R12, R11
1654
+
1655
+	// ml <= mo
1656
+	CMPQ R13, R12
1657
+	JA   copy_overlapping_match
1658
+
1659
+	// Copy non-overlapping match
1660
+	ADDQ R13, DI
1661
+	MOVQ R13, R12
1662
+	SUBQ $0x10, R12
1663
+	JB   copy_2_small
1664
+
1665
+copy_2_loop:
1666
+	MOVUPS (R11), X0
1667
+	MOVUPS X0, (BX)
1668
+	ADDQ   $0x10, R11
1669
+	ADDQ   $0x10, BX
1670
+	SUBQ   $0x10, R12
1671
+	JAE    copy_2_loop
1672
+	LEAQ   16(R11)(R12*1), R11
1673
+	LEAQ   16(BX)(R12*1), BX
1674
+	MOVUPS -16(R11), X0
1675
+	MOVUPS X0, -16(BX)
1676
+	JMP    copy_2_end
1677
+
1678
+copy_2_small:
1679
+	CMPQ R13, $0x03
1680
+	JE   copy_2_move_3
1681
+	JB   copy_2_move_1or2
1682
+	CMPQ R13, $0x08
1683
+	JB   copy_2_move_4through7
1684
+	JMP  copy_2_move_8through16
1685
+
1686
+copy_2_move_1or2:
1687
+	MOVB (R11), R12
1688
+	MOVB -1(R11)(R13*1), R14
1689
+	MOVB R12, (BX)
1690
+	MOVB R14, -1(BX)(R13*1)
1691
+	ADDQ R13, R11
1692
+	ADDQ R13, BX
1693
+	JMP  copy_2_end
1694
+
1695
+copy_2_move_3:
1696
+	MOVW (R11), R12
1697
+	MOVB 2(R11), R14
1698
+	MOVW R12, (BX)
1699
+	MOVB R14, 2(BX)
1700
+	ADDQ R13, R11
1701
+	ADDQ R13, BX
1702
+	JMP  copy_2_end
1703
+
1704
+copy_2_move_4through7:
1705
+	MOVL (R11), R12
1706
+	MOVL -4(R11)(R13*1), R14
1707
+	MOVL R12, (BX)
1708
+	MOVL R14, -4(BX)(R13*1)
1709
+	ADDQ R13, R11
1710
+	ADDQ R13, BX
1711
+	JMP  copy_2_end
1712
+
1713
+copy_2_move_8through16:
1714
+	MOVQ (R11), R12
1715
+	MOVQ -8(R11)(R13*1), R14
1716
+	MOVQ R12, (BX)
1717
+	MOVQ R14, -8(BX)(R13*1)
1718
+	ADDQ R13, R11
1719
+	ADDQ R13, BX
1720
+
1721
+copy_2_end:
1722
+	JMP handle_loop
1723
+
1724
+	// Copy overlapping match
1725
+copy_overlapping_match:
1726
+	ADDQ R13, DI
1727
+
1728
+copy_slow_3:
1729
+	MOVB (R11), R12
1730
+	MOVB R12, (BX)
1731
+	INCQ R11
1732
+	INCQ BX
1733
+	DECQ R13
1734
+	JNZ  copy_slow_3
1735
+
1736
+handle_loop:
1737
+	ADDQ $0x18, AX
1738
+	INCQ DX
1739
+	CMPQ DX, CX
1740
+	JB   main_loop
1741
+
1742
+loop_finished:
1743
+	// Return value
1744
+	MOVB $0x01, ret+8(FP)
1745
+
1746
+	// Update the context
1747
+	MOVQ ctx+0(FP), AX
1748
+	MOVQ DX, 24(AX)
1749
+	MOVQ DI, 104(AX)
1750
+	MOVQ 80(AX), CX
1751
+	SUBQ CX, SI
1752
+	MOVQ SI, 112(AX)
1753
+	RET
1754
+
1755
+error_match_off_too_big:
1756
+	// Return value
1757
+	MOVB $0x00, ret+8(FP)
1758
+
1759
+	// Update the context
1760
+	MOVQ ctx+0(FP), AX
1761
+	MOVQ DX, 24(AX)
1762
+	MOVQ DI, 104(AX)
1763
+	MOVQ 80(AX), CX
1764
+	SUBQ CX, SI
1765
+	MOVQ SI, 112(AX)
1766
+	RET
1767
+
1768
+empty_seqs:
1769
+	// Return value
1770
+	MOVB $0x01, ret+8(FP)
1771
+	RET
1772
+
1773
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
1774
+// Requires: CMOV, SSE
1775
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
1776
+	MOVQ    br+8(FP), AX
1777
+	MOVQ    32(AX), DX
1778
+	MOVBQZX 40(AX), BX
1779
+	MOVQ    24(AX), SI
1780
+	MOVQ    (AX), AX
1781
+	ADDQ    SI, AX
1782
+	MOVQ    AX, (SP)
1783
+	MOVQ    ctx+16(FP), AX
1784
+	MOVQ    72(AX), DI
1785
+	MOVQ    80(AX), R8
1786
+	MOVQ    88(AX), R9
1787
+	XORQ    CX, CX
1788
+	MOVQ    CX, 8(SP)
1789
+	MOVQ    CX, 16(SP)
1790
+	MOVQ    CX, 24(SP)
1791
+	MOVQ    112(AX), R10
1792
+	MOVQ    128(AX), CX
1793
+	MOVQ    CX, 32(SP)
1794
+	MOVQ    144(AX), R11
1795
+	MOVQ    136(AX), R12
1796
+	MOVQ    200(AX), CX
1797
+	MOVQ    CX, 56(SP)
1798
+	MOVQ    176(AX), CX
1799
+	MOVQ    CX, 48(SP)
1800
+	MOVQ    184(AX), AX
1801
+	MOVQ    AX, 40(SP)
1802
+	MOVQ    40(SP), AX
1803
+	ADDQ    AX, 48(SP)
1804
+
1805
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
1806
+	ADDQ R10, 32(SP)
1807
+
1808
+	// outBase += outPosition
1809
+	ADDQ R12, R10
1810
+
1811
+sequenceDecs_decodeSync_amd64_main_loop:
1812
+	MOVQ (SP), R13
1813
+
1814
+	// Fill bitreader to have enough for the offset and match length.
1815
+	CMPQ SI, $0x08
1816
+	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1817
+	MOVQ BX, AX
1818
+	SHRQ $0x03, AX
1819
+	SUBQ AX, R13
1820
+	MOVQ (R13), DX
1821
+	SUBQ AX, SI
1822
+	ANDQ $0x07, BX
1823
+	JMP  sequenceDecs_decodeSync_amd64_fill_end
1824
+
1825
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
1826
+	CMPQ    SI, $0x00
1827
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
1828
+	CMPQ    BX, $0x07
1829
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
1830
+	SHLQ    $0x08, DX
1831
+	SUBQ    $0x01, R13
1832
+	SUBQ    $0x01, SI
1833
+	SUBQ    $0x08, BX
1834
+	MOVBQZX (R13), AX
1835
+	ORQ     AX, DX
1836
+	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
1837
+
1838
+sequenceDecs_decodeSync_amd64_fill_end:
1839
+	// Update offset
1840
+	MOVQ  R9, AX
1841
+	MOVQ  BX, CX
1842
+	MOVQ  DX, R14
1843
+	SHLQ  CL, R14
1844
+	MOVB  AH, CL
1845
+	SHRQ  $0x20, AX
1846
+	TESTQ CX, CX
1847
+	JZ    sequenceDecs_decodeSync_amd64_of_update_zero
1848
+	ADDQ  CX, BX
1849
+	CMPQ  BX, $0x40
1850
+	JA    sequenceDecs_decodeSync_amd64_of_update_zero
1851
+	CMPQ  CX, $0x40
1852
+	JAE   sequenceDecs_decodeSync_amd64_of_update_zero
1853
+	NEGQ  CX
1854
+	SHRQ  CL, R14
1855
+	ADDQ  R14, AX
1856
+
1857
+sequenceDecs_decodeSync_amd64_of_update_zero:
1858
+	MOVQ AX, 8(SP)
1859
+
1860
+	// Update match length
1861
+	MOVQ  R8, AX
1862
+	MOVQ  BX, CX
1863
+	MOVQ  DX, R14
1864
+	SHLQ  CL, R14
1865
+	MOVB  AH, CL
1866
+	SHRQ  $0x20, AX
1867
+	TESTQ CX, CX
1868
+	JZ    sequenceDecs_decodeSync_amd64_ml_update_zero
1869
+	ADDQ  CX, BX
1870
+	CMPQ  BX, $0x40
1871
+	JA    sequenceDecs_decodeSync_amd64_ml_update_zero
1872
+	CMPQ  CX, $0x40
1873
+	JAE   sequenceDecs_decodeSync_amd64_ml_update_zero
1874
+	NEGQ  CX
1875
+	SHRQ  CL, R14
1876
+	ADDQ  R14, AX
1877
+
1878
+sequenceDecs_decodeSync_amd64_ml_update_zero:
1879
+	MOVQ AX, 16(SP)
1880
+
1881
+	// Fill bitreader to have enough for the remaining
1882
+	CMPQ SI, $0x08
1883
+	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1884
+	MOVQ BX, AX
1885
+	SHRQ $0x03, AX
1886
+	SUBQ AX, R13
1887
+	MOVQ (R13), DX
1888
+	SUBQ AX, SI
1889
+	ANDQ $0x07, BX
1890
+	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
1891
+
1892
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
1893
+	CMPQ    SI, $0x00
1894
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
1895
+	CMPQ    BX, $0x07
1896
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
1897
+	SHLQ    $0x08, DX
1898
+	SUBQ    $0x01, R13
1899
+	SUBQ    $0x01, SI
1900
+	SUBQ    $0x08, BX
1901
+	MOVBQZX (R13), AX
1902
+	ORQ     AX, DX
1903
+	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
1904
+
1905
+sequenceDecs_decodeSync_amd64_fill_2_end:
1906
+	// Update literal length
1907
+	MOVQ  DI, AX
1908
+	MOVQ  BX, CX
1909
+	MOVQ  DX, R14
1910
+	SHLQ  CL, R14
1911
+	MOVB  AH, CL
1912
+	SHRQ  $0x20, AX
1913
+	TESTQ CX, CX
1914
+	JZ    sequenceDecs_decodeSync_amd64_ll_update_zero
1915
+	ADDQ  CX, BX
1916
+	CMPQ  BX, $0x40
1917
+	JA    sequenceDecs_decodeSync_amd64_ll_update_zero
1918
+	CMPQ  CX, $0x40
1919
+	JAE   sequenceDecs_decodeSync_amd64_ll_update_zero
1920
+	NEGQ  CX
1921
+	SHRQ  CL, R14
1922
+	ADDQ  R14, AX
1923
+
1924
+sequenceDecs_decodeSync_amd64_ll_update_zero:
1925
+	MOVQ AX, 24(SP)
1926
+
1927
+	// Fill bitreader for state updates
1928
+	MOVQ    R13, (SP)
1929
+	MOVQ    R9, AX
1930
+	SHRQ    $0x08, AX
1931
+	MOVBQZX AL, AX
1932
+	MOVQ    ctx+16(FP), CX
1933
+	CMPQ    96(CX), $0x00
1934
+	JZ      sequenceDecs_decodeSync_amd64_skip_update
1935
+
1936
+	// Update Literal Length State
1937
+	MOVBQZX DI, R13
1938
+	SHRQ    $0x10, DI
1939
+	MOVWQZX DI, DI
1940
+	LEAQ    (BX)(R13*1), CX
1941
+	MOVQ    DX, R14
1942
+	MOVQ    CX, BX
1943
+	ROLQ    CL, R14
1944
+	MOVL    $0x00000001, R15
1945
+	MOVB    R13, CL
1946
+	SHLL    CL, R15
1947
+	DECL    R15
1948
+	ANDQ    R15, R14
1949
+	ADDQ    R14, DI
1950
+
1951
+	// Load ctx.llTable
1952
+	MOVQ ctx+16(FP), CX
1953
+	MOVQ (CX), CX
1954
+	MOVQ (CX)(DI*8), DI
1955
+
1956
+	// Update Match Length State
1957
+	MOVBQZX R8, R13
1958
+	SHRQ    $0x10, R8
1959
+	MOVWQZX R8, R8
1960
+	LEAQ    (BX)(R13*1), CX
1961
+	MOVQ    DX, R14
1962
+	MOVQ    CX, BX
1963
+	ROLQ    CL, R14
1964
+	MOVL    $0x00000001, R15
1965
+	MOVB    R13, CL
1966
+	SHLL    CL, R15
1967
+	DECL    R15
1968
+	ANDQ    R15, R14
1969
+	ADDQ    R14, R8
1970
+
1971
+	// Load ctx.mlTable
1972
+	MOVQ ctx+16(FP), CX
1973
+	MOVQ 24(CX), CX
1974
+	MOVQ (CX)(R8*8), R8
1975
+
1976
+	// Update Offset State
1977
+	MOVBQZX R9, R13
1978
+	SHRQ    $0x10, R9
1979
+	MOVWQZX R9, R9
1980
+	LEAQ    (BX)(R13*1), CX
1981
+	MOVQ    DX, R14
1982
+	MOVQ    CX, BX
1983
+	ROLQ    CL, R14
1984
+	MOVL    $0x00000001, R15
1985
+	MOVB    R13, CL
1986
+	SHLL    CL, R15
1987
+	DECL    R15
1988
+	ANDQ    R15, R14
1989
+	ADDQ    R14, R9
1990
+
1991
+	// Load ctx.ofTable
1992
+	MOVQ ctx+16(FP), CX
1993
+	MOVQ 48(CX), CX
1994
+	MOVQ (CX)(R9*8), R9
1995
+
1996
+sequenceDecs_decodeSync_amd64_skip_update:
1997
+	// Adjust offset
1998
+	MOVQ   s+0(FP), CX
1999
+	MOVQ   8(SP), R13
2000
+	CMPQ   AX, $0x01
2001
+	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
2002
+	MOVUPS 144(CX), X0
2003
+	MOVQ   R13, 144(CX)
2004
+	MOVUPS X0, 152(CX)
2005
+	JMP    sequenceDecs_decodeSync_amd64_after_adjust
2006
+
2007
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
2008
+	CMPQ 24(SP), $0x00000000
2009
+	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
2010
+	INCQ R13
2011
+	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2012
+
2013
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
2014
+	TESTQ R13, R13
2015
+	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
2016
+	MOVQ  144(CX), R13
2017
+	JMP   sequenceDecs_decodeSync_amd64_after_adjust
2018
+
2019
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
2020
+	MOVQ    R13, AX
2021
+	XORQ    R14, R14
2022
+	MOVQ    $-1, R15
2023
+	CMPQ    R13, $0x03
2024
+	CMOVQEQ R14, AX
2025
+	CMOVQEQ R15, R14
2026
+	ADDQ    144(CX)(AX*8), R14
2027
+	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
2028
+	MOVQ    $0x00000001, R14
2029
+
2030
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
2031
+	CMPQ R13, $0x01
2032
+	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
2033
+	MOVQ 152(CX), AX
2034
+	MOVQ AX, 160(CX)
2035
+
2036
+sequenceDecs_decodeSync_amd64_adjust_skip:
2037
+	MOVQ 144(CX), AX
2038
+	MOVQ AX, 152(CX)
2039
+	MOVQ R14, 144(CX)
2040
+	MOVQ R14, R13
2041
+
2042
+sequenceDecs_decodeSync_amd64_after_adjust:
2043
+	MOVQ R13, 8(SP)
2044
+
2045
+	// Check values
2046
+	MOVQ  16(SP), AX
2047
+	MOVQ  24(SP), CX
2048
+	LEAQ  (AX)(CX*1), R14
2049
+	MOVQ  s+0(FP), R15
2050
+	ADDQ  R14, 256(R15)
2051
+	MOVQ  ctx+16(FP), R14
2052
+	SUBQ  CX, 104(R14)
2053
+	JS    error_not_enough_literals
2054
+	CMPQ  AX, $0x00020002
2055
+	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
2056
+	TESTQ R13, R13
2057
+	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
2058
+	TESTQ AX, AX
2059
+	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
2060
+
2061
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
2062
+	MOVQ 24(SP), AX
2063
+	MOVQ 8(SP), CX
2064
+	MOVQ 16(SP), R13
2065
+
2066
+	// Check if we have enough space in s.out
2067
+	LEAQ (AX)(R13*1), R14
2068
+	ADDQ R10, R14
2069
+	CMPQ R14, 32(SP)
2070
+	JA   error_not_enough_space
2071
+
2072
+	// Copy literals
2073
+	TESTQ AX, AX
2074
+	JZ    check_offset
2075
+	XORQ  R14, R14
2076
+
2077
+copy_1:
2078
+	MOVUPS (R11)(R14*1), X0
2079
+	MOVUPS X0, (R10)(R14*1)
2080
+	ADDQ   $0x10, R14
2081
+	CMPQ   R14, AX
2082
+	JB     copy_1
2083
+	ADDQ   AX, R11
2084
+	ADDQ   AX, R10
2085
+	ADDQ   AX, R12
2086
+
2087
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2088
+check_offset:
2089
+	MOVQ R12, AX
2090
+	ADDQ 40(SP), AX
2091
+	CMPQ CX, AX
2092
+	JG   error_match_off_too_big
2093
+	CMPQ CX, 56(SP)
2094
+	JG   error_match_off_too_big
2095
+
2096
+	// Copy match from history
2097
+	MOVQ CX, AX
2098
+	SUBQ R12, AX
2099
+	JLS  copy_match
2100
+	MOVQ 48(SP), R14
2101
+	SUBQ AX, R14
2102
+	CMPQ R13, AX
2103
+	JG   copy_all_from_history
2104
+	MOVQ R13, AX
2105
+	SUBQ $0x10, AX
2106
+	JB   copy_4_small
2107
+
2108
+copy_4_loop:
2109
+	MOVUPS (R14), X0
2110
+	MOVUPS X0, (R10)
2111
+	ADDQ   $0x10, R14
2112
+	ADDQ   $0x10, R10
2113
+	SUBQ   $0x10, AX
2114
+	JAE    copy_4_loop
2115
+	LEAQ   16(R14)(AX*1), R14
2116
+	LEAQ   16(R10)(AX*1), R10
2117
+	MOVUPS -16(R14), X0
2118
+	MOVUPS X0, -16(R10)
2119
+	JMP    copy_4_end
2120
+
2121
+copy_4_small:
2122
+	CMPQ R13, $0x03
2123
+	JE   copy_4_move_3
2124
+	CMPQ R13, $0x08
2125
+	JB   copy_4_move_4through7
2126
+	JMP  copy_4_move_8through16
2127
+
2128
+copy_4_move_3:
2129
+	MOVW (R14), AX
2130
+	MOVB 2(R14), CL
2131
+	MOVW AX, (R10)
2132
+	MOVB CL, 2(R10)
2133
+	ADDQ R13, R14
2134
+	ADDQ R13, R10
2135
+	JMP  copy_4_end
2136
+
2137
+copy_4_move_4through7:
2138
+	MOVL (R14), AX
2139
+	MOVL -4(R14)(R13*1), CX
2140
+	MOVL AX, (R10)
2141
+	MOVL CX, -4(R10)(R13*1)
2142
+	ADDQ R13, R14
2143
+	ADDQ R13, R10
2144
+	JMP  copy_4_end
2145
+
2146
+copy_4_move_8through16:
2147
+	MOVQ (R14), AX
2148
+	MOVQ -8(R14)(R13*1), CX
2149
+	MOVQ AX, (R10)
2150
+	MOVQ CX, -8(R10)(R13*1)
2151
+	ADDQ R13, R14
2152
+	ADDQ R13, R10
2153
+
2154
+copy_4_end:
2155
+	ADDQ R13, R12
2156
+	JMP  handle_loop
2157
+	JMP loop_finished
2158
+
2159
+copy_all_from_history:
2160
+	MOVQ AX, R15
2161
+	SUBQ $0x10, R15
2162
+	JB   copy_5_small
2163
+
2164
+copy_5_loop:
2165
+	MOVUPS (R14), X0
2166
+	MOVUPS X0, (R10)
2167
+	ADDQ   $0x10, R14
2168
+	ADDQ   $0x10, R10
2169
+	SUBQ   $0x10, R15
2170
+	JAE    copy_5_loop
2171
+	LEAQ   16(R14)(R15*1), R14
2172
+	LEAQ   16(R10)(R15*1), R10
2173
+	MOVUPS -16(R14), X0
2174
+	MOVUPS X0, -16(R10)
2175
+	JMP    copy_5_end
2176
+
2177
+copy_5_small:
2178
+	CMPQ AX, $0x03
2179
+	JE   copy_5_move_3
2180
+	JB   copy_5_move_1or2
2181
+	CMPQ AX, $0x08
2182
+	JB   copy_5_move_4through7
2183
+	JMP  copy_5_move_8through16
2184
+
2185
+copy_5_move_1or2:
2186
+	MOVB (R14), R15
2187
+	MOVB -1(R14)(AX*1), BP
2188
+	MOVB R15, (R10)
2189
+	MOVB BP, -1(R10)(AX*1)
2190
+	ADDQ AX, R14
2191
+	ADDQ AX, R10
2192
+	JMP  copy_5_end
2193
+
2194
+copy_5_move_3:
2195
+	MOVW (R14), R15
2196
+	MOVB 2(R14), BP
2197
+	MOVW R15, (R10)
2198
+	MOVB BP, 2(R10)
2199
+	ADDQ AX, R14
2200
+	ADDQ AX, R10
2201
+	JMP  copy_5_end
2202
+
2203
+copy_5_move_4through7:
2204
+	MOVL (R14), R15
2205
+	MOVL -4(R14)(AX*1), BP
2206
+	MOVL R15, (R10)
2207
+	MOVL BP, -4(R10)(AX*1)
2208
+	ADDQ AX, R14
2209
+	ADDQ AX, R10
2210
+	JMP  copy_5_end
2211
+
2212
+copy_5_move_8through16:
2213
+	MOVQ (R14), R15
2214
+	MOVQ -8(R14)(AX*1), BP
2215
+	MOVQ R15, (R10)
2216
+	MOVQ BP, -8(R10)(AX*1)
2217
+	ADDQ AX, R14
2218
+	ADDQ AX, R10
2219
+
2220
+copy_5_end:
2221
+	ADDQ AX, R12
2222
+	SUBQ AX, R13
2223
+
2224
+	// Copy match from the current buffer
2225
+copy_match:
2226
+	MOVQ R10, AX
2227
+	SUBQ CX, AX
2228
+
2229
+	// ml <= mo
2230
+	CMPQ R13, CX
2231
+	JA   copy_overlapping_match
2232
+
2233
+	// Copy non-overlapping match
2234
+	ADDQ R13, R12
2235
+	MOVQ R10, CX
2236
+	ADDQ R13, R10
2237
+
2238
+copy_2:
2239
+	MOVUPS (AX), X0
2240
+	MOVUPS X0, (CX)
2241
+	ADDQ   $0x10, AX
2242
+	ADDQ   $0x10, CX
2243
+	SUBQ   $0x10, R13
2244
+	JHI    copy_2
2245
+	JMP    handle_loop
2246
+
2247
+	// Copy overlapping match
2248
+copy_overlapping_match:
2249
+	ADDQ R13, R12
2250
+
2251
+copy_slow_3:
2252
+	MOVB (AX), CL
2253
+	MOVB CL, (R10)
2254
+	INCQ AX
2255
+	INCQ R10
2256
+	DECQ R13
2257
+	JNZ  copy_slow_3
2258
+
2259
+handle_loop:
2260
+	MOVQ ctx+16(FP), AX
2261
+	DECQ 96(AX)
2262
+	JNS  sequenceDecs_decodeSync_amd64_main_loop
2263
+
2264
+loop_finished:
2265
+	MOVQ br+8(FP), AX
2266
+	MOVQ DX, 32(AX)
2267
+	MOVB BL, 40(AX)
2268
+	MOVQ SI, 24(AX)
2269
+
2270
+	// Update the context
2271
+	MOVQ ctx+16(FP), AX
2272
+	MOVQ R12, 136(AX)
2273
+	MOVQ 144(AX), CX
2274
+	SUBQ CX, R11
2275
+	MOVQ R11, 168(AX)
2276
+
2277
+	// Return success
2278
+	MOVQ $0x00000000, ret+24(FP)
2279
+	RET
2280
+
2281
+	// Return with match length error
2282
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
2283
+	MOVQ 16(SP), AX
2284
+	MOVQ ctx+16(FP), CX
2285
+	MOVQ AX, 216(CX)
2286
+	MOVQ $0x00000001, ret+24(FP)
2287
+	RET
2288
+
2289
+	// Return with match too long error
2290
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
2291
+	MOVQ ctx+16(FP), AX
2292
+	MOVQ 16(SP), CX
2293
+	MOVQ CX, 216(AX)
2294
+	MOVQ $0x00000002, ret+24(FP)
2295
+	RET
2296
+
2297
+	// Return with match offset too long error
2298
+error_match_off_too_big:
2299
+	MOVQ ctx+16(FP), AX
2300
+	MOVQ 8(SP), CX
2301
+	MOVQ CX, 224(AX)
2302
+	MOVQ R12, 136(AX)
2303
+	MOVQ $0x00000003, ret+24(FP)
2304
+	RET
2305
+
2306
+	// Return with not enough literals error
2307
+error_not_enough_literals:
2308
+	MOVQ ctx+16(FP), AX
2309
+	MOVQ 24(SP), CX
2310
+	MOVQ CX, 208(AX)
2311
+	MOVQ $0x00000004, ret+24(FP)
2312
+	RET
2313
+
2314
+	// Return with not enough output space error
2315
+error_not_enough_space:
2316
+	MOVQ ctx+16(FP), AX
2317
+	MOVQ 24(SP), CX
2318
+	MOVQ CX, 208(AX)
2319
+	MOVQ 16(SP), CX
2320
+	MOVQ CX, 216(AX)
2321
+	MOVQ R12, 136(AX)
2322
+	MOVQ $0x00000005, ret+24(FP)
2323
+	RET
2324
+
2325
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2326
+// Requires: BMI, BMI2, CMOV, SSE
2327
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
2328
+	MOVQ    br+8(FP), CX
2329
+	MOVQ    32(CX), AX
2330
+	MOVBQZX 40(CX), DX
2331
+	MOVQ    24(CX), BX
2332
+	MOVQ    (CX), CX
2333
+	ADDQ    BX, CX
2334
+	MOVQ    CX, (SP)
2335
+	MOVQ    ctx+16(FP), CX
2336
+	MOVQ    72(CX), SI
2337
+	MOVQ    80(CX), DI
2338
+	MOVQ    88(CX), R8
2339
+	XORQ    R9, R9
2340
+	MOVQ    R9, 8(SP)
2341
+	MOVQ    R9, 16(SP)
2342
+	MOVQ    R9, 24(SP)
2343
+	MOVQ    112(CX), R9
2344
+	MOVQ    128(CX), R10
2345
+	MOVQ    R10, 32(SP)
2346
+	MOVQ    144(CX), R10
2347
+	MOVQ    136(CX), R11
2348
+	MOVQ    200(CX), R12
2349
+	MOVQ    R12, 56(SP)
2350
+	MOVQ    176(CX), R12
2351
+	MOVQ    R12, 48(SP)
2352
+	MOVQ    184(CX), CX
2353
+	MOVQ    CX, 40(SP)
2354
+	MOVQ    40(SP), CX
2355
+	ADDQ    CX, 48(SP)
2356
+
2357
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
2358
+	ADDQ R9, 32(SP)
2359
+
2360
+	// outBase += outPosition
2361
+	ADDQ R11, R9
2362
+
2363
+sequenceDecs_decodeSync_bmi2_main_loop:
2364
+	MOVQ (SP), R12
2365
+
2366
+	// Fill bitreader to have enough for the offset and match length.
2367
+	CMPQ BX, $0x08
2368
+	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2369
+	MOVQ DX, CX
2370
+	SHRQ $0x03, CX
2371
+	SUBQ CX, R12
2372
+	MOVQ (R12), AX
2373
+	SUBQ CX, BX
2374
+	ANDQ $0x07, DX
2375
+	JMP  sequenceDecs_decodeSync_bmi2_fill_end
2376
+
2377
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
2378
+	CMPQ    BX, $0x00
2379
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
2380
+	CMPQ    DX, $0x07
2381
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
2382
+	SHLQ    $0x08, AX
2383
+	SUBQ    $0x01, R12
2384
+	SUBQ    $0x01, BX
2385
+	SUBQ    $0x08, DX
2386
+	MOVBQZX (R12), CX
2387
+	ORQ     CX, AX
2388
+	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
2389
+
2390
+sequenceDecs_decodeSync_bmi2_fill_end:
2391
+	// Update offset
2392
+	MOVQ   $0x00000808, CX
2393
+	BEXTRQ CX, R8, R13
2394
+	MOVQ   AX, R14
2395
+	LEAQ   (DX)(R13*1), CX
2396
+	ROLQ   CL, R14
2397
+	BZHIQ  R13, R14, R14
2398
+	MOVQ   CX, DX
2399
+	MOVQ   R8, CX
2400
+	SHRQ   $0x20, CX
2401
+	ADDQ   R14, CX
2402
+	MOVQ   CX, 8(SP)
2403
+
2404
+	// Update match length
2405
+	MOVQ   $0x00000808, CX
2406
+	BEXTRQ CX, DI, R13
2407
+	MOVQ   AX, R14
2408
+	LEAQ   (DX)(R13*1), CX
2409
+	ROLQ   CL, R14
2410
+	BZHIQ  R13, R14, R14
2411
+	MOVQ   CX, DX
2412
+	MOVQ   DI, CX
2413
+	SHRQ   $0x20, CX
2414
+	ADDQ   R14, CX
2415
+	MOVQ   CX, 16(SP)
2416
+
2417
+	// Fill bitreader to have enough for the remaining
2418
+	CMPQ BX, $0x08
2419
+	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2420
+	MOVQ DX, CX
2421
+	SHRQ $0x03, CX
2422
+	SUBQ CX, R12
2423
+	MOVQ (R12), AX
2424
+	SUBQ CX, BX
2425
+	ANDQ $0x07, DX
2426
+	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
2427
+
2428
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
2429
+	CMPQ    BX, $0x00
2430
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
2431
+	CMPQ    DX, $0x07
2432
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
2433
+	SHLQ    $0x08, AX
2434
+	SUBQ    $0x01, R12
2435
+	SUBQ    $0x01, BX
2436
+	SUBQ    $0x08, DX
2437
+	MOVBQZX (R12), CX
2438
+	ORQ     CX, AX
2439
+	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
2440
+
2441
+sequenceDecs_decodeSync_bmi2_fill_2_end:
2442
+	// Update literal length
2443
+	MOVQ   $0x00000808, CX
2444
+	BEXTRQ CX, SI, R13
2445
+	MOVQ   AX, R14
2446
+	LEAQ   (DX)(R13*1), CX
2447
+	ROLQ   CL, R14
2448
+	BZHIQ  R13, R14, R14
2449
+	MOVQ   CX, DX
2450
+	MOVQ   SI, CX
2451
+	SHRQ   $0x20, CX
2452
+	ADDQ   R14, CX
2453
+	MOVQ   CX, 24(SP)
2454
+
2455
+	// Fill bitreader for state updates
2456
+	MOVQ    R12, (SP)
2457
+	MOVQ    $0x00000808, CX
2458
+	BEXTRQ  CX, R8, R12
2459
+	MOVQ    ctx+16(FP), CX
2460
+	CMPQ    96(CX), $0x00
2461
+	JZ      sequenceDecs_decodeSync_bmi2_skip_update
2462
+	LEAQ    (SI)(DI*1), R13
2463
+	ADDQ    R8, R13
2464
+	MOVBQZX R13, R13
2465
+	LEAQ    (DX)(R13*1), CX
2466
+	MOVQ    AX, R14
2467
+	MOVQ    CX, DX
2468
+	ROLQ    CL, R14
2469
+	BZHIQ   R13, R14, R14
2470
+
2471
+	// Update Offset State
2472
+	BZHIQ  R8, R14, CX
2473
+	SHRXQ  R8, R14, R14
2474
+	MOVQ   $0x00001010, R13
2475
+	BEXTRQ R13, R8, R8
2476
+	ADDQ   CX, R8
2477
+
2478
+	// Load ctx.ofTable
2479
+	MOVQ ctx+16(FP), CX
2480
+	MOVQ 48(CX), CX
2481
+	MOVQ (CX)(R8*8), R8
2482
+
2483
+	// Update Match Length State
2484
+	BZHIQ  DI, R14, CX
2485
+	SHRXQ  DI, R14, R14
2486
+	MOVQ   $0x00001010, R13
2487
+	BEXTRQ R13, DI, DI
2488
+	ADDQ   CX, DI
2489
+
2490
+	// Load ctx.mlTable
2491
+	MOVQ ctx+16(FP), CX
2492
+	MOVQ 24(CX), CX
2493
+	MOVQ (CX)(DI*8), DI
2494
+
2495
+	// Update Literal Length State
2496
+	BZHIQ  SI, R14, CX
2497
+	MOVQ   $0x00001010, R13
2498
+	BEXTRQ R13, SI, SI
2499
+	ADDQ   CX, SI
2500
+
2501
+	// Load ctx.llTable
2502
+	MOVQ ctx+16(FP), CX
2503
+	MOVQ (CX), CX
2504
+	MOVQ (CX)(SI*8), SI
2505
+
2506
+sequenceDecs_decodeSync_bmi2_skip_update:
2507
+	// Adjust offset
2508
+	MOVQ   s+0(FP), CX
2509
+	MOVQ   8(SP), R13
2510
+	CMPQ   R12, $0x01
2511
+	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
2512
+	MOVUPS 144(CX), X0
2513
+	MOVQ   R13, 144(CX)
2514
+	MOVUPS X0, 152(CX)
2515
+	JMP    sequenceDecs_decodeSync_bmi2_after_adjust
2516
+
2517
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
2518
+	CMPQ 24(SP), $0x00000000
2519
+	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
2520
+	INCQ R13
2521
+	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2522
+
2523
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
2524
+	TESTQ R13, R13
2525
+	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
2526
+	MOVQ  144(CX), R13
2527
+	JMP   sequenceDecs_decodeSync_bmi2_after_adjust
2528
+
2529
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
2530
+	MOVQ    R13, R12
2531
+	XORQ    R14, R14
2532
+	MOVQ    $-1, R15
2533
+	CMPQ    R13, $0x03
2534
+	CMOVQEQ R14, R12
2535
+	CMOVQEQ R15, R14
2536
+	ADDQ    144(CX)(R12*8), R14
2537
+	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
2538
+	MOVQ    $0x00000001, R14
2539
+
2540
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
2541
+	CMPQ R13, $0x01
2542
+	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
2543
+	MOVQ 152(CX), R12
2544
+	MOVQ R12, 160(CX)
2545
+
2546
+sequenceDecs_decodeSync_bmi2_adjust_skip:
2547
+	MOVQ 144(CX), R12
2548
+	MOVQ R12, 152(CX)
2549
+	MOVQ R14, 144(CX)
2550
+	MOVQ R14, R13
2551
+
2552
+sequenceDecs_decodeSync_bmi2_after_adjust:
2553
+	MOVQ R13, 8(SP)
2554
+
2555
+	// Check values
2556
+	MOVQ  16(SP), CX
2557
+	MOVQ  24(SP), R12
2558
+	LEAQ  (CX)(R12*1), R14
2559
+	MOVQ  s+0(FP), R15
2560
+	ADDQ  R14, 256(R15)
2561
+	MOVQ  ctx+16(FP), R14
2562
+	SUBQ  R12, 104(R14)
2563
+	JS    error_not_enough_literals
2564
+	CMPQ  CX, $0x00020002
2565
+	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
2566
+	TESTQ R13, R13
2567
+	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
2568
+	TESTQ CX, CX
2569
+	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
2570
+
2571
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
2572
+	MOVQ 24(SP), CX
2573
+	MOVQ 8(SP), R12
2574
+	MOVQ 16(SP), R13
2575
+
2576
+	// Check if we have enough space in s.out
2577
+	LEAQ (CX)(R13*1), R14
2578
+	ADDQ R9, R14
2579
+	CMPQ R14, 32(SP)
2580
+	JA   error_not_enough_space
2581
+
2582
+	// Copy literals
2583
+	TESTQ CX, CX
2584
+	JZ    check_offset
2585
+	XORQ  R14, R14
2586
+
2587
+copy_1:
2588
+	MOVUPS (R10)(R14*1), X0
2589
+	MOVUPS X0, (R9)(R14*1)
2590
+	ADDQ   $0x10, R14
2591
+	CMPQ   R14, CX
2592
+	JB     copy_1
2593
+	ADDQ   CX, R10
2594
+	ADDQ   CX, R9
2595
+	ADDQ   CX, R11
2596
+
2597
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
2598
+check_offset:
2599
+	MOVQ R11, CX
2600
+	ADDQ 40(SP), CX
2601
+	CMPQ R12, CX
2602
+	JG   error_match_off_too_big
2603
+	CMPQ R12, 56(SP)
2604
+	JG   error_match_off_too_big
2605
+
2606
+	// Copy match from history
2607
+	MOVQ R12, CX
2608
+	SUBQ R11, CX
2609
+	JLS  copy_match
2610
+	MOVQ 48(SP), R14
2611
+	SUBQ CX, R14
2612
+	CMPQ R13, CX
2613
+	JG   copy_all_from_history
2614
+	MOVQ R13, CX
2615
+	SUBQ $0x10, CX
2616
+	JB   copy_4_small
2617
+
2618
+copy_4_loop:
2619
+	MOVUPS (R14), X0
2620
+	MOVUPS X0, (R9)
2621
+	ADDQ   $0x10, R14
2622
+	ADDQ   $0x10, R9
2623
+	SUBQ   $0x10, CX
2624
+	JAE    copy_4_loop
2625
+	LEAQ   16(R14)(CX*1), R14
2626
+	LEAQ   16(R9)(CX*1), R9
2627
+	MOVUPS -16(R14), X0
2628
+	MOVUPS X0, -16(R9)
2629
+	JMP    copy_4_end
2630
+
2631
+copy_4_small:
2632
+	CMPQ R13, $0x03
2633
+	JE   copy_4_move_3
2634
+	CMPQ R13, $0x08
2635
+	JB   copy_4_move_4through7
2636
+	JMP  copy_4_move_8through16
2637
+
2638
+copy_4_move_3:
2639
+	MOVW (R14), CX
2640
+	MOVB 2(R14), R12
2641
+	MOVW CX, (R9)
2642
+	MOVB R12, 2(R9)
2643
+	ADDQ R13, R14
2644
+	ADDQ R13, R9
2645
+	JMP  copy_4_end
2646
+
2647
+copy_4_move_4through7:
2648
+	MOVL (R14), CX
2649
+	MOVL -4(R14)(R13*1), R12
2650
+	MOVL CX, (R9)
2651
+	MOVL R12, -4(R9)(R13*1)
2652
+	ADDQ R13, R14
2653
+	ADDQ R13, R9
2654
+	JMP  copy_4_end
2655
+
2656
+copy_4_move_8through16:
2657
+	MOVQ (R14), CX
2658
+	MOVQ -8(R14)(R13*1), R12
2659
+	MOVQ CX, (R9)
2660
+	MOVQ R12, -8(R9)(R13*1)
2661
+	ADDQ R13, R14
2662
+	ADDQ R13, R9
2663
+
2664
+copy_4_end:
2665
+	ADDQ R13, R11
2666
+	JMP  handle_loop
2667
+	JMP loop_finished
2668
+
2669
+copy_all_from_history:
2670
+	MOVQ CX, R15
2671
+	SUBQ $0x10, R15
2672
+	JB   copy_5_small
2673
+
2674
+copy_5_loop:
2675
+	MOVUPS (R14), X0
2676
+	MOVUPS X0, (R9)
2677
+	ADDQ   $0x10, R14
2678
+	ADDQ   $0x10, R9
2679
+	SUBQ   $0x10, R15
2680
+	JAE    copy_5_loop
2681
+	LEAQ   16(R14)(R15*1), R14
2682
+	LEAQ   16(R9)(R15*1), R9
2683
+	MOVUPS -16(R14), X0
2684
+	MOVUPS X0, -16(R9)
2685
+	JMP    copy_5_end
2686
+
2687
+copy_5_small:
2688
+	CMPQ CX, $0x03
2689
+	JE   copy_5_move_3
2690
+	JB   copy_5_move_1or2
2691
+	CMPQ CX, $0x08
2692
+	JB   copy_5_move_4through7
2693
+	JMP  copy_5_move_8through16
2694
+
2695
+copy_5_move_1or2:
2696
+	MOVB (R14), R15
2697
+	MOVB -1(R14)(CX*1), BP
2698
+	MOVB R15, (R9)
2699
+	MOVB BP, -1(R9)(CX*1)
2700
+	ADDQ CX, R14
2701
+	ADDQ CX, R9
2702
+	JMP  copy_5_end
2703
+
2704
+copy_5_move_3:
2705
+	MOVW (R14), R15
2706
+	MOVB 2(R14), BP
2707
+	MOVW R15, (R9)
2708
+	MOVB BP, 2(R9)
2709
+	ADDQ CX, R14
2710
+	ADDQ CX, R9
2711
+	JMP  copy_5_end
2712
+
2713
+copy_5_move_4through7:
2714
+	MOVL (R14), R15
2715
+	MOVL -4(R14)(CX*1), BP
2716
+	MOVL R15, (R9)
2717
+	MOVL BP, -4(R9)(CX*1)
2718
+	ADDQ CX, R14
2719
+	ADDQ CX, R9
2720
+	JMP  copy_5_end
2721
+
2722
+copy_5_move_8through16:
2723
+	MOVQ (R14), R15
2724
+	MOVQ -8(R14)(CX*1), BP
2725
+	MOVQ R15, (R9)
2726
+	MOVQ BP, -8(R9)(CX*1)
2727
+	ADDQ CX, R14
2728
+	ADDQ CX, R9
2729
+
2730
+copy_5_end:
2731
+	ADDQ CX, R11
2732
+	SUBQ CX, R13
2733
+
2734
+	// Copy match from the current buffer
2735
+copy_match:
2736
+	MOVQ R9, CX
2737
+	SUBQ R12, CX
2738
+
2739
+	// ml <= mo
2740
+	CMPQ R13, R12
2741
+	JA   copy_overlapping_match
2742
+
2743
+	// Copy non-overlapping match
2744
+	ADDQ R13, R11
2745
+	MOVQ R9, R12
2746
+	ADDQ R13, R9
2747
+
2748
+copy_2:
2749
+	MOVUPS (CX), X0
2750
+	MOVUPS X0, (R12)
2751
+	ADDQ   $0x10, CX
2752
+	ADDQ   $0x10, R12
2753
+	SUBQ   $0x10, R13
2754
+	JHI    copy_2
2755
+	JMP    handle_loop
2756
+
2757
+	// Copy overlapping match
2758
+copy_overlapping_match:
2759
+	ADDQ R13, R11
2760
+
2761
+copy_slow_3:
2762
+	MOVB (CX), R12
2763
+	MOVB R12, (R9)
2764
+	INCQ CX
2765
+	INCQ R9
2766
+	DECQ R13
2767
+	JNZ  copy_slow_3
2768
+
2769
+handle_loop:
2770
+	MOVQ ctx+16(FP), CX
2771
+	DECQ 96(CX)
2772
+	JNS  sequenceDecs_decodeSync_bmi2_main_loop
2773
+
2774
+loop_finished:
2775
+	MOVQ br+8(FP), CX
2776
+	MOVQ AX, 32(CX)
2777
+	MOVB DL, 40(CX)
2778
+	MOVQ BX, 24(CX)
2779
+
2780
+	// Update the context
2781
+	MOVQ ctx+16(FP), AX
2782
+	MOVQ R11, 136(AX)
2783
+	MOVQ 144(AX), CX
2784
+	SUBQ CX, R10
2785
+	MOVQ R10, 168(AX)
2786
+
2787
+	// Return success
2788
+	MOVQ $0x00000000, ret+24(FP)
2789
+	RET
2790
+
2791
+	// Return with match length error
2792
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
2793
+	MOVQ 16(SP), AX
2794
+	MOVQ ctx+16(FP), CX
2795
+	MOVQ AX, 216(CX)
2796
+	MOVQ $0x00000001, ret+24(FP)
2797
+	RET
2798
+
2799
+	// Return with match too long error
2800
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
2801
+	MOVQ ctx+16(FP), AX
2802
+	MOVQ 16(SP), CX
2803
+	MOVQ CX, 216(AX)
2804
+	MOVQ $0x00000002, ret+24(FP)
2805
+	RET
2806
+
2807
+	// Return with match offset too long error
2808
+error_match_off_too_big:
2809
+	MOVQ ctx+16(FP), AX
2810
+	MOVQ 8(SP), CX
2811
+	MOVQ CX, 224(AX)
2812
+	MOVQ R11, 136(AX)
2813
+	MOVQ $0x00000003, ret+24(FP)
2814
+	RET
2815
+
2816
+	// Return with not enough literals error
2817
+error_not_enough_literals:
2818
+	MOVQ ctx+16(FP), AX
2819
+	MOVQ 24(SP), CX
2820
+	MOVQ CX, 208(AX)
2821
+	MOVQ $0x00000004, ret+24(FP)
2822
+	RET
2823
+
2824
+	// Return with not enough output space error
2825
+error_not_enough_space:
2826
+	MOVQ ctx+16(FP), AX
2827
+	MOVQ 24(SP), CX
2828
+	MOVQ CX, 208(AX)
2829
+	MOVQ 16(SP), CX
2830
+	MOVQ CX, 216(AX)
2831
+	MOVQ R11, 136(AX)
2832
+	MOVQ $0x00000005, ret+24(FP)
2833
+	RET
2834
+
2835
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
2836
+// Requires: CMOV, SSE
2837
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
2838
+	MOVQ    br+8(FP), AX
2839
+	MOVQ    32(AX), DX
2840
+	MOVBQZX 40(AX), BX
2841
+	MOVQ    24(AX), SI
2842
+	MOVQ    (AX), AX
2843
+	ADDQ    SI, AX
2844
+	MOVQ    AX, (SP)
2845
+	MOVQ    ctx+16(FP), AX
2846
+	MOVQ    72(AX), DI
2847
+	MOVQ    80(AX), R8
2848
+	MOVQ    88(AX), R9
2849
+	XORQ    CX, CX
2850
+	MOVQ    CX, 8(SP)
2851
+	MOVQ    CX, 16(SP)
2852
+	MOVQ    CX, 24(SP)
2853
+	MOVQ    112(AX), R10
2854
+	MOVQ    128(AX), CX
2855
+	MOVQ    CX, 32(SP)
2856
+	MOVQ    144(AX), R11
2857
+	MOVQ    136(AX), R12
2858
+	MOVQ    200(AX), CX
2859
+	MOVQ    CX, 56(SP)
2860
+	MOVQ    176(AX), CX
2861
+	MOVQ    CX, 48(SP)
2862
+	MOVQ    184(AX), AX
2863
+	MOVQ    AX, 40(SP)
2864
+	MOVQ    40(SP), AX
2865
+	ADDQ    AX, 48(SP)
2866
+
2867
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
2868
+	ADDQ R10, 32(SP)
2869
+
2870
+	// outBase += outPosition
2871
+	ADDQ R12, R10
2872
+
2873
+sequenceDecs_decodeSync_safe_amd64_main_loop:
2874
+	MOVQ (SP), R13
2875
+
2876
+	// Fill bitreader to have enough for the offset and match length.
2877
+	CMPQ SI, $0x08
2878
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2879
+	MOVQ BX, AX
2880
+	SHRQ $0x03, AX
2881
+	SUBQ AX, R13
2882
+	MOVQ (R13), DX
2883
+	SUBQ AX, SI
2884
+	ANDQ $0x07, BX
2885
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
2886
+
2887
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
2888
+	CMPQ    SI, $0x00
2889
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
2890
+	CMPQ    BX, $0x07
2891
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
2892
+	SHLQ    $0x08, DX
2893
+	SUBQ    $0x01, R13
2894
+	SUBQ    $0x01, SI
2895
+	SUBQ    $0x08, BX
2896
+	MOVBQZX (R13), AX
2897
+	ORQ     AX, DX
2898
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
2899
+
2900
+sequenceDecs_decodeSync_safe_amd64_fill_end:
2901
+	// Update offset
2902
+	MOVQ  R9, AX
2903
+	MOVQ  BX, CX
2904
+	MOVQ  DX, R14
2905
+	SHLQ  CL, R14
2906
+	MOVB  AH, CL
2907
+	SHRQ  $0x20, AX
2908
+	TESTQ CX, CX
2909
+	JZ    sequenceDecs_decodeSync_safe_amd64_of_update_zero
2910
+	ADDQ  CX, BX
2911
+	CMPQ  BX, $0x40
2912
+	JA    sequenceDecs_decodeSync_safe_amd64_of_update_zero
2913
+	CMPQ  CX, $0x40
2914
+	JAE   sequenceDecs_decodeSync_safe_amd64_of_update_zero
2915
+	NEGQ  CX
2916
+	SHRQ  CL, R14
2917
+	ADDQ  R14, AX
2918
+
2919
+sequenceDecs_decodeSync_safe_amd64_of_update_zero:
2920
+	MOVQ AX, 8(SP)
2921
+
2922
+	// Update match length
2923
+	MOVQ  R8, AX
2924
+	MOVQ  BX, CX
2925
+	MOVQ  DX, R14
2926
+	SHLQ  CL, R14
2927
+	MOVB  AH, CL
2928
+	SHRQ  $0x20, AX
2929
+	TESTQ CX, CX
2930
+	JZ    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2931
+	ADDQ  CX, BX
2932
+	CMPQ  BX, $0x40
2933
+	JA    sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2934
+	CMPQ  CX, $0x40
2935
+	JAE   sequenceDecs_decodeSync_safe_amd64_ml_update_zero
2936
+	NEGQ  CX
2937
+	SHRQ  CL, R14
2938
+	ADDQ  R14, AX
2939
+
2940
+sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
2941
+	MOVQ AX, 16(SP)
2942
+
2943
+	// Fill bitreader to have enough for the remaining
2944
+	CMPQ SI, $0x08
2945
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
2946
+	MOVQ BX, AX
2947
+	SHRQ $0x03, AX
2948
+	SUBQ AX, R13
2949
+	MOVQ (R13), DX
2950
+	SUBQ AX, SI
2951
+	ANDQ $0x07, BX
2952
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
2953
+
2954
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
2955
+	CMPQ    SI, $0x00
2956
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
2957
+	CMPQ    BX, $0x07
2958
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
2959
+	SHLQ    $0x08, DX
2960
+	SUBQ    $0x01, R13
2961
+	SUBQ    $0x01, SI
2962
+	SUBQ    $0x08, BX
2963
+	MOVBQZX (R13), AX
2964
+	ORQ     AX, DX
2965
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
2966
+
2967
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
2968
+	// Update literal length
2969
+	MOVQ  DI, AX
2970
+	MOVQ  BX, CX
2971
+	MOVQ  DX, R14
2972
+	SHLQ  CL, R14
2973
+	MOVB  AH, CL
2974
+	SHRQ  $0x20, AX
2975
+	TESTQ CX, CX
2976
+	JZ    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
2977
+	ADDQ  CX, BX
2978
+	CMPQ  BX, $0x40
2979
+	JA    sequenceDecs_decodeSync_safe_amd64_ll_update_zero
2980
+	CMPQ  CX, $0x40
2981
+	JAE   sequenceDecs_decodeSync_safe_amd64_ll_update_zero
2982
+	NEGQ  CX
2983
+	SHRQ  CL, R14
2984
+	ADDQ  R14, AX
2985
+
2986
+sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
2987
+	MOVQ AX, 24(SP)
2988
+
2989
+	// Fill bitreader for state updates
2990
+	MOVQ    R13, (SP)
2991
+	MOVQ    R9, AX
2992
+	SHRQ    $0x08, AX
2993
+	MOVBQZX AL, AX
2994
+	MOVQ    ctx+16(FP), CX
2995
+	CMPQ    96(CX), $0x00
2996
+	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
2997
+
2998
+	// Update Literal Length State
2999
+	MOVBQZX DI, R13
3000
+	SHRQ    $0x10, DI
3001
+	MOVWQZX DI, DI
3002
+	LEAQ    (BX)(R13*1), CX
3003
+	MOVQ    DX, R14
3004
+	MOVQ    CX, BX
3005
+	ROLQ    CL, R14
3006
+	MOVL    $0x00000001, R15
3007
+	MOVB    R13, CL
3008
+	SHLL    CL, R15
3009
+	DECL    R15
3010
+	ANDQ    R15, R14
3011
+	ADDQ    R14, DI
3012
+
3013
+	// Load ctx.llTable
3014
+	MOVQ ctx+16(FP), CX
3015
+	MOVQ (CX), CX
3016
+	MOVQ (CX)(DI*8), DI
3017
+
3018
+	// Update Match Length State
3019
+	MOVBQZX R8, R13
3020
+	SHRQ    $0x10, R8
3021
+	MOVWQZX R8, R8
3022
+	LEAQ    (BX)(R13*1), CX
3023
+	MOVQ    DX, R14
3024
+	MOVQ    CX, BX
3025
+	ROLQ    CL, R14
3026
+	MOVL    $0x00000001, R15
3027
+	MOVB    R13, CL
3028
+	SHLL    CL, R15
3029
+	DECL    R15
3030
+	ANDQ    R15, R14
3031
+	ADDQ    R14, R8
3032
+
3033
+	// Load ctx.mlTable
3034
+	MOVQ ctx+16(FP), CX
3035
+	MOVQ 24(CX), CX
3036
+	MOVQ (CX)(R8*8), R8
3037
+
3038
+	// Update Offset State
3039
+	MOVBQZX R9, R13
3040
+	SHRQ    $0x10, R9
3041
+	MOVWQZX R9, R9
3042
+	LEAQ    (BX)(R13*1), CX
3043
+	MOVQ    DX, R14
3044
+	MOVQ    CX, BX
3045
+	ROLQ    CL, R14
3046
+	MOVL    $0x00000001, R15
3047
+	MOVB    R13, CL
3048
+	SHLL    CL, R15
3049
+	DECL    R15
3050
+	ANDQ    R15, R14
3051
+	ADDQ    R14, R9
3052
+
3053
+	// Load ctx.ofTable
3054
+	MOVQ ctx+16(FP), CX
3055
+	MOVQ 48(CX), CX
3056
+	MOVQ (CX)(R9*8), R9
3057
+
3058
+sequenceDecs_decodeSync_safe_amd64_skip_update:
3059
+	// Adjust offset
3060
+	MOVQ   s+0(FP), CX
3061
+	MOVQ   8(SP), R13
3062
+	CMPQ   AX, $0x01
3063
+	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
3064
+	MOVUPS 144(CX), X0
3065
+	MOVQ   R13, 144(CX)
3066
+	MOVUPS X0, 152(CX)
3067
+	JMP    sequenceDecs_decodeSync_safe_amd64_after_adjust
3068
+
3069
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
3070
+	CMPQ 24(SP), $0x00000000
3071
+	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
3072
+	INCQ R13
3073
+	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3074
+
3075
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
3076
+	TESTQ R13, R13
3077
+	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
3078
+	MOVQ  144(CX), R13
3079
+	JMP   sequenceDecs_decodeSync_safe_amd64_after_adjust
3080
+
3081
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
3082
+	MOVQ    R13, AX
3083
+	XORQ    R14, R14
3084
+	MOVQ    $-1, R15
3085
+	CMPQ    R13, $0x03
3086
+	CMOVQEQ R14, AX
3087
+	CMOVQEQ R15, R14
3088
+	ADDQ    144(CX)(AX*8), R14
3089
+	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
3090
+	MOVQ    $0x00000001, R14
3091
+
3092
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
3093
+	CMPQ R13, $0x01
3094
+	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
3095
+	MOVQ 152(CX), AX
3096
+	MOVQ AX, 160(CX)
3097
+
3098
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
3099
+	MOVQ 144(CX), AX
3100
+	MOVQ AX, 152(CX)
3101
+	MOVQ R14, 144(CX)
3102
+	MOVQ R14, R13
3103
+
3104
+sequenceDecs_decodeSync_safe_amd64_after_adjust:
3105
+	MOVQ R13, 8(SP)
3106
+
3107
+	// Check values
3108
+	MOVQ  16(SP), AX
3109
+	MOVQ  24(SP), CX
3110
+	LEAQ  (AX)(CX*1), R14
3111
+	MOVQ  s+0(FP), R15
3112
+	ADDQ  R14, 256(R15)
3113
+	MOVQ  ctx+16(FP), R14
3114
+	SUBQ  CX, 104(R14)
3115
+	JS    error_not_enough_literals
3116
+	CMPQ  AX, $0x00020002
3117
+	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
3118
+	TESTQ R13, R13
3119
+	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
3120
+	TESTQ AX, AX
3121
+	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
3122
+
3123
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
3124
+	MOVQ 24(SP), AX
3125
+	MOVQ 8(SP), CX
3126
+	MOVQ 16(SP), R13
3127
+
3128
+	// Check if we have enough space in s.out
3129
+	LEAQ (AX)(R13*1), R14
3130
+	ADDQ R10, R14
3131
+	CMPQ R14, 32(SP)
3132
+	JA   error_not_enough_space
3133
+
3134
+	// Copy literals
3135
+	TESTQ AX, AX
3136
+	JZ    check_offset
3137
+	MOVQ  AX, R14
3138
+	SUBQ  $0x10, R14
3139
+	JB    copy_1_small
3140
+
3141
+copy_1_loop:
3142
+	MOVUPS (R11), X0
3143
+	MOVUPS X0, (R10)
3144
+	ADDQ   $0x10, R11
3145
+	ADDQ   $0x10, R10
3146
+	SUBQ   $0x10, R14
3147
+	JAE    copy_1_loop
3148
+	LEAQ   16(R11)(R14*1), R11
3149
+	LEAQ   16(R10)(R14*1), R10
3150
+	MOVUPS -16(R11), X0
3151
+	MOVUPS X0, -16(R10)
3152
+	JMP    copy_1_end
3153
+
3154
+copy_1_small:
3155
+	CMPQ AX, $0x03
3156
+	JE   copy_1_move_3
3157
+	JB   copy_1_move_1or2
3158
+	CMPQ AX, $0x08
3159
+	JB   copy_1_move_4through7
3160
+	JMP  copy_1_move_8through16
3161
+
3162
+copy_1_move_1or2:
3163
+	MOVB (R11), R14
3164
+	MOVB -1(R11)(AX*1), R15
3165
+	MOVB R14, (R10)
3166
+	MOVB R15, -1(R10)(AX*1)
3167
+	ADDQ AX, R11
3168
+	ADDQ AX, R10
3169
+	JMP  copy_1_end
3170
+
3171
+copy_1_move_3:
3172
+	MOVW (R11), R14
3173
+	MOVB 2(R11), R15
3174
+	MOVW R14, (R10)
3175
+	MOVB R15, 2(R10)
3176
+	ADDQ AX, R11
3177
+	ADDQ AX, R10
3178
+	JMP  copy_1_end
3179
+
3180
+copy_1_move_4through7:
3181
+	MOVL (R11), R14
3182
+	MOVL -4(R11)(AX*1), R15
3183
+	MOVL R14, (R10)
3184
+	MOVL R15, -4(R10)(AX*1)
3185
+	ADDQ AX, R11
3186
+	ADDQ AX, R10
3187
+	JMP  copy_1_end
3188
+
3189
+copy_1_move_8through16:
3190
+	MOVQ (R11), R14
3191
+	MOVQ -8(R11)(AX*1), R15
3192
+	MOVQ R14, (R10)
3193
+	MOVQ R15, -8(R10)(AX*1)
3194
+	ADDQ AX, R11
3195
+	ADDQ AX, R10
3196
+
3197
+copy_1_end:
3198
+	ADDQ AX, R12
3199
+
3200
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3201
+check_offset:
3202
+	MOVQ R12, AX
3203
+	ADDQ 40(SP), AX
3204
+	CMPQ CX, AX
3205
+	JG   error_match_off_too_big
3206
+	CMPQ CX, 56(SP)
3207
+	JG   error_match_off_too_big
3208
+
3209
+	// Copy match from history
3210
+	MOVQ CX, AX
3211
+	SUBQ R12, AX
3212
+	JLS  copy_match
3213
+	MOVQ 48(SP), R14
3214
+	SUBQ AX, R14
3215
+	CMPQ R13, AX
3216
+	JG   copy_all_from_history
3217
+	MOVQ R13, AX
3218
+	SUBQ $0x10, AX
3219
+	JB   copy_4_small
3220
+
3221
+copy_4_loop:
3222
+	MOVUPS (R14), X0
3223
+	MOVUPS X0, (R10)
3224
+	ADDQ   $0x10, R14
3225
+	ADDQ   $0x10, R10
3226
+	SUBQ   $0x10, AX
3227
+	JAE    copy_4_loop
3228
+	LEAQ   16(R14)(AX*1), R14
3229
+	LEAQ   16(R10)(AX*1), R10
3230
+	MOVUPS -16(R14), X0
3231
+	MOVUPS X0, -16(R10)
3232
+	JMP    copy_4_end
3233
+
3234
+copy_4_small:
3235
+	CMPQ R13, $0x03
3236
+	JE   copy_4_move_3
3237
+	CMPQ R13, $0x08
3238
+	JB   copy_4_move_4through7
3239
+	JMP  copy_4_move_8through16
3240
+
3241
+copy_4_move_3:
3242
+	MOVW (R14), AX
3243
+	MOVB 2(R14), CL
3244
+	MOVW AX, (R10)
3245
+	MOVB CL, 2(R10)
3246
+	ADDQ R13, R14
3247
+	ADDQ R13, R10
3248
+	JMP  copy_4_end
3249
+
3250
+copy_4_move_4through7:
3251
+	MOVL (R14), AX
3252
+	MOVL -4(R14)(R13*1), CX
3253
+	MOVL AX, (R10)
3254
+	MOVL CX, -4(R10)(R13*1)
3255
+	ADDQ R13, R14
3256
+	ADDQ R13, R10
3257
+	JMP  copy_4_end
3258
+
3259
+copy_4_move_8through16:
3260
+	MOVQ (R14), AX
3261
+	MOVQ -8(R14)(R13*1), CX
3262
+	MOVQ AX, (R10)
3263
+	MOVQ CX, -8(R10)(R13*1)
3264
+	ADDQ R13, R14
3265
+	ADDQ R13, R10
3266
+
3267
+copy_4_end:
3268
+	ADDQ R13, R12
3269
+	JMP  handle_loop
3270
+	JMP loop_finished
3271
+
3272
+copy_all_from_history:
3273
+	MOVQ AX, R15
3274
+	SUBQ $0x10, R15
3275
+	JB   copy_5_small
3276
+
3277
+copy_5_loop:
3278
+	MOVUPS (R14), X0
3279
+	MOVUPS X0, (R10)
3280
+	ADDQ   $0x10, R14
3281
+	ADDQ   $0x10, R10
3282
+	SUBQ   $0x10, R15
3283
+	JAE    copy_5_loop
3284
+	LEAQ   16(R14)(R15*1), R14
3285
+	LEAQ   16(R10)(R15*1), R10
3286
+	MOVUPS -16(R14), X0
3287
+	MOVUPS X0, -16(R10)
3288
+	JMP    copy_5_end
3289
+
3290
+copy_5_small:
3291
+	CMPQ AX, $0x03
3292
+	JE   copy_5_move_3
3293
+	JB   copy_5_move_1or2
3294
+	CMPQ AX, $0x08
3295
+	JB   copy_5_move_4through7
3296
+	JMP  copy_5_move_8through16
3297
+
3298
+copy_5_move_1or2:
3299
+	MOVB (R14), R15
3300
+	MOVB -1(R14)(AX*1), BP
3301
+	MOVB R15, (R10)
3302
+	MOVB BP, -1(R10)(AX*1)
3303
+	ADDQ AX, R14
3304
+	ADDQ AX, R10
3305
+	JMP  copy_5_end
3306
+
3307
+copy_5_move_3:
3308
+	MOVW (R14), R15
3309
+	MOVB 2(R14), BP
3310
+	MOVW R15, (R10)
3311
+	MOVB BP, 2(R10)
3312
+	ADDQ AX, R14
3313
+	ADDQ AX, R10
3314
+	JMP  copy_5_end
3315
+
3316
+copy_5_move_4through7:
3317
+	MOVL (R14), R15
3318
+	MOVL -4(R14)(AX*1), BP
3319
+	MOVL R15, (R10)
3320
+	MOVL BP, -4(R10)(AX*1)
3321
+	ADDQ AX, R14
3322
+	ADDQ AX, R10
3323
+	JMP  copy_5_end
3324
+
3325
+copy_5_move_8through16:
3326
+	MOVQ (R14), R15
3327
+	MOVQ -8(R14)(AX*1), BP
3328
+	MOVQ R15, (R10)
3329
+	MOVQ BP, -8(R10)(AX*1)
3330
+	ADDQ AX, R14
3331
+	ADDQ AX, R10
3332
+
3333
+copy_5_end:
3334
+	ADDQ AX, R12
3335
+	SUBQ AX, R13
3336
+
3337
+	// Copy match from the current buffer
3338
+copy_match:
3339
+	MOVQ R10, AX
3340
+	SUBQ CX, AX
3341
+
3342
+	// ml <= mo
3343
+	CMPQ R13, CX
3344
+	JA   copy_overlapping_match
3345
+
3346
+	// Copy non-overlapping match
3347
+	ADDQ R13, R12
3348
+	MOVQ R13, CX
3349
+	SUBQ $0x10, CX
3350
+	JB   copy_2_small
3351
+
3352
+copy_2_loop:
3353
+	MOVUPS (AX), X0
3354
+	MOVUPS X0, (R10)
3355
+	ADDQ   $0x10, AX
3356
+	ADDQ   $0x10, R10
3357
+	SUBQ   $0x10, CX
3358
+	JAE    copy_2_loop
3359
+	LEAQ   16(AX)(CX*1), AX
3360
+	LEAQ   16(R10)(CX*1), R10
3361
+	MOVUPS -16(AX), X0
3362
+	MOVUPS X0, -16(R10)
3363
+	JMP    copy_2_end
3364
+
3365
+copy_2_small:
3366
+	CMPQ R13, $0x03
3367
+	JE   copy_2_move_3
3368
+	JB   copy_2_move_1or2
3369
+	CMPQ R13, $0x08
3370
+	JB   copy_2_move_4through7
3371
+	JMP  copy_2_move_8through16
3372
+
3373
+copy_2_move_1or2:
3374
+	MOVB (AX), CL
3375
+	MOVB -1(AX)(R13*1), R14
3376
+	MOVB CL, (R10)
3377
+	MOVB R14, -1(R10)(R13*1)
3378
+	ADDQ R13, AX
3379
+	ADDQ R13, R10
3380
+	JMP  copy_2_end
3381
+
3382
+copy_2_move_3:
3383
+	MOVW (AX), CX
3384
+	MOVB 2(AX), R14
3385
+	MOVW CX, (R10)
3386
+	MOVB R14, 2(R10)
3387
+	ADDQ R13, AX
3388
+	ADDQ R13, R10
3389
+	JMP  copy_2_end
3390
+
3391
+copy_2_move_4through7:
3392
+	MOVL (AX), CX
3393
+	MOVL -4(AX)(R13*1), R14
3394
+	MOVL CX, (R10)
3395
+	MOVL R14, -4(R10)(R13*1)
3396
+	ADDQ R13, AX
3397
+	ADDQ R13, R10
3398
+	JMP  copy_2_end
3399
+
3400
+copy_2_move_8through16:
3401
+	MOVQ (AX), CX
3402
+	MOVQ -8(AX)(R13*1), R14
3403
+	MOVQ CX, (R10)
3404
+	MOVQ R14, -8(R10)(R13*1)
3405
+	ADDQ R13, AX
3406
+	ADDQ R13, R10
3407
+
3408
+copy_2_end:
3409
+	JMP handle_loop
3410
+
3411
+	// Copy overlapping match
3412
+copy_overlapping_match:
3413
+	ADDQ R13, R12
3414
+
3415
+copy_slow_3:
3416
+	MOVB (AX), CL
3417
+	MOVB CL, (R10)
3418
+	INCQ AX
3419
+	INCQ R10
3420
+	DECQ R13
3421
+	JNZ  copy_slow_3
3422
+
3423
+handle_loop:
3424
+	MOVQ ctx+16(FP), AX
3425
+	DECQ 96(AX)
3426
+	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
3427
+
3428
+loop_finished:
3429
+	MOVQ br+8(FP), AX
3430
+	MOVQ DX, 32(AX)
3431
+	MOVB BL, 40(AX)
3432
+	MOVQ SI, 24(AX)
3433
+
3434
+	// Update the context
3435
+	MOVQ ctx+16(FP), AX
3436
+	MOVQ R12, 136(AX)
3437
+	MOVQ 144(AX), CX
3438
+	SUBQ CX, R11
3439
+	MOVQ R11, 168(AX)
3440
+
3441
+	// Return success
3442
+	MOVQ $0x00000000, ret+24(FP)
3443
+	RET
3444
+
3445
+	// Return with match length error
3446
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
3447
+	MOVQ 16(SP), AX
3448
+	MOVQ ctx+16(FP), CX
3449
+	MOVQ AX, 216(CX)
3450
+	MOVQ $0x00000001, ret+24(FP)
3451
+	RET
3452
+
3453
+	// Return with match too long error
3454
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
3455
+	MOVQ ctx+16(FP), AX
3456
+	MOVQ 16(SP), CX
3457
+	MOVQ CX, 216(AX)
3458
+	MOVQ $0x00000002, ret+24(FP)
3459
+	RET
3460
+
3461
+	// Return with match offset too long error
3462
+error_match_off_too_big:
3463
+	MOVQ ctx+16(FP), AX
3464
+	MOVQ 8(SP), CX
3465
+	MOVQ CX, 224(AX)
3466
+	MOVQ R12, 136(AX)
3467
+	MOVQ $0x00000003, ret+24(FP)
3468
+	RET
3469
+
3470
+	// Return with not enough literals error
3471
+error_not_enough_literals:
3472
+	MOVQ ctx+16(FP), AX
3473
+	MOVQ 24(SP), CX
3474
+	MOVQ CX, 208(AX)
3475
+	MOVQ $0x00000004, ret+24(FP)
3476
+	RET
3477
+
3478
+	// Return with not enough output space error
3479
+error_not_enough_space:
3480
+	MOVQ ctx+16(FP), AX
3481
+	MOVQ 24(SP), CX
3482
+	MOVQ CX, 208(AX)
3483
+	MOVQ 16(SP), CX
3484
+	MOVQ CX, 216(AX)
3485
+	MOVQ R12, 136(AX)
3486
+	MOVQ $0x00000005, ret+24(FP)
3487
+	RET
3488
+
3489
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
3490
+// Requires: BMI, BMI2, CMOV, SSE
3491
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
3492
+	MOVQ    br+8(FP), CX
3493
+	MOVQ    32(CX), AX
3494
+	MOVBQZX 40(CX), DX
3495
+	MOVQ    24(CX), BX
3496
+	MOVQ    (CX), CX
3497
+	ADDQ    BX, CX
3498
+	MOVQ    CX, (SP)
3499
+	MOVQ    ctx+16(FP), CX
3500
+	MOVQ    72(CX), SI
3501
+	MOVQ    80(CX), DI
3502
+	MOVQ    88(CX), R8
3503
+	XORQ    R9, R9
3504
+	MOVQ    R9, 8(SP)
3505
+	MOVQ    R9, 16(SP)
3506
+	MOVQ    R9, 24(SP)
3507
+	MOVQ    112(CX), R9
3508
+	MOVQ    128(CX), R10
3509
+	MOVQ    R10, 32(SP)
3510
+	MOVQ    144(CX), R10
3511
+	MOVQ    136(CX), R11
3512
+	MOVQ    200(CX), R12
3513
+	MOVQ    R12, 56(SP)
3514
+	MOVQ    176(CX), R12
3515
+	MOVQ    R12, 48(SP)
3516
+	MOVQ    184(CX), CX
3517
+	MOVQ    CX, 40(SP)
3518
+	MOVQ    40(SP), CX
3519
+	ADDQ    CX, 48(SP)
3520
+
3521
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
3522
+	ADDQ R9, 32(SP)
3523
+
3524
+	// outBase += outPosition
3525
+	ADDQ R11, R9
3526
+
3527
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
3528
+	MOVQ (SP), R12
3529
+
3530
+	// Fill bitreader to have enough for the offset and match length.
3531
+	CMPQ BX, $0x08
3532
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3533
+	MOVQ DX, CX
3534
+	SHRQ $0x03, CX
3535
+	SUBQ CX, R12
3536
+	MOVQ (R12), AX
3537
+	SUBQ CX, BX
3538
+	ANDQ $0x07, DX
3539
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
3540
+
3541
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
3542
+	CMPQ    BX, $0x00
3543
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
3544
+	CMPQ    DX, $0x07
3545
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
3546
+	SHLQ    $0x08, AX
3547
+	SUBQ    $0x01, R12
3548
+	SUBQ    $0x01, BX
3549
+	SUBQ    $0x08, DX
3550
+	MOVBQZX (R12), CX
3551
+	ORQ     CX, AX
3552
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
3553
+
3554
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
3555
+	// Update offset
3556
+	MOVQ   $0x00000808, CX
3557
+	BEXTRQ CX, R8, R13
3558
+	MOVQ   AX, R14
3559
+	LEAQ   (DX)(R13*1), CX
3560
+	ROLQ   CL, R14
3561
+	BZHIQ  R13, R14, R14
3562
+	MOVQ   CX, DX
3563
+	MOVQ   R8, CX
3564
+	SHRQ   $0x20, CX
3565
+	ADDQ   R14, CX
3566
+	MOVQ   CX, 8(SP)
3567
+
3568
+	// Update match length
3569
+	MOVQ   $0x00000808, CX
3570
+	BEXTRQ CX, DI, R13
3571
+	MOVQ   AX, R14
3572
+	LEAQ   (DX)(R13*1), CX
3573
+	ROLQ   CL, R14
3574
+	BZHIQ  R13, R14, R14
3575
+	MOVQ   CX, DX
3576
+	MOVQ   DI, CX
3577
+	SHRQ   $0x20, CX
3578
+	ADDQ   R14, CX
3579
+	MOVQ   CX, 16(SP)
3580
+
3581
+	// Fill bitreader to have enough for the remaining
3582
+	CMPQ BX, $0x08
3583
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3584
+	MOVQ DX, CX
3585
+	SHRQ $0x03, CX
3586
+	SUBQ CX, R12
3587
+	MOVQ (R12), AX
3588
+	SUBQ CX, BX
3589
+	ANDQ $0x07, DX
3590
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3591
+
3592
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
3593
+	CMPQ    BX, $0x00
3594
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3595
+	CMPQ    DX, $0x07
3596
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
3597
+	SHLQ    $0x08, AX
3598
+	SUBQ    $0x01, R12
3599
+	SUBQ    $0x01, BX
3600
+	SUBQ    $0x08, DX
3601
+	MOVBQZX (R12), CX
3602
+	ORQ     CX, AX
3603
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
3604
+
3605
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
3606
+	// Update literal length
3607
+	MOVQ   $0x00000808, CX
3608
+	BEXTRQ CX, SI, R13
3609
+	MOVQ   AX, R14
3610
+	LEAQ   (DX)(R13*1), CX
3611
+	ROLQ   CL, R14
3612
+	BZHIQ  R13, R14, R14
3613
+	MOVQ   CX, DX
3614
+	MOVQ   SI, CX
3615
+	SHRQ   $0x20, CX
3616
+	ADDQ   R14, CX
3617
+	MOVQ   CX, 24(SP)
3618
+
3619
+	// Fill bitreader for state updates
3620
+	MOVQ    R12, (SP)
3621
+	MOVQ    $0x00000808, CX
3622
+	BEXTRQ  CX, R8, R12
3623
+	MOVQ    ctx+16(FP), CX
3624
+	CMPQ    96(CX), $0x00
3625
+	JZ      sequenceDecs_decodeSync_safe_bmi2_skip_update
3626
+	LEAQ    (SI)(DI*1), R13
3627
+	ADDQ    R8, R13
3628
+	MOVBQZX R13, R13
3629
+	LEAQ    (DX)(R13*1), CX
3630
+	MOVQ    AX, R14
3631
+	MOVQ    CX, DX
3632
+	ROLQ    CL, R14
3633
+	BZHIQ   R13, R14, R14
3634
+
3635
+	// Update Offset State
3636
+	BZHIQ  R8, R14, CX
3637
+	SHRXQ  R8, R14, R14
3638
+	MOVQ   $0x00001010, R13
3639
+	BEXTRQ R13, R8, R8
3640
+	ADDQ   CX, R8
3641
+
3642
+	// Load ctx.ofTable
3643
+	MOVQ ctx+16(FP), CX
3644
+	MOVQ 48(CX), CX
3645
+	MOVQ (CX)(R8*8), R8
3646
+
3647
+	// Update Match Length State
3648
+	BZHIQ  DI, R14, CX
3649
+	SHRXQ  DI, R14, R14
3650
+	MOVQ   $0x00001010, R13
3651
+	BEXTRQ R13, DI, DI
3652
+	ADDQ   CX, DI
3653
+
3654
+	// Load ctx.mlTable
3655
+	MOVQ ctx+16(FP), CX
3656
+	MOVQ 24(CX), CX
3657
+	MOVQ (CX)(DI*8), DI
3658
+
3659
+	// Update Literal Length State
3660
+	BZHIQ  SI, R14, CX
3661
+	MOVQ   $0x00001010, R13
3662
+	BEXTRQ R13, SI, SI
3663
+	ADDQ   CX, SI
3664
+
3665
+	// Load ctx.llTable
3666
+	MOVQ ctx+16(FP), CX
3667
+	MOVQ (CX), CX
3668
+	MOVQ (CX)(SI*8), SI
3669
+
3670
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
3671
+	// Adjust offset
3672
+	MOVQ   s+0(FP), CX
3673
+	MOVQ   8(SP), R13
3674
+	CMPQ   R12, $0x01
3675
+	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
3676
+	MOVUPS 144(CX), X0
3677
+	MOVQ   R13, 144(CX)
3678
+	MOVUPS X0, 152(CX)
3679
+	JMP    sequenceDecs_decodeSync_safe_bmi2_after_adjust
3680
+
3681
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
3682
+	CMPQ 24(SP), $0x00000000
3683
+	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
3684
+	INCQ R13
3685
+	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3686
+
3687
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
3688
+	TESTQ R13, R13
3689
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
3690
+	MOVQ  144(CX), R13
3691
+	JMP   sequenceDecs_decodeSync_safe_bmi2_after_adjust
3692
+
3693
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
3694
+	MOVQ    R13, R12
3695
+	XORQ    R14, R14
3696
+	MOVQ    $-1, R15
3697
+	CMPQ    R13, $0x03
3698
+	CMOVQEQ R14, R12
3699
+	CMOVQEQ R15, R14
3700
+	ADDQ    144(CX)(R12*8), R14
3701
+	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
3702
+	MOVQ    $0x00000001, R14
3703
+
3704
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
3705
+	CMPQ R13, $0x01
3706
+	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
3707
+	MOVQ 152(CX), R12
3708
+	MOVQ R12, 160(CX)
3709
+
3710
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
3711
+	MOVQ 144(CX), R12
3712
+	MOVQ R12, 152(CX)
3713
+	MOVQ R14, 144(CX)
3714
+	MOVQ R14, R13
3715
+
3716
+sequenceDecs_decodeSync_safe_bmi2_after_adjust:
3717
+	MOVQ R13, 8(SP)
3718
+
3719
+	// Check values
3720
+	MOVQ  16(SP), CX
3721
+	MOVQ  24(SP), R12
3722
+	LEAQ  (CX)(R12*1), R14
3723
+	MOVQ  s+0(FP), R15
3724
+	ADDQ  R14, 256(R15)
3725
+	MOVQ  ctx+16(FP), R14
3726
+	SUBQ  R12, 104(R14)
3727
+	JS    error_not_enough_literals
3728
+	CMPQ  CX, $0x00020002
3729
+	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
3730
+	TESTQ R13, R13
3731
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
3732
+	TESTQ CX, CX
3733
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
3734
+
3735
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
3736
+	MOVQ 24(SP), CX
3737
+	MOVQ 8(SP), R12
3738
+	MOVQ 16(SP), R13
3739
+
3740
+	// Check if we have enough space in s.out
3741
+	LEAQ (CX)(R13*1), R14
3742
+	ADDQ R9, R14
3743
+	CMPQ R14, 32(SP)
3744
+	JA   error_not_enough_space
3745
+
3746
+	// Copy literals
3747
+	TESTQ CX, CX
3748
+	JZ    check_offset
3749
+	MOVQ  CX, R14
3750
+	SUBQ  $0x10, R14
3751
+	JB    copy_1_small
3752
+
3753
+copy_1_loop:
3754
+	MOVUPS (R10), X0
3755
+	MOVUPS X0, (R9)
3756
+	ADDQ   $0x10, R10
3757
+	ADDQ   $0x10, R9
3758
+	SUBQ   $0x10, R14
3759
+	JAE    copy_1_loop
3760
+	LEAQ   16(R10)(R14*1), R10
3761
+	LEAQ   16(R9)(R14*1), R9
3762
+	MOVUPS -16(R10), X0
3763
+	MOVUPS X0, -16(R9)
3764
+	JMP    copy_1_end
3765
+
3766
+copy_1_small:
3767
+	CMPQ CX, $0x03
3768
+	JE   copy_1_move_3
3769
+	JB   copy_1_move_1or2
3770
+	CMPQ CX, $0x08
3771
+	JB   copy_1_move_4through7
3772
+	JMP  copy_1_move_8through16
3773
+
3774
+copy_1_move_1or2:
3775
+	MOVB (R10), R14
3776
+	MOVB -1(R10)(CX*1), R15
3777
+	MOVB R14, (R9)
3778
+	MOVB R15, -1(R9)(CX*1)
3779
+	ADDQ CX, R10
3780
+	ADDQ CX, R9
3781
+	JMP  copy_1_end
3782
+
3783
+copy_1_move_3:
3784
+	MOVW (R10), R14
3785
+	MOVB 2(R10), R15
3786
+	MOVW R14, (R9)
3787
+	MOVB R15, 2(R9)
3788
+	ADDQ CX, R10
3789
+	ADDQ CX, R9
3790
+	JMP  copy_1_end
3791
+
3792
+copy_1_move_4through7:
3793
+	MOVL (R10), R14
3794
+	MOVL -4(R10)(CX*1), R15
3795
+	MOVL R14, (R9)
3796
+	MOVL R15, -4(R9)(CX*1)
3797
+	ADDQ CX, R10
3798
+	ADDQ CX, R9
3799
+	JMP  copy_1_end
3800
+
3801
+copy_1_move_8through16:
3802
+	MOVQ (R10), R14
3803
+	MOVQ -8(R10)(CX*1), R15
3804
+	MOVQ R14, (R9)
3805
+	MOVQ R15, -8(R9)(CX*1)
3806
+	ADDQ CX, R10
3807
+	ADDQ CX, R9
3808
+
3809
+copy_1_end:
3810
+	ADDQ CX, R11
3811
+
3812
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
3813
+check_offset:
3814
+	MOVQ R11, CX
3815
+	ADDQ 40(SP), CX
3816
+	CMPQ R12, CX
3817
+	JG   error_match_off_too_big
3818
+	CMPQ R12, 56(SP)
3819
+	JG   error_match_off_too_big
3820
+
3821
+	// Copy match from history
3822
+	MOVQ R12, CX
3823
+	SUBQ R11, CX
3824
+	JLS  copy_match
3825
+	MOVQ 48(SP), R14
3826
+	SUBQ CX, R14
3827
+	CMPQ R13, CX
3828
+	JG   copy_all_from_history
3829
+	MOVQ R13, CX
3830
+	SUBQ $0x10, CX
3831
+	JB   copy_4_small
3832
+
3833
+copy_4_loop:
3834
+	MOVUPS (R14), X0
3835
+	MOVUPS X0, (R9)
3836
+	ADDQ   $0x10, R14
3837
+	ADDQ   $0x10, R9
3838
+	SUBQ   $0x10, CX
3839
+	JAE    copy_4_loop
3840
+	LEAQ   16(R14)(CX*1), R14
3841
+	LEAQ   16(R9)(CX*1), R9
3842
+	MOVUPS -16(R14), X0
3843
+	MOVUPS X0, -16(R9)
3844
+	JMP    copy_4_end
3845
+
3846
+copy_4_small:
3847
+	CMPQ R13, $0x03
3848
+	JE   copy_4_move_3
3849
+	CMPQ R13, $0x08
3850
+	JB   copy_4_move_4through7
3851
+	JMP  copy_4_move_8through16
3852
+
3853
+copy_4_move_3:
3854
+	MOVW (R14), CX
3855
+	MOVB 2(R14), R12
3856
+	MOVW CX, (R9)
3857
+	MOVB R12, 2(R9)
3858
+	ADDQ R13, R14
3859
+	ADDQ R13, R9
3860
+	JMP  copy_4_end
3861
+
3862
+copy_4_move_4through7:
3863
+	MOVL (R14), CX
3864
+	MOVL -4(R14)(R13*1), R12
3865
+	MOVL CX, (R9)
3866
+	MOVL R12, -4(R9)(R13*1)
3867
+	ADDQ R13, R14
3868
+	ADDQ R13, R9
3869
+	JMP  copy_4_end
3870
+
3871
+copy_4_move_8through16:
3872
+	MOVQ (R14), CX
3873
+	MOVQ -8(R14)(R13*1), R12
3874
+	MOVQ CX, (R9)
3875
+	MOVQ R12, -8(R9)(R13*1)
3876
+	ADDQ R13, R14
3877
+	ADDQ R13, R9
3878
+
3879
+copy_4_end:
3880
+	ADDQ R13, R11
3881
+	JMP  handle_loop
3882
+	JMP loop_finished
3883
+
3884
+copy_all_from_history:
3885
+	MOVQ CX, R15
3886
+	SUBQ $0x10, R15
3887
+	JB   copy_5_small
3888
+
3889
+copy_5_loop:
3890
+	MOVUPS (R14), X0
3891
+	MOVUPS X0, (R9)
3892
+	ADDQ   $0x10, R14
3893
+	ADDQ   $0x10, R9
3894
+	SUBQ   $0x10, R15
3895
+	JAE    copy_5_loop
3896
+	LEAQ   16(R14)(R15*1), R14
3897
+	LEAQ   16(R9)(R15*1), R9
3898
+	MOVUPS -16(R14), X0
3899
+	MOVUPS X0, -16(R9)
3900
+	JMP    copy_5_end
3901
+
3902
+copy_5_small:
3903
+	CMPQ CX, $0x03
3904
+	JE   copy_5_move_3
3905
+	JB   copy_5_move_1or2
3906
+	CMPQ CX, $0x08
3907
+	JB   copy_5_move_4through7
3908
+	JMP  copy_5_move_8through16
3909
+
3910
+copy_5_move_1or2:
3911
+	MOVB (R14), R15
3912
+	MOVB -1(R14)(CX*1), BP
3913
+	MOVB R15, (R9)
3914
+	MOVB BP, -1(R9)(CX*1)
3915
+	ADDQ CX, R14
3916
+	ADDQ CX, R9
3917
+	JMP  copy_5_end
3918
+
3919
+copy_5_move_3:
3920
+	MOVW (R14), R15
3921
+	MOVB 2(R14), BP
3922
+	MOVW R15, (R9)
3923
+	MOVB BP, 2(R9)
3924
+	ADDQ CX, R14
3925
+	ADDQ CX, R9
3926
+	JMP  copy_5_end
3927
+
3928
+copy_5_move_4through7:
3929
+	MOVL (R14), R15
3930
+	MOVL -4(R14)(CX*1), BP
3931
+	MOVL R15, (R9)
3932
+	MOVL BP, -4(R9)(CX*1)
3933
+	ADDQ CX, R14
3934
+	ADDQ CX, R9
3935
+	JMP  copy_5_end
3936
+
3937
+copy_5_move_8through16:
3938
+	MOVQ (R14), R15
3939
+	MOVQ -8(R14)(CX*1), BP
3940
+	MOVQ R15, (R9)
3941
+	MOVQ BP, -8(R9)(CX*1)
3942
+	ADDQ CX, R14
3943
+	ADDQ CX, R9
3944
+
3945
+copy_5_end:
3946
+	ADDQ CX, R11
3947
+	SUBQ CX, R13
3948
+
3949
+	// Copy match from the current buffer
3950
+copy_match:
3951
+	MOVQ R9, CX
3952
+	SUBQ R12, CX
3953
+
3954
+	// ml <= mo
3955
+	CMPQ R13, R12
3956
+	JA   copy_overlapping_match
3957
+
3958
+	// Copy non-overlapping match
3959
+	ADDQ R13, R11
3960
+	MOVQ R13, R12
3961
+	SUBQ $0x10, R12
3962
+	JB   copy_2_small
3963
+
3964
+copy_2_loop:
3965
+	MOVUPS (CX), X0
3966
+	MOVUPS X0, (R9)
3967
+	ADDQ   $0x10, CX
3968
+	ADDQ   $0x10, R9
3969
+	SUBQ   $0x10, R12
3970
+	JAE    copy_2_loop
3971
+	LEAQ   16(CX)(R12*1), CX
3972
+	LEAQ   16(R9)(R12*1), R9
3973
+	MOVUPS -16(CX), X0
3974
+	MOVUPS X0, -16(R9)
3975
+	JMP    copy_2_end
3976
+
3977
+copy_2_small:
3978
+	CMPQ R13, $0x03
3979
+	JE   copy_2_move_3
3980
+	JB   copy_2_move_1or2
3981
+	CMPQ R13, $0x08
3982
+	JB   copy_2_move_4through7
3983
+	JMP  copy_2_move_8through16
3984
+
3985
+copy_2_move_1or2:
3986
+	MOVB (CX), R12
3987
+	MOVB -1(CX)(R13*1), R14
3988
+	MOVB R12, (R9)
3989
+	MOVB R14, -1(R9)(R13*1)
3990
+	ADDQ R13, CX
3991
+	ADDQ R13, R9
3992
+	JMP  copy_2_end
3993
+
3994
+copy_2_move_3:
3995
+	MOVW (CX), R12
3996
+	MOVB 2(CX), R14
3997
+	MOVW R12, (R9)
3998
+	MOVB R14, 2(R9)
3999
+	ADDQ R13, CX
4000
+	ADDQ R13, R9
4001
+	JMP  copy_2_end
4002
+
4003
+copy_2_move_4through7:
4004
+	MOVL (CX), R12
4005
+	MOVL -4(CX)(R13*1), R14
4006
+	MOVL R12, (R9)
4007
+	MOVL R14, -4(R9)(R13*1)
4008
+	ADDQ R13, CX
4009
+	ADDQ R13, R9
4010
+	JMP  copy_2_end
4011
+
4012
+copy_2_move_8through16:
4013
+	MOVQ (CX), R12
4014
+	MOVQ -8(CX)(R13*1), R14
4015
+	MOVQ R12, (R9)
4016
+	MOVQ R14, -8(R9)(R13*1)
4017
+	ADDQ R13, CX
4018
+	ADDQ R13, R9
4019
+
4020
+copy_2_end:
4021
+	JMP handle_loop
4022
+
4023
+	// Copy overlapping match
4024
+copy_overlapping_match:
4025
+	ADDQ R13, R11
4026
+
4027
+copy_slow_3:
4028
+	MOVB (CX), R12
4029
+	MOVB R12, (R9)
4030
+	INCQ CX
4031
+	INCQ R9
4032
+	DECQ R13
4033
+	JNZ  copy_slow_3
4034
+
4035
+handle_loop:
4036
+	MOVQ ctx+16(FP), CX
4037
+	DECQ 96(CX)
4038
+	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
4039
+
4040
+loop_finished:
4041
+	MOVQ br+8(FP), CX
4042
+	MOVQ AX, 32(CX)
4043
+	MOVB DL, 40(CX)
4044
+	MOVQ BX, 24(CX)
4045
+
4046
+	// Update the context
4047
+	MOVQ ctx+16(FP), AX
4048
+	MOVQ R11, 136(AX)
4049
+	MOVQ 144(AX), CX
4050
+	SUBQ CX, R10
4051
+	MOVQ R10, 168(AX)
4052
+
4053
+	// Return success
4054
+	MOVQ $0x00000000, ret+24(FP)
4055
+	RET
4056
+
4057
+	// Return with match length error
4058
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
4059
+	MOVQ 16(SP), AX
4060
+	MOVQ ctx+16(FP), CX
4061
+	MOVQ AX, 216(CX)
4062
+	MOVQ $0x00000001, ret+24(FP)
4063
+	RET
4064
+
4065
+	// Return with match too long error
4066
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
4067
+	MOVQ ctx+16(FP), AX
4068
+	MOVQ 16(SP), CX
4069
+	MOVQ CX, 216(AX)
4070
+	MOVQ $0x00000002, ret+24(FP)
4071
+	RET
4072
+
4073
+	// Return with match offset too long error
4074
+error_match_off_too_big:
4075
+	MOVQ ctx+16(FP), AX
4076
+	MOVQ 8(SP), CX
4077
+	MOVQ CX, 224(AX)
4078
+	MOVQ R11, 136(AX)
4079
+	MOVQ $0x00000003, ret+24(FP)
4080
+	RET
4081
+
4082
+	// Return with not enough literals error
4083
+error_not_enough_literals:
4084
+	MOVQ ctx+16(FP), AX
4085
+	MOVQ 24(SP), CX
4086
+	MOVQ CX, 208(AX)
4087
+	MOVQ $0x00000004, ret+24(FP)
4088
+	RET
4089
+
4090
+	// Return with not enough output space error
4091
+error_not_enough_space:
4092
+	MOVQ ctx+16(FP), AX
4093
+	MOVQ 24(SP), CX
4094
+	MOVQ CX, 208(AX)
4095
+	MOVQ 16(SP), CX
4096
+	MOVQ CX, 216(AX)
4097
+	MOVQ R11, 136(AX)
4098
+	MOVQ $0x00000005, ret+24(FP)
4099
+	RET
0 4100
new file mode 100644
... ...
@@ -0,0 +1,237 @@
0
+//go:build !amd64 || appengine || !gc || noasm
1
+// +build !amd64 appengine !gc noasm
2
+
3
+package zstd
4
+
5
+import (
6
+	"fmt"
7
+	"io"
8
+)
9
+
10
+// decode sequences from the stream with the provided history but without dictionary.
11
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
12
+	return false, nil
13
+}
14
+
15
+// decode sequences from the stream without the provided history.
16
+func (s *sequenceDecs) decode(seqs []seqVals) error {
17
+	br := s.br
18
+
19
+	// Grab full sizes tables, to avoid bounds checks.
20
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
21
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
22
+	s.seqSize = 0
23
+	litRemain := len(s.literals)
24
+
25
+	maxBlockSize := maxCompressedBlockSize
26
+	if s.windowSize < maxBlockSize {
27
+		maxBlockSize = s.windowSize
28
+	}
29
+	for i := range seqs {
30
+		var ll, mo, ml int
31
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
32
+			// inlined function:
33
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
34
+
35
+			// Final will not read from stream.
36
+			var llB, mlB, moB uint8
37
+			ll, llB = llState.final()
38
+			ml, mlB = mlState.final()
39
+			mo, moB = ofState.final()
40
+
41
+			// extra bits are stored in reverse order.
42
+			br.fillFast()
43
+			mo += br.getBits(moB)
44
+			if s.maxBits > 32 {
45
+				br.fillFast()
46
+			}
47
+			ml += br.getBits(mlB)
48
+			ll += br.getBits(llB)
49
+
50
+			if moB > 1 {
51
+				s.prevOffset[2] = s.prevOffset[1]
52
+				s.prevOffset[1] = s.prevOffset[0]
53
+				s.prevOffset[0] = mo
54
+			} else {
55
+				// mo = s.adjustOffset(mo, ll, moB)
56
+				// Inlined for rather big speedup
57
+				if ll == 0 {
58
+					// There is an exception though, when current sequence's literals_length = 0.
59
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
60
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
61
+					mo++
62
+				}
63
+
64
+				if mo == 0 {
65
+					mo = s.prevOffset[0]
66
+				} else {
67
+					var temp int
68
+					if mo == 3 {
69
+						temp = s.prevOffset[0] - 1
70
+					} else {
71
+						temp = s.prevOffset[mo]
72
+					}
73
+
74
+					if temp == 0 {
75
+						// 0 is not valid; input is corrupted; force offset to 1
76
+						println("WARNING: temp was 0")
77
+						temp = 1
78
+					}
79
+
80
+					if mo != 1 {
81
+						s.prevOffset[2] = s.prevOffset[1]
82
+					}
83
+					s.prevOffset[1] = s.prevOffset[0]
84
+					s.prevOffset[0] = temp
85
+					mo = temp
86
+				}
87
+			}
88
+			br.fillFast()
89
+		} else {
90
+			if br.overread() {
91
+				if debugDecoder {
92
+					printf("reading sequence %d, exceeded available data\n", i)
93
+				}
94
+				return io.ErrUnexpectedEOF
95
+			}
96
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
97
+			br.fill()
98
+		}
99
+
100
+		if debugSequences {
101
+			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
102
+		}
103
+		// Evaluate.
104
+		// We might be doing this async, so do it early.
105
+		if mo == 0 && ml > 0 {
106
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
107
+		}
108
+		if ml > maxMatchLen {
109
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
110
+		}
111
+		s.seqSize += ll + ml
112
+		if s.seqSize > maxBlockSize {
113
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
114
+		}
115
+		litRemain -= ll
116
+		if litRemain < 0 {
117
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
118
+		}
119
+		seqs[i] = seqVals{
120
+			ll: ll,
121
+			ml: ml,
122
+			mo: mo,
123
+		}
124
+		if i == len(seqs)-1 {
125
+			// This is the last sequence, so we shouldn't update state.
126
+			break
127
+		}
128
+
129
+		// Manually inlined, ~ 5-20% faster
130
+		// Update all 3 states at once. Approx 20% faster.
131
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
132
+		if nBits == 0 {
133
+			llState = llTable[llState.newState()&maxTableMask]
134
+			mlState = mlTable[mlState.newState()&maxTableMask]
135
+			ofState = ofTable[ofState.newState()&maxTableMask]
136
+		} else {
137
+			bits := br.get32BitsFast(nBits)
138
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
139
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
140
+
141
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
142
+			lowBits &= bitMask[mlState.nbBits()&15]
143
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
144
+
145
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
146
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
147
+		}
148
+	}
149
+	s.seqSize += litRemain
150
+	if s.seqSize > maxBlockSize {
151
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
152
+	}
153
+	err := br.close()
154
+	if err != nil {
155
+		printf("Closing sequences: %v, %+v\n", err, *br)
156
+	}
157
+	return err
158
+}
159
+
160
+// executeSimple handles cases when a dictionary is not used.
161
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
162
+	// Ensure we have enough output size...
163
+	if len(s.out)+s.seqSize > cap(s.out) {
164
+		addBytes := s.seqSize + len(s.out)
165
+		s.out = append(s.out, make([]byte, addBytes)...)
166
+		s.out = s.out[:len(s.out)-addBytes]
167
+	}
168
+
169
+	if debugDecoder {
170
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
171
+	}
172
+
173
+	var t = len(s.out)
174
+	out := s.out[:t+s.seqSize]
175
+
176
+	for _, seq := range seqs {
177
+		// Add literals
178
+		copy(out[t:], s.literals[:seq.ll])
179
+		t += seq.ll
180
+		s.literals = s.literals[seq.ll:]
181
+
182
+		// Malformed input
183
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
184
+			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
185
+		}
186
+
187
+		// Copy from history.
188
+		if v := seq.mo - t; v > 0 {
189
+			// v is the start position in history from end.
190
+			start := len(hist) - v
191
+			if seq.ml > v {
192
+				// Some goes into the current block.
193
+				// Copy remainder of history
194
+				copy(out[t:], hist[start:])
195
+				t += v
196
+				seq.ml -= v
197
+			} else {
198
+				copy(out[t:], hist[start:start+seq.ml])
199
+				t += seq.ml
200
+				continue
201
+			}
202
+		}
203
+
204
+		// We must be in the current buffer now
205
+		if seq.ml > 0 {
206
+			start := t - seq.mo
207
+			if seq.ml <= t-start {
208
+				// No overlap
209
+				copy(out[t:], out[start:start+seq.ml])
210
+				t += seq.ml
211
+			} else {
212
+				// Overlapping copy
213
+				// Extend destination slice and copy one byte at the time.
214
+				src := out[start : start+seq.ml]
215
+				dst := out[t:]
216
+				dst = dst[:len(src)]
217
+				t += len(src)
218
+				// Destination is the space we just added.
219
+				for i := range src {
220
+					dst[i] = src[i]
221
+				}
222
+			}
223
+		}
224
+	}
225
+	// Add final literals
226
+	copy(out[t:], s.literals)
227
+	if debugDecoder {
228
+		t += len(s.literals)
229
+		if t != len(out) {
230
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
231
+		}
232
+	}
233
+	s.out = out
234
+
235
+	return nil
236
+}
... ...
@@ -18,26 +18,44 @@ const ZipMethodWinZip = 93
18 18
 // See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
19 19
 const ZipMethodPKWare = 20
20 20
 
21
-var zipReaderPool sync.Pool
21
+// zipReaderPool is the default reader pool.
22
+var zipReaderPool = sync.Pool{New: func() interface{} {
23
+	z, err := NewReader(nil, WithDecoderLowmem(true), WithDecoderMaxWindow(128<<20), WithDecoderConcurrency(1))
24
+	if err != nil {
25
+		panic(err)
26
+	}
27
+	return z
28
+}}
22 29
 
23 30
 // newZipReader creates a pooled zip decompressor.
24
-func newZipReader(r io.Reader) io.ReadCloser {
25
-	dec, ok := zipReaderPool.Get().(*Decoder)
26
-	if ok {
27
-		dec.Reset(r)
28
-	} else {
29
-		d, err := NewReader(r, WithDecoderConcurrency(1), WithDecoderLowmem(true))
30
-		if err != nil {
31
-			panic(err)
31
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
32
+	pool := &zipReaderPool
33
+	if len(opts) > 0 {
34
+		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
35
+		// Force concurrency 1
36
+		opts = append(opts, WithDecoderConcurrency(1))
37
+		// Create our own pool
38
+		pool = &sync.Pool{}
39
+	}
40
+	return func(r io.Reader) io.ReadCloser {
41
+		dec, ok := pool.Get().(*Decoder)
42
+		if ok {
43
+			dec.Reset(r)
44
+		} else {
45
+			d, err := NewReader(r, opts...)
46
+			if err != nil {
47
+				panic(err)
48
+			}
49
+			dec = d
32 50
 		}
33
-		dec = d
51
+		return &pooledZipReader{dec: dec, pool: pool}
34 52
 	}
35
-	return &pooledZipReader{dec: dec}
36 53
 }
37 54
 
38 55
 type pooledZipReader struct {
39
-	mu  sync.Mutex // guards Close and Read
40
-	dec *Decoder
56
+	mu   sync.Mutex // guards Close and Read
57
+	pool *sync.Pool
58
+	dec  *Decoder
41 59
 }
42 60
 
43 61
 func (r *pooledZipReader) Read(p []byte) (n int, err error) {
... ...
@@ -48,8 +66,8 @@ func (r *pooledZipReader) Read(p []byte) (n int, err error) {
48 48
 	}
49 49
 	dec, err := r.dec.Read(p)
50 50
 	if err == io.EOF {
51
-		err = r.dec.Reset(nil)
52
-		zipReaderPool.Put(r.dec)
51
+		r.dec.Reset(nil)
52
+		r.pool.Put(r.dec)
53 53
 		r.dec = nil
54 54
 	}
55 55
 	return dec, err
... ...
@@ -61,7 +79,7 @@ func (r *pooledZipReader) Close() error {
61 61
 	var err error
62 62
 	if r.dec != nil {
63 63
 		err = r.dec.Reset(nil)
64
-		zipReaderPool.Put(r.dec)
64
+		r.pool.Put(r.dec)
65 65
 		r.dec = nil
66 66
 	}
67 67
 	return err
... ...
@@ -115,6 +133,9 @@ func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
115 115
 
116 116
 // ZipDecompressor returns a decompressor that can be registered with zip libraries.
117 117
 // See ZipCompressor for example.
118
-func ZipDecompressor() func(r io.Reader) io.ReadCloser {
119
-	return newZipReader
118
+// Options can be specified. WithDecoderConcurrency(1) is forced,
119
+// and by default a 128MB maximum decompression window is specified.
120
+// The window size can be overridden if required.
121
+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
122
+	return newZipReader(opts...)
120 123
 }
... ...
@@ -110,17 +110,6 @@ func printf(format string, a ...interface{}) {
110 110
 	}
111 111
 }
112 112
 
113
-// matchLenFast does matching, but will not match the last up to 7 bytes.
114
-func matchLenFast(a, b []byte) int {
115
-	endI := len(a) & (math.MaxInt32 - 7)
116
-	for i := 0; i < endI; i += 8 {
117
-		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
118
-			return i + bits.TrailingZeros64(diff)>>3
119
-		}
120
-	}
121
-	return endI
122
-}
123
-
124 113
 // matchLen returns the maximum length.
125 114
 // a must be the shortest of the two.
126 115
 // The function also returns whether all bytes matched.
... ...
@@ -454,11 +454,12 @@ github.com/ishidawataru/sctp
454 454
 # github.com/jmespath/go-jmespath v0.3.0
455 455
 ## explicit; go 1.14
456 456
 github.com/jmespath/go-jmespath
457
-# github.com/klauspost/compress v1.15.1
458
-## explicit; go 1.15
457
+# github.com/klauspost/compress v1.15.9
458
+## explicit; go 1.16
459 459
 github.com/klauspost/compress
460 460
 github.com/klauspost/compress/fse
461 461
 github.com/klauspost/compress/huff0
462
+github.com/klauspost/compress/internal/cpuinfo
462 463
 github.com/klauspost/compress/internal/snapref
463 464
 github.com/klauspost/compress/zstd
464 465
 github.com/klauspost/compress/zstd/internal/xxhash