GitList

Browse code

vendor: github.com/klauspost/compress v1.15.9

various fixes in zstd compression

- https://github.com/klauspost/compress/releases/tag/v1.15.9
- https://github.com/klauspost/compress/releases/tag/v1.15.8
- https://github.com/klauspost/compress/releases/tag/v1.15.7
- https://github.com/klauspost/compress/releases/tag/v1.15.6
- https://github.com/klauspost/compress/releases/tag/v1.15.5
- https://github.com/klauspost/compress/releases/tag/v1.15.4
- https://github.com/klauspost/compress/releases/tag/v1.15.3
- https://github.com/klauspost/compress/releases/tag/v1.15.2

full diff: https://github.com/klauspost/compress/compare/v1.15.1...v1.15.9

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>

Sebastiaan van Stijn authored on 2022/08/26 08:46:28
Showing 47 changed files

vendor.mod index 1a4807c..5cae213 100644
vendor.sum index 45c9cf3..399751f 100644
vendor/github.com/klauspost/compress/.gitignore index b35f844..d31b378 100644
vendor/github.com/klauspost/compress/README.md index 0e2dc11..ad5c63a 100644
vendor/github.com/klauspost/compress/huff0/autogen.go index ff2c69d..0000000
vendor/github.com/klauspost/compress/huff0/bitreader.go index 451160e..504a7be 100644
vendor/github.com/klauspost/compress/huff0/bitwriter.go index 6bce4e8..ec71f7a 100644
vendor/github.com/klauspost/compress/huff0/bytereader.go index 50bcdf6..4dcab8d 100644
vendor/github.com/klauspost/compress/huff0/compress.go index bc95ac6..4d14542 100644
vendor/github.com/klauspost/compress/huff0/decompress.go index 04f6529..c0c48bd 100644
vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s index 0d6cb1a..0000000
vendor/github.com/klauspost/compress/huff0/decompress_8b_amd64.s.in index 6d477a2..0000000
vendor/github.com/klauspost/compress/huff0/decompress_amd64.go index d47f664..9f3e9f7 100644
vendor/github.com/klauspost/compress/huff0/decompress_amd64.s index 2edad3e..dd1a5ae 100644
vendor/github.com/klauspost/compress/huff0/decompress_amd64.s.in index 330d86a..0000000
vendor/github.com/klauspost/compress/huff0/decompress_generic.go index 126b4d6..4f6f37c 100644
vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go index 0000000..3954c51
vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go index 0000000..e802579
vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s index 0000000..4465fbe
vendor/github.com/klauspost/compress/zstd/README.md index e3445ac..beb7fa8 100644
vendor/github.com/klauspost/compress/zstd/bitreader.go index d7cd15b..97299d4 100644
vendor/github.com/klauspost/compress/zstd/bitwriter.go index b366182..78b3c61 100644
vendor/github.com/klauspost/compress/zstd/blockdec.go index 7d567a5..7eed729 100644
vendor/github.com/klauspost/compress/zstd/bytebuf.go index b80191e..2ad0207 100644
vendor/github.com/klauspost/compress/zstd/bytereader.go index 2c4fca1..0e59a24 100644
vendor/github.com/klauspost/compress/zstd/decoder.go index 9fcdaac..d212f47 100644
vendor/github.com/klauspost/compress/zstd/decoder_options.go index fd05c9b..c70e6fa 100644
vendor/github.com/klauspost/compress/zstd/enc_better.go index 602c05e..c769f69 100644
vendor/github.com/klauspost/compress/zstd/enc_dfast.go index d6b3104..7ff0c64 100644
vendor/github.com/klauspost/compress/zstd/encoder.go index dcc987a..7aaaedb 100644
vendor/github.com/klauspost/compress/zstd/encoder_options.go index 44d8dbd..a7c5e1a 100644
vendor/github.com/klauspost/compress/zstd/framedec.go index 11089d2..9568a4b 100644
vendor/github.com/klauspost/compress/zstd/fse_decoder.go index bb3d4fd..2f8860a 100644
vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go index 0000000..c881d28
vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s index 0000000..da32b44
vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go index 0000000..332e51f
vendor/github.com/klauspost/compress/zstd/fse_encoder.go index 5442061..ab26326 100644
vendor/github.com/klauspost/compress/zstd/fuzz.go index 7f2210e..0000000
vendor/github.com/klauspost/compress/zstd/fuzz_none.go index 6811c68..0000000
vendor/github.com/klauspost/compress/zstd/hash.go index cf33f29..5d73c21 100644
vendor/github.com/klauspost/compress/zstd/seqdec.go index 819f146..df04472 100644
vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go index 0000000..7598c10
vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s index 0000000..27e7677
vendor/github.com/klauspost/compress/zstd/seqdec_generic.go index 0000000..c3452bc
vendor/github.com/klauspost/compress/zstd/zip.go index ffffcbc..29c15c8 100644
vendor/github.com/klauspost/compress/zstd/zstd.go index c1c90b4..3eb3f1c 100644
vendor/modules.txt index 674309b..6e7cbb2 100644

@@ -47,7 +47,7 @@ require (
                      	github.com/hashicorp/serf v0.8.5
                      	github.com/imdario/mergo v0.3.12
                      	github.com/ishidawataru/sctp v0.0.0-20210707070123-9a39160e9062
                     -	github.com/klauspost/compress v1.15.1
                     +	github.com/klauspost/compress v1.15.9
                      	github.com/miekg/dns v1.1.27
                      	github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible
                      	github.com/moby/buildkit v0.10.4

@@ -691,8 +691,9 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI
                      github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
                      github.com/klauspost/compress v1.11.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
                      github.com/klauspost/compress v1.11.13/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
                     -github.com/klauspost/compress v1.15.1 h1:y9FcTHGyrebwfP0ZZqFiaxTaiDnUrGkJkI+f583BL1A=
                      github.com/klauspost/compress v1.15.1/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
                     +github.com/klauspost/compress v1.15.9 h1:wKRjX6JRtDdrE9qwa4b/Cip7ACOshUI4smpCQanqjSY=
                     +github.com/klauspost/compress v1.15.9/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
                      github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
                      github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
                      github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=

@@ -23,3 +23,10 @@ _testmain.go
                      *.test
                      *.prof
                      /s2/cmd/_s2sx/sfx-exe
+                    +
                     +# Linux perf files
                     +perf.data
                     +perf.data.old
+                    +
                     +# gdb history
                     +.gdb_history

@@ -17,6 +17,72 @@ This package provides various compression algorithms.
                      # changelog
                     +* July 13, 2022 (v1.15.8)
+                    +
                     +	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641
                     +	* s2: Add Index header trim/restore https://github.com/klauspost/compress/pull/638
                     +	* zstd: Optimize seqdeq amd64 asm by @greatroar in https://github.com/klauspost/compress/pull/636
                     +	* zstd: Improve decoder memcopy https://github.com/klauspost/compress/pull/637
                     +	* huff0: Pass a single bitReader pointer to asm by @greatroar in https://github.com/klauspost/compress/pull/634
                     +	* zstd: Branchless getBits for amd64 w/o BMI2 by @greatroar in https://github.com/klauspost/compress/pull/640
                     +	* gzhttp: Remove header before writing https://github.com/klauspost/compress/pull/639
+                    +
                     +* June 29, 2022 (v1.15.7)
+                    +
                     +	* s2: Fix absolute forward seeks  https://github.com/klauspost/compress/pull/633
                     +	* zip: Merge upstream  https://github.com/klauspost/compress/pull/631
                     +	* zip: Re-add zip64 fix https://github.com/klauspost/compress/pull/624
                     +	* zstd: translate fseDecoder.buildDtable into asm by @WojciechMula in https://github.com/klauspost/compress/pull/598
                     +	* flate: Faster histograms  https://github.com/klauspost/compress/pull/620
                     +	* deflate: Use compound hcode  https://github.com/klauspost/compress/pull/622
+                    +
                     +* June 3, 2022 (v1.15.6)
                     +	* s2: Improve coding for long, close matches https://github.com/klauspost/compress/pull/613
                     +	* s2c: Add Snappy/S2 stream recompression https://github.com/klauspost/compress/pull/611
                     +	* zstd: Always use configured block size https://github.com/klauspost/compress/pull/605
                     +	* zstd: Fix incorrect hash table placement for dict encoding in default https://github.com/klauspost/compress/pull/606
                     +	* zstd: Apply default config to ZipDecompressor without options https://github.com/klauspost/compress/pull/608
                     +	* gzhttp: Exclude more common archive formats https://github.com/klauspost/compress/pull/612
                     +	* s2: Add ReaderIgnoreCRC https://github.com/klauspost/compress/pull/609
                     +	* s2: Remove sanity load on index creation https://github.com/klauspost/compress/pull/607
                     +	* snappy: Use dedicated function for scoring https://github.com/klauspost/compress/pull/614
                     +	* s2c+s2d: Use official snappy framed extension https://github.com/klauspost/compress/pull/610
+                    +
                     +* May 25, 2022 (v1.15.5)
                     +	* s2: Add concurrent stream decompression https://github.com/klauspost/compress/pull/602
                     +	* s2: Fix final emit oob read crash on amd64 https://github.com/klauspost/compress/pull/601
                     +	* huff0: asm implementation of Decompress1X by @WojciechMula https://github.com/klauspost/compress/pull/596
                     +	* zstd: Use 1 less goroutine for stream decoding https://github.com/klauspost/compress/pull/588
                     +	* zstd: Copy literal in 16 byte blocks when possible https://github.com/klauspost/compress/pull/592
                     +	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599
                     +	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593
                     +	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586
                     +	* flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590
+                    +
+                    +
                     +* May 11, 2022 (v1.15.4)
                     +	* huff0: decompress directly into output by @WojciechMula in [#577](https://github.com/klauspost/compress/pull/577)
                     +	* inflate: Keep dict on stack [#581](https://github.com/klauspost/compress/pull/581)
                     +	* zstd: Faster decoding memcopy in asm [#583](https://github.com/klauspost/compress/pull/583)
                     +	* zstd: Fix ignored crc [#580](https://github.com/klauspost/compress/pull/580)
+                    +
                     +* May 5, 2022 (v1.15.3)
                     +	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
                     +	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
+                    +
                     +* Apr 26, 2022 (v1.15.2)
                     +	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
                     +	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
                     +	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
                     +	* Minimum version is Go 1.16, added CI test on 1.18.
+                    +
                     +* Mar 11, 2022 (v1.15.1)
                     +	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
                     +	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
                     +	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
                     +	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
                     +	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
+                    +
                      * Mar 3, 2022 (v1.15.0)
                      	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
                      	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
@@ -60,6 +126,9 @@ While the release has been extensively tested, it is recommended to testing when
                      	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
                      	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
                     +<details>
                     +	<summary>See changes to v1.13.x</summary>
+                    +
                      * Aug 30, 2021 (v1.13.5)
                      	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
                      	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
@@ -88,6 +157,8 @@ While the release has been extensively tested, it is recommended to testing when
                      	* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
                      	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
                      	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
                     +</details>
+                    +
                      <details>
                      	<summary>See changes to v1.12.x</summary>

                     deleted file mode 100644
@@ -1,5 +0,0 @@
                     -package huff0
+                    -
                     -//go:generate go run generate.go
                     -//go:generate asmfmt -w decompress_amd64.s
                     -//go:generate asmfmt -w decompress_8b_amd64.s

@@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
                      	return uint16(b.value >> ((64 - n) & 63))
+                     }
                     -// peekTopBits(n) is equvialent to peekBitFast(64 - n)
                     -func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
                     -	return uint16(b.value >> n)
                     -}
+                    -
                      func (b *bitReaderShifted) advance(n uint8) {
                      	b.bitsRead += n
                      	b.value <<= n & 63
@@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
+                     	}
+                     }
                     -// finished returns true if all bits have been read from the bit stream.
                     -func (b *bitReaderShifted) finished() bool {
                     -	return b.off == 0 && b.bitsRead >= 64
                     -}
+                    -
                      func (b *bitReaderShifted) remaining() uint {
                      	return b.off*8 + uint(64-b.bitsRead)
+                     }

@@ -5,8 +5,6 @@
                      package huff0
                     -import "fmt"
+                    -
                      // bitWriter will write bits.
                      // First bit will be LSB of the first byte of output.
                      type bitWriter struct {
@@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
 xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
 xFFFF, 0xFFFF} /* up to 16 bits */
                     -// addBits16NC will add up to 16 bits.
                     -// It will not check if there is space for them,
                     -// so the caller must ensure that it has flushed recently.
                     -func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
                     -	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
                     -	b.nBits += bits
                     -}
+                    -
                      // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
                      // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
                      func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
                      	b.nBits += encA.nBits + encB.nBits
+                     }
                     -// addBits16ZeroNC will add up to 16 bits.
                     -// It will not check if there is space for them,
                     -// so the caller must ensure that it has flushed recently.
                     -// This is fastest if bits can be zero.
                     -func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
                     -	if bits == 0 {
                     -		return
                     -	}
                     -	value <<= (16 - bits) & 15
                     -	value >>= (16 - bits) & 15
                     -	b.bitContainer |= uint64(value) << (b.nBits & 63)
                     -	b.nBits += bits
                     -}
+                    -
                     -// flush will flush all pending full bytes.
                     -// There will be at least 56 bits available for writing when this has been called.
                     -// Using flush32 is faster, but leaves less space for writing.
                     -func (b *bitWriter) flush() {
                     -	v := b.nBits >> 3
                     -	switch v {
                     -	case 0:
                     -		return
                     -	case 1:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -		)
                     -		b.bitContainer >>= 1 << 3
                     -	case 2:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -			byte(b.bitContainer>>8),
                     -		)
                     -		b.bitContainer >>= 2 << 3
                     -	case 3:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -			byte(b.bitContainer>>8),
                     -			byte(b.bitContainer>>16),
                     -		)
                     -		b.bitContainer >>= 3 << 3
                     -	case 4:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -			byte(b.bitContainer>>8),
                     -			byte(b.bitContainer>>16),
                     -			byte(b.bitContainer>>24),
                     -		)
                     -		b.bitContainer >>= 4 << 3
                     -	case 5:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -			byte(b.bitContainer>>8),
                     -			byte(b.bitContainer>>16),
                     -			byte(b.bitContainer>>24),
                     -			byte(b.bitContainer>>32),
                     -		)
                     -		b.bitContainer >>= 5 << 3
                     -	case 6:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -			byte(b.bitContainer>>8),
                     -			byte(b.bitContainer>>16),
                     -			byte(b.bitContainer>>24),
                     -			byte(b.bitContainer>>32),
                     -			byte(b.bitContainer>>40),
                     -		)
                     -		b.bitContainer >>= 6 << 3
                     -	case 7:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -			byte(b.bitContainer>>8),
                     -			byte(b.bitContainer>>16),
                     -			byte(b.bitContainer>>24),
                     -			byte(b.bitContainer>>32),
                     -			byte(b.bitContainer>>40),
                     -			byte(b.bitContainer>>48),
                     -		)
                     -		b.bitContainer >>= 7 << 3
                     -	case 8:
                     -		b.out = append(b.out,
                     -			byte(b.bitContainer),
                     -			byte(b.bitContainer>>8),
                     -			byte(b.bitContainer>>16),
                     -			byte(b.bitContainer>>24),
                     -			byte(b.bitContainer>>32),
                     -			byte(b.bitContainer>>40),
                     -			byte(b.bitContainer>>48),
                     -			byte(b.bitContainer>>56),
                     -		)
                     -		b.bitContainer = 0
                     -		b.nBits = 0
                     -		return
                     -	default:
                     -		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
                     -	}
                     -	b.nBits &= 7
                     -}
+                    -
                      // flush32 will flush out, so there are at least 32 bits available for writing.
                      func (b *bitWriter) flush32() {
                      	if b.nBits < 32 {
@@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
                      	b.flushAlign()
                      	return nil
+                     }
+                    -
                     -// reset and continue writing by appending to out.
                     -func (b *bitWriter) reset(out []byte) {
                     -	b.bitContainer = 0
                     -	b.nBits = 0
                     -	b.out = out
                     -}

@@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
                      	b.off = 0
+                     }
                     -// advance the stream b n bytes.
                     -func (b *byteReader) advance(n uint) {
                     -	b.off += int(n)
                     -}
+                    -
                      // Int32 returns a little endian int32 starting at current offset.
                      func (b byteReader) Int32() int32 {
                      	v3 := int32(b.b[b.off+3])
@@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
                      	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
+                     }
                     -// unread returns the unread portion of the input.
                     -func (b byteReader) unread() []byte {
                     -	return b.b[b.off:]
                     -}
+                    -
                      // remain will return the number of bytes remaining.
                      func (b byteReader) remain() int {
                      	return len(b.b) - b.off

@@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
                      	return true
+                     }
                     +//lint:ignore U1000 used for debugging
                      func (s *Scratch) validateTable(c cTable) bool {
                      	if len(c) < int(s.symbolLen) {
                      		return false

@@ -11,7 +11,6 @@ import (
                      type dTable struct {
                      	single []dEntrySingle
                     -	double []dEntryDouble
+                     }
                      // single-symbols decoding
@@ -19,13 +18,6 @@ type dEntrySingle struct {
                      	entry uint16
+                     }
                     -// double-symbols decoding
                     -type dEntryDouble struct {
                     -	seq   [4]byte
                     -	nBits uint8
                     -	len   uint8
                     -}
+                    -
                      // Uses special code for all tables that are < 8 bits.
                      const use8BitTables = true
@@ -35,7 +27,7 @@ const use8BitTables = true
                      // If no Scratch is provided a new one is allocated.
                      // The returned Scratch can be used for encoding or decoding input using this table.
                      func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
                     -	s, err = s.prepare(in)
                     +	s, err = s.prepare(nil)
                      	if err != nil {
                      		return s, nil, err
+                     	}
@@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
                      	return &[4][256]byte{}
+                     }
                     -// Decompress1X will decompress a 1X encoded stream.
                     -// The cap of the output buffer will be the maximum decompressed size.
                     -// The length of the supplied input must match the end of a block exactly.
                     -func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
                     -	if len(d.dt.single) == 0 {
                     -		return nil, errors.New("no table loaded")
                     -	}
                     -	if use8BitTables && d.actualTableLog <= 8 {
                     -		return d.decompress1X8Bit(dst, src)
                     -	}
                     -	var br bitReaderShifted
                     -	err := br.init(src)
                     -	if err != nil {
                     -		return dst, err
                     -	}
                     -	maxDecodedSize := cap(dst)
                     -	dst = dst[:0]
+                    -
                     -	// Avoid bounds check by always having full sized table.
                     -	const tlSize = 1 << tableLogMax
                     -	const tlMask = tlSize - 1
                     -	dt := d.dt.single[:tlSize]
+                    -
                     -	// Use temp table to avoid bound checks/append penalty.
                     -	bufs := d.buffer()
                     -	buf := &bufs[0]
                     -	var off uint8
+                    -
                     -	for br.off >= 8 {
                     -		br.fillFast()
                     -		v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
                     -		br.advance(uint8(v.entry))
                     -		buf[off+0] = uint8(v.entry >> 8)
+                    -
                     -		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
                     -		br.advance(uint8(v.entry))
                     -		buf[off+1] = uint8(v.entry >> 8)
+                    -
                     -		// Refill
                     -		br.fillFast()
+                    -
                     -		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
                     -		br.advance(uint8(v.entry))
                     -		buf[off+2] = uint8(v.entry >> 8)
+                    -
                     -		v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
                     -		br.advance(uint8(v.entry))
                     -		buf[off+3] = uint8(v.entry >> 8)
+                    -
                     -		off += 4
                     -		if off == 0 {
                     -			if len(dst)+256 > maxDecodedSize {
                     -				br.close()
                     -				d.bufs.Put(bufs)
                     -				return nil, ErrMaxDecodedSizeExceeded
                     -			}
                     -			dst = append(dst, buf[:]...)
                     -		}
                     -	}
+                    -
                     -	if len(dst)+int(off) > maxDecodedSize {
                     -		d.bufs.Put(bufs)
                     -		br.close()
                     -		return nil, ErrMaxDecodedSizeExceeded
                     -	}
                     -	dst = append(dst, buf[:off]...)
+                    -
                     -	// br < 8, so uint8 is fine
                     -	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
                     -	for bitsLeft > 0 {
                     -		br.fill()
                     -		if false && br.bitsRead >= 32 {
                     -			if br.off >= 4 {
                     -				v := br.in[br.off-4:]
                     -				v = v[:4]
                     -				low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
                     -				br.value = (br.value << 32) | uint64(low)
                     -				br.bitsRead -= 32
                     -				br.off -= 4
                     -			} else {
                     -				for br.off > 0 {
                     -					br.value = (br.value << 8) | uint64(br.in[br.off-1])
                     -					br.bitsRead -= 8
                     -					br.off--
                     -				}
                     -			}
                     -		}
                     -		if len(dst) >= maxDecodedSize {
                     -			d.bufs.Put(bufs)
                     -			br.close()
                     -			return nil, ErrMaxDecodedSizeExceeded
                     -		}
                     -		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
                     -		nBits := uint8(v.entry)
                     -		br.advance(nBits)
                     -		bitsLeft -= nBits
                     -		dst = append(dst, uint8(v.entry>>8))
                     -	}
                     -	d.bufs.Put(bufs)
                     -	return dst, br.close()
                     -}
+                    -
                      // decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
                      // The cap of the output buffer will be the maximum decompressed size.
                      // The length of the supplied input must match the end of a block exactly.
@@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
                      	const shift = 56
                      	const tlSize = 1 << 8
                     -	const tlMask = tlSize - 1
                      	single := d.dt.single[:tlSize]
                      	// Use temp table to avoid bound checks/append penalty.

                     deleted file mode 100644
@@ -1,488 +0,0 @@
                     -// +build !appengine
                     -// +build gc
                     -// +build !noasm
+                    -
                     -#include "textflag.h"
                     -#include "funcdata.h"
                     -#include "go_asm.h"
+                    -
                     -#define bufoff      256 // see decompress.go, we're using [4][256]byte table
+                    -
                     -// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
                     -//	peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
                     -TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
                     -#define off             R8
                     -#define buffer          DI
                     -#define table           SI
+                    -
                     -#define br_bits_read    R9
                     -#define br_value        R10
                     -#define br_offset       R11
                     -#define peek_bits       R12
                     -#define exhausted       DX
+                    -
                     -#define br0             R13
                     -#define br1             R14
                     -#define br2             R15
                     -#define br3             BP
+                    -
                     -	MOVQ BP, 0(SP)
+                    -
                     -	XORQ exhausted, exhausted // exhausted = false
                     -	XORQ off, off             // off = 0
+                    -
                     -	MOVBQZX peekBits+32(FP), peek_bits
                     -	MOVQ    buf+40(FP), buffer
                     -	MOVQ    tbl+48(FP), table
+                    -
                     -	MOVQ pbr0+0(FP), br0
                     -	MOVQ pbr1+8(FP), br1
                     -	MOVQ pbr2+16(FP), br2
                     -	MOVQ pbr3+24(FP), br3
+                    -
                     -main_loop:
+                    -
                     -	// const stream = 0
                     -	// br0.fillFast()
                     -	MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
                     -	MOVQ    bitReaderShifted_value(br0), br_value
                     -	MOVQ    bitReaderShifted_off(br0), br_offset
+                    -
                     -	// if b.bitsRead >= 32 {
                     -	CMPQ br_bits_read, $32
                     -	JB   skip_fill0
+                    -
                     -	SUBQ $32, br_bits_read // b.bitsRead -= 32
                     -	SUBQ $4, br_offset     // b.off -= 4
+                    -
                     -	// v := b.in[b.off-4 : b.off]
                     -	// v = v[:4]
                     -	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
                     -	MOVQ bitReaderShifted_in(br0), AX
                     -	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+                    -
                     -	// b.value |= uint64(low) << (b.bitsRead & 63)
                     -	MOVQ br_bits_read, CX
                     -	SHLQ CL, AX
                     -	ORQ  AX, br_value
+                    -
                     -	// exhausted = exhausted || (br0.off < 4)
                     -	CMPQ  br_offset, $4
                     -	SETLT DL
                     -	ORB   DL, DH
+                    -
                     -	// }
                     -skip_fill0:
+                    -
                     -	// val0 := br0.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v0 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br0.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val1 := br0.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v1 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br0.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off] = uint8(v0.entry >> 8)
                     -	// buf[stream][off+1] = uint8(v1.entry >> 8)
                     -	MOVW BX, 0(buffer)(off*1)
+                    -
                     -	// SECOND PART:
                     -	// val2 := br0.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v2 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br0.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val3 := br0.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v3 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br0.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off+2] = uint8(v2.entry >> 8)
                     -	// buf[stream][off+3] = uint8(v3.entry >> 8)
                     -	MOVW BX, 0+2(buffer)(off*1)
+                    -
                     -	// update the bitrader reader structure
                     -	MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
                     -	MOVQ br_value, bitReaderShifted_value(br0)
                     -	MOVQ br_offset, bitReaderShifted_off(br0)
+                    -
                     -	// const stream = 1
                     -	// br1.fillFast()
                     -	MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
                     -	MOVQ    bitReaderShifted_value(br1), br_value
                     -	MOVQ    bitReaderShifted_off(br1), br_offset
+                    -
                     -	// if b.bitsRead >= 32 {
                     -	CMPQ br_bits_read, $32
                     -	JB   skip_fill1
+                    -
                     -	SUBQ $32, br_bits_read // b.bitsRead -= 32
                     -	SUBQ $4, br_offset     // b.off -= 4
+                    -
                     -	// v := b.in[b.off-4 : b.off]
                     -	// v = v[:4]
                     -	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
                     -	MOVQ bitReaderShifted_in(br1), AX
                     -	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+                    -
                     -	// b.value |= uint64(low) << (b.bitsRead & 63)
                     -	MOVQ br_bits_read, CX
                     -	SHLQ CL, AX
                     -	ORQ  AX, br_value
+                    -
                     -	// exhausted = exhausted || (br1.off < 4)
                     -	CMPQ  br_offset, $4
                     -	SETLT DL
                     -	ORB   DL, DH
+                    -
                     -	// }
                     -skip_fill1:
+                    -
                     -	// val0 := br1.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v0 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br1.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val1 := br1.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v1 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br1.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off] = uint8(v0.entry >> 8)
                     -	// buf[stream][off+1] = uint8(v1.entry >> 8)
                     -	MOVW BX, 256(buffer)(off*1)
+                    -
                     -	// SECOND PART:
                     -	// val2 := br1.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v2 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br1.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val3 := br1.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v3 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br1.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off+2] = uint8(v2.entry >> 8)
                     -	// buf[stream][off+3] = uint8(v3.entry >> 8)
                     -	MOVW BX, 256+2(buffer)(off*1)
+                    -
                     -	// update the bitrader reader structure
                     -	MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
                     -	MOVQ br_value, bitReaderShifted_value(br1)
                     -	MOVQ br_offset, bitReaderShifted_off(br1)
+                    -
                     -	// const stream = 2
                     -	// br2.fillFast()
                     -	MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
                     -	MOVQ    bitReaderShifted_value(br2), br_value
                     -	MOVQ    bitReaderShifted_off(br2), br_offset
+                    -
                     -	// if b.bitsRead >= 32 {
                     -	CMPQ br_bits_read, $32
                     -	JB   skip_fill2
+                    -
                     -	SUBQ $32, br_bits_read // b.bitsRead -= 32
                     -	SUBQ $4, br_offset     // b.off -= 4
+                    -
                     -	// v := b.in[b.off-4 : b.off]
                     -	// v = v[:4]
                     -	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
                     -	MOVQ bitReaderShifted_in(br2), AX
                     -	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+                    -
                     -	// b.value |= uint64(low) << (b.bitsRead & 63)
                     -	MOVQ br_bits_read, CX
                     -	SHLQ CL, AX
                     -	ORQ  AX, br_value
+                    -
                     -	// exhausted = exhausted || (br2.off < 4)
                     -	CMPQ  br_offset, $4
                     -	SETLT DL
                     -	ORB   DL, DH
+                    -
                     -	// }
                     -skip_fill2:
+                    -
                     -	// val0 := br2.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v0 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br2.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val1 := br2.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v1 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br2.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off] = uint8(v0.entry >> 8)
                     -	// buf[stream][off+1] = uint8(v1.entry >> 8)
                     -	MOVW BX, 512(buffer)(off*1)
+                    -
                     -	// SECOND PART:
                     -	// val2 := br2.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v2 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br2.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val3 := br2.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v3 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br2.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off+2] = uint8(v2.entry >> 8)
                     -	// buf[stream][off+3] = uint8(v3.entry >> 8)
                     -	MOVW BX, 512+2(buffer)(off*1)
+                    -
                     -	// update the bitrader reader structure
                     -	MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
                     -	MOVQ br_value, bitReaderShifted_value(br2)
                     -	MOVQ br_offset, bitReaderShifted_off(br2)
+                    -
                     -	// const stream = 3
                     -	// br3.fillFast()
                     -	MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
                     -	MOVQ    bitReaderShifted_value(br3), br_value
                     -	MOVQ    bitReaderShifted_off(br3), br_offset
+                    -
                     -	// if b.bitsRead >= 32 {
                     -	CMPQ br_bits_read, $32
                     -	JB   skip_fill3
+                    -
                     -	SUBQ $32, br_bits_read // b.bitsRead -= 32
                     -	SUBQ $4, br_offset     // b.off -= 4
+                    -
                     -	// v := b.in[b.off-4 : b.off]
                     -	// v = v[:4]
                     -	// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
                     -	MOVQ bitReaderShifted_in(br3), AX
                     -	MOVL 0(br_offset)(AX*1), AX       // AX = uint32(b.in[b.off:b.off+4])
+                    -
                     -	// b.value |= uint64(low) << (b.bitsRead & 63)
                     -	MOVQ br_bits_read, CX
                     -	SHLQ CL, AX
                     -	ORQ  AX, br_value
+                    -
                     -	// exhausted = exhausted || (br3.off < 4)
                     -	CMPQ  br_offset, $4
                     -	SETLT DL
                     -	ORB   DL, DH
+                    -
                     -	// }
                     -skip_fill3:
+                    -
                     -	// val0 := br3.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v0 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br3.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val1 := br3.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v1 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br3.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off] = uint8(v0.entry >> 8)
                     -	// buf[stream][off+1] = uint8(v1.entry >> 8)
                     -	MOVW BX, 768(buffer)(off*1)
+                    -
                     -	// SECOND PART:
                     -	// val2 := br3.peekTopBits(peekBits)
                     -	MOVQ br_value, AX
                     -	MOVQ peek_bits, CX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v2 := table[val0&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v0
+                    -
                     -	// br3.advance(uint8(v0.entry))
                     -	MOVB    AH, BL           // BL = uint8(v0.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CL, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// val3 := br3.peekTopBits(peekBits)
                     -	MOVQ peek_bits, CX
                     -	MOVQ br_value, AX
                     -	SHRQ CL, AX        // AX = (value >> peek_bits) & mask
+                    -
                     -	// v3 := table[val1&mask]
                     -	MOVW 0(table)(AX*2), AX // AX - v1
+                    -
                     -	// br3.advance(uint8(v1.entry))
                     -	MOVB    AH, BH           // BH = uint8(v1.entry >> 8)
                     -	MOVBQZX AL, CX
                     -	SHLQ    CX, br_value     // value <<= n
                     -	ADDQ    CX, br_bits_read // bits_read += n
+                    -
                     -	// these two writes get coalesced
                     -	// buf[stream][off+2] = uint8(v2.entry >> 8)
                     -	// buf[stream][off+3] = uint8(v3.entry >> 8)
                     -	MOVW BX, 768+2(buffer)(off*1)
+                    -
                     -	// update the bitrader reader structure
                     -	MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
                     -	MOVQ br_value, bitReaderShifted_value(br3)
                     -	MOVQ br_offset, bitReaderShifted_off(br3)
+                    -
                     -	ADDQ $4, off // off += 2
+                    -
                     -	TESTB DH, DH // any br[i].ofs < 4?
                     -	JNZ   end
+                    -
                     -	CMPQ off, $bufoff
                     -	JL   main_loop
+                    -
                     -end:
                     -	MOVQ 0(SP), BP
+                    -
                     -	MOVB off, ret+56(FP)
                     -	RET
+                    -
                     -#undef off
                     -#undef buffer
                     -#undef table
+                    -
                     -#undef br_bits_read
                     -#undef br_value
                     -#undef br_offset
                     -#undef peek_bits
                     -#undef exhausted
+                    -
                     -#undef br0
                     -#undef br1
                     -#undef br2
                     -#undef br3

@@ -2,30 +2,40 @@
                      // +build amd64,!appengine,!noasm,gc
                      // This file contains the specialisation of Decoder.Decompress4X
                     -// that uses an asm implementation of its main loop.
                     +// and Decoder.Decompress1X that use an asm implementation of thir main loops.
                      package huff0
                      import (
                      	"errors"
                      	"fmt"
+                    +
                     +	"github.com/klauspost/compress/internal/cpuinfo"
+                     )
                      // decompress4x_main_loop_x86 is an x86 assembler implementation
                      // of Decompress4X when tablelog > 8.
                     -// go:noescape
                     -func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
                     -	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
                     +//go:noescape
                     +func decompress4x_main_loop_amd64(ctx *decompress4xContext)
                      // decompress4x_8b_loop_x86 is an x86 assembler implementation
                      // of Decompress4X when tablelog <= 8 which decodes 4 entries
                      // per loop.
                     -// go:noescape
                     -func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
                     -	peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
                     +//go:noescape
                     +func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
                      // fallback8BitSize is the size where using Go version is faster.
                      const fallback8BitSize = 800
                     +type decompress4xContext struct {
                     +	pbr      *[4]bitReaderShifted
                     +	peekBits uint8
                     +	out      *byte
                     +	dstEvery int
                     +	tbl      *dEntrySingle
                     +	decoded  int
                     +	limit    *byte
                     +}
+                    +
                      // Decompress4X will decompress a 4X encoded stream.
                      // The length of the supplied input must match the end of a block exactly.
                      // The *capacity* of the dst slice must match the destination size of
@@ -42,6 +52,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
                      	if cap(dst) < fallback8BitSize && use8BitTables {
                      		return d.decompress4X8bit(dst, src)
+                     	}
+                    +
                      	var br [4]bitReaderShifted
                      	// Decode "jump table"
                      	start := 6
@@ -71,70 +82,25 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
                      	const tlMask = tlSize - 1
                      	single := d.dt.single[:tlSize]
                     -	// Use temp table to avoid bound checks/append penalty.
                     -	buf := d.buffer()
                     -	var off uint8
                      	var decoded int
                     -	const debug = false
+                    -
                     -	// see: bitReaderShifted.peekBitsFast()
                     -	peekBits := uint8((64 - d.actualTableLog) & 63)
+                    -
                     -	// Decode 2 values from each decoder/loop.
                     -	const bufoff = 256
                     -	for {
                     -		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
                     -			break
                     +	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
                     +		ctx := decompress4xContext{
                     +			pbr:      &br,
                     +			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
                     +			out:      &out[0],
                     +			dstEvery: dstEvery,
                     +			tbl:      &single[0],
                     +			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
+                     		}
+                    -
                      		if use8BitTables {
                     -			off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
                     +			decompress4x_8b_main_loop_amd64(&ctx)
                      		} else {
                     -			off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
                     -		}
                     -		if debug {
                     -			fmt.Print("DEBUG: ")
                     -			fmt.Printf("off=%d,", off)
                     -			for i := 0; i < 4; i++ {
                     -				fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
                     -					i, br[i].bitsRead, br[i].value, br[i].off)
                     -			}
                     -			fmt.Println("")
                     -		}
+                    -
                     -		if off != 0 {
                     -			break
                     +			decompress4x_main_loop_amd64(&ctx)
+                     		}
                     -		if bufoff > dstEvery {
                     -			d.bufs.Put(buf)
                     -			return nil, errors.New("corruption detected: stream overrun 1")
                     -		}
                     -		copy(out, buf[0][:])
                     -		copy(out[dstEvery:], buf[1][:])
                     -		copy(out[dstEvery*2:], buf[2][:])
                     -		copy(out[dstEvery*3:], buf[3][:])
                     -		out = out[bufoff:]
                     -		decoded += bufoff * 4
                     -		// There must at least be 3 buffers left.
                     -		if len(out) < dstEvery*3 {
                     -			d.bufs.Put(buf)
                     -			return nil, errors.New("corruption detected: stream overrun 2")
                     -		}
                     -	}
                     -	if off > 0 {
                     -		ioff := int(off)
                     -		if len(out) < dstEvery*3+ioff {
                     -			d.bufs.Put(buf)
                     -			return nil, errors.New("corruption detected: stream overrun 3")
                     -		}
                     -		copy(out, buf[0][:off])
                     -		copy(out[dstEvery:], buf[1][:off])
                     -		copy(out[dstEvery*2:], buf[2][:off])
                     -		copy(out[dstEvery*3:], buf[3][:off])
                     -		decoded += int(off) * 4
                     -		out = out[off:]
                     +		decoded = ctx.decoded
                     +		out = out[decoded/4:]
+                     	}
                      	// Decode remaining.
@@ -150,7 +116,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
                      		for bitsLeft > 0 {
                      			br.fill()
                      			if offset >= endsAt {
                     -				d.bufs.Put(buf)
                      				return nil, errors.New("corruption detected: stream overrun 4")
+                     			}
@@ -164,7 +129,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
                      			offset++
+                     		}
                      		if offset != endsAt {
                     -			d.bufs.Put(buf)
                      			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+                     		}
                      		decoded += offset - dstEvery*i
@@ -173,9 +137,86 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
                      			return nil, err
+                     		}
+                     	}
                     -	d.bufs.Put(buf)
                      	if dstSize != decoded {
                      		return nil, errors.New("corruption detected: short output block")
+                     	}
                      	return dst, nil
+                     }
+                    +
                     +// decompress4x_main_loop_x86 is an x86 assembler implementation
                     +// of Decompress1X when tablelog > 8.
                     +//go:noescape
                     +func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+                    +
                     +// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
                     +// of Decompress1X when tablelog > 8.
                     +//go:noescape
                     +func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+                    +
                     +type decompress1xContext struct {
                     +	pbr      *bitReaderShifted
                     +	peekBits uint8
                     +	out      *byte
                     +	outCap   int
                     +	tbl      *dEntrySingle
                     +	decoded  int
                     +}
+                    +
                     +// Error reported by asm implementations
                     +const error_max_decoded_size_exeeded = -1
+                    +
                     +// Decompress1X will decompress a 1X encoded stream.
                     +// The cap of the output buffer will be the maximum decompressed size.
                     +// The length of the supplied input must match the end of a block exactly.
                     +func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
                     +	if len(d.dt.single) == 0 {
                     +		return nil, errors.New("no table loaded")
                     +	}
                     +	var br bitReaderShifted
                     +	err := br.init(src)
                     +	if err != nil {
                     +		return dst, err
                     +	}
                     +	maxDecodedSize := cap(dst)
                     +	dst = dst[:maxDecodedSize]
+                    +
                     +	const tlSize = 1 << tableLogMax
                     +	const tlMask = tlSize - 1
+                    +
                     +	if maxDecodedSize >= 4 {
                     +		ctx := decompress1xContext{
                     +			pbr:      &br,
                     +			out:      &dst[0],
                     +			outCap:   maxDecodedSize,
                     +			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
                     +			tbl:      &d.dt.single[0],
                     +		}
+                    +
                     +		if cpuinfo.HasBMI2() {
                     +			decompress1x_main_loop_bmi2(&ctx)
                     +		} else {
                     +			decompress1x_main_loop_amd64(&ctx)
                     +		}
                     +		if ctx.decoded == error_max_decoded_size_exeeded {
                     +			return nil, ErrMaxDecodedSizeExceeded
                     +		}
+                    +
                     +		dst = dst[:ctx.decoded]
                     +	}
+                    +
                     +	// br < 8, so uint8 is fine
                     +	bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
                     +	for bitsLeft > 0 {
                     +		br.fill()
                     +		if len(dst) >= maxDecodedSize {
                     +			br.close()
                     +			return nil, ErrMaxDecodedSizeExceeded
                     +		}
                     +		v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
                     +		nBits := uint8(v.entry)
                     +		br.advance(nBits)
                     +		bitsLeft -= nBits
                     +		dst = append(dst, uint8(v.entry>>8))
                     +	}
                     +	return dst, br.close()
                     +}

                     new file mode 100644
@@ -0,0 +1,34 @@
                     +// Package cpuinfo gives runtime info about the current CPU.
                     +//
                     +// This is a very limited module meant for use internally
                     +// in this project. For more versatile solution check
                     +// https://github.com/klauspost/cpuid.
                     +package cpuinfo
+                    +
                     +// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
                     +func HasBMI1() bool {
                     +	return hasBMI1
                     +}
+                    +
                     +// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
                     +func HasBMI2() bool {
                     +	return hasBMI2
                     +}
+                    +
                     +// DisableBMI2 will disable BMI2, for testing purposes.
                     +// Call returned function to restore previous state.
                     +func DisableBMI2() func() {
                     +	old := hasBMI2
                     +	hasBMI2 = false
                     +	return func() {
                     +		hasBMI2 = old
                     +	}
                     +}
+                    +
                     +// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
                     +func HasBMI() bool {
                     +	return HasBMI1() && HasBMI2()
                     +}
+                    +
                     +var hasBMI1 bool
                     +var hasBMI2 bool

                     new file mode 100644
@@ -0,0 +1,11 @@
                     +//go:build amd64 && !appengine && !noasm && gc
                     +// +build amd64,!appengine,!noasm,gc
+                    +
                     +package cpuinfo
+                    +
                     +// go:noescape
                     +func x86extensions() (bmi1, bmi2 bool)
+                    +
                     +func init() {
                     +	hasBMI1, hasBMI2 = x86extensions()
                     +}

                     new file mode 100644
@@ -0,0 +1,36 @@
                     +// +build !appengine
                     +// +build gc
                     +// +build !noasm
+                    +
                     +#include "textflag.h"
                     +#include "funcdata.h"
                     +#include "go_asm.h"
+                    +
                     +TEXT ·x86extensions(SB), NOSPLIT, $0
                     +	// 1. determine max EAX value
                     +	XORQ AX, AX
                     +	CPUID
+                    +
                     +	CMPQ AX, $7
                     +	JB   unsupported
+                    +
                     +	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
                     +	MOVQ $7, AX
                     +	MOVQ $0, CX
                     +	CPUID
+                    +
                     +	BTQ   $3, BX // bit 3 = BMI1
                     +	SETCS AL
+                    +
                     +	BTQ   $8, BX // bit 8 = BMI2
                     +	SETCS AH
+                    +
                     +	MOVB AL, bmi1+0(FP)
                     +	MOVB AH, bmi2+1(FP)
                     +	RET
+                    +
                     +unsupported:
                     +	XORQ AX, AX
                     +	MOVB AL, bmi1+0(FP)
                     +	MOVB AL, bmi2+1(FP)
                     +	RET

@@ -386,47 +386,31 @@ In practice this means that concurrency is often limited to utilizing about 3 co
                      ### Benchmarks
                     -These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
+                    -
                      The first two are streaming decodes and the last are smaller inputs.
+                    -
+                    +
                     +Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
+                    +
                      ```
                     -BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
                     -BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
+                    -
                     -BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
                     -BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
+                    -
                     -Concurrent performance:
+                    -
                     -BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
+                    -
                     -BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
                     -BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
                     +BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
                     +BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
+                    +
                     +Concurrent blocks, performance:
+                    +
                     +BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
                     +BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
                      ```
                     -This reflects the performance around May 2020, but this may be out of date.
                     +This reflects the performance around May 2022, but this may be out of date.
                      ## Zstd inside ZIP files

@@ -63,13 +63,6 @@ func (b *bitReader) get32BitsFast(n uint8) uint32 {
                      	return v
+                     }
                     -func (b *bitReader) get16BitsFast(n uint8) uint16 {
                     -	const regMask = 64 - 1
                     -	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
                     -	b.bitsRead += n
                     -	return v
                     -}
+                    -
                      // fillFast() will make sure at least 32 bits are available.
                      // There must be at least 4 bytes available.
                      func (b *bitReader) fillFast() {

@@ -5,9 +5,14 @@
                      package zstd
                      import (
                     +	"bytes"
                     +	"encoding/binary"
                      	"errors"
                      	"fmt"
                      	"io"
                     +	"io/ioutil"
                     +	"os"
                     +	"path/filepath"
                      	"sync"
                      	"github.com/klauspost/compress/huff0"
@@ -38,14 +43,14 @@ const (
                      	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
                      	maxCompressedBlockSize = 128 << 10
                     +	compressedBlockOverAlloc    = 16
                     +	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
+                    +
                      	// Maximum possible block size (all Raw+Uncompressed).
                      	maxBlockSize = (1 << 21) - 1
                     -	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
                     -	maxCompressedLiteralSize = 1 << 18
                     -	maxRLELiteralSize        = 1 << 20
                     -	maxMatchLen              = 131074
                     -	maxSequences             = 0x7f00 + 0xffff
                     +	maxMatchLen  = 131074
                     +	maxSequences = 0x7f00 + 0xffff
                      	// We support slightly less than the reference decoder to be able to
                      	// use ints on 32 bit archs.
@@ -97,7 +102,6 @@ type blockDec struct {
                      	// Block is RLE, this is the size.
                      	RLESize uint32
                     -	tmp     [4]byte
                      	Type blockType
@@ -136,7 +140,7 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
                      	b.Type = blockType((bh >> 1) & 3)
                      	// find size.
                      	cSize := int(bh >> 3)
                     -	maxSize := maxBlockSize
                     +	maxSize := maxCompressedBlockSizeAlloc
                      	switch b.Type {
                      	case blockTypeReserved:
                      		return ErrReservedBlockType
@@ -157,9 +161,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
                      			println("Data size on stream:", cSize)
+                     		}
                      		b.RLESize = 0
                     -		maxSize = maxCompressedBlockSize
                     +		maxSize = maxCompressedBlockSizeAlloc
                      		if windowSize < maxCompressedBlockSize && b.lowMem {
                     -			maxSize = int(windowSize)
                     +			maxSize = int(windowSize) + compressedBlockOverAlloc
+                     		}
                      		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
                      			if debugDecoder {
@@ -190,9 +194,9 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
                      	// Read block data.
                      	if cap(b.dataStorage) < cSize {
                      		if b.lowMem || cSize > maxCompressedBlockSize {
                     -			b.dataStorage = make([]byte, 0, cSize)
                     +			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
                      		} else {
                     -			b.dataStorage = make([]byte, 0, maxCompressedBlockSize)
                     +			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
+                     		}
+                     	}
                      	if cap(b.dst) <= maxSize {
@@ -360,14 +364,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
+                     		}
                      		if cap(b.literalBuf) < litRegenSize {
                      			if b.lowMem {
                     -				b.literalBuf = make([]byte, litRegenSize)
                     +				b.literalBuf = make([]byte, litRegenSize, litRegenSize+compressedBlockOverAlloc)
                      			} else {
                     -				if litRegenSize > maxCompressedLiteralSize {
                     -					// Exceptional
                     -					b.literalBuf = make([]byte, litRegenSize)
                     -				} else {
                     -					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
                     -				}
                     +				b.literalBuf = make([]byte, litRegenSize, maxCompressedBlockSize+compressedBlockOverAlloc)
+                     			}
+                     		}
                      		literals = b.literalBuf[:litRegenSize]
@@ -397,14 +396,14 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
                      		// Ensure we have space to store it.
                      		if cap(b.literalBuf) < litRegenSize {
                      			if b.lowMem {
                     -				b.literalBuf = make([]byte, 0, litRegenSize)
                     +				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
                      			} else {
                     -				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
                     +				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
+                     			}
+                     		}
                      		var err error
                      		// Use our out buffer.
                     -		huff.MaxDecodedSize = maxCompressedBlockSize
                     +		huff.MaxDecodedSize = litRegenSize
                      		if fourStreams {
                      			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
                      		} else {
@@ -429,9 +428,9 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
                      		// Ensure we have space to store it.
                      		if cap(b.literalBuf) < litRegenSize {
                      			if b.lowMem {
                     -				b.literalBuf = make([]byte, 0, litRegenSize)
                     +				b.literalBuf = make([]byte, 0, litRegenSize+compressedBlockOverAlloc)
                      			} else {
                     -				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
                     +				b.literalBuf = make([]byte, 0, maxCompressedBlockSize+compressedBlockOverAlloc)
+                     			}
+                     		}
                      		huff := hist.huffTree
@@ -448,7 +447,7 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
                      			return in, err
+                     		}
                      		hist.huffTree = huff
                     -		huff.MaxDecodedSize = maxCompressedBlockSize
                     +		huff.MaxDecodedSize = litRegenSize
                      		// Use our out buffer.
                      		if fourStreams {
                      			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -463,6 +462,8 @@ func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err
                      		if len(literals) != litRegenSize {
                      			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+                     		}
                     +		// Re-cap to get extra size.
                     +		literals = b.literalBuf[:len(literals)]
                      		if debugDecoder {
                      			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
+                     		}
@@ -486,10 +487,15 @@ func (b *blockDec) decodeCompressed(hist *history) error {
                      		b.dst = append(b.dst, hist.decoders.literals...)
                      		return nil
+                     	}
                     -	err = hist.decoders.decodeSync(hist)
                     +	before := len(hist.decoders.out)
                     +	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
                      	if err != nil {
                      		return err
+                     	}
                     +	if hist.decoders.maxSyncLen > 0 {
                     +		hist.decoders.maxSyncLen += uint64(before)
                     +		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
                     +	}
                      	b.dst = hist.decoders.out
                      	hist.recentOffsets = hist.decoders.prevOffset
                      	return nil
@@ -632,6 +638,22 @@ func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
                      		println("initializing sequences:", err)
                      		return err
+                     	}
                     +	// Extract blocks...
                     +	if false && hist.dict == nil {
                     +		fatalErr := func(err error) {
                     +			if err != nil {
                     +				panic(err)
                     +			}
                     +		}
                     +		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
                     +		var buf bytes.Buffer
                     +		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
                     +		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
                     +		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
                     +		buf.Write(in)
                     +		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
                     +	}
+                    +
                      	return nil
+                     }
@@ -650,6 +672,7 @@ func (b *blockDec) decodeSequences(hist *history) error {
+                     	}
                      	hist.decoders.windowSize = hist.windowSize
                      	hist.decoders.prevOffset = hist.recentOffsets
+                    +
                      	err := hist.decoders.decode(b.sequence)
                      	hist.recentOffsets = hist.decoders.prevOffset
                      	return err

@@ -23,7 +23,7 @@ type byteBuffer interface {
                      	readByte() (byte, error)
                      	// Skip n bytes.
                     -	skipN(n int) error
                     +	skipN(n int64) error
+                     }
                      // in-memory buffer
@@ -52,10 +52,6 @@ func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
                      	return r, nil
+                     }
                     -func (b *byteBuf) remain() []byte {
                     -	return *b
                     -}
+                    -
                      func (b *byteBuf) readByte() (byte, error) {
                      	bb := *b
                      	if len(bb) < 1 {
@@ -66,9 +62,12 @@ func (b *byteBuf) readByte() (byte, error) {
                      	return r, nil
+                     }
                     -func (b *byteBuf) skipN(n int) error {
                     +func (b *byteBuf) skipN(n int64) error {
                      	bb := *b
                     -	if len(bb) < n {
                     +	if n < 0 {
                     +		return fmt.Errorf("negative skip (%d) requested", n)
                     +	}
                     +	if int64(len(bb)) < n {
                      		return io.ErrUnexpectedEOF
+                     	}
                      	*b = bb[n:]
@@ -124,9 +123,9 @@ func (r *readerWrapper) readByte() (byte, error) {
                      	return r.tmp[0], nil
+                     }
                     -func (r *readerWrapper) skipN(n int) error {
                     -	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
                     -	if n2 != int64(n) {
                     +func (r *readerWrapper) skipN(n int64) error {
                     +	n2, err := io.CopyN(ioutil.Discard, r.r, n)
                     +	if n2 != n {
                      		err = io.ErrUnexpectedEOF
+                     	}
                      	return err

@@ -347,18 +347,23 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
+                     			}
                      			frame.history.setDict(&dict)
+                     		}
+                    -
                     -		if frame.FrameContentSize != fcsUnknown && frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
                     -			return dst, ErrDecoderSizeExceeded
                     +		if frame.WindowSize > d.o.maxWindowSize {
                     +			if debugDecoder {
                     +				println("window size exceeded:", frame.WindowSize, ">", d.o.maxWindowSize)
                     +			}
                     +			return dst, ErrWindowSizeExceeded
+                     		}
                     -		if frame.FrameContentSize < 1<<30 {
                     -			// Never preallocate more than 1 GB up front.
                     +		if frame.FrameContentSize != fcsUnknown {
                     +			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
                     +				return dst, ErrDecoderSizeExceeded
                     +			}
                      			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
                     -				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
                     +				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
                      				copy(dst2, dst)
                      				dst = dst2
+                     			}
+                     		}
+                    +
                      		if cap(dst) == 0 {
                      			// Allocate len(input) * 2 by default if nothing is provided
                      			// and we didn't get frame content size.
@@ -437,7 +442,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
                      		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
+                     	}
                     -	if len(next.b) > 0 {
                     +	if !d.o.ignoreChecksum && len(next.b) > 0 {
                      		n, err := d.current.crc.Write(next.b)
                      		if err == nil {
                      			if n != len(next.b) {
@@ -449,7 +454,7 @@ func (d *Decoder) nextBlock(blocking bool) (ok bool) {
                      		got := d.current.crc.Sum64()
                      		var tmp [4]byte
                      		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
                     -		if !bytes.Equal(tmp[:], next.d.checkCRC) && !ignoreCRC {
                     +		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
                      			if debugDecoder {
                      				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+                     			}
@@ -533,9 +538,15 @@ func (d *Decoder) nextBlockSync() (ok bool) {
                      		// Update/Check CRC
                      		if d.frame.HasCheckSum {
                     -			d.frame.crc.Write(d.current.b)
                     +			if !d.o.ignoreChecksum {
                     +				d.frame.crc.Write(d.current.b)
                     +			}
                      			if d.current.d.Last {
                     -				d.current.err = d.frame.checkCRC()
                     +				if !d.o.ignoreChecksum {
                     +					d.current.err = d.frame.checkCRC()
                     +				} else {
                     +					d.current.err = d.frame.consumeCRC()
                     +				}
                      				if d.current.err != nil {
                      					println("CRC error:", d.current.err)
                      					return false
@@ -629,60 +640,18 @@ func (d *Decoder) startSyncDecoder(r io.Reader) error {
                      // Create Decoder:
                      // ASYNC:
                     -// Spawn 4 go routines.
                     -// 0: Read frames and decode blocks.
                     -// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
                     -// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
                     -// 3: Wait for stream history, execute sequences, send stream history.
                     +// Spawn 3 go routines.
                     +// 0: Read frames and decode block literals.
                     +// 1: Decode sequences.
                     +// 2: Execute sequences, send to output.
                      func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
                      	defer d.streamWg.Done()
                      	br := readerWrapper{r: r}
                     -	var seqPrepare = make(chan *blockDec, d.o.concurrent)
                      	var seqDecode = make(chan *blockDec, d.o.concurrent)
                      	var seqExecute = make(chan *blockDec, d.o.concurrent)
                     -	// Async 1: Prepare blocks...
                     -	go func() {
                     -		var hist history
                     -		var hasErr bool
                     -		for block := range seqPrepare {
                     -			if hasErr {
                     -				if block != nil {
                     -					seqDecode <- block
                     -				}
                     -				continue
                     -			}
                     -			if block.async.newHist != nil {
                     -				if debugDecoder {
                     -					println("Async 1: new history")
                     -				}
                     -				hist.reset()
                     -				if block.async.newHist.dict != nil {
                     -					hist.setDict(block.async.newHist.dict)
                     -				}
                     -			}
                     -			if block.err != nil || block.Type != blockTypeCompressed {
                     -				hasErr = block.err != nil
                     -				seqDecode <- block
                     -				continue
                     -			}
+                    -
                     -			remain, err := block.decodeLiterals(block.data, &hist)
                     -			block.err = err
                     -			hasErr = block.err != nil
                     -			if err == nil {
                     -				block.async.literals = hist.decoders.literals
                     -				block.async.seqData = remain
                     -			} else if debugDecoder {
                     -				println("decodeLiterals error:", err)
                     -			}
                     -			seqDecode <- block
                     -		}
                     -		close(seqDecode)
                     -	}()
+                    -
                     -	// Async 2: Decode sequences...
                     +	// Async 1: Decode sequences...
                      	go func() {
                      		var hist history
                      		var hasErr bool
@@ -696,7 +665,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
+                     			}
                      			if block.async.newHist != nil {
                      				if debugDecoder {
                     -					println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
                     +					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
+                     				}
                      				hist.decoders = block.async.newHist.decoders
                      				hist.recentOffsets = block.async.newHist.recentOffsets
@@ -750,7 +719,7 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
+                     			}
                      			if block.async.newHist != nil {
                      				if debugDecoder {
                     -					println("Async 3: new history")
                     +					println("Async 2: new history")
+                     				}
                      				hist.windowSize = block.async.newHist.windowSize
                      				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
@@ -837,6 +806,33 @@ func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output ch
                      decodeStream:
                      	for {
                     +		var hist history
                     +		var hasErr bool
+                    +
                     +		decodeBlock := func(block *blockDec) {
                     +			if hasErr {
                     +				if block != nil {
                     +					seqDecode <- block
                     +				}
                     +				return
                     +			}
                     +			if block.err != nil || block.Type != blockTypeCompressed {
                     +				hasErr = block.err != nil
                     +				seqDecode <- block
                     +				return
                     +			}
+                    +
                     +			remain, err := block.decodeLiterals(block.data, &hist)
                     +			block.err = err
                     +			hasErr = block.err != nil
                     +			if err == nil {
                     +				block.async.literals = hist.decoders.literals
                     +				block.async.seqData = remain
                     +			} else if debugDecoder {
                     +				println("decodeLiterals error:", err)
                     +			}
                     +			seqDecode <- block
                     +		}
                      		frame := d.frame
                      		if debugDecoder {
                      			println("New frame...")
@@ -863,7 +859,7 @@ decodeStream:
                      			case <-ctx.Done():
                      			case dec := <-d.decoders:
                      				dec.sendErr(err)
                     -				seqPrepare <- dec
                     +				decodeBlock(dec)
+                     			}
                      			break decodeStream
+                     		}
@@ -883,6 +879,10 @@ decodeStream:
                      				if debugDecoder {
                      					println("Alloc History:", h.allocFrameBuffer)
+                     				}
                     +				hist.reset()
                     +				if h.dict != nil {
                     +					hist.setDict(h.dict)
                     +				}
                      				dec.async.newHist = &h
                      				dec.async.fcs = frame.FrameContentSize
                      				historySent = true
@@ -909,7 +909,7 @@ decodeStream:
+                     			}
                      			err = dec.err
                      			last := dec.Last
                     -			seqPrepare <- dec
                     +			decodeBlock(dec)
                      			if err != nil {
                      				break decodeStream
+                     			}
@@ -918,7 +918,7 @@ decodeStream:
+                     			}
+                     		}
+                     	}
                     -	close(seqPrepare)
                     +	close(seqDecode)
                      	wg.Wait()
                      	d.frame.history.b = frameHistCache
+                     }

@@ -19,6 +19,7 @@ type decoderOptions struct {
                      	maxDecodedSize uint64
                      	maxWindowSize  uint64
                      	dicts          []dict
                     +	ignoreChecksum bool
+                     }
                      func (o *decoderOptions) setDefault() {
@@ -31,7 +32,7 @@ func (o *decoderOptions) setDefault() {
                      	if o.concurrent > 4 {
                      		o.concurrent = 4
+                     	}
                     -	o.maxDecodedSize = 1 << 63
                     +	o.maxDecodedSize = 64 << 30
+                     }
                      // WithDecoderLowmem will set whether to use a lower amount of memory,
@@ -66,7 +67,7 @@ func WithDecoderConcurrency(n int) DOption {
                      // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
                      // non-streaming operations or maximum window size for streaming operations.
                      // This can be used to control memory usage of potentially hostile content.
                     -// Maximum and default is 1 << 63 bytes.
                     +// Maximum is 1 << 63 bytes. Default is 64GiB.
                      func WithDecoderMaxMemory(n uint64) DOption {
                      	return func(o *decoderOptions) error {
                      		if n == 0 {
@@ -112,3 +113,11 @@ func WithDecoderMaxWindow(size uint64) DOption {
                      		return nil
+                     	}
+                     }
+                    +
                     +// IgnoreChecksum allows to forcibly ignore checksum checking.
                     +func IgnoreChecksum(b bool) DOption {
                     +	return func(o *decoderOptions) error {
                     +		o.ignoreChecksum = b
                     +		return nil
                     +	}
                     +}

@@ -156,8 +156,8 @@ encodeLoop:
                      				panic("offset0 was 0")
+                     			}
                     -			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
                     +			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			candidateL := e.longTable[nextHashL]
                      			candidateS := e.table[nextHashS]
@@ -518,8 +518,8 @@ encodeLoop:
+                     			}
                      			// Store this, since we have it.
                     -			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
                     +			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			// We have at least 4 byte match.
                      			// No need to check backwards. We come straight from a match
@@ -674,8 +674,8 @@ encodeLoop:
                      				panic("offset0 was 0")
+                     			}
                     -			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
                     +			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			candidateL := e.longTable[nextHashL]
                      			candidateS := e.table[nextHashS]
@@ -1047,8 +1047,8 @@ encodeLoop:
+                     			}
                      			// Store this, since we have it.
                     -			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
                     +			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
                      			// We have at least 4 byte match.
                      			// No need to check backwards. We come straight from a match

@@ -127,8 +127,8 @@ encodeLoop:
                      				panic("offset0 was 0")
+                     			}
                     -			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
                     +			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			candidateL := e.longTable[nextHashL]
                      			candidateS := e.table[nextHashS]
@@ -439,8 +439,8 @@ encodeLoop:
                      		var t int32
                      		for {
                     -			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
                     +			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			candidateL := e.longTable[nextHashL]
                      			candidateS := e.table[nextHashS]
@@ -785,8 +785,8 @@ encodeLoop:
                      				panic("offset0 was 0")
+                     			}
                     -			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
                     +			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			candidateL := e.longTable[nextHashL]
                      			candidateS := e.table[nextHashS]
@@ -969,7 +969,7 @@ encodeLoop:
                      		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
                      		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
                      		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
                     -		longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
                     +		longHash2 := hashLen(cv1, dFastLongTableBits, dFastLongLen)
                      		e.longTable[longHash1] = te0
                      		e.longTable[longHash2] = te1
                      		e.markLongShardDirty(longHash1)
@@ -1002,8 +1002,8 @@ encodeLoop:
+                     			}
                      			// Store this, since we have it.
                     -			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
                     +			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
                      			// We have at least 4 byte match.
                      			// No need to check backwards. We come straight from a match

@@ -528,8 +528,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
                      		// If a non-single block is needed the encoder will reset again.
                      		e.encoders <- enc
                      	}()
                     -	// Use single segments when above minimum window and below 1MB.
                     -	single := len(src) < 1<<20 && len(src) > MinWindowSize
                     +	// Use single segments when above minimum window and below window size.
                     +	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
                      	if e.o.single != nil {
                      		single = *e.o.single
+                     	}
@@ -551,7 +551,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
+                     	}
                      	// If we can do everything in one block, prefer that.
                     -	if len(src) <= maxCompressedBlockSize {
                     +	if len(src) <= e.o.blockSize {
                      		enc.Reset(e.o.dict, true)
                      		// Slightly faster with no history and everything in one block.
                      		if e.o.crc {

@@ -283,7 +283,7 @@ func WithNoEntropyCompression(b bool) EOption {
                      // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
                      // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
                      // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
                     -// If this is not specified, block encodes will automatically choose this based on the input size.
                     +// If this is not specified, block encodes will automatically choose this based on the input size and the window size.
                      // This setting has no effect on streamed encodes.
                      func WithSingleSegment(b bool) EOption {
                      	return func(o *encoderOptions) error {

@@ -106,7 +106,7 @@ func (d *frameDec) reset(br byteBuffer) error {
+                     		}
                      		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
                      		println("Skipping frame with", n, "bytes.")
                     -		err = br.skipN(int(n))
                     +		err = br.skipN(int64(n))
                      		if err != nil {
                      			if debugDecoder {
                      				println("Reading discarded frame", err)
@@ -231,20 +231,27 @@ func (d *frameDec) reset(br byteBuffer) error {
                      		d.crc.Reset()
+                     	}
                     +	if d.WindowSize > d.o.maxWindowSize {
                     +		if debugDecoder {
                     +			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
                     +		}
                     +		return ErrWindowSizeExceeded
                     +	}
+                    +
                      	if d.WindowSize == 0 && d.SingleSegment {
                      		// We may not need window in this case.
                      		d.WindowSize = d.FrameContentSize
                      		if d.WindowSize < MinWindowSize {
                      			d.WindowSize = MinWindowSize
+                     		}
                     -	}
+                    -
                     -	if d.WindowSize > uint64(d.o.maxWindowSize) {
                     -		if debugDecoder {
                     -			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
                     +		if d.WindowSize > d.o.maxDecodedSize {
                     +			if debugDecoder {
                     +				printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
                     +			}
                     +			return ErrDecoderSizeExceeded
+                     		}
                     -		return ErrWindowSizeExceeded
+                     	}
+                    +
                      	// The minimum Window_Size is 1 KB.
                      	if d.WindowSize < MinWindowSize {
                      		if debugDecoder {
@@ -253,10 +260,11 @@ func (d *frameDec) reset(br byteBuffer) error {
                      		return ErrWindowSizeTooSmall
+                     	}
                      	d.history.windowSize = int(d.WindowSize)
                     -	if d.o.lowMem && d.history.windowSize < maxBlockSize {
                     +	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
                     +		// Alloc 2x window size if not low-mem, or very small window size.
                      		d.history.allocFrameBuffer = d.history.windowSize * 2
                     -		// TODO: Maybe use FrameContent size
                      	} else {
                     +		// Alloc with one additional block
                      		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+                     	}
@@ -290,13 +298,6 @@ func (d *frameDec) checkCRC() error {
                      	if !d.HasCheckSum {
                      		return nil
+                     	}
                     -	var tmp [4]byte
                     -	got := d.crc.Sum64()
                     -	// Flip to match file order.
                     -	tmp[0] = byte(got >> 0)
                     -	tmp[1] = byte(got >> 8)
                     -	tmp[2] = byte(got >> 16)
                     -	tmp[3] = byte(got >> 24)
                      	// We can overwrite upper tmp now
                      	want, err := d.rawInput.readSmall(4)
@@ -305,7 +306,19 @@ func (d *frameDec) checkCRC() error {
                      		return err
+                     	}
                     -	if !bytes.Equal(tmp[:], want) && !ignoreCRC {
                     +	if d.o.ignoreChecksum {
                     +		return nil
                     +	}
+                    +
                     +	var tmp [4]byte
                     +	got := d.crc.Sum64()
                     +	// Flip to match file order.
                     +	tmp[0] = byte(got >> 0)
                     +	tmp[1] = byte(got >> 8)
                     +	tmp[2] = byte(got >> 16)
                     +	tmp[3] = byte(got >> 24)
+                    +
                     +	if !bytes.Equal(tmp[:], want) {
                      		if debugDecoder {
                      			println("CRC Check Failed:", tmp[:], "!=", want)
+                     		}
@@ -317,6 +330,19 @@ func (d *frameDec) checkCRC() error {
                      	return nil
+                     }
                     +// consumeCRC reads the checksum data if the frame has one.
                     +func (d *frameDec) consumeCRC() error {
                     +	if d.HasCheckSum {
                     +		_, err := d.rawInput.readSmall(4)
                     +		if err != nil {
                     +			println("CRC missing?", err)
                     +			return err
                     +		}
                     +	}
+                    +
                     +	return nil
                     +}
+                    +
                      // runDecoder will create a sync decoder that will decode a block of data.
                      func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
                      	saved := d.history.b
@@ -326,6 +352,19 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
                      	d.history.ignoreBuffer = len(dst)
                      	// Store input length, so we only check new data.
                      	crcStart := len(dst)
                     +	d.history.decoders.maxSyncLen = 0
                     +	if d.FrameContentSize != fcsUnknown {
                     +		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
                     +		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
                     +			return dst, ErrDecoderSizeExceeded
                     +		}
                     +		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
                     +			// Alloc for output
                     +			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
                     +			copy(dst2, dst)
                     +			dst = dst2
                     +		}
                     +	}
                      	var err error
                      	for {
                      		err = dec.reset(d.rawInput, d.WindowSize)
@@ -360,13 +399,17 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
                      		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
                      			err = ErrFrameSizeMismatch
                      		} else if d.HasCheckSum {
                     -			var n int
                     -			n, err = d.crc.Write(dst[crcStart:])
                     -			if err == nil {
                     -				if n != len(dst)-crcStart {
                     -					err = io.ErrShortWrite
                     -				} else {
                     -					err = d.checkCRC()
                     +			if d.o.ignoreChecksum {
                     +				err = d.consumeCRC()
                     +			} else {
                     +				var n int
                     +				n, err = d.crc.Write(dst[crcStart:])
                     +				if err == nil {
                     +					if n != len(dst)-crcStart {
                     +						err = io.ErrShortWrite
                     +					} else {
                     +						err = d.checkCRC()
                     +					}
+                     				}
+                     			}
+                     		}

@@ -5,8 +5,10 @@
                      package zstd
                      import (
                     +	"encoding/binary"
                      	"errors"
                      	"fmt"
                     +	"io"
+                     )
                      const (
@@ -178,10 +180,32 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
                      		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
+                     	}
                      	b.advance((bitCount + 7) >> 3)
                     -	// println(s.norm[:s.symbolLen], s.symbolLen)
                      	return s.buildDtable()
+                     }
                     +func (s *fseDecoder) mustReadFrom(r io.Reader) {
                     +	fatalErr := func(err error) {
                     +		if err != nil {
                     +			panic(err)
                     +		}
                     +	}
                     +	// 	dt             [maxTablesize]decSymbol // Decompression table.
                     +	//	symbolLen      uint16                  // Length of active part of the symbol table.
                     +	//	actualTableLog uint8                   // Selected tablelog.
                     +	//	maxBits        uint8                   // Maximum number of additional bits
                     +	//	// used for table creation to avoid allocations.
                     +	//	stateTable [256]uint16
                     +	//	norm       [maxSymbolValue + 1]int16
                     +	//	preDefined bool
                     +	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
                     +	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
                     +	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
                     +	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
                     +	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
                     +	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
                     +	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
                     +}
+                    +
                      // decSymbol contains information about a state entry,
                      // Including the state offset base, the output symbol and
                      // the number of bits to read for the low part of the destination state.
@@ -204,18 +228,10 @@ func (d decSymbol) newState() uint16 {
                      	return uint16(d >> 16)
+                     }
                     -func (d decSymbol) baseline() uint32 {
                     -	return uint32(d >> 32)
                     -}
+                    -
                      func (d decSymbol) baselineInt() int {
                      	return int(d >> 32)
+                     }
                     -func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
                     -	*d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
                     -}
+                    -
                      func (d *decSymbol) setNBits(nBits uint8) {
                      	const mask = 0xffffffffffffff00
                      	*d = (*d & mask) | decSymbol(nBits)
@@ -231,11 +247,6 @@ func (d *decSymbol) setNewState(state uint16) {
                      	*d = (*d & mask) | decSymbol(state)<<16
+                     }
                     -func (d *decSymbol) setBaseline(baseline uint32) {
                     -	const mask = 0xffffffff
                     -	*d = (*d & mask) | decSymbol(baseline)<<32
                     -}
+                    -
                      func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
                      	const mask = 0xffff00ff
                      	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
@@ -257,68 +268,6 @@ func (s *fseDecoder) setRLE(symbol decSymbol) {
                      	s.dt[0] = symbol
+                     }
                     -// buildDtable will build the decoding table.
                     -func (s *fseDecoder) buildDtable() error {
                     -	tableSize := uint32(1 << s.actualTableLog)
                     -	highThreshold := tableSize - 1
                     -	symbolNext := s.stateTable[:256]
+                    -
                     -	// Init, lay down lowprob symbols
                     -	{
                     -		for i, v := range s.norm[:s.symbolLen] {
                     -			if v == -1 {
                     -				s.dt[highThreshold].setAddBits(uint8(i))
                     -				highThreshold--
                     -				symbolNext[i] = 1
                     -			} else {
                     -				symbolNext[i] = uint16(v)
                     -			}
                     -		}
                     -	}
                     -	// Spread symbols
                     -	{
                     -		tableMask := tableSize - 1
                     -		step := tableStep(tableSize)
                     -		position := uint32(0)
                     -		for ss, v := range s.norm[:s.symbolLen] {
                     -			for i := 0; i < int(v); i++ {
                     -				s.dt[position].setAddBits(uint8(ss))
                     -				position = (position + step) & tableMask
                     -				for position > highThreshold {
                     -					// lowprob area
                     -					position = (position + step) & tableMask
                     -				}
                     -			}
                     -		}
                     -		if position != 0 {
                     -			// position must reach all cells once, otherwise normalizedCounter is incorrect
                     -			return errors.New("corrupted input (position != 0)")
                     -		}
                     -	}
+                    -
                     -	// Build Decoding table
                     -	{
                     -		tableSize := uint16(1 << s.actualTableLog)
                     -		for u, v := range s.dt[:tableSize] {
                     -			symbol := v.addBits()
                     -			nextState := symbolNext[symbol]
                     -			symbolNext[symbol] = nextState + 1
                     -			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
                     -			s.dt[u&maxTableMask].setNBits(nBits)
                     -			newState := (nextState << nBits) - tableSize
                     -			if newState > tableSize {
                     -				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
                     -			}
                     -			if newState == uint16(u) && nBits == 0 {
                     -				// Seems weird that this is possible with nbits > 0.
                     -				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
                     -			}
                     -			s.dt[u&maxTableMask].setNewState(newState)
                     -		}
                     -	}
                     -	return nil
                     -}
+                    -
                      // transform will transform the decoder table into a table usable for
                      // decoding without having to apply the transformation while decoding.
                      // The state will contain the base value and the number of bits to read.
@@ -352,34 +301,7 @@ func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
                      	s.state = dt[br.getBits(tableLog)]
+                     }
                     -// next returns the current symbol and sets the next state.
                     -// At least tablelog bits must be available in the bit reader.
                     -func (s *fseState) next(br *bitReader) {
                     -	lowBits := uint16(br.getBits(s.state.nbBits()))
                     -	s.state = s.dt[s.state.newState()+lowBits]
                     -}
+                    -
                     -// finished returns true if all bits have been read from the bitstream
                     -// and the next state would require reading bits from the input.
                     -func (s *fseState) finished(br *bitReader) bool {
                     -	return br.finished() && s.state.nbBits() > 0
                     -}
+                    -
                     -// final returns the current state symbol without decoding the next.
                     -func (s *fseState) final() (int, uint8) {
                     -	return s.state.baselineInt(), s.state.addBits()
                     -}
+                    -
                      // final returns the current state symbol without decoding the next.
                      func (s decSymbol) final() (int, uint8) {
                      	return s.baselineInt(), s.addBits()
+                     }
+                    -
                     -// nextFast returns the next symbol and sets the next state.
                     -// This can only be used if no symbols are 0 bits.
                     -// At least tablelog bits must be available in the bit reader.
                     -func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
                     -	lowBits := br.get16BitsFast(s.state.nbBits())
                     -	s.state = s.dt[s.state.newState()+lowBits]
                     -	return s.state.baseline(), s.state.addBits()
                     -}

                     new file mode 100644
@@ -0,0 +1,64 @@
                     +//go:build amd64 && !appengine && !noasm && gc
                     +// +build amd64,!appengine,!noasm,gc
+                    +
                     +package zstd
+                    +
                     +import (
                     +	"fmt"
                     +)
+                    +
                     +type buildDtableAsmContext struct {
                     +	// inputs
                     +	stateTable *uint16
                     +	norm       *int16
                     +	dt         *uint64
+                    +
                     +	// outputs --- set by the procedure in the case of error;
                     +	// for interpretation please see the error handling part below
                     +	errParam1 uint64
                     +	errParam2 uint64
                     +}
+                    +
                     +// buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
                     +// Function returns non-zero exit code on error.
                     +// go:noescape
                     +func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
+                    +
                     +// please keep in sync with _generate/gen_fse.go
                     +const (
                     +	errorCorruptedNormalizedCounter = 1
                     +	errorNewStateTooBig             = 2
                     +	errorNewStateNoBits             = 3
                     +)
+                    +
                     +// buildDtable will build the decoding table.
                     +func (s *fseDecoder) buildDtable() error {
                     +	ctx := buildDtableAsmContext{
                     +		stateTable: &s.stateTable[0],
                     +		norm:       &s.norm[0],
                     +		dt:         (*uint64)(&s.dt[0]),
                     +	}
                     +	code := buildDtable_asm(s, &ctx)
+                    +
                     +	if code != 0 {
                     +		switch code {
                     +		case errorCorruptedNormalizedCounter:
                     +			position := ctx.errParam1
                     +			return fmt.Errorf("corrupted input (position=%d, expected 0)", position)
+                    +
                     +		case errorNewStateTooBig:
                     +			newState := decSymbol(ctx.errParam1)
                     +			size := ctx.errParam2
                     +			return fmt.Errorf("newState (%d) outside table size (%d)", newState, size)
+                    +
                     +		case errorNewStateNoBits:
                     +			newState := decSymbol(ctx.errParam1)
                     +			oldState := decSymbol(ctx.errParam2)
                     +			return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, oldState)
+                    +
                     +		default:
                     +			return fmt.Errorf("buildDtable_asm returned unhandled nonzero code = %d", code)
                     +		}
                     +	}
                     +	return nil
                     +}

                     new file mode 100644
@@ -0,0 +1,127 @@
                     +// Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
+                    +
                     +//go:build !appengine && !noasm && gc && !noasm
                     +// +build !appengine,!noasm,gc,!noasm
+                    +
                     +// func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
                     +TEXT ·buildDtable_asm(SB), $0-24
                     +	MOVQ ctx+8(FP), CX
                     +	MOVQ s+0(FP), DI
+                    +
                     +	// Load values
                     +	MOVBQZX 4098(DI), DX
                     +	XORQ    AX, AX
                     +	BTSQ    DX, AX
                     +	MOVQ    (CX), BX
                     +	MOVQ    16(CX), SI
                     +	LEAQ    -1(AX), R8
                     +	MOVQ    8(CX), CX
                     +	MOVWQZX 4096(DI), DI
+                    +
                     +	// End load values
                     +	// Init, lay down lowprob symbols
                     +	XORQ R9, R9
                     +	JMP  init_main_loop_condition
+                    +
                     +init_main_loop:
                     +	MOVWQSX (CX)(R9*2), R10
                     +	CMPW    R10, $-1
                     +	JNE     do_not_update_high_threshold
                     +	MOVB    R9, 1(SI)(R8*8)
                     +	DECQ    R8
                     +	MOVQ    $0x0000000000000001, R10
+                    +
                     +do_not_update_high_threshold:
                     +	MOVW R10, (BX)(R9*2)
                     +	INCQ R9
+                    +
                     +init_main_loop_condition:
                     +	CMPQ R9, DI
                     +	JL   init_main_loop
+                    +
                     +	// Spread symbols
                     +	// Calculate table step
                     +	MOVQ AX, R9
                     +	SHRQ $0x01, R9
                     +	MOVQ AX, R10
                     +	SHRQ $0x03, R10
                     +	LEAQ 3(R9)(R10*1), R9
+                    +
                     +	// Fill add bits values
                     +	LEAQ -1(AX), R10
                     +	XORQ R11, R11
                     +	XORQ R12, R12
                     +	JMP  spread_main_loop_condition
+                    +
                     +spread_main_loop:
                     +	XORQ    R13, R13
                     +	MOVWQSX (CX)(R12*2), R14
                     +	JMP     spread_inner_loop_condition
+                    +
                     +spread_inner_loop:
                     +	MOVB R12, 1(SI)(R11*8)
+                    +
                     +adjust_position:
                     +	ADDQ R9, R11
                     +	ANDQ R10, R11
                     +	CMPQ R11, R8
                     +	JG   adjust_position
                     +	INCQ R13
+                    +
                     +spread_inner_loop_condition:
                     +	CMPQ R13, R14
                     +	JL   spread_inner_loop
                     +	INCQ R12
+                    +
                     +spread_main_loop_condition:
                     +	CMPQ  R12, DI
                     +	JL    spread_main_loop
                     +	TESTQ R11, R11
                     +	JZ    spread_check_ok
                     +	MOVQ  ctx+8(FP), AX
                     +	MOVQ  R11, 24(AX)
                     +	MOVQ  $+1, ret+16(FP)
                     +	RET
+                    +
                     +spread_check_ok:
                     +	// Build Decoding table
                     +	XORQ DI, DI
+                    +
                     +build_table_main_table:
                     +	MOVBQZX 1(SI)(DI*8), CX
                     +	MOVWQZX (BX)(CX*2), R8
                     +	LEAQ    1(R8), R9
                     +	MOVW    R9, (BX)(CX*2)
                     +	MOVQ    R8, R9
                     +	BSRQ    R9, R9
                     +	MOVQ    DX, CX
                     +	SUBQ    R9, CX
                     +	SHLQ    CL, R8
                     +	SUBQ    AX, R8
                     +	MOVB    CL, (SI)(DI*8)
                     +	MOVW    R8, 2(SI)(DI*8)
                     +	CMPQ    R8, AX
                     +	JLE     build_table_check1_ok
                     +	MOVQ    ctx+8(FP), CX
                     +	MOVQ    R8, 24(CX)
                     +	MOVQ    AX, 32(CX)
                     +	MOVQ    $+2, ret+16(FP)
                     +	RET
+                    +
                     +build_table_check1_ok:
                     +	TESTB CL, CL
                     +	JNZ   build_table_check2_ok
                     +	CMPW  R8, DI
                     +	JNE   build_table_check2_ok
                     +	MOVQ  ctx+8(FP), AX
                     +	MOVQ  R8, 24(AX)
                     +	MOVQ  DI, 32(AX)
                     +	MOVQ  $+3, ret+16(FP)
                     +	RET
+                    +
                     +build_table_check2_ok:
                     +	INCQ DI
                     +	CMPQ DI, AX
                     +	JL   build_table_main_table
                     +	MOVQ $+0, ret+16(FP)
                     +	RET

@@ -76,21 +76,6 @@ func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
                      	s.clearCount = maxCount != 0
+                     }
                     -// prepare will prepare and allocate scratch tables used for both compression and decompression.
                     -func (s *fseEncoder) prepare() (*fseEncoder, error) {
                     -	if s == nil {
                     -		s = &fseEncoder{}
                     -	}
                     -	s.useRLE = false
                     -	if s.clearCount && s.maxCount == 0 {
                     -		for i := range s.count {
                     -			s.count[i] = 0
                     -		}
                     -		s.clearCount = false
                     -	}
                     -	return s, nil
                     -}
+                    -
                      // allocCtable will allocate tables needed for compression.
                      // If existing tables a re big enough, they are simply re-used.
                      func (s *fseEncoder) allocCtable() {
@@ -709,14 +694,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
                      	c.state = c.stateTable[lu]
+                     }
                     -// encode the output symbol provided and write it to the bitstream.
                     -func (c *cState) encode(symbolTT symbolTransform) {
                     -	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
                     -	dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
                     -	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
                     -	c.state = c.stateTable[dstState]
                     -}
+                    -
                      // flush will write the tablelog to the output and flush the remaining full bytes.
                      func (c *cState) flush(tableLog uint8) {
                      	c.bw.flush32()