Signed-off-by: Jintao Zhang <zhangjintao9020@gmail.com>
Jintao Zhang authored on 2020/07/31 01:39:59... | ... |
@@ -66,7 +66,7 @@ github.com/ugorji/go b4c50a2b199d93b13dc15e78929c |
66 | 66 |
github.com/hashicorp/consul 9a9cc9341bb487651a0399e3fc5e1e8a42e62dd9 # v0.5.2 |
67 | 67 |
github.com/miekg/dns 6c0c4e6581f8e173cc562c8b3363ab984e4ae071 # v1.1.27 |
68 | 68 |
github.com/ishidawataru/sctp 6e2cb1366111dcf547c13531e3a263a067715847 |
69 |
-go.etcd.io/bbolt a0458a2b35708eef59eb5f620ceb3cd1c01a824d # v1.3.3 |
|
69 |
+go.etcd.io/bbolt 232d8fc87f50244f9c808f4745759e08a304c029 # v1.3.5 |
|
70 | 70 |
|
71 | 71 |
# get graph and distribution packages |
72 | 72 |
github.com/docker/distribution 0d3efadf0154c2b8a4e7b6621fff9809655cc580 |
... | ... |
@@ -152,11 +152,12 @@ are not thread safe. To work with data in multiple goroutines you must start |
152 | 152 |
a transaction for each one or use locking to ensure only one goroutine accesses |
153 | 153 |
a transaction at a time. Creating transaction from the `DB` is thread safe. |
154 | 154 |
|
155 |
-Read-only transactions and read-write transactions should not depend on one |
|
156 |
-another and generally shouldn't be opened simultaneously in the same goroutine. |
|
157 |
-This can cause a deadlock as the read-write transaction needs to periodically |
|
158 |
-re-map the data file but it cannot do so while a read-only transaction is open. |
|
159 |
- |
|
155 |
+Transactions should not depend on one another and generally shouldn't be opened |
|
156 |
+simultaneously in the same goroutine. This can cause a deadlock as the read-write |
|
157 |
+transaction needs to periodically re-map the data file but it cannot do so while |
|
158 |
+any read-only transaction is open. Even a nested read-only transaction can cause |
|
159 |
+a deadlock, as the child transaction can block the parent transaction from releasing |
|
160 |
+its resources. |
|
160 | 161 |
|
161 | 162 |
#### Read-write transactions |
162 | 163 |
|
... | ... |
@@ -275,7 +276,7 @@ should be writable. |
275 | 275 |
### Using buckets |
276 | 276 |
|
277 | 277 |
Buckets are collections of key/value pairs within the database. All keys in a |
278 |
-bucket must be unique. You can create a bucket using the `DB.CreateBucket()` |
|
278 |
+bucket must be unique. You can create a bucket using the `Tx.CreateBucket()` |
|
279 | 279 |
function: |
280 | 280 |
|
281 | 281 |
```go |
... | ... |
@@ -923,6 +924,7 @@ Below is a list of public, open source projects that use Bolt: |
923 | 923 |
* [GoWebApp](https://github.com/josephspurrier/gowebapp) - A basic MVC web application in Go using BoltDB. |
924 | 924 |
* [GoShort](https://github.com/pankajkhairnar/goShort) - GoShort is a URL shortener written in Golang and BoltDB for persistent key/value storage and for routing it's using high performent HTTPRouter. |
925 | 925 |
* [gopherpit](https://github.com/gopherpit/gopherpit) - A web service to manage Go remote import paths with custom domains |
926 |
+* [gokv](https://github.com/philippgille/gokv) - Simple key-value store abstraction and implementations for Go (Redis, Consul, etcd, bbolt, BadgerDB, LevelDB, Memcached, DynamoDB, S3, PostgreSQL, MongoDB, CockroachDB and many more) |
|
926 | 927 |
* [Gitchain](https://github.com/gitchain/gitchain) - Decentralized, peer-to-peer Git repositories aka "Git meets Bitcoin". |
927 | 928 |
* [InfluxDB](https://influxdata.com) - Scalable datastore for metrics, events, and real-time analytics. |
928 | 929 |
* [ipLocator](https://github.com/AndreasBriese/ipLocator) - A fast ip-geo-location-server using bolt with bloom filters. |
... | ... |
@@ -935,6 +937,7 @@ Below is a list of public, open source projects that use Bolt: |
935 | 935 |
* [mbuckets](https://github.com/abhigupta912/mbuckets) - A Bolt wrapper that allows easy operations on multi level (nested) buckets. |
936 | 936 |
* [MetricBase](https://github.com/msiebuhr/MetricBase) - Single-binary version of Graphite. |
937 | 937 |
* [MuLiFS](https://github.com/dankomiocevic/mulifs) - Music Library Filesystem creates a filesystem to organise your music files. |
938 |
+* [NATS](https://github.com/nats-io/nats-streaming-server) - NATS Streaming uses bbolt for message and metadata storage. |
|
938 | 939 |
* [Operation Go: A Routine Mission](http://gocode.io) - An online programming game for Golang using Bolt for user accounts and a leaderboard. |
939 | 940 |
* [photosite/session](https://godoc.org/bitbucket.org/kardianos/photosite/session) - Sessions for a photo viewing site. |
940 | 941 |
* [Prometheus Annotation Server](https://github.com/oliver006/prom_annotation_server) - Annotation server for PromDash & Prometheus service monitoring system. |
... | ... |
@@ -1,28 +1,7 @@ |
1 | 1 |
package bbolt |
2 | 2 |
|
3 |
-import "unsafe" |
|
4 |
- |
|
5 | 3 |
// maxMapSize represents the largest mmap size supported by Bolt. |
6 | 4 |
const maxMapSize = 0x7FFFFFFF // 2GB |
7 | 5 |
|
8 | 6 |
// maxAllocSize is the size used when creating array pointers. |
9 | 7 |
const maxAllocSize = 0xFFFFFFF |
10 |
- |
|
11 |
-// Are unaligned load/stores broken on this arch? |
|
12 |
-var brokenUnaligned bool |
|
13 |
- |
|
14 |
-func init() { |
|
15 |
- // Simple check to see whether this arch handles unaligned load/stores |
|
16 |
- // correctly. |
|
17 |
- |
|
18 |
- // ARM9 and older devices require load/stores to be from/to aligned |
|
19 |
- // addresses. If not, the lower 2 bits are cleared and that address is |
|
20 |
- // read in a jumbled up order. |
|
21 |
- |
|
22 |
- // See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka15414.html |
|
23 |
- |
|
24 |
- raw := [6]byte{0xfe, 0xef, 0x11, 0x22, 0x22, 0x11} |
|
25 |
- val := *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&raw)) + 2)) |
|
26 |
- |
|
27 |
- brokenUnaligned = val != 0x11222211 |
|
28 |
-} |
5 | 5 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,90 @@ |
0 |
+// +build aix |
|
1 |
+ |
|
2 |
+package bbolt |
|
3 |
+ |
|
4 |
+import ( |
|
5 |
+ "fmt" |
|
6 |
+ "syscall" |
|
7 |
+ "time" |
|
8 |
+ "unsafe" |
|
9 |
+ |
|
10 |
+ "golang.org/x/sys/unix" |
|
11 |
+) |
|
12 |
+ |
|
13 |
+// flock acquires an advisory lock on a file descriptor. |
|
14 |
+func flock(db *DB, exclusive bool, timeout time.Duration) error { |
|
15 |
+ var t time.Time |
|
16 |
+ if timeout != 0 { |
|
17 |
+ t = time.Now() |
|
18 |
+ } |
|
19 |
+ fd := db.file.Fd() |
|
20 |
+ var lockType int16 |
|
21 |
+ if exclusive { |
|
22 |
+ lockType = syscall.F_WRLCK |
|
23 |
+ } else { |
|
24 |
+ lockType = syscall.F_RDLCK |
|
25 |
+ } |
|
26 |
+ for { |
|
27 |
+ // Attempt to obtain an exclusive lock. |
|
28 |
+ lock := syscall.Flock_t{Type: lockType} |
|
29 |
+ err := syscall.FcntlFlock(fd, syscall.F_SETLK, &lock) |
|
30 |
+ if err == nil { |
|
31 |
+ return nil |
|
32 |
+ } else if err != syscall.EAGAIN { |
|
33 |
+ return err |
|
34 |
+ } |
|
35 |
+ |
|
36 |
+ // If we timed out then return an error. |
|
37 |
+ if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout { |
|
38 |
+ return ErrTimeout |
|
39 |
+ } |
|
40 |
+ |
|
41 |
+ // Wait for a bit and try again. |
|
42 |
+ time.Sleep(flockRetryTimeout) |
|
43 |
+ } |
|
44 |
+} |
|
45 |
+ |
|
46 |
+// funlock releases an advisory lock on a file descriptor. |
|
47 |
+func funlock(db *DB) error { |
|
48 |
+ var lock syscall.Flock_t |
|
49 |
+ lock.Start = 0 |
|
50 |
+ lock.Len = 0 |
|
51 |
+ lock.Type = syscall.F_UNLCK |
|
52 |
+ lock.Whence = 0 |
|
53 |
+ return syscall.FcntlFlock(uintptr(db.file.Fd()), syscall.F_SETLK, &lock) |
|
54 |
+} |
|
55 |
+ |
|
56 |
+// mmap memory maps a DB's data file. |
|
57 |
+func mmap(db *DB, sz int) error { |
|
58 |
+ // Map the data file to memory. |
|
59 |
+ b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags) |
|
60 |
+ if err != nil { |
|
61 |
+ return err |
|
62 |
+ } |
|
63 |
+ |
|
64 |
+ // Advise the kernel that the mmap is accessed randomly. |
|
65 |
+ if err := unix.Madvise(b, syscall.MADV_RANDOM); err != nil { |
|
66 |
+ return fmt.Errorf("madvise: %s", err) |
|
67 |
+ } |
|
68 |
+ |
|
69 |
+ // Save the original byte slice and convert to a byte array pointer. |
|
70 |
+ db.dataref = b |
|
71 |
+ db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0])) |
|
72 |
+ db.datasz = sz |
|
73 |
+ return nil |
|
74 |
+} |
|
75 |
+ |
|
76 |
+// munmap unmaps a DB's data file from memory. |
|
77 |
+func munmap(db *DB) error { |
|
78 |
+ // Ignore the unmap if we have no mapped data. |
|
79 |
+ if db.dataref == nil { |
|
80 |
+ return nil |
|
81 |
+ } |
|
82 |
+ |
|
83 |
+ // Unmap using the original byte slice. |
|
84 |
+ err := unix.Munmap(db.dataref) |
|
85 |
+ db.dataref = nil |
|
86 |
+ db.data = nil |
|
87 |
+ db.datasz = 0 |
|
88 |
+ return err |
|
89 |
+} |
... | ... |
@@ -123,10 +123,12 @@ func (b *Bucket) Bucket(name []byte) *Bucket { |
123 | 123 |
func (b *Bucket) openBucket(value []byte) *Bucket { |
124 | 124 |
var child = newBucket(b.tx) |
125 | 125 |
|
126 |
- // If unaligned load/stores are broken on this arch and value is |
|
127 |
- // unaligned simply clone to an aligned byte array. |
|
128 |
- unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0 |
|
129 |
- |
|
126 |
+ // Unaligned access requires a copy to be made. |
|
127 |
+ const unalignedMask = unsafe.Alignof(struct { |
|
128 |
+ bucket |
|
129 |
+ page |
|
130 |
+ }{}) - 1 |
|
131 |
+ unaligned := uintptr(unsafe.Pointer(&value[0]))&unalignedMask != 0 |
|
130 | 132 |
if unaligned { |
131 | 133 |
value = cloneBytes(value) |
132 | 134 |
} |
... | ... |
@@ -206,7 +208,7 @@ func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) { |
206 | 206 |
} |
207 | 207 |
|
208 | 208 |
// DeleteBucket deletes a bucket at the given key. |
209 |
-// Returns an error if the bucket does not exists, or if the key represents a non-bucket value. |
|
209 |
+// Returns an error if the bucket does not exist, or if the key represents a non-bucket value. |
|
210 | 210 |
func (b *Bucket) DeleteBucket(key []byte) error { |
211 | 211 |
if b.tx.db == nil { |
212 | 212 |
return ErrTxClosed |
... | ... |
@@ -228,7 +230,7 @@ func (b *Bucket) DeleteBucket(key []byte) error { |
228 | 228 |
// Recursively delete all child buckets. |
229 | 229 |
child := b.Bucket(key) |
230 | 230 |
err := child.ForEach(func(k, v []byte) error { |
231 |
- if v == nil { |
|
231 |
+ if _, _, childFlags := child.Cursor().seek(k); (childFlags & bucketLeafFlag) != 0 { |
|
232 | 232 |
if err := child.DeleteBucket(k); err != nil { |
233 | 233 |
return fmt.Errorf("delete bucket: %s", err) |
234 | 234 |
} |
... | ... |
@@ -409,7 +411,7 @@ func (b *Bucket) Stats() BucketStats { |
409 | 409 |
|
410 | 410 |
if p.count != 0 { |
411 | 411 |
// If page has any elements, add all element headers. |
412 |
- used += leafPageElementSize * int(p.count-1) |
|
412 |
+ used += leafPageElementSize * uintptr(p.count-1) |
|
413 | 413 |
|
414 | 414 |
// Add all element key, value sizes. |
415 | 415 |
// The computation takes advantage of the fact that the position |
... | ... |
@@ -417,16 +419,16 @@ func (b *Bucket) Stats() BucketStats { |
417 | 417 |
// of all previous elements' keys and values. |
418 | 418 |
// It also includes the last element's header. |
419 | 419 |
lastElement := p.leafPageElement(p.count - 1) |
420 |
- used += int(lastElement.pos + lastElement.ksize + lastElement.vsize) |
|
420 |
+ used += uintptr(lastElement.pos + lastElement.ksize + lastElement.vsize) |
|
421 | 421 |
} |
422 | 422 |
|
423 | 423 |
if b.root == 0 { |
424 | 424 |
// For inlined bucket just update the inline stats |
425 |
- s.InlineBucketInuse += used |
|
425 |
+ s.InlineBucketInuse += int(used) |
|
426 | 426 |
} else { |
427 | 427 |
// For non-inlined bucket update all the leaf stats |
428 | 428 |
s.LeafPageN++ |
429 |
- s.LeafInuse += used |
|
429 |
+ s.LeafInuse += int(used) |
|
430 | 430 |
s.LeafOverflowN += int(p.overflow) |
431 | 431 |
|
432 | 432 |
// Collect stats from sub-buckets. |
... | ... |
@@ -447,13 +449,13 @@ func (b *Bucket) Stats() BucketStats { |
447 | 447 |
|
448 | 448 |
// used totals the used bytes for the page |
449 | 449 |
// Add header and all element headers. |
450 |
- used := pageHeaderSize + (branchPageElementSize * int(p.count-1)) |
|
450 |
+ used := pageHeaderSize + (branchPageElementSize * uintptr(p.count-1)) |
|
451 | 451 |
|
452 | 452 |
// Add size of all keys and values. |
453 | 453 |
// Again, use the fact that last element's position equals to |
454 | 454 |
// the total of key, value sizes of all previous elements. |
455 |
- used += int(lastElement.pos + lastElement.ksize) |
|
456 |
- s.BranchInuse += used |
|
455 |
+ used += uintptr(lastElement.pos + lastElement.ksize) |
|
456 |
+ s.BranchInuse += int(used) |
|
457 | 457 |
s.BranchOverflowN += int(p.overflow) |
458 | 458 |
} |
459 | 459 |
|
... | ... |
@@ -593,7 +595,7 @@ func (b *Bucket) inlineable() bool { |
593 | 593 |
// our threshold for inline bucket size. |
594 | 594 |
var size = pageHeaderSize |
595 | 595 |
for _, inode := range n.inodes { |
596 |
- size += leafPageElementSize + len(inode.key) + len(inode.value) |
|
596 |
+ size += leafPageElementSize + uintptr(len(inode.key)) + uintptr(len(inode.value)) |
|
597 | 597 |
|
598 | 598 |
if inode.flags&bucketLeafFlag != 0 { |
599 | 599 |
return false |
... | ... |
@@ -606,8 +608,8 @@ func (b *Bucket) inlineable() bool { |
606 | 606 |
} |
607 | 607 |
|
608 | 608 |
// Returns the maximum total size of a bucket to make it a candidate for inlining. |
609 |
-func (b *Bucket) maxInlineBucketSize() int { |
|
610 |
- return b.tx.db.pageSize / 4 |
|
609 |
+func (b *Bucket) maxInlineBucketSize() uintptr { |
|
610 |
+ return uintptr(b.tx.db.pageSize / 4) |
|
611 | 611 |
} |
612 | 612 |
|
613 | 613 |
// write allocates and writes a bucket to a byte slice. |
... | ... |
@@ -366,7 +366,7 @@ func (c *Cursor) node() *node { |
366 | 366 |
} |
367 | 367 |
for _, ref := range c.stack[:len(c.stack)-1] { |
368 | 368 |
_assert(!n.isLeaf, "expected branch node") |
369 |
- n = n.childAt(int(ref.index)) |
|
369 |
+ n = n.childAt(ref.index) |
|
370 | 370 |
} |
371 | 371 |
_assert(n.isLeaf, "expected leaf node") |
372 | 372 |
return n |
... | ... |
@@ -206,12 +206,12 @@ func Open(path string, mode os.FileMode, options *Options) (*DB, error) { |
206 | 206 |
} |
207 | 207 |
|
208 | 208 |
// Open data file and separate sync handler for metadata writes. |
209 |
- db.path = path |
|
210 | 209 |
var err error |
211 |
- if db.file, err = db.openFile(db.path, flag|os.O_CREATE, mode); err != nil { |
|
210 |
+ if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil { |
|
212 | 211 |
_ = db.close() |
213 | 212 |
return nil, err |
214 | 213 |
} |
214 |
+ db.path = db.file.Name() |
|
215 | 215 |
|
216 | 216 |
// Lock file so that other processes using Bolt in read-write mode cannot |
217 | 217 |
// use the database at the same time. This would cause corruption since |
... | ... |
@@ -71,7 +71,7 @@ func (f *freelist) size() int { |
71 | 71 |
// The first element will be used to store the count. See freelist.write. |
72 | 72 |
n++ |
73 | 73 |
} |
74 |
- return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * n) |
|
74 |
+ return int(pageHeaderSize) + (int(unsafe.Sizeof(pgid(0))) * n) |
|
75 | 75 |
} |
76 | 76 |
|
77 | 77 |
// count returns count of pages on the freelist |
... | ... |
@@ -93,7 +93,7 @@ func (f *freelist) pending_count() int { |
93 | 93 |
return count |
94 | 94 |
} |
95 | 95 |
|
96 |
-// copyall copies into dst a list of all free ids and all pending ids in one sorted list. |
|
96 |
+// copyall copies a list of all free ids and all pending ids in one sorted list. |
|
97 | 97 |
// f.count returns the minimum length required for dst. |
98 | 98 |
func (f *freelist) copyall(dst []pgid) { |
99 | 99 |
m := make(pgids, 0, f.pending_count()) |
... | ... |
@@ -267,17 +267,23 @@ func (f *freelist) read(p *page) { |
267 | 267 |
} |
268 | 268 |
// If the page.count is at the max uint16 value (64k) then it's considered |
269 | 269 |
// an overflow and the size of the freelist is stored as the first element. |
270 |
- idx, count := 0, int(p.count) |
|
270 |
+ var idx, count = 0, int(p.count) |
|
271 | 271 |
if count == 0xFFFF { |
272 | 272 |
idx = 1 |
273 |
- count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0]) |
|
273 |
+ c := *(*pgid)(unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))) |
|
274 |
+ count = int(c) |
|
275 |
+ if count < 0 { |
|
276 |
+ panic(fmt.Sprintf("leading element count %d overflows int", c)) |
|
277 |
+ } |
|
274 | 278 |
} |
275 | 279 |
|
276 | 280 |
// Copy the list of page ids from the freelist. |
277 | 281 |
if count == 0 { |
278 | 282 |
f.ids = nil |
279 | 283 |
} else { |
280 |
- ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx : idx+count] |
|
284 |
+ var ids []pgid |
|
285 |
+ data := unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), unsafe.Sizeof(ids[0]), idx) |
|
286 |
+ unsafeSlice(unsafe.Pointer(&ids), data, count) |
|
281 | 287 |
|
282 | 288 |
// copy the ids, so we don't modify on the freelist page directly |
283 | 289 |
idsCopy := make([]pgid, count) |
... | ... |
@@ -310,16 +316,22 @@ func (f *freelist) write(p *page) error { |
310 | 310 |
|
311 | 311 |
// The page.count can only hold up to 64k elements so if we overflow that |
312 | 312 |
// number then we handle it by putting the size in the first element. |
313 |
- lenids := f.count() |
|
314 |
- if lenids == 0 { |
|
315 |
- p.count = uint16(lenids) |
|
316 |
- } else if lenids < 0xFFFF { |
|
317 |
- p.count = uint16(lenids) |
|
318 |
- f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:]) |
|
313 |
+ l := f.count() |
|
314 |
+ if l == 0 { |
|
315 |
+ p.count = uint16(l) |
|
316 |
+ } else if l < 0xFFFF { |
|
317 |
+ p.count = uint16(l) |
|
318 |
+ var ids []pgid |
|
319 |
+ data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
|
320 |
+ unsafeSlice(unsafe.Pointer(&ids), data, l) |
|
321 |
+ f.copyall(ids) |
|
319 | 322 |
} else { |
320 | 323 |
p.count = 0xFFFF |
321 |
- ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(lenids) |
|
322 |
- f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:]) |
|
324 |
+ var ids []pgid |
|
325 |
+ data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
|
326 |
+ unsafeSlice(unsafe.Pointer(&ids), data, l+1) |
|
327 |
+ ids[0] = pgid(l) |
|
328 |
+ f.copyall(ids[1:]) |
|
323 | 329 |
} |
324 | 330 |
|
325 | 331 |
return nil |
... | ... |
@@ -41,19 +41,19 @@ func (n *node) size() int { |
41 | 41 |
sz, elsz := pageHeaderSize, n.pageElementSize() |
42 | 42 |
for i := 0; i < len(n.inodes); i++ { |
43 | 43 |
item := &n.inodes[i] |
44 |
- sz += elsz + len(item.key) + len(item.value) |
|
44 |
+ sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) |
|
45 | 45 |
} |
46 |
- return sz |
|
46 |
+ return int(sz) |
|
47 | 47 |
} |
48 | 48 |
|
49 | 49 |
// sizeLessThan returns true if the node is less than a given size. |
50 | 50 |
// This is an optimization to avoid calculating a large node when we only need |
51 | 51 |
// to know if it fits inside a certain page size. |
52 |
-func (n *node) sizeLessThan(v int) bool { |
|
52 |
+func (n *node) sizeLessThan(v uintptr) bool { |
|
53 | 53 |
sz, elsz := pageHeaderSize, n.pageElementSize() |
54 | 54 |
for i := 0; i < len(n.inodes); i++ { |
55 | 55 |
item := &n.inodes[i] |
56 |
- sz += elsz + len(item.key) + len(item.value) |
|
56 |
+ sz += elsz + uintptr(len(item.key)) + uintptr(len(item.value)) |
|
57 | 57 |
if sz >= v { |
58 | 58 |
return false |
59 | 59 |
} |
... | ... |
@@ -62,7 +62,7 @@ func (n *node) sizeLessThan(v int) bool { |
62 | 62 |
} |
63 | 63 |
|
64 | 64 |
// pageElementSize returns the size of each page element based on the type of node. |
65 |
-func (n *node) pageElementSize() int { |
|
65 |
+func (n *node) pageElementSize() uintptr { |
|
66 | 66 |
if n.isLeaf { |
67 | 67 |
return leafPageElementSize |
68 | 68 |
} |
... | ... |
@@ -207,10 +207,17 @@ func (n *node) write(p *page) { |
207 | 207 |
} |
208 | 208 |
|
209 | 209 |
// Loop over each item and write it to the page. |
210 |
- b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):] |
|
210 |
+ // off tracks the offset into p of the start of the next data. |
|
211 |
+ off := unsafe.Sizeof(*p) + n.pageElementSize()*uintptr(len(n.inodes)) |
|
211 | 212 |
for i, item := range n.inodes { |
212 | 213 |
_assert(len(item.key) > 0, "write: zero-length inode key") |
213 | 214 |
|
215 |
+ // Create a slice to write into of needed size and advance |
|
216 |
+ // byte pointer for next iteration. |
|
217 |
+ sz := len(item.key) + len(item.value) |
|
218 |
+ b := unsafeByteSlice(unsafe.Pointer(p), off, 0, sz) |
|
219 |
+ off += uintptr(sz) |
|
220 |
+ |
|
214 | 221 |
// Write the page element. |
215 | 222 |
if n.isLeaf { |
216 | 223 |
elem := p.leafPageElement(uint16(i)) |
... | ... |
@@ -226,20 +233,9 @@ func (n *node) write(p *page) { |
226 | 226 |
_assert(elem.pgid != p.id, "write: circular dependency occurred") |
227 | 227 |
} |
228 | 228 |
|
229 |
- // If the length of key+value is larger than the max allocation size |
|
230 |
- // then we need to reallocate the byte array pointer. |
|
231 |
- // |
|
232 |
- // See: https://github.com/boltdb/bolt/pull/335 |
|
233 |
- klen, vlen := len(item.key), len(item.value) |
|
234 |
- if len(b) < klen+vlen { |
|
235 |
- b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:] |
|
236 |
- } |
|
237 |
- |
|
238 | 229 |
// Write data for the element to the end of the page. |
239 |
- copy(b[0:], item.key) |
|
240 |
- b = b[klen:] |
|
241 |
- copy(b[0:], item.value) |
|
242 |
- b = b[vlen:] |
|
230 |
+ l := copy(b, item.key) |
|
231 |
+ copy(b[l:], item.value) |
|
243 | 232 |
} |
244 | 233 |
|
245 | 234 |
// DEBUG ONLY: n.dump() |
... | ... |
@@ -247,7 +243,7 @@ func (n *node) write(p *page) { |
247 | 247 |
|
248 | 248 |
// split breaks up a node into multiple smaller nodes, if appropriate. |
249 | 249 |
// This should only be called from the spill() function. |
250 |
-func (n *node) split(pageSize int) []*node { |
|
250 |
+func (n *node) split(pageSize uintptr) []*node { |
|
251 | 251 |
var nodes []*node |
252 | 252 |
|
253 | 253 |
node := n |
... | ... |
@@ -270,7 +266,7 @@ func (n *node) split(pageSize int) []*node { |
270 | 270 |
|
271 | 271 |
// splitTwo breaks up a node into two smaller nodes, if appropriate. |
272 | 272 |
// This should only be called from the split() function. |
273 |
-func (n *node) splitTwo(pageSize int) (*node, *node) { |
|
273 |
+func (n *node) splitTwo(pageSize uintptr) (*node, *node) { |
|
274 | 274 |
// Ignore the split if the page doesn't have at least enough nodes for |
275 | 275 |
// two pages or if the nodes can fit in a single page. |
276 | 276 |
if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) { |
... | ... |
@@ -312,18 +308,18 @@ func (n *node) splitTwo(pageSize int) (*node, *node) { |
312 | 312 |
// splitIndex finds the position where a page will fill a given threshold. |
313 | 313 |
// It returns the index as well as the size of the first page. |
314 | 314 |
// This is only be called from split(). |
315 |
-func (n *node) splitIndex(threshold int) (index, sz int) { |
|
315 |
+func (n *node) splitIndex(threshold int) (index, sz uintptr) { |
|
316 | 316 |
sz = pageHeaderSize |
317 | 317 |
|
318 | 318 |
// Loop until we only have the minimum number of keys required for the second page. |
319 | 319 |
for i := 0; i < len(n.inodes)-minKeysPerPage; i++ { |
320 |
- index = i |
|
320 |
+ index = uintptr(i) |
|
321 | 321 |
inode := n.inodes[i] |
322 |
- elsize := n.pageElementSize() + len(inode.key) + len(inode.value) |
|
322 |
+ elsize := n.pageElementSize() + uintptr(len(inode.key)) + uintptr(len(inode.value)) |
|
323 | 323 |
|
324 | 324 |
// If we have at least the minimum number of keys and adding another |
325 | 325 |
// node would put us over the threshold then exit and return. |
326 |
- if i >= minKeysPerPage && sz+elsize > threshold { |
|
326 |
+ if index >= minKeysPerPage && sz+elsize > uintptr(threshold) { |
|
327 | 327 |
break |
328 | 328 |
} |
329 | 329 |
|
... | ... |
@@ -356,7 +352,7 @@ func (n *node) spill() error { |
356 | 356 |
n.children = nil |
357 | 357 |
|
358 | 358 |
// Split nodes into appropriate sizes. The first node will always be n. |
359 |
- var nodes = n.split(tx.db.pageSize) |
|
359 |
+ var nodes = n.split(uintptr(tx.db.pageSize)) |
|
360 | 360 |
for _, node := range nodes { |
361 | 361 |
// Add node's page to the freelist if it's not new. |
362 | 362 |
if node.pgid > 0 { |
... | ... |
@@ -587,9 +583,11 @@ func (n *node) dump() { |
587 | 587 |
|
588 | 588 |
type nodes []*node |
589 | 589 |
|
590 |
-func (s nodes) Len() int { return len(s) } |
|
591 |
-func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |
|
592 |
-func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 } |
|
590 |
+func (s nodes) Len() int { return len(s) } |
|
591 |
+func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] } |
|
592 |
+func (s nodes) Less(i, j int) bool { |
|
593 |
+ return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 |
|
594 |
+} |
|
593 | 595 |
|
594 | 596 |
// inode represents an internal node inside of a node. |
595 | 597 |
// It can be used to point to elements in a page or point |
... | ... |
@@ -7,12 +7,12 @@ import ( |
7 | 7 |
"unsafe" |
8 | 8 |
) |
9 | 9 |
|
10 |
-const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr)) |
|
10 |
+const pageHeaderSize = unsafe.Sizeof(page{}) |
|
11 | 11 |
|
12 | 12 |
const minKeysPerPage = 2 |
13 | 13 |
|
14 |
-const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{})) |
|
15 |
-const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{})) |
|
14 |
+const branchPageElementSize = unsafe.Sizeof(branchPageElement{}) |
|
15 |
+const leafPageElementSize = unsafe.Sizeof(leafPageElement{}) |
|
16 | 16 |
|
17 | 17 |
const ( |
18 | 18 |
branchPageFlag = 0x01 |
... | ... |
@@ -32,7 +32,6 @@ type page struct { |
32 | 32 |
flags uint16 |
33 | 33 |
count uint16 |
34 | 34 |
overflow uint32 |
35 |
- ptr uintptr |
|
36 | 35 |
} |
37 | 36 |
|
38 | 37 |
// typ returns a human readable page type string used for debugging. |
... | ... |
@@ -51,13 +50,13 @@ func (p *page) typ() string { |
51 | 51 |
|
52 | 52 |
// meta returns a pointer to the metadata section of the page. |
53 | 53 |
func (p *page) meta() *meta { |
54 |
- return (*meta)(unsafe.Pointer(&p.ptr)) |
|
54 |
+ return (*meta)(unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p))) |
|
55 | 55 |
} |
56 | 56 |
|
57 | 57 |
// leafPageElement retrieves the leaf node by index |
58 | 58 |
func (p *page) leafPageElement(index uint16) *leafPageElement { |
59 |
- n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index] |
|
60 |
- return n |
|
59 |
+ return (*leafPageElement)(unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), |
|
60 |
+ leafPageElementSize, int(index))) |
|
61 | 61 |
} |
62 | 62 |
|
63 | 63 |
// leafPageElements retrieves a list of leaf nodes. |
... | ... |
@@ -65,12 +64,16 @@ func (p *page) leafPageElements() []leafPageElement { |
65 | 65 |
if p.count == 0 { |
66 | 66 |
return nil |
67 | 67 |
} |
68 |
- return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:] |
|
68 |
+ var elems []leafPageElement |
|
69 |
+ data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
|
70 |
+ unsafeSlice(unsafe.Pointer(&elems), data, int(p.count)) |
|
71 |
+ return elems |
|
69 | 72 |
} |
70 | 73 |
|
71 | 74 |
// branchPageElement retrieves the branch node by index |
72 | 75 |
func (p *page) branchPageElement(index uint16) *branchPageElement { |
73 |
- return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index] |
|
76 |
+ return (*branchPageElement)(unsafeIndex(unsafe.Pointer(p), unsafe.Sizeof(*p), |
|
77 |
+ unsafe.Sizeof(branchPageElement{}), int(index))) |
|
74 | 78 |
} |
75 | 79 |
|
76 | 80 |
// branchPageElements retrieves a list of branch nodes. |
... | ... |
@@ -78,12 +81,15 @@ func (p *page) branchPageElements() []branchPageElement { |
78 | 78 |
if p.count == 0 { |
79 | 79 |
return nil |
80 | 80 |
} |
81 |
- return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:] |
|
81 |
+ var elems []branchPageElement |
|
82 |
+ data := unsafeAdd(unsafe.Pointer(p), unsafe.Sizeof(*p)) |
|
83 |
+ unsafeSlice(unsafe.Pointer(&elems), data, int(p.count)) |
|
84 |
+ return elems |
|
82 | 85 |
} |
83 | 86 |
|
84 | 87 |
// dump writes n bytes of the page to STDERR as hex output. |
85 | 88 |
func (p *page) hexdump(n int) { |
86 |
- buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n] |
|
89 |
+ buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, n) |
|
87 | 90 |
fmt.Fprintf(os.Stderr, "%x\n", buf) |
88 | 91 |
} |
89 | 92 |
|
... | ... |
@@ -102,8 +108,7 @@ type branchPageElement struct { |
102 | 102 |
|
103 | 103 |
// key returns a byte slice of the node key. |
104 | 104 |
func (n *branchPageElement) key() []byte { |
105 |
- buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) |
|
106 |
- return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize] |
|
105 |
+ return unsafeByteSlice(unsafe.Pointer(n), 0, int(n.pos), int(n.pos)+int(n.ksize)) |
|
107 | 106 |
} |
108 | 107 |
|
109 | 108 |
// leafPageElement represents a node on a leaf page. |
... | ... |
@@ -116,14 +121,16 @@ type leafPageElement struct { |
116 | 116 |
|
117 | 117 |
// key returns a byte slice of the node key. |
118 | 118 |
func (n *leafPageElement) key() []byte { |
119 |
- buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) |
|
120 |
- return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize:n.ksize] |
|
119 |
+ i := int(n.pos) |
|
120 |
+ j := i + int(n.ksize) |
|
121 |
+ return unsafeByteSlice(unsafe.Pointer(n), 0, i, j) |
|
121 | 122 |
} |
122 | 123 |
|
123 | 124 |
// value returns a byte slice of the node value. |
124 | 125 |
func (n *leafPageElement) value() []byte { |
125 |
- buf := (*[maxAllocSize]byte)(unsafe.Pointer(n)) |
|
126 |
- return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos+n.ksize]))[:n.vsize:n.vsize] |
|
126 |
+ i := int(n.pos) + int(n.ksize) |
|
127 |
+ j := i + int(n.vsize) |
|
128 |
+ return unsafeByteSlice(unsafe.Pointer(n), 0, i, j) |
|
127 | 129 |
} |
128 | 130 |
|
129 | 131 |
// PageInfo represents human readable information about a page. |
... | ... |
@@ -523,20 +523,18 @@ func (tx *Tx) write() error { |
523 | 523 |
|
524 | 524 |
// Write pages to disk in order. |
525 | 525 |
for _, p := range pages { |
526 |
- size := (int(p.overflow) + 1) * tx.db.pageSize |
|
526 |
+ rem := (uint64(p.overflow) + 1) * uint64(tx.db.pageSize) |
|
527 | 527 |
offset := int64(p.id) * int64(tx.db.pageSize) |
528 |
+ var written uintptr |
|
528 | 529 |
|
529 | 530 |
// Write out page in "max allocation" sized chunks. |
530 |
- ptr := (*[maxAllocSize]byte)(unsafe.Pointer(p)) |
|
531 | 531 |
for { |
532 |
- // Limit our write to our max allocation size. |
|
533 |
- sz := size |
|
532 |
+ sz := rem |
|
534 | 533 |
if sz > maxAllocSize-1 { |
535 | 534 |
sz = maxAllocSize - 1 |
536 | 535 |
} |
536 |
+ buf := unsafeByteSlice(unsafe.Pointer(p), written, 0, int(sz)) |
|
537 | 537 |
|
538 |
- // Write chunk to disk. |
|
539 |
- buf := ptr[:sz] |
|
540 | 538 |
if _, err := tx.db.ops.writeAt(buf, offset); err != nil { |
541 | 539 |
return err |
542 | 540 |
} |
... | ... |
@@ -545,14 +543,14 @@ func (tx *Tx) write() error { |
545 | 545 |
tx.stats.Write++ |
546 | 546 |
|
547 | 547 |
// Exit inner for loop if we've written all the chunks. |
548 |
- size -= sz |
|
549 |
- if size == 0 { |
|
548 |
+ rem -= sz |
|
549 |
+ if rem == 0 { |
|
550 | 550 |
break |
551 | 551 |
} |
552 | 552 |
|
553 | 553 |
// Otherwise move offset forward and move pointer to next chunk. |
554 | 554 |
offset += int64(sz) |
555 |
- ptr = (*[maxAllocSize]byte)(unsafe.Pointer(&ptr[sz])) |
|
555 |
+ written += uintptr(sz) |
|
556 | 556 |
} |
557 | 557 |
} |
558 | 558 |
|
... | ... |
@@ -571,7 +569,7 @@ func (tx *Tx) write() error { |
571 | 571 |
continue |
572 | 572 |
} |
573 | 573 |
|
574 |
- buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:tx.db.pageSize] |
|
574 |
+ buf := unsafeByteSlice(unsafe.Pointer(p), 0, 0, tx.db.pageSize) |
|
575 | 575 |
|
576 | 576 |
// See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1 |
577 | 577 |
for i := range buf { |
578 | 578 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,39 @@ |
0 |
+package bbolt |
|
1 |
+ |
|
2 |
+import ( |
|
3 |
+ "reflect" |
|
4 |
+ "unsafe" |
|
5 |
+) |
|
6 |
+ |
|
7 |
+func unsafeAdd(base unsafe.Pointer, offset uintptr) unsafe.Pointer { |
|
8 |
+ return unsafe.Pointer(uintptr(base) + offset) |
|
9 |
+} |
|
10 |
+ |
|
11 |
+func unsafeIndex(base unsafe.Pointer, offset uintptr, elemsz uintptr, n int) unsafe.Pointer { |
|
12 |
+ return unsafe.Pointer(uintptr(base) + offset + uintptr(n)*elemsz) |
|
13 |
+} |
|
14 |
+ |
|
15 |
+func unsafeByteSlice(base unsafe.Pointer, offset uintptr, i, j int) []byte { |
|
16 |
+ // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices |
|
17 |
+ // |
|
18 |
+ // This memory is not allocated from C, but it is unmanaged by Go's |
|
19 |
+ // garbage collector and should behave similarly, and the compiler |
|
20 |
+ // should produce similar code. Note that this conversion allows a |
|
21 |
+ // subslice to begin after the base address, with an optional offset, |
|
22 |
+ // while the URL above does not cover this case and only slices from |
|
23 |
+ // index 0. However, the wiki never says that the address must be to |
|
24 |
+ // the beginning of a C allocation (or even that malloc was used at |
|
25 |
+ // all), so this is believed to be correct. |
|
26 |
+ return (*[maxAllocSize]byte)(unsafeAdd(base, offset))[i:j:j] |
|
27 |
+} |
|
28 |
+ |
|
29 |
+// unsafeSlice modifies the data, len, and cap of a slice variable pointed to by |
|
30 |
+// the slice parameter. This helper should be used over other direct |
|
31 |
+// manipulation of reflect.SliceHeader to prevent misuse, namely, converting |
|
32 |
+// from reflect.SliceHeader to a Go slice type. |
|
33 |
+func unsafeSlice(slice, data unsafe.Pointer, len int) { |
|
34 |
+ s := (*reflect.SliceHeader)(slice) |
|
35 |
+ s.Data = uintptr(data) |
|
36 |
+ s.Cap = len |
|
37 |
+ s.Len = len |
|
38 |
+} |