Browse code

Vendor libkv @ 458977154600b9f23984d9f4b82e79570b5ae12b

Signed-off-by: John Howard <jhoward@microsoft.com>

This also adds go.etcd.io/bbolt as boltdb/bolt is no longer
maintained, and we need https://github.com/etcd-io/bbolt/pull/122 which
was merged in https://github.com/etcd-io/bbolt/releases/tag/v1.3.1-etcd.8
in order to fix https://github.com/docker/libnetwork/issues/1950.

Note that I can't entirely remove boltdb/bolt as it is still used by
other components. Still need to work my way through them.... These include
containerd/containerd (https://github.com/containerd/containerd/pull/2634),
docker/swarmkit; moby/buildkit. And probably more....

John Howard authored on 2018/09/14 04:31:17
Showing 31 changed files
... ...
@@ -47,7 +47,7 @@ github.com/sean-/seed e2103e2c35297fb7e17febb81e49b312087a2372
47 47
 github.com/hashicorp/go-sockaddr 6d291a969b86c4b633730bfc6b8b9d64c3aafed9
48 48
 github.com/hashicorp/go-multierror fcdddc395df1ddf4247c69bd436e84cfa0733f7e
49 49
 github.com/hashicorp/serf 598c54895cc5a7b1a24a398d635e8c0ea0959870
50
-github.com/docker/libkv 1d8431073ae03cdaedb198a89722f3aab6d418ef
50
+github.com/docker/libkv 458977154600b9f23984d9f4b82e79570b5ae12b
51 51
 github.com/vishvananda/netns 604eaf189ee867d8c147fafc28def2394e878d25
52 52
 github.com/vishvananda/netlink b2de5d10e38ecce8607e6b438b6d174f389a004e
53 53
 
... ...
@@ -63,6 +63,7 @@ github.com/hashicorp/consul v0.5.2
63 63
 github.com/boltdb/bolt fff57c100f4dea1905678da7e90d92429dff2904
64 64
 github.com/miekg/dns v1.0.7
65 65
 github.com/ishidawataru/sctp 07191f837fedd2f13d1ec7b5f885f0f3ec54b1cb
66
+go.etcd.io/bbolt v1.3.1-etcd.8
66 67
 
67 68
 # get graph and distribution packages
68 69
 github.com/docker/distribution 83389a148052d74ac602f5f1d62f86ff2f3c4aa5
... ...
@@ -90,7 +90,7 @@ Calls like `WatchTree` may return different events (or number of events) dependi
90 90
 
91 91
 Only `Consul` and `etcd` have support for TLS and you should build and provide your own `config.TLS` object to feed the client. Support is planned for `zookeeper`.
92 92
 
93
-##Roadmap
93
+## Roadmap
94 94
 
95 95
 - Make the API nicer to use (using `options`)
96 96
 - Provide more options (`consistency` for example)
... ...
@@ -98,10 +98,10 @@ Only `Consul` and `etcd` have support for TLS and you should build and provide y
98 98
 - Better key formatting
99 99
 - New backends?
100 100
 
101
-##Contributing
101
+## Contributing
102 102
 
103 103
 Want to hack on libkv? [Docker's contributions guidelines](https://github.com/docker/docker/blob/master/CONTRIBUTING.md) apply.
104 104
 
105
-##Copyright and license
105
+## Copyright and license
106 106
 
107 107
 Copyright © 2014-2016 Docker, Inc. All rights reserved, except as follows. Code is released under the Apache 2.0 license. The README.md file, and files in the "docs" folder are licensed under the Creative Commons Attribution 4.0 International License under the terms and conditions set forth in the file "LICENSE.docs". You may obtain a duplicate copy of the same license, titled CC-BY-SA-4.0, at http://creativecommons.org/licenses/by/4.0/.
... ...
@@ -10,9 +10,9 @@ import (
10 10
 	"sync/atomic"
11 11
 	"time"
12 12
 
13
-	"github.com/boltdb/bolt"
14 13
 	"github.com/docker/libkv"
15 14
 	"github.com/docker/libkv/store"
15
+	bolt "go.etcd.io/bbolt"
16 16
 )
17 17
 
18 18
 var (
... ...
@@ -1,6 +1,7 @@
1 1
 package etcd
2 2
 
3 3
 import (
4
+	"context"
4 5
 	"crypto/tls"
5 6
 	"errors"
6 7
 	"log"
... ...
@@ -9,8 +10,6 @@ import (
9 9
 	"strings"
10 10
 	"time"
11 11
 
12
-	"golang.org/x/net/context"
13
-
14 12
 	etcd "github.com/coreos/etcd/client"
15 13
 	"github.com/docker/libkv"
16 14
 	"github.com/docker/libkv/store"
... ...
@@ -30,13 +29,29 @@ type Etcd struct {
30 30
 }
31 31
 
32 32
 type etcdLock struct {
33
-	client    etcd.KeysAPI
34
-	stopLock  chan struct{}
33
+	client etcd.KeysAPI
34
+	key    string
35
+	value  string
36
+	ttl    time.Duration
37
+
38
+	// Closed when the caller wants to stop renewing the lock. I'm not sure
39
+	// why this is even used - you could just call the Unlock() method.
35 40
 	stopRenew chan struct{}
36
-	key       string
37
-	value     string
38
-	last      *etcd.Response
39
-	ttl       time.Duration
41
+	// When the lock is held, this is the last modified index of the key.
42
+	// Used for conditional updates when extending the lock TTL and when
43
+	// conditionall deleteing when Unlock() is called.
44
+	lastIndex uint64
45
+	// When the lock is held, this function will cancel the locked context.
46
+	// This is called both by the Unlock() method in order to stop the
47
+	// background holding goroutine and in a deferred call in that background
48
+	// holding goroutine in case the lock is lost due to an error or the
49
+	// stopRenew channel is closed. Calling this function also closes the chan
50
+	// returned by the Lock() method.
51
+	cancel context.CancelFunc
52
+	// Used to sync the Unlock() call with the background holding goroutine.
53
+	// This channel is closed when that background goroutine exits, signalling
54
+	// that it is okay to conditionally delete the key.
55
+	doneHolding chan struct{}
40 56
 }
41 57
 
42 58
 const (
... ...
@@ -472,112 +487,97 @@ func (s *Etcd) NewLock(key string, options *store.LockOptions) (lock store.Locke
472 472
 // doing so. It returns a channel that is closed if our
473 473
 // lock is lost or if an error occurs
474 474
 func (l *etcdLock) Lock(stopChan chan struct{}) (<-chan struct{}, error) {
475
-
476
-	// Lock holder channel
477
-	lockHeld := make(chan struct{})
478
-	stopLocking := l.stopRenew
479
-
475
+	// Conditional Set - only if the key does not exist.
480 476
 	setOpts := &etcd.SetOptions{
481
-		TTL: l.ttl,
477
+		TTL:       l.ttl,
478
+		PrevExist: etcd.PrevNoExist,
482 479
 	}
483 480
 
484 481
 	for {
485
-		setOpts.PrevExist = etcd.PrevNoExist
486 482
 		resp, err := l.client.Set(context.Background(), l.key, l.value, setOpts)
487
-		if err != nil {
488
-			if etcdError, ok := err.(etcd.Error); ok {
489
-				if etcdError.Code != etcd.ErrorCodeNodeExist {
490
-					return nil, err
491
-				}
492
-				setOpts.PrevIndex = ^uint64(0)
493
-			}
494
-		} else {
495
-			setOpts.PrevIndex = resp.Node.ModifiedIndex
496
-		}
497
-
498
-		setOpts.PrevExist = etcd.PrevExist
499
-		l.last, err = l.client.Set(context.Background(), l.key, l.value, setOpts)
500
-
501 483
 		if err == nil {
502
-			// Leader section
503
-			l.stopLock = stopLocking
504
-			go l.holdLock(l.key, lockHeld, stopLocking)
505
-			break
506
-		} else {
507
-			// If this is a legitimate error, return
508
-			if etcdError, ok := err.(etcd.Error); ok {
509
-				if etcdError.Code != etcd.ErrorCodeTestFailed {
510
-					return nil, err
511
-				}
512
-			}
484
+			// Acquired the lock!
485
+			l.lastIndex = resp.Node.ModifiedIndex
486
+			lockedCtx, cancel := context.WithCancel(context.Background())
487
+			l.cancel = cancel
488
+			l.doneHolding = make(chan struct{})
513 489
 
514
-			// Seeker section
515
-			errorCh := make(chan error)
516
-			chWStop := make(chan bool)
517
-			free := make(chan bool)
490
+			go l.holdLock(lockedCtx)
518 491
 
519
-			go l.waitLock(l.key, errorCh, chWStop, free)
492
+			return lockedCtx.Done(), nil
493
+		}
520 494
 
521
-			// Wait for the key to be available or for
522
-			// a signal to stop trying to lock the key
523
-			select {
524
-			case <-free:
525
-				break
526
-			case err := <-errorCh:
527
-				return nil, err
528
-			case <-stopChan:
529
-				return nil, ErrAbortTryLock
530
-			}
495
+		etcdErr, ok := err.(etcd.Error)
496
+		if !ok || etcdErr.Code != etcd.ErrorCodeNodeExist {
497
+			return nil, err // Unexpected error.
498
+		}
531 499
 
532
-			// Delete or Expire event occurred
533
-			// Retry
500
+		// Need to wait for the lock key to expire or be deleted.
501
+		if err := l.waitLock(stopChan, etcdErr.Index); err != nil {
502
+			return nil, err
534 503
 		}
535
-	}
536 504
 
537
-	return lockHeld, nil
505
+		// Delete or Expire event occurred.
506
+		// Retry
507
+	}
538 508
 }
539 509
 
540
-// Hold the lock as long as we can
510
+// Hold the lock as long as we can.
541 511
 // Updates the key ttl periodically until we receive
542
-// an explicit stop signal from the Unlock method
543
-func (l *etcdLock) holdLock(key string, lockHeld chan struct{}, stopLocking <-chan struct{}) {
544
-	defer close(lockHeld)
512
+// an explicit stop signal from the Unlock method OR
513
+// the stopRenew channel is closed.
514
+func (l *etcdLock) holdLock(ctx context.Context) {
515
+	defer close(l.doneHolding)
516
+	defer l.cancel()
545 517
 
546 518
 	update := time.NewTicker(l.ttl / 3)
547 519
 	defer update.Stop()
548 520
 
549
-	var err error
550 521
 	setOpts := &etcd.SetOptions{TTL: l.ttl}
551 522
 
552 523
 	for {
553 524
 		select {
554 525
 		case <-update.C:
555
-			setOpts.PrevIndex = l.last.Node.ModifiedIndex
556
-			l.last, err = l.client.Set(context.Background(), key, l.value, setOpts)
526
+			setOpts.PrevIndex = l.lastIndex
527
+			resp, err := l.client.Set(ctx, l.key, l.value, setOpts)
557 528
 			if err != nil {
558 529
 				return
559 530
 			}
560
-
561
-		case <-stopLocking:
531
+			l.lastIndex = resp.Node.ModifiedIndex
532
+		case <-l.stopRenew:
533
+			return
534
+		case <-ctx.Done():
562 535
 			return
563 536
 		}
564 537
 	}
565 538
 }
566 539
 
567
-// WaitLock simply waits for the key to be available for creation
568
-func (l *etcdLock) waitLock(key string, errorCh chan error, stopWatchCh chan bool, free chan<- bool) {
569
-	opts := &etcd.WatcherOptions{Recursive: false}
570
-	watcher := l.client.Watcher(key, opts)
540
+// WaitLock simply waits for the key to be available for creation.
541
+func (l *etcdLock) waitLock(stopWait <-chan struct{}, afterIndex uint64) error {
542
+	waitCtx, waitCancel := context.WithCancel(context.Background())
543
+	defer waitCancel()
544
+	go func() {
545
+		select {
546
+		case <-stopWait:
547
+			// If the caller closes the stopWait, cancel the wait context.
548
+			waitCancel()
549
+		case <-waitCtx.Done():
550
+			// No longer waiting.
551
+		}
552
+	}()
571 553
 
554
+	watcher := l.client.Watcher(l.key, &etcd.WatcherOptions{AfterIndex: afterIndex})
572 555
 	for {
573
-		event, err := watcher.Next(context.Background())
556
+		event, err := watcher.Next(waitCtx)
574 557
 		if err != nil {
575
-			errorCh <- err
576
-			return
558
+			if err == context.Canceled {
559
+				return ErrAbortTryLock
560
+			}
561
+			return err
577 562
 		}
578
-		if event.Action == "delete" || event.Action == "expire" {
579
-			free <- true
580
-			return
563
+		switch event.Action {
564
+		case "delete", "compareAndDelete", "expire":
565
+			return nil // The key has been deleted or expired.
581 566
 		}
582 567
 	}
583 568
 }
... ...
@@ -585,19 +585,17 @@ func (l *etcdLock) waitLock(key string, errorCh chan error, stopWatchCh chan boo
585 585
 // Unlock the "key". Calling unlock while
586 586
 // not holding the lock will throw an error
587 587
 func (l *etcdLock) Unlock() error {
588
-	if l.stopLock != nil {
589
-		l.stopLock <- struct{}{}
590
-	}
591
-	if l.last != nil {
588
+	l.cancel()      // Will signal the holdLock goroutine to exit.
589
+	<-l.doneHolding // Wait for the holdLock goroutine to exit.
590
+
591
+	var err error
592
+	if l.lastIndex != 0 {
592 593
 		delOpts := &etcd.DeleteOptions{
593
-			PrevIndex: l.last.Node.ModifiedIndex,
594
-		}
595
-		_, err := l.client.Delete(context.Background(), l.key, delOpts)
596
-		if err != nil {
597
-			return err
594
+			PrevIndex: l.lastIndex,
598 595
 		}
596
+		_, err = l.client.Delete(context.Background(), l.key, delOpts)
599 597
 	}
600
-	return nil
598
+	return err
601 599
 }
602 600
 
603 601
 // Close closes the client connection
604 602
new file mode 100644
... ...
@@ -0,0 +1,20 @@
0
+The MIT License (MIT)
1
+
2
+Copyright (c) 2013 Ben Johnson
3
+
4
+Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+this software and associated documentation files (the "Software"), to deal in
6
+the Software without restriction, including without limitation the rights to
7
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+the Software, and to permit persons to whom the Software is furnished to do so,
9
+subject to the following conditions:
10
+
11
+The above copyright notice and this permission notice shall be included in all
12
+copies or substantial portions of the Software.
13
+
14
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
0 20
new file mode 100644
... ...
@@ -0,0 +1,953 @@
0
+bbolt
1
+=====
2
+
3
+[![Go Report Card](https://goreportcard.com/badge/github.com/etcd-io/bbolt?style=flat-square)](https://goreportcard.com/report/github.com/etcd-io/bbolt)
4
+[![Coverage](https://codecov.io/gh/etcd-io/bbolt/branch/master/graph/badge.svg)](https://codecov.io/gh/etcd-io/bbolt)
5
+[![Build Status Travis](https://img.shields.io/travis/etcd-io/bboltlabs.svg?style=flat-square&&branch=master)](https://travis-ci.com/etcd-io/bbolt)
6
+[![Godoc](http://img.shields.io/badge/go-documentation-blue.svg?style=flat-square)](https://godoc.org/github.com/etcd-io/bbolt)
7
+[![Releases](https://img.shields.io/github/release/etcd-io/bbolt/all.svg?style=flat-square)](https://github.com/etcd-io/bbolt/releases)
8
+[![LICENSE](https://img.shields.io/github/license/etcd-io/bbolt.svg?style=flat-square)](https://github.com/etcd-io/bbolt/blob/master/LICENSE)
9
+
10
+bbolt is a fork of [Ben Johnson's][gh_ben] [Bolt][bolt] key/value
11
+store. The purpose of this fork is to provide the Go community with an active
12
+maintenance and development target for Bolt; the goal is improved reliability
13
+and stability. bbolt includes bug fixes, performance enhancements, and features
14
+not found in Bolt while preserving backwards compatibility with the Bolt API.
15
+
16
+Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas]
17
+[LMDB project][lmdb]. The goal of the project is to provide a simple,
18
+fast, and reliable database for projects that don't require a full database
19
+server such as Postgres or MySQL.
20
+
21
+Since Bolt is meant to be used as such a low-level piece of functionality,
22
+simplicity is key. The API will be small and only focus on getting values
23
+and setting values. That's it.
24
+
25
+[gh_ben]: https://github.com/benbjohnson
26
+[bolt]: https://github.com/boltdb/bolt
27
+[hyc_symas]: https://twitter.com/hyc_symas
28
+[lmdb]: http://symas.com/mdb/
29
+
30
+## Project Status
31
+
32
+Bolt is stable, the API is fixed, and the file format is fixed. Full unit
33
+test coverage and randomized black box testing are used to ensure database
34
+consistency and thread safety. Bolt is currently used in high-load production
35
+environments serving databases as large as 1TB. Many companies such as
36
+Shopify and Heroku use Bolt-backed services every day.
37
+
38
+## Project versioning
39
+
40
+bbolt uses [semantic versioning](http://semver.org).
41
+API should not change between patch and minor releases.
42
+New minor versions may add additional features to the API.
43
+
44
+## Table of Contents
45
+
46
+  - [Getting Started](#getting-started)
47
+    - [Installing](#installing)
48
+    - [Opening a database](#opening-a-database)
49
+    - [Transactions](#transactions)
50
+      - [Read-write transactions](#read-write-transactions)
51
+      - [Read-only transactions](#read-only-transactions)
52
+      - [Batch read-write transactions](#batch-read-write-transactions)
53
+      - [Managing transactions manually](#managing-transactions-manually)
54
+    - [Using buckets](#using-buckets)
55
+    - [Using key/value pairs](#using-keyvalue-pairs)
56
+    - [Autoincrementing integer for the bucket](#autoincrementing-integer-for-the-bucket)
57
+    - [Iterating over keys](#iterating-over-keys)
58
+      - [Prefix scans](#prefix-scans)
59
+      - [Range scans](#range-scans)
60
+      - [ForEach()](#foreach)
61
+    - [Nested buckets](#nested-buckets)
62
+    - [Database backups](#database-backups)
63
+    - [Statistics](#statistics)
64
+    - [Read-Only Mode](#read-only-mode)
65
+    - [Mobile Use (iOS/Android)](#mobile-use-iosandroid)
66
+  - [Resources](#resources)
67
+  - [Comparison with other databases](#comparison-with-other-databases)
68
+    - [Postgres, MySQL, & other relational databases](#postgres-mysql--other-relational-databases)
69
+    - [LevelDB, RocksDB](#leveldb-rocksdb)
70
+    - [LMDB](#lmdb)
71
+  - [Caveats & Limitations](#caveats--limitations)
72
+  - [Reading the Source](#reading-the-source)
73
+  - [Other Projects Using Bolt](#other-projects-using-bolt)
74
+
75
+## Getting Started
76
+
77
+### Installing
78
+
79
+To start using Bolt, install Go and run `go get`:
80
+
81
+```sh
82
+$ go get go.etcd.io/bbolt/...
83
+```
84
+
85
+This will retrieve the library and install the `bolt` command line utility into
86
+your `$GOBIN` path.
87
+
88
+
89
+### Importing bbolt
90
+
91
+To use bbolt as an embedded key-value store, import as:
92
+
93
+```go
94
+import bolt "go.etcd.io/bbolt"
95
+
96
+db, err := bolt.Open(path, 0666, nil)
97
+if err != nil {
98
+  return err
99
+}
100
+defer db.Close()
101
+```
102
+
103
+
104
+### Opening a database
105
+
106
+The top-level object in Bolt is a `DB`. It is represented as a single file on
107
+your disk and represents a consistent snapshot of your data.
108
+
109
+To open your database, simply use the `bolt.Open()` function:
110
+
111
+```go
112
+package main
113
+
114
+import (
115
+	"log"
116
+
117
+	bolt "go.etcd.io/bbolt"
118
+)
119
+
120
+func main() {
121
+	// Open the my.db data file in your current directory.
122
+	// It will be created if it doesn't exist.
123
+	db, err := bolt.Open("my.db", 0600, nil)
124
+	if err != nil {
125
+		log.Fatal(err)
126
+	}
127
+	defer db.Close()
128
+
129
+	...
130
+}
131
+```
132
+
133
+Please note that Bolt obtains a file lock on the data file so multiple processes
134
+cannot open the same database at the same time. Opening an already open Bolt
135
+database will cause it to hang until the other process closes it. To prevent
136
+an indefinite wait you can pass a timeout option to the `Open()` function:
137
+
138
+```go
139
+db, err := bolt.Open("my.db", 0600, &bolt.Options{Timeout: 1 * time.Second})
140
+```
141
+
142
+
143
+### Transactions
144
+
145
+Bolt allows only one read-write transaction at a time but allows as many
146
+read-only transactions as you want at a time. Each transaction has a consistent
147
+view of the data as it existed when the transaction started.
148
+
149
+Individual transactions and all objects created from them (e.g. buckets, keys)
150
+are not thread safe. To work with data in multiple goroutines you must start
151
+a transaction for each one or use locking to ensure only one goroutine accesses
152
+a transaction at a time. Creating transaction from the `DB` is thread safe.
153
+
154
+Read-only transactions and read-write transactions should not depend on one
155
+another and generally shouldn't be opened simultaneously in the same goroutine.
156
+This can cause a deadlock as the read-write transaction needs to periodically
157
+re-map the data file but it cannot do so while a read-only transaction is open.
158
+
159
+
160
+#### Read-write transactions
161
+
162
+To start a read-write transaction, you can use the `DB.Update()` function:
163
+
164
+```go
165
+err := db.Update(func(tx *bolt.Tx) error {
166
+	...
167
+	return nil
168
+})
169
+```
170
+
171
+Inside the closure, you have a consistent view of the database. You commit the
172
+transaction by returning `nil` at the end. You can also rollback the transaction
173
+at any point by returning an error. All database operations are allowed inside
174
+a read-write transaction.
175
+
176
+Always check the return error as it will report any disk failures that can cause
177
+your transaction to not complete. If you return an error within your closure
178
+it will be passed through.
179
+
180
+
181
+#### Read-only transactions
182
+
183
+To start a read-only transaction, you can use the `DB.View()` function:
184
+
185
+```go
186
+err := db.View(func(tx *bolt.Tx) error {
187
+	...
188
+	return nil
189
+})
190
+```
191
+
192
+You also get a consistent view of the database within this closure, however,
193
+no mutating operations are allowed within a read-only transaction. You can only
194
+retrieve buckets, retrieve values, and copy the database within a read-only
195
+transaction.
196
+
197
+
198
+#### Batch read-write transactions
199
+
200
+Each `DB.Update()` waits for disk to commit the writes. This overhead
201
+can be minimized by combining multiple updates with the `DB.Batch()`
202
+function:
203
+
204
+```go
205
+err := db.Batch(func(tx *bolt.Tx) error {
206
+	...
207
+	return nil
208
+})
209
+```
210
+
211
+Concurrent Batch calls are opportunistically combined into larger
212
+transactions. Batch is only useful when there are multiple goroutines
213
+calling it.
214
+
215
+The trade-off is that `Batch` can call the given
216
+function multiple times, if parts of the transaction fail. The
217
+function must be idempotent and side effects must take effect only
218
+after a successful return from `DB.Batch()`.
219
+
220
+For example: don't display messages from inside the function, instead
221
+set variables in the enclosing scope:
222
+
223
+```go
224
+var id uint64
225
+err := db.Batch(func(tx *bolt.Tx) error {
226
+	// Find last key in bucket, decode as bigendian uint64, increment
227
+	// by one, encode back to []byte, and add new key.
228
+	...
229
+	id = newValue
230
+	return nil
231
+})
232
+if err != nil {
233
+	return ...
234
+}
235
+fmt.Println("Allocated ID %d", id)
236
+```
237
+
238
+
239
+#### Managing transactions manually
240
+
241
+The `DB.View()` and `DB.Update()` functions are wrappers around the `DB.Begin()`
242
+function. These helper functions will start the transaction, execute a function,
243
+and then safely close your transaction if an error is returned. This is the
244
+recommended way to use Bolt transactions.
245
+
246
+However, sometimes you may want to manually start and end your transactions.
247
+You can use the `DB.Begin()` function directly but **please** be sure to close
248
+the transaction.
249
+
250
+```go
251
+// Start a writable transaction.
252
+tx, err := db.Begin(true)
253
+if err != nil {
254
+    return err
255
+}
256
+defer tx.Rollback()
257
+
258
+// Use the transaction...
259
+_, err := tx.CreateBucket([]byte("MyBucket"))
260
+if err != nil {
261
+    return err
262
+}
263
+
264
+// Commit the transaction and check for error.
265
+if err := tx.Commit(); err != nil {
266
+    return err
267
+}
268
+```
269
+
270
+The first argument to `DB.Begin()` is a boolean stating if the transaction
271
+should be writable.
272
+
273
+
274
+### Using buckets
275
+
276
+Buckets are collections of key/value pairs within the database. All keys in a
277
+bucket must be unique. You can create a bucket using the `DB.CreateBucket()`
278
+function:
279
+
280
+```go
281
+db.Update(func(tx *bolt.Tx) error {
282
+	b, err := tx.CreateBucket([]byte("MyBucket"))
283
+	if err != nil {
284
+		return fmt.Errorf("create bucket: %s", err)
285
+	}
286
+	return nil
287
+})
288
+```
289
+
290
+You can also create a bucket only if it doesn't exist by using the
291
+`Tx.CreateBucketIfNotExists()` function. It's a common pattern to call this
292
+function for all your top-level buckets after you open your database so you can
293
+guarantee that they exist for future transactions.
294
+
295
+To delete a bucket, simply call the `Tx.DeleteBucket()` function.
296
+
297
+
298
+### Using key/value pairs
299
+
300
+To save a key/value pair to a bucket, use the `Bucket.Put()` function:
301
+
302
+```go
303
+db.Update(func(tx *bolt.Tx) error {
304
+	b := tx.Bucket([]byte("MyBucket"))
305
+	err := b.Put([]byte("answer"), []byte("42"))
306
+	return err
307
+})
308
+```
309
+
310
+This will set the value of the `"answer"` key to `"42"` in the `MyBucket`
311
+bucket. To retrieve this value, we can use the `Bucket.Get()` function:
312
+
313
+```go
314
+db.View(func(tx *bolt.Tx) error {
315
+	b := tx.Bucket([]byte("MyBucket"))
316
+	v := b.Get([]byte("answer"))
317
+	fmt.Printf("The answer is: %s\n", v)
318
+	return nil
319
+})
320
+```
321
+
322
+The `Get()` function does not return an error because its operation is
323
+guaranteed to work (unless there is some kind of system failure). If the key
324
+exists then it will return its byte slice value. If it doesn't exist then it
325
+will return `nil`. It's important to note that you can have a zero-length value
326
+set to a key which is different than the key not existing.
327
+
328
+Use the `Bucket.Delete()` function to delete a key from the bucket.
329
+
330
+Please note that values returned from `Get()` are only valid while the
331
+transaction is open. If you need to use a value outside of the transaction
332
+then you must use `copy()` to copy it to another byte slice.
333
+
334
+
335
+### Autoincrementing integer for the bucket
336
+By using the `NextSequence()` function, you can let Bolt determine a sequence
337
+which can be used as the unique identifier for your key/value pairs. See the
338
+example below.
339
+
340
+```go
341
+// CreateUser saves u to the store. The new user ID is set on u once the data is persisted.
342
+func (s *Store) CreateUser(u *User) error {
343
+    return s.db.Update(func(tx *bolt.Tx) error {
344
+        // Retrieve the users bucket.
345
+        // This should be created when the DB is first opened.
346
+        b := tx.Bucket([]byte("users"))
347
+
348
+        // Generate ID for the user.
349
+        // This returns an error only if the Tx is closed or not writeable.
350
+        // That can't happen in an Update() call so I ignore the error check.
351
+        id, _ := b.NextSequence()
352
+        u.ID = int(id)
353
+
354
+        // Marshal user data into bytes.
355
+        buf, err := json.Marshal(u)
356
+        if err != nil {
357
+            return err
358
+        }
359
+
360
+        // Persist bytes to users bucket.
361
+        return b.Put(itob(u.ID), buf)
362
+    })
363
+}
364
+
365
+// itob returns an 8-byte big endian representation of v.
366
+func itob(v int) []byte {
367
+    b := make([]byte, 8)
368
+    binary.BigEndian.PutUint64(b, uint64(v))
369
+    return b
370
+}
371
+
372
+type User struct {
373
+    ID int
374
+    ...
375
+}
376
+```
377
+
378
+### Iterating over keys
379
+
380
+Bolt stores its keys in byte-sorted order within a bucket. This makes sequential
381
+iteration over these keys extremely fast. To iterate over keys we'll use a
382
+`Cursor`:
383
+
384
+```go
385
+db.View(func(tx *bolt.Tx) error {
386
+	// Assume bucket exists and has keys
387
+	b := tx.Bucket([]byte("MyBucket"))
388
+
389
+	c := b.Cursor()
390
+
391
+	for k, v := c.First(); k != nil; k, v = c.Next() {
392
+		fmt.Printf("key=%s, value=%s\n", k, v)
393
+	}
394
+
395
+	return nil
396
+})
397
+```
398
+
399
+The cursor allows you to move to a specific point in the list of keys and move
400
+forward or backward through the keys one at a time.
401
+
402
+The following functions are available on the cursor:
403
+
404
+```
405
+First()  Move to the first key.
406
+Last()   Move to the last key.
407
+Seek()   Move to a specific key.
408
+Next()   Move to the next key.
409
+Prev()   Move to the previous key.
410
+```
411
+
412
+Each of those functions has a return signature of `(key []byte, value []byte)`.
413
+When you have iterated to the end of the cursor then `Next()` will return a
414
+`nil` key.  You must seek to a position using `First()`, `Last()`, or `Seek()`
415
+before calling `Next()` or `Prev()`. If you do not seek to a position then
416
+these functions will return a `nil` key.
417
+
418
+During iteration, if the key is non-`nil` but the value is `nil`, that means
419
+the key refers to a bucket rather than a value.  Use `Bucket.Bucket()` to
420
+access the sub-bucket.
421
+
422
+
423
+#### Prefix scans
424
+
425
+To iterate over a key prefix, you can combine `Seek()` and `bytes.HasPrefix()`:
426
+
427
+```go
428
+db.View(func(tx *bolt.Tx) error {
429
+	// Assume bucket exists and has keys
430
+	c := tx.Bucket([]byte("MyBucket")).Cursor()
431
+
432
+	prefix := []byte("1234")
433
+	for k, v := c.Seek(prefix); k != nil && bytes.HasPrefix(k, prefix); k, v = c.Next() {
434
+		fmt.Printf("key=%s, value=%s\n", k, v)
435
+	}
436
+
437
+	return nil
438
+})
439
+```
440
+
441
+#### Range scans
442
+
443
+Another common use case is scanning over a range such as a time range. If you
444
+use a sortable time encoding such as RFC3339 then you can query a specific
445
+date range like this:
446
+
447
+```go
448
+db.View(func(tx *bolt.Tx) error {
449
+	// Assume our events bucket exists and has RFC3339 encoded time keys.
450
+	c := tx.Bucket([]byte("Events")).Cursor()
451
+
452
+	// Our time range spans the 90's decade.
453
+	min := []byte("1990-01-01T00:00:00Z")
454
+	max := []byte("2000-01-01T00:00:00Z")
455
+
456
+	// Iterate over the 90's.
457
+	for k, v := c.Seek(min); k != nil && bytes.Compare(k, max) <= 0; k, v = c.Next() {
458
+		fmt.Printf("%s: %s\n", k, v)
459
+	}
460
+
461
+	return nil
462
+})
463
+```
464
+
465
+Note that, while RFC3339 is sortable, the Golang implementation of RFC3339Nano does not use a fixed number of digits after the decimal point and is therefore not sortable.
466
+
467
+
468
+#### ForEach()
469
+
470
+You can also use the function `ForEach()` if you know you'll be iterating over
471
+all the keys in a bucket:
472
+
473
+```go
474
+db.View(func(tx *bolt.Tx) error {
475
+	// Assume bucket exists and has keys
476
+	b := tx.Bucket([]byte("MyBucket"))
477
+
478
+	b.ForEach(func(k, v []byte) error {
479
+		fmt.Printf("key=%s, value=%s\n", k, v)
480
+		return nil
481
+	})
482
+	return nil
483
+})
484
+```
485
+
486
+Please note that keys and values in `ForEach()` are only valid while
487
+the transaction is open. If you need to use a key or value outside of
488
+the transaction, you must use `copy()` to copy it to another byte
489
+slice.
490
+
491
+### Nested buckets
492
+
493
+You can also store a bucket in a key to create nested buckets. The API is the
494
+same as the bucket management API on the `DB` object:
495
+
496
+```go
497
+func (*Bucket) CreateBucket(key []byte) (*Bucket, error)
498
+func (*Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error)
499
+func (*Bucket) DeleteBucket(key []byte) error
500
+```
501
+
502
+Say you had a multi-tenant application where the root level bucket was the account bucket. Inside of this bucket was a sequence of accounts which themselves are buckets. And inside the sequence bucket you could have many buckets pertaining to the Account itself (Users, Notes, etc) isolating the information into logical groupings.
503
+
504
+```go
505
+
506
+// createUser creates a new user in the given account.
507
+func createUser(accountID int, u *User) error {
508
+    // Start the transaction.
509
+    tx, err := db.Begin(true)
510
+    if err != nil {
511
+        return err
512
+    }
513
+    defer tx.Rollback()
514
+
515
+    // Retrieve the root bucket for the account.
516
+    // Assume this has already been created when the account was set up.
517
+    root := tx.Bucket([]byte(strconv.FormatUint(accountID, 10)))
518
+
519
+    // Setup the users bucket.
520
+    bkt, err := root.CreateBucketIfNotExists([]byte("USERS"))
521
+    if err != nil {
522
+        return err
523
+    }
524
+
525
+    // Generate an ID for the new user.
526
+    userID, err := bkt.NextSequence()
527
+    if err != nil {
528
+        return err
529
+    }
530
+    u.ID = userID
531
+
532
+    // Marshal and save the encoded user.
533
+    if buf, err := json.Marshal(u); err != nil {
534
+        return err
535
+    } else if err := bkt.Put([]byte(strconv.FormatUint(u.ID, 10)), buf); err != nil {
536
+        return err
537
+    }
538
+
539
+    // Commit the transaction.
540
+    if err := tx.Commit(); err != nil {
541
+        return err
542
+    }
543
+
544
+    return nil
545
+}
546
+
547
+```
548
+
549
+
550
+
551
+
552
+### Database backups
553
+
554
+Bolt is a single file so it's easy to backup. You can use the `Tx.WriteTo()`
555
+function to write a consistent view of the database to a writer. If you call
556
+this from a read-only transaction, it will perform a hot backup and not block
557
+your other database reads and writes.
558
+
559
+By default, it will use a regular file handle which will utilize the operating
560
+system's page cache. See the [`Tx`](https://godoc.org/go.etcd.io/bbolt#Tx)
561
+documentation for information about optimizing for larger-than-RAM datasets.
562
+
563
+One common use case is to backup over HTTP so you can use tools like `cURL` to
564
+do database backups:
565
+
566
+```go
567
+func BackupHandleFunc(w http.ResponseWriter, req *http.Request) {
568
+	err := db.View(func(tx *bolt.Tx) error {
569
+		w.Header().Set("Content-Type", "application/octet-stream")
570
+		w.Header().Set("Content-Disposition", `attachment; filename="my.db"`)
571
+		w.Header().Set("Content-Length", strconv.Itoa(int(tx.Size())))
572
+		_, err := tx.WriteTo(w)
573
+		return err
574
+	})
575
+	if err != nil {
576
+		http.Error(w, err.Error(), http.StatusInternalServerError)
577
+	}
578
+}
579
+```
580
+
581
+Then you can backup using this command:
582
+
583
+```sh
584
+$ curl http://localhost/backup > my.db
585
+```
586
+
587
+Or you can open your browser to `http://localhost/backup` and it will download
588
+automatically.
589
+
590
+If you want to backup to another file you can use the `Tx.CopyFile()` helper
591
+function.
592
+
593
+
594
+### Statistics
595
+
596
+The database keeps a running count of many of the internal operations it
597
+performs so you can better understand what's going on. By grabbing a snapshot
598
+of these stats at two points in time we can see what operations were performed
599
+in that time range.
600
+
601
+For example, we could start a goroutine to log stats every 10 seconds:
602
+
603
+```go
604
+go func() {
605
+	// Grab the initial stats.
606
+	prev := db.Stats()
607
+
608
+	for {
609
+		// Wait for 10s.
610
+		time.Sleep(10 * time.Second)
611
+
612
+		// Grab the current stats and diff them.
613
+		stats := db.Stats()
614
+		diff := stats.Sub(&prev)
615
+
616
+		// Encode stats to JSON and print to STDERR.
617
+		json.NewEncoder(os.Stderr).Encode(diff)
618
+
619
+		// Save stats for the next loop.
620
+		prev = stats
621
+	}
622
+}()
623
+```
624
+
625
+It's also useful to pipe these stats to a service such as statsd for monitoring
626
+or to provide an HTTP endpoint that will perform a fixed-length sample.
627
+
628
+
629
+### Read-Only Mode
630
+
631
+Sometimes it is useful to create a shared, read-only Bolt database. To this,
632
+set the `Options.ReadOnly` flag when opening your database. Read-only mode
633
+uses a shared lock to allow multiple processes to read from the database but
634
+it will block any processes from opening the database in read-write mode.
635
+
636
+```go
637
+db, err := bolt.Open("my.db", 0666, &bolt.Options{ReadOnly: true})
638
+if err != nil {
639
+	log.Fatal(err)
640
+}
641
+```
642
+
643
+### Mobile Use (iOS/Android)
644
+
645
+Bolt is able to run on mobile devices by leveraging the binding feature of the
646
+[gomobile](https://github.com/golang/mobile) tool. Create a struct that will
647
+contain your database logic and a reference to a `*bolt.DB` with a initializing
648
+constructor that takes in a filepath where the database file will be stored.
649
+Neither Android nor iOS require extra permissions or cleanup from using this method.
650
+
651
+```go
652
+func NewBoltDB(filepath string) *BoltDB {
653
+	db, err := bolt.Open(filepath+"/demo.db", 0600, nil)
654
+	if err != nil {
655
+		log.Fatal(err)
656
+	}
657
+
658
+	return &BoltDB{db}
659
+}
660
+
661
+type BoltDB struct {
662
+	db *bolt.DB
663
+	...
664
+}
665
+
666
+func (b *BoltDB) Path() string {
667
+	return b.db.Path()
668
+}
669
+
670
+func (b *BoltDB) Close() {
671
+	b.db.Close()
672
+}
673
+```
674
+
675
+Database logic should be defined as methods on this wrapper struct.
676
+
677
+To initialize this struct from the native language (both platforms now sync
678
+their local storage to the cloud. These snippets disable that functionality for the
679
+database file):
680
+
681
+#### Android
682
+
683
+```java
684
+String path;
685
+if (android.os.Build.VERSION.SDK_INT >=android.os.Build.VERSION_CODES.LOLLIPOP){
686
+    path = getNoBackupFilesDir().getAbsolutePath();
687
+} else{
688
+    path = getFilesDir().getAbsolutePath();
689
+}
690
+Boltmobiledemo.BoltDB boltDB = Boltmobiledemo.NewBoltDB(path)
691
+```
692
+
693
+#### iOS
694
+
695
+```objc
696
+- (void)demo {
697
+    NSString* path = [NSSearchPathForDirectoriesInDomains(NSLibraryDirectory,
698
+                                                          NSUserDomainMask,
699
+                                                          YES) objectAtIndex:0];
700
+	GoBoltmobiledemoBoltDB * demo = GoBoltmobiledemoNewBoltDB(path);
701
+	[self addSkipBackupAttributeToItemAtPath:demo.path];
702
+	//Some DB Logic would go here
703
+	[demo close];
704
+}
705
+
706
+- (BOOL)addSkipBackupAttributeToItemAtPath:(NSString *) filePathString
707
+{
708
+    NSURL* URL= [NSURL fileURLWithPath: filePathString];
709
+    assert([[NSFileManager defaultManager] fileExistsAtPath: [URL path]]);
710
+
711
+    NSError *error = nil;
712
+    BOOL success = [URL setResourceValue: [NSNumber numberWithBool: YES]
713
+                                  forKey: NSURLIsExcludedFromBackupKey error: &error];
714
+    if(!success){
715
+        NSLog(@"Error excluding %@ from backup %@", [URL lastPathComponent], error);
716
+    }
717
+    return success;
718
+}
719
+
720
+```
721
+
722
+## Resources
723
+
724
+For more information on getting started with Bolt, check out the following articles:
725
+
726
+* [Intro to BoltDB: Painless Performant Persistence](http://npf.io/2014/07/intro-to-boltdb-painless-performant-persistence/) by [Nate Finch](https://github.com/natefinch).
727
+* [Bolt -- an embedded key/value database for Go](https://www.progville.com/go/bolt-embedded-db-golang/) by Progville
728
+
729
+
730
+## Comparison with other databases
731
+
732
+### Postgres, MySQL, & other relational databases
733
+
734
+Relational databases structure data into rows and are only accessible through
735
+the use of SQL. This approach provides flexibility in how you store and query
736
+your data but also incurs overhead in parsing and planning SQL statements. Bolt
737
+accesses all data by a byte slice key. This makes Bolt fast to read and write
738
+data by key but provides no built-in support for joining values together.
739
+
740
+Most relational databases (with the exception of SQLite) are standalone servers
741
+that run separately from your application. This gives your systems
742
+flexibility to connect multiple application servers to a single database
743
+server but also adds overhead in serializing and transporting data over the
744
+network. Bolt runs as a library included in your application so all data access
745
+has to go through your application's process. This brings data closer to your
746
+application but limits multi-process access to the data.
747
+
748
+
749
+### LevelDB, RocksDB
750
+
751
+LevelDB and its derivatives (RocksDB, HyperLevelDB) are similar to Bolt in that
752
+they are libraries bundled into the application, however, their underlying
753
+structure is a log-structured merge-tree (LSM tree). An LSM tree optimizes
754
+random writes by using a write ahead log and multi-tiered, sorted files called
755
+SSTables. Bolt uses a B+tree internally and only a single file. Both approaches
756
+have trade-offs.
757
+
758
+If you require a high random write throughput (>10,000 w/sec) or you need to use
759
+spinning disks then LevelDB could be a good choice. If your application is
760
+read-heavy or does a lot of range scans then Bolt could be a good choice.
761
+
762
+One other important consideration is that LevelDB does not have transactions.
763
+It supports batch writing of key/values pairs and it supports read snapshots
764
+but it will not give you the ability to do a compare-and-swap operation safely.
765
+Bolt supports fully serializable ACID transactions.
766
+
767
+
768
+### LMDB
769
+
770
+Bolt was originally a port of LMDB so it is architecturally similar. Both use
771
+a B+tree, have ACID semantics with fully serializable transactions, and support
772
+lock-free MVCC using a single writer and multiple readers.
773
+
774
+The two projects have somewhat diverged. LMDB heavily focuses on raw performance
775
+while Bolt has focused on simplicity and ease of use. For example, LMDB allows
776
+several unsafe actions such as direct writes for the sake of performance. Bolt
777
+opts to disallow actions which can leave the database in a corrupted state. The
778
+only exception to this in Bolt is `DB.NoSync`.
779
+
780
+There are also a few differences in API. LMDB requires a maximum mmap size when
781
+opening an `mdb_env` whereas Bolt will handle incremental mmap resizing
782
+automatically. LMDB overloads the getter and setter functions with multiple
783
+flags whereas Bolt splits these specialized cases into their own functions.
784
+
785
+
786
+## Caveats & Limitations
787
+
788
+It's important to pick the right tool for the job and Bolt is no exception.
789
+Here are a few things to note when evaluating and using Bolt:
790
+
791
+* Bolt is good for read intensive workloads. Sequential write performance is
792
+  also fast but random writes can be slow. You can use `DB.Batch()` or add a
793
+  write-ahead log to help mitigate this issue.
794
+
795
+* Bolt uses a B+tree internally so there can be a lot of random page access.
796
+  SSDs provide a significant performance boost over spinning disks.
797
+
798
+* Try to avoid long running read transactions. Bolt uses copy-on-write so
799
+  old pages cannot be reclaimed while an old transaction is using them.
800
+
801
+* Byte slices returned from Bolt are only valid during a transaction. Once the
802
+  transaction has been committed or rolled back then the memory they point to
803
+  can be reused by a new page or can be unmapped from virtual memory and you'll
804
+  see an `unexpected fault address` panic when accessing it.
805
+
806
+* Bolt uses an exclusive write lock on the database file so it cannot be
807
+  shared by multiple processes.
808
+
809
+* Be careful when using `Bucket.FillPercent`. Setting a high fill percent for
810
+  buckets that have random inserts will cause your database to have very poor
811
+  page utilization.
812
+
813
+* Use larger buckets in general. Smaller buckets causes poor page utilization
814
+  once they become larger than the page size (typically 4KB).
815
+
816
+* Bulk loading a lot of random writes into a new bucket can be slow as the
817
+  page will not split until the transaction is committed. Randomly inserting
818
+  more than 100,000 key/value pairs into a single new bucket in a single
819
+  transaction is not advised.
820
+
821
+* Bolt uses a memory-mapped file so the underlying operating system handles the
822
+  caching of the data. Typically, the OS will cache as much of the file as it
823
+  can in memory and will release memory as needed to other processes. This means
824
+  that Bolt can show very high memory usage when working with large databases.
825
+  However, this is expected and the OS will release memory as needed. Bolt can
826
+  handle databases much larger than the available physical RAM, provided its
827
+  memory-map fits in the process virtual address space. It may be problematic
828
+  on 32-bits systems.
829
+
830
+* The data structures in the Bolt database are memory mapped so the data file
831
+  will be endian specific. This means that you cannot copy a Bolt file from a
832
+  little endian machine to a big endian machine and have it work. For most
833
+  users this is not a concern since most modern CPUs are little endian.
834
+
835
+* Because of the way pages are laid out on disk, Bolt cannot truncate data files
836
+  and return free pages back to the disk. Instead, Bolt maintains a free list
837
+  of unused pages within its data file. These free pages can be reused by later
838
+  transactions. This works well for many use cases as databases generally tend
839
+  to grow. However, it's important to note that deleting large chunks of data
840
+  will not allow you to reclaim that space on disk.
841
+
842
+  For more information on page allocation, [see this comment][page-allocation].
843
+
844
+[page-allocation]: https://github.com/boltdb/bolt/issues/308#issuecomment-74811638
845
+
846
+
847
+## Reading the Source
848
+
849
+Bolt is a relatively small code base (<5KLOC) for an embedded, serializable,
850
+transactional key/value database so it can be a good starting point for people
851
+interested in how databases work.
852
+
853
+The best places to start are the main entry points into Bolt:
854
+
855
+- `Open()` - Initializes the reference to the database. It's responsible for
856
+  creating the database if it doesn't exist, obtaining an exclusive lock on the
857
+  file, reading the meta pages, & memory-mapping the file.
858
+
859
+- `DB.Begin()` - Starts a read-only or read-write transaction depending on the
860
+  value of the `writable` argument. This requires briefly obtaining the "meta"
861
+  lock to keep track of open transactions. Only one read-write transaction can
862
+  exist at a time so the "rwlock" is acquired during the life of a read-write
863
+  transaction.
864
+
865
+- `Bucket.Put()` - Writes a key/value pair into a bucket. After validating the
866
+  arguments, a cursor is used to traverse the B+tree to the page and position
867
+  where they key & value will be written. Once the position is found, the bucket
868
+  materializes the underlying page and the page's parent pages into memory as
869
+  "nodes". These nodes are where mutations occur during read-write transactions.
870
+  These changes get flushed to disk during commit.
871
+
872
+- `Bucket.Get()` - Retrieves a key/value pair from a bucket. This uses a cursor
873
+  to move to the page & position of a key/value pair. During a read-only
874
+  transaction, the key and value data is returned as a direct reference to the
875
+  underlying mmap file so there's no allocation overhead. For read-write
876
+  transactions, this data may reference the mmap file or one of the in-memory
877
+  node values.
878
+
879
+- `Cursor` - This object is simply for traversing the B+tree of on-disk pages
880
+  or in-memory nodes. It can seek to a specific key, move to the first or last
881
+  value, or it can move forward or backward. The cursor handles the movement up
882
+  and down the B+tree transparently to the end user.
883
+
884
+- `Tx.Commit()` - Converts the in-memory dirty nodes and the list of free pages
885
+  into pages to be written to disk. Writing to disk then occurs in two phases.
886
+  First, the dirty pages are written to disk and an `fsync()` occurs. Second, a
887
+  new meta page with an incremented transaction ID is written and another
888
+  `fsync()` occurs. This two phase write ensures that partially written data
889
+  pages are ignored in the event of a crash since the meta page pointing to them
890
+  is never written. Partially written meta pages are invalidated because they
891
+  are written with a checksum.
892
+
893
+If you have additional notes that could be helpful for others, please submit
894
+them via pull request.
895
+
896
+
897
+## Other Projects Using Bolt
898
+
899
+Below is a list of public, open source projects that use Bolt:
900
+
901
+* [Algernon](https://github.com/xyproto/algernon) - A HTTP/2 web server with built-in support for Lua. Uses BoltDB as the default database backend.
902
+* [Bazil](https://bazil.org/) - A file system that lets your data reside where it is most convenient for it to reside.
903
+* [bolter](https://github.com/hasit/bolter) - Command-line app for viewing BoltDB file in your terminal.
904
+* [boltcli](https://github.com/spacewander/boltcli) - the redis-cli for boltdb with Lua script support.
905
+* [BoltHold](https://github.com/timshannon/bolthold) - An embeddable NoSQL store for Go types built on BoltDB
906
+* [BoltStore](https://github.com/yosssi/boltstore) - Session store using Bolt.
907
+* [Boltdb Boilerplate](https://github.com/bobintornado/boltdb-boilerplate) - Boilerplate wrapper around bolt aiming to make simple calls one-liners.
908
+* [BoltDbWeb](https://github.com/evnix/boltdbweb) - A web based GUI for BoltDB files.
909
+* [bleve](http://www.blevesearch.com/) - A pure Go search engine similar to ElasticSearch that uses Bolt as the default storage backend.
910
+* [btcwallet](https://github.com/btcsuite/btcwallet) - A bitcoin wallet.
911
+* [buckets](https://github.com/joyrexus/buckets) - a bolt wrapper streamlining
912
+  simple tx and key scans.
913
+* [cayley](https://github.com/google/cayley) - Cayley is an open-source graph database using Bolt as optional backend.
914
+* [ChainStore](https://github.com/pressly/chainstore) - Simple key-value interface to a variety of storage engines organized as a chain of operations.
915
+* [Consul](https://github.com/hashicorp/consul) - Consul is service discovery and configuration made easy. Distributed, highly available, and datacenter-aware.
916
+* [DVID](https://github.com/janelia-flyem/dvid) - Added Bolt as optional storage engine and testing it against Basho-tuned leveldb.
917
+* [dcrwallet](https://github.com/decred/dcrwallet) - A wallet for the Decred cryptocurrency.
918
+* [drive](https://github.com/odeke-em/drive) - drive is an unofficial Google Drive command line client for \*NIX operating systems.
919
+* [event-shuttle](https://github.com/sclasen/event-shuttle) - A Unix system service to collect and reliably deliver messages to Kafka.
920
+* [Freehold](http://tshannon.bitbucket.org/freehold/) - An open, secure, and lightweight platform for your files and data.
921
+* [Go Report Card](https://goreportcard.com/) - Go code quality report cards as a (free and open source) service.
922
+* [GoWebApp](https://github.com/josephspurrier/gowebapp) - A basic MVC web application in Go using BoltDB.
923
+* [GoShort](https://github.com/pankajkhairnar/goShort) - GoShort is a URL shortener written in Golang and BoltDB for persistent key/value storage and for routing it's using high performent HTTPRouter.
924
+* [gopherpit](https://github.com/gopherpit/gopherpit) - A web service to manage Go remote import paths with custom domains
925
+* [Gitchain](https://github.com/gitchain/gitchain) - Decentralized, peer-to-peer Git repositories aka "Git meets Bitcoin".
926
+* [InfluxDB](https://influxdata.com) - Scalable datastore for metrics, events, and real-time analytics.
927
+* [ipLocator](https://github.com/AndreasBriese/ipLocator) - A fast ip-geo-location-server using bolt with bloom filters.
928
+* [ipxed](https://github.com/kelseyhightower/ipxed) - Web interface and api for ipxed.
929
+* [Ironsmith](https://github.com/timshannon/ironsmith) - A simple, script-driven continuous integration (build - > test -> release) tool, with no external dependencies
930
+* [Kala](https://github.com/ajvb/kala) - Kala is a modern job scheduler optimized to run on a single node. It is persistent, JSON over HTTP API, ISO 8601 duration notation, and dependent jobs.
931
+* [LedisDB](https://github.com/siddontang/ledisdb) - A high performance NoSQL, using Bolt as optional storage.
932
+* [lru](https://github.com/crowdriff/lru) - Easy to use Bolt-backed Least-Recently-Used (LRU) read-through cache with chainable remote stores.
933
+* [mbuckets](https://github.com/abhigupta912/mbuckets) - A Bolt wrapper that allows easy operations on multi level (nested) buckets.
934
+* [MetricBase](https://github.com/msiebuhr/MetricBase) - Single-binary version of Graphite.
935
+* [MuLiFS](https://github.com/dankomiocevic/mulifs) - Music Library Filesystem creates a filesystem to organise your music files.
936
+* [Operation Go: A Routine Mission](http://gocode.io) - An online programming game for Golang using Bolt for user accounts and a leaderboard.
937
+* [photosite/session](https://godoc.org/bitbucket.org/kardianos/photosite/session) - Sessions for a photo viewing site.
938
+* [Prometheus Annotation Server](https://github.com/oliver006/prom_annotation_server) - Annotation server for PromDash & Prometheus service monitoring system.
939
+* [reef-pi](https://github.com/reef-pi/reef-pi) - reef-pi is an award winning, modular, DIY reef tank controller using easy to learn electronics based on a Raspberry Pi.
940
+* [Request Baskets](https://github.com/darklynx/request-baskets) - A web service to collect arbitrary HTTP requests and inspect them via REST API or simple web UI, similar to [RequestBin](http://requestb.in/) service
941
+* [Seaweed File System](https://github.com/chrislusf/seaweedfs) - Highly scalable distributed key~file system with O(1) disk read.
942
+* [stow](https://github.com/djherbis/stow) -  a persistence manager for objects
943
+  backed by boltdb.
944
+* [Storm](https://github.com/asdine/storm) - Simple and powerful ORM for BoltDB.
945
+* [SimpleBolt](https://github.com/xyproto/simplebolt) - A simple way to use BoltDB. Deals mainly with strings.
946
+* [Skybox Analytics](https://github.com/skybox/skybox) - A standalone funnel analysis tool for web analytics.
947
+* [Scuttlebutt](https://github.com/benbjohnson/scuttlebutt) - Uses Bolt to store and process all Twitter mentions of GitHub projects.
948
+* [tentacool](https://github.com/optiflows/tentacool) - REST api server to manage system stuff (IP, DNS, Gateway...) on a linux server.
949
+* [torrent](https://github.com/anacrolix/torrent) - Full-featured BitTorrent client package and utilities in Go. BoltDB is a storage backend in development.
950
+* [Wiki](https://github.com/peterhellberg/wiki) - A tiny wiki using Goji, BoltDB and Blackfriday.
951
+
952
+If you are using Bolt in a project please send a pull request to add it to the list.
0 953
new file mode 100644
... ...
@@ -0,0 +1,10 @@
0
+package bbolt
1
+
2
+// maxMapSize represents the largest mmap size supported by Bolt.
3
+const maxMapSize = 0x7FFFFFFF // 2GB
4
+
5
+// maxAllocSize is the size used when creating array pointers.
6
+const maxAllocSize = 0xFFFFFFF
7
+
8
+// Are unaligned load/stores broken on this arch?
9
+var brokenUnaligned = false
0 10
new file mode 100644
... ...
@@ -0,0 +1,10 @@
0
+package bbolt
1
+
2
+// maxMapSize represents the largest mmap size supported by Bolt.
3
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
4
+
5
+// maxAllocSize is the size used when creating array pointers.
6
+const maxAllocSize = 0x7FFFFFFF
7
+
8
+// Are unaligned load/stores broken on this arch?
9
+var brokenUnaligned = false
0 10
new file mode 100644
... ...
@@ -0,0 +1,28 @@
0
+package bbolt
1
+
2
+import "unsafe"
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0x7FFFFFFF // 2GB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0xFFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned bool
12
+
13
+func init() {
14
+	// Simple check to see whether this arch handles unaligned load/stores
15
+	// correctly.
16
+
17
+	// ARM9 and older devices require load/stores to be from/to aligned
18
+	// addresses. If not, the lower 2 bits are cleared and that address is
19
+	// read in a jumbled up order.
20
+
21
+	// See http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka15414.html
22
+
23
+	raw := [6]byte{0xfe, 0xef, 0x11, 0x22, 0x22, 0x11}
24
+	val := *(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(&raw)) + 2))
25
+
26
+	brokenUnaligned = val != 0x11222211
27
+}
0 28
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build arm64
1
+
2
+package bbolt
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0x7FFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned = false
0 12
new file mode 100644
... ...
@@ -0,0 +1,10 @@
0
+package bbolt
1
+
2
+import (
3
+	"syscall"
4
+)
5
+
6
+// fdatasync flushes written data to a file descriptor.
7
+func fdatasync(db *DB) error {
8
+	return syscall.Fdatasync(int(db.file.Fd()))
9
+}
0 10
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build mips64 mips64le
1
+
2
+package bbolt
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0x8000000000 // 512GB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0x7FFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned = false
0 12
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build mips mipsle
1
+
2
+package bbolt
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0x40000000 // 1GB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0xFFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned = false
0 12
new file mode 100644
... ...
@@ -0,0 +1,27 @@
0
+package bbolt
1
+
2
+import (
3
+	"syscall"
4
+	"unsafe"
5
+)
6
+
7
+const (
8
+	msAsync      = 1 << iota // perform asynchronous writes
9
+	msSync                   // perform synchronous writes
10
+	msInvalidate             // invalidate cached data
11
+)
12
+
13
+func msync(db *DB) error {
14
+	_, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate)
15
+	if errno != 0 {
16
+		return errno
17
+	}
18
+	return nil
19
+}
20
+
21
+func fdatasync(db *DB) error {
22
+	if db.data != nil {
23
+		return msync(db)
24
+	}
25
+	return db.file.Sync()
26
+}
0 27
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build ppc
1
+
2
+package bbolt
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0x7FFFFFFF // 2GB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0xFFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned = false
0 12
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build ppc64
1
+
2
+package bbolt
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0x7FFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned = false
0 12
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build ppc64le
1
+
2
+package bbolt
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0x7FFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned = false
0 12
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+// +build s390x
1
+
2
+package bbolt
3
+
4
+// maxMapSize represents the largest mmap size supported by Bolt.
5
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
6
+
7
+// maxAllocSize is the size used when creating array pointers.
8
+const maxAllocSize = 0x7FFFFFFF
9
+
10
+// Are unaligned load/stores broken on this arch?
11
+var brokenUnaligned = false
0 12
new file mode 100644
... ...
@@ -0,0 +1,93 @@
0
+// +build !windows,!plan9,!solaris
1
+
2
+package bbolt
3
+
4
+import (
5
+	"fmt"
6
+	"syscall"
7
+	"time"
8
+	"unsafe"
9
+)
10
+
11
+// flock acquires an advisory lock on a file descriptor.
12
+func flock(db *DB, exclusive bool, timeout time.Duration) error {
13
+	var t time.Time
14
+	if timeout != 0 {
15
+		t = time.Now()
16
+	}
17
+	fd := db.file.Fd()
18
+	flag := syscall.LOCK_NB
19
+	if exclusive {
20
+		flag |= syscall.LOCK_EX
21
+	} else {
22
+		flag |= syscall.LOCK_SH
23
+	}
24
+	for {
25
+		// Attempt to obtain an exclusive lock.
26
+		err := syscall.Flock(int(fd), flag)
27
+		if err == nil {
28
+			return nil
29
+		} else if err != syscall.EWOULDBLOCK {
30
+			return err
31
+		}
32
+
33
+		// If we timed out then return an error.
34
+		if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
35
+			return ErrTimeout
36
+		}
37
+
38
+		// Wait for a bit and try again.
39
+		time.Sleep(flockRetryTimeout)
40
+	}
41
+}
42
+
43
+// funlock releases an advisory lock on a file descriptor.
44
+func funlock(db *DB) error {
45
+	return syscall.Flock(int(db.file.Fd()), syscall.LOCK_UN)
46
+}
47
+
48
+// mmap memory maps a DB's data file.
49
+func mmap(db *DB, sz int) error {
50
+	// Map the data file to memory.
51
+	b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags)
52
+	if err != nil {
53
+		return err
54
+	}
55
+
56
+	// Advise the kernel that the mmap is accessed randomly.
57
+	err = madvise(b, syscall.MADV_RANDOM)
58
+	if err != nil && err != syscall.ENOSYS {
59
+		// Ignore not implemented error in kernel because it still works.
60
+		return fmt.Errorf("madvise: %s", err)
61
+	}
62
+
63
+	// Save the original byte slice and convert to a byte array pointer.
64
+	db.dataref = b
65
+	db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
66
+	db.datasz = sz
67
+	return nil
68
+}
69
+
70
+// munmap unmaps a DB's data file from memory.
71
+func munmap(db *DB) error {
72
+	// Ignore the unmap if we have no mapped data.
73
+	if db.dataref == nil {
74
+		return nil
75
+	}
76
+
77
+	// Unmap using the original byte slice.
78
+	err := syscall.Munmap(db.dataref)
79
+	db.dataref = nil
80
+	db.data = nil
81
+	db.datasz = 0
82
+	return err
83
+}
84
+
85
+// NOTE: This function is copied from stdlib because it is not available on darwin.
86
+func madvise(b []byte, advice int) (err error) {
87
+	_, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)), uintptr(advice))
88
+	if e1 != 0 {
89
+		err = e1
90
+	}
91
+	return
92
+}
0 93
new file mode 100644
... ...
@@ -0,0 +1,88 @@
0
+package bbolt
1
+
2
+import (
3
+	"fmt"
4
+	"syscall"
5
+	"time"
6
+	"unsafe"
7
+
8
+	"golang.org/x/sys/unix"
9
+)
10
+
11
+// flock acquires an advisory lock on a file descriptor.
12
+func flock(db *DB, exclusive bool, timeout time.Duration) error {
13
+	var t time.Time
14
+	if timeout != 0 {
15
+		t = time.Now()
16
+	}
17
+	fd := db.file.Fd()
18
+	var lockType int16
19
+	if exclusive {
20
+		lockType = syscall.F_WRLCK
21
+	} else {
22
+		lockType = syscall.F_RDLCK
23
+	}
24
+	for {
25
+		// Attempt to obtain an exclusive lock.
26
+		lock := syscall.Flock_t{Type: lockType}
27
+		err := syscall.FcntlFlock(fd, syscall.F_SETLK, &lock)
28
+		if err == nil {
29
+			return nil
30
+		} else if err != syscall.EAGAIN {
31
+			return err
32
+		}
33
+
34
+		// If we timed out then return an error.
35
+		if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
36
+			return ErrTimeout
37
+		}
38
+
39
+		// Wait for a bit and try again.
40
+		time.Sleep(flockRetryTimeout)
41
+	}
42
+}
43
+
44
+// funlock releases an advisory lock on a file descriptor.
45
+func funlock(db *DB) error {
46
+	var lock syscall.Flock_t
47
+	lock.Start = 0
48
+	lock.Len = 0
49
+	lock.Type = syscall.F_UNLCK
50
+	lock.Whence = 0
51
+	return syscall.FcntlFlock(uintptr(db.file.Fd()), syscall.F_SETLK, &lock)
52
+}
53
+
54
+// mmap memory maps a DB's data file.
55
+func mmap(db *DB, sz int) error {
56
+	// Map the data file to memory.
57
+	b, err := unix.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED|db.MmapFlags)
58
+	if err != nil {
59
+		return err
60
+	}
61
+
62
+	// Advise the kernel that the mmap is accessed randomly.
63
+	if err := unix.Madvise(b, syscall.MADV_RANDOM); err != nil {
64
+		return fmt.Errorf("madvise: %s", err)
65
+	}
66
+
67
+	// Save the original byte slice and convert to a byte array pointer.
68
+	db.dataref = b
69
+	db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
70
+	db.datasz = sz
71
+	return nil
72
+}
73
+
74
+// munmap unmaps a DB's data file from memory.
75
+func munmap(db *DB) error {
76
+	// Ignore the unmap if we have no mapped data.
77
+	if db.dataref == nil {
78
+		return nil
79
+	}
80
+
81
+	// Unmap using the original byte slice.
82
+	err := unix.Munmap(db.dataref)
83
+	db.dataref = nil
84
+	db.data = nil
85
+	db.datasz = 0
86
+	return err
87
+}
0 88
new file mode 100644
... ...
@@ -0,0 +1,141 @@
0
+package bbolt
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"syscall"
6
+	"time"
7
+	"unsafe"
8
+)
9
+
10
+// LockFileEx code derived from golang build filemutex_windows.go @ v1.5.1
11
+var (
12
+	modkernel32      = syscall.NewLazyDLL("kernel32.dll")
13
+	procLockFileEx   = modkernel32.NewProc("LockFileEx")
14
+	procUnlockFileEx = modkernel32.NewProc("UnlockFileEx")
15
+)
16
+
17
+const (
18
+	// see https://msdn.microsoft.com/en-us/library/windows/desktop/aa365203(v=vs.85).aspx
19
+	flagLockExclusive       = 2
20
+	flagLockFailImmediately = 1
21
+
22
+	// see https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx
23
+	errLockViolation syscall.Errno = 0x21
24
+)
25
+
26
+func lockFileEx(h syscall.Handle, flags, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) {
27
+	r, _, err := procLockFileEx.Call(uintptr(h), uintptr(flags), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol)))
28
+	if r == 0 {
29
+		return err
30
+	}
31
+	return nil
32
+}
33
+
34
+func unlockFileEx(h syscall.Handle, reserved, locklow, lockhigh uint32, ol *syscall.Overlapped) (err error) {
35
+	r, _, err := procUnlockFileEx.Call(uintptr(h), uintptr(reserved), uintptr(locklow), uintptr(lockhigh), uintptr(unsafe.Pointer(ol)), 0)
36
+	if r == 0 {
37
+		return err
38
+	}
39
+	return nil
40
+}
41
+
42
+// fdatasync flushes written data to a file descriptor.
43
+func fdatasync(db *DB) error {
44
+	return db.file.Sync()
45
+}
46
+
47
+// flock acquires an advisory lock on a file descriptor.
48
+func flock(db *DB, exclusive bool, timeout time.Duration) error {
49
+	var t time.Time
50
+	if timeout != 0 {
51
+		t = time.Now()
52
+	}
53
+	var flag uint32 = flagLockFailImmediately
54
+	if exclusive {
55
+		flag |= flagLockExclusive
56
+	}
57
+	for {
58
+		// Fix for https://github.com/etcd-io/bbolt/issues/121. Use byte-range
59
+		// -1..0 as the lock on the database file.
60
+		var m1 uint32 = (1 << 32) - 1 // -1 in a uint32
61
+		err := lockFileEx(syscall.Handle(db.file.Fd()), flag, 0, 1, 0, &syscall.Overlapped{
62
+			Offset:     m1,
63
+			OffsetHigh: m1,
64
+		})
65
+
66
+		if err == nil {
67
+			return nil
68
+		} else if err != errLockViolation {
69
+			return err
70
+		}
71
+
72
+		// If we timed oumercit then return an error.
73
+		if timeout != 0 && time.Since(t) > timeout-flockRetryTimeout {
74
+			return ErrTimeout
75
+		}
76
+
77
+		// Wait for a bit and try again.
78
+		time.Sleep(flockRetryTimeout)
79
+	}
80
+}
81
+
82
+// funlock releases an advisory lock on a file descriptor.
83
+func funlock(db *DB) error {
84
+	var m1 uint32 = (1 << 32) - 1 // -1 in a uint32
85
+	err := unlockFileEx(syscall.Handle(db.file.Fd()), 0, 1, 0, &syscall.Overlapped{
86
+		Offset:     m1,
87
+		OffsetHigh: m1,
88
+	})
89
+	return err
90
+}
91
+
92
+// mmap memory maps a DB's data file.
93
+// Based on: https://github.com/edsrzf/mmap-go
94
+func mmap(db *DB, sz int) error {
95
+	if !db.readOnly {
96
+		// Truncate the database to the size of the mmap.
97
+		if err := db.file.Truncate(int64(sz)); err != nil {
98
+			return fmt.Errorf("truncate: %s", err)
99
+		}
100
+	}
101
+
102
+	// Open a file mapping handle.
103
+	sizelo := uint32(sz >> 32)
104
+	sizehi := uint32(sz) & 0xffffffff
105
+	h, errno := syscall.CreateFileMapping(syscall.Handle(db.file.Fd()), nil, syscall.PAGE_READONLY, sizelo, sizehi, nil)
106
+	if h == 0 {
107
+		return os.NewSyscallError("CreateFileMapping", errno)
108
+	}
109
+
110
+	// Create the memory map.
111
+	addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(sz))
112
+	if addr == 0 {
113
+		return os.NewSyscallError("MapViewOfFile", errno)
114
+	}
115
+
116
+	// Close mapping handle.
117
+	if err := syscall.CloseHandle(syscall.Handle(h)); err != nil {
118
+		return os.NewSyscallError("CloseHandle", err)
119
+	}
120
+
121
+	// Convert to a byte array.
122
+	db.data = ((*[maxMapSize]byte)(unsafe.Pointer(addr)))
123
+	db.datasz = sz
124
+
125
+	return nil
126
+}
127
+
128
+// munmap unmaps a pointer from a file.
129
+// Based on: https://github.com/edsrzf/mmap-go
130
+func munmap(db *DB) error {
131
+	if db.data == nil {
132
+		return nil
133
+	}
134
+
135
+	addr := (uintptr)(unsafe.Pointer(&db.data[0]))
136
+	if err := syscall.UnmapViewOfFile(addr); err != nil {
137
+		return os.NewSyscallError("UnmapViewOfFile", err)
138
+	}
139
+	return nil
140
+}
0 141
new file mode 100644
... ...
@@ -0,0 +1,8 @@
0
+// +build !windows,!plan9,!linux,!openbsd
1
+
2
+package bbolt
3
+
4
+// fdatasync flushes written data to a file descriptor.
5
+func fdatasync(db *DB) error {
6
+	return db.file.Sync()
7
+}
0 8
new file mode 100644
... ...
@@ -0,0 +1,775 @@
0
+package bbolt
1
+
2
+import (
3
+	"bytes"
4
+	"fmt"
5
+	"unsafe"
6
+)
7
+
8
+const (
9
+	// MaxKeySize is the maximum length of a key, in bytes.
10
+	MaxKeySize = 32768
11
+
12
+	// MaxValueSize is the maximum length of a value, in bytes.
13
+	MaxValueSize = (1 << 31) - 2
14
+)
15
+
16
+const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
17
+
18
+const (
19
+	minFillPercent = 0.1
20
+	maxFillPercent = 1.0
21
+)
22
+
23
+// DefaultFillPercent is the percentage that split pages are filled.
24
+// This value can be changed by setting Bucket.FillPercent.
25
+const DefaultFillPercent = 0.5
26
+
27
+// Bucket represents a collection of key/value pairs inside the database.
28
+type Bucket struct {
29
+	*bucket
30
+	tx       *Tx                // the associated transaction
31
+	buckets  map[string]*Bucket // subbucket cache
32
+	page     *page              // inline page reference
33
+	rootNode *node              // materialized node for the root page.
34
+	nodes    map[pgid]*node     // node cache
35
+
36
+	// Sets the threshold for filling nodes when they split. By default,
37
+	// the bucket will fill to 50% but it can be useful to increase this
38
+	// amount if you know that your write workloads are mostly append-only.
39
+	//
40
+	// This is non-persisted across transactions so it must be set in every Tx.
41
+	FillPercent float64
42
+}
43
+
44
+// bucket represents the on-file representation of a bucket.
45
+// This is stored as the "value" of a bucket key. If the bucket is small enough,
46
+// then its root page can be stored inline in the "value", after the bucket
47
+// header. In the case of inline buckets, the "root" will be 0.
48
+type bucket struct {
49
+	root     pgid   // page id of the bucket's root-level page
50
+	sequence uint64 // monotonically incrementing, used by NextSequence()
51
+}
52
+
53
+// newBucket returns a new bucket associated with a transaction.
54
+func newBucket(tx *Tx) Bucket {
55
+	var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
56
+	if tx.writable {
57
+		b.buckets = make(map[string]*Bucket)
58
+		b.nodes = make(map[pgid]*node)
59
+	}
60
+	return b
61
+}
62
+
63
+// Tx returns the tx of the bucket.
64
+func (b *Bucket) Tx() *Tx {
65
+	return b.tx
66
+}
67
+
68
+// Root returns the root of the bucket.
69
+func (b *Bucket) Root() pgid {
70
+	return b.root
71
+}
72
+
73
+// Writable returns whether the bucket is writable.
74
+func (b *Bucket) Writable() bool {
75
+	return b.tx.writable
76
+}
77
+
78
+// Cursor creates a cursor associated with the bucket.
79
+// The cursor is only valid as long as the transaction is open.
80
+// Do not use a cursor after the transaction is closed.
81
+func (b *Bucket) Cursor() *Cursor {
82
+	// Update transaction statistics.
83
+	b.tx.stats.CursorCount++
84
+
85
+	// Allocate and return a cursor.
86
+	return &Cursor{
87
+		bucket: b,
88
+		stack:  make([]elemRef, 0),
89
+	}
90
+}
91
+
92
+// Bucket retrieves a nested bucket by name.
93
+// Returns nil if the bucket does not exist.
94
+// The bucket instance is only valid for the lifetime of the transaction.
95
+func (b *Bucket) Bucket(name []byte) *Bucket {
96
+	if b.buckets != nil {
97
+		if child := b.buckets[string(name)]; child != nil {
98
+			return child
99
+		}
100
+	}
101
+
102
+	// Move cursor to key.
103
+	c := b.Cursor()
104
+	k, v, flags := c.seek(name)
105
+
106
+	// Return nil if the key doesn't exist or it is not a bucket.
107
+	if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
108
+		return nil
109
+	}
110
+
111
+	// Otherwise create a bucket and cache it.
112
+	var child = b.openBucket(v)
113
+	if b.buckets != nil {
114
+		b.buckets[string(name)] = child
115
+	}
116
+
117
+	return child
118
+}
119
+
120
+// Helper method that re-interprets a sub-bucket value
121
+// from a parent into a Bucket
122
+func (b *Bucket) openBucket(value []byte) *Bucket {
123
+	var child = newBucket(b.tx)
124
+
125
+	// If unaligned load/stores are broken on this arch and value is
126
+	// unaligned simply clone to an aligned byte array.
127
+	unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0
128
+
129
+	if unaligned {
130
+		value = cloneBytes(value)
131
+	}
132
+
133
+	// If this is a writable transaction then we need to copy the bucket entry.
134
+	// Read-only transactions can point directly at the mmap entry.
135
+	if b.tx.writable && !unaligned {
136
+		child.bucket = &bucket{}
137
+		*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
138
+	} else {
139
+		child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
140
+	}
141
+
142
+	// Save a reference to the inline page if the bucket is inline.
143
+	if child.root == 0 {
144
+		child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
145
+	}
146
+
147
+	return &child
148
+}
149
+
150
+// CreateBucket creates a new bucket at the given key and returns the new bucket.
151
+// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.
152
+// The bucket instance is only valid for the lifetime of the transaction.
153
+func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
154
+	if b.tx.db == nil {
155
+		return nil, ErrTxClosed
156
+	} else if !b.tx.writable {
157
+		return nil, ErrTxNotWritable
158
+	} else if len(key) == 0 {
159
+		return nil, ErrBucketNameRequired
160
+	}
161
+
162
+	// Move cursor to correct position.
163
+	c := b.Cursor()
164
+	k, _, flags := c.seek(key)
165
+
166
+	// Return an error if there is an existing key.
167
+	if bytes.Equal(key, k) {
168
+		if (flags & bucketLeafFlag) != 0 {
169
+			return nil, ErrBucketExists
170
+		}
171
+		return nil, ErrIncompatibleValue
172
+	}
173
+
174
+	// Create empty, inline bucket.
175
+	var bucket = Bucket{
176
+		bucket:      &bucket{},
177
+		rootNode:    &node{isLeaf: true},
178
+		FillPercent: DefaultFillPercent,
179
+	}
180
+	var value = bucket.write()
181
+
182
+	// Insert into node.
183
+	key = cloneBytes(key)
184
+	c.node().put(key, key, value, 0, bucketLeafFlag)
185
+
186
+	// Since subbuckets are not allowed on inline buckets, we need to
187
+	// dereference the inline page, if it exists. This will cause the bucket
188
+	// to be treated as a regular, non-inline bucket for the rest of the tx.
189
+	b.page = nil
190
+
191
+	return b.Bucket(key), nil
192
+}
193
+
194
+// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.
195
+// Returns an error if the bucket name is blank, or if the bucket name is too long.
196
+// The bucket instance is only valid for the lifetime of the transaction.
197
+func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
198
+	child, err := b.CreateBucket(key)
199
+	if err == ErrBucketExists {
200
+		return b.Bucket(key), nil
201
+	} else if err != nil {
202
+		return nil, err
203
+	}
204
+	return child, nil
205
+}
206
+
207
+// DeleteBucket deletes a bucket at the given key.
208
+// Returns an error if the bucket does not exists, or if the key represents a non-bucket value.
209
+func (b *Bucket) DeleteBucket(key []byte) error {
210
+	if b.tx.db == nil {
211
+		return ErrTxClosed
212
+	} else if !b.Writable() {
213
+		return ErrTxNotWritable
214
+	}
215
+
216
+	// Move cursor to correct position.
217
+	c := b.Cursor()
218
+	k, _, flags := c.seek(key)
219
+
220
+	// Return an error if bucket doesn't exist or is not a bucket.
221
+	if !bytes.Equal(key, k) {
222
+		return ErrBucketNotFound
223
+	} else if (flags & bucketLeafFlag) == 0 {
224
+		return ErrIncompatibleValue
225
+	}
226
+
227
+	// Recursively delete all child buckets.
228
+	child := b.Bucket(key)
229
+	err := child.ForEach(func(k, v []byte) error {
230
+		if v == nil {
231
+			if err := child.DeleteBucket(k); err != nil {
232
+				return fmt.Errorf("delete bucket: %s", err)
233
+			}
234
+		}
235
+		return nil
236
+	})
237
+	if err != nil {
238
+		return err
239
+	}
240
+
241
+	// Remove cached copy.
242
+	delete(b.buckets, string(key))
243
+
244
+	// Release all bucket pages to freelist.
245
+	child.nodes = nil
246
+	child.rootNode = nil
247
+	child.free()
248
+
249
+	// Delete the node if we have a matching key.
250
+	c.node().del(key)
251
+
252
+	return nil
253
+}
254
+
255
+// Get retrieves the value for a key in the bucket.
256
+// Returns a nil value if the key does not exist or if the key is a nested bucket.
257
+// The returned value is only valid for the life of the transaction.
258
+func (b *Bucket) Get(key []byte) []byte {
259
+	k, v, flags := b.Cursor().seek(key)
260
+
261
+	// Return nil if this is a bucket.
262
+	if (flags & bucketLeafFlag) != 0 {
263
+		return nil
264
+	}
265
+
266
+	// If our target node isn't the same key as what's passed in then return nil.
267
+	if !bytes.Equal(key, k) {
268
+		return nil
269
+	}
270
+	return v
271
+}
272
+
273
+// Put sets the value for a key in the bucket.
274
+// If the key exist then its previous value will be overwritten.
275
+// Supplied value must remain valid for the life of the transaction.
276
+// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
277
+func (b *Bucket) Put(key []byte, value []byte) error {
278
+	if b.tx.db == nil {
279
+		return ErrTxClosed
280
+	} else if !b.Writable() {
281
+		return ErrTxNotWritable
282
+	} else if len(key) == 0 {
283
+		return ErrKeyRequired
284
+	} else if len(key) > MaxKeySize {
285
+		return ErrKeyTooLarge
286
+	} else if int64(len(value)) > MaxValueSize {
287
+		return ErrValueTooLarge
288
+	}
289
+
290
+	// Move cursor to correct position.
291
+	c := b.Cursor()
292
+	k, _, flags := c.seek(key)
293
+
294
+	// Return an error if there is an existing key with a bucket value.
295
+	if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 {
296
+		return ErrIncompatibleValue
297
+	}
298
+
299
+	// Insert into node.
300
+	key = cloneBytes(key)
301
+	c.node().put(key, key, value, 0, 0)
302
+
303
+	return nil
304
+}
305
+
306
+// Delete removes a key from the bucket.
307
+// If the key does not exist then nothing is done and a nil error is returned.
308
+// Returns an error if the bucket was created from a read-only transaction.
309
+func (b *Bucket) Delete(key []byte) error {
310
+	if b.tx.db == nil {
311
+		return ErrTxClosed
312
+	} else if !b.Writable() {
313
+		return ErrTxNotWritable
314
+	}
315
+
316
+	// Move cursor to correct position.
317
+	c := b.Cursor()
318
+	k, _, flags := c.seek(key)
319
+
320
+	// Return nil if the key doesn't exist.
321
+	if !bytes.Equal(key, k) {
322
+		return nil
323
+	}
324
+
325
+	// Return an error if there is already existing bucket value.
326
+	if (flags & bucketLeafFlag) != 0 {
327
+		return ErrIncompatibleValue
328
+	}
329
+
330
+	// Delete the node if we have a matching key.
331
+	c.node().del(key)
332
+
333
+	return nil
334
+}
335
+
336
+// Sequence returns the current integer for the bucket without incrementing it.
337
+func (b *Bucket) Sequence() uint64 { return b.bucket.sequence }
338
+
339
+// SetSequence updates the sequence number for the bucket.
340
+func (b *Bucket) SetSequence(v uint64) error {
341
+	if b.tx.db == nil {
342
+		return ErrTxClosed
343
+	} else if !b.Writable() {
344
+		return ErrTxNotWritable
345
+	}
346
+
347
+	// Materialize the root node if it hasn't been already so that the
348
+	// bucket will be saved during commit.
349
+	if b.rootNode == nil {
350
+		_ = b.node(b.root, nil)
351
+	}
352
+
353
+	// Increment and return the sequence.
354
+	b.bucket.sequence = v
355
+	return nil
356
+}
357
+
358
+// NextSequence returns an autoincrementing integer for the bucket.
359
+func (b *Bucket) NextSequence() (uint64, error) {
360
+	if b.tx.db == nil {
361
+		return 0, ErrTxClosed
362
+	} else if !b.Writable() {
363
+		return 0, ErrTxNotWritable
364
+	}
365
+
366
+	// Materialize the root node if it hasn't been already so that the
367
+	// bucket will be saved during commit.
368
+	if b.rootNode == nil {
369
+		_ = b.node(b.root, nil)
370
+	}
371
+
372
+	// Increment and return the sequence.
373
+	b.bucket.sequence++
374
+	return b.bucket.sequence, nil
375
+}
376
+
377
+// ForEach executes a function for each key/value pair in a bucket.
378
+// If the provided function returns an error then the iteration is stopped and
379
+// the error is returned to the caller. The provided function must not modify
380
+// the bucket; this will result in undefined behavior.
381
+func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
382
+	if b.tx.db == nil {
383
+		return ErrTxClosed
384
+	}
385
+	c := b.Cursor()
386
+	for k, v := c.First(); k != nil; k, v = c.Next() {
387
+		if err := fn(k, v); err != nil {
388
+			return err
389
+		}
390
+	}
391
+	return nil
392
+}
393
+
394
+// Stat returns stats on a bucket.
395
+func (b *Bucket) Stats() BucketStats {
396
+	var s, subStats BucketStats
397
+	pageSize := b.tx.db.pageSize
398
+	s.BucketN += 1
399
+	if b.root == 0 {
400
+		s.InlineBucketN += 1
401
+	}
402
+	b.forEachPage(func(p *page, depth int) {
403
+		if (p.flags & leafPageFlag) != 0 {
404
+			s.KeyN += int(p.count)
405
+
406
+			// used totals the used bytes for the page
407
+			used := pageHeaderSize
408
+
409
+			if p.count != 0 {
410
+				// If page has any elements, add all element headers.
411
+				used += leafPageElementSize * int(p.count-1)
412
+
413
+				// Add all element key, value sizes.
414
+				// The computation takes advantage of the fact that the position
415
+				// of the last element's key/value equals to the total of the sizes
416
+				// of all previous elements' keys and values.
417
+				// It also includes the last element's header.
418
+				lastElement := p.leafPageElement(p.count - 1)
419
+				used += int(lastElement.pos + lastElement.ksize + lastElement.vsize)
420
+			}
421
+
422
+			if b.root == 0 {
423
+				// For inlined bucket just update the inline stats
424
+				s.InlineBucketInuse += used
425
+			} else {
426
+				// For non-inlined bucket update all the leaf stats
427
+				s.LeafPageN++
428
+				s.LeafInuse += used
429
+				s.LeafOverflowN += int(p.overflow)
430
+
431
+				// Collect stats from sub-buckets.
432
+				// Do that by iterating over all element headers
433
+				// looking for the ones with the bucketLeafFlag.
434
+				for i := uint16(0); i < p.count; i++ {
435
+					e := p.leafPageElement(i)
436
+					if (e.flags & bucketLeafFlag) != 0 {
437
+						// For any bucket element, open the element value
438
+						// and recursively call Stats on the contained bucket.
439
+						subStats.Add(b.openBucket(e.value()).Stats())
440
+					}
441
+				}
442
+			}
443
+		} else if (p.flags & branchPageFlag) != 0 {
444
+			s.BranchPageN++
445
+			lastElement := p.branchPageElement(p.count - 1)
446
+
447
+			// used totals the used bytes for the page
448
+			// Add header and all element headers.
449
+			used := pageHeaderSize + (branchPageElementSize * int(p.count-1))
450
+
451
+			// Add size of all keys and values.
452
+			// Again, use the fact that last element's position equals to
453
+			// the total of key, value sizes of all previous elements.
454
+			used += int(lastElement.pos + lastElement.ksize)
455
+			s.BranchInuse += used
456
+			s.BranchOverflowN += int(p.overflow)
457
+		}
458
+
459
+		// Keep track of maximum page depth.
460
+		if depth+1 > s.Depth {
461
+			s.Depth = (depth + 1)
462
+		}
463
+	})
464
+
465
+	// Alloc stats can be computed from page counts and pageSize.
466
+	s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize
467
+	s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize
468
+
469
+	// Add the max depth of sub-buckets to get total nested depth.
470
+	s.Depth += subStats.Depth
471
+	// Add the stats for all sub-buckets
472
+	s.Add(subStats)
473
+	return s
474
+}
475
+
476
+// forEachPage iterates over every page in a bucket, including inline pages.
477
+func (b *Bucket) forEachPage(fn func(*page, int)) {
478
+	// If we have an inline page then just use that.
479
+	if b.page != nil {
480
+		fn(b.page, 0)
481
+		return
482
+	}
483
+
484
+	// Otherwise traverse the page hierarchy.
485
+	b.tx.forEachPage(b.root, 0, fn)
486
+}
487
+
488
+// forEachPageNode iterates over every page (or node) in a bucket.
489
+// This also includes inline pages.
490
+func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) {
491
+	// If we have an inline page or root node then just use that.
492
+	if b.page != nil {
493
+		fn(b.page, nil, 0)
494
+		return
495
+	}
496
+	b._forEachPageNode(b.root, 0, fn)
497
+}
498
+
499
+func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) {
500
+	var p, n = b.pageNode(pgid)
501
+
502
+	// Execute function.
503
+	fn(p, n, depth)
504
+
505
+	// Recursively loop over children.
506
+	if p != nil {
507
+		if (p.flags & branchPageFlag) != 0 {
508
+			for i := 0; i < int(p.count); i++ {
509
+				elem := p.branchPageElement(uint16(i))
510
+				b._forEachPageNode(elem.pgid, depth+1, fn)
511
+			}
512
+		}
513
+	} else {
514
+		if !n.isLeaf {
515
+			for _, inode := range n.inodes {
516
+				b._forEachPageNode(inode.pgid, depth+1, fn)
517
+			}
518
+		}
519
+	}
520
+}
521
+
522
+// spill writes all the nodes for this bucket to dirty pages.
523
+func (b *Bucket) spill() error {
524
+	// Spill all child buckets first.
525
+	for name, child := range b.buckets {
526
+		// If the child bucket is small enough and it has no child buckets then
527
+		// write it inline into the parent bucket's page. Otherwise spill it
528
+		// like a normal bucket and make the parent value a pointer to the page.
529
+		var value []byte
530
+		if child.inlineable() {
531
+			child.free()
532
+			value = child.write()
533
+		} else {
534
+			if err := child.spill(); err != nil {
535
+				return err
536
+			}
537
+
538
+			// Update the child bucket header in this bucket.
539
+			value = make([]byte, unsafe.Sizeof(bucket{}))
540
+			var bucket = (*bucket)(unsafe.Pointer(&value[0]))
541
+			*bucket = *child.bucket
542
+		}
543
+
544
+		// Skip writing the bucket if there are no materialized nodes.
545
+		if child.rootNode == nil {
546
+			continue
547
+		}
548
+
549
+		// Update parent node.
550
+		var c = b.Cursor()
551
+		k, _, flags := c.seek([]byte(name))
552
+		if !bytes.Equal([]byte(name), k) {
553
+			panic(fmt.Sprintf("misplaced bucket header: %x -> %x", []byte(name), k))
554
+		}
555
+		if flags&bucketLeafFlag == 0 {
556
+			panic(fmt.Sprintf("unexpected bucket header flag: %x", flags))
557
+		}
558
+		c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
559
+	}
560
+
561
+	// Ignore if there's not a materialized root node.
562
+	if b.rootNode == nil {
563
+		return nil
564
+	}
565
+
566
+	// Spill nodes.
567
+	if err := b.rootNode.spill(); err != nil {
568
+		return err
569
+	}
570
+	b.rootNode = b.rootNode.root()
571
+
572
+	// Update the root node for this bucket.
573
+	if b.rootNode.pgid >= b.tx.meta.pgid {
574
+		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid))
575
+	}
576
+	b.root = b.rootNode.pgid
577
+
578
+	return nil
579
+}
580
+
581
+// inlineable returns true if a bucket is small enough to be written inline
582
+// and if it contains no subbuckets. Otherwise returns false.
583
+func (b *Bucket) inlineable() bool {
584
+	var n = b.rootNode
585
+
586
+	// Bucket must only contain a single leaf node.
587
+	if n == nil || !n.isLeaf {
588
+		return false
589
+	}
590
+
591
+	// Bucket is not inlineable if it contains subbuckets or if it goes beyond
592
+	// our threshold for inline bucket size.
593
+	var size = pageHeaderSize
594
+	for _, inode := range n.inodes {
595
+		size += leafPageElementSize + len(inode.key) + len(inode.value)
596
+
597
+		if inode.flags&bucketLeafFlag != 0 {
598
+			return false
599
+		} else if size > b.maxInlineBucketSize() {
600
+			return false
601
+		}
602
+	}
603
+
604
+	return true
605
+}
606
+
607
+// Returns the maximum total size of a bucket to make it a candidate for inlining.
608
+func (b *Bucket) maxInlineBucketSize() int {
609
+	return b.tx.db.pageSize / 4
610
+}
611
+
612
+// write allocates and writes a bucket to a byte slice.
613
+func (b *Bucket) write() []byte {
614
+	// Allocate the appropriate size.
615
+	var n = b.rootNode
616
+	var value = make([]byte, bucketHeaderSize+n.size())
617
+
618
+	// Write a bucket header.
619
+	var bucket = (*bucket)(unsafe.Pointer(&value[0]))
620
+	*bucket = *b.bucket
621
+
622
+	// Convert byte slice to a fake page and write the root node.
623
+	var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
624
+	n.write(p)
625
+
626
+	return value
627
+}
628
+
629
+// rebalance attempts to balance all nodes.
630
+func (b *Bucket) rebalance() {
631
+	for _, n := range b.nodes {
632
+		n.rebalance()
633
+	}
634
+	for _, child := range b.buckets {
635
+		child.rebalance()
636
+	}
637
+}
638
+
639
+// node creates a node from a page and associates it with a given parent.
640
+func (b *Bucket) node(pgid pgid, parent *node) *node {
641
+	_assert(b.nodes != nil, "nodes map expected")
642
+
643
+	// Retrieve node if it's already been created.
644
+	if n := b.nodes[pgid]; n != nil {
645
+		return n
646
+	}
647
+
648
+	// Otherwise create a node and cache it.
649
+	n := &node{bucket: b, parent: parent}
650
+	if parent == nil {
651
+		b.rootNode = n
652
+	} else {
653
+		parent.children = append(parent.children, n)
654
+	}
655
+
656
+	// Use the inline page if this is an inline bucket.
657
+	var p = b.page
658
+	if p == nil {
659
+		p = b.tx.page(pgid)
660
+	}
661
+
662
+	// Read the page into the node and cache it.
663
+	n.read(p)
664
+	b.nodes[pgid] = n
665
+
666
+	// Update statistics.
667
+	b.tx.stats.NodeCount++
668
+
669
+	return n
670
+}
671
+
672
+// free recursively frees all pages in the bucket.
673
+func (b *Bucket) free() {
674
+	if b.root == 0 {
675
+		return
676
+	}
677
+
678
+	var tx = b.tx
679
+	b.forEachPageNode(func(p *page, n *node, _ int) {
680
+		if p != nil {
681
+			tx.db.freelist.free(tx.meta.txid, p)
682
+		} else {
683
+			n.free()
684
+		}
685
+	})
686
+	b.root = 0
687
+}
688
+
689
+// dereference removes all references to the old mmap.
690
+func (b *Bucket) dereference() {
691
+	if b.rootNode != nil {
692
+		b.rootNode.root().dereference()
693
+	}
694
+
695
+	for _, child := range b.buckets {
696
+		child.dereference()
697
+	}
698
+}
699
+
700
+// pageNode returns the in-memory node, if it exists.
701
+// Otherwise returns the underlying page.
702
+func (b *Bucket) pageNode(id pgid) (*page, *node) {
703
+	// Inline buckets have a fake page embedded in their value so treat them
704
+	// differently. We'll return the rootNode (if available) or the fake page.
705
+	if b.root == 0 {
706
+		if id != 0 {
707
+			panic(fmt.Sprintf("inline bucket non-zero page access(2): %d != 0", id))
708
+		}
709
+		if b.rootNode != nil {
710
+			return nil, b.rootNode
711
+		}
712
+		return b.page, nil
713
+	}
714
+
715
+	// Check the node cache for non-inline buckets.
716
+	if b.nodes != nil {
717
+		if n := b.nodes[id]; n != nil {
718
+			return nil, n
719
+		}
720
+	}
721
+
722
+	// Finally lookup the page from the transaction if no node is materialized.
723
+	return b.tx.page(id), nil
724
+}
725
+
726
+// BucketStats records statistics about resources used by a bucket.
727
+type BucketStats struct {
728
+	// Page count statistics.
729
+	BranchPageN     int // number of logical branch pages
730
+	BranchOverflowN int // number of physical branch overflow pages
731
+	LeafPageN       int // number of logical leaf pages
732
+	LeafOverflowN   int // number of physical leaf overflow pages
733
+
734
+	// Tree statistics.
735
+	KeyN  int // number of keys/value pairs
736
+	Depth int // number of levels in B+tree
737
+
738
+	// Page size utilization.
739
+	BranchAlloc int // bytes allocated for physical branch pages
740
+	BranchInuse int // bytes actually used for branch data
741
+	LeafAlloc   int // bytes allocated for physical leaf pages
742
+	LeafInuse   int // bytes actually used for leaf data
743
+
744
+	// Bucket statistics
745
+	BucketN           int // total number of buckets including the top bucket
746
+	InlineBucketN     int // total number on inlined buckets
747
+	InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse)
748
+}
749
+
750
+func (s *BucketStats) Add(other BucketStats) {
751
+	s.BranchPageN += other.BranchPageN
752
+	s.BranchOverflowN += other.BranchOverflowN
753
+	s.LeafPageN += other.LeafPageN
754
+	s.LeafOverflowN += other.LeafOverflowN
755
+	s.KeyN += other.KeyN
756
+	if s.Depth < other.Depth {
757
+		s.Depth = other.Depth
758
+	}
759
+	s.BranchAlloc += other.BranchAlloc
760
+	s.BranchInuse += other.BranchInuse
761
+	s.LeafAlloc += other.LeafAlloc
762
+	s.LeafInuse += other.LeafInuse
763
+
764
+	s.BucketN += other.BucketN
765
+	s.InlineBucketN += other.InlineBucketN
766
+	s.InlineBucketInuse += other.InlineBucketInuse
767
+}
768
+
769
+// cloneBytes returns a copy of a given slice.
770
+func cloneBytes(v []byte) []byte {
771
+	var clone = make([]byte, len(v))
772
+	copy(clone, v)
773
+	return clone
774
+}
0 775
new file mode 100644
... ...
@@ -0,0 +1,396 @@
0
+package bbolt
1
+
2
+import (
3
+	"bytes"
4
+	"fmt"
5
+	"sort"
6
+)
7
+
8
+// Cursor represents an iterator that can traverse over all key/value pairs in a bucket in sorted order.
9
+// Cursors see nested buckets with value == nil.
10
+// Cursors can be obtained from a transaction and are valid as long as the transaction is open.
11
+//
12
+// Keys and values returned from the cursor are only valid for the life of the transaction.
13
+//
14
+// Changing data while traversing with a cursor may cause it to be invalidated
15
+// and return unexpected keys and/or values. You must reposition your cursor
16
+// after mutating data.
17
+type Cursor struct {
18
+	bucket *Bucket
19
+	stack  []elemRef
20
+}
21
+
22
+// Bucket returns the bucket that this cursor was created from.
23
+func (c *Cursor) Bucket() *Bucket {
24
+	return c.bucket
25
+}
26
+
27
+// First moves the cursor to the first item in the bucket and returns its key and value.
28
+// If the bucket is empty then a nil key and value are returned.
29
+// The returned key and value are only valid for the life of the transaction.
30
+func (c *Cursor) First() (key []byte, value []byte) {
31
+	_assert(c.bucket.tx.db != nil, "tx closed")
32
+	c.stack = c.stack[:0]
33
+	p, n := c.bucket.pageNode(c.bucket.root)
34
+	c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
35
+	c.first()
36
+
37
+	// If we land on an empty page then move to the next value.
38
+	// https://github.com/boltdb/bolt/issues/450
39
+	if c.stack[len(c.stack)-1].count() == 0 {
40
+		c.next()
41
+	}
42
+
43
+	k, v, flags := c.keyValue()
44
+	if (flags & uint32(bucketLeafFlag)) != 0 {
45
+		return k, nil
46
+	}
47
+	return k, v
48
+
49
+}
50
+
51
+// Last moves the cursor to the last item in the bucket and returns its key and value.
52
+// If the bucket is empty then a nil key and value are returned.
53
+// The returned key and value are only valid for the life of the transaction.
54
+func (c *Cursor) Last() (key []byte, value []byte) {
55
+	_assert(c.bucket.tx.db != nil, "tx closed")
56
+	c.stack = c.stack[:0]
57
+	p, n := c.bucket.pageNode(c.bucket.root)
58
+	ref := elemRef{page: p, node: n}
59
+	ref.index = ref.count() - 1
60
+	c.stack = append(c.stack, ref)
61
+	c.last()
62
+	k, v, flags := c.keyValue()
63
+	if (flags & uint32(bucketLeafFlag)) != 0 {
64
+		return k, nil
65
+	}
66
+	return k, v
67
+}
68
+
69
+// Next moves the cursor to the next item in the bucket and returns its key and value.
70
+// If the cursor is at the end of the bucket then a nil key and value are returned.
71
+// The returned key and value are only valid for the life of the transaction.
72
+func (c *Cursor) Next() (key []byte, value []byte) {
73
+	_assert(c.bucket.tx.db != nil, "tx closed")
74
+	k, v, flags := c.next()
75
+	if (flags & uint32(bucketLeafFlag)) != 0 {
76
+		return k, nil
77
+	}
78
+	return k, v
79
+}
80
+
81
+// Prev moves the cursor to the previous item in the bucket and returns its key and value.
82
+// If the cursor is at the beginning of the bucket then a nil key and value are returned.
83
+// The returned key and value are only valid for the life of the transaction.
84
+func (c *Cursor) Prev() (key []byte, value []byte) {
85
+	_assert(c.bucket.tx.db != nil, "tx closed")
86
+
87
+	// Attempt to move back one element until we're successful.
88
+	// Move up the stack as we hit the beginning of each page in our stack.
89
+	for i := len(c.stack) - 1; i >= 0; i-- {
90
+		elem := &c.stack[i]
91
+		if elem.index > 0 {
92
+			elem.index--
93
+			break
94
+		}
95
+		c.stack = c.stack[:i]
96
+	}
97
+
98
+	// If we've hit the end then return nil.
99
+	if len(c.stack) == 0 {
100
+		return nil, nil
101
+	}
102
+
103
+	// Move down the stack to find the last element of the last leaf under this branch.
104
+	c.last()
105
+	k, v, flags := c.keyValue()
106
+	if (flags & uint32(bucketLeafFlag)) != 0 {
107
+		return k, nil
108
+	}
109
+	return k, v
110
+}
111
+
112
+// Seek moves the cursor to a given key and returns it.
113
+// If the key does not exist then the next key is used. If no keys
114
+// follow, a nil key is returned.
115
+// The returned key and value are only valid for the life of the transaction.
116
+func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) {
117
+	k, v, flags := c.seek(seek)
118
+
119
+	// If we ended up after the last element of a page then move to the next one.
120
+	if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() {
121
+		k, v, flags = c.next()
122
+	}
123
+
124
+	if k == nil {
125
+		return nil, nil
126
+	} else if (flags & uint32(bucketLeafFlag)) != 0 {
127
+		return k, nil
128
+	}
129
+	return k, v
130
+}
131
+
132
+// Delete removes the current key/value under the cursor from the bucket.
133
+// Delete fails if current key/value is a bucket or if the transaction is not writable.
134
+func (c *Cursor) Delete() error {
135
+	if c.bucket.tx.db == nil {
136
+		return ErrTxClosed
137
+	} else if !c.bucket.Writable() {
138
+		return ErrTxNotWritable
139
+	}
140
+
141
+	key, _, flags := c.keyValue()
142
+	// Return an error if current value is a bucket.
143
+	if (flags & bucketLeafFlag) != 0 {
144
+		return ErrIncompatibleValue
145
+	}
146
+	c.node().del(key)
147
+
148
+	return nil
149
+}
150
+
151
+// seek moves the cursor to a given key and returns it.
152
+// If the key does not exist then the next key is used.
153
+func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
154
+	_assert(c.bucket.tx.db != nil, "tx closed")
155
+
156
+	// Start from root page/node and traverse to correct page.
157
+	c.stack = c.stack[:0]
158
+	c.search(seek, c.bucket.root)
159
+
160
+	// If this is a bucket then return a nil value.
161
+	return c.keyValue()
162
+}
163
+
164
+// first moves the cursor to the first leaf element under the last page in the stack.
165
+func (c *Cursor) first() {
166
+	for {
167
+		// Exit when we hit a leaf page.
168
+		var ref = &c.stack[len(c.stack)-1]
169
+		if ref.isLeaf() {
170
+			break
171
+		}
172
+
173
+		// Keep adding pages pointing to the first element to the stack.
174
+		var pgid pgid
175
+		if ref.node != nil {
176
+			pgid = ref.node.inodes[ref.index].pgid
177
+		} else {
178
+			pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
179
+		}
180
+		p, n := c.bucket.pageNode(pgid)
181
+		c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
182
+	}
183
+}
184
+
185
+// last moves the cursor to the last leaf element under the last page in the stack.
186
+func (c *Cursor) last() {
187
+	for {
188
+		// Exit when we hit a leaf page.
189
+		ref := &c.stack[len(c.stack)-1]
190
+		if ref.isLeaf() {
191
+			break
192
+		}
193
+
194
+		// Keep adding pages pointing to the last element in the stack.
195
+		var pgid pgid
196
+		if ref.node != nil {
197
+			pgid = ref.node.inodes[ref.index].pgid
198
+		} else {
199
+			pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
200
+		}
201
+		p, n := c.bucket.pageNode(pgid)
202
+
203
+		var nextRef = elemRef{page: p, node: n}
204
+		nextRef.index = nextRef.count() - 1
205
+		c.stack = append(c.stack, nextRef)
206
+	}
207
+}
208
+
209
+// next moves to the next leaf element and returns the key and value.
210
+// If the cursor is at the last leaf element then it stays there and returns nil.
211
+func (c *Cursor) next() (key []byte, value []byte, flags uint32) {
212
+	for {
213
+		// Attempt to move over one element until we're successful.
214
+		// Move up the stack as we hit the end of each page in our stack.
215
+		var i int
216
+		for i = len(c.stack) - 1; i >= 0; i-- {
217
+			elem := &c.stack[i]
218
+			if elem.index < elem.count()-1 {
219
+				elem.index++
220
+				break
221
+			}
222
+		}
223
+
224
+		// If we've hit the root page then stop and return. This will leave the
225
+		// cursor on the last element of the last page.
226
+		if i == -1 {
227
+			return nil, nil, 0
228
+		}
229
+
230
+		// Otherwise start from where we left off in the stack and find the
231
+		// first element of the first leaf page.
232
+		c.stack = c.stack[:i+1]
233
+		c.first()
234
+
235
+		// If this is an empty page then restart and move back up the stack.
236
+		// https://github.com/boltdb/bolt/issues/450
237
+		if c.stack[len(c.stack)-1].count() == 0 {
238
+			continue
239
+		}
240
+
241
+		return c.keyValue()
242
+	}
243
+}
244
+
245
+// search recursively performs a binary search against a given page/node until it finds a given key.
246
+func (c *Cursor) search(key []byte, pgid pgid) {
247
+	p, n := c.bucket.pageNode(pgid)
248
+	if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 {
249
+		panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags))
250
+	}
251
+	e := elemRef{page: p, node: n}
252
+	c.stack = append(c.stack, e)
253
+
254
+	// If we're on a leaf page/node then find the specific node.
255
+	if e.isLeaf() {
256
+		c.nsearch(key)
257
+		return
258
+	}
259
+
260
+	if n != nil {
261
+		c.searchNode(key, n)
262
+		return
263
+	}
264
+	c.searchPage(key, p)
265
+}
266
+
267
+func (c *Cursor) searchNode(key []byte, n *node) {
268
+	var exact bool
269
+	index := sort.Search(len(n.inodes), func(i int) bool {
270
+		// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
271
+		// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
272
+		ret := bytes.Compare(n.inodes[i].key, key)
273
+		if ret == 0 {
274
+			exact = true
275
+		}
276
+		return ret != -1
277
+	})
278
+	if !exact && index > 0 {
279
+		index--
280
+	}
281
+	c.stack[len(c.stack)-1].index = index
282
+
283
+	// Recursively search to the next page.
284
+	c.search(key, n.inodes[index].pgid)
285
+}
286
+
287
+func (c *Cursor) searchPage(key []byte, p *page) {
288
+	// Binary search for the correct range.
289
+	inodes := p.branchPageElements()
290
+
291
+	var exact bool
292
+	index := sort.Search(int(p.count), func(i int) bool {
293
+		// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
294
+		// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
295
+		ret := bytes.Compare(inodes[i].key(), key)
296
+		if ret == 0 {
297
+			exact = true
298
+		}
299
+		return ret != -1
300
+	})
301
+	if !exact && index > 0 {
302
+		index--
303
+	}
304
+	c.stack[len(c.stack)-1].index = index
305
+
306
+	// Recursively search to the next page.
307
+	c.search(key, inodes[index].pgid)
308
+}
309
+
310
+// nsearch searches the leaf node on the top of the stack for a key.
311
+func (c *Cursor) nsearch(key []byte) {
312
+	e := &c.stack[len(c.stack)-1]
313
+	p, n := e.page, e.node
314
+
315
+	// If we have a node then search its inodes.
316
+	if n != nil {
317
+		index := sort.Search(len(n.inodes), func(i int) bool {
318
+			return bytes.Compare(n.inodes[i].key, key) != -1
319
+		})
320
+		e.index = index
321
+		return
322
+	}
323
+
324
+	// If we have a page then search its leaf elements.
325
+	inodes := p.leafPageElements()
326
+	index := sort.Search(int(p.count), func(i int) bool {
327
+		return bytes.Compare(inodes[i].key(), key) != -1
328
+	})
329
+	e.index = index
330
+}
331
+
332
+// keyValue returns the key and value of the current leaf element.
333
+func (c *Cursor) keyValue() ([]byte, []byte, uint32) {
334
+	ref := &c.stack[len(c.stack)-1]
335
+
336
+	// If the cursor is pointing to the end of page/node then return nil.
337
+	if ref.count() == 0 || ref.index >= ref.count() {
338
+		return nil, nil, 0
339
+	}
340
+
341
+	// Retrieve value from node.
342
+	if ref.node != nil {
343
+		inode := &ref.node.inodes[ref.index]
344
+		return inode.key, inode.value, inode.flags
345
+	}
346
+
347
+	// Or retrieve value from page.
348
+	elem := ref.page.leafPageElement(uint16(ref.index))
349
+	return elem.key(), elem.value(), elem.flags
350
+}
351
+
352
+// node returns the node that the cursor is currently positioned on.
353
+func (c *Cursor) node() *node {
354
+	_assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")
355
+
356
+	// If the top of the stack is a leaf node then just return it.
357
+	if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
358
+		return ref.node
359
+	}
360
+
361
+	// Start from root and traverse down the hierarchy.
362
+	var n = c.stack[0].node
363
+	if n == nil {
364
+		n = c.bucket.node(c.stack[0].page.id, nil)
365
+	}
366
+	for _, ref := range c.stack[:len(c.stack)-1] {
367
+		_assert(!n.isLeaf, "expected branch node")
368
+		n = n.childAt(int(ref.index))
369
+	}
370
+	_assert(n.isLeaf, "expected leaf node")
371
+	return n
372
+}
373
+
374
+// elemRef represents a reference to an element on a given page/node.
375
+type elemRef struct {
376
+	page  *page
377
+	node  *node
378
+	index int
379
+}
380
+
381
+// isLeaf returns whether the ref is pointing at a leaf page/node.
382
+func (r *elemRef) isLeaf() bool {
383
+	if r.node != nil {
384
+		return r.node.isLeaf
385
+	}
386
+	return (r.page.flags & leafPageFlag) != 0
387
+}
388
+
389
+// count returns the number of inodes or page elements.
390
+func (r *elemRef) count() int {
391
+	if r.node != nil {
392
+		return len(r.node.inodes)
393
+	}
394
+	return int(r.page.count)
395
+}
0 396
new file mode 100644
... ...
@@ -0,0 +1,1138 @@
0
+package bbolt
1
+
2
+import (
3
+	"errors"
4
+	"fmt"
5
+	"hash/fnv"
6
+	"log"
7
+	"os"
8
+	"runtime"
9
+	"sort"
10
+	"sync"
11
+	"time"
12
+	"unsafe"
13
+)
14
+
15
+// The largest step that can be taken when remapping the mmap.
16
+const maxMmapStep = 1 << 30 // 1GB
17
+
18
+// The data file format version.
19
+const version = 2
20
+
21
+// Represents a marker value to indicate that a file is a Bolt DB.
22
+const magic uint32 = 0xED0CDAED
23
+
24
+const pgidNoFreelist pgid = 0xffffffffffffffff
25
+
26
+// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
27
+// syncing changes to a file.  This is required as some operating systems,
28
+// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
29
+// must be synchronized using the msync(2) syscall.
30
+const IgnoreNoSync = runtime.GOOS == "openbsd"
31
+
32
+// Default values if not set in a DB instance.
33
+const (
34
+	DefaultMaxBatchSize  int = 1000
35
+	DefaultMaxBatchDelay     = 10 * time.Millisecond
36
+	DefaultAllocSize         = 16 * 1024 * 1024
37
+)
38
+
39
+// default page size for db is set to the OS page size.
40
+var defaultPageSize = os.Getpagesize()
41
+
42
+// The time elapsed between consecutive file locking attempts.
43
+const flockRetryTimeout = 50 * time.Millisecond
44
+
45
+// DB represents a collection of buckets persisted to a file on disk.
46
+// All data access is performed through transactions which can be obtained through the DB.
47
+// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
48
+type DB struct {
49
+	// When enabled, the database will perform a Check() after every commit.
50
+	// A panic is issued if the database is in an inconsistent state. This
51
+	// flag has a large performance impact so it should only be used for
52
+	// debugging purposes.
53
+	StrictMode bool
54
+
55
+	// Setting the NoSync flag will cause the database to skip fsync()
56
+	// calls after each commit. This can be useful when bulk loading data
57
+	// into a database and you can restart the bulk load in the event of
58
+	// a system failure or database corruption. Do not set this flag for
59
+	// normal use.
60
+	//
61
+	// If the package global IgnoreNoSync constant is true, this value is
62
+	// ignored.  See the comment on that constant for more details.
63
+	//
64
+	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
65
+	NoSync bool
66
+
67
+	// When true, skips syncing freelist to disk. This improves the database
68
+	// write performance under normal operation, but requires a full database
69
+	// re-sync during recovery.
70
+	NoFreelistSync bool
71
+
72
+	// When true, skips the truncate call when growing the database.
73
+	// Setting this to true is only safe on non-ext3/ext4 systems.
74
+	// Skipping truncation avoids preallocation of hard drive space and
75
+	// bypasses a truncate() and fsync() syscall on remapping.
76
+	//
77
+	// https://github.com/boltdb/bolt/issues/284
78
+	NoGrowSync bool
79
+
80
+	// If you want to read the entire database fast, you can set MmapFlag to
81
+	// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
82
+	MmapFlags int
83
+
84
+	// MaxBatchSize is the maximum size of a batch. Default value is
85
+	// copied from DefaultMaxBatchSize in Open.
86
+	//
87
+	// If <=0, disables batching.
88
+	//
89
+	// Do not change concurrently with calls to Batch.
90
+	MaxBatchSize int
91
+
92
+	// MaxBatchDelay is the maximum delay before a batch starts.
93
+	// Default value is copied from DefaultMaxBatchDelay in Open.
94
+	//
95
+	// If <=0, effectively disables batching.
96
+	//
97
+	// Do not change concurrently with calls to Batch.
98
+	MaxBatchDelay time.Duration
99
+
100
+	// AllocSize is the amount of space allocated when the database
101
+	// needs to create new pages. This is done to amortize the cost
102
+	// of truncate() and fsync() when growing the data file.
103
+	AllocSize int
104
+
105
+	path     string
106
+	file     *os.File
107
+	dataref  []byte // mmap'ed readonly, write throws SEGV
108
+	data     *[maxMapSize]byte
109
+	datasz   int
110
+	filesz   int // current on disk file size
111
+	meta0    *meta
112
+	meta1    *meta
113
+	pageSize int
114
+	opened   bool
115
+	rwtx     *Tx
116
+	txs      []*Tx
117
+	stats    Stats
118
+
119
+	freelist     *freelist
120
+	freelistLoad sync.Once
121
+
122
+	pagePool sync.Pool
123
+
124
+	batchMu sync.Mutex
125
+	batch   *batch
126
+
127
+	rwlock   sync.Mutex   // Allows only one writer at a time.
128
+	metalock sync.Mutex   // Protects meta page access.
129
+	mmaplock sync.RWMutex // Protects mmap access during remapping.
130
+	statlock sync.RWMutex // Protects stats access.
131
+
132
+	ops struct {
133
+		writeAt func(b []byte, off int64) (n int, err error)
134
+	}
135
+
136
+	// Read only mode.
137
+	// When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
138
+	readOnly bool
139
+}
140
+
141
+// Path returns the path to currently open database file.
142
+func (db *DB) Path() string {
143
+	return db.path
144
+}
145
+
146
+// GoString returns the Go string representation of the database.
147
+func (db *DB) GoString() string {
148
+	return fmt.Sprintf("bolt.DB{path:%q}", db.path)
149
+}
150
+
151
+// String returns the string representation of the database.
152
+func (db *DB) String() string {
153
+	return fmt.Sprintf("DB<%q>", db.path)
154
+}
155
+
156
+// Open creates and opens a database at the given path.
157
+// If the file does not exist then it will be created automatically.
158
+// Passing in nil options will cause Bolt to open the database with the default options.
159
+func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
160
+	db := &DB{
161
+		opened: true,
162
+	}
163
+	// Set default options if no options are provided.
164
+	if options == nil {
165
+		options = DefaultOptions
166
+	}
167
+	db.NoSync = options.NoSync
168
+	db.NoGrowSync = options.NoGrowSync
169
+	db.MmapFlags = options.MmapFlags
170
+	db.NoFreelistSync = options.NoFreelistSync
171
+
172
+	// Set default values for later DB operations.
173
+	db.MaxBatchSize = DefaultMaxBatchSize
174
+	db.MaxBatchDelay = DefaultMaxBatchDelay
175
+	db.AllocSize = DefaultAllocSize
176
+
177
+	flag := os.O_RDWR
178
+	if options.ReadOnly {
179
+		flag = os.O_RDONLY
180
+		db.readOnly = true
181
+	}
182
+
183
+	// Open data file and separate sync handler for metadata writes.
184
+	db.path = path
185
+	var err error
186
+	if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
187
+		_ = db.close()
188
+		return nil, err
189
+	}
190
+
191
+	// Lock file so that other processes using Bolt in read-write mode cannot
192
+	// use the database  at the same time. This would cause corruption since
193
+	// the two processes would write meta pages and free pages separately.
194
+	// The database file is locked exclusively (only one process can grab the lock)
195
+	// if !options.ReadOnly.
196
+	// The database file is locked using the shared lock (more than one process may
197
+	// hold a lock at the same time) otherwise (options.ReadOnly is set).
198
+	if err := flock(db, !db.readOnly, options.Timeout); err != nil {
199
+		_ = db.close()
200
+		return nil, err
201
+	}
202
+
203
+	// Default values for test hooks
204
+	db.ops.writeAt = db.file.WriteAt
205
+
206
+	if db.pageSize = options.PageSize; db.pageSize == 0 {
207
+		// Set the default page size to the OS page size.
208
+		db.pageSize = defaultPageSize
209
+	}
210
+
211
+	// Initialize the database if it doesn't exist.
212
+	if info, err := db.file.Stat(); err != nil {
213
+		_ = db.close()
214
+		return nil, err
215
+	} else if info.Size() == 0 {
216
+		// Initialize new files with meta pages.
217
+		if err := db.init(); err != nil {
218
+			// clean up file descriptor on initialization fail
219
+			_ = db.close()
220
+			return nil, err
221
+		}
222
+	} else {
223
+		// Read the first meta page to determine the page size.
224
+		var buf [0x1000]byte
225
+		// If we can't read the page size, but can read a page, assume
226
+		// it's the same as the OS or one given -- since that's how the
227
+		// page size was chosen in the first place.
228
+		//
229
+		// If the first page is invalid and this OS uses a different
230
+		// page size than what the database was created with then we
231
+		// are out of luck and cannot access the database.
232
+		//
233
+		// TODO: scan for next page
234
+		if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
235
+			if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
236
+				db.pageSize = int(m.pageSize)
237
+			}
238
+		} else {
239
+			_ = db.close()
240
+			return nil, ErrInvalid
241
+		}
242
+	}
243
+
244
+	// Initialize page pool.
245
+	db.pagePool = sync.Pool{
246
+		New: func() interface{} {
247
+			return make([]byte, db.pageSize)
248
+		},
249
+	}
250
+
251
+	// Memory map the data file.
252
+	if err := db.mmap(options.InitialMmapSize); err != nil {
253
+		_ = db.close()
254
+		return nil, err
255
+	}
256
+
257
+	if db.readOnly {
258
+		return db, nil
259
+	}
260
+
261
+	db.loadFreelist()
262
+
263
+	// Flush freelist when transitioning from no sync to sync so
264
+	// NoFreelistSync unaware boltdb can open the db later.
265
+	if !db.NoFreelistSync && !db.hasSyncedFreelist() {
266
+		tx, err := db.Begin(true)
267
+		if tx != nil {
268
+			err = tx.Commit()
269
+		}
270
+		if err != nil {
271
+			_ = db.close()
272
+			return nil, err
273
+		}
274
+	}
275
+
276
+	// Mark the database as opened and return.
277
+	return db, nil
278
+}
279
+
280
+// loadFreelist reads the freelist if it is synced, or reconstructs it
281
+// by scanning the DB if it is not synced. It assumes there are no
282
+// concurrent accesses being made to the freelist.
283
+func (db *DB) loadFreelist() {
284
+	db.freelistLoad.Do(func() {
285
+		db.freelist = newFreelist()
286
+		if !db.hasSyncedFreelist() {
287
+			// Reconstruct free list by scanning the DB.
288
+			db.freelist.readIDs(db.freepages())
289
+		} else {
290
+			// Read free list from freelist page.
291
+			db.freelist.read(db.page(db.meta().freelist))
292
+		}
293
+		db.stats.FreePageN = len(db.freelist.ids)
294
+	})
295
+}
296
+
297
+func (db *DB) hasSyncedFreelist() bool {
298
+	return db.meta().freelist != pgidNoFreelist
299
+}
300
+
301
+// mmap opens the underlying memory-mapped file and initializes the meta references.
302
+// minsz is the minimum size that the new mmap can be.
303
+func (db *DB) mmap(minsz int) error {
304
+	db.mmaplock.Lock()
305
+	defer db.mmaplock.Unlock()
306
+
307
+	info, err := db.file.Stat()
308
+	if err != nil {
309
+		return fmt.Errorf("mmap stat error: %s", err)
310
+	} else if int(info.Size()) < db.pageSize*2 {
311
+		return fmt.Errorf("file size too small")
312
+	}
313
+
314
+	// Ensure the size is at least the minimum size.
315
+	var size = int(info.Size())
316
+	if size < minsz {
317
+		size = minsz
318
+	}
319
+	size, err = db.mmapSize(size)
320
+	if err != nil {
321
+		return err
322
+	}
323
+
324
+	// Dereference all mmap references before unmapping.
325
+	if db.rwtx != nil {
326
+		db.rwtx.root.dereference()
327
+	}
328
+
329
+	// Unmap existing data before continuing.
330
+	if err := db.munmap(); err != nil {
331
+		return err
332
+	}
333
+
334
+	// Memory-map the data file as a byte slice.
335
+	if err := mmap(db, size); err != nil {
336
+		return err
337
+	}
338
+
339
+	// Save references to the meta pages.
340
+	db.meta0 = db.page(0).meta()
341
+	db.meta1 = db.page(1).meta()
342
+
343
+	// Validate the meta pages. We only return an error if both meta pages fail
344
+	// validation, since meta0 failing validation means that it wasn't saved
345
+	// properly -- but we can recover using meta1. And vice-versa.
346
+	err0 := db.meta0.validate()
347
+	err1 := db.meta1.validate()
348
+	if err0 != nil && err1 != nil {
349
+		return err0
350
+	}
351
+
352
+	return nil
353
+}
354
+
355
+// munmap unmaps the data file from memory.
356
+func (db *DB) munmap() error {
357
+	if err := munmap(db); err != nil {
358
+		return fmt.Errorf("unmap error: " + err.Error())
359
+	}
360
+	return nil
361
+}
362
+
363
+// mmapSize determines the appropriate size for the mmap given the current size
364
+// of the database. The minimum size is 32KB and doubles until it reaches 1GB.
365
+// Returns an error if the new mmap size is greater than the max allowed.
366
+func (db *DB) mmapSize(size int) (int, error) {
367
+	// Double the size from 32KB until 1GB.
368
+	for i := uint(15); i <= 30; i++ {
369
+		if size <= 1<<i {
370
+			return 1 << i, nil
371
+		}
372
+	}
373
+
374
+	// Verify the requested size is not above the maximum allowed.
375
+	if size > maxMapSize {
376
+		return 0, fmt.Errorf("mmap too large")
377
+	}
378
+
379
+	// If larger than 1GB then grow by 1GB at a time.
380
+	sz := int64(size)
381
+	if remainder := sz % int64(maxMmapStep); remainder > 0 {
382
+		sz += int64(maxMmapStep) - remainder
383
+	}
384
+
385
+	// Ensure that the mmap size is a multiple of the page size.
386
+	// This should always be true since we're incrementing in MBs.
387
+	pageSize := int64(db.pageSize)
388
+	if (sz % pageSize) != 0 {
389
+		sz = ((sz / pageSize) + 1) * pageSize
390
+	}
391
+
392
+	// If we've exceeded the max size then only grow up to the max size.
393
+	if sz > maxMapSize {
394
+		sz = maxMapSize
395
+	}
396
+
397
+	return int(sz), nil
398
+}
399
+
400
+// init creates a new database file and initializes its meta pages.
401
+func (db *DB) init() error {
402
+	// Create two meta pages on a buffer.
403
+	buf := make([]byte, db.pageSize*4)
404
+	for i := 0; i < 2; i++ {
405
+		p := db.pageInBuffer(buf[:], pgid(i))
406
+		p.id = pgid(i)
407
+		p.flags = metaPageFlag
408
+
409
+		// Initialize the meta page.
410
+		m := p.meta()
411
+		m.magic = magic
412
+		m.version = version
413
+		m.pageSize = uint32(db.pageSize)
414
+		m.freelist = 2
415
+		m.root = bucket{root: 3}
416
+		m.pgid = 4
417
+		m.txid = txid(i)
418
+		m.checksum = m.sum64()
419
+	}
420
+
421
+	// Write an empty freelist at page 3.
422
+	p := db.pageInBuffer(buf[:], pgid(2))
423
+	p.id = pgid(2)
424
+	p.flags = freelistPageFlag
425
+	p.count = 0
426
+
427
+	// Write an empty leaf page at page 4.
428
+	p = db.pageInBuffer(buf[:], pgid(3))
429
+	p.id = pgid(3)
430
+	p.flags = leafPageFlag
431
+	p.count = 0
432
+
433
+	// Write the buffer to our data file.
434
+	if _, err := db.ops.writeAt(buf, 0); err != nil {
435
+		return err
436
+	}
437
+	if err := fdatasync(db); err != nil {
438
+		return err
439
+	}
440
+
441
+	return nil
442
+}
443
+
444
+// Close releases all database resources.
445
+// It will block waiting for any open transactions to finish
446
+// before closing the database and returning.
447
+func (db *DB) Close() error {
448
+	db.rwlock.Lock()
449
+	defer db.rwlock.Unlock()
450
+
451
+	db.metalock.Lock()
452
+	defer db.metalock.Unlock()
453
+
454
+	db.mmaplock.Lock()
455
+	defer db.mmaplock.Unlock()
456
+
457
+	return db.close()
458
+}
459
+
460
+func (db *DB) close() error {
461
+	if !db.opened {
462
+		return nil
463
+	}
464
+
465
+	db.opened = false
466
+
467
+	db.freelist = nil
468
+
469
+	// Clear ops.
470
+	db.ops.writeAt = nil
471
+
472
+	// Close the mmap.
473
+	if err := db.munmap(); err != nil {
474
+		return err
475
+	}
476
+
477
+	// Close file handles.
478
+	if db.file != nil {
479
+		// No need to unlock read-only file.
480
+		if !db.readOnly {
481
+			// Unlock the file.
482
+			if err := funlock(db); err != nil {
483
+				log.Printf("bolt.Close(): funlock error: %s", err)
484
+			}
485
+		}
486
+
487
+		// Close the file descriptor.
488
+		if err := db.file.Close(); err != nil {
489
+			return fmt.Errorf("db file close: %s", err)
490
+		}
491
+		db.file = nil
492
+	}
493
+
494
+	db.path = ""
495
+	return nil
496
+}
497
+
498
+// Begin starts a new transaction.
499
+// Multiple read-only transactions can be used concurrently but only one
500
+// write transaction can be used at a time. Starting multiple write transactions
501
+// will cause the calls to block and be serialized until the current write
502
+// transaction finishes.
503
+//
504
+// Transactions should not be dependent on one another. Opening a read
505
+// transaction and a write transaction in the same goroutine can cause the
506
+// writer to deadlock because the database periodically needs to re-mmap itself
507
+// as it grows and it cannot do that while a read transaction is open.
508
+//
509
+// If a long running read transaction (for example, a snapshot transaction) is
510
+// needed, you might want to set DB.InitialMmapSize to a large enough value
511
+// to avoid potential blocking of write transaction.
512
+//
513
+// IMPORTANT: You must close read-only transactions after you are finished or
514
+// else the database will not reclaim old pages.
515
+func (db *DB) Begin(writable bool) (*Tx, error) {
516
+	if writable {
517
+		return db.beginRWTx()
518
+	}
519
+	return db.beginTx()
520
+}
521
+
522
+func (db *DB) beginTx() (*Tx, error) {
523
+	// Lock the meta pages while we initialize the transaction. We obtain
524
+	// the meta lock before the mmap lock because that's the order that the
525
+	// write transaction will obtain them.
526
+	db.metalock.Lock()
527
+
528
+	// Obtain a read-only lock on the mmap. When the mmap is remapped it will
529
+	// obtain a write lock so all transactions must finish before it can be
530
+	// remapped.
531
+	db.mmaplock.RLock()
532
+
533
+	// Exit if the database is not open yet.
534
+	if !db.opened {
535
+		db.mmaplock.RUnlock()
536
+		db.metalock.Unlock()
537
+		return nil, ErrDatabaseNotOpen
538
+	}
539
+
540
+	// Create a transaction associated with the database.
541
+	t := &Tx{}
542
+	t.init(db)
543
+
544
+	// Keep track of transaction until it closes.
545
+	db.txs = append(db.txs, t)
546
+	n := len(db.txs)
547
+
548
+	// Unlock the meta pages.
549
+	db.metalock.Unlock()
550
+
551
+	// Update the transaction stats.
552
+	db.statlock.Lock()
553
+	db.stats.TxN++
554
+	db.stats.OpenTxN = n
555
+	db.statlock.Unlock()
556
+
557
+	return t, nil
558
+}
559
+
560
+func (db *DB) beginRWTx() (*Tx, error) {
561
+	// If the database was opened with Options.ReadOnly, return an error.
562
+	if db.readOnly {
563
+		return nil, ErrDatabaseReadOnly
564
+	}
565
+
566
+	// Obtain writer lock. This is released by the transaction when it closes.
567
+	// This enforces only one writer transaction at a time.
568
+	db.rwlock.Lock()
569
+
570
+	// Once we have the writer lock then we can lock the meta pages so that
571
+	// we can set up the transaction.
572
+	db.metalock.Lock()
573
+	defer db.metalock.Unlock()
574
+
575
+	// Exit if the database is not open yet.
576
+	if !db.opened {
577
+		db.rwlock.Unlock()
578
+		return nil, ErrDatabaseNotOpen
579
+	}
580
+
581
+	// Create a transaction associated with the database.
582
+	t := &Tx{writable: true}
583
+	t.init(db)
584
+	db.rwtx = t
585
+	db.freePages()
586
+	return t, nil
587
+}
588
+
589
+// freePages releases any pages associated with closed read-only transactions.
590
+func (db *DB) freePages() {
591
+	// Free all pending pages prior to earliest open transaction.
592
+	sort.Sort(txsById(db.txs))
593
+	minid := txid(0xFFFFFFFFFFFFFFFF)
594
+	if len(db.txs) > 0 {
595
+		minid = db.txs[0].meta.txid
596
+	}
597
+	if minid > 0 {
598
+		db.freelist.release(minid - 1)
599
+	}
600
+	// Release unused txid extents.
601
+	for _, t := range db.txs {
602
+		db.freelist.releaseRange(minid, t.meta.txid-1)
603
+		minid = t.meta.txid + 1
604
+	}
605
+	db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
606
+	// Any page both allocated and freed in an extent is safe to release.
607
+}
608
+
609
+type txsById []*Tx
610
+
611
+func (t txsById) Len() int           { return len(t) }
612
+func (t txsById) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
613
+func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
614
+
615
+// removeTx removes a transaction from the database.
616
+func (db *DB) removeTx(tx *Tx) {
617
+	// Release the read lock on the mmap.
618
+	db.mmaplock.RUnlock()
619
+
620
+	// Use the meta lock to restrict access to the DB object.
621
+	db.metalock.Lock()
622
+
623
+	// Remove the transaction.
624
+	for i, t := range db.txs {
625
+		if t == tx {
626
+			last := len(db.txs) - 1
627
+			db.txs[i] = db.txs[last]
628
+			db.txs[last] = nil
629
+			db.txs = db.txs[:last]
630
+			break
631
+		}
632
+	}
633
+	n := len(db.txs)
634
+
635
+	// Unlock the meta pages.
636
+	db.metalock.Unlock()
637
+
638
+	// Merge statistics.
639
+	db.statlock.Lock()
640
+	db.stats.OpenTxN = n
641
+	db.stats.TxStats.add(&tx.stats)
642
+	db.statlock.Unlock()
643
+}
644
+
645
+// Update executes a function within the context of a read-write managed transaction.
646
+// If no error is returned from the function then the transaction is committed.
647
+// If an error is returned then the entire transaction is rolled back.
648
+// Any error that is returned from the function or returned from the commit is
649
+// returned from the Update() method.
650
+//
651
+// Attempting to manually commit or rollback within the function will cause a panic.
652
+func (db *DB) Update(fn func(*Tx) error) error {
653
+	t, err := db.Begin(true)
654
+	if err != nil {
655
+		return err
656
+	}
657
+
658
+	// Make sure the transaction rolls back in the event of a panic.
659
+	defer func() {
660
+		if t.db != nil {
661
+			t.rollback()
662
+		}
663
+	}()
664
+
665
+	// Mark as a managed tx so that the inner function cannot manually commit.
666
+	t.managed = true
667
+
668
+	// If an error is returned from the function then rollback and return error.
669
+	err = fn(t)
670
+	t.managed = false
671
+	if err != nil {
672
+		_ = t.Rollback()
673
+		return err
674
+	}
675
+
676
+	return t.Commit()
677
+}
678
+
679
+// View executes a function within the context of a managed read-only transaction.
680
+// Any error that is returned from the function is returned from the View() method.
681
+//
682
+// Attempting to manually rollback within the function will cause a panic.
683
+func (db *DB) View(fn func(*Tx) error) error {
684
+	t, err := db.Begin(false)
685
+	if err != nil {
686
+		return err
687
+	}
688
+
689
+	// Make sure the transaction rolls back in the event of a panic.
690
+	defer func() {
691
+		if t.db != nil {
692
+			t.rollback()
693
+		}
694
+	}()
695
+
696
+	// Mark as a managed tx so that the inner function cannot manually rollback.
697
+	t.managed = true
698
+
699
+	// If an error is returned from the function then pass it through.
700
+	err = fn(t)
701
+	t.managed = false
702
+	if err != nil {
703
+		_ = t.Rollback()
704
+		return err
705
+	}
706
+
707
+	return t.Rollback()
708
+}
709
+
710
+// Batch calls fn as part of a batch. It behaves similar to Update,
711
+// except:
712
+//
713
+// 1. concurrent Batch calls can be combined into a single Bolt
714
+// transaction.
715
+//
716
+// 2. the function passed to Batch may be called multiple times,
717
+// regardless of whether it returns error or not.
718
+//
719
+// This means that Batch function side effects must be idempotent and
720
+// take permanent effect only after a successful return is seen in
721
+// caller.
722
+//
723
+// The maximum batch size and delay can be adjusted with DB.MaxBatchSize
724
+// and DB.MaxBatchDelay, respectively.
725
+//
726
+// Batch is only useful when there are multiple goroutines calling it.
727
+func (db *DB) Batch(fn func(*Tx) error) error {
728
+	errCh := make(chan error, 1)
729
+
730
+	db.batchMu.Lock()
731
+	if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
732
+		// There is no existing batch, or the existing batch is full; start a new one.
733
+		db.batch = &batch{
734
+			db: db,
735
+		}
736
+		db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
737
+	}
738
+	db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
739
+	if len(db.batch.calls) >= db.MaxBatchSize {
740
+		// wake up batch, it's ready to run
741
+		go db.batch.trigger()
742
+	}
743
+	db.batchMu.Unlock()
744
+
745
+	err := <-errCh
746
+	if err == trySolo {
747
+		err = db.Update(fn)
748
+	}
749
+	return err
750
+}
751
+
752
+type call struct {
753
+	fn  func(*Tx) error
754
+	err chan<- error
755
+}
756
+
757
+type batch struct {
758
+	db    *DB
759
+	timer *time.Timer
760
+	start sync.Once
761
+	calls []call
762
+}
763
+
764
+// trigger runs the batch if it hasn't already been run.
765
+func (b *batch) trigger() {
766
+	b.start.Do(b.run)
767
+}
768
+
769
+// run performs the transactions in the batch and communicates results
770
+// back to DB.Batch.
771
+func (b *batch) run() {
772
+	b.db.batchMu.Lock()
773
+	b.timer.Stop()
774
+	// Make sure no new work is added to this batch, but don't break
775
+	// other batches.
776
+	if b.db.batch == b {
777
+		b.db.batch = nil
778
+	}
779
+	b.db.batchMu.Unlock()
780
+
781
+retry:
782
+	for len(b.calls) > 0 {
783
+		var failIdx = -1
784
+		err := b.db.Update(func(tx *Tx) error {
785
+			for i, c := range b.calls {
786
+				if err := safelyCall(c.fn, tx); err != nil {
787
+					failIdx = i
788
+					return err
789
+				}
790
+			}
791
+			return nil
792
+		})
793
+
794
+		if failIdx >= 0 {
795
+			// take the failing transaction out of the batch. it's
796
+			// safe to shorten b.calls here because db.batch no longer
797
+			// points to us, and we hold the mutex anyway.
798
+			c := b.calls[failIdx]
799
+			b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
800
+			// tell the submitter re-run it solo, continue with the rest of the batch
801
+			c.err <- trySolo
802
+			continue retry
803
+		}
804
+
805
+		// pass success, or bolt internal errors, to all callers
806
+		for _, c := range b.calls {
807
+			c.err <- err
808
+		}
809
+		break retry
810
+	}
811
+}
812
+
813
+// trySolo is a special sentinel error value used for signaling that a
814
+// transaction function should be re-run. It should never be seen by
815
+// callers.
816
+var trySolo = errors.New("batch function returned an error and should be re-run solo")
817
+
818
+type panicked struct {
819
+	reason interface{}
820
+}
821
+
822
+func (p panicked) Error() string {
823
+	if err, ok := p.reason.(error); ok {
824
+		return err.Error()
825
+	}
826
+	return fmt.Sprintf("panic: %v", p.reason)
827
+}
828
+
829
+func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
830
+	defer func() {
831
+		if p := recover(); p != nil {
832
+			err = panicked{p}
833
+		}
834
+	}()
835
+	return fn(tx)
836
+}
837
+
838
+// Sync executes fdatasync() against the database file handle.
839
+//
840
+// This is not necessary under normal operation, however, if you use NoSync
841
+// then it allows you to force the database file to sync against the disk.
842
+func (db *DB) Sync() error { return fdatasync(db) }
843
+
844
+// Stats retrieves ongoing performance stats for the database.
845
+// This is only updated when a transaction closes.
846
+func (db *DB) Stats() Stats {
847
+	db.statlock.RLock()
848
+	defer db.statlock.RUnlock()
849
+	return db.stats
850
+}
851
+
852
+// This is for internal access to the raw data bytes from the C cursor, use
853
+// carefully, or not at all.
854
+func (db *DB) Info() *Info {
855
+	return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
856
+}
857
+
858
+// page retrieves a page reference from the mmap based on the current page size.
859
+func (db *DB) page(id pgid) *page {
860
+	pos := id * pgid(db.pageSize)
861
+	return (*page)(unsafe.Pointer(&db.data[pos]))
862
+}
863
+
864
+// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
865
+func (db *DB) pageInBuffer(b []byte, id pgid) *page {
866
+	return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
867
+}
868
+
869
+// meta retrieves the current meta page reference.
870
+func (db *DB) meta() *meta {
871
+	// We have to return the meta with the highest txid which doesn't fail
872
+	// validation. Otherwise, we can cause errors when in fact the database is
873
+	// in a consistent state. metaA is the one with the higher txid.
874
+	metaA := db.meta0
875
+	metaB := db.meta1
876
+	if db.meta1.txid > db.meta0.txid {
877
+		metaA = db.meta1
878
+		metaB = db.meta0
879
+	}
880
+
881
+	// Use higher meta page if valid. Otherwise fallback to previous, if valid.
882
+	if err := metaA.validate(); err == nil {
883
+		return metaA
884
+	} else if err := metaB.validate(); err == nil {
885
+		return metaB
886
+	}
887
+
888
+	// This should never be reached, because both meta1 and meta0 were validated
889
+	// on mmap() and we do fsync() on every write.
890
+	panic("bolt.DB.meta(): invalid meta pages")
891
+}
892
+
893
+// allocate returns a contiguous block of memory starting at a given page.
894
+func (db *DB) allocate(txid txid, count int) (*page, error) {
895
+	// Allocate a temporary buffer for the page.
896
+	var buf []byte
897
+	if count == 1 {
898
+		buf = db.pagePool.Get().([]byte)
899
+	} else {
900
+		buf = make([]byte, count*db.pageSize)
901
+	}
902
+	p := (*page)(unsafe.Pointer(&buf[0]))
903
+	p.overflow = uint32(count - 1)
904
+
905
+	// Use pages from the freelist if they are available.
906
+	if p.id = db.freelist.allocate(txid, count); p.id != 0 {
907
+		return p, nil
908
+	}
909
+
910
+	// Resize mmap() if we're at the end.
911
+	p.id = db.rwtx.meta.pgid
912
+	var minsz = int((p.id+pgid(count))+1) * db.pageSize
913
+	if minsz >= db.datasz {
914
+		if err := db.mmap(minsz); err != nil {
915
+			return nil, fmt.Errorf("mmap allocate error: %s", err)
916
+		}
917
+	}
918
+
919
+	// Move the page id high water mark.
920
+	db.rwtx.meta.pgid += pgid(count)
921
+
922
+	return p, nil
923
+}
924
+
925
+// grow grows the size of the database to the given sz.
926
+func (db *DB) grow(sz int) error {
927
+	// Ignore if the new size is less than available file size.
928
+	if sz <= db.filesz {
929
+		return nil
930
+	}
931
+
932
+	// If the data is smaller than the alloc size then only allocate what's needed.
933
+	// Once it goes over the allocation size then allocate in chunks.
934
+	if db.datasz < db.AllocSize {
935
+		sz = db.datasz
936
+	} else {
937
+		sz += db.AllocSize
938
+	}
939
+
940
+	// Truncate and fsync to ensure file size metadata is flushed.
941
+	// https://github.com/boltdb/bolt/issues/284
942
+	if !db.NoGrowSync && !db.readOnly {
943
+		if runtime.GOOS != "windows" {
944
+			if err := db.file.Truncate(int64(sz)); err != nil {
945
+				return fmt.Errorf("file resize error: %s", err)
946
+			}
947
+		}
948
+		if err := db.file.Sync(); err != nil {
949
+			return fmt.Errorf("file sync error: %s", err)
950
+		}
951
+	}
952
+
953
+	db.filesz = sz
954
+	return nil
955
+}
956
+
957
+func (db *DB) IsReadOnly() bool {
958
+	return db.readOnly
959
+}
960
+
961
+func (db *DB) freepages() []pgid {
962
+	tx, err := db.beginTx()
963
+	defer func() {
964
+		err = tx.Rollback()
965
+		if err != nil {
966
+			panic("freepages: failed to rollback tx")
967
+		}
968
+	}()
969
+	if err != nil {
970
+		panic("freepages: failed to open read only tx")
971
+	}
972
+
973
+	reachable := make(map[pgid]*page)
974
+	nofreed := make(map[pgid]bool)
975
+	ech := make(chan error)
976
+	go func() {
977
+		for e := range ech {
978
+			panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e))
979
+		}
980
+	}()
981
+	tx.checkBucket(&tx.root, reachable, nofreed, ech)
982
+	close(ech)
983
+
984
+	var fids []pgid
985
+	for i := pgid(2); i < db.meta().pgid; i++ {
986
+		if _, ok := reachable[i]; !ok {
987
+			fids = append(fids, i)
988
+		}
989
+	}
990
+	return fids
991
+}
992
+
993
+// Options represents the options that can be set when opening a database.
994
+type Options struct {
995
+	// Timeout is the amount of time to wait to obtain a file lock.
996
+	// When set to zero it will wait indefinitely. This option is only
997
+	// available on Darwin and Linux.
998
+	Timeout time.Duration
999
+
1000
+	// Sets the DB.NoGrowSync flag before memory mapping the file.
1001
+	NoGrowSync bool
1002
+
1003
+	// Do not sync freelist to disk. This improves the database write performance
1004
+	// under normal operation, but requires a full database re-sync during recovery.
1005
+	NoFreelistSync bool
1006
+
1007
+	// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
1008
+	// grab a shared lock (UNIX).
1009
+	ReadOnly bool
1010
+
1011
+	// Sets the DB.MmapFlags flag before memory mapping the file.
1012
+	MmapFlags int
1013
+
1014
+	// InitialMmapSize is the initial mmap size of the database
1015
+	// in bytes. Read transactions won't block write transaction
1016
+	// if the InitialMmapSize is large enough to hold database mmap
1017
+	// size. (See DB.Begin for more information)
1018
+	//
1019
+	// If <=0, the initial map size is 0.
1020
+	// If initialMmapSize is smaller than the previous database size,
1021
+	// it takes no effect.
1022
+	InitialMmapSize int
1023
+
1024
+	// PageSize overrides the default OS page size.
1025
+	PageSize int
1026
+
1027
+	// NoSync sets the initial value of DB.NoSync. Normally this can just be
1028
+	// set directly on the DB itself when returned from Open(), but this option
1029
+	// is useful in APIs which expose Options but not the underlying DB.
1030
+	NoSync bool
1031
+}
1032
+
1033
+// DefaultOptions represent the options used if nil options are passed into Open().
1034
+// No timeout is used which will cause Bolt to wait indefinitely for a lock.
1035
+var DefaultOptions = &Options{
1036
+	Timeout:    0,
1037
+	NoGrowSync: false,
1038
+}
1039
+
1040
+// Stats represents statistics about the database.
1041
+type Stats struct {
1042
+	// Freelist stats
1043
+	FreePageN     int // total number of free pages on the freelist
1044
+	PendingPageN  int // total number of pending pages on the freelist
1045
+	FreeAlloc     int // total bytes allocated in free pages
1046
+	FreelistInuse int // total bytes used by the freelist
1047
+
1048
+	// Transaction stats
1049
+	TxN     int // total number of started read transactions
1050
+	OpenTxN int // number of currently open read transactions
1051
+
1052
+	TxStats TxStats // global, ongoing stats.
1053
+}
1054
+
1055
+// Sub calculates and returns the difference between two sets of database stats.
1056
+// This is useful when obtaining stats at two different points and time and
1057
+// you need the performance counters that occurred within that time span.
1058
+func (s *Stats) Sub(other *Stats) Stats {
1059
+	if other == nil {
1060
+		return *s
1061
+	}
1062
+	var diff Stats
1063
+	diff.FreePageN = s.FreePageN
1064
+	diff.PendingPageN = s.PendingPageN
1065
+	diff.FreeAlloc = s.FreeAlloc
1066
+	diff.FreelistInuse = s.FreelistInuse
1067
+	diff.TxN = s.TxN - other.TxN
1068
+	diff.TxStats = s.TxStats.Sub(&other.TxStats)
1069
+	return diff
1070
+}
1071
+
1072
+type Info struct {
1073
+	Data     uintptr
1074
+	PageSize int
1075
+}
1076
+
1077
+type meta struct {
1078
+	magic    uint32
1079
+	version  uint32
1080
+	pageSize uint32
1081
+	flags    uint32
1082
+	root     bucket
1083
+	freelist pgid
1084
+	pgid     pgid
1085
+	txid     txid
1086
+	checksum uint64
1087
+}
1088
+
1089
+// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
1090
+func (m *meta) validate() error {
1091
+	if m.magic != magic {
1092
+		return ErrInvalid
1093
+	} else if m.version != version {
1094
+		return ErrVersionMismatch
1095
+	} else if m.checksum != 0 && m.checksum != m.sum64() {
1096
+		return ErrChecksum
1097
+	}
1098
+	return nil
1099
+}
1100
+
1101
+// copy copies one meta object to another.
1102
+func (m *meta) copy(dest *meta) {
1103
+	*dest = *m
1104
+}
1105
+
1106
+// write writes the meta onto a page.
1107
+func (m *meta) write(p *page) {
1108
+	if m.root.root >= m.pgid {
1109
+		panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
1110
+	} else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
1111
+		// TODO: reject pgidNoFreeList if !NoFreelistSync
1112
+		panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
1113
+	}
1114
+
1115
+	// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
1116
+	p.id = pgid(m.txid % 2)
1117
+	p.flags |= metaPageFlag
1118
+
1119
+	// Calculate the checksum.
1120
+	m.checksum = m.sum64()
1121
+
1122
+	m.copy(p.meta())
1123
+}
1124
+
1125
+// generates the checksum for the meta.
1126
+func (m *meta) sum64() uint64 {
1127
+	var h = fnv.New64a()
1128
+	_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
1129
+	return h.Sum64()
1130
+}
1131
+
1132
+// _assert will panic with a given formatted message if the given condition is false.
1133
+func _assert(condition bool, msg string, v ...interface{}) {
1134
+	if !condition {
1135
+		panic(fmt.Sprintf("assertion failed: "+msg, v...))
1136
+	}
1137
+}
0 1138
new file mode 100644
... ...
@@ -0,0 +1,44 @@
0
+/*
1
+package bbolt implements a low-level key/value store in pure Go. It supports
2
+fully serializable transactions, ACID semantics, and lock-free MVCC with
3
+multiple readers and a single writer. Bolt can be used for projects that
4
+want a simple data store without the need to add large dependencies such as
5
+Postgres or MySQL.
6
+
7
+Bolt is a single-level, zero-copy, B+tree data store. This means that Bolt is
8
+optimized for fast read access and does not require recovery in the event of a
9
+system crash. Transactions which have not finished committing will simply be
10
+rolled back in the event of a crash.
11
+
12
+The design of Bolt is based on Howard Chu's LMDB database project.
13
+
14
+Bolt currently works on Windows, Mac OS X, and Linux.
15
+
16
+
17
+Basics
18
+
19
+There are only a few types in Bolt: DB, Bucket, Tx, and Cursor. The DB is
20
+a collection of buckets and is represented by a single file on disk. A bucket is
21
+a collection of unique keys that are associated with values.
22
+
23
+Transactions provide either read-only or read-write access to the database.
24
+Read-only transactions can retrieve key/value pairs and can use Cursors to
25
+iterate over the dataset sequentially. Read-write transactions can create and
26
+delete buckets and can insert and remove keys. Only one read-write transaction
27
+is allowed at a time.
28
+
29
+
30
+Caveats
31
+
32
+The database uses a read-only, memory-mapped data file to ensure that
33
+applications cannot corrupt the database, however, this means that keys and
34
+values returned from Bolt cannot be changed. Writing to a read-only byte slice
35
+will cause Go to panic.
36
+
37
+Keys and values retrieved from the database are only valid for the life of
38
+the transaction. When used outside the transaction, these byte slices can
39
+point to different data or can point to invalid memory which will cause a panic.
40
+
41
+
42
+*/
43
+package bbolt
0 44
new file mode 100644
... ...
@@ -0,0 +1,71 @@
0
+package bbolt
1
+
2
+import "errors"
3
+
4
+// These errors can be returned when opening or calling methods on a DB.
5
+var (
6
+	// ErrDatabaseNotOpen is returned when a DB instance is accessed before it
7
+	// is opened or after it is closed.
8
+	ErrDatabaseNotOpen = errors.New("database not open")
9
+
10
+	// ErrDatabaseOpen is returned when opening a database that is
11
+	// already open.
12
+	ErrDatabaseOpen = errors.New("database already open")
13
+
14
+	// ErrInvalid is returned when both meta pages on a database are invalid.
15
+	// This typically occurs when a file is not a bolt database.
16
+	ErrInvalid = errors.New("invalid database")
17
+
18
+	// ErrVersionMismatch is returned when the data file was created with a
19
+	// different version of Bolt.
20
+	ErrVersionMismatch = errors.New("version mismatch")
21
+
22
+	// ErrChecksum is returned when either meta page checksum does not match.
23
+	ErrChecksum = errors.New("checksum error")
24
+
25
+	// ErrTimeout is returned when a database cannot obtain an exclusive lock
26
+	// on the data file after the timeout passed to Open().
27
+	ErrTimeout = errors.New("timeout")
28
+)
29
+
30
+// These errors can occur when beginning or committing a Tx.
31
+var (
32
+	// ErrTxNotWritable is returned when performing a write operation on a
33
+	// read-only transaction.
34
+	ErrTxNotWritable = errors.New("tx not writable")
35
+
36
+	// ErrTxClosed is returned when committing or rolling back a transaction
37
+	// that has already been committed or rolled back.
38
+	ErrTxClosed = errors.New("tx closed")
39
+
40
+	// ErrDatabaseReadOnly is returned when a mutating transaction is started on a
41
+	// read-only database.
42
+	ErrDatabaseReadOnly = errors.New("database is in read-only mode")
43
+)
44
+
45
+// These errors can occur when putting or deleting a value or a bucket.
46
+var (
47
+	// ErrBucketNotFound is returned when trying to access a bucket that has
48
+	// not been created yet.
49
+	ErrBucketNotFound = errors.New("bucket not found")
50
+
51
+	// ErrBucketExists is returned when creating a bucket that already exists.
52
+	ErrBucketExists = errors.New("bucket already exists")
53
+
54
+	// ErrBucketNameRequired is returned when creating a bucket with a blank name.
55
+	ErrBucketNameRequired = errors.New("bucket name required")
56
+
57
+	// ErrKeyRequired is returned when inserting a zero-length key.
58
+	ErrKeyRequired = errors.New("key required")
59
+
60
+	// ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize.
61
+	ErrKeyTooLarge = errors.New("key too large")
62
+
63
+	// ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize.
64
+	ErrValueTooLarge = errors.New("value too large")
65
+
66
+	// ErrIncompatibleValue is returned when trying create or delete a bucket
67
+	// on an existing non-bucket key or when trying to create or delete a
68
+	// non-bucket key on an existing bucket key.
69
+	ErrIncompatibleValue = errors.New("incompatible value")
70
+)
0 71
new file mode 100644
... ...
@@ -0,0 +1,333 @@
0
+package bbolt
1
+
2
+import (
3
+	"fmt"
4
+	"sort"
5
+	"unsafe"
6
+)
7
+
8
+// txPending holds a list of pgids and corresponding allocation txns
9
+// that are pending to be freed.
10
+type txPending struct {
11
+	ids              []pgid
12
+	alloctx          []txid // txids allocating the ids
13
+	lastReleaseBegin txid   // beginning txid of last matching releaseRange
14
+}
15
+
16
+// freelist represents a list of all pages that are available for allocation.
17
+// It also tracks pages that have been freed but are still in use by open transactions.
18
+type freelist struct {
19
+	ids     []pgid              // all free and available free page ids.
20
+	allocs  map[pgid]txid       // mapping of txid that allocated a pgid.
21
+	pending map[txid]*txPending // mapping of soon-to-be free page ids by tx.
22
+	cache   map[pgid]bool       // fast lookup of all free and pending page ids.
23
+}
24
+
25
+// newFreelist returns an empty, initialized freelist.
26
+func newFreelist() *freelist {
27
+	return &freelist{
28
+		allocs:  make(map[pgid]txid),
29
+		pending: make(map[txid]*txPending),
30
+		cache:   make(map[pgid]bool),
31
+	}
32
+}
33
+
34
+// size returns the size of the page after serialization.
35
+func (f *freelist) size() int {
36
+	n := f.count()
37
+	if n >= 0xFFFF {
38
+		// The first element will be used to store the count. See freelist.write.
39
+		n++
40
+	}
41
+	return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * n)
42
+}
43
+
44
+// count returns count of pages on the freelist
45
+func (f *freelist) count() int {
46
+	return f.free_count() + f.pending_count()
47
+}
48
+
49
+// free_count returns count of free pages
50
+func (f *freelist) free_count() int {
51
+	return len(f.ids)
52
+}
53
+
54
+// pending_count returns count of pending pages
55
+func (f *freelist) pending_count() int {
56
+	var count int
57
+	for _, txp := range f.pending {
58
+		count += len(txp.ids)
59
+	}
60
+	return count
61
+}
62
+
63
+// copyall copies into dst a list of all free ids and all pending ids in one sorted list.
64
+// f.count returns the minimum length required for dst.
65
+func (f *freelist) copyall(dst []pgid) {
66
+	m := make(pgids, 0, f.pending_count())
67
+	for _, txp := range f.pending {
68
+		m = append(m, txp.ids...)
69
+	}
70
+	sort.Sort(m)
71
+	mergepgids(dst, f.ids, m)
72
+}
73
+
74
+// allocate returns the starting page id of a contiguous list of pages of a given size.
75
+// If a contiguous block cannot be found then 0 is returned.
76
+func (f *freelist) allocate(txid txid, n int) pgid {
77
+	if len(f.ids) == 0 {
78
+		return 0
79
+	}
80
+
81
+	var initial, previd pgid
82
+	for i, id := range f.ids {
83
+		if id <= 1 {
84
+			panic(fmt.Sprintf("invalid page allocation: %d", id))
85
+		}
86
+
87
+		// Reset initial page if this is not contiguous.
88
+		if previd == 0 || id-previd != 1 {
89
+			initial = id
90
+		}
91
+
92
+		// If we found a contiguous block then remove it and return it.
93
+		if (id-initial)+1 == pgid(n) {
94
+			// If we're allocating off the beginning then take the fast path
95
+			// and just adjust the existing slice. This will use extra memory
96
+			// temporarily but the append() in free() will realloc the slice
97
+			// as is necessary.
98
+			if (i + 1) == n {
99
+				f.ids = f.ids[i+1:]
100
+			} else {
101
+				copy(f.ids[i-n+1:], f.ids[i+1:])
102
+				f.ids = f.ids[:len(f.ids)-n]
103
+			}
104
+
105
+			// Remove from the free cache.
106
+			for i := pgid(0); i < pgid(n); i++ {
107
+				delete(f.cache, initial+i)
108
+			}
109
+			f.allocs[initial] = txid
110
+			return initial
111
+		}
112
+
113
+		previd = id
114
+	}
115
+	return 0
116
+}
117
+
118
+// free releases a page and its overflow for a given transaction id.
119
+// If the page is already free then a panic will occur.
120
+func (f *freelist) free(txid txid, p *page) {
121
+	if p.id <= 1 {
122
+		panic(fmt.Sprintf("cannot free page 0 or 1: %d", p.id))
123
+	}
124
+
125
+	// Free page and all its overflow pages.
126
+	txp := f.pending[txid]
127
+	if txp == nil {
128
+		txp = &txPending{}
129
+		f.pending[txid] = txp
130
+	}
131
+	allocTxid, ok := f.allocs[p.id]
132
+	if ok {
133
+		delete(f.allocs, p.id)
134
+	} else if (p.flags & freelistPageFlag) != 0 {
135
+		// Freelist is always allocated by prior tx.
136
+		allocTxid = txid - 1
137
+	}
138
+
139
+	for id := p.id; id <= p.id+pgid(p.overflow); id++ {
140
+		// Verify that page is not already free.
141
+		if f.cache[id] {
142
+			panic(fmt.Sprintf("page %d already freed", id))
143
+		}
144
+		// Add to the freelist and cache.
145
+		txp.ids = append(txp.ids, id)
146
+		txp.alloctx = append(txp.alloctx, allocTxid)
147
+		f.cache[id] = true
148
+	}
149
+}
150
+
151
+// release moves all page ids for a transaction id (or older) to the freelist.
152
+func (f *freelist) release(txid txid) {
153
+	m := make(pgids, 0)
154
+	for tid, txp := range f.pending {
155
+		if tid <= txid {
156
+			// Move transaction's pending pages to the available freelist.
157
+			// Don't remove from the cache since the page is still free.
158
+			m = append(m, txp.ids...)
159
+			delete(f.pending, tid)
160
+		}
161
+	}
162
+	sort.Sort(m)
163
+	f.ids = pgids(f.ids).merge(m)
164
+}
165
+
166
+// releaseRange moves pending pages allocated within an extent [begin,end] to the free list.
167
+func (f *freelist) releaseRange(begin, end txid) {
168
+	if begin > end {
169
+		return
170
+	}
171
+	var m pgids
172
+	for tid, txp := range f.pending {
173
+		if tid < begin || tid > end {
174
+			continue
175
+		}
176
+		// Don't recompute freed pages if ranges haven't updated.
177
+		if txp.lastReleaseBegin == begin {
178
+			continue
179
+		}
180
+		for i := 0; i < len(txp.ids); i++ {
181
+			if atx := txp.alloctx[i]; atx < begin || atx > end {
182
+				continue
183
+			}
184
+			m = append(m, txp.ids[i])
185
+			txp.ids[i] = txp.ids[len(txp.ids)-1]
186
+			txp.ids = txp.ids[:len(txp.ids)-1]
187
+			txp.alloctx[i] = txp.alloctx[len(txp.alloctx)-1]
188
+			txp.alloctx = txp.alloctx[:len(txp.alloctx)-1]
189
+			i--
190
+		}
191
+		txp.lastReleaseBegin = begin
192
+		if len(txp.ids) == 0 {
193
+			delete(f.pending, tid)
194
+		}
195
+	}
196
+	sort.Sort(m)
197
+	f.ids = pgids(f.ids).merge(m)
198
+}
199
+
200
+// rollback removes the pages from a given pending tx.
201
+func (f *freelist) rollback(txid txid) {
202
+	// Remove page ids from cache.
203
+	txp := f.pending[txid]
204
+	if txp == nil {
205
+		return
206
+	}
207
+	var m pgids
208
+	for i, pgid := range txp.ids {
209
+		delete(f.cache, pgid)
210
+		tx := txp.alloctx[i]
211
+		if tx == 0 {
212
+			continue
213
+		}
214
+		if tx != txid {
215
+			// Pending free aborted; restore page back to alloc list.
216
+			f.allocs[pgid] = tx
217
+		} else {
218
+			// Freed page was allocated by this txn; OK to throw away.
219
+			m = append(m, pgid)
220
+		}
221
+	}
222
+	// Remove pages from pending list and mark as free if allocated by txid.
223
+	delete(f.pending, txid)
224
+	sort.Sort(m)
225
+	f.ids = pgids(f.ids).merge(m)
226
+}
227
+
228
+// freed returns whether a given page is in the free list.
229
+func (f *freelist) freed(pgid pgid) bool {
230
+	return f.cache[pgid]
231
+}
232
+
233
+// read initializes the freelist from a freelist page.
234
+func (f *freelist) read(p *page) {
235
+	if (p.flags & freelistPageFlag) == 0 {
236
+		panic(fmt.Sprintf("invalid freelist page: %d, page type is %s", p.id, p.typ()))
237
+	}
238
+	// If the page.count is at the max uint16 value (64k) then it's considered
239
+	// an overflow and the size of the freelist is stored as the first element.
240
+	idx, count := 0, int(p.count)
241
+	if count == 0xFFFF {
242
+		idx = 1
243
+		count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0])
244
+	}
245
+
246
+	// Copy the list of page ids from the freelist.
247
+	if count == 0 {
248
+		f.ids = nil
249
+	} else {
250
+		ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx : idx+count]
251
+		f.ids = make([]pgid, len(ids))
252
+		copy(f.ids, ids)
253
+
254
+		// Make sure they're sorted.
255
+		sort.Sort(pgids(f.ids))
256
+	}
257
+
258
+	// Rebuild the page cache.
259
+	f.reindex()
260
+}
261
+
262
+// read initializes the freelist from a given list of ids.
263
+func (f *freelist) readIDs(ids []pgid) {
264
+	f.ids = ids
265
+	f.reindex()
266
+}
267
+
268
+// write writes the page ids onto a freelist page. All free and pending ids are
269
+// saved to disk since in the event of a program crash, all pending ids will
270
+// become free.
271
+func (f *freelist) write(p *page) error {
272
+	// Combine the old free pgids and pgids waiting on an open transaction.
273
+
274
+	// Update the header flag.
275
+	p.flags |= freelistPageFlag
276
+
277
+	// The page.count can only hold up to 64k elements so if we overflow that
278
+	// number then we handle it by putting the size in the first element.
279
+	lenids := f.count()
280
+	if lenids == 0 {
281
+		p.count = uint16(lenids)
282
+	} else if lenids < 0xFFFF {
283
+		p.count = uint16(lenids)
284
+		f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:])
285
+	} else {
286
+		p.count = 0xFFFF
287
+		((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(lenids)
288
+		f.copyall(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:])
289
+	}
290
+
291
+	return nil
292
+}
293
+
294
+// reload reads the freelist from a page and filters out pending items.
295
+func (f *freelist) reload(p *page) {
296
+	f.read(p)
297
+
298
+	// Build a cache of only pending pages.
299
+	pcache := make(map[pgid]bool)
300
+	for _, txp := range f.pending {
301
+		for _, pendingID := range txp.ids {
302
+			pcache[pendingID] = true
303
+		}
304
+	}
305
+
306
+	// Check each page in the freelist and build a new available freelist
307
+	// with any pages not in the pending lists.
308
+	var a []pgid
309
+	for _, id := range f.ids {
310
+		if !pcache[id] {
311
+			a = append(a, id)
312
+		}
313
+	}
314
+	f.ids = a
315
+
316
+	// Once the available list is rebuilt then rebuild the free cache so that
317
+	// it includes the available and pending free pages.
318
+	f.reindex()
319
+}
320
+
321
+// reindex rebuilds the free cache based on available and pending free lists.
322
+func (f *freelist) reindex() {
323
+	f.cache = make(map[pgid]bool, len(f.ids))
324
+	for _, id := range f.ids {
325
+		f.cache[id] = true
326
+	}
327
+	for _, txp := range f.pending {
328
+		for _, pendingID := range txp.ids {
329
+			f.cache[pendingID] = true
330
+		}
331
+	}
332
+}
0 333
new file mode 100644
... ...
@@ -0,0 +1,604 @@
0
+package bbolt
1
+
2
+import (
3
+	"bytes"
4
+	"fmt"
5
+	"sort"
6
+	"unsafe"
7
+)
8
+
9
+// node represents an in-memory, deserialized page.
10
+type node struct {
11
+	bucket     *Bucket
12
+	isLeaf     bool
13
+	unbalanced bool
14
+	spilled    bool
15
+	key        []byte
16
+	pgid       pgid
17
+	parent     *node
18
+	children   nodes
19
+	inodes     inodes
20
+}
21
+
22
+// root returns the top-level node this node is attached to.
23
+func (n *node) root() *node {
24
+	if n.parent == nil {
25
+		return n
26
+	}
27
+	return n.parent.root()
28
+}
29
+
30
+// minKeys returns the minimum number of inodes this node should have.
31
+func (n *node) minKeys() int {
32
+	if n.isLeaf {
33
+		return 1
34
+	}
35
+	return 2
36
+}
37
+
38
+// size returns the size of the node after serialization.
39
+func (n *node) size() int {
40
+	sz, elsz := pageHeaderSize, n.pageElementSize()
41
+	for i := 0; i < len(n.inodes); i++ {
42
+		item := &n.inodes[i]
43
+		sz += elsz + len(item.key) + len(item.value)
44
+	}
45
+	return sz
46
+}
47
+
48
+// sizeLessThan returns true if the node is less than a given size.
49
+// This is an optimization to avoid calculating a large node when we only need
50
+// to know if it fits inside a certain page size.
51
+func (n *node) sizeLessThan(v int) bool {
52
+	sz, elsz := pageHeaderSize, n.pageElementSize()
53
+	for i := 0; i < len(n.inodes); i++ {
54
+		item := &n.inodes[i]
55
+		sz += elsz + len(item.key) + len(item.value)
56
+		if sz >= v {
57
+			return false
58
+		}
59
+	}
60
+	return true
61
+}
62
+
63
+// pageElementSize returns the size of each page element based on the type of node.
64
+func (n *node) pageElementSize() int {
65
+	if n.isLeaf {
66
+		return leafPageElementSize
67
+	}
68
+	return branchPageElementSize
69
+}
70
+
71
+// childAt returns the child node at a given index.
72
+func (n *node) childAt(index int) *node {
73
+	if n.isLeaf {
74
+		panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
75
+	}
76
+	return n.bucket.node(n.inodes[index].pgid, n)
77
+}
78
+
79
+// childIndex returns the index of a given child node.
80
+func (n *node) childIndex(child *node) int {
81
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
82
+	return index
83
+}
84
+
85
+// numChildren returns the number of children.
86
+func (n *node) numChildren() int {
87
+	return len(n.inodes)
88
+}
89
+
90
+// nextSibling returns the next node with the same parent.
91
+func (n *node) nextSibling() *node {
92
+	if n.parent == nil {
93
+		return nil
94
+	}
95
+	index := n.parent.childIndex(n)
96
+	if index >= n.parent.numChildren()-1 {
97
+		return nil
98
+	}
99
+	return n.parent.childAt(index + 1)
100
+}
101
+
102
+// prevSibling returns the previous node with the same parent.
103
+func (n *node) prevSibling() *node {
104
+	if n.parent == nil {
105
+		return nil
106
+	}
107
+	index := n.parent.childIndex(n)
108
+	if index == 0 {
109
+		return nil
110
+	}
111
+	return n.parent.childAt(index - 1)
112
+}
113
+
114
+// put inserts a key/value.
115
+func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
116
+	if pgid >= n.bucket.tx.meta.pgid {
117
+		panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
118
+	} else if len(oldKey) <= 0 {
119
+		panic("put: zero-length old key")
120
+	} else if len(newKey) <= 0 {
121
+		panic("put: zero-length new key")
122
+	}
123
+
124
+	// Find insertion index.
125
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
126
+
127
+	// Add capacity and shift nodes if we don't have an exact match and need to insert.
128
+	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
129
+	if !exact {
130
+		n.inodes = append(n.inodes, inode{})
131
+		copy(n.inodes[index+1:], n.inodes[index:])
132
+	}
133
+
134
+	inode := &n.inodes[index]
135
+	inode.flags = flags
136
+	inode.key = newKey
137
+	inode.value = value
138
+	inode.pgid = pgid
139
+	_assert(len(inode.key) > 0, "put: zero-length inode key")
140
+}
141
+
142
+// del removes a key from the node.
143
+func (n *node) del(key []byte) {
144
+	// Find index of key.
145
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
146
+
147
+	// Exit if the key isn't found.
148
+	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
149
+		return
150
+	}
151
+
152
+	// Delete inode from the node.
153
+	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
154
+
155
+	// Mark the node as needing rebalancing.
156
+	n.unbalanced = true
157
+}
158
+
159
+// read initializes the node from a page.
160
+func (n *node) read(p *page) {
161
+	n.pgid = p.id
162
+	n.isLeaf = ((p.flags & leafPageFlag) != 0)
163
+	n.inodes = make(inodes, int(p.count))
164
+
165
+	for i := 0; i < int(p.count); i++ {
166
+		inode := &n.inodes[i]
167
+		if n.isLeaf {
168
+			elem := p.leafPageElement(uint16(i))
169
+			inode.flags = elem.flags
170
+			inode.key = elem.key()
171
+			inode.value = elem.value()
172
+		} else {
173
+			elem := p.branchPageElement(uint16(i))
174
+			inode.pgid = elem.pgid
175
+			inode.key = elem.key()
176
+		}
177
+		_assert(len(inode.key) > 0, "read: zero-length inode key")
178
+	}
179
+
180
+	// Save first key so we can find the node in the parent when we spill.
181
+	if len(n.inodes) > 0 {
182
+		n.key = n.inodes[0].key
183
+		_assert(len(n.key) > 0, "read: zero-length node key")
184
+	} else {
185
+		n.key = nil
186
+	}
187
+}
188
+
189
+// write writes the items onto one or more pages.
190
+func (n *node) write(p *page) {
191
+	// Initialize page.
192
+	if n.isLeaf {
193
+		p.flags |= leafPageFlag
194
+	} else {
195
+		p.flags |= branchPageFlag
196
+	}
197
+
198
+	if len(n.inodes) >= 0xFFFF {
199
+		panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
200
+	}
201
+	p.count = uint16(len(n.inodes))
202
+
203
+	// Stop here if there are no items to write.
204
+	if p.count == 0 {
205
+		return
206
+	}
207
+
208
+	// Loop over each item and write it to the page.
209
+	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
210
+	for i, item := range n.inodes {
211
+		_assert(len(item.key) > 0, "write: zero-length inode key")
212
+
213
+		// Write the page element.
214
+		if n.isLeaf {
215
+			elem := p.leafPageElement(uint16(i))
216
+			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
217
+			elem.flags = item.flags
218
+			elem.ksize = uint32(len(item.key))
219
+			elem.vsize = uint32(len(item.value))
220
+		} else {
221
+			elem := p.branchPageElement(uint16(i))
222
+			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
223
+			elem.ksize = uint32(len(item.key))
224
+			elem.pgid = item.pgid
225
+			_assert(elem.pgid != p.id, "write: circular dependency occurred")
226
+		}
227
+
228
+		// If the length of key+value is larger than the max allocation size
229
+		// then we need to reallocate the byte array pointer.
230
+		//
231
+		// See: https://github.com/boltdb/bolt/pull/335
232
+		klen, vlen := len(item.key), len(item.value)
233
+		if len(b) < klen+vlen {
234
+			b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
235
+		}
236
+
237
+		// Write data for the element to the end of the page.
238
+		copy(b[0:], item.key)
239
+		b = b[klen:]
240
+		copy(b[0:], item.value)
241
+		b = b[vlen:]
242
+	}
243
+
244
+	// DEBUG ONLY: n.dump()
245
+}
246
+
247
+// split breaks up a node into multiple smaller nodes, if appropriate.
248
+// This should only be called from the spill() function.
249
+func (n *node) split(pageSize int) []*node {
250
+	var nodes []*node
251
+
252
+	node := n
253
+	for {
254
+		// Split node into two.
255
+		a, b := node.splitTwo(pageSize)
256
+		nodes = append(nodes, a)
257
+
258
+		// If we can't split then exit the loop.
259
+		if b == nil {
260
+			break
261
+		}
262
+
263
+		// Set node to b so it gets split on the next iteration.
264
+		node = b
265
+	}
266
+
267
+	return nodes
268
+}
269
+
270
+// splitTwo breaks up a node into two smaller nodes, if appropriate.
271
+// This should only be called from the split() function.
272
+func (n *node) splitTwo(pageSize int) (*node, *node) {
273
+	// Ignore the split if the page doesn't have at least enough nodes for
274
+	// two pages or if the nodes can fit in a single page.
275
+	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
276
+		return n, nil
277
+	}
278
+
279
+	// Determine the threshold before starting a new node.
280
+	var fillPercent = n.bucket.FillPercent
281
+	if fillPercent < minFillPercent {
282
+		fillPercent = minFillPercent
283
+	} else if fillPercent > maxFillPercent {
284
+		fillPercent = maxFillPercent
285
+	}
286
+	threshold := int(float64(pageSize) * fillPercent)
287
+
288
+	// Determine split position and sizes of the two pages.
289
+	splitIndex, _ := n.splitIndex(threshold)
290
+
291
+	// Split node into two separate nodes.
292
+	// If there's no parent then we'll need to create one.
293
+	if n.parent == nil {
294
+		n.parent = &node{bucket: n.bucket, children: []*node{n}}
295
+	}
296
+
297
+	// Create a new node and add it to the parent.
298
+	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
299
+	n.parent.children = append(n.parent.children, next)
300
+
301
+	// Split inodes across two nodes.
302
+	next.inodes = n.inodes[splitIndex:]
303
+	n.inodes = n.inodes[:splitIndex]
304
+
305
+	// Update the statistics.
306
+	n.bucket.tx.stats.Split++
307
+
308
+	return n, next
309
+}
310
+
311
+// splitIndex finds the position where a page will fill a given threshold.
312
+// It returns the index as well as the size of the first page.
313
+// This is only be called from split().
314
+func (n *node) splitIndex(threshold int) (index, sz int) {
315
+	sz = pageHeaderSize
316
+
317
+	// Loop until we only have the minimum number of keys required for the second page.
318
+	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
319
+		index = i
320
+		inode := n.inodes[i]
321
+		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
322
+
323
+		// If we have at least the minimum number of keys and adding another
324
+		// node would put us over the threshold then exit and return.
325
+		if i >= minKeysPerPage && sz+elsize > threshold {
326
+			break
327
+		}
328
+
329
+		// Add the element size to the total size.
330
+		sz += elsize
331
+	}
332
+
333
+	return
334
+}
335
+
336
+// spill writes the nodes to dirty pages and splits nodes as it goes.
337
+// Returns an error if dirty pages cannot be allocated.
338
+func (n *node) spill() error {
339
+	var tx = n.bucket.tx
340
+	if n.spilled {
341
+		return nil
342
+	}
343
+
344
+	// Spill child nodes first. Child nodes can materialize sibling nodes in
345
+	// the case of split-merge so we cannot use a range loop. We have to check
346
+	// the children size on every loop iteration.
347
+	sort.Sort(n.children)
348
+	for i := 0; i < len(n.children); i++ {
349
+		if err := n.children[i].spill(); err != nil {
350
+			return err
351
+		}
352
+	}
353
+
354
+	// We no longer need the child list because it's only used for spill tracking.
355
+	n.children = nil
356
+
357
+	// Split nodes into appropriate sizes. The first node will always be n.
358
+	var nodes = n.split(tx.db.pageSize)
359
+	for _, node := range nodes {
360
+		// Add node's page to the freelist if it's not new.
361
+		if node.pgid > 0 {
362
+			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
363
+			node.pgid = 0
364
+		}
365
+
366
+		// Allocate contiguous space for the node.
367
+		p, err := tx.allocate((node.size() + tx.db.pageSize - 1) / tx.db.pageSize)
368
+		if err != nil {
369
+			return err
370
+		}
371
+
372
+		// Write the node.
373
+		if p.id >= tx.meta.pgid {
374
+			panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
375
+		}
376
+		node.pgid = p.id
377
+		node.write(p)
378
+		node.spilled = true
379
+
380
+		// Insert into parent inodes.
381
+		if node.parent != nil {
382
+			var key = node.key
383
+			if key == nil {
384
+				key = node.inodes[0].key
385
+			}
386
+
387
+			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
388
+			node.key = node.inodes[0].key
389
+			_assert(len(node.key) > 0, "spill: zero-length node key")
390
+		}
391
+
392
+		// Update the statistics.
393
+		tx.stats.Spill++
394
+	}
395
+
396
+	// If the root node split and created a new root then we need to spill that
397
+	// as well. We'll clear out the children to make sure it doesn't try to respill.
398
+	if n.parent != nil && n.parent.pgid == 0 {
399
+		n.children = nil
400
+		return n.parent.spill()
401
+	}
402
+
403
+	return nil
404
+}
405
+
406
+// rebalance attempts to combine the node with sibling nodes if the node fill
407
+// size is below a threshold or if there are not enough keys.
408
+func (n *node) rebalance() {
409
+	if !n.unbalanced {
410
+		return
411
+	}
412
+	n.unbalanced = false
413
+
414
+	// Update statistics.
415
+	n.bucket.tx.stats.Rebalance++
416
+
417
+	// Ignore if node is above threshold (25%) and has enough keys.
418
+	var threshold = n.bucket.tx.db.pageSize / 4
419
+	if n.size() > threshold && len(n.inodes) > n.minKeys() {
420
+		return
421
+	}
422
+
423
+	// Root node has special handling.
424
+	if n.parent == nil {
425
+		// If root node is a branch and only has one node then collapse it.
426
+		if !n.isLeaf && len(n.inodes) == 1 {
427
+			// Move root's child up.
428
+			child := n.bucket.node(n.inodes[0].pgid, n)
429
+			n.isLeaf = child.isLeaf
430
+			n.inodes = child.inodes[:]
431
+			n.children = child.children
432
+
433
+			// Reparent all child nodes being moved.
434
+			for _, inode := range n.inodes {
435
+				if child, ok := n.bucket.nodes[inode.pgid]; ok {
436
+					child.parent = n
437
+				}
438
+			}
439
+
440
+			// Remove old child.
441
+			child.parent = nil
442
+			delete(n.bucket.nodes, child.pgid)
443
+			child.free()
444
+		}
445
+
446
+		return
447
+	}
448
+
449
+	// If node has no keys then just remove it.
450
+	if n.numChildren() == 0 {
451
+		n.parent.del(n.key)
452
+		n.parent.removeChild(n)
453
+		delete(n.bucket.nodes, n.pgid)
454
+		n.free()
455
+		n.parent.rebalance()
456
+		return
457
+	}
458
+
459
+	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
460
+
461
+	// Destination node is right sibling if idx == 0, otherwise left sibling.
462
+	var target *node
463
+	var useNextSibling = (n.parent.childIndex(n) == 0)
464
+	if useNextSibling {
465
+		target = n.nextSibling()
466
+	} else {
467
+		target = n.prevSibling()
468
+	}
469
+
470
+	// If both this node and the target node are too small then merge them.
471
+	if useNextSibling {
472
+		// Reparent all child nodes being moved.
473
+		for _, inode := range target.inodes {
474
+			if child, ok := n.bucket.nodes[inode.pgid]; ok {
475
+				child.parent.removeChild(child)
476
+				child.parent = n
477
+				child.parent.children = append(child.parent.children, child)
478
+			}
479
+		}
480
+
481
+		// Copy over inodes from target and remove target.
482
+		n.inodes = append(n.inodes, target.inodes...)
483
+		n.parent.del(target.key)
484
+		n.parent.removeChild(target)
485
+		delete(n.bucket.nodes, target.pgid)
486
+		target.free()
487
+	} else {
488
+		// Reparent all child nodes being moved.
489
+		for _, inode := range n.inodes {
490
+			if child, ok := n.bucket.nodes[inode.pgid]; ok {
491
+				child.parent.removeChild(child)
492
+				child.parent = target
493
+				child.parent.children = append(child.parent.children, child)
494
+			}
495
+		}
496
+
497
+		// Copy over inodes to target and remove node.
498
+		target.inodes = append(target.inodes, n.inodes...)
499
+		n.parent.del(n.key)
500
+		n.parent.removeChild(n)
501
+		delete(n.bucket.nodes, n.pgid)
502
+		n.free()
503
+	}
504
+
505
+	// Either this node or the target node was deleted from the parent so rebalance it.
506
+	n.parent.rebalance()
507
+}
508
+
509
+// removes a node from the list of in-memory children.
510
+// This does not affect the inodes.
511
+func (n *node) removeChild(target *node) {
512
+	for i, child := range n.children {
513
+		if child == target {
514
+			n.children = append(n.children[:i], n.children[i+1:]...)
515
+			return
516
+		}
517
+	}
518
+}
519
+
520
+// dereference causes the node to copy all its inode key/value references to heap memory.
521
+// This is required when the mmap is reallocated so inodes are not pointing to stale data.
522
+func (n *node) dereference() {
523
+	if n.key != nil {
524
+		key := make([]byte, len(n.key))
525
+		copy(key, n.key)
526
+		n.key = key
527
+		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
528
+	}
529
+
530
+	for i := range n.inodes {
531
+		inode := &n.inodes[i]
532
+
533
+		key := make([]byte, len(inode.key))
534
+		copy(key, inode.key)
535
+		inode.key = key
536
+		_assert(len(inode.key) > 0, "dereference: zero-length inode key")
537
+
538
+		value := make([]byte, len(inode.value))
539
+		copy(value, inode.value)
540
+		inode.value = value
541
+	}
542
+
543
+	// Recursively dereference children.
544
+	for _, child := range n.children {
545
+		child.dereference()
546
+	}
547
+
548
+	// Update statistics.
549
+	n.bucket.tx.stats.NodeDeref++
550
+}
551
+
552
+// free adds the node's underlying page to the freelist.
553
+func (n *node) free() {
554
+	if n.pgid != 0 {
555
+		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
556
+		n.pgid = 0
557
+	}
558
+}
559
+
560
+// dump writes the contents of the node to STDERR for debugging purposes.
561
+/*
562
+func (n *node) dump() {
563
+	// Write node header.
564
+	var typ = "branch"
565
+	if n.isLeaf {
566
+		typ = "leaf"
567
+	}
568
+	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
569
+
570
+	// Write out abbreviated version of each item.
571
+	for _, item := range n.inodes {
572
+		if n.isLeaf {
573
+			if item.flags&bucketLeafFlag != 0 {
574
+				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
575
+				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
576
+			} else {
577
+				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
578
+			}
579
+		} else {
580
+			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
581
+		}
582
+	}
583
+	warn("")
584
+}
585
+*/
586
+
587
+type nodes []*node
588
+
589
+func (s nodes) Len() int           { return len(s) }
590
+func (s nodes) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
591
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
592
+
593
+// inode represents an internal node inside of a node.
594
+// It can be used to point to elements in a page or point
595
+// to an element which hasn't been added to a page yet.
596
+type inode struct {
597
+	flags uint32
598
+	pgid  pgid
599
+	key   []byte
600
+	value []byte
601
+}
602
+
603
+type inodes []inode
0 604
new file mode 100644
... ...
@@ -0,0 +1,197 @@
0
+package bbolt
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"sort"
6
+	"unsafe"
7
+)
8
+
9
+const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr))
10
+
11
+const minKeysPerPage = 2
12
+
13
+const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{}))
14
+const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{}))
15
+
16
+const (
17
+	branchPageFlag   = 0x01
18
+	leafPageFlag     = 0x02
19
+	metaPageFlag     = 0x04
20
+	freelistPageFlag = 0x10
21
+)
22
+
23
+const (
24
+	bucketLeafFlag = 0x01
25
+)
26
+
27
+type pgid uint64
28
+
29
+type page struct {
30
+	id       pgid
31
+	flags    uint16
32
+	count    uint16
33
+	overflow uint32
34
+	ptr      uintptr
35
+}
36
+
37
+// typ returns a human readable page type string used for debugging.
38
+func (p *page) typ() string {
39
+	if (p.flags & branchPageFlag) != 0 {
40
+		return "branch"
41
+	} else if (p.flags & leafPageFlag) != 0 {
42
+		return "leaf"
43
+	} else if (p.flags & metaPageFlag) != 0 {
44
+		return "meta"
45
+	} else if (p.flags & freelistPageFlag) != 0 {
46
+		return "freelist"
47
+	}
48
+	return fmt.Sprintf("unknown<%02x>", p.flags)
49
+}
50
+
51
+// meta returns a pointer to the metadata section of the page.
52
+func (p *page) meta() *meta {
53
+	return (*meta)(unsafe.Pointer(&p.ptr))
54
+}
55
+
56
+// leafPageElement retrieves the leaf node by index
57
+func (p *page) leafPageElement(index uint16) *leafPageElement {
58
+	n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index]
59
+	return n
60
+}
61
+
62
+// leafPageElements retrieves a list of leaf nodes.
63
+func (p *page) leafPageElements() []leafPageElement {
64
+	if p.count == 0 {
65
+		return nil
66
+	}
67
+	return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:]
68
+}
69
+
70
+// branchPageElement retrieves the branch node by index
71
+func (p *page) branchPageElement(index uint16) *branchPageElement {
72
+	return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index]
73
+}
74
+
75
+// branchPageElements retrieves a list of branch nodes.
76
+func (p *page) branchPageElements() []branchPageElement {
77
+	if p.count == 0 {
78
+		return nil
79
+	}
80
+	return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:]
81
+}
82
+
83
+// dump writes n bytes of the page to STDERR as hex output.
84
+func (p *page) hexdump(n int) {
85
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n]
86
+	fmt.Fprintf(os.Stderr, "%x\n", buf)
87
+}
88
+
89
+type pages []*page
90
+
91
+func (s pages) Len() int           { return len(s) }
92
+func (s pages) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
93
+func (s pages) Less(i, j int) bool { return s[i].id < s[j].id }
94
+
95
+// branchPageElement represents a node on a branch page.
96
+type branchPageElement struct {
97
+	pos   uint32
98
+	ksize uint32
99
+	pgid  pgid
100
+}
101
+
102
+// key returns a byte slice of the node key.
103
+func (n *branchPageElement) key() []byte {
104
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
105
+	return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize]
106
+}
107
+
108
+// leafPageElement represents a node on a leaf page.
109
+type leafPageElement struct {
110
+	flags uint32
111
+	pos   uint32
112
+	ksize uint32
113
+	vsize uint32
114
+}
115
+
116
+// key returns a byte slice of the node key.
117
+func (n *leafPageElement) key() []byte {
118
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
119
+	return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos]))[:n.ksize:n.ksize]
120
+}
121
+
122
+// value returns a byte slice of the node value.
123
+func (n *leafPageElement) value() []byte {
124
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
125
+	return (*[maxAllocSize]byte)(unsafe.Pointer(&buf[n.pos+n.ksize]))[:n.vsize:n.vsize]
126
+}
127
+
128
+// PageInfo represents human readable information about a page.
129
+type PageInfo struct {
130
+	ID            int
131
+	Type          string
132
+	Count         int
133
+	OverflowCount int
134
+}
135
+
136
+type pgids []pgid
137
+
138
+func (s pgids) Len() int           { return len(s) }
139
+func (s pgids) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
140
+func (s pgids) Less(i, j int) bool { return s[i] < s[j] }
141
+
142
+// merge returns the sorted union of a and b.
143
+func (a pgids) merge(b pgids) pgids {
144
+	// Return the opposite slice if one is nil.
145
+	if len(a) == 0 {
146
+		return b
147
+	}
148
+	if len(b) == 0 {
149
+		return a
150
+	}
151
+	merged := make(pgids, len(a)+len(b))
152
+	mergepgids(merged, a, b)
153
+	return merged
154
+}
155
+
156
+// mergepgids copies the sorted union of a and b into dst.
157
+// If dst is too small, it panics.
158
+func mergepgids(dst, a, b pgids) {
159
+	if len(dst) < len(a)+len(b) {
160
+		panic(fmt.Errorf("mergepgids bad len %d < %d + %d", len(dst), len(a), len(b)))
161
+	}
162
+	// Copy in the opposite slice if one is nil.
163
+	if len(a) == 0 {
164
+		copy(dst, b)
165
+		return
166
+	}
167
+	if len(b) == 0 {
168
+		copy(dst, a)
169
+		return
170
+	}
171
+
172
+	// Merged will hold all elements from both lists.
173
+	merged := dst[:0]
174
+
175
+	// Assign lead to the slice with a lower starting value, follow to the higher value.
176
+	lead, follow := a, b
177
+	if b[0] < a[0] {
178
+		lead, follow = b, a
179
+	}
180
+
181
+	// Continue while there are elements in the lead.
182
+	for len(lead) > 0 {
183
+		// Merge largest prefix of lead that is ahead of follow[0].
184
+		n := sort.Search(len(lead), func(i int) bool { return lead[i] > follow[0] })
185
+		merged = append(merged, lead[:n]...)
186
+		if n >= len(lead) {
187
+			break
188
+		}
189
+
190
+		// Swap lead and follow.
191
+		lead, follow = follow, lead[n:]
192
+	}
193
+
194
+	// Append what's left in follow.
195
+	_ = append(merged, follow...)
196
+}
0 197
new file mode 100644
... ...
@@ -0,0 +1,707 @@
0
+package bbolt
1
+
2
+import (
3
+	"fmt"
4
+	"io"
5
+	"os"
6
+	"sort"
7
+	"strings"
8
+	"time"
9
+	"unsafe"
10
+)
11
+
12
+// txid represents the internal transaction identifier.
13
+type txid uint64
14
+
15
+// Tx represents a read-only or read/write transaction on the database.
16
+// Read-only transactions can be used for retrieving values for keys and creating cursors.
17
+// Read/write transactions can create and remove buckets and create and remove keys.
18
+//
19
+// IMPORTANT: You must commit or rollback transactions when you are done with
20
+// them. Pages can not be reclaimed by the writer until no more transactions
21
+// are using them. A long running read transaction can cause the database to
22
+// quickly grow.
23
+type Tx struct {
24
+	writable       bool
25
+	managed        bool
26
+	db             *DB
27
+	meta           *meta
28
+	root           Bucket
29
+	pages          map[pgid]*page
30
+	stats          TxStats
31
+	commitHandlers []func()
32
+
33
+	// WriteFlag specifies the flag for write-related methods like WriteTo().
34
+	// Tx opens the database file with the specified flag to copy the data.
35
+	//
36
+	// By default, the flag is unset, which works well for mostly in-memory
37
+	// workloads. For databases that are much larger than available RAM,
38
+	// set the flag to syscall.O_DIRECT to avoid trashing the page cache.
39
+	WriteFlag int
40
+}
41
+
42
+// init initializes the transaction.
43
+func (tx *Tx) init(db *DB) {
44
+	tx.db = db
45
+	tx.pages = nil
46
+
47
+	// Copy the meta page since it can be changed by the writer.
48
+	tx.meta = &meta{}
49
+	db.meta().copy(tx.meta)
50
+
51
+	// Copy over the root bucket.
52
+	tx.root = newBucket(tx)
53
+	tx.root.bucket = &bucket{}
54
+	*tx.root.bucket = tx.meta.root
55
+
56
+	// Increment the transaction id and add a page cache for writable transactions.
57
+	if tx.writable {
58
+		tx.pages = make(map[pgid]*page)
59
+		tx.meta.txid += txid(1)
60
+	}
61
+}
62
+
63
+// ID returns the transaction id.
64
+func (tx *Tx) ID() int {
65
+	return int(tx.meta.txid)
66
+}
67
+
68
+// DB returns a reference to the database that created the transaction.
69
+func (tx *Tx) DB() *DB {
70
+	return tx.db
71
+}
72
+
73
+// Size returns current database size in bytes as seen by this transaction.
74
+func (tx *Tx) Size() int64 {
75
+	return int64(tx.meta.pgid) * int64(tx.db.pageSize)
76
+}
77
+
78
+// Writable returns whether the transaction can perform write operations.
79
+func (tx *Tx) Writable() bool {
80
+	return tx.writable
81
+}
82
+
83
+// Cursor creates a cursor associated with the root bucket.
84
+// All items in the cursor will return a nil value because all root bucket keys point to buckets.
85
+// The cursor is only valid as long as the transaction is open.
86
+// Do not use a cursor after the transaction is closed.
87
+func (tx *Tx) Cursor() *Cursor {
88
+	return tx.root.Cursor()
89
+}
90
+
91
+// Stats retrieves a copy of the current transaction statistics.
92
+func (tx *Tx) Stats() TxStats {
93
+	return tx.stats
94
+}
95
+
96
+// Bucket retrieves a bucket by name.
97
+// Returns nil if the bucket does not exist.
98
+// The bucket instance is only valid for the lifetime of the transaction.
99
+func (tx *Tx) Bucket(name []byte) *Bucket {
100
+	return tx.root.Bucket(name)
101
+}
102
+
103
+// CreateBucket creates a new bucket.
104
+// Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long.
105
+// The bucket instance is only valid for the lifetime of the transaction.
106
+func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) {
107
+	return tx.root.CreateBucket(name)
108
+}
109
+
110
+// CreateBucketIfNotExists creates a new bucket if it doesn't already exist.
111
+// Returns an error if the bucket name is blank, or if the bucket name is too long.
112
+// The bucket instance is only valid for the lifetime of the transaction.
113
+func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) {
114
+	return tx.root.CreateBucketIfNotExists(name)
115
+}
116
+
117
+// DeleteBucket deletes a bucket.
118
+// Returns an error if the bucket cannot be found or if the key represents a non-bucket value.
119
+func (tx *Tx) DeleteBucket(name []byte) error {
120
+	return tx.root.DeleteBucket(name)
121
+}
122
+
123
+// ForEach executes a function for each bucket in the root.
124
+// If the provided function returns an error then the iteration is stopped and
125
+// the error is returned to the caller.
126
+func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error {
127
+	return tx.root.ForEach(func(k, v []byte) error {
128
+		return fn(k, tx.root.Bucket(k))
129
+	})
130
+}
131
+
132
+// OnCommit adds a handler function to be executed after the transaction successfully commits.
133
+func (tx *Tx) OnCommit(fn func()) {
134
+	tx.commitHandlers = append(tx.commitHandlers, fn)
135
+}
136
+
137
+// Commit writes all changes to disk and updates the meta page.
138
+// Returns an error if a disk write error occurs, or if Commit is
139
+// called on a read-only transaction.
140
+func (tx *Tx) Commit() error {
141
+	_assert(!tx.managed, "managed tx commit not allowed")
142
+	if tx.db == nil {
143
+		return ErrTxClosed
144
+	} else if !tx.writable {
145
+		return ErrTxNotWritable
146
+	}
147
+
148
+	// TODO(benbjohnson): Use vectorized I/O to write out dirty pages.
149
+
150
+	// Rebalance nodes which have had deletions.
151
+	var startTime = time.Now()
152
+	tx.root.rebalance()
153
+	if tx.stats.Rebalance > 0 {
154
+		tx.stats.RebalanceTime += time.Since(startTime)
155
+	}
156
+
157
+	// spill data onto dirty pages.
158
+	startTime = time.Now()
159
+	if err := tx.root.spill(); err != nil {
160
+		tx.rollback()
161
+		return err
162
+	}
163
+	tx.stats.SpillTime += time.Since(startTime)
164
+
165
+	// Free the old root bucket.
166
+	tx.meta.root.root = tx.root.root
167
+
168
+	// Free the old freelist because commit writes out a fresh freelist.
169
+	if tx.meta.freelist != pgidNoFreelist {
170
+		tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
171
+	}
172
+
173
+	if !tx.db.NoFreelistSync {
174
+		err := tx.commitFreelist()
175
+		if err != nil {
176
+			return err
177
+		}
178
+	} else {
179
+		tx.meta.freelist = pgidNoFreelist
180
+	}
181
+
182
+	// Write dirty pages to disk.
183
+	startTime = time.Now()
184
+	if err := tx.write(); err != nil {
185
+		tx.rollback()
186
+		return err
187
+	}
188
+
189
+	// If strict mode is enabled then perform a consistency check.
190
+	// Only the first consistency error is reported in the panic.
191
+	if tx.db.StrictMode {
192
+		ch := tx.Check()
193
+		var errs []string
194
+		for {
195
+			err, ok := <-ch
196
+			if !ok {
197
+				break
198
+			}
199
+			errs = append(errs, err.Error())
200
+		}
201
+		if len(errs) > 0 {
202
+			panic("check fail: " + strings.Join(errs, "\n"))
203
+		}
204
+	}
205
+
206
+	// Write meta to disk.
207
+	if err := tx.writeMeta(); err != nil {
208
+		tx.rollback()
209
+		return err
210
+	}
211
+	tx.stats.WriteTime += time.Since(startTime)
212
+
213
+	// Finalize the transaction.
214
+	tx.close()
215
+
216
+	// Execute commit handlers now that the locks have been removed.
217
+	for _, fn := range tx.commitHandlers {
218
+		fn()
219
+	}
220
+
221
+	return nil
222
+}
223
+
224
+func (tx *Tx) commitFreelist() error {
225
+	// Allocate new pages for the new free list. This will overestimate
226
+	// the size of the freelist but not underestimate the size (which would be bad).
227
+	opgid := tx.meta.pgid
228
+	p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
229
+	if err != nil {
230
+		tx.rollback()
231
+		return err
232
+	}
233
+	if err := tx.db.freelist.write(p); err != nil {
234
+		tx.rollback()
235
+		return err
236
+	}
237
+	tx.meta.freelist = p.id
238
+	// If the high water mark has moved up then attempt to grow the database.
239
+	if tx.meta.pgid > opgid {
240
+		if err := tx.db.grow(int(tx.meta.pgid+1) * tx.db.pageSize); err != nil {
241
+			tx.rollback()
242
+			return err
243
+		}
244
+	}
245
+
246
+	return nil
247
+}
248
+
249
+// Rollback closes the transaction and ignores all previous updates. Read-only
250
+// transactions must be rolled back and not committed.
251
+func (tx *Tx) Rollback() error {
252
+	_assert(!tx.managed, "managed tx rollback not allowed")
253
+	if tx.db == nil {
254
+		return ErrTxClosed
255
+	}
256
+	tx.rollback()
257
+	return nil
258
+}
259
+
260
+func (tx *Tx) rollback() {
261
+	if tx.db == nil {
262
+		return
263
+	}
264
+	if tx.writable {
265
+		tx.db.freelist.rollback(tx.meta.txid)
266
+		tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist))
267
+	}
268
+	tx.close()
269
+}
270
+
271
+func (tx *Tx) close() {
272
+	if tx.db == nil {
273
+		return
274
+	}
275
+	if tx.writable {
276
+		// Grab freelist stats.
277
+		var freelistFreeN = tx.db.freelist.free_count()
278
+		var freelistPendingN = tx.db.freelist.pending_count()
279
+		var freelistAlloc = tx.db.freelist.size()
280
+
281
+		// Remove transaction ref & writer lock.
282
+		tx.db.rwtx = nil
283
+		tx.db.rwlock.Unlock()
284
+
285
+		// Merge statistics.
286
+		tx.db.statlock.Lock()
287
+		tx.db.stats.FreePageN = freelistFreeN
288
+		tx.db.stats.PendingPageN = freelistPendingN
289
+		tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize
290
+		tx.db.stats.FreelistInuse = freelistAlloc
291
+		tx.db.stats.TxStats.add(&tx.stats)
292
+		tx.db.statlock.Unlock()
293
+	} else {
294
+		tx.db.removeTx(tx)
295
+	}
296
+
297
+	// Clear all references.
298
+	tx.db = nil
299
+	tx.meta = nil
300
+	tx.root = Bucket{tx: tx}
301
+	tx.pages = nil
302
+}
303
+
304
+// Copy writes the entire database to a writer.
305
+// This function exists for backwards compatibility.
306
+//
307
+// Deprecated; Use WriteTo() instead.
308
+func (tx *Tx) Copy(w io.Writer) error {
309
+	_, err := tx.WriteTo(w)
310
+	return err
311
+}
312
+
313
+// WriteTo writes the entire database to a writer.
314
+// If err == nil then exactly tx.Size() bytes will be written into the writer.
315
+func (tx *Tx) WriteTo(w io.Writer) (n int64, err error) {
316
+	// Attempt to open reader with WriteFlag
317
+	f, err := os.OpenFile(tx.db.path, os.O_RDONLY|tx.WriteFlag, 0)
318
+	if err != nil {
319
+		return 0, err
320
+	}
321
+	defer func() {
322
+		if cerr := f.Close(); err == nil {
323
+			err = cerr
324
+		}
325
+	}()
326
+
327
+	// Generate a meta page. We use the same page data for both meta pages.
328
+	buf := make([]byte, tx.db.pageSize)
329
+	page := (*page)(unsafe.Pointer(&buf[0]))
330
+	page.flags = metaPageFlag
331
+	*page.meta() = *tx.meta
332
+
333
+	// Write meta 0.
334
+	page.id = 0
335
+	page.meta().checksum = page.meta().sum64()
336
+	nn, err := w.Write(buf)
337
+	n += int64(nn)
338
+	if err != nil {
339
+		return n, fmt.Errorf("meta 0 copy: %s", err)
340
+	}
341
+
342
+	// Write meta 1 with a lower transaction id.
343
+	page.id = 1
344
+	page.meta().txid -= 1
345
+	page.meta().checksum = page.meta().sum64()
346
+	nn, err = w.Write(buf)
347
+	n += int64(nn)
348
+	if err != nil {
349
+		return n, fmt.Errorf("meta 1 copy: %s", err)
350
+	}
351
+
352
+	// Move past the meta pages in the file.
353
+	if _, err := f.Seek(int64(tx.db.pageSize*2), io.SeekStart); err != nil {
354
+		return n, fmt.Errorf("seek: %s", err)
355
+	}
356
+
357
+	// Copy data pages.
358
+	wn, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2))
359
+	n += wn
360
+	if err != nil {
361
+		return n, err
362
+	}
363
+
364
+	return n, nil
365
+}
366
+
367
+// CopyFile copies the entire database to file at the given path.
368
+// A reader transaction is maintained during the copy so it is safe to continue
369
+// using the database while a copy is in progress.
370
+func (tx *Tx) CopyFile(path string, mode os.FileMode) error {
371
+	f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode)
372
+	if err != nil {
373
+		return err
374
+	}
375
+
376
+	err = tx.Copy(f)
377
+	if err != nil {
378
+		_ = f.Close()
379
+		return err
380
+	}
381
+	return f.Close()
382
+}
383
+
384
+// Check performs several consistency checks on the database for this transaction.
385
+// An error is returned if any inconsistency is found.
386
+//
387
+// It can be safely run concurrently on a writable transaction. However, this
388
+// incurs a high cost for large databases and databases with a lot of subbuckets
389
+// because of caching. This overhead can be removed if running on a read-only
390
+// transaction, however, it is not safe to execute other writer transactions at
391
+// the same time.
392
+func (tx *Tx) Check() <-chan error {
393
+	ch := make(chan error)
394
+	go tx.check(ch)
395
+	return ch
396
+}
397
+
398
+func (tx *Tx) check(ch chan error) {
399
+	// Force loading free list if opened in ReadOnly mode.
400
+	tx.db.loadFreelist()
401
+
402
+	// Check if any pages are double freed.
403
+	freed := make(map[pgid]bool)
404
+	all := make([]pgid, tx.db.freelist.count())
405
+	tx.db.freelist.copyall(all)
406
+	for _, id := range all {
407
+		if freed[id] {
408
+			ch <- fmt.Errorf("page %d: already freed", id)
409
+		}
410
+		freed[id] = true
411
+	}
412
+
413
+	// Track every reachable page.
414
+	reachable := make(map[pgid]*page)
415
+	reachable[0] = tx.page(0) // meta0
416
+	reachable[1] = tx.page(1) // meta1
417
+	if tx.meta.freelist != pgidNoFreelist {
418
+		for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ {
419
+			reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist)
420
+		}
421
+	}
422
+
423
+	// Recursively check buckets.
424
+	tx.checkBucket(&tx.root, reachable, freed, ch)
425
+
426
+	// Ensure all pages below high water mark are either reachable or freed.
427
+	for i := pgid(0); i < tx.meta.pgid; i++ {
428
+		_, isReachable := reachable[i]
429
+		if !isReachable && !freed[i] {
430
+			ch <- fmt.Errorf("page %d: unreachable unfreed", int(i))
431
+		}
432
+	}
433
+
434
+	// Close the channel to signal completion.
435
+	close(ch)
436
+}
437
+
438
+func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) {
439
+	// Ignore inline buckets.
440
+	if b.root == 0 {
441
+		return
442
+	}
443
+
444
+	// Check every page used by this bucket.
445
+	b.tx.forEachPage(b.root, 0, func(p *page, _ int) {
446
+		if p.id > tx.meta.pgid {
447
+			ch <- fmt.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid))
448
+		}
449
+
450
+		// Ensure each page is only referenced once.
451
+		for i := pgid(0); i <= pgid(p.overflow); i++ {
452
+			var id = p.id + i
453
+			if _, ok := reachable[id]; ok {
454
+				ch <- fmt.Errorf("page %d: multiple references", int(id))
455
+			}
456
+			reachable[id] = p
457
+		}
458
+
459
+		// We should only encounter un-freed leaf and branch pages.
460
+		if freed[p.id] {
461
+			ch <- fmt.Errorf("page %d: reachable freed", int(p.id))
462
+		} else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 {
463
+			ch <- fmt.Errorf("page %d: invalid type: %s", int(p.id), p.typ())
464
+		}
465
+	})
466
+
467
+	// Check each bucket within this bucket.
468
+	_ = b.ForEach(func(k, v []byte) error {
469
+		if child := b.Bucket(k); child != nil {
470
+			tx.checkBucket(child, reachable, freed, ch)
471
+		}
472
+		return nil
473
+	})
474
+}
475
+
476
+// allocate returns a contiguous block of memory starting at a given page.
477
+func (tx *Tx) allocate(count int) (*page, error) {
478
+	p, err := tx.db.allocate(tx.meta.txid, count)
479
+	if err != nil {
480
+		return nil, err
481
+	}
482
+
483
+	// Save to our page cache.
484
+	tx.pages[p.id] = p
485
+
486
+	// Update statistics.
487
+	tx.stats.PageCount += count
488
+	tx.stats.PageAlloc += count * tx.db.pageSize
489
+
490
+	return p, nil
491
+}
492
+
493
+// write writes any dirty pages to disk.
494
+func (tx *Tx) write() error {
495
+	// Sort pages by id.
496
+	pages := make(pages, 0, len(tx.pages))
497
+	for _, p := range tx.pages {
498
+		pages = append(pages, p)
499
+	}
500
+	// Clear out page cache early.
501
+	tx.pages = make(map[pgid]*page)
502
+	sort.Sort(pages)
503
+
504
+	// Write pages to disk in order.
505
+	for _, p := range pages {
506
+		size := (int(p.overflow) + 1) * tx.db.pageSize
507
+		offset := int64(p.id) * int64(tx.db.pageSize)
508
+
509
+		// Write out page in "max allocation" sized chunks.
510
+		ptr := (*[maxAllocSize]byte)(unsafe.Pointer(p))
511
+		for {
512
+			// Limit our write to our max allocation size.
513
+			sz := size
514
+			if sz > maxAllocSize-1 {
515
+				sz = maxAllocSize - 1
516
+			}
517
+
518
+			// Write chunk to disk.
519
+			buf := ptr[:sz]
520
+			if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
521
+				return err
522
+			}
523
+
524
+			// Update statistics.
525
+			tx.stats.Write++
526
+
527
+			// Exit inner for loop if we've written all the chunks.
528
+			size -= sz
529
+			if size == 0 {
530
+				break
531
+			}
532
+
533
+			// Otherwise move offset forward and move pointer to next chunk.
534
+			offset += int64(sz)
535
+			ptr = (*[maxAllocSize]byte)(unsafe.Pointer(&ptr[sz]))
536
+		}
537
+	}
538
+
539
+	// Ignore file sync if flag is set on DB.
540
+	if !tx.db.NoSync || IgnoreNoSync {
541
+		if err := fdatasync(tx.db); err != nil {
542
+			return err
543
+		}
544
+	}
545
+
546
+	// Put small pages back to page pool.
547
+	for _, p := range pages {
548
+		// Ignore page sizes over 1 page.
549
+		// These are allocated using make() instead of the page pool.
550
+		if int(p.overflow) != 0 {
551
+			continue
552
+		}
553
+
554
+		buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:tx.db.pageSize]
555
+
556
+		// See https://go.googlesource.com/go/+/f03c9202c43e0abb130669852082117ca50aa9b1
557
+		for i := range buf {
558
+			buf[i] = 0
559
+		}
560
+		tx.db.pagePool.Put(buf)
561
+	}
562
+
563
+	return nil
564
+}
565
+
566
+// writeMeta writes the meta to the disk.
567
+func (tx *Tx) writeMeta() error {
568
+	// Create a temporary buffer for the meta page.
569
+	buf := make([]byte, tx.db.pageSize)
570
+	p := tx.db.pageInBuffer(buf, 0)
571
+	tx.meta.write(p)
572
+
573
+	// Write the meta page to file.
574
+	if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
575
+		return err
576
+	}
577
+	if !tx.db.NoSync || IgnoreNoSync {
578
+		if err := fdatasync(tx.db); err != nil {
579
+			return err
580
+		}
581
+	}
582
+
583
+	// Update statistics.
584
+	tx.stats.Write++
585
+
586
+	return nil
587
+}
588
+
589
+// page returns a reference to the page with a given id.
590
+// If page has been written to then a temporary buffered page is returned.
591
+func (tx *Tx) page(id pgid) *page {
592
+	// Check the dirty pages first.
593
+	if tx.pages != nil {
594
+		if p, ok := tx.pages[id]; ok {
595
+			return p
596
+		}
597
+	}
598
+
599
+	// Otherwise return directly from the mmap.
600
+	return tx.db.page(id)
601
+}
602
+
603
+// forEachPage iterates over every page within a given page and executes a function.
604
+func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) {
605
+	p := tx.page(pgid)
606
+
607
+	// Execute function.
608
+	fn(p, depth)
609
+
610
+	// Recursively loop over children.
611
+	if (p.flags & branchPageFlag) != 0 {
612
+		for i := 0; i < int(p.count); i++ {
613
+			elem := p.branchPageElement(uint16(i))
614
+			tx.forEachPage(elem.pgid, depth+1, fn)
615
+		}
616
+	}
617
+}
618
+
619
+// Page returns page information for a given page number.
620
+// This is only safe for concurrent use when used by a writable transaction.
621
+func (tx *Tx) Page(id int) (*PageInfo, error) {
622
+	if tx.db == nil {
623
+		return nil, ErrTxClosed
624
+	} else if pgid(id) >= tx.meta.pgid {
625
+		return nil, nil
626
+	}
627
+
628
+	// Build the page info.
629
+	p := tx.db.page(pgid(id))
630
+	info := &PageInfo{
631
+		ID:            id,
632
+		Count:         int(p.count),
633
+		OverflowCount: int(p.overflow),
634
+	}
635
+
636
+	// Determine the type (or if it's free).
637
+	if tx.db.freelist.freed(pgid(id)) {
638
+		info.Type = "free"
639
+	} else {
640
+		info.Type = p.typ()
641
+	}
642
+
643
+	return info, nil
644
+}
645
+
646
+// TxStats represents statistics about the actions performed by the transaction.
647
+type TxStats struct {
648
+	// Page statistics.
649
+	PageCount int // number of page allocations
650
+	PageAlloc int // total bytes allocated
651
+
652
+	// Cursor statistics.
653
+	CursorCount int // number of cursors created
654
+
655
+	// Node statistics
656
+	NodeCount int // number of node allocations
657
+	NodeDeref int // number of node dereferences
658
+
659
+	// Rebalance statistics.
660
+	Rebalance     int           // number of node rebalances
661
+	RebalanceTime time.Duration // total time spent rebalancing
662
+
663
+	// Split/Spill statistics.
664
+	Split     int           // number of nodes split
665
+	Spill     int           // number of nodes spilled
666
+	SpillTime time.Duration // total time spent spilling
667
+
668
+	// Write statistics.
669
+	Write     int           // number of writes performed
670
+	WriteTime time.Duration // total time spent writing to disk
671
+}
672
+
673
+func (s *TxStats) add(other *TxStats) {
674
+	s.PageCount += other.PageCount
675
+	s.PageAlloc += other.PageAlloc
676
+	s.CursorCount += other.CursorCount
677
+	s.NodeCount += other.NodeCount
678
+	s.NodeDeref += other.NodeDeref
679
+	s.Rebalance += other.Rebalance
680
+	s.RebalanceTime += other.RebalanceTime
681
+	s.Split += other.Split
682
+	s.Spill += other.Spill
683
+	s.SpillTime += other.SpillTime
684
+	s.Write += other.Write
685
+	s.WriteTime += other.WriteTime
686
+}
687
+
688
+// Sub calculates and returns the difference between two sets of transaction stats.
689
+// This is useful when obtaining stats at two different points and time and
690
+// you need the performance counters that occurred within that time span.
691
+func (s *TxStats) Sub(other *TxStats) TxStats {
692
+	var diff TxStats
693
+	diff.PageCount = s.PageCount - other.PageCount
694
+	diff.PageAlloc = s.PageAlloc - other.PageAlloc
695
+	diff.CursorCount = s.CursorCount - other.CursorCount
696
+	diff.NodeCount = s.NodeCount - other.NodeCount
697
+	diff.NodeDeref = s.NodeDeref - other.NodeDeref
698
+	diff.Rebalance = s.Rebalance - other.Rebalance
699
+	diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime
700
+	diff.Split = s.Split - other.Split
701
+	diff.Spill = s.Spill - other.Spill
702
+	diff.SpillTime = s.SpillTime - other.SpillTime
703
+	diff.Write = s.Write - other.Write
704
+	diff.WriteTime = s.WriteTime - other.WriteTime
705
+	return diff
706
+}