Browse code

Vendoring libnetwork and libkv

- Libnetwork brings in :
* Default Gateway as a service for network drivers
* Persistence for local scoped networks using libkv
* BATS based Multi-host Integration-test infra and end-to-end tests
* libnetwork fixes for zookeeper and etcd backend
- Libkv upgrade brings in :
* boltdb support for local kv persistence
* other general bug fixes

Signed-off-by: Madhu Venugopal <madhu@docker.com>

Madhu Venugopal authored on 2015/09/26 02:02:31
Showing 61 changed files
... ...
@@ -20,18 +20,19 @@ clone git github.com/tchap/go-patricia v2.1.0
20 20
 clone git golang.org/x/net 3cffabab72adf04f8e3b01c5baf775361837b5fe https://github.com/golang/net.git
21 21
 
22 22
 #get libnetwork packages
23
-clone git github.com/docker/libnetwork e5fea92a6c8a5968bdb8005bf959c6e23113b689
23
+clone git github.com/docker/libnetwork f5423a097e5da89f9ea206ddf8b93b5ac1f51ee7
24 24
 clone git github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
25 25
 clone git github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b
26 26
 clone git github.com/hashicorp/memberlist 9a1e242e454d2443df330bdd51a436d5a9058fc4
27 27
 clone git github.com/hashicorp/serf 7151adcef72687bf95f451a2e0ba15cb19412bf2
28
-clone git github.com/docker/libkv a0a57ed3755665e9a402a3df315402134eb6625f
28
+clone git github.com/docker/libkv ea7ff6ae76485ab93ac36799d3e13b1905787ffe
29 29
 clone git github.com/vishvananda/netns 604eaf189ee867d8c147fafc28def2394e878d25
30 30
 clone git github.com/vishvananda/netlink 4b5dce31de6d42af5bb9811c6d265472199e0fec
31 31
 clone git github.com/BurntSushi/toml f706d00e3de6abe700c994cdd545a1a4915af060
32 32
 clone git github.com/samuel/go-zookeeper d0e0d8e11f318e000a8cc434616d69e329edc374
33 33
 clone git github.com/coreos/go-etcd v2.0.0
34 34
 clone git github.com/hashicorp/consul v0.5.2
35
+clone git github.com/boltdb/bolt v1.0
35 36
 
36 37
 # get graph and distribution packages
37 38
 clone git github.com/docker/distribution ec87e9b6971d831f0eff752ddb54fb64693e51cd # docker/1.8 branch
38 39
new file mode 100644
... ...
@@ -0,0 +1,3 @@
0
+*.prof
1
+*.test
2
+/bin/
0 3
new file mode 100644
... ...
@@ -0,0 +1,20 @@
0
+The MIT License (MIT)
1
+
2
+Copyright (c) 2013 Ben Johnson
3
+
4
+Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+this software and associated documentation files (the "Software"), to deal in
6
+the Software without restriction, including without limitation the rights to
7
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+the Software, and to permit persons to whom the Software is furnished to do so,
9
+subject to the following conditions:
10
+
11
+The above copyright notice and this permission notice shall be included in all
12
+copies or substantial portions of the Software.
13
+
14
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
0 20
new file mode 100644
... ...
@@ -0,0 +1,54 @@
0
+TEST=.
1
+BENCH=.
2
+COVERPROFILE=/tmp/c.out
3
+BRANCH=`git rev-parse --abbrev-ref HEAD`
4
+COMMIT=`git rev-parse --short HEAD`
5
+GOLDFLAGS="-X main.branch $(BRANCH) -X main.commit $(COMMIT)"
6
+
7
+default: build
8
+
9
+bench:
10
+	go test -v -test.run=NOTHINCONTAINSTHIS -test.bench=$(BENCH)
11
+
12
+# http://cloc.sourceforge.net/
13
+cloc:
14
+	@cloc --not-match-f='Makefile|_test.go' .
15
+
16
+cover: fmt
17
+	go test -coverprofile=$(COVERPROFILE) -test.run=$(TEST) $(COVERFLAG) .
18
+	go tool cover -html=$(COVERPROFILE)
19
+	rm $(COVERPROFILE)
20
+
21
+cpuprofile: fmt
22
+	@go test -c
23
+	@./bolt.test -test.v -test.run=$(TEST) -test.cpuprofile cpu.prof
24
+
25
+# go get github.com/kisielk/errcheck
26
+errcheck:
27
+	@echo "=== errcheck ==="
28
+	@errcheck github.com/boltdb/bolt
29
+
30
+fmt:
31
+	@go fmt ./...
32
+
33
+get:
34
+	@go get -d ./...
35
+
36
+build: get
37
+	@mkdir -p bin
38
+	@go build -ldflags=$(GOLDFLAGS) -a -o bin/bolt ./cmd/bolt
39
+
40
+test: fmt
41
+	@go get github.com/stretchr/testify/assert
42
+	@echo "=== TESTS ==="
43
+	@go test -v -cover -test.run=$(TEST)
44
+	@echo ""
45
+	@echo ""
46
+	@echo "=== CLI ==="
47
+	@go test -v -test.run=$(TEST) ./cmd/bolt
48
+	@echo ""
49
+	@echo ""
50
+	@echo "=== RACE DETECTOR ==="
51
+	@go test -v -race -test.run="TestSimulate_(100op|1000op)"
52
+
53
+.PHONY: bench cloc cover cpuprofile fmt memprofile test
0 54
new file mode 100644
... ...
@@ -0,0 +1,455 @@
0
+Bolt [![Build Status](https://drone.io/github.com/boltdb/bolt/status.png)](https://drone.io/github.com/boltdb/bolt/latest) [![Coverage Status](https://coveralls.io/repos/boltdb/bolt/badge.png?branch=master)](https://coveralls.io/r/boltdb/bolt?branch=master) [![GoDoc](https://godoc.org/github.com/boltdb/bolt?status.png)](https://godoc.org/github.com/boltdb/bolt) ![Version](http://img.shields.io/badge/version-1.0-green.png)
1
+====
2
+
3
+Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas] and
4
+the [LMDB project][lmdb]. The goal of the project is to provide a simple,
5
+fast, and reliable database for projects that don't require a full database
6
+server such as Postgres or MySQL.
7
+
8
+Since Bolt is meant to be used as such a low-level piece of functionality,
9
+simplicity is key. The API will be small and only focus on getting values
10
+and setting values. That's it.
11
+
12
+[hyc_symas]: https://twitter.com/hyc_symas
13
+[lmdb]: http://symas.com/mdb/
14
+
15
+
16
+## Project Status
17
+
18
+Bolt is stable and the API is fixed. Full unit test coverage and randomized 
19
+black box testing are used to ensure database consistency and thread safety.
20
+Bolt is currently in high-load production environments serving databases as
21
+large as 1TB. Many companies such as Shopify and Heroku use Bolt-backed
22
+services every day.
23
+
24
+
25
+## Getting Started
26
+
27
+### Installing
28
+
29
+To start using Bolt, install Go and run `go get`:
30
+
31
+```sh
32
+$ go get github.com/boltdb/bolt/...
33
+```
34
+
35
+This will retrieve the library and install the `bolt` command line utility into
36
+your `$GOBIN` path.
37
+
38
+
39
+### Opening a database
40
+
41
+The top-level object in Bolt is a `DB`. It is represented as a single file on
42
+your disk and represents a consistent snapshot of your data.
43
+
44
+To open your database, simply use the `bolt.Open()` function:
45
+
46
+```go
47
+package main
48
+
49
+import (
50
+	"log"
51
+
52
+	"github.com/boltdb/bolt"
53
+)
54
+
55
+func main() {
56
+	// Open the my.db data file in your current directory.
57
+	// It will be created if it doesn't exist.
58
+	db, err := bolt.Open("my.db", 0600, nil)
59
+	if err != nil {
60
+		log.Fatal(err)
61
+	}
62
+	defer db.Close()
63
+
64
+	...
65
+}
66
+```
67
+
68
+Please note that Bolt obtains a file lock on the data file so multiple processes
69
+cannot open the same database at the same time. Opening an already open Bolt
70
+database will cause it to hang until the other process closes it. To prevent
71
+an indefinite wait you can pass a timeout option to the `Open()` function:
72
+
73
+```go
74
+db, err := bolt.Open("my.db", 0600, &bolt.Options{Timeout: 1 * time.Second})
75
+```
76
+
77
+
78
+### Transactions
79
+
80
+Bolt allows only one read-write transaction at a time but allows as many
81
+read-only transactions as you want at a time. Each transaction has a consistent
82
+view of the data as it existed when the transaction started.
83
+
84
+Individual transactions and all objects created from them (e.g. buckets, keys)
85
+are not thread safe. To work with data in multiple goroutines you must start
86
+a transaction for each one or use locking to ensure only one goroutine accesses
87
+a transaction at a time. Creating transaction from the `DB` is thread safe.
88
+
89
+
90
+#### Read-write transactions
91
+
92
+To start a read-write transaction, you can use the `DB.Update()` function:
93
+
94
+```go
95
+err := db.Update(func(tx *bolt.Tx) error {
96
+	...
97
+	return nil
98
+})
99
+```
100
+
101
+Inside the closure, you have a consistent view of the database. You commit the
102
+transaction by returning `nil` at the end. You can also rollback the transaction
103
+at any point by returning an error. All database operations are allowed inside
104
+a read-write transaction.
105
+
106
+Always check the return error as it will report any disk failures that can cause
107
+your transaction to not complete. If you return an error within your closure
108
+it will be passed through.
109
+
110
+
111
+#### Read-only transactions
112
+
113
+To start a read-only transaction, you can use the `DB.View()` function:
114
+
115
+```go
116
+err := db.View(func(tx *bolt.Tx) error {
117
+	...
118
+	return nil
119
+})
120
+```
121
+
122
+You also get a consistent view of the database within this closure, however, 
123
+no mutating operations are allowed within a read-only transaction. You can only
124
+retrieve buckets, retrieve values, and copy the database within a read-only
125
+transaction.
126
+
127
+
128
+### Using buckets
129
+
130
+Buckets are collections of key/value pairs within the database. All keys in a
131
+bucket must be unique. You can create a bucket using the `DB.CreateBucket()`
132
+function:
133
+
134
+```go
135
+db.Update(func(tx *bolt.Tx) error {
136
+	b, err := tx.CreateBucket([]byte("MyBucket"))
137
+	if err != nil {
138
+		return fmt.Errorf("create bucket: %s", err)
139
+	}
140
+	return nil
141
+})
142
+```
143
+
144
+You can also create a bucket only if it doesn't exist by using the
145
+`Tx.CreateBucketIfNotExists()` function. It's a common pattern to call this
146
+function for all your top-level buckets after you open your database so you can
147
+guarantee that they exist for future transactions.
148
+
149
+To delete a bucket, simply call the `Tx.DeleteBucket()` function.
150
+
151
+
152
+### Using key/value pairs
153
+
154
+To save a key/value pair to a bucket, use the `Bucket.Put()` function:
155
+
156
+```go
157
+db.Update(func(tx *bolt.Tx) error {
158
+	b := tx.Bucket([]byte("MyBucket"))
159
+	err := b.Put([]byte("answer"), []byte("42"))
160
+	return err
161
+})
162
+```
163
+
164
+This will set the value of the `"answer"` key to `"42"` in the `MyBucket`
165
+bucket. To retrieve this value, we can use the `Bucket.Get()` function:
166
+
167
+```go
168
+db.View(func(tx *bolt.Tx) error {
169
+	b := tx.Bucket([]byte("MyBucket"))
170
+	v := b.Get([]byte("answer"))
171
+	fmt.Printf("The answer is: %s\n", v)
172
+	return nil
173
+})
174
+```
175
+
176
+The `Get()` function does not return an error because its operation is
177
+guarenteed to work (unless there is some kind of system failure). If the key
178
+exists then it will return its byte slice value. If it doesn't exist then it
179
+will return `nil`. It's important to note that you can have a zero-length value
180
+set to a key which is different than the key not existing.
181
+
182
+Use the `Bucket.Delete()` function to delete a key from the bucket.
183
+
184
+
185
+### Iterating over keys
186
+
187
+Bolt stores its keys in byte-sorted order within a bucket. This makes sequential
188
+iteration over these keys extremely fast. To iterate over keys we'll use a
189
+`Cursor`:
190
+
191
+```go
192
+db.View(func(tx *bolt.Tx) error {
193
+	b := tx.Bucket([]byte("MyBucket"))
194
+	c := b.Cursor()
195
+
196
+	for k, v := c.First(); k != nil; k, v = c.Next() {
197
+		fmt.Printf("key=%s, value=%s\n", k, v)
198
+	}
199
+
200
+	return nil
201
+})
202
+```
203
+
204
+The cursor allows you to move to a specific point in the list of keys and move
205
+forward or backward through the keys one at a time.
206
+
207
+The following functions are available on the cursor:
208
+
209
+```
210
+First()  Move to the first key.
211
+Last()   Move to the last key.
212
+Seek()   Move to a specific key.
213
+Next()   Move to the next key.
214
+Prev()   Move to the previous key.
215
+```
216
+
217
+When you have iterated to the end of the cursor then `Next()` will return `nil`.
218
+You must seek to a position using `First()`, `Last()`, or `Seek()` before
219
+calling `Next()` or `Prev()`. If you do not seek to a position then these
220
+functions will return `nil`.
221
+
222
+
223
+#### Prefix scans
224
+
225
+To iterate over a key prefix, you can combine `Seek()` and `bytes.HasPrefix()`:
226
+
227
+```go
228
+db.View(func(tx *bolt.Tx) error {
229
+	c := tx.Bucket([]byte("MyBucket")).Cursor()
230
+
231
+	prefix := []byte("1234")
232
+	for k, v := c.Seek(prefix); bytes.HasPrefix(k, prefix); k, v = c.Next() {
233
+		fmt.Printf("key=%s, value=%s\n", k, v)
234
+	}
235
+
236
+	return nil
237
+})
238
+```
239
+
240
+#### Range scans
241
+
242
+Another common use case is scanning over a range such as a time range. If you
243
+use a sortable time encoding such as RFC3339 then you can query a specific
244
+date range like this:
245
+
246
+```go
247
+db.View(func(tx *bolt.Tx) error {
248
+	// Assume our events bucket has RFC3339 encoded time keys.
249
+	c := tx.Bucket([]byte("Events")).Cursor()
250
+
251
+	// Our time range spans the 90's decade.
252
+	min := []byte("1990-01-01T00:00:00Z")
253
+	max := []byte("2000-01-01T00:00:00Z")
254
+
255
+	// Iterate over the 90's.
256
+	for k, v := c.Seek(min); k != nil && bytes.Compare(k, max) != -1; k, v = c.Next() {
257
+		fmt.Printf("%s: %s\n", k, v)
258
+	}
259
+
260
+	return nil
261
+})
262
+```
263
+
264
+
265
+#### ForEach()
266
+
267
+You can also use the function `ForEach()` if you know you'll be iterating over
268
+all the keys in a bucket:
269
+
270
+```go
271
+db.View(func(tx *bolt.Tx) error {
272
+	b := tx.Bucket([]byte("MyBucket"))
273
+	b.ForEach(func(k, v []byte) error {
274
+		fmt.Printf("key=%s, value=%s\n", k, v)
275
+		return nil
276
+	})
277
+	return nil
278
+})
279
+```
280
+
281
+
282
+### Nested buckets
283
+
284
+You can also store a bucket in a key to create nested buckets. The API is the
285
+same as the bucket management API on the `DB` object:
286
+
287
+```go
288
+func (*Bucket) CreateBucket(key []byte) (*Bucket, error)
289
+func (*Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error)
290
+func (*Bucket) DeleteBucket(key []byte) error
291
+```
292
+
293
+
294
+### Database backups
295
+
296
+Bolt is a single file so it's easy to backup. You can use the `Tx.Copy()`
297
+function to write a consistent view of the database to a writer. If you call
298
+this from a read-only transaction, it will perform a hot backup and not block
299
+your other database reads and writes. It will also use `O_DIRECT` when available
300
+to prevent page cache trashing.
301
+
302
+One common use case is to backup over HTTP so you can use tools like `cURL` to
303
+do database backups:
304
+
305
+```go
306
+func BackupHandleFunc(w http.ResponseWriter, req *http.Request) {
307
+	err := db.View(func(tx bolt.Tx) error {
308
+		w.Header().Set("Content-Type", "application/octet-stream")
309
+		w.Header().Set("Content-Disposition", `attachment; filename="my.db"`)
310
+		w.Header().Set("Content-Length", strconv.Itoa(int(tx.Size())))
311
+		return tx.Copy(w)
312
+	})
313
+	if err != nil {
314
+		http.Error(w, err.Error(), http.StatusInternalServerError)
315
+	}
316
+}
317
+```
318
+
319
+Then you can backup using this command:
320
+
321
+```sh
322
+$ curl http://localhost/backup > my.db
323
+```
324
+
325
+Or you can open your browser to `http://localhost/backup` and it will download
326
+automatically.
327
+
328
+If you want to backup to another file you can use the `Tx.CopyFile()` helper
329
+function.
330
+
331
+
332
+### Statistics
333
+
334
+The database keeps a running count of many of the internal operations it
335
+performs so you can better understand what's going on. By grabbing a snapshot
336
+of these stats at two points in time we can see what operations were performed
337
+in that time range.
338
+
339
+For example, we could start a goroutine to log stats every 10 seconds:
340
+
341
+```go
342
+go func() {
343
+	// Grab the initial stats.
344
+	prev := db.Stats()
345
+
346
+	for {
347
+		// Wait for 10s.
348
+		time.Sleep(10 * time.Second)
349
+
350
+		// Grab the current stats and diff them.
351
+		stats := db.Stats()
352
+		diff := stats.Sub(&prev)
353
+		
354
+		// Encode stats to JSON and print to STDERR.
355
+		json.NewEncoder(os.Stderr).Encode(diff)
356
+
357
+		// Save stats for the next loop.
358
+		prev = stats
359
+	}
360
+}
361
+}()
362
+```
363
+
364
+It's also useful to pipe these stats to a service such as statsd for monitoring
365
+or to provide an HTTP endpoint that will perform a fixed-length sample.
366
+
367
+
368
+## Resources
369
+
370
+For more information on getting started with Bolt, check out the following articles:
371
+
372
+* [Intro to BoltDB: Painless Performant Persistence](http://npf.io/2014/07/intro-to-boltdb-painless-performant-persistence/) by [Nate Finch](https://github.com/natefinch).
373
+
374
+
375
+
376
+## Comparing Bolt to LMDB
377
+
378
+Bolt was originally a port of LMDB so it is architecturally similar. Both use
379
+a B+tree, have ACID semanetics with fully serializable transactions, and support
380
+lock-free MVCC using a single writer and multiple readers.
381
+
382
+The two projects have somewhat diverged. LMDB heavily focuses on raw performance
383
+while Bolt has focused on simplicity and ease of use. For example, LMDB allows
384
+several unsafe actions such as direct writes and append writes for the sake of
385
+performance. Bolt opts to disallow actions which can leave the database in a 
386
+corrupted state. The only exception to this in Bolt is `DB.NoSync`.
387
+
388
+
389
+## Caveats & Limitations
390
+
391
+It's important to pick the right tool for the job and Bolt is no exception.
392
+Here are a few things to note when evaluating and using Bolt:
393
+
394
+* Bolt is good for read intensive workloads. Sequential write performance is
395
+  also fast but random writes can be slow. You can add a write-ahead log or
396
+  [transaction coalescer](https://github.com/boltdb/coalescer) in front of Bolt
397
+  to mitigate this issue.
398
+
399
+* Bolt uses a B+tree internally so there can be a lot of random page access.
400
+  SSDs provide a significant performance boost over spinning disks.
401
+
402
+* Try to avoid long running read transactions. Bolt uses copy-on-write so
403
+  old pages cannot be reclaimed while an old transaction is using them.
404
+
405
+* Byte slices returned from Bolt are only valid during a transaction. Once the
406
+  transaction has been committed or rolled back then the memory they point to
407
+  can be reused by a new page or can be unmapped from virtual memory and you'll
408
+  see an `unexpected fault address` panic when accessing it.
409
+
410
+* Be careful when using `Bucket.FillPercent`. Setting a high fill percent for
411
+  buckets that have random inserts will cause your database to have very poor
412
+  page utilization.
413
+
414
+* Use larger buckets in general. Smaller buckets causes poor page utilization
415
+  once they become larger than the page size (typically 4KB).
416
+
417
+* Bulk loading a lot of random writes into a new bucket can be slow as the
418
+  page will not split until the transaction is committed. Randomly inserting
419
+  more than 100,000 key/value pairs into a single new bucket in a single
420
+  transaction is not advised.
421
+
422
+* Bolt uses a memory-mapped file so the underlying operating system handles the
423
+  caching of the data. Typically, the OS will cache as much of the file as it
424
+  can in memory and will release memory as needed to other processes. This means
425
+  that Bolt can show very high memory usage when working with large databases.
426
+  However, this is expected and the OS will release memory as needed. Bolt can
427
+  handle databases much larger than the available physical RAM.
428
+
429
+
430
+## Other Projects Using Bolt
431
+
432
+Below is a list of public, open source projects that use Bolt:
433
+
434
+* [Bazil](https://github.com/bazillion/bazil) - A file system that lets your data reside where it is most convenient for it to reside.
435
+* [DVID](https://github.com/janelia-flyem/dvid) - Added Bolt as optional storage engine and testing it against Basho-tuned leveldb.
436
+* [Skybox Analytics](https://github.com/skybox/skybox) - A standalone funnel analysis tool for web analytics.
437
+* [Scuttlebutt](https://github.com/benbjohnson/scuttlebutt) - Uses Bolt to store and process all Twitter mentions of GitHub projects.
438
+* [Wiki](https://github.com/peterhellberg/wiki) - A tiny wiki using Goji, BoltDB and Blackfriday.
439
+* [ChainStore](https://github.com/nulayer/chainstore) - Simple key-value interface to a variety of storage engines organized as a chain of operations.
440
+* [MetricBase](https://github.com/msiebuhr/MetricBase) - Single-binary version of Graphite.
441
+* [Gitchain](https://github.com/gitchain/gitchain) - Decentralized, peer-to-peer Git repositories aka "Git meets Bitcoin".
442
+* [event-shuttle](https://github.com/sclasen/event-shuttle) - A Unix system service to collect and reliably deliver messages to Kafka.
443
+* [ipxed](https://github.com/kelseyhightower/ipxed) - Web interface and api for ipxed.
444
+* [BoltStore](https://github.com/yosssi/boltstore) - Session store using Bolt.
445
+* [photosite/session](http://godoc.org/bitbucket.org/kardianos/photosite/session) - Sessions for a photo viewing site.
446
+* [LedisDB](https://github.com/siddontang/ledisdb) - A high performance NoSQL, using Bolt as optional storage.
447
+* [ipLocator](https://github.com/AndreasBriese/ipLocator) - A fast ip-geo-location-server using bolt with bloom filters.
448
+* [cayley](https://github.com/google/cayley) - Cayley is an open-source graph database using Bolt as optional backend.
449
+* [bleve](http://www.blevesearch.com/) - A pure Go search engine similar to ElasticSearch that uses Bolt as the default storage backend.
450
+* [tentacool](https://github.com/optiflows/tentacool) - REST api server to manage system stuff (IP, DNS, Gateway...) on a linux server.
451
+* [SkyDB](https://github.com/skydb/sky) - Behavioral analytics database.
452
+
453
+If you are using Bolt in a project please send a pull request to add it to the list.
454
+
0 455
new file mode 100644
... ...
@@ -0,0 +1,4 @@
0
+package bolt
1
+
2
+// maxMapSize represents the largest mmap size supported by Bolt.
3
+const maxMapSize = 0xFFFFFFF // 256MB
0 4
new file mode 100644
... ...
@@ -0,0 +1,4 @@
0
+package bolt
1
+
2
+// maxMapSize represents the largest mmap size supported by Bolt.
3
+const maxMapSize = 0xFFFFFFFFFFFF // 256TB
0 4
new file mode 100644
... ...
@@ -0,0 +1,4 @@
0
+package bolt
1
+
2
+// maxMapSize represents the largest mmap size supported by Bolt.
3
+const maxMapSize = 0xFFFFFFF // 256MB
0 4
new file mode 100644
... ...
@@ -0,0 +1,12 @@
0
+package bolt
1
+
2
+import (
3
+	"syscall"
4
+)
5
+
6
+var odirect = syscall.O_DIRECT
7
+
8
+// fdatasync flushes written data to a file descriptor.
9
+func fdatasync(db *DB) error {
10
+	return syscall.Fdatasync(int(db.file.Fd()))
11
+}
0 12
new file mode 100644
... ...
@@ -0,0 +1,29 @@
0
+package bolt
1
+
2
+import (
3
+	"syscall"
4
+	"unsafe"
5
+)
6
+
7
+const (
8
+	msAsync      = 1 << iota // perform asynchronous writes
9
+	msSync                   // perform synchronous writes
10
+	msInvalidate             // invalidate cached data
11
+)
12
+
13
+var odirect int
14
+
15
+func msync(db *DB) error {
16
+	_, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate)
17
+	if errno != 0 {
18
+		return errno
19
+	}
20
+	return nil
21
+}
22
+
23
+func fdatasync(db *DB) error {
24
+	if db.data != nil {
25
+		return msync(db)
26
+	}
27
+	return db.file.Sync()
28
+}
0 29
new file mode 100644
... ...
@@ -0,0 +1,69 @@
0
+// +build !windows,!plan9
1
+
2
+package bolt
3
+
4
+import (
5
+	"os"
6
+	"syscall"
7
+	"time"
8
+	"unsafe"
9
+)
10
+
11
+// flock acquires an advisory lock on a file descriptor.
12
+func flock(f *os.File, timeout time.Duration) error {
13
+	var t time.Time
14
+	for {
15
+		// If we're beyond our timeout then return an error.
16
+		// This can only occur after we've attempted a flock once.
17
+		if t.IsZero() {
18
+			t = time.Now()
19
+		} else if timeout > 0 && time.Since(t) > timeout {
20
+			return ErrTimeout
21
+		}
22
+
23
+		// Otherwise attempt to obtain an exclusive lock.
24
+		err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
25
+		if err == nil {
26
+			return nil
27
+		} else if err != syscall.EWOULDBLOCK {
28
+			return err
29
+		}
30
+
31
+		// Wait for a bit and try again.
32
+		time.Sleep(50 * time.Millisecond)
33
+	}
34
+}
35
+
36
+// funlock releases an advisory lock on a file descriptor.
37
+func funlock(f *os.File) error {
38
+	return syscall.Flock(int(f.Fd()), syscall.LOCK_UN)
39
+}
40
+
41
+// mmap memory maps a DB's data file.
42
+func mmap(db *DB, sz int) error {
43
+	b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED)
44
+	if err != nil {
45
+		return err
46
+	}
47
+
48
+	// Save the original byte slice and convert to a byte array pointer.
49
+	db.dataref = b
50
+	db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
51
+	db.datasz = sz
52
+	return nil
53
+}
54
+
55
+// munmap unmaps a DB's data file from memory.
56
+func munmap(db *DB) error {
57
+	// Ignore the unmap if we have no mapped data.
58
+	if db.dataref == nil {
59
+		return nil
60
+	}
61
+
62
+	// Unmap using the original byte slice.
63
+	err := syscall.Munmap(db.dataref)
64
+	db.dataref = nil
65
+	db.data = nil
66
+	db.datasz = 0
67
+	return err
68
+}
0 69
new file mode 100644
... ...
@@ -0,0 +1,74 @@
0
+package bolt
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"syscall"
6
+	"time"
7
+	"unsafe"
8
+)
9
+
10
+var odirect int
11
+
12
+// fdatasync flushes written data to a file descriptor.
13
+func fdatasync(db *DB) error {
14
+	return db.file.Sync()
15
+}
16
+
17
+// flock acquires an advisory lock on a file descriptor.
18
+func flock(f *os.File, _ time.Duration) error {
19
+	return nil
20
+}
21
+
22
+// funlock releases an advisory lock on a file descriptor.
23
+func funlock(f *os.File) error {
24
+	return nil
25
+}
26
+
27
+// mmap memory maps a DB's data file.
28
+// Based on: https://github.com/edsrzf/mmap-go
29
+func mmap(db *DB, sz int) error {
30
+	// Truncate the database to the size of the mmap.
31
+	if err := db.file.Truncate(int64(sz)); err != nil {
32
+		return fmt.Errorf("truncate: %s", err)
33
+	}
34
+
35
+	// Open a file mapping handle.
36
+	sizelo := uint32(sz >> 32)
37
+	sizehi := uint32(sz) & 0xffffffff
38
+	h, errno := syscall.CreateFileMapping(syscall.Handle(db.file.Fd()), nil, syscall.PAGE_READONLY, sizelo, sizehi, nil)
39
+	if h == 0 {
40
+		return os.NewSyscallError("CreateFileMapping", errno)
41
+	}
42
+
43
+	// Create the memory map.
44
+	addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(sz))
45
+	if addr == 0 {
46
+		return os.NewSyscallError("MapViewOfFile", errno)
47
+	}
48
+
49
+	// Close mapping handle.
50
+	if err := syscall.CloseHandle(syscall.Handle(h)); err != nil {
51
+		return os.NewSyscallError("CloseHandle", err)
52
+	}
53
+
54
+	// Convert to a byte array.
55
+	db.data = ((*[maxMapSize]byte)(unsafe.Pointer(addr)))
56
+	db.datasz = sz
57
+
58
+	return nil
59
+}
60
+
61
+// munmap unmaps a pointer from a file.
62
+// Based on: https://github.com/edsrzf/mmap-go
63
+func munmap(db *DB) error {
64
+	if db.data == nil {
65
+		return nil
66
+	}
67
+
68
+	addr := (uintptr)(unsafe.Pointer(&db.data[0]))
69
+	if err := syscall.UnmapViewOfFile(addr); err != nil {
70
+		return os.NewSyscallError("UnmapViewOfFile", err)
71
+	}
72
+	return nil
73
+}
0 74
new file mode 100644
... ...
@@ -0,0 +1,10 @@
0
+// +build !windows,!plan9,!linux,!openbsd
1
+
2
+package bolt
3
+
4
+var odirect int
5
+
6
+// fdatasync flushes written data to a file descriptor.
7
+func fdatasync(db *DB) error {
8
+	return db.file.Sync()
9
+}
0 10
new file mode 100644
... ...
@@ -0,0 +1,728 @@
0
+package bolt
1
+
2
+import (
3
+	"bytes"
4
+	"fmt"
5
+	"unsafe"
6
+)
7
+
8
+const (
9
+	// MaxKeySize is the maximum length of a key, in bytes.
10
+	MaxKeySize = 32768
11
+
12
+	// MaxValueSize is the maximum length of a value, in bytes.
13
+	MaxValueSize = 4294967295
14
+)
15
+
16
+const (
17
+	maxUint = ^uint(0)
18
+	minUint = 0
19
+	maxInt  = int(^uint(0) >> 1)
20
+	minInt  = -maxInt - 1
21
+)
22
+
23
+const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
24
+
25
+const (
26
+	minFillPercent = 0.1
27
+	maxFillPercent = 1.0
28
+)
29
+
30
+// DefaultFillPercent is the percentage that split pages are filled.
31
+// This value can be changed by setting Bucket.FillPercent.
32
+const DefaultFillPercent = 0.5
33
+
34
+// Bucket represents a collection of key/value pairs inside the database.
35
+type Bucket struct {
36
+	*bucket
37
+	tx       *Tx                // the associated transaction
38
+	buckets  map[string]*Bucket // subbucket cache
39
+	page     *page              // inline page reference
40
+	rootNode *node              // materialized node for the root page.
41
+	nodes    map[pgid]*node     // node cache
42
+
43
+	// Sets the threshold for filling nodes when they split. By default,
44
+	// the bucket will fill to 50% but it can be useful to increase this
45
+	// amount if you know that your write workloads are mostly append-only.
46
+	//
47
+	// This is non-persisted across transactions so it must be set in every Tx.
48
+	FillPercent float64
49
+}
50
+
51
+// bucket represents the on-file representation of a bucket.
52
+// This is stored as the "value" of a bucket key. If the bucket is small enough,
53
+// then its root page can be stored inline in the "value", after the bucket
54
+// header. In the case of inline buckets, the "root" will be 0.
55
+type bucket struct {
56
+	root     pgid   // page id of the bucket's root-level page
57
+	sequence uint64 // monotonically incrementing, used by NextSequence()
58
+}
59
+
60
+// newBucket returns a new bucket associated with a transaction.
61
+func newBucket(tx *Tx) Bucket {
62
+	var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
63
+	if tx.writable {
64
+		b.buckets = make(map[string]*Bucket)
65
+		b.nodes = make(map[pgid]*node)
66
+	}
67
+	return b
68
+}
69
+
70
+// Tx returns the tx of the bucket.
71
+func (b *Bucket) Tx() *Tx {
72
+	return b.tx
73
+}
74
+
75
+// Root returns the root of the bucket.
76
+func (b *Bucket) Root() pgid {
77
+	return b.root
78
+}
79
+
80
+// Writable returns whether the bucket is writable.
81
+func (b *Bucket) Writable() bool {
82
+	return b.tx.writable
83
+}
84
+
85
+// Cursor creates a cursor associated with the bucket.
86
+// The cursor is only valid as long as the transaction is open.
87
+// Do not use a cursor after the transaction is closed.
88
+func (b *Bucket) Cursor() *Cursor {
89
+	// Update transaction statistics.
90
+	b.tx.stats.CursorCount++
91
+
92
+	// Allocate and return a cursor.
93
+	return &Cursor{
94
+		bucket: b,
95
+		stack:  make([]elemRef, 0),
96
+	}
97
+}
98
+
99
+// Bucket retrieves a nested bucket by name.
100
+// Returns nil if the bucket does not exist.
101
+func (b *Bucket) Bucket(name []byte) *Bucket {
102
+	if b.buckets != nil {
103
+		if child := b.buckets[string(name)]; child != nil {
104
+			return child
105
+		}
106
+	}
107
+
108
+	// Move cursor to key.
109
+	c := b.Cursor()
110
+	k, v, flags := c.seek(name)
111
+
112
+	// Return nil if the key doesn't exist or it is not a bucket.
113
+	if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
114
+		return nil
115
+	}
116
+
117
+	// Otherwise create a bucket and cache it.
118
+	var child = b.openBucket(v)
119
+	if b.buckets != nil {
120
+		b.buckets[string(name)] = child
121
+	}
122
+
123
+	return child
124
+}
125
+
126
+// Helper method that re-interprets a sub-bucket value
127
+// from a parent into a Bucket
128
+func (b *Bucket) openBucket(value []byte) *Bucket {
129
+	var child = newBucket(b.tx)
130
+
131
+	// If this is a writable transaction then we need to copy the bucket entry.
132
+	// Read-only transactions can point directly at the mmap entry.
133
+	if b.tx.writable {
134
+		child.bucket = &bucket{}
135
+		*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
136
+	} else {
137
+		child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
138
+	}
139
+
140
+	// Save a reference to the inline page if the bucket is inline.
141
+	if child.root == 0 {
142
+		child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
143
+	}
144
+
145
+	return &child
146
+}
147
+
148
+// CreateBucket creates a new bucket at the given key and returns the new bucket.
149
+// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.
150
+func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
151
+	if b.tx.db == nil {
152
+		return nil, ErrTxClosed
153
+	} else if !b.tx.writable {
154
+		return nil, ErrTxNotWritable
155
+	} else if len(key) == 0 {
156
+		return nil, ErrBucketNameRequired
157
+	}
158
+
159
+	// Move cursor to correct position.
160
+	c := b.Cursor()
161
+	k, _, flags := c.seek(key)
162
+
163
+	// Return an error if there is an existing key.
164
+	if bytes.Equal(key, k) {
165
+		if (flags & bucketLeafFlag) != 0 {
166
+			return nil, ErrBucketExists
167
+		} else {
168
+			return nil, ErrIncompatibleValue
169
+		}
170
+	}
171
+
172
+	// Create empty, inline bucket.
173
+	var bucket = Bucket{
174
+		bucket:      &bucket{},
175
+		rootNode:    &node{isLeaf: true},
176
+		FillPercent: DefaultFillPercent,
177
+	}
178
+	var value = bucket.write()
179
+
180
+	// Insert into node.
181
+	key = cloneBytes(key)
182
+	c.node().put(key, key, value, 0, bucketLeafFlag)
183
+
184
+	// Since subbuckets are not allowed on inline buckets, we need to
185
+	// dereference the inline page, if it exists. This will cause the bucket
186
+	// to be treated as a regular, non-inline bucket for the rest of the tx.
187
+	b.page = nil
188
+
189
+	return b.Bucket(key), nil
190
+}
191
+
192
+// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.
193
+// Returns an error if the bucket name is blank, or if the bucket name is too long.
194
+func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
195
+	child, err := b.CreateBucket(key)
196
+	if err == ErrBucketExists {
197
+		return b.Bucket(key), nil
198
+	} else if err != nil {
199
+		return nil, err
200
+	}
201
+	return child, nil
202
+}
203
+
204
+// DeleteBucket deletes a bucket at the given key.
205
+// Returns an error if the bucket does not exists, or if the key represents a non-bucket value.
206
+func (b *Bucket) DeleteBucket(key []byte) error {
207
+	if b.tx.db == nil {
208
+		return ErrTxClosed
209
+	} else if !b.Writable() {
210
+		return ErrTxNotWritable
211
+	}
212
+
213
+	// Move cursor to correct position.
214
+	c := b.Cursor()
215
+	k, _, flags := c.seek(key)
216
+
217
+	// Return an error if bucket doesn't exist or is not a bucket.
218
+	if !bytes.Equal(key, k) {
219
+		return ErrBucketNotFound
220
+	} else if (flags & bucketLeafFlag) == 0 {
221
+		return ErrIncompatibleValue
222
+	}
223
+
224
+	// Recursively delete all child buckets.
225
+	child := b.Bucket(key)
226
+	err := child.ForEach(func(k, v []byte) error {
227
+		if v == nil {
228
+			if err := child.DeleteBucket(k); err != nil {
229
+				return fmt.Errorf("delete bucket: %s", err)
230
+			}
231
+		}
232
+		return nil
233
+	})
234
+	if err != nil {
235
+		return err
236
+	}
237
+
238
+	// Remove cached copy.
239
+	delete(b.buckets, string(key))
240
+
241
+	// Release all bucket pages to freelist.
242
+	child.nodes = nil
243
+	child.rootNode = nil
244
+	child.free()
245
+
246
+	// Delete the node if we have a matching key.
247
+	c.node().del(key)
248
+
249
+	return nil
250
+}
251
+
252
+// Get retrieves the value for a key in the bucket.
253
+// Returns a nil value if the key does not exist or if the key is a nested bucket.
254
+func (b *Bucket) Get(key []byte) []byte {
255
+	k, v, flags := b.Cursor().seek(key)
256
+
257
+	// Return nil if this is a bucket.
258
+	if (flags & bucketLeafFlag) != 0 {
259
+		return nil
260
+	}
261
+
262
+	// If our target node isn't the same key as what's passed in then return nil.
263
+	if !bytes.Equal(key, k) {
264
+		return nil
265
+	}
266
+	return v
267
+}
268
+
269
+// Put sets the value for a key in the bucket.
270
+// If the key exist then its previous value will be overwritten.
271
+// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
272
+func (b *Bucket) Put(key []byte, value []byte) error {
273
+	if b.tx.db == nil {
274
+		return ErrTxClosed
275
+	} else if !b.Writable() {
276
+		return ErrTxNotWritable
277
+	} else if len(key) == 0 {
278
+		return ErrKeyRequired
279
+	} else if len(key) > MaxKeySize {
280
+		return ErrKeyTooLarge
281
+	} else if int64(len(value)) > MaxValueSize {
282
+		return ErrValueTooLarge
283
+	}
284
+
285
+	// Move cursor to correct position.
286
+	c := b.Cursor()
287
+	k, _, flags := c.seek(key)
288
+
289
+	// Return an error if there is an existing key with a bucket value.
290
+	if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 {
291
+		return ErrIncompatibleValue
292
+	}
293
+
294
+	// Insert into node.
295
+	key = cloneBytes(key)
296
+	c.node().put(key, key, value, 0, 0)
297
+
298
+	return nil
299
+}
300
+
301
+// Delete removes a key from the bucket.
302
+// If the key does not exist then nothing is done and a nil error is returned.
303
+// Returns an error if the bucket was created from a read-only transaction.
304
+func (b *Bucket) Delete(key []byte) error {
305
+	if b.tx.db == nil {
306
+		return ErrTxClosed
307
+	} else if !b.Writable() {
308
+		return ErrTxNotWritable
309
+	}
310
+
311
+	// Move cursor to correct position.
312
+	c := b.Cursor()
313
+	_, _, flags := c.seek(key)
314
+
315
+	// Return an error if there is already existing bucket value.
316
+	if (flags & bucketLeafFlag) != 0 {
317
+		return ErrIncompatibleValue
318
+	}
319
+
320
+	// Delete the node if we have a matching key.
321
+	c.node().del(key)
322
+
323
+	return nil
324
+}
325
+
326
+// NextSequence returns an autoincrementing integer for the bucket.
327
+func (b *Bucket) NextSequence() (uint64, error) {
328
+	if b.tx.db == nil {
329
+		return 0, ErrTxClosed
330
+	} else if !b.Writable() {
331
+		return 0, ErrTxNotWritable
332
+	}
333
+
334
+	// Increment and return the sequence.
335
+	b.bucket.sequence++
336
+	return b.bucket.sequence, nil
337
+}
338
+
339
+// ForEach executes a function for each key/value pair in a bucket.
340
+// If the provided function returns an error then the iteration is stopped and
341
+// the error is returned to the caller.
342
+func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
343
+	if b.tx.db == nil {
344
+		return ErrTxClosed
345
+	}
346
+	c := b.Cursor()
347
+	for k, v := c.First(); k != nil; k, v = c.Next() {
348
+		if err := fn(k, v); err != nil {
349
+			return err
350
+		}
351
+	}
352
+	return nil
353
+}
354
+
355
+// Stat returns stats on a bucket.
356
+func (b *Bucket) Stats() BucketStats {
357
+	var s, subStats BucketStats
358
+	pageSize := b.tx.db.pageSize
359
+	s.BucketN += 1
360
+	if b.root == 0 {
361
+		s.InlineBucketN += 1
362
+	}
363
+	b.forEachPage(func(p *page, depth int) {
364
+		if (p.flags & leafPageFlag) != 0 {
365
+			s.KeyN += int(p.count)
366
+
367
+			// used totals the used bytes for the page
368
+			used := pageHeaderSize
369
+
370
+			if p.count != 0 {
371
+				// If page has any elements, add all element headers.
372
+				used += leafPageElementSize * int(p.count-1)
373
+
374
+				// Add all element key, value sizes.
375
+				// The computation takes advantage of the fact that the position
376
+				// of the last element's key/value equals to the total of the sizes
377
+				// of all previous elements' keys and values.
378
+				// It also includes the last element's header.
379
+				lastElement := p.leafPageElement(p.count - 1)
380
+				used += int(lastElement.pos + lastElement.ksize + lastElement.vsize)
381
+			}
382
+
383
+			if b.root == 0 {
384
+				// For inlined bucket just update the inline stats
385
+				s.InlineBucketInuse += used
386
+			} else {
387
+				// For non-inlined bucket update all the leaf stats
388
+				s.LeafPageN++
389
+				s.LeafInuse += used
390
+				s.LeafOverflowN += int(p.overflow)
391
+
392
+				// Collect stats from sub-buckets.
393
+				// Do that by iterating over all element headers
394
+				// looking for the ones with the bucketLeafFlag.
395
+				for i := uint16(0); i < p.count; i++ {
396
+					e := p.leafPageElement(i)
397
+					if (e.flags & bucketLeafFlag) != 0 {
398
+						// For any bucket element, open the element value
399
+						// and recursively call Stats on the contained bucket.
400
+						subStats.Add(b.openBucket(e.value()).Stats())
401
+					}
402
+				}
403
+			}
404
+		} else if (p.flags & branchPageFlag) != 0 {
405
+			s.BranchPageN++
406
+			lastElement := p.branchPageElement(p.count - 1)
407
+
408
+			// used totals the used bytes for the page
409
+			// Add header and all element headers.
410
+			used := pageHeaderSize + (branchPageElementSize * int(p.count-1))
411
+
412
+			// Add size of all keys and values.
413
+			// Again, use the fact that last element's position equals to
414
+			// the total of key, value sizes of all previous elements.
415
+			used += int(lastElement.pos + lastElement.ksize)
416
+			s.BranchInuse += used
417
+			s.BranchOverflowN += int(p.overflow)
418
+		}
419
+
420
+		// Keep track of maximum page depth.
421
+		if depth+1 > s.Depth {
422
+			s.Depth = (depth + 1)
423
+		}
424
+	})
425
+
426
+	// Alloc stats can be computed from page counts and pageSize.
427
+	s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize
428
+	s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize
429
+
430
+	// Add the max depth of sub-buckets to get total nested depth.
431
+	s.Depth += subStats.Depth
432
+	// Add the stats for all sub-buckets
433
+	s.Add(subStats)
434
+	return s
435
+}
436
+
437
+// forEachPage iterates over every page in a bucket, including inline pages.
438
+func (b *Bucket) forEachPage(fn func(*page, int)) {
439
+	// If we have an inline page then just use that.
440
+	if b.page != nil {
441
+		fn(b.page, 0)
442
+		return
443
+	}
444
+
445
+	// Otherwise traverse the page hierarchy.
446
+	b.tx.forEachPage(b.root, 0, fn)
447
+}
448
+
449
+// forEachPageNode iterates over every page (or node) in a bucket.
450
+// This also includes inline pages.
451
+func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) {
452
+	// If we have an inline page or root node then just use that.
453
+	if b.page != nil {
454
+		fn(b.page, nil, 0)
455
+		return
456
+	}
457
+	b._forEachPageNode(b.root, 0, fn)
458
+}
459
+
460
+func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) {
461
+	var p, n = b.pageNode(pgid)
462
+
463
+	// Execute function.
464
+	fn(p, n, depth)
465
+
466
+	// Recursively loop over children.
467
+	if p != nil {
468
+		if (p.flags & branchPageFlag) != 0 {
469
+			for i := 0; i < int(p.count); i++ {
470
+				elem := p.branchPageElement(uint16(i))
471
+				b._forEachPageNode(elem.pgid, depth+1, fn)
472
+			}
473
+		}
474
+	} else {
475
+		if !n.isLeaf {
476
+			for _, inode := range n.inodes {
477
+				b._forEachPageNode(inode.pgid, depth+1, fn)
478
+			}
479
+		}
480
+	}
481
+}
482
+
483
+// spill writes all the nodes for this bucket to dirty pages.
484
+func (b *Bucket) spill() error {
485
+	// Spill all child buckets first.
486
+	for name, child := range b.buckets {
487
+		// If the child bucket is small enough and it has no child buckets then
488
+		// write it inline into the parent bucket's page. Otherwise spill it
489
+		// like a normal bucket and make the parent value a pointer to the page.
490
+		var value []byte
491
+		if child.inlineable() {
492
+			child.free()
493
+			value = child.write()
494
+		} else {
495
+			if err := child.spill(); err != nil {
496
+				return err
497
+			}
498
+
499
+			// Update the child bucket header in this bucket.
500
+			value = make([]byte, unsafe.Sizeof(bucket{}))
501
+			var bucket = (*bucket)(unsafe.Pointer(&value[0]))
502
+			*bucket = *child.bucket
503
+		}
504
+
505
+		// Skip writing the bucket if there are no materialized nodes.
506
+		if child.rootNode == nil {
507
+			continue
508
+		}
509
+
510
+		// Update parent node.
511
+		var c = b.Cursor()
512
+		k, _, flags := c.seek([]byte(name))
513
+		_assert(bytes.Equal([]byte(name), k), "misplaced bucket header: %x -> %x", []byte(name), k)
514
+		_assert(flags&bucketLeafFlag != 0, "unexpected bucket header flag: %x", flags)
515
+		c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
516
+	}
517
+
518
+	// Ignore if there's not a materialized root node.
519
+	if b.rootNode == nil {
520
+		return nil
521
+	}
522
+
523
+	// Spill nodes.
524
+	if err := b.rootNode.spill(); err != nil {
525
+		return err
526
+	}
527
+	b.rootNode = b.rootNode.root()
528
+
529
+	// Update the root node for this bucket.
530
+	_assert(b.rootNode.pgid < b.tx.meta.pgid, "pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid)
531
+	b.root = b.rootNode.pgid
532
+
533
+	return nil
534
+}
535
+
536
+// inlineable returns true if a bucket is small enough to be written inline
537
+// and if it contains no subbuckets. Otherwise returns false.
538
+func (b *Bucket) inlineable() bool {
539
+	var n = b.rootNode
540
+
541
+	// Bucket must only contain a single leaf node.
542
+	if n == nil || !n.isLeaf {
543
+		return false
544
+	}
545
+
546
+	// Bucket is not inlineable if it contains subbuckets or if it goes beyond
547
+	// our threshold for inline bucket size.
548
+	var size = pageHeaderSize
549
+	for _, inode := range n.inodes {
550
+		size += leafPageElementSize + len(inode.key) + len(inode.value)
551
+
552
+		if inode.flags&bucketLeafFlag != 0 {
553
+			return false
554
+		} else if size > b.maxInlineBucketSize() {
555
+			return false
556
+		}
557
+	}
558
+
559
+	return true
560
+}
561
+
562
+// Returns the maximum total size of a bucket to make it a candidate for inlining.
563
+func (b *Bucket) maxInlineBucketSize() int {
564
+	return b.tx.db.pageSize / 4
565
+}
566
+
567
+// write allocates and writes a bucket to a byte slice.
568
+func (b *Bucket) write() []byte {
569
+	// Allocate the appropriate size.
570
+	var n = b.rootNode
571
+	var value = make([]byte, bucketHeaderSize+n.size())
572
+
573
+	// Write a bucket header.
574
+	var bucket = (*bucket)(unsafe.Pointer(&value[0]))
575
+	*bucket = *b.bucket
576
+
577
+	// Convert byte slice to a fake page and write the root node.
578
+	var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
579
+	n.write(p)
580
+
581
+	return value
582
+}
583
+
584
+// rebalance attempts to balance all nodes.
585
+func (b *Bucket) rebalance() {
586
+	for _, n := range b.nodes {
587
+		n.rebalance()
588
+	}
589
+	for _, child := range b.buckets {
590
+		child.rebalance()
591
+	}
592
+}
593
+
594
+// node creates a node from a page and associates it with a given parent.
595
+func (b *Bucket) node(pgid pgid, parent *node) *node {
596
+	_assert(b.nodes != nil, "nodes map expected")
597
+
598
+	// Retrieve node if it's already been created.
599
+	if n := b.nodes[pgid]; n != nil {
600
+		return n
601
+	}
602
+
603
+	// Otherwise create a node and cache it.
604
+	n := &node{bucket: b, parent: parent}
605
+	if parent == nil {
606
+		b.rootNode = n
607
+	} else {
608
+		parent.children = append(parent.children, n)
609
+	}
610
+
611
+	// Use the inline page if this is an inline bucket.
612
+	var p = b.page
613
+	if p == nil {
614
+		p = b.tx.page(pgid)
615
+	}
616
+
617
+	// Read the page into the node and cache it.
618
+	n.read(p)
619
+	b.nodes[pgid] = n
620
+
621
+	// Update statistics.
622
+	b.tx.stats.NodeCount++
623
+
624
+	return n
625
+}
626
+
627
+// free recursively frees all pages in the bucket.
628
+func (b *Bucket) free() {
629
+	if b.root == 0 {
630
+		return
631
+	}
632
+
633
+	var tx = b.tx
634
+	b.forEachPageNode(func(p *page, n *node, _ int) {
635
+		if p != nil {
636
+			tx.db.freelist.free(tx.meta.txid, p)
637
+		} else {
638
+			n.free()
639
+		}
640
+	})
641
+	b.root = 0
642
+}
643
+
644
+// dereference removes all references to the old mmap.
645
+func (b *Bucket) dereference() {
646
+	if b.rootNode != nil {
647
+		b.rootNode.root().dereference()
648
+	}
649
+
650
+	for _, child := range b.buckets {
651
+		child.dereference()
652
+	}
653
+}
654
+
655
+// pageNode returns the in-memory node, if it exists.
656
+// Otherwise returns the underlying page.
657
+func (b *Bucket) pageNode(id pgid) (*page, *node) {
658
+	// Inline buckets have a fake page embedded in their value so treat them
659
+	// differently. We'll return the rootNode (if available) or the fake page.
660
+	if b.root == 0 {
661
+		_assert(id == 0, "inline bucket non-zero page access(2): %d != 0", id)
662
+		if b.rootNode != nil {
663
+			return nil, b.rootNode
664
+		}
665
+		return b.page, nil
666
+	}
667
+
668
+	// Check the node cache for non-inline buckets.
669
+	if b.nodes != nil {
670
+		if n := b.nodes[id]; n != nil {
671
+			return nil, n
672
+		}
673
+	}
674
+
675
+	// Finally lookup the page from the transaction if no node is materialized.
676
+	return b.tx.page(id), nil
677
+}
678
+
679
+// BucketStats records statistics about resources used by a bucket.
680
+type BucketStats struct {
681
+	// Page count statistics.
682
+	BranchPageN     int // number of logical branch pages
683
+	BranchOverflowN int // number of physical branch overflow pages
684
+	LeafPageN       int // number of logical leaf pages
685
+	LeafOverflowN   int // number of physical leaf overflow pages
686
+
687
+	// Tree statistics.
688
+	KeyN  int // number of keys/value pairs
689
+	Depth int // number of levels in B+tree
690
+
691
+	// Page size utilization.
692
+	BranchAlloc int // bytes allocated for physical branch pages
693
+	BranchInuse int // bytes actually used for branch data
694
+	LeafAlloc   int // bytes allocated for physical leaf pages
695
+	LeafInuse   int // bytes actually used for leaf data
696
+
697
+	// Bucket statistics
698
+	BucketN           int // total number of buckets including the top bucket
699
+	InlineBucketN     int // total number on inlined buckets
700
+	InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse)
701
+}
702
+
703
+func (s *BucketStats) Add(other BucketStats) {
704
+	s.BranchPageN += other.BranchPageN
705
+	s.BranchOverflowN += other.BranchOverflowN
706
+	s.LeafPageN += other.LeafPageN
707
+	s.LeafOverflowN += other.LeafOverflowN
708
+	s.KeyN += other.KeyN
709
+	if s.Depth < other.Depth {
710
+		s.Depth = other.Depth
711
+	}
712
+	s.BranchAlloc += other.BranchAlloc
713
+	s.BranchInuse += other.BranchInuse
714
+	s.LeafAlloc += other.LeafAlloc
715
+	s.LeafInuse += other.LeafInuse
716
+
717
+	s.BucketN += other.BucketN
718
+	s.InlineBucketN += other.InlineBucketN
719
+	s.InlineBucketInuse += other.InlineBucketInuse
720
+}
721
+
722
+// cloneBytes returns a copy of a given slice.
723
+func cloneBytes(v []byte) []byte {
724
+	var clone = make([]byte, len(v))
725
+	copy(clone, v)
726
+	return clone
727
+}
0 728
new file mode 100644
... ...
@@ -0,0 +1,376 @@
0
+package bolt
1
+
2
+import (
3
+	"bytes"
4
+	"sort"
5
+)
6
+
7
+// Cursor represents an iterator that can traverse over all key/value pairs in a bucket in sorted order.
8
+// Cursors see nested buckets with value == nil.
9
+// Cursors can be obtained from a transaction and are valid as long as the transaction is open.
10
+//
11
+// Changing data while traversing with a cursor may cause it to be invalidated
12
+// and return unexpected keys and/or values. You must reposition your cursor
13
+// after mutating data.
14
+type Cursor struct {
15
+	bucket *Bucket
16
+	stack  []elemRef
17
+}
18
+
19
+// Bucket returns the bucket that this cursor was created from.
20
+func (c *Cursor) Bucket() *Bucket {
21
+	return c.bucket
22
+}
23
+
24
+// First moves the cursor to the first item in the bucket and returns its key and value.
25
+// If the bucket is empty then a nil key and value are returned.
26
+func (c *Cursor) First() (key []byte, value []byte) {
27
+	_assert(c.bucket.tx.db != nil, "tx closed")
28
+	c.stack = c.stack[:0]
29
+	p, n := c.bucket.pageNode(c.bucket.root)
30
+	c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
31
+	c.first()
32
+	k, v, flags := c.keyValue()
33
+	if (flags & uint32(bucketLeafFlag)) != 0 {
34
+		return k, nil
35
+	}
36
+	return k, v
37
+
38
+}
39
+
40
+// Last moves the cursor to the last item in the bucket and returns its key and value.
41
+// If the bucket is empty then a nil key and value are returned.
42
+func (c *Cursor) Last() (key []byte, value []byte) {
43
+	_assert(c.bucket.tx.db != nil, "tx closed")
44
+	c.stack = c.stack[:0]
45
+	p, n := c.bucket.pageNode(c.bucket.root)
46
+	ref := elemRef{page: p, node: n}
47
+	ref.index = ref.count() - 1
48
+	c.stack = append(c.stack, ref)
49
+	c.last()
50
+	k, v, flags := c.keyValue()
51
+	if (flags & uint32(bucketLeafFlag)) != 0 {
52
+		return k, nil
53
+	}
54
+	return k, v
55
+}
56
+
57
+// Next moves the cursor to the next item in the bucket and returns its key and value.
58
+// If the cursor is at the end of the bucket then a nil key and value are returned.
59
+func (c *Cursor) Next() (key []byte, value []byte) {
60
+	_assert(c.bucket.tx.db != nil, "tx closed")
61
+	k, v, flags := c.next()
62
+	if (flags & uint32(bucketLeafFlag)) != 0 {
63
+		return k, nil
64
+	}
65
+	return k, v
66
+}
67
+
68
+// Prev moves the cursor to the previous item in the bucket and returns its key and value.
69
+// If the cursor is at the beginning of the bucket then a nil key and value are returned.
70
+func (c *Cursor) Prev() (key []byte, value []byte) {
71
+	_assert(c.bucket.tx.db != nil, "tx closed")
72
+
73
+	// Attempt to move back one element until we're successful.
74
+	// Move up the stack as we hit the beginning of each page in our stack.
75
+	for i := len(c.stack) - 1; i >= 0; i-- {
76
+		elem := &c.stack[i]
77
+		if elem.index > 0 {
78
+			elem.index--
79
+			break
80
+		}
81
+		c.stack = c.stack[:i]
82
+	}
83
+
84
+	// If we've hit the end then return nil.
85
+	if len(c.stack) == 0 {
86
+		return nil, nil
87
+	}
88
+
89
+	// Move down the stack to find the last element of the last leaf under this branch.
90
+	c.last()
91
+	k, v, flags := c.keyValue()
92
+	if (flags & uint32(bucketLeafFlag)) != 0 {
93
+		return k, nil
94
+	}
95
+	return k, v
96
+}
97
+
98
+// Seek moves the cursor to a given key and returns it.
99
+// If the key does not exist then the next key is used. If no keys
100
+// follow, a nil key is returned.
101
+func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) {
102
+	k, v, flags := c.seek(seek)
103
+
104
+	// If we ended up after the last element of a page then move to the next one.
105
+	if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() {
106
+		k, v, flags = c.next()
107
+	}
108
+
109
+	if k == nil {
110
+		return nil, nil
111
+	} else if (flags & uint32(bucketLeafFlag)) != 0 {
112
+		return k, nil
113
+	}
114
+	return k, v
115
+}
116
+
117
+// Delete removes the current key/value under the cursor from the bucket.
118
+// Delete fails if current key/value is a bucket or if the transaction is not writable.
119
+func (c *Cursor) Delete() error {
120
+	if c.bucket.tx.db == nil {
121
+		return ErrTxClosed
122
+	} else if !c.bucket.Writable() {
123
+		return ErrTxNotWritable
124
+	}
125
+
126
+	key, _, flags := c.keyValue()
127
+	// Return an error if current value is a bucket.
128
+	if (flags & bucketLeafFlag) != 0 {
129
+		return ErrIncompatibleValue
130
+	}
131
+	c.node().del(key)
132
+
133
+	return nil
134
+}
135
+
136
+// seek moves the cursor to a given key and returns it.
137
+// If the key does not exist then the next key is used.
138
+func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
139
+	_assert(c.bucket.tx.db != nil, "tx closed")
140
+
141
+	// Start from root page/node and traverse to correct page.
142
+	c.stack = c.stack[:0]
143
+	c.search(seek, c.bucket.root)
144
+	ref := &c.stack[len(c.stack)-1]
145
+
146
+	// If the cursor is pointing to the end of page/node then return nil.
147
+	if ref.index >= ref.count() {
148
+		return nil, nil, 0
149
+	}
150
+
151
+	// If this is a bucket then return a nil value.
152
+	return c.keyValue()
153
+}
154
+
155
+// first moves the cursor to the first leaf element under the last page in the stack.
156
+func (c *Cursor) first() {
157
+	for {
158
+		// Exit when we hit a leaf page.
159
+		var ref = &c.stack[len(c.stack)-1]
160
+		if ref.isLeaf() {
161
+			break
162
+		}
163
+
164
+		// Keep adding pages pointing to the first element to the stack.
165
+		var pgid pgid
166
+		if ref.node != nil {
167
+			pgid = ref.node.inodes[ref.index].pgid
168
+		} else {
169
+			pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
170
+		}
171
+		p, n := c.bucket.pageNode(pgid)
172
+		c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
173
+	}
174
+}
175
+
176
+// last moves the cursor to the last leaf element under the last page in the stack.
177
+func (c *Cursor) last() {
178
+	for {
179
+		// Exit when we hit a leaf page.
180
+		ref := &c.stack[len(c.stack)-1]
181
+		if ref.isLeaf() {
182
+			break
183
+		}
184
+
185
+		// Keep adding pages pointing to the last element in the stack.
186
+		var pgid pgid
187
+		if ref.node != nil {
188
+			pgid = ref.node.inodes[ref.index].pgid
189
+		} else {
190
+			pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
191
+		}
192
+		p, n := c.bucket.pageNode(pgid)
193
+
194
+		var nextRef = elemRef{page: p, node: n}
195
+		nextRef.index = nextRef.count() - 1
196
+		c.stack = append(c.stack, nextRef)
197
+	}
198
+}
199
+
200
+// next moves to the next leaf element and returns the key and value.
201
+// If the cursor is at the last leaf element then it stays there and returns nil.
202
+func (c *Cursor) next() (key []byte, value []byte, flags uint32) {
203
+	// Attempt to move over one element until we're successful.
204
+	// Move up the stack as we hit the end of each page in our stack.
205
+	var i int
206
+	for i = len(c.stack) - 1; i >= 0; i-- {
207
+		elem := &c.stack[i]
208
+		if elem.index < elem.count()-1 {
209
+			elem.index++
210
+			break
211
+		}
212
+	}
213
+
214
+	// If we've hit the root page then stop and return. This will leave the
215
+	// cursor on the last element of the last page.
216
+	if i == -1 {
217
+		return nil, nil, 0
218
+	}
219
+
220
+	// Otherwise start from where we left off in the stack and find the
221
+	// first element of the first leaf page.
222
+	c.stack = c.stack[:i+1]
223
+	c.first()
224
+	return c.keyValue()
225
+}
226
+
227
+// search recursively performs a binary search against a given page/node until it finds a given key.
228
+func (c *Cursor) search(key []byte, pgid pgid) {
229
+	p, n := c.bucket.pageNode(pgid)
230
+	if p != nil {
231
+		_assert((p.flags&(branchPageFlag|leafPageFlag)) != 0, "invalid page type: %d: %x", p.id, p.flags)
232
+	}
233
+	e := elemRef{page: p, node: n}
234
+	c.stack = append(c.stack, e)
235
+
236
+	// If we're on a leaf page/node then find the specific node.
237
+	if e.isLeaf() {
238
+		c.nsearch(key)
239
+		return
240
+	}
241
+
242
+	if n != nil {
243
+		c.searchNode(key, n)
244
+		return
245
+	}
246
+	c.searchPage(key, p)
247
+}
248
+
249
+func (c *Cursor) searchNode(key []byte, n *node) {
250
+	var exact bool
251
+	index := sort.Search(len(n.inodes), func(i int) bool {
252
+		// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
253
+		// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
254
+		ret := bytes.Compare(n.inodes[i].key, key)
255
+		if ret == 0 {
256
+			exact = true
257
+		}
258
+		return ret != -1
259
+	})
260
+	if !exact && index > 0 {
261
+		index--
262
+	}
263
+	c.stack[len(c.stack)-1].index = index
264
+
265
+	// Recursively search to the next page.
266
+	c.search(key, n.inodes[index].pgid)
267
+}
268
+
269
+func (c *Cursor) searchPage(key []byte, p *page) {
270
+	// Binary search for the correct range.
271
+	inodes := p.branchPageElements()
272
+
273
+	var exact bool
274
+	index := sort.Search(int(p.count), func(i int) bool {
275
+		// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
276
+		// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
277
+		ret := bytes.Compare(inodes[i].key(), key)
278
+		if ret == 0 {
279
+			exact = true
280
+		}
281
+		return ret != -1
282
+	})
283
+	if !exact && index > 0 {
284
+		index--
285
+	}
286
+	c.stack[len(c.stack)-1].index = index
287
+
288
+	// Recursively search to the next page.
289
+	c.search(key, inodes[index].pgid)
290
+}
291
+
292
+// nsearch searches the leaf node on the top of the stack for a key.
293
+func (c *Cursor) nsearch(key []byte) {
294
+	e := &c.stack[len(c.stack)-1]
295
+	p, n := e.page, e.node
296
+
297
+	// If we have a node then search its inodes.
298
+	if n != nil {
299
+		index := sort.Search(len(n.inodes), func(i int) bool {
300
+			return bytes.Compare(n.inodes[i].key, key) != -1
301
+		})
302
+		e.index = index
303
+		return
304
+	}
305
+
306
+	// If we have a page then search its leaf elements.
307
+	inodes := p.leafPageElements()
308
+	index := sort.Search(int(p.count), func(i int) bool {
309
+		return bytes.Compare(inodes[i].key(), key) != -1
310
+	})
311
+	e.index = index
312
+}
313
+
314
+// keyValue returns the key and value of the current leaf element.
315
+func (c *Cursor) keyValue() ([]byte, []byte, uint32) {
316
+	ref := &c.stack[len(c.stack)-1]
317
+	if ref.count() == 0 || ref.index >= ref.count() {
318
+		return nil, nil, 0
319
+	}
320
+
321
+	// Retrieve value from node.
322
+	if ref.node != nil {
323
+		inode := &ref.node.inodes[ref.index]
324
+		return inode.key, inode.value, inode.flags
325
+	}
326
+
327
+	// Or retrieve value from page.
328
+	elem := ref.page.leafPageElement(uint16(ref.index))
329
+	return elem.key(), elem.value(), elem.flags
330
+}
331
+
332
+// node returns the node that the cursor is currently positioned on.
333
+func (c *Cursor) node() *node {
334
+	_assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")
335
+
336
+	// If the top of the stack is a leaf node then just return it.
337
+	if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
338
+		return ref.node
339
+	}
340
+
341
+	// Start from root and traverse down the hierarchy.
342
+	var n = c.stack[0].node
343
+	if n == nil {
344
+		n = c.bucket.node(c.stack[0].page.id, nil)
345
+	}
346
+	for _, ref := range c.stack[:len(c.stack)-1] {
347
+		_assert(!n.isLeaf, "expected branch node")
348
+		n = n.childAt(int(ref.index))
349
+	}
350
+	_assert(n.isLeaf, "expected leaf node")
351
+	return n
352
+}
353
+
354
+// elemRef represents a reference to an element on a given page/node.
355
+type elemRef struct {
356
+	page  *page
357
+	node  *node
358
+	index int
359
+}
360
+
361
+// isLeaf returns whether the ref is pointing at a leaf page/node.
362
+func (r *elemRef) isLeaf() bool {
363
+	if r.node != nil {
364
+		return r.node.isLeaf
365
+	}
366
+	return (r.page.flags & leafPageFlag) != 0
367
+}
368
+
369
+// count returns the number of inodes or page elements.
370
+func (r *elemRef) count() int {
371
+	if r.node != nil {
372
+		return len(r.node.inodes)
373
+	}
374
+	return int(r.page.count)
375
+}
0 376
new file mode 100644
... ...
@@ -0,0 +1,689 @@
0
+package bolt
1
+
2
+import (
3
+	"fmt"
4
+	"hash/fnv"
5
+	"os"
6
+	"runtime"
7
+	"runtime/debug"
8
+	"strings"
9
+	"sync"
10
+	"time"
11
+	"unsafe"
12
+)
13
+
14
+// The smallest size that the mmap can be.
15
+const minMmapSize = 1 << 22 // 4MB
16
+
17
+// The largest step that can be taken when remapping the mmap.
18
+const maxMmapStep = 1 << 30 // 1GB
19
+
20
+// The data file format version.
21
+const version = 2
22
+
23
+// Represents a marker value to indicate that a file is a Bolt DB.
24
+const magic uint32 = 0xED0CDAED
25
+
26
+// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
27
+// syncing changes to a file.  This is required as some operating systems,
28
+// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
29
+// must be synchronzied using the msync(2) syscall.
30
+const IgnoreNoSync = runtime.GOOS == "openbsd"
31
+
32
+// DB represents a collection of buckets persisted to a file on disk.
33
+// All data access is performed through transactions which can be obtained through the DB.
34
+// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
35
+type DB struct {
36
+	// When enabled, the database will perform a Check() after every commit.
37
+	// A panic is issued if the database is in an inconsistent state. This
38
+	// flag has a large performance impact so it should only be used for
39
+	// debugging purposes.
40
+	StrictMode bool
41
+
42
+	// Setting the NoSync flag will cause the database to skip fsync()
43
+	// calls after each commit. This can be useful when bulk loading data
44
+	// into a database and you can restart the bulk load in the event of
45
+	// a system failure or database corruption. Do not set this flag for
46
+	// normal use.
47
+	//
48
+	// If the package global IgnoreNoSync constant is true, this value is
49
+	// ignored.  See the comment on that constant for more details.
50
+	//
51
+	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
52
+	NoSync bool
53
+
54
+	path     string
55
+	file     *os.File
56
+	dataref  []byte
57
+	data     *[maxMapSize]byte
58
+	datasz   int
59
+	meta0    *meta
60
+	meta1    *meta
61
+	pageSize int
62
+	opened   bool
63
+	rwtx     *Tx
64
+	txs      []*Tx
65
+	freelist *freelist
66
+	stats    Stats
67
+
68
+	rwlock   sync.Mutex   // Allows only one writer at a time.
69
+	metalock sync.Mutex   // Protects meta page access.
70
+	mmaplock sync.RWMutex // Protects mmap access during remapping.
71
+	statlock sync.RWMutex // Protects stats access.
72
+
73
+	ops struct {
74
+		writeAt func(b []byte, off int64) (n int, err error)
75
+	}
76
+}
77
+
78
+// Path returns the path to currently open database file.
79
+func (db *DB) Path() string {
80
+	return db.path
81
+}
82
+
83
+// GoString returns the Go string representation of the database.
84
+func (db *DB) GoString() string {
85
+	return fmt.Sprintf("bolt.DB{path:%q}", db.path)
86
+}
87
+
88
+// String returns the string representation of the database.
89
+func (db *DB) String() string {
90
+	return fmt.Sprintf("DB<%q>", db.path)
91
+}
92
+
93
+// Open creates and opens a database at the given path.
94
+// If the file does not exist then it will be created automatically.
95
+// Passing in nil options will cause Bolt to open the database with the default options.
96
+func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
97
+	var db = &DB{opened: true}
98
+
99
+	// Set default options if no options are provided.
100
+	if options == nil {
101
+		options = DefaultOptions
102
+	}
103
+
104
+	// Open data file and separate sync handler for metadata writes.
105
+	db.path = path
106
+
107
+	var err error
108
+	if db.file, err = os.OpenFile(db.path, os.O_RDWR|os.O_CREATE, mode); err != nil {
109
+		_ = db.close()
110
+		return nil, err
111
+	}
112
+
113
+	// Lock file so that other processes using Bolt cannot use the database
114
+	// at the same time. This would cause corruption since the two processes
115
+	// would write meta pages and free pages separately.
116
+	if err := flock(db.file, options.Timeout); err != nil {
117
+		_ = db.close()
118
+		return nil, err
119
+	}
120
+
121
+	// Default values for test hooks
122
+	db.ops.writeAt = db.file.WriteAt
123
+
124
+	// Initialize the database if it doesn't exist.
125
+	if info, err := db.file.Stat(); err != nil {
126
+		return nil, fmt.Errorf("stat error: %s", err)
127
+	} else if info.Size() == 0 {
128
+		// Initialize new files with meta pages.
129
+		if err := db.init(); err != nil {
130
+			return nil, err
131
+		}
132
+	} else {
133
+		// Read the first meta page to determine the page size.
134
+		var buf [0x1000]byte
135
+		if _, err := db.file.ReadAt(buf[:], 0); err == nil {
136
+			m := db.pageInBuffer(buf[:], 0).meta()
137
+			if err := m.validate(); err != nil {
138
+				return nil, fmt.Errorf("meta0 error: %s", err)
139
+			}
140
+			db.pageSize = int(m.pageSize)
141
+		}
142
+	}
143
+
144
+	// Memory map the data file.
145
+	if err := db.mmap(0); err != nil {
146
+		_ = db.close()
147
+		return nil, err
148
+	}
149
+
150
+	// Read in the freelist.
151
+	db.freelist = newFreelist()
152
+	db.freelist.read(db.page(db.meta().freelist))
153
+
154
+	// Mark the database as opened and return.
155
+	return db, nil
156
+}
157
+
158
+// mmap opens the underlying memory-mapped file and initializes the meta references.
159
+// minsz is the minimum size that the new mmap can be.
160
+func (db *DB) mmap(minsz int) error {
161
+	db.mmaplock.Lock()
162
+	defer db.mmaplock.Unlock()
163
+
164
+	// Dereference all mmap references before unmapping.
165
+	if db.rwtx != nil {
166
+		db.rwtx.root.dereference()
167
+	}
168
+
169
+	// Unmap existing data before continuing.
170
+	if err := db.munmap(); err != nil {
171
+		return err
172
+	}
173
+
174
+	info, err := db.file.Stat()
175
+	if err != nil {
176
+		return fmt.Errorf("mmap stat error: %s", err)
177
+	} else if int(info.Size()) < db.pageSize*2 {
178
+		return fmt.Errorf("file size too small")
179
+	}
180
+
181
+	// Ensure the size is at least the minimum size.
182
+	var size = int(info.Size())
183
+	if size < minsz {
184
+		size = minsz
185
+	}
186
+	size = db.mmapSize(size)
187
+
188
+	// Memory-map the data file as a byte slice.
189
+	if err := mmap(db, size); err != nil {
190
+		return err
191
+	}
192
+
193
+	// Save references to the meta pages.
194
+	db.meta0 = db.page(0).meta()
195
+	db.meta1 = db.page(1).meta()
196
+
197
+	// Validate the meta pages.
198
+	if err := db.meta0.validate(); err != nil {
199
+		return fmt.Errorf("meta0 error: %s", err)
200
+	}
201
+	if err := db.meta1.validate(); err != nil {
202
+		return fmt.Errorf("meta1 error: %s", err)
203
+	}
204
+
205
+	return nil
206
+}
207
+
208
+// munmap unmaps the data file from memory.
209
+func (db *DB) munmap() error {
210
+	if err := munmap(db); err != nil {
211
+		return fmt.Errorf("unmap error: " + err.Error())
212
+	}
213
+	return nil
214
+}
215
+
216
+// mmapSize determines the appropriate size for the mmap given the current size
217
+// of the database. The minimum size is 4MB and doubles until it reaches 1GB.
218
+func (db *DB) mmapSize(size int) int {
219
+	if size <= minMmapSize {
220
+		return minMmapSize
221
+	} else if size < maxMmapStep {
222
+		size *= 2
223
+	} else {
224
+		size += maxMmapStep
225
+	}
226
+
227
+	// Ensure that the mmap size is a multiple of the page size.
228
+	if (size % db.pageSize) != 0 {
229
+		size = ((size / db.pageSize) + 1) * db.pageSize
230
+	}
231
+
232
+	return size
233
+}
234
+
235
+// init creates a new database file and initializes its meta pages.
236
+func (db *DB) init() error {
237
+	// Set the page size to the OS page size.
238
+	db.pageSize = os.Getpagesize()
239
+
240
+	// Create two meta pages on a buffer.
241
+	buf := make([]byte, db.pageSize*4)
242
+	for i := 0; i < 2; i++ {
243
+		p := db.pageInBuffer(buf[:], pgid(i))
244
+		p.id = pgid(i)
245
+		p.flags = metaPageFlag
246
+
247
+		// Initialize the meta page.
248
+		m := p.meta()
249
+		m.magic = magic
250
+		m.version = version
251
+		m.pageSize = uint32(db.pageSize)
252
+		m.version = version
253
+		m.freelist = 2
254
+		m.root = bucket{root: 3}
255
+		m.pgid = 4
256
+		m.txid = txid(i)
257
+	}
258
+
259
+	// Write an empty freelist at page 3.
260
+	p := db.pageInBuffer(buf[:], pgid(2))
261
+	p.id = pgid(2)
262
+	p.flags = freelistPageFlag
263
+	p.count = 0
264
+
265
+	// Write an empty leaf page at page 4.
266
+	p = db.pageInBuffer(buf[:], pgid(3))
267
+	p.id = pgid(3)
268
+	p.flags = leafPageFlag
269
+	p.count = 0
270
+
271
+	// Write the buffer to our data file.
272
+	if _, err := db.ops.writeAt(buf, 0); err != nil {
273
+		return err
274
+	}
275
+	if err := fdatasync(db); err != nil {
276
+		return err
277
+	}
278
+
279
+	return nil
280
+}
281
+
282
+// Close releases all database resources.
283
+// All transactions must be closed before closing the database.
284
+func (db *DB) Close() error {
285
+	db.metalock.Lock()
286
+	defer db.metalock.Unlock()
287
+	return db.close()
288
+}
289
+
290
+func (db *DB) close() error {
291
+	db.opened = false
292
+
293
+	db.freelist = nil
294
+	db.path = ""
295
+
296
+	// Clear ops.
297
+	db.ops.writeAt = nil
298
+
299
+	// Close the mmap.
300
+	if err := db.munmap(); err != nil {
301
+		return err
302
+	}
303
+
304
+	// Close file handles.
305
+	if db.file != nil {
306
+		// Unlock the file.
307
+		_ = funlock(db.file)
308
+
309
+		// Close the file descriptor.
310
+		if err := db.file.Close(); err != nil {
311
+			return fmt.Errorf("db file close: %s", err)
312
+		}
313
+		db.file = nil
314
+	}
315
+
316
+	return nil
317
+}
318
+
319
+// Begin starts a new transaction.
320
+// Multiple read-only transactions can be used concurrently but only one
321
+// write transaction can be used at a time. Starting multiple write transactions
322
+// will cause the calls to block and be serialized until the current write
323
+// transaction finishes.
324
+//
325
+// IMPORTANT: You must close read-only transactions after you are finished or
326
+// else the database will not reclaim old pages.
327
+func (db *DB) Begin(writable bool) (*Tx, error) {
328
+	if writable {
329
+		return db.beginRWTx()
330
+	}
331
+	return db.beginTx()
332
+}
333
+
334
+func (db *DB) beginTx() (*Tx, error) {
335
+	// Lock the meta pages while we initialize the transaction. We obtain
336
+	// the meta lock before the mmap lock because that's the order that the
337
+	// write transaction will obtain them.
338
+	db.metalock.Lock()
339
+
340
+	// Obtain a read-only lock on the mmap. When the mmap is remapped it will
341
+	// obtain a write lock so all transactions must finish before it can be
342
+	// remapped.
343
+	db.mmaplock.RLock()
344
+
345
+	// Exit if the database is not open yet.
346
+	if !db.opened {
347
+		db.mmaplock.RUnlock()
348
+		db.metalock.Unlock()
349
+		return nil, ErrDatabaseNotOpen
350
+	}
351
+
352
+	// Create a transaction associated with the database.
353
+	t := &Tx{}
354
+	t.init(db)
355
+
356
+	// Keep track of transaction until it closes.
357
+	db.txs = append(db.txs, t)
358
+	n := len(db.txs)
359
+
360
+	// Unlock the meta pages.
361
+	db.metalock.Unlock()
362
+
363
+	// Update the transaction stats.
364
+	db.statlock.Lock()
365
+	db.stats.TxN++
366
+	db.stats.OpenTxN = n
367
+	db.statlock.Unlock()
368
+
369
+	return t, nil
370
+}
371
+
372
+func (db *DB) beginRWTx() (*Tx, error) {
373
+	// Obtain writer lock. This is released by the transaction when it closes.
374
+	// This enforces only one writer transaction at a time.
375
+	db.rwlock.Lock()
376
+
377
+	// Once we have the writer lock then we can lock the meta pages so that
378
+	// we can set up the transaction.
379
+	db.metalock.Lock()
380
+	defer db.metalock.Unlock()
381
+
382
+	// Exit if the database is not open yet.
383
+	if !db.opened {
384
+		db.rwlock.Unlock()
385
+		return nil, ErrDatabaseNotOpen
386
+	}
387
+
388
+	// Create a transaction associated with the database.
389
+	t := &Tx{writable: true}
390
+	t.init(db)
391
+	db.rwtx = t
392
+
393
+	// Free any pages associated with closed read-only transactions.
394
+	var minid txid = 0xFFFFFFFFFFFFFFFF
395
+	for _, t := range db.txs {
396
+		if t.meta.txid < minid {
397
+			minid = t.meta.txid
398
+		}
399
+	}
400
+	if minid > 0 {
401
+		db.freelist.release(minid - 1)
402
+	}
403
+
404
+	return t, nil
405
+}
406
+
407
+// removeTx removes a transaction from the database.
408
+func (db *DB) removeTx(tx *Tx) {
409
+	// Release the read lock on the mmap.
410
+	db.mmaplock.RUnlock()
411
+
412
+	// Use the meta lock to restrict access to the DB object.
413
+	db.metalock.Lock()
414
+
415
+	// Remove the transaction.
416
+	for i, t := range db.txs {
417
+		if t == tx {
418
+			db.txs = append(db.txs[:i], db.txs[i+1:]...)
419
+			break
420
+		}
421
+	}
422
+	n := len(db.txs)
423
+
424
+	// Unlock the meta pages.
425
+	db.metalock.Unlock()
426
+
427
+	// Merge statistics.
428
+	db.statlock.Lock()
429
+	db.stats.OpenTxN = n
430
+	db.stats.TxStats.add(&tx.stats)
431
+	db.statlock.Unlock()
432
+}
433
+
434
+// Update executes a function within the context of a read-write managed transaction.
435
+// If no error is returned from the function then the transaction is committed.
436
+// If an error is returned then the entire transaction is rolled back.
437
+// Any error that is returned from the function or returned from the commit is
438
+// returned from the Update() method.
439
+//
440
+// Attempting to manually commit or rollback within the function will cause a panic.
441
+func (db *DB) Update(fn func(*Tx) error) error {
442
+	t, err := db.Begin(true)
443
+	if err != nil {
444
+		return err
445
+	}
446
+
447
+	// Make sure the transaction rolls back in the event of a panic.
448
+	defer func() {
449
+		if t.db != nil {
450
+			t.rollback()
451
+		}
452
+	}()
453
+
454
+	// Mark as a managed tx so that the inner function cannot manually commit.
455
+	t.managed = true
456
+
457
+	// If an error is returned from the function then rollback and return error.
458
+	err = fn(t)
459
+	t.managed = false
460
+	if err != nil {
461
+		_ = t.Rollback()
462
+		return err
463
+	}
464
+
465
+	return t.Commit()
466
+}
467
+
468
+// View executes a function within the context of a managed read-only transaction.
469
+// Any error that is returned from the function is returned from the View() method.
470
+//
471
+// Attempting to manually rollback within the function will cause a panic.
472
+func (db *DB) View(fn func(*Tx) error) error {
473
+	t, err := db.Begin(false)
474
+	if err != nil {
475
+		return err
476
+	}
477
+
478
+	// Make sure the transaction rolls back in the event of a panic.
479
+	defer func() {
480
+		if t.db != nil {
481
+			t.rollback()
482
+		}
483
+	}()
484
+
485
+	// Mark as a managed tx so that the inner function cannot manually rollback.
486
+	t.managed = true
487
+
488
+	// If an error is returned from the function then pass it through.
489
+	err = fn(t)
490
+	t.managed = false
491
+	if err != nil {
492
+		_ = t.Rollback()
493
+		return err
494
+	}
495
+
496
+	if err := t.Rollback(); err != nil {
497
+		return err
498
+	}
499
+
500
+	return nil
501
+}
502
+
503
+// Stats retrieves ongoing performance stats for the database.
504
+// This is only updated when a transaction closes.
505
+func (db *DB) Stats() Stats {
506
+	db.statlock.RLock()
507
+	defer db.statlock.RUnlock()
508
+	return db.stats
509
+}
510
+
511
+// This is for internal access to the raw data bytes from the C cursor, use
512
+// carefully, or not at all.
513
+func (db *DB) Info() *Info {
514
+	return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
515
+}
516
+
517
+// page retrieves a page reference from the mmap based on the current page size.
518
+func (db *DB) page(id pgid) *page {
519
+	pos := id * pgid(db.pageSize)
520
+	return (*page)(unsafe.Pointer(&db.data[pos]))
521
+}
522
+
523
+// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
524
+func (db *DB) pageInBuffer(b []byte, id pgid) *page {
525
+	return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
526
+}
527
+
528
+// meta retrieves the current meta page reference.
529
+func (db *DB) meta() *meta {
530
+	if db.meta0.txid > db.meta1.txid {
531
+		return db.meta0
532
+	}
533
+	return db.meta1
534
+}
535
+
536
+// allocate returns a contiguous block of memory starting at a given page.
537
+func (db *DB) allocate(count int) (*page, error) {
538
+	// Allocate a temporary buffer for the page.
539
+	buf := make([]byte, count*db.pageSize)
540
+	p := (*page)(unsafe.Pointer(&buf[0]))
541
+	p.overflow = uint32(count - 1)
542
+
543
+	// Use pages from the freelist if they are available.
544
+	if p.id = db.freelist.allocate(count); p.id != 0 {
545
+		return p, nil
546
+	}
547
+
548
+	// Resize mmap() if we're at the end.
549
+	p.id = db.rwtx.meta.pgid
550
+	var minsz = int((p.id+pgid(count))+1) * db.pageSize
551
+	if minsz >= db.datasz {
552
+		if err := db.mmap(minsz); err != nil {
553
+			return nil, fmt.Errorf("mmap allocate error: %s", err)
554
+		}
555
+	}
556
+
557
+	// Move the page id high water mark.
558
+	db.rwtx.meta.pgid += pgid(count)
559
+
560
+	return p, nil
561
+}
562
+
563
+// Options represents the options that can be set when opening a database.
564
+type Options struct {
565
+	// Timeout is the amount of time to wait to obtain a file lock.
566
+	// When set to zero it will wait indefinitely. This option is only
567
+	// available on Darwin and Linux.
568
+	Timeout time.Duration
569
+}
570
+
571
+// DefaultOptions represent the options used if nil options are passed into Open().
572
+// No timeout is used which will cause Bolt to wait indefinitely for a lock.
573
+var DefaultOptions = &Options{
574
+	Timeout: 0,
575
+}
576
+
577
+// Stats represents statistics about the database.
578
+type Stats struct {
579
+	// Freelist stats
580
+	FreePageN     int // total number of free pages on the freelist
581
+	PendingPageN  int // total number of pending pages on the freelist
582
+	FreeAlloc     int // total bytes allocated in free pages
583
+	FreelistInuse int // total bytes used by the freelist
584
+
585
+	// Transaction stats
586
+	TxN     int // total number of started read transactions
587
+	OpenTxN int // number of currently open read transactions
588
+
589
+	TxStats TxStats // global, ongoing stats.
590
+}
591
+
592
+// Sub calculates and returns the difference between two sets of database stats.
593
+// This is useful when obtaining stats at two different points and time and
594
+// you need the performance counters that occurred within that time span.
595
+func (s *Stats) Sub(other *Stats) Stats {
596
+	if other == nil {
597
+		return *s
598
+	}
599
+	var diff Stats
600
+	diff.FreePageN = s.FreePageN
601
+	diff.PendingPageN = s.PendingPageN
602
+	diff.FreeAlloc = s.FreeAlloc
603
+	diff.FreelistInuse = s.FreelistInuse
604
+	diff.TxN = other.TxN - s.TxN
605
+	diff.TxStats = s.TxStats.Sub(&other.TxStats)
606
+	return diff
607
+}
608
+
609
+func (s *Stats) add(other *Stats) {
610
+	s.TxStats.add(&other.TxStats)
611
+}
612
+
613
+type Info struct {
614
+	Data     uintptr
615
+	PageSize int
616
+}
617
+
618
+type meta struct {
619
+	magic    uint32
620
+	version  uint32
621
+	pageSize uint32
622
+	flags    uint32
623
+	root     bucket
624
+	freelist pgid
625
+	pgid     pgid
626
+	txid     txid
627
+	checksum uint64
628
+}
629
+
630
+// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
631
+func (m *meta) validate() error {
632
+	if m.checksum != 0 && m.checksum != m.sum64() {
633
+		return ErrChecksum
634
+	} else if m.magic != magic {
635
+		return ErrInvalid
636
+	} else if m.version != version {
637
+		return ErrVersionMismatch
638
+	}
639
+	return nil
640
+}
641
+
642
+// copy copies one meta object to another.
643
+func (m *meta) copy(dest *meta) {
644
+	*dest = *m
645
+}
646
+
647
+// write writes the meta onto a page.
648
+func (m *meta) write(p *page) {
649
+
650
+	_assert(m.root.root < m.pgid, "root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)
651
+	_assert(m.freelist < m.pgid, "freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)
652
+
653
+	// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
654
+	p.id = pgid(m.txid % 2)
655
+	p.flags |= metaPageFlag
656
+
657
+	// Calculate the checksum.
658
+	m.checksum = m.sum64()
659
+
660
+	m.copy(p.meta())
661
+}
662
+
663
+// generates the checksum for the meta.
664
+func (m *meta) sum64() uint64 {
665
+	var h = fnv.New64a()
666
+	_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
667
+	return h.Sum64()
668
+}
669
+
670
+// _assert will panic with a given formatted message if the given condition is false.
671
+func _assert(condition bool, msg string, v ...interface{}) {
672
+	if !condition {
673
+		panic(fmt.Sprintf("assertion failed: "+msg, v...))
674
+	}
675
+}
676
+
677
+func warn(v ...interface{}) {
678
+	fmt.Fprintln(os.Stderr, v...)
679
+}
680
+
681
+func warnf(msg string, v ...interface{}) {
682
+	fmt.Fprintf(os.Stderr, msg+"\n", v...)
683
+}
684
+
685
+func printstack() {
686
+	stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n")
687
+	fmt.Fprintln(os.Stderr, stack)
688
+}
0 689
new file mode 100644
... ...
@@ -0,0 +1,44 @@
0
+/*
1
+Package bolt implements a low-level key/value store in pure Go. It supports
2
+fully serializable transactions, ACID semantics, and lock-free MVCC with
3
+multiple readers and a single writer. Bolt can be used for projects that
4
+want a simple data store without the need to add large dependencies such as
5
+Postgres or MySQL.
6
+
7
+Bolt is a single-level, zero-copy, B+tree data store. This means that Bolt is
8
+optimized for fast read access and does not require recovery in the event of a
9
+system crash. Transactions which have not finished committing will simply be
10
+rolled back in the event of a crash.
11
+
12
+The design of Bolt is based on Howard Chu's LMDB database project.
13
+
14
+Bolt currently works on Windows, Mac OS X, and Linux.
15
+
16
+
17
+Basics
18
+
19
+There are only a few types in Bolt: DB, Bucket, Tx, and Cursor. The DB is
20
+a collection of buckets and is represented by a single file on disk. A bucket is
21
+a collection of unique keys that are associated with values.
22
+
23
+Transactions provide either read-only or read-write access to the database.
24
+Read-only transactions can retrieve key/value pairs and can use Cursors to
25
+iterate over the dataset sequentially. Read-write transactions can create and
26
+delete buckets and can insert and remove keys. Only one read-write transaction
27
+is allowed at a time.
28
+
29
+
30
+Caveats
31
+
32
+The database uses a read-only, memory-mapped data file to ensure that
33
+applications cannot corrupt the database, however, this means that keys and
34
+values returned from Bolt cannot be changed. Writing to a read-only byte slice
35
+will cause Go to panic.
36
+
37
+Keys and values retrieved from the database are only valid for the life of
38
+the transaction. When used outside the transaction, these byte slices can
39
+point to different data or can point to invalid memory which will cause a panic.
40
+
41
+
42
+*/
43
+package bolt
0 44
new file mode 100644
... ...
@@ -0,0 +1,66 @@
0
+package bolt
1
+
2
+import "errors"
3
+
4
+// These errors can be returned when opening or calling methods on a DB.
5
+var (
6
+	// ErrDatabaseNotOpen is returned when a DB instance is accessed before it
7
+	// is opened or after it is closed.
8
+	ErrDatabaseNotOpen = errors.New("database not open")
9
+
10
+	// ErrDatabaseOpen is returned when opening a database that is
11
+	// already open.
12
+	ErrDatabaseOpen = errors.New("database already open")
13
+
14
+	// ErrInvalid is returned when a data file is not a Bolt-formatted database.
15
+	ErrInvalid = errors.New("invalid database")
16
+
17
+	// ErrVersionMismatch is returned when the data file was created with a
18
+	// different version of Bolt.
19
+	ErrVersionMismatch = errors.New("version mismatch")
20
+
21
+	// ErrChecksum is returned when either meta page checksum does not match.
22
+	ErrChecksum = errors.New("checksum error")
23
+
24
+	// ErrTimeout is returned when a database cannot obtain an exclusive lock
25
+	// on the data file after the timeout passed to Open().
26
+	ErrTimeout = errors.New("timeout")
27
+)
28
+
29
+// These errors can occur when beginning or committing a Tx.
30
+var (
31
+	// ErrTxNotWritable is returned when performing a write operation on a
32
+	// read-only transaction.
33
+	ErrTxNotWritable = errors.New("tx not writable")
34
+
35
+	// ErrTxClosed is returned when committing or rolling back a transaction
36
+	// that has already been committed or rolled back.
37
+	ErrTxClosed = errors.New("tx closed")
38
+)
39
+
40
+// These errors can occur when putting or deleting a value or a bucket.
41
+var (
42
+	// ErrBucketNotFound is returned when trying to access a bucket that has
43
+	// not been created yet.
44
+	ErrBucketNotFound = errors.New("bucket not found")
45
+
46
+	// ErrBucketExists is returned when creating a bucket that already exists.
47
+	ErrBucketExists = errors.New("bucket already exists")
48
+
49
+	// ErrBucketNameRequired is returned when creating a bucket with a blank name.
50
+	ErrBucketNameRequired = errors.New("bucket name required")
51
+
52
+	// ErrKeyRequired is returned when inserting a zero-length key.
53
+	ErrKeyRequired = errors.New("key required")
54
+
55
+	// ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize.
56
+	ErrKeyTooLarge = errors.New("key too large")
57
+
58
+	// ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize.
59
+	ErrValueTooLarge = errors.New("value too large")
60
+
61
+	// ErrIncompatibleValue is returned when trying create or delete a bucket
62
+	// on an existing non-bucket key or when trying to create or delete a
63
+	// non-bucket key on an existing bucket key.
64
+	ErrIncompatibleValue = errors.New("incompatible value")
65
+)
0 66
new file mode 100644
... ...
@@ -0,0 +1,234 @@
0
+package bolt
1
+
2
+import (
3
+	"sort"
4
+	"unsafe"
5
+)
6
+
7
+// freelist represents a list of all pages that are available for allocation.
8
+// It also tracks pages that have been freed but are still in use by open transactions.
9
+type freelist struct {
10
+	ids     []pgid          // all free and available free page ids.
11
+	pending map[txid][]pgid // mapping of soon-to-be free page ids by tx.
12
+	cache   map[pgid]bool   // fast lookup of all free and pending page ids.
13
+}
14
+
15
+// newFreelist returns an empty, initialized freelist.
16
+func newFreelist() *freelist {
17
+	return &freelist{
18
+		pending: make(map[txid][]pgid),
19
+		cache:   make(map[pgid]bool),
20
+	}
21
+}
22
+
23
+// size returns the size of the page after serialization.
24
+func (f *freelist) size() int {
25
+	return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * f.count())
26
+}
27
+
28
+// count returns count of pages on the freelist
29
+func (f *freelist) count() int {
30
+	return f.free_count() + f.pending_count()
31
+}
32
+
33
+// free_count returns count of free pages
34
+func (f *freelist) free_count() int {
35
+	return len(f.ids)
36
+}
37
+
38
+// pending_count returns count of pending pages
39
+func (f *freelist) pending_count() int {
40
+	var count int
41
+	for _, list := range f.pending {
42
+		count += len(list)
43
+	}
44
+	return count
45
+}
46
+
47
+// all returns a list of all free ids and all pending ids in one sorted list.
48
+func (f *freelist) all() []pgid {
49
+	ids := make([]pgid, len(f.ids))
50
+	copy(ids, f.ids)
51
+
52
+	for _, list := range f.pending {
53
+		ids = append(ids, list...)
54
+	}
55
+
56
+	sort.Sort(pgids(ids))
57
+	return ids
58
+}
59
+
60
+// allocate returns the starting page id of a contiguous list of pages of a given size.
61
+// If a contiguous block cannot be found then 0 is returned.
62
+func (f *freelist) allocate(n int) pgid {
63
+	if len(f.ids) == 0 {
64
+		return 0
65
+	}
66
+
67
+	var initial, previd pgid
68
+	for i, id := range f.ids {
69
+		_assert(id > 1, "invalid page allocation: %d", id)
70
+
71
+		// Reset initial page if this is not contiguous.
72
+		if previd == 0 || id-previd != 1 {
73
+			initial = id
74
+		}
75
+
76
+		// If we found a contiguous block then remove it and return it.
77
+		if (id-initial)+1 == pgid(n) {
78
+			// If we're allocating off the beginning then take the fast path
79
+			// and just adjust the existing slice. This will use extra memory
80
+			// temporarily but the append() in free() will realloc the slice
81
+			// as is necessary.
82
+			if (i + 1) == n {
83
+				f.ids = f.ids[i+1:]
84
+			} else {
85
+				copy(f.ids[i-n+1:], f.ids[i+1:])
86
+				f.ids = f.ids[:len(f.ids)-n]
87
+			}
88
+
89
+			// Remove from the free cache.
90
+			for i := pgid(0); i < pgid(n); i++ {
91
+				delete(f.cache, initial+i)
92
+			}
93
+
94
+			return initial
95
+		}
96
+
97
+		previd = id
98
+	}
99
+	return 0
100
+}
101
+
102
+// free releases a page and its overflow for a given transaction id.
103
+// If the page is already free then a panic will occur.
104
+func (f *freelist) free(txid txid, p *page) {
105
+	_assert(p.id > 1, "cannot free page 0 or 1: %d", p.id)
106
+
107
+	// Free page and all its overflow pages.
108
+	var ids = f.pending[txid]
109
+	for id := p.id; id <= p.id+pgid(p.overflow); id++ {
110
+		// Verify that page is not already free.
111
+		_assert(!f.cache[id], "page %d already freed", id)
112
+
113
+		// Add to the freelist and cache.
114
+		ids = append(ids, id)
115
+		f.cache[id] = true
116
+	}
117
+	f.pending[txid] = ids
118
+}
119
+
120
+// release moves all page ids for a transaction id (or older) to the freelist.
121
+func (f *freelist) release(txid txid) {
122
+	for tid, ids := range f.pending {
123
+		if tid <= txid {
124
+			// Move transaction's pending pages to the available freelist.
125
+			// Don't remove from the cache since the page is still free.
126
+			f.ids = append(f.ids, ids...)
127
+			delete(f.pending, tid)
128
+		}
129
+	}
130
+	sort.Sort(pgids(f.ids))
131
+}
132
+
133
+// rollback removes the pages from a given pending tx.
134
+func (f *freelist) rollback(txid txid) {
135
+	// Remove page ids from cache.
136
+	for _, id := range f.pending[txid] {
137
+		delete(f.cache, id)
138
+	}
139
+
140
+	// Remove pages from pending list.
141
+	delete(f.pending, txid)
142
+}
143
+
144
+// freed returns whether a given page is in the free list.
145
+func (f *freelist) freed(pgid pgid) bool {
146
+	return f.cache[pgid]
147
+}
148
+
149
+// read initializes the freelist from a freelist page.
150
+func (f *freelist) read(p *page) {
151
+	// If the page.count is at the max uint16 value (64k) then it's considered
152
+	// an overflow and the size of the freelist is stored as the first element.
153
+	idx, count := 0, int(p.count)
154
+	if count == 0xFFFF {
155
+		idx = 1
156
+		count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0])
157
+	}
158
+
159
+	// Copy the list of page ids from the freelist.
160
+	ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count]
161
+	f.ids = make([]pgid, len(ids))
162
+	copy(f.ids, ids)
163
+
164
+	// Make sure they're sorted.
165
+	sort.Sort(pgids(f.ids))
166
+
167
+	// Rebuild the page cache.
168
+	f.reindex()
169
+}
170
+
171
+// write writes the page ids onto a freelist page. All free and pending ids are
172
+// saved to disk since in the event of a program crash, all pending ids will
173
+// become free.
174
+func (f *freelist) write(p *page) error {
175
+	// Combine the old free pgids and pgids waiting on an open transaction.
176
+	ids := f.all()
177
+
178
+	// Update the header flag.
179
+	p.flags |= freelistPageFlag
180
+
181
+	// The page.count can only hold up to 64k elements so if we overflow that
182
+	// number then we handle it by putting the size in the first element.
183
+	if len(ids) < 0xFFFF {
184
+		p.count = uint16(len(ids))
185
+		copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:], ids)
186
+	} else {
187
+		p.count = 0xFFFF
188
+		((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(len(ids))
189
+		copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:], ids)
190
+	}
191
+
192
+	return nil
193
+}
194
+
195
+// reload reads the freelist from a page and filters out pending items.
196
+func (f *freelist) reload(p *page) {
197
+	f.read(p)
198
+
199
+	// Build a cache of only pending pages.
200
+	pcache := make(map[pgid]bool)
201
+	for _, pendingIDs := range f.pending {
202
+		for _, pendingID := range pendingIDs {
203
+			pcache[pendingID] = true
204
+		}
205
+	}
206
+
207
+	// Check each page in the freelist and build a new available freelist
208
+	// with any pages not in the pending lists.
209
+	var a []pgid
210
+	for _, id := range f.ids {
211
+		if !pcache[id] {
212
+			a = append(a, id)
213
+		}
214
+	}
215
+	f.ids = a
216
+
217
+	// Once the available list is rebuilt then rebuild the free cache so that
218
+	// it includes the available and pending free pages.
219
+	f.reindex()
220
+}
221
+
222
+// reindex rebuilds the free cache based on available and pending free lists.
223
+func (f *freelist) reindex() {
224
+	f.cache = make(map[pgid]bool)
225
+	for _, id := range f.ids {
226
+		f.cache[id] = true
227
+	}
228
+	for _, pendingIDs := range f.pending {
229
+		for _, pendingID := range pendingIDs {
230
+			f.cache[pendingID] = true
231
+		}
232
+	}
233
+}
0 234
new file mode 100644
... ...
@@ -0,0 +1,616 @@
0
+package bolt
1
+
2
+import (
3
+	"bytes"
4
+	"sort"
5
+	"unsafe"
6
+)
7
+
8
+// node represents an in-memory, deserialized page.
9
+type node struct {
10
+	bucket     *Bucket
11
+	isLeaf     bool
12
+	unbalanced bool
13
+	spilled    bool
14
+	key        []byte
15
+	pgid       pgid
16
+	parent     *node
17
+	children   nodes
18
+	inodes     inodes
19
+}
20
+
21
+// root returns the top-level node this node is attached to.
22
+func (n *node) root() *node {
23
+	if n.parent == nil {
24
+		return n
25
+	}
26
+	return n.parent.root()
27
+}
28
+
29
+// minKeys returns the minimum number of inodes this node should have.
30
+func (n *node) minKeys() int {
31
+	if n.isLeaf {
32
+		return 1
33
+	}
34
+	return 2
35
+}
36
+
37
+// size returns the size of the node after serialization.
38
+func (n *node) size() int {
39
+	sz, elsz := pageHeaderSize, n.pageElementSize()
40
+	for i := 0; i < len(n.inodes); i++ {
41
+		item := &n.inodes[i]
42
+		sz += elsz + len(item.key) + len(item.value)
43
+	}
44
+	return sz
45
+}
46
+
47
+// sizeLessThan returns true if the node is less than a given size.
48
+// This is an optimization to avoid calculating a large node when we only need
49
+// to know if it fits inside a certain page size.
50
+func (n *node) sizeLessThan(v int) bool {
51
+	sz, elsz := pageHeaderSize, n.pageElementSize()
52
+	for i := 0; i < len(n.inodes); i++ {
53
+		item := &n.inodes[i]
54
+		sz += elsz + len(item.key) + len(item.value)
55
+		if sz >= v {
56
+			return false
57
+		}
58
+	}
59
+	return true
60
+}
61
+
62
+// pageElementSize returns the size of each page element based on the type of node.
63
+func (n *node) pageElementSize() int {
64
+	if n.isLeaf {
65
+		return leafPageElementSize
66
+	}
67
+	return branchPageElementSize
68
+}
69
+
70
+// childAt returns the child node at a given index.
71
+func (n *node) childAt(index int) *node {
72
+	_assert(!n.isLeaf, "invalid childAt(%d) on a leaf node", index)
73
+	return n.bucket.node(n.inodes[index].pgid, n)
74
+}
75
+
76
+// childIndex returns the index of a given child node.
77
+func (n *node) childIndex(child *node) int {
78
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
79
+	return index
80
+}
81
+
82
+// numChildren returns the number of children.
83
+func (n *node) numChildren() int {
84
+	return len(n.inodes)
85
+}
86
+
87
+// nextSibling returns the next node with the same parent.
88
+func (n *node) nextSibling() *node {
89
+	if n.parent == nil {
90
+		return nil
91
+	}
92
+	index := n.parent.childIndex(n)
93
+	if index >= n.parent.numChildren()-1 {
94
+		return nil
95
+	}
96
+	return n.parent.childAt(index + 1)
97
+}
98
+
99
+// prevSibling returns the previous node with the same parent.
100
+func (n *node) prevSibling() *node {
101
+	if n.parent == nil {
102
+		return nil
103
+	}
104
+	index := n.parent.childIndex(n)
105
+	if index == 0 {
106
+		return nil
107
+	}
108
+	return n.parent.childAt(index - 1)
109
+}
110
+
111
+// put inserts a key/value.
112
+func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
113
+	_assert(pgid < n.bucket.tx.meta.pgid, "pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)
114
+	_assert(len(oldKey) > 0, "put: zero-length old key")
115
+	_assert(len(newKey) > 0, "put: zero-length new key")
116
+
117
+	// Find insertion index.
118
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
119
+
120
+	// Add capacity and shift nodes if we don't have an exact match and need to insert.
121
+	exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
122
+	if !exact {
123
+		n.inodes = append(n.inodes, inode{})
124
+		copy(n.inodes[index+1:], n.inodes[index:])
125
+	}
126
+
127
+	inode := &n.inodes[index]
128
+	inode.flags = flags
129
+	inode.key = newKey
130
+	inode.value = value
131
+	inode.pgid = pgid
132
+	_assert(len(inode.key) > 0, "put: zero-length inode key")
133
+}
134
+
135
+// del removes a key from the node.
136
+func (n *node) del(key []byte) {
137
+	// Find index of key.
138
+	index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
139
+
140
+	// Exit if the key isn't found.
141
+	if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
142
+		return
143
+	}
144
+
145
+	// Delete inode from the node.
146
+	n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
147
+
148
+	// Mark the node as needing rebalancing.
149
+	n.unbalanced = true
150
+}
151
+
152
+// read initializes the node from a page.
153
+func (n *node) read(p *page) {
154
+	n.pgid = p.id
155
+	n.isLeaf = ((p.flags & leafPageFlag) != 0)
156
+	n.inodes = make(inodes, int(p.count))
157
+
158
+	for i := 0; i < int(p.count); i++ {
159
+		inode := &n.inodes[i]
160
+		if n.isLeaf {
161
+			elem := p.leafPageElement(uint16(i))
162
+			inode.flags = elem.flags
163
+			inode.key = elem.key()
164
+			inode.value = elem.value()
165
+		} else {
166
+			elem := p.branchPageElement(uint16(i))
167
+			inode.pgid = elem.pgid
168
+			inode.key = elem.key()
169
+		}
170
+		_assert(len(inode.key) > 0, "read: zero-length inode key")
171
+	}
172
+
173
+	// Save first key so we can find the node in the parent when we spill.
174
+	if len(n.inodes) > 0 {
175
+		n.key = n.inodes[0].key
176
+		_assert(len(n.key) > 0, "read: zero-length node key")
177
+	} else {
178
+		n.key = nil
179
+	}
180
+}
181
+
182
+// write writes the items onto one or more pages.
183
+func (n *node) write(p *page) {
184
+	// Initialize page.
185
+	if n.isLeaf {
186
+		p.flags |= leafPageFlag
187
+	} else {
188
+		p.flags |= branchPageFlag
189
+	}
190
+
191
+	_assert(len(n.inodes) < 0xFFFF, "inode overflow: %d (pgid=%d)", len(n.inodes), p.id)
192
+	p.count = uint16(len(n.inodes))
193
+
194
+	// Loop over each item and write it to the page.
195
+	b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
196
+	for i, item := range n.inodes {
197
+		_assert(len(item.key) > 0, "write: zero-length inode key")
198
+
199
+		// Write the page element.
200
+		if n.isLeaf {
201
+			elem := p.leafPageElement(uint16(i))
202
+			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
203
+			elem.flags = item.flags
204
+			elem.ksize = uint32(len(item.key))
205
+			elem.vsize = uint32(len(item.value))
206
+		} else {
207
+			elem := p.branchPageElement(uint16(i))
208
+			elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
209
+			elem.ksize = uint32(len(item.key))
210
+			elem.pgid = item.pgid
211
+			_assert(elem.pgid != p.id, "write: circular dependency occurred")
212
+		}
213
+
214
+		// Write data for the element to the end of the page.
215
+		copy(b[0:], item.key)
216
+		b = b[len(item.key):]
217
+		copy(b[0:], item.value)
218
+		b = b[len(item.value):]
219
+	}
220
+
221
+	// DEBUG ONLY: n.dump()
222
+}
223
+
224
+// split breaks up a node into multiple smaller nodes, if appropriate.
225
+// This should only be called from the spill() function.
226
+func (n *node) split(pageSize int) []*node {
227
+	var nodes []*node
228
+
229
+	node := n
230
+	for {
231
+		// Split node into two.
232
+		a, b := node.splitTwo(pageSize)
233
+		nodes = append(nodes, a)
234
+
235
+		// If we can't split then exit the loop.
236
+		if b == nil {
237
+			break
238
+		}
239
+
240
+		// Set node to b so it gets split on the next iteration.
241
+		node = b
242
+	}
243
+
244
+	return nodes
245
+}
246
+
247
+// splitTwo breaks up a node into two smaller nodes, if appropriate.
248
+// This should only be called from the split() function.
249
+func (n *node) splitTwo(pageSize int) (*node, *node) {
250
+	// Ignore the split if the page doesn't have at least enough nodes for
251
+	// two pages or if the nodes can fit in a single page.
252
+	if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
253
+		return n, nil
254
+	}
255
+
256
+	// Determine the threshold before starting a new node.
257
+	var fillPercent = n.bucket.FillPercent
258
+	if fillPercent < minFillPercent {
259
+		fillPercent = minFillPercent
260
+	} else if fillPercent > maxFillPercent {
261
+		fillPercent = maxFillPercent
262
+	}
263
+	threshold := int(float64(pageSize) * fillPercent)
264
+
265
+	// Determine split position and sizes of the two pages.
266
+	splitIndex, _ := n.splitIndex(threshold)
267
+
268
+	// Split node into two separate nodes.
269
+	// If there's no parent then we'll need to create one.
270
+	if n.parent == nil {
271
+		n.parent = &node{bucket: n.bucket, children: []*node{n}}
272
+	}
273
+
274
+	// Create a new node and add it to the parent.
275
+	next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
276
+	n.parent.children = append(n.parent.children, next)
277
+
278
+	// Split inodes across two nodes.
279
+	next.inodes = n.inodes[splitIndex:]
280
+	n.inodes = n.inodes[:splitIndex]
281
+
282
+	// Update the statistics.
283
+	n.bucket.tx.stats.Split++
284
+
285
+	return n, next
286
+}
287
+
288
+// splitIndex finds the position where a page will fill a given threshold.
289
+// It returns the index as well as the size of the first page.
290
+// This is only be called from split().
291
+func (n *node) splitIndex(threshold int) (index, sz int) {
292
+	sz = pageHeaderSize
293
+
294
+	// Loop until we only have the minimum number of keys required for the second page.
295
+	for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
296
+		index = i
297
+		inode := n.inodes[i]
298
+		elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
299
+
300
+		// If we have at least the minimum number of keys and adding another
301
+		// node would put us over the threshold then exit and return.
302
+		if i >= minKeysPerPage && sz+elsize > threshold {
303
+			break
304
+		}
305
+
306
+		// Add the element size to the total size.
307
+		sz += elsize
308
+	}
309
+
310
+	return
311
+}
312
+
313
+// spill writes the nodes to dirty pages and splits nodes as it goes.
314
+// Returns an error if dirty pages cannot be allocated.
315
+func (n *node) spill() error {
316
+	var tx = n.bucket.tx
317
+	if n.spilled {
318
+		return nil
319
+	}
320
+
321
+	// Spill child nodes first. Child nodes can materialize sibling nodes in
322
+	// the case of split-merge so we cannot use a range loop. We have to check
323
+	// the children size on every loop iteration.
324
+	sort.Sort(n.children)
325
+	for i := 0; i < len(n.children); i++ {
326
+		if err := n.children[i].spill(); err != nil {
327
+			return err
328
+		}
329
+	}
330
+
331
+	// We no longer need the child list because it's only used for spill tracking.
332
+	n.children = nil
333
+
334
+	// Split nodes into appropriate sizes. The first node will always be n.
335
+	var nodes = n.split(tx.db.pageSize)
336
+	for _, node := range nodes {
337
+		// Add node's page to the freelist if it's not new.
338
+		if node.pgid > 0 {
339
+			tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
340
+			node.pgid = 0
341
+		}
342
+
343
+		// Allocate contiguous space for the node.
344
+		p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
345
+		if err != nil {
346
+			return err
347
+		}
348
+
349
+		// Write the node.
350
+		_assert(p.id < tx.meta.pgid, "pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)
351
+		node.pgid = p.id
352
+		node.write(p)
353
+		node.spilled = true
354
+
355
+		// Insert into parent inodes.
356
+		if node.parent != nil {
357
+			var key = node.key
358
+			if key == nil {
359
+				key = node.inodes[0].key
360
+			}
361
+
362
+			node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
363
+			node.key = node.inodes[0].key
364
+			_assert(len(node.key) > 0, "spill: zero-length node key")
365
+		}
366
+
367
+		// Update the statistics.
368
+		tx.stats.Spill++
369
+	}
370
+
371
+	// If the root node split and created a new root then we need to spill that
372
+	// as well. We'll clear out the children to make sure it doesn't try to respill.
373
+	if n.parent != nil && n.parent.pgid == 0 {
374
+		n.children = nil
375
+		return n.parent.spill()
376
+	}
377
+
378
+	return nil
379
+}
380
+
381
+// rebalance attempts to combine the node with sibling nodes if the node fill
382
+// size is below a threshold or if there are not enough keys.
383
+func (n *node) rebalance() {
384
+	if !n.unbalanced {
385
+		return
386
+	}
387
+	n.unbalanced = false
388
+
389
+	// Update statistics.
390
+	n.bucket.tx.stats.Rebalance++
391
+
392
+	// Ignore if node is above threshold (25%) and has enough keys.
393
+	var threshold = n.bucket.tx.db.pageSize / 4
394
+	if n.size() > threshold && len(n.inodes) > n.minKeys() {
395
+		return
396
+	}
397
+
398
+	// Root node has special handling.
399
+	if n.parent == nil {
400
+		// If root node is a branch and only has one node then collapse it.
401
+		if !n.isLeaf && len(n.inodes) == 1 {
402
+			// Move root's child up.
403
+			child := n.bucket.node(n.inodes[0].pgid, n)
404
+			n.isLeaf = child.isLeaf
405
+			n.inodes = child.inodes[:]
406
+			n.children = child.children
407
+
408
+			// Reparent all child nodes being moved.
409
+			for _, inode := range n.inodes {
410
+				if child, ok := n.bucket.nodes[inode.pgid]; ok {
411
+					child.parent = n
412
+				}
413
+			}
414
+
415
+			// Remove old child.
416
+			child.parent = nil
417
+			delete(n.bucket.nodes, child.pgid)
418
+			child.free()
419
+		}
420
+
421
+		return
422
+	}
423
+
424
+	// If node has no keys then just remove it.
425
+	if n.numChildren() == 0 {
426
+		n.parent.del(n.key)
427
+		n.parent.removeChild(n)
428
+		delete(n.bucket.nodes, n.pgid)
429
+		n.free()
430
+		n.parent.rebalance()
431
+		return
432
+	}
433
+
434
+	_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
435
+
436
+	// Destination node is right sibling if idx == 0, otherwise left sibling.
437
+	var target *node
438
+	var useNextSibling = (n.parent.childIndex(n) == 0)
439
+	if useNextSibling {
440
+		target = n.nextSibling()
441
+	} else {
442
+		target = n.prevSibling()
443
+	}
444
+
445
+	// If target node has extra nodes then just move one over.
446
+	if target.numChildren() > target.minKeys() {
447
+		if useNextSibling {
448
+			// Reparent and move node.
449
+			if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok {
450
+				child.parent.removeChild(child)
451
+				child.parent = n
452
+				child.parent.children = append(child.parent.children, child)
453
+			}
454
+			n.inodes = append(n.inodes, target.inodes[0])
455
+			target.inodes = target.inodes[1:]
456
+
457
+			// Update target key on parent.
458
+			target.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0)
459
+			target.key = target.inodes[0].key
460
+			_assert(len(target.key) > 0, "rebalance(1): zero-length node key")
461
+		} else {
462
+			// Reparent and move node.
463
+			if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok {
464
+				child.parent.removeChild(child)
465
+				child.parent = n
466
+				child.parent.children = append(child.parent.children, child)
467
+			}
468
+			n.inodes = append(n.inodes, inode{})
469
+			copy(n.inodes[1:], n.inodes)
470
+			n.inodes[0] = target.inodes[len(target.inodes)-1]
471
+			target.inodes = target.inodes[:len(target.inodes)-1]
472
+		}
473
+
474
+		// Update parent key for node.
475
+		n.parent.put(n.key, n.inodes[0].key, nil, n.pgid, 0)
476
+		n.key = n.inodes[0].key
477
+		_assert(len(n.key) > 0, "rebalance(2): zero-length node key")
478
+
479
+		return
480
+	}
481
+
482
+	// If both this node and the target node are too small then merge them.
483
+	if useNextSibling {
484
+		// Reparent all child nodes being moved.
485
+		for _, inode := range target.inodes {
486
+			if child, ok := n.bucket.nodes[inode.pgid]; ok {
487
+				child.parent.removeChild(child)
488
+				child.parent = n
489
+				child.parent.children = append(child.parent.children, child)
490
+			}
491
+		}
492
+
493
+		// Copy over inodes from target and remove target.
494
+		n.inodes = append(n.inodes, target.inodes...)
495
+		n.parent.del(target.key)
496
+		n.parent.removeChild(target)
497
+		delete(n.bucket.nodes, target.pgid)
498
+		target.free()
499
+	} else {
500
+		// Reparent all child nodes being moved.
501
+		for _, inode := range n.inodes {
502
+			if child, ok := n.bucket.nodes[inode.pgid]; ok {
503
+				child.parent.removeChild(child)
504
+				child.parent = target
505
+				child.parent.children = append(child.parent.children, child)
506
+			}
507
+		}
508
+
509
+		// Copy over inodes to target and remove node.
510
+		target.inodes = append(target.inodes, n.inodes...)
511
+		n.parent.del(n.key)
512
+		n.parent.removeChild(n)
513
+		delete(n.bucket.nodes, n.pgid)
514
+		n.free()
515
+	}
516
+
517
+	// Either this node or the target node was deleted from the parent so rebalance it.
518
+	n.parent.rebalance()
519
+}
520
+
521
+// removes a node from the list of in-memory children.
522
+// This does not affect the inodes.
523
+func (n *node) removeChild(target *node) {
524
+	for i, child := range n.children {
525
+		if child == target {
526
+			n.children = append(n.children[:i], n.children[i+1:]...)
527
+			return
528
+		}
529
+	}
530
+}
531
+
532
+// dereference causes the node to copy all its inode key/value references to heap memory.
533
+// This is required when the mmap is reallocated so inodes are not pointing to stale data.
534
+func (n *node) dereference() {
535
+	if n.key != nil {
536
+		key := make([]byte, len(n.key))
537
+		copy(key, n.key)
538
+		n.key = key
539
+		_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
540
+	}
541
+
542
+	for i := range n.inodes {
543
+		inode := &n.inodes[i]
544
+
545
+		key := make([]byte, len(inode.key))
546
+		copy(key, inode.key)
547
+		inode.key = key
548
+		_assert(len(inode.key) > 0, "dereference: zero-length inode key")
549
+
550
+		value := make([]byte, len(inode.value))
551
+		copy(value, inode.value)
552
+		inode.value = value
553
+	}
554
+
555
+	// Recursively dereference children.
556
+	for _, child := range n.children {
557
+		child.dereference()
558
+	}
559
+
560
+	// Update statistics.
561
+	n.bucket.tx.stats.NodeDeref++
562
+}
563
+
564
+// free adds the node's underlying page to the freelist.
565
+func (n *node) free() {
566
+	if n.pgid != 0 {
567
+		n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
568
+		n.pgid = 0
569
+	}
570
+}
571
+
572
+// dump writes the contents of the node to STDERR for debugging purposes.
573
+/*
574
+func (n *node) dump() {
575
+	// Write node header.
576
+	var typ = "branch"
577
+	if n.isLeaf {
578
+		typ = "leaf"
579
+	}
580
+	warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
581
+
582
+	// Write out abbreviated version of each item.
583
+	for _, item := range n.inodes {
584
+		if n.isLeaf {
585
+			if item.flags&bucketLeafFlag != 0 {
586
+				bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
587
+				warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
588
+			} else {
589
+				warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
590
+			}
591
+		} else {
592
+			warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
593
+		}
594
+	}
595
+	warn("")
596
+}
597
+*/
598
+
599
+type nodes []*node
600
+
601
+func (s nodes) Len() int           { return len(s) }
602
+func (s nodes) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
603
+func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
604
+
605
+// inode represents an internal node inside of a node.
606
+// It can be used to point to elements in a page or point
607
+// to an element which hasn't been added to a page yet.
608
+type inode struct {
609
+	flags uint32
610
+	pgid  pgid
611
+	key   []byte
612
+	value []byte
613
+}
614
+
615
+type inodes []inode
0 616
new file mode 100644
... ...
@@ -0,0 +1,135 @@
0
+package bolt
1
+
2
+import (
3
+	"fmt"
4
+	"os"
5
+	"unsafe"
6
+)
7
+
8
+const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr))
9
+
10
+const maxAllocSize = 0xFFFFFFF
11
+const minKeysPerPage = 2
12
+
13
+const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{}))
14
+const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{}))
15
+
16
+const (
17
+	branchPageFlag   = 0x01
18
+	leafPageFlag     = 0x02
19
+	metaPageFlag     = 0x04
20
+	freelistPageFlag = 0x10
21
+)
22
+
23
+const (
24
+	bucketLeafFlag = 0x01
25
+)
26
+
27
+type pgid uint64
28
+
29
+type page struct {
30
+	id       pgid
31
+	flags    uint16
32
+	count    uint16
33
+	overflow uint32
34
+	ptr      uintptr
35
+}
36
+
37
+// typ returns a human readable page type string used for debugging.
38
+func (p *page) typ() string {
39
+	if (p.flags & branchPageFlag) != 0 {
40
+		return "branch"
41
+	} else if (p.flags & leafPageFlag) != 0 {
42
+		return "leaf"
43
+	} else if (p.flags & metaPageFlag) != 0 {
44
+		return "meta"
45
+	} else if (p.flags & freelistPageFlag) != 0 {
46
+		return "freelist"
47
+	}
48
+	return fmt.Sprintf("unknown<%02x>", p.flags)
49
+}
50
+
51
+// meta returns a pointer to the metadata section of the page.
52
+func (p *page) meta() *meta {
53
+	return (*meta)(unsafe.Pointer(&p.ptr))
54
+}
55
+
56
+// leafPageElement retrieves the leaf node by index
57
+func (p *page) leafPageElement(index uint16) *leafPageElement {
58
+	n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index]
59
+	return n
60
+}
61
+
62
+// leafPageElements retrieves a list of leaf nodes.
63
+func (p *page) leafPageElements() []leafPageElement {
64
+	return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:]
65
+}
66
+
67
+// branchPageElement retrieves the branch node by index
68
+func (p *page) branchPageElement(index uint16) *branchPageElement {
69
+	return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index]
70
+}
71
+
72
+// branchPageElements retrieves a list of branch nodes.
73
+func (p *page) branchPageElements() []branchPageElement {
74
+	return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:]
75
+}
76
+
77
+// dump writes n bytes of the page to STDERR as hex output.
78
+func (p *page) hexdump(n int) {
79
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n]
80
+	fmt.Fprintf(os.Stderr, "%x\n", buf)
81
+}
82
+
83
+type pages []*page
84
+
85
+func (s pages) Len() int           { return len(s) }
86
+func (s pages) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
87
+func (s pages) Less(i, j int) bool { return s[i].id < s[j].id }
88
+
89
+// branchPageElement represents a node on a branch page.
90
+type branchPageElement struct {
91
+	pos   uint32
92
+	ksize uint32
93
+	pgid  pgid
94
+}
95
+
96
+// key returns a byte slice of the node key.
97
+func (n *branchPageElement) key() []byte {
98
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
99
+	return buf[n.pos : n.pos+n.ksize]
100
+}
101
+
102
+// leafPageElement represents a node on a leaf page.
103
+type leafPageElement struct {
104
+	flags uint32
105
+	pos   uint32
106
+	ksize uint32
107
+	vsize uint32
108
+}
109
+
110
+// key returns a byte slice of the node key.
111
+func (n *leafPageElement) key() []byte {
112
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
113
+	return buf[n.pos : n.pos+n.ksize]
114
+}
115
+
116
+// value returns a byte slice of the node value.
117
+func (n *leafPageElement) value() []byte {
118
+	buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
119
+	return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize]
120
+}
121
+
122
+// PageInfo represents human readable information about a page.
123
+type PageInfo struct {
124
+	ID            int
125
+	Type          string
126
+	Count         int
127
+	OverflowCount int
128
+}
129
+
130
+type pgids []pgid
131
+
132
+func (s pgids) Len() int           { return len(s) }
133
+func (s pgids) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
134
+func (s pgids) Less(i, j int) bool { return s[i] < s[j] }
0 135
new file mode 100644
... ...
@@ -0,0 +1,580 @@
0
+package bolt
1
+
2
+import (
3
+	"fmt"
4
+	"io"
5
+	"os"
6
+	"sort"
7
+	"time"
8
+	"unsafe"
9
+)
10
+
11
+// txid represents the internal transaction identifier.
12
+type txid uint64
13
+
14
+// Tx represents a read-only or read/write transaction on the database.
15
+// Read-only transactions can be used for retrieving values for keys and creating cursors.
16
+// Read/write transactions can create and remove buckets and create and remove keys.
17
+//
18
+// IMPORTANT: You must commit or rollback transactions when you are done with
19
+// them. Pages can not be reclaimed by the writer until no more transactions
20
+// are using them. A long running read transaction can cause the database to
21
+// quickly grow.
22
+type Tx struct {
23
+	writable       bool
24
+	managed        bool
25
+	db             *DB
26
+	meta           *meta
27
+	root           Bucket
28
+	pages          map[pgid]*page
29
+	stats          TxStats
30
+	commitHandlers []func()
31
+}
32
+
33
+// init initializes the transaction.
34
+func (tx *Tx) init(db *DB) {
35
+	tx.db = db
36
+	tx.pages = nil
37
+
38
+	// Copy the meta page since it can be changed by the writer.
39
+	tx.meta = &meta{}
40
+	db.meta().copy(tx.meta)
41
+
42
+	// Copy over the root bucket.
43
+	tx.root = newBucket(tx)
44
+	tx.root.bucket = &bucket{}
45
+	*tx.root.bucket = tx.meta.root
46
+
47
+	// Increment the transaction id and add a page cache for writable transactions.
48
+	if tx.writable {
49
+		tx.pages = make(map[pgid]*page)
50
+		tx.meta.txid += txid(1)
51
+	}
52
+}
53
+
54
+// ID returns the transaction id.
55
+func (tx *Tx) ID() int {
56
+	return int(tx.meta.txid)
57
+}
58
+
59
+// DB returns a reference to the database that created the transaction.
60
+func (tx *Tx) DB() *DB {
61
+	return tx.db
62
+}
63
+
64
+// Size returns current database size in bytes as seen by this transaction.
65
+func (tx *Tx) Size() int64 {
66
+	return int64(tx.meta.pgid) * int64(tx.db.pageSize)
67
+}
68
+
69
+// Writable returns whether the transaction can perform write operations.
70
+func (tx *Tx) Writable() bool {
71
+	return tx.writable
72
+}
73
+
74
+// Cursor creates a cursor associated with the root bucket.
75
+// All items in the cursor will return a nil value because all root bucket keys point to buckets.
76
+// The cursor is only valid as long as the transaction is open.
77
+// Do not use a cursor after the transaction is closed.
78
+func (tx *Tx) Cursor() *Cursor {
79
+	return tx.root.Cursor()
80
+}
81
+
82
+// Stats retrieves a copy of the current transaction statistics.
83
+func (tx *Tx) Stats() TxStats {
84
+	return tx.stats
85
+}
86
+
87
+// Bucket retrieves a bucket by name.
88
+// Returns nil if the bucket does not exist.
89
+func (tx *Tx) Bucket(name []byte) *Bucket {
90
+	return tx.root.Bucket(name)
91
+}
92
+
93
+// CreateBucket creates a new bucket.
94
+// Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long.
95
+func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) {
96
+	return tx.root.CreateBucket(name)
97
+}
98
+
99
+// CreateBucketIfNotExists creates a new bucket if it doesn't already exist.
100
+// Returns an error if the bucket name is blank, or if the bucket name is too long.
101
+func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) {
102
+	return tx.root.CreateBucketIfNotExists(name)
103
+}
104
+
105
+// DeleteBucket deletes a bucket.
106
+// Returns an error if the bucket cannot be found or if the key represents a non-bucket value.
107
+func (tx *Tx) DeleteBucket(name []byte) error {
108
+	return tx.root.DeleteBucket(name)
109
+}
110
+
111
+// ForEach executes a function for each bucket in the root.
112
+// If the provided function returns an error then the iteration is stopped and
113
+// the error is returned to the caller.
114
+func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error {
115
+	return tx.root.ForEach(func(k, v []byte) error {
116
+		if err := fn(k, tx.root.Bucket(k)); err != nil {
117
+			return err
118
+		}
119
+		return nil
120
+	})
121
+}
122
+
123
+// OnCommit adds a handler function to be executed after the transaction successfully commits.
124
+func (tx *Tx) OnCommit(fn func()) {
125
+	tx.commitHandlers = append(tx.commitHandlers, fn)
126
+}
127
+
128
+// Commit writes all changes to disk and updates the meta page.
129
+// Returns an error if a disk write error occurs.
130
+func (tx *Tx) Commit() error {
131
+	_assert(!tx.managed, "managed tx commit not allowed")
132
+	if tx.db == nil {
133
+		return ErrTxClosed
134
+	} else if !tx.writable {
135
+		return ErrTxNotWritable
136
+	}
137
+
138
+	// TODO(benbjohnson): Use vectorized I/O to write out dirty pages.
139
+
140
+	// Rebalance nodes which have had deletions.
141
+	var startTime = time.Now()
142
+	tx.root.rebalance()
143
+	if tx.stats.Rebalance > 0 {
144
+		tx.stats.RebalanceTime += time.Since(startTime)
145
+	}
146
+
147
+	// spill data onto dirty pages.
148
+	startTime = time.Now()
149
+	if err := tx.root.spill(); err != nil {
150
+		tx.rollback()
151
+		return err
152
+	}
153
+	tx.stats.SpillTime += time.Since(startTime)
154
+
155
+	// Free the old root bucket.
156
+	tx.meta.root.root = tx.root.root
157
+
158
+	// Free the freelist and allocate new pages for it. This will overestimate
159
+	// the size of the freelist but not underestimate the size (which would be bad).
160
+	tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
161
+	p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
162
+	if err != nil {
163
+		tx.rollback()
164
+		return err
165
+	}
166
+	if err := tx.db.freelist.write(p); err != nil {
167
+		tx.rollback()
168
+		return err
169
+	}
170
+	tx.meta.freelist = p.id
171
+
172
+	// Write dirty pages to disk.
173
+	startTime = time.Now()
174
+	if err := tx.write(); err != nil {
175
+		tx.rollback()
176
+		return err
177
+	}
178
+
179
+	// If strict mode is enabled then perform a consistency check.
180
+	// Only the first consistency error is reported in the panic.
181
+	if tx.db.StrictMode {
182
+		if err, ok := <-tx.Check(); ok {
183
+			panic("check fail: " + err.Error())
184
+		}
185
+	}
186
+
187
+	// Write meta to disk.
188
+	if err := tx.writeMeta(); err != nil {
189
+		tx.rollback()
190
+		return err
191
+	}
192
+	tx.stats.WriteTime += time.Since(startTime)
193
+
194
+	// Finalize the transaction.
195
+	tx.close()
196
+
197
+	// Execute commit handlers now that the locks have been removed.
198
+	for _, fn := range tx.commitHandlers {
199
+		fn()
200
+	}
201
+
202
+	return nil
203
+}
204
+
205
+// Rollback closes the transaction and ignores all previous updates.
206
+func (tx *Tx) Rollback() error {
207
+	_assert(!tx.managed, "managed tx rollback not allowed")
208
+	if tx.db == nil {
209
+		return ErrTxClosed
210
+	}
211
+	tx.rollback()
212
+	return nil
213
+}
214
+
215
+func (tx *Tx) rollback() {
216
+	if tx.db == nil {
217
+		return
218
+	}
219
+	if tx.writable {
220
+		tx.db.freelist.rollback(tx.meta.txid)
221
+		tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist))
222
+	}
223
+	tx.close()
224
+}
225
+
226
+func (tx *Tx) close() {
227
+	if tx.db == nil {
228
+		return
229
+	}
230
+	if tx.writable {
231
+		// Grab freelist stats.
232
+		var freelistFreeN = tx.db.freelist.free_count()
233
+		var freelistPendingN = tx.db.freelist.pending_count()
234
+		var freelistAlloc = tx.db.freelist.size()
235
+
236
+		// Remove writer lock.
237
+		tx.db.rwlock.Unlock()
238
+
239
+		// Merge statistics.
240
+		tx.db.statlock.Lock()
241
+		tx.db.stats.FreePageN = freelistFreeN
242
+		tx.db.stats.PendingPageN = freelistPendingN
243
+		tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize
244
+		tx.db.stats.FreelistInuse = freelistAlloc
245
+		tx.db.stats.TxStats.add(&tx.stats)
246
+		tx.db.statlock.Unlock()
247
+	} else {
248
+		tx.db.removeTx(tx)
249
+	}
250
+	tx.db = nil
251
+}
252
+
253
+// Copy writes the entire database to a writer.
254
+// A reader transaction is maintained during the copy so it is safe to continue
255
+// using the database while a copy is in progress.
256
+// Copy will write exactly tx.Size() bytes into the writer.
257
+func (tx *Tx) Copy(w io.Writer) error {
258
+	var f *os.File
259
+	var err error
260
+
261
+	// Attempt to open reader directly.
262
+	if f, err = os.OpenFile(tx.db.path, os.O_RDONLY|odirect, 0); err != nil {
263
+		// Fallback to a regular open if that doesn't work.
264
+		if f, err = os.OpenFile(tx.db.path, os.O_RDONLY, 0); err != nil {
265
+			return err
266
+		}
267
+	}
268
+
269
+	// Copy the meta pages.
270
+	tx.db.metalock.Lock()
271
+	_, err = io.CopyN(w, f, int64(tx.db.pageSize*2))
272
+	tx.db.metalock.Unlock()
273
+	if err != nil {
274
+		_ = f.Close()
275
+		return fmt.Errorf("meta copy: %s", err)
276
+	}
277
+
278
+	// Copy data pages.
279
+	if _, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2)); err != nil {
280
+		_ = f.Close()
281
+		return err
282
+	}
283
+
284
+	return f.Close()
285
+}
286
+
287
+// CopyFile copies the entire database to file at the given path.
288
+// A reader transaction is maintained during the copy so it is safe to continue
289
+// using the database while a copy is in progress.
290
+func (tx *Tx) CopyFile(path string, mode os.FileMode) error {
291
+	f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode)
292
+	if err != nil {
293
+		return err
294
+	}
295
+
296
+	err = tx.Copy(f)
297
+	if err != nil {
298
+		_ = f.Close()
299
+		return err
300
+	}
301
+	return f.Close()
302
+}
303
+
304
+// Check performs several consistency checks on the database for this transaction.
305
+// An error is returned if any inconsistency is found.
306
+//
307
+// It can be safely run concurrently on a writable transaction. However, this
308
+// incurs a high cost for large databases and databases with a lot of subbuckets
309
+// because of caching. This overhead can be removed if running on a read-only
310
+// transaction, however, it is not safe to execute other writer transactions at
311
+// the same time.
312
+func (tx *Tx) Check() <-chan error {
313
+	ch := make(chan error)
314
+	go tx.check(ch)
315
+	return ch
316
+}
317
+
318
+func (tx *Tx) check(ch chan error) {
319
+	// Check if any pages are double freed.
320
+	freed := make(map[pgid]bool)
321
+	for _, id := range tx.db.freelist.all() {
322
+		if freed[id] {
323
+			ch <- fmt.Errorf("page %d: already freed", id)
324
+		}
325
+		freed[id] = true
326
+	}
327
+
328
+	// Track every reachable page.
329
+	reachable := make(map[pgid]*page)
330
+	reachable[0] = tx.page(0) // meta0
331
+	reachable[1] = tx.page(1) // meta1
332
+	for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ {
333
+		reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist)
334
+	}
335
+
336
+	// Recursively check buckets.
337
+	tx.checkBucket(&tx.root, reachable, freed, ch)
338
+
339
+	// Ensure all pages below high water mark are either reachable or freed.
340
+	for i := pgid(0); i < tx.meta.pgid; i++ {
341
+		_, isReachable := reachable[i]
342
+		if !isReachable && !freed[i] {
343
+			ch <- fmt.Errorf("page %d: unreachable unfreed", int(i))
344
+		}
345
+	}
346
+
347
+	// Close the channel to signal completion.
348
+	close(ch)
349
+}
350
+
351
+func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) {
352
+	// Ignore inline buckets.
353
+	if b.root == 0 {
354
+		return
355
+	}
356
+
357
+	// Check every page used by this bucket.
358
+	b.tx.forEachPage(b.root, 0, func(p *page, _ int) {
359
+		if p.id > tx.meta.pgid {
360
+			ch <- fmt.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid))
361
+		}
362
+
363
+		// Ensure each page is only referenced once.
364
+		for i := pgid(0); i <= pgid(p.overflow); i++ {
365
+			var id = p.id + i
366
+			if _, ok := reachable[id]; ok {
367
+				ch <- fmt.Errorf("page %d: multiple references", int(id))
368
+			}
369
+			reachable[id] = p
370
+		}
371
+
372
+		// We should only encounter un-freed leaf and branch pages.
373
+		if freed[p.id] {
374
+			ch <- fmt.Errorf("page %d: reachable freed", int(p.id))
375
+		} else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 {
376
+			ch <- fmt.Errorf("page %d: invalid type: %s", int(p.id), p.typ())
377
+		}
378
+	})
379
+
380
+	// Check each bucket within this bucket.
381
+	_ = b.ForEach(func(k, v []byte) error {
382
+		if child := b.Bucket(k); child != nil {
383
+			tx.checkBucket(child, reachable, freed, ch)
384
+		}
385
+		return nil
386
+	})
387
+}
388
+
389
+// allocate returns a contiguous block of memory starting at a given page.
390
+func (tx *Tx) allocate(count int) (*page, error) {
391
+	p, err := tx.db.allocate(count)
392
+	if err != nil {
393
+		return nil, err
394
+	}
395
+
396
+	// Save to our page cache.
397
+	tx.pages[p.id] = p
398
+
399
+	// Update statistics.
400
+	tx.stats.PageCount++
401
+	tx.stats.PageAlloc += count * tx.db.pageSize
402
+
403
+	return p, nil
404
+}
405
+
406
+// write writes any dirty pages to disk.
407
+func (tx *Tx) write() error {
408
+	// Sort pages by id.
409
+	pages := make(pages, 0, len(tx.pages))
410
+	for _, p := range tx.pages {
411
+		pages = append(pages, p)
412
+	}
413
+	sort.Sort(pages)
414
+
415
+	// Write pages to disk in order.
416
+	for _, p := range pages {
417
+		size := (int(p.overflow) + 1) * tx.db.pageSize
418
+		buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:size]
419
+		offset := int64(p.id) * int64(tx.db.pageSize)
420
+		if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
421
+			return err
422
+		}
423
+
424
+		// Update statistics.
425
+		tx.stats.Write++
426
+	}
427
+	if !tx.db.NoSync || IgnoreNoSync {
428
+		if err := fdatasync(tx.db); err != nil {
429
+			return err
430
+		}
431
+	}
432
+
433
+	// Clear out page cache.
434
+	tx.pages = make(map[pgid]*page)
435
+
436
+	return nil
437
+}
438
+
439
+// writeMeta writes the meta to the disk.
440
+func (tx *Tx) writeMeta() error {
441
+	// Create a temporary buffer for the meta page.
442
+	buf := make([]byte, tx.db.pageSize)
443
+	p := tx.db.pageInBuffer(buf, 0)
444
+	tx.meta.write(p)
445
+
446
+	// Write the meta page to file.
447
+	if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
448
+		return err
449
+	}
450
+	if !tx.db.NoSync || IgnoreNoSync {
451
+		if err := fdatasync(tx.db); err != nil {
452
+			return err
453
+		}
454
+	}
455
+
456
+	// Update statistics.
457
+	tx.stats.Write++
458
+
459
+	return nil
460
+}
461
+
462
+// page returns a reference to the page with a given id.
463
+// If page has been written to then a temporary bufferred page is returned.
464
+func (tx *Tx) page(id pgid) *page {
465
+	// Check the dirty pages first.
466
+	if tx.pages != nil {
467
+		if p, ok := tx.pages[id]; ok {
468
+			return p
469
+		}
470
+	}
471
+
472
+	// Otherwise return directly from the mmap.
473
+	return tx.db.page(id)
474
+}
475
+
476
+// forEachPage iterates over every page within a given page and executes a function.
477
+func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) {
478
+	p := tx.page(pgid)
479
+
480
+	// Execute function.
481
+	fn(p, depth)
482
+
483
+	// Recursively loop over children.
484
+	if (p.flags & branchPageFlag) != 0 {
485
+		for i := 0; i < int(p.count); i++ {
486
+			elem := p.branchPageElement(uint16(i))
487
+			tx.forEachPage(elem.pgid, depth+1, fn)
488
+		}
489
+	}
490
+}
491
+
492
+// Page returns page information for a given page number.
493
+// This is only safe for concurrent use when used by a writable transaction.
494
+func (tx *Tx) Page(id int) (*PageInfo, error) {
495
+	if tx.db == nil {
496
+		return nil, ErrTxClosed
497
+	} else if pgid(id) >= tx.meta.pgid {
498
+		return nil, nil
499
+	}
500
+
501
+	// Build the page info.
502
+	p := tx.db.page(pgid(id))
503
+	info := &PageInfo{
504
+		ID:            id,
505
+		Count:         int(p.count),
506
+		OverflowCount: int(p.overflow),
507
+	}
508
+
509
+	// Determine the type (or if it's free).
510
+	if tx.db.freelist.freed(pgid(id)) {
511
+		info.Type = "free"
512
+	} else {
513
+		info.Type = p.typ()
514
+	}
515
+
516
+	return info, nil
517
+}
518
+
519
+// TxStats represents statistics about the actions performed by the transaction.
520
+type TxStats struct {
521
+	// Page statistics.
522
+	PageCount int // number of page allocations
523
+	PageAlloc int // total bytes allocated
524
+
525
+	// Cursor statistics.
526
+	CursorCount int // number of cursors created
527
+
528
+	// Node statistics
529
+	NodeCount int // number of node allocations
530
+	NodeDeref int // number of node dereferences
531
+
532
+	// Rebalance statistics.
533
+	Rebalance     int           // number of node rebalances
534
+	RebalanceTime time.Duration // total time spent rebalancing
535
+
536
+	// Split/Spill statistics.
537
+	Split     int           // number of nodes split
538
+	Spill     int           // number of nodes spilled
539
+	SpillTime time.Duration // total time spent spilling
540
+
541
+	// Write statistics.
542
+	Write     int           // number of writes performed
543
+	WriteTime time.Duration // total time spent writing to disk
544
+}
545
+
546
+func (s *TxStats) add(other *TxStats) {
547
+	s.PageCount += other.PageCount
548
+	s.PageAlloc += other.PageAlloc
549
+	s.CursorCount += other.CursorCount
550
+	s.NodeCount += other.NodeCount
551
+	s.NodeDeref += other.NodeDeref
552
+	s.Rebalance += other.Rebalance
553
+	s.RebalanceTime += other.RebalanceTime
554
+	s.Split += other.Split
555
+	s.Spill += other.Spill
556
+	s.SpillTime += other.SpillTime
557
+	s.Write += other.Write
558
+	s.WriteTime += other.WriteTime
559
+}
560
+
561
+// Sub calculates and returns the difference between two sets of transaction stats.
562
+// This is useful when obtaining stats at two different points and time and
563
+// you need the performance counters that occurred within that time span.
564
+func (s *TxStats) Sub(other *TxStats) TxStats {
565
+	var diff TxStats
566
+	diff.PageCount = s.PageCount - other.PageCount
567
+	diff.PageAlloc = s.PageAlloc - other.PageAlloc
568
+	diff.CursorCount = s.CursorCount - other.CursorCount
569
+	diff.NodeCount = s.NodeCount - other.NodeCount
570
+	diff.NodeDeref = s.NodeDeref - other.NodeDeref
571
+	diff.Rebalance = s.Rebalance - other.Rebalance
572
+	diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime
573
+	diff.Split = s.Split - other.Split
574
+	diff.Spill = s.Spill - other.Spill
575
+	diff.SpillTime = s.SpillTime - other.SpillTime
576
+	diff.Write = s.Write - other.Write
577
+	diff.WriteTime = s.WriteTime - other.WriteTime
578
+	return diff
579
+}
0 580
deleted file mode 100644
... ...
@@ -1,191 +0,0 @@
1
-
2
-                                 Apache License
3
-                           Version 2.0, January 2004
4
-                        http://www.apache.org/licenses/
5
-
6
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
-
8
-   1. Definitions.
9
-
10
-      "License" shall mean the terms and conditions for use, reproduction,
11
-      and distribution as defined by Sections 1 through 9 of this document.
12
-
13
-      "Licensor" shall mean the copyright owner or entity authorized by
14
-      the copyright owner that is granting the License.
15
-
16
-      "Legal Entity" shall mean the union of the acting entity and all
17
-      other entities that control, are controlled by, or are under common
18
-      control with that entity. For the purposes of this definition,
19
-      "control" means (i) the power, direct or indirect, to cause the
20
-      direction or management of such entity, whether by contract or
21
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
-      outstanding shares, or (iii) beneficial ownership of such entity.
23
-
24
-      "You" (or "Your") shall mean an individual or Legal Entity
25
-      exercising permissions granted by this License.
26
-
27
-      "Source" form shall mean the preferred form for making modifications,
28
-      including but not limited to software source code, documentation
29
-      source, and configuration files.
30
-
31
-      "Object" form shall mean any form resulting from mechanical
32
-      transformation or translation of a Source form, including but
33
-      not limited to compiled object code, generated documentation,
34
-      and conversions to other media types.
35
-
36
-      "Work" shall mean the work of authorship, whether in Source or
37
-      Object form, made available under the License, as indicated by a
38
-      copyright notice that is included in or attached to the work
39
-      (an example is provided in the Appendix below).
40
-
41
-      "Derivative Works" shall mean any work, whether in Source or Object
42
-      form, that is based on (or derived from) the Work and for which the
43
-      editorial revisions, annotations, elaborations, or other modifications
44
-      represent, as a whole, an original work of authorship. For the purposes
45
-      of this License, Derivative Works shall not include works that remain
46
-      separable from, or merely link (or bind by name) to the interfaces of,
47
-      the Work and Derivative Works thereof.
48
-
49
-      "Contribution" shall mean any work of authorship, including
50
-      the original version of the Work and any modifications or additions
51
-      to that Work or Derivative Works thereof, that is intentionally
52
-      submitted to Licensor for inclusion in the Work by the copyright owner
53
-      or by an individual or Legal Entity authorized to submit on behalf of
54
-      the copyright owner. For the purposes of this definition, "submitted"
55
-      means any form of electronic, verbal, or written communication sent
56
-      to the Licensor or its representatives, including but not limited to
57
-      communication on electronic mailing lists, source code control systems,
58
-      and issue tracking systems that are managed by, or on behalf of, the
59
-      Licensor for the purpose of discussing and improving the Work, but
60
-      excluding communication that is conspicuously marked or otherwise
61
-      designated in writing by the copyright owner as "Not a Contribution."
62
-
63
-      "Contributor" shall mean Licensor and any individual or Legal Entity
64
-      on behalf of whom a Contribution has been received by Licensor and
65
-      subsequently incorporated within the Work.
66
-
67
-   2. Grant of Copyright License. Subject to the terms and conditions of
68
-      this License, each Contributor hereby grants to You a perpetual,
69
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
-      copyright license to reproduce, prepare Derivative Works of,
71
-      publicly display, publicly perform, sublicense, and distribute the
72
-      Work and such Derivative Works in Source or Object form.
73
-
74
-   3. Grant of Patent License. Subject to the terms and conditions of
75
-      this License, each Contributor hereby grants to You a perpetual,
76
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
-      (except as stated in this section) patent license to make, have made,
78
-      use, offer to sell, sell, import, and otherwise transfer the Work,
79
-      where such license applies only to those patent claims licensable
80
-      by such Contributor that are necessarily infringed by their
81
-      Contribution(s) alone or by combination of their Contribution(s)
82
-      with the Work to which such Contribution(s) was submitted. If You
83
-      institute patent litigation against any entity (including a
84
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
85
-      or a Contribution incorporated within the Work constitutes direct
86
-      or contributory patent infringement, then any patent licenses
87
-      granted to You under this License for that Work shall terminate
88
-      as of the date such litigation is filed.
89
-
90
-   4. Redistribution. You may reproduce and distribute copies of the
91
-      Work or Derivative Works thereof in any medium, with or without
92
-      modifications, and in Source or Object form, provided that You
93
-      meet the following conditions:
94
-
95
-      (a) You must give any other recipients of the Work or
96
-          Derivative Works a copy of this License; and
97
-
98
-      (b) You must cause any modified files to carry prominent notices
99
-          stating that You changed the files; and
100
-
101
-      (c) You must retain, in the Source form of any Derivative Works
102
-          that You distribute, all copyright, patent, trademark, and
103
-          attribution notices from the Source form of the Work,
104
-          excluding those notices that do not pertain to any part of
105
-          the Derivative Works; and
106
-
107
-      (d) If the Work includes a "NOTICE" text file as part of its
108
-          distribution, then any Derivative Works that You distribute must
109
-          include a readable copy of the attribution notices contained
110
-          within such NOTICE file, excluding those notices that do not
111
-          pertain to any part of the Derivative Works, in at least one
112
-          of the following places: within a NOTICE text file distributed
113
-          as part of the Derivative Works; within the Source form or
114
-          documentation, if provided along with the Derivative Works; or,
115
-          within a display generated by the Derivative Works, if and
116
-          wherever such third-party notices normally appear. The contents
117
-          of the NOTICE file are for informational purposes only and
118
-          do not modify the License. You may add Your own attribution
119
-          notices within Derivative Works that You distribute, alongside
120
-          or as an addendum to the NOTICE text from the Work, provided
121
-          that such additional attribution notices cannot be construed
122
-          as modifying the License.
123
-
124
-      You may add Your own copyright statement to Your modifications and
125
-      may provide additional or different license terms and conditions
126
-      for use, reproduction, or distribution of Your modifications, or
127
-      for any such Derivative Works as a whole, provided Your use,
128
-      reproduction, and distribution of the Work otherwise complies with
129
-      the conditions stated in this License.
130
-
131
-   5. Submission of Contributions. Unless You explicitly state otherwise,
132
-      any Contribution intentionally submitted for inclusion in the Work
133
-      by You to the Licensor shall be under the terms and conditions of
134
-      this License, without any additional terms or conditions.
135
-      Notwithstanding the above, nothing herein shall supersede or modify
136
-      the terms of any separate license agreement you may have executed
137
-      with Licensor regarding such Contributions.
138
-
139
-   6. Trademarks. This License does not grant permission to use the trade
140
-      names, trademarks, service marks, or product names of the Licensor,
141
-      except as required for reasonable and customary use in describing the
142
-      origin of the Work and reproducing the content of the NOTICE file.
143
-
144
-   7. Disclaimer of Warranty. Unless required by applicable law or
145
-      agreed to in writing, Licensor provides the Work (and each
146
-      Contributor provides its Contributions) on an "AS IS" BASIS,
147
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
-      implied, including, without limitation, any warranties or conditions
149
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
-      PARTICULAR PURPOSE. You are solely responsible for determining the
151
-      appropriateness of using or redistributing the Work and assume any
152
-      risks associated with Your exercise of permissions under this License.
153
-
154
-   8. Limitation of Liability. In no event and under no legal theory,
155
-      whether in tort (including negligence), contract, or otherwise,
156
-      unless required by applicable law (such as deliberate and grossly
157
-      negligent acts) or agreed to in writing, shall any Contributor be
158
-      liable to You for damages, including any direct, indirect, special,
159
-      incidental, or consequential damages of any character arising as a
160
-      result of this License or out of the use or inability to use the
161
-      Work (including but not limited to damages for loss of goodwill,
162
-      work stoppage, computer failure or malfunction, or any and all
163
-      other commercial damages or losses), even if such Contributor
164
-      has been advised of the possibility of such damages.
165
-
166
-   9. Accepting Warranty or Additional Liability. While redistributing
167
-      the Work or Derivative Works thereof, You may choose to offer,
168
-      and charge a fee for, acceptance of support, warranty, indemnity,
169
-      or other liability obligations and/or rights consistent with this
170
-      License. However, in accepting such obligations, You may act only
171
-      on Your own behalf and on Your sole responsibility, not on behalf
172
-      of any other Contributor, and only if You agree to indemnify,
173
-      defend, and hold each Contributor harmless for any liability
174
-      incurred by, or claims asserted against, such Contributor by reason
175
-      of your accepting any such warranty or additional liability.
176
-
177
-   END OF TERMS AND CONDITIONS
178
-
179
-   Copyright 2014-2015 Docker, Inc.
180
-
181
-   Licensed under the Apache License, Version 2.0 (the "License");
182
-   you may not use this file except in compliance with the License.
183
-   You may obtain a copy of the License at
184
-
185
-       http://www.apache.org/licenses/LICENSE-2.0
186
-
187
-   Unless required by applicable law or agreed to in writing, software
188
-   distributed under the License is distributed on an "AS IS" BASIS,
189
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190
-   See the License for the specific language governing permissions and
191
-   limitations under the License.
192 1
new file mode 100644
... ...
@@ -0,0 +1,191 @@
0
+
1
+                                 Apache License
2
+                           Version 2.0, January 2004
3
+                        http://www.apache.org/licenses/
4
+
5
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+   1. Definitions.
8
+
9
+      "License" shall mean the terms and conditions for use, reproduction,
10
+      and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+      "Licensor" shall mean the copyright owner or entity authorized by
13
+      the copyright owner that is granting the License.
14
+
15
+      "Legal Entity" shall mean the union of the acting entity and all
16
+      other entities that control, are controlled by, or are under common
17
+      control with that entity. For the purposes of this definition,
18
+      "control" means (i) the power, direct or indirect, to cause the
19
+      direction or management of such entity, whether by contract or
20
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+      outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+      "You" (or "Your") shall mean an individual or Legal Entity
24
+      exercising permissions granted by this License.
25
+
26
+      "Source" form shall mean the preferred form for making modifications,
27
+      including but not limited to software source code, documentation
28
+      source, and configuration files.
29
+
30
+      "Object" form shall mean any form resulting from mechanical
31
+      transformation or translation of a Source form, including but
32
+      not limited to compiled object code, generated documentation,
33
+      and conversions to other media types.
34
+
35
+      "Work" shall mean the work of authorship, whether in Source or
36
+      Object form, made available under the License, as indicated by a
37
+      copyright notice that is included in or attached to the work
38
+      (an example is provided in the Appendix below).
39
+
40
+      "Derivative Works" shall mean any work, whether in Source or Object
41
+      form, that is based on (or derived from) the Work and for which the
42
+      editorial revisions, annotations, elaborations, or other modifications
43
+      represent, as a whole, an original work of authorship. For the purposes
44
+      of this License, Derivative Works shall not include works that remain
45
+      separable from, or merely link (or bind by name) to the interfaces of,
46
+      the Work and Derivative Works thereof.
47
+
48
+      "Contribution" shall mean any work of authorship, including
49
+      the original version of the Work and any modifications or additions
50
+      to that Work or Derivative Works thereof, that is intentionally
51
+      submitted to Licensor for inclusion in the Work by the copyright owner
52
+      or by an individual or Legal Entity authorized to submit on behalf of
53
+      the copyright owner. For the purposes of this definition, "submitted"
54
+      means any form of electronic, verbal, or written communication sent
55
+      to the Licensor or its representatives, including but not limited to
56
+      communication on electronic mailing lists, source code control systems,
57
+      and issue tracking systems that are managed by, or on behalf of, the
58
+      Licensor for the purpose of discussing and improving the Work, but
59
+      excluding communication that is conspicuously marked or otherwise
60
+      designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+      "Contributor" shall mean Licensor and any individual or Legal Entity
63
+      on behalf of whom a Contribution has been received by Licensor and
64
+      subsequently incorporated within the Work.
65
+
66
+   2. Grant of Copyright License. Subject to the terms and conditions of
67
+      this License, each Contributor hereby grants to You a perpetual,
68
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+      copyright license to reproduce, prepare Derivative Works of,
70
+      publicly display, publicly perform, sublicense, and distribute the
71
+      Work and such Derivative Works in Source or Object form.
72
+
73
+   3. Grant of Patent License. Subject to the terms and conditions of
74
+      this License, each Contributor hereby grants to You a perpetual,
75
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+      (except as stated in this section) patent license to make, have made,
77
+      use, offer to sell, sell, import, and otherwise transfer the Work,
78
+      where such license applies only to those patent claims licensable
79
+      by such Contributor that are necessarily infringed by their
80
+      Contribution(s) alone or by combination of their Contribution(s)
81
+      with the Work to which such Contribution(s) was submitted. If You
82
+      institute patent litigation against any entity (including a
83
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+      or a Contribution incorporated within the Work constitutes direct
85
+      or contributory patent infringement, then any patent licenses
86
+      granted to You under this License for that Work shall terminate
87
+      as of the date such litigation is filed.
88
+
89
+   4. Redistribution. You may reproduce and distribute copies of the
90
+      Work or Derivative Works thereof in any medium, with or without
91
+      modifications, and in Source or Object form, provided that You
92
+      meet the following conditions:
93
+
94
+      (a) You must give any other recipients of the Work or
95
+          Derivative Works a copy of this License; and
96
+
97
+      (b) You must cause any modified files to carry prominent notices
98
+          stating that You changed the files; and
99
+
100
+      (c) You must retain, in the Source form of any Derivative Works
101
+          that You distribute, all copyright, patent, trademark, and
102
+          attribution notices from the Source form of the Work,
103
+          excluding those notices that do not pertain to any part of
104
+          the Derivative Works; and
105
+
106
+      (d) If the Work includes a "NOTICE" text file as part of its
107
+          distribution, then any Derivative Works that You distribute must
108
+          include a readable copy of the attribution notices contained
109
+          within such NOTICE file, excluding those notices that do not
110
+          pertain to any part of the Derivative Works, in at least one
111
+          of the following places: within a NOTICE text file distributed
112
+          as part of the Derivative Works; within the Source form or
113
+          documentation, if provided along with the Derivative Works; or,
114
+          within a display generated by the Derivative Works, if and
115
+          wherever such third-party notices normally appear. The contents
116
+          of the NOTICE file are for informational purposes only and
117
+          do not modify the License. You may add Your own attribution
118
+          notices within Derivative Works that You distribute, alongside
119
+          or as an addendum to the NOTICE text from the Work, provided
120
+          that such additional attribution notices cannot be construed
121
+          as modifying the License.
122
+
123
+      You may add Your own copyright statement to Your modifications and
124
+      may provide additional or different license terms and conditions
125
+      for use, reproduction, or distribution of Your modifications, or
126
+      for any such Derivative Works as a whole, provided Your use,
127
+      reproduction, and distribution of the Work otherwise complies with
128
+      the conditions stated in this License.
129
+
130
+   5. Submission of Contributions. Unless You explicitly state otherwise,
131
+      any Contribution intentionally submitted for inclusion in the Work
132
+      by You to the Licensor shall be under the terms and conditions of
133
+      this License, without any additional terms or conditions.
134
+      Notwithstanding the above, nothing herein shall supersede or modify
135
+      the terms of any separate license agreement you may have executed
136
+      with Licensor regarding such Contributions.
137
+
138
+   6. Trademarks. This License does not grant permission to use the trade
139
+      names, trademarks, service marks, or product names of the Licensor,
140
+      except as required for reasonable and customary use in describing the
141
+      origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+   7. Disclaimer of Warranty. Unless required by applicable law or
144
+      agreed to in writing, Licensor provides the Work (and each
145
+      Contributor provides its Contributions) on an "AS IS" BASIS,
146
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+      implied, including, without limitation, any warranties or conditions
148
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+      PARTICULAR PURPOSE. You are solely responsible for determining the
150
+      appropriateness of using or redistributing the Work and assume any
151
+      risks associated with Your exercise of permissions under this License.
152
+
153
+   8. Limitation of Liability. In no event and under no legal theory,
154
+      whether in tort (including negligence), contract, or otherwise,
155
+      unless required by applicable law (such as deliberate and grossly
156
+      negligent acts) or agreed to in writing, shall any Contributor be
157
+      liable to You for damages, including any direct, indirect, special,
158
+      incidental, or consequential damages of any character arising as a
159
+      result of this License or out of the use or inability to use the
160
+      Work (including but not limited to damages for loss of goodwill,
161
+      work stoppage, computer failure or malfunction, or any and all
162
+      other commercial damages or losses), even if such Contributor
163
+      has been advised of the possibility of such damages.
164
+
165
+   9. Accepting Warranty or Additional Liability. While redistributing
166
+      the Work or Derivative Works thereof, You may choose to offer,
167
+      and charge a fee for, acceptance of support, warranty, indemnity,
168
+      or other liability obligations and/or rights consistent with this
169
+      License. However, in accepting such obligations, You may act only
170
+      on Your own behalf and on Your sole responsibility, not on behalf
171
+      of any other Contributor, and only if You agree to indemnify,
172
+      defend, and hold each Contributor harmless for any liability
173
+      incurred by, or claims asserted against, such Contributor by reason
174
+      of your accepting any such warranty or additional liability.
175
+
176
+   END OF TERMS AND CONDITIONS
177
+
178
+   Copyright 2014-2015 Docker, Inc.
179
+
180
+   Licensed under the Apache License, Version 2.0 (the "License");
181
+   you may not use this file except in compliance with the License.
182
+   You may obtain a copy of the License at
183
+
184
+       http://www.apache.org/licenses/LICENSE-2.0
185
+
186
+   Unless required by applicable law or agreed to in writing, software
187
+   distributed under the License is distributed on an "AS IS" BASIS,
188
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189
+   See the License for the specific language governing permissions and
190
+   limitations under the License.
0 191
new file mode 100644
... ...
@@ -0,0 +1,425 @@
0
+Attribution-ShareAlike 4.0 International
1
+
2
+=======================================================================
3
+
4
+Creative Commons Corporation ("Creative Commons") is not a law firm and
5
+does not provide legal services or legal advice. Distribution of
6
+Creative Commons public licenses does not create a lawyer-client or
7
+other relationship. Creative Commons makes its licenses and related
8
+information available on an "as-is" basis. Creative Commons gives no
9
+warranties regarding its licenses, any material licensed under their
10
+terms and conditions, or any related information. Creative Commons
11
+disclaims all liability for damages resulting from their use to the
12
+fullest extent possible.
13
+
14
+Using Creative Commons Public Licenses
15
+
16
+Creative Commons public licenses provide a standard set of terms and
17
+conditions that creators and other rights holders may use to share
18
+original works of authorship and other material subject to copyright
19
+and certain other rights specified in the public license below. The
20
+following considerations are for informational purposes only, are not
21
+exhaustive, and do not form part of our licenses.
22
+
23
+     Considerations for licensors: Our public licenses are
24
+     intended for use by those authorized to give the public
25
+     permission to use material in ways otherwise restricted by
26
+     copyright and certain other rights. Our licenses are
27
+     irrevocable. Licensors should read and understand the terms
28
+     and conditions of the license they choose before applying it.
29
+     Licensors should also secure all rights necessary before
30
+     applying our licenses so that the public can reuse the
31
+     material as expected. Licensors should clearly mark any
32
+     material not subject to the license. This includes other CC-
33
+     licensed material, or material used under an exception or
34
+     limitation to copyright. More considerations for licensors:
35
+	wiki.creativecommons.org/Considerations_for_licensors
36
+
37
+     Considerations for the public: By using one of our public
38
+     licenses, a licensor grants the public permission to use the
39
+     licensed material under specified terms and conditions. If
40
+     the licensor's permission is not necessary for any reason--for
41
+     example, because of any applicable exception or limitation to
42
+     copyright--then that use is not regulated by the license. Our
43
+     licenses grant only permissions under copyright and certain
44
+     other rights that a licensor has authority to grant. Use of
45
+     the licensed material may still be restricted for other
46
+     reasons, including because others have copyright or other
47
+     rights in the material. A licensor may make special requests,
48
+     such as asking that all changes be marked or described.
49
+     Although not required by our licenses, you are encouraged to
50
+     respect those requests where reasonable. More_considerations
51
+     for the public:
52
+	wiki.creativecommons.org/Considerations_for_licensees
53
+
54
+=======================================================================
55
+
56
+Creative Commons Attribution-ShareAlike 4.0 International Public
57
+License
58
+
59
+By exercising the Licensed Rights (defined below), You accept and agree
60
+to be bound by the terms and conditions of this Creative Commons
61
+Attribution-ShareAlike 4.0 International Public License ("Public
62
+License"). To the extent this Public License may be interpreted as a
63
+contract, You are granted the Licensed Rights in consideration of Your
64
+acceptance of these terms and conditions, and the Licensor grants You
65
+such rights in consideration of benefits the Licensor receives from
66
+making the Licensed Material available under these terms and
67
+conditions.
68
+
69
+
70
+Section 1 -- Definitions.
71
+
72
+  a. Adapted Material means material subject to Copyright and Similar
73
+     Rights that is derived from or based upon the Licensed Material
74
+     and in which the Licensed Material is translated, altered,
75
+     arranged, transformed, or otherwise modified in a manner requiring
76
+     permission under the Copyright and Similar Rights held by the
77
+     Licensor. For purposes of this Public License, where the Licensed
78
+     Material is a musical work, performance, or sound recording,
79
+     Adapted Material is always produced where the Licensed Material is
80
+     synched in timed relation with a moving image.
81
+
82
+  b. Adapter's License means the license You apply to Your Copyright
83
+     and Similar Rights in Your contributions to Adapted Material in
84
+     accordance with the terms and conditions of this Public License.
85
+
86
+  c. BY-SA Compatible License means a license listed at
87
+     creativecommons.org/compatiblelicenses, approved by Creative
88
+     Commons as essentially the equivalent of this Public License.
89
+
90
+  d. Copyright and Similar Rights means copyright and/or similar rights
91
+     closely related to copyright including, without limitation,
92
+     performance, broadcast, sound recording, and Sui Generis Database
93
+     Rights, without regard to how the rights are labeled or
94
+     categorized. For purposes of this Public License, the rights
95
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
96
+     Rights.
97
+
98
+  e. Effective Technological Measures means those measures that, in the
99
+     absence of proper authority, may not be circumvented under laws
100
+     fulfilling obligations under Article 11 of the WIPO Copyright
101
+     Treaty adopted on December 20, 1996, and/or similar international
102
+     agreements.
103
+
104
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
105
+     any other exception or limitation to Copyright and Similar Rights
106
+     that applies to Your use of the Licensed Material.
107
+
108
+  g. License Elements means the license attributes listed in the name
109
+     of a Creative Commons Public License. The License Elements of this
110
+     Public License are Attribution and ShareAlike.
111
+
112
+  h. Licensed Material means the artistic or literary work, database,
113
+     or other material to which the Licensor applied this Public
114
+     License.
115
+
116
+  i. Licensed Rights means the rights granted to You subject to the
117
+     terms and conditions of this Public License, which are limited to
118
+     all Copyright and Similar Rights that apply to Your use of the
119
+     Licensed Material and that the Licensor has authority to license.
120
+
121
+  j. Licensor means the individual(s) or entity(ies) granting rights
122
+     under this Public License.
123
+
124
+  k. Share means to provide material to the public by any means or
125
+     process that requires permission under the Licensed Rights, such
126
+     as reproduction, public display, public performance, distribution,
127
+     dissemination, communication, or importation, and to make material
128
+     available to the public including in ways that members of the
129
+     public may access the material from a place and at a time
130
+     individually chosen by them.
131
+
132
+  l. Sui Generis Database Rights means rights other than copyright
133
+     resulting from Directive 96/9/EC of the European Parliament and of
134
+     the Council of 11 March 1996 on the legal protection of databases,
135
+     as amended and/or succeeded, as well as other essentially
136
+     equivalent rights anywhere in the world.
137
+
138
+  m. You means the individual or entity exercising the Licensed Rights
139
+     under this Public License. Your has a corresponding meaning.
140
+
141
+
142
+Section 2 -- Scope.
143
+
144
+  a. License grant.
145
+
146
+       1. Subject to the terms and conditions of this Public License,
147
+          the Licensor hereby grants You a worldwide, royalty-free,
148
+          non-sublicensable, non-exclusive, irrevocable license to
149
+          exercise the Licensed Rights in the Licensed Material to:
150
+
151
+            a. reproduce and Share the Licensed Material, in whole or
152
+               in part; and
153
+
154
+            b. produce, reproduce, and Share Adapted Material.
155
+
156
+       2. Exceptions and Limitations. For the avoidance of doubt, where
157
+          Exceptions and Limitations apply to Your use, this Public
158
+          License does not apply, and You do not need to comply with
159
+          its terms and conditions.
160
+
161
+       3. Term. The term of this Public License is specified in Section
162
+          6(a).
163
+
164
+       4. Media and formats; technical modifications allowed. The
165
+          Licensor authorizes You to exercise the Licensed Rights in
166
+          all media and formats whether now known or hereafter created,
167
+          and to make technical modifications necessary to do so. The
168
+          Licensor waives and/or agrees not to assert any right or
169
+          authority to forbid You from making technical modifications
170
+          necessary to exercise the Licensed Rights, including
171
+          technical modifications necessary to circumvent Effective
172
+          Technological Measures. For purposes of this Public License,
173
+          simply making modifications authorized by this Section 2(a)
174
+          (4) never produces Adapted Material.
175
+
176
+       5. Downstream recipients.
177
+
178
+            a. Offer from the Licensor -- Licensed Material. Every
179
+               recipient of the Licensed Material automatically
180
+               receives an offer from the Licensor to exercise the
181
+               Licensed Rights under the terms and conditions of this
182
+               Public License.
183
+
184
+            b. Additional offer from the Licensor -- Adapted Material.
185
+               Every recipient of Adapted Material from You
186
+               automatically receives an offer from the Licensor to
187
+               exercise the Licensed Rights in the Adapted Material
188
+               under the conditions of the Adapter's License You apply.
189
+
190
+            c. No downstream restrictions. You may not offer or impose
191
+               any additional or different terms or conditions on, or
192
+               apply any Effective Technological Measures to, the
193
+               Licensed Material if doing so restricts exercise of the
194
+               Licensed Rights by any recipient of the Licensed
195
+               Material.
196
+
197
+       6. No endorsement. Nothing in this Public License constitutes or
198
+          may be construed as permission to assert or imply that You
199
+          are, or that Your use of the Licensed Material is, connected
200
+          with, or sponsored, endorsed, or granted official status by,
201
+          the Licensor or others designated to receive attribution as
202
+          provided in Section 3(a)(1)(A)(i).
203
+
204
+  b. Other rights.
205
+
206
+       1. Moral rights, such as the right of integrity, are not
207
+          licensed under this Public License, nor are publicity,
208
+          privacy, and/or other similar personality rights; however, to
209
+          the extent possible, the Licensor waives and/or agrees not to
210
+          assert any such rights held by the Licensor to the limited
211
+          extent necessary to allow You to exercise the Licensed
212
+          Rights, but not otherwise.
213
+
214
+       2. Patent and trademark rights are not licensed under this
215
+          Public License.
216
+
217
+       3. To the extent possible, the Licensor waives any right to
218
+          collect royalties from You for the exercise of the Licensed
219
+          Rights, whether directly or through a collecting society
220
+          under any voluntary or waivable statutory or compulsory
221
+          licensing scheme. In all other cases the Licensor expressly
222
+          reserves any right to collect such royalties.
223
+
224
+
225
+Section 3 -- License Conditions.
226
+
227
+Your exercise of the Licensed Rights is expressly made subject to the
228
+following conditions.
229
+
230
+  a. Attribution.
231
+
232
+       1. If You Share the Licensed Material (including in modified
233
+          form), You must:
234
+
235
+            a. retain the following if it is supplied by the Licensor
236
+               with the Licensed Material:
237
+
238
+                 i. identification of the creator(s) of the Licensed
239
+                    Material and any others designated to receive
240
+                    attribution, in any reasonable manner requested by
241
+                    the Licensor (including by pseudonym if
242
+                    designated);
243
+
244
+                ii. a copyright notice;
245
+
246
+               iii. a notice that refers to this Public License;
247
+
248
+                iv. a notice that refers to the disclaimer of
249
+                    warranties;
250
+
251
+                 v. a URI or hyperlink to the Licensed Material to the
252
+                    extent reasonably practicable;
253
+
254
+            b. indicate if You modified the Licensed Material and
255
+               retain an indication of any previous modifications; and
256
+
257
+            c. indicate the Licensed Material is licensed under this
258
+               Public License, and include the text of, or the URI or
259
+               hyperlink to, this Public License.
260
+
261
+       2. You may satisfy the conditions in Section 3(a)(1) in any
262
+          reasonable manner based on the medium, means, and context in
263
+          which You Share the Licensed Material. For example, it may be
264
+          reasonable to satisfy the conditions by providing a URI or
265
+          hyperlink to a resource that includes the required
266
+          information.
267
+
268
+       3. If requested by the Licensor, You must remove any of the
269
+          information required by Section 3(a)(1)(A) to the extent
270
+          reasonably practicable.
271
+
272
+  b. ShareAlike.
273
+
274
+     In addition to the conditions in Section 3(a), if You Share
275
+     Adapted Material You produce, the following conditions also apply.
276
+
277
+       1. The Adapter's License You apply must be a Creative Commons
278
+          license with the same License Elements, this version or
279
+          later, or a BY-SA Compatible License.
280
+
281
+       2. You must include the text of, or the URI or hyperlink to, the
282
+          Adapter's License You apply. You may satisfy this condition
283
+          in any reasonable manner based on the medium, means, and
284
+          context in which You Share Adapted Material.
285
+
286
+       3. You may not offer or impose any additional or different terms
287
+          or conditions on, or apply any Effective Technological
288
+          Measures to, Adapted Material that restrict exercise of the
289
+          rights granted under the Adapter's License You apply.
290
+
291
+
292
+Section 4 -- Sui Generis Database Rights.
293
+
294
+Where the Licensed Rights include Sui Generis Database Rights that
295
+apply to Your use of the Licensed Material:
296
+
297
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
298
+     to extract, reuse, reproduce, and Share all or a substantial
299
+     portion of the contents of the database;
300
+
301
+  b. if You include all or a substantial portion of the database
302
+     contents in a database in which You have Sui Generis Database
303
+     Rights, then the database in which You have Sui Generis Database
304
+     Rights (but not its individual contents) is Adapted Material,
305
+
306
+     including for purposes of Section 3(b); and
307
+  c. You must comply with the conditions in Section 3(a) if You Share
308
+     all or a substantial portion of the contents of the database.
309
+
310
+For the avoidance of doubt, this Section 4 supplements and does not
311
+replace Your obligations under this Public License where the Licensed
312
+Rights include other Copyright and Similar Rights.
313
+
314
+
315
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
316
+
317
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
318
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
319
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
320
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
321
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
322
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
323
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
324
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
325
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
326
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
327
+
328
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
329
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
330
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
331
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
332
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
333
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
334
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
335
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
336
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
337
+
338
+  c. The disclaimer of warranties and limitation of liability provided
339
+     above shall be interpreted in a manner that, to the extent
340
+     possible, most closely approximates an absolute disclaimer and
341
+     waiver of all liability.
342
+
343
+
344
+Section 6 -- Term and Termination.
345
+
346
+  a. This Public License applies for the term of the Copyright and
347
+     Similar Rights licensed here. However, if You fail to comply with
348
+     this Public License, then Your rights under this Public License
349
+     terminate automatically.
350
+
351
+  b. Where Your right to use the Licensed Material has terminated under
352
+     Section 6(a), it reinstates:
353
+
354
+       1. automatically as of the date the violation is cured, provided
355
+          it is cured within 30 days of Your discovery of the
356
+          violation; or
357
+
358
+       2. upon express reinstatement by the Licensor.
359
+
360
+     For the avoidance of doubt, this Section 6(b) does not affect any
361
+     right the Licensor may have to seek remedies for Your violations
362
+     of this Public License.
363
+
364
+  c. For the avoidance of doubt, the Licensor may also offer the
365
+     Licensed Material under separate terms or conditions or stop
366
+     distributing the Licensed Material at any time; however, doing so
367
+     will not terminate this Public License.
368
+
369
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
370
+     License.
371
+
372
+
373
+Section 7 -- Other Terms and Conditions.
374
+
375
+  a. The Licensor shall not be bound by any additional or different
376
+     terms or conditions communicated by You unless expressly agreed.
377
+
378
+  b. Any arrangements, understandings, or agreements regarding the
379
+     Licensed Material not stated herein are separate from and
380
+     independent of the terms and conditions of this Public License.
381
+
382
+
383
+Section 8 -- Interpretation.
384
+
385
+  a. For the avoidance of doubt, this Public License does not, and
386
+     shall not be interpreted to, reduce, limit, restrict, or impose
387
+     conditions on any use of the Licensed Material that could lawfully
388
+     be made without permission under this Public License.
389
+
390
+  b. To the extent possible, if any provision of this Public License is
391
+     deemed unenforceable, it shall be automatically reformed to the
392
+     minimum extent necessary to make it enforceable. If the provision
393
+     cannot be reformed, it shall be severed from this Public License
394
+     without affecting the enforceability of the remaining terms and
395
+     conditions.
396
+
397
+  c. No term or condition of this Public License will be waived and no
398
+     failure to comply consented to unless expressly agreed to by the
399
+     Licensor.
400
+
401
+  d. Nothing in this Public License constitutes or may be interpreted
402
+     as a limitation upon, or waiver of, any privileges and immunities
403
+     that apply to the Licensor or You, including from the legal
404
+     processes of any jurisdiction or authority.
405
+
406
+
407
+=======================================================================
408
+
409
+Creative Commons is not a party to its public licenses.
410
+Notwithstanding, Creative Commons may elect to apply one of its public
411
+licenses to material it publishes and in those instances will be
412
+considered the "Licensor." Except for the limited purpose of indicating
413
+that material is shared under a Creative Commons public license or as
414
+otherwise permitted by the Creative Commons policies published at
415
+creativecommons.org/policies, Creative Commons does not authorize the
416
+use of the trademark "Creative Commons" or any other trademark or logo
417
+of Creative Commons without its prior written consent including,
418
+without limitation, in connection with any unauthorized modifications
419
+to any of its public licenses or any other arrangements,
420
+understandings, or agreements concerning use of licensed material. For
421
+the avoidance of doubt, this paragraph does not form part of the public
422
+licenses.
423
+
424
+Creative Commons may be contacted at creativecommons.org.
... ...
@@ -68,6 +68,10 @@ func main() {
68 68
 
69 69
 You can find other usage examples for `libkv` under the `docker/swarm` or `docker/libnetwork` repositories.
70 70
 
71
+## TLS
72
+
73
+The etcd backend supports etcd servers that require TLS Client Authentication.  Zookeeper and Consul support are planned.  This feature is somewhat experimental and the store.ClientTLSConfig struct may change to accommodate the additional backends.
74
+
71 75
 ## Warning
72 76
 
73 77
 There are a few consistency issues with *etcd*, on the notion of *directory* and *key*. If you want to use the three KV backends in an interchangeable way, you should only put data on leaves (see [Issue 20](https://github.com/docker/libkv/issues/20) for more details). This will be fixed when *etcd* API v3 will be made available (API v3 drops the *directory/key* distinction). An official release for *libkv* with a tag is likely to come after this issue being marked as **solved**.
... ...
@@ -113,4 +117,4 @@ Want to hack on libkv? [Docker's contributions guidelines](https://github.com/do
113 113
 
114 114
 ##Copyright and license
115 115
 
116
-Code and documentation copyright 2015 Docker, inc. Code released under the Apache 2.0 license. Docs released under Creative commons.
116
+Copyright © 2014-2015 Docker, Inc. All rights reserved, except as follows. Code is released under the Apache 2.0 license. Documentation is licensed to end users under the Creative Commons Attribution 4.0 International License under the terms and conditions set forth in the file "LICENSE.docs". You may obtain a duplicate copy of the same license, titled CC-BY-SA-4.0, at http://creativecommons.org/licenses/by/4.0/.
117 117
new file mode 100644
... ...
@@ -0,0 +1,327 @@
0
+package boltdb
1
+
2
+import (
3
+	"bytes"
4
+	"encoding/binary"
5
+	"errors"
6
+	"os"
7
+	"path/filepath"
8
+	"sync/atomic"
9
+
10
+	"github.com/boltdb/bolt"
11
+	"github.com/docker/libkv"
12
+	"github.com/docker/libkv/store"
13
+)
14
+
15
+var (
16
+	// ErrMultipleEndpointsUnsupported is thrown when multiple endpoints specified for
17
+	// BoltDB. Endpoint has to be a local file path
18
+	ErrMultipleEndpointsUnsupported = errors.New("boltdb supports one endpoint and should be a file path")
19
+	// ErrBoltBucketNotFound is thrown when specified BoltBD bucket doesn't exist in the DB
20
+	ErrBoltBucketNotFound = errors.New("boltdb bucket doesn't exist")
21
+	// ErrBoltBucketOptionMissing is thrown when boltBcuket config option is missing
22
+	ErrBoltBucketOptionMissing = errors.New("boltBucket config option missing")
23
+	// ErrBoltAPIUnsupported is thrown when an APIs unsupported by BoltDB backend is called
24
+	ErrBoltAPIUnsupported = errors.New("API not supported by BoltDB backend")
25
+)
26
+
27
+//BoltDB type implements the Store interface
28
+type BoltDB struct {
29
+	client     *bolt.DB
30
+	boltBucket []byte
31
+	dbIndex    uint64
32
+}
33
+
34
+const (
35
+	libkvmetadatalen = 8
36
+)
37
+
38
+// Register registers boltdb to libkv
39
+func Register() {
40
+	libkv.AddStore(store.BOLTDB, New)
41
+}
42
+
43
+// New opens a new BoltDB connection to the specified path and bucket
44
+func New(endpoints []string, options *store.Config) (store.Store, error) {
45
+	if len(endpoints) > 1 {
46
+		return nil, ErrMultipleEndpointsUnsupported
47
+	}
48
+
49
+	if (options == nil) || (len(options.Bucket) == 0) {
50
+		return nil, ErrBoltBucketOptionMissing
51
+	}
52
+
53
+	dir, _ := filepath.Split(endpoints[0])
54
+	if err := os.MkdirAll(dir, 0750); err != nil {
55
+		return nil, err
56
+	}
57
+
58
+	var boltOptions *bolt.Options
59
+	if options != nil {
60
+		boltOptions = &bolt.Options{Timeout: options.ConnectionTimeout}
61
+	}
62
+	db, err := bolt.Open(endpoints[0], 0644, boltOptions)
63
+	if err != nil {
64
+		return nil, err
65
+	}
66
+
67
+	b := &BoltDB{}
68
+
69
+	b.client = db
70
+	b.boltBucket = []byte(options.Bucket)
71
+	return b, nil
72
+}
73
+
74
+// Get the value at "key". BoltDB doesn't provide an inbuilt last modified index with every kv pair. Its implemented by
75
+// by a atomic counter maintained by the libkv and appened to the value passed by the client.
76
+func (b *BoltDB) Get(key string) (*store.KVPair, error) {
77
+	var val []byte
78
+
79
+	db := b.client
80
+	err := db.View(func(tx *bolt.Tx) error {
81
+		bucket := tx.Bucket(b.boltBucket)
82
+		if bucket == nil {
83
+			return (ErrBoltBucketNotFound)
84
+		}
85
+
86
+		val = bucket.Get([]byte(key))
87
+
88
+		return nil
89
+	})
90
+
91
+	if len(val) == 0 {
92
+		return nil, store.ErrKeyNotFound
93
+	}
94
+	if err != nil {
95
+		return nil, err
96
+	}
97
+
98
+	dbIndex := binary.LittleEndian.Uint64(val[:libkvmetadatalen])
99
+	val = val[libkvmetadatalen:]
100
+
101
+	return &store.KVPair{Key: key, Value: val, LastIndex: (dbIndex)}, nil
102
+}
103
+
104
+//Put the key, value pair. index number metadata is prepended to the value
105
+func (b *BoltDB) Put(key string, value []byte, opts *store.WriteOptions) error {
106
+	var dbIndex uint64
107
+	db := b.client
108
+	dbval := make([]byte, libkvmetadatalen)
109
+
110
+	err := db.Update(func(tx *bolt.Tx) error {
111
+		bucket, err := tx.CreateBucketIfNotExists(b.boltBucket)
112
+		if err != nil {
113
+			return err
114
+		}
115
+
116
+		dbIndex = atomic.AddUint64(&b.dbIndex, 1)
117
+		binary.LittleEndian.PutUint64(dbval, dbIndex)
118
+		dbval = append(dbval, value...)
119
+
120
+		err = bucket.Put([]byte(key), dbval)
121
+		if err != nil {
122
+			return err
123
+		}
124
+		return nil
125
+	})
126
+	return err
127
+}
128
+
129
+//Delete the value for the given key.
130
+func (b *BoltDB) Delete(key string) error {
131
+	db := b.client
132
+
133
+	err := db.Update(func(tx *bolt.Tx) error {
134
+		bucket := tx.Bucket(b.boltBucket)
135
+		if bucket == nil {
136
+			return (ErrBoltBucketNotFound)
137
+		}
138
+		err := bucket.Delete([]byte(key))
139
+		return err
140
+	})
141
+	return err
142
+}
143
+
144
+// Exists checks if the key exists inside the store
145
+func (b *BoltDB) Exists(key string) (bool, error) {
146
+	var val []byte
147
+
148
+	db := b.client
149
+	err := db.View(func(tx *bolt.Tx) error {
150
+		bucket := tx.Bucket(b.boltBucket)
151
+		if bucket == nil {
152
+			return (ErrBoltBucketNotFound)
153
+		}
154
+
155
+		val = bucket.Get([]byte(key))
156
+
157
+		return nil
158
+	})
159
+
160
+	if len(val) == 0 {
161
+		return false, err
162
+	}
163
+	return true, err
164
+}
165
+
166
+// List returns the range of keys starting with the passed in prefix
167
+func (b *BoltDB) List(keyPrefix string) ([]*store.KVPair, error) {
168
+	kv := []*store.KVPair{}
169
+
170
+	db := b.client
171
+	err := db.View(func(tx *bolt.Tx) error {
172
+		bucket := tx.Bucket(b.boltBucket)
173
+		if bucket == nil {
174
+			return (ErrBoltBucketNotFound)
175
+		}
176
+
177
+		cursor := bucket.Cursor()
178
+		prefix := []byte(keyPrefix)
179
+
180
+		for key, val := cursor.Seek(prefix); bytes.HasPrefix(key, prefix); key, val = cursor.Next() {
181
+
182
+			dbIndex := binary.LittleEndian.Uint64(val[:libkvmetadatalen])
183
+			val = val[libkvmetadatalen:]
184
+
185
+			kv = append(kv, &store.KVPair{
186
+				Key:       string(key),
187
+				Value:     val,
188
+				LastIndex: dbIndex,
189
+			})
190
+		}
191
+		return nil
192
+	})
193
+	if len(kv) == 0 {
194
+		return nil, store.ErrKeyNotFound
195
+	}
196
+	return kv, err
197
+}
198
+
199
+// AtomicDelete deletes a value at "key" if the key
200
+// has not been modified in the meantime, throws an
201
+// error if this is the case
202
+func (b *BoltDB) AtomicDelete(key string, previous *store.KVPair) (bool, error) {
203
+	var val []byte
204
+	var dbIndex uint64
205
+
206
+	if previous == nil {
207
+		return false, store.ErrPreviousNotSpecified
208
+	}
209
+	db := b.client
210
+
211
+	err := db.Update(func(tx *bolt.Tx) error {
212
+		bucket := tx.Bucket(b.boltBucket)
213
+		if bucket == nil {
214
+			return ErrBoltBucketNotFound
215
+		}
216
+
217
+		val = bucket.Get([]byte(key))
218
+		dbIndex = binary.LittleEndian.Uint64(val[:libkvmetadatalen])
219
+		if dbIndex != previous.LastIndex {
220
+			return store.ErrKeyModified
221
+		}
222
+		err := bucket.Delete([]byte(key))
223
+		return err
224
+	})
225
+	if err != nil {
226
+		return false, err
227
+	}
228
+	return true, err
229
+}
230
+
231
+// AtomicPut puts a value at "key" if the key has not been
232
+// modified since the last Put, throws an error if this is the case
233
+func (b *BoltDB) AtomicPut(key string, value []byte, previous *store.KVPair, options *store.WriteOptions) (bool, *store.KVPair, error) {
234
+	var val []byte
235
+	var dbIndex uint64
236
+	dbval := make([]byte, libkvmetadatalen)
237
+
238
+	db := b.client
239
+
240
+	err := db.Update(func(tx *bolt.Tx) error {
241
+		var err error
242
+		bucket := tx.Bucket(b.boltBucket)
243
+		if bucket == nil {
244
+			if previous != nil {
245
+				return ErrBoltBucketNotFound
246
+			}
247
+			bucket, err = tx.CreateBucket(b.boltBucket)
248
+			if err != nil {
249
+				return err
250
+			}
251
+		}
252
+		// AtomicPut is equivalent to Put if previous is nil and the Ky
253
+		// doesn't exist in the DB.
254
+		val = bucket.Get([]byte(key))
255
+		if previous == nil && len(val) != 0 {
256
+			return store.ErrKeyModified
257
+		}
258
+		if previous != nil {
259
+			if len(val) == 0 {
260
+				return store.ErrKeyNotFound
261
+			}
262
+			dbIndex = binary.LittleEndian.Uint64(val[:libkvmetadatalen])
263
+			if dbIndex != previous.LastIndex {
264
+				return store.ErrKeyModified
265
+			}
266
+		}
267
+		dbIndex = atomic.AddUint64(&b.dbIndex, 1)
268
+		binary.LittleEndian.PutUint64(dbval, b.dbIndex)
269
+		dbval = append(dbval, value...)
270
+		return (bucket.Put([]byte(key), dbval))
271
+	})
272
+	if err != nil {
273
+		return false, nil, err
274
+	}
275
+
276
+	updated := &store.KVPair{
277
+		Key:       key,
278
+		Value:     value,
279
+		LastIndex: dbIndex,
280
+	}
281
+
282
+	return true, updated, nil
283
+}
284
+
285
+// Close the db connection to the BoltDB
286
+func (b *BoltDB) Close() {
287
+	db := b.client
288
+
289
+	db.Close()
290
+}
291
+
292
+// DeleteTree deletes a range of keys with a given prefix
293
+func (b *BoltDB) DeleteTree(keyPrefix string) error {
294
+	db := b.client
295
+	err := db.Update(func(tx *bolt.Tx) error {
296
+		bucket := tx.Bucket(b.boltBucket)
297
+		if bucket == nil {
298
+			return (ErrBoltBucketNotFound)
299
+		}
300
+
301
+		cursor := bucket.Cursor()
302
+		prefix := []byte(keyPrefix)
303
+
304
+		for key, _ := cursor.Seek(prefix); bytes.HasPrefix(key, prefix); key, _ = cursor.Next() {
305
+			_ = bucket.Delete([]byte(key))
306
+		}
307
+		return nil
308
+	})
309
+
310
+	return err
311
+}
312
+
313
+// NewLock has to implemented at the library level since its not supported by BoltDB
314
+func (b *BoltDB) NewLock(key string, options *store.LockOptions) (store.Locker, error) {
315
+	return nil, ErrBoltAPIUnsupported
316
+}
317
+
318
+// Watch has to implemented at the library level since its not supported by BoltDB
319
+func (b *BoltDB) Watch(key string, stopCh <-chan struct{}) (<-chan *store.KVPair, error) {
320
+	return nil, ErrBoltAPIUnsupported
321
+}
322
+
323
+// WatchTree has to implemented at the library level since its not supported by BoltDB
324
+func (b *BoltDB) WatchTree(directory string, stopCh <-chan struct{}) (<-chan []*store.KVPair, error) {
325
+	return nil, ErrBoltAPIUnsupported
326
+}
... ...
@@ -43,13 +43,28 @@ func Register() {
43 43
 func New(addrs []string, options *store.Config) (store.Store, error) {
44 44
 	s := &Etcd{}
45 45
 
46
-	entries := store.CreateEndpoints(addrs, "http")
47
-	s.client = etcd.NewClient(entries)
46
+	var (
47
+		entries []string
48
+		err     error
49
+	)
50
+
51
+	// Create the etcd client
52
+	if options != nil && options.ClientTLS != nil {
53
+		entries = store.CreateEndpoints(addrs, "https")
54
+		s.client, err = etcd.NewTLSClient(entries, options.ClientTLS.CertFile, options.ClientTLS.KeyFile, options.ClientTLS.CACertFile)
55
+		if err != nil {
56
+			return nil, err
57
+		}
58
+	} else {
59
+		entries = store.CreateEndpoints(addrs, "http")
60
+		s.client = etcd.NewClient(entries)
61
+	}
48 62
 
49 63
 	// Set options
50 64
 	if options != nil {
65
+		// Plain TLS config overrides ClientTLS if specified
51 66
 		if options.TLS != nil {
52
-			s.setTLS(options.TLS)
67
+			s.setTLS(options.TLS, addrs)
53 68
 		}
54 69
 		if options.ConnectionTimeout != 0 {
55 70
 			s.setTimeout(options.ConnectionTimeout)
... ...
@@ -67,16 +82,10 @@ func New(addrs []string, options *store.Config) (store.Store, error) {
67 67
 	return s, nil
68 68
 }
69 69
 
70
-// SetTLS sets the tls configuration given the path
71
-// of certificate files
72
-func (s *Etcd) setTLS(tls *tls.Config) {
73
-	// Change to https scheme
74
-	var addrs []string
75
-	entries := s.client.GetCluster()
76
-	for _, entry := range entries {
77
-		addrs = append(addrs, strings.Replace(entry, "http", "https", -1))
78
-	}
79
-	s.client.SetCluster(addrs)
70
+// SetTLS sets the tls configuration given a tls.Config scheme
71
+func (s *Etcd) setTLS(tls *tls.Config, addrs []string) {
72
+	entries := store.CreateEndpoints(addrs, "https")
73
+	s.client.SetCluster(entries)
80 74
 
81 75
 	// Set transport
82 76
 	t := http.Transport{
... ...
@@ -39,11 +39,20 @@ var (
39 39
 
40 40
 // Config contains the options for a storage client
41 41
 type Config struct {
42
+	ClientTLS         *ClientTLSConfig
42 43
 	TLS               *tls.Config
43 44
 	ConnectionTimeout time.Duration
44 45
 	Bucket            string
45 46
 }
46 47
 
48
+// ClientTLSConfig contains data for a Client TLS configuration in the form
49
+//  the etcd client wants it.  Eventually we'll adapt it for ZK and Consul.
50
+type ClientTLSConfig struct {
51
+	CertFile   string
52
+	KeyFile    string
53
+	CACertFile string
54
+}
55
+
47 56
 // Store represents the backend K/V storage
48 57
 // Each store should support every call listed
49 58
 // here. Or it couldn't be implemented as a K/V
... ...
@@ -10,15 +10,10 @@ cidocker = docker run ${ciargs} ${dockerargs} golang:1.4
10 10
 all: ${build_image}.created build check integration-tests clean
11 11
 
12 12
 integration-tests: ./cmd/dnet/dnet
13
-	@if [ ! -d ./integration-tmp ]; then \
14
-	    mkdir -p ./integration-tmp;	\
15
-	    git clone https://github.com/sstephenson/bats.git ./integration-tmp/bats; \
16
-	    ./integration-tmp/bats/install.sh ./integration-tmp; \
17
-	fi
18
-	@./integration-tmp/bin/bats ./test/integration/dnet
13
+	@./test/integration/dnet/run-integration-tests.sh
19 14
 
20 15
 ./cmd/dnet/dnet:
21
-	make build-local
16
+	make build
22 17
 
23 18
 clean:
24 19
 	@if [ -e ./cmd/dnet/dnet ]; then \
... ...
@@ -76,7 +71,7 @@ run-tests:
76 76
 	done
77 77
 	@echo "Done running tests"
78 78
 
79
-check-local: 	check-format check-code start-services run-tests
79
+check-local:	check-format check-code start-services run-tests
80 80
 
81 81
 install-deps:
82 82
 	apt-get update && apt-get -y install iptables zookeeperd
... ...
@@ -17,61 +17,55 @@ There are many networking solutions available to suit a broad range of use-cases
17 17
 
18 18
 
19 19
 ```go
20
-        // Create a new controller instance
21
-        controller, err := libnetwork.New()
22
-        if err != nil {
23
-                return
24
-        }
25
-
26
-        // Select and configure the network driver
27
-        networkType := "bridge"
28
-
29
-        driverOptions := options.Generic{}
30
-        genericOption := make(map[string]interface{})
31
-        genericOption[netlabel.GenericData] = driverOptions
32
-        err = controller.ConfigureNetworkDriver(networkType, genericOption)
33
-        if err != nil {
34
-                return
35
-        }
36
-
37
-        // Create a network for containers to join.
38
-        // NewNetwork accepts Variadic optional arguments that libnetwork and Drivers can use.
39
-        network, err := controller.NewNetwork(networkType, "network1")
40
-        if err != nil {
41
-                return
42
-        }
43
-
44
-        // For each new container: allocate IP and interfaces. The returned network
45
-        // settings will be used for container infos (inspect and such), as well as
46
-        // iptables rules for port publishing. This info is contained or accessible
47
-        // from the returned endpoint.
48
-        ep, err := network.CreateEndpoint("Endpoint1")
49
-        if err != nil {
50
-                return
51
-        }
52
-
53
-        // Create the sandbox for the containr.
54
-        sbx, err := controller.NewSandbox("container1",
55
-        libnetwork.OptionHostname("test"),
56
-        libnetwork.OptionDomainname("docker.io"))
57
-		
58
-        // A sandbox can join the endpoint via the join api.
59
-        // Join accepts Variadic arguments which libnetwork and Drivers can use.
60
-        err = ep.Join(sbx)
61
-        if err != nil {
62
-                return
63
-        }
64
-
65
-		// libnetwork client can check the endpoint's operational data via the Info() API
66
-		epInfo, err := ep.DriverInfo()
67
-		mapData, ok := epInfo[netlabel.PortMap]
20
+	// Select and configure the network driver
21
+	networkType := "bridge"
22
+
23
+	// Create a new controller instance
24
+	driverOptions := options.Generic{}
25
+	genericOption := make(map[string]interface{})
26
+	genericOption[netlabel.GenericData] = driverOptions
27
+	controller, err := libnetwork.New(config.OptionDriverConfig(networkType, genericOption))
28
+	if err != nil {
29
+		return
30
+	}
31
+
32
+	// Create a network for containers to join.
33
+	// NewNetwork accepts Variadic optional arguments that libnetwork and Drivers can use.
34
+	network, err := controller.NewNetwork(networkType, "network1")
35
+	if err != nil {
36
+		return
37
+	}
38
+
39
+	// For each new container: allocate IP and interfaces. The returned network
40
+	// settings will be used for container infos (inspect and such), as well as
41
+	// iptables rules for port publishing. This info is contained or accessible
42
+	// from the returned endpoint.
43
+	ep, err := network.CreateEndpoint("Endpoint1")
44
+	if err != nil {
45
+		return
46
+	}
47
+
48
+	// Create the sandbox for the container.
49
+	// NewSandbox accepts Variadic optional arguments which libnetwork can use.
50
+	sbx, err := controller.NewSandbox("container1",
51
+		libnetwork.OptionHostname("test"),
52
+		libnetwork.OptionDomainname("docker.io"))
53
+
54
+	// A sandbox can join the endpoint via the join api.
55
+	err = ep.Join(sbx)
56
+	if err != nil {
57
+		return
58
+	}
59
+
60
+	// libnetwork client can check the endpoint's operational data via the Info() API
61
+	epInfo, err := ep.DriverInfo()
62
+	mapData, ok := epInfo[netlabel.PortMap]
63
+	if ok {
64
+		portMapping, ok := mapData.([]types.PortBinding)
68 65
 		if ok {
69
-			portMapping, ok := mapData.([]types.PortBinding)
70
-			if ok {
71
-				fmt.Printf("Current port mapping for endpoint %s: %v", ep.Name(), portMapping)
72
-			}
66
+			fmt.Printf("Current port mapping for endpoint %s: %v", ep.Name(), portMapping)
73 67
 		}
74
-
68
+	}
75 69
 ```
76 70
 #### Current Status
77 71
 Please watch this space for updates on the progress.
... ...
@@ -87,4 +81,3 @@ Want to hack on libnetwork? [Docker's contributions guidelines](https://github.c
87 87
 
88 88
 ## Copyright and license
89 89
 Code and documentation copyright 2015 Docker, inc. Code released under the Apache 2.0 license. Docs released under Creative commons.
90
-
... ...
@@ -291,9 +291,6 @@ func processCreateDefaults(c libnetwork.NetworkController, nc *networkCreate) {
291 291
 		if _, ok := gData["BridgeName"]; !ok {
292 292
 			gData["BridgeName"] = nc.Name
293 293
 		}
294
-		if _, ok := gData["AllowNonDefaultBridge"]; !ok {
295
-			gData["AllowNonDefaultBridge"] = "true"
296
-		}
297 294
 		nc.Options[netlabel.GenericData] = genericData
298 295
 	}
299 296
 }
... ...
@@ -87,12 +87,12 @@ func (s *sequence) toString() string {
87 87
 }
88 88
 
89 89
 // GetAvailableBit returns the position of the first unset bit in the bitmask represented by this sequence
90
-func (s *sequence) getAvailableBit() (uint32, uint32, error) {
90
+func (s *sequence) getAvailableBit(from uint32) (uint32, uint32, error) {
91 91
 	if s.block == blockMAX || s.count == 0 {
92
-		return invalidPos, invalidPos, fmt.Errorf("no available bit")
92
+		return invalidPos, invalidPos, errNoBitAvailable
93 93
 	}
94
-	bits := uint32(0)
95
-	bitSel := blockFirstBit
94
+	bits := from
95
+	bitSel := blockFirstBit >> from
96 96
 	for bitSel > 0 && s.block&bitSel != 0 {
97 97
 		bitSel >>= 1
98 98
 		bits++
... ...
@@ -186,12 +186,23 @@ func (h *Handle) getCopy() *Handle {
186 186
 	}
187 187
 }
188 188
 
189
+// SetAnyInRange atomically sets the first unset bit in the specified range in the sequence and returns the corresponding ordinal
190
+func (h *Handle) SetAnyInRange(start, end uint32) (uint32, error) {
191
+	if end-start <= 0 || end >= h.bits {
192
+		return invalidPos, fmt.Errorf("invalid bit range [%d, %d]", start, end)
193
+	}
194
+	if h.Unselected() == 0 {
195
+		return invalidPos, errNoBitAvailable
196
+	}
197
+	return h.set(0, start, end, true, false)
198
+}
199
+
189 200
 // SetAny atomically sets the first unset bit in the sequence and returns the corresponding ordinal
190 201
 func (h *Handle) SetAny() (uint32, error) {
191 202
 	if h.Unselected() == 0 {
192 203
 		return invalidPos, errNoBitAvailable
193 204
 	}
194
-	return h.set(0, true, false)
205
+	return h.set(0, 0, h.bits-1, true, false)
195 206
 }
196 207
 
197 208
 // Set atomically sets the corresponding bit in the sequence
... ...
@@ -199,7 +210,7 @@ func (h *Handle) Set(ordinal uint32) error {
199 199
 	if err := h.validateOrdinal(ordinal); err != nil {
200 200
 		return err
201 201
 	}
202
-	_, err := h.set(ordinal, false, false)
202
+	_, err := h.set(ordinal, 0, 0, false, false)
203 203
 	return err
204 204
 }
205 205
 
... ...
@@ -208,7 +219,7 @@ func (h *Handle) Unset(ordinal uint32) error {
208 208
 	if err := h.validateOrdinal(ordinal); err != nil {
209 209
 		return err
210 210
 	}
211
-	_, err := h.set(ordinal, false, true)
211
+	_, err := h.set(ordinal, 0, 0, false, true)
212 212
 	return err
213 213
 }
214 214
 
... ...
@@ -225,7 +236,7 @@ func (h *Handle) IsSet(ordinal uint32) bool {
225 225
 }
226 226
 
227 227
 // set/reset the bit
228
-func (h *Handle) set(ordinal uint32, any bool, release bool) (uint32, error) {
228
+func (h *Handle) set(ordinal, start, end uint32, any bool, release bool) (uint32, error) {
229 229
 	var (
230 230
 		bitPos  uint32
231 231
 		bytePos uint32
... ...
@@ -240,8 +251,11 @@ func (h *Handle) set(ordinal uint32, any bool, release bool) (uint32, error) {
240 240
 			bytePos, bitPos = ordinalToPos(ordinal)
241 241
 		} else {
242 242
 			if any {
243
-				bytePos, bitPos, err = getFirstAvailable(h.head)
243
+				bytePos, bitPos, err = getFirstAvailable(h.head, start)
244 244
 				ret = posToOrdinal(bytePos, bitPos)
245
+				if end < ret {
246
+					err = errNoBitAvailable
247
+				}
245 248
 			} else {
246 249
 				bytePos, bitPos, err = checkIfAvailable(h.head, ordinal)
247 250
 				ret = ordinal
... ...
@@ -285,7 +299,7 @@ func (h *Handle) set(ordinal uint32, any bool, release bool) (uint32, error) {
285 285
 
286 286
 // checks is needed because to cover the case where the number of bits is not a multiple of blockLen
287 287
 func (h *Handle) validateOrdinal(ordinal uint32) error {
288
-	if ordinal > h.bits {
288
+	if ordinal >= h.bits {
289 289
 		return fmt.Errorf("bit does not belong to the sequence")
290 290
 	}
291 291
 	return nil
... ...
@@ -353,16 +367,24 @@ func (h *Handle) String() string {
353 353
 		h.app, h.id, h.dbIndex, h.bits, h.unselected, h.head.toString())
354 354
 }
355 355
 
356
-// getFirstAvailable looks for the first unset bit in passed mask
357
-func getFirstAvailable(head *sequence) (uint32, uint32, error) {
358
-	byteIndex := uint32(0)
359
-	current := head
356
+// getFirstAvailable looks for the first unset bit in passed mask starting from start
357
+func getFirstAvailable(head *sequence, start uint32) (uint32, uint32, error) {
358
+	// Find sequence which contains the start bit
359
+	byteStart, bitStart := ordinalToPos(start)
360
+	current, _, _, inBlockBytePos := findSequence(head, byteStart)
361
+
362
+	// Derive the this sequence offsets
363
+	byteOffset := byteStart - inBlockBytePos
364
+	bitOffset := inBlockBytePos*8 + bitStart
365
+
360 366
 	for current != nil {
361 367
 		if current.block != blockMAX {
362
-			bytePos, bitPos, err := current.getAvailableBit()
363
-			return byteIndex + bytePos, bitPos, err
368
+			bytePos, bitPos, err := current.getAvailableBit(bitOffset)
369
+			return byteOffset + bytePos, bitPos, err
364 370
 		}
365
-		byteIndex += current.count * blockBytes
371
+		// Moving to next block: Reset bit offset.
372
+		bitOffset = 0
373
+		byteOffset += current.count * blockBytes
366 374
 		current = current.next
367 375
 	}
368 376
 	return invalidPos, invalidPos, errNoBitAvailable
... ...
@@ -371,8 +393,7 @@ func getFirstAvailable(head *sequence) (uint32, uint32, error) {
371 371
 // checkIfAvailable checks if the bit correspondent to the specified ordinal is unset
372 372
 // If the ordinal is beyond the sequence limits, a negative response is returned
373 373
 func checkIfAvailable(head *sequence, ordinal uint32) (uint32, uint32, error) {
374
-	bytePos := ordinal / 8
375
-	bitPos := ordinal % 8
374
+	bytePos, bitPos := ordinalToPos(ordinal)
376 375
 
377 376
 	// Find the sequence containing this byte
378 377
 	current, _, _, inBlockBytePos := findSequence(head, bytePos)
... ...
@@ -70,6 +70,16 @@ func (h *Handle) Exists() bool {
70 70
 	return h.dbExists
71 71
 }
72 72
 
73
+// Skip provides a way for a KV Object to avoid persisting it in the KV Store
74
+func (h *Handle) Skip() bool {
75
+	return false
76
+}
77
+
78
+// DataScope method returns the storage scope of the datastore
79
+func (h *Handle) DataScope() datastore.DataScope {
80
+	return datastore.GlobalScope
81
+}
82
+
73 83
 func (h *Handle) watchForChanges() error {
74 84
 	h.Lock()
75 85
 	store := h.store
... ...
@@ -236,21 +236,22 @@ func (cli *NetworkCli) CmdServiceLs(chain string, args ...string) error {
236 236
 	wr := tabwriter.NewWriter(cli.out, 20, 1, 3, ' ', 0)
237 237
 	// unless quiet (-q) is specified, print field titles
238 238
 	if !*quiet {
239
-		fmt.Fprintln(wr, "SERVICE ID\tNAME\tNETWORK\tCONTAINER")
239
+		fmt.Fprintln(wr, "SERVICE ID\tNAME\tNETWORK\tCONTAINER\tSANDBOX")
240 240
 	}
241 241
 
242 242
 	for _, sr := range serviceResources {
243 243
 		ID := sr.ID
244
-		bkID, err := getBackendID(cli, ID)
244
+		bkID, sbID, err := getBackendID(cli, ID)
245 245
 		if err != nil {
246 246
 			return err
247 247
 		}
248 248
 		if !*noTrunc {
249 249
 			ID = stringid.TruncateID(ID)
250 250
 			bkID = stringid.TruncateID(bkID)
251
+			sbID = stringid.TruncateID(sbID)
251 252
 		}
252 253
 		if !*quiet {
253
-			fmt.Fprintf(wr, "%s\t%s\t%s\t%s\n", ID, sr.Name, sr.Network, bkID)
254
+			fmt.Fprintf(wr, "%s\t%s\t%s\t%s\t%s\n", ID, sr.Name, sr.Network, bkID, sbID)
254 255
 		} else {
255 256
 			fmt.Fprintln(wr, ID)
256 257
 		}
... ...
@@ -260,24 +261,26 @@ func (cli *NetworkCli) CmdServiceLs(chain string, args ...string) error {
260 260
 	return nil
261 261
 }
262 262
 
263
-func getBackendID(cli *NetworkCli, servID string) (string, error) {
263
+func getBackendID(cli *NetworkCli, servID string) (string, string, error) {
264 264
 	var (
265 265
 		obj []byte
266 266
 		err error
267 267
 		bk  string
268
+		sb  string
268 269
 	)
269 270
 
270 271
 	if obj, _, err = readBody(cli.call("GET", "/services/"+servID+"/backend", nil, nil)); err == nil {
271 272
 		var sr SandboxResource
272 273
 		if err := json.NewDecoder(bytes.NewReader(obj)).Decode(&sr); err == nil {
273 274
 			bk = sr.ContainerID
275
+			sb = sr.ID
274 276
 		} else {
275 277
 			// Only print a message, don't make the caller cli fail for this
276 278
 			fmt.Fprintf(cli.out, "Failed to retrieve backend list for service %s (%v)\n", servID, err)
277 279
 		}
278 280
 	}
279 281
 
280
-	return bk, err
282
+	return bk, sb, err
281 283
 }
282 284
 
283 285
 // CmdServiceInfo handles service info UI
... ...
@@ -5,14 +5,15 @@ import (
5 5
 
6 6
 	"github.com/BurntSushi/toml"
7 7
 	log "github.com/Sirupsen/logrus"
8
+	"github.com/docker/libkv/store"
8 9
 	"github.com/docker/libnetwork/netlabel"
9 10
 )
10 11
 
11 12
 // Config encapsulates configurations of various Libnetwork components
12 13
 type Config struct {
13
-	Daemon    DaemonCfg
14
-	Cluster   ClusterCfg
15
-	Datastore DatastoreCfg
14
+	Daemon                  DaemonCfg
15
+	Cluster                 ClusterCfg
16
+	GlobalStore, LocalStore DatastoreCfg
16 17
 }
17 18
 
18 19
 // DaemonCfg represents libnetwork core configuration
... ...
@@ -21,6 +22,7 @@ type DaemonCfg struct {
21 21
 	DefaultNetwork string
22 22
 	DefaultDriver  string
23 23
 	Labels         []string
24
+	DriverCfg      map[string]interface{}
24 25
 }
25 26
 
26 27
 // ClusterCfg represents cluster configuration
... ...
@@ -40,6 +42,7 @@ type DatastoreCfg struct {
40 40
 type DatastoreClientCfg struct {
41 41
 	Provider string
42 42
 	Address  string
43
+	Config   *store.Config
43 44
 }
44 45
 
45 46
 // ParseConfig parses the libnetwork configuration file
... ...
@@ -71,6 +74,13 @@ func OptionDefaultDriver(dd string) Option {
71 71
 	}
72 72
 }
73 73
 
74
+// OptionDriverConfig returns an option setter for driver configuration.
75
+func OptionDriverConfig(networkType string, config map[string]interface{}) Option {
76
+	return func(c *Config) {
77
+		c.Daemon.DriverCfg[networkType] = config
78
+	}
79
+}
80
+
74 81
 // OptionLabels function returns an option setter for labels
75 82
 func OptionLabels(labels []string) Option {
76 83
 	return func(c *Config) {
... ...
@@ -86,7 +96,7 @@ func OptionLabels(labels []string) Option {
86 86
 func OptionKVProvider(provider string) Option {
87 87
 	return func(c *Config) {
88 88
 		log.Infof("Option OptionKVProvider: %s", provider)
89
-		c.Datastore.Client.Provider = strings.TrimSpace(provider)
89
+		c.GlobalStore.Client.Provider = strings.TrimSpace(provider)
90 90
 	}
91 91
 }
92 92
 
... ...
@@ -94,7 +104,7 @@ func OptionKVProvider(provider string) Option {
94 94
 func OptionKVProviderURL(url string) Option {
95 95
 	return func(c *Config) {
96 96
 		log.Infof("Option OptionKVProviderURL: %s", url)
97
-		c.Datastore.Client.Address = strings.TrimSpace(url)
97
+		c.GlobalStore.Client.Address = strings.TrimSpace(url)
98 98
 	}
99 99
 }
100 100
 
... ...
@@ -114,3 +124,27 @@ func IsValidName(name string) bool {
114 114
 	}
115 115
 	return true
116 116
 }
117
+
118
+// OptionLocalKVProvider function returns an option setter for kvstore provider
119
+func OptionLocalKVProvider(provider string) Option {
120
+	return func(c *Config) {
121
+		log.Infof("Option OptionLocalKVProvider: %s", provider)
122
+		c.LocalStore.Client.Provider = strings.TrimSpace(provider)
123
+	}
124
+}
125
+
126
+// OptionLocalKVProviderURL function returns an option setter for kvstore url
127
+func OptionLocalKVProviderURL(url string) Option {
128
+	return func(c *Config) {
129
+		log.Infof("Option OptionLocalKVProviderURL: %s", url)
130
+		c.LocalStore.Client.Address = strings.TrimSpace(url)
131
+	}
132
+}
133
+
134
+// OptionLocalKVProviderConfig function returns an option setter for kvstore config
135
+func OptionLocalKVProviderConfig(config *store.Config) Option {
136
+	return func(c *Config) {
137
+		log.Infof("Option OptionLocalKVProviderConfig: %v", config)
138
+		c.LocalStore.Client.Config = config
139
+	}
140
+}
... ...
@@ -2,16 +2,13 @@
2 2
 Package libnetwork provides the basic functionality and extension points to
3 3
 create network namespaces and allocate interfaces for containers to use.
4 4
 
5
-	// Create a new controller instance
6
-	controller, _err := libnetwork.New(nil)
7
-
8
-	// Select and configure the network driver
9 5
 	networkType := "bridge"
10 6
 
7
+	// Create a new controller instance
11 8
 	driverOptions := options.Generic{}
12 9
 	genericOption := make(map[string]interface{})
13 10
 	genericOption[netlabel.GenericData] = driverOptions
14
-	err := controller.ConfigureNetworkDriver(networkType, genericOption)
11
+	controller, err := libnetwork.New(config.OptionDriverConfig(networkType, genericOption))
15 12
 	if err != nil {
16 13
 		return
17 14
 	}
... ...
@@ -32,11 +29,14 @@ create network namespaces and allocate interfaces for containers to use.
32 32
 		return
33 33
 	}
34 34
 
35
-	// A container can join the endpoint by providing the container ID to the join api.
36
-	// Join accepts Variadic arguments which will be made use of by libnetwork and Drivers
37
-	err = ep.Join("container1",
38
-		libnetwork.JoinOptionHostname("test"),
39
-		libnetwork.JoinOptionDomainname("docker.io"))
35
+	// Create the sandbox for the container.
36
+	// NewSandbox accepts Variadic optional arguments which libnetwork can use.
37
+	sbx, err := controller.NewSandbox("container1",
38
+		libnetwork.OptionHostname("test"),
39
+		libnetwork.OptionDomainname("docker.io"))
40
+
41
+	// A sandbox can join the endpoint via the join api.
42
+	err = ep.Join(sbx)
40 43
 	if err != nil {
41 44
 		return
42 45
 	}
... ...
@@ -47,7 +47,6 @@ import (
47 47
 	"container/heap"
48 48
 	"fmt"
49 49
 	"net"
50
-	"strings"
51 50
 	"sync"
52 51
 
53 52
 	log "github.com/Sirupsen/logrus"
... ...
@@ -57,7 +56,6 @@ import (
57 57
 	"github.com/docker/libnetwork/datastore"
58 58
 	"github.com/docker/libnetwork/driverapi"
59 59
 	"github.com/docker/libnetwork/hostdiscovery"
60
-	"github.com/docker/libnetwork/netlabel"
61 60
 	"github.com/docker/libnetwork/osl"
62 61
 	"github.com/docker/libnetwork/types"
63 62
 )
... ...
@@ -68,9 +66,6 @@ type NetworkController interface {
68 68
 	// ID provides an unique identity for the controller
69 69
 	ID() string
70 70
 
71
-	// ConfigureNetworkDriver applies the passed options to the driver instance for the specified network type
72
-	ConfigureNetworkDriver(networkType string, options map[string]interface{}) error
73
-
74 71
 	// Config method returns the bootup configuration for the controller
75 72
 	Config() config.Config
76 73
 
... ...
@@ -125,13 +120,13 @@ type endpointTable map[string]*endpoint
125 125
 type sandboxTable map[string]*sandbox
126 126
 
127 127
 type controller struct {
128
-	id             string
129
-	networks       networkTable
130
-	drivers        driverTable
131
-	sandboxes      sandboxTable
132
-	cfg            *config.Config
133
-	store          datastore.DataStore
134
-	extKeyListener net.Listener
128
+	id                      string
129
+	networks                networkTable
130
+	drivers                 driverTable
131
+	sandboxes               sandboxTable
132
+	cfg                     *config.Config
133
+	globalStore, localStore datastore.DataStore
134
+	extKeyListener          net.Listener
135 135
 	sync.Mutex
136 136
 }
137 137
 
... ...
@@ -139,7 +134,11 @@ type controller struct {
139 139
 func New(cfgOptions ...config.Option) (NetworkController, error) {
140 140
 	var cfg *config.Config
141 141
 	if len(cfgOptions) > 0 {
142
-		cfg = &config.Config{}
142
+		cfg = &config.Config{
143
+			Daemon: config.DaemonCfg{
144
+				DriverCfg: make(map[string]interface{}),
145
+			},
146
+		}
143 147
 		cfg.ProcessOptions(cfgOptions...)
144 148
 	}
145 149
 	c := &controller{
... ...
@@ -153,7 +152,7 @@ func New(cfgOptions ...config.Option) (NetworkController, error) {
153 153
 	}
154 154
 
155 155
 	if cfg != nil {
156
-		if err := c.initDataStore(); err != nil {
156
+		if err := c.initGlobalStore(); err != nil {
157 157
 			// Failing to initalize datastore is a bad situation to be in.
158 158
 			// But it cannot fail creating the Controller
159 159
 			log.Debugf("Failed to Initialize Datastore due to %v. Operating in non-clustered mode", err)
... ...
@@ -163,6 +162,9 @@ func New(cfgOptions ...config.Option) (NetworkController, error) {
163 163
 			// But it cannot fail creating the Controller
164 164
 			log.Debugf("Failed to Initialize Discovery : %v", err)
165 165
 		}
166
+		if err := c.initLocalStore(); err != nil {
167
+			log.Debugf("Failed to Initialize LocalDatastore due to %v.", err)
168
+		}
166 169
 	}
167 170
 
168 171
 	if err := c.startExternalKeyListener(); err != nil {
... ...
@@ -207,16 +209,6 @@ func (c *controller) Config() config.Config {
207 207
 	return *c.cfg
208 208
 }
209 209
 
210
-func (c *controller) ConfigureNetworkDriver(networkType string, options map[string]interface{}) error {
211
-	c.Lock()
212
-	dd, ok := c.drivers[networkType]
213
-	c.Unlock()
214
-	if !ok {
215
-		return NetworkTypeError(networkType)
216
-	}
217
-	return dd.driver.Config(options)
218
-}
219
-
220 210
 func (c *controller) RegisterDriver(networkType string, driver driverapi.Driver, capability driverapi.Capability) error {
221 211
 	if !config.IsValidName(networkType) {
222 212
 		return ErrInvalidName(networkType)
... ...
@@ -228,32 +220,8 @@ func (c *controller) RegisterDriver(networkType string, driver driverapi.Driver,
228 228
 		return driverapi.ErrActiveRegistration(networkType)
229 229
 	}
230 230
 	c.drivers[networkType] = &driverData{driver, capability}
231
-
232
-	if c.cfg == nil {
233
-		c.Unlock()
234
-		return nil
235
-	}
236
-
237
-	opt := make(map[string]interface{})
238
-	for _, label := range c.cfg.Daemon.Labels {
239
-		if strings.HasPrefix(label, netlabel.DriverPrefix+"."+networkType) {
240
-			opt[netlabel.Key(label)] = netlabel.Value(label)
241
-		}
242
-	}
243
-
244
-	if capability.Scope == driverapi.GlobalScope && c.validateDatastoreConfig() {
245
-		opt[netlabel.KVProvider] = c.cfg.Datastore.Client.Provider
246
-		opt[netlabel.KVProviderURL] = c.cfg.Datastore.Client.Address
247
-	}
248
-
249 231
 	c.Unlock()
250 232
 
251
-	if len(opt) != 0 {
252
-		if err := driver.Config(opt); err != nil {
253
-			return err
254
-		}
255
-	}
256
-
257 233
 	return nil
258 234
 }
259 235
 
... ...
@@ -280,6 +248,7 @@ func (c *controller) NewNetwork(networkType, name string, options ...NetworkOpti
280 280
 		id:          stringid.GenerateRandomID(),
281 281
 		ctrlr:       c,
282 282
 		endpoints:   endpointTable{},
283
+		persist:     true,
283 284
 	}
284 285
 
285 286
 	network.processOptions(options...)
... ...
@@ -288,7 +257,7 @@ func (c *controller) NewNetwork(networkType, name string, options ...NetworkOpti
288 288
 		return nil, err
289 289
 	}
290 290
 
291
-	if err := c.updateNetworkToStore(network); err != nil {
291
+	if err := c.updateToStore(network); err != nil {
292 292
 		log.Warnf("couldnt create network %s: %v", network.name, err)
293 293
 		if e := network.Delete(); e != nil {
294 294
 			log.Warnf("couldnt cleanup network %s: %v", network.name, err)
... ...
@@ -317,6 +286,7 @@ func (c *controller) addNetwork(n *network) error {
317 317
 	n.Lock()
318 318
 	n.svcRecords = svcMap{}
319 319
 	n.driver = dd.driver
320
+	n.dataScope = dd.capability.DataScope
320 321
 	d := n.driver
321 322
 	n.Unlock()
322 323
 
... ...
@@ -324,8 +294,10 @@ func (c *controller) addNetwork(n *network) error {
324 324
 	if err := d.CreateNetwork(n.id, n.generic); err != nil {
325 325
 		return err
326 326
 	}
327
-	if err := n.watchEndpoints(); err != nil {
328
-		return err
327
+	if n.isGlobalScoped() {
328
+		if err := n.watchEndpoints(); err != nil {
329
+			return err
330
+		}
329 331
 	}
330 332
 	c.Lock()
331 333
 	c.networks[n.id] = n
... ...
@@ -515,20 +487,10 @@ func (c *controller) loadDriver(networkType string) (*driverData, error) {
515 515
 	return dd, nil
516 516
 }
517 517
 
518
-func (c *controller) isDriverGlobalScoped(networkType string) (bool, error) {
519
-	c.Lock()
520
-	dd, ok := c.drivers[networkType]
521
-	c.Unlock()
522
-	if !ok {
523
-		return false, types.NotFoundErrorf("driver not found for %s", networkType)
524
-	}
525
-	if dd.capability.Scope == driverapi.GlobalScope {
526
-		return true, nil
527
-	}
528
-	return false, nil
529
-}
530
-
531 518
 func (c *controller) Stop() {
519
+	if c.localStore != nil {
520
+		c.localStore.KVStore().Close()
521
+	}
532 522
 	c.stopExternalKeyListener()
533 523
 	osl.GC()
534 524
 }
... ...
@@ -6,6 +6,7 @@ import (
6 6
 
7 7
 	"github.com/docker/libkv"
8 8
 	"github.com/docker/libkv/store"
9
+	"github.com/docker/libkv/store/boltdb"
9 10
 	"github.com/docker/libkv/store/consul"
10 11
 	"github.com/docker/libkv/store/etcd"
11 12
 	"github.com/docker/libkv/store/zookeeper"
... ...
@@ -58,8 +59,22 @@ type KV interface {
58 58
 	// True if the object exists in the datastore, false if it hasn't been stored yet.
59 59
 	// When SetIndex() is called, the object has been stored.
60 60
 	Exists() bool
61
+	// DataScope indicates the storage scope of the KV object
62
+	DataScope() DataScope
63
+	// Skip provides a way for a KV Object to avoid persisting it in the KV Store
64
+	Skip() bool
61 65
 }
62 66
 
67
+// DataScope indicates the storage scope
68
+type DataScope int
69
+
70
+const (
71
+	// LocalScope indicates to store the KV object in local datastore such as boltdb
72
+	LocalScope DataScope = iota
73
+	// GlobalScope indicates to store the KV object in global datastore such as consul/etcd/zookeeper
74
+	GlobalScope
75
+)
76
+
63 77
 const (
64 78
 	// NetworkKeyPrefix is the prefix for network key in the kv store
65 79
 	NetworkKeyPrefix = "network"
... ...
@@ -73,6 +88,7 @@ func init() {
73 73
 	consul.Register()
74 74
 	zookeeper.Register()
75 75
 	etcd.Register()
76
+	boltdb.Register()
76 77
 }
77 78
 
78 79
 //Key provides convenient method to create a Key
... ...
@@ -94,8 +110,11 @@ func ParseKey(key string) ([]string, error) {
94 94
 }
95 95
 
96 96
 // newClient used to connect to KV Store
97
-func newClient(kv string, addrs string) (DataStore, error) {
98
-	store, err := libkv.NewStore(store.Backend(kv), []string{addrs}, &store.Config{})
97
+func newClient(kv string, addrs string, config *store.Config) (DataStore, error) {
98
+	if config == nil {
99
+		config = &store.Config{}
100
+	}
101
+	store, err := libkv.NewStore(store.Backend(kv), []string{addrs}, config)
99 102
 	if err != nil {
100 103
 		return nil, err
101 104
 	}
... ...
@@ -109,7 +128,7 @@ func NewDataStore(cfg *config.DatastoreCfg) (DataStore, error) {
109 109
 		return nil, types.BadRequestErrorf("invalid configuration passed to datastore")
110 110
 	}
111 111
 	// TODO : cfg.Embedded case
112
-	return newClient(cfg.Client.Provider, cfg.Client.Address)
112
+	return newClient(cfg.Client.Provider, cfg.Client.Address, cfg.Client.Config)
113 113
 }
114 114
 
115 115
 // NewCustomDataStore can be used by clients to plugin cusom datatore that adhers to store.Store
116 116
new file mode 100644
... ...
@@ -0,0 +1,159 @@
0
+package libnetwork
1
+
2
+import (
3
+	"fmt"
4
+
5
+	"github.com/docker/libnetwork/netlabel"
6
+	"github.com/docker/libnetwork/options"
7
+	"github.com/docker/libnetwork/types"
8
+)
9
+
10
+const (
11
+	libnGWNetwork = "docker_gwbridge"
12
+	gwEPlen       = 12
13
+)
14
+
15
+/*
16
+   libnetwork creates a bridge network "docker_gw_bridge" for provding
17
+   default gateway for the containers if none of the container's endpoints
18
+   have GW set by the driver. ICC is set to false for the GW_bridge network.
19
+
20
+   If a driver can't provide external connectivity it can choose to not set
21
+   the GW IP for the endpoint.
22
+
23
+   endpoint on the GW_bridge network is managed dynamically by libnetwork.
24
+   ie:
25
+   - its created when an endpoint without GW joins the container
26
+   - its deleted when an endpoint with GW joins the container
27
+*/
28
+
29
+func (sb *sandbox) setupDefaultGW(srcEp *endpoint) error {
30
+	var createOptions []EndpointOption
31
+	c := srcEp.getNetwork().getController()
32
+
33
+	// check if the conitainer already has a GW endpoint
34
+	if ep := sb.getEndpointInGWNetwork(); ep != nil {
35
+		return nil
36
+	}
37
+
38
+	n, err := c.NetworkByName(libnGWNetwork)
39
+	if err != nil {
40
+		if _, ok := err.(types.NotFoundError); !ok {
41
+			return err
42
+		}
43
+		n, err = c.createGWNetwork()
44
+		if err != nil {
45
+			return err
46
+		}
47
+	}
48
+
49
+	if opt, ok := srcEp.generic[netlabel.PortMap]; ok {
50
+		if pb, ok := opt.([]types.PortBinding); ok {
51
+			createOptions = append(createOptions, CreateOptionPortMapping(pb))
52
+		}
53
+	}
54
+
55
+	if opt, ok := srcEp.generic[netlabel.ExposedPorts]; ok {
56
+		if exp, ok := opt.([]types.TransportPort); ok {
57
+			createOptions = append(createOptions, CreateOptionExposedPorts(exp))
58
+		}
59
+	}
60
+
61
+	eplen := gwEPlen
62
+	if len(sb.containerID) < gwEPlen {
63
+		eplen = len(sb.containerID)
64
+	}
65
+
66
+	newEp, err := n.CreateEndpoint("gateway_"+sb.containerID[0:eplen], createOptions...)
67
+	if err != nil {
68
+		return fmt.Errorf("container %s: endpoint create on GW Network failed: %v", sb.containerID, err)
69
+	}
70
+	epLocal := newEp.(*endpoint)
71
+
72
+	if err := epLocal.sbJoin(sb); err != nil {
73
+		return fmt.Errorf("container %s: endpoint join on GW Network failed: %v", sb.containerID, err)
74
+	}
75
+	return nil
76
+}
77
+
78
+func (sb *sandbox) clearDefaultGW() error {
79
+	var ep *endpoint
80
+
81
+	if ep = sb.getEndpointInGWNetwork(); ep == nil {
82
+		return nil
83
+	}
84
+
85
+	if err := ep.sbLeave(sb); err != nil {
86
+		return fmt.Errorf("container %s: endpoint leaving GW Network failed: %v", sb.containerID, err)
87
+	}
88
+	if err := ep.Delete(); err != nil {
89
+		return fmt.Errorf("container %s: deleting endpoint on GW Network failed: %v", sb.containerID, err)
90
+	}
91
+	return nil
92
+}
93
+
94
+func (c *controller) createGWNetwork() (Network, error) {
95
+	netOption := options.Generic{
96
+		"BridgeName":         libnGWNetwork,
97
+		"EnableICC":          false,
98
+		"EnableIPMasquerade": true,
99
+	}
100
+
101
+	n, err := c.NewNetwork("bridge", libnGWNetwork,
102
+		NetworkOptionGeneric(options.Generic{
103
+			netlabel.GenericData: netOption,
104
+			netlabel.EnableIPv6:  false,
105
+		}))
106
+
107
+	if err != nil {
108
+		return nil, fmt.Errorf("error creating external connectivity network: %v", err)
109
+	}
110
+	return n, err
111
+}
112
+
113
+func (sb *sandbox) needDefaultGW() bool {
114
+	var needGW bool
115
+
116
+	for _, ep := range sb.getConnectedEndpoints() {
117
+		if ep.endpointInGWNetwork() {
118
+			continue
119
+		}
120
+		if ep.getNetwork().Type() == "null" || ep.getNetwork().Type() == "host" {
121
+			continue
122
+		}
123
+		// TODO v6 needs to be handled.
124
+		if len(ep.Gateway()) > 0 {
125
+			return false
126
+		}
127
+		needGW = true
128
+	}
129
+	return needGW
130
+}
131
+
132
+func (sb *sandbox) getEndpointInGWNetwork() *endpoint {
133
+	for _, ep := range sb.getConnectedEndpoints() {
134
+		if ep.getNetwork().name == libnGWNetwork {
135
+			return ep
136
+		}
137
+	}
138
+	return nil
139
+}
140
+
141
+func (ep *endpoint) endpointInGWNetwork() bool {
142
+	if ep.getNetwork().name == libnGWNetwork {
143
+		return true
144
+	}
145
+	return false
146
+}
147
+
148
+func (sb *sandbox) getEPwithoutGateway() *endpoint {
149
+	for _, ep := range sb.getConnectedEndpoints() {
150
+		if ep.getNetwork().Type() == "null" || ep.getNetwork().Type() == "host" {
151
+			continue
152
+		}
153
+		if len(ep.Gateway()) == 0 {
154
+			return ep
155
+		}
156
+	}
157
+	return nil
158
+}
... ...
@@ -1,15 +1,16 @@
1 1
 package driverapi
2 2
 
3
-import "net"
3
+import (
4
+	"net"
5
+
6
+	"github.com/docker/libnetwork/datastore"
7
+)
4 8
 
5 9
 // NetworkPluginEndpointType represents the Endpoint Type used by Plugin system
6 10
 const NetworkPluginEndpointType = "NetworkDriver"
7 11
 
8 12
 // Driver is an interface that every plugin driver needs to implement.
9 13
 type Driver interface {
10
-	// Push driver specific config to the driver
11
-	Config(options map[string]interface{}) error
12
-
13 14
 	// CreateNetwork invokes the driver method to create a network passing
14 15
 	// the network id and network specific config. The config mechanism will
15 16
 	// eventually be replaced with labels which are yet to be introduced.
... ...
@@ -101,17 +102,7 @@ type DriverCallback interface {
101 101
 	RegisterDriver(name string, driver Driver, capability Capability) error
102 102
 }
103 103
 
104
-// Scope indicates the drivers scope capability
105
-type Scope int
106
-
107
-const (
108
-	// LocalScope represents the driver capable of providing networking services for containers in a single host
109
-	LocalScope Scope = iota
110
-	// GlobalScope represents the driver capable of providing networking services for containers across hosts
111
-	GlobalScope
112
-)
113
-
114 104
 // Capability represents the high level capabilities of the drivers which libnetwork can make use of
115 105
 type Capability struct {
116
-	Scope Scope
106
+	DataScope datastore.DataScope
117 107
 }
118 108
new file mode 100644
... ...
@@ -0,0 +1,55 @@
0
+package libnetwork
1
+
2
+import (
3
+	"strings"
4
+
5
+	"github.com/docker/libnetwork/driverapi"
6
+	"github.com/docker/libnetwork/netlabel"
7
+)
8
+
9
+type initializer struct {
10
+	fn    func(driverapi.DriverCallback, map[string]interface{}) error
11
+	ntype string
12
+}
13
+
14
+func initDrivers(c *controller) error {
15
+	for _, i := range getInitializers() {
16
+		if err := i.fn(c, makeDriverConfig(c, i.ntype)); err != nil {
17
+			return err
18
+		}
19
+	}
20
+
21
+	return nil
22
+}
23
+
24
+func makeDriverConfig(c *controller, ntype string) map[string]interface{} {
25
+	if c.cfg == nil {
26
+		return nil
27
+	}
28
+
29
+	config := make(map[string]interface{})
30
+
31
+	if c.validateGlobalStoreConfig() {
32
+		config[netlabel.KVProvider] = c.cfg.GlobalStore.Client.Provider
33
+		config[netlabel.KVProviderURL] = c.cfg.GlobalStore.Client.Address
34
+	}
35
+
36
+	for _, label := range c.cfg.Daemon.Labels {
37
+		if !strings.HasPrefix(netlabel.Key(label), netlabel.DriverPrefix+"."+ntype) {
38
+			continue
39
+		}
40
+
41
+		config[netlabel.Key(label)] = netlabel.Value(label)
42
+	}
43
+
44
+	drvCfg, ok := c.cfg.Daemon.DriverCfg[ntype]
45
+	if !ok {
46
+		return config
47
+	}
48
+
49
+	for k, v := range drvCfg.(map[string]interface{}) {
50
+		config[k] = v
51
+	}
52
+
53
+	return config
54
+}
... ...
@@ -14,6 +14,7 @@ import (
14 14
 	"syscall"
15 15
 
16 16
 	"github.com/Sirupsen/logrus"
17
+	"github.com/docker/libnetwork/datastore"
17 18
 	"github.com/docker/libnetwork/driverapi"
18 19
 	"github.com/docker/libnetwork/ipallocator"
19 20
 	"github.com/docker/libnetwork/iptables"
... ...
@@ -47,18 +48,18 @@ type configuration struct {
47 47
 
48 48
 // networkConfiguration for network specific configuration
49 49
 type networkConfiguration struct {
50
-	BridgeName            string
51
-	AddressIPv4           *net.IPNet
52
-	FixedCIDR             *net.IPNet
53
-	FixedCIDRv6           *net.IPNet
54
-	EnableIPv6            bool
55
-	EnableIPMasquerade    bool
56
-	EnableICC             bool
57
-	Mtu                   int
58
-	DefaultGatewayIPv4    net.IP
59
-	DefaultGatewayIPv6    net.IP
60
-	DefaultBindingIP      net.IP
61
-	AllowNonDefaultBridge bool
50
+	BridgeName         string
51
+	AddressIPv4        *net.IPNet
52
+	FixedCIDR          *net.IPNet
53
+	FixedCIDRv6        *net.IPNet
54
+	EnableIPv6         bool
55
+	EnableIPMasquerade bool
56
+	EnableICC          bool
57
+	Mtu                int
58
+	DefaultGatewayIPv4 net.IP
59
+	DefaultGatewayIPv6 net.IP
60
+	DefaultBindingIP   net.IP
61
+	DefaultBridge      bool
62 62
 }
63 63
 
64 64
 // endpointConfiguration represents the user specified configuration for the sandbox endpoint
... ...
@@ -97,7 +98,6 @@ type bridgeNetwork struct {
97 97
 
98 98
 type driver struct {
99 99
 	config      *configuration
100
-	configured  bool
101 100
 	network     *bridgeNetwork
102 101
 	natChain    *iptables.ChainInfo
103 102
 	filterChain *iptables.ChainInfo
... ...
@@ -106,13 +106,13 @@ type driver struct {
106 106
 }
107 107
 
108 108
 // New constructs a new bridge driver
109
-func newDriver() driverapi.Driver {
109
+func newDriver() *driver {
110 110
 	ipAllocator = ipallocator.New()
111 111
 	return &driver{networks: map[string]*bridgeNetwork{}, config: &configuration{}}
112 112
 }
113 113
 
114 114
 // Init registers a new instance of bridge driver
115
-func Init(dc driverapi.DriverCallback) error {
115
+func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
116 116
 	if _, err := os.Stat("/proc/sys/net/bridge"); err != nil {
117 117
 		if out, err := exec.Command("modprobe", "-va", "bridge", "br_netfilter").CombinedOutput(); err != nil {
118 118
 			logrus.Warnf("Running modprobe bridge br_netfilter failed with message: %s, error: %v", out, err)
... ...
@@ -128,10 +128,15 @@ func Init(dc driverapi.DriverCallback) error {
128 128
 		logrus.Warnf("Failed to remove existing iptables entries in %s : %v", DockerChain, err)
129 129
 	}
130 130
 
131
+	d := newDriver()
132
+	if err := d.configure(config); err != nil {
133
+		return err
134
+	}
135
+
131 136
 	c := driverapi.Capability{
132
-		Scope: driverapi.LocalScope,
137
+		DataScope: datastore.LocalScope,
133 138
 	}
134
-	return dc.RegisterDriver(networkType, newDriver(), c)
139
+	return dc.RegisterDriver(networkType, d, c)
135 140
 }
136 141
 
137 142
 // Validate performs a static validation on the network configuration parameters.
... ...
@@ -244,13 +249,13 @@ func (c *networkConfiguration) fromMap(data map[string]interface{}) error {
244 244
 		}
245 245
 	}
246 246
 
247
-	if i, ok := data["AllowNonDefaultBridge"]; ok && i != nil {
247
+	if i, ok := data["DefaultBridge"]; ok && i != nil {
248 248
 		if s, ok := i.(string); ok {
249
-			if c.AllowNonDefaultBridge, err = strconv.ParseBool(s); err != nil {
250
-				return types.BadRequestErrorf("failed to parse AllowNonDefaultBridge value: %s", err.Error())
249
+			if c.DefaultBridge, err = strconv.ParseBool(s); err != nil {
250
+				return types.BadRequestErrorf("failed to parse DefaultBridge value: %s", err.Error())
251 251
 			}
252 252
 		} else {
253
-			return types.BadRequestErrorf("invalid type for AllowNonDefaultBridge value")
253
+			return types.BadRequestErrorf("invalid type for DefaultBridge value")
254 254
 		}
255 255
 	}
256 256
 
... ...
@@ -426,17 +431,13 @@ func (c *networkConfiguration) conflictsWithNetworks(id string, others []*bridge
426 426
 	return nil
427 427
 }
428 428
 
429
-func (d *driver) Config(option map[string]interface{}) error {
429
+func (d *driver) configure(option map[string]interface{}) error {
430 430
 	var config *configuration
431 431
 	var err error
432 432
 
433 433
 	d.Lock()
434 434
 	defer d.Unlock()
435 435
 
436
-	if d.configured {
437
-		return &ErrConfigExists{}
438
-	}
439
-
440 436
 	genericData, ok := option[netlabel.GenericData]
441 437
 	if !ok || genericData == nil {
442 438
 		return nil
... ...
@@ -469,7 +470,6 @@ func (d *driver) Config(option map[string]interface{}) error {
469 469
 		}
470 470
 	}
471 471
 
472
-	d.configured = true
473 472
 	d.config = config
474 473
 	return nil
475 474
 }
... ...
@@ -516,7 +516,7 @@ func parseNetworkGenericOptions(data interface{}) (*networkConfiguration, error)
516 516
 	return config, err
517 517
 }
518 518
 
519
-func parseNetworkOptions(option options.Generic) (*networkConfiguration, error) {
519
+func parseNetworkOptions(id string, option options.Generic) (*networkConfiguration, error) {
520 520
 	var err error
521 521
 	config := &networkConfiguration{}
522 522
 
... ...
@@ -537,6 +537,9 @@ func parseNetworkOptions(option options.Generic) (*networkConfiguration, error)
537 537
 		return nil, err
538 538
 	}
539 539
 
540
+	if config.BridgeName == "" && config.DefaultBridge == false {
541
+		config.BridgeName = "br-" + id[:12]
542
+	}
540 543
 	return config, nil
541 544
 }
542 545
 
... ...
@@ -567,20 +570,12 @@ func (d *driver) getNetworks() []*bridgeNetwork {
567 567
 
568 568
 // Create a new network using bridge plugin
569 569
 func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
570
-	var (
571
-		err          error
572
-		configLocked bool
573
-	)
570
+	var err error
574 571
 
575 572
 	defer osl.InitOSContext()()
576 573
 
577 574
 	// Sanity checks
578 575
 	d.Lock()
579
-	if !d.configured {
580
-		configLocked = true
581
-		d.configured = true
582
-	}
583
-
584 576
 	if _, ok := d.networks[id]; ok {
585 577
 		d.Unlock()
586 578
 		return types.ForbiddenErrorf("network %s exists", id)
... ...
@@ -588,7 +583,7 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
588 588
 	d.Unlock()
589 589
 
590 590
 	// Parse and validate the config. It should not conflict with existing networks' config
591
-	config, err := parseNetworkOptions(option)
591
+	config, err := parseNetworkOptions(id, option)
592 592
 	if err != nil {
593 593
 		return err
594 594
 	}
... ...
@@ -619,10 +614,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
619 619
 	defer func() {
620 620
 		if err != nil {
621 621
 			d.Lock()
622
-			if configLocked {
623
-				d.configured = false
624
-			}
625
-
626 622
 			delete(d.networks, id)
627 623
 			d.Unlock()
628 624
 		}
... ...
@@ -211,6 +211,17 @@ func (ndbee NonDefaultBridgeExistError) Error() string {
211 211
 // Forbidden denotes the type of this error
212 212
 func (ndbee NonDefaultBridgeExistError) Forbidden() {}
213 213
 
214
+// NonDefaultBridgeNeedsIPError is returned when a non-default
215
+// bridge config is passed but it has no ip configured
216
+type NonDefaultBridgeNeedsIPError string
217
+
218
+func (ndbee NonDefaultBridgeNeedsIPError) Error() string {
219
+	return fmt.Sprintf("bridge device with non default name %s must have a valid IP address", string(ndbee))
220
+}
221
+
222
+// Forbidden denotes the type of this error
223
+func (ndbee NonDefaultBridgeNeedsIPError) Forbidden() {}
224
+
214 225
 // FixedCIDRv4Error is returned when fixed-cidrv4 configuration
215 226
 // failed.
216 227
 type FixedCIDRv4Error struct {
... ...
@@ -15,7 +15,7 @@ func setupDevice(config *networkConfiguration, i *bridgeInterface) error {
15 15
 
16 16
 	// We only attempt to create the bridge when the requested device name is
17 17
 	// the default one.
18
-	if config.BridgeName != DefaultBridgeName && !config.AllowNonDefaultBridge {
18
+	if config.BridgeName != DefaultBridgeName && config.DefaultBridge {
19 19
 		return NonDefaultBridgeExistError(config.BridgeName)
20 20
 	}
21 21
 
... ...
@@ -53,8 +53,8 @@ func setupBridgeIPv4(config *networkConfiguration, i *bridgeInterface) error {
53 53
 
54 54
 	// Do not try to configure IPv4 on a non-default bridge unless you are
55 55
 	// specifically asked to do so.
56
-	if config.BridgeName != DefaultBridgeName && !config.AllowNonDefaultBridge {
57
-		return NonDefaultBridgeExistError(config.BridgeName)
56
+	if config.BridgeName != DefaultBridgeName && config.DefaultBridge {
57
+		return NonDefaultBridgeNeedsIPError(config.BridgeName)
58 58
 	}
59 59
 
60 60
 	bridgeIPv4, err := electBridgeIPv4(config)
... ...
@@ -3,6 +3,7 @@ package host
3 3
 import (
4 4
 	"sync"
5 5
 
6
+	"github.com/docker/libnetwork/datastore"
6 7
 	"github.com/docker/libnetwork/driverapi"
7 8
 	"github.com/docker/libnetwork/types"
8 9
 )
... ...
@@ -15,17 +16,13 @@ type driver struct {
15 15
 }
16 16
 
17 17
 // Init registers a new instance of host driver
18
-func Init(dc driverapi.DriverCallback) error {
18
+func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
19 19
 	c := driverapi.Capability{
20
-		Scope: driverapi.LocalScope,
20
+		DataScope: datastore.LocalScope,
21 21
 	}
22 22
 	return dc.RegisterDriver(networkType, &driver{}, c)
23 23
 }
24 24
 
25
-func (d *driver) Config(option map[string]interface{}) error {
26
-	return nil
27
-}
28
-
29 25
 func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
30 26
 	d.Lock()
31 27
 	defer d.Unlock()
... ...
@@ -3,6 +3,7 @@ package null
3 3
 import (
4 4
 	"sync"
5 5
 
6
+	"github.com/docker/libnetwork/datastore"
6 7
 	"github.com/docker/libnetwork/driverapi"
7 8
 	"github.com/docker/libnetwork/types"
8 9
 )
... ...
@@ -15,17 +16,13 @@ type driver struct {
15 15
 }
16 16
 
17 17
 // Init registers a new instance of null driver
18
-func Init(dc driverapi.DriverCallback) error {
18
+func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
19 19
 	c := driverapi.Capability{
20
-		Scope: driverapi.LocalScope,
20
+		DataScope: datastore.LocalScope,
21 21
 	}
22 22
 	return dc.RegisterDriver(networkType, &driver{}, c)
23 23
 }
24 24
 
25
-func (d *driver) Config(option map[string]interface{}) error {
26
-	return nil
27
-}
28
-
29 25
 func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
30 26
 	d.Lock()
31 27
 	defer d.Unlock()
... ...
@@ -72,11 +72,6 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
72 72
 		}
73 73
 	}
74 74
 
75
-	err = jinfo.SetGateway(bridgeIP.IP)
76
-	if err != nil {
77
-		return err
78
-	}
79
-
80 75
 	d.peerDbAdd(nid, eid, ep.addr.IP, ep.mac,
81 76
 		d.serfInstance.LocalMember().Addr, true)
82 77
 	d.notifyCh <- ovNotify{
... ...
@@ -40,6 +40,10 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
40 40
 		return fmt.Errorf("invalid network id")
41 41
 	}
42 42
 
43
+	if err := d.configure(); err != nil {
44
+		return err
45
+	}
46
+
43 47
 	n := &network{
44 48
 		id:        id,
45 49
 		driver:    d,
... ...
@@ -297,6 +301,10 @@ func (n *network) Exists() bool {
297 297
 	return n.dbExists
298 298
 }
299 299
 
300
+func (n *network) Skip() bool {
301
+	return false
302
+}
303
+
300 304
 func (n *network) SetValue(value []byte) error {
301 305
 	var vni uint32
302 306
 	err := json.Unmarshal(value, &vni)
... ...
@@ -306,6 +314,10 @@ func (n *network) SetValue(value []byte) error {
306 306
 	return err
307 307
 }
308 308
 
309
+func (n *network) DataScope() datastore.DataScope {
310
+	return datastore.GlobalScope
311
+}
312
+
309 313
 func (n *network) writeToStore() error {
310 314
 	return n.driver.store.PutObjectAtomic(n)
311 315
 }
... ...
@@ -6,6 +6,7 @@ import (
6 6
 	"net"
7 7
 	"sync"
8 8
 
9
+	"github.com/docker/libkv/store"
9 10
 	"github.com/docker/libnetwork/config"
10 11
 	"github.com/docker/libnetwork/datastore"
11 12
 	"github.com/docker/libnetwork/driverapi"
... ...
@@ -30,6 +31,7 @@ type driver struct {
30 30
 	exitCh       chan chan struct{}
31 31
 	ifaceName    string
32 32
 	neighIP      string
33
+	config       map[string]interface{}
33 34
 	peerDb       peerNetworkMap
34 35
 	serfInstance *serf.Serf
35 36
 	networks     networkTable
... ...
@@ -67,19 +69,22 @@ func onceInit() {
67 67
 }
68 68
 
69 69
 // Init registers a new instance of overlay driver
70
-func Init(dc driverapi.DriverCallback) error {
70
+func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
71 71
 	once.Do(onceInit)
72 72
 
73 73
 	c := driverapi.Capability{
74
-		Scope: driverapi.GlobalScope,
74
+		DataScope: datastore.GlobalScope,
75 75
 	}
76 76
 
77
-	return dc.RegisterDriver(networkType, &driver{
77
+	d := &driver{
78 78
 		networks: networkTable{},
79 79
 		peerDb: peerNetworkMap{
80 80
 			mp: map[string]peerMap{},
81 81
 		},
82
-	}, c)
82
+		config: config,
83
+	}
84
+
85
+	return dc.RegisterDriver(networkType, d, c)
83 86
 }
84 87
 
85 88
 // Fini cleans up the driver resources
... ...
@@ -95,23 +100,27 @@ func Fini(drv driverapi.Driver) {
95 95
 	}
96 96
 }
97 97
 
98
-func (d *driver) Config(option map[string]interface{}) error {
98
+func (d *driver) configure() error {
99 99
 	var onceDone bool
100 100
 	var err error
101 101
 
102
+	if len(d.config) == 0 {
103
+		return nil
104
+	}
105
+
102 106
 	d.Do(func() {
103 107
 		onceDone = true
104 108
 
105
-		if ifaceName, ok := option[netlabel.OverlayBindInterface]; ok {
109
+		if ifaceName, ok := d.config[netlabel.OverlayBindInterface]; ok {
106 110
 			d.ifaceName = ifaceName.(string)
107 111
 		}
108 112
 
109
-		if neighIP, ok := option[netlabel.OverlayNeighborIP]; ok {
113
+		if neighIP, ok := d.config[netlabel.OverlayNeighborIP]; ok {
110 114
 			d.neighIP = neighIP.(string)
111 115
 		}
112 116
 
113
-		provider, provOk := option[netlabel.KVProvider]
114
-		provURL, urlOk := option[netlabel.KVProviderURL]
117
+		provider, provOk := d.config[netlabel.KVProvider]
118
+		provURL, urlOk := d.config[netlabel.KVProviderURL]
115 119
 
116 120
 		if provOk && urlOk {
117 121
 			cfg := &config.DatastoreCfg{
... ...
@@ -120,6 +129,10 @@ func (d *driver) Config(option map[string]interface{}) error {
120 120
 					Address:  provURL.(string),
121 121
 				},
122 122
 			}
123
+			provConfig, confOk := d.config[netlabel.KVProviderConfig]
124
+			if confOk {
125
+				cfg.Client.Config = provConfig.(*store.Config)
126
+			}
123 127
 			d.store, err = datastore.NewDataStore(cfg)
124 128
 			if err != nil {
125 129
 				err = fmt.Errorf("failed to initialize data store: %v", err)
... ...
@@ -146,10 +159,6 @@ func (d *driver) Config(option map[string]interface{}) error {
146 146
 
147 147
 	})
148 148
 
149
-	if !onceDone {
150
-		return fmt.Errorf("config already applied to driver")
151
-	}
152
-
153 149
 	return err
154 150
 }
155 151
 
... ...
@@ -6,6 +6,7 @@ import (
6 6
 
7 7
 	log "github.com/Sirupsen/logrus"
8 8
 	"github.com/docker/docker/pkg/plugins"
9
+	"github.com/docker/libnetwork/datastore"
9 10
 	"github.com/docker/libnetwork/driverapi"
10 11
 	"github.com/docker/libnetwork/drivers/remote/api"
11 12
 	"github.com/docker/libnetwork/types"
... ...
@@ -26,7 +27,7 @@ func newDriver(name string, client *plugins.Client) driverapi.Driver {
26 26
 
27 27
 // Init makes sure a remote driver is registered when a network driver
28 28
 // plugin is activated.
29
-func Init(dc driverapi.DriverCallback) error {
29
+func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
30 30
 	plugins.Handle(driverapi.NetworkPluginEndpointType, func(name string, client *plugins.Client) {
31 31
 		// negotiate driver capability with client
32 32
 		d := newDriver(name, client)
... ...
@@ -52,9 +53,9 @@ func (d *driver) getCapabilities() (*driverapi.Capability, error) {
52 52
 	c := &driverapi.Capability{}
53 53
 	switch capResp.Scope {
54 54
 	case "global":
55
-		c.Scope = driverapi.GlobalScope
55
+		c.DataScope = datastore.GlobalScope
56 56
 	case "local":
57
-		c.Scope = driverapi.LocalScope
57
+		c.DataScope = datastore.LocalScope
58 58
 	default:
59 59
 		return nil, fmt.Errorf("invalid capability: expecting 'local' or 'global', got %s", capResp.Scope)
60 60
 	}
... ...
@@ -1,6 +1,9 @@
1 1
 package windows
2 2
 
3
-import "github.com/docker/libnetwork/driverapi"
3
+import (
4
+	"github.com/docker/libnetwork/datastore"
5
+	"github.com/docker/libnetwork/driverapi"
6
+)
4 7
 
5 8
 const networkType = "windows"
6 9
 
... ...
@@ -9,17 +12,13 @@ const networkType = "windows"
9 9
 type driver struct{}
10 10
 
11 11
 // Init registers a new instance of null driver
12
-func Init(dc driverapi.DriverCallback) error {
12
+func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
13 13
 	c := driverapi.Capability{
14
-		Scope: driverapi.LocalScope,
14
+		DataScope: datastore.LocalScope,
15 15
 	}
16 16
 	return dc.RegisterDriver(networkType, &driver{}, c)
17 17
 }
18 18
 
19
-func (d *driver) Config(option map[string]interface{}) error {
20
-	return nil
21
-}
22
-
23 19
 func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
24 20
 	return nil
25 21
 }
... ...
@@ -1,19 +1,13 @@
1 1
 package libnetwork
2 2
 
3 3
 import (
4
-	"github.com/docker/libnetwork/driverapi"
5 4
 	"github.com/docker/libnetwork/drivers/null"
6 5
 	"github.com/docker/libnetwork/drivers/remote"
7 6
 )
8 7
 
9
-func initDrivers(dc driverapi.DriverCallback) error {
10
-	for _, fn := range [](func(driverapi.DriverCallback) error){
11
-		null.Init,
12
-		remote.Init,
13
-	} {
14
-		if err := fn(dc); err != nil {
15
-			return err
16
-		}
8
+func getInitializers() []initializer {
9
+	return []initializer{
10
+		{null.Init, "null"},
11
+		{remote.Init, "remote"},
17 12
 	}
18
-	return nil
19 13
 }
... ...
@@ -1,25 +1,19 @@
1 1
 package libnetwork
2 2
 
3 3
 import (
4
-	"github.com/docker/libnetwork/driverapi"
5 4
 	"github.com/docker/libnetwork/drivers/bridge"
6 5
 	"github.com/docker/libnetwork/drivers/host"
7 6
 	"github.com/docker/libnetwork/drivers/null"
8
-	o "github.com/docker/libnetwork/drivers/overlay"
7
+	"github.com/docker/libnetwork/drivers/overlay"
9 8
 	"github.com/docker/libnetwork/drivers/remote"
10 9
 )
11 10
 
12
-func initDrivers(dc driverapi.DriverCallback) error {
13
-	for _, fn := range [](func(driverapi.DriverCallback) error){
14
-		bridge.Init,
15
-		host.Init,
16
-		null.Init,
17
-		remote.Init,
18
-		o.Init,
19
-	} {
20
-		if err := fn(dc); err != nil {
21
-			return err
22
-		}
11
+func getInitializers() []initializer {
12
+	return []initializer{
13
+		{bridge.Init, "bridge"},
14
+		{host.Init, "host"},
15
+		{null.Init, "null"},
16
+		{remote.Init, "remote"},
17
+		{overlay.Init, "overlay"},
23 18
 	}
24
-	return nil
25 19
 }
... ...
@@ -1,17 +1,9 @@
1 1
 package libnetwork
2 2
 
3
-import (
4
-	"github.com/docker/libnetwork/driverapi"
5
-	"github.com/docker/libnetwork/drivers/windows"
6
-)
3
+import "github.com/docker/libnetwork/drivers/windows"
7 4
 
8
-func initDrivers(dc driverapi.DriverCallback) error {
9
-	for _, fn := range [](func(driverapi.DriverCallback) error){
10
-		windows.Init,
11
-	} {
12
-		if err := fn(dc); err != nil {
13
-			return err
14
-		}
5
+func getInitializers() []initializer {
6
+	return []initializer{
7
+		{windows.Init, "windows"},
15 8
 	}
16
-	return nil
17 9
 }
... ...
@@ -5,6 +5,7 @@ import (
5 5
 	"encoding/json"
6 6
 	"fmt"
7 7
 	"net"
8
+	"strings"
8 9
 	"sync"
9 10
 
10 11
 	log "github.com/Sirupsen/logrus"
... ...
@@ -130,15 +131,15 @@ func (ep *endpoint) KeyPrefix() []string {
130 130
 	return []string{datastore.EndpointKeyPrefix, ep.getNetwork().id}
131 131
 }
132 132
 
133
-func (ep *endpoint) networkIDFromKey(key []string) (string, error) {
134
-	// endpoint Key structure : endpoint/network-id/endpoint-id
135
-	// it's an invalid key if the key doesn't have all the 3 key elements above
136
-	if key == nil || len(key) < 3 || key[0] != datastore.EndpointKeyPrefix {
133
+func (ep *endpoint) networkIDFromKey(key string) (string, error) {
134
+	// endpoint Key structure : docker/libnetwork/endpoint/${network-id}/${endpoint-id}
135
+	// it's an invalid key if the key doesn't have all the 5 key elements above
136
+	keyElements := strings.Split(key, "/")
137
+	if !strings.HasPrefix(key, datastore.Key(datastore.EndpointKeyPrefix)) || len(keyElements) < 5 {
137 138
 		return "", fmt.Errorf("invalid endpoint key : %v", key)
138 139
 	}
139
-
140
-	// network-id is placed at index=1. pls refer to endpoint.Key() method
141
-	return key[1], nil
140
+	// network-id is placed at index=3. pls refer to endpoint.Key() method
141
+	return strings.Split(key, "/")[3], nil
142 142
 }
143 143
 
144 144
 func (ep *endpoint) Value() []byte {
... ...
@@ -172,6 +173,10 @@ func (ep *endpoint) Exists() bool {
172 172
 	return ep.dbExists
173 173
 }
174 174
 
175
+func (ep *endpoint) Skip() bool {
176
+	return ep.getNetwork().Skip()
177
+}
178
+
175 179
 func (ep *endpoint) processOptions(options ...EndpointOption) {
176 180
 	ep.Lock()
177 181
 	defer ep.Unlock()
... ...
@@ -183,53 +188,30 @@ func (ep *endpoint) processOptions(options ...EndpointOption) {
183 183
 	}
184 184
 }
185 185
 
186
-// joinLeaveStart waits to ensure there are no joins or leaves in progress and
187
-// marks this join/leave in progress without race
188
-func (ep *endpoint) joinLeaveStart() {
189
-	ep.Lock()
190
-	defer ep.Unlock()
191
-
192
-	for ep.joinLeaveDone != nil {
193
-		joinLeaveDone := ep.joinLeaveDone
194
-		ep.Unlock()
195
-
196
-		select {
197
-		case <-joinLeaveDone:
198
-		}
186
+func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
199 187
 
200
-		ep.Lock()
188
+	if sbox == nil {
189
+		return types.BadRequestErrorf("endpoint cannot be joined by nil container")
201 190
 	}
202 191
 
203
-	ep.joinLeaveDone = make(chan struct{})
204
-}
192
+	sb, ok := sbox.(*sandbox)
193
+	if !ok {
194
+		return types.BadRequestErrorf("not a valid Sandbox interface")
195
+	}
205 196
 
206
-// joinLeaveEnd marks the end of this join/leave operation and
207
-// signals the same without race to other join and leave waiters
208
-func (ep *endpoint) joinLeaveEnd() {
209
-	ep.Lock()
210
-	defer ep.Unlock()
197
+	sb.joinLeaveStart()
198
+	defer sb.joinLeaveEnd()
211 199
 
212
-	if ep.joinLeaveDone != nil {
213
-		close(ep.joinLeaveDone)
214
-		ep.joinLeaveDone = nil
215
-	}
200
+	return ep.sbJoin(sbox, options...)
216 201
 }
217 202
 
218
-func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
203
+func (ep *endpoint) sbJoin(sbox Sandbox, options ...EndpointOption) error {
219 204
 	var err error
220
-
221
-	if sbox == nil {
222
-		return types.BadRequestErrorf("endpoint cannot be joined by nil container")
223
-	}
224
-
225 205
 	sb, ok := sbox.(*sandbox)
226 206
 	if !ok {
227 207
 		return types.BadRequestErrorf("not a valid Sandbox interface")
228 208
 	}
229 209
 
230
-	ep.joinLeaveStart()
231
-	defer ep.joinLeaveEnd()
232
-
233 210
 	ep.Lock()
234 211
 	if ep.sandboxID != "" {
235 212
 		ep.Unlock()
... ...
@@ -281,8 +263,10 @@ func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
281 281
 		return err
282 282
 	}
283 283
 
284
-	if err = network.ctrlr.updateEndpointToStore(ep); err != nil {
285
-		return err
284
+	if !ep.isLocalScoped() {
285
+		if err = network.ctrlr.updateToStore(ep); err != nil {
286
+			return err
287
+		}
286 288
 	}
287 289
 
288 290
 	sb.Lock()
... ...
@@ -304,7 +288,11 @@ func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
304 304
 	if err = sb.populateNetworkResources(ep); err != nil {
305 305
 		return err
306 306
 	}
307
-	return nil
307
+
308
+	if sb.needDefaultGW() {
309
+		return sb.setupDefaultGW(ep)
310
+	}
311
+	return sb.clearDefaultGW()
308 312
 }
309 313
 
310 314
 func (ep *endpoint) hasInterface(iName string) bool {
... ...
@@ -315,9 +303,6 @@ func (ep *endpoint) hasInterface(iName string) bool {
315 315
 }
316 316
 
317 317
 func (ep *endpoint) Leave(sbox Sandbox, options ...EndpointOption) error {
318
-	ep.joinLeaveStart()
319
-	defer ep.joinLeaveEnd()
320
-
321 318
 	if sbox == nil || sbox.ID() == "" || sbox.Key() == "" {
322 319
 		return types.BadRequestErrorf("invalid Sandbox passed to enpoint leave: %v", sbox)
323 320
 	}
... ...
@@ -327,6 +312,18 @@ func (ep *endpoint) Leave(sbox Sandbox, options ...EndpointOption) error {
327 327
 		return types.BadRequestErrorf("not a valid Sandbox interface")
328 328
 	}
329 329
 
330
+	sb.joinLeaveStart()
331
+	defer sb.joinLeaveEnd()
332
+
333
+	return ep.sbLeave(sbox, options...)
334
+}
335
+
336
+func (ep *endpoint) sbLeave(sbox Sandbox, options ...EndpointOption) error {
337
+	sb, ok := sbox.(*sandbox)
338
+	if !ok {
339
+		return types.BadRequestErrorf("not a valid Sandbox interface")
340
+	}
341
+
330 342
 	ep.Lock()
331 343
 	sid := ep.sandboxID
332 344
 	ep.Unlock()
... ...
@@ -350,18 +347,31 @@ func (ep *endpoint) Leave(sbox Sandbox, options ...EndpointOption) error {
350 350
 	d := n.driver
351 351
 	n.Unlock()
352 352
 
353
-	if err := c.updateEndpointToStore(ep); err != nil {
354
-		ep.Lock()
355
-		ep.sandboxID = sid
356
-		ep.Unlock()
357
-		return err
353
+	if !ep.isLocalScoped() {
354
+		if err := c.updateToStore(ep); err != nil {
355
+			ep.Lock()
356
+			ep.sandboxID = sid
357
+			ep.Unlock()
358
+			return err
359
+		}
358 360
 	}
359 361
 
360 362
 	if err := d.Leave(n.id, ep.id); err != nil {
361 363
 		return err
362 364
 	}
363 365
 
364
-	return sb.clearNetworkResources(ep)
366
+	if err := sb.clearNetworkResources(ep); err != nil {
367
+		return err
368
+	}
369
+
370
+	if sb.needDefaultGW() {
371
+		ep := sb.getEPwithoutGateway()
372
+		if ep == nil {
373
+			return fmt.Errorf("endpoint without GW expected, but not found")
374
+		}
375
+		return sb.setupDefaultGW(ep)
376
+	}
377
+	return sb.clearDefaultGW()
365 378
 }
366 379
 
367 380
 func (ep *endpoint) Delete() error {
... ...
@@ -379,27 +389,31 @@ func (ep *endpoint) Delete() error {
379 379
 	n.Unlock()
380 380
 	ep.Unlock()
381 381
 
382
-	if err = ctrlr.deleteEndpointFromStore(ep); err != nil {
383
-		return err
382
+	if !ep.isLocalScoped() {
383
+		if err = ctrlr.deleteFromStore(ep); err != nil {
384
+			return err
385
+		}
384 386
 	}
385 387
 	defer func() {
386 388
 		if err != nil {
387
-			ep.SetIndex(0)
388
-			if e := ctrlr.updateEndpointToStore(ep); e != nil {
389
-				log.Warnf("failed to recreate endpoint in store %s : %v", name, err)
389
+			ep.dbExists = false
390
+			if !ep.isLocalScoped() {
391
+				if e := ctrlr.updateToStore(ep); e != nil {
392
+					log.Warnf("failed to recreate endpoint in store %s : %v", name, e)
393
+				}
390 394
 			}
391 395
 		}
392 396
 	}()
393 397
 
394 398
 	// Update the endpoint count in network and update it in the datastore
395 399
 	n.DecEndpointCnt()
396
-	if err = ctrlr.updateNetworkToStore(n); err != nil {
400
+	if err = ctrlr.updateToStore(n); err != nil {
397 401
 		return err
398 402
 	}
399 403
 	defer func() {
400 404
 		if err != nil {
401 405
 			n.IncEndpointCnt()
402
-			if e := ctrlr.updateNetworkToStore(n); e != nil {
406
+			if e := ctrlr.updateToStore(n); e != nil {
403 407
 				log.Warnf("failed to update network %s : %v", n.name, e)
404 408
 			}
405 409
 		}
... ...
@@ -525,3 +539,13 @@ func JoinOptionPriority(ep Endpoint, prio int) EndpointOption {
525 525
 		sb.epPriority[ep.id] = prio
526 526
 	}
527 527
 }
528
+
529
+func (ep *endpoint) DataScope() datastore.DataScope {
530
+	ep.Lock()
531
+	defer ep.Unlock()
532
+	return ep.network.dataScope
533
+}
534
+
535
+func (ep *endpoint) isLocalScoped() bool {
536
+	return ep.DataScope() == datastore.LocalScope
537
+}
... ...
@@ -30,6 +30,9 @@ const (
30 30
 	// KVProviderURL constant represents the KV provider URL
31 31
 	KVProviderURL = DriverPrefix + ".kv_provider_url"
32 32
 
33
+	// KVProviderConfig constant represents the KV provider Config
34
+	KVProviderConfig = DriverPrefix + ".kv_provider_config"
35
+
33 36
 	// OverlayBindInterface constant represents overlay driver bind interface
34 37
 	OverlayBindInterface = DriverPrefix + ".overlay.bind_interface"
35 38
 
... ...
@@ -68,7 +68,9 @@ type network struct {
68 68
 	dbIndex     uint64
69 69
 	svcRecords  svcMap
70 70
 	dbExists    bool
71
+	persist     bool
71 72
 	stopWatchCh chan struct{}
73
+	dataScope   datastore.DataScope
72 74
 	sync.Mutex
73 75
 }
74 76
 
... ...
@@ -140,6 +142,18 @@ func (n *network) Exists() bool {
140 140
 	return n.dbExists
141 141
 }
142 142
 
143
+func (n *network) Skip() bool {
144
+	n.Lock()
145
+	defer n.Unlock()
146
+	return !n.persist
147
+}
148
+
149
+func (n *network) DataScope() datastore.DataScope {
150
+	n.Lock()
151
+	defer n.Unlock()
152
+	return n.dataScope
153
+}
154
+
143 155
 func (n *network) EndpointCnt() uint64 {
144 156
 	n.Lock()
145 157
 	defer n.Unlock()
... ...
@@ -167,6 +181,7 @@ func (n *network) MarshalJSON() ([]byte, error) {
167 167
 	netMap["endpointCnt"] = n.endpointCnt
168 168
 	netMap["enableIPv6"] = n.enableIPv6
169 169
 	netMap["generic"] = n.generic
170
+	netMap["persist"] = n.persist
170 171
 	return json.Marshal(netMap)
171 172
 }
172 173
 
... ...
@@ -184,6 +199,9 @@ func (n *network) UnmarshalJSON(b []byte) (err error) {
184 184
 	if netMap["generic"] != nil {
185 185
 		n.generic = netMap["generic"].(map[string]interface{})
186 186
 	}
187
+	if netMap["persist"] != nil {
188
+		n.persist = netMap["persist"].(bool)
189
+	}
187 190
 	return nil
188 191
 }
189 192
 
... ...
@@ -203,6 +221,13 @@ func NetworkOptionGeneric(generic map[string]interface{}) NetworkOption {
203 203
 	}
204 204
 }
205 205
 
206
+// NetworkOptionPersist returns an option setter to set persistence policy for a network
207
+func NetworkOptionPersist(persist bool) NetworkOption {
208
+	return func(n *network) {
209
+		n.persist = persist
210
+	}
211
+}
212
+
206 213
 func (n *network) processOptions(options ...NetworkOption) {
207 214
 	for _, opt := range options {
208 215
 		if opt != nil {
... ...
@@ -233,13 +258,22 @@ func (n *network) Delete() error {
233 233
 
234 234
 	// deleteNetworkFromStore performs an atomic delete operation and the network.endpointCnt field will help
235 235
 	// prevent any possible race between endpoint join and network delete
236
-	if err = ctrlr.deleteNetworkFromStore(n); err != nil {
236
+	if err = ctrlr.deleteFromStore(n); err != nil {
237 237
 		if err == datastore.ErrKeyModified {
238 238
 			return types.InternalErrorf("operation in progress. delete failed for network %s. Please try again.")
239 239
 		}
240 240
 		return err
241 241
 	}
242 242
 
243
+	defer func() {
244
+		if err != nil {
245
+			n.dbExists = false
246
+			if e := ctrlr.updateToStore(n); e != nil {
247
+				log.Warnf("failed to recreate network in store %s : %v", n.name, e)
248
+			}
249
+		}
250
+	}()
251
+
243 252
 	if err = n.deleteNetwork(); err != nil {
244 253
 		return err
245 254
 	}
... ...
@@ -315,13 +349,13 @@ func (n *network) CreateEndpoint(name string, options ...EndpointOption) (Endpoi
315 315
 	n.Unlock()
316 316
 
317 317
 	n.IncEndpointCnt()
318
-	if err = ctrlr.updateNetworkToStore(n); err != nil {
318
+	if err = ctrlr.updateToStore(n); err != nil {
319 319
 		return nil, err
320 320
 	}
321 321
 	defer func() {
322 322
 		if err != nil {
323 323
 			n.DecEndpointCnt()
324
-			if err = ctrlr.updateNetworkToStore(n); err != nil {
324
+			if err = ctrlr.updateToStore(n); err != nil {
325 325
 				log.Warnf("endpoint count cleanup failed when updating network for %s : %v", name, err)
326 326
 			}
327 327
 		}
... ...
@@ -337,8 +371,10 @@ func (n *network) CreateEndpoint(name string, options ...EndpointOption) (Endpoi
337 337
 		}
338 338
 	}()
339 339
 
340
-	if err = ctrlr.updateEndpointToStore(ep); err != nil {
341
-		return nil, err
340
+	if !ep.isLocalScoped() {
341
+		if err = ctrlr.updateToStore(ep); err != nil {
342
+			return nil, err
343
+		}
342 344
 	}
343 345
 
344 346
 	return ep, nil
... ...
@@ -398,11 +434,8 @@ func (n *network) EndpointByID(id string) (Endpoint, error) {
398 398
 	return nil, ErrNoSuchEndpoint(id)
399 399
 }
400 400
 
401
-func (n *network) isGlobalScoped() (bool, error) {
402
-	n.Lock()
403
-	c := n.ctrlr
404
-	n.Unlock()
405
-	return c.isDriverGlobalScoped(n.networkType)
401
+func (n *network) isGlobalScoped() bool {
402
+	return n.DataScope() == datastore.GlobalScope
406 403
 }
407 404
 
408 405
 func (n *network) updateSvcRecord(ep *endpoint, isAdd bool) {
... ...
@@ -54,16 +54,15 @@ func (sb *sandbox) processOptions(options ...SandboxOption) {
54 54
 type epHeap []*endpoint
55 55
 
56 56
 type sandbox struct {
57
-	id          string
58
-	containerID string
59
-	config      containerConfig
60
-	osSbox      osl.Sandbox
61
-	controller  *controller
62
-	refCnt      int
63
-	endpoints   epHeap
64
-	epPriority  map[string]int
65
-	//hostsPath      string
66
-	//resolvConfPath string
57
+	id            string
58
+	containerID   string
59
+	config        containerConfig
60
+	osSbox        osl.Sandbox
61
+	controller    *controller
62
+	refCnt        int
63
+	hostsOnce     sync.Once
64
+	endpoints     epHeap
65
+	epPriority    map[string]int
67 66
 	joinLeaveDone chan struct{}
68 67
 	sync.Mutex
69 68
 }
... ...
@@ -149,6 +148,11 @@ func (sb *sandbox) Delete() error {
149 149
 
150 150
 	// Detach from all endpoints
151 151
 	for _, ep := range sb.getConnectedEndpoints() {
152
+		// endpoint in the Gateway network will be cleaned up
153
+		// when when sandbox no longer needs external connectivity
154
+		if ep.endpointInGWNetwork() {
155
+			continue
156
+		}
152 157
 		if err := ep.Leave(sb); err != nil {
153 158
 			log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
154 159
 		}
... ...
@@ -342,15 +346,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
342 342
 		}
343 343
 	}
344 344
 
345
-	sb.Lock()
346
-	highEp := sb.endpoints[0]
347
-	sb.Unlock()
348
-	if ep == highEp {
349
-		if err := sb.updateGateway(ep); err != nil {
350
-			return err
345
+	for _, gwep := range sb.getConnectedEndpoints() {
346
+		if len(gwep.Gateway()) > 0 {
347
+			if gwep != ep {
348
+				return nil
349
+			}
350
+			if err := sb.updateGateway(gwep); err != nil {
351
+				return err
352
+			}
351 353
 		}
352 354
 	}
353
-
354 355
 	return nil
355 356
 }
356 357
 
... ...
@@ -389,26 +394,33 @@ func (sb *sandbox) clearNetworkResources(ep *endpoint) error {
389 389
 		return nil
390 390
 	}
391 391
 
392
-	highEpBefore := sb.endpoints[0]
393 392
 	var (
394
-		i int
395
-		e *endpoint
393
+		gwepBefore, gwepAfter *endpoint
394
+		index                 = -1
396 395
 	)
397
-	for i, e = range sb.endpoints {
396
+	for i, e := range sb.endpoints {
398 397
 		if e == ep {
398
+			index = i
399
+		}
400
+		if len(e.Gateway()) > 0 && gwepBefore == nil {
401
+			gwepBefore = e
402
+		}
403
+		if index != -1 && gwepBefore != nil {
399 404
 			break
400 405
 		}
401 406
 	}
402
-	heap.Remove(&sb.endpoints, i)
403
-	var highEpAfter *endpoint
404
-	if len(sb.endpoints) > 0 {
405
-		highEpAfter = sb.endpoints[0]
407
+	heap.Remove(&sb.endpoints, index)
408
+	for _, e := range sb.endpoints {
409
+		if len(e.Gateway()) > 0 {
410
+			gwepAfter = e
411
+			break
412
+		}
406 413
 	}
407 414
 	delete(sb.epPriority, ep.ID())
408 415
 	sb.Unlock()
409 416
 
410
-	if highEpBefore != highEpAfter {
411
-		sb.updateGateway(highEpAfter)
417
+	if gwepAfter != nil && gwepBefore != gwepAfter {
418
+		sb.updateGateway(gwepAfter)
412 419
 	}
413 420
 
414 421
 	return nil
... ...
@@ -447,22 +459,47 @@ func (sb *sandbox) buildHostsFile() error {
447 447
 }
448 448
 
449 449
 func (sb *sandbox) updateHostsFile(ifaceIP string, svcRecords []etchosts.Record) error {
450
+	var err error
451
+
450 452
 	if sb.config.originHostsPath != "" {
451 453
 		return nil
452 454
 	}
453 455
 
454
-	// Rebuild the hosts file accounting for the passed interface IP and service records
455
-	extraContent := make([]etchosts.Record, 0, len(sb.config.extraHosts)+len(svcRecords))
456
+	max := func(a, b int) int {
457
+		if a < b {
458
+			return b
459
+		}
456 460
 
457
-	for _, extraHost := range sb.config.extraHosts {
458
-		extraContent = append(extraContent, etchosts.Record{Hosts: extraHost.name, IP: extraHost.IP})
461
+		return a
462
+	}
463
+
464
+	extraContent := make([]etchosts.Record, 0,
465
+		max(len(sb.config.extraHosts), len(svcRecords)))
466
+
467
+	sb.hostsOnce.Do(func() {
468
+		// Rebuild the hosts file accounting for the passed
469
+		// interface IP and service records
470
+
471
+		for _, extraHost := range sb.config.extraHosts {
472
+			extraContent = append(extraContent,
473
+				etchosts.Record{Hosts: extraHost.name, IP: extraHost.IP})
474
+		}
475
+
476
+		err = etchosts.Build(sb.config.hostsPath, ifaceIP,
477
+			sb.config.hostName, sb.config.domainName, extraContent)
478
+	})
479
+
480
+	if err != nil {
481
+		return err
459 482
 	}
460 483
 
484
+	extraContent = extraContent[:0]
461 485
 	for _, svc := range svcRecords {
462 486
 		extraContent = append(extraContent, svc)
463 487
 	}
464 488
 
465
-	return etchosts.Build(sb.config.hostsPath, ifaceIP, sb.config.hostName, sb.config.domainName, extraContent)
489
+	sb.addHostsEntries(extraContent)
490
+	return nil
466 491
 }
467 492
 
468 493
 func (sb *sandbox) addHostsEntries(recs []etchosts.Record) {
... ...
@@ -629,6 +666,38 @@ func (sb *sandbox) updateDNS(ipv6Enabled bool) error {
629 629
 	return os.Rename(tmpResolvFile.Name(), sb.config.resolvConfPath)
630 630
 }
631 631
 
632
+// joinLeaveStart waits to ensure there are no joins or leaves in progress and
633
+// marks this join/leave in progress without race
634
+func (sb *sandbox) joinLeaveStart() {
635
+	sb.Lock()
636
+	defer sb.Unlock()
637
+
638
+	for sb.joinLeaveDone != nil {
639
+		joinLeaveDone := sb.joinLeaveDone
640
+		sb.Unlock()
641
+
642
+		select {
643
+		case <-joinLeaveDone:
644
+		}
645
+
646
+		sb.Lock()
647
+	}
648
+
649
+	sb.joinLeaveDone = make(chan struct{})
650
+}
651
+
652
+// joinLeaveEnd marks the end of this join/leave operation and
653
+// signals the same without race to other join and leave waiters
654
+func (sb *sandbox) joinLeaveEnd() {
655
+	sb.Lock()
656
+	defer sb.Unlock()
657
+
658
+	if sb.joinLeaveDone != nil {
659
+		close(sb.joinLeaveDone)
660
+		sb.joinLeaveDone = nil
661
+	}
662
+}
663
+
632 664
 // OptionHostname function returns an option setter for hostname option to
633 665
 // be passed to NewSandbox method.
634 666
 func OptionHostname(name string) SandboxOption {
... ...
@@ -748,6 +817,17 @@ func (eh epHeap) Less(i, j int) bool {
748 748
 	ci, _ := eh[i].getSandbox()
749 749
 	cj, _ := eh[j].getSandbox()
750 750
 
751
+	epi := eh[i]
752
+	epj := eh[j]
753
+
754
+	if epi.endpointInGWNetwork() {
755
+		return false
756
+	}
757
+
758
+	if epj.endpointInGWNetwork() {
759
+		return true
760
+	}
761
+
751 762
 	cip, ok := ci.epPriority[eh[i].ID()]
752 763
 	if !ok {
753 764
 		cip = 0
... ...
@@ -138,7 +138,7 @@ func (c *controller) acceptClientConnections(sock string, l net.Listener) {
138 138
 		conn, err := l.Accept()
139 139
 		if err != nil {
140 140
 			if _, err1 := os.Stat(sock); os.IsNotExist(err1) {
141
-				logrus.Warnf("Unix socket %s doesnt exist. cannot accept client connections", sock)
141
+				logrus.Debugf("Unix socket %s doesnt exist. cannot accept client connections", sock)
142 142
 				return
143 143
 			}
144 144
 			logrus.Errorf("Error accepting connection %v", err)
... ...
@@ -3,99 +3,98 @@ package libnetwork
3 3
 import (
4 4
 	"encoding/json"
5 5
 	"fmt"
6
+	"time"
6 7
 
7 8
 	log "github.com/Sirupsen/logrus"
8 9
 	"github.com/docker/libkv/store"
10
+	"github.com/docker/libnetwork/config"
9 11
 	"github.com/docker/libnetwork/datastore"
10 12
 )
11 13
 
12
-func (c *controller) validateDatastoreConfig() bool {
13
-	return c.cfg != nil && c.cfg.Datastore.Client.Provider != "" && c.cfg.Datastore.Client.Address != ""
14
+var (
15
+	defaultBoltTimeout      = 3 * time.Second
16
+	defaultLocalStoreConfig = config.DatastoreCfg{
17
+		Embedded: true,
18
+		Client: config.DatastoreClientCfg{
19
+			Provider: "boltdb",
20
+			Address:  defaultPrefix + "/boltdb.db",
21
+			Config: &store.Config{
22
+				Bucket:            "libnetwork",
23
+				ConnectionTimeout: defaultBoltTimeout,
24
+			},
25
+		},
26
+	}
27
+)
28
+
29
+func (c *controller) validateGlobalStoreConfig() bool {
30
+	return c.cfg != nil && c.cfg.GlobalStore.Client.Provider != "" && c.cfg.GlobalStore.Client.Address != ""
14 31
 }
15 32
 
16
-func (c *controller) initDataStore() error {
33
+func (c *controller) initGlobalStore() error {
17 34
 	c.Lock()
18 35
 	cfg := c.cfg
19 36
 	c.Unlock()
20
-	if !c.validateDatastoreConfig() {
21
-		return fmt.Errorf("datastore initialization requires a valid configuration")
37
+	if !c.validateGlobalStoreConfig() {
38
+		return fmt.Errorf("globalstore initialization requires a valid configuration")
22 39
 	}
23 40
 
24
-	store, err := datastore.NewDataStore(&cfg.Datastore)
41
+	store, err := datastore.NewDataStore(&cfg.GlobalStore)
25 42
 	if err != nil {
26 43
 		return err
27 44
 	}
28 45
 	c.Lock()
29
-	c.store = store
46
+	c.globalStore = store
30 47
 	c.Unlock()
31 48
 
32
-	nws, err := c.getNetworksFromStore()
49
+	nws, err := c.getNetworksFromStore(true)
33 50
 	if err == nil {
34 51
 		c.processNetworkUpdate(nws, nil)
35 52
 	} else if err != datastore.ErrKeyNotFound {
36
-		log.Warnf("failed to read networks from datastore during init : %v", err)
53
+		log.Warnf("failed to read networks from globalstore during init : %v", err)
37 54
 	}
38 55
 	return c.watchNetworks()
39 56
 }
40 57
 
41
-func (c *controller) getNetworksFromStore() ([]*store.KVPair, error) {
58
+func (c *controller) initLocalStore() error {
42 59
 	c.Lock()
43
-	cs := c.store
60
+	cfg := c.cfg
44 61
 	c.Unlock()
45
-	return cs.KVStore().List(datastore.Key(datastore.NetworkKeyPrefix))
46
-}
47
-
48
-func (c *controller) newNetworkFromStore(n *network) error {
49
-	n.Lock()
50
-	n.ctrlr = c
51
-	n.endpoints = endpointTable{}
52
-	n.Unlock()
53
-
54
-	return c.addNetwork(n)
55
-}
56
-
57
-func (c *controller) updateNetworkToStore(n *network) error {
58
-	global, err := n.isGlobalScoped()
59
-	if err != nil || !global {
62
+	localStore, err := datastore.NewDataStore(c.getLocalStoreConfig(cfg))
63
+	if err != nil {
60 64
 		return err
61 65
 	}
62 66
 	c.Lock()
63
-	cs := c.store
67
+	c.localStore = localStore
64 68
 	c.Unlock()
65
-	if cs == nil {
66
-		log.Debugf("datastore not initialized. Network %s is not added to the store", n.Name())
67
-		return nil
68
-	}
69 69
 
70
-	return cs.PutObjectAtomic(n)
70
+	nws, err := c.getNetworksFromStore(false)
71
+	if err == nil {
72
+		c.processNetworkUpdate(nws, nil)
73
+	} else if err != datastore.ErrKeyNotFound {
74
+		log.Warnf("failed to read networks from localstore during init : %v", err)
75
+	}
76
+	return nil
71 77
 }
72 78
 
73
-func (c *controller) deleteNetworkFromStore(n *network) error {
74
-	global, err := n.isGlobalScoped()
75
-	if err != nil || !global {
76
-		return err
77
-	}
79
+func (c *controller) getNetworksFromStore(global bool) ([]*store.KVPair, error) {
80
+	var cs datastore.DataStore
78 81
 	c.Lock()
79
-	cs := c.store
80
-	c.Unlock()
81
-	if cs == nil {
82
-		log.Debugf("datastore not initialized. Network %s is not deleted from datastore", n.Name())
83
-		return nil
82
+	if global {
83
+		cs = c.globalStore
84
+	} else {
85
+		cs = c.localStore
84 86
 	}
85
-
86
-	if err := cs.DeleteObjectAtomic(n); err != nil {
87
-		return err
88
-	}
89
-
90
-	return nil
87
+	c.Unlock()
88
+	return cs.KVStore().List(datastore.Key(datastore.NetworkKeyPrefix))
91 89
 }
92 90
 
93
-func (c *controller) getNetworkFromStore(nid string) (*network, error) {
94
-	n := network{id: nid}
95
-	if err := c.store.GetObject(datastore.Key(n.Key()...), &n); err != nil {
96
-		return nil, err
97
-	}
98
-	return &n, nil
91
+func (c *controller) newNetworkFromStore(n *network) error {
92
+	n.Lock()
93
+	n.ctrlr = c
94
+	n.endpoints = endpointTable{}
95
+	n.Unlock()
96
+
97
+	return c.addNetwork(n)
99 98
 }
100 99
 
101 100
 func (c *controller) newEndpointFromStore(key string, ep *endpoint) error {
... ...
@@ -113,52 +112,30 @@ func (c *controller) newEndpointFromStore(key string, ep *endpoint) error {
113 113
 	return err
114 114
 }
115 115
 
116
-func (c *controller) updateEndpointToStore(ep *endpoint) error {
117
-	ep.Lock()
118
-	n := ep.network
119
-	name := ep.name
120
-	ep.Unlock()
121
-	global, err := n.isGlobalScoped()
122
-	if err != nil || !global {
123
-		return err
116
+func (c *controller) updateToStore(kvObject datastore.KV) error {
117
+	if kvObject.Skip() {
118
+		return nil
124 119
 	}
125
-	c.Lock()
126
-	cs := c.store
127
-	c.Unlock()
120
+	cs := c.getDataStore(kvObject.DataScope())
128 121
 	if cs == nil {
129
-		log.Debugf("datastore not initialized. endpoint %s is not added to the store", name)
122
+		log.Debugf("datastore not initialized. kv object %s is not added to the store", datastore.Key(kvObject.Key()...))
130 123
 		return nil
131 124
 	}
132 125
 
133
-	return cs.PutObjectAtomic(ep)
134
-}
135
-
136
-func (c *controller) getEndpointFromStore(eid string) (*endpoint, error) {
137
-	ep := endpoint{id: eid}
138
-	if err := c.store.GetObject(datastore.Key(ep.Key()...), &ep); err != nil {
139
-		return nil, err
140
-	}
141
-	return &ep, nil
126
+	return cs.PutObjectAtomic(kvObject)
142 127
 }
143 128
 
144
-func (c *controller) deleteEndpointFromStore(ep *endpoint) error {
145
-	ep.Lock()
146
-	n := ep.network
147
-	ep.Unlock()
148
-	global, err := n.isGlobalScoped()
149
-	if err != nil || !global {
150
-		return err
129
+func (c *controller) deleteFromStore(kvObject datastore.KV) error {
130
+	if kvObject.Skip() {
131
+		return nil
151 132
 	}
152
-
153
-	c.Lock()
154
-	cs := c.store
155
-	c.Unlock()
133
+	cs := c.getDataStore(kvObject.DataScope())
156 134
 	if cs == nil {
157
-		log.Debugf("datastore not initialized. endpoint %s is not deleted from datastore", ep.Name())
135
+		log.Debugf("datastore not initialized. kv object %s is not deleted from datastore", datastore.Key(kvObject.Key()...))
158 136
 		return nil
159 137
 	}
160 138
 
161
-	if err := cs.DeleteObjectAtomic(ep); err != nil {
139
+	if err := cs.DeleteObjectAtomic(kvObject); err != nil {
162 140
 		return err
163 141
 	}
164 142
 
... ...
@@ -166,12 +143,12 @@ func (c *controller) deleteEndpointFromStore(ep *endpoint) error {
166 166
 }
167 167
 
168 168
 func (c *controller) watchNetworks() error {
169
-	if !c.validateDatastoreConfig() {
169
+	if !c.validateGlobalStoreConfig() {
170 170
 		return nil
171 171
 	}
172 172
 
173 173
 	c.Lock()
174
-	cs := c.store
174
+	cs := c.globalStore
175 175
 	c.Unlock()
176 176
 
177 177
 	networkKey := datastore.Key(datastore.NetworkKeyPrefix)
... ...
@@ -191,8 +168,7 @@ func (c *controller) watchNetworks() error {
191 191
 				lview := c.networks
192 192
 				c.Unlock()
193 193
 				for k, v := range lview {
194
-					global, _ := v.isGlobalScoped()
195
-					if global {
194
+					if v.isGlobalScoped() {
196 195
 						tmpview[k] = v
197 196
 					}
198 197
 				}
... ...
@@ -207,7 +183,7 @@ func (c *controller) watchNetworks() error {
207 207
 						continue
208 208
 					}
209 209
 					tmp := network{}
210
-					if err := c.store.GetObject(datastore.Key(existing.Key()...), &tmp); err != datastore.ErrKeyNotFound {
210
+					if err := c.globalStore.GetObject(datastore.Key(existing.Key()...), &tmp); err != datastore.ErrKeyNotFound {
211 211
 						continue
212 212
 					}
213 213
 					if err := existing.deleteNetwork(); err != nil {
... ...
@@ -221,12 +197,12 @@ func (c *controller) watchNetworks() error {
221 221
 }
222 222
 
223 223
 func (n *network) watchEndpoints() error {
224
-	if !n.ctrlr.validateDatastoreConfig() {
224
+	if n.Skip() || !n.ctrlr.validateGlobalStoreConfig() {
225 225
 		return nil
226 226
 	}
227 227
 
228 228
 	n.Lock()
229
-	cs := n.ctrlr.store
229
+	cs := n.ctrlr.globalStore
230 230
 	tmp := endpoint{network: n}
231 231
 	n.stopWatchCh = make(chan struct{})
232 232
 	stopCh := n.stopWatchCh
... ...
@@ -251,28 +227,11 @@ func (n *network) watchEndpoints() error {
251 251
 				lview := n.endpoints
252 252
 				n.Unlock()
253 253
 				for k, v := range lview {
254
-					global, _ := v.network.isGlobalScoped()
255
-					if global {
254
+					if v.network.isGlobalScoped() {
256 255
 						tmpview[k] = v
257 256
 					}
258 257
 				}
259
-				for _, epe := range eps {
260
-					var ep endpoint
261
-					err := json.Unmarshal(epe.Value, &ep)
262
-					if err != nil {
263
-						log.Error(err)
264
-						continue
265
-					}
266
-					delete(tmpview, ep.id)
267
-					ep.SetIndex(epe.LastIndex)
268
-					ep.network = n
269
-					if n.ctrlr.processEndpointUpdate(&ep) {
270
-						err = n.ctrlr.newEndpointFromStore(epe.Key, &ep)
271
-						if err != nil {
272
-							log.Error(err)
273
-						}
274
-					}
275
-				}
258
+				n.ctrlr.processEndpointsUpdate(eps, &tmpview)
276 259
 				// Delete processing
277 260
 				for k := range tmpview {
278 261
 					n.Lock()
... ...
@@ -381,3 +340,53 @@ func ensureKeys(key string, cs datastore.DataStore) error {
381 381
 	}
382 382
 	return cs.KVStore().Put(key, []byte{}, nil)
383 383
 }
384
+
385
+func (c *controller) getLocalStoreConfig(cfg *config.Config) *config.DatastoreCfg {
386
+	if cfg != nil && cfg.LocalStore.Client.Provider != "" && cfg.LocalStore.Client.Address != "" {
387
+		return &cfg.LocalStore
388
+	}
389
+	return &defaultLocalStoreConfig
390
+}
391
+
392
+func (c *controller) getDataStore(dataScope datastore.DataScope) (dataStore datastore.DataStore) {
393
+	c.Lock()
394
+	if dataScope == datastore.GlobalScope {
395
+		dataStore = c.globalStore
396
+	} else if dataScope == datastore.LocalScope {
397
+		dataStore = c.localStore
398
+	}
399
+	c.Unlock()
400
+	return
401
+}
402
+
403
+func (c *controller) processEndpointsUpdate(eps []*store.KVPair, prune *endpointTable) {
404
+	for _, epe := range eps {
405
+		var ep endpoint
406
+		err := json.Unmarshal(epe.Value, &ep)
407
+		if err != nil {
408
+			log.Error(err)
409
+			continue
410
+		}
411
+		if prune != nil {
412
+			delete(*prune, ep.id)
413
+		}
414
+		ep.SetIndex(epe.LastIndex)
415
+		if nid, err := ep.networkIDFromKey(epe.Key); err != nil {
416
+			log.Error(err)
417
+			continue
418
+		} else {
419
+			if n, err := c.NetworkByID(nid); err != nil {
420
+				log.Error(err)
421
+				continue
422
+			} else {
423
+				ep.network = n.(*network)
424
+			}
425
+		}
426
+		if c.processEndpointUpdate(&ep) {
427
+			err = c.newEndpointFromStore(epe.Key, &ep)
428
+			if err != nil {
429
+				log.Error(err)
430
+			}
431
+		}
432
+	}
433
+}