Browse code

image: add identity cache backend walk and prune expired support

Signed-off-by: CrazyMax <1951866+crazy-max@users.noreply.github.com>

CrazyMax authored on 2026/02/28 01:26:01
Showing 3 changed files
... ...
@@ -18,6 +18,8 @@ type Entry struct {
18 18
 type Backend interface {
19 19
 	Load(ctx context.Context, cacheKey string, now time.Time) (Entry, bool, error)
20 20
 	Store(ctx context.Context, cacheKey string, entry Entry, now time.Time) error
21
+	Walk(ctx context.Context, now time.Time, fn func(cacheKey string, entry Entry) error) error
22
+	PruneExpired(ctx context.Context, now time.Time) error
21 23
 	Close() error
22 24
 }
23 25
 
... ...
@@ -36,6 +38,14 @@ func (nopBackend) Store(context.Context, string, Entry, time.Time) error {
36 36
 	return nil
37 37
 }
38 38
 
39
+func (nopBackend) Walk(context.Context, time.Time, func(string, Entry) error) error {
40
+	return nil
41
+}
42
+
43
+func (nopBackend) PruneExpired(context.Context, time.Time) error {
44
+	return nil
45
+}
46
+
39 47
 func (nopBackend) Close() error {
40 48
 	return nil
41 49
 }
... ...
@@ -2,34 +2,23 @@ package identitycache
2 2
 
3 3
 import (
4 4
 	"context"
5
-	"crypto/rand"
6 5
 	"encoding/json"
7
-	"math/big"
6
+	"fmt"
8 7
 	"os"
9 8
 	"path/filepath"
10 9
 	"sync"
11 10
 	"time"
12 11
 
13
-	boltdb "github.com/moby/buildkit/util/db"
14
-	"github.com/moby/buildkit/util/db/boltutil"
12
+	"github.com/containerd/log"
15 13
 	bolt "go.etcd.io/bbolt"
16 14
 )
17 15
 
18 16
 var bboltCacheBucket = []byte("image-identity-cache-v1")
19 17
 
20
-const (
21
-	// 15m matches the shortest cache TTL (imageIdentityErrorCacheTTL), so
22
-	// prune won't lag far behind shortest-lived entries.
23
-	pruneIntervalMin    = 15 * time.Minute
24
-	pruneIntervalSpread = 15 * time.Minute
25
-)
26
-
27 18
 type boltBackend struct {
28
-	db        boltdb.DB
19
+	db        *bolt.DB
29 20
 	closeOnce sync.Once
30 21
 	closeErr  error
31
-	stopPrune chan struct{}
32
-	pruneDone chan struct{}
33 22
 }
34 23
 
35 24
 // NewBoltDBBackend creates a bbolt-backed persistent cache backend.
... ...
@@ -41,7 +30,7 @@ func NewBoltDBBackend(root string) (Backend, error) {
41 41
 	if err := os.MkdirAll(cacheDir, 0o700); err != nil {
42 42
 		return nil, err
43 43
 	}
44
-	db, err := boltutil.SafeOpen(filepath.Join(cacheDir, "identity-cache.db"), 0o600, nil)
44
+	db, err := safeOpen(filepath.Join(cacheDir, "identity-cache.db"), 0o600, nil)
45 45
 	if err != nil {
46 46
 		return nil, err
47 47
 	}
... ...
@@ -53,7 +42,6 @@ func NewBoltDBBackend(root string) (Backend, error) {
53 53
 		_ = db.Close()
54 54
 		return nil, err
55 55
 	}
56
-	b.startPrune()
57 56
 	return b, nil
58 57
 }
59 58
 
... ...
@@ -85,9 +73,6 @@ func (b *boltBackend) Load(_ context.Context, cacheKey string, now time.Time) (E
85 85
 		return Entry{}, false, nil
86 86
 	}
87 87
 	if now.After(entry.ExpiresAt) {
88
-		if err := b.delete(cacheKey); err != nil {
89
-			return Entry{}, false, err
90
-		}
91 88
 		return Entry{}, false, nil
92 89
 	}
93 90
 	return entry, true, nil
... ...
@@ -107,17 +92,39 @@ func (b *boltBackend) Store(_ context.Context, cacheKey string, entry Entry, _ t
107 107
 	})
108 108
 }
109 109
 
110
+func (b *boltBackend) Walk(ctx context.Context, _ time.Time, fn func(cacheKey string, entry Entry) error) error {
111
+	return b.db.View(func(tx *bolt.Tx) error {
112
+		bucket := tx.Bucket(bboltCacheBucket)
113
+		if bucket == nil {
114
+			return nil
115
+		}
116
+		cursor := bucket.Cursor()
117
+		for key, value := cursor.First(); key != nil; key, value = cursor.Next() {
118
+			if err := ctx.Err(); err != nil {
119
+				return err
120
+			}
121
+
122
+			var entry Entry
123
+			if err := json.Unmarshal(value, &entry); err != nil {
124
+				continue
125
+			}
126
+			if err := fn(string(key), entry); err != nil {
127
+				return err
128
+			}
129
+		}
130
+		return nil
131
+	})
132
+}
133
+
134
+func (b *boltBackend) PruneExpired(_ context.Context, now time.Time) error {
135
+	return b.pruneExpiredEntries(now)
136
+}
137
+
110 138
 func (b *boltBackend) Close() error {
111 139
 	if b == nil || b.db == nil {
112 140
 		return nil
113 141
 	}
114 142
 	b.closeOnce.Do(func() {
115
-		if b.stopPrune != nil {
116
-			close(b.stopPrune)
117
-		}
118
-		if b.pruneDone != nil {
119
-			<-b.pruneDone
120
-		}
121 143
 		b.closeErr = b.db.Close()
122 144
 	})
123 145
 	return b.closeErr
... ...
@@ -133,33 +140,6 @@ func (b *boltBackend) delete(cacheKey string) error {
133 133
 	})
134 134
 }
135 135
 
136
-func (b *boltBackend) startPrune() {
137
-	b.stopPrune = make(chan struct{})
138
-	b.pruneDone = make(chan struct{})
139
-	go func() {
140
-		defer close(b.pruneDone)
141
-		timer := time.NewTimer(nextPruneDelay())
142
-		defer timer.Stop()
143
-		for {
144
-			select {
145
-			case <-b.stopPrune:
146
-				return
147
-			case <-timer.C:
148
-				_ = b.pruneExpiredEntries(time.Now().UTC())
149
-				timer.Reset(nextPruneDelay())
150
-			}
151
-		}
152
-	}()
153
-}
154
-
155
-func nextPruneDelay() time.Duration {
156
-	n, err := rand.Int(rand.Reader, big.NewInt(int64(pruneIntervalSpread)))
157
-	if err != nil {
158
-		return pruneIntervalMin
159
-	}
160
-	return pruneIntervalMin + time.Duration(n.Int64())
161
-}
162
-
163 136
 func (b *boltBackend) pruneExpiredEntries(now time.Time) error {
164 137
 	return b.db.Update(func(tx *bolt.Tx) error {
165 138
 		bucket := tx.Bucket(bboltCacheBucket)
... ...
@@ -178,3 +158,45 @@ func (b *boltBackend) pruneExpiredEntries(now time.Time) error {
178 178
 		return nil
179 179
 	})
180 180
 }
181
+
182
+// safeOpen opens a bolt database with automatic recovery from corruption.
183
+// If the database file is corrupted, it backs up the corrupted file and creates
184
+// a new empty database.
185
+func safeOpen(dbPath string, mode os.FileMode, opts *bolt.Options) (db *bolt.DB, err error) {
186
+	defer func() {
187
+		if r := recover(); r != nil {
188
+			err = fmt.Errorf("%v", r)
189
+		}
190
+		// If we fail opening the DB, but can read a non-empty file, try resetting it.
191
+		if err != nil && fileHasContent(dbPath) {
192
+			db, err = fallbackOpen(dbPath, mode, opts, err)
193
+		}
194
+	}()
195
+	return openDB(dbPath, mode, opts)
196
+}
197
+
198
+func openDB(dbPath string, mode os.FileMode, opts *bolt.Options) (*bolt.DB, error) {
199
+	bdb, err := bolt.Open(dbPath, mode, opts)
200
+	if err != nil {
201
+		return nil, err
202
+	}
203
+	return bdb, nil
204
+}
205
+
206
+func fallbackOpen(dbPath string, mode os.FileMode, opts *bolt.Options, openErr error) (*bolt.DB, error) {
207
+	backupPath := dbPath + "." + fmt.Sprintf("%d", time.Now().UnixNano()) + ".bak"
208
+	log.L.Errorf("failed to open moby image identity cache database %s, resetting to empty. "+
209
+		"Old database is backed up to %s. This usually means dockerd crashed or was terminated abruptly, leaving the cache DB corrupted. "+
210
+		"If this keeps happening, please report at https://github.com/moby/moby/issues. original error: %v",
211
+		dbPath, backupPath, openErr)
212
+	if err := os.Rename(dbPath, backupPath); err != nil {
213
+		return nil, fmt.Errorf("failed to rename database file %s to %s: %w", dbPath, backupPath, err)
214
+	}
215
+	// Second open should create a new database; failure here is permanent.
216
+	return openDB(dbPath, mode, opts)
217
+}
218
+
219
+func fileHasContent(dbPath string) bool {
220
+	st, err := os.Stat(dbPath)
221
+	return err == nil && st.Size() > 0
222
+}
181 223
new file mode 100644
... ...
@@ -0,0 +1,76 @@
0
+package identitycache
1
+
2
+import (
3
+	"sort"
4
+	"testing"
5
+	"time"
6
+
7
+	"gotest.tools/v3/assert"
8
+	is "gotest.tools/v3/assert/cmp"
9
+)
10
+
11
+func TestBoltBackendWalkIncludesExpiredEntriesUntilPrune(t *testing.T) {
12
+	ctx := t.Context()
13
+	now := time.Now().UTC()
14
+
15
+	backend, err := NewBoltDBBackend(t.TempDir())
16
+	assert.NilError(t, err)
17
+	defer backend.Close()
18
+
19
+	boltbk := backend.(*boltBackend)
20
+
21
+	assert.NilError(t, boltbk.Store(ctx, "fresh", Entry{
22
+		CachedAt:  now.Add(-time.Minute),
23
+		ExpiresAt: now.Add(time.Hour),
24
+	}, now))
25
+	assert.NilError(t, boltbk.Store(ctx, "expired", Entry{
26
+		CachedAt:  now.Add(-2 * time.Hour),
27
+		ExpiresAt: now.Add(-time.Minute),
28
+	}, now))
29
+
30
+	var keys []string
31
+	assert.NilError(t, boltbk.Walk(ctx, now, func(cacheKey string, _ Entry) error {
32
+		keys = append(keys, cacheKey)
33
+		return nil
34
+	}))
35
+	sort.Strings(keys)
36
+	assert.Check(t, is.DeepEqual(keys, []string{"expired", "fresh"}))
37
+
38
+	assert.NilError(t, boltbk.PruneExpired(ctx, now))
39
+
40
+	keys = nil
41
+	assert.NilError(t, boltbk.Walk(ctx, now, func(cacheKey string, _ Entry) error {
42
+		keys = append(keys, cacheKey)
43
+		return nil
44
+	}))
45
+	assert.Check(t, is.DeepEqual(keys, []string{"fresh"}))
46
+}
47
+
48
+func TestBoltBackendLoadExpiredReturnsMissWithoutDelete(t *testing.T) {
49
+	ctx := t.Context()
50
+	now := time.Now().UTC()
51
+
52
+	backend, err := NewBoltDBBackend(t.TempDir())
53
+	assert.NilError(t, err)
54
+	defer backend.Close()
55
+
56
+	boltbk := backend.(*boltBackend)
57
+
58
+	assert.NilError(t, boltbk.Store(ctx, "expired", Entry{
59
+		CachedAt:  now.Add(-2 * time.Hour),
60
+		ExpiresAt: now.Add(-time.Minute),
61
+	}, now))
62
+
63
+	_, ok, err := boltbk.Load(ctx, "expired", now)
64
+	assert.NilError(t, err)
65
+	assert.Check(t, !ok)
66
+
67
+	seen := false
68
+	assert.NilError(t, boltbk.Walk(ctx, now, func(cacheKey string, _ Entry) error {
69
+		if cacheKey == "expired" {
70
+			seen = true
71
+		}
72
+		return nil
73
+	}))
74
+	assert.Check(t, seen)
75
+}