prune: correctly account for duplicates in max-unused check

The size comparison for `--max-unused` only accounted for unused but not for duplicate data. For repositories with a large amount of duplicates this can result in a situation where no data gets pruned even though the amount of unused data is much higher than specified.
2025-01-18 22:15:35 +01:00 · 2025-01-18 22:15:35 +01:00 · 9331461a13
parent ed3922ac82
commit 9331461a13
3 changed files with 91 additions and 1 deletions
--- a/changelog/unreleased/pull-5212
+++ b/changelog/unreleased/pull-5212
@ -0,0 +1,10 @@
 Bugfix: Correctly account for duplicate data in `prune --max-unused`
 `prune --max-unused size` did not correctly account for duplicate data.
 If a repository contained a large amount of duplicate data, this could
 previously result in pruning too little data.
 This issue is now resolved.
 https://github.com/restic/restic/pull/5212
 https://forum.restic.net/t/restic-not-obeying-max-unused-parameter-on-prune/8879
--- a/internal/repository/prune.go
+++ b/internal/repository/prune.go
@ -478,7 +478,8 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository,
 	maxUnusedSizeAfter := opts.MaxUnusedBytes(stats.Size.Used)
 	for _, p := range repackCandidates {
-		reachedUnusedSizeAfter := (stats.Size.Unused-stats.Size.Remove-stats.Size.Repackrm < maxUnusedSizeAfter)
+		remainingUnusedSize := stats.Size.Duplicate + stats.Size.Unused - stats.Size.Remove - stats.Size.Repackrm
 		reachedUnusedSizeAfter := remainingUnusedSize < maxUnusedSizeAfter
 		reachedRepackSize := stats.Size.Repack+p.unusedSize+p.usedSize >= opts.MaxRepackBytes
 		packIsLargeEnough := p.unusedSize+p.usedSize >= uint64(targetPackSize)
--- a/internal/repository/prune_test.go
+++ b/internal/repository/prune_test.go
@ -112,3 +112,82 @@ func TestPrune(t *testing.T) {
 		})
 	}
 }
 // TestPruneMaxUnusedDuplicate checks that MaxUnused correctly accounts for duplicates.
 //
 // Create a repository containing blobs a to d that are stored in packs as follows:
 // - a, d
 // - b, d
 // - c, d
 // All blobs should be kept during prune, but the duplicates should be gone afterwards.
 // The special construction ensures that each pack contains a used, non-duplicate blob.
 // This ensures that special cases that delete completely duplicate packs files do not
 // apply.
 func TestPruneMaxUnusedDuplicate(t *testing.T) {
 	seed := time.Now().UnixNano()
 	random := rand.New(rand.NewSource(seed))
 	t.Logf("rand initialized with seed %d", seed)
 	repo, _, _ := repository.TestRepositoryWithVersion(t, 0)
 	// large blobs to prevent repacking due to too small packsize
 	const blobSize = 1024 * 1024
 	bufs := [][]byte{}
 	for i := 0; i < 4; i++ {
 		// use uniform length for simpler control via MaxUnusedBytes
 		buf := make([]byte, blobSize)
 		random.Read(buf)
 		bufs = append(bufs, buf)
 	}
 	keep := restic.NewBlobSet()
 	for _, blobs := range [][][]byte{
 		{bufs[0], bufs[3]},
 		{bufs[1], bufs[3]},
 		{bufs[2], bufs[3]},
 	} {
 		var wg errgroup.Group
 		repo.StartPackUploader(context.TODO(), &wg)
 		for _, blob := range blobs {
 			id, _, _, err := repo.SaveBlob(context.TODO(), restic.DataBlob, blob, restic.ID{}, true)
 			keep.Insert(restic.BlobHandle{Type: restic.DataBlob, ID: id})
 			rtest.OK(t, err)
 		}
 		rtest.OK(t, repo.Flush(context.Background()))
 	}
 	opts := repository.PruneOptions{
 		MaxRepackBytes: math.MaxUint64,
 		// non-zero number of unused bytes, that is nevertheless smaller than a single blob
 		// setting this to zero would bypass the unused/duplicate size accounting that should
 		// be tested here
 		MaxUnusedBytes: func(used uint64) (unused uint64) { return blobSize / 2 },
 	}
 	plan, err := repository.PlanPrune(context.TODO(), opts, repo, func(ctx context.Context, repo restic.Repository, usedBlobs restic.FindBlobSet) error {
 		for blob := range keep {
 			usedBlobs.Insert(blob)
 		}
 		return nil
 	}, &progress.NoopPrinter{})
 	rtest.OK(t, err)
 	rtest.OK(t, plan.Execute(context.TODO(), &progress.NoopPrinter{}))
 	rsize := plan.Stats().Size
 	remainingUnusedSize := rsize.Duplicate + rsize.Unused - rsize.Remove - rsize.Repackrm
 	maxUnusedSize := opts.MaxUnusedBytes(rsize.Used)
 	rtest.Assert(t, remainingUnusedSize <= maxUnusedSize, "too much unused data remains got %v, expected less than %v", remainingUnusedSize, maxUnusedSize)
 	// divide by blobSize to ignore pack file overhead
 	rtest.Equals(t, rsize.Used/blobSize, uint64(4))
 	rtest.Equals(t, rsize.Duplicate/blobSize, uint64(2))
 	rtest.Equals(t, rsize.Unused, uint64(0))
 	rtest.Equals(t, rsize.Remove, uint64(0))
 	rtest.Equals(t, rsize.Repack/blobSize, uint64(4))
 	rtest.Equals(t, rsize.Repackrm/blobSize, uint64(2))
 	rtest.Equals(t, rsize.Unref, uint64(0))
 	rtest.Equals(t, rsize.Uncompressed, uint64(0))
 }