From 9331461a1397a56f2958036b4d0f2497c1dcdd13 Mon Sep 17 00:00:00 2001 From: Michael Eischer Date: Sat, 18 Jan 2025 22:15:35 +0100 Subject: [PATCH] prune: correctly account for duplicates in max-unused check The size comparison for `--max-unused` only accounted for unused but not for duplicate data. For repositories with a large amount of duplicates this can result in a situation where no data gets pruned even though the amount of unused data is much higher than specified. --- changelog/unreleased/pull-5212 | 10 ++++ internal/repository/prune.go | 3 +- internal/repository/prune_test.go | 79 +++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 changelog/unreleased/pull-5212 diff --git a/changelog/unreleased/pull-5212 b/changelog/unreleased/pull-5212 new file mode 100644 index 000000000..5acd74c00 --- /dev/null +++ b/changelog/unreleased/pull-5212 @@ -0,0 +1,10 @@ +Bugfix: Correctly account for duplicate data in `prune --max-unused` + +`prune --max-unused size` did not correctly account for duplicate data. +If a repository contained a large amount of duplicate data, this could +previously result in pruning too little data. + +This issue is now resolved. + +https://github.com/restic/restic/pull/5212 +https://forum.restic.net/t/restic-not-obeying-max-unused-parameter-on-prune/8879 diff --git a/internal/repository/prune.go b/internal/repository/prune.go index 3803b6f33..1f5832239 100644 --- a/internal/repository/prune.go +++ b/internal/repository/prune.go @@ -478,7 +478,8 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, maxUnusedSizeAfter := opts.MaxUnusedBytes(stats.Size.Used) for _, p := range repackCandidates { - reachedUnusedSizeAfter := (stats.Size.Unused-stats.Size.Remove-stats.Size.Repackrm < maxUnusedSizeAfter) + remainingUnusedSize := stats.Size.Duplicate + stats.Size.Unused - stats.Size.Remove - stats.Size.Repackrm + reachedUnusedSizeAfter := remainingUnusedSize < maxUnusedSizeAfter reachedRepackSize := stats.Size.Repack+p.unusedSize+p.usedSize >= opts.MaxRepackBytes packIsLargeEnough := p.unusedSize+p.usedSize >= uint64(targetPackSize) diff --git a/internal/repository/prune_test.go b/internal/repository/prune_test.go index cc569aa43..3234622f4 100644 --- a/internal/repository/prune_test.go +++ b/internal/repository/prune_test.go @@ -112,3 +112,82 @@ func TestPrune(t *testing.T) { }) } } + +// TestPruneMaxUnusedDuplicate checks that MaxUnused correctly accounts for duplicates. +// +// Create a repository containing blobs a to d that are stored in packs as follows: +// - a, d +// - b, d +// - c, d +// All blobs should be kept during prune, but the duplicates should be gone afterwards. +// The special construction ensures that each pack contains a used, non-duplicate blob. +// This ensures that special cases that delete completely duplicate packs files do not +// apply. +func TestPruneMaxUnusedDuplicate(t *testing.T) { + seed := time.Now().UnixNano() + random := rand.New(rand.NewSource(seed)) + t.Logf("rand initialized with seed %d", seed) + + repo, _, _ := repository.TestRepositoryWithVersion(t, 0) + // large blobs to prevent repacking due to too small packsize + const blobSize = 1024 * 1024 + + bufs := [][]byte{} + for i := 0; i < 4; i++ { + // use uniform length for simpler control via MaxUnusedBytes + buf := make([]byte, blobSize) + random.Read(buf) + bufs = append(bufs, buf) + } + keep := restic.NewBlobSet() + + for _, blobs := range [][][]byte{ + {bufs[0], bufs[3]}, + {bufs[1], bufs[3]}, + {bufs[2], bufs[3]}, + } { + var wg errgroup.Group + repo.StartPackUploader(context.TODO(), &wg) + + for _, blob := range blobs { + id, _, _, err := repo.SaveBlob(context.TODO(), restic.DataBlob, blob, restic.ID{}, true) + keep.Insert(restic.BlobHandle{Type: restic.DataBlob, ID: id}) + rtest.OK(t, err) + } + + rtest.OK(t, repo.Flush(context.Background())) + } + + opts := repository.PruneOptions{ + MaxRepackBytes: math.MaxUint64, + // non-zero number of unused bytes, that is nevertheless smaller than a single blob + // setting this to zero would bypass the unused/duplicate size accounting that should + // be tested here + MaxUnusedBytes: func(used uint64) (unused uint64) { return blobSize / 2 }, + } + + plan, err := repository.PlanPrune(context.TODO(), opts, repo, func(ctx context.Context, repo restic.Repository, usedBlobs restic.FindBlobSet) error { + for blob := range keep { + usedBlobs.Insert(blob) + } + return nil + }, &progress.NoopPrinter{}) + rtest.OK(t, err) + + rtest.OK(t, plan.Execute(context.TODO(), &progress.NoopPrinter{})) + + rsize := plan.Stats().Size + remainingUnusedSize := rsize.Duplicate + rsize.Unused - rsize.Remove - rsize.Repackrm + maxUnusedSize := opts.MaxUnusedBytes(rsize.Used) + rtest.Assert(t, remainingUnusedSize <= maxUnusedSize, "too much unused data remains got %v, expected less than %v", remainingUnusedSize, maxUnusedSize) + + // divide by blobSize to ignore pack file overhead + rtest.Equals(t, rsize.Used/blobSize, uint64(4)) + rtest.Equals(t, rsize.Duplicate/blobSize, uint64(2)) + rtest.Equals(t, rsize.Unused, uint64(0)) + rtest.Equals(t, rsize.Remove, uint64(0)) + rtest.Equals(t, rsize.Repack/blobSize, uint64(4)) + rtest.Equals(t, rsize.Repackrm/blobSize, uint64(2)) + rtest.Equals(t, rsize.Unref, uint64(0)) + rtest.Equals(t, rsize.Uncompressed, uint64(0)) +}