fix: scope garbage collection to Docker-only blobs (#109)

collect_all_blobs scanned all seven registry prefixes for keys
containing /blobs/ or /tarballs/, but collect_referenced_digests only
reads Docker manifests. Non-Docker artifacts (notably npm tarballs)
were collected but never marked as referenced, so running gc without
--dry-run would delete them.

Restrict blob collection to docker/ until per-registry reference
resolution exists for other registry types.

// ticktockbent
This commit is contained in:
Wes
2026-04-06 13:28:21 -04:00
committed by GitHub
parent 98c9c33ea0
commit 9d3c248ea5

View File

@@ -1,10 +1,14 @@
//! Garbage Collection for orphaned blobs
//! Garbage Collection for orphaned Docker blobs
//!
//! Mark-and-sweep approach:
//! 1. List all blobs across registries
//! 2. Parse all manifests to find referenced blobs
//! 1. List all Docker blobs
//! 2. Parse Docker manifests to find referenced blobs
//! 3. Blobs not referenced by any manifest = orphans
//! 4. Delete orphans (with --dry-run support)
//!
//! Currently Docker-only. Other registries (npm, maven, cargo, pypi, go,
//! raw) are excluded because no reference resolver exists for their
//! metadata formats.
use std::collections::HashSet;
@@ -72,15 +76,15 @@ pub async fn run_gc(storage: &Storage, dry_run: bool) -> GcResult {
async fn collect_all_blobs(storage: &Storage) -> Vec<String> {
let mut blobs = Vec::new();
// Collect blobs from all registry types, not just Docker
for prefix in &[
"docker/", "maven/", "npm/", "cargo/", "pypi/", "raw/", "go/",
] {
let keys = storage.list(prefix).await;
for key in keys {
if key.contains("/blobs/") || key.contains("/tarballs/") {
blobs.push(key);
}
// Only collect Docker blobs. Other registries (npm, maven, cargo, pypi,
// go, raw) use storage key schemes that collect_referenced_digests does
// not understand, so their artifacts would appear as orphans and be
// deleted. Extending GC to non-Docker registries requires per-registry
// reference resolution.
let keys = storage.list("docker/").await;
for key in keys {
if key.contains("/blobs/") {
blobs.push(key);
}
}
blobs
@@ -304,18 +308,103 @@ mod tests {
}
#[tokio::test]
async fn test_gc_multi_registry_blobs() {
async fn test_gc_ignores_non_docker_registries() {
let dir = tempfile::tempdir().unwrap();
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
// npm tarball (not referenced by Docker manifests => orphan candidate)
// Non-Docker artifacts must not be collected by GC, because
// collect_referenced_digests only understands Docker manifests.
// Without this guard, these would all appear as orphans and be deleted.
storage
.put("npm/lodash/tarballs/lodash-4.17.21.tgz", b"tarball-data")
.await
.unwrap();
storage
.put("maven/com/example/lib/1.0/lib-1.0.jar", b"jar-data")
.await
.unwrap();
storage
.put("cargo/serde/1.0.0/serde-1.0.0.crate", b"crate-data")
.await
.unwrap();
let result = run_gc(&storage, true).await;
assert_eq!(result.total_blobs, 0);
assert_eq!(result.orphaned_blobs, 0);
}
#[tokio::test]
async fn test_gc_does_not_delete_npm_tarballs() {
let dir = tempfile::tempdir().unwrap();
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
// Regression test: npm tarballs were previously collected because
// their keys contain "/tarballs/", but no reference resolver existed
// for npm metadata, so they were all treated as orphans.
storage
.put("npm/lodash/tarballs/lodash-4.17.21.tgz", b"tarball-data")
.await
.unwrap();
let result = run_gc(&storage, true).await;
// npm tarballs contain "tarballs/" which matches the filter
assert_eq!(result.total_blobs, 1);
let result = run_gc(&storage, false).await;
assert_eq!(result.deleted_blobs, 0);
// Verify tarball still exists
assert!(storage
.get("npm/lodash/tarballs/lodash-4.17.21.tgz")
.await
.is_ok());
}
#[tokio::test]
async fn test_gc_deletes_docker_orphan_but_preserves_npm() {
let dir = tempfile::tempdir().unwrap();
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
// Docker manifest referencing one blob
let manifest = serde_json::json!({
"config": {"digest": "sha256:configabc"},
"layers": []
});
storage
.put(
"docker/test/manifests/latest.json",
manifest.to_string().as_bytes(),
)
.await
.unwrap();
storage
.put("docker/test/blobs/sha256:configabc", b"config")
.await
.unwrap();
// Orphan Docker blob
storage
.put("docker/test/blobs/sha256:orphan1", b"orphan")
.await
.unwrap();
// npm tarball that must survive
storage
.put("npm/lodash/tarballs/lodash-4.17.21.tgz", b"tarball-data")
.await
.unwrap();
let result = run_gc(&storage, false).await;
assert_eq!(result.total_blobs, 2); // only Docker blobs counted
assert_eq!(result.orphaned_blobs, 1);
assert_eq!(result.deleted_blobs, 1);
// Docker orphan gone
assert!(storage
.get("docker/test/blobs/sha256:orphan1")
.await
.is_err());
// Docker referenced blob still exists
assert!(storage
.get("docker/test/blobs/sha256:configabc")
.await
.is_ok());
// npm tarball untouched
assert!(storage
.get("npm/lodash/tarballs/lodash-4.17.21.tgz")
.await
.is_ok());
}
}