mirror of
https://github.com/getnora-io/nora.git
synced 2026-04-12 17:20:33 +00:00
collect_all_blobs scanned all seven registry prefixes for keys containing /blobs/ or /tarballs/, but collect_referenced_digests only reads Docker manifests. Non-Docker artifacts (notably npm tarballs) were collected but never marked as referenced, so running gc without --dry-run would delete them. Restrict blob collection to docker/ until per-registry reference resolution exists for other registry types. // ticktockbent
411 lines
13 KiB
Rust
411 lines
13 KiB
Rust
//! Garbage Collection for orphaned Docker blobs
|
|
//!
|
|
//! Mark-and-sweep approach:
|
|
//! 1. List all Docker blobs
|
|
//! 2. Parse Docker manifests to find referenced blobs
|
|
//! 3. Blobs not referenced by any manifest = orphans
|
|
//! 4. Delete orphans (with --dry-run support)
|
|
//!
|
|
//! Currently Docker-only. Other registries (npm, maven, cargo, pypi, go,
|
|
//! raw) are excluded because no reference resolver exists for their
|
|
//! metadata formats.
|
|
|
|
use std::collections::HashSet;
|
|
|
|
use tracing::info;
|
|
|
|
use crate::storage::Storage;
|
|
|
|
pub struct GcResult {
|
|
pub total_blobs: usize,
|
|
pub referenced_blobs: usize,
|
|
pub orphaned_blobs: usize,
|
|
pub deleted_blobs: usize,
|
|
pub orphan_keys: Vec<String>,
|
|
}
|
|
|
|
pub async fn run_gc(storage: &Storage, dry_run: bool) -> GcResult {
|
|
info!("Starting garbage collection (dry_run={})", dry_run);
|
|
|
|
// 1. Collect all blob keys
|
|
let all_blobs = collect_all_blobs(storage).await;
|
|
info!("Found {} total blobs", all_blobs.len());
|
|
|
|
// 2. Collect all referenced digests from manifests
|
|
let referenced = collect_referenced_digests(storage).await;
|
|
info!(
|
|
"Found {} referenced digests from manifests",
|
|
referenced.len()
|
|
);
|
|
|
|
// 3. Find orphans
|
|
let mut orphan_keys: Vec<String> = Vec::new();
|
|
for key in &all_blobs {
|
|
if let Some(digest) = key.rsplit('/').next() {
|
|
if !referenced.contains(digest) {
|
|
orphan_keys.push(key.clone());
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Found {} orphaned blobs", orphan_keys.len());
|
|
|
|
let mut deleted = 0;
|
|
if !dry_run {
|
|
for key in &orphan_keys {
|
|
if storage.delete(key).await.is_ok() {
|
|
deleted += 1;
|
|
info!("Deleted: {}", key);
|
|
}
|
|
}
|
|
info!("Deleted {} orphaned blobs", deleted);
|
|
} else {
|
|
for key in &orphan_keys {
|
|
info!("[dry-run] Would delete: {}", key);
|
|
}
|
|
}
|
|
|
|
GcResult {
|
|
total_blobs: all_blobs.len(),
|
|
referenced_blobs: referenced.len(),
|
|
orphaned_blobs: orphan_keys.len(),
|
|
deleted_blobs: deleted,
|
|
orphan_keys,
|
|
}
|
|
}
|
|
|
|
async fn collect_all_blobs(storage: &Storage) -> Vec<String> {
|
|
let mut blobs = Vec::new();
|
|
// Only collect Docker blobs. Other registries (npm, maven, cargo, pypi,
|
|
// go, raw) use storage key schemes that collect_referenced_digests does
|
|
// not understand, so their artifacts would appear as orphans and be
|
|
// deleted. Extending GC to non-Docker registries requires per-registry
|
|
// reference resolution.
|
|
let keys = storage.list("docker/").await;
|
|
for key in keys {
|
|
if key.contains("/blobs/") {
|
|
blobs.push(key);
|
|
}
|
|
}
|
|
blobs
|
|
}
|
|
|
|
async fn collect_referenced_digests(storage: &Storage) -> HashSet<String> {
|
|
let mut referenced = HashSet::new();
|
|
|
|
let all_keys = storage.list("docker/").await;
|
|
for key in &all_keys {
|
|
if !key.contains("/manifests/") || !key.ends_with(".json") || key.ends_with(".meta.json") {
|
|
continue;
|
|
}
|
|
|
|
if let Ok(data) = storage.get(key).await {
|
|
if let Ok(json) = serde_json::from_slice::<serde_json::Value>(&data) {
|
|
if let Some(config) = json.get("config") {
|
|
if let Some(digest) = config.get("digest").and_then(|v| v.as_str()) {
|
|
referenced.insert(digest.to_string());
|
|
}
|
|
}
|
|
|
|
if let Some(layers) = json.get("layers").and_then(|v| v.as_array()) {
|
|
for layer in layers {
|
|
if let Some(digest) = layer.get("digest").and_then(|v| v.as_str()) {
|
|
referenced.insert(digest.to_string());
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(manifests) = json.get("manifests").and_then(|v| v.as_array()) {
|
|
for m in manifests {
|
|
if let Some(digest) = m.get("digest").and_then(|v| v.as_str()) {
|
|
referenced.insert(digest.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
referenced
|
|
}
|
|
|
|
#[cfg(test)]
|
|
#[allow(clippy::unwrap_used)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_gc_result_defaults() {
|
|
let result = GcResult {
|
|
total_blobs: 0,
|
|
referenced_blobs: 0,
|
|
orphaned_blobs: 0,
|
|
deleted_blobs: 0,
|
|
orphan_keys: vec![],
|
|
};
|
|
assert_eq!(result.total_blobs, 0);
|
|
assert!(result.orphan_keys.is_empty());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_empty_storage() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
let result = run_gc(&storage, true).await;
|
|
assert_eq!(result.total_blobs, 0);
|
|
assert_eq!(result.referenced_blobs, 0);
|
|
assert_eq!(result.orphaned_blobs, 0);
|
|
assert_eq!(result.deleted_blobs, 0);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_no_orphans() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
// Create a manifest that references a blob
|
|
let manifest = serde_json::json!({
|
|
"config": {"digest": "sha256:configabc"},
|
|
"layers": [{"digest": "sha256:layer111", "size": 100}]
|
|
});
|
|
storage
|
|
.put(
|
|
"docker/test/manifests/latest.json",
|
|
manifest.to_string().as_bytes(),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/test/blobs/sha256:configabc", b"config-data")
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/test/blobs/sha256:layer111", b"layer-data")
|
|
.await
|
|
.unwrap();
|
|
|
|
let result = run_gc(&storage, true).await;
|
|
assert_eq!(result.total_blobs, 2);
|
|
assert_eq!(result.orphaned_blobs, 0);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_finds_orphans_dry_run() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
// Create a manifest referencing only one blob
|
|
let manifest = serde_json::json!({
|
|
"config": {"digest": "sha256:configabc"},
|
|
"layers": [{"digest": "sha256:layer111", "size": 100}]
|
|
});
|
|
storage
|
|
.put(
|
|
"docker/test/manifests/latest.json",
|
|
manifest.to_string().as_bytes(),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/test/blobs/sha256:configabc", b"config-data")
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/test/blobs/sha256:layer111", b"layer-data")
|
|
.await
|
|
.unwrap();
|
|
// Orphan blob (not referenced)
|
|
storage
|
|
.put("docker/test/blobs/sha256:orphan999", b"orphan-data")
|
|
.await
|
|
.unwrap();
|
|
|
|
let result = run_gc(&storage, true).await;
|
|
assert_eq!(result.total_blobs, 3);
|
|
assert_eq!(result.orphaned_blobs, 1);
|
|
assert_eq!(result.deleted_blobs, 0); // dry run
|
|
assert!(result.orphan_keys[0].contains("orphan999"));
|
|
|
|
// Verify orphan still exists (dry run)
|
|
assert!(storage
|
|
.get("docker/test/blobs/sha256:orphan999")
|
|
.await
|
|
.is_ok());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_deletes_orphans() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
let manifest = serde_json::json!({
|
|
"config": {"digest": "sha256:configabc"},
|
|
"layers": []
|
|
});
|
|
storage
|
|
.put(
|
|
"docker/test/manifests/latest.json",
|
|
manifest.to_string().as_bytes(),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/test/blobs/sha256:configabc", b"config")
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/test/blobs/sha256:orphan1", b"orphan")
|
|
.await
|
|
.unwrap();
|
|
|
|
let result = run_gc(&storage, false).await;
|
|
assert_eq!(result.orphaned_blobs, 1);
|
|
assert_eq!(result.deleted_blobs, 1);
|
|
|
|
// Verify orphan is gone
|
|
assert!(storage
|
|
.get("docker/test/blobs/sha256:orphan1")
|
|
.await
|
|
.is_err());
|
|
// Referenced blob still exists
|
|
assert!(storage
|
|
.get("docker/test/blobs/sha256:configabc")
|
|
.await
|
|
.is_ok());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_manifest_list_references() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
// Multi-arch manifest list
|
|
let manifest = serde_json::json!({
|
|
"manifests": [
|
|
{"digest": "sha256:platformA", "size": 100},
|
|
{"digest": "sha256:platformB", "size": 200}
|
|
]
|
|
});
|
|
storage
|
|
.put(
|
|
"docker/multi/manifests/latest.json",
|
|
manifest.to_string().as_bytes(),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/multi/blobs/sha256:platformA", b"arch-a")
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/multi/blobs/sha256:platformB", b"arch-b")
|
|
.await
|
|
.unwrap();
|
|
|
|
let result = run_gc(&storage, true).await;
|
|
assert_eq!(result.orphaned_blobs, 0);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_ignores_non_docker_registries() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
// Non-Docker artifacts must not be collected by GC, because
|
|
// collect_referenced_digests only understands Docker manifests.
|
|
// Without this guard, these would all appear as orphans and be deleted.
|
|
storage
|
|
.put("npm/lodash/tarballs/lodash-4.17.21.tgz", b"tarball-data")
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("maven/com/example/lib/1.0/lib-1.0.jar", b"jar-data")
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("cargo/serde/1.0.0/serde-1.0.0.crate", b"crate-data")
|
|
.await
|
|
.unwrap();
|
|
|
|
let result = run_gc(&storage, true).await;
|
|
assert_eq!(result.total_blobs, 0);
|
|
assert_eq!(result.orphaned_blobs, 0);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_does_not_delete_npm_tarballs() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
// Regression test: npm tarballs were previously collected because
|
|
// their keys contain "/tarballs/", but no reference resolver existed
|
|
// for npm metadata, so they were all treated as orphans.
|
|
storage
|
|
.put("npm/lodash/tarballs/lodash-4.17.21.tgz", b"tarball-data")
|
|
.await
|
|
.unwrap();
|
|
|
|
let result = run_gc(&storage, false).await;
|
|
assert_eq!(result.deleted_blobs, 0);
|
|
// Verify tarball still exists
|
|
assert!(storage
|
|
.get("npm/lodash/tarballs/lodash-4.17.21.tgz")
|
|
.await
|
|
.is_ok());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_gc_deletes_docker_orphan_but_preserves_npm() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let storage = Storage::new_local(dir.path().join("data").to_str().unwrap());
|
|
|
|
// Docker manifest referencing one blob
|
|
let manifest = serde_json::json!({
|
|
"config": {"digest": "sha256:configabc"},
|
|
"layers": []
|
|
});
|
|
storage
|
|
.put(
|
|
"docker/test/manifests/latest.json",
|
|
manifest.to_string().as_bytes(),
|
|
)
|
|
.await
|
|
.unwrap();
|
|
storage
|
|
.put("docker/test/blobs/sha256:configabc", b"config")
|
|
.await
|
|
.unwrap();
|
|
// Orphan Docker blob
|
|
storage
|
|
.put("docker/test/blobs/sha256:orphan1", b"orphan")
|
|
.await
|
|
.unwrap();
|
|
// npm tarball that must survive
|
|
storage
|
|
.put("npm/lodash/tarballs/lodash-4.17.21.tgz", b"tarball-data")
|
|
.await
|
|
.unwrap();
|
|
|
|
let result = run_gc(&storage, false).await;
|
|
assert_eq!(result.total_blobs, 2); // only Docker blobs counted
|
|
assert_eq!(result.orphaned_blobs, 1);
|
|
assert_eq!(result.deleted_blobs, 1);
|
|
// Docker orphan gone
|
|
assert!(storage
|
|
.get("docker/test/blobs/sha256:orphan1")
|
|
.await
|
|
.is_err());
|
|
// Docker referenced blob still exists
|
|
assert!(storage
|
|
.get("docker/test/blobs/sha256:configabc")
|
|
.await
|
|
.is_ok());
|
|
// npm tarball untouched
|
|
assert!(storage
|
|
.get("npm/lodash/tarballs/lodash-4.17.21.tgz")
|
|
.await
|
|
.is_ok());
|
|
}
|
|
}
|