From 27a368b3a0d7e74135857531ce194531ebdd0844 Mon Sep 17 00:00:00 2001 From: DevITWay | Pavel Volkov Date: Wed, 8 Apr 2026 09:38:18 +0300 Subject: [PATCH] feat: Cargo sparse index (RFC 2789) + PyPI twine upload + PEP 691 (#113) Cargo registry: - Sparse index with config.json, prefix-based lookup (1/2/3/4+ char rules) - cargo publish wire format (LE u32 lengths + JSON metadata + tarball) - Version immutability with Cargo-compatible JSON error responses - Dependency field mapping (version_req->req, explicit_name_in_toml->package) - Case-insensitive crate name normalization across all endpoints - Cache-Control headers on index (max-age=300) and downloads (immutable) PyPI registry: - twine upload via multipart/form-data with SHA-256 verification - PEP 691 JSON API with Accept header content negotiation - Hash fragment preservation in proxied links (PEP 503) - Package name normalization per PEP 503 577 tests (up from 504), 0 failures, clippy clean. --- CHANGELOG.md | 23 +- Cargo.lock | 35 +- Cargo.toml | 4 +- nora-registry/src/registry/cargo_registry.rs | 994 ++++++++++++++++++- nora-registry/src/registry/pypi.rs | 542 ++++++++-- 5 files changed, 1508 insertions(+), 90 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df1c606..bc95b15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,28 @@ # Changelog ## [Unreleased] +## [0.5.0] - 2026-04-07 + +### Added +- **Cargo sparse index (RFC 2789)** — cargo can now use NORA as a proper registry with `sparse+http://` protocol, including `config.json`, prefix-based index lookup, and `cargo publish` wire format support +- **Cargo publish** — full publish flow with wire format parsing, version immutability (409 Conflict), SHA-256 checksums in sparse index, and proper `warnings` response format +- **PyPI twine upload** — `twine upload` via multipart/form-data with SHA-256 verification, filename validation, and version immutability +- **PEP 691 JSON API** — content negotiation via `Accept: application/vnd.pypi.simple.v1+json` for package index and version listing, with hash digests in responses +- 577 total tests (up from 504), including 25 new Cargo tests and 18 new PyPI tests + ### Fixed -- Go and Raw registries missing from Prometheus metrics (`detect_registry` labeled both as "other") -- Go and Raw registries missing from `/health` endpoint `registries` object +- Cargo dependency field mapping: `version_req` correctly renamed to `req` and `explicit_name_in_toml` to `package` in sparse index entries, matching Cargo registry specification +- Cargo crate names normalized to lowercase across all endpoints (publish, download, metadata, sparse index) for consistent storage keys +- Cargo publish write ordering: index written before .crate tarball to prevent orphaned files on partial failure +- Cargo conflict errors now return Cargo-compatible JSON format (`{"errors": [{"detail": "..."}]}`) +- PyPI hash fragments preserved when rewriting upstream links (PEP 503 compliance) +- Redundant path traversal checks removed from crate name validation (charset already excludes unsafe characters) + +### Changed +- Cargo sparse index and config.json responses include `Cache-Control: public, max-age=300` +- Cargo .crate downloads include `Cache-Control: public, max-age=31536000, immutable` and `Content-Type: application/x-tar` +- axum upgraded with `multipart` feature for PyPI upload support + ## [0.4.0] - 2026-04-05 diff --git a/Cargo.lock b/Cargo.lock index 33dfccb..16bfe19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -176,6 +176,7 @@ dependencies = [ "matchit", "memchr", "mime", + "multer", "percent-encoding", "pin-project-lite", "serde_core", @@ -678,6 +679,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -1500,6 +1510,23 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "multer" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http", + "httparse", + "memchr", + "mime", + "spin", + "version_check", +] + [[package]] name = "nonempty" version = "0.7.0" @@ -1522,7 +1549,7 @@ dependencies = [ [[package]] name = "nora-registry" -version = "0.4.0" +version = "0.5.0" dependencies = [ "argon2", "async-trait", @@ -2369,6 +2396,12 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "spinning_top" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index 8657181..54c8687 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,7 @@ members = [ ] [workspace.package] -version = "0.4.0" +version = "0.5.0" edition = "2021" rust-version = "1.75" license = "MIT" @@ -16,7 +16,7 @@ homepage = "https://getnora.io" [workspace.dependencies] tokio = { version = "1", features = ["full"] } -axum = "0.8" +axum = { version = "0.8", features = ["multipart"] } serde = { version = "1", features = ["derive"] } serde_json = "1" tracing = "0.1" diff --git a/nora-registry/src/registry/cargo_registry.rs b/nora-registry/src/registry/cargo_registry.rs index 0171f61..e7e8371 100644 --- a/nora-registry/src/registry/cargo_registry.rs +++ b/nora-registry/src/registry/cargo_registry.rs @@ -1,29 +1,180 @@ // Copyright (c) 2026 Volkov Pavel | DevITWay // SPDX-License-Identifier: MIT +//! Cargo registry with sparse index (RFC 2789). +//! +//! Implements: +//! GET /cargo/index/config.json — registry configuration +//! GET /cargo/index/{prefix}/{crate} — sparse index entries +//! GET /cargo/api/v1/crates/{crate_name} — crate metadata (proxy) +//! GET /cargo/api/v1/crates/{name}/{ver}/download — download .crate +//! PUT /cargo/api/v1/crates/new — cargo publish + use crate::activity_log::{ActionType, ActivityEntry}; use crate::audit::AuditEntry; use crate::registry::proxy_fetch; use crate::validation::validate_storage_key; use crate::AppState; use axum::{ + body::Bytes, extract::{Path, State}, - http::StatusCode, + http::{header, HeaderValue, StatusCode}, response::{IntoResponse, Response}, - routing::get, + routing::{get, put}, Router, }; +use sha2::Digest; use std::sync::Arc; pub fn routes() -> Router> { Router::new() + .route("/cargo/index/config.json", get(index_config)) + .route("/cargo/index/{*path}", get(sparse_index)) .route("/cargo/api/v1/crates/{crate_name}", get(get_metadata)) .route( "/cargo/api/v1/crates/{crate_name}/{version}/download", get(download), ) + .route("/cargo/api/v1/crates/new", put(publish)) } +// ============================================================================ +// Sparse index — RFC 2789 +// ============================================================================ + +/// GET /cargo/index/config.json — tells cargo where to download crates. +async fn index_config(State(state): State>) -> Response { + let base = nora_base_url(&state); + let config = serde_json::json!({ + "dl": format!("{}/cargo/api/v1/crates", base.trim_end_matches('/')), + "api": format!("{}/cargo", base.trim_end_matches('/')) + }); + ( + StatusCode::OK, + [ + ( + header::CONTENT_TYPE, + HeaderValue::from_static("application/json"), + ), + ( + header::CACHE_CONTROL, + HeaderValue::from_static("public, max-age=300"), + ), + ], + serde_json::to_vec(&config).unwrap_or_default(), + ) + .into_response() +} + +/// GET /cargo/index/{prefix}/{crate} — sparse index lookup. +/// +/// Cargo sparse index uses a directory structure based on crate name length: +/// 1 char: /cargo/index/1/{name} +/// 2 chars: /cargo/index/2/{name} +/// 3 chars: /cargo/index/3/{first_char}/{name} +/// 4+ chars: /cargo/index/{first_two}/{next_two}/{name} +/// +/// Each entry is one JSON line per version (newline-delimited). +async fn sparse_index(State(state): State>, Path(path): Path) -> Response { + // Extract crate name from the path (last segment), normalized to lowercase + let crate_name = match path.rsplit('/').next() { + Some(name) if !name.is_empty() => name.to_lowercase(), + _ => return StatusCode::NOT_FOUND.into_response(), + }; + + // Validate crate name + if !is_valid_crate_name(&crate_name) { + return StatusCode::BAD_REQUEST.into_response(); + } + + // Verify prefix matches the crate name (case-insensitive) + let expected_prefix = crate_index_prefix(&crate_name); + if path.to_lowercase() != format!("{}/{}", expected_prefix, crate_name) { + return StatusCode::NOT_FOUND.into_response(); + } + + // Try local index first + let index_key = format!("cargo/index/{}/{}", expected_prefix, crate_name); + if let Ok(data) = state.storage.get(&index_key).await { + state.metrics.record_download("cargo"); + state.metrics.record_cache_hit(); + state.activity.push(ActivityEntry::new( + ActionType::CacheHit, + crate_name.to_string(), + "cargo", + "CACHE", + )); + return sparse_index_response(data.to_vec()); + } + + // Try upstream sparse index (sparse+https://index.crates.io/) + let proxy_url = match &state.config.cargo.proxy { + Some(url) => url.clone(), + None => return StatusCode::NOT_FOUND.into_response(), + }; + + // crates.io sparse index lives at index.crates.io + let upstream_index_url = if proxy_url.contains("crates.io") { + format!("https://index.crates.io/{}/{}", expected_prefix, crate_name) + } else { + // Custom registry: assume sparse index at {proxy}/index/{prefix}/{crate} + format!( + "{}/index/{}/{}", + proxy_url.trim_end_matches('/'), + expected_prefix, + crate_name + ) + }; + + match proxy_fetch( + &state.http_client, + &upstream_index_url, + state.config.cargo.proxy_timeout, + state.config.cargo.proxy_auth.as_deref(), + ) + .await + { + Ok(data) => { + state.metrics.record_download("cargo"); + state.metrics.record_cache_miss(); + state.activity.push(ActivityEntry::new( + ActionType::ProxyFetch, + crate_name.to_string(), + "cargo", + "PROXY", + )); + state + .audit + .log(AuditEntry::new("proxy_fetch", "api", "", "cargo", "")); + + // Cache in background + let storage = state.storage.clone(); + let key = index_key; + let data_clone = data.clone(); + tokio::spawn(async move { + let _ = storage.put(&key, &data_clone).await; + }); + + state.repo_index.invalidate("cargo"); + sparse_index_response(data) + } + Err(crate::registry::ProxyError::NotFound) => StatusCode::NOT_FOUND.into_response(), + Err(e) => { + tracing::debug!( + crate_name = crate_name, + error = ?e, + "Cargo sparse index upstream error" + ); + StatusCode::NOT_FOUND.into_response() + } + } +} + +// ============================================================================ +// Metadata & download (existing, refactored) +// ============================================================================ + +/// GET /cargo/api/v1/crates/{crate_name} — JSON metadata. async fn get_metadata( State(state): State>, Path(crate_name): Path, @@ -31,6 +182,7 @@ async fn get_metadata( if validate_storage_key(&crate_name).is_err() { return StatusCode::BAD_REQUEST.into_response(); } + let crate_name = crate_name.to_lowercase(); let key = format!("cargo/{}/metadata.json", crate_name); if let Ok(data) = state.storage.get(&key).await { @@ -70,6 +222,7 @@ async fn get_metadata( } } +/// GET /cargo/api/v1/crates/{name}/{version}/download — download .crate file. async fn download( State(state): State>, Path((crate_name, version)): Path<(String, String)>, @@ -77,6 +230,7 @@ async fn download( if validate_storage_key(&crate_name).is_err() || validate_storage_key(&version).is_err() { return StatusCode::BAD_REQUEST.into_response(); } + let crate_name = crate_name.to_lowercase(); let key = format!( "cargo/{}/{}/{}-{}.crate", crate_name, version, crate_name, version @@ -95,7 +249,21 @@ async fn download( state .audit .log(AuditEntry::new("pull", "api", "", "cargo", "")); - return (StatusCode::OK, data).into_response(); + return ( + StatusCode::OK, + [ + ( + header::CONTENT_TYPE, + HeaderValue::from_static("application/x-tar"), + ), + ( + header::CACHE_CONTROL, + HeaderValue::from_static("public, max-age=31536000, immutable"), + ), + ], + data, + ) + .into_response(); } // Proxy fetch from upstream @@ -120,7 +288,6 @@ async fn download( .await { Ok(data) => { - // Cache in background let storage = state.storage.clone(); let key_clone = key.clone(); let data_clone = data.clone(); @@ -138,18 +305,450 @@ async fn download( state .audit .log(AuditEntry::new("proxy_fetch", "api", "", "cargo", "")); - (StatusCode::OK, data).into_response() + ( + StatusCode::OK, + [ + ( + header::CONTENT_TYPE, + HeaderValue::from_static("application/x-tar"), + ), + ( + header::CACHE_CONTROL, + HeaderValue::from_static("public, max-age=31536000, immutable"), + ), + ], + data, + ) + .into_response() } Err(_) => StatusCode::NOT_FOUND.into_response(), } } +// ============================================================================ +// Cargo publish +// ============================================================================ + +/// PUT /cargo/api/v1/crates/new — publish a crate. +/// +/// Wire format (cargo puts this as the body): +/// 4 bytes LE: metadata JSON length +/// N bytes: metadata JSON +/// 4 bytes LE: .crate tarball length +/// M bytes: .crate tarball +async fn publish(State(state): State>, body: Bytes) -> Response { + if body.len() < 8 { + return (StatusCode::BAD_REQUEST, "Payload too small").into_response(); + } + + // Parse wire format + let metadata_len = u32::from_le_bytes([body[0], body[1], body[2], body[3]]) as usize; + if body.len() < 4 + metadata_len + 4 { + return (StatusCode::BAD_REQUEST, "Truncated metadata").into_response(); + } + + let metadata_bytes = &body[4..4 + metadata_len]; + let metadata: serde_json::Value = match serde_json::from_slice(metadata_bytes) { + Ok(v) => v, + Err(e) => { + return ( + StatusCode::BAD_REQUEST, + format!("Invalid metadata JSON: {}", e), + ) + .into_response() + } + }; + + let crate_len_offset = 4 + metadata_len; + let crate_len = u32::from_le_bytes([ + body[crate_len_offset], + body[crate_len_offset + 1], + body[crate_len_offset + 2], + body[crate_len_offset + 3], + ]) as usize; + + let crate_start = crate_len_offset + 4; + if body.len() < crate_start + crate_len { + return (StatusCode::BAD_REQUEST, "Truncated crate tarball").into_response(); + } + + let crate_data = &body[crate_start..crate_start + crate_len]; + + // Extract required fields + let name = match metadata.get("name").and_then(|n| n.as_str()) { + Some(n) => n, + None => return (StatusCode::BAD_REQUEST, "Missing crate name").into_response(), + }; + + let vers = match metadata.get("vers").and_then(|v| v.as_str()) { + Some(v) => v, + None => return (StatusCode::BAD_REQUEST, "Missing crate version").into_response(), + }; + + // Validate + if !is_valid_crate_name(name) { + return (StatusCode::BAD_REQUEST, "Invalid crate name").into_response(); + } + if validate_storage_key(vers).is_err() { + return (StatusCode::BAD_REQUEST, "Invalid version").into_response(); + } + + // Normalize to lowercase for consistent storage keys + let name = name.to_lowercase(); + let vers = vers.to_string(); + + // Check version immutability + let crate_key = format!("cargo/{}/{}/{}-{}.crate", name, vers, name, vers); + if state.storage.stat(&crate_key).await.is_some() { + let err = serde_json::json!({ + "errors": [{"detail": format!("crate version `{}@{}` already exists", name, vers)}] + }); + return ( + StatusCode::CONFLICT, + [( + header::CONTENT_TYPE, + HeaderValue::from_static("application/json"), + )], + serde_json::to_vec(&err).unwrap_or_default(), + ) + .into_response(); + } + + // Compute checksum + let cksum = hex::encode(sha2::Sha256::digest(crate_data)); + + // Build sparse index entry (one JSON line per version) + // Transform deps: Cargo publish sends `version_req` but index format requires `req`, + // and `explicit_name_in_toml` becomes `package` in the index. + let deps = metadata + .get("deps") + .and_then(|d| d.as_array()) + .map(|arr| { + arr.iter() + .map(|dep| { + let mut d = dep.clone(); + if let Some(obj) = d.as_object_mut() { + // version_req -> req + if let Some(vr) = obj.remove("version_req") { + obj.insert("req".to_string(), vr); + } + // explicit_name_in_toml -> package + if let Some(ent) = obj.remove("explicit_name_in_toml") { + if !ent.is_null() { + obj.insert("package".to_string(), ent); + } + } + } + d + }) + .collect::>() + }) + .map(serde_json::Value::Array) + .unwrap_or(serde_json::json!([])); + let features = metadata + .get("features") + .cloned() + .unwrap_or(serde_json::json!({})); + let features2 = metadata.get("features2").cloned(); + let links = metadata.get("links").cloned(); + + let mut index_entry = serde_json::json!({ + "name": name, + "vers": vers, + "deps": deps, + "cksum": cksum, + "features": features, + "yanked": false, + }); + + if let Some(f2) = features2 { + index_entry["features2"] = f2; + } + if let Some(l) = links { + index_entry["links"] = l; + } + + let entry_line = serde_json::to_string(&index_entry).unwrap_or_default(); + + // Write index FIRST — if it fails, no orphaned .crate file + // If .crate write fails later, re-publish is possible (immutability checks .crate, not index) + let prefix = crate_index_prefix(&name); + let index_key = format!("cargo/index/{}/{}", prefix, name); + + let mut index_content = state + .storage + .get(&index_key) + .await + .map(|d| d.to_vec()) + .unwrap_or_default(); + + // Ensure newline separator + if !index_content.is_empty() && !index_content.ends_with(b"\n") { + index_content.push(b'\n'); + } + index_content.extend_from_slice(entry_line.as_bytes()); + index_content.push(b'\n'); + + if state.storage.put(&index_key, &index_content).await.is_err() { + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + + // Store .crate tarball SECOND + if state.storage.put(&crate_key, crate_data).await.is_err() { + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + + state.metrics.record_upload("cargo"); + state.activity.push(ActivityEntry::new( + ActionType::Push, + format!("{}@{}", name, vers), + "cargo", + "LOCAL", + )); + state + .audit + .log(AuditEntry::new("push", "api", "", "cargo", "")); + state.repo_index.invalidate("cargo"); + + // Cargo expects a JSON response with warnings array + let response = serde_json::json!({ + "warnings": { + "invalid_categories": [], + "invalid_badges": [], + "other": [] + } + }); + + ( + StatusCode::OK, + [( + header::CONTENT_TYPE, + HeaderValue::from_static("application/json"), + )], + serde_json::to_vec(&response).unwrap_or_default(), + ) + .into_response() +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/// Compute sparse index prefix for a crate name (RFC 2789). +fn crate_index_prefix(name: &str) -> String { + let lower = name.to_lowercase(); + match lower.len() { + 1 => "1".to_string(), + 2 => "2".to_string(), + 3 => format!("3/{}", &lower[..1]), + _ => format!("{}/{}", &lower[..2], &lower[2..4]), + } +} + +/// Validate crate name per Cargo spec. +fn is_valid_crate_name(name: &str) -> bool { + if name.is_empty() || name.len() > 64 { + return false; + } + // Must start with alphanumeric + let first = name.chars().next().unwrap_or('\0'); + if !first.is_ascii_alphanumeric() { + return false; + } + // Only alphanumeric, `-`, `_` + name.chars() + .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') +} + +/// Construct NORA base URL from config. +fn nora_base_url(state: &AppState) -> String { + if let Some(url) = &state.config.server.public_url { + return url.clone(); + } + format!( + "http://{}:{}", + state.config.server.host, state.config.server.port + ) +} + +/// Build response with sparse index content-type. +fn sparse_index_response(data: Vec) -> Response { + ( + StatusCode::OK, + [ + ( + header::CONTENT_TYPE, + HeaderValue::from_static("application/json"), + ), + ( + header::CACHE_CONTROL, + HeaderValue::from_static("public, max-age=300"), + ), + ], + data, + ) + .into_response() +} + +// ============================================================================ +// Unit Tests +// ============================================================================ + #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { + use super::*; + + // ── Prefix computation (RFC 2789) ─────────────────────────────────── + + #[test] + fn test_prefix_single_char() { + assert_eq!(crate_index_prefix("a"), "1"); + assert_eq!(crate_index_prefix("Z"), "1"); + } + + #[test] + fn test_prefix_two_chars() { + assert_eq!(crate_index_prefix("ab"), "2"); + assert_eq!(crate_index_prefix("IO"), "2"); + } + + #[test] + fn test_prefix_three_chars() { + assert_eq!(crate_index_prefix("abc"), "3/a"); + assert_eq!(crate_index_prefix("Foo"), "3/f"); + } + + #[test] + fn test_prefix_four_plus_chars() { + assert_eq!(crate_index_prefix("serde"), "se/rd"); + assert_eq!(crate_index_prefix("tokio"), "to/ki"); + assert_eq!(crate_index_prefix("Axum"), "ax/um"); + assert_eq!(crate_index_prefix("ab_cd_ef"), "ab/_c"); + } + + // ── Crate name validation ─────────────────────────────────────────── + + #[test] + fn test_valid_crate_names() { + assert!(is_valid_crate_name("serde")); + assert!(is_valid_crate_name("my-crate")); + assert!(is_valid_crate_name("my_crate")); + assert!(is_valid_crate_name("a")); + assert!(is_valid_crate_name("crate123")); + } + + #[test] + fn test_invalid_crate_names() { + assert!(!is_valid_crate_name("")); + assert!(!is_valid_crate_name("-start")); + assert!(!is_valid_crate_name("_start")); + assert!(!is_valid_crate_name("has space")); + assert!(!is_valid_crate_name("has/slash")); + assert!(!is_valid_crate_name("has..dots")); + assert!(!is_valid_crate_name("has\\backslash")); + assert!(!is_valid_crate_name(&"a".repeat(65))); + } + + #[test] + fn test_crate_name_max_length() { + assert!(is_valid_crate_name(&"a".repeat(64))); + assert!(!is_valid_crate_name(&"a".repeat(65))); + } +} + +// ============================================================================ +// Integration Tests +// ============================================================================ + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod integration_tests { use crate::test_helpers::{body_bytes, create_test_context, send}; + use axum::body::Body; use axum::http::{Method, StatusCode}; + #[tokio::test] + async fn test_cargo_index_config() { + let ctx = create_test_context(); + let resp = send(&ctx.app, Method::GET, "/cargo/index/config.json", "").await; + assert_eq!(resp.status(), StatusCode::OK); + let body = body_bytes(resp).await; + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert!(json.get("dl").is_some()); + assert!(json.get("api").is_some()); + } + + #[tokio::test] + async fn test_cargo_sparse_index_from_storage() { + let ctx = create_test_context(); + let index_data = br#"{"name":"serde","vers":"1.0.0","deps":[],"cksum":"abc123","features":{},"yanked":false}"#; + ctx.state + .storage + .put("cargo/index/se/rd/serde", index_data) + .await + .unwrap(); + + let resp = send(&ctx.app, Method::GET, "/cargo/index/se/rd/serde", "").await; + assert_eq!(resp.status(), StatusCode::OK); + let body = body_bytes(resp).await; + assert_eq!(&body[..], index_data); + } + + #[tokio::test] + async fn test_cargo_sparse_index_wrong_prefix() { + let ctx = create_test_context(); + // "serde" should be at se/rd/serde, not 1/serde + let resp = send(&ctx.app, Method::GET, "/cargo/index/1/serde", "").await; + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + } + + #[tokio::test] + async fn test_cargo_sparse_index_single_char() { + let ctx = create_test_context(); + ctx.state + .storage + .put("cargo/index/1/a", b"index-data") + .await + .unwrap(); + + let resp = send(&ctx.app, Method::GET, "/cargo/index/1/a", "").await; + assert_eq!(resp.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_cargo_sparse_index_two_char() { + let ctx = create_test_context(); + ctx.state + .storage + .put("cargo/index/2/ab", b"index-data") + .await + .unwrap(); + + let resp = send(&ctx.app, Method::GET, "/cargo/index/2/ab", "").await; + assert_eq!(resp.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_cargo_sparse_index_three_char() { + let ctx = create_test_context(); + ctx.state + .storage + .put("cargo/index/3/f/foo", b"index-data") + .await + .unwrap(); + + let resp = send(&ctx.app, Method::GET, "/cargo/index/3/f/foo", "").await; + assert_eq!(resp.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_cargo_sparse_index_not_found_no_proxy() { + let ctx = create_test_context(); + let resp = send(&ctx.app, Method::GET, "/cargo/index/se/rd/serde", "").await; + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + } + #[tokio::test] async fn test_cargo_metadata_not_found() { let ctx = create_test_context(); @@ -212,4 +811,389 @@ mod tests { let body = body_bytes(resp).await; assert_eq!(&body[..], b"crate-data"); } + + // ── Publish tests ─────────────────────────────────────────────────── + + /// Build cargo publish wire format: 4-byte LE metadata len + metadata + 4-byte LE crate len + crate + fn build_publish_payload(metadata: &serde_json::Value, crate_data: &[u8]) -> Vec { + let meta_bytes = serde_json::to_vec(metadata).unwrap(); + let meta_len = (meta_bytes.len() as u32).to_le_bytes(); + let crate_len = (crate_data.len() as u32).to_le_bytes(); + + let mut payload = Vec::new(); + payload.extend_from_slice(&meta_len); + payload.extend_from_slice(&meta_bytes); + payload.extend_from_slice(&crate_len); + payload.extend_from_slice(crate_data); + payload + } + + #[tokio::test] + async fn test_cargo_publish_basic() { + let ctx = create_test_context(); + + let metadata = serde_json::json!({ + "name": "my-crate", + "vers": "0.1.0", + "deps": [], + "features": {}, + }); + let crate_data = b"fake-crate-tarball"; + let payload = build_publish_payload(&metadata, crate_data); + + let resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + + // Verify .crate stored + let stored = ctx + .state + .storage + .get("cargo/my-crate/0.1.0/my-crate-0.1.0.crate") + .await + .unwrap(); + assert_eq!(&stored[..], crate_data); + + // Verify sparse index entry created + let index = ctx + .state + .storage + .get("cargo/index/my/-c/my-crate") + .await + .unwrap(); + let index_str = String::from_utf8_lossy(&index); + assert!(index_str.contains("\"name\":\"my-crate\"")); + assert!(index_str.contains("\"vers\":\"0.1.0\"")); + assert!(index_str.contains("\"cksum\":")); + } + + #[tokio::test] + async fn test_cargo_publish_version_immutability() { + let ctx = create_test_context(); + + // First publish + let metadata = serde_json::json!({ + "name": "immut-test", + "vers": "1.0.0", + "deps": [], + "features": {}, + }); + let payload = build_publish_payload(&metadata, b"crate-v1"); + let resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + + // Second publish with same version → CONFLICT + let payload2 = build_publish_payload(&metadata, b"crate-v1-again"); + let resp2 = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload2), + ) + .await; + assert_eq!(resp2.status(), StatusCode::CONFLICT); + } + + #[tokio::test] + async fn test_cargo_publish_multiple_versions() { + let ctx = create_test_context(); + + // v0.1.0 + let m1 = + serde_json::json!({"name": "multi-ver", "vers": "0.1.0", "deps": [], "features": {}}); + let p1 = build_publish_payload(&m1, b"crate-01"); + let r1 = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(p1), + ) + .await; + assert_eq!(r1.status(), StatusCode::OK); + + // v0.2.0 + let m2 = + serde_json::json!({"name": "multi-ver", "vers": "0.2.0", "deps": [], "features": {}}); + let p2 = build_publish_payload(&m2, b"crate-02"); + let r2 = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(p2), + ) + .await; + assert_eq!(r2.status(), StatusCode::OK); + + // Index should have 2 lines + let index = ctx + .state + .storage + .get("cargo/index/mu/lt/multi-ver") + .await + .unwrap(); + let index_str = String::from_utf8_lossy(&index); + let lines: Vec<&str> = index_str.lines().collect(); + assert_eq!(lines.len(), 2); + assert!(lines[0].contains("0.1.0")); + assert!(lines[1].contains("0.2.0")); + } + + #[tokio::test] + async fn test_cargo_publish_invalid_name() { + let ctx = create_test_context(); + + let metadata = serde_json::json!({ + "name": "../traversal", + "vers": "1.0.0", + "deps": [], + "features": {}, + }); + let payload = build_publish_payload(&metadata, b"bad"); + + let resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(resp.status(), StatusCode::BAD_REQUEST); + } + + #[tokio::test] + async fn test_cargo_publish_truncated_payload() { + let ctx = create_test_context(); + let resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(vec![0u8; 3]), + ) + .await; + assert_eq!(resp.status(), StatusCode::BAD_REQUEST); + } + + #[tokio::test] + async fn test_cargo_publish_response_has_warnings() { + let ctx = create_test_context(); + + let metadata = serde_json::json!({ + "name": "warn-test", + "vers": "1.0.0", + "deps": [], + "features": {}, + }); + let payload = build_publish_payload(&metadata, b"crate-data"); + + let resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + + let body = body_bytes(resp).await; + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert!(json.get("warnings").is_some()); + } + + #[tokio::test] + async fn test_cargo_publish_then_download() { + let ctx = create_test_context(); + + let metadata = serde_json::json!({ + "name": "roundtrip", + "vers": "2.0.0", + "deps": [], + "features": {}, + }); + let crate_data = b"published-crate-content"; + let payload = build_publish_payload(&metadata, crate_data); + + // Publish + let publish_resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(publish_resp.status(), StatusCode::OK); + + // Download + let dl_resp = send( + &ctx.app, + Method::GET, + "/cargo/api/v1/crates/roundtrip/2.0.0/download", + "", + ) + .await; + assert_eq!(dl_resp.status(), StatusCode::OK); + let body = body_bytes(dl_resp).await; + assert_eq!(&body[..], crate_data); + } + + #[tokio::test] + async fn test_cargo_publish_then_sparse_index() { + let ctx = create_test_context(); + + let metadata = serde_json::json!({ + "name": "idx-test", + "vers": "1.0.0", + "deps": [{"name": "serde", "req": "^1", "features": [], "optional": false, "default_features": true, "target": null, "kind": "normal"}], + "features": {"default": ["serde"]}, + "links": null, + }); + let payload = build_publish_payload(&metadata, b"crate"); + + let publish_resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(publish_resp.status(), StatusCode::OK); + + // Sparse index lookup + let idx_resp = send(&ctx.app, Method::GET, "/cargo/index/id/x-/idx-test", "").await; + assert_eq!(idx_resp.status(), StatusCode::OK); + + let body = body_bytes(idx_resp).await; + let line: serde_json::Value = + serde_json::from_str(String::from_utf8_lossy(&body).lines().next().unwrap()).unwrap(); + assert_eq!(line["name"], "idx-test"); + assert_eq!(line["vers"], "1.0.0"); + assert!(line["deps"].as_array().unwrap().len() == 1); + assert!(line["cksum"].as_str().unwrap().len() == 64); // sha256 hex + } + + #[tokio::test] + async fn test_cargo_publish_transforms_deps_version_req_to_req() { + let ctx = create_test_context(); + + let metadata = serde_json::json!({ + "name": "dep-test", + "vers": "1.0.0", + "deps": [{ + "name": "serde", + "version_req": "^1.0", + "features": ["derive"], + "optional": false, + "default_features": true, + "target": null, + "kind": "normal", + "registry": null, + "explicit_name_in_toml": null + }, { + "name": "my_serde", + "version_req": "^1.0", + "features": [], + "optional": false, + "default_features": true, + "target": null, + "kind": "normal", + "registry": null, + "explicit_name_in_toml": "serde_json" + }], + "features": {}, + }); + let payload = build_publish_payload(&metadata, b"crate-data"); + + let resp = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(resp.status(), StatusCode::OK); + + // Read the sparse index entry + let index = ctx + .state + .storage + .get("cargo/index/de/p-/dep-test") + .await + .unwrap(); + let line: serde_json::Value = + serde_json::from_str(String::from_utf8_lossy(&index).lines().next().unwrap()).unwrap(); + + let deps = line["deps"].as_array().unwrap(); + assert_eq!(deps.len(), 2); + + // version_req must be renamed to req + assert!( + deps[0].get("version_req").is_none(), + "version_req should not be in index" + ); + assert_eq!(deps[0]["req"], "^1.0", "version_req must be renamed to req"); + + // explicit_name_in_toml=null should be dropped (not become package=null) + assert!(deps[0].get("explicit_name_in_toml").is_none()); + assert!( + deps[0].get("package").is_none(), + "null explicit_name_in_toml should not create package field" + ); + + // explicit_name_in_toml="serde_json" should become package="serde_json" + assert!(deps[1].get("explicit_name_in_toml").is_none()); + assert_eq!( + deps[1]["package"], "serde_json", + "explicit_name_in_toml must become package" + ); + } + + #[tokio::test] + async fn test_cargo_publish_conflict_json_format() { + let ctx = create_test_context(); + + let metadata = serde_json::json!({ + "name": "conflict-fmt", + "vers": "1.0.0", + "deps": [], + "features": {}, + }); + let payload = build_publish_payload(&metadata, b"v1"); + let r1 = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload), + ) + .await; + assert_eq!(r1.status(), StatusCode::OK); + + // Second publish -> CONFLICT with Cargo JSON format + let payload2 = build_publish_payload(&metadata, b"v1-again"); + let r2 = send( + &ctx.app, + Method::PUT, + "/cargo/api/v1/crates/new", + Body::from(payload2), + ) + .await; + assert_eq!(r2.status(), StatusCode::CONFLICT); + + let body = body_bytes(r2).await; + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert!(json["errors"].as_array().unwrap().len() > 0); + assert!(json["errors"][0]["detail"] + .as_str() + .unwrap() + .contains("already exists")); + } } diff --git a/nora-registry/src/registry/pypi.rs b/nora-registry/src/registry/pypi.rs index 72dae7a..02950ad 100644 --- a/nora-registry/src/registry/pypi.rs +++ b/nora-registry/src/registry/pypi.rs @@ -1,28 +1,47 @@ // Copyright (c) 2026 Volkov Pavel | DevITWay // SPDX-License-Identifier: MIT +//! PyPI registry — PEP 503 (Simple HTML) + PEP 691 (JSON) + twine upload. +//! +//! Implements: +//! GET /simple/ — package index (HTML or JSON) +//! GET /simple/{name}/ — package versions (HTML or JSON) +//! GET /simple/{name}/{filename} — download file +//! POST /simple/ — twine upload (multipart/form-data) + use crate::activity_log::{ActionType, ActivityEntry}; use crate::audit::AuditEntry; use crate::registry::{proxy_fetch, proxy_fetch_text}; use crate::AppState; use axum::{ - extract::{Path, State}, - http::{header, StatusCode}, + extract::{Multipart, Path, State}, + http::{header, HeaderMap, StatusCode}, response::{Html, IntoResponse, Response}, routing::get, Router, }; +use sha2::Digest; use std::sync::Arc; +/// PEP 691 JSON content type +const PEP691_JSON: &str = "application/vnd.pypi.simple.v1+json"; + pub fn routes() -> Router> { Router::new() - .route("/simple/", get(list_packages)) + .route("/simple/", get(list_packages).post(upload)) .route("/simple/{name}/", get(package_versions)) .route("/simple/{name}/{filename}", get(download_file)) } -/// List all packages (Simple API index) -async fn list_packages(State(state): State>) -> impl IntoResponse { +// ============================================================================ +// Package index +// ============================================================================ + +/// GET /simple/ — list all packages (PEP 503 HTML or PEP 691 JSON). +async fn list_packages( + State(state): State>, + headers: HeaderMap, +) -> impl IntoResponse { let keys = state.storage.list("pypi/").await; let mut packages = std::collections::HashSet::new(); @@ -34,52 +53,77 @@ async fn list_packages(State(state): State>) -> impl IntoResponse } } - let mut html = String::from( - "\nSimple Index

Simple Index

\n", - ); let mut pkg_list: Vec<_> = packages.into_iter().collect(); pkg_list.sort(); - for pkg in pkg_list { - html.push_str(&format!("{}
\n", pkg, pkg)); + if wants_json(&headers) { + // PEP 691 JSON response + let projects: Vec = pkg_list + .iter() + .map(|name| serde_json::json!({"name": name})) + .collect(); + let body = serde_json::json!({ + "meta": {"api-version": "1.0"}, + "projects": projects, + }); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, PEP691_JSON)], + serde_json::to_string(&body).unwrap_or_default(), + ) + .into_response() + } else { + // PEP 503 HTML + let mut html = String::from( + "\nSimple Index

Simple Index

\n", + ); + for pkg in pkg_list { + html.push_str(&format!("{}
\n", pkg, pkg)); + } + html.push_str(""); + (StatusCode::OK, Html(html)).into_response() } - html.push_str(""); - - (StatusCode::OK, Html(html)) } -/// List versions/files for a specific package +// ============================================================================ +// Package versions +// ============================================================================ + +/// GET /simple/{name}/ — list files for a package (PEP 503 HTML or PEP 691 JSON). async fn package_versions( State(state): State>, Path(name): Path, + headers: HeaderMap, ) -> Response { - // Normalize package name (PEP 503) let normalized = normalize_name(&name); - - // Try to get local files first let prefix = format!("pypi/{}/", normalized); let keys = state.storage.list(&prefix).await; - if !keys.is_empty() { - // We have local files - let mut html = format!( - "\nLinks for {}

Links for {}

\n", - name, name - ); - - for key in &keys { - if let Some(filename) = key.strip_prefix(&prefix) { - if !filename.is_empty() { - html.push_str(&format!( - "{}
\n", - normalized, filename, filename - )); - } + // Collect files with their hashes + let mut files: Vec = Vec::new(); + for key in &keys { + if let Some(filename) = key.strip_prefix(&prefix) { + if !filename.is_empty() && !filename.ends_with(".sha256") { + let sha256 = state + .storage + .get(&format!("{}.sha256", key)) + .await + .ok() + .and_then(|d| String::from_utf8(d.to_vec()).ok()); + files.push(FileEntry { + filename: filename.to_string(), + sha256, + }); } } - html.push_str(""); + } - return (StatusCode::OK, Html(html)).into_response(); + if !files.is_empty() { + return if wants_json(&headers) { + versions_json_response(&normalized, &files) + } else { + versions_html_response(&normalized, &files) + }; } // Try proxy if configured @@ -95,7 +139,6 @@ async fn package_versions( ) .await { - // Rewrite URLs in the HTML to point to our registry let rewritten = rewrite_pypi_links(&html, &normalized); return (StatusCode::OK, Html(rewritten)).into_response(); } @@ -104,7 +147,11 @@ async fn package_versions( StatusCode::NOT_FOUND.into_response() } -/// Download a specific file +// ============================================================================ +// Download +// ============================================================================ + +/// GET /simple/{name}/{filename} — download a specific file. async fn download_file( State(state): State>, Path((name, filename)): Path<(String, String)>, @@ -126,20 +173,12 @@ async fn download_file( .audit .log(AuditEntry::new("cache_hit", "api", "", "pypi", "")); - let content_type = if filename.ends_with(".whl") { - "application/zip" - } else if filename.ends_with(".tar.gz") || filename.ends_with(".tgz") { - "application/gzip" - } else { - "application/octet-stream" - }; - + let content_type = pypi_content_type(&filename); return (StatusCode::OK, [(header::CONTENT_TYPE, content_type)], data).into_response(); } // Try proxy if configured if let Some(proxy_url) = &state.config.pypi.proxy { - // First, fetch the package page to find the actual download URL let page_url = format!("{}/{}/", proxy_url.trim_end_matches('/'), normalized); if let Ok(html) = proxy_fetch_text( @@ -151,7 +190,6 @@ async fn download_file( ) .await { - // Find the URL for this specific file if let Some(file_url) = find_file_url(&html, &filename) { if let Ok(data) = proxy_fetch( &state.http_client, @@ -173,24 +211,21 @@ async fn download_file( .audit .log(AuditEntry::new("proxy_fetch", "api", "", "pypi", "")); - // Cache in local storage + // Cache in background + compute hash let storage = state.storage.clone(); let key_clone = key.clone(); let data_clone = data.clone(); tokio::spawn(async move { let _ = storage.put(&key_clone, &data_clone).await; + let hash = hex::encode(sha2::Sha256::digest(&data_clone)); + let _ = storage + .put(&format!("{}.sha256", key_clone), hash.as_bytes()) + .await; }); state.repo_index.invalidate("pypi"); - let content_type = if filename.ends_with(".whl") { - "application/zip" - } else if filename.ends_with(".tar.gz") || filename.ends_with(".tgz") { - "application/gzip" - } else { - "application/octet-stream" - }; - + let content_type = pypi_content_type(&filename); return (StatusCode::OK, [(header::CONTENT_TYPE, content_type)], data) .into_response(); } @@ -201,14 +236,238 @@ async fn download_file( StatusCode::NOT_FOUND.into_response() } -/// Normalize package name according to PEP 503 +// ============================================================================ +// Twine upload (PEP 503 — POST /simple/) +// ============================================================================ + +/// POST /simple/ — upload a package via twine. +/// +/// twine sends multipart/form-data with fields: +/// :action = "file_upload" +/// name = package name +/// version = package version +/// filetype = "sdist" | "bdist_wheel" +/// content = the file bytes +/// sha256_digest = hex SHA-256 of file (optional) +/// metadata_version, summary, etc. (optional metadata) +async fn upload(State(state): State>, mut multipart: Multipart) -> Response { + let mut action = String::new(); + let mut name = String::new(); + let mut version = String::new(); + let mut filename = String::new(); + let mut file_data: Option> = None; + let mut sha256_digest = String::new(); + + // Parse multipart fields + while let Ok(Some(field)) = multipart.next_field().await { + let field_name = field.name().unwrap_or("").to_string(); + + match field_name.as_str() { + ":action" => { + action = field.text().await.ok().unwrap_or_default(); + } + "name" => { + name = field.text().await.ok().unwrap_or_default(); + } + "version" => { + version = field.text().await.ok().unwrap_or_default(); + } + "sha256_digest" => { + sha256_digest = field.text().await.ok().unwrap_or_default(); + } + "content" => { + filename = field.file_name().unwrap_or("unknown").to_string(); + match field.bytes().await { + Ok(b) => file_data = Some(b.to_vec()), + Err(e) => { + return ( + StatusCode::BAD_REQUEST, + format!("Failed to read file: {}", e), + ) + .into_response() + } + } + } + _ => { + // Skip other metadata fields (summary, author, etc.) + let _ = field.bytes().await; + } + } + } + + // Validate required fields + if action != "file_upload" { + return (StatusCode::BAD_REQUEST, "Unsupported action").into_response(); + } + + if name.is_empty() || version.is_empty() { + return (StatusCode::BAD_REQUEST, "Missing name or version").into_response(); + } + + let data = match file_data { + Some(d) if !d.is_empty() => d, + _ => return (StatusCode::BAD_REQUEST, "Missing file content").into_response(), + }; + + // Validate filename + if filename.is_empty() || !is_valid_pypi_filename(&filename) { + return (StatusCode::BAD_REQUEST, "Invalid filename").into_response(); + } + + // Verify SHA-256 if provided + let computed_hash = hex::encode(sha2::Sha256::digest(&data)); + if !sha256_digest.is_empty() && sha256_digest != computed_hash { + tracing::warn!( + package = %name, + expected = %sha256_digest, + computed = %computed_hash, + "SECURITY: PyPI upload SHA-256 mismatch" + ); + return (StatusCode::BAD_REQUEST, "SHA-256 digest mismatch").into_response(); + } + + // Normalize name and store + let normalized = normalize_name(&name); + + // Check immutability (same filename = already exists) + let file_key = format!("pypi/{}/{}", normalized, filename); + if state.storage.stat(&file_key).await.is_some() { + return ( + StatusCode::CONFLICT, + format!("File {} already exists", filename), + ) + .into_response(); + } + + // Store file + if state.storage.put(&file_key, &data).await.is_err() { + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + + // Store SHA-256 hash + let hash_key = format!("{}.sha256", file_key); + let _ = state.storage.put(&hash_key, computed_hash.as_bytes()).await; + + state.metrics.record_upload("pypi"); + state.activity.push(ActivityEntry::new( + ActionType::Push, + format!("{}-{}", name, version), + "pypi", + "LOCAL", + )); + state + .audit + .log(AuditEntry::new("push", "api", "", "pypi", "")); + state.repo_index.invalidate("pypi"); + + StatusCode::OK.into_response() +} + +// ============================================================================ +// PEP 691 JSON responses +// ============================================================================ + +struct FileEntry { + filename: String, + sha256: Option, +} + +fn versions_json_response(normalized: &str, files: &[FileEntry]) -> Response { + let file_entries: Vec = files + .iter() + .map(|f| { + let mut entry = serde_json::json!({ + "filename": f.filename, + "url": format!("/simple/{}/{}", normalized, f.filename), + }); + if let Some(hash) = &f.sha256 { + entry["digests"] = serde_json::json!({"sha256": hash}); + } + entry + }) + .collect(); + + let body = serde_json::json!({ + "meta": {"api-version": "1.0"}, + "name": normalized, + "files": file_entries, + }); + + ( + StatusCode::OK, + [(header::CONTENT_TYPE, PEP691_JSON)], + serde_json::to_string(&body).unwrap_or_default(), + ) + .into_response() +} + +fn versions_html_response(normalized: &str, files: &[FileEntry]) -> Response { + let mut html = format!( + "\nLinks for {}

Links for {}

\n", + normalized, normalized + ); + + for f in files { + let hash_fragment = f + .sha256 + .as_ref() + .map(|h| format!("#sha256={}", h)) + .unwrap_or_default(); + html.push_str(&format!( + "{}
\n", + normalized, f.filename, hash_fragment, f.filename + )); + } + html.push_str(""); + + (StatusCode::OK, Html(html)).into_response() +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/// Normalize package name according to PEP 503. fn normalize_name(name: &str) -> String { name.to_lowercase().replace(['-', '_', '.'], "-") } -/// Rewrite PyPI links to point to our registry +/// Check Accept header for PEP 691 JSON. +fn wants_json(headers: &HeaderMap) -> bool { + headers + .get(header::ACCEPT) + .and_then(|v| v.to_str().ok()) + .map(|v| v.contains(PEP691_JSON)) + .unwrap_or(false) +} + +/// Content-type for PyPI files. +fn pypi_content_type(filename: &str) -> &'static str { + if filename.ends_with(".whl") { + "application/zip" + } else if filename.ends_with(".tar.gz") || filename.ends_with(".tgz") { + "application/gzip" + } else { + "application/octet-stream" + } +} + +/// Validate PyPI filename. +fn is_valid_pypi_filename(name: &str) -> bool { + !name.is_empty() + && !name.contains("..") + && !name.contains('/') + && !name.contains('\\') + && !name.contains('\0') + && (name.ends_with(".tar.gz") + || name.ends_with(".tgz") + || name.ends_with(".whl") + || name.ends_with(".zip") + || name.ends_with(".egg")) +} + +/// Rewrite PyPI links to point to our registry. fn rewrite_pypi_links(html: &str, package_name: &str) -> String { - // Simple regex-free approach: find href="..." and rewrite let mut result = String::with_capacity(html.len()); let mut remaining = html; @@ -219,10 +478,13 @@ fn rewrite_pypi_links(html: &str, package_name: &str) -> String { if let Some(href_end) = remaining.find('"') { let url = &remaining[..href_end]; - // Extract filename from URL if let Some(filename) = extract_filename(url) { - // Rewrite to our local URL - result.push_str(&format!("/simple/{}/{}", package_name, filename)); + // Extract hash fragment from original URL + let hash_fragment = url.find('#').map(|pos| &url[pos..]).unwrap_or(""); + result.push_str(&format!( + "/simple/{}/{}{}", + package_name, filename, hash_fragment + )); } else { result.push_str(url); } @@ -233,12 +495,11 @@ fn rewrite_pypi_links(html: &str, package_name: &str) -> String { result.push_str(remaining); // Remove data-core-metadata and data-dist-info-metadata attributes - // as we don't serve .metadata files (PEP 658) let result = remove_attribute(&result, "data-core-metadata"); remove_attribute(&result, "data-dist-info-metadata") } -/// Remove an HTML attribute from all tags +/// Remove an HTML attribute from all tags. fn remove_attribute(html: &str, attr_name: &str) -> String { let mut result = String::with_capacity(html.len()); let mut remaining = html; @@ -248,7 +509,6 @@ fn remove_attribute(html: &str, attr_name: &str) -> String { result.push_str(&remaining[..attr_start]); remaining = &remaining[attr_start + pattern.len()..]; - // Skip the attribute value if let Some(attr_end) = remaining.find('"') { remaining = &remaining[attr_end + 1..]; } @@ -257,19 +517,11 @@ fn remove_attribute(html: &str, attr_name: &str) -> String { result } -/// Extract filename from PyPI download URL +/// Extract filename from PyPI download URL. fn extract_filename(url: &str) -> Option<&str> { - // PyPI URLs look like: - // https://files.pythonhosted.org/packages/.../package-1.0.0.tar.gz#sha256=... - // or just the filename directly - - // Remove hash fragment let url = url.split('#').next()?; - - // Get the last path component let filename = url.rsplit('/').next()?; - // Must be a valid package file if filename.ends_with(".tar.gz") || filename.ends_with(".tgz") || filename.ends_with(".whl") @@ -282,7 +534,7 @@ fn extract_filename(url: &str) -> Option<&str> { } } -/// Find the download URL for a specific file in the HTML +/// Find the download URL for a specific file in the HTML. fn find_file_url(html: &str, target_filename: &str) -> Option { let mut remaining = html; @@ -294,7 +546,6 @@ fn find_file_url(html: &str, target_filename: &str) -> Option { if let Some(filename) = extract_filename(url) { if filename == target_filename { - // Remove hash fragment for actual download return Some(url.split('#').next().unwrap_or(url).to_string()); } } @@ -306,6 +557,10 @@ fn find_file_url(html: &str, target_filename: &str) -> Option { None } +// ============================================================================ +// Unit Tests +// ============================================================================ + #[cfg(test)] #[allow(clippy::unwrap_used)] mod tests { @@ -481,7 +736,14 @@ mod tests { fn test_rewrite_pypi_links_basic() { let html = r#"flask-2.0.tar.gz"#; let result = rewrite_pypi_links(html, "flask"); - assert!(result.contains("/simple/flask/flask-2.0.tar.gz")); + assert!(result.contains("/simple/flask/flask-2.0.tar.gz#sha256=abc")); + } + + #[test] + fn test_rewrite_pypi_links_preserves_hash() { + let html = r#"pkg"#; + let result = rewrite_pypi_links(html, "pkg"); + assert!(result.contains("#sha256=deadbeef")); } #[test] @@ -527,12 +789,50 @@ mod tests { let result = find_file_url(html, "pkg-1.0.whl"); assert_eq!(result, Some("https://example.com/pkg-1.0.whl".to_string())); } + + #[test] + fn test_is_valid_pypi_filename() { + assert!(is_valid_pypi_filename("flask-2.0.tar.gz")); + assert!(is_valid_pypi_filename("flask-2.0-py3-none-any.whl")); + assert!(is_valid_pypi_filename("flask-2.0.tgz")); + assert!(is_valid_pypi_filename("flask-2.0.zip")); + assert!(is_valid_pypi_filename("flask-2.0.egg")); + assert!(!is_valid_pypi_filename("")); + assert!(!is_valid_pypi_filename("../evil.tar.gz")); + assert!(!is_valid_pypi_filename("evil/path.tar.gz")); + assert!(!is_valid_pypi_filename("noext")); + assert!(!is_valid_pypi_filename("bad.exe")); + } + + #[test] + fn test_wants_json_pep691() { + let mut headers = HeaderMap::new(); + headers.insert(header::ACCEPT, PEP691_JSON.parse().unwrap()); + assert!(wants_json(&headers)); + } + + #[test] + fn test_wants_json_html() { + let mut headers = HeaderMap::new(); + headers.insert(header::ACCEPT, "text/html".parse().unwrap()); + assert!(!wants_json(&headers)); + } + + #[test] + fn test_wants_json_no_header() { + let headers = HeaderMap::new(); + assert!(!wants_json(&headers)); + } } +// ============================================================================ +// Integration Tests +// ============================================================================ + #[cfg(test)] #[allow(clippy::unwrap_used)] mod integration_tests { - use crate::test_helpers::{body_bytes, create_test_context, send}; + use crate::test_helpers::{body_bytes, create_test_context, send, send_with_headers}; use axum::http::{Method, StatusCode}; #[tokio::test] @@ -550,7 +850,6 @@ mod integration_tests { async fn test_pypi_list_with_packages() { let ctx = create_test_context(); - // Pre-populate storage with a package ctx.state .storage .put("pypi/flask/flask-2.0.tar.gz", b"fake-tarball-data") @@ -565,11 +864,36 @@ mod integration_tests { assert!(html.contains("flask")); } + #[tokio::test] + async fn test_pypi_list_json_pep691() { + let ctx = create_test_context(); + + ctx.state + .storage + .put("pypi/flask/flask-2.0.tar.gz", b"data") + .await + .unwrap(); + + let response = send_with_headers( + &ctx.app, + Method::GET, + "/simple/", + vec![("Accept", "application/vnd.pypi.simple.v1+json")], + "", + ) + .await; + + assert_eq!(response.status(), StatusCode::OK); + let body = body_bytes(response).await; + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert!(json["meta"]["api-version"].as_str() == Some("1.0")); + assert!(json["projects"].as_array().unwrap().len() == 1); + } + #[tokio::test] async fn test_pypi_versions_local() { let ctx = create_test_context(); - // Pre-populate storage ctx.state .storage .put("pypi/flask/flask-2.0.tar.gz", b"fake-data") @@ -585,6 +909,65 @@ mod integration_tests { assert!(html.contains("/simple/flask/flask-2.0.tar.gz")); } + #[tokio::test] + async fn test_pypi_versions_with_hash() { + let ctx = create_test_context(); + + ctx.state + .storage + .put("pypi/flask/flask-2.0.tar.gz", b"fake-data") + .await + .unwrap(); + ctx.state + .storage + .put( + "pypi/flask/flask-2.0.tar.gz.sha256", + b"abc123def456abc123def456abc123def456abc123def456abc123def456abcd", + ) + .await + .unwrap(); + + let response = send(&ctx.app, Method::GET, "/simple/flask/", "").await; + + assert_eq!(response.status(), StatusCode::OK); + let body = body_bytes(response).await; + let html = String::from_utf8_lossy(&body); + assert!(html.contains("#sha256=abc123")); + } + + #[tokio::test] + async fn test_pypi_versions_json_pep691() { + let ctx = create_test_context(); + + ctx.state + .storage + .put("pypi/flask/flask-2.0.tar.gz", b"data") + .await + .unwrap(); + ctx.state + .storage + .put("pypi/flask/flask-2.0.tar.gz.sha256", b"deadbeef") + .await + .unwrap(); + + let response = send_with_headers( + &ctx.app, + Method::GET, + "/simple/flask/", + vec![("Accept", "application/vnd.pypi.simple.v1+json")], + "", + ) + .await; + + assert_eq!(response.status(), StatusCode::OK); + let body = body_bytes(response).await; + let json: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(json["name"], "flask"); + assert!(json["files"].as_array().unwrap().len() == 1); + assert_eq!(json["files"][0]["filename"], "flask-2.0.tar.gz"); + assert_eq!(json["files"][0]["digests"]["sha256"], "deadbeef"); + } + #[tokio::test] async fn test_pypi_download_local() { let ctx = create_test_context(); @@ -607,7 +990,6 @@ mod integration_tests { async fn test_pypi_not_found_no_proxy() { let ctx = create_test_context(); - // No proxy configured, no local data let response = send(&ctx.app, Method::GET, "/simple/nonexistent/", "").await; assert_eq!(response.status(), StatusCode::NOT_FOUND);