diff --git a/src/cargo/ops/vendor.rs b/src/cargo/ops/vendor.rs index b3fa4a132df..78e8b068af0 100644 --- a/src/cargo/ops/vendor.rs +++ b/src/cargo/ops/vendor.rs @@ -3,14 +3,18 @@ use crate::core::SourceId; use crate::core::{GitReference, Package, Workspace}; use crate::ops; use crate::sources::path::PathSource; -use crate::sources::PathEntry; +use crate::sources::RegistrySource; use crate::sources::SourceConfigMap; use crate::sources::CRATES_IO_REGISTRY; use crate::util::cache_lock::CacheLockMode; use crate::util::{try_canonicalize, CargoResult, GlobalContext}; + use anyhow::{bail, Context as _}; use cargo_util::{paths, Sha256}; +use cargo_util_schemas::core::SourceKind; use serde::Serialize; +use walkdir::WalkDir; + use std::collections::HashSet; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::ffi::OsStr; @@ -86,9 +90,16 @@ struct SourceReplacementCache<'gctx> { } impl SourceReplacementCache<'_> { - fn new(gctx: &GlobalContext) -> CargoResult> { + fn new( + gctx: &GlobalContext, + respect_source_config: bool, + ) -> CargoResult> { Ok(SourceReplacementCache { - map: SourceConfigMap::new(gctx)?, + map: if respect_source_config { + SourceConfigMap::new(gctx) + } else { + SourceConfigMap::empty(gctx) + }?, cache: Default::default(), }) } @@ -111,14 +122,14 @@ fn sync( opts: &VendorOptions<'_>, ) -> CargoResult { let dry_run = false; - let canonical_destination = try_canonicalize(opts.destination); - let canonical_destination = canonical_destination.as_deref().unwrap_or(opts.destination); - let dest_dir_already_exists = canonical_destination.exists(); + let vendor_dir = try_canonicalize(opts.destination); + let vendor_dir = vendor_dir.as_deref().unwrap_or(opts.destination); + let vendor_dir_already_exists = vendor_dir.exists(); - paths::create_dir_all(&canonical_destination)?; + paths::create_dir_all(&vendor_dir)?; let mut to_remove = HashSet::new(); if !opts.no_delete { - for entry in canonical_destination.read_dir()? { + for entry in vendor_dir.read_dir()? { let entry = entry?; if !entry .file_name() @@ -130,19 +141,13 @@ fn sync( } } - let mut source_replacement_cache = SourceReplacementCache::new(gctx)?; - - // First up attempt to work around rust-lang/cargo#5956. Apparently build - // artifacts sprout up in Cargo's global cache for whatever reason, although - // it's unsure what tool is causing these issues at this time. For now we - // apply a heavy-hammer approach which is to delete Cargo's unpacked version - // of each crate to start off with. After we do this we'll re-resolve and - // redownload again, which should trigger Cargo to re-extract all the - // crates. - // - // Note that errors are largely ignored here as this is a best-effort - // attempt. If anything fails here we basically just move on to the next - // crate to work with. + let mut source_replacement_cache = + SourceReplacementCache::new(gctx, opts.respect_source_config)?; + + let mut checksums = HashMap::new(); + let mut ids = BTreeMap::new(); + + // Let's download all crates and start storing internal tables about them. for ws in workspaces { let (packages, resolve) = ops::resolve_ws(ws, dry_run) .with_context(|| format!("failed to load lockfile for {}", ws.root().display()))?; @@ -152,14 +157,11 @@ fn sync( .with_context(|| format!("failed to download packages for {}", ws.root().display()))?; for pkg in resolve.iter() { - let sid = if opts.respect_source_config { - source_replacement_cache.get(pkg.source_id())? - } else { - pkg.source_id() - }; + let sid = source_replacement_cache.get(pkg.source_id())?; - // Don't delete actual source code! + // Don't vendor path crates since they're already in the repository if sid.is_path() { + // And don't delete actual source code! if let Ok(path) = sid.url().to_file_path() { if let Ok(path) = try_canonicalize(path) { to_remove.remove(&path); @@ -167,39 +169,7 @@ fn sync( } continue; } - if sid.is_git() { - continue; - } - // Only delete sources that are safe to delete, i.e. they are caches. - if sid.is_registry() { - if let Ok(pkg) = packages.get_one(pkg) { - drop(fs::remove_dir_all(pkg.root())); - } - continue; - } - } - } - - let mut checksums = HashMap::new(); - let mut ids = BTreeMap::new(); - - // Next up let's actually download all crates and start storing internal - // tables about them. - for ws in workspaces { - let (packages, resolve) = ops::resolve_ws(ws, dry_run) - .with_context(|| format!("failed to load lockfile for {}", ws.root().display()))?; - - packages - .get_many(resolve.iter()) - .with_context(|| format!("failed to download packages for {}", ws.root().display()))?; - - for pkg in resolve.iter() { - // No need to vendor path crates since they're already in the - // repository - if pkg.source_id().is_path() { - continue; - } ids.insert( pkg, packages @@ -247,7 +217,7 @@ fn sync( }; sources.insert(id.source_id()); - let dst = canonical_destination.join(&dst_name); + let dst = vendor_dir.join(&dst_name); to_remove.remove(&dst); let cksum = dst.join(".cargo-checksum.json"); // Registries are the only immutable sources, @@ -263,16 +233,89 @@ fn sync( )?; let _ = fs::remove_dir_all(&dst); - let pathsource = PathSource::new(src, id.source_id(), gctx); - let paths = pathsource.list_files(pkg)?; - let mut map = BTreeMap::new(); - cp_sources(pkg, src, &paths, &dst, &mut map, &mut tmp_buf, gctx) - .with_context(|| format!("failed to copy over vendored sources for: {}", id))?; + + let mut file_cksums = BTreeMap::new(); + + // Need this mapping anyway because we will directly consult registry sources, + // otherwise builtin source replacement (sparse registry) won't be respected. + let sid = source_replacement_cache.get(id.source_id())?; + + if sid.is_registry() { + // To keep the unpacked source from registry in a pristine state, + // we'll do a direct extraction into the vendor directory. + let registry = match sid.kind() { + SourceKind::Registry | SourceKind::SparseRegistry => { + RegistrySource::remote(sid, &Default::default(), gctx)? + } + SourceKind::LocalRegistry => { + let path = sid.url().to_file_path().expect("local path"); + RegistrySource::local(sid, &path, &Default::default(), gctx) + } + _ => unreachable!("not registry source: {sid}"), + }; + + let walkdir = |root| { + WalkDir::new(root) + .into_iter() + // It is safe to skip errors, + // since we'll hit them during copying/reading later anyway. + .filter_map(|e| e.ok()) + // There should be no symlink in tarballs on crates.io, + // but might be wrong for local registries. + // Hence here be conservative and include symlinks. + .filter(|e| e.file_type().is_file() || e.file_type().is_symlink()) + }; + let mut compute_file_cksums = |root| { + for e in walkdir(root) { + let path = e.path(); + let relative = path.strip_prefix(&dst).unwrap(); + let cksum = Sha256::new() + .update_path(path) + .map(Sha256::finish_hex) + .with_context(|| format!("failed to checksum `{}`", path.display()))?; + file_cksums.insert(relative.to_str().unwrap().replace("\\", "/"), cksum); + } + Ok::<_, anyhow::Error>(()) + }; + if dir_has_version_suffix { + registry.unpack_package_in(id, &vendor_dir, &vendor_this)?; + compute_file_cksums(&dst)?; + } else { + // Due to the extra sanity check in registry unpack + // (ensure it contain only one top-level directory with name `pkg-version`), + // we can only unpack a directory with version suffix, + // and move it to the no suffix directory. + let staging_dir = tempfile::Builder::new() + .prefix(".vendor-staging") + .tempdir_in(vendor_dir)?; + let unpacked_src = + registry.unpack_package_in(id, staging_dir.path(), &vendor_this)?; + if let Err(e) = fs::rename(&unpacked_src, &dst) { + // This fallback is mainly for Windows 10 versions earlier than 1607. + // The destination of `fs::rename` can't be a diretory in older versions. + // Can be removed once the minimal supported Windows version gets bumped. + tracing::warn!("failed to `mv {unpacked_src:?} {dst:?}`: {e}"); + let paths: Vec<_> = walkdir(&unpacked_src).map(|e| e.into_path()).collect(); + cp_sources(pkg, src, &paths, &dst, &mut file_cksums, &mut tmp_buf, gctx) + .with_context(|| format!("failed to copy vendored sources for {id}"))?; + } else { + compute_file_cksums(&dst)?; + } + } + } else { + let paths = PathSource::new(src, sid, gctx) + .list_files(pkg)? + .into_iter() + .map(|p| p.into_path_buf()) + .collect::>(); + cp_sources(pkg, src, &paths, &dst, &mut file_cksums, &mut tmp_buf, gctx) + .with_context(|| format!("failed to copy vendored sources for {id}"))?; + } // Finally, emit the metadata about this package let json = serde_json::json!({ "package": checksums.get(id), - "files": map, + "files": file_cksums, }); paths::write(&cksum, json.to_string())?; @@ -347,9 +390,9 @@ fn sync( directory: opts.destination.to_string_lossy().replace("\\", "/"), }, ); - } else if !dest_dir_already_exists { + } else if !vendor_dir_already_exists { // Nothing to vendor. Remove the destination dir we've just created. - paths::remove_dir(canonical_destination)?; + paths::remove_dir(vendor_dir)?; } Ok(VendorConfig { source: config }) @@ -358,36 +401,18 @@ fn sync( fn cp_sources( pkg: &Package, src: &Path, - paths: &[PathEntry], + paths: &[PathBuf], dst: &Path, cksums: &mut BTreeMap, tmp_buf: &mut [u8], gctx: &GlobalContext, ) -> CargoResult<()> { for p in paths { - let p = p.as_ref(); let relative = p.strip_prefix(&src).unwrap(); - match relative.to_str() { - // Skip git config files as they're not relevant to builds most of - // the time and if we respect them (e.g. in git) then it'll - // probably mess with the checksums when a vendor dir is checked - // into someone else's source control - Some(".gitattributes" | ".gitignore" | ".git") => continue, - - // Temporary Cargo files - Some(".cargo-ok") => continue, - - // Skip patch-style orig/rej files. Published crates on crates.io - // have `Cargo.toml.orig` which we don't want to use here and - // otherwise these are rarely used as part of the build process. - Some(filename) => { - if filename.ends_with(".orig") || filename.ends_with(".rej") { - continue; - } - } - _ => {} - }; + if !vendor_this(relative) { + continue; + } // Join pathname components individually to make sure that the joined // path uses the correct directory separators everywhere, since @@ -417,7 +442,7 @@ fn cp_sources( &dst, &mut dst_opts, &mut contents.as_bytes(), - "Generated Cargo.toml", + Path::new("Generated Cargo.toml"), tmp_buf, )? } else { @@ -430,13 +455,7 @@ fn cp_sources( .with_context(|| format!("failed to stat {:?}", p))?; dst_opts.mode(src_metadata.mode()); } - copy_and_checksum( - &dst, - &mut dst_opts, - &mut src, - &p.display().to_string(), - tmp_buf, - )? + copy_and_checksum(&dst, &mut dst_opts, &mut src, &p, tmp_buf)? }; cksums.insert(relative.to_str().unwrap().replace("\\", "/"), cksum); @@ -562,7 +581,7 @@ fn copy_and_checksum( dst_path: &Path, dst_opts: &mut OpenOptions, contents: &mut T, - contents_path: &str, + contents_path: &Path, buf: &mut [u8], ) -> CargoResult { let mut dst = dst_opts @@ -584,3 +603,25 @@ fn copy_and_checksum( .with_context(|| format!("failed to write to {:?}", dst_path))?; } } + +/// Filters files we want to vendor. +/// +/// `relative` is a path relative to the package root. +fn vendor_this(relative: &Path) -> bool { + match relative.to_str() { + // Skip git config files as they're not relevant to builds most of + // the time and if we respect them (e.g. in git) then it'll + // probably mess with the checksums when a vendor dir is checked + // into someone else's source control + Some(".gitattributes" | ".gitignore" | ".git") => false, + + // Temporary Cargo files + Some(".cargo-ok") => false, + + // Skip patch-style orig/rej files. Published crates on crates.io + // have `Cargo.toml.orig` which we don't want to use here and + // otherwise these are rarely used as part of the build process. + Some(p) if p.ends_with(".orig") || p.ends_with(".rej") => false, + _ => true, + } +} diff --git a/src/cargo/sources/registry/mod.rs b/src/cargo/sources/registry/mod.rs index bf10f81fc20..9f45e9e60ae 100644 --- a/src/cargo/sources/registry/mod.rs +++ b/src/cargo/sources/registry/mod.rs @@ -248,6 +248,8 @@ pub struct RegistrySource<'gctx> { source_id: SourceId, /// The path where crate files are extracted (`$CARGO_HOME/registry/src/$REG-HASH`). src_path: Filesystem, + /// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`). + cache_path: Filesystem, /// Local reference to [`GlobalContext`] for convenience. gctx: &'gctx GlobalContext, /// Abstraction for interfacing to the different registry kinds. @@ -532,6 +534,7 @@ impl<'gctx> RegistrySource<'gctx> { RegistrySource { name: name.into(), src_path: gctx.registry_source_path().join(name), + cache_path: gctx.registry_cache_path().join(name), gctx, source_id, index: index::RegistryIndex::new(source_id, ops.index_path(), gctx), @@ -630,60 +633,8 @@ impl<'gctx> RegistrySource<'gctx> { Err(e) => anyhow::bail!("unable to read .cargo-ok file at {path:?}: {e}"), } dst.create_dir()?; - let mut tar = { - let size_limit = max_unpack_size(self.gctx, tarball.metadata()?.len()); - let gz = GzDecoder::new(tarball); - let gz = LimitErrorReader::new(gz, size_limit); - let mut tar = Archive::new(gz); - set_mask(&mut tar); - tar - }; - let mut bytes_written = 0; - let prefix = unpack_dir.file_name().unwrap(); - let parent = unpack_dir.parent().unwrap(); - for entry in tar.entries()? { - let mut entry = entry.context("failed to iterate over archive")?; - let entry_path = entry - .path() - .context("failed to read entry path")? - .into_owned(); - // We're going to unpack this tarball into the global source - // directory, but we want to make sure that it doesn't accidentally - // (or maliciously) overwrite source code from other crates. Cargo - // itself should never generate a tarball that hits this error, and - // crates.io should also block uploads with these sorts of tarballs, - // but be extra sure by adding a check here as well. - if !entry_path.starts_with(prefix) { - anyhow::bail!( - "invalid tarball downloaded, contains \ - a file at {:?} which isn't under {:?}", - entry_path, - prefix - ) - } - // Prevent unpacking the lockfile from the crate itself. - if entry_path - .file_name() - .map_or(false, |p| p == PACKAGE_SOURCE_LOCK) - { - continue; - } - // Unpacking failed - bytes_written += entry.size(); - let mut result = entry.unpack_in(parent).map_err(anyhow::Error::from); - if cfg!(windows) && restricted_names::is_windows_reserved_path(&entry_path) { - result = result.with_context(|| { - format!( - "`{}` appears to contain a reserved Windows path, \ - it cannot be extracted on Windows", - entry_path.display() - ) - }); - } - result - .with_context(|| format!("failed to unpack entry at `{}`", entry_path.display()))?; - } + let bytes_written = unpack(self.gctx, tarball, unpack_dir, &|_| true)?; // Now that we've finished unpacking, create and write to the lock file to indicate that // unpacking was successful. @@ -708,6 +659,29 @@ impl<'gctx> RegistrySource<'gctx> { Ok(unpack_dir.to_path_buf()) } + /// Unpacks the `.crate` tarball of the package in a given directory. + /// + /// Returns the path to the crate tarball directory, + /// whch is always `/-`. + /// + /// This holds an assumption that the associated tarball already exists. + pub fn unpack_package_in( + &self, + pkg: &PackageId, + unpack_dir: &Path, + include: &dyn Fn(&Path) -> bool, + ) -> CargoResult { + let path = self.cache_path.join(pkg.tarball_name()); + let path = self + .gctx + .assert_package_cache_locked(CacheLockMode::Shared, &path); + let dst = unpack_dir.join(format!("{}-{}", pkg.name(), pkg.version())); + let tarball = + File::open(path).with_context(|| format!("failed to open {}", path.display()))?; + unpack(self.gctx, &tarball, &dst, include)?; + Ok(dst) + } + /// Turns the downloaded `.crate` tarball file into a [`Package`]. /// /// This unconditionally sets checksum for the returned package, so it @@ -1046,3 +1020,70 @@ fn set_mask(tar: &mut Archive) { #[cfg(unix)] tar.set_mask(crate::util::get_umask()); } + +/// Unpack a tarball with zip bomb and overwrite protections. +fn unpack( + gctx: &GlobalContext, + tarball: &File, + unpack_dir: &Path, + include: &dyn Fn(&Path) -> bool, +) -> CargoResult { + let mut tar = { + let size_limit = max_unpack_size(gctx, tarball.metadata()?.len()); + let gz = GzDecoder::new(tarball); + let gz = LimitErrorReader::new(gz, size_limit); + let mut tar = Archive::new(gz); + set_mask(&mut tar); + tar + }; + let mut bytes_written = 0; + let prefix = unpack_dir.file_name().unwrap(); + let parent = unpack_dir.parent().unwrap(); + for entry in tar.entries()? { + let mut entry = entry.context("failed to iterate over archive")?; + let entry_path = entry + .path() + .context("failed to read entry path")? + .into_owned(); + + if let Ok(path) = entry_path.strip_prefix(prefix) { + if !include(path) { + continue; + } + } else { + // We're going to unpack this tarball into the global source + // directory, but we want to make sure that it doesn't accidentally + // (or maliciously) overwrite source code from other crates. Cargo + // itself should never generate a tarball that hits this error, and + // crates.io should also block uploads with these sorts of tarballs, + // but be extra sure by adding a check here as well. + anyhow::bail!( + "invalid tarball downloaded, contains \ + a file at {entry_path:?} which isn't under {prefix:?}", + ) + } + + // Prevent unpacking the lockfile from the crate itself. + if entry_path + .file_name() + .map_or(false, |p| p == PACKAGE_SOURCE_LOCK) + { + continue; + } + // Unpacking failed + bytes_written += entry.size(); + let mut result = entry.unpack_in(parent).map_err(anyhow::Error::from); + if cfg!(windows) && restricted_names::is_windows_reserved_path(&entry_path) { + result = result.with_context(|| { + format!( + "`{}` appears to contain a reserved Windows path, \ + it cannot be extracted on Windows", + entry_path.display() + ) + }); + } + result.with_context(|| format!("failed to unpack entry at `{}`", entry_path.display()))?; + } + + Ok(bytes_written) +} diff --git a/tests/testsuite/vendor.rs b/tests/testsuite/vendor.rs index db194026b64..353cfa21b85 100644 --- a/tests/testsuite/vendor.rs +++ b/tests/testsuite/vendor.rs @@ -213,12 +213,13 @@ fn package_exclude() { p.cargo("vendor --respect-source-config").run(); let csum = p.read_file("vendor/bar/.cargo-checksum.json"); + // Everything is included because `cargo-vendor` + // do direct extractions from tarballs + // (Some are excluded like `.git` or `.cargo-ok` though.) assert!(csum.contains(".include")); - assert!(!csum.contains(".exclude")); - assert!(!csum.contains(".dotdir/exclude")); - // Gitignore doesn't re-include a file in an excluded parent directory, - // even if negating it explicitly. - assert!(!csum.contains(".dotdir/include")); + assert!(csum.contains(".exclude")); + assert!(csum.contains(".dotdir/exclude")); + assert!(csum.contains(".dotdir/include")); } #[cargo_test]