use anyhow::{Context, Result, bail}; use log::*; mod nom_parsing; use nom_parsing::{parse_file, read_file_contents}; mod checker; mod detector; #[macro_use] mod messages; use messages::{explains, explains_all}; mod args; mod linkcheck; mod utils; use clap::{CommandFactory, Parser}; use clap_complete::CompleteEnv; use args::{Args, Shell}; mod recode; use recode::{wrong_line_endings2crlf, wrong_line_endings2lf}; use checker::{Issue, check_file}; use detector::{DetectResult, LineEnding, detect}; use linkcheck::LinkCheck; use std::fmt; use std::fmt::Display; use std::sync::LazyLock; use std::str; use utils::*; use scoped_threadpool::Pool; use serde::{Deserialize, Serialize}; use std::borrow::Cow; use std::os::unix::fs::MetadataExt; use tempfile::Builder; use colored::Colorize; use regex::Regex; use std::fs::File; use std::fs::Metadata; use std::io::BufReader; use std::io::prelude::*; use std::os::unix::fs::FileTypeExt; use std::os::unix::fs::PermissionsExt; use std::{fs, process}; use std::ffi::OsStr; use std::io::Read; use std::path::Path; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, Ordering}; use rustc_hash::FxHashMap as HashMap; use rustc_hash::FxHashSet as HashSet; use std::time::SystemTime; use std::fmt::Arguments; use std::sync::mpsc::{Sender, channel}; #[cfg(unix)] use walkdir::{DirEntry, WalkDir}; #[derive(Hash, Clone, Copy, Eq, PartialEq)] struct FileSize(u64); #[derive(Hash, Clone, Copy, Eq, PartialEq)] struct Devno(u64); #[derive(Hash, Clone, Copy, Eq, PartialEq)] struct Inode(u64); impl fmt::Display for FileSize { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { self.0.fmt(f) } } fn is_future_mtime(now: SystemTime, mtime: SystemTime) -> bool { mtime > now //mtime > now + Duration::new(1800, 0) } fn format_message(message: &Arguments, no_color: bool) -> Cow<'static, str> { let msg_str = format!("{message}"); if msg_str.starts_with(' ') { return msg_str.into(); } let (left, right) = msg_str.split_once(' ').unwrap(); if no_color { msg_str.into() } else { let colored_msg = match &left.chars().next().unwrap() { 'E' | 'F' => format!("{} {}", left.bright_red().bold(), right), 'W' => format!("{} {}", left.bright_red(), right), 'I' => format!("{} {}", left.bright_yellow().bold(), right), _ => msg_str, }; colored_msg.into() } } #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct TdsException { pub pkg: String, pub tpkg: String, } #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct PathExceptions { pub tds_path_exceptions: Vec, } fn get_config_file_name() -> Result> { if let Some(config_file) = &ARGS.config_file { return Ok(Some(config_file.clone())); } let home_dir = match home::home_dir() { Some(path) => path.display().to_string(), None => panic!("Impossible to get your home dir!"), }; let config_files = [".ctan/pkgcheck.yml", ".config/ctan/pkgcheck.yml"]; for f in config_files { let config_file_abs_path = format!("{home_dir}/{f}"); if Path::new(&config_file_abs_path).exists() { return Ok(Some(config_file_abs_path)); } } Ok(None) } fn read_yaml_config() -> Result> { let mut pkg_replacements: HashMap = HashMap::default(); for (p, q) in [ ("armtex", "armenian"), ("babel-base", "babel"), ("l3backend-dev", "latex-dev/l3backend"), ("l3kernel-dev", "latex-dev/l3kernel"), ("latex-amsmath", "latex"), ("latex-amsmath-dev", "latex-dev"), ("latex-base", "latex"), ("latex-base-dev", "latex-dev"), ("latex-cyrillic", "cyrillic"), ("latex-firstaid", "latex/firstaid"), ("latex-firstaid-dev", "latex-dev/firstaid"), ("latex-graphics", "latex"), ("latex-graphics-dev", "latex-dev"), ("latex-lab", "latex"), ("latex-lab-dev", "latex-dev"), ("latex-tools", "latex"), ("latex-tools-dev", "latex-dev"), ("vntex-nonfree", "vntex"), ] { pkg_replacements.insert(p.to_string(), q.to_string()); } if let Some(config_filename) = get_config_file_name()? { i0008!(config_filename); let data = fs::read_to_string(&config_filename) .with_context(|| format!("Config file {} could not be read", &config_filename))?; let path_exceptions = serde_yaml::from_str::(&data).with_context(|| { format!( "Problem with YAML content of config file {}", &config_filename ) })?; for play in &path_exceptions.tds_path_exceptions { // check if package name is already in pkg_replacements hash let old_val = pkg_replacements.get(&play.pkg); if let Some(ov) = old_val && ARGS.verbose { if ov == &play.tpkg { w0009!(play.pkg, play.tpkg); } else { i0009!(play.pkg, ov, play.tpkg); } } pkg_replacements.insert(play.pkg.clone(), play.tpkg.clone()); } } Ok(pkg_replacements) } fn setup_logger(no_color: bool) -> Result<(), fern::InitError> { fern::Dispatch::new() .format(move |out, message, _record| { let msg_txt = format_message(message, no_color); out.finish(format_args!("{msg_txt}")) }) .level(log::LevelFilter::Info) .level_for("lopdf", log::LevelFilter::Error) .chain(std::io::stdout()) .apply()?; Ok(()) } type HashSender = Sender<(FileSize, PathBuf, Vec)>; // SizesHashMap contains // - file sizes // - and a vector of file names having that size type SizesHashMap = HashMap>; /// key: generated file /// value: the generator. i.e. the .ins or .dtx file type GeneratedHashMap = HashMap; type FileNamesHashMap = HashMap; const BLOCKSIZE: usize = 4096; fn hash_file_inner(path: &Path) -> Result> { let mut buf = [0u8; BLOCKSIZE]; let mut fp = File::open(path)?; let mut digest = blake3::Hasher::new(); loop { match fp.read(&mut buf)? { 0 => break, n => { digest.update(&buf[..n]); } } } Ok(digest.finalize().as_bytes().to_vec()) } fn hash_file(fsize: FileSize, path: PathBuf, tx: &HashSender) -> Result<()> { let hash = hash_file_inner(&path).with_context(|| format!("{}", &path.display()))?; tx.send((fsize, path, hash)).unwrap(); Ok(()) } // returns false if an error occurred fn fix_inconsistent_le(fname: &str) -> bool { i0004!(fname); match wrong_line_endings2lf(fname) { Ok(_) => { i0007!(fname, "LF"); true } Err(e) => { e0027!(fname, e); false } } } // returns false if an error occurred fn make_crlf(fname: &str) -> bool { i0004!(fname); match wrong_line_endings2crlf(fname) { Ok(()) => { i0007!(fname, "CRLF"); true } Err(e) => { e0027!(fname, e); false } } } fn check_readme(dir_entry: &str, is_readme: &ReadmeKind, ft: &DetectResult) -> bool { let msg_name = if let ReadmeKind::Symlink(s) = is_readme { format!("{} (symlinked from {})", dir_entry, &s) } else { dir_entry.to_string() }; // let cr = check_file(Path::new(&msg_name), ft); match ft { DetectResult::Archive | DetectResult::Zip | DetectResult::Elf => { e0003!(msg_name); return false; } DetectResult::Bom(b) => { e0029!(msg_name, b); return false; } DetectResult::Text(_le) => match File::open(dir_entry) { Ok(f) => { if !check_readme_inner(&msg_name, &f) { return false; } } Err(e) => { e0027!(msg_name, e); return false; } }, _ => (), } true } fn check_readme_inner(fname: &str, f: &std::fs::File) -> bool { let reader = BufReader::new(f); let lines = reader.split(b'\n').map(|l| l.unwrap()); let mut result = true; for (lineno, line) in lines.enumerate() { if let Err(e) = String::from_utf8(line.clone()) { e0021!(fname, lineno + 1, e); result = false; } } result } fn is_readme(entry: &str) -> bool { matches!(entry, "README" | "README.txt" | "README.md") } fn get_devno(meta: &Metadata) -> Devno { Devno(meta.dev()) } fn _get_devno(entry: &DirEntry) -> Devno { let meta = fs::metadata(entry.path().to_str().unwrap()); match meta { Ok(m) => Devno(m.dev()), _ => Devno(0), } } // In the past we took care to avoid visiting a single inode twice, which takes care of (false positive) hardlinks. // Now we want to know if there is a hardlink in the package directory #[cfg(unix)] fn check_inode(set: &mut HashMap<(Devno, Inode), Vec>, filename: &str, meta: &Metadata) { set.entry((get_devno(meta), Inode(meta.ino()))) .or_default() .push(filename.to_string()); } #[cfg(not(unix))] fn check_inode(_: &mut HashSet, _: &Metadata) -> bool { true } static ARGS: LazyLock = LazyLock::new(Args::parse); static ERROR_OCCURRED: AtomicBool = AtomicBool::new(false); static WARNING_OCCURRED: AtomicBool = AtomicBool::new(false); //Get the current time static NOW: LazyLock = LazyLock::new(SystemTime::now); #[derive(Debug, Clone, PartialEq, Eq)] pub enum DPath { Both(PathBuf), Tds(PathBuf), } impl fmt::Display for DPath { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DPath::Both(p) => write!(f, "{}", p.display()), DPath::Tds(p) => write!(f, "{}", p.display()), } } } #[derive(Default)] pub struct DupPath { len: usize, plen: usize, dupes: Vec, } impl DupPath { pub fn new() -> DupPath { DupPath { len: 0, plen: 0, dupes: Vec::new(), } } pub fn push(&mut self, pb: PathBuf) { let pname = pb.to_string_lossy(); self.len += 1; if pname.ends_with(".tfm") { self.dupes.push(DPath::Tds(pb.clone())); } else { self.plen += 1; self.dupes.push(DPath::Both(pb.clone())); } } } type DupHashes = HashMap<(FileSize, Vec), DupPath>; fn main() -> Result<()> { CompleteEnv::with_factory(Args::command).complete(); let _ = setup_logger(ARGS.no_colors); match &ARGS.explain { None => (), Some(e) => { explains(e); process::exit(0); } } if let Some(shell) = &ARGS.generate_completion { match shell { Shell::Nushell => print!("{}", args::nushell_completion()), other => println!("{}", other.usage("pkgcheck")), } process::exit(0); } if ARGS.explain_all { explains_all(); process::exit(0); } if ARGS.show_tmp_endings { show_tmp_endings(); process::exit(0); } // read yaml config file if one is given explicitly or implicitly let pkg_replace: HashMap = read_yaml_config()?; let pkg_dir = match &ARGS.pkg_dir { None => { bail!("Specify a directory to check (use option -d)"); } Some(d) => { // make sure the given directory ends with a '/' (slash) let ds: String = if d.ends_with('/') { d.clone() } else { let d_s = d.clone(); d_s + "/" }; exists_dir(&ds).with_context(|| format!("Specified package directory {}", &ds))?; ds } }; let tds_zip = &ARGS.tds_zip; // let's check if the specified TDS archive // - does exist? // - is a zip archive? if let Some(tds_zip) = tds_zip { exists_file(tds_zip).with_context(|| format!("Specified TDS zip archive {}", &tds_zip))?; let pkg_name = get_package_name_from_tds_archive_name(tds_zip)?; let p = Path::new(tds_zip); let result = detect(p)?; let cr = check_file(p, &result)?; if let Some(Issue::MimeMismatch { .. }) = &cr.mime_issue { // println!("{}: expected {expected}, got {got}", tds_zip); bail!("TDS archive {tds_zip} is not a zip archive") } if let Some(hashes) = check_package(&pkg_dir, &Some(tds_zip))? { check_tds_archive(tds_zip, &hashes, &pkg_replace, &pkg_name)?; } } else { let _ = check_package(&pkg_dir, &None)?; } if ARGS.correct_perms || ARGS.correct_le { process::exit(0); } if ERROR_OCCURRED.load(Ordering::Relaxed) || (WARNING_OCCURRED.load(Ordering::Relaxed) && !ARGS.warnings_no_errors) { process::exit(1); } process::exit(0); } fn print_duplicates(hashes: &DupHashes) { let mut total_dupes = 0; let mut total_files = 0; let mut total_size = 0; let mut header_printed = false; for (k, paths) in hashes { let (sz, _hash) = k; if paths.plen <= 1 { total_files += 1; total_size += sz.0; continue; } else if !header_printed { w0002!(); header_printed = true; } total_files += paths.plen; total_size += sz.0 * (paths.plen - 1) as u64; total_dupes += (paths.plen - 1) as u64; info!("Size: {}", sz.0); for p in &paths.dupes { if let DPath::Both(p) = p { let ps = p.as_path().to_str().unwrap(); info!(" >>> {ps}"); } } //eprintln!(); } if ARGS.verbose && total_dupes > 0 { info!("Duplicate statistics"); info!(" Found {total_files} duplicate files"); info!(" Size of duplicate files: {total_size}"); } } //#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)] pub enum FType { Regular, Directory, Symlink, BlockDevice, CharDevice, Fifo, Socket, Error(String), } fn get_filetype(entry: &DirEntry) -> FType { match entry.metadata() { Ok(mt) => { let ft = mt.file_type(); if ft.is_symlink() { return FType::Symlink; } if ft.is_dir() { return FType::Directory; } if ft.is_block_device() { return FType::BlockDevice; } if ft.is_char_device() { return FType::CharDevice; } if ft.is_fifo() { return FType::Fifo; } if ft.is_socket() { return FType::Socket; } FType::Regular } Err(e) => FType::Error(format!("{e}")), } } /// entry: relative full path name /// generated: store found file names which are generated fn check_generated_files(entry: &str, generated: &mut GeneratedHashMap) { match read_file_contents(Path::new(entry)) { Ok(content) => { let file_path = Path::new(entry); for fname in parse_file(file_path, &content) { // If the filename in the generate statement contains a path component // we ignore it so that a generated file will be reported even if it is // in a different place in the package directory which sometimes // happens in uploaded packages let fname_opt = utils::filename(&fname); if fname_opt.is_none() { continue; } let filename = fname_opt.unwrap(); // As we request a README in the top level directory of // a package we ignore if a README was generated by an // .ins or .dtx file // CAVEAT: If this happens in a subdirectory it could be an error!!!! if is_readme(filename) { continue; } // Ignore generated pdf, html, and css files if fname.ends_with(".pdf") || fname.ends_with(".html") || fname.ends_with(".css") { continue; } generated .entry(filename.to_string()) .or_insert_with(|| entry.to_string()); //generated.entry(target).or_insert_with(|| entry.to_string()); } } Err(e) => error!("Could not read {entry}: {e}"), } } fn x_bit_set(p: u32) -> bool { let p1 = p & 0o7777; p1 & 0o111 != 0 } fn get_package_name_from_tds_archive_name(tds_zip: &str) -> Result { if tds_zip.len() < 8 + 1 || !tds_zip.ends_with(".tds.zip") { bail!("Bad file name {tds_zip} for the TDS zip archive"); } let mut pname = String::from(utils::basename(tds_zip)); let plen = pname.len(); pname.truncate(plen - 8); Ok(pname) } fn check_tds_archive( tds_zip: &str, hashes: &DupHashes, pkg_replace: &HashMap, pkg_name: &str, ) -> Result<()> { i0003!(tds_zip); let mut lcnames: HashMap> = HashMap::default(); let dir_entry = Path::new(tds_zip); let p = get_perms(dir_entry)?; if !owner_has(p, 4) || !others_have(p, 4) || x_bit_set(p) { e0024!(tds_zip, perms_to_string(p)); if ARGS.correct_perms { i0005!(&tds_zip); set_perms(tds_zip, 0o664)?; } } let ut = Utils::new(utils::CheckType::Tds); let tmp_dir = Builder::new() .prefix("pkgcheck") .tempdir() .with_context(|| "creating tempdir")?; let tmp_dir_offset = tmp_dir.path().to_str().unwrap().len() + 1; let tmp_dir_str = tmp_dir.path().to_str().unwrap(); // unzip the TDS zip archive into a temporary directory ut.unzip(tds_zip, tmp_dir_str) // was E0033 error message .with_context(|| format!("TDS zip archive {tds_zip}"))?; // in order to compare the package files with the content of the // tds zip archive we need to checksum the files in the tds zip // archive. let mut sizes: SizesHashMap = HashMap::default(); let mut pool = Pool::new(num_cpus::get() as u32 + 1); { // Processing a single file entry, with the "sizes" hashmap collecting // same-size files. Entries are either Found::One or Found::Multiple, // so that we can submit the first file's path as a hashing job when the // first duplicate is found. Hashing each file is submitted as a job to // the pool. let mut process = |fsize, dir_entry: &DirEntry| { let path = dir_entry.path().to_path_buf(); let sizeref = &mut sizes; sizeref.entry(fsize).or_default().push(path); }; let mut map_files_found = false; let mut map_dvips_found = false; // those top level directories are the directories found in the // texmf-dist/ directory of a texlive installation let tds_toplevel_dirs: HashSet = [ "asymptote", "bibtex", "chktex", "context", "doc", "dvipdfmx", "dvips", "fonts", "hbf2gf", "makeindex", "metafont", "metapost", "mft", "omega", "pbibtex", "psutils", "scripts", "source", "tex", "tex4ht", "texconfig", "texdoc", "texdoctk", "ttf2pk", "web2c", "xdvi", "xindy", ] .iter() .map(|&s| s.to_string()) .collect(); // set to True if the TDS zip archive contains a top level directory doc/ let mut doc_found = false; // we track the number of toplevel directories which must at least be 2 let mut number_of_toplevel_dirs = 0; let re: Regex = Regex::new(r"fonts[/]map[/]dvips[/]").unwrap(); for dir_entry in WalkDir::new(tmp_dir.path().to_str().unwrap()).follow_links(false) { match dir_entry { Ok(dir_entry) => { let dir_entry_str = if let Some(d) = dir_entry.path().to_str() { d } else { e0031!(dir_entry.path().to_string_lossy()); continue; }; // this is the file_name without the directory part // unwrap() is ok here as we covered potential UTF-8 related errors // above in the definition of dir_entry_str let file_name = dir_entry.file_name().to_str().unwrap().to_string(); let meta = match dir_entry.metadata() { Ok(meta) => meta, Err(e) => { e0027!(dir_entry.path().display(), e); continue; } }; // let mtime = meta.modified().unwrap(); // if is_future_mtime(*NOW, mtime) { // let diff = mtime.duration_since(*NOW).unwrap(); // println!( // "{} has an mtime in the future by {} seconds", // &file_name, // diff.as_secs() // ); // } let ft = get_filetype(&dir_entry); if let FType::Error(e) = ft { e0023!(e); continue; } // this is the path name without the temporary part // from unpacking the TDS zip archive let dir_entry_display = if dir_entry.depth() == 0 { &dir_entry_str[tmp_dir_offset - 1..] } else { &dir_entry_str[tmp_dir_offset..] }; let filetype = match ft { FType::Directory => FileKind::Directory, FType::Regular => FileKind::File, FType::Symlink => { e0043!(dir_entry_display); continue; } _ => panic!( "Unexpected file type for {} in zip archive", dir_entry_display ), }; register_duplicate_filename(&mut lcnames, dir_entry_display, filetype); ut.check_for_temporary_file(dir_entry_display); // In the top level directory of a TDS zip archive // ... no files are allowed // ... only specific directories are allowed if dir_entry.depth() == 1 { if ft == FType::Regular { e0034!(dir_entry_display); continue; } if tds_toplevel_dirs.contains(&file_name) { number_of_toplevel_dirs += 1; if &file_name == "doc" { doc_found = true; } } else { e0020!(&file_name); } continue; } if ft == FType::Directory { ut.check_for_empty_directory(dir_entry_str, dir_entry_display); ut.check_for_hidden_directory(&file_name, dir_entry_display); ut.is_unwanted_directory(&file_name, dir_entry_str); continue; } // The LaTeX team provides the file `.tex` as a file with an empty name // in order to make `\input\relax` work (explained by David Carlisle) // Therefore, we don't call check_for_hidden_file() in this case match (pkg_name, dir_entry_display) { ("latex-tools", "tex/latex/tools/.tex") | ("latex-tools-dev", "tex/latex-dev/tools/.tex") => (), (_, _) => ut.check_for_hidden_file(&file_name, dir_entry_display), } let fsize = meta.len(); process(FileSize(fsize), &dir_entry); ut.check_filesize(fsize, dir_entry_display); // if we encounter a .dtx or .ins file we check // that it is in a subdirectory of either source/ or doc/ if (dir_entry_str.ends_with(".dtx") || dir_entry_str.ends_with(".ins")) && !(dir_entry_display.starts_with("source/") || dir_entry_display.starts_with("doc/")) { e0036!(dir_entry_display); continue; } // if the path doesn't contain a man page... if !dir_entry_str.contains("/man/") && !dir_entry_str.contains(pkg_name) { if let Some(real_name) = pkg_replace.get(pkg_name) { let pkg_name_s = format!("/{real_name}/"); if !dir_entry_str.contains(&pkg_name_s) { e0028!(pkg_name_s, dir_entry_display); } } else { e0028!(pkg_name, dir_entry_display); } } if dir_entry_str.ends_with(".map") { map_files_found = true; if re.is_match(dir_entry_str) { map_dvips_found = true; } } } Err(e) => { error!("{e}"); } } } if !doc_found { e0039!(); } if number_of_toplevel_dirs < 2 { e0040!(); } if map_files_found && !map_dvips_found { e0041!(); } }; let mut tds_hashes: HashMap<(FileSize, Vec), Vec> = HashMap::default(); pool.scoped(|scope| { let (tx, rx) = channel(); let hashref = &mut tds_hashes; scope.execute(move || { for (size, path, hash) in rx { hashref.entry((size, hash)).or_default().push(path); } }); for size in sizes.keys() { for p in &sizes[size] { let txc = tx.clone(); scope.execute(move || { hash_file(*size, p.clone(), &txc) .unwrap_or_else(|_| panic!("error hashing file {}", p.display())); }); } } }); // now check if each package file is in the tds archive for (k, paths) in hashes { if !tds_hashes.contains_key(k) { let p = &paths.dupes[0]; e0026!(p); } } print_casefolding_tds(&lcnames); Ok(()) } fn get_extension_from_filename(filename: &str) -> Option<&str> { Path::new(filename).extension().and_then(OsStr::to_str) } fn found_unwanted_filetype(fname: &str, ft: &FType) -> bool { match ft { FType::Socket => { e0013!(fname); true } FType::Fifo => { e0014!(fname); true } FType::BlockDevice => { e0015!(fname); true } FType::CharDevice => { e0016!(fname); true } FType::Error(e) => { e0023!(e); true } _ => false, } } // Very important // The permissions getting back from a file or directory have // #define S_IFREG 0100000 which means `regular file` // which is defined in `/usr/include/linux/stat.h` // // This means that, e.g. instead of 0o644 we have to use 0o100644 // fn check_and_correct_perms4(dir_entry: &str, p: u32) -> Result<()> { if !check_perms4(p) { e0002!(dir_entry, perms_to_string(p)); if ARGS.correct_perms { i0005!(&dir_entry); set_perms(dir_entry, 0o644)?; } } Ok(()) } fn check_and_correct_perms5(dir_entry: &str, p: u32) -> Result<()> { if !check_perms5(p) { e0002!(dir_entry, perms_to_string(p)); if ARGS.correct_perms { i0005!(&dir_entry); set_perms(dir_entry, 0o755)?; } } Ok(()) } // Sets permissions for a file or directory // Sample invocation: set_perms("somfile", 0o644); fn set_perms(entry: &str, p: u32) -> Result<()> { let f = File::open(entry)?; let attr = f.metadata()?; let mut perms = attr.permissions(); let ps = &format!("{:o}", perms.mode()); perms.set_mode(p); let ps1 = &format!("{p:o}"); f.set_permissions(perms)?; info!("mode of '{entry}' changed from {ps} to {ps1} "); Ok(()) } #[derive(Debug, Clone, PartialEq)] enum FileKind { File, Directory, Symlink(String), } impl Display for FileKind { fn fmt(&self, f: &mut ::std::fmt::Formatter) -> Result<(), ::std::fmt::Error> { match *self { FileKind::File => f.write_str("file"), FileKind::Directory => f.write_str("directory"), FileKind::Symlink(_) => f.write_str("symlink"), } } } #[derive(Debug, Clone, PartialEq)] enum ReadmeKind { No, Yes, Symlink(String), } fn register_duplicate_filename( lcnames: &mut HashMap>, dir_entry: &str, fk: FileKind, ) { let lc_dir_entry_str = dir_entry.to_lowercase(); if let Some(_dir_name) = filename(dir_entry) { lcnames .entry(PathBuf::from(lc_dir_entry_str)) .or_default() .push((PathBuf::from(&dir_entry), fk)); } } fn check_package(root: &str, tds_zip: &Option<&str>) -> Result> { let mut lcnames: HashMap> = HashMap::default(); let mut doublenames: HashMap> = HashMap::default(); let mut inodes = HashMap::default(); let ut = Utils::new(utils::CheckType::Package); i0002!(root); // This hash contains all package file names. // // PathBuf: the full path starting at the directory specified at the command line // Metadata: the meta data of the file // String: the file name without any directory part // ReadmeKind: is it a certain README, file or symlink? // A special case of a README file is a file with has a different name but // was pointed to by a symlink. Example: README --> README.rst let mut file_names: FileNamesHashMap = HashMap::default(); let mut readme_found = false; let root_absolute = PathBuf::from(root) .canonicalize() .unwrap() .to_string_lossy() .to_string(); for dir_entry in WalkDir::new(root).follow_links(false) { match dir_entry { Ok(dir_entry) => { let Some(dir_entry_str) = dir_entry.path().to_str() else { // invalid UTF-8 character(s) in filename e0031!(dir_entry.path().to_string_lossy()); continue; }; let meta = match dir_entry.metadata() { Ok(meta) => meta, Err(e) => { // insufficient permission to read directory e0023!(e); continue; } }; check_inode(&mut inodes, dir_entry_str, &meta); // this is the file_name without the directory part // unwrap() is ok here as we covered potential UTF-8 related errors // above in the definition of dir_entry_str let file_name = dir_entry.file_name().to_str().unwrap().to_string(); let mtime = meta.modified().unwrap(); if is_future_mtime(*NOW, mtime) { let diff = mtime.duration_since(*NOW).unwrap(); w0011!(&file_name, diff.as_secs(), &utils::format_duration(&diff)); } // we check for weird stuff like socket files aso. let ft = get_filetype(&dir_entry); if found_unwanted_filetype(dir_entry_str, &ft) { continue; } ut.filename_has_bad_chars(&dir_entry, dir_entry_str); // 1. dealing with symlinks if ft == FType::Symlink { match get_symlink(&dir_entry) { // broken symlink Ok(None) => { e0010!(&dir_entry_str); continue; } Err(e) => { e0027!(&dir_entry_str, e); continue; } Ok(Some(p)) => { let pd: String = p.canonicalize().unwrap().to_string_lossy().to_string(); // symlink pointing to outside of the package directory tree if !pd.starts_with(&root_absolute) { e0030!(&dir_entry_str, p.display()); continue; } if let Some(_dir_name) = filename(dir_entry_str) { register_duplicate_filename( &mut lcnames, dir_entry_str, FileKind::Symlink(pd.clone()), ); } if is_readme(&file_name) { readme_found = true; file_names.insert( p, ( meta, file_name, ReadmeKind::Symlink(dir_entry_str.to_string()), ), ); } continue; } } } let p = get_perms(dir_entry.path())?; // 2. dealing with directories if ft == FType::Directory { if let Some(_dir_name) = filename(dir_entry_str) { register_duplicate_filename( &mut lcnames, dir_entry_str, FileKind::Directory, ); } if !owner_has(p, 5) || !others_have(p, 5) { e0011!(&dir_entry_str, perms_to_string(p)); if ARGS.correct_perms { i0005!(&dir_entry_str); set_perms(dir_entry_str, 0o775)?; } } ut.check_for_empty_directory(dir_entry_str, dir_entry_str); ut.check_for_hidden_directory(&file_name, dir_entry_str); ut.is_unwanted_directory(&file_name, dir_entry_str); continue; } // 3. dealing with regular files ut.check_for_hidden_file(&file_name, dir_entry_str); if !ARGS.ignore_tmpfiles() { ut.check_for_temporary_file(dir_entry_str); } if let Some(file_name) = filename(dir_entry_str) { let doubleref = &mut doublenames; doubleref .entry(PathBuf::from(file_name)) .or_default() .push(PathBuf::from(&dir_entry_str)); } if is_readme(&file_name) { // We want to deal with README files only if they are // in the root directory of the package. let f = format!( "{}{}{}", root, // we have to pay attention if `root` ends already with '/' if root.ends_with('/') { "" } else { "/" }, &file_name ); if dir_entry_str == f { readme_found = true; file_names.insert( dir_entry.path().to_path_buf(), (meta, file_name.clone(), ReadmeKind::Yes), ); } else { file_names.entry(dir_entry.path().to_path_buf()).or_insert(( meta, file_name.clone(), ReadmeKind::No, )); } } else { file_names.entry(dir_entry.path().to_path_buf()).or_insert(( meta, file_name.clone(), ReadmeKind::No, )); } register_duplicate_filename(&mut lcnames, dir_entry_str, FileKind::File); } Err(e) => { error!("{e}"); } } } if !readme_found { e0009!(); } let lc = LinkCheck::new(4, false); let mut sizes: SizesHashMap = HashMap::default(); let mut generated: GeneratedHashMap = HashMap::default(); // Processing a single file entry, with the "sizes" hashmap collecting // same-size files. Entries are either Found::One or Found::Multiple, // so that we can submit the first file's path as a hashing job when the // first duplicate is found. Hashing each file is submitted as a job to // the pool. let mut process = |fsize, path: &PathBuf| { let sizeref = &mut sizes; let path = path.clone(); sizeref.entry(fsize).or_default().push(path); }; for (path, (meta, _file_name, is_readme)) in &file_names { let Some(dir_entry_str) = path.to_str() else { e0031!(&path.to_string_lossy()); continue; }; let fsize = meta.len(); ut.check_filesize(fsize, dir_entry_str); let perms = get_perms(path)?; let p = Path::new(dir_entry_str); let ft = detect(p)?; if ReadmeKind::No != *is_readme { if !check_readme(dir_entry_str, is_readme, &ft) { continue; } if ARGS.urlcheck { lc.check_urls(dir_entry_str); } } let cr = check_file(Path::new(dir_entry_str), &ft)?; // Here we check mime mismatches // Example: a `.png` file which is not an image if let Some(Issue::MimeMismatch { expected, got }) = &cr.mime_issue { if !matches!(&ft, DetectResult::Zerofile) { e0032!(dir_entry_str, expected, got); } } // Here we check permission issues if let Some(Issue::PermMismatch { expected, got }) = &cr.perm_issue { match &ft { DetectResult::Text(_) => { if is_windows_batchfile(dir_entry_str) { e0002!(dir_entry_str, perms_to_string(*got)); if ARGS.correct_perms { i0005!(&dir_entry_str); set_perms(dir_entry_str, 0o755)?; } } else { check_and_correct_perms4(dir_entry_str, perms)?; } } DetectResult::Script(_, _) => { e0002!(dir_entry_str, perms_to_string(*got)); if ARGS.correct_perms { i0005!(&dir_entry_str); set_perms(dir_entry_str, 0o755)?; } } DetectResult::Bom(_b) => { check_and_correct_perms4(dir_entry_str, *got)?; } DetectResult::Elf => { if !dir_entry_str.ends_with(".dll") { check_and_correct_perms5(dir_entry_str, perms)?; } } DetectResult::Pdf | DetectResult::Archive | DetectResult::Zip => { check_and_correct_perms4(dir_entry_str, perms)?; } DetectResult::Zerofile => check_and_correct_perms4(dir_entry_str, *got)?, dr => { eprintln!( "Unexpected error: {} permission mismatch — expected {expected}, got {got:03o}", dr ); } } } if matches!(ft, DetectResult::Text(_)) { let fext = get_extension_from_filename(dir_entry_str); if fext == Some("ins") || fext == Some("dtx") { check_generated_files(dir_entry_str, &mut generated); } } match ft { DetectResult::Text(LineEnding::CrLf) => { if !is_windows_batchfile(dir_entry_str) { e0012!(&dir_entry_str); if ARGS.correct_le { fix_inconsistent_le(dir_entry_str); } } } DetectResult::Text(LineEnding::Cr) => { e0037!(&dir_entry_str); if ARGS.correct_le { fix_inconsistent_le(dir_entry_str); } } DetectResult::Text(LineEnding::Mixed(cr, lf, crlf)) => { e0038!(&dir_entry_str, cr, lf, crlf); if ARGS.correct_le { if is_windows_batchfile(dir_entry_str) { make_crlf(dir_entry_str); } else { fix_inconsistent_le(dir_entry_str); } } } DetectResult::Text(LineEnding::Lf) => { if is_windows_batchfile(dir_entry_str) { w0008!(&dir_entry_str); } } DetectResult::Bom(b) => { w0004!(&dir_entry_str, b); } DetectResult::Pdf => { is_pdf_ok(dir_entry_str); } DetectResult::Archive | DetectResult::Zip => { if dir_entry_str.ends_with(".tds.zip") { e0035!(&dir_entry_str); } else { w0001!(&dir_entry_str); } // check_and_correct_perms4(dir_entry_str, perms)?; } _ => continue, } if !(ARGS.ignore_dupes() && tds_zip.is_none()) { process(FileSize(fsize), path); } } print_casefolding(&lcnames); print_generated(&doublenames, &generated); print_hardlinks(&inodes); if !ARGS.ignore_same_named() { print_doublenames(&doublenames); } if ARGS.ignore_dupes() && tds_zip.is_none() { return Ok(None); } // Set up thread pool for the task to hash a file. Number of CPUs + 1 has been // found to be a good pool size, likely since the walker thread should be // doing mostly IO. let mut pool = Pool::new(num_cpus::get() as u32 + 1); let mut hashes: HashMap<(FileSize, Vec), DupPath> = HashMap::default(); pool.scoped(|scope| { let (tx, rx) = channel(); let hashref = &mut hashes; scope.execute(move || { for (size, path, hash) in rx { hashref.entry((size, hash)).or_default().push(path); } }); for size in sizes.keys() { let paths = &sizes[size]; if paths.len() == 1 && tds_zip.is_none() { continue; } for p in &sizes[size] { let txc = tx.clone(); scope.execute(move || { hash_file(*size, p.clone(), &txc) .unwrap_or_else(|_| panic!("error hashing file {}", p.display())); }); } } }); if !ARGS.ignore_dupes() { print_duplicates(&hashes); } Ok(Some(hashes)) } /// Print all hardlinks /// If we have more than a single file in the Vec then we have a hardlink fn print_hardlinks(hashes: &HashMap<(Devno, Inode), Vec>) { for ((_devid, inode), eles) in hashes { if eles.len() > 1 { w0010!(inode.0); for hfile in eles { info!(" >>> {}", &hfile); } } } } fn print_casefolding_tds(hashes: &HashMap>) { for (k, eles) in hashes { // println!("pcf_tds: {:?}, {:?}", k, &eles); if eles.len() == 1 { continue; } e0042!(k.display()); for (p, ty) in eles { info!(" >>> {} ({})", p.display(), ty); } } } /// We don't want to have file names in a directory which are the same when /// converted to lower case fn print_casefolding(hashes: &HashMap>) { for (k, eles) in hashes { //println!("pcf: {:?}, {:?}", k, &eles); if eles.len() == 1 { continue; } e0025!(k.display()); for (p, ty) in eles { info!(" >>> {} ({})", p.display(), ty); } } } fn print_generated(doublenames: &HashMap>, generated: &GeneratedHashMap) { // `k` is the file which is generated by `generator` for (k, generator) in generated { let path = PathBuf::from(k); if doublenames.contains_key(&path) { if k.ends_with(".ins") || k.ends_with(".pdf") { //println!("key {}, gen {}", k, gen); continue; } let v = &doublenames[&path]; for fname in v { e0019!(fname.to_str().unwrap(), generator.as_str()); } } } } fn print_doublenames(hashes: &HashMap>) { for (k, paths) in hashes { if paths.len() == 1 { continue; } let ks = k.to_str().unwrap(); if ks == "README" || ks == "README.txt" || ks == "README.md" || ks == "Makefile" || ks == "Makefile.am" || ks == "Makefile.in" || ks == "makefile" { continue; } w0003!(k.to_str().unwrap()); // println!(":: {}", k.display()); for p in paths { info!(" >>> {}", p.display()); } } } fn show_tmp_endings() { i0006!(); for (t, c) in temp_file_endings() { info!("{t:23} {c}"); } }