use std::{ borrow::Cow, collections::{BTreeMap, BTreeSet, HashMap, HashSet}, fs, io, path::{PathBuf, Path}, }; use gimli::{constants, DW_TAG_compile_unit}; use object::{Object, ObjectSection, ReadCache}; use serde::{Deserialize, Serialize}; use sha2::{ digest::generic_array::{typenum::U32, GenericArray}, Digest, Sha256, }; use typed_arena::Arena; #[derive(Serialize, Deserialize, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)] #[serde(transparent)] pub struct Sha256Hash { #[serde(with = "hex")] inner: [u8; 32], } impl From> for Sha256Hash { fn from(value: GenericArray) -> Self { Self { inner: value.into(), } } } #[derive(Serialize, Deserialize)] pub struct FileStore { pub files: Vec, pub filenames: HashMap, pub hashes: BTreeMap, } #[derive(Serialize, Deserialize)] pub struct FileStoreEntry { pub index: usize, pub hash: Sha256Hash, pub format: FileFormat, pub input_names: HashSet, pub output_names: HashSet, } #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum FileFormat { ELF, Other, } impl FileStore { pub fn new(inputs: Vec) -> anyhow::Result { let mut result = Self { files: vec![], filenames: HashMap::new(), hashes: BTreeMap::new(), }; for input in inputs { result.ingest_input(input)?; } Ok(result) } /// Register the minimal set of information associated with a file. Returns whether the file /// was already known. pub fn insert(&mut self, path: PathBuf, hash: Sha256Hash) -> bool { if let Some(idx) = self.hashes.get_mut(&hash) { self.files.get_mut(*idx).unwrap().output_names.insert(path); true } else { let index = self.files.len(); self.files.push(FileStoreEntry { index, hash, format: FileFormat::Other, input_names: HashSet::new(), output_names: HashSet::from([path.clone()]), }); self.filenames.insert(path, index); self.hashes.insert(hash, index); false } } pub fn update_format(&mut self, path: &Path, format: FileFormat) { if !self.filenames.contains_key(path) { panic!("update_format called with unknown path {}", path.to_string_lossy()) } let idx = *self.filenames.get(path).unwrap(); self.files.get_mut(idx).unwrap().format = format; } fn ingest_input(&mut self, filename: PathBuf) -> anyhow::Result<()> { let stat = fs::metadata(&filename)?; if stat.is_dir() { for entry in walkdir::WalkDir::new(&filename) .into_iter() .filter_map(|e| e.ok()) { if entry.file_type().is_file() { let fp = fs::File::open(entry.path())?; self.ingest_input_content(entry.path().to_owned(), fp)?; } } } else { let fp = fs::File::open(&filename)?; self.ingest_input_content(filename, fp)?; } Ok(()) } fn ingest_input_content( &mut self, filename: PathBuf, mut content: (impl io::Read + io::Seek), ) -> anyhow::Result<()> { let mut h = Sha256::new(); log::debug!("Hashing {}", filename.to_string_lossy()); io::copy(&mut content, &mut h)?; let hash = h.finalize().into(); let index = match self.hashes.entry(hash) { std::collections::btree_map::Entry::Vacant(e) => { let index = self.files.len(); e.insert(index); let (format, refs) = parse_format(&mut content)?; self.files.push(FileStoreEntry { index, hash, format, input_names: [filename.clone()].into(), output_names: HashSet::new(), }); for (reference_path, _reference_hash) in refs { // lazy... self.ingest_input(reference_path)?; } index } std::collections::btree_map::Entry::Occupied(e) => { self.files .get_mut(*e.get()) .unwrap() .input_names .insert(filename.clone()); *e.get() } }; if index == self.files.len() {} self.filenames.insert(filename, index); Ok(()) } /* pub fn ingest_output_local(&mut self, filename: PathBuf) -> anyhow::Result<()> { let stat = fs::metadata(&filename)?; if stat.is_dir() { return Ok(()); } let fp = fs::File::open(&filename)?; self.ingest_output(filename, fp)?; Ok(()) } pub fn ingest_output( &mut self, filename: PathBuf, mut content: (impl io::Read + io::Seek), ) -> anyhow::Result { let mut h = Sha256::new(); io::copy(&mut content, &mut h)?; let hash = h.finalize().into(); let index = match self.hashes.entry(hash) { std::collections::btree_map::Entry::Vacant(e) => { let index = self.files.len(); e.insert(index); let (format, refs) = self.parse_format(&mut content)?; self.files.push(FileStoreEntry { index, format, hash, input_names: HashSet::new(), output_names: [filename.clone()].into(), }); index } std::collections::btree_map::Entry::Occupied(e) => { self.files .get_mut(*e.get()) .unwrap() .output_names .insert(filename.clone()); *e.get() } }; self.filenames.insert(filename, index); Ok(index) } */ } fn load_file_section<'input, 'arena, Endian: gimli::Endianity, R: object::ReadRef<'input>>( id: gimli::SectionId, file: &object::File<'input, R>, endian: Endian, arena_data: &'arena Arena>, ) -> Result, ()> { // TODO: Unify with dwarfdump.rs in gimli. let name = id.name(); match file.section_by_name(name) { Some(section) => match section.uncompressed_data().unwrap() { Cow::Borrowed(b) => Ok(gimli::EndianSlice::new(b, endian)), Cow::Owned(b) => Ok(gimli::EndianSlice::new(arena_data.alloc(b.into()), endian)), }, None => Ok(gimli::EndianSlice::new(&[][..], endian)), } } fn read_exact_or_end(fp: &mut impl io::Read, buf: &mut [u8]) -> anyhow::Result { let mut read_so_far = 0; while read_so_far < buf.len() { let n = fp.read(&mut buf[read_so_far..])?; if n == 0 { break; } read_so_far += n; } Ok(read_so_far) } pub fn parse_format(fp: &mut (impl io::Read + io::Seek)) -> anyhow::Result<(FileFormat, BTreeSet<(PathBuf, Sha256Hash)>)> { fp.seek(io::SeekFrom::Start(0))?; let mut buf = [0; 4]; let count = read_exact_or_end(fp, &mut buf)?; let buf = &buf[..count]; Ok(match buf { [0x7f, b'E', b'L', b'F', ..] => { let read_cache = ReadCache::new(fp); let elf = object::File::parse(&read_cache)?; let endian = if elf.is_little_endian() { gimli::RunTimeEndian::Little } else { gimli::RunTimeEndian::Big }; let arena_data = Arena::new(); let mut load_section = |id: gimli::SectionId| -> Result<_, _> { load_file_section(id, &elf, endian, &arena_data) }; let dwarf = gimli::Dwarf::load(&mut load_section).unwrap(); let mut units = dwarf.units(); let mut inputs = vec![]; while let Ok(Some(unit)) = units.next() { let abbrev = dwarf.abbreviations(&unit)?; let mut entries = unit.entries(&abbrev); while let Some((_, entry)) = entries.next_dfs()? { if entry.tag() == DW_TAG_compile_unit { let mut basename = None; let mut dirname = None; if let Some(name) = entry.attr(constants::DW_AT_name)?.map(|a| a.value()) { if let Ok(name) = dwarf.attr_string(&dwarf.unit(unit)?, name) { basename = Some(PathBuf::from(name.to_string()?)); } } if let Some(name) = entry.attr(constants::DW_AT_comp_dir)?.map(|a| a.value()) { if let Ok(name) = dwarf.attr_string(&dwarf.unit(unit)?, name) { dirname = Some(PathBuf::from(name.to_string()?)); } } if let (Some(dirname), Some(basename)) = (dirname, basename) { inputs.push(dirname.join(basename)); } } } } let references = inputs .into_iter() .map(|filename| -> anyhow::Result<_> { // TODO: this needs to try suffixes of the filename against the filepath table to see if it // was moved between compilation and ingestion. but how... let metadata = match fs::metadata(&filename) { Ok(m) => m, Err(_) => return Ok(None), }; if !metadata.is_file() { return Ok(None); } let mut fp = fs::File::open(&filename)?; let mut h = Sha256::new(); log::debug!("Hashing {}", filename.to_string_lossy()); io::copy(&mut fp, &mut h)?; let result = Ok(Some((filename, h.finalize().into()))); result }) .collect::, _>>()? .into_iter() .filter_map(|x| x) .collect(); (FileFormat::ELF, references) } _ => (FileFormat::Other, BTreeSet::new()), }) }