320 lines
11 KiB
Rust
320 lines
11 KiB
Rust
use std::{
|
|
borrow::Cow,
|
|
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
|
|
fs, io,
|
|
path::{PathBuf, Path},
|
|
};
|
|
|
|
use gimli::{constants, DW_TAG_compile_unit};
|
|
use object::{Object, ObjectSection, ReadCache};
|
|
use serde::{Deserialize, Serialize};
|
|
use sha2::{
|
|
digest::generic_array::{typenum::U32, GenericArray},
|
|
Digest, Sha256,
|
|
};
|
|
use typed_arena::Arena;
|
|
|
|
#[derive(Serialize, Deserialize, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
|
|
#[serde(transparent)]
|
|
pub struct Sha256Hash {
|
|
#[serde(with = "hex")]
|
|
inner: [u8; 32],
|
|
}
|
|
|
|
impl From<GenericArray<u8, U32>> for Sha256Hash {
|
|
fn from(value: GenericArray<u8, U32>) -> Self {
|
|
Self {
|
|
inner: value.into(),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize)]
|
|
pub struct FileStore {
|
|
pub files: Vec<FileStoreEntry>,
|
|
pub filenames: HashMap<PathBuf, usize>,
|
|
pub hashes: BTreeMap<Sha256Hash, usize>,
|
|
}
|
|
|
|
#[derive(Serialize, Deserialize)]
|
|
pub struct FileStoreEntry {
|
|
pub index: usize,
|
|
pub hash: Sha256Hash,
|
|
pub format: FileFormat,
|
|
pub input_names: HashSet<PathBuf>,
|
|
pub output_names: HashSet<PathBuf>,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
|
pub enum FileFormat {
|
|
ELF,
|
|
Other,
|
|
}
|
|
|
|
impl FileStore {
|
|
pub fn new(inputs: Vec<PathBuf>) -> anyhow::Result<Self> {
|
|
let mut result = Self {
|
|
files: vec![],
|
|
filenames: HashMap::new(),
|
|
hashes: BTreeMap::new(),
|
|
};
|
|
|
|
for input in inputs {
|
|
result.ingest_input(input)?;
|
|
}
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
/// Register the minimal set of information associated with a file. Returns whether the file
|
|
/// was already known.
|
|
pub fn insert(&mut self, path: PathBuf, hash: Sha256Hash) -> bool {
|
|
if let Some(idx) = self.hashes.get_mut(&hash) {
|
|
self.files.get_mut(*idx).unwrap().output_names.insert(path);
|
|
true
|
|
} else {
|
|
let index = self.files.len();
|
|
self.files.push(FileStoreEntry {
|
|
index,
|
|
hash,
|
|
format: FileFormat::Other,
|
|
input_names: HashSet::new(),
|
|
output_names: HashSet::from([path.clone()]),
|
|
});
|
|
self.filenames.insert(path, index);
|
|
self.hashes.insert(hash, index);
|
|
false
|
|
}
|
|
}
|
|
|
|
pub fn update_format(&mut self, path: &Path, format: FileFormat) {
|
|
if !self.filenames.contains_key(path) {
|
|
panic!("update_format called with unknown path {}", path.to_string_lossy())
|
|
}
|
|
let idx = *self.filenames.get(path).unwrap();
|
|
self.files.get_mut(idx).unwrap().format = format;
|
|
}
|
|
|
|
fn ingest_input(&mut self, filename: PathBuf) -> anyhow::Result<()> {
|
|
let stat = fs::metadata(&filename)?;
|
|
if stat.is_dir() {
|
|
for entry in walkdir::WalkDir::new(&filename)
|
|
.into_iter()
|
|
.filter_map(|e| e.ok())
|
|
{
|
|
if entry.file_type().is_file() {
|
|
let fp = fs::File::open(entry.path())?;
|
|
self.ingest_input_content(entry.path().to_owned(), fp)?;
|
|
}
|
|
}
|
|
} else {
|
|
let fp = fs::File::open(&filename)?;
|
|
self.ingest_input_content(filename, fp)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn ingest_input_content(
|
|
&mut self,
|
|
filename: PathBuf,
|
|
mut content: (impl io::Read + io::Seek),
|
|
) -> anyhow::Result<()> {
|
|
let mut h = Sha256::new();
|
|
log::debug!("Hashing {}", filename.to_string_lossy());
|
|
io::copy(&mut content, &mut h)?;
|
|
let hash = h.finalize().into();
|
|
|
|
let index = match self.hashes.entry(hash) {
|
|
std::collections::btree_map::Entry::Vacant(e) => {
|
|
let index = self.files.len();
|
|
e.insert(index);
|
|
let (format, refs) = parse_format(&mut content)?;
|
|
self.files.push(FileStoreEntry {
|
|
index,
|
|
hash,
|
|
format,
|
|
input_names: [filename.clone()].into(),
|
|
output_names: HashSet::new(),
|
|
});
|
|
for (reference_path, _reference_hash) in refs { // lazy...
|
|
self.ingest_input(reference_path)?;
|
|
}
|
|
index
|
|
}
|
|
std::collections::btree_map::Entry::Occupied(e) => {
|
|
self.files
|
|
.get_mut(*e.get())
|
|
.unwrap()
|
|
.input_names
|
|
.insert(filename.clone());
|
|
*e.get()
|
|
}
|
|
};
|
|
|
|
if index == self.files.len() {}
|
|
|
|
self.filenames.insert(filename, index);
|
|
Ok(())
|
|
}
|
|
|
|
/*
|
|
pub fn ingest_output_local(&mut self, filename: PathBuf) -> anyhow::Result<()> {
|
|
let stat = fs::metadata(&filename)?;
|
|
if stat.is_dir() {
|
|
return Ok(());
|
|
}
|
|
|
|
let fp = fs::File::open(&filename)?;
|
|
self.ingest_output(filename, fp)?;
|
|
Ok(())
|
|
}
|
|
|
|
pub fn ingest_output(
|
|
&mut self,
|
|
filename: PathBuf,
|
|
mut content: (impl io::Read + io::Seek),
|
|
) -> anyhow::Result<usize> {
|
|
let mut h = Sha256::new();
|
|
io::copy(&mut content, &mut h)?;
|
|
let hash = h.finalize().into();
|
|
|
|
let index = match self.hashes.entry(hash) {
|
|
std::collections::btree_map::Entry::Vacant(e) => {
|
|
let index = self.files.len();
|
|
e.insert(index);
|
|
let (format, refs) = self.parse_format(&mut content)?;
|
|
self.files.push(FileStoreEntry {
|
|
index,
|
|
format,
|
|
hash,
|
|
input_names: HashSet::new(),
|
|
output_names: [filename.clone()].into(),
|
|
});
|
|
index
|
|
}
|
|
std::collections::btree_map::Entry::Occupied(e) => {
|
|
self.files
|
|
.get_mut(*e.get())
|
|
.unwrap()
|
|
.output_names
|
|
.insert(filename.clone());
|
|
*e.get()
|
|
}
|
|
};
|
|
|
|
self.filenames.insert(filename, index);
|
|
|
|
Ok(index)
|
|
}
|
|
*/
|
|
}
|
|
|
|
fn load_file_section<'input, 'arena, Endian: gimli::Endianity, R: object::ReadRef<'input>>(
|
|
id: gimli::SectionId,
|
|
file: &object::File<'input, R>,
|
|
endian: Endian,
|
|
arena_data: &'arena Arena<Cow<'input, [u8]>>,
|
|
) -> Result<gimli::EndianSlice<'arena, Endian>, ()> {
|
|
// TODO: Unify with dwarfdump.rs in gimli.
|
|
let name = id.name();
|
|
match file.section_by_name(name) {
|
|
Some(section) => match section.uncompressed_data().unwrap() {
|
|
Cow::Borrowed(b) => Ok(gimli::EndianSlice::new(b, endian)),
|
|
Cow::Owned(b) => Ok(gimli::EndianSlice::new(arena_data.alloc(b.into()), endian)),
|
|
},
|
|
None => Ok(gimli::EndianSlice::new(&[][..], endian)),
|
|
}
|
|
}
|
|
|
|
fn read_exact_or_end(fp: &mut impl io::Read, buf: &mut [u8]) -> anyhow::Result<usize> {
|
|
let mut read_so_far = 0;
|
|
while read_so_far < buf.len() {
|
|
let n = fp.read(&mut buf[read_so_far..])?;
|
|
if n == 0 {
|
|
break;
|
|
}
|
|
read_so_far += n;
|
|
}
|
|
Ok(read_so_far)
|
|
}
|
|
|
|
pub fn parse_format(fp: &mut (impl io::Read + io::Seek)) -> anyhow::Result<(FileFormat, BTreeSet<(PathBuf, Sha256Hash)>)> {
|
|
fp.seek(io::SeekFrom::Start(0))?;
|
|
let mut buf = [0; 4];
|
|
let count = read_exact_or_end(fp, &mut buf)?;
|
|
let buf = &buf[..count];
|
|
|
|
Ok(match buf {
|
|
[0x7f, b'E', b'L', b'F', ..] => {
|
|
let read_cache = ReadCache::new(fp);
|
|
let elf = object::File::parse(&read_cache)?;
|
|
let endian = if elf.is_little_endian() {
|
|
gimli::RunTimeEndian::Little
|
|
} else {
|
|
gimli::RunTimeEndian::Big
|
|
};
|
|
let arena_data = Arena::new();
|
|
let mut load_section = |id: gimli::SectionId| -> Result<_, _> {
|
|
load_file_section(id, &elf, endian, &arena_data)
|
|
};
|
|
let dwarf = gimli::Dwarf::load(&mut load_section).unwrap();
|
|
let mut units = dwarf.units();
|
|
let mut inputs = vec![];
|
|
while let Ok(Some(unit)) = units.next() {
|
|
let abbrev = dwarf.abbreviations(&unit)?;
|
|
let mut entries = unit.entries(&abbrev);
|
|
while let Some((_, entry)) = entries.next_dfs()? {
|
|
if entry.tag() == DW_TAG_compile_unit {
|
|
let mut basename = None;
|
|
let mut dirname = None;
|
|
if let Some(name) =
|
|
entry.attr(constants::DW_AT_name)?.map(|a| a.value())
|
|
{
|
|
if let Ok(name) = dwarf.attr_string(&dwarf.unit(unit)?, name) {
|
|
basename = Some(PathBuf::from(name.to_string()?));
|
|
}
|
|
}
|
|
if let Some(name) =
|
|
entry.attr(constants::DW_AT_comp_dir)?.map(|a| a.value())
|
|
{
|
|
if let Ok(name) = dwarf.attr_string(&dwarf.unit(unit)?, name) {
|
|
dirname = Some(PathBuf::from(name.to_string()?));
|
|
}
|
|
}
|
|
if let (Some(dirname), Some(basename)) = (dirname, basename) {
|
|
inputs.push(dirname.join(basename));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let references = inputs
|
|
.into_iter()
|
|
.map(|filename| -> anyhow::Result<_> {
|
|
// TODO: this needs to try suffixes of the filename against the filepath table to see if it
|
|
// was moved between compilation and ingestion. but how...
|
|
let metadata = match fs::metadata(&filename) {
|
|
Ok(m) => m,
|
|
Err(_) => return Ok(None),
|
|
};
|
|
if !metadata.is_file() {
|
|
return Ok(None);
|
|
}
|
|
|
|
let mut fp = fs::File::open(&filename)?;
|
|
let mut h = Sha256::new();
|
|
log::debug!("Hashing {}", filename.to_string_lossy());
|
|
io::copy(&mut fp, &mut h)?;
|
|
let result = Ok(Some((filename, h.finalize().into())));
|
|
result
|
|
})
|
|
.collect::<Result<Vec<_>, _>>()?
|
|
.into_iter()
|
|
.filter_map(|x| x)
|
|
.collect();
|
|
(FileFormat::ELF, references)
|
|
}
|
|
_ => (FileFormat::Other, BTreeSet::new()),
|
|
})
|
|
}
|