ontology/src/filestore.rs

320 lines
11 KiB
Rust

use std::{
borrow::Cow,
collections::{BTreeMap, BTreeSet, HashMap, HashSet},
fs, io,
path::{PathBuf, Path},
};
use gimli::{constants, DW_TAG_compile_unit};
use object::{Object, ObjectSection, ReadCache};
use serde::{Deserialize, Serialize};
use sha2::{
digest::generic_array::{typenum::U32, GenericArray},
Digest, Sha256,
};
use typed_arena::Arena;
#[derive(Serialize, Deserialize, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
#[serde(transparent)]
pub struct Sha256Hash {
#[serde(with = "hex")]
inner: [u8; 32],
}
impl From<GenericArray<u8, U32>> for Sha256Hash {
fn from(value: GenericArray<u8, U32>) -> Self {
Self {
inner: value.into(),
}
}
}
#[derive(Serialize, Deserialize)]
pub struct FileStore {
pub files: Vec<FileStoreEntry>,
pub filenames: HashMap<PathBuf, usize>,
pub hashes: BTreeMap<Sha256Hash, usize>,
}
#[derive(Serialize, Deserialize)]
pub struct FileStoreEntry {
pub index: usize,
pub hash: Sha256Hash,
pub format: FileFormat,
pub input_names: HashSet<PathBuf>,
pub output_names: HashSet<PathBuf>,
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum FileFormat {
ELF,
Other,
}
impl FileStore {
pub fn new(inputs: Vec<PathBuf>) -> anyhow::Result<Self> {
let mut result = Self {
files: vec![],
filenames: HashMap::new(),
hashes: BTreeMap::new(),
};
for input in inputs {
result.ingest_input(input)?;
}
Ok(result)
}
/// Register the minimal set of information associated with a file. Returns whether the file
/// was already known.
pub fn insert(&mut self, path: PathBuf, hash: Sha256Hash) -> bool {
if let Some(idx) = self.hashes.get_mut(&hash) {
self.files.get_mut(*idx).unwrap().output_names.insert(path);
true
} else {
let index = self.files.len();
self.files.push(FileStoreEntry {
index,
hash,
format: FileFormat::Other,
input_names: HashSet::new(),
output_names: HashSet::from([path.clone()]),
});
self.filenames.insert(path, index);
self.hashes.insert(hash, index);
false
}
}
pub fn update_format(&mut self, path: &Path, format: FileFormat) {
if !self.filenames.contains_key(path) {
panic!("update_format called with unknown path {}", path.to_string_lossy())
}
let idx = *self.filenames.get(path).unwrap();
self.files.get_mut(idx).unwrap().format = format;
}
fn ingest_input(&mut self, filename: PathBuf) -> anyhow::Result<()> {
let stat = fs::metadata(&filename)?;
if stat.is_dir() {
for entry in walkdir::WalkDir::new(&filename)
.into_iter()
.filter_map(|e| e.ok())
{
if entry.file_type().is_file() {
let fp = fs::File::open(entry.path())?;
self.ingest_input_content(entry.path().to_owned(), fp)?;
}
}
} else {
let fp = fs::File::open(&filename)?;
self.ingest_input_content(filename, fp)?;
}
Ok(())
}
fn ingest_input_content(
&mut self,
filename: PathBuf,
mut content: (impl io::Read + io::Seek),
) -> anyhow::Result<()> {
let mut h = Sha256::new();
log::debug!("Hashing {}", filename.to_string_lossy());
io::copy(&mut content, &mut h)?;
let hash = h.finalize().into();
let index = match self.hashes.entry(hash) {
std::collections::btree_map::Entry::Vacant(e) => {
let index = self.files.len();
e.insert(index);
let (format, refs) = parse_format(&mut content)?;
self.files.push(FileStoreEntry {
index,
hash,
format,
input_names: [filename.clone()].into(),
output_names: HashSet::new(),
});
for (reference_path, _reference_hash) in refs { // lazy...
self.ingest_input(reference_path)?;
}
index
}
std::collections::btree_map::Entry::Occupied(e) => {
self.files
.get_mut(*e.get())
.unwrap()
.input_names
.insert(filename.clone());
*e.get()
}
};
if index == self.files.len() {}
self.filenames.insert(filename, index);
Ok(())
}
/*
pub fn ingest_output_local(&mut self, filename: PathBuf) -> anyhow::Result<()> {
let stat = fs::metadata(&filename)?;
if stat.is_dir() {
return Ok(());
}
let fp = fs::File::open(&filename)?;
self.ingest_output(filename, fp)?;
Ok(())
}
pub fn ingest_output(
&mut self,
filename: PathBuf,
mut content: (impl io::Read + io::Seek),
) -> anyhow::Result<usize> {
let mut h = Sha256::new();
io::copy(&mut content, &mut h)?;
let hash = h.finalize().into();
let index = match self.hashes.entry(hash) {
std::collections::btree_map::Entry::Vacant(e) => {
let index = self.files.len();
e.insert(index);
let (format, refs) = self.parse_format(&mut content)?;
self.files.push(FileStoreEntry {
index,
format,
hash,
input_names: HashSet::new(),
output_names: [filename.clone()].into(),
});
index
}
std::collections::btree_map::Entry::Occupied(e) => {
self.files
.get_mut(*e.get())
.unwrap()
.output_names
.insert(filename.clone());
*e.get()
}
};
self.filenames.insert(filename, index);
Ok(index)
}
*/
}
fn load_file_section<'input, 'arena, Endian: gimli::Endianity, R: object::ReadRef<'input>>(
id: gimli::SectionId,
file: &object::File<'input, R>,
endian: Endian,
arena_data: &'arena Arena<Cow<'input, [u8]>>,
) -> Result<gimli::EndianSlice<'arena, Endian>, ()> {
// TODO: Unify with dwarfdump.rs in gimli.
let name = id.name();
match file.section_by_name(name) {
Some(section) => match section.uncompressed_data().unwrap() {
Cow::Borrowed(b) => Ok(gimli::EndianSlice::new(b, endian)),
Cow::Owned(b) => Ok(gimli::EndianSlice::new(arena_data.alloc(b.into()), endian)),
},
None => Ok(gimli::EndianSlice::new(&[][..], endian)),
}
}
fn read_exact_or_end(fp: &mut impl io::Read, buf: &mut [u8]) -> anyhow::Result<usize> {
let mut read_so_far = 0;
while read_so_far < buf.len() {
let n = fp.read(&mut buf[read_so_far..])?;
if n == 0 {
break;
}
read_so_far += n;
}
Ok(read_so_far)
}
pub fn parse_format(fp: &mut (impl io::Read + io::Seek)) -> anyhow::Result<(FileFormat, BTreeSet<(PathBuf, Sha256Hash)>)> {
fp.seek(io::SeekFrom::Start(0))?;
let mut buf = [0; 4];
let count = read_exact_or_end(fp, &mut buf)?;
let buf = &buf[..count];
Ok(match buf {
[0x7f, b'E', b'L', b'F', ..] => {
let read_cache = ReadCache::new(fp);
let elf = object::File::parse(&read_cache)?;
let endian = if elf.is_little_endian() {
gimli::RunTimeEndian::Little
} else {
gimli::RunTimeEndian::Big
};
let arena_data = Arena::new();
let mut load_section = |id: gimli::SectionId| -> Result<_, _> {
load_file_section(id, &elf, endian, &arena_data)
};
let dwarf = gimli::Dwarf::load(&mut load_section).unwrap();
let mut units = dwarf.units();
let mut inputs = vec![];
while let Ok(Some(unit)) = units.next() {
let abbrev = dwarf.abbreviations(&unit)?;
let mut entries = unit.entries(&abbrev);
while let Some((_, entry)) = entries.next_dfs()? {
if entry.tag() == DW_TAG_compile_unit {
let mut basename = None;
let mut dirname = None;
if let Some(name) =
entry.attr(constants::DW_AT_name)?.map(|a| a.value())
{
if let Ok(name) = dwarf.attr_string(&dwarf.unit(unit)?, name) {
basename = Some(PathBuf::from(name.to_string()?));
}
}
if let Some(name) =
entry.attr(constants::DW_AT_comp_dir)?.map(|a| a.value())
{
if let Ok(name) = dwarf.attr_string(&dwarf.unit(unit)?, name) {
dirname = Some(PathBuf::from(name.to_string()?));
}
}
if let (Some(dirname), Some(basename)) = (dirname, basename) {
inputs.push(dirname.join(basename));
}
}
}
}
let references = inputs
.into_iter()
.map(|filename| -> anyhow::Result<_> {
// TODO: this needs to try suffixes of the filename against the filepath table to see if it
// was moved between compilation and ingestion. but how...
let metadata = match fs::metadata(&filename) {
Ok(m) => m,
Err(_) => return Ok(None),
};
if !metadata.is_file() {
return Ok(None);
}
let mut fp = fs::File::open(&filename)?;
let mut h = Sha256::new();
log::debug!("Hashing {}", filename.to_string_lossy());
io::copy(&mut fp, &mut h)?;
let result = Ok(Some((filename, h.finalize().into())));
result
})
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.filter_map(|x| x)
.collect();
(FileFormat::ELF, references)
}
_ => (FileFormat::Other, BTreeSet::new()),
})
}