From 4b94442999278f92d4c7d4484dac97383ebe8971 Mon Sep 17 00:00:00 2001 From: Audrey Dutcher Date: Mon, 15 Apr 2024 13:07:19 -0700 Subject: [PATCH] Start adding filestore --- Cargo.lock | 72 +++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/filestore.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 3 +- src/tracer.rs | 7 +++- 5 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 src/filestore.rs diff --git a/Cargo.lock b/Cargo.lock index 07a95ae..d57aa70 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,6 +77,15 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -135,6 +144,35 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "cpufeatures" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "env_filter" version = "0.1.0" @@ -158,6 +196,16 @@ dependencies = [ "log", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "heck" version = "0.5.0" @@ -228,6 +276,7 @@ dependencies = [ "nix", "serde", "serde_json", + "sha2", ] [[package]] @@ -314,6 +363,17 @@ dependencies = [ "serde", ] +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "strsim" version = "0.11.1" @@ -331,6 +391,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unicode-ident" version = "1.0.12" @@ -343,6 +409,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "windows-sys" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index d54387a..f8e4092 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,4 @@ env_logger = "0.11" serde = { version = "1", features = ["derive"] } serde_json = "1.0" clap = { version = "4.5.4", features = ["derive"] } +sha2 = { version = "0.10" } diff --git a/src/filestore.rs b/src/filestore.rs new file mode 100644 index 0000000..efd0e1a --- /dev/null +++ b/src/filestore.rs @@ -0,0 +1,87 @@ +use std::{path::PathBuf, collections::{HashMap, BTreeMap, HashSet}, fs, io}; + +use sha2::{Sha256, Digest, digest::{generic_array::GenericArray, typenum::U32}}; + +type Sha256Hash = GenericArray; + +pub struct FileStore { + pub files: Vec, + pub input_mapping: HashMap, + pub hashes: BTreeMap, +} + +pub struct FileStoreEntry { + pub index: usize, + pub hash: Sha256Hash, + pub input_names: HashSet, + pub output_names: HashSet, +} + +impl FileStore { + pub fn new(inputs: Vec) -> anyhow::Result { + let mut result = Self { + files: vec![], + input_mapping: HashMap::new(), + hashes: BTreeMap::new(), + }; + + for input in inputs { + let mut fp = fs::File::open(&input)?; + + let mut h = Sha256::new(); + io::copy(&mut fp, &mut h)?; + let hash = h.finalize(); + + let index = match result.hashes.entry(hash) { + std::collections::btree_map::Entry::Vacant(e) => { + let index = result.files.len(); + result.files.push(FileStoreEntry { + index, + hash, + input_names: HashSet::new(), + output_names: [input.clone()].into(), + }); + e.insert(index); + index + } + std::collections::btree_map::Entry::Occupied(e) => { + result.files.get_mut(*e.get()).unwrap().output_names.insert(input.clone()); + *e.get() + } + }; + + result.input_mapping.insert(input, index); + } + + Ok(result) + } + + pub fn ingest_output_local(&mut self, filename: PathBuf) -> anyhow::Result<()> { + let fp = fs::File::open(&filename)?; + self.ingest_output(filename, fp) + } + + pub fn ingest_output(&mut self, filename: PathBuf, mut content: impl io::Read) -> anyhow::Result<()> { + let mut h = Sha256::new(); + io::copy(&mut content, &mut h)?; + let hash = h.finalize(); + + match self.hashes.entry(hash) { + std::collections::btree_map::Entry::Vacant(e) => { + let index = self.files.len(); + self.files.push(FileStoreEntry { + index, + hash, + input_names: HashSet::new(), + output_names: [filename].into(), + }); + e.insert(index); + } + std::collections::btree_map::Entry::Occupied(e) => { + self.files.get_mut(*e.get()).unwrap().output_names.insert(filename); + } + } + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index f890ef6..6cef3ba 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ mod tracer; +mod filestore; use std::path::PathBuf; @@ -37,7 +38,7 @@ fn main() { } else { Box::new(std::io::stdout()) }; - let mut t = tracer::Tracer::new().unwrap(); + let mut t = tracer::Tracer::new(input).unwrap(); t.start_root_process(cmd).unwrap(); if output.is_none() { diff --git a/src/tracer.rs b/src/tracer.rs index 850c51c..ab3c711 100644 --- a/src/tracer.rs +++ b/src/tracer.rs @@ -24,6 +24,8 @@ use nix::{ use serde::{Deserialize, Serialize}; +use crate::filestore::FileStore; + #[derive(Copy, Clone, Serialize, Deserialize, Eq, PartialEq, Debug, Hash)] pub struct Pid(i32); @@ -413,6 +415,7 @@ pub struct Tracer { pub store: ProcessStateStore, pub log: Vec, pub start_time: Instant, + pub files: FileStore, } fn ptrace_syscall(pid: Pid, sig: Option) -> Result<(), Errno> { @@ -438,11 +441,13 @@ impl Tracer { self.log(Identifier { pid, machine: 0 }, event); } - pub fn new() -> anyhow::Result { + pub fn new(input: Vec) -> anyhow::Result { + let files = FileStore::new(input)?; Ok(Self { store: ProcessStateStore::new(), log: vec![], start_time: Instant::now(), + files, }) }