From f56e0ec20799ddb7948dab988153d9c7796cc974 Mon Sep 17 00:00:00 2001 From: annieversary Date: Fri, 9 Sep 2022 15:05:35 +0200 Subject: [PATCH] add regex scraper --- Cargo.toml | 4 +++- src/scraping.rs | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5dee8d3..453caa8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,10 +7,12 @@ edition = "2021" default = [] inventory = ["dep:inventory"] -scraping = ["dep:scraper"] +scraping = ["dep:scraper", "dep:lazy_static", "dep:regex"] [dependencies] inventory = { version = "0.3", optional = true } +lazy_static = { version = "1.4.0", optional = true } +regex = { version = "1.6.0", optional = true } scraper = { version = "0.13.0", optional = true } tracing = "0.1.35" diff --git a/src/scraping.rs b/src/scraping.rs index 1269770..13d356e 100644 --- a/src/scraping.rs +++ b/src/scraping.rs @@ -1,6 +1,10 @@ +use regex::Regex; use scraper::{ElementRef, Html}; -/// Gets all classes from an html +/// Gets all classes from an html, using the scraper crate +/// +/// If you have a templated html source and it's failing to get some classes, you might want +/// to use `get_classes_regex` instead. /// /// ``` /// # use zephyr::{*, scraping::*}; @@ -31,12 +35,32 @@ pub fn get_classes(html: &str) -> Vec { classes } +lazy_static::lazy_static! { + static ref STYLE_REGEX: Regex = + Regex::new(r#"(?:class|className)=(?:["']\W+\s*(?:\w+)\()?["']([^'"]+)['"]"#).unwrap(); +} + +/// Gets all classes from an html, using a regex +/// +/// It's less accurate than `get_classes`, meaning it will find more false positives. +/// Use this if you have a templated html source and don't mind generating more classes than there actually are. +pub fn get_classes_regex(html: &str) -> Vec<&str> { + let mut classes = vec![]; + for capture in STYLE_REGEX.captures_iter(html) { + if let Some(group) = capture.get(1) { + classes.push(group.as_str()) + } + } + + classes +} + #[cfg(test)] mod tests { use super::*; #[test] - fn test_parse() { + fn test_parse_scraper() { let c = get_classes( "

Hello, world!

Hello, world!

", @@ -44,4 +68,14 @@ mod tests { assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]); } + + #[test] + fn test_parse_regex() { + let c = get_classes_regex( + "

Hello, world!

+

Hello, world!

", + ); + + assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]); + } }