add regex scraper

This commit is contained in:
annieversary 2022-09-09 15:05:35 +02:00
parent 51cdf392c6
commit f56e0ec207
2 changed files with 39 additions and 3 deletions

View File

@ -7,10 +7,12 @@ edition = "2021"
default = []
inventory = ["dep:inventory"]
scraping = ["dep:scraper"]
scraping = ["dep:scraper", "dep:lazy_static", "dep:regex"]
[dependencies]
inventory = { version = "0.3", optional = true }
lazy_static = { version = "1.4.0", optional = true }
regex = { version = "1.6.0", optional = true }
scraper = { version = "0.13.0", optional = true }
tracing = "0.1.35"

View File

@ -1,6 +1,10 @@
use regex::Regex;
use scraper::{ElementRef, Html};
/// Gets all classes from an html
/// Gets all classes from an html, using the scraper crate
///
/// If you have a templated html source and it's failing to get some classes, you might want
/// to use `get_classes_regex` instead.
///
/// ```
/// # use zephyr::{*, scraping::*};
@ -31,12 +35,32 @@ pub fn get_classes(html: &str) -> Vec<String> {
classes
}
lazy_static::lazy_static! {
static ref STYLE_REGEX: Regex =
Regex::new(r#"(?:class|className)=(?:["']\W+\s*(?:\w+)\()?["']([^'"]+)['"]"#).unwrap();
}
/// Gets all classes from an html, using a regex
///
/// It's less accurate than `get_classes`, meaning it will find more false positives.
/// Use this if you have a templated html source and don't mind generating more classes than there actually are.
pub fn get_classes_regex(html: &str) -> Vec<&str> {
let mut classes = vec![];
for capture in STYLE_REGEX.captures_iter(html) {
if let Some(group) = capture.get(1) {
classes.push(group.as_str())
}
}
classes
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse() {
fn test_parse_scraper() {
let c = get_classes(
"<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>
<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>",
@ -44,4 +68,14 @@ mod tests {
assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]);
}
#[test]
fn test_parse_regex() {
let c = get_classes_regex(
"<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>
<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>",
);
assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]);
}
}