add regex scraper
This commit is contained in:
parent
51cdf392c6
commit
f56e0ec207
|
@ -7,10 +7,12 @@ edition = "2021"
|
|||
default = []
|
||||
|
||||
inventory = ["dep:inventory"]
|
||||
scraping = ["dep:scraper"]
|
||||
scraping = ["dep:scraper", "dep:lazy_static", "dep:regex"]
|
||||
|
||||
[dependencies]
|
||||
inventory = { version = "0.3", optional = true }
|
||||
lazy_static = { version = "1.4.0", optional = true }
|
||||
regex = { version = "1.6.0", optional = true }
|
||||
scraper = { version = "0.13.0", optional = true }
|
||||
tracing = "0.1.35"
|
||||
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
use regex::Regex;
|
||||
use scraper::{ElementRef, Html};
|
||||
|
||||
/// Gets all classes from an html
|
||||
/// Gets all classes from an html, using the scraper crate
|
||||
///
|
||||
/// If you have a templated html source and it's failing to get some classes, you might want
|
||||
/// to use `get_classes_regex` instead.
|
||||
///
|
||||
/// ```
|
||||
/// # use zephyr::{*, scraping::*};
|
||||
|
@ -31,12 +35,32 @@ pub fn get_classes(html: &str) -> Vec<String> {
|
|||
classes
|
||||
}
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref STYLE_REGEX: Regex =
|
||||
Regex::new(r#"(?:class|className)=(?:["']\W+\s*(?:\w+)\()?["']([^'"]+)['"]"#).unwrap();
|
||||
}
|
||||
|
||||
/// Gets all classes from an html, using a regex
|
||||
///
|
||||
/// It's less accurate than `get_classes`, meaning it will find more false positives.
|
||||
/// Use this if you have a templated html source and don't mind generating more classes than there actually are.
|
||||
pub fn get_classes_regex(html: &str) -> Vec<&str> {
|
||||
let mut classes = vec![];
|
||||
for capture in STYLE_REGEX.captures_iter(html) {
|
||||
if let Some(group) = capture.get(1) {
|
||||
classes.push(group.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
classes
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse() {
|
||||
fn test_parse_scraper() {
|
||||
let c = get_classes(
|
||||
"<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>
|
||||
<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>",
|
||||
|
@ -44,4 +68,14 @@ mod tests {
|
|||
|
||||
assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_regex() {
|
||||
let c = get_classes_regex(
|
||||
"<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>
|
||||
<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>",
|
||||
);
|
||||
|
||||
assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue