add regex scraper
This commit is contained in:
parent
51cdf392c6
commit
f56e0ec207
|
@ -7,10 +7,12 @@ edition = "2021"
|
||||||
default = []
|
default = []
|
||||||
|
|
||||||
inventory = ["dep:inventory"]
|
inventory = ["dep:inventory"]
|
||||||
scraping = ["dep:scraper"]
|
scraping = ["dep:scraper", "dep:lazy_static", "dep:regex"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
inventory = { version = "0.3", optional = true }
|
inventory = { version = "0.3", optional = true }
|
||||||
|
lazy_static = { version = "1.4.0", optional = true }
|
||||||
|
regex = { version = "1.6.0", optional = true }
|
||||||
scraper = { version = "0.13.0", optional = true }
|
scraper = { version = "0.13.0", optional = true }
|
||||||
tracing = "0.1.35"
|
tracing = "0.1.35"
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
|
use regex::Regex;
|
||||||
use scraper::{ElementRef, Html};
|
use scraper::{ElementRef, Html};
|
||||||
|
|
||||||
/// Gets all classes from an html
|
/// Gets all classes from an html, using the scraper crate
|
||||||
|
///
|
||||||
|
/// If you have a templated html source and it's failing to get some classes, you might want
|
||||||
|
/// to use `get_classes_regex` instead.
|
||||||
///
|
///
|
||||||
/// ```
|
/// ```
|
||||||
/// # use zephyr::{*, scraping::*};
|
/// # use zephyr::{*, scraping::*};
|
||||||
|
@ -31,12 +35,32 @@ pub fn get_classes(html: &str) -> Vec<String> {
|
||||||
classes
|
classes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lazy_static::lazy_static! {
|
||||||
|
static ref STYLE_REGEX: Regex =
|
||||||
|
Regex::new(r#"(?:class|className)=(?:["']\W+\s*(?:\w+)\()?["']([^'"]+)['"]"#).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Gets all classes from an html, using a regex
|
||||||
|
///
|
||||||
|
/// It's less accurate than `get_classes`, meaning it will find more false positives.
|
||||||
|
/// Use this if you have a templated html source and don't mind generating more classes than there actually are.
|
||||||
|
pub fn get_classes_regex(html: &str) -> Vec<&str> {
|
||||||
|
let mut classes = vec![];
|
||||||
|
for capture in STYLE_REGEX.captures_iter(html) {
|
||||||
|
if let Some(group) = capture.get(1) {
|
||||||
|
classes.push(group.as_str())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
classes
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse() {
|
fn test_parse_scraper() {
|
||||||
let c = get_classes(
|
let c = get_classes(
|
||||||
"<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>
|
"<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>
|
||||||
<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>",
|
<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>",
|
||||||
|
@ -44,4 +68,14 @@ mod tests {
|
||||||
|
|
||||||
assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]);
|
assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_parse_regex() {
|
||||||
|
let c = get_classes_regex(
|
||||||
|
"<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>
|
||||||
|
<h1 class=\"hey hello\">Hello, <i class=\"hiii\">world!</i></h1>",
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(c, vec!["hey hello", "hiii", "hey hello", "hiii",]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue