fix gallerybox selector, make robust to missing titles
This commit is contained in:
parent
afc0ba3d13
commit
fa1c07e059
29
src/wiki.rs
29
src/wiki.rs
|
@ -8,17 +8,17 @@ use tracing::{event, instrument, Level};
|
|||
use url::Url;
|
||||
|
||||
// Filter a filename string for filenames
|
||||
fn filter_filename(filename: &str) -> bool {
|
||||
let re = Regex::new("ISO.7010.[EWMPF][0-9]{3}.*").unwrap();
|
||||
re.is_match(filename)
|
||||
fn extract_filename(filename: &str) -> Option<&str> {
|
||||
let re = Regex::new("(File:ISO.7010.[EWMPF][0-9]{3}.*)").unwrap();
|
||||
Some(re.find(filename)?.as_str())
|
||||
}
|
||||
|
||||
// Scrape all images from the wikipedia page, returning a vec of title, filename pairs
|
||||
pub fn scrape_web() -> StdError<Vec<(String, String)>> {
|
||||
event!(Level::INFO, "Scraping the wikipedia page for things");
|
||||
// Parse CSS selectors to scrape elements
|
||||
let gallerybox_sel = scraper::Selector::parse(".mw-body-content li.gallerybox")
|
||||
.map_err(|e| format!("{:?}", e))?;
|
||||
let gallerybox_sel =
|
||||
scraper::Selector::parse("li.gallerybox").map_err(|e| format!("{:?}", e))?;
|
||||
let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?;
|
||||
let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?;
|
||||
|
||||
|
@ -32,32 +32,23 @@ pub fn scrape_web() -> StdError<Vec<(String, String)>> {
|
|||
let page = scraper::Html::parse_document(txt.as_str());
|
||||
return Ok(page
|
||||
.select(&gallerybox_sel)
|
||||
.map(|a| {
|
||||
.filter_map(|a| {
|
||||
let link = a
|
||||
.select(&link_sel)
|
||||
.next()
|
||||
.unwrap()
|
||||
.value()
|
||||
.attr("href")
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
.unwrap();
|
||||
let title = a
|
||||
.select(&title_sel)
|
||||
.next()
|
||||
.unwrap()
|
||||
.next()?
|
||||
.text()
|
||||
.collect::<String>()
|
||||
.trim()
|
||||
.to_owned();
|
||||
(title, link)
|
||||
})
|
||||
// Filter for filenames that look like ISO diagrams
|
||||
.filter(|tup| filter_filename(&tup.1))
|
||||
// Extract the file name only (.e.g `File:ISO_7010_X000.svg`)
|
||||
.filter_map(|(title, link)| {
|
||||
link.split('/')
|
||||
.next_back()
|
||||
.map(|end| (title, end.to_owned()))
|
||||
// Extract the filename, filtering out any files that don't look interesting
|
||||
Some((title, extract_filename(link)?.to_owned()))
|
||||
})
|
||||
.collect::<Vec<_>>());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue