fix gallerybox selector, make robust to missing titles

This commit is contained in:
Sam W 2023-03-31 00:58:51 +01:00
parent afc0ba3d13
commit fa1c07e059
1 changed files with 11 additions and 20 deletions

View File

@ -8,17 +8,17 @@ use tracing::{event, instrument, Level};
use url::Url; use url::Url;
// Filter a filename string for filenames // Filter a filename string for filenames
fn filter_filename(filename: &str) -> bool { fn extract_filename(filename: &str) -> Option<&str> {
let re = Regex::new("ISO.7010.[EWMPF][0-9]{3}.*").unwrap(); let re = Regex::new("(File:ISO.7010.[EWMPF][0-9]{3}.*)").unwrap();
re.is_match(filename) Some(re.find(filename)?.as_str())
} }
// Scrape all images from the wikipedia page, returning a vec of title, filename pairs // Scrape all images from the wikipedia page, returning a vec of title, filename pairs
pub fn scrape_web() -> StdError<Vec<(String, String)>> { pub fn scrape_web() -> StdError<Vec<(String, String)>> {
event!(Level::INFO, "Scraping the wikipedia page for things"); event!(Level::INFO, "Scraping the wikipedia page for things");
// Parse CSS selectors to scrape elements // Parse CSS selectors to scrape elements
let gallerybox_sel = scraper::Selector::parse(".mw-body-content li.gallerybox") let gallerybox_sel =
.map_err(|e| format!("{:?}", e))?; scraper::Selector::parse("li.gallerybox").map_err(|e| format!("{:?}", e))?;
let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?; let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?;
let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?; let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?;
@ -32,32 +32,23 @@ pub fn scrape_web() -> StdError<Vec<(String, String)>> {
let page = scraper::Html::parse_document(txt.as_str()); let page = scraper::Html::parse_document(txt.as_str());
return Ok(page return Ok(page
.select(&gallerybox_sel) .select(&gallerybox_sel)
.map(|a| { .filter_map(|a| {
let link = a let link = a
.select(&link_sel) .select(&link_sel)
.next() .next()
.unwrap() .unwrap()
.value() .value()
.attr("href") .attr("href")
.unwrap() .unwrap();
.to_owned();
let title = a let title = a
.select(&title_sel) .select(&title_sel)
.next() .next()?
.unwrap()
.text() .text()
.collect::<String>() .collect::<String>()
.trim() .trim()
.to_owned(); .to_owned();
(title, link) // Extract the filename, filtering out any files that don't look interesting
}) Some((title, extract_filename(link)?.to_owned()))
// Filter for filenames that look like ISO diagrams
.filter(|tup| filter_filename(&tup.1))
// Extract the file name only (.e.g `File:ISO_7010_X000.svg`)
.filter_map(|(title, link)| {
link.split('/')
.next_back()
.map(|end| (title, end.to_owned()))
}) })
.collect::<Vec<_>>()); .collect::<Vec<_>>());
} }