diff --git a/src/wiki.rs b/src/wiki.rs index 46c6603..a2b1b59 100644 --- a/src/wiki.rs +++ b/src/wiki.rs @@ -8,17 +8,17 @@ use tracing::{event, instrument, Level}; use url::Url; // Filter a filename string for filenames -fn filter_filename(filename: &str) -> bool { - let re = Regex::new("ISO.7010.[EWMPF][0-9]{3}.*").unwrap(); - re.is_match(filename) +fn extract_filename(filename: &str) -> Option<&str> { + let re = Regex::new("(File:ISO.7010.[EWMPF][0-9]{3}.*)").unwrap(); + Some(re.find(filename)?.as_str()) } // Scrape all images from the wikipedia page, returning a vec of title, filename pairs pub fn scrape_web() -> StdError> { event!(Level::INFO, "Scraping the wikipedia page for things"); // Parse CSS selectors to scrape elements - let gallerybox_sel = scraper::Selector::parse(".mw-body-content li.gallerybox") - .map_err(|e| format!("{:?}", e))?; + let gallerybox_sel = + scraper::Selector::parse("li.gallerybox").map_err(|e| format!("{:?}", e))?; let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?; let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?; @@ -32,32 +32,23 @@ pub fn scrape_web() -> StdError> { let page = scraper::Html::parse_document(txt.as_str()); return Ok(page .select(&gallerybox_sel) - .map(|a| { + .filter_map(|a| { let link = a .select(&link_sel) .next() .unwrap() .value() .attr("href") - .unwrap() - .to_owned(); + .unwrap(); let title = a .select(&title_sel) - .next() - .unwrap() + .next()? .text() .collect::() .trim() .to_owned(); - (title, link) - }) - // Filter for filenames that look like ISO diagrams - .filter(|tup| filter_filename(&tup.1)) - // Extract the file name only (.e.g `File:ISO_7010_X000.svg`) - .filter_map(|(title, link)| { - link.split('/') - .next_back() - .map(|end| (title, end.to_owned())) + // Extract the filename, filtering out any files that don't look interesting + Some((title, extract_filename(link)?.to_owned())) }) .collect::>()); } @@ -113,7 +104,7 @@ impl Display for FileMeta { f, "FileMeta{{url: {}, name: {}, html_url: {}, author: {}, attribution_required: {}, license_short_name: {}, license_url: {}}}", self.url, self.name, self.html_url, self.author, self.attribution_required, self.license_short_name, - self.license_url.clone().map_or("None".to_owned(), |u| u.to_string()) // Ew. + self.license_url.clone().map_or("None".to_owned(), |u| u.to_string()) // Ew. ) } }