fix gallerybox selector, make robust to missing titles

This commit is contained in:
Sam W 2023-03-31 00:58:51 +01:00
parent afc0ba3d13
commit fa1c07e059
1 changed files with 11 additions and 20 deletions

View File

@ -8,17 +8,17 @@ use tracing::{event, instrument, Level};
use url::Url;
// Filter a filename string for filenames
fn filter_filename(filename: &str) -> bool {
let re = Regex::new("ISO.7010.[EWMPF][0-9]{3}.*").unwrap();
re.is_match(filename)
fn extract_filename(filename: &str) -> Option<&str> {
let re = Regex::new("(File:ISO.7010.[EWMPF][0-9]{3}.*)").unwrap();
Some(re.find(filename)?.as_str())
}
// Scrape all images from the wikipedia page, returning a vec of title, filename pairs
pub fn scrape_web() -> StdError<Vec<(String, String)>> {
event!(Level::INFO, "Scraping the wikipedia page for things");
// Parse CSS selectors to scrape elements
let gallerybox_sel = scraper::Selector::parse(".mw-body-content li.gallerybox")
.map_err(|e| format!("{:?}", e))?;
let gallerybox_sel =
scraper::Selector::parse("li.gallerybox").map_err(|e| format!("{:?}", e))?;
let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?;
let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?;
@ -32,32 +32,23 @@ pub fn scrape_web() -> StdError<Vec<(String, String)>> {
let page = scraper::Html::parse_document(txt.as_str());
return Ok(page
.select(&gallerybox_sel)
.map(|a| {
.filter_map(|a| {
let link = a
.select(&link_sel)
.next()
.unwrap()
.value()
.attr("href")
.unwrap()
.to_owned();
.unwrap();
let title = a
.select(&title_sel)
.next()
.unwrap()
.next()?
.text()
.collect::<String>()
.trim()
.to_owned();
(title, link)
})
// Filter for filenames that look like ISO diagrams
.filter(|tup| filter_filename(&tup.1))
// Extract the file name only (.e.g `File:ISO_7010_X000.svg`)
.filter_map(|(title, link)| {
link.split('/')
.next_back()
.map(|end| (title, end.to_owned()))
// Extract the filename, filtering out any files that don't look interesting
Some((title, extract_filename(link)?.to_owned()))
})
.collect::<Vec<_>>());
}
@ -113,7 +104,7 @@ impl Display for FileMeta {
f,
"FileMeta{{url: {}, name: {}, html_url: {}, author: {}, attribution_required: {}, license_short_name: {}, license_url: {}}}",
self.url, self.name, self.html_url, self.author, self.attribution_required, self.license_short_name,
self.license_url.clone().map_or("None".to_owned(), |u| u.to_string()) // Ew.
self.license_url.clone().map_or("None".to_owned(), |u| u.to_string()) // Ew.
)
}
}