fix gallerybox selector, make robust to missing titles
This commit is contained in:
parent
afc0ba3d13
commit
fa1c07e059
29
src/wiki.rs
29
src/wiki.rs
|
@ -8,17 +8,17 @@ use tracing::{event, instrument, Level};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
// Filter a filename string for filenames
|
// Filter a filename string for filenames
|
||||||
fn filter_filename(filename: &str) -> bool {
|
fn extract_filename(filename: &str) -> Option<&str> {
|
||||||
let re = Regex::new("ISO.7010.[EWMPF][0-9]{3}.*").unwrap();
|
let re = Regex::new("(File:ISO.7010.[EWMPF][0-9]{3}.*)").unwrap();
|
||||||
re.is_match(filename)
|
Some(re.find(filename)?.as_str())
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scrape all images from the wikipedia page, returning a vec of title, filename pairs
|
// Scrape all images from the wikipedia page, returning a vec of title, filename pairs
|
||||||
pub fn scrape_web() -> StdError<Vec<(String, String)>> {
|
pub fn scrape_web() -> StdError<Vec<(String, String)>> {
|
||||||
event!(Level::INFO, "Scraping the wikipedia page for things");
|
event!(Level::INFO, "Scraping the wikipedia page for things");
|
||||||
// Parse CSS selectors to scrape elements
|
// Parse CSS selectors to scrape elements
|
||||||
let gallerybox_sel = scraper::Selector::parse(".mw-body-content li.gallerybox")
|
let gallerybox_sel =
|
||||||
.map_err(|e| format!("{:?}", e))?;
|
scraper::Selector::parse("li.gallerybox").map_err(|e| format!("{:?}", e))?;
|
||||||
let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?;
|
let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?;
|
||||||
let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?;
|
let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?;
|
||||||
|
|
||||||
|
@ -32,32 +32,23 @@ pub fn scrape_web() -> StdError<Vec<(String, String)>> {
|
||||||
let page = scraper::Html::parse_document(txt.as_str());
|
let page = scraper::Html::parse_document(txt.as_str());
|
||||||
return Ok(page
|
return Ok(page
|
||||||
.select(&gallerybox_sel)
|
.select(&gallerybox_sel)
|
||||||
.map(|a| {
|
.filter_map(|a| {
|
||||||
let link = a
|
let link = a
|
||||||
.select(&link_sel)
|
.select(&link_sel)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.value()
|
.value()
|
||||||
.attr("href")
|
.attr("href")
|
||||||
.unwrap()
|
.unwrap();
|
||||||
.to_owned();
|
|
||||||
let title = a
|
let title = a
|
||||||
.select(&title_sel)
|
.select(&title_sel)
|
||||||
.next()
|
.next()?
|
||||||
.unwrap()
|
|
||||||
.text()
|
.text()
|
||||||
.collect::<String>()
|
.collect::<String>()
|
||||||
.trim()
|
.trim()
|
||||||
.to_owned();
|
.to_owned();
|
||||||
(title, link)
|
// Extract the filename, filtering out any files that don't look interesting
|
||||||
})
|
Some((title, extract_filename(link)?.to_owned()))
|
||||||
// Filter for filenames that look like ISO diagrams
|
|
||||||
.filter(|tup| filter_filename(&tup.1))
|
|
||||||
// Extract the file name only (.e.g `File:ISO_7010_X000.svg`)
|
|
||||||
.filter_map(|(title, link)| {
|
|
||||||
link.split('/')
|
|
||||||
.next_back()
|
|
||||||
.map(|end| (title, end.to_owned()))
|
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>());
|
.collect::<Vec<_>>());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue