fix gallerybox selector, make robust to missing titles

2023-03-31 00:58:51 +01:00 · 2023-03-31 00:58:51 +01:00 · fa1c07e059
commit fa1c07e059
parent afc0ba3d13
1 changed files with 11 additions and 20 deletions
--- a/src/wiki.rs
+++ b/src/wiki.rs
@ -8,17 +8,17 @@ use tracing::{event, instrument, Level};
 use url::Url;
 // Filter a filename string for filenames
-fn filter_filename(filename: &str) -> bool {
+fn extract_filename(filename: &str) -> Option<&str> {
-    let re = Regex::new("ISO.7010.[EWMPF][0-9]{3}.*").unwrap();
+    let re = Regex::new("(File:ISO.7010.[EWMPF][0-9]{3}.*)").unwrap();
-    re.is_match(filename)
+    Some(re.find(filename)?.as_str())
 }
 // Scrape all images from the wikipedia page, returning a vec of title, filename pairs
 pub fn scrape_web() -> StdError<Vec<(String, String)>> {
    event!(Level::INFO, "Scraping the wikipedia page for things");
    // Parse CSS selectors to scrape elements
-    let gallerybox_sel = scraper::Selector::parse(".mw-body-content li.gallerybox")
+    let gallerybox_sel =
-        .map_err(|e| format!("{:?}", e))?;
+        scraper::Selector::parse("li.gallerybox").map_err(|e| format!("{:?}", e))?;
    let link_sel = scraper::Selector::parse("a.image").map_err(|e| format!("{:?}", e))?;
    let title_sel = scraper::Selector::parse(".gallerytext p").map_err(|e| format!("{:?}", e))?;
@ -32,32 +32,23 @@ pub fn scrape_web() -> StdError<Vec<(String, String)>> {
    let page = scraper::Html::parse_document(txt.as_str());
    return Ok(page
        .select(&gallerybox_sel)
-        .map(|a| {
+        .filter_map(|a| {
            let link = a
                .select(&link_sel)
                .next()
                .unwrap()
                .value()
                .attr("href")
-                .unwrap()
+                .unwrap();
                .to_owned();
            let title = a
                .select(&title_sel)
-                .next()
+                .next()?
                .unwrap()
                .text()
                .collect::<String>()
                .trim()
                .to_owned();
-            (title, link)
+            // Extract the filename, filtering out any files that don't look interesting
-        })
+            Some((title, extract_filename(link)?.to_owned()))
        // Filter for filenames that look like ISO diagrams
        .filter(|tup| filter_filename(&tup.1))
        // Extract the file name only (.e.g `File:ISO_7010_X000.svg`)
        .filter_map(|(title, link)| {
            link.split('/')
                .next_back()
                .map(|end| (title, end.to_owned()))
        })
        .collect::<Vec<_>>());
 }