math reports and clippy lints

KWARC · Mar 25, 2024 · a74f2f1 · a74f2f1
1 parent 283be4a
commit a74f2f1
Show file tree

Hide file tree

Showing 11 changed files with 212 additions and 25 deletions.
diff --git a/examples/citation_ngrams.rs b/examples/citation_ngrams.rs
@@ -34,7 +34,7 @@ fn main() -> Result<(), Box<dyn Error>> {
 
   let mut input_args = env::args();
   let _ = input_args.next(); // skip process name
-  while let Some(file_path) = input_args.next() {
+  for file_path in input_args {
     eprintln!("-- opening {:?}", file_path);
     let file = File::open(file_path)?;
     let reader = BufReader::new(file);

diff --git a/examples/corpus_math_count.rs b/examples/corpus_math_count.rs
@@ -0,0 +1,137 @@
+//! Count the total number of <math> elements,
+//! and their Content MathML annotations
+//! in a directory of HTML documents
+//!
+//! example use for arXMLiv:
+//!    `cargo run --release --example corpus_math_count /data/datasets/dataset-arXMLiv-2022`
+//!
+//! This script extracts the raw data from a "blind" descent over each `<math>` element, and may
+//! require additional cutoffs and post-processing over uncurated corpora.
+//! You can find an example of post-processing done for the data of arXMLiv here:
+//! https://gist.github.com/dginev/e50a632d31be05bb87d64cc1800f6fd4#file-apply_cutoffs-pl
+#![allow(clippy::unused_io_amount)]
+
+use std::collections::HashMap;
+use std::env;
+use std::fs::File;
+use std::io::{BufWriter, Error};
+use std::time::Instant;
+
+use libxml::xpath::Context;
+use llamapun::parallel_data::Corpus;
+
+static BUFFER_CAPACITY: usize = 10_485_760;
+
+pub fn main() -> Result<(), Error> {
+  let start = Instant::now();
+  // Read input arguments
+  let mut input_args = env::args();
+  let _ = input_args.next(); // skip process name
+  let corpus_path = match input_args.next() {
+    Some(path) => path,
+    None => "tests/resources/".to_string(),
+  };
+  let node_statistics_filepath = match input_args.next() {
+    Some(path) => path,
+    None => "corpus_math_count.csv".to_string(),
+  };
+  let content_statistics_filepath = match input_args.next() {
+    Some(path) => path,
+    None => "corpus_content_count.csv".to_string(),
+  };
+
+  let extension_filter = input_args.next();
+
+  let node_statistics_file = File::create(node_statistics_filepath)?;
+  let content_statistics_file = File::create(content_statistics_filepath)?;
+
+  let mut corpus = Corpus::new(corpus_path);
+  corpus.extension = extension_filter;
+
+  let mut total = 0;
+  let (math_catalog, content_math_catalog) = corpus.catalogs_with_parallel_walk(|document| {
+    let mut math_count_hash = HashMap::new();
+    let mut content_count_hash = HashMap::new();
+    // just return the number of math elements
+    let mut xpath_context = Context::new(&document.dom).unwrap();
+    let math_count = xpath_context
+      .findvalue("count(//*[local-name()='math'])", None)
+      .unwrap();
+    math_count_hash.insert(math_count, 1);
+
+    let content_count = xpath_context
+      .findvalue(
+        "count(//*[local-name()='annotation-xml' and @encoding='MathML-Content'])",
+        None,
+      ).unwrap();
+    content_count_hash.insert(content_count, 1);
+
+    (math_count_hash, content_count_hash)
+  });
+
+  let duration_sec = start.elapsed().as_millis();
+  eprintln!("---");
+  eprintln!("Math counting finished in {:?}ms", duration_sec);
+
+  // Report on Math.
+  let mut catalog_vec: Vec<(&String, &u64)> = math_catalog.iter().collect();
+  catalog_vec.sort_by(|a, b| b.1.cmp(a.1));
+
+  let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, node_statistics_file);
+  let mut csv_writer = csv::Writer::from_writer(buffered_writer);
+  csv_writer.write_record(["math elements", "documents in corpus"])?;
+
+  for (key, val) in catalog_vec {
+    total += key.parse::<u64>().unwrap() * val;
+    csv_writer.write_record([key, &val.to_string()])?;
+  }
+  eprintln!(" Grand total of <math> in dataset: ");
+  eprintln!(" --- ");
+  eprintln!(" {} ", total);
+  eprintln!(" --- ");
+  // Close the writer
+  csv_writer.flush()?;
+
+  // Report on Content Math.
+  total = 0;
+  let mut catalog_vec: Vec<(&String, &u64)> = content_math_catalog.iter().collect();
+  catalog_vec.sort_by(|a, b| b.1.cmp(a.1));
+
+  let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, content_statistics_file);
+  let mut csv_writer = csv::Writer::from_writer(buffered_writer);
+  csv_writer.write_record(["annotation-xml elements", "documents in corpus"])?;
+
+  for (key, val) in catalog_vec {
+    total += key.parse::<u64>().unwrap() * val;
+    csv_writer.write_record([key, &val.to_string()])?;
+  }
+  eprintln!(" Grand total of Content MathML <annotation-xml> in dataset: ");
+  eprintln!(" --- ");
+  eprintln!(" {} ", total);
+  eprintln!(" --- ");
+  // Close the writer
+  csv_writer.flush()
+
+}
+
+// Example output from arXMLiv 2022:
+// Math counting finished in 14030571ms
+//   Grand total of <math> in dataset:
+// ---
+// 970414519
+// ---
+// Grand total of Content MathML <annotation-xml> in dataset:
+// ---
+// 953308908
+// ---
+
+// Example output from ar5iv 2024:
+//Math counting finished in 22121404ms
+// Grand total of <math> in dataset:
+// ---
+// 1059794660
+// ---
+// Grand total of Content MathML <annotation-xml> in dataset:
+// ---
+// 1038882200
+// ---
diff --git a/examples/corpus_mathml_stats.rs b/examples/corpus_mathml_stats.rs
@@ -22,7 +22,6 @@ use std::collections::{HashMap, HashSet};
 use std::env;
 use std::fs::File;
 use std::io::{BufWriter, Error};
-use std::thread;
 use std::time::Instant;
 
 use libxml::readonly::RoNode;
@@ -71,8 +70,7 @@ pub fn main() -> Result<(), Error> {
 
   let catalog = corpus.catalog_with_parallel_walk(|document| {
     println!(
-      "Thread: {:?}, doc: {:?}",
-      thread::current().name(),
+      "doc: {:?}",
       document.path
     );
 
@@ -103,10 +101,10 @@ pub fn main() -> Result<(), Error> {
 
   let buffered_writer = BufWriter::with_capacity(BUFFER_CAPACITY, node_statistics_file);
   let mut csv_writer = csv::Writer::from_writer(buffered_writer);
-  csv_writer.write_record(&["name@attr[value]", "frequency"])?;
+  csv_writer.write_record(["name@attr[value]", "frequency"])?;
 
   for (key, val) in catalog_vec {
-    csv_writer.write_record(&[key, &val.to_string()])?;
+    csv_writer.write_record([key, &val.to_string()])?;
   }
   // Close the writer
   csv_writer.flush()

diff --git a/examples/corpus_node_model.rs b/examples/corpus_node_model.rs
@@ -14,7 +14,7 @@ use std::time::Instant;
 use libxml::readonly::RoNode;
 use llamapun::parallel_data::Corpus;
 
-static NEWLINE: &'static [u8] = b"\n";
+static NEWLINE: &[u8] = b"\n";
 static BUFFER_CAPACITY: usize = 10_485_760;
 
 pub fn main() -> Result<(), Error> {

diff --git a/examples/corpus_statement_paragraphs_model.rs b/examples/corpus_statement_paragraphs_model.rs
@@ -258,7 +258,7 @@ fn extract_document_statements(
       }
     }
     // Discard paragraphs outside of a reasonable [4,1024] word count range
-    if word_count < 4 || word_count > 1024 {
+    if !(4..=1024).contains(&word_count) {
       overflow_count += 1;
       invalid_paragraph = true;
     }
@@ -294,7 +294,7 @@ fn extract_document_statements(
 /// give a sha256 hash, assemble a filename based on it
 fn hash_file_path(directory: &str, content: &str) -> String {
   let mut hasher = Sha256::new();
-  hasher.input_str(&content);
+  hasher.input_str(content);
   let hash = hasher.result_str();
   directory.to_string() + "/" + &hash + ".txt"
 }

diff --git a/examples/pattern_example.rs b/examples/pattern_example.rs
@@ -38,9 +38,9 @@ fn math_node_to_string(node: RoNode) -> String {
 }
 
 /// helper function
-fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
+fn math_node_to_string_actual(node: RoNode, string: &mut String) {
   match node.get_name().as_ref() {
-    "semantics" => math_node_to_string_children(node, &mut string),
+    "semantics" => math_node_to_string_children(node, string),
     "annotation" | "annotation-xml" => {},
     "text" => {
       if node.is_text_node() {
@@ -51,7 +51,7 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
       string.push('<');
       string.push_str(default);
       string.push('>');
-      math_node_to_string_children(node, &mut string);
+      math_node_to_string_children(node, string);
       string.push('<');
       string.push('/');
       string.push_str(default);
@@ -61,13 +61,13 @@ fn math_node_to_string_actual(node: RoNode, mut string: &mut String) {
 }
 
 /// helper function
-fn math_node_to_string_children(node: RoNode, mut string: &mut String) {
+fn math_node_to_string_children(node: RoNode, string: &mut String) {
   let mut cur = node.get_first_child();
   loop {
     if cur.is_none() {
       break;
     }
-    math_node_to_string_actual(cur.unwrap(), &mut string);
+    math_node_to_string_actual(cur.unwrap(), string);
     cur = cur.unwrap().get_next_sibling();
   }
 }

diff --git a/examples/word_tokenization.rs b/examples/word_tokenization.rs
@@ -50,7 +50,7 @@ fn main() {
   let inorder_dictionary = dictionary.sorted();
   let mut inorder_frequency: Vec<(usize, usize)> = Vec::new();
   for entry in &inorder_dictionary {
-    let frequency = unigrams.get(&entry.0);
+    let frequency = unigrams.get(entry.0);
     inorder_frequency.push((entry.1, frequency));
   }
   plot_simple(

diff --git a/src/ngrams.rs b/src/ngrams.rs
@@ -124,15 +124,15 @@ impl Ngrams {
         if words_since_anchor_seen == self.window_size && side == AnchorSide::Right {
           // it has been too long since we saw an anchor, add to the current buffer, record and
           // reset
-          self.record_words(continuous_buffer.drain(..).collect());
+          self.record_words(std::mem::take(&mut continuous_buffer));
           context_window.clear();
           side = AnchorSide::Left;
         }
       }
     }
     // Any remaining content should be added
     continuous_buffer.extend(context_window.asc_iter().copied());
-    self.record_words(continuous_buffer.drain(..).collect());
+    self.record_words(std::mem::take(&mut continuous_buffer));
   }
 
   /// Take an arbitrarily long vector of words, and record all (overlapping) ngrams obtainable from

diff --git a/src/parallel_data/corpus.rs b/src/parallel_data/corpus.rs
@@ -48,7 +48,7 @@ impl Corpus {
     }
   }
 
-  /// Get a parallel iterator over the documents
+  /// Get a parallel iterator over the documents, returning a single report catalog
   pub fn catalog_with_parallel_walk<F>(&self, closure: F) -> HashMap<String, u64>
   where F: Fn(Document) -> HashMap<String, u64> + Send + Sync {
     ParWalkDir::new(self.path.clone())
@@ -95,4 +95,56 @@ impl Corpus {
         map1
       })
   }
+
+  /// Get a parallel iterator over the documents, returning a pair of report catalogs
+    pub fn catalogs_with_parallel_walk<F>(&self, closure: F) -> (HashMap<String, u64>,HashMap<String, u64>)
+  where F: Fn(Document) -> (HashMap<String, u64>,HashMap<String, u64>) + Send + Sync {
+    ParWalkDir::new(self.path.clone())
+      .num_threads(rayon::current_num_threads())
+      .skip_hidden(true)
+      .sort(false)
+      .into_iter()
+      .filter_map(|each| {
+        if let Ok(entry) = each {
+          let file_name = entry.file_name.to_str().unwrap_or("");
+          let selected = if let Some(ref extension) = self.extension {
+            file_name.ends_with(extension)
+          } else {
+            file_name.ends_with(".html") || file_name.ends_with(".xhtml")
+          };
+          if selected {
+            let path = entry.path().to_str().unwrap_or("").to_owned();
+            if !path.is_empty() {
+              return Some(path);
+            }
+          }
+        }
+        // all other cases
+        None
+      })
+      .enumerate()
+      .par_bridge()
+      .map(|each| {
+        let (index, path) = each;
+        let document = Document::new(path, self).unwrap();
+        if index % 1000 == 0 && index > 0 {
+          println!(
+            "-- catalog_with_parallel_walk now processing document {:?}",
+            1 + index
+          );
+        }
+        closure(document)
+      })
+      .reduce(|| (HashMap::new(),HashMap::new()), |(mut map11, mut map12), (map21,map22)| {
+        for (k, v) in map21 {
+          let entry = map11.entry(k).or_insert(0);
+          *entry += v;
+        }
+        for (k, v) in map22 {
+          let entry = map12.entry(k).or_insert(0);
+          *entry += v;
+        }
+        (map11,map12)
+      })
+  }
 }
diff --git a/src/patterns/rules.rs b/src/patterns/rules.rs
@@ -1064,7 +1064,7 @@ impl PatternFile {
       match cur.get_name().as_ref() {
         "meta" => {
           if meta_opt.is_some() {
-            return Err("pattern_file has multiple meta nodes".to_string()).map_err(err_map);
+            return Err("pattern_file has multiple meta nodes".to_string());
           }
           meta_opt =
             Some(MetaDescription::load_from_node(cur, file_name.to_string()).map_err(err_map)?);
@@ -1085,7 +1085,7 @@ impl PatternFile {
           pctx.add_sequence_rule(cur).map_err(err_map)?;
         },
         x => {
-          return Err(format!("Unexpected node \"{x}\" in pattern_file")).map_err(err_map);
+          return Err(format!("Unexpected node \"{x}\" in pattern_file"));
         },
       }
     }

diff --git a/tests/dnm_test.rs b/tests/dnm_test.rs
@@ -83,19 +83,19 @@ fn test_xml_node_to_plaintext() {
   let mut node = doc.get_root_readonly().unwrap();
   match node.get_first_child() {
     Some(n) => node = n,
-    None => assert!(false), //DOM generation failed
+    None => unreachable!(), //DOM generation failed
   }
   while node.get_name() != "body" {
     match node.get_next_sibling() {
       Some(n) => node = n,
-      None => assert!(false),
+      None => unreachable!(),
     }
   }
   node = node.get_first_child().unwrap();
   while node.get_name() != "h1" {
     match node.get_next_sibling() {
       Some(n) => node = n,
-      None => assert!(false),
+      None => unreachable!(),
     }
   }
   //Node content should have been processed
@@ -106,15 +106,15 @@ fn test_xml_node_to_plaintext() {
   while node.get_name() != "h2" {
     match node.get_next_sibling() {
       Some(n) => node = n,
-      None => assert!(false),
+      None => unreachable!(),
     }
   }
   //node was skipped in dnm generation
   assert_eq!(dnm.get_range_of_node(node).unwrap().get_plaintext(), "");
   while node.get_name() != "a" {
     match node.get_next_sibling() {
       Some(n) => node = n,
-      None => assert!(false),
+      None => unreachable!(),
     }
   }
   //node content should have been replaced by "[link]"