From 1db0c92fddb96eb79510bf899f431fa52b9e49f6 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Tue, 28 Nov 2023 10:43:37 -0500 Subject: [PATCH 1/8] generate command pulls from solr, transforms doc --- .../commands/generate_textcorpus.py | 365 ++++++++++++++++++ 1 file changed, 365 insertions(+) create mode 100644 ppa/archive/management/commands/generate_textcorpus.py diff --git a/ppa/archive/management/commands/generate_textcorpus.py b/ppa/archive/management/commands/generate_textcorpus.py new file mode 100644 index 000000000..e184ad4ca --- /dev/null +++ b/ppa/archive/management/commands/generate_textcorpus.py @@ -0,0 +1,365 @@ +""" +**generate_textcorpus** is a custom manage command to generate a plain +text corpus from Solr. It should be run *after* content has been indexed +into Solr via the **index** manage command. + +The Corpus is serialized in the Matrix Market format +(https://math.nist.gov/MatrixMarket/formats.html) +with a `.mm` extension, using the Gensim Topic Modelling library +(https://radimrehurek.com/gensim) +Typically, an index corresponding to the .mm file is also saved by Gensim, +with a `.mm.index` extension. + +A dictionary corresponding to token IDs is also saved by default, +using a `.mm.dict` extension. +By default, this is a pickled Gensim Dictionary object. +If the `--dictionary-as-text` flag is specified, then the dictionary is saved +as a utf8-encoded and newline-separated +file, where line number N contains the token with token_id N-1. +Saving the dictionary can be skipped by using the `--no-dictionary` option. + +Additional document-level metadata found in the Solr Index is also saved by +default, with `.mm.metadata` extension. +This is a csv file with a header row and one row per unique document found in +the Solr index. +Saving the metadata can be skipped by using the `--no-metadata` option. + +By default, *all* documents found in the Solr index are serialized. +This can be controlled using -`-doc-limit`, +which denotes the maximum no. of documents to serialize. This is especially +useful for development, or for sanity-testing your Solr installation. + +For corpus generation, the following pre-processing options are available via +the `--preprocess` flag:: + + # Lower-cases words + 'lower' + # Strips HTML tags + 'strip_tags' + # Strips punctuation + 'strip_punctuation' + # Collapses multiple whitespaces into one + 'strip_multiple_whitespaces' + # Strips numeric characters + 'strip_numeric' + # Removes stopwords - Note that the set of default stopwords used by Gensim + # is from Stone, Denis, Kwantes (2010). + 'remove_stopwords' + # Strip short words. The lower limit on word length is 3. + 'strip_short' + # Use Porter Stemmer for word-normalization. + 'stem_text' + +IMPORTANT - NO preprocessing filters are applied by default, but you will +typically at least want to use `lower`. +Multiple preprocessing filters can be applied (in order) by specifying multiple +`--preprocess` flags. + +Example usage:: + + # Save all files to the 'data' folder, with bare-minimum preprocessing + python manage.py generate_corpus --path data --preprocess lower + --preprocess strip_tags + + # Restrict corpus to 1000 documents + python manage.py generate_corpus --path data --doc-limit 1000 + --preprocess lower --preprocess strip_tags + + # Don't generate dictionary; don't generate metadata + python manage.py generate_corpus --path data --doc-limit 1000 + --preprocess lower --no-dictionary --no-metadata + +""" + +import csv +import logging +import os.path +from collections import OrderedDict +from os import makedirs +from pprint import pprint,pformat + +from django.core.management.base import BaseCommand +from gensim import corpora +from gensim.corpora.dictionary import Dictionary +from gensim.parsing.preprocessing import ( + preprocess_string, + remove_stopwords, + stem_text, + strip_multiple_whitespaces, + strip_numeric, + strip_punctuation, + strip_short, + strip_tags, +) +from parasolr.django import SolrQuerySet +from progressbar import NullBar, ProgressBar + +logger = logging.getLogger(__name__) + +PREPROCESS_FUNCTIONS = OrderedDict( + [ + ("strip_short", strip_short), + ("strip_multiple_whitespaces", strip_multiple_whitespaces), + ("strip_punctuation", strip_punctuation), + ("strip_tags", strip_tags), + ("strip_numeric", strip_numeric), + ("lower", lambda x: x.lower()), + ("remove_stopwords", remove_stopwords), + ("stem_text", stem_text), + ] +) + + +class SolrCorpus: + """Custom class to generate a text corpus from Solr""" + + # Class attributes that rarely, if ever, need to change + DOC_ID_FIELD = "source_id" # Solr field name for document identifier + DOC_CONTENT_FIELD = "content" # Solr field name for document content + PAGE_ORDER_FIELD = "order" # Solr field name for page ordering + OUTPUT_DOC_FIELDS = dict( + id = '', + work_cluster = 'cluster_id_s', + work_group = 'group_id_s', + work_source = 'source_id', + page_orig = 'label', + page_digital = 'order', + page_text = 'content', + ) + SOURCE_PAGE_ID = 'id' + + def __init__(self, name, doc_limit=-1, preprocess_fns=None, pbar=True): + """ + A class encapsulating a Solr Client specification, that yields + Bag-of-Word vectors on iteration, and thus acts as a Gensim Corpus. + + :param name: A string name of this corpus. + Used as a string prefix for generated files. + :param client: A SolrClient.SolrClient object used to interface with + Solr + :param collection: A string representing the Solr collection name. + :param doc_limit: Max no. of documents to process. The default of -1 + means we process ALL documents found. + :param preprocess_fns: A list of single-argument functions to use as + preprocessors. + See the module gensim.parsing.preprocessing for some typical + preprocessing functions. + :param pbar: A boolean indicating whether to display a progress bar + during corpus generation. + """ + self.name = name + self.doc_limit = doc_limit + + if preprocess_fns is not None: + if "ALL" in preprocess_fns: + self.preprocess_fns = PREPROCESS_FUNCTIONS.values() + else: + self.preprocess_fns = [PREPROCESS_FUNCTIONS[k] for k in preprocess_fns] + else: + self.preprocess_fns = [] + + self.dictionary = Dictionary() + + # doc_id -> dict of k->v mappings + # NOTE: We cannot use 'metadata' as GenSim mangles this attribute! + self._metadata = {} + + # list of strings, populated on first doc retrieval + self.metadata_field_names = None + + # facet on document id to get counts of pages by work + results = SolrQuerySet().facet(SolrCorpus.DOC_ID_FIELD, limit=self.doc_limit) + + """ + An OrderedDict of doc_id => page count mapping + An OrderedDict is important here in case we want to save document-level + metadata, in which case rows of metadata would be in the same order as + the BoW-vectors returned by this object's iterator. + """ + self.page_counts = results.get_facets().facet_fields["source_id"] + self.doc_ids = self.page_counts.keys() + self.doc_count = len(self.doc_ids) + if pbar: + self.pbar = ProgressBar( + redirect_stderr=True, max_value=self.doc_count, max_error=False + ) + else: + self.pbar = NullBar() + + def __iter__(self): + for doc_id in self.doc_ids: + if doc_id not in self.page_counts: + logger.warning( + "Unknown page count for doc {}. Skipping.".format(doc_id) + ) + continue + + result = ( + SolrQuerySet() + .search(**{SolrCorpus.DOC_ID_FIELD: doc_id}) + .order_by(SolrCorpus.PAGE_ORDER_FIELD) + ) + # populate the result cache with number of rows specified + docs = result.get_results(rows=self.page_counts[doc_id]) + + metadata_docs = [d for d in docs if d["item_type"] == "work"] + + n_metadata_docs = len(metadata_docs) + if n_metadata_docs > 0: + if n_metadata_docs > 1: + logger.warning( + "Multiple metadata records found for doc ID" + "{}. Using the first.".format(doc_id) + ) + + metadata_doc = metadata_docs[0] + self._metadata[doc_id] = {k: v for k, v in metadata_doc.items()} + if self.metadata_field_names is None: + self.metadata_field_names = list(metadata_doc.keys()) + else: + logger.warning("No metadata record found for doc ID {}.".format(doc_id)) + + # filter out pages that have no content; + # combine all pages into one string + + metadocs=[] + for i,doc in enumerate(docs): + if not SolrCorpus.DOC_CONTENT_FIELD in doc: + metadocs.append(doc) + else: + yield self._transform_doc(doc) + self.pbar.update(self.pbar.value + 1) + + def _transform_doc(self, doc): + odoc = OrderedDict({ + key_new:doc.get(key_orig,'') + for key_new,key_orig in ( + SolrCorpus.OUTPUT_DOC_FIELDS.items() + ) + }) + odoc[SolrCorpus.SOURCE_PAGE_ID] = f'{odoc["work_source"]}_{odoc["page_orig"]}' + return odoc + + def _save_dictionary(self, filepath, as_text=False): + """ + Save dictionary at a specified path, either as a picked Gensim + Dictionary object, or a .txt file + :param filepath: File path for saved dictionary + :param as_text: Whether to save as a plaintext file, where the + 0-indexed line number denotes the token id. + :return: None + """ + if as_text: + with open(filepath, "w", encoding="utf8") as f: + f.writelines( + [self.dictionary[i] + "\n" for i in range(len(self.dictionary))] + ) + else: + self.dictionary.save(filepath) + + def _save_metadata(self, filepath): + if self.metadata_field_names is None: + raise RuntimeError("Unable to determine metadata field names!") + + with open(filepath, "w", encoding="utf8", newline="") as f: + writer = csv.writer(f) + writer.writerow(self.metadata_field_names) # header row + + for doc_id in self.doc_ids: + metadata = self._metadata[doc_id] + writer.writerow( + [ + metadata.get(field_name) + for field_name in self.metadata_field_names + ] + ) + + def save(self, path, save_dict=True, save_dict_as_text=False, save_metadata=False): + """Save the generated corpus text and metadata to files on disk""" + path_texts = os.path.join(path,'texts') + path_metadata = os.path.join(path,'metadata.csv') + if not os.path.isdir(path_texts): makedirs(path_texts) + + + + for i,d in enumerate(self): + pprint(d) + print() + if i>2000: break + + + # # There's no way to completely turn off the progress ticker for Gensim + # # serialize - we simply set it's frequency to one more than the no. of + # # documents we have, so it will effectively be shut off. + # corpora.MmCorpus.serialize(corpus_path, self, progress_cnt=self.doc_count + 1) + + # if save_dict: + # self._save_dictionary(corpus_path + ".dict", as_text=save_dict_as_text) + # if save_metadata: + # self._save_metadata(corpus_path + ".metadata") + + +class Command(BaseCommand): + """Custom manage command to generate a token corpus from text indexed in Solr""" + + def add_arguments(self, parser): + parser.add_argument( + "--path", required=True, help="Directory path to save corpus file(s)." + ) + parser.add_argument( + "--name", + default="corpus", + help="Name prefix to use for all saved corpus file(s).", + ) + + parser.add_argument( + "--doc-limit", + type=int, + default=-1, + help="Limit on the number of documents for corpus generation." + "The default of -1 considers ALL documents.", + ) + parser.add_argument( + "--no-dictionary", + action="store_true", + help="Do not save corpus dictionary.", + ) + parser.add_argument( + "--dictionary-as-text", + action="store_true", + help="If saving dictionary, save as a plaintext file.", + ) + parser.add_argument( + "--no-metadata", + action="store_true", + default=False, + help="Do not save corpus metadata.", + ) + parser.add_argument( + "--no-progress", + action="store_true", + help="Do not display progress bar to track the status of the" "command.", + ) + parser.add_argument( + "--preprocess", + action="append", + choices=list(PREPROCESS_FUNCTIONS.keys()) + ["ALL"], + help="Pre-processing filter(s) to apply. Multiple filters can be" + "applied (in order) by adding multiple --preprocess flags." + "Use ALL to apply all pre-processing filters.", + ) + + def handle(self, *args, **options): + corpus = SolrCorpus( + name=options["name"], + doc_limit=options["doc_limit"], + preprocess_fns=options["preprocess"], + pbar=not options["no_progress"], + ) + + corpus.save( + options["path"], + save_dict=not options["no_dictionary"], + save_dict_as_text=options["dictionary_as_text"], + save_metadata=not options["no_metadata"], + ) From b34b6d6fa2a95a6b287ae0fc4925b2fce7bd91f1 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Tue, 28 Nov 2023 13:50:37 -0500 Subject: [PATCH 2/8] adding Wouter's preprocessing code Co-authored-by: Wouter Haverals --- dev-requirements.txt | 5 +- .../commands/generate_textcorpus.py | 675 +++++++++++++++--- 2 files changed, 565 insertions(+), 115 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 3ce12c8af..d6d40a18e 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -4,4 +4,7 @@ pytest-django>=4.5.2 pytest-cov django-debug-toolbar sphinx -pre-commit \ No newline at end of file +pre-commit +wordfreq +nltk +tqdm \ No newline at end of file diff --git a/ppa/archive/management/commands/generate_textcorpus.py b/ppa/archive/management/commands/generate_textcorpus.py index e184ad4ca..60d0905e4 100644 --- a/ppa/archive/management/commands/generate_textcorpus.py +++ b/ppa/archive/management/commands/generate_textcorpus.py @@ -3,26 +3,9 @@ text corpus from Solr. It should be run *after* content has been indexed into Solr via the **index** manage command. -The Corpus is serialized in the Matrix Market format -(https://math.nist.gov/MatrixMarket/formats.html) -with a `.mm` extension, using the Gensim Topic Modelling library -(https://radimrehurek.com/gensim) -Typically, an index corresponding to the .mm file is also saved by Gensim, -with a `.mm.index` extension. - -A dictionary corresponding to token IDs is also saved by default, -using a `.mm.dict` extension. -By default, this is a pickled Gensim Dictionary object. -If the `--dictionary-as-text` flag is specified, then the dictionary is saved -as a utf8-encoded and newline-separated -file, where line number N contains the token with token_id N-1. -Saving the dictionary can be skipped by using the `--no-dictionary` option. - -Additional document-level metadata found in the Solr Index is also saved by -default, with `.mm.metadata` extension. -This is a csv file with a header row and one row per unique document found in -the Solr index. -Saving the metadata can be skipped by using the `--no-metadata` option. +Vineet Bansal authored the parts of the code below that iterate over the Solr- +indexed corpus. Wouter Haverals authored the functions that preprocess the +returned page texts: cleaning OCR, rejoining lines, and other useful processes. By default, *all* documents found in the Solr index are serialized. This can be controlled using -`-doc-limit`, @@ -73,6 +56,11 @@ import csv import logging +logging.getLogger().handlers.clear() +logger = logging.getLogger(__name__) +logger.setLevel(logging.ERROR) + + import os.path from collections import OrderedDict from os import makedirs @@ -81,31 +69,467 @@ from django.core.management.base import BaseCommand from gensim import corpora from gensim.corpora.dictionary import Dictionary -from gensim.parsing.preprocessing import ( - preprocess_string, - remove_stopwords, - stem_text, - strip_multiple_whitespaces, - strip_numeric, - strip_punctuation, - strip_short, - strip_tags, -) from parasolr.django import SolrQuerySet from progressbar import NullBar, ProgressBar -logger = logging.getLogger(__name__) + + +### PREPROCESSING CODE ### + +PATH_HERE = os.path.abspath(os.path.dirname(__file__)) +PATH_REPO = os.path.abspath(os.path.join( + PATH_HERE, + '..', # management + '..', # archive + '..', # ppa + '..' # ppa-django +)) +PATH_REPO_DATA = os.path.join(PATH_REPO, 'data') +PATH_OCR_RULESETS = os.path.join(PATH_REPO_DATA, 'ocr_cleanup_rulesets') + + +# imports... +import os,sys,warnings,random +warnings.filterwarnings('ignore') +from functools import cache +from tqdm import tqdm +from sqlitedict import SqliteDict +import orjson,zlib +import pandas as pd +from intspan import intspan +import jsonlines +import multiprocessing as mp +mp.set_start_method('fork') + +## ocr correction imports +import re +import wordfreq +import os +import json +import nltk +from nltk.tokenize import word_tokenize +import pandas as pd +from tqdm import tqdm +tqdm.pandas() +from collections import defaultdict +from difflib import SequenceMatcher +from functools import cached_property +from collections import Counter +import gzip +# nltk.download('punkt') + + + +def tokenize_agnostic(txt): + return re.findall(r"[\w']+|[.,!?; -—–'\n]", txt) + +def untokenize_agnostic(l): + return ''.join(l) + +def remove_trailing_punctuation(word): + """ + remove trailing punctuation and spaces + don't remove the dash '-', as this might interfere with the function to repair broken words! + Question: should we also remove punct at the beginning of the token...? Not doing that now. + + # small test + word = "...example.,...! " + clean_word = remove_trailing_punctuation(word) + print(clean_word) + """ + return re.sub(r'[\.,\?!"\')(:;`]+\s*$', '', word) + + + + + + +# process a list of word pairs, where each pair consists of an 'incorrect' word with a historic long 's' (ſ) and its 'correct' modern equivalent +# the script then replaces the historic long 's' (ſ) words with 'f', generates new word pairs +# ONLY if the newly generated f-word does NOT exist in the English language, we retain the word!! For this, we use language stats provided by wordfreq +# the resulting pairs are then written to the outfile, while pairs that exists -- with high frequency in English -- are written to a separate disregard_file +# i think this is clever, so i named the function accordingly :-) + +def generate_clever_f_s_hack(source_file, output_file, disregard_file, skip_words=None, frequency_threshold=1e-6): + if skip_words is None: + skip_words = {'ſlip'} # add specific words to skip here -- dunno if this is still useful, the file will capture most of these words + + unique_pairs = set() # set to keep track of unique (incorrect f-word, correct s-word) pairs + + with open(source_file, 'r') as infile, open(output_file, 'w') as outfile, open(disregard_file, 'w') as disregard: + # skip the title line of the infile + next(infile) + + for line in infile: + parts = line.strip().split('\t') + if len(parts) < 3: + continue + + incorrect, correct = parts[:2] + # e.g.: + # incorrect correct + # moſt most + # muſt must + # ſo so + # ſome some + # ſee see etc. + + # strip leading/trailing spaces + incorrect = incorrect.strip() + correct = correct.strip() + + # remove trailing punctuation + incorrect = remove_trailing_punctuation(incorrect) + correct = remove_trailing_punctuation(correct) + + # replace 'ſ' with 'f' in the incorrect word + f_incorrect = incorrect.replace('ſ', 'f') + # e.g.: + # incorrect correct + # moft most + # muft must + # fo so + # fome some + # fee see etc. + + # skip if the incorrect word is in skip_words or already in pairs + if f_incorrect in skip_words or (f_incorrect, correct) in unique_pairs: + continue + + # check the frequency of the word + word_frequency = wordfreq.word_frequency(f_incorrect.lower(), 'en') + + # skip if the word exists and its frequency is above the threshold + if word_frequency > frequency_threshold: + disregard.write(f"{f_incorrect}\t{correct}\n") + #print(f'Word that exist with the f-spelling and we don\'t want to include: {f_incorrect}') + # e.g. + # Words that exist with the f-spelling and we don't want to include: fame + # Words that exist with the f-spelling and we don't want to include: found etc. + continue + + # check if the generated word exists in English + if word_frequency <= frequency_threshold: + outfile.write(f"{f_incorrect}\t{correct}\n") + unique_pairs.add((f_incorrect, correct)) + # e.g. + # moft most + # muft must + # fo so + # fome some etc. + +# apply +# generate_clever_f_s_hack( +# source_file=os.path.join(PATH_OCR_RULESETS, "all_long_s_corrections_log.txt"), +# output_file=os.path.join(PATH_OCR_RULESETS, "clever_f_ſ_hack.txt"), +# disregard_file=os.path.join(PATH_OCR_RULESETS, "disregard_fſs_replacements.txt") +# ) + + + + + +@cache +def load_correction_rules(file_path = os.path.join(PATH_OCR_RULESETS, 'CorrectionRules.txt')): + correction_rules = {} + with open(file_path, 'r') as file: + for line in file: + parts = line.strip().split('\t') + if len(parts) >= 2: + incorrect, correct = parts[:2] + correction_rules[incorrect] = correct + return correction_rules + + +def correct_ocr_errors(text, correction_rules): + corrections = 0 + for incorrect, correct in correction_rules.items(): + if incorrect in text: + text = text.replace(incorrect, correct) + corrections += 1 + return text, corrections + +def rejoin_linebreaks(text, specific_linebreak_corrections): + """ + function to addresses the issue of words that are split between two lines due to a line break, typically indicated by a hyphen + the function rejoins such words + """ + corrections = 0 + parts = text.split('-\n') + corrected_text = parts[0] + for part in parts[1:]: + corrected_text_words = corrected_text.split() + part_words = part.split() + + if corrected_text_words and part_words: # check if both lists are not empty + last_word_before_break = corrected_text_words[-1] + first_word_after_break = part_words[0] + + # form the broken word and the corrected word + broken_word = last_word_before_break + '-\n' + first_word_after_break + corrected_word = last_word_before_break + first_word_after_break + + # log the correction (gets later written to the txt file) + # specific_linebreak_corrections[broken_word + " \t " + corrected_word] += 1 + specific_linebreak_corrections.append((broken_word,corrected_word)) + + corrected_text += part + corrections += 1 + else: + # if either part is empty or doesn't contain words, simply append a hyphen + corrected_text += '-' + part + + return corrected_text, corrections + +def replace_historic_long_s(text, long_s_corrections): + """ + function to replaces the historic long 's' (ſ) with the regular 's' + + :text: text to be processed + :long_s_corrections: dictionary to log specific corrections and their counts + :return: tuple of processed text with long 's' replaced, and the number of corrections made + """ + corrected_text = text.replace('ſ', 's') + corrections = 0 + if corrected_text != text: + words_with_long_s = set(text.split()) - set(corrected_text.split()) + for word in words_with_long_s: + corrected_word = word.replace('ſ', 's') + long_s_corrections.append((word,corrected_word)) + corrections += 1 + return corrected_text, corrections + +@cache +def load_f_s_hack_corrections(file_path = os.path.join(PATH_OCR_RULESETS, "clever_f_ſ_hack.txt")): + """ + little helper script to load the f-->s words (from generate_clever_f_s_hack) into a dict, for convenient lookup + """ + correction_rules = {} + with open(file_path, 'r') as file: + for line in file: + parts = line.strip().split() + if len(parts) >= 2: + incorrect, correct = parts[:2] + correction_rules[incorrect] = correct + return correction_rules + +def process_headers(pages, remove_headers=True, similarity_threshold=80): + """ + function to identifies and optionally removes running headers + inspired by Ted Underwood's GREAT headerfinder script: https://github.com/tedunderwood/DataMunging/blob/master/runningheaders/HeaderFinder.py + some changes made: + - flexibility to remove headers or just identify them (just by setting the boolean value) + - we don't explicitly handle roman numerals, the line comparison logic (combining str.isalpha and a threshold for fuzzy matching) should take care of it + + :pages: list of dicts, each representing a page with 'page_text' + :remove_headers: bool, if set to True --> removes identified headers, otherwise just identifies them and wirtes them to the log + :similarity_threshold: int, threshold for fuzzy matching to consider lines as similar (default 80 seems to work well) + :return: list of pages with headers + """ + identified_headers = [] + headers_set = set() + + def get_substantial_lines(page_text): + """ + helper function: if the processed line contains less than 5 characters, or if the line consists solely of digits + it is considered insubstantial and is skipped + """ + lines = page_text.split('\n') + substantial_lines = [] + for line in lines: + if len(line.strip()) < 5 or line.strip().isdigit(): + continue + substantial_lines.append(line) + if len(substantial_lines) == 2: + break + return substantial_lines + + numpages = len(pages) + iterr = range(numpages) + iterr = tqdm(iterr, total=numpages, position=1, desc='Preprocessing headers',disable=True) + for i in iterr: + page = pages[i] + if not 'corrections' in page: page['corrections']={} + if not 'headers' in page['corrections']: page['corrections']['headers']=[] + current_page_text = pages[i]['page_text'] + current_substantial_lines = get_substantial_lines(current_page_text) + + header_found = False + + # determine the range of pages to compare with + start_index = max(0, i - 2) + end_index = min(len(pages), i + 3) + if i == len(pages) - 1: # Special handling for the last page + start_index = max(0, i - 2) # Compare with pages before + + for j in range(start_index, end_index): + if i == j: + continue + + comparison_page_text = pages[j]['page_text'] + comparison_substantial_lines = get_substantial_lines(comparison_page_text) + + for current_line in current_substantial_lines: + for comparison_line in comparison_substantial_lines: + # line comparison logic, considering possible page numbers + cleaned_current_line = ''.join(filter(str.isalpha, current_line)) + cleaned_comparison_line = ''.join(filter(str.isalpha, comparison_line)) + + s = SequenceMatcher(None, cleaned_current_line, cleaned_comparison_line) + similarity = s.ratio() * 100 + + if similarity > similarity_threshold: + header_key = (i, current_line) + if header_key not in headers_set: + identified_headers.append(header_key) + headers_set.add(header_key) + if remove_headers: + header_found = True + break + + if header_found: + correx=(current_line,'') + if correx not in set(page['corrections']['headers']): + page['corrections']['headers'].append(correx) + lines_of_page = current_page_text.split('\n') + for idx, line in enumerate(lines_of_page): + if line.strip() == current_line.strip(): + page['page_text_clean'] = '\n'.join(lines_of_page[idx+1:]) + + break + break + + return pages + + + +def cleanup_str(txt, use_nltk_tokenizer=False, **page_attrs): + """ + Most of the cleanup occurs here. Can be called with a string or a string with page attributes + """ + page_text = txt + # dicts to store specific corrections and their counts + specific_ocr_corrections = [] + specific_linebreak_corrections = [] + specific_long_s_corrections = [] + correction_rules = load_correction_rules() + clever_f_s_hack_rules = load_f_s_hack_corrections() + + # add a dictionary for specific f ſ hack corrections + specific_f_s_hack_corrections = [] + + # counters for corrections + linebreak_corrections = 0 + ocr_corrections = 0 + long_s_corrections = 0 + f_s_word_replacements = 0 + + # rejoin line breaks before tokenization and log corrections + page_text, corrections = rejoin_linebreaks(page_text, specific_linebreak_corrections) + linebreak_corrections += corrections + + # apply correction for long 's' + corrected_text, corrections = replace_historic_long_s(page_text, specific_long_s_corrections) + long_s_corrections += corrections + page_text = corrected_text + + # tokenization + tokens = word_tokenize(page_text) if use_nltk_tokenizer else tokenize_agnostic(page_text) + + # apply OCR corrections on tokens and log corrections + corrected_tokens = [] + for token in tokens: + if token in correction_rules: + corrected_token = correction_rules[token] + ocr_corrections += 1 + specific_ocr_corrections.append((token,corrected_token)) + else: + corrected_token = token + corrected_tokens.append(corrected_token) + + # apply f-ſ-s hack corrections on tokens and log corrections + for i, token in enumerate(corrected_tokens): + if token in clever_f_s_hack_rules: + corrected_token = clever_f_s_hack_rules[token] + f_s_word_replacements += 1 + specific_f_s_hack_corrections.append((token,corrected_token)) + corrected_tokens[i] = corrected_token + + token_count = len(corrected_tokens) + + # convert corrected tokens back to text for further processing + corrected_text = untokenize(corrected_tokens) if use_nltk_tokenizer else untokenize_agnostic(corrected_tokens) + + corrected_tokens_real = [x for x in corrected_tokens if any(y.isalpha() for y in x)] + + # create output dictionary + def as_counts(l): + return l + # return dict(Counter(l)) + + return { + 'page_text':page_text, + **page_attrs, + 'page_text_clean':corrected_text, + # 'page_num_tokens':token_count, + 'page_tokens':corrected_tokens_real, + 'corrections': { + 'headers':as_counts(page_attrs.get('corrections',{}).get('headers',[])), + 'ocr':as_counts(specific_ocr_corrections), + 'linebreaks':as_counts(specific_linebreak_corrections), + 'long_s':as_counts(specific_long_s_corrections), + 'f_s':as_counts(specific_f_s_hack_corrections), + } + } + + + +def cleanup_page(page_d): + """ + Cleanup a page dictionary + """ + txt=page_d.get('page_text_clean', page_d.get('page_text','')) + odx=cleanup_str(txt, **page_d) + return odx + +def cleanup_pages(pages_ld): + """ + Cleanup a list or dataframe of pages + """ + logger.debug('processing headers') + pages_ld = process_headers(pages_ld, remove_headers=True) # ideally, we want to set this later when calling the function + + logger.debug('processing headers') + pages_ld = [cleanup_page(page_d) for page_d in tqdm(pages_ld,position=1,desc='Cleaning up pages',disable=True)] + return pages_ld + + + + + + + + + + + + + + + + + PREPROCESS_FUNCTIONS = OrderedDict( [ - ("strip_short", strip_short), - ("strip_multiple_whitespaces", strip_multiple_whitespaces), - ("strip_punctuation", strip_punctuation), - ("strip_tags", strip_tags), - ("strip_numeric", strip_numeric), - ("lower", lambda x: x.lower()), - ("remove_stopwords", remove_stopwords), - ("stem_text", stem_text), + # ("strip_short", strip_short), + # ("strip_multiple_whitespaces", strip_multiple_whitespaces), + # ("strip_punctuation", strip_punctuation), + # ("strip_tags", strip_tags), + # ("strip_numeric", strip_numeric), + # ("lower", lambda x: x.lower()), + # ("remove_stopwords", remove_stopwords), + # ("stem_text", stem_text), ] ) @@ -118,15 +542,15 @@ class SolrCorpus: DOC_CONTENT_FIELD = "content" # Solr field name for document content PAGE_ORDER_FIELD = "order" # Solr field name for page ordering OUTPUT_DOC_FIELDS = dict( - id = '', work_cluster = 'cluster_id_s', work_group = 'group_id_s', work_source = 'source_id', + page_id = '', page_orig = 'label', page_digital = 'order', page_text = 'content', ) - SOURCE_PAGE_ID = 'id' + SOURCE_PAGE_ID = 'page_id' def __init__(self, name, doc_limit=-1, preprocess_fns=None, pbar=True): """ @@ -179,66 +603,23 @@ def __init__(self, name, doc_limit=-1, preprocess_fns=None, pbar=True): self.page_counts = results.get_facets().facet_fields["source_id"] self.doc_ids = self.page_counts.keys() self.doc_count = len(self.doc_ids) - if pbar: - self.pbar = ProgressBar( - redirect_stderr=True, max_value=self.doc_count, max_error=False - ) - else: - self.pbar = NullBar() + # if pbar: + # self.pbar = ProgressBar( + # redirect_stderr=True, max_value=self.doc_count, max_error=False + # ) + # else: + # self.pbar = NullBar() def __iter__(self): - for doc_id in self.doc_ids: + for doc_id in random.sample(self.doc_ids, len(self.doc_ids)): + logger.debug(f'proceeding to doc id {doc_id}') if doc_id not in self.page_counts: logger.warning( "Unknown page count for doc {}. Skipping.".format(doc_id) ) continue - - result = ( - SolrQuerySet() - .search(**{SolrCorpus.DOC_ID_FIELD: doc_id}) - .order_by(SolrCorpus.PAGE_ORDER_FIELD) - ) - # populate the result cache with number of rows specified - docs = result.get_results(rows=self.page_counts[doc_id]) - - metadata_docs = [d for d in docs if d["item_type"] == "work"] - - n_metadata_docs = len(metadata_docs) - if n_metadata_docs > 0: - if n_metadata_docs > 1: - logger.warning( - "Multiple metadata records found for doc ID" - "{}. Using the first.".format(doc_id) - ) - - metadata_doc = metadata_docs[0] - self._metadata[doc_id] = {k: v for k, v in metadata_doc.items()} - if self.metadata_field_names is None: - self.metadata_field_names = list(metadata_doc.keys()) - else: - logger.warning("No metadata record found for doc ID {}.".format(doc_id)) - - # filter out pages that have no content; - # combine all pages into one string - - metadocs=[] - for i,doc in enumerate(docs): - if not SolrCorpus.DOC_CONTENT_FIELD in doc: - metadocs.append(doc) - else: - yield self._transform_doc(doc) - self.pbar.update(self.pbar.value + 1) - - def _transform_doc(self, doc): - odoc = OrderedDict({ - key_new:doc.get(key_orig,'') - for key_new,key_orig in ( - SolrCorpus.OUTPUT_DOC_FIELDS.items() - ) - }) - odoc[SolrCorpus.SOURCE_PAGE_ID] = f'{odoc["work_source"]}_{odoc["page_orig"]}' - return odoc + logger.debug('querying solr') + yield doc_id def _save_dictionary(self, filepath, as_text=False): """ @@ -279,25 +660,89 @@ def save(self, path, save_dict=True, save_dict_as_text=False, save_metadata=Fals path_texts = os.path.join(path,'texts') path_metadata = os.path.join(path,'metadata.csv') if not os.path.isdir(path_texts): makedirs(path_texts) - - - - for i,d in enumerate(self): - pprint(d) - print() - if i>2000: break - - - # # There's no way to completely turn off the progress ticker for Gensim - # # serialize - we simply set it's frequency to one more than the no. of - # # documents we have, so it will effectively be shut off. - # corpora.MmCorpus.serialize(corpus_path, self, progress_cnt=self.doc_count + 1) - - # if save_dict: - # self._save_dictionary(corpus_path + ".dict", as_text=save_dict_as_text) - # if save_metadata: - # self._save_metadata(corpus_path + ".metadata") - + num_cpu=mp.cpu_count() // 2 if mp.cpu_count()>1 else 1 + pool = mp.Pool(num_cpu) + tasks = [] + + def iter_tasks(): + for i,obj in enumerate(self): + yield (path_texts, self.page_counts, obj) + + for obj in iter_tasks(): + tasks.append(pool.apply_async(_do_save, (obj,))) + + # close the process pool + filenames = [] + for task in tqdm(tasks,position=0,desc=f'Saving corpus [{num_cpu}x]'): + res = task.get() + filenames.append(res) + + return filenames + + +def _iter_group_pages(doc_id, page_counts): + result = ( + SolrQuerySet() + .search(**{SolrCorpus.DOC_ID_FIELD: doc_id}) + .order_by(SolrCorpus.PAGE_ORDER_FIELD) + ) + # populate the result cache with number of rows specified + docs = result.get_results(rows=page_counts[doc_id]) + logger.debug(f'found {len(docs)} documents') + + metadata_docs = [d for d in docs if d["item_type"] == "work"] + logger.debug(f'found {len(metadata_docs)} metadata documents') + + page_docs = [d for d in docs if d["item_type"] == "page"] + logger.debug(f'found {len(page_docs)} page documents') + + logger.debug(f'sorting page documents') + page_docs.sort(key = lambda d: (d['source_id'], d['order'])) + work_page_docs = defaultdict(list) + for pdoc in page_docs: + work_page_docs[pdoc['group_id_s']].append(pdoc) + + # filter out pages that have no content; + # combine all pages into one string + logger.debug(f'iterating over {len(work_page_docs)} groups') + for group_id,source_pages in work_page_docs.items(): + logger.debug(f'proceeding to group {group_id} within document id {doc_id}') + logger.debug(f'reformatting page document dictionaries') + pages_ld = [ + _transform_doc(doc) + for doc in source_pages + ] + assert all([ + (doc['work_group'] == group_id) + for doc in pages_ld + ]) + + yield group_id,pages_ld + + + +def _transform_doc(doc): + odoc = OrderedDict({ + key_new:doc.get(key_orig,'') + for key_new,key_orig in ( + SolrCorpus.OUTPUT_DOC_FIELDS.items() + ) + }) + odoc[SolrCorpus.SOURCE_PAGE_ID] = f'{odoc["work_source"]}_{odoc["page_orig"]}' + return odoc + +def _do_save(obj): + path_texts, page_counts, doc_id = obj + filenames = [] + for group_id,pages_ld in _iter_group_pages(doc_id, page_counts): + logger.debug(f'applying cleanup preprocessing') + pages_ld = cleanup_pages(pages_ld) + filename = os.path.join(path_texts, group_id.replace('/','|')+'.json') + with open(filename,'w') as of: + json.dump(pages_ld, of, indent=4, sort_keys=True) + filenames.append(filename) + return filenames + class Command(BaseCommand): """Custom manage command to generate a token corpus from text indexed in Solr""" @@ -357,9 +802,11 @@ def handle(self, *args, **options): pbar=not options["no_progress"], ) - corpus.save( + filenames_saved = corpus.save( options["path"], save_dict=not options["no_dictionary"], save_dict_as_text=options["dictionary_as_text"], save_metadata=not options["no_metadata"], ) + + print(f'Successfully saved {len(filenames_saved)} json files') From cdc7bf220e77c1e14a1310c3fd8164a930a59438 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Thu, 30 Nov 2023 16:35:10 -0500 Subject: [PATCH 3/8] generate and tests --- .../commands/generate_textcorpus.py | 896 +++--------------- ppa/archive/tests/test_generate_textcorpus.py | 103 ++ 2 files changed, 246 insertions(+), 753 deletions(-) create mode 100644 ppa/archive/tests/test_generate_textcorpus.py diff --git a/ppa/archive/management/commands/generate_textcorpus.py b/ppa/archive/management/commands/generate_textcorpus.py index 60d0905e4..3e13dc22c 100644 --- a/ppa/archive/management/commands/generate_textcorpus.py +++ b/ppa/archive/management/commands/generate_textcorpus.py @@ -2,760 +2,188 @@ **generate_textcorpus** is a custom manage command to generate a plain text corpus from Solr. It should be run *after* content has been indexed into Solr via the **index** manage command. - -Vineet Bansal authored the parts of the code below that iterate over the Solr- -indexed corpus. Wouter Haverals authored the functions that preprocess the -returned page texts: cleaning OCR, rejoining lines, and other useful processes. - -By default, *all* documents found in the Solr index are serialized. -This can be controlled using -`-doc-limit`, -which denotes the maximum no. of documents to serialize. This is especially -useful for development, or for sanity-testing your Solr installation. - -For corpus generation, the following pre-processing options are available via -the `--preprocess` flag:: - - # Lower-cases words - 'lower' - # Strips HTML tags - 'strip_tags' - # Strips punctuation - 'strip_punctuation' - # Collapses multiple whitespaces into one - 'strip_multiple_whitespaces' - # Strips numeric characters - 'strip_numeric' - # Removes stopwords - Note that the set of default stopwords used by Gensim - # is from Stone, Denis, Kwantes (2010). - 'remove_stopwords' - # Strip short words. The lower limit on word length is 3. - 'strip_short' - # Use Porter Stemmer for word-normalization. - 'stem_text' - -IMPORTANT - NO preprocessing filters are applied by default, but you will -typically at least want to use `lower`. -Multiple preprocessing filters can be applied (in order) by specifying multiple -`--preprocess` flags. - -Example usage:: - - # Save all files to the 'data' folder, with bare-minimum preprocessing - python manage.py generate_corpus --path data --preprocess lower - --preprocess strip_tags - - # Restrict corpus to 1000 documents - python manage.py generate_corpus --path data --doc-limit 1000 - --preprocess lower --preprocess strip_tags - - # Don't generate dictionary; don't generate metadata - python manage.py generate_corpus --path data --doc-limit 1000 - --preprocess lower --no-dictionary --no-metadata - """ -import csv -import logging -logging.getLogger().handlers.clear() -logger = logging.getLogger(__name__) -logger.setLevel(logging.ERROR) - - -import os.path -from collections import OrderedDict -from os import makedirs -from pprint import pprint,pformat - +import os +import orjson from django.core.management.base import BaseCommand -from gensim import corpora -from gensim.corpora.dictionary import Dictionary from parasolr.django import SolrQuerySet -from progressbar import NullBar, ProgressBar - - - -### PREPROCESSING CODE ### - -PATH_HERE = os.path.abspath(os.path.dirname(__file__)) -PATH_REPO = os.path.abspath(os.path.join( - PATH_HERE, - '..', # management - '..', # archive - '..', # ppa - '..' # ppa-django -)) -PATH_REPO_DATA = os.path.join(PATH_REPO, 'data') -PATH_OCR_RULESETS = os.path.join(PATH_REPO_DATA, 'ocr_cleanup_rulesets') - - -# imports... -import os,sys,warnings,random -warnings.filterwarnings('ignore') -from functools import cache +from collections import defaultdict, OrderedDict +import logging from tqdm import tqdm -from sqlitedict import SqliteDict -import orjson,zlib import pandas as pd -from intspan import intspan -import jsonlines -import multiprocessing as mp -mp.set_start_method('fork') - -## ocr correction imports -import re -import wordfreq -import os -import json -import nltk -from nltk.tokenize import word_tokenize -import pandas as pd -from tqdm import tqdm -tqdm.pandas() -from collections import defaultdict -from difflib import SequenceMatcher -from functools import cached_property -from collections import Counter -import gzip -# nltk.download('punkt') - - +from typing import Tuple +from contextlib import contextmanager +import logging -def tokenize_agnostic(txt): - return re.findall(r"[\w']+|[.,!?; -—–'\n]", txt) +class SolrCorpus: + """Custom class to generate a text corpus from Solr""" -def untokenize_agnostic(l): - return ''.join(l) + # Class attributes that rarely, if ever, need to change + DOC_ID_FIELD = "group_id_s" # Solr field name for document identifier (not source_id, which is same across excerpts) + PAGE_ORDER_FIELD = "order" # Solr field name for page ordering + OUTPUT_DOC_FIELDS = dict( + page_num_orig = 'label', + page_num_digi = 'order', + page_text = 'content', + ) + PAGE_ID_FIELD = 'page_id' + WORK_ID_FIELD = 'work_id' + PAGE_NUM_FIELD = 'page_num_orig' + PAGE_SORT_FIELD = 'page_num_digi' -def remove_trailing_punctuation(word): - """ - remove trailing punctuation and spaces - don't remove the dash '-', as this might interfere with the function to repair broken words! - Question: should we also remove punct at the beginning of the token...? Not doing that now. - - # small test - word = "...example.,...! " - clean_word = remove_trailing_punctuation(word) - print(clean_word) - """ - return re.sub(r'[\.,\?!"\')(:;`]+\s*$', '', word) - - - - - - -# process a list of word pairs, where each pair consists of an 'incorrect' word with a historic long 's' (ſ) and its 'correct' modern equivalent -# the script then replaces the historic long 's' (ſ) words with 'f', generates new word pairs -# ONLY if the newly generated f-word does NOT exist in the English language, we retain the word!! For this, we use language stats provided by wordfreq -# the resulting pairs are then written to the outfile, while pairs that exists -- with high frequency in English -- are written to a separate disregard_file -# i think this is clever, so i named the function accordingly :-) - -def generate_clever_f_s_hack(source_file, output_file, disregard_file, skip_words=None, frequency_threshold=1e-6): - if skip_words is None: - skip_words = {'ſlip'} # add specific words to skip here -- dunno if this is still useful, the file will capture most of these words - - unique_pairs = set() # set to keep track of unique (incorrect f-word, correct s-word) pairs - - with open(source_file, 'r') as infile, open(output_file, 'w') as outfile, open(disregard_file, 'w') as disregard: - # skip the title line of the infile - next(infile) - - for line in infile: - parts = line.strip().split('\t') - if len(parts) < 3: - continue - - incorrect, correct = parts[:2] - # e.g.: - # incorrect correct - # moſt most - # muſt must - # ſo so - # ſome some - # ſee see etc. - - # strip leading/trailing spaces - incorrect = incorrect.strip() - correct = correct.strip() - - # remove trailing punctuation - incorrect = remove_trailing_punctuation(incorrect) - correct = remove_trailing_punctuation(correct) - - # replace 'ſ' with 'f' in the incorrect word - f_incorrect = incorrect.replace('ſ', 'f') - # e.g.: - # incorrect correct - # moft most - # muft must - # fo so - # fome some - # fee see etc. - - # skip if the incorrect word is in skip_words or already in pairs - if f_incorrect in skip_words or (f_incorrect, correct) in unique_pairs: - continue - - # check the frequency of the word - word_frequency = wordfreq.word_frequency(f_incorrect.lower(), 'en') - - # skip if the word exists and its frequency is above the threshold - if word_frequency > frequency_threshold: - disregard.write(f"{f_incorrect}\t{correct}\n") - #print(f'Word that exist with the f-spelling and we don\'t want to include: {f_incorrect}') - # e.g. - # Words that exist with the f-spelling and we don't want to include: fame - # Words that exist with the f-spelling and we don't want to include: found etc. - continue - - # check if the generated word exists in English - if word_frequency <= frequency_threshold: - outfile.write(f"{f_incorrect}\t{correct}\n") - unique_pairs.add((f_incorrect, correct)) - # e.g. - # moft most - # muft must - # fo so - # fome some etc. - -# apply -# generate_clever_f_s_hack( -# source_file=os.path.join(PATH_OCR_RULESETS, "all_long_s_corrections_log.txt"), -# output_file=os.path.join(PATH_OCR_RULESETS, "clever_f_ſ_hack.txt"), -# disregard_file=os.path.join(PATH_OCR_RULESETS, "disregard_fſs_replacements.txt") -# ) - - - - - -@cache -def load_correction_rules(file_path = os.path.join(PATH_OCR_RULESETS, 'CorrectionRules.txt')): - correction_rules = {} - with open(file_path, 'r') as file: - for line in file: - parts = line.strip().split('\t') - if len(parts) >= 2: - incorrect, correct = parts[:2] - correction_rules[incorrect] = correct - return correction_rules - - -def correct_ocr_errors(text, correction_rules): - corrections = 0 - for incorrect, correct in correction_rules.items(): - if incorrect in text: - text = text.replace(incorrect, correct) - corrections += 1 - return text, corrections - -def rejoin_linebreaks(text, specific_linebreak_corrections): - """ - function to addresses the issue of words that are split between two lines due to a line break, typically indicated by a hyphen - the function rejoins such words - """ - corrections = 0 - parts = text.split('-\n') - corrected_text = parts[0] - for part in parts[1:]: - corrected_text_words = corrected_text.split() - part_words = part.split() - - if corrected_text_words and part_words: # check if both lists are not empty - last_word_before_break = corrected_text_words[-1] - first_word_after_break = part_words[0] - - # form the broken word and the corrected word - broken_word = last_word_before_break + '-\n' + first_word_after_break - corrected_word = last_word_before_break + first_word_after_break - - # log the correction (gets later written to the txt file) - # specific_linebreak_corrections[broken_word + " \t " + corrected_word] += 1 - specific_linebreak_corrections.append((broken_word,corrected_word)) - - corrected_text += part - corrections += 1 - else: - # if either part is empty or doesn't contain words, simply append a hyphen - corrected_text += '-' + part - - return corrected_text, corrections - -def replace_historic_long_s(text, long_s_corrections): - """ - function to replaces the historic long 's' (ſ) with the regular 's' - - :text: text to be processed - :long_s_corrections: dictionary to log specific corrections and their counts - :return: tuple of processed text with long 's' replaced, and the number of corrections made - """ - corrected_text = text.replace('ſ', 's') - corrections = 0 - if corrected_text != text: - words_with_long_s = set(text.split()) - set(corrected_text.split()) - for word in words_with_long_s: - corrected_word = word.replace('ſ', 's') - long_s_corrections.append((word,corrected_word)) - corrections += 1 - return corrected_text, corrections - -@cache -def load_f_s_hack_corrections(file_path = os.path.join(PATH_OCR_RULESETS, "clever_f_ſ_hack.txt")): - """ - little helper script to load the f-->s words (from generate_clever_f_s_hack) into a dict, for convenient lookup - """ - correction_rules = {} - with open(file_path, 'r') as file: - for line in file: - parts = line.strip().split() - if len(parts) >= 2: - incorrect, correct = parts[:2] - correction_rules[incorrect] = correct - return correction_rules - -def process_headers(pages, remove_headers=True, similarity_threshold=80): - """ - function to identifies and optionally removes running headers - inspired by Ted Underwood's GREAT headerfinder script: https://github.com/tedunderwood/DataMunging/blob/master/runningheaders/HeaderFinder.py - some changes made: - - flexibility to remove headers or just identify them (just by setting the boolean value) - - we don't explicitly handle roman numerals, the line comparison logic (combining str.isalpha and a threshold for fuzzy matching) should take care of it - - :pages: list of dicts, each representing a page with 'page_text' - :remove_headers: bool, if set to True --> removes identified headers, otherwise just identifies them and wirtes them to the log - :similarity_threshold: int, threshold for fuzzy matching to consider lines as similar (default 80 seems to work well) - :return: list of pages with headers - """ - identified_headers = [] - headers_set = set() - - def get_substantial_lines(page_text): - """ - helper function: if the processed line contains less than 5 characters, or if the line consists solely of digits - it is considered insubstantial and is skipped + def __init__(self, path, doc_limit=-1): """ - lines = page_text.split('\n') - substantial_lines = [] - for line in lines: - if len(line.strip()) < 5 or line.strip().isdigit(): - continue - substantial_lines.append(line) - if len(substantial_lines) == 2: - break - return substantial_lines - - numpages = len(pages) - iterr = range(numpages) - iterr = tqdm(iterr, total=numpages, position=1, desc='Preprocessing headers',disable=True) - for i in iterr: - page = pages[i] - if not 'corrections' in page: page['corrections']={} - if not 'headers' in page['corrections']: page['corrections']['headers']=[] - current_page_text = pages[i]['page_text'] - current_substantial_lines = get_substantial_lines(current_page_text) - - header_found = False - - # determine the range of pages to compare with - start_index = max(0, i - 2) - end_index = min(len(pages), i + 3) - if i == len(pages) - 1: # Special handling for the last page - start_index = max(0, i - 2) # Compare with pages before - - for j in range(start_index, end_index): - if i == j: - continue - - comparison_page_text = pages[j]['page_text'] - comparison_substantial_lines = get_substantial_lines(comparison_page_text) - - for current_line in current_substantial_lines: - for comparison_line in comparison_substantial_lines: - # line comparison logic, considering possible page numbers - cleaned_current_line = ''.join(filter(str.isalpha, current_line)) - cleaned_comparison_line = ''.join(filter(str.isalpha, comparison_line)) - - s = SequenceMatcher(None, cleaned_current_line, cleaned_comparison_line) - similarity = s.ratio() * 100 - - if similarity > similarity_threshold: - header_key = (i, current_line) - if header_key not in headers_set: - identified_headers.append(header_key) - headers_set.add(header_key) - if remove_headers: - header_found = True - break - - if header_found: - correx=(current_line,'') - if correx not in set(page['corrections']['headers']): - page['corrections']['headers'].append(correx) - lines_of_page = current_page_text.split('\n') - for idx, line in enumerate(lines_of_page): - if line.strip() == current_line.strip(): - page['page_text_clean'] = '\n'.join(lines_of_page[idx+1:]) - - break - break - - return pages - - - -def cleanup_str(txt, use_nltk_tokenizer=False, **page_attrs): - """ - Most of the cleanup occurs here. Can be called with a string or a string with page attributes - """ - page_text = txt - # dicts to store specific corrections and their counts - specific_ocr_corrections = [] - specific_linebreak_corrections = [] - specific_long_s_corrections = [] - correction_rules = load_correction_rules() - clever_f_s_hack_rules = load_f_s_hack_corrections() - - # add a dictionary for specific f ſ hack corrections - specific_f_s_hack_corrections = [] - - # counters for corrections - linebreak_corrections = 0 - ocr_corrections = 0 - long_s_corrections = 0 - f_s_word_replacements = 0 - - # rejoin line breaks before tokenization and log corrections - page_text, corrections = rejoin_linebreaks(page_text, specific_linebreak_corrections) - linebreak_corrections += corrections - - # apply correction for long 's' - corrected_text, corrections = replace_historic_long_s(page_text, specific_long_s_corrections) - long_s_corrections += corrections - page_text = corrected_text - - # tokenization - tokens = word_tokenize(page_text) if use_nltk_tokenizer else tokenize_agnostic(page_text) - - # apply OCR corrections on tokens and log corrections - corrected_tokens = [] - for token in tokens: - if token in correction_rules: - corrected_token = correction_rules[token] - ocr_corrections += 1 - specific_ocr_corrections.append((token,corrected_token)) - else: - corrected_token = token - corrected_tokens.append(corrected_token) - - # apply f-ſ-s hack corrections on tokens and log corrections - for i, token in enumerate(corrected_tokens): - if token in clever_f_s_hack_rules: - corrected_token = clever_f_s_hack_rules[token] - f_s_word_replacements += 1 - specific_f_s_hack_corrections.append((token,corrected_token)) - corrected_tokens[i] = corrected_token - - token_count = len(corrected_tokens) - - # convert corrected tokens back to text for further processing - corrected_text = untokenize(corrected_tokens) if use_nltk_tokenizer else untokenize_agnostic(corrected_tokens) - - corrected_tokens_real = [x for x in corrected_tokens if any(y.isalpha() for y in x)] - - # create output dictionary - def as_counts(l): - return l - # return dict(Counter(l)) - - return { - 'page_text':page_text, - **page_attrs, - 'page_text_clean':corrected_text, - # 'page_num_tokens':token_count, - 'page_tokens':corrected_tokens_real, - 'corrections': { - 'headers':as_counts(page_attrs.get('corrections',{}).get('headers',[])), - 'ocr':as_counts(specific_ocr_corrections), - 'linebreaks':as_counts(specific_linebreak_corrections), - 'long_s':as_counts(specific_long_s_corrections), - 'f_s':as_counts(specific_f_s_hack_corrections), - } - } - + A class encapsulating a Solr Client specification, that yields + Bag-of-Word vectors on iteration, and thus acts as a Gensim Corpus. + :param path: A string to a path for the corpus output. + :param doc_limit: Max no. of documents to process. The default of -1 + means we process ALL documents found. + """ + # root path for corpus + self.path = path -def cleanup_page(page_d): - """ - Cleanup a page dictionary - """ - txt=page_d.get('page_text_clean', page_d.get('page_text','')) - odx=cleanup_str(txt, **page_d) - return odx + # limit docs queried + self.doc_limit = doc_limit -def cleanup_pages(pages_ld): - """ - Cleanup a list or dataframe of pages - """ - logger.debug('processing headers') - pages_ld = process_headers(pages_ld, remove_headers=True) # ideally, we want to set this later when calling the function + # subsequent paths + self.path_texts = os.path.join(self.path,'texts') + self.path_metadata = os.path.join(self.path,'metadata.csv') + + # query to get initial results + results = SolrQuerySet().facet(self.DOC_ID_FIELD, limit=self.doc_limit) + # store page counts and doc ids + self.page_counts = results.get_facets().facet_fields[self.DOC_ID_FIELD] + self.doc_ids = self.page_counts.keys() + self.doc_count = len(self.doc_ids) - logger.debug('processing headers') - pages_ld = [cleanup_page(page_d) for page_d in tqdm(pages_ld,position=1,desc='Cleaning up pages',disable=True)] - return pages_ld - - - - - - - - - - + @staticmethod + def _get_id(doc_id:str) -> str: + """Method to make a file-safe version of a document ID""" + return doc_id.replace('/','|') + def _get_meta_pages(self, doc_id:str) -> Tuple[dict,list]: + """Get metadata (dictionary) and pages (list of dictionaries) for a given document""" + + # get file safe work_id + work_id = self._get_id(doc_id) + + # query + result = ( + SolrQuerySet() + .search(**{self.DOC_ID_FIELD: doc_id}) + .order_by(self.PAGE_ORDER_FIELD) + ) + # populate the result cache with number of rows specified + docs = [ + doc + for doc in result.get_results(rows=self.page_counts[doc_id]) + if doc[self.DOC_ID_FIELD]==doc_id + ] + # find the metadata doc + metadata_docs = [d for d in docs if d["item_type"] == "work"] + print('?',metadata_docs) + assert len(metadata_docs)==1 + meta = {self.WORK_ID_FIELD:work_id, **metadata_docs[0]} + # find the pages docs + page_docs = [d for d in docs if d["item_type"] == "page"] + # transform into new dictionary with keys in `self.PAGE_ID_FIELD` and `self.OUTPUT_DOC_FIELDS` + pages = [self._transform_doc(doc,meta) for doc in page_docs] + # make sure sorted by numeric page num (i.e. "digital") + pages.sort(key=lambda page: page[self.PAGE_SORT_FIELD]) + return meta, pages + def _transform_doc(self,doc:dict,meta:dict) -> dict: + """Reformat document dictionary""" -PREPROCESS_FUNCTIONS = OrderedDict( - [ - # ("strip_short", strip_short), - # ("strip_multiple_whitespaces", strip_multiple_whitespaces), - # ("strip_punctuation", strip_punctuation), - # ("strip_tags", strip_tags), - # ("strip_numeric", strip_numeric), - # ("lower", lambda x: x.lower()), - # ("remove_stopwords", remove_stopwords), - # ("stem_text", stem_text), - ] -) + # get new dictionary + odoc={ + key_new:doc.get(key_orig,'') + for key_new,key_orig in ( + self.OUTPUT_DOC_FIELDS.items() + ) + } + # return with page id + return { + self.PAGE_ID_FIELD:f'{meta[self.WORK_ID_FIELD]}_{odoc[self.PAGE_NUM_FIELD]}', + **odoc + } + + def _save_doc(self,doc_id:str) -> Tuple[str,dict]: + """Save document pages as json and return filename along with document's metadata""" + + # get metadata and pages for this doc + meta,pages = self._get_meta_pages(doc_id) + + # if pages, save json + if pages: + filename = os.path.join(self.path_texts, meta[self.WORK_ID_FIELD]+'.json') + os.makedirs(self.path_texts,exist_ok=True) + with open(filename,'wb') as of: + of.write(orjson.dumps(pages,option=orjson.OPT_INDENT_2)) + + # otherwise, returned filename is blank to indicate no file saved + else: + filename='' + + return filename,meta -class SolrCorpus: - """Custom class to generate a text corpus from Solr""" - # Class attributes that rarely, if ever, need to change - DOC_ID_FIELD = "source_id" # Solr field name for document identifier - DOC_CONTENT_FIELD = "content" # Solr field name for document content - PAGE_ORDER_FIELD = "order" # Solr field name for page ordering - OUTPUT_DOC_FIELDS = dict( - work_cluster = 'cluster_id_s', - work_group = 'group_id_s', - work_source = 'source_id', - page_id = '', - page_orig = 'label', - page_digital = 'order', - page_text = 'content', - ) - SOURCE_PAGE_ID = 'page_id' + def save(self): + """Save the generated corpus text and metadata to files on disk""" - def __init__(self, name, doc_limit=-1, preprocess_fns=None, pbar=True): - """ - A class encapsulating a Solr Client specification, that yields - Bag-of-Word vectors on iteration, and thus acts as a Gensim Corpus. + # save docs and gather metadata + metadata=[] + pdesc='Saved text to' + pbar=tqdm(total=self.doc_count, desc=f'{pdesc}: ...') + for doc_id in self.doc_ids: + # get saved filename and found metadata for this document + fn,meta = self._save_doc(doc_id) - :param name: A string name of this corpus. - Used as a string prefix for generated files. - :param client: A SolrClient.SolrClient object used to interface with - Solr - :param collection: A string representing the Solr collection name. - :param doc_limit: Max no. of documents to process. The default of -1 - means we process ALL documents found. - :param preprocess_fns: A list of single-argument functions to use as - preprocessors. - See the module gensim.parsing.preprocessing for some typical - preprocessing functions. - :param pbar: A boolean indicating whether to display a progress bar - during corpus generation. - """ - self.name = name - self.doc_limit = doc_limit + # if we saved, update progress bar desc + if fn: pbar.set_description(f'{pdesc}: {fn}') - if preprocess_fns is not None: - if "ALL" in preprocess_fns: - self.preprocess_fns = PREPROCESS_FUNCTIONS.values() - else: - self.preprocess_fns = [PREPROCESS_FUNCTIONS[k] for k in preprocess_fns] - else: - self.preprocess_fns = [] + # tick + pbar.update() - self.dictionary = Dictionary() + # add this doc's meta to metadata + metadata.append(meta) + pbar.close() - # doc_id -> dict of k->v mappings - # NOTE: We cannot use 'metadata' as GenSim mangles this attribute! - self._metadata = {} + # save metadata csv + dfmeta=pd.DataFrame(metadata).set_index(self.WORK_ID_FIELD).fillna('') + dfmeta.to_csv(self.path_metadata) + print(f'Saved metadata to: {self.path_metadata}') - # list of strings, populated on first doc retrieval - self.metadata_field_names = None - # facet on document id to get counts of pages by work - results = SolrQuerySet().facet(SolrCorpus.DOC_ID_FIELD, limit=self.doc_limit) - """ - An OrderedDict of doc_id => page count mapping - An OrderedDict is important here in case we want to save document-level - metadata, in which case rows of metadata would be in the same order as - the BoW-vectors returned by this object's iterator. - """ - self.page_counts = results.get_facets().facet_fields["source_id"] - self.doc_ids = self.page_counts.keys() - self.doc_count = len(self.doc_ids) - # if pbar: - # self.pbar = ProgressBar( - # redirect_stderr=True, max_value=self.doc_count, max_error=False - # ) - # else: - # self.pbar = NullBar() - - def __iter__(self): - for doc_id in random.sample(self.doc_ids, len(self.doc_ids)): - logger.debug(f'proceeding to doc id {doc_id}') - if doc_id not in self.page_counts: - logger.warning( - "Unknown page count for doc {}. Skipping.".format(doc_id) - ) - continue - logger.debug('querying solr') - yield doc_id - - def _save_dictionary(self, filepath, as_text=False): - """ - Save dictionary at a specified path, either as a picked Gensim - Dictionary object, or a .txt file - :param filepath: File path for saved dictionary - :param as_text: Whether to save as a plaintext file, where the - 0-indexed line number denotes the token id. - :return: None - """ - if as_text: - with open(filepath, "w", encoding="utf8") as f: - f.writelines( - [self.dictionary[i] + "\n" for i in range(len(self.dictionary))] - ) - else: - self.dictionary.save(filepath) - - def _save_metadata(self, filepath): - if self.metadata_field_names is None: - raise RuntimeError("Unable to determine metadata field names!") - - with open(filepath, "w", encoding="utf8", newline="") as f: - writer = csv.writer(f) - writer.writerow(self.metadata_field_names) # header row - - for doc_id in self.doc_ids: - metadata = self._metadata[doc_id] - writer.writerow( - [ - metadata.get(field_name) - for field_name in self.metadata_field_names - ] - ) - - def save(self, path, save_dict=True, save_dict_as_text=False, save_metadata=False): - """Save the generated corpus text and metadata to files on disk""" - path_texts = os.path.join(path,'texts') - path_metadata = os.path.join(path,'metadata.csv') - if not os.path.isdir(path_texts): makedirs(path_texts) - num_cpu=mp.cpu_count() // 2 if mp.cpu_count()>1 else 1 - pool = mp.Pool(num_cpu) - tasks = [] - - def iter_tasks(): - for i,obj in enumerate(self): - yield (path_texts, self.page_counts, obj) - - for obj in iter_tasks(): - tasks.append(pool.apply_async(_do_save, (obj,))) - - # close the process pool - filenames = [] - for task in tqdm(tasks,position=0,desc=f'Saving corpus [{num_cpu}x]'): - res = task.get() - filenames.append(res) - - return filenames -def _iter_group_pages(doc_id, page_counts): - result = ( - SolrQuerySet() - .search(**{SolrCorpus.DOC_ID_FIELD: doc_id}) - .order_by(SolrCorpus.PAGE_ORDER_FIELD) - ) - # populate the result cache with number of rows specified - docs = result.get_results(rows=page_counts[doc_id]) - logger.debug(f'found {len(docs)} documents') - - metadata_docs = [d for d in docs if d["item_type"] == "work"] - logger.debug(f'found {len(metadata_docs)} metadata documents') - - page_docs = [d for d in docs if d["item_type"] == "page"] - logger.debug(f'found {len(page_docs)} page documents') - - logger.debug(f'sorting page documents') - page_docs.sort(key = lambda d: (d['source_id'], d['order'])) - work_page_docs = defaultdict(list) - for pdoc in page_docs: - work_page_docs[pdoc['group_id_s']].append(pdoc) - - # filter out pages that have no content; - # combine all pages into one string - logger.debug(f'iterating over {len(work_page_docs)} groups') - for group_id,source_pages in work_page_docs.items(): - logger.debug(f'proceeding to group {group_id} within document id {doc_id}') - logger.debug(f'reformatting page document dictionaries') - pages_ld = [ - _transform_doc(doc) - for doc in source_pages - ] - assert all([ - (doc['work_group'] == group_id) - for doc in pages_ld - ]) - - yield group_id,pages_ld +@contextmanager +def logging_disabled(highest_level=logging.CRITICAL): + """Quick way to suppress solr logs as we iterate. Taken from https://gist.github.com/simon-weber/7853144""" + previous_level = logging.root.manager.disable + logging.disable(highest_level) + try: yield + finally: logging.disable(previous_level) -def _transform_doc(doc): - odoc = OrderedDict({ - key_new:doc.get(key_orig,'') - for key_new,key_orig in ( - SolrCorpus.OUTPUT_DOC_FIELDS.items() - ) - }) - odoc[SolrCorpus.SOURCE_PAGE_ID] = f'{odoc["work_source"]}_{odoc["page_orig"]}' - return odoc - -def _do_save(obj): - path_texts, page_counts, doc_id = obj - filenames = [] - for group_id,pages_ld in _iter_group_pages(doc_id, page_counts): - logger.debug(f'applying cleanup preprocessing') - pages_ld = cleanup_pages(pages_ld) - filename = os.path.join(path_texts, group_id.replace('/','|')+'.json') - with open(filename,'w') as of: - json.dump(pages_ld, of, indent=4, sort_keys=True) - filenames.append(filename) - return filenames - class Command(BaseCommand): - """Custom manage command to generate a token corpus from text indexed in Solr""" + """Custom manage command to generate a text corpus from text indexed in Solr""" def add_arguments(self, parser): parser.add_argument( "--path", required=True, help="Directory path to save corpus file(s)." ) - parser.add_argument( - "--name", - default="corpus", - help="Name prefix to use for all saved corpus file(s).", - ) parser.add_argument( "--doc-limit", @@ -764,49 +192,11 @@ def add_arguments(self, parser): help="Limit on the number of documents for corpus generation." "The default of -1 considers ALL documents.", ) - parser.add_argument( - "--no-dictionary", - action="store_true", - help="Do not save corpus dictionary.", - ) - parser.add_argument( - "--dictionary-as-text", - action="store_true", - help="If saving dictionary, save as a plaintext file.", - ) - parser.add_argument( - "--no-metadata", - action="store_true", - default=False, - help="Do not save corpus metadata.", - ) - parser.add_argument( - "--no-progress", - action="store_true", - help="Do not display progress bar to track the status of the" "command.", - ) - parser.add_argument( - "--preprocess", - action="append", - choices=list(PREPROCESS_FUNCTIONS.keys()) + ["ALL"], - help="Pre-processing filter(s) to apply. Multiple filters can be" - "applied (in order) by adding multiple --preprocess flags." - "Use ALL to apply all pre-processing filters.", - ) - - def handle(self, *args, **options): - corpus = SolrCorpus( - name=options["name"], - doc_limit=options["doc_limit"], - preprocess_fns=options["preprocess"], - pbar=not options["no_progress"], - ) - filenames_saved = corpus.save( - options["path"], - save_dict=not options["no_dictionary"], - save_dict_as_text=options["dictionary_as_text"], - save_metadata=not options["no_metadata"], - ) - print(f'Successfully saved {len(filenames_saved)} json files') + def handle(self, *args, **options): + with logging_disabled(): + SolrCorpus( + path=options["path"], + doc_limit=options["doc_limit"], + ).save() \ No newline at end of file diff --git a/ppa/archive/tests/test_generate_textcorpus.py b/ppa/archive/tests/test_generate_textcorpus.py new file mode 100644 index 000000000..62645af97 --- /dev/null +++ b/ppa/archive/tests/test_generate_textcorpus.py @@ -0,0 +1,103 @@ +from unittest.mock import patch +import json +import pytest +from django.core.management import call_command +from django.core.management.base import CommandError +import pandas as pd +import os + +# mock results for acet query used to get document IDs and page counts +mock_solr_facets = {"group_id_s": {"doc_1": 2, "doc_2": 1}} + +# mock result for solr document data +mock_solr_docs = [ + # The first record has item_type='work' and contains metadata for the + # document + {"item_type": "work", "pub_year": 1863, "group_id_s":"doc_1"}, + # If multiple metadata rows are found, the first one (above) is used + # Subsequent records have item_type='page', page-order specified by + # 'order', with content in 'content' + { + "item_type": "page", + "order": 1, + "content": "Four score and seven years ago our fathers brought forth" + " on this continent, a new nation, ", + "group_id_s":"doc_1", + "label":'i' + }, + { + "item_type": "page", + "order": 2, + "content": "conceived in Liberty, and dedicated to the proposition" + " that all men are created equal.", + "group_id_s":"doc_1", + "label":'ii' + }, + + + + {"item_type": "work", "pub_year": "unknown","group_id_s":"doc_2"}, + { + "item_type": "page", + "order": 3, + "content": "!!!!!", + "group_id_s":"doc_2", + "label":"2" + }, +] + + +@pytest.fixture +def patched_solr_queryset(mock_solr_queryset): + # local fixture that uses parasolr queryset mock + # and patches in test docs & facets + mock_qs = mock_solr_queryset() + with patch( + "ppa.archive.management.commands.generate_textcorpus.SolrQuerySet", new=mock_qs + ) as mock_queryset_cls: + mock_qs = mock_queryset_cls.return_value + mock_qs.get_results.return_value = mock_solr_docs + mock_qs.get_facets.return_value.facet_fields = mock_solr_facets + + yield mock_qs + + +def test_save(tmpdir, patched_solr_queryset): + call_command("generate_textcorpus", "--path", tmpdir.dirpath()) + metadata_file = tmpdir.dirpath("metadata.csv") + assert metadata_file.check() + dfmeta = pd.read_csv(metadata_file) + assert len(dfmeta) == 2 + + tdir=tmpdir.dirpath('texts') + fns=os.listdir(tdir) + assert len(fns) == 2 + + print(fns) + fn1=os.path.join(tdir,fns[0]) + fn2=os.path.join(tdir,fns[1]) + with open(fn1) as f: ld1=json.load(f) + with open(fn2) as f: ld2=json.load(f) + + assert len(ld1)==2 + assert len(ld2)==1 + + assert all(all(bool(v) for k,v in d.items()) for d in ld1) + assert all(all(bool(v) for k,v in d.items()) for d in ld2) + + + + + + +def test_invalid_preprocess_flags(tmpdir, patched_solr_queryset): + # Flags that are not supported + with pytest.raises(CommandError): + call_command( + "generate_textcorpus", "--path", tmpdir.dirpath(), "--doc-limit","one" + ) + + with pytest.raises(CommandError): + call_command( + "generate_textcorpus", "--woops","huh" + ) From cbfa9ad0d3ce53fb80f982f5c5f9d7eaa78f8ef7 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Thu, 30 Nov 2023 16:36:54 -0500 Subject: [PATCH 4/8] fix to reqs --- dev-requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index d6d40a18e..7dd1a4c41 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -5,6 +5,6 @@ pytest-cov django-debug-toolbar sphinx pre-commit -wordfreq -nltk +pandas +orjson tqdm \ No newline at end of file From f5dc287261aef4746b1c6a89ebd1139c4ee6ba35 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Thu, 30 Nov 2023 16:37:44 -0500 Subject: [PATCH 5/8] comment --- ppa/archive/management/commands/generate_textcorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppa/archive/management/commands/generate_textcorpus.py b/ppa/archive/management/commands/generate_textcorpus.py index 3e13dc22c..6cdba7aef 100644 --- a/ppa/archive/management/commands/generate_textcorpus.py +++ b/ppa/archive/management/commands/generate_textcorpus.py @@ -5,7 +5,7 @@ """ import os -import orjson +import orjson # a faster json implementation from django.core.management.base import BaseCommand from parasolr.django import SolrQuerySet from collections import defaultdict, OrderedDict From ff7dcb71e4c6d73db7d0eecc9690797e47dcc89f Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Thu, 30 Nov 2023 16:40:40 -0500 Subject: [PATCH 6/8] comment desc --- ppa/archive/management/commands/generate_textcorpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ppa/archive/management/commands/generate_textcorpus.py b/ppa/archive/management/commands/generate_textcorpus.py index 6cdba7aef..922d3b46a 100644 --- a/ppa/archive/management/commands/generate_textcorpus.py +++ b/ppa/archive/management/commands/generate_textcorpus.py @@ -34,8 +34,8 @@ class SolrCorpus: def __init__(self, path, doc_limit=-1): """ - A class encapsulating a Solr Client specification, that yields - Bag-of-Word vectors on iteration, and thus acts as a Gensim Corpus. + A class encapsulating a Solr Client specification which yields + metadata and page data for PPA documents. :param path: A string to a path for the corpus output. :param doc_limit: Max no. of documents to process. The default of -1 From e133c9cf459ddaba461f1d7e6a567c78c227547c Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Fri, 1 Dec 2023 06:57:43 -0500 Subject: [PATCH 7/8] cleaning up --- ppa/archive/management/commands/generate_textcorpus.py | 1 - ppa/archive/tests/test_generate_textcorpus.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/ppa/archive/management/commands/generate_textcorpus.py b/ppa/archive/management/commands/generate_textcorpus.py index 922d3b46a..02d57ed49 100644 --- a/ppa/archive/management/commands/generate_textcorpus.py +++ b/ppa/archive/management/commands/generate_textcorpus.py @@ -85,7 +85,6 @@ def _get_meta_pages(self, doc_id:str) -> Tuple[dict,list]: # find the metadata doc metadata_docs = [d for d in docs if d["item_type"] == "work"] - print('?',metadata_docs) assert len(metadata_docs)==1 meta = {self.WORK_ID_FIELD:work_id, **metadata_docs[0]} diff --git a/ppa/archive/tests/test_generate_textcorpus.py b/ppa/archive/tests/test_generate_textcorpus.py index 62645af97..0f91915d9 100644 --- a/ppa/archive/tests/test_generate_textcorpus.py +++ b/ppa/archive/tests/test_generate_textcorpus.py @@ -14,6 +14,7 @@ # The first record has item_type='work' and contains metadata for the # document {"item_type": "work", "pub_year": 1863, "group_id_s":"doc_1"}, + {"item_type": "work", "pub_year": "unknown","group_id_s":"doc_2"}, # If multiple metadata rows are found, the first one (above) is used # Subsequent records have item_type='page', page-order specified by # 'order', with content in 'content' @@ -36,7 +37,6 @@ - {"item_type": "work", "pub_year": "unknown","group_id_s":"doc_2"}, { "item_type": "page", "order": 3, @@ -73,7 +73,6 @@ def test_save(tmpdir, patched_solr_queryset): fns=os.listdir(tdir) assert len(fns) == 2 - print(fns) fn1=os.path.join(tdir,fns[0]) fn2=os.path.join(tdir,fns[1]) with open(fn1) as f: ld1=json.load(f) From 1702b2bb2054ff1c848aa9f726a63172c3ae5701 Mon Sep 17 00:00:00 2001 From: Ryan Heuser Date: Fri, 1 Dec 2023 07:35:10 -0500 Subject: [PATCH 8/8] switching from pandas/csv to json for metadata --- dev-requirements.txt | 2 -- .../management/commands/generate_textcorpus.py | 13 ++++++------- ppa/archive/tests/test_generate_textcorpus.py | 7 ++++--- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 7dd1a4c41..2f9733f27 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -5,6 +5,4 @@ pytest-cov django-debug-toolbar sphinx pre-commit -pandas -orjson tqdm \ No newline at end of file diff --git a/ppa/archive/management/commands/generate_textcorpus.py b/ppa/archive/management/commands/generate_textcorpus.py index 02d57ed49..cd9126f0b 100644 --- a/ppa/archive/management/commands/generate_textcorpus.py +++ b/ppa/archive/management/commands/generate_textcorpus.py @@ -5,13 +5,12 @@ """ import os -import orjson # a faster json implementation +import json from django.core.management.base import BaseCommand from parasolr.django import SolrQuerySet from collections import defaultdict, OrderedDict import logging from tqdm import tqdm -import pandas as pd from typing import Tuple from contextlib import contextmanager import logging @@ -49,7 +48,7 @@ def __init__(self, path, doc_limit=-1): # subsequent paths self.path_texts = os.path.join(self.path,'texts') - self.path_metadata = os.path.join(self.path,'metadata.csv') + self.path_metadata = os.path.join(self.path,'metadata.json') # query to get initial results results = SolrQuerySet().facet(self.DOC_ID_FIELD, limit=self.doc_limit) @@ -125,8 +124,8 @@ def _save_doc(self,doc_id:str) -> Tuple[str,dict]: if pages: filename = os.path.join(self.path_texts, meta[self.WORK_ID_FIELD]+'.json') os.makedirs(self.path_texts,exist_ok=True) - with open(filename,'wb') as of: - of.write(orjson.dumps(pages,option=orjson.OPT_INDENT_2)) + with open(filename,'w') as of: + json.dump(pages, of, indent=2) # otherwise, returned filename is blank to indicate no file saved else: @@ -157,8 +156,8 @@ def save(self): pbar.close() # save metadata csv - dfmeta=pd.DataFrame(metadata).set_index(self.WORK_ID_FIELD).fillna('') - dfmeta.to_csv(self.path_metadata) + with open(self.path_metadata,'w') as of: + json.dump(metadata, of, indent=2) print(f'Saved metadata to: {self.path_metadata}') diff --git a/ppa/archive/tests/test_generate_textcorpus.py b/ppa/archive/tests/test_generate_textcorpus.py index 0f91915d9..c134ee0ae 100644 --- a/ppa/archive/tests/test_generate_textcorpus.py +++ b/ppa/archive/tests/test_generate_textcorpus.py @@ -3,7 +3,6 @@ import pytest from django.core.management import call_command from django.core.management.base import CommandError -import pandas as pd import os # mock results for acet query used to get document IDs and page counts @@ -66,8 +65,10 @@ def test_save(tmpdir, patched_solr_queryset): call_command("generate_textcorpus", "--path", tmpdir.dirpath()) metadata_file = tmpdir.dirpath("metadata.csv") assert metadata_file.check() - dfmeta = pd.read_csv(metadata_file) - assert len(dfmeta) == 2 + + with open(metadata_file) as f: + meta=json.load(f) + assert len(meta) == 2 tdir=tmpdir.dirpath('texts') fns=os.listdir(tdir)