SYS-1732: SpaCy NLP - director name extraction proof of concept (#4)

* spaCy name extraction * fix formatting in Dockerfile and requirements * ast -> json, loop -> list comprehension
UCLALibrary · Jan 24, 2025 · b85eaa9 · b85eaa9
1 parent d465132
commit b85eaa9
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 1 deletion.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.13-slim-bookworm
+FROM python:3.12-slim-bookworm
 
 # Install git, required to install the alma_api_client package from github.
 RUN apt-get update && \

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,7 @@
 git+https://github.com/UCLALibrary/alma-api-client
 # For Excel conversion
 pandas==2.2.3
+# for NLP
+spacy==3.7.5
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
+en_core_web_md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl
diff --git a/spacy_experiment.py b/spacy_experiment.py
@@ -0,0 +1,81 @@
+import json
+import spacy
+import csv
+
+# for type hinting
+from spacy.language import Language
+
+
+def find_names(data: list, model: Language) -> dict:
+    output_dict = {}
+    for i in range(len(data)):
+        subfield_c = data[i]
+        current_names = []
+        current_other_entities = []
+        doc = model(subfield_c)
+        for ent in doc.ents:
+            if ent.label_ == "PERSON":
+                current_names.append(ent.text)
+            else:
+                current_other_entities.append(ent.text)
+        # add index to subfield_c so keys are unique
+        dict_key = f"{subfield_c} ({i})"
+        output_dict[dict_key] = {
+            "names": current_names,
+            "other_entities": current_other_entities,
+        }
+
+    return output_dict
+
+
+def evaluate_model(data: list, model: Language) -> None:
+    entity_dict = find_names(data, model)
+    # get total count of names and other entities
+    total_names = 0
+    total_other_entities = 0
+    for key in entity_dict:
+        total_names += len(entity_dict[key]["names"])
+        total_other_entities += len(entity_dict[key]["other_entities"])
+
+    print(f"total subfields: {len(entity_dict)}")
+    print(f"total names: {total_names}")
+    print(f"total other entities: {total_other_entities}")
+    write_output_csv(f"output_{model.meta['name']}.csv", entity_dict)
+
+
+def write_output_csv(file_name: str, output_dict: dict) -> None:
+    with open(file_name, encoding="utf-8", mode="w") as file:
+        writer = csv.writer(file)
+        writer.writerow(["subfield_c", "names", "other_entities"])
+        for key in output_dict:
+            # remove index from key (index was added to make keys unique)
+            subfield_c = key.split(" (")[0]
+            writer.writerow(
+                [
+                    subfield_c,
+                    output_dict[key]["names"],
+                    output_dict[key]["other_entities"],
+                ]
+            )
+    print(f"output written to {file_name}")
+
+
+def main():
+    with open("f245c_directors.txt", encoding="utf-8") as file:
+        data = json.load(file)
+
+    # concatenate each list of strings into a single string
+    data = [" ".join(row) for row in data]
+
+    small_model = spacy.load("en_core_web_sm")
+    medium_model = spacy.load("en_core_web_md")
+
+    print("small model:")
+    evaluate_model(data, small_model)
+    print()
+    print("medium model:")
+    evaluate_model(data, medium_model)
+
+
+if __name__ == "__main__":
+    main()