Parse: title, idno, material, dates & places

Xennis · Apr 5, 2020 · 5bf6868 · 5bf6868
1 parent b88978e
commit 5bf6868
Show file tree

Hide file tree

Showing 31 changed files with 1,284 additions and 0 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,12 @@
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+max_line_length = 132
+
+[*.py]
+indent_size = 4
+indent_style = space
+
+[Makefile]
+indent_style = tab
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+# Virtual environment
+.env/
+
+# Python
+*.pyc
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,10 @@
+language: python
+python:
+  - "3.7"
+install:
+  - pip install --quiet --requirement requirements.txt
+script:
+  - black --line-length 132 *.py
+  - python -m unittest discover -p '*_test.py'
+branches:
+  only: [master]
diff --git a/Makefile b/Makefile
@@ -0,0 +1,10 @@
+format:
+	black --line-length 132 *.py
+
+check: format-check unittest
+
+format-check:
+	black --check --line-length 132 *.py
+
+unittest:
+	python -m unittest discover -p '*_test.py'
diff --git a/README.md b/README.md
@@ -1 +1,26 @@
 # epidoc-parser
+
+[![Build Status](https://travis-ci.org/Xennis/epidoc-parser.svg?branch=master)](https://travis-ci.org/Xennis/epidoc-parser)
+
+## Development
+
+Requirements:
+* Python 3.7 is installed
+
+Create a virtual environment, enable it and install the dependencies
+```sh
+python3.7 -m venv .env
+source .env/bin/activate
+pip install --requirement requirements.txt
+```
+
+Run the test
+```sh
+make unittest
+```
+
+## LICENSE
+
+### Test data
+
+The test data in this project is from the project [idp.data](https://github.com/papyri/idp.data) by [Papyri.info](http://papyri.info). This data is made available under a [Creative Commons Attribution 3.0 License](http://creativecommons.org/licenses/by/3.0/), with copyright and attribution to the respective projects.
diff --git a/epidoc.py b/epidoc.py
@@ -0,0 +1,88 @@
+from bs4 import BeautifulSoup
+
+from history import ParseHistory
+from normalize import normalize
+
+
+class EpiDocHeader:
+
+    title = None
+    idno = {}
+    material = None
+    dates = []
+    places = []
+
+    @classmethod
+    def create(cls, title, idno, material=None, dates=None, places=None):
+        h = cls()
+        h.title = title
+        h.idno = idno
+        h.material = material
+        if dates is not None:
+            h.dates = dates
+        if places is not None:
+            h.places = places
+        return h
+
+    def __eq__(self, other):
+        if not isinstance(other, EpiDocHeader):
+            return False
+        return (
+            self.title == other.title
+            and self.idno == other.idno
+            and self.material == other.material
+            and self.dates == other.dates
+            and self.places == other.places
+        )
+
+    def __repr__(self):
+        return f"title={self.title},idno={self.idno},material={self.material},date={self.dates},places={self.places}"
+
+
+class EpiDoc:
+
+    header = None
+
+    @classmethod
+    def create(cls, header):
+        d = cls()
+        d.header = header
+        return d
+
+    def __eq__(self, other):
+        if not isinstance(other, EpiDoc):
+            return False
+        return self.header == other.header
+
+    def __repr__(self):
+        return f"header={self.header}"
+
+
+def load(fp):
+    return loads(fp.read())
+
+
+def loads(s):
+    soup = BeautifulSoup(s, features="lxml")
+
+    header = EpiDocHeader()
+
+    filedesc = soup.teiheader.filedesc
+    header.title = filedesc.titlestmt.title.getText()
+    idnos = {}
+    for idno in filedesc.publicationstmt.find_all("idno"):
+        typ = normalize(idno.attrs.get("type"))
+        value = normalize(idno.getText())
+        idnos[typ] = value
+    header.idno = idnos
+
+    msdesc = filedesc.sourcedesc.msdesc
+    if msdesc:
+        header.material = normalize(msdesc.physdesc.objectdesc.support.material.getText())
+        history = msdesc.history
+        header.dates = ParseHistory.dates(history)
+        header.places = ParseHistory.places(history)
+
+    doc = EpiDoc()
+    doc.header = header
+    return doc
diff --git a/epidoc_test.py b/epidoc_test.py
@@ -0,0 +1,144 @@
+import os
+import unittest
+
+from epidoc import EpiDoc, EpiDocHeader, load
+
+TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")
+
+
+class TestLoad(unittest.TestCase):
+    def test_all(self):
+        tests = [
+            (
+                os.path.join("ddb", "chla.3.198.xml"),
+                EpiDoc.create(
+                    header=EpiDocHeader.create(
+                        title="chla.3.198",
+                        idno={
+                            "ddb-hybrid": "chla;3;198",
+                            "ddb-perseus-style": "0279;3;198",
+                            "filename": "chla.3.198",
+                            "hgv": "114844",
+                            "tm": "114844",
+                        },
+                    )
+                ),
+            ),
+            (
+                os.path.join("ddb", "p.coles.16.xml"),
+                EpiDoc.create(
+                    header=EpiDocHeader.create(
+                        title="p.coles.16",
+                        idno={"ddb-hybrid": "p.coles;;16", "filename": "p.coles.16", "hgv": "697551", "tm": "697551",},
+                    )
+                ),
+            ),
+            (
+                os.path.join("dlcp", "26761.xml"),
+                EpiDoc.create(
+                    header=EpiDocHeader.create(
+                        title="Sb. 16 13045",
+                        idno={"dclp": "26761", "dclp-hybrid": "sb;16;13045", "filename": "26761", "ldab": "5148", "tm": "26761",},
+                        material="papyrus",
+                        dates=[{"text": "100 - 299", "notbefore": "0100", "notafter": "0299",}],
+                        places={
+                            "text": "Found: Egypt; written: Egypt",
+                            "found": [{"text": "Egypt", "type": "ancient", "subtype": "region",}],
+                            "composed": [{"text": "Egypt", "type": "ancient", "subtype": "region",}],
+                        },
+                    )
+                ),
+            ),
+            (
+                os.path.join("dlcp", "135858.xml"),
+                EpiDoc.create(
+                    header=EpiDocHeader.create(
+                        title="TM 135858",
+                        idno={
+                            "dclp": "135858",
+                            "dclp-hybrid": "tm;;135858",
+                            "filename": "135858",
+                            "ldab": "135858",
+                            "tm": "135858",
+                        },
+                        material="parchment",
+                        dates=[{"text": "550 - 649", "notbefore": "0550", "notafter": "0649",}],
+                        places={
+                            "text": "Found: Naqlun (Arsinoites, Egypt); written: Naqlun (Arsinoites, Egypt)",
+                            "found": [
+                                {"text": "Arsinoites", "type": "ancient", "subtype": "nome",},
+                                {"text": "Egypt", "type": "ancient", "subtype": "region",},
+                                {"text": "Naqlun", "type": "ancient", "ref": ["https://www.trismegistos.org/place/1418"],},
+                            ],
+                            "composed": [
+                                {"text": "Arsinoites", "type": "ancient", "subtype": "nome",},
+                                {"text": "Egypt", "type": "ancient", "subtype": "region",},
+                                {"text": "Naqlun", "type": "ancient", "ref": ["https://www.trismegistos.org/place/1418"],},
+                            ],
+                        },
+                    )
+                ),
+            ),
+            (
+                os.path.join("hgv", "13003.xml"),
+                EpiDoc.create(
+                    header=EpiDocHeader.create(
+                        title="Taxing - list",
+                        idno={
+                            "filename": "13003",
+                            "tm": "13003",
+                            "ddb-perseus-style": "0198;2;371v",
+                            "ddb-filename": "p.ryl.2.371v",
+                            "ddb-hybrid": "p.ryl;2;371v",
+                        },
+                        material="papyrus",
+                        dates=[{"text": "134 - 135", "notbefore": "0134", "notafter": "0135",}],
+                        places={
+                            "text": "Philopator alias Theogenus (Arsinoites)",
+                            "located": [
+                                {
+                                    "text": "Philopator alias Theogenus",
+                                    "type": "ancient",
+                                    "ref": ["http://pleiades.stoa.org/places/741563", "http://www.trismegistos.org/place/1776",],
+                                },
+                                {"text": "Arsinoites", "type": "ancient", "subtype": "nome",},
+                                {"text": "Ägypten", "type": "ancient", "subtype": "region",},
+                            ],
+                        },
+                    )
+                ),
+            ),
+            (
+                os.path.join("hgv", "74005.xml"),
+                EpiDoc.create(
+                    header=EpiDocHeader.create(
+                        title="Ordre de paiement",
+                        idno={
+                            "filename": "74005",
+                            "tm": "74005",
+                            "ddb-perseus-style": "0022;4;452",
+                            "ddb-filename": "o.douch.4.452",
+                            "ddb-hybrid": "o.douch;4;452",
+                        },
+                        material="ostrakon",
+                        dates=[{"text": "IV - Anfang V", "notbefore": "0301", "notafter": "0425", "precision": "low",}],
+                        places={
+                            "text": "Kysis (Oasis Magna)",
+                            "located": [
+                                {
+                                    "text": "Kysis",
+                                    "type": "ancient",
+                                    "ref": ["http://pleiades.stoa.org/places/776191", "http://www.trismegistos.org/place/2761",],
+                                },
+                                {"text": "Oasis Magna", "type": "ancient", "subtype": "region",},
+                            ],
+                        },
+                    )
+                ),
+            ),
+        ]
+
+        for (filename, want) in tests:
+            with open(os.path.join(TESTDATA_DIR, "full", filename)) as f:
+                actual = load(f)
+            self.assertEqual(want, actual, msg=filename)
diff --git a/history.py b/history.py
@@ -0,0 +1,42 @@
+from normalize import normalize, normalized_get_text
+
+
+class ParseHistory:
+    @staticmethod
+    def dates(history):
+        result = []
+        for elem in history.origin.findAll("origdate"):
+            date = {"text": normalized_get_text(elem)}
+            for name, value in elem.attrs.items():
+                date[normalize(name)] = normalize(value)
+            result.append(date)
+        return result
+
+    @staticmethod
+    def places(history):
+        result = {}
+        origin_place = history.origin.origplace
+        result["text"] = origin_place.getText().strip()
+        for name, value in origin_place.attrs.items():
+            result[normalize(name)] = normalize(value)
+
+        for elem in history.findAll("provenance"):
+            typ = elem.attrs.get("type")
+            assert typ not in result
+            result[typ] = ParseHistory.provenance(elem)
+
+        return result
+
+    @staticmethod
+    def provenance(provenance):
+        result = []
+        # Note: For some it's provenance.p.placename
+        for elem in provenance.findAll("placename"):
+            place = {"text": normalized_get_text(elem)}
+            for name, value in elem.attrs.items():
+                place[normalize(name)] = normalize(value)
+
+            if "ref" in place:
+                place["ref"] = [normalize(ref) for ref in place["ref"].split(" ")]
+            result.append(place)
+        return result