Skip to content

Commit

Permalink
Parse: title, idno, material, dates & places
Browse files Browse the repository at this point in the history
  • Loading branch information
Xennis committed Apr 5, 2020
1 parent b88978e commit 5bf6868
Show file tree
Hide file tree
Showing 31 changed files with 1,284 additions and 0 deletions.
12 changes: 12 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
max_line_length = 132

[*.py]
indent_size = 4
indent_style = space

[Makefile]
indent_style = tab
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Virtual environment
.env/

# Python
*.pyc
10 changes: 10 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
language: python
python:
- "3.7"
install:
- pip install --quiet --requirement requirements.txt
script:
- black --line-length 132 *.py
- python -m unittest discover -p '*_test.py'
branches:
only: [master]
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
format:
black --line-length 132 *.py

check: format-check unittest

format-check:
black --check --line-length 132 *.py

unittest:
python -m unittest discover -p '*_test.py'
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1 +1,26 @@
# epidoc-parser

[![Build Status](https://travis-ci.org/Xennis/epidoc-parser.svg?branch=master)](https://travis-ci.org/Xennis/epidoc-parser)

## Development

Requirements:
* Python 3.7 is installed

Create a virtual environment, enable it and install the dependencies
```sh
python3.7 -m venv .env
source .env/bin/activate
pip install --requirement requirements.txt
```

Run the test
```sh
make unittest
```

## LICENSE

### Test data

The test data in this project is from the project [idp.data](https://github.com/papyri/idp.data) by [Papyri.info](http://papyri.info). This data is made available under a [Creative Commons Attribution 3.0 License](http://creativecommons.org/licenses/by/3.0/), with copyright and attribution to the respective projects.
88 changes: 88 additions & 0 deletions epidoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from bs4 import BeautifulSoup

from history import ParseHistory
from normalize import normalize


class EpiDocHeader:

title = None
idno = {}
material = None
dates = []
places = []

@classmethod
def create(cls, title, idno, material=None, dates=None, places=None):
h = cls()
h.title = title
h.idno = idno
h.material = material
if dates is not None:
h.dates = dates
if places is not None:
h.places = places
return h

def __eq__(self, other):
if not isinstance(other, EpiDocHeader):
return False
return (
self.title == other.title
and self.idno == other.idno
and self.material == other.material
and self.dates == other.dates
and self.places == other.places
)

def __repr__(self):
return f"title={self.title},idno={self.idno},material={self.material},date={self.dates},places={self.places}"


class EpiDoc:

header = None

@classmethod
def create(cls, header):
d = cls()
d.header = header
return d

def __eq__(self, other):
if not isinstance(other, EpiDoc):
return False
return self.header == other.header

def __repr__(self):
return f"header={self.header}"


def load(fp):
return loads(fp.read())


def loads(s):
soup = BeautifulSoup(s, features="lxml")

header = EpiDocHeader()

filedesc = soup.teiheader.filedesc
header.title = filedesc.titlestmt.title.getText()
idnos = {}
for idno in filedesc.publicationstmt.find_all("idno"):
typ = normalize(idno.attrs.get("type"))
value = normalize(idno.getText())
idnos[typ] = value
header.idno = idnos

msdesc = filedesc.sourcedesc.msdesc
if msdesc:
header.material = normalize(msdesc.physdesc.objectdesc.support.material.getText())
history = msdesc.history
header.dates = ParseHistory.dates(history)
header.places = ParseHistory.places(history)

doc = EpiDoc()
doc.header = header
return doc
144 changes: 144 additions & 0 deletions epidoc_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import os
import unittest

from epidoc import EpiDoc, EpiDocHeader, load

TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata")


class TestLoad(unittest.TestCase):
def test_all(self):
tests = [
(
os.path.join("ddb", "chla.3.198.xml"),
EpiDoc.create(
header=EpiDocHeader.create(
title="chla.3.198",
idno={
"ddb-hybrid": "chla;3;198",
"ddb-perseus-style": "0279;3;198",
"filename": "chla.3.198",
"hgv": "114844",
"tm": "114844",
},
)
),
),
(
os.path.join("ddb", "p.coles.16.xml"),
EpiDoc.create(
header=EpiDocHeader.create(
title="p.coles.16",
idno={"ddb-hybrid": "p.coles;;16", "filename": "p.coles.16", "hgv": "697551", "tm": "697551",},
)
),
),
(
os.path.join("dlcp", "26761.xml"),
EpiDoc.create(
header=EpiDocHeader.create(
title="Sb. 16 13045",
idno={"dclp": "26761", "dclp-hybrid": "sb;16;13045", "filename": "26761", "ldab": "5148", "tm": "26761",},
material="papyrus",
dates=[{"text": "100 - 299", "notbefore": "0100", "notafter": "0299",}],
places={
"text": "Found: Egypt; written: Egypt",
"found": [{"text": "Egypt", "type": "ancient", "subtype": "region",}],
"composed": [{"text": "Egypt", "type": "ancient", "subtype": "region",}],
},
)
),
),
(
os.path.join("dlcp", "135858.xml"),
EpiDoc.create(
header=EpiDocHeader.create(
title="TM 135858",
idno={
"dclp": "135858",
"dclp-hybrid": "tm;;135858",
"filename": "135858",
"ldab": "135858",
"tm": "135858",
},
material="parchment",
dates=[{"text": "550 - 649", "notbefore": "0550", "notafter": "0649",}],
places={
"text": "Found: Naqlun (Arsinoites, Egypt); written: Naqlun (Arsinoites, Egypt)",
"found": [
{"text": "Arsinoites", "type": "ancient", "subtype": "nome",},
{"text": "Egypt", "type": "ancient", "subtype": "region",},
{"text": "Naqlun", "type": "ancient", "ref": ["https://www.trismegistos.org/place/1418"],},
],
"composed": [
{"text": "Arsinoites", "type": "ancient", "subtype": "nome",},
{"text": "Egypt", "type": "ancient", "subtype": "region",},
{"text": "Naqlun", "type": "ancient", "ref": ["https://www.trismegistos.org/place/1418"],},
],
},
)
),
),
(
os.path.join("hgv", "13003.xml"),
EpiDoc.create(
header=EpiDocHeader.create(
title="Taxing - list",
idno={
"filename": "13003",
"tm": "13003",
"ddb-perseus-style": "0198;2;371v",
"ddb-filename": "p.ryl.2.371v",
"ddb-hybrid": "p.ryl;2;371v",
},
material="papyrus",
dates=[{"text": "134 - 135", "notbefore": "0134", "notafter": "0135",}],
places={
"text": "Philopator alias Theogenus (Arsinoites)",
"located": [
{
"text": "Philopator alias Theogenus",
"type": "ancient",
"ref": ["http://pleiades.stoa.org/places/741563", "http://www.trismegistos.org/place/1776",],
},
{"text": "Arsinoites", "type": "ancient", "subtype": "nome",},
{"text": "Ägypten", "type": "ancient", "subtype": "region",},
],
},
)
),
),
(
os.path.join("hgv", "74005.xml"),
EpiDoc.create(
header=EpiDocHeader.create(
title="Ordre de paiement",
idno={
"filename": "74005",
"tm": "74005",
"ddb-perseus-style": "0022;4;452",
"ddb-filename": "o.douch.4.452",
"ddb-hybrid": "o.douch;4;452",
},
material="ostrakon",
dates=[{"text": "IV - Anfang V", "notbefore": "0301", "notafter": "0425", "precision": "low",}],
places={
"text": "Kysis (Oasis Magna)",
"located": [
{
"text": "Kysis",
"type": "ancient",
"ref": ["http://pleiades.stoa.org/places/776191", "http://www.trismegistos.org/place/2761",],
},
{"text": "Oasis Magna", "type": "ancient", "subtype": "region",},
],
},
)
),
),
]

for (filename, want) in tests:
with open(os.path.join(TESTDATA_DIR, "full", filename)) as f:
actual = load(f)
self.assertEqual(want, actual, msg=filename)
42 changes: 42 additions & 0 deletions history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from normalize import normalize, normalized_get_text


class ParseHistory:
@staticmethod
def dates(history):
result = []
for elem in history.origin.findAll("origdate"):
date = {"text": normalized_get_text(elem)}
for name, value in elem.attrs.items():
date[normalize(name)] = normalize(value)
result.append(date)
return result

@staticmethod
def places(history):
result = {}
origin_place = history.origin.origplace
result["text"] = origin_place.getText().strip()
for name, value in origin_place.attrs.items():
result[normalize(name)] = normalize(value)

for elem in history.findAll("provenance"):
typ = elem.attrs.get("type")
assert typ not in result
result[typ] = ParseHistory.provenance(elem)

return result

@staticmethod
def provenance(provenance):
result = []
# Note: For some it's provenance.p.placename
for elem in provenance.findAll("placename"):
place = {"text": normalized_get_text(elem)}
for name, value in elem.attrs.items():
place[normalize(name)] = normalize(value)

if "ref" in place:
place["ref"] = [normalize(ref) for ref in place["ref"].split(" ")]
result.append(place)
return result
Loading

0 comments on commit 5bf6868

Please sign in to comment.