-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Parse: title, idno, material, dates & places
- Loading branch information
Showing
31 changed files
with
1,284 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
[*] | ||
charset = utf-8 | ||
end_of_line = lf | ||
insert_final_newline = true | ||
max_line_length = 132 | ||
|
||
[*.py] | ||
indent_size = 4 | ||
indent_style = space | ||
|
||
[Makefile] | ||
indent_style = tab |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Virtual environment | ||
.env/ | ||
|
||
# Python | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
language: python | ||
python: | ||
- "3.7" | ||
install: | ||
- pip install --quiet --requirement requirements.txt | ||
script: | ||
- black --line-length 132 *.py | ||
- python -m unittest discover -p '*_test.py' | ||
branches: | ||
only: [master] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
format: | ||
black --line-length 132 *.py | ||
|
||
check: format-check unittest | ||
|
||
format-check: | ||
black --check --line-length 132 *.py | ||
|
||
unittest: | ||
python -m unittest discover -p '*_test.py' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,26 @@ | ||
# epidoc-parser | ||
|
||
[](https://travis-ci.org/Xennis/epidoc-parser) | ||
|
||
## Development | ||
|
||
Requirements: | ||
* Python 3.7 is installed | ||
|
||
Create a virtual environment, enable it and install the dependencies | ||
```sh | ||
python3.7 -m venv .env | ||
source .env/bin/activate | ||
pip install --requirement requirements.txt | ||
``` | ||
|
||
Run the test | ||
```sh | ||
make unittest | ||
``` | ||
|
||
## LICENSE | ||
|
||
### Test data | ||
|
||
The test data in this project is from the project [idp.data](https://github.com/papyri/idp.data) by [Papyri.info](http://papyri.info). This data is made available under a [Creative Commons Attribution 3.0 License](http://creativecommons.org/licenses/by/3.0/), with copyright and attribution to the respective projects. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from bs4 import BeautifulSoup | ||
|
||
from history import ParseHistory | ||
from normalize import normalize | ||
|
||
|
||
class EpiDocHeader: | ||
|
||
title = None | ||
idno = {} | ||
material = None | ||
dates = [] | ||
places = [] | ||
|
||
@classmethod | ||
def create(cls, title, idno, material=None, dates=None, places=None): | ||
h = cls() | ||
h.title = title | ||
h.idno = idno | ||
h.material = material | ||
if dates is not None: | ||
h.dates = dates | ||
if places is not None: | ||
h.places = places | ||
return h | ||
|
||
def __eq__(self, other): | ||
if not isinstance(other, EpiDocHeader): | ||
return False | ||
return ( | ||
self.title == other.title | ||
and self.idno == other.idno | ||
and self.material == other.material | ||
and self.dates == other.dates | ||
and self.places == other.places | ||
) | ||
|
||
def __repr__(self): | ||
return f"title={self.title},idno={self.idno},material={self.material},date={self.dates},places={self.places}" | ||
|
||
|
||
class EpiDoc: | ||
|
||
header = None | ||
|
||
@classmethod | ||
def create(cls, header): | ||
d = cls() | ||
d.header = header | ||
return d | ||
|
||
def __eq__(self, other): | ||
if not isinstance(other, EpiDoc): | ||
return False | ||
return self.header == other.header | ||
|
||
def __repr__(self): | ||
return f"header={self.header}" | ||
|
||
|
||
def load(fp): | ||
return loads(fp.read()) | ||
|
||
|
||
def loads(s): | ||
soup = BeautifulSoup(s, features="lxml") | ||
|
||
header = EpiDocHeader() | ||
|
||
filedesc = soup.teiheader.filedesc | ||
header.title = filedesc.titlestmt.title.getText() | ||
idnos = {} | ||
for idno in filedesc.publicationstmt.find_all("idno"): | ||
typ = normalize(idno.attrs.get("type")) | ||
value = normalize(idno.getText()) | ||
idnos[typ] = value | ||
header.idno = idnos | ||
|
||
msdesc = filedesc.sourcedesc.msdesc | ||
if msdesc: | ||
header.material = normalize(msdesc.physdesc.objectdesc.support.material.getText()) | ||
history = msdesc.history | ||
header.dates = ParseHistory.dates(history) | ||
header.places = ParseHistory.places(history) | ||
|
||
doc = EpiDoc() | ||
doc.header = header | ||
return doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import os | ||
import unittest | ||
|
||
from epidoc import EpiDoc, EpiDocHeader, load | ||
|
||
TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "testdata") | ||
|
||
|
||
class TestLoad(unittest.TestCase): | ||
def test_all(self): | ||
tests = [ | ||
( | ||
os.path.join("ddb", "chla.3.198.xml"), | ||
EpiDoc.create( | ||
header=EpiDocHeader.create( | ||
title="chla.3.198", | ||
idno={ | ||
"ddb-hybrid": "chla;3;198", | ||
"ddb-perseus-style": "0279;3;198", | ||
"filename": "chla.3.198", | ||
"hgv": "114844", | ||
"tm": "114844", | ||
}, | ||
) | ||
), | ||
), | ||
( | ||
os.path.join("ddb", "p.coles.16.xml"), | ||
EpiDoc.create( | ||
header=EpiDocHeader.create( | ||
title="p.coles.16", | ||
idno={"ddb-hybrid": "p.coles;;16", "filename": "p.coles.16", "hgv": "697551", "tm": "697551",}, | ||
) | ||
), | ||
), | ||
( | ||
os.path.join("dlcp", "26761.xml"), | ||
EpiDoc.create( | ||
header=EpiDocHeader.create( | ||
title="Sb. 16 13045", | ||
idno={"dclp": "26761", "dclp-hybrid": "sb;16;13045", "filename": "26761", "ldab": "5148", "tm": "26761",}, | ||
material="papyrus", | ||
dates=[{"text": "100 - 299", "notbefore": "0100", "notafter": "0299",}], | ||
places={ | ||
"text": "Found: Egypt; written: Egypt", | ||
"found": [{"text": "Egypt", "type": "ancient", "subtype": "region",}], | ||
"composed": [{"text": "Egypt", "type": "ancient", "subtype": "region",}], | ||
}, | ||
) | ||
), | ||
), | ||
( | ||
os.path.join("dlcp", "135858.xml"), | ||
EpiDoc.create( | ||
header=EpiDocHeader.create( | ||
title="TM 135858", | ||
idno={ | ||
"dclp": "135858", | ||
"dclp-hybrid": "tm;;135858", | ||
"filename": "135858", | ||
"ldab": "135858", | ||
"tm": "135858", | ||
}, | ||
material="parchment", | ||
dates=[{"text": "550 - 649", "notbefore": "0550", "notafter": "0649",}], | ||
places={ | ||
"text": "Found: Naqlun (Arsinoites, Egypt); written: Naqlun (Arsinoites, Egypt)", | ||
"found": [ | ||
{"text": "Arsinoites", "type": "ancient", "subtype": "nome",}, | ||
{"text": "Egypt", "type": "ancient", "subtype": "region",}, | ||
{"text": "Naqlun", "type": "ancient", "ref": ["https://www.trismegistos.org/place/1418"],}, | ||
], | ||
"composed": [ | ||
{"text": "Arsinoites", "type": "ancient", "subtype": "nome",}, | ||
{"text": "Egypt", "type": "ancient", "subtype": "region",}, | ||
{"text": "Naqlun", "type": "ancient", "ref": ["https://www.trismegistos.org/place/1418"],}, | ||
], | ||
}, | ||
) | ||
), | ||
), | ||
( | ||
os.path.join("hgv", "13003.xml"), | ||
EpiDoc.create( | ||
header=EpiDocHeader.create( | ||
title="Taxing - list", | ||
idno={ | ||
"filename": "13003", | ||
"tm": "13003", | ||
"ddb-perseus-style": "0198;2;371v", | ||
"ddb-filename": "p.ryl.2.371v", | ||
"ddb-hybrid": "p.ryl;2;371v", | ||
}, | ||
material="papyrus", | ||
dates=[{"text": "134 - 135", "notbefore": "0134", "notafter": "0135",}], | ||
places={ | ||
"text": "Philopator alias Theogenus (Arsinoites)", | ||
"located": [ | ||
{ | ||
"text": "Philopator alias Theogenus", | ||
"type": "ancient", | ||
"ref": ["http://pleiades.stoa.org/places/741563", "http://www.trismegistos.org/place/1776",], | ||
}, | ||
{"text": "Arsinoites", "type": "ancient", "subtype": "nome",}, | ||
{"text": "Ägypten", "type": "ancient", "subtype": "region",}, | ||
], | ||
}, | ||
) | ||
), | ||
), | ||
( | ||
os.path.join("hgv", "74005.xml"), | ||
EpiDoc.create( | ||
header=EpiDocHeader.create( | ||
title="Ordre de paiement", | ||
idno={ | ||
"filename": "74005", | ||
"tm": "74005", | ||
"ddb-perseus-style": "0022;4;452", | ||
"ddb-filename": "o.douch.4.452", | ||
"ddb-hybrid": "o.douch;4;452", | ||
}, | ||
material="ostrakon", | ||
dates=[{"text": "IV - Anfang V", "notbefore": "0301", "notafter": "0425", "precision": "low",}], | ||
places={ | ||
"text": "Kysis (Oasis Magna)", | ||
"located": [ | ||
{ | ||
"text": "Kysis", | ||
"type": "ancient", | ||
"ref": ["http://pleiades.stoa.org/places/776191", "http://www.trismegistos.org/place/2761",], | ||
}, | ||
{"text": "Oasis Magna", "type": "ancient", "subtype": "region",}, | ||
], | ||
}, | ||
) | ||
), | ||
), | ||
] | ||
|
||
for (filename, want) in tests: | ||
with open(os.path.join(TESTDATA_DIR, "full", filename)) as f: | ||
actual = load(f) | ||
self.assertEqual(want, actual, msg=filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from normalize import normalize, normalized_get_text | ||
|
||
|
||
class ParseHistory: | ||
@staticmethod | ||
def dates(history): | ||
result = [] | ||
for elem in history.origin.findAll("origdate"): | ||
date = {"text": normalized_get_text(elem)} | ||
for name, value in elem.attrs.items(): | ||
date[normalize(name)] = normalize(value) | ||
result.append(date) | ||
return result | ||
|
||
@staticmethod | ||
def places(history): | ||
result = {} | ||
origin_place = history.origin.origplace | ||
result["text"] = origin_place.getText().strip() | ||
for name, value in origin_place.attrs.items(): | ||
result[normalize(name)] = normalize(value) | ||
|
||
for elem in history.findAll("provenance"): | ||
typ = elem.attrs.get("type") | ||
assert typ not in result | ||
result[typ] = ParseHistory.provenance(elem) | ||
|
||
return result | ||
|
||
@staticmethod | ||
def provenance(provenance): | ||
result = [] | ||
# Note: For some it's provenance.p.placename | ||
for elem in provenance.findAll("placename"): | ||
place = {"text": normalized_get_text(elem)} | ||
for name, value in elem.attrs.items(): | ||
place[normalize(name)] = normalize(value) | ||
|
||
if "ref" in place: | ||
place["ref"] = [normalize(ref) for ref in place["ref"].split(" ")] | ||
result.append(place) | ||
return result |
Oops, something went wrong.