diff --git a/examples/pgp_dates.ipynb b/examples/pgp_dates.ipynb
new file mode 100644
index 0000000..ff821e8
--- /dev/null
+++ b/examples/pgp_dates.ipynb
@@ -0,0 +1,4650 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "2d231f1e-3944-4579-b868-504f7fb2d543",
+ "metadata": {},
+ "source": [
+ "# Princeton Geniza Project\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "67c5532d-ebc4-4e1e-aa64-e6802ed1d971",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "\n",
+ "pgp_documents_csv = \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n",
+ "\n",
+ "documents = pd.read_csv(pgp_documents_csv)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "41dc5a05-a04b-4b6d-acfe-1f7b04849346",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Total documents: 35,091\n",
+ "Documents with dates: 4,380\n",
+ " date on document: 4,064\n",
+ " inferred dating: 321\n"
+ ]
+ }
+ ],
+ "source": [
+ "# limit to documents with dates\n",
+ "docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]\n",
+ "docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()\n",
+ "docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]\n",
+ "\n",
+ "print(f\"\"\"\n",
+ "Total documents: {len(documents):,}\n",
+ "Documents with dates: {len(docs_with_dates):,}\n",
+ " date on document: {len(docs_with_docdate):,}\n",
+ " inferred dating: {len(docs_with_inferreddate):,}\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "94d6340b-10d0-461b-b745-378ffa1ffcec",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1259 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1116-03-05 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ "
\n",
+ " \n",
+ " 36 | \n",
+ " 491 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1131 | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1188-12-07 | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 502 | \n",
+ " Tevet 1548 | \n",
+ " Seleucid | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " 506 | \n",
+ " Elul 1428 | \n",
+ " Seleucid | \n",
+ " 1117-08-01/1117-08-29 | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " 516 | \n",
+ " First decade of Ḥeshvan 1442 | \n",
+ " Seleucid | \n",
+ " 1130 | \n",
+ "
\n",
+ " \n",
+ " 61 | \n",
+ " 524 | \n",
+ " Thursday, 12 Sivan 4795 | \n",
+ " Anno Mundi | \n",
+ " 1035-05-22 | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " 525 | \n",
+ " Shawwāl 425 | \n",
+ " Hijrī | \n",
+ " 1034-08-29/1034-09-07 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "5 449 1570 Seleucid \n",
+ "16 463 19 Adar 1427 Seleucid \n",
+ "23 472 1337 Seleucid \n",
+ "36 491 NaN NaN \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid \n",
+ "43 502 Tevet 1548 Seleucid \n",
+ "47 506 Elul 1428 Seleucid \n",
+ "55 516 First decade of Ḥeshvan 1442 Seleucid \n",
+ "61 524 Thursday, 12 Sivan 4795 Anno Mundi \n",
+ "62 525 Shawwāl 425 Hijrī \n",
+ "\n",
+ " doc_date_standard \n",
+ "5 1259 \n",
+ "16 1116-03-05 \n",
+ "23 1025-08-28/1026-09-14 \n",
+ "36 1131 \n",
+ "41 1188-12-07 \n",
+ "43 1236-11-30/1236-12-28 \n",
+ "47 1117-08-01/1117-08-29 \n",
+ "55 1130 \n",
+ "61 1035-05-22 \n",
+ "62 1034-08-29/1034-09-07 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b9703b47-a7e2-4178-a7da-fb47db11b5b7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parse error on 1217-02-20/1217-02-29: Error trying to process rule \"date\":\n",
+ "\n",
+ "Day out of range in datetime string \"1217-02-29\"\n",
+ "Parse error on 1747-02-29: Error trying to process rule \"date\":\n",
+ "\n",
+ "Day out of range in datetime string \"1747-02-29\"\n"
+ ]
+ }
+ ],
+ "source": [
+ "from lark.visitors import VisitError\n",
+ "\n",
+ "# first, how far can we get with the standard dates? can we parse as edtf and sort, render?\n",
+ "from undate import Undate \n",
+ "\n",
+ "def parse_standard_date(value):\n",
+ " try:\n",
+ " return Undate.parse(value, \"EDTF\")\n",
+ " except VisitError as err:\n",
+ " print(f\"Parse error on {value}: {err}\")\n",
+ " \n",
+ "\n",
+ "# ignore gregorian/julian thing for now\n",
+ "# from pgp code:\n",
+ "# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October\n",
+ "# cut off between gregorian/julian dates, in julian days\n",
+ "#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n",
+ "\n",
+ "docs_with_docdate['undate'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "f49e82a4-b05b-4395-998f-0c9e75729e9f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " last_modified | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 3190 | \n",
+ " 3957 | \n",
+ " middle decade of Adar 1528 | \n",
+ " Seleucid | \n",
+ " 1217-02-20/1217-02-29 | \n",
+ " 2023-02-09 07:22:14.481118+00:00 | \n",
+ "
\n",
+ " \n",
+ " 34445 | \n",
+ " 40006 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1747-02-29 | \n",
+ " 2024-08-07 18:24:19.425288+00:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "3190 3957 middle decade of Adar 1528 Seleucid \n",
+ "34445 40006 NaN NaN \n",
+ "\n",
+ " doc_date_standard last_modified \n",
+ "3190 1217-02-20/1217-02-29 2023-02-09 07:22:14.481118+00:00 \n",
+ "34445 1747-02-29 2024-08-07 18:24:19.425288+00:00 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what are the records with standardized dates that couldn't be parsed?\n",
+ "\n",
+ "# this is probably a data error in the original\n",
+ "\n",
+ "docs_with_docdate[docs_with_docdate.undate.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "2d502575-a2b4-4fce-9f59-6932275dfac2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Seleucid 1581\n",
+ "Anno Mundi 1128\n",
+ "Hijrī 859\n",
+ "Kharājī 8\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "docs_with_docdate.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "04e4ffb2-13e7-49cc-913b-2104b61aef16",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 61 | \n",
+ " 524 | \n",
+ " Thursday, 12 Sivan 4795 | \n",
+ " Anno Mundi | \n",
+ " 1035-05-22 | \n",
+ "
\n",
+ " \n",
+ " 90 | \n",
+ " 561 | \n",
+ " 10 Nisan 4716 | \n",
+ " Anno Mundi | \n",
+ " 0956-03-24 | \n",
+ "
\n",
+ " \n",
+ " 111 | \n",
+ " 582 | \n",
+ " Thursday, 6 Adar 4996 | \n",
+ " Anno Mundi | \n",
+ " 1236-02-14 | \n",
+ "
\n",
+ " \n",
+ " 119 | \n",
+ " 591 | \n",
+ " Sunday, 29 Tammuz 4898 | \n",
+ " Anno Mundi | \n",
+ " 1138-07-10 | \n",
+ "
\n",
+ " \n",
+ " 131 | \n",
+ " 603 | \n",
+ " 4805/4806 | \n",
+ " Anno Mundi | \n",
+ " 1044-08-27/1045-09-13 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 34831 | \n",
+ " 40401 | \n",
+ " 5408 | \n",
+ " Anno Mundi | \n",
+ " 1647-09-30/1648-09-16 | \n",
+ "
\n",
+ " \n",
+ " 34994 | \n",
+ " 40566 | \n",
+ " 5594 | \n",
+ " Anno Mundi | \n",
+ " 1833-09-14/1834-10-03 | \n",
+ "
\n",
+ " \n",
+ " 35052 | \n",
+ " 40624 | \n",
+ " 21 Nisan 5376 | \n",
+ " Anno Mundi | \n",
+ " 1616-04-08 | \n",
+ "
\n",
+ " \n",
+ " 35063 | \n",
+ " 40635 | \n",
+ " 5555 | \n",
+ " Anno Mundi | \n",
+ " 1794-09-25/1795-09-13 | \n",
+ "
\n",
+ " \n",
+ " 35070 | \n",
+ " 40642 | \n",
+ " 5516 | \n",
+ " Anno Mundi | \n",
+ " 1755-09-06/1756-09-24 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1128 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard\n",
+ "61 524 Thursday, 12 Sivan 4795 Anno Mundi 1035-05-22\n",
+ "90 561 10 Nisan 4716 Anno Mundi 0956-03-24\n",
+ "111 582 Thursday, 6 Adar 4996 Anno Mundi 1236-02-14\n",
+ "119 591 Sunday, 29 Tammuz 4898 Anno Mundi 1138-07-10\n",
+ "131 603 4805/4806 Anno Mundi 1044-08-27/1045-09-13\n",
+ "... ... ... ... ...\n",
+ "34831 40401 5408 Anno Mundi 1647-09-30/1648-09-16\n",
+ "34994 40566 5594 Anno Mundi 1833-09-14/1834-10-03\n",
+ "35052 40624 21 Nisan 5376 Anno Mundi 1616-04-08\n",
+ "35063 40635 5555 Anno Mundi 1794-09-25/1795-09-13\n",
+ "35070 40642 5516 Anno Mundi 1755-09-06/1756-09-24\n",
+ "\n",
+ "[1128 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# example hebrew dates\n",
+ "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "4d11e583-7c80-44ed-80b1-d0c5b7b7f408",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_89288/1200615794.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
+ " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 702 | \n",
+ " 1223 | \n",
+ " Wednesday, 9 Tammuz 4912 AM | \n",
+ " Anno Mundi | \n",
+ " 1152-06-13 | \n",
+ "
\n",
+ " \n",
+ " 16702 | \n",
+ " 19975 | \n",
+ " Sunday, 10 Kislev 5583 AM | \n",
+ " Anno Mundi | \n",
+ " 1822-11-24 | \n",
+ "
\n",
+ " \n",
+ " 25421 | \n",
+ " 30550 | \n",
+ " Tammuz 5537 AM | \n",
+ " Anno Mundi | \n",
+ " 1777-07-06/1777-08-03 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "702 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n",
+ "16702 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n",
+ "25421 30550 Tammuz 5537 AM Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "702 1152-06-13 \n",
+ "16702 1822-11-24 \n",
+ "25421 1777-07-06/1777-08-03 "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many end with AM ?\n",
+ "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n",
+ "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "cd1a751a-5299-418f-a3f8-050ab0384354",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1556 | \n",
+ " 2163 | \n",
+ " first third of Tammuz 500[.] | \n",
+ " Anno Mundi | \n",
+ " 1244/1249 | \n",
+ "
\n",
+ " \n",
+ " 1567 | \n",
+ " 2175 | \n",
+ " End of Sivan 152[.] | \n",
+ " Seleucid | \n",
+ " 1209/1218 | \n",
+ "
\n",
+ " \n",
+ " 1753 | \n",
+ " 2460 | \n",
+ " 13[..] | \n",
+ " Seleucid | \n",
+ " 988/1088 | \n",
+ "
\n",
+ " \n",
+ " 2018 | \n",
+ " 2745 | \n",
+ " 1[.] Kislev 48[..] | \n",
+ " Anno Mundi | \n",
+ " 1039-11-30/1138-11-24 | \n",
+ "
\n",
+ " \n",
+ " 3044 | \n",
+ " 3805 | \n",
+ " 13[..] | \n",
+ " Seleucid | \n",
+ " 988/1087 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 30595 | \n",
+ " 35955 | \n",
+ " 12 Muḥarram 52[.] | \n",
+ " Hijrī | \n",
+ " 1126/1134 | \n",
+ "
\n",
+ " \n",
+ " 31232 | \n",
+ " 36738 | \n",
+ " 54[.] | \n",
+ " Hijrī | \n",
+ " 1145/1154 | \n",
+ "
\n",
+ " \n",
+ " 32554 | \n",
+ " 38077 | \n",
+ " 14[...] | \n",
+ " Seleucid | \n",
+ " 1088-09-19/1188-09-23 | \n",
+ "
\n",
+ " \n",
+ " 34660 | \n",
+ " 40226 | \n",
+ " 49[.] | \n",
+ " Hijrī | \n",
+ " 1096-12-19/1106-09-01 | \n",
+ "
\n",
+ " \n",
+ " 34768 | \n",
+ " 40335 | \n",
+ " [4]82[.] | \n",
+ " Anno Mundi | \n",
+ " 1059-09-11/1069-09-18 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
67 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "1567 2175 End of Sivan 152[.] Seleucid \n",
+ "1753 2460 13[..] Seleucid \n",
+ "2018 2745 1[.] Kislev 48[..] Anno Mundi \n",
+ "3044 3805 13[..] Seleucid \n",
+ "... ... ... ... \n",
+ "30595 35955 12 Muḥarram 52[.] Hijrī \n",
+ "31232 36738 54[.] Hijrī \n",
+ "32554 38077 14[...] Seleucid \n",
+ "34660 40226 49[.] Hijrī \n",
+ "34768 40335 [4]82[.] Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "1556 1244/1249 \n",
+ "1567 1209/1218 \n",
+ "1753 988/1088 \n",
+ "2018 1039-11-30/1138-11-24 \n",
+ "3044 988/1087 \n",
+ "... ... \n",
+ "30595 1126/1134 \n",
+ "31232 1145/1154 \n",
+ "32554 1088-09-19/1188-09-23 \n",
+ "34660 1096-12-19/1106-09-01 \n",
+ "34768 1059-09-11/1069-09-18 \n",
+ "\n",
+ "[67 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many include periods\n",
+ "docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "9fa8d2ba-6612-4de5-8741-dea177f99412",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 635 | \n",
+ " 1154 | \n",
+ " Last decade of Kislev 5004 | \n",
+ " Anno Mundi | \n",
+ " 1243-12 | \n",
+ "
\n",
+ " \n",
+ " 1172 | \n",
+ " 1750 | \n",
+ " 11th Tammuz 4767 | \n",
+ " Anno Mundi | \n",
+ " 1007 | \n",
+ "
\n",
+ " \n",
+ " 1173 | \n",
+ " 1751 | \n",
+ " Monday, 27th Ṭevet 4797 | \n",
+ " Anno Mundi | \n",
+ " 1037-01-23 | \n",
+ "
\n",
+ " \n",
+ " 1556 | \n",
+ " 2163 | \n",
+ " first third of Tammuz 500[.] | \n",
+ " Anno Mundi | \n",
+ " 1244/1249 | \n",
+ "
\n",
+ " \n",
+ " 5142 | \n",
+ " 6795 | \n",
+ " last decade of Tishrei 4991 | \n",
+ " Anno Mundi | \n",
+ " 1230-09-29/1230-10-08 | \n",
+ "
\n",
+ " \n",
+ " 5223 | \n",
+ " 6892 | \n",
+ " last decade of Iyyar 4906 | \n",
+ " Anno Mundi | \n",
+ " 1146-05-04/1146-05-13 | \n",
+ "
\n",
+ " \n",
+ " 5664 | \n",
+ " 7409 | \n",
+ " last third of Ḥeshvan 4965 | \n",
+ " Anno Mundi | \n",
+ " 1204-10-17/1204-10-25 | \n",
+ "
\n",
+ " \n",
+ " 5812 | \n",
+ " 7581 | \n",
+ " middle third of Adar 4876 | \n",
+ " Anno Mundi | \n",
+ " 1116-05 | \n",
+ "
\n",
+ " \n",
+ " 7024 | \n",
+ " 9068 | \n",
+ " Last decade of Ṭevet 4898 | \n",
+ " Anno Mundi | \n",
+ " 1138-01 | \n",
+ "
\n",
+ " \n",
+ " 8639 | \n",
+ " 11215 | \n",
+ " Middle third of Av 4889 | \n",
+ " Anno Mundi | \n",
+ " 1129-07-29/1129-08-07 | \n",
+ "
\n",
+ " \n",
+ " 8818 | \n",
+ " 11418 | \n",
+ " Middle third of Sivan 4895 | \n",
+ " Anno Mundi | \n",
+ " 1135 | \n",
+ "
\n",
+ " \n",
+ " 8890 | \n",
+ " 11493 | \n",
+ " first decade of Kislev 5439 | \n",
+ " Anno Mundi | \n",
+ " 1678-11-16/1678-11-25 | \n",
+ "
\n",
+ " \n",
+ " 13598 | \n",
+ " 16487 | \n",
+ " last decade of Shevaṭ [5]141 | \n",
+ " Anno Mundi | \n",
+ " 1381-01-16/1381-01-25 | \n",
+ "
\n",
+ " \n",
+ " 23094 | \n",
+ " 27468 | \n",
+ " 23rd Shewat, 5414 | \n",
+ " Anno Mundi | \n",
+ " 1654 | \n",
+ "
\n",
+ " \n",
+ " 23103 | \n",
+ " 27477 | \n",
+ " 8th Kislev 5448 | \n",
+ " Anno Mundi | \n",
+ " 1687 | \n",
+ "
\n",
+ " \n",
+ " 23108 | \n",
+ " 27482 | \n",
+ " Friday 14th Adar I, 5463 | \n",
+ " Anno Mundi | \n",
+ " 1703 | \n",
+ "
\n",
+ " \n",
+ " 23109 | \n",
+ " 27483 | \n",
+ " Monday, 8th Adar I, 5398 | \n",
+ " Anno Mundi | \n",
+ " 1638 | \n",
+ "
\n",
+ " \n",
+ " 23110 | \n",
+ " 27484 | \n",
+ " Friday 20th Shevat 5405 | \n",
+ " Anno Mundi | \n",
+ " 1645 | \n",
+ "
\n",
+ " \n",
+ " 23113 | \n",
+ " 27487 | \n",
+ " Thursday 15th Shevat 5450 | \n",
+ " Anno Mundi | \n",
+ " 1690 | \n",
+ "
\n",
+ " \n",
+ " 23116 | \n",
+ " 27490 | \n",
+ " Thursday 19th Elul 5428 | \n",
+ " Anno Mundi | \n",
+ " 1668 | \n",
+ "
\n",
+ " \n",
+ " 23119 | \n",
+ " 27493 | \n",
+ " Sunday 21st Kislev 5460 | \n",
+ " Anno Mundi | \n",
+ " 1699 | \n",
+ "
\n",
+ " \n",
+ " 23123 | \n",
+ " 27497 | \n",
+ " Sunday 17th Sivan 5423 | \n",
+ " Anno Mundi | \n",
+ " 1663 | \n",
+ "
\n",
+ " \n",
+ " 23124 | \n",
+ " 27498 | \n",
+ " Sunday 25th Tevet 5409 | \n",
+ " Anno Mundi | \n",
+ " 1648 | \n",
+ "
\n",
+ " \n",
+ " 23137 | \n",
+ " 27511 | \n",
+ " Wednesday 28th Tevet 5399 | \n",
+ " Anno Mundi | \n",
+ " 1640 | \n",
+ "
\n",
+ " \n",
+ " 23141 | \n",
+ " 27515 | \n",
+ " Monday 15th Iyyar 5414 | \n",
+ " Anno Mundi | \n",
+ " 1654 | \n",
+ "
\n",
+ " \n",
+ " 23143 | \n",
+ " 27517 | \n",
+ " Sunday 1st Kislev 5545 | \n",
+ " Anno Mundi | \n",
+ " 1783 | \n",
+ "
\n",
+ " \n",
+ " 23172 | \n",
+ " 27546 | \n",
+ " Thursday 13th Nisan 5544 | \n",
+ " Anno Mundi | \n",
+ " 1784 | \n",
+ "
\n",
+ " \n",
+ " 23211 | \n",
+ " 27587 | \n",
+ " Monday 10th Sivan 5553 | \n",
+ " Anno Mundi | \n",
+ " 1793 | \n",
+ "
\n",
+ " \n",
+ " 23212 | \n",
+ " 27588 | \n",
+ " Monday 12th Sivan 5602 | \n",
+ " Anno Mundi | \n",
+ " 1842 | \n",
+ "
\n",
+ " \n",
+ " 23320 | \n",
+ " 27697 | \n",
+ " 10th Tamuz 5552 | \n",
+ " Anno Mundi | \n",
+ " 1792 | \n",
+ "
\n",
+ " \n",
+ " 23339 | \n",
+ " 27721 | \n",
+ " Thursday 25th Adar 5405 | \n",
+ " Anno Mundi | \n",
+ " 1645 | \n",
+ "
\n",
+ " \n",
+ " 23539 | \n",
+ " 27930 | \n",
+ " Monday 19th Sivan 5410 | \n",
+ " Anno Mundi | \n",
+ " 1650 | \n",
+ "
\n",
+ " \n",
+ " 23569 | \n",
+ " 27962 | \n",
+ " Sunday 16th Shevat 5415 | \n",
+ " Anno Mundi | \n",
+ " 1655 | \n",
+ "
\n",
+ " \n",
+ " 23574 | \n",
+ " 27967 | \n",
+ " Thursday 7th Kislev, 5431 | \n",
+ " Anno Mundi | \n",
+ " 1671 | \n",
+ "
\n",
+ " \n",
+ " 23635 | \n",
+ " 28028 | \n",
+ " 28th Tevet 5425 | \n",
+ " Anno Mundi | \n",
+ " 1665 | \n",
+ "
\n",
+ " \n",
+ " 23637 | \n",
+ " 28030 | \n",
+ " 1st Heshvan 5510 | \n",
+ " Anno Mundi | \n",
+ " 1750 | \n",
+ "
\n",
+ " \n",
+ " 23649 | \n",
+ " 28042 | \n",
+ " Sunday 16th Nissan 5438 | \n",
+ " Anno Mundi | \n",
+ " 1677 | \n",
+ "
\n",
+ " \n",
+ " 23652 | \n",
+ " 28045 | \n",
+ " Thursday 13th Tishrei 5459 | \n",
+ " Anno Mundi | \n",
+ " 1699 | \n",
+ "
\n",
+ " \n",
+ " 23667 | \n",
+ " 28060 | \n",
+ " Sunday 7th Heshvan 5425 | \n",
+ " Anno Mundi | \n",
+ " 1665 | \n",
+ "
\n",
+ " \n",
+ " 23679 | \n",
+ " 28072 | \n",
+ " Tuesday 5th Kislev, 5404 | \n",
+ " Anno Mundi | \n",
+ " 1644 | \n",
+ "
\n",
+ " \n",
+ " 23680 | \n",
+ " 28073 | \n",
+ " Friday 1st Nisan, 5405 | \n",
+ " Anno Mundi | \n",
+ " 1645 | \n",
+ "
\n",
+ " \n",
+ " 23689 | \n",
+ " 28085 | \n",
+ " Wednesday 23rd Iyyar 5410 | \n",
+ " Anno Mundi | \n",
+ " 1650 | \n",
+ "
\n",
+ " \n",
+ " 25361 | \n",
+ " 30489 | \n",
+ " first decade of Kislev 5454 | \n",
+ " Anno Mundi | \n",
+ " 1693-11-29/1693-12-08 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "635 1154 Last decade of Kislev 5004 Anno Mundi \n",
+ "1172 1750 11th Tammuz 4767 Anno Mundi \n",
+ "1173 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n",
+ "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "5142 6795 last decade of Tishrei 4991 Anno Mundi \n",
+ "5223 6892 last decade of Iyyar 4906 Anno Mundi \n",
+ "5664 7409 last third of Ḥeshvan 4965 Anno Mundi \n",
+ "5812 7581 middle third of Adar 4876 Anno Mundi \n",
+ "7024 9068 Last decade of Ṭevet 4898 Anno Mundi \n",
+ "8639 11215 Middle third of Av 4889 Anno Mundi \n",
+ "8818 11418 Middle third of Sivan 4895 Anno Mundi \n",
+ "8890 11493 first decade of Kislev 5439 Anno Mundi \n",
+ "13598 16487 last decade of Shevaṭ [5]141 Anno Mundi \n",
+ "23094 27468 23rd Shewat, 5414 Anno Mundi \n",
+ "23103 27477 8th Kislev 5448 Anno Mundi \n",
+ "23108 27482 Friday 14th Adar I, 5463 Anno Mundi \n",
+ "23109 27483 Monday, 8th Adar I, 5398 Anno Mundi \n",
+ "23110 27484 Friday 20th Shevat 5405 Anno Mundi \n",
+ "23113 27487 Thursday 15th Shevat 5450 Anno Mundi \n",
+ "23116 27490 Thursday 19th Elul 5428 Anno Mundi \n",
+ "23119 27493 Sunday 21st Kislev 5460 Anno Mundi \n",
+ "23123 27497 Sunday 17th Sivan 5423 Anno Mundi \n",
+ "23124 27498 Sunday 25th Tevet 5409 Anno Mundi \n",
+ "23137 27511 Wednesday 28th Tevet 5399 Anno Mundi \n",
+ "23141 27515 Monday 15th Iyyar 5414 Anno Mundi \n",
+ "23143 27517 Sunday 1st Kislev 5545 Anno Mundi \n",
+ "23172 27546 Thursday 13th Nisan 5544 Anno Mundi \n",
+ "23211 27587 Monday 10th Sivan 5553 Anno Mundi \n",
+ "23212 27588 Monday 12th Sivan 5602 Anno Mundi \n",
+ "23320 27697 10th Tamuz 5552 Anno Mundi \n",
+ "23339 27721 Thursday 25th Adar 5405 Anno Mundi \n",
+ "23539 27930 Monday 19th Sivan 5410 Anno Mundi \n",
+ "23569 27962 Sunday 16th Shevat 5415 Anno Mundi \n",
+ "23574 27967 Thursday 7th Kislev, 5431 Anno Mundi \n",
+ "23635 28028 28th Tevet 5425 Anno Mundi \n",
+ "23637 28030 1st Heshvan 5510 Anno Mundi \n",
+ "23649 28042 Sunday 16th Nissan 5438 Anno Mundi \n",
+ "23652 28045 Thursday 13th Tishrei 5459 Anno Mundi \n",
+ "23667 28060 Sunday 7th Heshvan 5425 Anno Mundi \n",
+ "23679 28072 Tuesday 5th Kislev, 5404 Anno Mundi \n",
+ "23680 28073 Friday 1st Nisan, 5405 Anno Mundi \n",
+ "23689 28085 Wednesday 23rd Iyyar 5410 Anno Mundi \n",
+ "25361 30489 first decade of Kislev 5454 Anno Mundi \n",
+ "\n",
+ " doc_date_standard \n",
+ "635 1243-12 \n",
+ "1172 1007 \n",
+ "1173 1037-01-23 \n",
+ "1556 1244/1249 \n",
+ "5142 1230-09-29/1230-10-08 \n",
+ "5223 1146-05-04/1146-05-13 \n",
+ "5664 1204-10-17/1204-10-25 \n",
+ "5812 1116-05 \n",
+ "7024 1138-01 \n",
+ "8639 1129-07-29/1129-08-07 \n",
+ "8818 1135 \n",
+ "8890 1678-11-16/1678-11-25 \n",
+ "13598 1381-01-16/1381-01-25 \n",
+ "23094 1654 \n",
+ "23103 1687 \n",
+ "23108 1703 \n",
+ "23109 1638 \n",
+ "23110 1645 \n",
+ "23113 1690 \n",
+ "23116 1668 \n",
+ "23119 1699 \n",
+ "23123 1663 \n",
+ "23124 1648 \n",
+ "23137 1640 \n",
+ "23141 1654 \n",
+ "23143 1783 \n",
+ "23172 1784 \n",
+ "23211 1793 \n",
+ "23212 1842 \n",
+ "23320 1792 \n",
+ "23339 1645 \n",
+ "23539 1650 \n",
+ "23569 1655 \n",
+ "23574 1671 \n",
+ "23635 1665 \n",
+ "23637 1750 \n",
+ "23649 1677 \n",
+ "23652 1699 \n",
+ "23667 1665 \n",
+ "23679 1644 \n",
+ "23680 1645 \n",
+ "23689 1650 \n",
+ "25361 1693-11-29/1693-12-08 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many use ordinals instead of numerals?\n",
+ "hebrew_dates[hebrew_dates.doc_date_original.str.contains(\"st\") | hebrew_dates.doc_date_original.str.contains(\"rd\") | hebrew_dates.doc_date_original.str.contains(\"th\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "5b6d5811-fe81-471d-bd29-896cec4c98ff",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "11th Tammuz 4767: 11 Tammuz 4767\n",
+ "27th Tevet: 27 Tevet\n",
+ "8th Kislev: 8 Kislev\n"
+ ]
+ }
+ ],
+ "source": [
+ "import re\n",
+ "\n",
+ "# test removing ordinals without removing the numbers\n",
+ "for val in ['11th Tammuz 4767', \"27th Tevet\", \"8th Kislev\"]:\n",
+ " cleand_val = re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", val)\n",
+ " print(f\"{val}: { cleand_val}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "798da8f2-2332-48c2-aeec-214474e9d49c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Parse error on PGPID 603 4805/4806 (Hebrew): Could not parse '4805/4806' as a Hebrew date\n",
+ "Parse error on PGPID 613 Tishrei–Ṭevet 1495 (Seleucid): Could not parse 'Tishrei–Ṭevet 1495' as a Hebrew date\n",
+ "Parse error on PGPID 658 Ḥannuka 1548 (Seleucid): Could not parse 'Ḥannuka 1548' as a Hebrew date\n",
+ "Parse error on PGPID 860 Marheshvan 1460 (Seleucid): Could not parse 'Marheshvan 1460' as a Hebrew date\n",
+ "Parse error on PGPID 997 Second third Av 1414 (Seleucid): Could not parse 'Second third Av 1414' as a Hebrew date\n",
+ "Parse error on PGPID 1098 Early Elul 1476 (Seleucid): Could not parse 'Early Elul 1476' as a Hebrew date\n",
+ "Parse error on PGPID 1111 Second third Tammuz 1529 (Seleucid): Could not parse 'Second third Tammuz 1529' as a Hebrew date\n",
+ "Parse error on PGPID 1139 Passover 1537 (Seleucid): Could not parse 'Passover 1537' as a Hebrew date\n",
+ "Parse error on PGPID 1140 Sivan, 1564 (Seleucid): Could not parse 'Sivan, 1564' as a Hebrew date\n",
+ "Parse error on PGPID 1339 426–30 (Islamic): Could not parse '426–30' as an Islamic date\n",
+ "Parse error on PGPID 1368 Shevaṭ 1471 (Seleucid): Could not parse 'Shevaṭ 1471' as a Hebrew date\n",
+ "Parse error on PGPID 1864 Tishrei, 15?? (Seleucid): Could not parse 'Tishrei, 15??' as a Hebrew date\n",
+ "Parse error on PGPID 2129 ten days Tammuz 1410 (Seleucid): Could not parse 'ten days Tammuz 1410' as a Hebrew date\n",
+ "Parse error on PGPID 2134 day Shevaṭ 1447 (Seleucid): Could not parse 'day Shevaṭ 1447' as a Hebrew date\n",
+ "Parse error on PGPID 2142 Elul 7 1425 (Seleucid): Could not parse 'Elul 7 1425' as a Hebrew date\n",
+ "ignoring missing digits for now first third of Tammuz 500[.]\n",
+ "ignoring missing digits for now End of Sivan 152[.]\n",
+ "Parse error on PGPID 2410 1461-03-15 (Seleucid): Could not parse '1461-03-15' as a Hebrew date\n",
+ "parsed 1461-03-15 with ISO8601 format and calendar Seleucid, result is 1461-03-15 (1150-06-18/1150-06-18)\n",
+ "ignoring missing digits for now 13[..]\n",
+ "Parse error on PGPID 2601 Adar (Seleucid): Unexpected end-of-input. Expected one of: \n",
+ "\t* __ANON_0\n",
+ "\n",
+ "Parse error on PGPID 2664 ten days Shevaṭ 1418 (Seleucid): Could not parse 'ten days Shevaṭ 1418' as a Hebrew date\n",
+ "Parse error on PGPID 2687 month 1560 (Seleucid): Could not parse 'month 1560' as a Hebrew date\n",
+ "ignoring missing digits for now 1[.] Kislev 48[..]\n",
+ "Parse error on PGPID 2958 Iyyār 1459 (Seleucid): Could not parse 'Iyyār 1459' as a Hebrew date\n",
+ "Parse error on PGPID 2985 Adar II, 1446 (Seleucid): Could not parse 'Adar II, 1446' as a Hebrew date\n",
+ "Parse error on PGPID 3021 1526 4975 (Seleucid): Could not parse '1526 4975' as a Hebrew date\n",
+ "Parse error on PGPID 3209 Tishrei 1370-9 (Seleucid): Could not parse 'Tishrei 1370-9' as a Hebrew date\n",
+ "Could not parse Tishrei 1370-9 as ISO date: invalid literal for int() with base 10: 'Tishrei 1370'\n",
+ "Parse error on PGPID 3307 Sivan 18 1329 (Seleucid): Could not parse 'Sivan 18 1329' as a Hebrew date\n",
+ "Parse error on PGPID 3430 Sunday, 1406 (Seleucid): Could not parse 'Sunday, 1406' as a Hebrew date\n",
+ "Parse error on PGPID 3524 1471 Shevaṭ 1471 (Seleucid): Could not parse '1471 Shevaṭ 1471' as a Hebrew date\n",
+ "Parse error on PGPID 3603 shortly before Elul 1437 (Seleucid): Could not parse 'shortly before Elul 1437' as a Hebrew date\n",
+ "Parse error on PGPID 3637 23 Ḥeshvan 521 (Islamic): Could not parse '23 Ḥeshvan 521' as an Islamic date\n",
+ "ignoring missing digits for now 13[..]\n",
+ "Parse error on PGPID 4011 ten days Tishrei 1458 (Seleucid): Could not parse 'ten days Tishrei 1458' as a Hebrew date\n",
+ "ignoring missing digits for now Av [14]9[.]\n",
+ "ignoring missing digits for now 13[..]\n",
+ "ignoring missing digits for now 144[.]\n",
+ "Parse error on PGPID 4209 ten days Shevat 1419 (Seleucid): Could not parse 'ten days Shevat 1419' as a Hebrew date\n",
+ "ignoring missing digits for now Adar 143[.]\n",
+ "ignoring missing digits for now Tuesday, 27 [...] 1431\n",
+ "Parse error on PGPID 4241 Tuesday, 27 000 1431 (Seleucid): Could not parse 'Tuesday, 27 000 1431' as a Hebrew date\n",
+ "Parse error on PGPID 4253 26 Sivan 14?? (Seleucid): Could not parse '26 Sivan 14??' as a Hebrew date\n",
+ "Parse error on PGPID 4609 18 Marheshvan 1539 (Seleucid): Could not parse '18 Marheshvan 1539' as a Hebrew date\n",
+ "Parse error on PGPID 4756 Monday evening, 30 Kislev 1460 (Seleucid): Could not parse 'Monday evening, 30 Kislev 1460' as a Hebrew date\n",
+ "Parse error on PGPID 5319 Pesaḥ 1529 (Seleucid): Could not parse 'Pesaḥ 1529' as a Hebrew date\n",
+ "Parse error on PGPID 5386 3 Nissan 1409 (Seleucid): Could not parse '3 Nissan 1409' as a Hebrew date\n",
+ "Parse error on PGPID 5387 3 Nissan 1409 (Seleucid): Could not parse '3 Nissan 1409' as a Hebrew date\n",
+ "Parse error on PGPID 5636 ten days Elul 1440 (Seleucid): Could not parse 'ten days Elul 1440' as a Hebrew date\n",
+ "Parse error on PGPID 5902 10 Elul 444 (Islamic): Could not parse '10 Elul 444' as an Islamic date\n",
+ "Parse error on PGPID 6000 447/449 (Islamic): Could not parse '447/449' as an Islamic date\n",
+ "Parse error on PGPID 6037 449/450 (Islamic): Could not parse '449/450' as an Islamic date\n",
+ "Parse error on PGPID 6058 Tishrei 444 (Islamic): Could not parse 'Tishrei 444' as an Islamic date\n",
+ "Parse error on PGPID 6235 467-04 (Islamic): Could not parse '467-04' as an Islamic date\n",
+ "parsed 467-04 with ISO8601 format and calendar Islamic, result is 0467-04 (1074-11-30/1074-12-28)\n",
+ "Parse error on PGPID 6249 Tuesday, 17 SIvan 1475 (Seleucid): Could not parse 'Tuesday, 17 SIvan 1475' as a Hebrew date\n",
+ "Parse error on PGPID 6292 1452-03-08 (Seleucid): Could not parse '1452-03-08' as a Hebrew date\n",
+ "parsed 1452-03-08 with ISO8601 format and calendar Seleucid, result is 1452-03-08 (1141-05-23/1141-05-23)\n",
+ "Parse error on PGPID 6325 20 day the Omer 1545 (Seleucid): Could not parse '20 day the Omer 1545' as a Hebrew date\n",
+ "ignoring missing digits for now 2[.] Ḥeshvan 1352\n",
+ "Parse error on PGPID 6542 Nisan 1497 – Tishrei 1498 (Seleucid): Could not parse 'Nisan 1497 – Tishrei 1498' as a Hebrew date\n",
+ "Parse error on PGPID 6581 Iyyar-Tammuz 1475 (Seleucid): Could not parse 'Iyyar-Tammuz 1475' as a Hebrew date\n",
+ "Could not parse Iyyar-Tammuz 1475 as ISO date: invalid literal for int() with base 10: 'Iyyar'\n",
+ "Parse error on PGPID 6582 1543-08 (Seleucid): Could not parse '1543-08' as a Hebrew date\n",
+ "parsed 1543-08 with ISO8601 format and calendar Seleucid, result is 1543-08 (1231-10-06/1231-11-03)\n",
+ "Parse error on PGPID 6628 Iyar 1510–29 (Seleucid): Could not parse 'Iyar 1510–29' as a Hebrew date\n",
+ "Parse error on PGPID 6803 1357–59 (Seleucid): Could not parse '1357–59' as a Hebrew date\n",
+ "Parse error on PGPID 6827 day Dhū l-Qaʿda 544 (Islamic): Could not parse 'day Dhū l-Qaʿda 544' as an Islamic date\n",
+ "Could not parse day Dhū l-Qaʿda 544 as ISO date: invalid literal for int() with base 10: 'day Dhū l'\n",
+ "Parse error on PGPID 6834 Approximately 15 weeks prior to Ḥeshvan 1451 (Seleucid): Could not parse 'Approximately 15 weeks prior to Ḥeshvan 1451' as a Hebrew date\n",
+ "ignoring missing digits for now 2[.] Ṭevet 4874\n",
+ "Parse error on PGPID 6913 Ḥeshvan–Kislev 1339 (Seleucid): Could not parse 'Ḥeshvan–Kislev 1339' as a Hebrew date\n",
+ "Parse error on PGPID 6925 Wednesday, 24 month year (Seleucid): Could not parse 'Wednesday, 24 month year' as a Hebrew date\n",
+ "Parse error on PGPID 6982 5–10 Adar II 1535 (Seleucid): Could not parse '5–10 Adar II 1535' as a Hebrew date\n",
+ "Parse error on PGPID 7029 Thursday, 20 Iyar 4758 or 4768 (Hebrew): Could not parse 'Thursday, 20 Iyar 4758 or 4768' as a Hebrew date\n",
+ "Parse error on PGPID 7032 Monday, Shevaṭ 1461 (Seleucid): Could not parse 'Monday, Shevaṭ 1461' as a Hebrew date\n",
+ "ignoring missing digits for now Ḥeshvan 136[.]\n",
+ "Parse error on PGPID 7098 4761–68 (Hebrew): Could not parse '4761–68' as a Hebrew date\n",
+ "ignoring missing digits for now 48[..]\n",
+ "Parse error on PGPID 7150 Nisan–Iyyar 1341 (Seleucid): Could not parse 'Nisan–Iyyar 1341' as a Hebrew date\n",
+ "ignoring missing digits for now Thursday, 13 Nisan 15[..]\n",
+ "Parse error on PGPID 7299 Ṭevet, 1424 (Seleucid): Could not parse 'Ṭevet, 1424' as a Hebrew date\n",
+ "ignoring missing digits for now 2[.] Kislev 5327\n",
+ "Parse error on PGPID 7415 4 Elul 1420–29 (Seleucid): Could not parse '4 Elul 1420–29' as a Hebrew date\n",
+ "Parse error on PGPID 7427 439–40 (Islamic): Could not parse '439–40' as an Islamic date\n",
+ "ignoring missing digits for now 13[..]\n",
+ "Parse error on PGPID 7580 ten days Kislev 1428 (Seleucid): Could not parse 'ten days Kislev 1428' as a Hebrew date\n",
+ "Parse error on PGPID 7612 Ca. 27 Iyyar 1547 (Seleucid): Could not parse 'Ca. 27 Iyyar 1547' as a Hebrew date\n",
+ "Parse error on PGPID 7735 Adar II–Nisan 1339 (Seleucid): Could not parse 'Adar II–Nisan 1339' as a Hebrew date\n",
+ "Parse error on PGPID 7744 1335-10-14 (Seleucid): Could not parse '1335-10-14' as a Hebrew date\n",
+ "parsed 1335-10-14 with ISO8601 format and calendar Seleucid, result is 1335-10-14 (1024-01-05/1024-01-05)\n",
+ "Parse error on PGPID 7745 1337-01-27 (Seleucid): Could not parse '1337-01-27' as a Hebrew date\n",
+ "parsed 1337-01-27 with ISO8601 format and calendar Seleucid, result is 1337-01-27 (1026-04-23/1026-04-23)\n",
+ "Parse error on PGPID 7746 1341-03-18 (Seleucid): Could not parse '1341-03-18' as a Hebrew date\n",
+ "parsed 1341-03-18 with ISO8601 format and calendar Seleucid, result is 1341-03-18 (1030-05-28/1030-05-28)\n",
+ "Parse error on PGPID 7747 1371-02-20 (Seleucid): Could not parse '1371-02-20' as a Hebrew date\n",
+ "parsed 1371-02-20 with ISO8601 format and calendar Seleucid, result is 1371-02-20 (1060-04-29/1060-04-29)\n",
+ "Parse error on PGPID 7748 1377-07-04 (Seleucid): Could not parse '1377-07-04' as a Hebrew date\n",
+ "parsed 1377-07-04 with ISO8601 format and calendar Seleucid, result is 1377-07-04 (1065-09-12/1065-09-12)\n",
+ "Parse error on PGPID 7749 1430-09-16 (Seleucid): Could not parse '1430-09-16' as a Hebrew date\n",
+ "parsed 1430-09-16 with ISO8601 format and calendar Seleucid, result is 1430-09-16 (1118-12-09/1118-12-09)\n",
+ "Parse error on PGPID 7750 1399-04-16 (Seleucid): Could not parse '1399-04-16' as a Hebrew date\n",
+ "parsed 1399-04-16 with ISO8601 format and calendar Seleucid, result is 1399-04-16 (1088-07-14/1088-07-14)\n",
+ "Parse error on PGPID 7752 1395-09-02 (Seleucid): Could not parse '1395-09-02' as a Hebrew date\n",
+ "parsed 1395-09-02 with ISO8601 format and calendar Seleucid, result is 1395-09-02 (1083-11-21/1083-11-21)\n",
+ "Parse error on PGPID 7753 1399-04-16 (Seleucid): Could not parse '1399-04-16' as a Hebrew date\n",
+ "parsed 1399-04-16 with ISO8601 format and calendar Seleucid, result is 1399-04-16 (1088-07-14/1088-07-14)\n",
+ "Parse error on PGPID 7755 1415-05-12 (Seleucid): Could not parse '1415-05-12' as a Hebrew date\n",
+ "parsed 1415-05-12 with ISO8601 format and calendar Seleucid, result is 1415-05-12 (1104-08-12/1104-08-12)\n",
+ "Parse error on PGPID 7757 1148-07-17 (Seleucid): Could not parse '1148-07-17' as a Hebrew date\n",
+ "parsed 1148-07-17 with ISO8601 format and calendar Seleucid, result is 1148-07-17 (0836-10-06/0836-10-06)\n",
+ "Parse error on PGPID 7759 1440-09-15 (Seleucid): Could not parse '1440-09-15' as a Hebrew date\n",
+ "parsed 1440-09-15 with ISO8601 format and calendar Seleucid, result is 1440-09-15 (1128-11-16/1128-11-16)\n",
+ "Parse error on PGPID 7760 1441-09-02 (Seleucid): Could not parse '1441-09-02' as a Hebrew date\n",
+ "parsed 1441-09-02 with ISO8601 format and calendar Seleucid, result is 1441-09-02 (1129-11-23/1129-11-23)\n",
+ "Parse error on PGPID 7762 1443-04-29 (Seleucid): Could not parse '1443-04-29' as a Hebrew date\n",
+ "parsed 1443-04-29 with ISO8601 format and calendar Seleucid, result is 1443-04-29 (1132-07-21/1132-07-21)\n",
+ "Parse error on PGPID 7763 1446-12-03 (Seleucid): Could not parse '1446-12-03' as a Hebrew date\n",
+ "parsed 1446-12-03 with ISO8601 format and calendar Seleucid, result is 1446-12-03 (1135-02-25/1135-02-25)\n",
+ "Parse error on PGPID 7765 1457-06-12 (Seleucid): Could not parse '1457-06-12' as a Hebrew date\n",
+ "parsed 1457-06-12 with ISO8601 format and calendar Seleucid, result is 1457-06-12 (1146-08-29/1146-08-29)\n",
+ "Parse error on PGPID 7766 1461-10-22 (Seleucid): Could not parse '1461-10-22' as a Hebrew date\n",
+ "parsed 1461-10-22 with ISO8601 format and calendar Seleucid, result is 1461-10-22 (1149-12-30/1149-12-30)\n",
+ "Parse error on PGPID 7767 1461-03-15 (Seleucid): Could not parse '1461-03-15' as a Hebrew date\n",
+ "parsed 1461-03-15 with ISO8601 format and calendar Seleucid, result is 1461-03-15 (1150-06-18/1150-06-18)\n",
+ "Parse error on PGPID 7768 1464-07-19 (Seleucid): Could not parse '1464-07-19' as a Hebrew date\n",
+ "parsed 1464-07-19 with ISO8601 format and calendar Seleucid, result is 1464-07-19 (1152-09-26/1152-09-26)\n",
+ "Parse error on PGPID 7769 1466-11-19 (Seleucid): Could not parse '1466-11-19' as a Hebrew date\n",
+ "parsed 1466-11-19 with ISO8601 format and calendar Seleucid, result is 1466-11-19 (1155-01-31/1155-01-31)\n",
+ "Parse error on PGPID 7770 1486-02-12 (Seleucid): Could not parse '1486-02-12' as a Hebrew date\n",
+ "parsed 1486-02-12 with ISO8601 format and calendar Seleucid, result is 1486-02-12 (1175-05-12/1175-05-12)\n",
+ "Parse error on PGPID 7771 1473-02-05 (Seleucid): Could not parse '1473-02-05' as a Hebrew date\n",
+ "parsed 1473-02-05 with ISO8601 format and calendar Seleucid, result is 1473-02-05 (1162-04-28/1162-04-28)\n",
+ "Parse error on PGPID 7772 1476-11-05 (Seleucid): Could not parse '1476-11-05' as a Hebrew date\n",
+ "parsed 1476-11-05 with ISO8601 format and calendar Seleucid, result is 1476-11-05 (1165-01-27/1165-01-27)\n",
+ "Parse error on PGPID 7773 1477-11-25 (Seleucid): Could not parse '1477-11-25' as a Hebrew date\n",
+ "parsed 1477-11-25 with ISO8601 format and calendar Seleucid, result is 1477-11-25 (1166-02-05/1166-02-05)\n",
+ "Parse error on PGPID 7774 1489-12-20 (Seleucid): Could not parse '1489-12-20' as a Hebrew date\n",
+ "parsed 1489-12-20 with ISO8601 format and calendar Seleucid, result is 1489-12-20 (1178-02-16/1178-02-16)\n",
+ "Parse error on PGPID 7776 1492-12-20 (Seleucid): Could not parse '1492-12-20' as a Hebrew date\n",
+ "parsed 1492-12-20 with ISO8601 format and calendar Seleucid, result is 1492-12-20 (1181-03-16/1181-03-16)\n",
+ "Parse error on PGPID 7777 1524-09-05 (Seleucid): Could not parse '1524-09-05' as a Hebrew date\n",
+ "parsed 1524-09-05 with ISO8601 format and calendar Seleucid, result is 1524-09-05 (1212-11-08/1212-11-08)\n",
+ "Parse error on PGPID 7780 1537-07-10 (Seleucid): Could not parse '1537-07-10' as a Hebrew date\n",
+ "parsed 1537-07-10 with ISO8601 format and calendar Seleucid, result is 1537-07-10 (1225-09-20/1225-09-20)\n",
+ "Parse error on PGPID 7781 1590-05-18 (Seleucid): Could not parse '1590-05-18' as a Hebrew date\n",
+ "parsed 1590-05-18 with ISO8601 format and calendar Seleucid, result is 1590-05-18 (1279-08-05/1279-08-05)\n",
+ "Parse error on PGPID 7926 second half Adar II 1366 (Seleucid): Could not parse 'second half Adar II 1366' as a Hebrew date\n",
+ "Parse error on PGPID 7968 Second third Elul, 1482 (Seleucid): Could not parse 'Second third Elul, 1482' as a Hebrew date\n",
+ "Parse error on PGPID 8072 Tammuz/Av 1466 (Seleucid): Could not parse 'Tammuz/Av 1466' as a Hebrew date\n",
+ "ignoring missing digits for now 13 Nisan 13[..]\n",
+ "Parse error on PGPID 8258 Elul 1141 – Tishrei 1442 (Seleucid): Could not parse 'Elul 1141 – Tishrei 1442' as a Hebrew date\n",
+ "Parse error on PGPID 8271 10 days Shawwāl 480 (Islamic): Could not parse '10 days Shawwāl 480' as an Islamic date\n",
+ "Parse error on PGPID 8517 Iyyār 1461 (Seleucid): Could not parse 'Iyyār 1461' as a Hebrew date\n",
+ "Parse error on PGPID 8665 Monday, 24 Iyyār 1403 (Seleucid): Could not parse 'Monday, 24 Iyyār 1403' as a Hebrew date\n",
+ "Parse error on PGPID 8778 Tammuz 1549–Tammuz 1550 (Seleucid): Could not parse 'Tammuz 1549–Tammuz 1550' as a Hebrew date\n",
+ "ignoring missing digits for now 153[.]\n",
+ "Parse error on PGPID 8925 Rabi al-Awwal 617 (Islamic): Could not parse 'Rabi al-Awwal 617' as an Islamic date\n",
+ "Could not parse Rabi al-Awwal 617 as ISO date: invalid literal for int() with base 10: 'Rabi al'\n",
+ "ignoring missing digits for now 139[.]\n",
+ "Parse error on PGPID 9198 Iyyar 571 (Islamic): Could not parse 'Iyyar 571' as an Islamic date\n",
+ "Parse error on PGPID 9235 516/521 (Islamic): Could not parse '516/521' as an Islamic date\n",
+ "Parse error on PGPID 9253 12 Nissan 1409 (Seleucid): Could not parse '12 Nissan 1409' as a Hebrew date\n",
+ "Parse error on PGPID 9504 Tishrei-Ḥeshvan 1472 (Seleucid): Could not parse 'Tishrei-Ḥeshvan 1472' as a Hebrew date\n",
+ "Could not parse Tishrei-Ḥeshvan 1472 as ISO date: invalid literal for int() with base 10: 'Tishrei'\n",
+ "Parse error on PGPID 9522 24 Adar Sheni 1280 (Seleucid): Could not parse '24 Adar Sheni 1280' as a Hebrew date\n",
+ "Parse error on PGPID 9682 ten days Av 1427 (Seleucid): Could not parse 'ten days Av 1427' as a Hebrew date\n",
+ "Parse error on PGPID 9915 427–29 (Islamic): Could not parse '427–29' as an Islamic date\n",
+ "Parse error on PGPID 10446 11 Rabīʿ II 488/20 Rabīʿ II 488 (Islamic): Could not parse '11 Rabīʿ II 488/20 Rabīʿ II 488' as an Islamic date\n",
+ "ignoring missing digits for now 135[.]\n",
+ "Parse error on PGPID 11091 Tishrei 7 1421 (Seleucid): Could not parse 'Tishrei 7 1421' as a Hebrew date\n",
+ "ignoring missing digits for now 13[..]\n",
+ "Parse error on PGPID 11249 909-03-14 (Islamic): Could not parse '909-03-14' as an Islamic date\n",
+ "parsed 909-03-14 with ISO8601 format and calendar Islamic, result is 0909-03-14 (1503-09-16/1503-09-16)\n",
+ "Parse error on PGPID 11250 909-03 (Islamic): Could not parse '909-03' as an Islamic date\n",
+ "parsed 909-03 with ISO8601 format and calendar Islamic, result is 0909-03 (1503-09-03/1503-10-02)\n",
+ "Parse error on PGPID 11336 Wednesday, Shevaṭ 1402 (Seleucid): Could not parse 'Wednesday, Shevaṭ 1402' as a Hebrew date\n",
+ "ignoring missing digits for now 14 Sivan 156[.]\n",
+ "Parse error on PGPID 11591 Av – Elul 1527 (Seleucid): Could not parse 'Av – Elul 1527' as a Hebrew date\n",
+ "Parse error on PGPID 12008 Rabīʿ al-Awwal 1259 (Islamic): Could not parse 'Rabīʿ al-Awwal 1259' as an Islamic date\n",
+ "Could not parse Rabīʿ al-Awwal 1259 as ISO date: invalid literal for int() with base 10: 'Rabīʿ al'\n",
+ "Parse error on PGPID 12664 403-12-25 (Islamic): Could not parse '403-12-25' as an Islamic date\n",
+ "parsed 403-12-25 with ISO8601 format and calendar Islamic, result is 0403-12-25 (1013-07-13/1013-07-13)\n",
+ "ignoring missing digits for now 17[..]\n",
+ "ignoring missing digits for now Ramaḍān 4[..]\n",
+ "ignoring missing digits for now Shevaṭ 47[..]\n",
+ "Parse error on PGPID 16232 1469/1471 (Seleucid): Could not parse '1469/1471' as a Hebrew date\n",
+ "Parse error on PGPID 16247 495/524 (Islamic): Could not parse '495/524' as an Islamic date\n",
+ "ignoring missing digits for now Elul 143[.]\n",
+ "ignoring missing digits for now Kislev 152[.]\n",
+ "Parse error on PGPID 16451 before Sukkot 1684 (Seleucid): Could not parse 'before Sukkot 1684' as a Hebrew date\n",
+ "ignoring missing digits for now 48[..]\n",
+ "ignoring missing digits for now 14[..]\n",
+ "ignoring missing digits for now 15[..]\n",
+ "ignoring missing digits for now 14[..]\n",
+ "Parse error on PGPID 16717 Rabīʿ II 873 (Islamic): Could not parse 'Rabīʿ II 873' as an Islamic date\n",
+ "Parse error on PGPID 16725 621–29 (Islamic): Could not parse '621–29' as an Islamic date\n",
+ "Parse error on PGPID 16743 half 420 (Islamic): Could not parse 'half 420' as an Islamic date\n",
+ "Parse error on PGPID 17119 Shabbat Badmibar 1651 (Seleucid): Could not parse 'Shabbat Badmibar 1651' as a Hebrew date\n",
+ "Parse error on PGPID 18203 18 Shaʿbān – 15 Ramaḍān 934 (Islamic): Could not parse '18 Shaʿbān – 15 Ramaḍān 934' as an Islamic date\n",
+ "Parse error on PGPID 18566 10-20 Shaʿbān 448 (Islamic): Could not parse '10-20 Shaʿbān 448' as an Islamic date\n",
+ "Could not parse 10-20 Shaʿbān 448 as ISO date: invalid literal for int() with base 10: '20 Shaʿbān 448'\n",
+ "Parse error on PGPID 18576 548/549 (Islamic): Could not parse '548/549' as an Islamic date\n",
+ "Parse error on PGPID 18588 ten days Av 1425 (Seleucid): Could not parse 'ten days Av 1425' as a Hebrew date\n",
+ "ignoring missing digits for now [4]84[.]\n",
+ "Parse error on PGPID 19110 Adar 1408 – Kislev 1409 (Seleucid): Could not parse 'Adar 1408 – Kislev 1409' as a Hebrew date\n",
+ "ignoring missing digits for now Kislev 13[..]\n",
+ "Parse error on PGPID 19137 A Monday in the Tammuz 1434 (Seleucid): Could not parse 'A Monday in the Tammuz 1434' as a Hebrew date\n",
+ "ignoring missing digits for now 145[.]\n",
+ "Parse error on PGPID 19319 20-29 Elul 5500 (Hebrew): Could not parse '20-29 Elul 5500' as a Hebrew date\n",
+ "Could not parse 20-29 Elul 5500 as ISO date: invalid literal for int() with base 10: '29 Elul 5500'\n",
+ "ignoring missing digits for now 194[.]\n",
+ "Parse error on PGPID 20077 1459–62 (Seleucid): Could not parse '1459–62' as a Hebrew date\n",
+ "Parse error on PGPID 20404 5452–57 (Hebrew): Could not parse '5452–57' as a Hebrew date\n",
+ "Parse error on PGPID 20406 439–40 (Islamic): Could not parse '439–40' as an Islamic date\n",
+ "Parse error on PGPID 20647 16 Dhu l-Qa’da 550 (Islamic): Could not parse '16 Dhu l-Qa’da 550' as an Islamic date\n",
+ "Could not parse 16 Dhu l-Qa’da 550 as ISO date: invalid literal for int() with base 10: '16 Dhu l'\n",
+ "ignoring missing digits for now 6[..]\n",
+ "Parse error on PGPID 20715 536–537 (Islamic): Could not parse '536–537' as an Islamic date\n",
+ "Parse error on PGPID 20784 409/410 (Islamic): Could not parse '409/410' as an Islamic date\n",
+ "ignoring missing digits for now 154[.]\n",
+ "Parse error on PGPID 21102 522–73 (Islamic): Could not parse '522–73' as an Islamic date\n",
+ "ignoring missing digits for now first third of Ramaḍān 54[.]\n",
+ "ignoring missing digits for now 47[.]\n",
+ "ignoring missing digits for now Sivan 53[..]\n",
+ "Parse error on PGPID 22157 השע\"ד (Hebrew): Could not parse 'השע\"ד' as a Hebrew date\n",
+ "Parse error on PGPID 22446 1463/4 (Seleucid): Could not parse '1463/4' as a Hebrew date\n",
+ "ignoring missing digits for now Adar II 17[..]\n",
+ "Parse error on PGPID 22863 אתלא (Seleucid): Could not parse 'אתלא' as a Hebrew date\n",
+ "ignoring missing digits for now 64[.]\n",
+ "Parse error on PGPID 23880 608–09 (Islamic): Could not parse '608–09' as an Islamic date\n",
+ "Parse error on PGPID 23926 3–4 Dhū l-Ḥijja 600 (Islamic): Could not parse '3–4 Dhū l-Ḥijja 600' as an Islamic date\n",
+ "Could not parse 3–4 Dhū l-Ḥijja 600 as ISO date: invalid literal for int() with base 10: '3–4 Dhū l'\n",
+ "ignoring missing digits for now Sivan 142[.]\n",
+ "Parse error on PGPID 24668 Sunday, 16 Rabiʿ I 403 (Islamic): Could not parse 'Sunday, 16 Rabiʿ I 403' as an Islamic date\n",
+ "ignoring missing digits for now 145[.]\n",
+ "Parse error on PGPID 24843 503–04 (Islamic): Could not parse '503–04' as an Islamic date\n",
+ "ignoring missing digits for now Rabīʿ I 49[.]\n",
+ "Parse error on PGPID 24872 Rabīʿ I 490 (Islamic): Could not parse 'Rabīʿ I 490' as an Islamic date\n",
+ "Parse error on PGPID 25106 Rabī ʿ 1312 (Islamic): Could not parse 'Rabī ʿ 1312' as an Islamic date\n",
+ "Parse error on PGPID 25233 19 Rabiʿ II 566 (Islamic): Could not parse '19 Rabiʿ II 566' as an Islamic date\n",
+ "Parse error on PGPID 25298 Nisan 1463 – Adar 1464 (Seleucid): Could not parse 'Nisan 1463 – Adar 1464' as a Hebrew date\n",
+ "ignoring missing digits for now 52[..]\n",
+ "ignoring missing digits for now Tuesday, 2[.] Shevaṭ 1341\n",
+ "ignoring missing digits for now 157[.]\n",
+ "Parse error on PGPID 25917 14–28 Ramaḍān 433 (Islamic): Could not parse '14–28 Ramaḍān 433' as an Islamic date\n",
+ "ignoring missing digits for now Ṭevet 143[.]\n",
+ "Parse error on PGPID 26405 26 Adar II 1600 — 26 Elul 1601 (Seleucid): Could not parse '26 Adar II 1600 — 26 Elul 1601' as a Hebrew date\n",
+ "ignoring missing digits for now Elul 498[.]\n",
+ "Parse error on PGPID 26553 1361–62 (Seleucid): Could not parse '1361–62' as a Hebrew date\n",
+ "ignoring missing digits for now Ḥeshvan 13[..]\n",
+ "Parse error on PGPID 27238 תר\"ב (Hebrew): Could not parse 'תר\"ב' as a Hebrew date\n",
+ "Parse error on PGPID 27254 התר\"ן (Hebrew): Could not parse 'התר\"ן' as a Hebrew date\n",
+ "Parse error on PGPID 27468 23 Shewat, 5414 (Hebrew): Could not parse '23 Shewat, 5414' as a Hebrew date\n",
+ "Parse error on PGPID 27474 Tuesday 3 Nissan 5443 (Hebrew): Could not parse 'Tuesday 3 Nissan 5443' as a Hebrew date\n",
+ "Parse error on PGPID 27697 10 Tamuz 5552 (Hebrew): Could not parse '10 Tamuz 5552' as a Hebrew date\n",
+ "ignoring missing digits for now 1[.] Sivan 1420\n",
+ "Parse error on PGPID 27913 Rabīʿ I or Rabīʿ II 915 (Islamic): Could not parse 'Rabīʿ I or Rabīʿ II 915' as an Islamic date\n",
+ "Parse error on PGPID 27938 Adar II 5334–Sivan 5535 (Hebrew): Could not parse 'Adar II 5334–Sivan 5535' as a Hebrew date\n",
+ "Parse error on PGPID 27978 Monday, Iyyar 2, 5405 (Hebrew): Could not parse 'Monday, Iyyar 2, 5405' as a Hebrew date\n",
+ "Parse error on PGPID 28042 Sunday 16 Nissan 5438 (Hebrew): Could not parse 'Sunday 16 Nissan 5438' as a Hebrew date\n",
+ "Parse error on PGPID 28293 1-10 Shevaṭ 5577 (Hebrew): Could not parse '1-10 Shevaṭ 5577' as a Hebrew date\n",
+ "Could not parse 1-10 Shevaṭ 5577 as ISO date: invalid literal for int() with base 10: '10 Shevaṭ 5577'\n",
+ "Parse error on PGPID 28487 1219–20 (Islamic): Could not parse '1219–20' as an Islamic date\n",
+ "Parse error on PGPID 28919 Av 5579/Av 5582 (Hebrew): Could not parse 'Av 5579/Av 5582' as a Hebrew date\n",
+ "Parse error on PGPID 28997 Adar II, 1451 (Seleucid): Could not parse 'Adar II, 1451' as a Hebrew date\n",
+ "Parse error on PGPID 29029 25 Dhū l-Qa‘da 403 (Islamic): Could not parse '25 Dhū l-Qa‘da 403' as an Islamic date\n",
+ "Could not parse 25 Dhū l-Qa‘da 403 as ISO date: invalid literal for int() with base 10: '25 Dhū l'\n",
+ "ignoring missing digits for now 41[.]\n",
+ "Parse error on PGPID 30314 Av 5493 – Tishrei 5494 (Hebrew): Could not parse 'Av 5493 – Tishrei 5494' as a Hebrew date\n",
+ "Parse error on PGPID 30553 5442–43 (Hebrew): Could not parse '5442–43' as a Hebrew date\n",
+ "Parse error on PGPID 30944 859-04-13 (Islamic): Could not parse '859-04-13' as an Islamic date\n",
+ "parsed 859-04-13 with ISO8601 format and calendar Islamic, result is 0859-04-13 (1455-04-11/1455-04-11)\n",
+ "Parse error on PGPID 30990 634-06-14 (Islamic): Could not parse '634-06-14' as an Islamic date\n",
+ "parsed 634-06-14 with ISO8601 format and calendar Islamic, result is 0634-06-14 (1237-02-19/1237-02-19)\n",
+ "Parse error on PGPID 31058 723-01-27 (Islamic): Could not parse '723-01-27' as an Islamic date\n",
+ "parsed 723-01-27 with ISO8601 format and calendar Islamic, result is 0723-01-27 (1323-02-13/1323-02-13)\n",
+ "Parse error on PGPID 31080 1247-05-18 (Islamic): Could not parse '1247-05-18' as an Islamic date\n",
+ "parsed 1247-05-18 with ISO8601 format and calendar Islamic, result is 1247-05-18 (1831-10-25/1831-10-25)\n",
+ "ignoring missing digits for now 123[.]\n",
+ "Parse error on PGPID 31879 1207-08 (Islamic): Could not parse '1207-08' as an Islamic date\n",
+ "parsed 1207-08 with ISO8601 format and calendar Islamic, result is 1207-08 (1793-03-14/1793-04-11)\n",
+ "Parse error on PGPID 32721 531/533 (Islamic): Could not parse '531/533' as an Islamic date\n",
+ "Parse error on PGPID 32884 Muḥarram 1246 - Ṣafar 1246 (Islamic): Could not parse 'Muḥarram 1246 - Ṣafar 1246' as an Islamic date\n",
+ "Could not parse Muḥarram 1246 - Ṣafar 1246 as ISO date: invalid literal for int() with base 10: 'Muḥarram 1246 '\n",
+ "Parse error on PGPID 32997 Jumāda II 602-603 (Islamic): Could not parse 'Jumāda II 602-603' as an Islamic date\n",
+ "Could not parse Jumāda II 602-603 as ISO date: invalid literal for int() with base 10: 'Jumāda II 602'\n",
+ "ignoring missing digits for now 73[.]\n",
+ "Parse error on PGPID 34019 408–410 (Islamic): Could not parse '408–410' as an Islamic date\n",
+ "Parse error on PGPID 34221 Before 9 Av 1412 (Seleucid): Could not parse 'Before 9 Av 1412' as a Hebrew date\n",
+ "Parse error on PGPID 34349 19 Ramaḍān 504/29 Ramaḍān 504 (Islamic): Could not parse '19 Ramaḍān 504/29 Ramaḍān 504' as an Islamic date\n",
+ "ignoring missing digits for now 146[.]\n",
+ "Parse error on PGPID 34598 4982–83 (Hebrew): Could not parse '4982–83' as a Hebrew date\n",
+ "Parse error on PGPID 34740 29 June 1922 (Hebrew): Could not parse '29 June 1922' as a Hebrew date\n",
+ "Parse error on PGPID 34880 Adar I and Adar II 5453 (Hebrew): Could not parse 'Adar I and Adar II 5453' as a Hebrew date\n",
+ "Parse error on PGPID 35014 Nisan–Sivan 5585 (Hebrew): Could not parse 'Nisan–Sivan 5585' as a Hebrew date\n",
+ "Parse error on PGPID 35018 5008-5010 (Hebrew): Could not parse '5008-5010' as a Hebrew date\n",
+ "Could not parse 5008-5010 as ISO date: bad month number 5010; must be 1-12\n",
+ "Parse error on PGPID 35020 4962/4964 (Hebrew): Could not parse '4962/4964' as a Hebrew date\n",
+ "Parse error on PGPID 35022 4790/4791 (Hebrew): Could not parse '4790/4791' as a Hebrew date\n",
+ "Parse error on PGPID 35024 4728–31 (Hebrew): Could not parse '4728–31' as a Hebrew date\n",
+ "ignoring missing digits for now 42[.]\n",
+ "Parse error on PGPID 35117 tenth Ṣafar 728 (Islamic): Could not parse 'tenth Ṣafar 728' as an Islamic date\n",
+ "Parse error on PGPID 35181 1390-10-23 (Seleucid): Could not parse '1390-10-23' as a Hebrew date\n",
+ "parsed 1390-10-23 with ISO8601 format and calendar Seleucid, result is 1390-10-23 (1079-01-06/1079-01-06)\n",
+ "Parse error on PGPID 35272 411/427 (Islamic): Could not parse '411/427' as an Islamic date\n",
+ "ignoring missing digits for now 41[.]\n",
+ "Parse error on PGPID 35464 Nisan/Iyyar 1529 (Seleucid): Could not parse 'Nisan/Iyyar 1529' as a Hebrew date\n",
+ "Parse error on PGPID 35476 Dhū l-Qiʿda 614 (Islamic): Could not parse 'Dhū l-Qiʿda 614' as an Islamic date\n",
+ "Could not parse Dhū l-Qiʿda 614 as ISO date: invalid literal for int() with base 10: 'Dhū l'\n",
+ "Parse error on PGPID 35744 Rabīʿ I 565 (Islamic): Could not parse 'Rabīʿ I 565' as an Islamic date\n",
+ "Parse error on PGPID 35834 550-1-19 (Islamic): Could not parse '550-1-19' as an Islamic date\n",
+ "parsed 550-1-19 with ISO8601 format and calendar Islamic, result is 0550-01-19 (1155-04-01/1155-04-01)\n",
+ "ignoring missing digits for now 12 Muḥarram 52[.]\n",
+ "Parse error on PGPID 36005 502/503 (Islamic): Could not parse '502/503' as an Islamic date\n",
+ "Parse error on PGPID 36210 Kislev/Tevet 5460 (Hebrew): Could not parse 'Kislev/Tevet 5460' as a Hebrew date\n",
+ "Parse error on PGPID 36234 660-05 (Islamic): Could not parse '660-05' as an Islamic date\n",
+ "parsed 660-05 with ISO8601 format and calendar Islamic, result is 0660-05 (1262-03-31/1262-04-29)\n",
+ "ignoring missing digits for now 54[.]\n",
+ "Parse error on PGPID 36771 545/547 (Islamic): Could not parse '545/547' as an Islamic date\n",
+ "Parse error on PGPID 37068 Dhū l-Qaʿda 579 / Muḥarram 580 (Islamic): Could not parse 'Dhū l-Qaʿda 579 / Muḥarram 580' as an Islamic date\n",
+ "Parse error on PGPID 37875 525–26 (Islamic): Could not parse '525–26' as an Islamic date\n",
+ "ignoring missing digits for now 14[...]\n",
+ "Parse error on PGPID 38380 Dhū al-Qada 508 (Islamic): Could not parse 'Dhū al-Qada 508' as an Islamic date\n",
+ "Could not parse Dhū al-Qada 508 as ISO date: invalid literal for int() with base 10: 'Dhū al'\n",
+ "Parse error on PGPID 38508 Dhū l-Qaʿda 947 – Shawwāl 947 (Islamic): Could not parse 'Dhū l-Qaʿda 947 – Shawwāl 947' as an Islamic date\n",
+ "Could not parse Dhū l-Qaʿda 947 – Shawwāl 947 as ISO date: invalid literal for int() with base 10: 'Dhū l'\n",
+ "Parse error on PGPID 39016 Rabiʿ II 448 (Islamic): Could not parse 'Rabiʿ II 448' as an Islamic date\n",
+ "Parse error on PGPID 40044 Thursday 6 Nissan 1409 (Seleucid): Could not parse 'Thursday 6 Nissan 1409' as a Hebrew date\n",
+ "Parse error on PGPID 40046 Monday 3 Nissan 1410 (Seleucid): Could not parse 'Monday 3 Nissan 1410' as a Hebrew date\n",
+ "Parse error on PGPID 40063 Sunday Elul 1 1409 (Seleucid): Could not parse 'Sunday Elul 1 1409' as a Hebrew date\n",
+ "Parse error on PGPID 40140 4 Rabīʿ II 1106 (Islamic): Could not parse '4 Rabīʿ II 1106' as an Islamic date\n",
+ "ignoring missing digits for now 49[.]\n",
+ "ignoring missing digits for now [4]82[.]\n",
+ "Parse error on PGPID 40646 19TH Rabīʿ II 991 (Islamic): Could not parse '19TH Rabīʿ II 991' as an Islamic date\n",
+ "Parse error on PGPID 40662 20 Rabīʿ al-Awwal 1070 (Islamic): Could not parse '20 Rabīʿ al-Awwal 1070' as an Islamic date\n",
+ "Could not parse 20 Rabīʿ al-Awwal 1070 as ISO date: invalid literal for int() with base 10: '20 Rabīʿ al'\n"
+ ]
+ }
+ ],
+ "source": [
+ "# parse hijri, anno mundi, and seleucid dates as undates\n",
+ "\n",
+ "import re\n",
+ "from lark.exceptions import UnexpectedEOF\n",
+ "\n",
+ "def parse_original_date(row):\n",
+ " # print(f\"PGPID {row.pgpid} {row.doc_date_original} ({row.doc_date_calendar})\")\n",
+ " undate_calendar = None\n",
+ " if row.doc_date_calendar == \"Anno Mundi\":\n",
+ " undate_calendar = \"Hebrew\"\n",
+ " elif row.doc_date_calendar == \"Hijrī\":\n",
+ " undate_calendar = \"Islamic\"\n",
+ " elif row.doc_date_calendar == \"Seleucid\":\n",
+ " # handle seleucid as hebrew with offset (adapt from pgp code)\n",
+ " undate_calendar = \"Seleucid\"\n",
+ "\n",
+ " \n",
+ " if undate_calendar:\n",
+ " value = row.doc_date_original\n",
+ "\n",
+ " # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]\n",
+ " # ... parser doesn't support this, but undate DOES\n",
+ " if '[.' in value:\n",
+ " print(f\"ignoring missing digits for now {value}\")\n",
+ " value = value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\") \n",
+ " \n",
+ " # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'\n",
+ " # for now, just strip out brackets before parsing \n",
+ " # (in future, could potentially infer uncertainty based on these)\n",
+ " value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')\n",
+ "\n",
+ " # also remove unsupported modifiers:\n",
+ " # Late Tevet 4903, Last decade of Kislev 5004, first third of ...\n",
+ " # some dates include of, e.g. day of month\n",
+ " modifiers = [\"Late \", \"(first|middle|last)( third|half|decade|tenth)? (of )?\", \"(Beginning|end) of \", \"last day\", \"First 10 days\", \" of\", \"spring\", \"decade \", \"night, \"]\n",
+ " for mod in modifiers:\n",
+ " value = re.sub(mod, \"\", value, flags=re.I)\n",
+ "\n",
+ " # there are a handful of misspelled wednesdays...\n",
+ " value = value.replace(\"Wedensday\", \"Wednesday\")\n",
+ " # and a Thrusday\n",
+ " value = value.replace(\"Thrusday\", \"Thursday\")\n",
+ "\n",
+ " # three Hebrew calendar dates include text \"AM\" at end; at least one AH date\n",
+ " if value.endswith(\" AM\") or value.endswith(\" AH\"):\n",
+ " value = value[:-3]\n",
+ " if value.endswith(\".\"): # strip off trailing period\n",
+ " value = value[:-1]\n",
+ " # \n",
+ "\n",
+ " # about 62 have ordinals; strip them out\n",
+ " value = re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", value)\n",
+ " \n",
+ " try:\n",
+ " return Undate.parse(value, undate_calendar)\n",
+ " except (VisitError, ValueError, UnexpectedEOF) as err:\n",
+ " print(f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\")\n",
+ "\n",
+ " # there are a handful of cases in PGP where calendars are mixed,\n",
+ " # i.e. hebrew months used for hijri calendar\n",
+ "\n",
+ " # some dates are entered in ISO format for another calendar; can we parse and set calendar?\n",
+ " if \"-\" in value and not \"/\" in value: # exclude intervals for now\n",
+ " try:\n",
+ " parsed = Undate.parse(value, \"ISO8601\")\n",
+ " if parsed:\n",
+ " parsed = parsed.as_calendar(undate_calendar)\n",
+ " print(f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\")\n",
+ " return parsed\n",
+ " except ValueError as err:\n",
+ " print(f\"Could not parse {value} as ISO date: {err}\")\n",
+ "\n",
+ "docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "623eb160-ab6c-44ba-b3f4-6770c2c7bd86",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "original dates parsed: 3401\n",
+ "original dates unparsed: 167 (anno mundi, hijri, and seleucid calendars)\n",
+ "proportion parsed: 95.32%\n"
+ ]
+ }
+ ],
+ "source": [
+ "# how many hebrew/islamic dates were parsed / not parsed?\n",
+ "\n",
+ "orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()\n",
+ "\n",
+ "orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] \n",
+ "\n",
+ "total_parsed = len(orig_dates_parsed)\n",
+ "total_unparsed = len(orig_dates_unparsed)\n",
+ "print(f\"\"\"original dates parsed: {total_parsed}\n",
+ "original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)\n",
+ "proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%\"\"\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "42945787-6788-422d-9a04-f983ec6b31af",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1259 | \n",
+ " 1259 | \n",
+ " 1570 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1116-03-05 | \n",
+ " 1116-03-05 | \n",
+ " 1427-12-19 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1337 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1188-12-07 | \n",
+ " 1188-12-07 | \n",
+ " 1500-09-15 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 502 | \n",
+ " Tevet 1548 | \n",
+ " Seleucid | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ " 1548-10 | \n",
+ " month | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5 449 1570 Seleucid 1259 \n",
+ "16 463 19 Adar 1427 Seleucid 1116-03-05 \n",
+ "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n",
+ "43 502 Tevet 1548 Seleucid 1236-11-30/1236-12-28 \n",
+ "\n",
+ " undate undate_orig orig_date_precision \n",
+ "5 1259 1570 year \n",
+ "16 1116-03-05 1427-12-19 day \n",
+ "23 1025-08-28/1026-09-14 1337 year \n",
+ "41 1188-12-07 1500-09-15 day \n",
+ "43 1236-11-30/1236-12-28 1548-10 month "
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what is the date granularity of the dates we were able to parse?\n",
+ "\n",
+ "orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())\n",
+ "orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "88f1d3ab-e1c7-48b5-8907-5aeea463f1e8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_date_precision\n",
+ "day 1566\n",
+ "month 1013\n",
+ "year 822\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# this is skewed because of the kinds of dates we're not able to parse or modifiers we're omitting entirely\n",
+ "orig_dates_parsed.orig_date_precision.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "5d3a55b0-ed36-47ba-b022-848bb128b449",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5 | \n",
+ " 449 | \n",
+ " 1570 | \n",
+ " Seleucid | \n",
+ " 1259 | \n",
+ " 1259 | \n",
+ " 1570 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 463 | \n",
+ " 19 Adar 1427 | \n",
+ " Seleucid | \n",
+ " 1116-03-05 | \n",
+ " 1116-03-05 | \n",
+ " 1427-12-19 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 472 | \n",
+ " 1337 | \n",
+ " Seleucid | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1025-08-28/1026-09-14 | \n",
+ " 1337 | \n",
+ " year | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " 499 | \n",
+ " Wednesday, 15 Kislev 1500 | \n",
+ " Seleucid | \n",
+ " 1188-12-07 | \n",
+ " 1188-12-07 | \n",
+ " 1500-09-15 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 502 | \n",
+ " Tevet 1548 | \n",
+ " Seleucid | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ " 1236-11-30/1236-12-28 | \n",
+ " 1548-10 | \n",
+ " month | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5 449 1570 Seleucid 1259 \n",
+ "16 463 19 Adar 1427 Seleucid 1116-03-05 \n",
+ "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n",
+ "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n",
+ "43 502 Tevet 1548 Seleucid 1236-11-30/1236-12-28 \n",
+ "\n",
+ " undate undate_orig orig_date_precision \n",
+ "5 1259 1570 year \n",
+ "16 1116-03-05 1427-12-19 day \n",
+ "23 1025-08-28/1026-09-14 1337 year \n",
+ "41 1188-12-07 1500-09-15 day \n",
+ "43 1236-11-30/1236-12-28 1548-10 month "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check on the seleucid date parsing\n",
+ "\n",
+ "orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "8907d1fc-b87f-4173-8759-74c07fa70dca",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " original: 1570 undate:1570 pgp standard 1259 earliest:1258-09-07 latest:1259-09-26\n",
+ " original: 19 Adar 1427 undate:1427-12-19 pgp standard 1116-03-05 earliest:1116-03-12 latest:1116-03-12\n",
+ " original: 1337 undate:1337 pgp standard 1025-08-28/1026-09-14 earliest:1025-09-03 latest:1026-09-20\n",
+ " original: Wednesday, 15 Kislev 1500 undate:1500-09-15 pgp standard 1188-12-07 earliest:1188-12-14 latest:1188-12-14\n",
+ " original: Tevet 1548 undate:1548-10 pgp standard 1236-11-30/1236-12-28 earliest:1236-12-07 latest:1237-01-04\n",
+ " original: Elul 1428 undate:1428-06 pgp standard 1117-08-01/1117-08-29 earliest:1117-08-08 latest:1117-09-05\n",
+ " original: First decade of Ḥeshvan 1442 undate:1442-08 pgp standard 1130 earliest:1130-10-13 latest:1130-11-10\n",
+ " original: Ḥeshvan 1453 undate:1453-08 pgp standard 1141 earliest:1141-10-11 latest:1141-11-08\n",
+ " original: Sunday, 21 Kislev 1355 undate:1355-09-21 pgp standard 1043-11-26 earliest:1043-12-02 latest:1043-12-02\n",
+ " original: Monday, 16 Tammuz 1540 undate:1540-04-16 pgp standard 1229-07-09 earliest:1229-07-16 latest:1229-07-16\n"
+ ]
+ }
+ ],
+ "source": [
+ "for row in orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'][:10].itertuples():\n",
+ " print(f\" original: {row.doc_date_original} undate:{row.undate_orig} pgp standard {row.doc_date_standard} earliest:{row.undate_orig.earliest} latest:{row.undate_orig.latest}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "48142841-e030-4009-af11-6cbc936fd7bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# check calendar agreement, how many were wrong?\n",
+ "\n",
+ "calendar_mapping = {\n",
+ " \"hebrew\": \"Anno Mundi\",\n",
+ " \"islamic\": \"Hijrī\",\n",
+ " \"seleucid\": \"Seleucid\"\n",
+ "}\n",
+ "\n",
+ "orig_dates_parsed['undate_calendar'] = orig_dates_parsed.undate_orig.apply(lambda x: calendar_mapping.get(x.calendar, x.calendar))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "0719fcbe-8a87-4fe9-a0b8-66a4b19f5d39",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " undate | \n",
+ " undate_calendar | \n",
+ " doc_date_calendar | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 35063 | \n",
+ " 1794-09-25/1795-09-13 | \n",
+ " Anno Mundi | \n",
+ " Anno Mundi | \n",
+ "
\n",
+ " \n",
+ " 35070 | \n",
+ " 1755-09-06/1756-09-24 | \n",
+ " Anno Mundi | \n",
+ " Anno Mundi | \n",
+ "
\n",
+ " \n",
+ " 35071 | \n",
+ " 1519-10-09 | \n",
+ " Hijrī | \n",
+ " Hijrī | \n",
+ "
\n",
+ " \n",
+ " 35072 | \n",
+ " 1563-04-05 | \n",
+ " Hijrī | \n",
+ " Hijrī | \n",
+ "
\n",
+ " \n",
+ " 35073 | \n",
+ " 1563-04-25 | \n",
+ " Hijrī | \n",
+ " Hijrī | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " undate undate_calendar doc_date_calendar\n",
+ "35063 1794-09-25/1795-09-13 Anno Mundi Anno Mundi\n",
+ "35070 1755-09-06/1756-09-24 Anno Mundi Anno Mundi\n",
+ "35071 1519-10-09 Hijrī Hijrī\n",
+ "35072 1563-04-05 Hijrī Hijrī\n",
+ "35073 1563-04-25 Hijrī Hijrī"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# which records appear to have mismatched original calendars? (i.e. can be parsed by the opposite parser)\n",
+ "# only 4! \n",
+ "# PGPIDs 3637, 5902, 6058, 9198\n",
+ "\n",
+ "# however, looking at the PGP records indicates sometimes the authors mixed hebrew and arabic months\n",
+ "# from description of PGPID 3637: [It is unusual but not unheard of to combine Hebrew months with the Hijrī calendar.] \n",
+ "\n",
+ "orig_dates_parsed[['undate', 'undate_calendar', 'doc_date_calendar']].tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "a104d772-6c2c-4711-91ec-8cf1f108ae23",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# can we sort by parsed original dates? \n",
+ "# doesn't work currently because of overlapping dates / different granularity\n",
+ "#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "c653d928-8fec-4ddc-9abf-ace2f7ca6629",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# set earliest/latest for graphing\n",
+ "\n",
+ "# IMPORTANT: we have to cast type to something pandas/altair supports\n",
+ "\n",
+ "orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')\n",
+ "orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')\n",
+ "orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "91f155fe-d0e6-4ee4-99de-698ac301e3f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orig_date_earliest | \n",
+ " orig_date_latest | \n",
+ " orig_date_mid | \n",
+ " pgpid | \n",
+ " doc_date_calendar | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5 | \n",
+ " 1258-09-07 | \n",
+ " 1259-09-26 | \n",
+ " 1259-03-18 | \n",
+ " 449 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ " 1116-03-12 | \n",
+ " 463 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 1025-09-03 | \n",
+ " 1026-09-20 | \n",
+ " 1026-03-13 | \n",
+ " 472 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " 41 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ " 1188-12-14 | \n",
+ " 499 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " 43 | \n",
+ " 1236-12-07 | \n",
+ " 1237-01-04 | \n",
+ " 1236-12-21 | \n",
+ " 502 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " 1117-08-08 | \n",
+ " 1117-09-05 | \n",
+ " 1117-08-22 | \n",
+ " 506 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " 1130-10-13 | \n",
+ " 1130-11-10 | \n",
+ " 1130-10-27 | \n",
+ " 516 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ " 61 | \n",
+ " 1035-05-28 | \n",
+ " 1035-05-28 | \n",
+ " 1035-05-28 | \n",
+ " 524 | \n",
+ " Anno Mundi | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " 1034-08-25 | \n",
+ " 1034-09-22 | \n",
+ " 1034-09-08 | \n",
+ " 525 | \n",
+ " Hijrī | \n",
+ "
\n",
+ " \n",
+ " 73 | \n",
+ " 1141-10-11 | \n",
+ " 1141-11-08 | \n",
+ " 1141-10-25 | \n",
+ " 537 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orig_date_earliest orig_date_latest orig_date_mid pgpid doc_date_calendar\n",
+ "5 1258-09-07 1259-09-26 1259-03-18 449 Seleucid\n",
+ "16 1116-03-12 1116-03-12 1116-03-12 463 Seleucid\n",
+ "23 1025-09-03 1026-09-20 1026-03-13 472 Seleucid\n",
+ "41 1188-12-14 1188-12-14 1188-12-14 499 Seleucid\n",
+ "43 1236-12-07 1237-01-04 1236-12-21 502 Seleucid\n",
+ "47 1117-08-08 1117-09-05 1117-08-22 506 Seleucid\n",
+ "55 1130-10-13 1130-11-10 1130-10-27 516 Seleucid\n",
+ "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi\n",
+ "62 1034-08-25 1034-09-22 1034-09-08 525 Hijrī\n",
+ "73 1141-10-11 1141-11-08 1141-10-25 537 Seleucid"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "a8cc1025-0334-44b9-90e6-13ddb30fec31",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_date_earliest datetime64[s]\n",
+ "orig_date_latest datetime64[s]\n",
+ "orig_date_mid datetime64[s]\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid']].dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "b0299c8d-a113-4918-bd04-57c00b233d21",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " orig_date_earliest | \n",
+ " orig_date_latest | \n",
+ " orig_date_mid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " orig_date_precision | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [pgpid, orig_date_earliest, orig_date_latest, orig_date_mid, doc_date_original, doc_date_calendar, doc_date_standard, orig_date_precision]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import datetime\n",
+ "\n",
+ "# are these data errors?\n",
+ "\n",
+ "orig_dates_parsed[orig_dates_parsed.orig_date_earliest > Undate(2100).earliest][['pgpid', 'orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'orig_date_precision']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "c5861110-dbd5-4d7a-8ada-acf7cb871aa7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import altair as alt\n",
+ "\n",
+ "# exclude dates after 2100\n",
+ "graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'pgpid', 'doc_date_calendar']][orig_dates_parsed.orig_date_earliest < Undate(2100).earliest]\n",
+ "\n",
+ "bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(\n",
+ " x=alt.X('orig_date_earliest:T', title=\"original date (range)\"), # , axis=alt.Axis(format=\"r\")),\n",
+ " x2='orig_date_latest:T',\n",
+ " y=alt.Y('count(pgpid)', title='Count of Documents')\n",
+ ").properties(width=1200, height=200)\n",
+ "\n",
+ "earliest_chart = bar_chart.mark_point(opacity=0.2, color=\"green\", interpolate=\"monotone\").encode(\n",
+ " x=alt.X('orig_date_earliest:T', title=\"Date (earliest)\"), # axis=alt.Axis(format=\"r\")),\n",
+ " y=alt.Y('count(pgpid)', title='Count of Documents')\n",
+ ").properties(width=1200, height=200)\n",
+ "\n",
+ "latest_chart = bar_chart.mark_point(opacity=0.2, color=\"blue\", interpolate=\"monotone\").encode(\n",
+ " x=alt.X('orig_date_latest:T', title=\"Date (latest)\"), # axis=alt.Axis(format=\"r\")),\n",
+ " y=alt.Y('count(pgpid)', title='Count of Documents')\n",
+ ").properties(width=1200, height=200)\n",
+ "\n",
+ "# (bar_chart & line_chart).properties(title=\"Documents by date (1000-1300)\")\n",
+ "(bar_chart & (latest_chart + earliest_chart)).interactive()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "b1fce94f-8f52-4b56-b88f-3575c3ebf2b0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "# plot points for all the documents with date, using jitter to scatter them vertically\n",
+ "\n",
+ "jitter_plot = alt.Chart(graphable_data).mark_circle(size=8, opacity=0.5).encode(\n",
+ " x=\"orig_date_earliest:T\", # maybe could eventually use jitter to plot between earliest/latest\n",
+ " y=alt.Y(\"jitter:Q\", title=\"\").axis(None),\n",
+ " color=alt.Color('doc_date_calendar:N', title=\"Calendar\") #.legend(None)\n",
+ ").transform_calculate(\n",
+ " # Generate Gaussian jitter with a Box-Muller transform\n",
+ " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n",
+ ").properties(width=1200, height=200)\n",
+ "\n",
+ "\n",
+ "jitter_plot \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 225,
+ "id": "f8439e45-e9b0-4eba-8b43-6a87cb8d3b9a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.VConcatChart(...)"
+ ]
+ },
+ "execution_count": 225,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(bar_chart & jitter_plot).interactive()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "951d92ea-4689-481c-8590-324b782a7a1c",
+ "metadata": {},
+ "source": [
+ "## compare weekdays"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "3122a874-bb17-429f-993f-4bf7a76c1a36",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 1835 | \n",
+ " 2550 | \n",
+ " Monday night, 5 Av 1443 | \n",
+ " Seleucid | \n",
+ " 1132 | \n",
+ " 1132 | \n",
+ " 1443-05-05 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 29309 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 29930 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 34016 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 34474 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ " 34475 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1835 2550 Monday night, 5 Av 1443 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29309 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29930 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34016 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34474 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34475 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate undate_orig orig_date_precision \n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1835 1132 1132 1443-05-05 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29309 1266/1267 1266/1267 1578-10-20 day \n",
+ "29930 1214/1215 1214/1215 1526-10-13 day \n",
+ "34016 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34474 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34475 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ "[104 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']]\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "993c4f4a-4364-42ad-8927-145458f0e538",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_date_precision\n",
+ "day 104\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates.orig_date_precision.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "3e4ea50c-b11c-433b-b6f9-691098b057d3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " 1835 | \n",
+ " 2550 | \n",
+ " Monday night, 5 Av 1443 | \n",
+ " Seleucid | \n",
+ " 1132 | \n",
+ " 1132 | \n",
+ " 1443-05-05 | \n",
+ " day | \n",
+ " 1 | \n",
+ " Tuesday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " day | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 29309 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 29930 | \n",
+ " 35264 | \n",
+ " Wednesday 13 Ṭevet 1526 | \n",
+ " Seleucid | \n",
+ " 1214/1215 | \n",
+ " 1214/1215 | \n",
+ " 1526-10-13 | \n",
+ " day | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " 34016 | \n",
+ " 39564 | \n",
+ " Monday 16 Tevet 1339 | \n",
+ " Seleucid | \n",
+ " 1027-12-18 | \n",
+ " 1027-12-18 | \n",
+ " 1339-10-16 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 34474 | \n",
+ " 40035 | \n",
+ " Monday 1st Iyyar 1437 | \n",
+ " Seleucid | \n",
+ " 1126-04-26 | \n",
+ " 1126-04-26 | \n",
+ " 1437-02-01 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 34475 | \n",
+ " 40036 | \n",
+ " Friday 15 of Adar 1443 | \n",
+ " Seleucid | \n",
+ " 1132-03-04 | \n",
+ " 1132-03-04 | \n",
+ " 1443-12-15 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
104 rows × 10 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1835 2550 Monday night, 5 Av 1443 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "... ... ... ... \n",
+ "29309 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29930 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "34016 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34474 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34475 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "\n",
+ " doc_date_standard undate undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1835 1132 1132 1443-05-05 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "... ... ... ... ... \n",
+ "29309 1266/1267 1266/1267 1578-10-20 day \n",
+ "29930 1214/1215 1214/1215 1526-10-13 day \n",
+ "34016 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34474 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34475 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "\n",
+ " undate_weekday undate_weekday_name orig_weekday \n",
+ "851 3 Thursday Thursday \n",
+ "1835 1 Tuesday Tuesday \n",
+ "1929 0 Monday Monday \n",
+ "2013 2 Wednesday Wednesday \n",
+ "3257 3 Thursday Thursday \n",
+ "... ... ... ... \n",
+ "29309 0 Monday Monday \n",
+ "29930 2 Wednesday Wednesday \n",
+ "34016 0 Monday Monday \n",
+ "34474 0 Monday Monday \n",
+ "34475 4 Friday Friday \n",
+ "\n",
+ "[104 rows x 10 columns]"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "days = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n",
+ "\n",
+ "# get numeric weekday\n",
+ "weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)\n",
+ "weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])\n",
+ "# extract weekday from date label\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()\n",
+ "# correct misspellings\n",
+ "misspelled_days = {\n",
+ " \"Wedensday\": \"Wednesday\",\n",
+ " \"Thrusday\": \"Thursday\",\n",
+ "}\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))\n",
+ "\n",
+ "# shift night to next day, e.g. Wednesday night should be Thursday\n",
+ "# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day\n",
+ "def next_day(weekday):\n",
+ " return days[(days.index(weekday) +1) % 7]\n",
+ "\n",
+ "weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if \" night\" in row.doc_date_original else row.orig_weekday, axis=1)\n",
+ "weekday_dates[weekday_dates.doc_date_original.str.contains(\" night\")]\n",
+ "\n",
+ "weekday_dates"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "4ced7809-1414-44ae-aae7-c2d0d1dee9ad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " day | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " 1835 | \n",
+ " 2550 | \n",
+ " Monday night, 5 Av 1443 | \n",
+ " Seleucid | \n",
+ " 1132 | \n",
+ " 1132 | \n",
+ " 1443-05-05 | \n",
+ " day | \n",
+ " 1 | \n",
+ " Tuesday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " day | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " 5511 | \n",
+ " 7237 | \n",
+ " Tuesday night, 22 Kislev 1435 | \n",
+ " Seleucid | \n",
+ " 1123-12-12 | \n",
+ " 1123-12-12 | \n",
+ " 1435-09-22 | \n",
+ " day | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " 5857 | \n",
+ " 7642 | \n",
+ " Thursday night, 23 Tammuz 1538 | \n",
+ " Seleucid | \n",
+ " 1227-07-09 | \n",
+ " 1227-07-09 | \n",
+ " 1538-04-23 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ " 6419 | \n",
+ " 8332 | \n",
+ " Friday night, 20 Iyar 4957 | \n",
+ " Anno Mundi | \n",
+ " 1197-05 | \n",
+ " 1197-05 | \n",
+ " 4957-02-20 | \n",
+ " day | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Saturday | \n",
+ "
\n",
+ " \n",
+ " 29309 | \n",
+ " 34623 | \n",
+ " Sunday night, 20 Ṭevet 1578 | \n",
+ " Seleucid | \n",
+ " 1266/1267 | \n",
+ " 1266/1267 | \n",
+ " 1578-10-20 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1835 2550 Monday night, 5 Av 1443 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "5511 7237 Tuesday night, 22 Kislev 1435 Seleucid \n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid \n",
+ "5857 7642 Thursday night, 23 Tammuz 1538 Seleucid \n",
+ "6419 8332 Friday night, 20 Iyar 4957 Anno Mundi \n",
+ "29309 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "\n",
+ " doc_date_standard undate undate_orig orig_date_precision \\\n",
+ "851 1270 1270 1581-03-28 day \n",
+ "1835 1132 1132 1443-05-05 day \n",
+ "1929 1133 1133 1444-09-25 day \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "5511 1123-12-12 1123-12-12 1435-09-22 day \n",
+ "5854 1127 1127 1438-10-29 day \n",
+ "5857 1227-07-09 1227-07-09 1538-04-23 day \n",
+ "6419 1197-05 1197-05 4957-02-20 day \n",
+ "29309 1266/1267 1266/1267 1578-10-20 day \n",
+ "\n",
+ " undate_weekday undate_weekday_name orig_weekday \n",
+ "851 3 Thursday Thursday \n",
+ "1835 1 Tuesday Tuesday \n",
+ "1929 0 Monday Monday \n",
+ "3257 3 Thursday Thursday \n",
+ "5511 2 Wednesday Wednesday \n",
+ "5854 4 Friday Tuesday \n",
+ "5857 4 Friday Friday \n",
+ "6419 5 Saturday Saturday \n",
+ "29309 0 Monday Monday "
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates[weekday_dates.doc_date_original.str.contains(\" night\")]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "fedb5323-0e9c-476e-a7e2-95443d2f9e1d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "44 matches, 60 mismatches (42.31%)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5271 | \n",
+ " 6947 | \n",
+ " Monday 3 Iyyar 1740 | \n",
+ " Seleucid | \n",
+ " 1429-04-07 | \n",
+ " 1429-04-07 | \n",
+ " 1740-02-03 | \n",
+ " day | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " 8649 | \n",
+ " 11227 | \n",
+ " Monday 24 Jumādā I 517 | \n",
+ " Hijrī | \n",
+ " 1123-07-20 | \n",
+ " 1123-07-20 | \n",
+ " 0517-05-24 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 16400 | \n",
+ " 19649 | \n",
+ " Thursday 26 Iyyar 5306 | \n",
+ " Anno Mundi | \n",
+ " 1546-04-28 | \n",
+ " 1546-04-28 | \n",
+ " 5306-02-26 | \n",
+ " day | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " 17728 | \n",
+ " 21094 | \n",
+ " Saturday 20 Rajab 550 | \n",
+ " Hijrī | \n",
+ " 1155-09-19 | \n",
+ " 1155-09-19 | \n",
+ " 0550-07-20 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Saturday | \n",
+ "
\n",
+ " \n",
+ " 23105 | \n",
+ " 27479 | \n",
+ " Tuesday 11 Tammuz 5525 | \n",
+ " Anno Mundi | \n",
+ " 1765-06-30 | \n",
+ " 1765-06-30 | \n",
+ " 5525-04-11 | \n",
+ " day | \n",
+ " 6 | \n",
+ " Sunday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " 23110 | \n",
+ " 27484 | \n",
+ " Friday 20th Shevat 5405 | \n",
+ " Anno Mundi | \n",
+ " 1645 | \n",
+ " 1645 | \n",
+ " 5405-11-20 | \n",
+ " day | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Friday | \n",
+ "
\n",
+ " \n",
+ " 23111 | \n",
+ " 27485 | \n",
+ " Sunday 22 Adar 5590 | \n",
+ " Anno Mundi | \n",
+ " 1830-03-17 | \n",
+ " 1830-03-17 | \n",
+ " 5590-12-22 | \n",
+ " day | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " 23113 | \n",
+ " 27487 | \n",
+ " Thursday 15th Shevat 5450 | \n",
+ " Anno Mundi | \n",
+ " 1690 | \n",
+ " 1690 | \n",
+ " 5450-11-15 | \n",
+ " day | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " 23115 | \n",
+ " 27489 | \n",
+ " Sunday 6 Nisan 5528 | \n",
+ " Anno Mundi | \n",
+ " 1768-03-24 | \n",
+ " 1768-03-24 | \n",
+ " 5528-01-06 | \n",
+ " day | \n",
+ " 3 | \n",
+ " Thursday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " 23116 | \n",
+ " 27490 | \n",
+ " Thursday 19th Elul 5428 | \n",
+ " Anno Mundi | \n",
+ " 1668 | \n",
+ " 1668 | \n",
+ " 5428-06-19 | \n",
+ " day | \n",
+ " 6 | \n",
+ " Sunday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " 23117 | \n",
+ " 27491 | \n",
+ " Tuesday 1 Kislev 5507 | \n",
+ " Anno Mundi | \n",
+ " 1746-11-14 | \n",
+ " 1746-11-14 | \n",
+ " 5507-09-01 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ " 23122 | \n",
+ " 27496 | \n",
+ " Sunday 28 Elul 5511 | \n",
+ " Anno Mundi | \n",
+ " 1751-09-18 | \n",
+ " 1751-09-18 | \n",
+ " 5511-06-28 | \n",
+ " day | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " 23123 | \n",
+ " 27497 | \n",
+ " Sunday 17th Sivan 5423 | \n",
+ " Anno Mundi | \n",
+ " 1663 | \n",
+ " 1663 | \n",
+ " 5423-03-17 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " 23124 | \n",
+ " 27498 | \n",
+ " Sunday 25th Tevet 5409 | \n",
+ " Anno Mundi | \n",
+ " 1648 | \n",
+ " 1648 | \n",
+ " 5409-10-25 | \n",
+ " day | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " 23126 | \n",
+ " 27500 | \n",
+ " Thursday 4 Sivan 5516 | \n",
+ " Anno Mundi | \n",
+ " 1756-06-02 | \n",
+ " 1756-06-02 | \n",
+ " 5516-03-04 | \n",
+ " day | \n",
+ " 2 | \n",
+ " Wednesday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ " 23133 | \n",
+ " 27507 | \n",
+ " Sunday 25 Sivan 5556 | \n",
+ " Anno Mundi | \n",
+ " 1796-07-01 | \n",
+ " 1796-07-01 | \n",
+ " 5556-03-25 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Sunday | \n",
+ "
\n",
+ " \n",
+ " 23137 | \n",
+ " 27511 | \n",
+ " Wednesday 28th Tevet 5399 | \n",
+ " Anno Mundi | \n",
+ " 1640 | \n",
+ " 1640 | \n",
+ " 5399-10-28 | \n",
+ " day | \n",
+ " 1 | \n",
+ " Tuesday | \n",
+ " Wednesday | \n",
+ "
\n",
+ " \n",
+ " 23141 | \n",
+ " 27515 | \n",
+ " Monday 15th Iyyar 5414 | \n",
+ " Anno Mundi | \n",
+ " 1654 | \n",
+ " 1654 | \n",
+ " 5414-02-15 | \n",
+ " day | \n",
+ " 5 | \n",
+ " Saturday | \n",
+ " Monday | \n",
+ "
\n",
+ " \n",
+ " 23142 | \n",
+ " 27516 | \n",
+ " Thursday 24 Nisan 5481 | \n",
+ " Anno Mundi | \n",
+ " 1721-04-21 | \n",
+ " 1721-04-21 | \n",
+ " 5481-01-24 | \n",
+ " day | \n",
+ " 0 | \n",
+ " Monday | \n",
+ " Thursday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5271 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
+ "8649 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n",
+ "16400 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n",
+ "17728 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n",
+ "23105 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n",
+ "23110 27484 Friday 20th Shevat 5405 Anno Mundi 1645 \n",
+ "23111 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n",
+ "23113 27487 Thursday 15th Shevat 5450 Anno Mundi 1690 \n",
+ "23115 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n",
+ "23116 27490 Thursday 19th Elul 5428 Anno Mundi 1668 \n",
+ "23117 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n",
+ "23122 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n",
+ "23123 27497 Sunday 17th Sivan 5423 Anno Mundi 1663 \n",
+ "23124 27498 Sunday 25th Tevet 5409 Anno Mundi 1648 \n",
+ "23126 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n",
+ "23133 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n",
+ "23137 27511 Wednesday 28th Tevet 5399 Anno Mundi 1640 \n",
+ "23141 27515 Monday 15th Iyyar 5414 Anno Mundi 1654 \n",
+ "23142 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n",
+ "\n",
+ " undate undate_orig orig_date_precision undate_weekday \\\n",
+ "5271 1429-04-07 1740-02-03 day 3 \n",
+ "5854 1127 1438-10-29 day 4 \n",
+ "8649 1123-07-20 0517-05-24 day 4 \n",
+ "16400 1546-04-28 5306-02-26 day 2 \n",
+ "17728 1155-09-19 0550-07-20 day 0 \n",
+ "23105 1765-06-30 5525-04-11 day 6 \n",
+ "23110 1645 5405-11-20 day 3 \n",
+ "23111 1830-03-17 5590-12-22 day 2 \n",
+ "23113 1690 5450-11-15 day 2 \n",
+ "23115 1768-03-24 5528-01-06 day 3 \n",
+ "23116 1668 5428-06-19 day 6 \n",
+ "23117 1746-11-14 5507-09-01 day 0 \n",
+ "23122 1751-09-18 5511-06-28 day 5 \n",
+ "23123 1663 5423-03-17 day 4 \n",
+ "23124 1648 5409-10-25 day 5 \n",
+ "23126 1756-06-02 5516-03-04 day 2 \n",
+ "23133 1796-07-01 5556-03-25 day 4 \n",
+ "23137 1640 5399-10-28 day 1 \n",
+ "23141 1654 5414-02-15 day 5 \n",
+ "23142 1721-04-21 5481-01-24 day 0 \n",
+ "\n",
+ " undate_weekday_name orig_weekday \n",
+ "5271 Thursday Monday \n",
+ "5854 Friday Tuesday \n",
+ "8649 Friday Monday \n",
+ "16400 Wednesday Thursday \n",
+ "17728 Monday Saturday \n",
+ "23105 Sunday Tuesday \n",
+ "23110 Thursday Friday \n",
+ "23111 Wednesday Sunday \n",
+ "23113 Wednesday Thursday \n",
+ "23115 Thursday Sunday \n",
+ "23116 Sunday Thursday \n",
+ "23117 Monday Tuesday \n",
+ "23122 Saturday Sunday \n",
+ "23123 Friday Sunday \n",
+ "23124 Saturday Sunday \n",
+ "23126 Wednesday Thursday \n",
+ "23133 Friday Sunday \n",
+ "23137 Tuesday Wednesday \n",
+ "23141 Saturday Monday \n",
+ "23142 Monday Thursday "
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many match?\n",
+ "matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]\n",
+ "\n",
+ "mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]\n",
+ "\n",
+ "print(f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)\")\n",
+ "mismatches.head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "d6476907-1628-4d68-ab1f-43c95e123707",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Anno Mundi 55\n",
+ "Seleucid 3\n",
+ "Hijrī 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mismatches.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "18b71d18-5d5b-4f92-8801-499bcf412efe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "orig_weekday\n",
+ "Wednesday 17\n",
+ "Sunday 12\n",
+ "Monday 10\n",
+ "Thursday 9\n",
+ "Tuesday 7\n",
+ "Friday 4\n",
+ "Saturday 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mismatches.orig_weekday.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "eb7ea065-e4b5-47aa-9538-8dc9851ea572",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1 mismatches that include text 'night'\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " orig_date_precision | \n",
+ " undate_weekday | \n",
+ " undate_weekday_name | \n",
+ " orig_weekday | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 5854 | \n",
+ " 7637 | \n",
+ " Monday night, 29 Ṭevet 1438 | \n",
+ " Seleucid | \n",
+ " 1127 | \n",
+ " 1127 | \n",
+ " 1438-10-29 | \n",
+ " day | \n",
+ " 4 | \n",
+ " Friday | \n",
+ " Tuesday | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
+ "\n",
+ " undate undate_orig orig_date_precision undate_weekday \\\n",
+ "5854 1127 1438-10-29 day 4 \n",
+ "\n",
+ " undate_weekday_name orig_weekday \n",
+ "5854 Friday Tuesday "
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many mismatches are due to night?\n",
+ "night_mismatches = mismatches[mismatches.doc_date_original.str.contains(\" night\")]\n",
+ "print(f\"{len(night_mismatches)} mismatches that include text 'night'\")\n",
+ "night_mismatches"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "ece780b8-2eb2-4cbc-9195-27def665f7fa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# plot frequency by day, just for fun\n",
+ "\n",
+ "# get numeric weekday\n",
+ "orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)\n",
+ "orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
+ "\n",
+ "# restrict to dates with day precision; the rest are just using earliest day\n",
+ "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']\n",
+ "\n",
+ "\n",
+ "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Color('count(pgpid)', title='# of documents')\n",
+ ").properties(title='document frequency by weekday')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "6b2f24de-18ce-4f40-b300-e8cc334a338c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "undate_weekday_name\n",
+ "Monday 300\n",
+ "Thursday 280\n",
+ "Tuesday 233\n",
+ "Sunday 223\n",
+ "Wednesday 223\n",
+ "Friday 211\n",
+ "Saturday 96\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_days.undate_weekday_name.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "6a7a0bf5-f8c2-4034-8495-2fb4b297740a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pgpid | \n",
+ " doc_date_original | \n",
+ " doc_date_calendar | \n",
+ " doc_date_standard | \n",
+ " undate | \n",
+ " undate_orig | \n",
+ " century | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 851 | \n",
+ " 1377 | \n",
+ " Wednesday night, 28 Sivan 1581 | \n",
+ " Seleucid | \n",
+ " 1270 | \n",
+ " 1270 | \n",
+ " 1581-03-28 | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ " 1835 | \n",
+ " 2550 | \n",
+ " Monday night, 5 Av 1443 | \n",
+ " Seleucid | \n",
+ " 1132 | \n",
+ " 1132 | \n",
+ " 1443-05-05 | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ " 1929 | \n",
+ " 2649 | \n",
+ " Sunday night, 25 Kislev 1444 | \n",
+ " Seleucid | \n",
+ " 1133 | \n",
+ " 1133 | \n",
+ " 1444-09-25 | \n",
+ " 1100s | \n",
+ "
\n",
+ " \n",
+ " 2013 | \n",
+ " 2739 | \n",
+ " Wednesday 29th Elul 1354 | \n",
+ " Seleucid | \n",
+ " 1043-09-07 | \n",
+ " 1043-09-07 | \n",
+ " 1354-06-29 | \n",
+ " 1000s | \n",
+ "
\n",
+ " \n",
+ " 3257 | \n",
+ " 4026 | \n",
+ " Wednesday night, 29 Tishrei 1541 | \n",
+ " Seleucid | \n",
+ " 1229-09-18 | \n",
+ " 1229-09-18 | \n",
+ " 1541-07-29 | \n",
+ " 1200s | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pgpid doc_date_original doc_date_calendar \\\n",
+ "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1835 2550 Monday night, 5 Av 1443 Seleucid \n",
+ "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "\n",
+ " doc_date_standard undate undate_orig century \n",
+ "851 1270 1270 1581-03-28 1200s \n",
+ "1835 1132 1132 1443-05-05 1100s \n",
+ "1929 1133 1133 1444-09-25 1100s \n",
+ "2013 1043-09-07 1043-09-07 1354-06-29 1000s \n",
+ "3257 1229-09-18 1229-09-18 1541-07-29 1200s "
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get rough century (gregorian calendar)\n",
+ "weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: f\"{(\"%04d\" % x.earliest.year)[:2]}00s\")\n",
+ "\n",
+ "weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'century']].head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "2c07d56f-552a-4d2c-9c18-0b78f056ccf6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "century\n",
+ "1700s 48\n",
+ "1600s 19\n",
+ "1100s 11\n",
+ "1800s 9\n",
+ "1200s 6\n",
+ "1000s 5\n",
+ "1500s 4\n",
+ "0900s 1\n",
+ "1400s 1\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates.century.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "eb99871e-d9a5-4211-9bd2-5a9acfe8face",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.Chart(...)"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Y('century'),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").properties(title='document frequency by weekday and century')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "08a58fcf-2b08-441b-9dc8-385bafeb88e6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# what about heat map by month?\n",
+ "\n",
+ "\n",
+ "# get numeric month\n",
+ "orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)\n",
+ "# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
+ "\n",
+ "has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]\n",
+ "#orig_dates_months = [\n",
+ "\n",
+ "\n",
+ "alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_month', title='month'),\n",
+ " alt.Color('count(pgpid)', title='# of documents')\n",
+ ").facet(\n",
+ " row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ ").properties(title='Document frequency by month and calendar')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "a7a16c53-6f01-4457-9458-4fcf80a35c51",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Seleucid 1183\n",
+ "Anno Mundi 888\n",
+ "Hijrī 508\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "has_month.doc_date_calendar.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "65bce74e-67b7-48df-9f7f-a6f264af4f11",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(1566, 39)"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "orig_dates_days[orig_dates_days.undate_weekday_name.notna()].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "ac940883-e00e-4dde-8339-95a1b733f6f3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_89288/2470126649.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# weekday frequency by month?\n",
+ "\n",
+ "orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n",
+ "\n",
+ "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " alt.Y('undate_month', title=\"month\"),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").facet(\n",
+ " column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ ").properties(title='Document frequency by weekday and month (1,557 documents)')\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "35f1ff65-f726-4817-8312-a08198956343",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ ""
+ ],
+ "text/plain": [
+ "alt.FacetChart(...)"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "# orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n",
+ "\n",
+ "# alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n",
+ "# alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ "# alt.Y('undate_month', title=\"month\"),\n",
+ "# alt.Color('count(pgpid)')\n",
+ "# ).facet(\n",
+ "# column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ "# ).properties(title='document frequency by weekday and month')\n",
+ "\n",
+ "\n",
+ "\n",
+ "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
+ " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
+ " # alt.Y('doc_date_calendar'),\n",
+ " alt.Color('count(pgpid)')\n",
+ ").facet(row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
+ ").properties(title='document frequency by weekday')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "65897b9d-2399-434a-9a6c-e08f58510848",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "doc_date_calendar\n",
+ "Anno Mundi 82\n",
+ "Seleucid 20\n",
+ "Hijrī 2\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "weekday_dates.doc_date_calendar.value_counts()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py
index a43a270..5836b2f 100644
--- a/src/undate/converters/calendars/__init__.py
+++ b/src/undate/converters/calendars/__init__.py
@@ -1,5 +1,11 @@
from undate.converters.calendars.gregorian import GregorianDateConverter
from undate.converters.calendars.hebrew import HebrewDateConverter
from undate.converters.calendars.islamic import IslamicDateConverter
+from undate.converters.calendars.seleucid import SeleucidDateConverter
-__all__ = ["GregorianDateConverter", "HebrewDateConverter", "IslamicDateConverter"]
+__all__ = [
+ "GregorianDateConverter",
+ "HebrewDateConverter",
+ "IslamicDateConverter",
+ "SeleucidDateConverter",
+]
diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark
index b55ec3f..6f4244c 100644
--- a/src/undate/converters/calendars/hebrew/hebrew.lark
+++ b/src/undate/converters/calendars/hebrew/hebrew.lark
@@ -3,7 +3,7 @@
// only support day month year format for now
// parser requires numeric day and year to be distinguished based on order
-hebrew_date: day month year | month year | year
+hebrew_date: weekday? day month comma? year | month year | year
// TODO: handle date ranges?
@@ -27,10 +27,14 @@ month: month_1
| month_10
| month_11
| month_12
- | month_13
+ | month_13
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/
+comma: ","
+weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma?
+
+
// months, in order; from convertdate list
// with variants from Princeton Geniza Project
// support matching with and without accents
@@ -43,11 +47,13 @@ month_5: "Av"
month_6: "Elul"
// Tishrei or Tishri
month_7: /Tishre?i/
-month_8: "Heshvan"
+// Heshvan, Ḥeshvan, Marḥeshvan
+month_8: /(Mar)?[ḤHḥ]eshvan/
month_9: "Kislev"
// Tevet or Teveth
month_10: /[ṬT]eveth?/
-month_11: "Shevat"
+// Shevat or Shevaṭ
+month_11: /Sheva[tṭ]/
// Adar I or Adar
month_12: /Adar( I)?/
// Adar II or Adar Bet
diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py
index 48e8b20..8880434 100644
--- a/src/undate/converters/calendars/hebrew/transformer.py
+++ b/src/undate/converters/calendars/hebrew/transformer.py
@@ -13,6 +13,8 @@ class HebrewDateTransformer(Transformer):
"""Transform a Hebrew date parse tree and return an Undate or
UndateInterval."""
+ calendar = Calendar.HEBREW
+
def hebrew_date(self, items):
parts = {}
for child in items:
@@ -22,9 +24,9 @@ def hebrew_date(self, items):
value = int(child.children[0])
parts[str(child.data)] = value
- # initialize and return an undate with islamic year, month, day and
- # islamic calendar
- return HebrewUndate(**parts)
+ # initialize and return an undate with year, month, day and
+ # configured calendar (hebrew by default)
+ return Undate(**parts, calendar=self.calendar)
# year translation is not needed since we want a tree with name year
# this is equivalent to a no-op
diff --git a/src/undate/converters/calendars/islamic/islamic.lark b/src/undate/converters/calendars/islamic/islamic.lark
index 3ad59a5..1e4940b 100644
--- a/src/undate/converters/calendars/islamic/islamic.lark
+++ b/src/undate/converters/calendars/islamic/islamic.lark
@@ -3,7 +3,7 @@
// only support day month year format for now
// parser requires numeric day and year to be distinguished based on order
-islamic_date: day month year | month year | year
+islamic_date: weekday? day month year | month year | year
// TODO: handle date ranges?
@@ -13,6 +13,7 @@ islamic_date: day month year | month year | year
year: /\d+/
+
// months
month: month_1
| month_2
@@ -29,6 +30,10 @@ month: month_1
// months have 29 or 30 days; we do not expect leading zeroes
day: /[1-9]/ | /[12][0-9]/ | /30/
+
+comma: ","
+weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma?
+
// months, in order; from convertdate list
// with variants from Princeton Geniza Project
// support matching with and without accents
@@ -42,7 +47,7 @@ month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|II)/
// Jumādā al-ʾAwwal or Jumādā I
month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/
// Jumādā ath-Thāniya or Jumādā II
-month_6: /Jum[āa]d[āa] (ath-Th[āa]niyah|II)/
+month_6: /Jum[āa][dḍ][āa] (ath-Th[āa]niyah|II)/
month_7: "Rajab"
// Shaʿbān
month_8: /Sha[ʿ']b[āa]n/
diff --git a/src/undate/converters/calendars/seleucid.py b/src/undate/converters/calendars/seleucid.py
new file mode 100644
index 0000000..bddf867
--- /dev/null
+++ b/src/undate/converters/calendars/seleucid.py
@@ -0,0 +1,24 @@
+from undate.converters.calendars import HebrewDateConverter
+from undate.undate import Calendar
+
+
+class SeleucidDateConverter(HebrewDateConverter):
+ #: offset for Seleucid calendar: Seleucid year + 3449 = Anno Mundi year
+ SELEUCID_OFFSET = 3449
+
+ #: converter name: Seleucid
+ name: str = "Seleucid"
+ calendar_name: str = "Seleucid"
+
+ def __init__(self):
+ super().__init__()
+ # override hebrew calendar to initialize undates with seleucid
+ # calendar; this triggers Seleucid calendar to_gregorian method use
+ self.transformer.calendar = Calendar.SELEUCID
+
+ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]:
+ """Convert a Seleucid date, specified by year, month, and day,
+ to the Gregorian equivalent date. Uses hebrew calendar conversion
+ logic with :attr:`SELEUCID_OFFSET`. Returns a tuple of year, month, day.
+ """
+ return super().to_gregorian(year + self.SELEUCID_OFFSET, month, day)
diff --git a/src/undate/date.py b/src/undate/date.py
index 27f6efa..81cd035 100644
--- a/src/undate/date.py
+++ b/src/undate/date.py
@@ -104,6 +104,27 @@ def day(self) -> Optional[int]:
return int(str(self.astype("datetime64[D]")).split("-")[-1])
return None
+ @property
+ def weekday(self) -> Optional[int]:
+ """Equivalent to :meth:`datetime.date.weedkay`; returns day of week as an
+ integer where Monday is 0 and Sunday is 6. Only supported for dates
+ with date unit in days.
+ """
+ # only return a weekday if date unit is in days
+ if self.dtype == "datetime64[D]":
+ # calculate based on difference between current day and week start
+ # numpy datetime weeks start on thursdays - presumably since
+ # unix epoch day zero was a thursday...
+
+ # implementation inspired in part by https://stackoverflow.com/a/54264187
+
+ thursday_week = self.astype("datetime64[W]")
+ days_from_thursday = (self - thursday_week).astype(int)
+ # if monday is 0, thursday is 3
+ return (days_from_thursday + 3) % 7
+
+ return None
+
def __sub__(self, other):
# modify to conditionally return a timedelta object instead of a
# Date object with dtype timedelta64[D] (default behavior)
diff --git a/src/undate/undate.py b/src/undate/undate.py
index be4454a..bc627ab 100644
--- a/src/undate/undate.py
+++ b/src/undate/undate.py
@@ -29,6 +29,7 @@ class Calendar(StrEnum):
GREGORIAN = auto()
HEBREW = auto()
ISLAMIC = auto()
+ SELEUCID = auto()
@staticmethod
def get_converter(calendar):
@@ -96,7 +97,6 @@ def __init__(
if calendar is not None:
self.set_calendar(calendar)
self.calendar_converter = Calendar.get_converter(self.calendar)
-
self.calculate_earliest_latest(year, month, day)
if converter is None:
@@ -192,6 +192,9 @@ def calculate_earliest_latest(self, year, month, day):
)
def set_calendar(self, calendar: Union[str, Calendar]):
+ """Find calendar by name if passed as string and set on the object.
+ Only intended for use at initialization time; use :meth:`as_calendar`
+ to change calendar."""
if calendar is not None:
# if not passed as a Calendar instance, do a lookup
if not isinstance(calendar, Calendar):
@@ -202,6 +205,19 @@ def set_calendar(self, calendar: Union[str, Calendar]):
raise ValueError(f"Calendar `{calendar}` is not supported") from err
self.calendar = calendar
+ def as_calendar(self, calendar: Union[str, Calendar]):
+ """Return a new :class:`Undate` object with the same year, month, day, and labels
+ used to initialize the current object, but with a different calendar. Note that this
+ does NOT do calendar conversion, but reinterprets current numeric year, month, day values
+ according to the new calendar."""
+ return Undate(
+ year=self.initial_values.get("year"),
+ month=self.initial_values.get("month"),
+ day=self.initial_values.get("day"),
+ label=self.label,
+ calendar=calendar,
+ )
+
def __str__(self) -> str:
# if any portion of the date is partially known, construct
# pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits
@@ -319,8 +335,12 @@ def __lt__(self, other: object) -> bool:
# (e.g., single date within the same year)
# comparison for those cases is not currently supported
elif other in self or self in other:
+ # sort by precision, most precise first
+ by_precision = sorted(
+ [self, other], key=lambda x: x.precision, reverse=True
+ )
raise NotImplementedError(
- "Can't compare when one date falls within the other"
+ f"Can't compare when one date ({by_precision[0]}) falls within the other ({by_precision[1]})"
)
# NOTE: unsupported comparisons are supposed to return NotImplemented
# However, doing that in this case results in a confusing TypeError!
diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
index 6e4a5e6..7dcca83 100644
--- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
+++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
@@ -26,6 +26,12 @@ def test_hebrew_undate():
("5362", HebrewUndate(5362), DatePrecision.YEAR),
# add when we support parsing ranges:
# Adar I and Adar II 5453 : (1693 CE)
+ # support weekdays included in text
+ ("Thursday, 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
+ # with or without comma
+ ("Thursday 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
+ # huh, current parsing completely ignores whitespace; do we want that?
+ ("Thursday12Sivan4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY),
]
diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
index 951a9f8..04ff53b 100644
--- a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
+++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
@@ -28,6 +28,7 @@ def test_islamic_undate():
# examples from ISMI data (reformatted to day month year)
# Rabi 1 = month 3
("14 Rabīʿ I 901", IslamicUndate(901, 3, 14), DatePrecision.DAY),
+ ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH),
("884", IslamicUndate(884), DatePrecision.YEAR),
# Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)),
# add when we support parsing ranges:
diff --git a/tests/test_date.py b/tests/test_date.py
index 5ff017d..8b13472 100644
--- a/tests/test_date.py
+++ b/tests/test_date.py
@@ -1,3 +1,5 @@
+import datetime
+
import numpy as np
from undate.date import ONE_YEAR, Date, DatePrecision, Timedelta
@@ -51,6 +53,26 @@ def test_properties_day(self):
assert Date(2010, 5).day is None
assert Date(2021, 6, 15).day == 15
+ def test_weekday(self):
+ # thursday
+ assert Date(2025, 1, 2).weekday == 3
+ assert Date(2025, 1, 2).weekday == datetime.date(2025, 1, 2).weekday()
+ # friday
+ assert Date(2025, 1, 3).weekday == 4
+ assert Date(2025, 1, 3).weekday == datetime.date(2025, 1, 3).weekday()
+ # saturday
+ assert Date(2025, 1, 4).weekday == 5
+ assert Date(2025, 1, 4).weekday == datetime.date(2025, 1, 4).weekday()
+ # sunday
+ assert Date(2025, 1, 5).weekday == 6
+ assert Date(2025, 1, 5).weekday == datetime.date(2025, 1, 5).weekday()
+ # monday
+ assert Date(2025, 1, 6).weekday == 0
+ assert Date(2025, 1, 6).weekday == datetime.date(2025, 1, 6).weekday()
+ # tuesday
+ assert Date(2025, 1, 7).weekday == 1
+ assert Date(2025, 1, 7).weekday == datetime.date(2025, 1, 7).weekday()
+
def test_substract(self):
# date - date = timedelta
date_difference = Date(2024, 1, 2) - Date(2024, 1, 1)
diff --git a/tests/test_undate.py b/tests/test_undate.py
index 18e03b0..d4b3794 100644
--- a/tests/test_undate.py
+++ b/tests/test_undate.py
@@ -298,11 +298,17 @@ def test_lt_notimplemented(self):
# how to compare mixed precision where dates overlap?
# if the second date falls *within* earliest/latest,
# then it is not clearly less; not implemented?
- with pytest.raises(NotImplementedError, match="date falls within the other"):
+ with pytest.raises(
+ NotImplementedError,
+ match="one date \\(2022-05\\) falls within the other \\(2022\\)",
+ ):
assert Undate(2022) < Undate(2022, 5)
# same if we attempt to compare in the other direction
- with pytest.raises(NotImplementedError, match="date falls within the other"):
+ with pytest.raises(
+ NotImplementedError,
+ match="one date \\(2022-05\\) falls within the other \\(2022\\)",
+ ):
assert Undate(2022, 5) < Undate(2022)
testdata_contains = [