diff --git a/examples/pgp_dates.ipynb b/examples/pgp_dates.ipynb new file mode 100644 index 0000000..ff821e8 --- /dev/null +++ b/examples/pgp_dates.ipynb @@ -0,0 +1,4650 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2d231f1e-3944-4579-b868-504f7fb2d543", + "metadata": {}, + "source": [ + "# Princeton Geniza Project\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "67c5532d-ebc4-4e1e-aa64-e6802ed1d971", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "\n", + "pgp_documents_csv = \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n", + "\n", + "documents = pd.read_csv(pgp_documents_csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "41dc5a05-a04b-4b6d-acfe-1f7b04849346", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Total documents: 35,091\n", + "Documents with dates: 4,380\n", + " date on document: 4,064\n", + " inferred dating: 321\n" + ] + } + ], + "source": [ + "# limit to documents with dates\n", + "docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]\n", + "docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()\n", + "docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]\n", + "\n", + "print(f\"\"\"\n", + "Total documents: {len(documents):,}\n", + "Documents with dates: {len(docs_with_dates):,}\n", + " date on document: {len(docs_with_docdate):,}\n", + " inferred dating: {len(docs_with_inferreddate):,}\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "94d6340b-10d0-461b-b745-378ffa1ffcec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
54491570Seleucid1259
1646319 Adar 1427Seleucid1116-03-05
234721337Seleucid1025-08-28/1026-09-14
36491NaNNaN1131
41499Wednesday, 15 Kislev 1500Seleucid1188-12-07
43502Tevet 1548Seleucid1236-11-30/1236-12-28
47506Elul 1428Seleucid1117-08-01/1117-08-29
55516First decade of Ḥeshvan 1442Seleucid1130
61524Thursday, 12 Sivan 4795Anno Mundi1035-05-22
62525Shawwāl 425Hijrī1034-08-29/1034-09-07
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "5 449 1570 Seleucid \n", + "16 463 19 Adar 1427 Seleucid \n", + "23 472 1337 Seleucid \n", + "36 491 NaN NaN \n", + "41 499 Wednesday, 15 Kislev 1500 Seleucid \n", + "43 502 Tevet 1548 Seleucid \n", + "47 506 Elul 1428 Seleucid \n", + "55 516 First decade of Ḥeshvan 1442 Seleucid \n", + "61 524 Thursday, 12 Sivan 4795 Anno Mundi \n", + "62 525 Shawwāl 425 Hijrī \n", + "\n", + " doc_date_standard \n", + "5 1259 \n", + "16 1116-03-05 \n", + "23 1025-08-28/1026-09-14 \n", + "36 1131 \n", + "41 1188-12-07 \n", + "43 1236-11-30/1236-12-28 \n", + "47 1117-08-01/1117-08-29 \n", + "55 1130 \n", + "61 1035-05-22 \n", + "62 1034-08-29/1034-09-07 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9703b47-a7e2-4178-a7da-fb47db11b5b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parse error on 1217-02-20/1217-02-29: Error trying to process rule \"date\":\n", + "\n", + "Day out of range in datetime string \"1217-02-29\"\n", + "Parse error on 1747-02-29: Error trying to process rule \"date\":\n", + "\n", + "Day out of range in datetime string \"1747-02-29\"\n" + ] + } + ], + "source": [ + "from lark.visitors import VisitError\n", + "\n", + "# first, how far can we get with the standard dates? can we parse as edtf and sort, render?\n", + "from undate import Undate \n", + "\n", + "def parse_standard_date(value):\n", + " try:\n", + " return Undate.parse(value, \"EDTF\")\n", + " except VisitError as err:\n", + " print(f\"Parse error on {value}: {err}\")\n", + " \n", + "\n", + "# ignore gregorian/julian thing for now\n", + "# from pgp code:\n", + "# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October\n", + "# cut off between gregorian/julian dates, in julian days\n", + "#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n", + "\n", + "docs_with_docdate['undate'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f49e82a4-b05b-4395-998f-0c9e75729e9f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardlast_modified
31903957middle decade of Adar 1528Seleucid1217-02-20/1217-02-292023-02-09 07:22:14.481118+00:00
3444540006NaNNaN1747-02-292024-08-07 18:24:19.425288+00:00
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "3190 3957 middle decade of Adar 1528 Seleucid \n", + "34445 40006 NaN NaN \n", + "\n", + " doc_date_standard last_modified \n", + "3190 1217-02-20/1217-02-29 2023-02-09 07:22:14.481118+00:00 \n", + "34445 1747-02-29 2024-08-07 18:24:19.425288+00:00 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what are the records with standardized dates that couldn't be parsed?\n", + "\n", + "# this is probably a data error in the original\n", + "\n", + "docs_with_docdate[docs_with_docdate.undate.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2d502575-a2b4-4fce-9f59-6932275dfac2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Seleucid 1581\n", + "Anno Mundi 1128\n", + "Hijrī 859\n", + "Kharājī 8\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs_with_docdate.doc_date_calendar.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "04e4ffb2-13e7-49cc-913b-2104b61aef16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
61524Thursday, 12 Sivan 4795Anno Mundi1035-05-22
9056110 Nisan 4716Anno Mundi0956-03-24
111582Thursday, 6 Adar 4996Anno Mundi1236-02-14
119591Sunday, 29 Tammuz 4898Anno Mundi1138-07-10
1316034805/4806Anno Mundi1044-08-27/1045-09-13
...............
34831404015408Anno Mundi1647-09-30/1648-09-16
34994405665594Anno Mundi1833-09-14/1834-10-03
350524062421 Nisan 5376Anno Mundi1616-04-08
35063406355555Anno Mundi1794-09-25/1795-09-13
35070406425516Anno Mundi1755-09-06/1756-09-24
\n", + "

1128 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard\n", + "61 524 Thursday, 12 Sivan 4795 Anno Mundi 1035-05-22\n", + "90 561 10 Nisan 4716 Anno Mundi 0956-03-24\n", + "111 582 Thursday, 6 Adar 4996 Anno Mundi 1236-02-14\n", + "119 591 Sunday, 29 Tammuz 4898 Anno Mundi 1138-07-10\n", + "131 603 4805/4806 Anno Mundi 1044-08-27/1045-09-13\n", + "... ... ... ... ...\n", + "34831 40401 5408 Anno Mundi 1647-09-30/1648-09-16\n", + "34994 40566 5594 Anno Mundi 1833-09-14/1834-10-03\n", + "35052 40624 21 Nisan 5376 Anno Mundi 1616-04-08\n", + "35063 40635 5555 Anno Mundi 1794-09-25/1795-09-13\n", + "35070 40642 5516 Anno Mundi 1755-09-06/1756-09-24\n", + "\n", + "[1128 rows x 4 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# example hebrew dates\n", + "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4d11e583-7c80-44ed-80b1-d0c5b7b7f408", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_89288/1200615794.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", + " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
7021223Wednesday, 9 Tammuz 4912 AMAnno Mundi1152-06-13
1670219975Sunday, 10 Kislev 5583 AMAnno Mundi1822-11-24
2542130550Tammuz 5537 AMAnno Mundi1777-07-06/1777-08-03
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "702 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n", + "16702 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n", + "25421 30550 Tammuz 5537 AM Anno Mundi \n", + "\n", + " doc_date_standard \n", + "702 1152-06-13 \n", + "16702 1822-11-24 \n", + "25421 1777-07-06/1777-08-03 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many end with AM ?\n", + "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n", + "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cd1a751a-5299-418f-a3f8-050ab0384354", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
15562163first third of Tammuz 500[.]Anno Mundi1244/1249
15672175End of Sivan 152[.]Seleucid1209/1218
1753246013[..]Seleucid988/1088
201827451[.] Kislev 48[..]Anno Mundi1039-11-30/1138-11-24
3044380513[..]Seleucid988/1087
...............
305953595512 Muḥarram 52[.]Hijrī1126/1134
312323673854[.]Hijrī1145/1154
325543807714[...]Seleucid1088-09-19/1188-09-23
346604022649[.]Hijrī1096-12-19/1106-09-01
3476840335[4]82[.]Anno Mundi1059-09-11/1069-09-18
\n", + "

67 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "1556 2163 first third of Tammuz 500[.] Anno Mundi \n", + "1567 2175 End of Sivan 152[.] Seleucid \n", + "1753 2460 13[..] Seleucid \n", + "2018 2745 1[.] Kislev 48[..] Anno Mundi \n", + "3044 3805 13[..] Seleucid \n", + "... ... ... ... \n", + "30595 35955 12 Muḥarram 52[.] Hijrī \n", + "31232 36738 54[.] Hijrī \n", + "32554 38077 14[...] Seleucid \n", + "34660 40226 49[.] Hijrī \n", + "34768 40335 [4]82[.] Anno Mundi \n", + "\n", + " doc_date_standard \n", + "1556 1244/1249 \n", + "1567 1209/1218 \n", + "1753 988/1088 \n", + "2018 1039-11-30/1138-11-24 \n", + "3044 988/1087 \n", + "... ... \n", + "30595 1126/1134 \n", + "31232 1145/1154 \n", + "32554 1088-09-19/1188-09-23 \n", + "34660 1096-12-19/1106-09-01 \n", + "34768 1059-09-11/1069-09-18 \n", + "\n", + "[67 rows x 4 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many include periods\n", + "docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9fa8d2ba-6612-4de5-8741-dea177f99412", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standard
6351154Last decade of Kislev 5004Anno Mundi1243-12
1172175011th Tammuz 4767Anno Mundi1007
11731751Monday, 27th Ṭevet 4797Anno Mundi1037-01-23
15562163first third of Tammuz 500[.]Anno Mundi1244/1249
51426795last decade of Tishrei 4991Anno Mundi1230-09-29/1230-10-08
52236892last decade of Iyyar 4906Anno Mundi1146-05-04/1146-05-13
56647409last third of Ḥeshvan 4965Anno Mundi1204-10-17/1204-10-25
58127581middle third of Adar 4876Anno Mundi1116-05
70249068Last decade of Ṭevet 4898Anno Mundi1138-01
863911215Middle third of Av 4889Anno Mundi1129-07-29/1129-08-07
881811418Middle third of Sivan 4895Anno Mundi1135
889011493first decade of Kislev 5439Anno Mundi1678-11-16/1678-11-25
1359816487last decade of Shevaṭ [5]141Anno Mundi1381-01-16/1381-01-25
230942746823rd Shewat, 5414Anno Mundi1654
23103274778th Kislev 5448Anno Mundi1687
2310827482Friday 14th Adar I, 5463Anno Mundi1703
2310927483Monday, 8th Adar I, 5398Anno Mundi1638
2311027484Friday 20th Shevat 5405Anno Mundi1645
2311327487Thursday 15th Shevat 5450Anno Mundi1690
2311627490Thursday 19th Elul 5428Anno Mundi1668
2311927493Sunday 21st Kislev 5460Anno Mundi1699
2312327497Sunday 17th Sivan 5423Anno Mundi1663
2312427498Sunday 25th Tevet 5409Anno Mundi1648
2313727511Wednesday 28th Tevet 5399Anno Mundi1640
2314127515Monday 15th Iyyar 5414Anno Mundi1654
2314327517Sunday 1st Kislev 5545Anno Mundi1783
2317227546Thursday 13th Nisan 5544Anno Mundi1784
2321127587Monday 10th Sivan 5553Anno Mundi1793
2321227588Monday 12th Sivan 5602Anno Mundi1842
233202769710th Tamuz 5552Anno Mundi1792
2333927721Thursday 25th Adar 5405Anno Mundi1645
2353927930Monday 19th Sivan 5410Anno Mundi1650
2356927962Sunday 16th Shevat 5415Anno Mundi1655
2357427967Thursday 7th Kislev, 5431Anno Mundi1671
236352802828th Tevet 5425Anno Mundi1665
23637280301st Heshvan 5510Anno Mundi1750
2364928042Sunday 16th Nissan 5438Anno Mundi1677
2365228045Thursday 13th Tishrei 5459Anno Mundi1699
2366728060Sunday 7th Heshvan 5425Anno Mundi1665
2367928072Tuesday 5th Kislev, 5404Anno Mundi1644
2368028073Friday 1st Nisan, 5405Anno Mundi1645
2368928085Wednesday 23rd Iyyar 5410Anno Mundi1650
2536130489first decade of Kislev 5454Anno Mundi1693-11-29/1693-12-08
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "635 1154 Last decade of Kislev 5004 Anno Mundi \n", + "1172 1750 11th Tammuz 4767 Anno Mundi \n", + "1173 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n", + "1556 2163 first third of Tammuz 500[.] Anno Mundi \n", + "5142 6795 last decade of Tishrei 4991 Anno Mundi \n", + "5223 6892 last decade of Iyyar 4906 Anno Mundi \n", + "5664 7409 last third of Ḥeshvan 4965 Anno Mundi \n", + "5812 7581 middle third of Adar 4876 Anno Mundi \n", + "7024 9068 Last decade of Ṭevet 4898 Anno Mundi \n", + "8639 11215 Middle third of Av 4889 Anno Mundi \n", + "8818 11418 Middle third of Sivan 4895 Anno Mundi \n", + "8890 11493 first decade of Kislev 5439 Anno Mundi \n", + "13598 16487 last decade of Shevaṭ [5]141 Anno Mundi \n", + "23094 27468 23rd Shewat, 5414 Anno Mundi \n", + "23103 27477 8th Kislev 5448 Anno Mundi \n", + "23108 27482 Friday 14th Adar I, 5463 Anno Mundi \n", + "23109 27483 Monday, 8th Adar I, 5398 Anno Mundi \n", + "23110 27484 Friday 20th Shevat 5405 Anno Mundi \n", + "23113 27487 Thursday 15th Shevat 5450 Anno Mundi \n", + "23116 27490 Thursday 19th Elul 5428 Anno Mundi \n", + "23119 27493 Sunday 21st Kislev 5460 Anno Mundi \n", + "23123 27497 Sunday 17th Sivan 5423 Anno Mundi \n", + "23124 27498 Sunday 25th Tevet 5409 Anno Mundi \n", + "23137 27511 Wednesday 28th Tevet 5399 Anno Mundi \n", + "23141 27515 Monday 15th Iyyar 5414 Anno Mundi \n", + "23143 27517 Sunday 1st Kislev 5545 Anno Mundi \n", + "23172 27546 Thursday 13th Nisan 5544 Anno Mundi \n", + "23211 27587 Monday 10th Sivan 5553 Anno Mundi \n", + "23212 27588 Monday 12th Sivan 5602 Anno Mundi \n", + "23320 27697 10th Tamuz 5552 Anno Mundi \n", + "23339 27721 Thursday 25th Adar 5405 Anno Mundi \n", + "23539 27930 Monday 19th Sivan 5410 Anno Mundi \n", + "23569 27962 Sunday 16th Shevat 5415 Anno Mundi \n", + "23574 27967 Thursday 7th Kislev, 5431 Anno Mundi \n", + "23635 28028 28th Tevet 5425 Anno Mundi \n", + "23637 28030 1st Heshvan 5510 Anno Mundi \n", + "23649 28042 Sunday 16th Nissan 5438 Anno Mundi \n", + "23652 28045 Thursday 13th Tishrei 5459 Anno Mundi \n", + "23667 28060 Sunday 7th Heshvan 5425 Anno Mundi \n", + "23679 28072 Tuesday 5th Kislev, 5404 Anno Mundi \n", + "23680 28073 Friday 1st Nisan, 5405 Anno Mundi \n", + "23689 28085 Wednesday 23rd Iyyar 5410 Anno Mundi \n", + "25361 30489 first decade of Kislev 5454 Anno Mundi \n", + "\n", + " doc_date_standard \n", + "635 1243-12 \n", + "1172 1007 \n", + "1173 1037-01-23 \n", + "1556 1244/1249 \n", + "5142 1230-09-29/1230-10-08 \n", + "5223 1146-05-04/1146-05-13 \n", + "5664 1204-10-17/1204-10-25 \n", + "5812 1116-05 \n", + "7024 1138-01 \n", + "8639 1129-07-29/1129-08-07 \n", + "8818 1135 \n", + "8890 1678-11-16/1678-11-25 \n", + "13598 1381-01-16/1381-01-25 \n", + "23094 1654 \n", + "23103 1687 \n", + "23108 1703 \n", + "23109 1638 \n", + "23110 1645 \n", + "23113 1690 \n", + "23116 1668 \n", + "23119 1699 \n", + "23123 1663 \n", + "23124 1648 \n", + "23137 1640 \n", + "23141 1654 \n", + "23143 1783 \n", + "23172 1784 \n", + "23211 1793 \n", + "23212 1842 \n", + "23320 1792 \n", + "23339 1645 \n", + "23539 1650 \n", + "23569 1655 \n", + "23574 1671 \n", + "23635 1665 \n", + "23637 1750 \n", + "23649 1677 \n", + "23652 1699 \n", + "23667 1665 \n", + "23679 1644 \n", + "23680 1645 \n", + "23689 1650 \n", + "25361 1693-11-29/1693-12-08 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many use ordinals instead of numerals?\n", + "hebrew_dates[hebrew_dates.doc_date_original.str.contains(\"st\") | hebrew_dates.doc_date_original.str.contains(\"rd\") | hebrew_dates.doc_date_original.str.contains(\"th\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5b6d5811-fe81-471d-bd29-896cec4c98ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "11th Tammuz 4767: 11 Tammuz 4767\n", + "27th Tevet: 27 Tevet\n", + "8th Kislev: 8 Kislev\n" + ] + } + ], + "source": [ + "import re\n", + "\n", + "# test removing ordinals without removing the numbers\n", + "for val in ['11th Tammuz 4767', \"27th Tevet\", \"8th Kislev\"]:\n", + " cleand_val = re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", val)\n", + " print(f\"{val}: { cleand_val}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "798da8f2-2332-48c2-aeec-214474e9d49c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Parse error on PGPID 603 4805/4806 (Hebrew): Could not parse '4805/4806' as a Hebrew date\n", + "Parse error on PGPID 613 Tishrei–Ṭevet 1495 (Seleucid): Could not parse 'Tishrei–Ṭevet 1495' as a Hebrew date\n", + "Parse error on PGPID 658 Ḥannuka 1548 (Seleucid): Could not parse 'Ḥannuka 1548' as a Hebrew date\n", + "Parse error on PGPID 860 Marheshvan 1460 (Seleucid): Could not parse 'Marheshvan 1460' as a Hebrew date\n", + "Parse error on PGPID 997 Second third Av 1414 (Seleucid): Could not parse 'Second third Av 1414' as a Hebrew date\n", + "Parse error on PGPID 1098 Early Elul 1476 (Seleucid): Could not parse 'Early Elul 1476' as a Hebrew date\n", + "Parse error on PGPID 1111 Second third Tammuz 1529 (Seleucid): Could not parse 'Second third Tammuz 1529' as a Hebrew date\n", + "Parse error on PGPID 1139 Passover 1537 (Seleucid): Could not parse 'Passover 1537' as a Hebrew date\n", + "Parse error on PGPID 1140 Sivan, 1564 (Seleucid): Could not parse 'Sivan, 1564' as a Hebrew date\n", + "Parse error on PGPID 1339 426–30 (Islamic): Could not parse '426–30' as an Islamic date\n", + "Parse error on PGPID 1368 Shevaṭ 1471 (Seleucid): Could not parse 'Shevaṭ 1471' as a Hebrew date\n", + "Parse error on PGPID 1864 Tishrei, 15?? (Seleucid): Could not parse 'Tishrei, 15??' as a Hebrew date\n", + "Parse error on PGPID 2129 ten days Tammuz 1410 (Seleucid): Could not parse 'ten days Tammuz 1410' as a Hebrew date\n", + "Parse error on PGPID 2134 day Shevaṭ 1447 (Seleucid): Could not parse 'day Shevaṭ 1447' as a Hebrew date\n", + "Parse error on PGPID 2142 Elul 7 1425 (Seleucid): Could not parse 'Elul 7 1425' as a Hebrew date\n", + "ignoring missing digits for now first third of Tammuz 500[.]\n", + "ignoring missing digits for now End of Sivan 152[.]\n", + "Parse error on PGPID 2410 1461-03-15 (Seleucid): Could not parse '1461-03-15' as a Hebrew date\n", + "parsed 1461-03-15 with ISO8601 format and calendar Seleucid, result is 1461-03-15 (1150-06-18/1150-06-18)\n", + "ignoring missing digits for now 13[..]\n", + "Parse error on PGPID 2601 Adar (Seleucid): Unexpected end-of-input. Expected one of: \n", + "\t* __ANON_0\n", + "\n", + "Parse error on PGPID 2664 ten days Shevaṭ 1418 (Seleucid): Could not parse 'ten days Shevaṭ 1418' as a Hebrew date\n", + "Parse error on PGPID 2687 month 1560 (Seleucid): Could not parse 'month 1560' as a Hebrew date\n", + "ignoring missing digits for now 1[.] Kislev 48[..]\n", + "Parse error on PGPID 2958 Iyyār 1459 (Seleucid): Could not parse 'Iyyār 1459' as a Hebrew date\n", + "Parse error on PGPID 2985 Adar II, 1446 (Seleucid): Could not parse 'Adar II, 1446' as a Hebrew date\n", + "Parse error on PGPID 3021 1526 4975 (Seleucid): Could not parse '1526 4975' as a Hebrew date\n", + "Parse error on PGPID 3209 Tishrei 1370-9 (Seleucid): Could not parse 'Tishrei 1370-9' as a Hebrew date\n", + "Could not parse Tishrei 1370-9 as ISO date: invalid literal for int() with base 10: 'Tishrei 1370'\n", + "Parse error on PGPID 3307 Sivan 18 1329 (Seleucid): Could not parse 'Sivan 18 1329' as a Hebrew date\n", + "Parse error on PGPID 3430 Sunday, 1406 (Seleucid): Could not parse 'Sunday, 1406' as a Hebrew date\n", + "Parse error on PGPID 3524 1471 Shevaṭ 1471 (Seleucid): Could not parse '1471 Shevaṭ 1471' as a Hebrew date\n", + "Parse error on PGPID 3603 shortly before Elul 1437 (Seleucid): Could not parse 'shortly before Elul 1437' as a Hebrew date\n", + "Parse error on PGPID 3637 23 Ḥeshvan 521 (Islamic): Could not parse '23 Ḥeshvan 521' as an Islamic date\n", + "ignoring missing digits for now 13[..]\n", + "Parse error on PGPID 4011 ten days Tishrei 1458 (Seleucid): Could not parse 'ten days Tishrei 1458' as a Hebrew date\n", + "ignoring missing digits for now Av [14]9[.]\n", + "ignoring missing digits for now 13[..]\n", + "ignoring missing digits for now 144[.]\n", + "Parse error on PGPID 4209 ten days Shevat 1419 (Seleucid): Could not parse 'ten days Shevat 1419' as a Hebrew date\n", + "ignoring missing digits for now Adar 143[.]\n", + "ignoring missing digits for now Tuesday, 27 [...] 1431\n", + "Parse error on PGPID 4241 Tuesday, 27 000 1431 (Seleucid): Could not parse 'Tuesday, 27 000 1431' as a Hebrew date\n", + "Parse error on PGPID 4253 26 Sivan 14?? (Seleucid): Could not parse '26 Sivan 14??' as a Hebrew date\n", + "Parse error on PGPID 4609 18 Marheshvan 1539 (Seleucid): Could not parse '18 Marheshvan 1539' as a Hebrew date\n", + "Parse error on PGPID 4756 Monday evening, 30 Kislev 1460 (Seleucid): Could not parse 'Monday evening, 30 Kislev 1460' as a Hebrew date\n", + "Parse error on PGPID 5319 Pesaḥ 1529 (Seleucid): Could not parse 'Pesaḥ 1529' as a Hebrew date\n", + "Parse error on PGPID 5386 3 Nissan 1409 (Seleucid): Could not parse '3 Nissan 1409' as a Hebrew date\n", + "Parse error on PGPID 5387 3 Nissan 1409 (Seleucid): Could not parse '3 Nissan 1409' as a Hebrew date\n", + "Parse error on PGPID 5636 ten days Elul 1440 (Seleucid): Could not parse 'ten days Elul 1440' as a Hebrew date\n", + "Parse error on PGPID 5902 10 Elul 444 (Islamic): Could not parse '10 Elul 444' as an Islamic date\n", + "Parse error on PGPID 6000 447/449 (Islamic): Could not parse '447/449' as an Islamic date\n", + "Parse error on PGPID 6037 449/450 (Islamic): Could not parse '449/450' as an Islamic date\n", + "Parse error on PGPID 6058 Tishrei 444 (Islamic): Could not parse 'Tishrei 444' as an Islamic date\n", + "Parse error on PGPID 6235 467-04 (Islamic): Could not parse '467-04' as an Islamic date\n", + "parsed 467-04 with ISO8601 format and calendar Islamic, result is 0467-04 (1074-11-30/1074-12-28)\n", + "Parse error on PGPID 6249 Tuesday, 17 SIvan 1475 (Seleucid): Could not parse 'Tuesday, 17 SIvan 1475' as a Hebrew date\n", + "Parse error on PGPID 6292 1452-03-08 (Seleucid): Could not parse '1452-03-08' as a Hebrew date\n", + "parsed 1452-03-08 with ISO8601 format and calendar Seleucid, result is 1452-03-08 (1141-05-23/1141-05-23)\n", + "Parse error on PGPID 6325 20 day the Omer 1545 (Seleucid): Could not parse '20 day the Omer 1545' as a Hebrew date\n", + "ignoring missing digits for now 2[.] Ḥeshvan 1352\n", + "Parse error on PGPID 6542 Nisan 1497 – Tishrei 1498 (Seleucid): Could not parse 'Nisan 1497 – Tishrei 1498' as a Hebrew date\n", + "Parse error on PGPID 6581 Iyyar-Tammuz 1475 (Seleucid): Could not parse 'Iyyar-Tammuz 1475' as a Hebrew date\n", + "Could not parse Iyyar-Tammuz 1475 as ISO date: invalid literal for int() with base 10: 'Iyyar'\n", + "Parse error on PGPID 6582 1543-08 (Seleucid): Could not parse '1543-08' as a Hebrew date\n", + "parsed 1543-08 with ISO8601 format and calendar Seleucid, result is 1543-08 (1231-10-06/1231-11-03)\n", + "Parse error on PGPID 6628 Iyar 1510–29 (Seleucid): Could not parse 'Iyar 1510–29' as a Hebrew date\n", + "Parse error on PGPID 6803 1357–59 (Seleucid): Could not parse '1357–59' as a Hebrew date\n", + "Parse error on PGPID 6827 day Dhū l-Qaʿda 544 (Islamic): Could not parse 'day Dhū l-Qaʿda 544' as an Islamic date\n", + "Could not parse day Dhū l-Qaʿda 544 as ISO date: invalid literal for int() with base 10: 'day Dhū l'\n", + "Parse error on PGPID 6834 Approximately 15 weeks prior to Ḥeshvan 1451 (Seleucid): Could not parse 'Approximately 15 weeks prior to Ḥeshvan 1451' as a Hebrew date\n", + "ignoring missing digits for now 2[.] Ṭevet 4874\n", + "Parse error on PGPID 6913 Ḥeshvan–Kislev 1339 (Seleucid): Could not parse 'Ḥeshvan–Kislev 1339' as a Hebrew date\n", + "Parse error on PGPID 6925 Wednesday, 24 month year (Seleucid): Could not parse 'Wednesday, 24 month year' as a Hebrew date\n", + "Parse error on PGPID 6982 5–10 Adar II 1535 (Seleucid): Could not parse '5–10 Adar II 1535' as a Hebrew date\n", + "Parse error on PGPID 7029 Thursday, 20 Iyar 4758 or 4768 (Hebrew): Could not parse 'Thursday, 20 Iyar 4758 or 4768' as a Hebrew date\n", + "Parse error on PGPID 7032 Monday, Shevaṭ 1461 (Seleucid): Could not parse 'Monday, Shevaṭ 1461' as a Hebrew date\n", + "ignoring missing digits for now Ḥeshvan 136[.]\n", + "Parse error on PGPID 7098 4761–68 (Hebrew): Could not parse '4761–68' as a Hebrew date\n", + "ignoring missing digits for now 48[..]\n", + "Parse error on PGPID 7150 Nisan–Iyyar 1341 (Seleucid): Could not parse 'Nisan–Iyyar 1341' as a Hebrew date\n", + "ignoring missing digits for now Thursday, 13 Nisan 15[..]\n", + "Parse error on PGPID 7299 Ṭevet, 1424 (Seleucid): Could not parse 'Ṭevet, 1424' as a Hebrew date\n", + "ignoring missing digits for now 2[.] Kislev 5327\n", + "Parse error on PGPID 7415 4 Elul 1420–29 (Seleucid): Could not parse '4 Elul 1420–29' as a Hebrew date\n", + "Parse error on PGPID 7427 439–40 (Islamic): Could not parse '439–40' as an Islamic date\n", + "ignoring missing digits for now 13[..]\n", + "Parse error on PGPID 7580 ten days Kislev 1428 (Seleucid): Could not parse 'ten days Kislev 1428' as a Hebrew date\n", + "Parse error on PGPID 7612 Ca. 27 Iyyar 1547 (Seleucid): Could not parse 'Ca. 27 Iyyar 1547' as a Hebrew date\n", + "Parse error on PGPID 7735 Adar II–Nisan 1339 (Seleucid): Could not parse 'Adar II–Nisan 1339' as a Hebrew date\n", + "Parse error on PGPID 7744 1335-10-14 (Seleucid): Could not parse '1335-10-14' as a Hebrew date\n", + "parsed 1335-10-14 with ISO8601 format and calendar Seleucid, result is 1335-10-14 (1024-01-05/1024-01-05)\n", + "Parse error on PGPID 7745 1337-01-27 (Seleucid): Could not parse '1337-01-27' as a Hebrew date\n", + "parsed 1337-01-27 with ISO8601 format and calendar Seleucid, result is 1337-01-27 (1026-04-23/1026-04-23)\n", + "Parse error on PGPID 7746 1341-03-18 (Seleucid): Could not parse '1341-03-18' as a Hebrew date\n", + "parsed 1341-03-18 with ISO8601 format and calendar Seleucid, result is 1341-03-18 (1030-05-28/1030-05-28)\n", + "Parse error on PGPID 7747 1371-02-20 (Seleucid): Could not parse '1371-02-20' as a Hebrew date\n", + "parsed 1371-02-20 with ISO8601 format and calendar Seleucid, result is 1371-02-20 (1060-04-29/1060-04-29)\n", + "Parse error on PGPID 7748 1377-07-04 (Seleucid): Could not parse '1377-07-04' as a Hebrew date\n", + "parsed 1377-07-04 with ISO8601 format and calendar Seleucid, result is 1377-07-04 (1065-09-12/1065-09-12)\n", + "Parse error on PGPID 7749 1430-09-16 (Seleucid): Could not parse '1430-09-16' as a Hebrew date\n", + "parsed 1430-09-16 with ISO8601 format and calendar Seleucid, result is 1430-09-16 (1118-12-09/1118-12-09)\n", + "Parse error on PGPID 7750 1399-04-16 (Seleucid): Could not parse '1399-04-16' as a Hebrew date\n", + "parsed 1399-04-16 with ISO8601 format and calendar Seleucid, result is 1399-04-16 (1088-07-14/1088-07-14)\n", + "Parse error on PGPID 7752 1395-09-02 (Seleucid): Could not parse '1395-09-02' as a Hebrew date\n", + "parsed 1395-09-02 with ISO8601 format and calendar Seleucid, result is 1395-09-02 (1083-11-21/1083-11-21)\n", + "Parse error on PGPID 7753 1399-04-16 (Seleucid): Could not parse '1399-04-16' as a Hebrew date\n", + "parsed 1399-04-16 with ISO8601 format and calendar Seleucid, result is 1399-04-16 (1088-07-14/1088-07-14)\n", + "Parse error on PGPID 7755 1415-05-12 (Seleucid): Could not parse '1415-05-12' as a Hebrew date\n", + "parsed 1415-05-12 with ISO8601 format and calendar Seleucid, result is 1415-05-12 (1104-08-12/1104-08-12)\n", + "Parse error on PGPID 7757 1148-07-17 (Seleucid): Could not parse '1148-07-17' as a Hebrew date\n", + "parsed 1148-07-17 with ISO8601 format and calendar Seleucid, result is 1148-07-17 (0836-10-06/0836-10-06)\n", + "Parse error on PGPID 7759 1440-09-15 (Seleucid): Could not parse '1440-09-15' as a Hebrew date\n", + "parsed 1440-09-15 with ISO8601 format and calendar Seleucid, result is 1440-09-15 (1128-11-16/1128-11-16)\n", + "Parse error on PGPID 7760 1441-09-02 (Seleucid): Could not parse '1441-09-02' as a Hebrew date\n", + "parsed 1441-09-02 with ISO8601 format and calendar Seleucid, result is 1441-09-02 (1129-11-23/1129-11-23)\n", + "Parse error on PGPID 7762 1443-04-29 (Seleucid): Could not parse '1443-04-29' as a Hebrew date\n", + "parsed 1443-04-29 with ISO8601 format and calendar Seleucid, result is 1443-04-29 (1132-07-21/1132-07-21)\n", + "Parse error on PGPID 7763 1446-12-03 (Seleucid): Could not parse '1446-12-03' as a Hebrew date\n", + "parsed 1446-12-03 with ISO8601 format and calendar Seleucid, result is 1446-12-03 (1135-02-25/1135-02-25)\n", + "Parse error on PGPID 7765 1457-06-12 (Seleucid): Could not parse '1457-06-12' as a Hebrew date\n", + "parsed 1457-06-12 with ISO8601 format and calendar Seleucid, result is 1457-06-12 (1146-08-29/1146-08-29)\n", + "Parse error on PGPID 7766 1461-10-22 (Seleucid): Could not parse '1461-10-22' as a Hebrew date\n", + "parsed 1461-10-22 with ISO8601 format and calendar Seleucid, result is 1461-10-22 (1149-12-30/1149-12-30)\n", + "Parse error on PGPID 7767 1461-03-15 (Seleucid): Could not parse '1461-03-15' as a Hebrew date\n", + "parsed 1461-03-15 with ISO8601 format and calendar Seleucid, result is 1461-03-15 (1150-06-18/1150-06-18)\n", + "Parse error on PGPID 7768 1464-07-19 (Seleucid): Could not parse '1464-07-19' as a Hebrew date\n", + "parsed 1464-07-19 with ISO8601 format and calendar Seleucid, result is 1464-07-19 (1152-09-26/1152-09-26)\n", + "Parse error on PGPID 7769 1466-11-19 (Seleucid): Could not parse '1466-11-19' as a Hebrew date\n", + "parsed 1466-11-19 with ISO8601 format and calendar Seleucid, result is 1466-11-19 (1155-01-31/1155-01-31)\n", + "Parse error on PGPID 7770 1486-02-12 (Seleucid): Could not parse '1486-02-12' as a Hebrew date\n", + "parsed 1486-02-12 with ISO8601 format and calendar Seleucid, result is 1486-02-12 (1175-05-12/1175-05-12)\n", + "Parse error on PGPID 7771 1473-02-05 (Seleucid): Could not parse '1473-02-05' as a Hebrew date\n", + "parsed 1473-02-05 with ISO8601 format and calendar Seleucid, result is 1473-02-05 (1162-04-28/1162-04-28)\n", + "Parse error on PGPID 7772 1476-11-05 (Seleucid): Could not parse '1476-11-05' as a Hebrew date\n", + "parsed 1476-11-05 with ISO8601 format and calendar Seleucid, result is 1476-11-05 (1165-01-27/1165-01-27)\n", + "Parse error on PGPID 7773 1477-11-25 (Seleucid): Could not parse '1477-11-25' as a Hebrew date\n", + "parsed 1477-11-25 with ISO8601 format and calendar Seleucid, result is 1477-11-25 (1166-02-05/1166-02-05)\n", + "Parse error on PGPID 7774 1489-12-20 (Seleucid): Could not parse '1489-12-20' as a Hebrew date\n", + "parsed 1489-12-20 with ISO8601 format and calendar Seleucid, result is 1489-12-20 (1178-02-16/1178-02-16)\n", + "Parse error on PGPID 7776 1492-12-20 (Seleucid): Could not parse '1492-12-20' as a Hebrew date\n", + "parsed 1492-12-20 with ISO8601 format and calendar Seleucid, result is 1492-12-20 (1181-03-16/1181-03-16)\n", + "Parse error on PGPID 7777 1524-09-05 (Seleucid): Could not parse '1524-09-05' as a Hebrew date\n", + "parsed 1524-09-05 with ISO8601 format and calendar Seleucid, result is 1524-09-05 (1212-11-08/1212-11-08)\n", + "Parse error on PGPID 7780 1537-07-10 (Seleucid): Could not parse '1537-07-10' as a Hebrew date\n", + "parsed 1537-07-10 with ISO8601 format and calendar Seleucid, result is 1537-07-10 (1225-09-20/1225-09-20)\n", + "Parse error on PGPID 7781 1590-05-18 (Seleucid): Could not parse '1590-05-18' as a Hebrew date\n", + "parsed 1590-05-18 with ISO8601 format and calendar Seleucid, result is 1590-05-18 (1279-08-05/1279-08-05)\n", + "Parse error on PGPID 7926 second half Adar II 1366 (Seleucid): Could not parse 'second half Adar II 1366' as a Hebrew date\n", + "Parse error on PGPID 7968 Second third Elul, 1482 (Seleucid): Could not parse 'Second third Elul, 1482' as a Hebrew date\n", + "Parse error on PGPID 8072 Tammuz/Av 1466 (Seleucid): Could not parse 'Tammuz/Av 1466' as a Hebrew date\n", + "ignoring missing digits for now 13 Nisan 13[..]\n", + "Parse error on PGPID 8258 Elul 1141 – Tishrei 1442 (Seleucid): Could not parse 'Elul 1141 – Tishrei 1442' as a Hebrew date\n", + "Parse error on PGPID 8271 10 days Shawwāl 480 (Islamic): Could not parse '10 days Shawwāl 480' as an Islamic date\n", + "Parse error on PGPID 8517 Iyyār 1461 (Seleucid): Could not parse 'Iyyār 1461' as a Hebrew date\n", + "Parse error on PGPID 8665 Monday, 24 Iyyār 1403 (Seleucid): Could not parse 'Monday, 24 Iyyār 1403' as a Hebrew date\n", + "Parse error on PGPID 8778 Tammuz 1549–Tammuz 1550 (Seleucid): Could not parse 'Tammuz 1549–Tammuz 1550' as a Hebrew date\n", + "ignoring missing digits for now 153[.]\n", + "Parse error on PGPID 8925 Rabi al-Awwal 617 (Islamic): Could not parse 'Rabi al-Awwal 617' as an Islamic date\n", + "Could not parse Rabi al-Awwal 617 as ISO date: invalid literal for int() with base 10: 'Rabi al'\n", + "ignoring missing digits for now 139[.]\n", + "Parse error on PGPID 9198 Iyyar 571 (Islamic): Could not parse 'Iyyar 571' as an Islamic date\n", + "Parse error on PGPID 9235 516/521 (Islamic): Could not parse '516/521' as an Islamic date\n", + "Parse error on PGPID 9253 12 Nissan 1409 (Seleucid): Could not parse '12 Nissan 1409' as a Hebrew date\n", + "Parse error on PGPID 9504 Tishrei-Ḥeshvan 1472 (Seleucid): Could not parse 'Tishrei-Ḥeshvan 1472' as a Hebrew date\n", + "Could not parse Tishrei-Ḥeshvan 1472 as ISO date: invalid literal for int() with base 10: 'Tishrei'\n", + "Parse error on PGPID 9522 24 Adar Sheni 1280 (Seleucid): Could not parse '24 Adar Sheni 1280' as a Hebrew date\n", + "Parse error on PGPID 9682 ten days Av 1427 (Seleucid): Could not parse 'ten days Av 1427' as a Hebrew date\n", + "Parse error on PGPID 9915 427–29 (Islamic): Could not parse '427–29' as an Islamic date\n", + "Parse error on PGPID 10446 11 Rabīʿ II 488/20 Rabīʿ II 488 (Islamic): Could not parse '11 Rabīʿ II 488/20 Rabīʿ II 488' as an Islamic date\n", + "ignoring missing digits for now 135[.]\n", + "Parse error on PGPID 11091 Tishrei 7 1421 (Seleucid): Could not parse 'Tishrei 7 1421' as a Hebrew date\n", + "ignoring missing digits for now 13[..]\n", + "Parse error on PGPID 11249 909-03-14 (Islamic): Could not parse '909-03-14' as an Islamic date\n", + "parsed 909-03-14 with ISO8601 format and calendar Islamic, result is 0909-03-14 (1503-09-16/1503-09-16)\n", + "Parse error on PGPID 11250 909-03 (Islamic): Could not parse '909-03' as an Islamic date\n", + "parsed 909-03 with ISO8601 format and calendar Islamic, result is 0909-03 (1503-09-03/1503-10-02)\n", + "Parse error on PGPID 11336 Wednesday, Shevaṭ 1402 (Seleucid): Could not parse 'Wednesday, Shevaṭ 1402' as a Hebrew date\n", + "ignoring missing digits for now 14 Sivan 156[.]\n", + "Parse error on PGPID 11591 Av – Elul 1527 (Seleucid): Could not parse 'Av – Elul 1527' as a Hebrew date\n", + "Parse error on PGPID 12008 Rabīʿ al-Awwal 1259 (Islamic): Could not parse 'Rabīʿ al-Awwal 1259' as an Islamic date\n", + "Could not parse Rabīʿ al-Awwal 1259 as ISO date: invalid literal for int() with base 10: 'Rabīʿ al'\n", + "Parse error on PGPID 12664 403-12-25 (Islamic): Could not parse '403-12-25' as an Islamic date\n", + "parsed 403-12-25 with ISO8601 format and calendar Islamic, result is 0403-12-25 (1013-07-13/1013-07-13)\n", + "ignoring missing digits for now 17[..]\n", + "ignoring missing digits for now Ramaḍān 4[..]\n", + "ignoring missing digits for now Shevaṭ 47[..]\n", + "Parse error on PGPID 16232 1469/1471 (Seleucid): Could not parse '1469/1471' as a Hebrew date\n", + "Parse error on PGPID 16247 495/524 (Islamic): Could not parse '495/524' as an Islamic date\n", + "ignoring missing digits for now Elul 143[.]\n", + "ignoring missing digits for now Kislev 152[.]\n", + "Parse error on PGPID 16451 before Sukkot 1684 (Seleucid): Could not parse 'before Sukkot 1684' as a Hebrew date\n", + "ignoring missing digits for now 48[..]\n", + "ignoring missing digits for now 14[..]\n", + "ignoring missing digits for now 15[..]\n", + "ignoring missing digits for now 14[..]\n", + "Parse error on PGPID 16717 Rabīʿ II 873 (Islamic): Could not parse 'Rabīʿ II 873' as an Islamic date\n", + "Parse error on PGPID 16725 621–29 (Islamic): Could not parse '621–29' as an Islamic date\n", + "Parse error on PGPID 16743 half 420 (Islamic): Could not parse 'half 420' as an Islamic date\n", + "Parse error on PGPID 17119 Shabbat Badmibar 1651 (Seleucid): Could not parse 'Shabbat Badmibar 1651' as a Hebrew date\n", + "Parse error on PGPID 18203 18 Shaʿbān – 15 Ramaḍān 934 (Islamic): Could not parse '18 Shaʿbān – 15 Ramaḍān 934' as an Islamic date\n", + "Parse error on PGPID 18566 10-20 Shaʿbān 448 (Islamic): Could not parse '10-20 Shaʿbān 448' as an Islamic date\n", + "Could not parse 10-20 Shaʿbān 448 as ISO date: invalid literal for int() with base 10: '20 Shaʿbān 448'\n", + "Parse error on PGPID 18576 548/549 (Islamic): Could not parse '548/549' as an Islamic date\n", + "Parse error on PGPID 18588 ten days Av 1425 (Seleucid): Could not parse 'ten days Av 1425' as a Hebrew date\n", + "ignoring missing digits for now [4]84[.]\n", + "Parse error on PGPID 19110 Adar 1408 – Kislev 1409 (Seleucid): Could not parse 'Adar 1408 – Kislev 1409' as a Hebrew date\n", + "ignoring missing digits for now Kislev 13[..]\n", + "Parse error on PGPID 19137 A Monday in the Tammuz 1434 (Seleucid): Could not parse 'A Monday in the Tammuz 1434' as a Hebrew date\n", + "ignoring missing digits for now 145[.]\n", + "Parse error on PGPID 19319 20-29 Elul 5500 (Hebrew): Could not parse '20-29 Elul 5500' as a Hebrew date\n", + "Could not parse 20-29 Elul 5500 as ISO date: invalid literal for int() with base 10: '29 Elul 5500'\n", + "ignoring missing digits for now 194[.]\n", + "Parse error on PGPID 20077 1459–62 (Seleucid): Could not parse '1459–62' as a Hebrew date\n", + "Parse error on PGPID 20404 5452–57 (Hebrew): Could not parse '5452–57' as a Hebrew date\n", + "Parse error on PGPID 20406 439–40 (Islamic): Could not parse '439–40' as an Islamic date\n", + "Parse error on PGPID 20647 16 Dhu l-Qa’da 550 (Islamic): Could not parse '16 Dhu l-Qa’da 550' as an Islamic date\n", + "Could not parse 16 Dhu l-Qa’da 550 as ISO date: invalid literal for int() with base 10: '16 Dhu l'\n", + "ignoring missing digits for now 6[..]\n", + "Parse error on PGPID 20715 536–537 (Islamic): Could not parse '536–537' as an Islamic date\n", + "Parse error on PGPID 20784 409/410 (Islamic): Could not parse '409/410' as an Islamic date\n", + "ignoring missing digits for now 154[.]\n", + "Parse error on PGPID 21102 522–73 (Islamic): Could not parse '522–73' as an Islamic date\n", + "ignoring missing digits for now first third of Ramaḍān 54[.]\n", + "ignoring missing digits for now 47[.]\n", + "ignoring missing digits for now Sivan 53[..]\n", + "Parse error on PGPID 22157 השע\"ד (Hebrew): Could not parse 'השע\"ד' as a Hebrew date\n", + "Parse error on PGPID 22446 1463/4 (Seleucid): Could not parse '1463/4' as a Hebrew date\n", + "ignoring missing digits for now Adar II 17[..]\n", + "Parse error on PGPID 22863 אתלא (Seleucid): Could not parse 'אתלא' as a Hebrew date\n", + "ignoring missing digits for now 64[.]\n", + "Parse error on PGPID 23880 608–09 (Islamic): Could not parse '608–09' as an Islamic date\n", + "Parse error on PGPID 23926 3–4 Dhū l-Ḥijja 600 (Islamic): Could not parse '3–4 Dhū l-Ḥijja 600' as an Islamic date\n", + "Could not parse 3–4 Dhū l-Ḥijja 600 as ISO date: invalid literal for int() with base 10: '3–4 Dhū l'\n", + "ignoring missing digits for now Sivan 142[.]\n", + "Parse error on PGPID 24668 Sunday, 16 Rabiʿ I 403 (Islamic): Could not parse 'Sunday, 16 Rabiʿ I 403' as an Islamic date\n", + "ignoring missing digits for now 145[.]\n", + "Parse error on PGPID 24843 503–04 (Islamic): Could not parse '503–04' as an Islamic date\n", + "ignoring missing digits for now Rabīʿ I 49[.]\n", + "Parse error on PGPID 24872 Rabīʿ I 490 (Islamic): Could not parse 'Rabīʿ I 490' as an Islamic date\n", + "Parse error on PGPID 25106 Rabī ʿ 1312 (Islamic): Could not parse 'Rabī ʿ 1312' as an Islamic date\n", + "Parse error on PGPID 25233 19 Rabiʿ II 566 (Islamic): Could not parse '19 Rabiʿ II 566' as an Islamic date\n", + "Parse error on PGPID 25298 Nisan 1463 – Adar 1464 (Seleucid): Could not parse 'Nisan 1463 – Adar 1464' as a Hebrew date\n", + "ignoring missing digits for now 52[..]\n", + "ignoring missing digits for now Tuesday, 2[.] Shevaṭ 1341\n", + "ignoring missing digits for now 157[.]\n", + "Parse error on PGPID 25917 14–28 Ramaḍān 433 (Islamic): Could not parse '14–28 Ramaḍān 433' as an Islamic date\n", + "ignoring missing digits for now Ṭevet 143[.]\n", + "Parse error on PGPID 26405 26 Adar II 1600 — 26 Elul 1601 (Seleucid): Could not parse '26 Adar II 1600 — 26 Elul 1601' as a Hebrew date\n", + "ignoring missing digits for now Elul 498[.]\n", + "Parse error on PGPID 26553 1361–62 (Seleucid): Could not parse '1361–62' as a Hebrew date\n", + "ignoring missing digits for now Ḥeshvan 13[..]\n", + "Parse error on PGPID 27238 תר\"ב (Hebrew): Could not parse 'תר\"ב' as a Hebrew date\n", + "Parse error on PGPID 27254 התר\"ן (Hebrew): Could not parse 'התר\"ן' as a Hebrew date\n", + "Parse error on PGPID 27468 23 Shewat, 5414 (Hebrew): Could not parse '23 Shewat, 5414' as a Hebrew date\n", + "Parse error on PGPID 27474 Tuesday 3 Nissan 5443 (Hebrew): Could not parse 'Tuesday 3 Nissan 5443' as a Hebrew date\n", + "Parse error on PGPID 27697 10 Tamuz 5552 (Hebrew): Could not parse '10 Tamuz 5552' as a Hebrew date\n", + "ignoring missing digits for now 1[.] Sivan 1420\n", + "Parse error on PGPID 27913 Rabīʿ I or Rabīʿ II 915 (Islamic): Could not parse 'Rabīʿ I or Rabīʿ II 915' as an Islamic date\n", + "Parse error on PGPID 27938 Adar II 5334–Sivan 5535 (Hebrew): Could not parse 'Adar II 5334–Sivan 5535' as a Hebrew date\n", + "Parse error on PGPID 27978 Monday, Iyyar 2, 5405 (Hebrew): Could not parse 'Monday, Iyyar 2, 5405' as a Hebrew date\n", + "Parse error on PGPID 28042 Sunday 16 Nissan 5438 (Hebrew): Could not parse 'Sunday 16 Nissan 5438' as a Hebrew date\n", + "Parse error on PGPID 28293 1-10 Shevaṭ 5577 (Hebrew): Could not parse '1-10 Shevaṭ 5577' as a Hebrew date\n", + "Could not parse 1-10 Shevaṭ 5577 as ISO date: invalid literal for int() with base 10: '10 Shevaṭ 5577'\n", + "Parse error on PGPID 28487 1219–20 (Islamic): Could not parse '1219–20' as an Islamic date\n", + "Parse error on PGPID 28919 Av 5579/Av 5582 (Hebrew): Could not parse 'Av 5579/Av 5582' as a Hebrew date\n", + "Parse error on PGPID 28997 Adar II, 1451 (Seleucid): Could not parse 'Adar II, 1451' as a Hebrew date\n", + "Parse error on PGPID 29029 25 Dhū l-Qa‘da 403 (Islamic): Could not parse '25 Dhū l-Qa‘da 403' as an Islamic date\n", + "Could not parse 25 Dhū l-Qa‘da 403 as ISO date: invalid literal for int() with base 10: '25 Dhū l'\n", + "ignoring missing digits for now 41[.]\n", + "Parse error on PGPID 30314 Av 5493 – Tishrei 5494 (Hebrew): Could not parse 'Av 5493 – Tishrei 5494' as a Hebrew date\n", + "Parse error on PGPID 30553 5442–43 (Hebrew): Could not parse '5442–43' as a Hebrew date\n", + "Parse error on PGPID 30944 859-04-13 (Islamic): Could not parse '859-04-13' as an Islamic date\n", + "parsed 859-04-13 with ISO8601 format and calendar Islamic, result is 0859-04-13 (1455-04-11/1455-04-11)\n", + "Parse error on PGPID 30990 634-06-14 (Islamic): Could not parse '634-06-14' as an Islamic date\n", + "parsed 634-06-14 with ISO8601 format and calendar Islamic, result is 0634-06-14 (1237-02-19/1237-02-19)\n", + "Parse error on PGPID 31058 723-01-27 (Islamic): Could not parse '723-01-27' as an Islamic date\n", + "parsed 723-01-27 with ISO8601 format and calendar Islamic, result is 0723-01-27 (1323-02-13/1323-02-13)\n", + "Parse error on PGPID 31080 1247-05-18 (Islamic): Could not parse '1247-05-18' as an Islamic date\n", + "parsed 1247-05-18 with ISO8601 format and calendar Islamic, result is 1247-05-18 (1831-10-25/1831-10-25)\n", + "ignoring missing digits for now 123[.]\n", + "Parse error on PGPID 31879 1207-08 (Islamic): Could not parse '1207-08' as an Islamic date\n", + "parsed 1207-08 with ISO8601 format and calendar Islamic, result is 1207-08 (1793-03-14/1793-04-11)\n", + "Parse error on PGPID 32721 531/533 (Islamic): Could not parse '531/533' as an Islamic date\n", + "Parse error on PGPID 32884 Muḥarram 1246 - Ṣafar 1246 (Islamic): Could not parse 'Muḥarram 1246 - Ṣafar 1246' as an Islamic date\n", + "Could not parse Muḥarram 1246 - Ṣafar 1246 as ISO date: invalid literal for int() with base 10: 'Muḥarram 1246 '\n", + "Parse error on PGPID 32997 Jumāda II 602-603 (Islamic): Could not parse 'Jumāda II 602-603' as an Islamic date\n", + "Could not parse Jumāda II 602-603 as ISO date: invalid literal for int() with base 10: 'Jumāda II 602'\n", + "ignoring missing digits for now 73[.]\n", + "Parse error on PGPID 34019 408–410 (Islamic): Could not parse '408–410' as an Islamic date\n", + "Parse error on PGPID 34221 Before 9 Av 1412 (Seleucid): Could not parse 'Before 9 Av 1412' as a Hebrew date\n", + "Parse error on PGPID 34349 19 Ramaḍān 504/29 Ramaḍān 504 (Islamic): Could not parse '19 Ramaḍān 504/29 Ramaḍān 504' as an Islamic date\n", + "ignoring missing digits for now 146[.]\n", + "Parse error on PGPID 34598 4982–83 (Hebrew): Could not parse '4982–83' as a Hebrew date\n", + "Parse error on PGPID 34740 29 June 1922 (Hebrew): Could not parse '29 June 1922' as a Hebrew date\n", + "Parse error on PGPID 34880 Adar I and Adar II 5453 (Hebrew): Could not parse 'Adar I and Adar II 5453' as a Hebrew date\n", + "Parse error on PGPID 35014 Nisan–Sivan 5585 (Hebrew): Could not parse 'Nisan–Sivan 5585' as a Hebrew date\n", + "Parse error on PGPID 35018 5008-5010 (Hebrew): Could not parse '5008-5010' as a Hebrew date\n", + "Could not parse 5008-5010 as ISO date: bad month number 5010; must be 1-12\n", + "Parse error on PGPID 35020 4962/4964 (Hebrew): Could not parse '4962/4964' as a Hebrew date\n", + "Parse error on PGPID 35022 4790/4791 (Hebrew): Could not parse '4790/4791' as a Hebrew date\n", + "Parse error on PGPID 35024 4728–31 (Hebrew): Could not parse '4728–31' as a Hebrew date\n", + "ignoring missing digits for now 42[.]\n", + "Parse error on PGPID 35117 tenth Ṣafar 728 (Islamic): Could not parse 'tenth Ṣafar 728' as an Islamic date\n", + "Parse error on PGPID 35181 1390-10-23 (Seleucid): Could not parse '1390-10-23' as a Hebrew date\n", + "parsed 1390-10-23 with ISO8601 format and calendar Seleucid, result is 1390-10-23 (1079-01-06/1079-01-06)\n", + "Parse error on PGPID 35272 411/427 (Islamic): Could not parse '411/427' as an Islamic date\n", + "ignoring missing digits for now 41[.]\n", + "Parse error on PGPID 35464 Nisan/Iyyar 1529 (Seleucid): Could not parse 'Nisan/Iyyar 1529' as a Hebrew date\n", + "Parse error on PGPID 35476 Dhū l-Qiʿda 614 (Islamic): Could not parse 'Dhū l-Qiʿda 614' as an Islamic date\n", + "Could not parse Dhū l-Qiʿda 614 as ISO date: invalid literal for int() with base 10: 'Dhū l'\n", + "Parse error on PGPID 35744 Rabīʿ I 565 (Islamic): Could not parse 'Rabīʿ I 565' as an Islamic date\n", + "Parse error on PGPID 35834 550-1-19 (Islamic): Could not parse '550-1-19' as an Islamic date\n", + "parsed 550-1-19 with ISO8601 format and calendar Islamic, result is 0550-01-19 (1155-04-01/1155-04-01)\n", + "ignoring missing digits for now 12 Muḥarram 52[.]\n", + "Parse error on PGPID 36005 502/503 (Islamic): Could not parse '502/503' as an Islamic date\n", + "Parse error on PGPID 36210 Kislev/Tevet 5460 (Hebrew): Could not parse 'Kislev/Tevet 5460' as a Hebrew date\n", + "Parse error on PGPID 36234 660-05 (Islamic): Could not parse '660-05' as an Islamic date\n", + "parsed 660-05 with ISO8601 format and calendar Islamic, result is 0660-05 (1262-03-31/1262-04-29)\n", + "ignoring missing digits for now 54[.]\n", + "Parse error on PGPID 36771 545/547 (Islamic): Could not parse '545/547' as an Islamic date\n", + "Parse error on PGPID 37068 Dhū l-Qaʿda 579 / Muḥarram 580 (Islamic): Could not parse 'Dhū l-Qaʿda 579 / Muḥarram 580' as an Islamic date\n", + "Parse error on PGPID 37875 525–26 (Islamic): Could not parse '525–26' as an Islamic date\n", + "ignoring missing digits for now 14[...]\n", + "Parse error on PGPID 38380 Dhū al-Qada 508 (Islamic): Could not parse 'Dhū al-Qada 508' as an Islamic date\n", + "Could not parse Dhū al-Qada 508 as ISO date: invalid literal for int() with base 10: 'Dhū al'\n", + "Parse error on PGPID 38508 Dhū l-Qaʿda 947 – Shawwāl 947 (Islamic): Could not parse 'Dhū l-Qaʿda 947 – Shawwāl 947' as an Islamic date\n", + "Could not parse Dhū l-Qaʿda 947 – Shawwāl 947 as ISO date: invalid literal for int() with base 10: 'Dhū l'\n", + "Parse error on PGPID 39016 Rabiʿ II 448 (Islamic): Could not parse 'Rabiʿ II 448' as an Islamic date\n", + "Parse error on PGPID 40044 Thursday 6 Nissan 1409 (Seleucid): Could not parse 'Thursday 6 Nissan 1409' as a Hebrew date\n", + "Parse error on PGPID 40046 Monday 3 Nissan 1410 (Seleucid): Could not parse 'Monday 3 Nissan 1410' as a Hebrew date\n", + "Parse error on PGPID 40063 Sunday Elul 1 1409 (Seleucid): Could not parse 'Sunday Elul 1 1409' as a Hebrew date\n", + "Parse error on PGPID 40140 4 Rabīʿ II 1106 (Islamic): Could not parse '4 Rabīʿ II 1106' as an Islamic date\n", + "ignoring missing digits for now 49[.]\n", + "ignoring missing digits for now [4]82[.]\n", + "Parse error on PGPID 40646 19TH Rabīʿ II 991 (Islamic): Could not parse '19TH Rabīʿ II 991' as an Islamic date\n", + "Parse error on PGPID 40662 20 Rabīʿ al-Awwal 1070 (Islamic): Could not parse '20 Rabīʿ al-Awwal 1070' as an Islamic date\n", + "Could not parse 20 Rabīʿ al-Awwal 1070 as ISO date: invalid literal for int() with base 10: '20 Rabīʿ al'\n" + ] + } + ], + "source": [ + "# parse hijri, anno mundi, and seleucid dates as undates\n", + "\n", + "import re\n", + "from lark.exceptions import UnexpectedEOF\n", + "\n", + "def parse_original_date(row):\n", + " # print(f\"PGPID {row.pgpid} {row.doc_date_original} ({row.doc_date_calendar})\")\n", + " undate_calendar = None\n", + " if row.doc_date_calendar == \"Anno Mundi\":\n", + " undate_calendar = \"Hebrew\"\n", + " elif row.doc_date_calendar == \"Hijrī\":\n", + " undate_calendar = \"Islamic\"\n", + " elif row.doc_date_calendar == \"Seleucid\":\n", + " # handle seleucid as hebrew with offset (adapt from pgp code)\n", + " undate_calendar = \"Seleucid\"\n", + "\n", + " \n", + " if undate_calendar:\n", + " value = row.doc_date_original\n", + "\n", + " # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]\n", + " # ... parser doesn't support this, but undate DOES\n", + " if '[.' in value:\n", + " print(f\"ignoring missing digits for now {value}\")\n", + " value = value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\") \n", + " \n", + " # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'\n", + " # for now, just strip out brackets before parsing \n", + " # (in future, could potentially infer uncertainty based on these)\n", + " value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')\n", + "\n", + " # also remove unsupported modifiers:\n", + " # Late Tevet 4903, Last decade of Kislev 5004, first third of ...\n", + " # some dates include of, e.g. day of month\n", + " modifiers = [\"Late \", \"(first|middle|last)( third|half|decade|tenth)? (of )?\", \"(Beginning|end) of \", \"last day\", \"First 10 days\", \" of\", \"spring\", \"decade \", \"night, \"]\n", + " for mod in modifiers:\n", + " value = re.sub(mod, \"\", value, flags=re.I)\n", + "\n", + " # there are a handful of misspelled wednesdays...\n", + " value = value.replace(\"Wedensday\", \"Wednesday\")\n", + " # and a Thrusday\n", + " value = value.replace(\"Thrusday\", \"Thursday\")\n", + "\n", + " # three Hebrew calendar dates include text \"AM\" at end; at least one AH date\n", + " if value.endswith(\" AM\") or value.endswith(\" AH\"):\n", + " value = value[:-3]\n", + " if value.endswith(\".\"): # strip off trailing period\n", + " value = value[:-1]\n", + " # \n", + "\n", + " # about 62 have ordinals; strip them out\n", + " value = re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", value)\n", + " \n", + " try:\n", + " return Undate.parse(value, undate_calendar)\n", + " except (VisitError, ValueError, UnexpectedEOF) as err:\n", + " print(f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\")\n", + "\n", + " # there are a handful of cases in PGP where calendars are mixed,\n", + " # i.e. hebrew months used for hijri calendar\n", + "\n", + " # some dates are entered in ISO format for another calendar; can we parse and set calendar?\n", + " if \"-\" in value and not \"/\" in value: # exclude intervals for now\n", + " try:\n", + " parsed = Undate.parse(value, \"ISO8601\")\n", + " if parsed:\n", + " parsed = parsed.as_calendar(undate_calendar)\n", + " print(f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\")\n", + " return parsed\n", + " except ValueError as err:\n", + " print(f\"Could not parse {value} as ISO date: {err}\")\n", + "\n", + "docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "623eb160-ab6c-44ba-b3f4-6770c2c7bd86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "original dates parsed: 3401\n", + "original dates unparsed: 167 (anno mundi, hijri, and seleucid calendars)\n", + "proportion parsed: 95.32%\n" + ] + } + ], + "source": [ + "# how many hebrew/islamic dates were parsed / not parsed?\n", + "\n", + "orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()\n", + "\n", + "orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] \n", + "\n", + "total_parsed = len(orig_dates_parsed)\n", + "total_unparsed = len(orig_dates_unparsed)\n", + "print(f\"\"\"original dates parsed: {total_parsed}\n", + "original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)\n", + "proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "42945787-6788-422d-9a04-f983ec6b31af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origorig_date_precision
54491570Seleucid125912591570year
1646319 Adar 1427Seleucid1116-03-051116-03-051427-12-19day
234721337Seleucid1025-08-28/1026-09-141025-08-28/1026-09-141337year
41499Wednesday, 15 Kislev 1500Seleucid1188-12-071188-12-071500-09-15day
43502Tevet 1548Seleucid1236-11-30/1236-12-281236-11-30/1236-12-281548-10month
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5 449 1570 Seleucid 1259 \n", + "16 463 19 Adar 1427 Seleucid 1116-03-05 \n", + "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n", + "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n", + "43 502 Tevet 1548 Seleucid 1236-11-30/1236-12-28 \n", + "\n", + " undate undate_orig orig_date_precision \n", + "5 1259 1570 year \n", + "16 1116-03-05 1427-12-19 day \n", + "23 1025-08-28/1026-09-14 1337 year \n", + "41 1188-12-07 1500-09-15 day \n", + "43 1236-11-30/1236-12-28 1548-10 month " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what is the date granularity of the dates we were able to parse?\n", + "\n", + "orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())\n", + "orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "88f1d3ab-e1c7-48b5-8907-5aeea463f1e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orig_date_precision\n", + "day 1566\n", + "month 1013\n", + "year 822\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# this is skewed because of the kinds of dates we're not able to parse or modifiers we're omitting entirely\n", + "orig_dates_parsed.orig_date_precision.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5d3a55b0-ed36-47ba-b022-848bb128b449", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origorig_date_precision
54491570Seleucid125912591570year
1646319 Adar 1427Seleucid1116-03-051116-03-051427-12-19day
234721337Seleucid1025-08-28/1026-09-141025-08-28/1026-09-141337year
41499Wednesday, 15 Kislev 1500Seleucid1188-12-071188-12-071500-09-15day
43502Tevet 1548Seleucid1236-11-30/1236-12-281236-11-30/1236-12-281548-10month
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5 449 1570 Seleucid 1259 \n", + "16 463 19 Adar 1427 Seleucid 1116-03-05 \n", + "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n", + "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n", + "43 502 Tevet 1548 Seleucid 1236-11-30/1236-12-28 \n", + "\n", + " undate undate_orig orig_date_precision \n", + "5 1259 1570 year \n", + "16 1116-03-05 1427-12-19 day \n", + "23 1025-08-28/1026-09-14 1337 year \n", + "41 1188-12-07 1500-09-15 day \n", + "43 1236-11-30/1236-12-28 1548-10 month " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check on the seleucid date parsing\n", + "\n", + "orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "8907d1fc-b87f-4173-8759-74c07fa70dca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " original: 1570 undate:1570 pgp standard 1259 earliest:1258-09-07 latest:1259-09-26\n", + " original: 19 Adar 1427 undate:1427-12-19 pgp standard 1116-03-05 earliest:1116-03-12 latest:1116-03-12\n", + " original: 1337 undate:1337 pgp standard 1025-08-28/1026-09-14 earliest:1025-09-03 latest:1026-09-20\n", + " original: Wednesday, 15 Kislev 1500 undate:1500-09-15 pgp standard 1188-12-07 earliest:1188-12-14 latest:1188-12-14\n", + " original: Tevet 1548 undate:1548-10 pgp standard 1236-11-30/1236-12-28 earliest:1236-12-07 latest:1237-01-04\n", + " original: Elul 1428 undate:1428-06 pgp standard 1117-08-01/1117-08-29 earliest:1117-08-08 latest:1117-09-05\n", + " original: First decade of Ḥeshvan 1442 undate:1442-08 pgp standard 1130 earliest:1130-10-13 latest:1130-11-10\n", + " original: Ḥeshvan 1453 undate:1453-08 pgp standard 1141 earliest:1141-10-11 latest:1141-11-08\n", + " original: Sunday, 21 Kislev 1355 undate:1355-09-21 pgp standard 1043-11-26 earliest:1043-12-02 latest:1043-12-02\n", + " original: Monday, 16 Tammuz 1540 undate:1540-04-16 pgp standard 1229-07-09 earliest:1229-07-16 latest:1229-07-16\n" + ] + } + ], + "source": [ + "for row in orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'][:10].itertuples():\n", + " print(f\" original: {row.doc_date_original} undate:{row.undate_orig} pgp standard {row.doc_date_standard} earliest:{row.undate_orig.earliest} latest:{row.undate_orig.latest}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "48142841-e030-4009-af11-6cbc936fd7bf", + "metadata": {}, + "outputs": [], + "source": [ + "# check calendar agreement, how many were wrong?\n", + "\n", + "calendar_mapping = {\n", + " \"hebrew\": \"Anno Mundi\",\n", + " \"islamic\": \"Hijrī\",\n", + " \"seleucid\": \"Seleucid\"\n", + "}\n", + "\n", + "orig_dates_parsed['undate_calendar'] = orig_dates_parsed.undate_orig.apply(lambda x: calendar_mapping.get(x.calendar, x.calendar))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0719fcbe-8a87-4fe9-a0b8-66a4b19f5d39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
undateundate_calendardoc_date_calendar
350631794-09-25/1795-09-13Anno MundiAnno Mundi
350701755-09-06/1756-09-24Anno MundiAnno Mundi
350711519-10-09HijrīHijrī
350721563-04-05HijrīHijrī
350731563-04-25HijrīHijrī
\n", + "
" + ], + "text/plain": [ + " undate undate_calendar doc_date_calendar\n", + "35063 1794-09-25/1795-09-13 Anno Mundi Anno Mundi\n", + "35070 1755-09-06/1756-09-24 Anno Mundi Anno Mundi\n", + "35071 1519-10-09 Hijrī Hijrī\n", + "35072 1563-04-05 Hijrī Hijrī\n", + "35073 1563-04-25 Hijrī Hijrī" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# which records appear to have mismatched original calendars? (i.e. can be parsed by the opposite parser)\n", + "# only 4! \n", + "# PGPIDs 3637, 5902, 6058, 9198\n", + "\n", + "# however, looking at the PGP records indicates sometimes the authors mixed hebrew and arabic months\n", + "# from description of PGPID 3637: [It is unusual but not unheard of to combine Hebrew months with the Hijrī calendar.] \n", + "\n", + "orig_dates_parsed[['undate', 'undate_calendar', 'doc_date_calendar']].tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a104d772-6c2c-4711-91ec-8cf1f108ae23", + "metadata": {}, + "outputs": [], + "source": [ + "# can we sort by parsed original dates? \n", + "# doesn't work currently because of overlapping dates / different granularity\n", + "#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "c653d928-8fec-4ddc-9abf-ace2f7ca6629", + "metadata": {}, + "outputs": [], + "source": [ + "# set earliest/latest for graphing\n", + "\n", + "# IMPORTANT: we have to cast type to something pandas/altair supports\n", + "\n", + "orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')\n", + "orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')\n", + "orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "91f155fe-d0e6-4ee4-99de-698ac301e3f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orig_date_earliestorig_date_latestorig_date_midpgpiddoc_date_calendar
51258-09-071259-09-261259-03-18449Seleucid
161116-03-121116-03-121116-03-12463Seleucid
231025-09-031026-09-201026-03-13472Seleucid
411188-12-141188-12-141188-12-14499Seleucid
431236-12-071237-01-041236-12-21502Seleucid
471117-08-081117-09-051117-08-22506Seleucid
551130-10-131130-11-101130-10-27516Seleucid
611035-05-281035-05-281035-05-28524Anno Mundi
621034-08-251034-09-221034-09-08525Hijrī
731141-10-111141-11-081141-10-25537Seleucid
\n", + "
" + ], + "text/plain": [ + " orig_date_earliest orig_date_latest orig_date_mid pgpid doc_date_calendar\n", + "5 1258-09-07 1259-09-26 1259-03-18 449 Seleucid\n", + "16 1116-03-12 1116-03-12 1116-03-12 463 Seleucid\n", + "23 1025-09-03 1026-09-20 1026-03-13 472 Seleucid\n", + "41 1188-12-14 1188-12-14 1188-12-14 499 Seleucid\n", + "43 1236-12-07 1237-01-04 1236-12-21 502 Seleucid\n", + "47 1117-08-08 1117-09-05 1117-08-22 506 Seleucid\n", + "55 1130-10-13 1130-11-10 1130-10-27 516 Seleucid\n", + "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi\n", + "62 1034-08-25 1034-09-22 1034-09-08 525 Hijrī\n", + "73 1141-10-11 1141-11-08 1141-10-25 537 Seleucid" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "a8cc1025-0334-44b9-90e6-13ddb30fec31", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orig_date_earliest datetime64[s]\n", + "orig_date_latest datetime64[s]\n", + "orig_date_mid datetime64[s]\n", + "dtype: object" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid']].dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b0299c8d-a113-4918-bd04-57c00b233d21", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpidorig_date_earliestorig_date_latestorig_date_middoc_date_originaldoc_date_calendardoc_date_standardorig_date_precision
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [pgpid, orig_date_earliest, orig_date_latest, orig_date_mid, doc_date_original, doc_date_calendar, doc_date_standard, orig_date_precision]\n", + "Index: []" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import datetime\n", + "\n", + "# are these data errors?\n", + "\n", + "orig_dates_parsed[orig_dates_parsed.orig_date_earliest > Undate(2100).earliest][['pgpid', 'orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'orig_date_precision']]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "c5861110-dbd5-4d7a-8ada-acf7cb871aa7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import altair as alt\n", + "\n", + "# exclude dates after 2100\n", + "graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'pgpid', 'doc_date_calendar']][orig_dates_parsed.orig_date_earliest < Undate(2100).earliest]\n", + "\n", + "bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(\n", + " x=alt.X('orig_date_earliest:T', title=\"original date (range)\"), # , axis=alt.Axis(format=\"r\")),\n", + " x2='orig_date_latest:T',\n", + " y=alt.Y('count(pgpid)', title='Count of Documents')\n", + ").properties(width=1200, height=200)\n", + "\n", + "earliest_chart = bar_chart.mark_point(opacity=0.2, color=\"green\", interpolate=\"monotone\").encode(\n", + " x=alt.X('orig_date_earliest:T', title=\"Date (earliest)\"), # axis=alt.Axis(format=\"r\")),\n", + " y=alt.Y('count(pgpid)', title='Count of Documents')\n", + ").properties(width=1200, height=200)\n", + "\n", + "latest_chart = bar_chart.mark_point(opacity=0.2, color=\"blue\", interpolate=\"monotone\").encode(\n", + " x=alt.X('orig_date_latest:T', title=\"Date (latest)\"), # axis=alt.Axis(format=\"r\")),\n", + " y=alt.Y('count(pgpid)', title='Count of Documents')\n", + ").properties(width=1200, height=200)\n", + "\n", + "# (bar_chart & line_chart).properties(title=\"Documents by date (1000-1300)\")\n", + "(bar_chart & (latest_chart + earliest_chart)).interactive()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b1fce94f-8f52-4b56-b88f-3575c3ebf2b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# plot points for all the documents with date, using jitter to scatter them vertically\n", + "\n", + "jitter_plot = alt.Chart(graphable_data).mark_circle(size=8, opacity=0.5).encode(\n", + " x=\"orig_date_earliest:T\", # maybe could eventually use jitter to plot between earliest/latest\n", + " y=alt.Y(\"jitter:Q\", title=\"\").axis(None),\n", + " color=alt.Color('doc_date_calendar:N', title=\"Calendar\") #.legend(None)\n", + ").transform_calculate(\n", + " # Generate Gaussian jitter with a Box-Muller transform\n", + " jitter=\"sqrt(-2*log(random()))*cos(2*PI*random())\"\n", + ").properties(width=1200, height=200)\n", + "\n", + "\n", + "jitter_plot \n" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "id": "f8439e45-e9b0-4eba-8b43-6a87cb8d3b9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 225, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(bar_chart & jitter_plot).interactive()" + ] + }, + { + "cell_type": "markdown", + "id": "951d92ea-4689-481c-8590-324b782a7a1c", + "metadata": {}, + "source": [ + "## compare weekdays" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "3122a874-bb17-429f-993f-4bf7a76c1a36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origorig_date_precision
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28day
18352550Monday night, 5 Av 1443Seleucid113211321443-05-05day
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25day
20132739Wednesday 29th Elul 1354Seleucid1043-09-071043-09-071354-06-29day
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29day
........................
2930934623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20day
2993035264Wednesday 13 Ṭevet 1526Seleucid1214/12151214/12151526-10-13day
3401639564Monday 16 Tevet 1339Seleucid1027-12-181027-12-181339-10-16day
3447440035Monday 1st Iyyar 1437Seleucid1126-04-261126-04-261437-02-01day
3447540036Friday 15 of Adar 1443Seleucid1132-03-041132-03-041443-12-15day
\n", + "

104 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1835 2550 Monday night, 5 Av 1443 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "... ... ... ... \n", + "29309 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "29930 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", + "34016 39564 Monday 16 Tevet 1339 Seleucid \n", + "34474 40035 Monday 1st Iyyar 1437 Seleucid \n", + "34475 40036 Friday 15 of Adar 1443 Seleucid \n", + "\n", + " doc_date_standard undate undate_orig orig_date_precision \n", + "851 1270 1270 1581-03-28 day \n", + "1835 1132 1132 1443-05-05 day \n", + "1929 1133 1133 1444-09-25 day \n", + "2013 1043-09-07 1043-09-07 1354-06-29 day \n", + "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "... ... ... ... ... \n", + "29309 1266/1267 1266/1267 1578-10-20 day \n", + "29930 1214/1215 1214/1215 1526-10-13 day \n", + "34016 1027-12-18 1027-12-18 1339-10-16 day \n", + "34474 1126-04-26 1126-04-26 1437-02-01 day \n", + "34475 1132-03-04 1132-03-04 1443-12-15 day \n", + "\n", + "[104 rows x 7 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'orig_date_precision']]\n", + "weekday_dates" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "993c4f4a-4364-42ad-8927-145458f0e538", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orig_date_precision\n", + "day 104\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates.orig_date_precision.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "3e4ea50c-b11c-433b-b6f9-691098b057d3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origorig_date_precisionundate_weekdayundate_weekday_nameorig_weekday
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28day3ThursdayThursday
18352550Monday night, 5 Av 1443Seleucid113211321443-05-05day1TuesdayTuesday
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25day0MondayMonday
20132739Wednesday 29th Elul 1354Seleucid1043-09-071043-09-071354-06-29day2WednesdayWednesday
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29day3ThursdayThursday
.................................
2930934623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20day0MondayMonday
2993035264Wednesday 13 Ṭevet 1526Seleucid1214/12151214/12151526-10-13day2WednesdayWednesday
3401639564Monday 16 Tevet 1339Seleucid1027-12-181027-12-181339-10-16day0MondayMonday
3447440035Monday 1st Iyyar 1437Seleucid1126-04-261126-04-261437-02-01day0MondayMonday
3447540036Friday 15 of Adar 1443Seleucid1132-03-041132-03-041443-12-15day4FridayFriday
\n", + "

104 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1835 2550 Monday night, 5 Av 1443 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "... ... ... ... \n", + "29309 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "29930 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", + "34016 39564 Monday 16 Tevet 1339 Seleucid \n", + "34474 40035 Monday 1st Iyyar 1437 Seleucid \n", + "34475 40036 Friday 15 of Adar 1443 Seleucid \n", + "\n", + " doc_date_standard undate undate_orig orig_date_precision \\\n", + "851 1270 1270 1581-03-28 day \n", + "1835 1132 1132 1443-05-05 day \n", + "1929 1133 1133 1444-09-25 day \n", + "2013 1043-09-07 1043-09-07 1354-06-29 day \n", + "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "... ... ... ... ... \n", + "29309 1266/1267 1266/1267 1578-10-20 day \n", + "29930 1214/1215 1214/1215 1526-10-13 day \n", + "34016 1027-12-18 1027-12-18 1339-10-16 day \n", + "34474 1126-04-26 1126-04-26 1437-02-01 day \n", + "34475 1132-03-04 1132-03-04 1443-12-15 day \n", + "\n", + " undate_weekday undate_weekday_name orig_weekday \n", + "851 3 Thursday Thursday \n", + "1835 1 Tuesday Tuesday \n", + "1929 0 Monday Monday \n", + "2013 2 Wednesday Wednesday \n", + "3257 3 Thursday Thursday \n", + "... ... ... ... \n", + "29309 0 Monday Monday \n", + "29930 2 Wednesday Wednesday \n", + "34016 0 Monday Monday \n", + "34474 0 Monday Monday \n", + "34475 4 Friday Friday \n", + "\n", + "[104 rows x 10 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "days = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n", + "\n", + "# get numeric weekday\n", + "weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)\n", + "weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])\n", + "# extract weekday from date label\n", + "weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()\n", + "# correct misspellings\n", + "misspelled_days = {\n", + " \"Wedensday\": \"Wednesday\",\n", + " \"Thrusday\": \"Thursday\",\n", + "}\n", + "weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))\n", + "\n", + "# shift night to next day, e.g. Wednesday night should be Thursday\n", + "# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day\n", + "def next_day(weekday):\n", + " return days[(days.index(weekday) +1) % 7]\n", + "\n", + "weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if \" night\" in row.doc_date_original else row.orig_weekday, axis=1)\n", + "weekday_dates[weekday_dates.doc_date_original.str.contains(\" night\")]\n", + "\n", + "weekday_dates" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "4ced7809-1414-44ae-aae7-c2d0d1dee9ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origorig_date_precisionundate_weekdayundate_weekday_nameorig_weekday
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28day3ThursdayThursday
18352550Monday night, 5 Av 1443Seleucid113211321443-05-05day1TuesdayTuesday
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25day0MondayMonday
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29day3ThursdayThursday
55117237Tuesday night, 22 Kislev 1435Seleucid1123-12-121123-12-121435-09-22day2WednesdayWednesday
58547637Monday night, 29 Ṭevet 1438Seleucid112711271438-10-29day4FridayTuesday
58577642Thursday night, 23 Tammuz 1538Seleucid1227-07-091227-07-091538-04-23day4FridayFriday
64198332Friday night, 20 Iyar 4957Anno Mundi1197-051197-054957-02-20day5SaturdaySaturday
2930934623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20day0MondayMonday
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1835 2550 Monday night, 5 Av 1443 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "5511 7237 Tuesday night, 22 Kislev 1435 Seleucid \n", + "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid \n", + "5857 7642 Thursday night, 23 Tammuz 1538 Seleucid \n", + "6419 8332 Friday night, 20 Iyar 4957 Anno Mundi \n", + "29309 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "\n", + " doc_date_standard undate undate_orig orig_date_precision \\\n", + "851 1270 1270 1581-03-28 day \n", + "1835 1132 1132 1443-05-05 day \n", + "1929 1133 1133 1444-09-25 day \n", + "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "5511 1123-12-12 1123-12-12 1435-09-22 day \n", + "5854 1127 1127 1438-10-29 day \n", + "5857 1227-07-09 1227-07-09 1538-04-23 day \n", + "6419 1197-05 1197-05 4957-02-20 day \n", + "29309 1266/1267 1266/1267 1578-10-20 day \n", + "\n", + " undate_weekday undate_weekday_name orig_weekday \n", + "851 3 Thursday Thursday \n", + "1835 1 Tuesday Tuesday \n", + "1929 0 Monday Monday \n", + "3257 3 Thursday Thursday \n", + "5511 2 Wednesday Wednesday \n", + "5854 4 Friday Tuesday \n", + "5857 4 Friday Friday \n", + "6419 5 Saturday Saturday \n", + "29309 0 Monday Monday " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates[weekday_dates.doc_date_original.str.contains(\" night\")]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "fedb5323-0e9c-476e-a7e2-95443d2f9e1d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "44 matches, 60 mismatches (42.31%)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origorig_date_precisionundate_weekdayundate_weekday_nameorig_weekday
52716947Monday 3 Iyyar 1740Seleucid1429-04-071429-04-071740-02-03day3ThursdayMonday
58547637Monday night, 29 Ṭevet 1438Seleucid112711271438-10-29day4FridayTuesday
864911227Monday 24 Jumādā I 517Hijrī1123-07-201123-07-200517-05-24day4FridayMonday
1640019649Thursday 26 Iyyar 5306Anno Mundi1546-04-281546-04-285306-02-26day2WednesdayThursday
1772821094Saturday 20 Rajab 550Hijrī1155-09-191155-09-190550-07-20day0MondaySaturday
2310527479Tuesday 11 Tammuz 5525Anno Mundi1765-06-301765-06-305525-04-11day6SundayTuesday
2311027484Friday 20th Shevat 5405Anno Mundi164516455405-11-20day3ThursdayFriday
2311127485Sunday 22 Adar 5590Anno Mundi1830-03-171830-03-175590-12-22day2WednesdaySunday
2311327487Thursday 15th Shevat 5450Anno Mundi169016905450-11-15day2WednesdayThursday
2311527489Sunday 6 Nisan 5528Anno Mundi1768-03-241768-03-245528-01-06day3ThursdaySunday
2311627490Thursday 19th Elul 5428Anno Mundi166816685428-06-19day6SundayThursday
2311727491Tuesday 1 Kislev 5507Anno Mundi1746-11-141746-11-145507-09-01day0MondayTuesday
2312227496Sunday 28 Elul 5511Anno Mundi1751-09-181751-09-185511-06-28day5SaturdaySunday
2312327497Sunday 17th Sivan 5423Anno Mundi166316635423-03-17day4FridaySunday
2312427498Sunday 25th Tevet 5409Anno Mundi164816485409-10-25day5SaturdaySunday
2312627500Thursday 4 Sivan 5516Anno Mundi1756-06-021756-06-025516-03-04day2WednesdayThursday
2313327507Sunday 25 Sivan 5556Anno Mundi1796-07-011796-07-015556-03-25day4FridaySunday
2313727511Wednesday 28th Tevet 5399Anno Mundi164016405399-10-28day1TuesdayWednesday
2314127515Monday 15th Iyyar 5414Anno Mundi165416545414-02-15day5SaturdayMonday
2314227516Thursday 24 Nisan 5481Anno Mundi1721-04-211721-04-215481-01-24day0MondayThursday
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5271 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n", + "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n", + "8649 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n", + "16400 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n", + "17728 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n", + "23105 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n", + "23110 27484 Friday 20th Shevat 5405 Anno Mundi 1645 \n", + "23111 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n", + "23113 27487 Thursday 15th Shevat 5450 Anno Mundi 1690 \n", + "23115 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n", + "23116 27490 Thursday 19th Elul 5428 Anno Mundi 1668 \n", + "23117 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n", + "23122 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n", + "23123 27497 Sunday 17th Sivan 5423 Anno Mundi 1663 \n", + "23124 27498 Sunday 25th Tevet 5409 Anno Mundi 1648 \n", + "23126 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n", + "23133 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n", + "23137 27511 Wednesday 28th Tevet 5399 Anno Mundi 1640 \n", + "23141 27515 Monday 15th Iyyar 5414 Anno Mundi 1654 \n", + "23142 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n", + "\n", + " undate undate_orig orig_date_precision undate_weekday \\\n", + "5271 1429-04-07 1740-02-03 day 3 \n", + "5854 1127 1438-10-29 day 4 \n", + "8649 1123-07-20 0517-05-24 day 4 \n", + "16400 1546-04-28 5306-02-26 day 2 \n", + "17728 1155-09-19 0550-07-20 day 0 \n", + "23105 1765-06-30 5525-04-11 day 6 \n", + "23110 1645 5405-11-20 day 3 \n", + "23111 1830-03-17 5590-12-22 day 2 \n", + "23113 1690 5450-11-15 day 2 \n", + "23115 1768-03-24 5528-01-06 day 3 \n", + "23116 1668 5428-06-19 day 6 \n", + "23117 1746-11-14 5507-09-01 day 0 \n", + "23122 1751-09-18 5511-06-28 day 5 \n", + "23123 1663 5423-03-17 day 4 \n", + "23124 1648 5409-10-25 day 5 \n", + "23126 1756-06-02 5516-03-04 day 2 \n", + "23133 1796-07-01 5556-03-25 day 4 \n", + "23137 1640 5399-10-28 day 1 \n", + "23141 1654 5414-02-15 day 5 \n", + "23142 1721-04-21 5481-01-24 day 0 \n", + "\n", + " undate_weekday_name orig_weekday \n", + "5271 Thursday Monday \n", + "5854 Friday Tuesday \n", + "8649 Friday Monday \n", + "16400 Wednesday Thursday \n", + "17728 Monday Saturday \n", + "23105 Sunday Tuesday \n", + "23110 Thursday Friday \n", + "23111 Wednesday Sunday \n", + "23113 Wednesday Thursday \n", + "23115 Thursday Sunday \n", + "23116 Sunday Thursday \n", + "23117 Monday Tuesday \n", + "23122 Saturday Sunday \n", + "23123 Friday Sunday \n", + "23124 Saturday Sunday \n", + "23126 Wednesday Thursday \n", + "23133 Friday Sunday \n", + "23137 Tuesday Wednesday \n", + "23141 Saturday Monday \n", + "23142 Monday Thursday " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many match?\n", + "matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]\n", + "\n", + "mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]\n", + "\n", + "print(f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)\")\n", + "mismatches.head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "d6476907-1628-4d68-ab1f-43c95e123707", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Anno Mundi 55\n", + "Seleucid 3\n", + "Hijrī 2\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mismatches.doc_date_calendar.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "18b71d18-5d5b-4f92-8801-499bcf412efe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orig_weekday\n", + "Wednesday 17\n", + "Sunday 12\n", + "Monday 10\n", + "Thursday 9\n", + "Tuesday 7\n", + "Friday 4\n", + "Saturday 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mismatches.orig_weekday.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "eb7ea065-e4b5-47aa-9538-8dc9851ea572", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 mismatches that include text 'night'\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origorig_date_precisionundate_weekdayundate_weekday_nameorig_weekday
58547637Monday night, 29 Ṭevet 1438Seleucid112711271438-10-29day4FridayTuesday
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n", + "\n", + " undate undate_orig orig_date_precision undate_weekday \\\n", + "5854 1127 1438-10-29 day 4 \n", + "\n", + " undate_weekday_name orig_weekday \n", + "5854 Friday Tuesday " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many mismatches are due to night?\n", + "night_mismatches = mismatches[mismatches.doc_date_original.str.contains(\" night\")]\n", + "print(f\"{len(night_mismatches)} mismatches that include text 'night'\")\n", + "night_mismatches" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ece780b8-2eb2-4cbc-9195-27def665f7fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# plot frequency by day, just for fun\n", + "\n", + "# get numeric weekday\n", + "orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)\n", + "orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n", + "\n", + "# restrict to dates with day precision; the rest are just using earliest day\n", + "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']\n", + "\n", + "\n", + "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " alt.Color('count(pgpid)', title='# of documents')\n", + ").properties(title='document frequency by weekday')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "6b2f24de-18ce-4f40-b300-e8cc334a338c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "undate_weekday_name\n", + "Monday 300\n", + "Thursday 280\n", + "Tuesday 233\n", + "Sunday 223\n", + "Wednesday 223\n", + "Friday 211\n", + "Saturday 96\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_days.undate_weekday_name.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "6a7a0bf5-f8c2-4034-8495-2fb4b297740a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundateundate_origcentury
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-281200s
18352550Monday night, 5 Av 1443Seleucid113211321443-05-051100s
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-251100s
20132739Wednesday 29th Elul 1354Seleucid1043-09-071043-09-071354-06-291000s
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-291200s
\n", + "
" + ], + "text/plain": [ + " pgpid doc_date_original doc_date_calendar \\\n", + "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1835 2550 Monday night, 5 Av 1443 Seleucid \n", + "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "\n", + " doc_date_standard undate undate_orig century \n", + "851 1270 1270 1581-03-28 1200s \n", + "1835 1132 1132 1443-05-05 1100s \n", + "1929 1133 1133 1444-09-25 1100s \n", + "2013 1043-09-07 1043-09-07 1354-06-29 1000s \n", + "3257 1229-09-18 1229-09-18 1541-07-29 1200s " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get rough century (gregorian calendar)\n", + "weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: f\"{(\"%04d\" % x.earliest.year)[:2]}00s\")\n", + "\n", + "weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate', 'undate_orig', 'century']].head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "2c07d56f-552a-4d2c-9c18-0b78f056ccf6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "century\n", + "1700s 48\n", + "1600s 19\n", + "1100s 11\n", + "1800s 9\n", + "1200s 6\n", + "1000s 5\n", + "1500s 4\n", + "0900s 1\n", + "1400s 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates.century.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "eb99871e-d9a5-4211-9bd2-5a9acfe8face", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " alt.Y('century'),\n", + " alt.Color('count(pgpid)')\n", + ").properties(title='document frequency by weekday and century')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "08a58fcf-2b08-441b-9dc8-385bafeb88e6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what about heat map by month?\n", + "\n", + "\n", + "# get numeric month\n", + "orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)\n", + "# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n", + "\n", + "has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]\n", + "#orig_dates_months = [\n", + "\n", + "\n", + "alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n", + " alt.X('undate_month', title='month'),\n", + " alt.Color('count(pgpid)', title='# of documents')\n", + ").facet(\n", + " row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", + ").properties(title='Document frequency by month and calendar')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "a7a16c53-6f01-4457-9458-4fcf80a35c51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Seleucid 1183\n", + "Anno Mundi 888\n", + "Hijrī 508\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "has_month.doc_date_calendar.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "65bce74e-67b7-48df-9f7f-a6f264af4f11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1566, 39)" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orig_dates_days[orig_dates_days.undate_weekday_name.notna()].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "ac940883-e00e-4dde-8339-95a1b733f6f3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_89288/2470126649.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# weekday frequency by month?\n", + "\n", + "orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n", + "\n", + "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " alt.Y('undate_month', title=\"month\"),\n", + " alt.Color('count(pgpid)')\n", + ").facet(\n", + " column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", + ").properties(title='Document frequency by weekday and month (1,557 documents)')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "35f1ff65-f726-4817-8312-a08198956343", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.FacetChart(...)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n", + "\n", + "# alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n", + "# alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + "# alt.Y('undate_month', title=\"month\"),\n", + "# alt.Color('count(pgpid)')\n", + "# ).facet(\n", + "# column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", + "# ).properties(title='document frequency by weekday and month')\n", + "\n", + "\n", + "\n", + "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n", + " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", + " # alt.Y('doc_date_calendar'),\n", + " alt.Color('count(pgpid)')\n", + ").facet(row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", + ").properties(title='document frequency by weekday')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "65897b9d-2399-434a-9a6c-e08f58510848", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doc_date_calendar\n", + "Anno Mundi 82\n", + "Seleucid 20\n", + "Hijrī 2\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weekday_dates.doc_date_calendar.value_counts()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py index a43a270..5836b2f 100644 --- a/src/undate/converters/calendars/__init__.py +++ b/src/undate/converters/calendars/__init__.py @@ -1,5 +1,11 @@ from undate.converters.calendars.gregorian import GregorianDateConverter from undate.converters.calendars.hebrew import HebrewDateConverter from undate.converters.calendars.islamic import IslamicDateConverter +from undate.converters.calendars.seleucid import SeleucidDateConverter -__all__ = ["GregorianDateConverter", "HebrewDateConverter", "IslamicDateConverter"] +__all__ = [ + "GregorianDateConverter", + "HebrewDateConverter", + "IslamicDateConverter", + "SeleucidDateConverter", +] diff --git a/src/undate/converters/calendars/hebrew/hebrew.lark b/src/undate/converters/calendars/hebrew/hebrew.lark index b55ec3f..6f4244c 100644 --- a/src/undate/converters/calendars/hebrew/hebrew.lark +++ b/src/undate/converters/calendars/hebrew/hebrew.lark @@ -3,7 +3,7 @@ // only support day month year format for now // parser requires numeric day and year to be distinguished based on order -hebrew_date: day month year | month year | year +hebrew_date: weekday? day month comma? year | month year | year // TODO: handle date ranges? @@ -27,10 +27,14 @@ month: month_1 | month_10 | month_11 | month_12 - | month_13 + | month_13 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ +comma: "," +weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma? + + // months, in order; from convertdate list // with variants from Princeton Geniza Project // support matching with and without accents @@ -43,11 +47,13 @@ month_5: "Av" month_6: "Elul" // Tishrei or Tishri month_7: /Tishre?i/ -month_8: "Heshvan" +// Heshvan, Ḥeshvan, Marḥeshvan +month_8: /(Mar)?[ḤHḥ]eshvan/ month_9: "Kislev" // Tevet or Teveth month_10: /[ṬT]eveth?/ -month_11: "Shevat" +// Shevat or Shevaṭ +month_11: /Sheva[tṭ]/ // Adar I or Adar month_12: /Adar( I)?/ // Adar II or Adar Bet diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py index 48e8b20..8880434 100644 --- a/src/undate/converters/calendars/hebrew/transformer.py +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -13,6 +13,8 @@ class HebrewDateTransformer(Transformer): """Transform a Hebrew date parse tree and return an Undate or UndateInterval.""" + calendar = Calendar.HEBREW + def hebrew_date(self, items): parts = {} for child in items: @@ -22,9 +24,9 @@ def hebrew_date(self, items): value = int(child.children[0]) parts[str(child.data)] = value - # initialize and return an undate with islamic year, month, day and - # islamic calendar - return HebrewUndate(**parts) + # initialize and return an undate with year, month, day and + # configured calendar (hebrew by default) + return Undate(**parts, calendar=self.calendar) # year translation is not needed since we want a tree with name year # this is equivalent to a no-op diff --git a/src/undate/converters/calendars/islamic/islamic.lark b/src/undate/converters/calendars/islamic/islamic.lark index 3ad59a5..1e4940b 100644 --- a/src/undate/converters/calendars/islamic/islamic.lark +++ b/src/undate/converters/calendars/islamic/islamic.lark @@ -3,7 +3,7 @@ // only support day month year format for now // parser requires numeric day and year to be distinguished based on order -islamic_date: day month year | month year | year +islamic_date: weekday? day month year | month year | year // TODO: handle date ranges? @@ -13,6 +13,7 @@ islamic_date: day month year | month year | year year: /\d+/ + // months month: month_1 | month_2 @@ -29,6 +30,10 @@ month: month_1 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ + +comma: "," +weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma? + // months, in order; from convertdate list // with variants from Princeton Geniza Project // support matching with and without accents @@ -42,7 +47,7 @@ month_4: /Rab[īi][ʿ'] (ath-Th[āa]n[īi]|II)/ // Jumādā al-ʾAwwal or Jumādā I month_5: /Jum[āa]d[āa] (al-[ʾ`]Awwal|I)/ // Jumādā ath-Thāniya or Jumādā II -month_6: /Jum[āa]d[āa] (ath-Th[āa]niyah|II)/ +month_6: /Jum[āa][dḍ][āa] (ath-Th[āa]niyah|II)/ month_7: "Rajab" // Shaʿbān month_8: /Sha[ʿ']b[āa]n/ diff --git a/src/undate/converters/calendars/seleucid.py b/src/undate/converters/calendars/seleucid.py new file mode 100644 index 0000000..bddf867 --- /dev/null +++ b/src/undate/converters/calendars/seleucid.py @@ -0,0 +1,24 @@ +from undate.converters.calendars import HebrewDateConverter +from undate.undate import Calendar + + +class SeleucidDateConverter(HebrewDateConverter): + #: offset for Seleucid calendar: Seleucid year + 3449 = Anno Mundi year + SELEUCID_OFFSET = 3449 + + #: converter name: Seleucid + name: str = "Seleucid" + calendar_name: str = "Seleucid" + + def __init__(self): + super().__init__() + # override hebrew calendar to initialize undates with seleucid + # calendar; this triggers Seleucid calendar to_gregorian method use + self.transformer.calendar = Calendar.SELEUCID + + def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: + """Convert a Seleucid date, specified by year, month, and day, + to the Gregorian equivalent date. Uses hebrew calendar conversion + logic with :attr:`SELEUCID_OFFSET`. Returns a tuple of year, month, day. + """ + return super().to_gregorian(year + self.SELEUCID_OFFSET, month, day) diff --git a/src/undate/date.py b/src/undate/date.py index 27f6efa..81cd035 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -104,6 +104,27 @@ def day(self) -> Optional[int]: return int(str(self.astype("datetime64[D]")).split("-")[-1]) return None + @property + def weekday(self) -> Optional[int]: + """Equivalent to :meth:`datetime.date.weedkay`; returns day of week as an + integer where Monday is 0 and Sunday is 6. Only supported for dates + with date unit in days. + """ + # only return a weekday if date unit is in days + if self.dtype == "datetime64[D]": + # calculate based on difference between current day and week start + # numpy datetime weeks start on thursdays - presumably since + # unix epoch day zero was a thursday... + + # implementation inspired in part by https://stackoverflow.com/a/54264187 + + thursday_week = self.astype("datetime64[W]") + days_from_thursday = (self - thursday_week).astype(int) + # if monday is 0, thursday is 3 + return (days_from_thursday + 3) % 7 + + return None + def __sub__(self, other): # modify to conditionally return a timedelta object instead of a # Date object with dtype timedelta64[D] (default behavior) diff --git a/src/undate/undate.py b/src/undate/undate.py index be4454a..bc627ab 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -29,6 +29,7 @@ class Calendar(StrEnum): GREGORIAN = auto() HEBREW = auto() ISLAMIC = auto() + SELEUCID = auto() @staticmethod def get_converter(calendar): @@ -96,7 +97,6 @@ def __init__( if calendar is not None: self.set_calendar(calendar) self.calendar_converter = Calendar.get_converter(self.calendar) - self.calculate_earliest_latest(year, month, day) if converter is None: @@ -192,6 +192,9 @@ def calculate_earliest_latest(self, year, month, day): ) def set_calendar(self, calendar: Union[str, Calendar]): + """Find calendar by name if passed as string and set on the object. + Only intended for use at initialization time; use :meth:`as_calendar` + to change calendar.""" if calendar is not None: # if not passed as a Calendar instance, do a lookup if not isinstance(calendar, Calendar): @@ -202,6 +205,19 @@ def set_calendar(self, calendar: Union[str, Calendar]): raise ValueError(f"Calendar `{calendar}` is not supported") from err self.calendar = calendar + def as_calendar(self, calendar: Union[str, Calendar]): + """Return a new :class:`Undate` object with the same year, month, day, and labels + used to initialize the current object, but with a different calendar. Note that this + does NOT do calendar conversion, but reinterprets current numeric year, month, day values + according to the new calendar.""" + return Undate( + year=self.initial_values.get("year"), + month=self.initial_values.get("month"), + day=self.initial_values.get("day"), + label=self.label, + calendar=calendar, + ) + def __str__(self) -> str: # if any portion of the date is partially known, construct # pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits @@ -319,8 +335,12 @@ def __lt__(self, other: object) -> bool: # (e.g., single date within the same year) # comparison for those cases is not currently supported elif other in self or self in other: + # sort by precision, most precise first + by_precision = sorted( + [self, other], key=lambda x: x.precision, reverse=True + ) raise NotImplementedError( - "Can't compare when one date falls within the other" + f"Can't compare when one date ({by_precision[0]}) falls within the other ({by_precision[1]})" ) # NOTE: unsupported comparisons are supposed to return NotImplemented # However, doing that in this case results in a confusing TypeError! diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py index 6e4a5e6..7dcca83 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py @@ -26,6 +26,12 @@ def test_hebrew_undate(): ("5362", HebrewUndate(5362), DatePrecision.YEAR), # add when we support parsing ranges: # Adar I and Adar II 5453 : (1693 CE) + # support weekdays included in text + ("Thursday, 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY), + # with or without comma + ("Thursday 12 Sivan 4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY), + # huh, current parsing completely ignores whitespace; do we want that? + ("Thursday12Sivan4795", HebrewUndate(4795, 3, 12), DatePrecision.DAY), ] diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py index 951a9f8..04ff53b 100644 --- a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py +++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py @@ -28,6 +28,7 @@ def test_islamic_undate(): # examples from ISMI data (reformatted to day month year) # Rabi 1 = month 3 ("14 Rabīʿ I 901", IslamicUndate(901, 3, 14), DatePrecision.DAY), + ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH), ("884", IslamicUndate(884), DatePrecision.YEAR), # Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), # add when we support parsing ranges: diff --git a/tests/test_date.py b/tests/test_date.py index 5ff017d..8b13472 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -1,3 +1,5 @@ +import datetime + import numpy as np from undate.date import ONE_YEAR, Date, DatePrecision, Timedelta @@ -51,6 +53,26 @@ def test_properties_day(self): assert Date(2010, 5).day is None assert Date(2021, 6, 15).day == 15 + def test_weekday(self): + # thursday + assert Date(2025, 1, 2).weekday == 3 + assert Date(2025, 1, 2).weekday == datetime.date(2025, 1, 2).weekday() + # friday + assert Date(2025, 1, 3).weekday == 4 + assert Date(2025, 1, 3).weekday == datetime.date(2025, 1, 3).weekday() + # saturday + assert Date(2025, 1, 4).weekday == 5 + assert Date(2025, 1, 4).weekday == datetime.date(2025, 1, 4).weekday() + # sunday + assert Date(2025, 1, 5).weekday == 6 + assert Date(2025, 1, 5).weekday == datetime.date(2025, 1, 5).weekday() + # monday + assert Date(2025, 1, 6).weekday == 0 + assert Date(2025, 1, 6).weekday == datetime.date(2025, 1, 6).weekday() + # tuesday + assert Date(2025, 1, 7).weekday == 1 + assert Date(2025, 1, 7).weekday == datetime.date(2025, 1, 7).weekday() + def test_substract(self): # date - date = timedelta date_difference = Date(2024, 1, 2) - Date(2024, 1, 1) diff --git a/tests/test_undate.py b/tests/test_undate.py index 18e03b0..d4b3794 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -298,11 +298,17 @@ def test_lt_notimplemented(self): # how to compare mixed precision where dates overlap? # if the second date falls *within* earliest/latest, # then it is not clearly less; not implemented? - with pytest.raises(NotImplementedError, match="date falls within the other"): + with pytest.raises( + NotImplementedError, + match="one date \\(2022-05\\) falls within the other \\(2022\\)", + ): assert Undate(2022) < Undate(2022, 5) # same if we attempt to compare in the other direction - with pytest.raises(NotImplementedError, match="date falls within the other"): + with pytest.raises( + NotImplementedError, + match="one date \\(2022-05\\) falls within the other \\(2022\\)", + ): assert Undate(2022, 5) < Undate(2022) testdata_contains = [