From ab378da69ea29b9fe45d3dee0e0b3b9f76f77359 Mon Sep 17 00:00:00 2001 From: Rebecca Sutton Koeser Date: Mon, 23 Oct 2023 10:12:27 -0400 Subject: [PATCH] support partially known dates (#36) * Support initializing undate with partially known year or month Co-authored-by: Cole Crawford <16374762+ColeDCrawford@users.noreply.github.com> * Handle some cases for initializing undate partially known days Co-authored-by: Cole Crawford <16374762+ColeDCrawford@users.noreply.github.com> * Add notes about next steps Co-authored-by: Cole Crawford <16374762+ColeDCrawford@users.noreply.github.com> * Clean up known year logic; disable failing tests * Refactor logic for calculating missing month digits * Refactor logic for calculating missing day digits * Refactor shared partially known month/day logic for min/max values * Update duration logic for partially known dates with known granularity * Update string methods for partially known dates * Adjust partial date duration calculation for dates that wrap years * Make duration logic more consistent * Add jupyter notebook comparing partial date duration logic with S&co * Pin sphinx to pre 7.0 for compatibility with rtd theme --------- Co-authored-by: Cole Crawford <16374762+ColeDCrawford@users.noreply.github.com> --- .../shxco_partial_date_durations.ipynb | 5667 +++++++++++++++++ setup.cfg | 3 +- src/undate/dateformat/iso8601.py | 9 +- src/undate/undate.py | 249 +- tests/test_undate.py | 138 +- 5 files changed, 6024 insertions(+), 42 deletions(-) create mode 100644 examples/notebooks/shxco_partial_date_durations.ipynb diff --git a/examples/notebooks/shxco_partial_date_durations.ipynb b/examples/notebooks/shxco_partial_date_durations.ipynb new file mode 100644 index 0000000..11d6662 --- /dev/null +++ b/examples/notebooks/shxco_partial_date_durations.ipynb @@ -0,0 +1,5667 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# undate partial date duration check\n", + "compare undate interval duration calculation for date rnages between partial dates with Shakespeare and Company Project events dataset" + ], + "metadata": { + "id": "s_holu9LI6q1" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TbokQJlu4G7Y", + "outputId": "d30849fd-811c-492d-ed37-a66ea4dd9088" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting git+https://github.com/dh-tech/undate-python.git@story/3-partially-known-dates\n", + " Cloning https://github.com/dh-tech/undate-python.git (to revision story/3-partially-known-dates) to /tmp/pip-req-build-aoklox4b\n", + " Running command git clone --filter=blob:none --quiet https://github.com/dh-tech/undate-python.git /tmp/pip-req-build-aoklox4b\n", + " Running command git checkout -b story/3-partially-known-dates --track origin/story/3-partially-known-dates\n", + " Switched to a new branch 'story/3-partially-known-dates'\n", + " Branch 'story/3-partially-known-dates' set up to track remote branch 'story/3-partially-known-dates' from 'origin'.\n", + " Resolved https://github.com/dh-tech/undate-python.git to commit 615cb0d9af313128a562107305d1b7c2eddd9535\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from undate==0.2.0.dev0) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->undate==0.2.0.dev0) (1.16.0)\n", + "Building wheels for collected packages: undate\n", + " Building wheel for undate (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for undate: filename=undate-0.2.0.dev0-py3-none-any.whl size=15200 sha256=ca2e25447c84ad830f1e7ac31a43d67701390d594fff44637a565c3de5bb6134\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-ctzxaxcn/wheels/b0/dd/8f/69e3af2abd0249334bdcc1836876d45b86b0e1183e79b71123\n", + "Successfully built undate\n", + "Installing collected packages: undate\n", + "Successfully installed undate-0.2.0.dev0\n" + ] + } + ], + "source": [ + "# install from feature branch for now, until merged\n", + "%pip install git+https://github.com/dh-tech/undate-python.git@story/3-partially-known-dates" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# load most recent version of S&co events dataset\n", + "\n", + "# dataspace link on S&co website is currently broken\n", + "#events_df = pd.read_csv(\"https://dataspace.princeton.edu/bitstream/88435/dsp019306t2441/2/SCoData_events_v1.2_2022-01.csv\")\n", + "# other dataset link resulting in an error; incomplete download?\n", + "# events_df = pd.read_csv(\"https://dataspace.princeton.edu/bitstream/88435/dsp019306t2441/1\")\n", + "events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n", + "events_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 579 + }, + "id": "Q7KZRmj_4ySW", + "outputId": "ee3cacd7-c347-437a-ee8e-91a4086d6e88" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "0 Generic 1920 NaN \n", + "1 Subscription 1921 NaN \n", + "2 Borrow 1922 1922-08-23 \n", + "3 Generic 1922 NaN \n", + "4 Subscription 1922 NaN \n", + "\n", + " member_uris member_names \\\n", + "0 https://shakespeareandco.princeton.edu/members... Raymonde Linossier \n", + "1 https://shakespeareandco.princeton.edu/members... Mme Garreta \n", + "2 https://shakespeareandco.princeton.edu/members... Mr. Rhys \n", + "3 https://shakespeareandco.princeton.edu/members... Ernest Walsh \n", + "4 https://shakespeareandco.princeton.edu/members... Mr. Lincoln \n", + "\n", + " member_sort_names subscription_price_paid subscription_deposit \\\n", + "0 Linossier, Raymonde NaN NaN \n", + "1 Garreta, Mme NaN NaN \n", + "2 Rhys, Mr. NaN NaN \n", + "3 Walsh, Ernest NaN NaN \n", + "4 Lincoln, Mr. NaN 7.0 \n", + "\n", + " subscription_duration subscription_duration_days ... \\\n", + "0 NaN NaN ... \n", + "1 NaN NaN ... \n", + "2 NaN NaN ... \n", + "3 NaN NaN ... \n", + "4 NaN NaN ... \n", + "\n", + " item_uri item_title \\\n", + "0 https://shakespeareandco.princeton.edu/books/b... Pigs Is Pigs \n", + "1 NaN NaN \n", + "2 https://shakespeareandco.princeton.edu/books/c... Typhoon \n", + "3 https://shakespeareandco.princeton.edu/books/b... The Pretty Lady \n", + "4 NaN NaN \n", + "\n", + " item_volume item_authors item_year item_notes \\\n", + "0 NaN Butler, Ellis Parker 1906.0 NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN Conrad, Joseph 1902.0 NaN \n", + "3 NaN Bennett, Arnold 1918.0 NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "0 Lending Library Card Sylvia Beach, Raymonde Linossier Lending Libra... \n", + "1 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "2 Lending Library Card Sylvia Beach, Rhys Lending Library Card, Box 4... \n", + "3 Lending Library Card Sylvia Beach, Ernest Walsh Lending Library Car... \n", + "4 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest \\\n", + "0 https://figgy.princeton.edu/concern/scanned_re... \n", + "1 NaN \n", + "2 https://figgy.princeton.edu/concern/scanned_re... \n", + "3 https://figgy.princeton.edu/concern/scanned_re... \n", + "4 NaN \n", + "\n", + " source_image \n", + "0 https://iiif.princeton.edu/loris/figgy_prod/00... \n", + "1 NaN \n", + "2 https://iiif.princeton.edu/loris/figgy_prod/67... \n", + "3 https://iiif.princeton.edu/loris/figgy_prod/af... \n", + "4 NaN \n", + "\n", + "[5 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
0Generic1920NaNhttps://shakespeareandco.princeton.edu/members...Raymonde LinossierLinossier, RaymondeNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...Pigs Is PigsNaNButler, Ellis Parker1906.0NaNLending Library CardSylvia Beach, Raymonde Linossier Lending Libra...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/00...
1Subscription1921NaNhttps://shakespeareandco.princeton.edu/members...Mme GarretaGarreta, MmeNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
2Borrow19221922-08-23https://shakespeareandco.princeton.edu/members...Mr. RhysRhys, Mr.NaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/c...TyphoonNaNConrad, Joseph1902.0NaNLending Library CardSylvia Beach, Rhys Lending Library Card, Box 4...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/67...
3Generic1922NaNhttps://shakespeareandco.princeton.edu/members...Ernest WalshWalsh, ErnestNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...The Pretty LadyNaNBennett, Arnold1918.0NaNLending Library CardSylvia Beach, Ernest Walsh Lending Library Car...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/af...
4Subscription1922NaNhttps://shakespeareandco.princeton.edu/members...Mr. LincolnLincoln, Mr.NaN7.0NaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
\n", + "

5 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## method to calculate durations\n", + "\n", + "define a method to initialize undate interval from start and end date string in ISO format as used in S&co datasets\n", + "\n", + "\n", + "**Note:** that there's an off-by-one discrepancy between how we currently calculate duration in undate and in the Shakespeare and Company Project code: the S&co code counts the first day in the range but not the last (could also be thought of as counting half of start and end dates)." + ], + "metadata": { + "id": "0Y6CsfIAJoqi" + } + }, + { + "cell_type": "code", + "source": [ + "from undate.undate import UndateInterval\n", + "from undate.dateformat.iso8601 import ISO8601DateFormat\n", + "\n", + "def undate_duration(start_date, end_date):\n", + " isoformat = ISO8601DateFormat()\n", + "\n", + " unstart = isoformat.parse(start_date)\n", + " unend = isoformat.parse(end_date)\n", + " interval = UndateInterval(earliest=unstart, latest=unend)\n", + "\n", + " # subtract one here for simplicity of comparison,\n", + " # to reconcile difference between how duration logic\n", + "\n", + " return interval.duration().days - 1" + ], + "metadata": { + "id": "y_MqgrQW64uI" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## subscription events\n", + "\n", + "compare subscription events with known duration" + ], + "metadata": { + "id": "JBVWMB7lJbYB" + } + }, + { + "cell_type": "code", + "source": [ + "# identify subscription eventss with duration information\n", + "subs_duration = events_df[events_df.subscription_duration_days.notna()]\n", + "subs_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "c8iPHU5K58cz", + "outputId": "c0cc72ef-ed0b-4a30-d7b5-ea21ef0582c7" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "28 Subscription 1927 1928 \n", + "70 Subscription 1931 1932 \n", + "233 Subscription 1921-07 1921-08 \n", + "234 Subscription 1921-09 1922-02 \n", + "260 Subscription 1923-06 1923-10 \n", + "\n", + " member_uris \\\n", + "28 https://shakespeareandco.princeton.edu/members... \n", + "70 https://shakespeareandco.princeton.edu/members... \n", + "233 https://shakespeareandco.princeton.edu/members... \n", + "234 https://shakespeareandco.princeton.edu/members... \n", + "260 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "28 Arthur Elliott Felkin \n", + "70 Geraldine Deknatel;William Deknatel \n", + "233 Mrs. G. S. Madam \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell \n", + "260 Victor Llona \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "28 Felkin, Arthur Elliott NaN \n", + "70 Deknatel, Geraldine;Deknatel, William NaN \n", + "233 Madam, Mrs. G. S. NaN \n", + "234 Moderwell, Anne;Moderwell, Hiram NaN \n", + "260 Llona, Victor NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "28 NaN 1 year 365.0 \n", + "70 NaN 1 year 365.0 \n", + "233 NaN 1 month 31.0 \n", + "234 NaN 5 months 153.0 \n", + "260 NaN 4 months 122.0 \n", + "\n", + " ... item_uri item_title item_volume item_authors item_year item_notes \\\n", + "28 ... NaN NaN NaN NaN NaN NaN \n", + "70 ... NaN NaN NaN NaN NaN NaN \n", + "233 ... NaN NaN NaN NaN NaN NaN \n", + "234 ... NaN NaN NaN NaN NaN NaN \n", + "260 ... NaN NaN NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "28 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "70 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "233 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "234 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "260 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest source_image \n", + "28 NaN NaN \n", + "70 NaN NaN \n", + "233 NaN NaN \n", + "234 NaN NaN \n", + "260 NaN NaN \n", + "\n", + "[5 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
28Subscription19271928https://shakespeareandco.princeton.edu/members...Arthur Elliott FelkinFelkin, Arthur ElliottNaNNaN1 year365.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
70Subscription19311932https://shakespeareandco.princeton.edu/members...Geraldine Deknatel;William DeknatelDeknatel, Geraldine;Deknatel, WilliamNaNNaN1 year365.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
233Subscription1921-071921-08https://shakespeareandco.princeton.edu/members...Mrs. G. S. MadamMadam, Mrs. G. S.NaNNaN1 month31.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
234Subscription1921-091922-02https://shakespeareandco.princeton.edu/members...Anne Moderwell;Hiram Moderwell / H. K. ModerwellModerwell, Anne;Moderwell, HiramNaNNaN5 months153.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
260Subscription1923-061923-10https://shakespeareandco.princeton.edu/members...Victor LlonaLlona, VictorNaNNaN4 months122.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
\n", + "

5 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what do the subscription duration day values look like?\n", + "subs_duration.subscription_duration_days.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9i0aN7iQ6voY", + "outputId": "fe1ac93f-5571-4bd3-e4c1-06e90cf33f5c" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "31.0 2997\n", + "30.0 1975\n", + "92.0 936\n", + "91.0 397\n", + "365.0 337\n", + " ... \n", + "69.0 1\n", + "36.0 1\n", + "73.0 1\n", + "574.0 1\n", + "171.0 1\n", + "Name: subscription_duration_days, Length: 133, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration.subscription_duration_days.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aGqi4LRp60tV", + "outputId": "fbd61c94-41ab-40a7-87c2-cf0548c75d5a" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "count 9146.000000\n", + "mean 72.142685\n", + "std 81.559368\n", + "min 1.000000\n", + "25% 30.000000\n", + "50% 31.000000\n", + "75% 91.000000\n", + "max 574.000000\n", + "Name: subscription_duration_days, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# do we have unknown start/end date values?\n", + "subs_duration[subs_duration.start_date.isna()]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "pUsAb16MKqvb", + "outputId": "27f3b8e7-c5a5-4297-eb7e-e37e81945dda" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Empty DataFrame\n", + "Columns: [event_type, start_date, end_date, member_uris, member_names, member_sort_names, subscription_price_paid, subscription_deposit, subscription_duration, subscription_duration_days, subscription_volumes, subscription_category, subscription_purchase_date, reimbursement_refund, borrow_status, borrow_duration_days, purchase_price, currency, item_uri, item_title, item_volume, item_authors, item_year, item_notes, source_type, source_citation, source_manifest, source_image]\n", + "Index: []\n", + "\n", + "[0 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
\n", + "

0 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration[subs_duration.end_date.isna()]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + }, + "id": "0odaog0eK0CN", + "outputId": "1e8814ff-0043-4969-b1d1-7574c3e82008" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "13168 Subscription 1932-10-06 NaN \n", + "13686 Subscription 1933-03-02 NaN \n", + "\n", + " member_uris \\\n", + "13168 https://shakespeareandco.princeton.edu/members... \n", + "13686 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "13168 Jean (Bakewell) Connolly / Mrs. Cyril Connolly \n", + "13686 Stanislas Pascal Franchot \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "13168 Connolly, Jean NaN \n", + "13686 Franchot, Stanislas Pascal NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "13168 100.0 NaN 31.0 \n", + "13686 50.0 NaN 31.0 \n", + "\n", + " ... item_uri item_title item_volume item_authors item_year \\\n", + "13168 ... NaN NaN NaN NaN NaN \n", + "13686 ... NaN NaN NaN NaN NaN \n", + "\n", + " item_notes source_type \\\n", + "13168 NaN Logbook \n", + "13686 NaN Logbook;Lending Library Card \n", + "\n", + " source_citation \\\n", + "13168 Sylvia Beach, Logbooks 1919–1941, Sylvia Beach... \n", + "13686 Sylvia Beach, Logbooks 1919–1941, Sylvia Beach... \n", + "\n", + " source_manifest \\\n", + "13168 NaN \n", + "13686 ;https://figgy.princeton.edu/concern/scanned_r... \n", + "\n", + " source_image \n", + "13168 NaN \n", + "13686 ;https://iiif.princeton.edu/loris/figgy_prod/7... \n", + "\n", + "[2 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
13168Subscription1932-10-06NaNhttps://shakespeareandco.princeton.edu/members...Jean (Bakewell) Connolly / Mrs. Cyril ConnollyConnolly, JeanNaN100.0NaN31.0...NaNNaNNaNNaNNaNNaNLogbookSylvia Beach, Logbooks 1919–1941, Sylvia Beach...NaNNaN
13686Subscription1933-03-02NaNhttps://shakespeareandco.princeton.edu/members...Stanislas Pascal FranchotFranchot, Stanislas PascalNaN50.0NaN31.0...NaNNaNNaNNaNNaNNaNLogbook;Lending Library CardSylvia Beach, Logbooks 1919–1941, Sylvia Beach...;https://figgy.princeton.edu/concern/scanned_r...;https://iiif.princeton.edu/loris/figgy_prod/7...
\n", + "

2 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# omit events with unknown end date since we can't recalculate duration\n", + "# (duration in the dataset is based on the subscription duration)\n", + "subs_duration = subs_duration[subs_duration.end_date.notna()]" + ], + "metadata": { + "id": "jwvN9-CgLQRx" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# add a new field for duration as calculated by undate\n", + "subs_duration[\"undate_duration\"] = subs_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "subs_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "Z-CVWd3z7Jb6", + "outputId": "d52d57d4-9803-4bfa-9708-bdf149c7098b" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "28 Subscription 1927 1928 \n", + "70 Subscription 1931 1932 \n", + "233 Subscription 1921-07 1921-08 \n", + "234 Subscription 1921-09 1922-02 \n", + "260 Subscription 1923-06 1923-10 \n", + "\n", + " member_uris \\\n", + "28 https://shakespeareandco.princeton.edu/members... \n", + "70 https://shakespeareandco.princeton.edu/members... \n", + "233 https://shakespeareandco.princeton.edu/members... \n", + "234 https://shakespeareandco.princeton.edu/members... \n", + "260 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "28 Arthur Elliott Felkin \n", + "70 Geraldine Deknatel;William Deknatel \n", + "233 Mrs. G. S. Madam \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell \n", + "260 Victor Llona \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "28 Felkin, Arthur Elliott NaN \n", + "70 Deknatel, Geraldine;Deknatel, William NaN \n", + "233 Madam, Mrs. G. S. NaN \n", + "234 Moderwell, Anne;Moderwell, Hiram NaN \n", + "260 Llona, Victor NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "28 NaN 1 year 365.0 \n", + "70 NaN 1 year 365.0 \n", + "233 NaN 1 month 31.0 \n", + "234 NaN 5 months 153.0 \n", + "260 NaN 4 months 122.0 \n", + "\n", + " ... item_title item_volume item_authors item_year item_notes \\\n", + "28 ... NaN NaN NaN NaN NaN \n", + "70 ... NaN NaN NaN NaN NaN \n", + "233 ... NaN NaN NaN NaN NaN \n", + "234 ... NaN NaN NaN NaN NaN \n", + "260 ... NaN NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "28 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "70 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "233 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "234 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "260 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest source_image undate_duration \n", + "28 NaN NaN 730 \n", + "70 NaN NaN 730 \n", + "233 NaN NaN 61 \n", + "234 NaN NaN 180 \n", + "260 NaN NaN 152 \n", + "\n", + "[5 rows x 29 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_imageundate_duration
28Subscription19271928https://shakespeareandco.princeton.edu/members...Arthur Elliott FelkinFelkin, Arthur ElliottNaNNaN1 year365.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN730
70Subscription19311932https://shakespeareandco.princeton.edu/members...Geraldine Deknatel;William DeknatelDeknatel, Geraldine;Deknatel, WilliamNaNNaN1 year365.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN730
233Subscription1921-071921-08https://shakespeareandco.princeton.edu/members...Mrs. G. S. MadamMadam, Mrs. G. S.NaNNaN1 month31.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN61
234Subscription1921-091922-02https://shakespeareandco.princeton.edu/members...Anne Moderwell;Hiram Moderwell / H. K. ModerwellModerwell, Anne;Moderwell, HiramNaNNaN5 months153.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN180
260Subscription1923-061923-10https://shakespeareandco.princeton.edu/members...Victor LlonaLlona, VictorNaNNaN4 months122.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN152
\n", + "

5 rows × 29 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# compare undate duration with dataset duration\n", + "# limit to fields we care about\n", + "subs_duration = subs_duration[['start_date', 'end_date', 'subscription_duration', 'subscription_duration_days', 'undate_duration']]\n", + "subs_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "fVf6M2E2LgnH", + "outputId": "87e6585a-670d-466e-d206-caabaaa48df9" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "35114 1941-11-24 1941-12-24 1 month \n", + "35115 1941-11-24 1941-12-24 1 month \n", + "35116 1941-12-04 1942-01-04 1 month \n", + "35118 1941-12-08 1942-03-08 3 months \n", + "35119 1941-12-09 1942-01-09 1 month \n", + "\n", + " subscription_duration_days undate_duration \n", + "28 365.0 730 \n", + "70 365.0 730 \n", + "233 31.0 61 \n", + "234 153.0 180 \n", + "260 122.0 152 \n", + "... ... ... \n", + "35114 30.0 30 \n", + "35115 30.0 30 \n", + "35116 31.0 31 \n", + "35118 90.0 90 \n", + "35119 31.0 31 \n", + "\n", + "[9144 rows x 5 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_duration
28192719281 year365.0730
70193119321 year365.0730
2331921-071921-081 month31.061
2341921-091922-025 months153.0180
2601923-061923-104 months122.0152
..................
351141941-11-241941-12-241 month30.030
351151941-11-241941-12-241 month30.030
351161941-12-041942-01-041 month31.031
351181941-12-081942-03-083 months90.090
351191941-12-091942-01-091 month31.031
\n", + "

9144 rows × 5 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what's the difference between the two?\n", + "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration - row.subscription_duration_days, axis=1)\n", + "subs_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "drnCqTtsL835", + "outputId": "dc042b74-295a-436c-9c70-c6014d986cf7" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "35114 1941-11-24 1941-12-24 1 month \n", + "35115 1941-11-24 1941-12-24 1 month \n", + "35116 1941-12-04 1942-01-04 1 month \n", + "35118 1941-12-08 1942-03-08 3 months \n", + "35119 1941-12-09 1942-01-09 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "233 31.0 61 30.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "... ... ... ... \n", + "35114 30.0 30 0.0 \n", + "35115 30.0 30 0.0 \n", + "35116 31.0 31 0.0 \n", + "35118 90.0 90 0.0 \n", + "35119 31.0 31 0.0 \n", + "\n", + "[9144 rows x 6 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2331921-071921-081 month31.06130.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
.....................
351141941-11-241941-12-241 month30.0300.0
351151941-11-241941-12-241 month30.0300.0
351161941-12-041942-01-041 month31.0310.0
351181941-12-081942-03-083 months90.0900.0
351191941-12-091942-01-091 month31.0310.0
\n", + "

9144 rows × 6 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration['duration_diff'].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z3i984igMNjm", + "outputId": "c8a3580e-a36a-4756-d427-286ba8e5cf91" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0.0 9065\n", + " 30.0 30\n", + " 29.0 21\n", + " 1.0 10\n", + "-1.0 9\n", + " 28.0 4\n", + " 365.0 2\n", + " 27.0 1\n", + " 2.0 1\n", + "-3.0 1\n", + "Name: duration_diff, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### investigate discrepancies" + ], + "metadata": { + "id": "Uu9kmAA_gm5o" + } + }, + { + "cell_type": "code", + "source": [ + "# investigate the ones with larger differences\n", + "subset_subdurations = subs_duration[subs_duration.duration_diff != 0]\n", + "subset_subdurations" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "gdenGvR1MkUG", + "outputId": "589b6b49-3f9c-42d5-e01f-326401007878" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "33878 1933-12-01 1934-01-01 1 month \n", + "33880 1933-12-02 1934-01-02 1 month \n", + "33902 1934-01-02 1934-07-02 6 months \n", + "33936 1934-06-02 1934-12-02 6 months \n", + "34892 1940-11-30 1940-12-30 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "233 31.0 61 30.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "... ... ... ... \n", + "33878 30.0 31 1.0 \n", + "33880 30.0 31 1.0 \n", + "33902 182.0 181 -1.0 \n", + "33936 184.0 183 -1.0 \n", + "34892 31.0 30 -1.0 \n", + "\n", + "[79 rows x 6 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2331921-071921-081 month31.06130.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
.....................
338781933-12-011934-01-011 month30.0311.0
338801933-12-021934-01-021 month30.0311.0
339021934-01-021934-07-026 months182.0181-1.0
339361934-06-021934-12-026 months184.0183-1.0
348921940-11-301940-12-301 month31.030-1.0
\n", + "

79 rows × 6 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# too many to lok at once, can we segment by subscription duration?\n", + "subset_subdurations.subscription_duration.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9_w1Cwl2N81d", + "outputId": "c0733942-16cd-42bf-c9a3-abbf250e44f5" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 month 38\n", + "3 months 12\n", + "2 months 7\n", + "6 months 6\n", + "4 months 5\n", + "5 months 3\n", + "1 year 2\n", + "7 months 2\n", + "8 months 2\n", + "11 months 1\n", + "10 months 1\n", + "Name: subscription_duration, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# lots of one-month subscriptions, what do the discrepancies look like?\n", + "subset_subdurations[subset_subdurations.subscription_duration == '1 month']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "snv1qguUOHPB", + "outputId": "dce76078-236b-48ee-9607-5d702cf4ee04" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "233 1921-07 1921-08 1 month \n", + "261 1923-08 1923-09 1 month \n", + "271 1924-02 1924-03 1 month \n", + "313 1926-11 1926-12 1 month \n", + "354 1928-02 1928-03 1 month \n", + "356 1928-02 1928-03 1 month \n", + "393 1929-08 1929-09 1 month \n", + "394 1929-08 1929-09 1 month \n", + "430 1930-05 1930-06 1 month \n", + "444 1930-11 1930-12 1 month \n", + "462 1931-05 1931-06 1 month \n", + "464 1931-06 1931-07 1 month \n", + "466 1931-07 1931-08 1 month \n", + "468 1931-08 1931-09 1 month \n", + "472 1931-09 1931-10 1 month \n", + "477 1931-10 1931-11 1 month \n", + "478 1931-10 1931-11 1 month \n", + "483 1931-11 1931-12 1 month \n", + "484 1931-11 1931-12 1 month \n", + "487 1931-12 1932-01 1 month \n", + "492 1932-01 1932-02 1 month \n", + "500 1932-02 1932-03 1 month \n", + "501 1932-02 1932-03 1 month \n", + "504 1932-03 1932-04 1 month \n", + "516 1932-04 1932-05 1 month \n", + "517 1932-05 1932-06 1 month \n", + "7064 1926-09-15 1926-10-15 1 month \n", + "31089 1923-11-22 1923-12-22 1 month \n", + "31511 1924-11-08 1924-12-08 1 month \n", + "31722 1925-05-09 1925-06-09 1 month \n", + "32269 1926-06-10 1926-07-10 1 month \n", + "32444 1926-10-07 1926-11-07 1 month \n", + "33401 1929-05-18 1929-06-18 1 month \n", + "33665 1932-12-15 1933-01-15 1 month \n", + "33709 1933-02-03 1933-03-03 1 month \n", + "33878 1933-12-01 1934-01-01 1 month \n", + "33880 1933-12-02 1934-01-02 1 month \n", + "34892 1940-11-30 1940-12-30 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "233 31.0 61 30.0 \n", + "261 31.0 60 29.0 \n", + "271 29.0 59 30.0 \n", + "313 30.0 60 30.0 \n", + "354 29.0 59 30.0 \n", + "356 29.0 59 30.0 \n", + "393 31.0 60 29.0 \n", + "394 31.0 60 29.0 \n", + "430 31.0 60 29.0 \n", + "444 30.0 60 30.0 \n", + "462 31.0 60 29.0 \n", + "464 30.0 60 30.0 \n", + "466 31.0 61 30.0 \n", + "468 31.0 60 29.0 \n", + "472 30.0 60 30.0 \n", + "477 31.0 60 29.0 \n", + "478 31.0 60 29.0 \n", + "483 30.0 60 30.0 \n", + "484 30.0 60 30.0 \n", + "487 31.0 61 30.0 \n", + "492 31.0 59 28.0 \n", + "500 29.0 59 30.0 \n", + "501 29.0 59 30.0 \n", + "504 31.0 60 29.0 \n", + "516 30.0 60 30.0 \n", + "517 31.0 60 29.0 \n", + "7064 31.0 30 -1.0 \n", + "31089 31.0 30 -1.0 \n", + "31511 31.0 30 -1.0 \n", + "31722 30.0 31 1.0 \n", + "32269 31.0 30 -1.0 \n", + "32444 30.0 31 1.0 \n", + "33401 30.0 31 1.0 \n", + "33665 30.0 31 1.0 \n", + "33709 31.0 28 -3.0 \n", + "33878 30.0 31 1.0 \n", + "33880 30.0 31 1.0 \n", + "34892 31.0 30 -1.0 " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
2331921-071921-081 month31.06130.0
2611923-081923-091 month31.06029.0
2711924-021924-031 month29.05930.0
3131926-111926-121 month30.06030.0
3541928-021928-031 month29.05930.0
3561928-021928-031 month29.05930.0
3931929-081929-091 month31.06029.0
3941929-081929-091 month31.06029.0
4301930-051930-061 month31.06029.0
4441930-111930-121 month30.06030.0
4621931-051931-061 month31.06029.0
4641931-061931-071 month30.06030.0
4661931-071931-081 month31.06130.0
4681931-081931-091 month31.06029.0
4721931-091931-101 month30.06030.0
4771931-101931-111 month31.06029.0
4781931-101931-111 month31.06029.0
4831931-111931-121 month30.06030.0
4841931-111931-121 month30.06030.0
4871931-121932-011 month31.06130.0
4921932-011932-021 month31.05928.0
5001932-021932-031 month29.05930.0
5011932-021932-031 month29.05930.0
5041932-031932-041 month31.06029.0
5161932-041932-051 month30.06030.0
5171932-051932-061 month31.06029.0
70641926-09-151926-10-151 month31.030-1.0
310891923-11-221923-12-221 month31.030-1.0
315111924-11-081924-12-081 month31.030-1.0
317221925-05-091925-06-091 month30.0311.0
322691926-06-101926-07-101 month31.030-1.0
324441926-10-071926-11-071 month30.0311.0
334011929-05-181929-06-181 month30.0311.0
336651932-12-151933-01-151 month30.0311.0
337091933-02-031933-03-031 month31.028-3.0
338781933-12-011934-01-011 month30.0311.0
338801933-12-021934-01-021 month30.0311.0
348921940-11-301940-12-301 month31.030-1.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The first set of these are calculated differently because they are partial dates; undate logic calculates based on earliest possible date through last possible date, but we have additional information in these cases that is project-specific and undate can't take into account, i.e. subscription duration is one month starting sometime in a known year or month.\n", + "\n", + "The handful towards the end that are off by one in either direction (+/-) are a little more concerning... (potential bug in S&co code? or value calculated based on known semantic duration?)" + ], + "metadata": { + "id": "Rm4jqlA4hq9E" + } + }, + { + "cell_type": "code", + "source": [ + "# durations other than one month\n", + "subset_subdurations[subset_subdurations.subscription_duration != '1 month']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "TEL7qdNhOXHL", + "outputId": "50e051d5-18ae-4f24-a229-fc02fb610ed8" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "272 1924-02 1924-04 2 months \n", + "293 1926-03 1926-10 7 months \n", + "321 1927-03 1928-02 11 months \n", + "331 1927-07 1927-10 3 months \n", + "337 1927-10 1928-06 8 months \n", + "349 1928-01 1928-04 3 months \n", + "388 1929-06 1930-04 10 months \n", + "408 1930-01 1930-04 3 months \n", + "409 1930-01 1930-04 3 months \n", + "412 1930-01 1930-09 8 months \n", + "415 1930-02 1930-06 4 months \n", + "431 1930-05 1930-07 2 months \n", + "437 1930-09 1930-12 3 months \n", + "454 1930-12 1931-03 3 months \n", + "459 1931-03 1931-05 2 months \n", + "465 1931-07 1931-10 3 months \n", + "471 1931-09 1931-12 3 months \n", + "475 1931-09 1931-12 3 months \n", + "476 1931-10 1932-03 5 months \n", + "480 1931-10 1932-02 4 months \n", + "485 1931-11 1932-06 7 months \n", + "486 1931-12 1932-05 5 months \n", + "489 1931-12 1932-02 2 months \n", + "490 1931-12 1932-04 4 months \n", + "496 1932-01 1932-03 2 months \n", + "502 1932-02 1932-06 4 months \n", + "506 1932-03 1932-05 2 months \n", + "507 1932-03 1932-05 2 months \n", + "709 1919-12-02 1920-06-02 6 months \n", + "7560 1927-01-11 1927-04-11 3 months \n", + "31480 1924-10-17 1925-04-17 6 months \n", + "31917 1925-10-21 1926-01-21 3 months \n", + "32613 1927-03-14 1927-06-14 3 months \n", + "32671 1927-06-14 1927-12-14 6 months \n", + "32869 1927-12-14 1928-06-14 6 months \n", + "33902 1934-01-02 1934-07-02 6 months \n", + "33936 1934-06-02 1934-12-02 6 months \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "272 60.0 89 29.0 \n", + "293 214.0 244 30.0 \n", + "321 337.0 365 28.0 \n", + "331 92.0 122 30.0 \n", + "337 244.0 273 29.0 \n", + "349 91.0 120 29.0 \n", + "388 304.0 333 29.0 \n", + "408 90.0 119 29.0 \n", + "409 90.0 119 29.0 \n", + "412 243.0 272 29.0 \n", + "415 120.0 149 29.0 \n", + "431 61.0 91 30.0 \n", + "437 91.0 121 30.0 \n", + "454 90.0 120 30.0 \n", + "459 61.0 91 30.0 \n", + "465 92.0 122 30.0 \n", + "471 91.0 121 30.0 \n", + "475 91.0 121 30.0 \n", + "476 152.0 182 30.0 \n", + "480 123.0 151 28.0 \n", + "485 213.0 242 29.0 \n", + "486 152.0 182 30.0 \n", + "489 62.0 90 28.0 \n", + "490 122.0 151 29.0 \n", + "496 60.0 90 30.0 \n", + "502 121.0 150 29.0 \n", + "506 61.0 91 30.0 \n", + "507 61.0 91 30.0 \n", + "709 182.0 183 1.0 \n", + "7560 91.0 90 -1.0 \n", + "31480 181.0 182 1.0 \n", + "31917 91.0 92 1.0 \n", + "32613 90.0 92 2.0 \n", + "32671 184.0 183 -1.0 \n", + "32869 182.0 183 1.0 \n", + "33902 182.0 181 -1.0 \n", + "33936 184.0 183 -1.0 " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
2721924-021924-042 months60.08929.0
2931926-031926-107 months214.024430.0
3211927-031928-0211 months337.036528.0
3311927-071927-103 months92.012230.0
3371927-101928-068 months244.027329.0
3491928-011928-043 months91.012029.0
3881929-061930-0410 months304.033329.0
4081930-011930-043 months90.011929.0
4091930-011930-043 months90.011929.0
4121930-011930-098 months243.027229.0
4151930-021930-064 months120.014929.0
4311930-051930-072 months61.09130.0
4371930-091930-123 months91.012130.0
4541930-121931-033 months90.012030.0
4591931-031931-052 months61.09130.0
4651931-071931-103 months92.012230.0
4711931-091931-123 months91.012130.0
4751931-091931-123 months91.012130.0
4761931-101932-035 months152.018230.0
4801931-101932-024 months123.015128.0
4851931-111932-067 months213.024229.0
4861931-121932-055 months152.018230.0
4891931-121932-022 months62.09028.0
4901931-121932-044 months122.015129.0
4961932-011932-032 months60.09030.0
5021932-021932-064 months121.015029.0
5061932-031932-052 months61.09130.0
5071932-031932-052 months61.09130.0
7091919-12-021920-06-026 months182.01831.0
75601927-01-111927-04-113 months91.090-1.0
314801924-10-171925-04-176 months181.01821.0
319171925-10-211926-01-213 months91.0921.0
326131927-03-141927-06-143 months90.0922.0
326711927-06-141927-12-146 months184.0183-1.0
328691927-12-141928-06-146 months182.01831.0
339021934-01-021934-07-026 months182.0181-1.0
339361934-06-021934-12-026 months184.0183-1.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## borrow events\n", + "\n", + "compare borrow events with known duration" + ], + "metadata": { + "id": "2tk6N7SXKKCu" + } + }, + { + "cell_type": "code", + "source": [ + "borrow_duration = events_df[events_df.borrow_duration_days.notna()]\n", + "# limit to fields we care about for this check\n", + "borrow_duration = borrow_duration[['start_date', 'end_date', 'borrow_duration_days']]\n", + "borrow_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "fA1Nedmz6cyF", + "outputId": "5230d5ad-fec4-4353-a0d2-9676d1aa776d" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days\n", + "602 --01-07 --01-13 6.0\n", + "603 --01-12 --01-20 8.0\n", + "604 --01-16 --02-16 31.0\n", + "605 --01-19 --01-24 5.0\n", + "606 --01-20 --01-28 8.0" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_days
602--01-07--01-136.0
603--01-12--01-208.0
604--01-16--02-1631.0
605--01-19--01-245.0
606--01-20--01-288.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "borrow_duration.tail()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "KPOBIRsTUKM9", + "outputId": "4a251445-e7c7-4250-82df-ece0bc9a3d56" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days\n", + "29903 1961-06-30 1961-10-04 96.0\n", + "29904 1961-06-30 1961-10-04 96.0\n", + "29905 1961-06-30 1961-10-04 96.0\n", + "29907 1961-10-04 1962-03-21 168.0\n", + "29908 1961-10-04 1962-03-21 168.0" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_days
299031961-06-301961-10-0496.0
299041961-06-301961-10-0496.0
299051961-06-301961-10-0496.0
299071961-10-041962-03-21168.0
299081961-10-041962-03-21168.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# add a new field for duration as calculated by undate\n", + "borrow_duration[\"undate_duration\"] = borrow_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "borrow_duration.head(10)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "id": "39nEPZva8jDo", + "outputId": "6cff4de2-c188-43ad-dc75-684c4d461029" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days undate_duration\n", + "602 --01-07 --01-13 6.0 6\n", + "603 --01-12 --01-20 8.0 8\n", + "604 --01-16 --02-16 31.0 31\n", + "605 --01-19 --01-24 5.0 5\n", + "606 --01-20 --01-28 8.0 8\n", + "607 --01-24 --03-20 55.0 55\n", + "608 --01-24 --03-20 55.0 55\n", + "609 --01-24 --03-20 55.0 55\n", + "610 --01-24 --05-30 126.0 126\n", + "611 --01-24 --05-30 126.0 126" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_duration
602--01-07--01-136.06
603--01-12--01-208.08
604--01-16--02-1631.031
605--01-19--01-245.05
606--01-20--01-288.08
607--01-24--03-2055.055
608--01-24--03-2055.055
609--01-24--03-2055.055
610--01-24--05-30126.0126
611--01-24--05-30126.0126
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what's the difference between the two?\n", + "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration - row.borrow_duration_days, axis=1)\n", + "borrow_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "rL5S47wPWfd-", + "outputId": "127af40e-0037-4f99-d590-9cc2466a206b" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days undate_duration \\\n", + "602 --01-07 --01-13 6.0 6 \n", + "603 --01-12 --01-20 8.0 8 \n", + "604 --01-16 --02-16 31.0 31 \n", + "605 --01-19 --01-24 5.0 5 \n", + "606 --01-20 --01-28 8.0 8 \n", + "... ... ... ... ... \n", + "29903 1961-06-30 1961-10-04 96.0 96 \n", + "29904 1961-06-30 1961-10-04 96.0 96 \n", + "29905 1961-06-30 1961-10-04 96.0 96 \n", + "29907 1961-10-04 1962-03-21 168.0 168 \n", + "29908 1961-10-04 1962-03-21 168.0 168 \n", + "\n", + " duration_diff \n", + "602 0.0 \n", + "603 0.0 \n", + "604 0.0 \n", + "605 0.0 \n", + "606 0.0 \n", + "... ... \n", + "29903 0.0 \n", + "29904 0.0 \n", + "29905 0.0 \n", + "29907 0.0 \n", + "29908 0.0 \n", + "\n", + "[19728 rows x 5 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_durationduration_diff
602--01-07--01-136.060.0
603--01-12--01-208.080.0
604--01-16--02-1631.0310.0
605--01-19--01-245.050.0
606--01-20--01-288.080.0
..................
299031961-06-301961-10-0496.0960.0
299041961-06-301961-10-0496.0960.0
299051961-06-301961-10-0496.0960.0
299071961-10-041962-03-21168.01680.0
299081961-10-041962-03-21168.01680.0
\n", + "

19728 rows × 5 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what do the duration differences look like?\n", + "borrow_duration.duration_diff.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DQumLSXZW7r6", + "outputId": "fc5196d6-9d9a-430e-ecb2-c142676c3614" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.0 19728\n", + "Name: duration_diff, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Woohoo, everything matches! 🎉\n", + "\n", + "In a previous run, there were two borrow events where the calculation did not match; this was due to an error in undate duration method when the start and end dates have unknown years and dates wrap to the following year (e.g., december to january), which has now been corrected.\n", + "\n", + "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#)." + ], + "metadata": { + "id": "r0TUYWzSXIil" + } + }, + { + "cell_type": "code", + "source": [ + "borrow_duration[borrow_duration.duration_diff != 0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 + }, + "id": "-Bq76gtDWljg", + "outputId": "f1ee526d-b938-4cbf-e93c-c6c91c077ae7" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Empty DataFrame\n", + "Columns: [start_date, end_date, borrow_duration_days, undate_duration, duration_diff]\n", + "Index: []" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_durationduration_diff
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + } + ] +} \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 8d04412..217c8ee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,9 +58,10 @@ test = pytest-ordering pytest-cov docs = - sphinx + sphinx<7.0.0 sphinx_rtd_theme m2r2 +# pin sphinx because 7.0 currently not compatible with rtd theme [options.packages.find] where = src diff --git a/src/undate/dateformat/iso8601.py b/src/undate/dateformat/iso8601.py index 22aff07..eb7e8eb 100644 --- a/src/undate/dateformat/iso8601.py +++ b/src/undate/dateformat/iso8601.py @@ -48,17 +48,16 @@ def to_string(self, undate: Undate) -> str: date_parts: List[Union[str, None]] = [] # for each part of the date that is known, generate the string format # then combine - for date_portion, known in undate.known_values.items(): - if known: + # TODO: should error if we have year and day but no month + for date_portion, iso_format in self.iso_format.items(): + if undate.is_known(date_portion): # NOTE: datetime strftime for %Y for 3-digit year # results in leading zero in some environments # and not others; force year to always be 4 digits if date_portion == "year": date_parts.append("%04d" % undate.earliest.year) else: - date_parts.append( - undate.earliest.strftime(self.iso_format[date_portion]) - ) + date_parts.append(undate.earliest.strftime(iso_format)) elif date_portion == "year": # if not known but this is year, add '-' for --MM-DD unknown year format date_parts.append("-") diff --git a/src/undate/undate.py b/src/undate/undate.py index 566f869..e816adb 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,5 +1,7 @@ import datetime from calendar import monthrange +from enum import Enum, auto +import re # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None from typing import Optional, Dict, Union @@ -9,14 +11,29 @@ from undate.dateformat.base import BaseDateFormat -# duration of a single day +#: duration of a single day ONE_DAY = datetime.timedelta(days=1) +class DatePrecision(Enum): + """date precision, to indicate date precision independent from how much + of the date is known.""" + + #: year + YEAR = auto() + #: month + MONTH = auto() + #: day + DAY = auto() + + class Undate: """Simple object for representing uncertain, fuzzy or partially unknown dates""" - DEFAULT_FORMAT = "ISO8601" + DEFAULT_FORMAT: str = "ISO8601" + + #: symbol for unknown digits within a date value + MISSING_DIGIT: str = "X" earliest: Union[datetime.date, None] = None latest: Union[datetime.date, None] = None @@ -24,42 +41,104 @@ class Undate: #: Labels are not taken into account when comparing undate objects. label: Union[str, None] = None formatter: Union[BaseDateFormat, None] = None + #: precision of the date (day, month, year, etc.) + precision: DatePrecision = None + + #: known non-leap year + NON_LEAP_YEAR: int = 2022 def __init__( self, - year: Optional[int] = None, - month: Optional[int] = None, - day: Optional[int] = None, + year: Optional[Union[int, str]] = None, + month: Optional[Union[int, str]] = None, + day: Optional[Union[int, str]] = None, formatter: Optional[BaseDateFormat] = None, label: Optional[str] = None, ): - # TODO: support initializing for unknown values in each of these - # e.g., maybe values could be string or int; if string with - # unknown digits, calculate min/max for unknowns + # keep track of initial values and which values are known + self.initial_values: Dict[str, Union[int, str]] = { + "year": year, + "month": month, + "day": day, + } + if day: + self.precision = DatePrecision.DAY + elif month: + self.precision = DatePrecision.MONTH + elif year: + self.precision = DatePrecision.YEAR + + # TODO: refactor partial date min/max calculations + + if year is not None: + try: + year = int(year) + # update initial value since it is used to determine + # whether or not year is known + self.initial_values["year"] = year + min_year = max_year = year + except ValueError: + # year is a string that can't be converted to int + min_year = int(year.replace(self.MISSING_DIGIT, "0")) + max_year = int(year.replace(self.MISSING_DIGIT, "9")) + else: + min_year = datetime.MINYEAR + max_year = datetime.MAXYEAR + + # if month is passed in as a string but completely unknown, + # treat as none + # TODO: we should preserve this information somehow; + # difference between just a year and and an unknown month within a year + # maybe in terms of granularity / size ? + if month == "XX": + month = None + + min_month = 1 + max_month = 12 + if month is not None: + try: + # treat as an integer if we can + month = int(month) + # update initial value + self.initial_values["month"] = month + min_month = max_month = month + except ValueError: + # if not, calculate min/max for missing digits + min_month, max_month = self._missing_digit_minmax( + month, min_month, max_month + ) + + # similar to month above — unknown day, but day-level granularity + if day == "XX": + day = None + + if isinstance(day, int) or isinstance(day, str) and day.isnumeric(): + day = int(day) + # update initial value - fully known day + self.initial_values["day"] = day + min_day = max_day = day + else: + # if we have no day or partial day, calculate min / max + min_day = 1 + # if we know year and month (or max month), calculate exactly + if year and month: + _, max_day = monthrange(year, max_month) + elif year is None and month: + # If we don't have year and month, + # calculate based on a known non-leap year + # (better than just setting 31, but still not great) + _, max_day = monthrange(self.NON_LEAP_YEAR, max_month) + else: + max_day: int = 31 + + # if day is partially specified, narrow min/max further + if day is not None: + min_day, max_day = self._missing_digit_minmax(day, min_day, max_day) # for unknowns, assume smallest possible value for earliest and # largest valid for latest - self.earliest = datetime.date(year or datetime.MINYEAR, month or 1, day or 1) - # if day is unknown but we have year and month, calculate max day - if day is None and year and month: - _, maxday = monthrange(year, month) - elif day is None and year is None and month: - # TODO: what to do if we don't have year and month? - # This will produce bad data if the year is a leap year and the month is February - # 2022 chosen below as it is not a not leap year - # Better than just setting 31, but still not great - _, maxday = monthrange(2022, month) - else: - maxday: int = 31 - self.latest = datetime.date( - year or datetime.MAXYEAR, month or 12, day or maxday - ) - # keep track of which values are known - self.known_values: Dict[str, bool] = { - "year": year is not None, - "month": month is not None, - "day": day is not None, - } + self.earliest = datetime.date(min_year, min_month, min_day) + self.latest = datetime.date(max_year, max_month, max_day) if not formatter: # TODO subclass definitions not available unless they are imported where Undate() is called @@ -69,6 +148,24 @@ def __init__( self.label = label def __str__(self) -> str: + # if any portion of the date is partially known, construct + # pseudo ISO8601 format here, since ISO8601 doesn't support unknown digits + # (temporary, should switch to default format that can handle it, e.g. EDTF) + if any(self.is_partially_known(part) for part in ["year", "month", "day"]): + # initial values could be either string or int + year = self.initial_values["year"] + month = self.initial_values["month"] + day = self.initial_values["day"] + # if integer, convert to string with correct number of digits + # replace unknown year with - for --MM or --MM-DD format + parts = [ + f"{year:04d}" if isinstance(year, int) else year or "-", + f"{month:02d}" if isinstance(month, int) else month, + f"{day:02d}" if isinstance(day, int) else day, + ] + # combine, skipping any values that are None + return "-".join([str(p) for p in parts if p != None]) + return self.formatter.to_string(self) def __repr__(self) -> str: @@ -82,18 +179,95 @@ def __eq__(self, other: "Undate") -> bool: return ( self.earliest == other.earliest and self.latest == other.latest - and self.known_values == other.known_values + # NOTE: assumes that partially known values can only be written + # in one format (i.e. X for missing digits). + # If we support other formats, will need to normalize to common + # internal format for comparison + and self.initial_values == other.initial_values ) @property def known_year(self) -> bool: - return self.known_values["year"] + return self.is_known("year") + + def is_known(self, part: str) -> bool: + """Check if a part of the date (year, month, day) is known. + Returns False if unknown or only partially known.""" + # TODO: should we use constants or enum for values? + + # if we have an integer, then consider the date known + # if we have a string, then it is only partially known; return false + return isinstance(self.initial_values[part], int) + + def is_partially_known(self, part: str) -> bool: + return isinstance(self.initial_values[part], str) def duration(self) -> datetime.timedelta: - # what is the duration of this date? - # subtract earliest from latest, and add a day to count the starting day + """What is the duration of this date? + Calculate based on earliest and latest date within range, + taking into account the precision of the date even if not all + parts of the date are known.""" + + # if precision is a single day, duration is one day + # no matter when it is or what else is known + if self.precision == DatePrecision.DAY: + return ONE_DAY + + # if precision is month and year is unknown, + # calculate month duration within a single year (not min/max) + if self.precision == DatePrecision.MONTH: + latest = self.latest + if not self.known_year: + # if year is unknown, calculate month duration in + # a single year + latest = datetime.date( + self.earliest.year, self.latest.month, self.latest.day + ) + delta = latest - self.earliest + ONE_DAY + # month duration can't ever be more than 31 days + # (could we ever know if it's smaller?) + + # if granularity == month but not known month, duration = 31 + if delta.days > 31: + return datetime.timedelta(days=31) + return delta + + # otherwise, calculate based on earliest/latest range + + # subtract earliest from latest and add a day to count start day return self.latest - self.earliest + ONE_DAY + def _missing_digit_minmax( + self, value: str, min_val: int, max_val: int + ) -> (int, int): + # given a possible range, calculate min/max values for a string + # with a missing digit + + # assuming two digit only (i.e., month or day) + possible_values = [f"{n:02}" for n in range(min_val, max_val + 1)] + # ensure input value has two digits + value = "%02s" % value + # generate regex where missing digit matches anything + val_pattern = re.compile(value.replace(self.MISSING_DIGIT, ".")) + # identify all possible matches, then get min and max + matches = [val for val in possible_values if val_pattern.match(val)] + min_match = min(matches) + max_match = max(matches) + + # split input string into a list so we can update individually + min_val = list(value) + max_val = list(value) + for i, digit in enumerate(value): + # replace the corresponding digit with our min and max + if digit == self.MISSING_DIGIT: + min_val[i] = min_match[i] + max_val[i] = max_match[i] + + # combine the lists of digits back together and convert to int + min_val = int("".join(min_val)) + max_val = int("".join(max_val)) + return (min_val, max_val) + class UndateInterval: """A date range between two uncertain dates. @@ -148,13 +322,18 @@ def duration(self) -> datetime.timedelta: elif not self.latest.known_year and not self.earliest.known_year: # under what circumstances can we assume that if both years # are unknown the dates are in the same year or sequential? - duration = self.latest.earliest - self.earliest.earliest + ONE_DAY + duration = self.latest.earliest - self.earliest.earliest # if we get a negative, we've wrapped from end of one year - # to the beginning of the next + # to the beginning of the next; + # recalculate assuming second date is in the subsequent year if duration.days < 0: end = self.latest.earliest + relativedelta(years=1) duration = end - self.earliest.earliest + # add the additional day *after* checking for a negative + # or after recalculating with adjusted year + duration += ONE_DAY + return duration else: diff --git a/tests/test_undate.py b/tests/test_undate.py index 9217ea4..d02c3d2 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -12,6 +12,17 @@ def test_str(self): assert str(Undate(2022)) == "2022" assert str(Undate(month=11, day=7)) == "--11-07" + def test_partially_known_str(self): + assert str(Undate("19XX")) == "19XX" + assert str(Undate(2022, "1X")) == "2022-1X" + assert str(Undate(2022, 11, "2X")) == "2022-11-2X" + assert str(Undate(month="1X", day=7)) == "--1X-07" + + # TODO: should not allow initializing year/day without month; + # should we infer unknown month? or raise an exception? + # assert str(Undate(2022, day="2X")) == "2022-XX-2X" # currently returns 2022-2X + # assert str(Undate(2022, day=7)) == "2022-XX-07" @ currently returns 2022-07 + def test_repr(self): assert repr(Undate(2022, 11, 7)) == "" assert ( @@ -19,6 +30,92 @@ def test_repr(self): == "" ) + def test_init_str(self): + assert Undate("2000").earliest.year == 2000 + # single or double digit string month should be ok + assert Undate("2000", "2").earliest.month == 2 + assert Undate("2000", "02").earliest.month == 2 + + def test_init_partially_known_year(self): + uncertain1900s = Undate("19XX") + assert uncertain1900s.earliest.year == 1900 + assert uncertain1900s.latest.year == 1999 + + uncertain1x = Undate("1X05") + assert uncertain1x.earliest.year == 1005 + assert uncertain1x.latest.year == 1905 + + uncertain18x7 = Undate("18X7") + assert uncertain18x7.earliest.year == 1807 + assert uncertain18x7.latest.year == 1897 + + def test_init_partially_known_month(self): + uncertain_fall = Undate(1900, "1X") + assert uncertain_fall.earliest.month == 10 + assert uncertain_fall.latest.month == 12 + + uncertain_notfall = Undate(1900, "0X") + assert uncertain_notfall.earliest.month == 1 + assert uncertain_notfall.latest.month == 9 + + # unlikely case, but now possible to calculate + assert Undate(1900, "X1").earliest.month == 1 + assert Undate(1900, "X1").latest.month == 11 + + # treat as unknown but allow + unknown_month = Undate(1900, "XX") + assert unknown_month.earliest.month == 1 + assert unknown_month.latest.month == 12 + assert str(unknown_month) == "1900-XX" + + def test_init_partially_known_day(self): + uncertain_day = Undate(1900, 1, "XX") # treat as None + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 31 + + uncertain_day = Undate(1900, 1, "1X") + assert uncertain_day.earliest.day == 10 + assert uncertain_day.latest.day == 19 + + uncertain_day = Undate(1900, 1, "0X") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 9 + uncertain_day = Undate(1900, 1, "2X") + assert uncertain_day.earliest.day == 20 + assert uncertain_day.latest.day == 29 + uncertain_day = Undate(1900, 1, "3X") + assert uncertain_day.earliest.day == 30 + assert uncertain_day.latest.day == 31 + + uncertain_day = Undate(1900, 1, "X5") + assert uncertain_day.earliest.day == 5 + assert uncertain_day.latest.day == 25 + + uncertain_day = Undate(1900, 1, "X1") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 31 + + # month with only 30 days + uncertain_day = Undate(1900, 6, "X1") + assert uncertain_day.earliest.day == 1 + assert uncertain_day.latest.day == 21 # doesn't go to 31 + uncertain_day = Undate(1900, 6, "3X") + assert uncertain_day.earliest.day == 30 + assert uncertain_day.latest.day == 30 + + # special cases + # february! 28 days usually + uncertain_day = Undate(1900, 2, "2X") + assert uncertain_day.earliest.day == 20 + assert uncertain_day.latest.day == 28 + # february in a leap year + uncertain_day = Undate(2024, 2, "2X") + assert uncertain_day.latest.day == 29 + + def test_init_invalid(self): + with pytest.raises(ValueError): + Undate("19xx") + def test_invalid_date(self): # invalid month should raise an error with pytest.raises(ValueError): @@ -56,9 +153,40 @@ def test_duration(self): leapyear_duration = Undate(2024).duration() assert leapyear_duration.days == 366 + def test_partiallyknown_duration(self): + # day in unknown month/year + assert Undate(day=5).duration().days == 1 + assert Undate(year=1900, month=11, day="2X").duration().days == 1 + + # month in unknown year + assert Undate(month=6).duration().days == 30 + # partially known month + assert Undate(year=1900, month="1X").duration().days == 31 + # what about february? + # could vary with leap years, but assume non-leapyear + assert Undate(month=2).duration().days == 28 + def test_known_year(self): assert Undate(2022).known_year is True assert Undate(month=2, day=5).known_year is False + # partially known year is not known + assert Undate("19XX").known_year is False + # fully known string year should be known + assert Undate("1900").known_year is True + + def test_is_known_month(self): + assert Undate(2022).is_known("month") is False + assert Undate(2022, 2).is_known("month") is True + assert Undate(2022, "5").is_known("month") is True + assert Undate(2022, "1X").is_known("month") is False + assert Undate(2022, "XX").is_known("month") is False + + def test_is_known_day(self): + assert Undate(1984).is_known("day") is False + assert Undate(month=1, day=3).is_known("day") is True + assert Undate(month=1, day="5").is_known("day") is True + assert Undate(month=1, day="X5").is_known("day") is False + assert Undate(month=1, day="XX").is_known("day") is False class TestUndateInterval: @@ -136,4 +264,12 @@ def test_duration(self): month_noyear_duration = UndateInterval( Undate(None, 12, 1), Undate(None, 1, 1) ).duration() - assert month_noyear_duration.days == 31 + assert month_noyear_duration.days == 32 + # this seems wrong, but we currently count both start and dates + + # real case from Shakespeare and Company Project data; + # second date is a year minus one day in the future + month_noyear_duration = UndateInterval( + Undate(None, 6, 7), Undate(None, 6, 6) + ).duration() + assert month_noyear_duration.days == 365