From c9e767313be0e812389baa88b77201f98e9406d5 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Fri, 14 Jul 2023 09:16:54 -0400 Subject: [PATCH] Add jupyter notebook comparing partial date duration logic with S&co --- .../shxco_partial_date_durations.ipynb | 5667 +++++++++++++++++ 1 file changed, 5667 insertions(+) create mode 100644 examples/notebooks/shxco_partial_date_durations.ipynb diff --git a/examples/notebooks/shxco_partial_date_durations.ipynb b/examples/notebooks/shxco_partial_date_durations.ipynb new file mode 100644 index 0000000..11d6662 --- /dev/null +++ b/examples/notebooks/shxco_partial_date_durations.ipynb @@ -0,0 +1,5667 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# undate partial date duration check\n", + "compare undate interval duration calculation for date rnages between partial dates with Shakespeare and Company Project events dataset" + ], + "metadata": { + "id": "s_holu9LI6q1" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TbokQJlu4G7Y", + "outputId": "d30849fd-811c-492d-ed37-a66ea4dd9088" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting git+https://github.com/dh-tech/undate-python.git@story/3-partially-known-dates\n", + " Cloning https://github.com/dh-tech/undate-python.git (to revision story/3-partially-known-dates) to /tmp/pip-req-build-aoklox4b\n", + " Running command git clone --filter=blob:none --quiet https://github.com/dh-tech/undate-python.git /tmp/pip-req-build-aoklox4b\n", + " Running command git checkout -b story/3-partially-known-dates --track origin/story/3-partially-known-dates\n", + " Switched to a new branch 'story/3-partially-known-dates'\n", + " Branch 'story/3-partially-known-dates' set up to track remote branch 'story/3-partially-known-dates' from 'origin'.\n", + " Resolved https://github.com/dh-tech/undate-python.git to commit 615cb0d9af313128a562107305d1b7c2eddd9535\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from undate==0.2.0.dev0) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->undate==0.2.0.dev0) (1.16.0)\n", + "Building wheels for collected packages: undate\n", + " Building wheel for undate (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for undate: filename=undate-0.2.0.dev0-py3-none-any.whl size=15200 sha256=ca2e25447c84ad830f1e7ac31a43d67701390d594fff44637a565c3de5bb6134\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-ctzxaxcn/wheels/b0/dd/8f/69e3af2abd0249334bdcc1836876d45b86b0e1183e79b71123\n", + "Successfully built undate\n", + "Installing collected packages: undate\n", + "Successfully installed undate-0.2.0.dev0\n" + ] + } + ], + "source": [ + "# install from feature branch for now, until merged\n", + "%pip install git+https://github.com/dh-tech/undate-python.git@story/3-partially-known-dates" + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "# load most recent version of S&co events dataset\n", + "\n", + "# dataspace link on S&co website is currently broken\n", + "#events_df = pd.read_csv(\"https://dataspace.princeton.edu/bitstream/88435/dsp019306t2441/2/SCoData_events_v1.2_2022-01.csv\")\n", + "# other dataset link resulting in an error; incomplete download?\n", + "# events_df = pd.read_csv(\"https://dataspace.princeton.edu/bitstream/88435/dsp019306t2441/1\")\n", + "events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n", + "events_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 579 + }, + "id": "Q7KZRmj_4ySW", + "outputId": "ee3cacd7-c347-437a-ee8e-91a4086d6e88" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " events_df = pd.read_csv(\"https://github.com/rlskoeser/shxco-missingdata-specreading/raw/main/data/source-data/SCoData_events_v1.2_2022-01.csv\")\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "0 Generic 1920 NaN \n", + "1 Subscription 1921 NaN \n", + "2 Borrow 1922 1922-08-23 \n", + "3 Generic 1922 NaN \n", + "4 Subscription 1922 NaN \n", + "\n", + " member_uris member_names \\\n", + "0 https://shakespeareandco.princeton.edu/members... Raymonde Linossier \n", + "1 https://shakespeareandco.princeton.edu/members... Mme Garreta \n", + "2 https://shakespeareandco.princeton.edu/members... Mr. Rhys \n", + "3 https://shakespeareandco.princeton.edu/members... Ernest Walsh \n", + "4 https://shakespeareandco.princeton.edu/members... Mr. Lincoln \n", + "\n", + " member_sort_names subscription_price_paid subscription_deposit \\\n", + "0 Linossier, Raymonde NaN NaN \n", + "1 Garreta, Mme NaN NaN \n", + "2 Rhys, Mr. NaN NaN \n", + "3 Walsh, Ernest NaN NaN \n", + "4 Lincoln, Mr. NaN 7.0 \n", + "\n", + " subscription_duration subscription_duration_days ... \\\n", + "0 NaN NaN ... \n", + "1 NaN NaN ... \n", + "2 NaN NaN ... \n", + "3 NaN NaN ... \n", + "4 NaN NaN ... \n", + "\n", + " item_uri item_title \\\n", + "0 https://shakespeareandco.princeton.edu/books/b... Pigs Is Pigs \n", + "1 NaN NaN \n", + "2 https://shakespeareandco.princeton.edu/books/c... Typhoon \n", + "3 https://shakespeareandco.princeton.edu/books/b... The Pretty Lady \n", + "4 NaN NaN \n", + "\n", + " item_volume item_authors item_year item_notes \\\n", + "0 NaN Butler, Ellis Parker 1906.0 NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN Conrad, Joseph 1902.0 NaN \n", + "3 NaN Bennett, Arnold 1918.0 NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "0 Lending Library Card Sylvia Beach, Raymonde Linossier Lending Libra... \n", + "1 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "2 Lending Library Card Sylvia Beach, Rhys Lending Library Card, Box 4... \n", + "3 Lending Library Card Sylvia Beach, Ernest Walsh Lending Library Car... \n", + "4 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest \\\n", + "0 https://figgy.princeton.edu/concern/scanned_re... \n", + "1 NaN \n", + "2 https://figgy.princeton.edu/concern/scanned_re... \n", + "3 https://figgy.princeton.edu/concern/scanned_re... \n", + "4 NaN \n", + "\n", + " source_image \n", + "0 https://iiif.princeton.edu/loris/figgy_prod/00... \n", + "1 NaN \n", + "2 https://iiif.princeton.edu/loris/figgy_prod/67... \n", + "3 https://iiif.princeton.edu/loris/figgy_prod/af... \n", + "4 NaN \n", + "\n", + "[5 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
0Generic1920NaNhttps://shakespeareandco.princeton.edu/members...Raymonde LinossierLinossier, RaymondeNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...Pigs Is PigsNaNButler, Ellis Parker1906.0NaNLending Library CardSylvia Beach, Raymonde Linossier Lending Libra...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/00...
1Subscription1921NaNhttps://shakespeareandco.princeton.edu/members...Mme GarretaGarreta, MmeNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
2Borrow19221922-08-23https://shakespeareandco.princeton.edu/members...Mr. RhysRhys, Mr.NaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/c...TyphoonNaNConrad, Joseph1902.0NaNLending Library CardSylvia Beach, Rhys Lending Library Card, Box 4...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/67...
3Generic1922NaNhttps://shakespeareandco.princeton.edu/members...Ernest WalshWalsh, ErnestNaNNaNNaNNaN...https://shakespeareandco.princeton.edu/books/b...The Pretty LadyNaNBennett, Arnold1918.0NaNLending Library CardSylvia Beach, Ernest Walsh Lending Library Car...https://figgy.princeton.edu/concern/scanned_re...https://iiif.princeton.edu/loris/figgy_prod/af...
4Subscription1922NaNhttps://shakespeareandco.princeton.edu/members...Mr. LincolnLincoln, Mr.NaN7.0NaNNaN...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
\n", + "

5 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 2 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## method to calculate durations\n", + "\n", + "define a method to initialize undate interval from start and end date string in ISO format as used in S&co datasets\n", + "\n", + "\n", + "**Note:** that there's an off-by-one discrepancy between how we currently calculate duration in undate and in the Shakespeare and Company Project code: the S&co code counts the first day in the range but not the last (could also be thought of as counting half of start and end dates)." + ], + "metadata": { + "id": "0Y6CsfIAJoqi" + } + }, + { + "cell_type": "code", + "source": [ + "from undate.undate import UndateInterval\n", + "from undate.dateformat.iso8601 import ISO8601DateFormat\n", + "\n", + "def undate_duration(start_date, end_date):\n", + " isoformat = ISO8601DateFormat()\n", + "\n", + " unstart = isoformat.parse(start_date)\n", + " unend = isoformat.parse(end_date)\n", + " interval = UndateInterval(earliest=unstart, latest=unend)\n", + "\n", + " # subtract one here for simplicity of comparison,\n", + " # to reconcile difference between how duration logic\n", + "\n", + " return interval.duration().days - 1" + ], + "metadata": { + "id": "y_MqgrQW64uI" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## subscription events\n", + "\n", + "compare subscription events with known duration" + ], + "metadata": { + "id": "JBVWMB7lJbYB" + } + }, + { + "cell_type": "code", + "source": [ + "# identify subscription eventss with duration information\n", + "subs_duration = events_df[events_df.subscription_duration_days.notna()]\n", + "subs_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "c8iPHU5K58cz", + "outputId": "c0cc72ef-ed0b-4a30-d7b5-ea21ef0582c7" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "28 Subscription 1927 1928 \n", + "70 Subscription 1931 1932 \n", + "233 Subscription 1921-07 1921-08 \n", + "234 Subscription 1921-09 1922-02 \n", + "260 Subscription 1923-06 1923-10 \n", + "\n", + " member_uris \\\n", + "28 https://shakespeareandco.princeton.edu/members... \n", + "70 https://shakespeareandco.princeton.edu/members... \n", + "233 https://shakespeareandco.princeton.edu/members... \n", + "234 https://shakespeareandco.princeton.edu/members... \n", + "260 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "28 Arthur Elliott Felkin \n", + "70 Geraldine Deknatel;William Deknatel \n", + "233 Mrs. G. S. Madam \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell \n", + "260 Victor Llona \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "28 Felkin, Arthur Elliott NaN \n", + "70 Deknatel, Geraldine;Deknatel, William NaN \n", + "233 Madam, Mrs. G. S. NaN \n", + "234 Moderwell, Anne;Moderwell, Hiram NaN \n", + "260 Llona, Victor NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "28 NaN 1 year 365.0 \n", + "70 NaN 1 year 365.0 \n", + "233 NaN 1 month 31.0 \n", + "234 NaN 5 months 153.0 \n", + "260 NaN 4 months 122.0 \n", + "\n", + " ... item_uri item_title item_volume item_authors item_year item_notes \\\n", + "28 ... NaN NaN NaN NaN NaN NaN \n", + "70 ... NaN NaN NaN NaN NaN NaN \n", + "233 ... NaN NaN NaN NaN NaN NaN \n", + "234 ... NaN NaN NaN NaN NaN NaN \n", + "260 ... NaN NaN NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "28 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "70 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "233 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "234 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "260 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest source_image \n", + "28 NaN NaN \n", + "70 NaN NaN \n", + "233 NaN NaN \n", + "234 NaN NaN \n", + "260 NaN NaN \n", + "\n", + "[5 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
28Subscription19271928https://shakespeareandco.princeton.edu/members...Arthur Elliott FelkinFelkin, Arthur ElliottNaNNaN1 year365.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
70Subscription19311932https://shakespeareandco.princeton.edu/members...Geraldine Deknatel;William DeknatelDeknatel, Geraldine;Deknatel, WilliamNaNNaN1 year365.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
233Subscription1921-071921-08https://shakespeareandco.princeton.edu/members...Mrs. G. S. MadamMadam, Mrs. G. S.NaNNaN1 month31.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
234Subscription1921-091922-02https://shakespeareandco.princeton.edu/members...Anne Moderwell;Hiram Moderwell / H. K. ModerwellModerwell, Anne;Moderwell, HiramNaNNaN5 months153.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
260Subscription1923-061923-10https://shakespeareandco.princeton.edu/members...Victor LlonaLlona, VictorNaNNaN4 months122.0...NaNNaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN
\n", + "

5 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what do the subscription duration day values look like?\n", + "subs_duration.subscription_duration_days.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9i0aN7iQ6voY", + "outputId": "fe1ac93f-5571-4bd3-e4c1-06e90cf33f5c" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "31.0 2997\n", + "30.0 1975\n", + "92.0 936\n", + "91.0 397\n", + "365.0 337\n", + " ... \n", + "69.0 1\n", + "36.0 1\n", + "73.0 1\n", + "574.0 1\n", + "171.0 1\n", + "Name: subscription_duration_days, Length: 133, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 5 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration.subscription_duration_days.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aGqi4LRp60tV", + "outputId": "fbd61c94-41ab-40a7-87c2-cf0548c75d5a" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "count 9146.000000\n", + "mean 72.142685\n", + "std 81.559368\n", + "min 1.000000\n", + "25% 30.000000\n", + "50% 31.000000\n", + "75% 91.000000\n", + "max 574.000000\n", + "Name: subscription_duration_days, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# do we have unknown start/end date values?\n", + "subs_duration[subs_duration.start_date.isna()]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "pUsAb16MKqvb", + "outputId": "27f3b8e7-c5a5-4297-eb7e-e37e81945dda" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Empty DataFrame\n", + "Columns: [event_type, start_date, end_date, member_uris, member_names, member_sort_names, subscription_price_paid, subscription_deposit, subscription_duration, subscription_duration_days, subscription_volumes, subscription_category, subscription_purchase_date, reimbursement_refund, borrow_status, borrow_duration_days, purchase_price, currency, item_uri, item_title, item_volume, item_authors, item_year, item_notes, source_type, source_citation, source_manifest, source_image]\n", + "Index: []\n", + "\n", + "[0 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
\n", + "

0 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration[subs_duration.end_date.isna()]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 310 + }, + "id": "0odaog0eK0CN", + "outputId": "1e8814ff-0043-4969-b1d1-7574c3e82008" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "13168 Subscription 1932-10-06 NaN \n", + "13686 Subscription 1933-03-02 NaN \n", + "\n", + " member_uris \\\n", + "13168 https://shakespeareandco.princeton.edu/members... \n", + "13686 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "13168 Jean (Bakewell) Connolly / Mrs. Cyril Connolly \n", + "13686 Stanislas Pascal Franchot \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "13168 Connolly, Jean NaN \n", + "13686 Franchot, Stanislas Pascal NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "13168 100.0 NaN 31.0 \n", + "13686 50.0 NaN 31.0 \n", + "\n", + " ... item_uri item_title item_volume item_authors item_year \\\n", + "13168 ... NaN NaN NaN NaN NaN \n", + "13686 ... NaN NaN NaN NaN NaN \n", + "\n", + " item_notes source_type \\\n", + "13168 NaN Logbook \n", + "13686 NaN Logbook;Lending Library Card \n", + "\n", + " source_citation \\\n", + "13168 Sylvia Beach, Logbooks 1919–1941, Sylvia Beach... \n", + "13686 Sylvia Beach, Logbooks 1919–1941, Sylvia Beach... \n", + "\n", + " source_manifest \\\n", + "13168 NaN \n", + "13686 ;https://figgy.princeton.edu/concern/scanned_r... \n", + "\n", + " source_image \n", + "13168 NaN \n", + "13686 ;https://iiif.princeton.edu/loris/figgy_prod/7... \n", + "\n", + "[2 rows x 28 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_uriitem_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_image
13168Subscription1932-10-06NaNhttps://shakespeareandco.princeton.edu/members...Jean (Bakewell) Connolly / Mrs. Cyril ConnollyConnolly, JeanNaN100.0NaN31.0...NaNNaNNaNNaNNaNNaNLogbookSylvia Beach, Logbooks 1919–1941, Sylvia Beach...NaNNaN
13686Subscription1933-03-02NaNhttps://shakespeareandco.princeton.edu/members...Stanislas Pascal FranchotFranchot, Stanislas PascalNaN50.0NaN31.0...NaNNaNNaNNaNNaNNaNLogbook;Lending Library CardSylvia Beach, Logbooks 1919–1941, Sylvia Beach...;https://figgy.princeton.edu/concern/scanned_r...;https://iiif.princeton.edu/loris/figgy_prod/7...
\n", + "

2 rows × 28 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# omit events with unknown end date since we can't recalculate duration\n", + "# (duration in the dataset is based on the subscription duration)\n", + "subs_duration = subs_duration[subs_duration.end_date.notna()]" + ], + "metadata": { + "id": "jwvN9-CgLQRx" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# add a new field for duration as calculated by undate\n", + "subs_duration[\"undate_duration\"] = subs_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "subs_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 560 + }, + "id": "Z-CVWd3z7Jb6", + "outputId": "d52d57d4-9803-4bfa-9708-bdf149c7098b" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " event_type start_date end_date \\\n", + "28 Subscription 1927 1928 \n", + "70 Subscription 1931 1932 \n", + "233 Subscription 1921-07 1921-08 \n", + "234 Subscription 1921-09 1922-02 \n", + "260 Subscription 1923-06 1923-10 \n", + "\n", + " member_uris \\\n", + "28 https://shakespeareandco.princeton.edu/members... \n", + "70 https://shakespeareandco.princeton.edu/members... \n", + "233 https://shakespeareandco.princeton.edu/members... \n", + "234 https://shakespeareandco.princeton.edu/members... \n", + "260 https://shakespeareandco.princeton.edu/members... \n", + "\n", + " member_names \\\n", + "28 Arthur Elliott Felkin \n", + "70 Geraldine Deknatel;William Deknatel \n", + "233 Mrs. G. S. Madam \n", + "234 Anne Moderwell;Hiram Moderwell / H. K. Moderwell \n", + "260 Victor Llona \n", + "\n", + " member_sort_names subscription_price_paid \\\n", + "28 Felkin, Arthur Elliott NaN \n", + "70 Deknatel, Geraldine;Deknatel, William NaN \n", + "233 Madam, Mrs. G. S. NaN \n", + "234 Moderwell, Anne;Moderwell, Hiram NaN \n", + "260 Llona, Victor NaN \n", + "\n", + " subscription_deposit subscription_duration subscription_duration_days \\\n", + "28 NaN 1 year 365.0 \n", + "70 NaN 1 year 365.0 \n", + "233 NaN 1 month 31.0 \n", + "234 NaN 5 months 153.0 \n", + "260 NaN 4 months 122.0 \n", + "\n", + " ... item_title item_volume item_authors item_year item_notes \\\n", + "28 ... NaN NaN NaN NaN NaN \n", + "70 ... NaN NaN NaN NaN NaN \n", + "233 ... NaN NaN NaN NaN NaN \n", + "234 ... NaN NaN NaN NaN NaN \n", + "260 ... NaN NaN NaN NaN NaN \n", + "\n", + " source_type source_citation \\\n", + "28 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "70 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "233 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "234 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "260 Address Book Sylvia Beach, Address Book 1919–1935, box 69, ... \n", + "\n", + " source_manifest source_image undate_duration \n", + "28 NaN NaN 730 \n", + "70 NaN NaN 730 \n", + "233 NaN NaN 61 \n", + "234 NaN NaN 180 \n", + "260 NaN NaN 152 \n", + "\n", + "[5 rows x 29 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_typestart_dateend_datemember_urismember_namesmember_sort_namessubscription_price_paidsubscription_depositsubscription_durationsubscription_duration_days...item_titleitem_volumeitem_authorsitem_yearitem_notessource_typesource_citationsource_manifestsource_imageundate_duration
28Subscription19271928https://shakespeareandco.princeton.edu/members...Arthur Elliott FelkinFelkin, Arthur ElliottNaNNaN1 year365.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN730
70Subscription19311932https://shakespeareandco.princeton.edu/members...Geraldine Deknatel;William DeknatelDeknatel, Geraldine;Deknatel, WilliamNaNNaN1 year365.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN730
233Subscription1921-071921-08https://shakespeareandco.princeton.edu/members...Mrs. G. S. MadamMadam, Mrs. G. S.NaNNaN1 month31.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN61
234Subscription1921-091922-02https://shakespeareandco.princeton.edu/members...Anne Moderwell;Hiram Moderwell / H. K. ModerwellModerwell, Anne;Moderwell, HiramNaNNaN5 months153.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN180
260Subscription1923-061923-10https://shakespeareandco.princeton.edu/members...Victor LlonaLlona, VictorNaNNaN4 months122.0...NaNNaNNaNNaNNaNAddress BookSylvia Beach, Address Book 1919–1935, box 69, ...NaNNaN152
\n", + "

5 rows × 29 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# compare undate duration with dataset duration\n", + "# limit to fields we care about\n", + "subs_duration = subs_duration[['start_date', 'end_date', 'subscription_duration', 'subscription_duration_days', 'undate_duration']]\n", + "subs_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "fVf6M2E2LgnH", + "outputId": "87e6585a-670d-466e-d206-caabaaa48df9" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "35114 1941-11-24 1941-12-24 1 month \n", + "35115 1941-11-24 1941-12-24 1 month \n", + "35116 1941-12-04 1942-01-04 1 month \n", + "35118 1941-12-08 1942-03-08 3 months \n", + "35119 1941-12-09 1942-01-09 1 month \n", + "\n", + " subscription_duration_days undate_duration \n", + "28 365.0 730 \n", + "70 365.0 730 \n", + "233 31.0 61 \n", + "234 153.0 180 \n", + "260 122.0 152 \n", + "... ... ... \n", + "35114 30.0 30 \n", + "35115 30.0 30 \n", + "35116 31.0 31 \n", + "35118 90.0 90 \n", + "35119 31.0 31 \n", + "\n", + "[9144 rows x 5 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_duration
28192719281 year365.0730
70193119321 year365.0730
2331921-071921-081 month31.061
2341921-091922-025 months153.0180
2601923-061923-104 months122.0152
..................
351141941-11-241941-12-241 month30.030
351151941-11-241941-12-241 month30.030
351161941-12-041942-01-041 month31.031
351181941-12-081942-03-083 months90.090
351191941-12-091942-01-091 month31.031
\n", + "

9144 rows × 5 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what's the difference between the two?\n", + "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration - row.subscription_duration_days, axis=1)\n", + "subs_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "drnCqTtsL835", + "outputId": "dc042b74-295a-436c-9c70-c6014d986cf7" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "35114 1941-11-24 1941-12-24 1 month \n", + "35115 1941-11-24 1941-12-24 1 month \n", + "35116 1941-12-04 1942-01-04 1 month \n", + "35118 1941-12-08 1942-03-08 3 months \n", + "35119 1941-12-09 1942-01-09 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "233 31.0 61 30.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "... ... ... ... \n", + "35114 30.0 30 0.0 \n", + "35115 30.0 30 0.0 \n", + "35116 31.0 31 0.0 \n", + "35118 90.0 90 0.0 \n", + "35119 31.0 31 0.0 \n", + "\n", + "[9144 rows x 6 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2331921-071921-081 month31.06130.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
.....................
351141941-11-241941-12-241 month30.0300.0
351151941-11-241941-12-241 month30.0300.0
351161941-12-041942-01-041 month31.0310.0
351181941-12-081942-03-083 months90.0900.0
351191941-12-091942-01-091 month31.0310.0
\n", + "

9144 rows × 6 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "source": [ + "subs_duration['duration_diff'].value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "z3i984igMNjm", + "outputId": "c8a3580e-a36a-4756-d427-286ba8e5cf91" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 0.0 9065\n", + " 30.0 30\n", + " 29.0 21\n", + " 1.0 10\n", + "-1.0 9\n", + " 28.0 4\n", + " 365.0 2\n", + " 27.0 1\n", + " 2.0 1\n", + "-3.0 1\n", + "Name: duration_diff, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### investigate discrepancies" + ], + "metadata": { + "id": "Uu9kmAA_gm5o" + } + }, + { + "cell_type": "code", + "source": [ + "# investigate the ones with larger differences\n", + "subset_subdurations = subs_duration[subs_duration.duration_diff != 0]\n", + "subset_subdurations" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "gdenGvR1MkUG", + "outputId": "589b6b49-3f9c-42d5-e01f-326401007878" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "233 1921-07 1921-08 1 month \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "... ... ... ... \n", + "33878 1933-12-01 1934-01-01 1 month \n", + "33880 1933-12-02 1934-01-02 1 month \n", + "33902 1934-01-02 1934-07-02 6 months \n", + "33936 1934-06-02 1934-12-02 6 months \n", + "34892 1940-11-30 1940-12-30 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "233 31.0 61 30.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "... ... ... ... \n", + "33878 30.0 31 1.0 \n", + "33880 30.0 31 1.0 \n", + "33902 182.0 181 -1.0 \n", + "33936 184.0 183 -1.0 \n", + "34892 31.0 30 -1.0 \n", + "\n", + "[79 rows x 6 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2331921-071921-081 month31.06130.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
.....................
338781933-12-011934-01-011 month30.0311.0
338801933-12-021934-01-021 month30.0311.0
339021934-01-021934-07-026 months182.0181-1.0
339361934-06-021934-12-026 months184.0183-1.0
348921940-11-301940-12-301 month31.030-1.0
\n", + "

79 rows × 6 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# too many to lok at once, can we segment by subscription duration?\n", + "subset_subdurations.subscription_duration.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9_w1Cwl2N81d", + "outputId": "c0733942-16cd-42bf-c9a3-abbf250e44f5" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1 month 38\n", + "3 months 12\n", + "2 months 7\n", + "6 months 6\n", + "4 months 5\n", + "5 months 3\n", + "1 year 2\n", + "7 months 2\n", + "8 months 2\n", + "11 months 1\n", + "10 months 1\n", + "Name: subscription_duration, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# lots of one-month subscriptions, what do the discrepancies look like?\n", + "subset_subdurations[subset_subdurations.subscription_duration == '1 month']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "snv1qguUOHPB", + "outputId": "dce76078-236b-48ee-9607-5d702cf4ee04" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "233 1921-07 1921-08 1 month \n", + "261 1923-08 1923-09 1 month \n", + "271 1924-02 1924-03 1 month \n", + "313 1926-11 1926-12 1 month \n", + "354 1928-02 1928-03 1 month \n", + "356 1928-02 1928-03 1 month \n", + "393 1929-08 1929-09 1 month \n", + "394 1929-08 1929-09 1 month \n", + "430 1930-05 1930-06 1 month \n", + "444 1930-11 1930-12 1 month \n", + "462 1931-05 1931-06 1 month \n", + "464 1931-06 1931-07 1 month \n", + "466 1931-07 1931-08 1 month \n", + "468 1931-08 1931-09 1 month \n", + "472 1931-09 1931-10 1 month \n", + "477 1931-10 1931-11 1 month \n", + "478 1931-10 1931-11 1 month \n", + "483 1931-11 1931-12 1 month \n", + "484 1931-11 1931-12 1 month \n", + "487 1931-12 1932-01 1 month \n", + "492 1932-01 1932-02 1 month \n", + "500 1932-02 1932-03 1 month \n", + "501 1932-02 1932-03 1 month \n", + "504 1932-03 1932-04 1 month \n", + "516 1932-04 1932-05 1 month \n", + "517 1932-05 1932-06 1 month \n", + "7064 1926-09-15 1926-10-15 1 month \n", + "31089 1923-11-22 1923-12-22 1 month \n", + "31511 1924-11-08 1924-12-08 1 month \n", + "31722 1925-05-09 1925-06-09 1 month \n", + "32269 1926-06-10 1926-07-10 1 month \n", + "32444 1926-10-07 1926-11-07 1 month \n", + "33401 1929-05-18 1929-06-18 1 month \n", + "33665 1932-12-15 1933-01-15 1 month \n", + "33709 1933-02-03 1933-03-03 1 month \n", + "33878 1933-12-01 1934-01-01 1 month \n", + "33880 1933-12-02 1934-01-02 1 month \n", + "34892 1940-11-30 1940-12-30 1 month \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "233 31.0 61 30.0 \n", + "261 31.0 60 29.0 \n", + "271 29.0 59 30.0 \n", + "313 30.0 60 30.0 \n", + "354 29.0 59 30.0 \n", + "356 29.0 59 30.0 \n", + "393 31.0 60 29.0 \n", + "394 31.0 60 29.0 \n", + "430 31.0 60 29.0 \n", + "444 30.0 60 30.0 \n", + "462 31.0 60 29.0 \n", + "464 30.0 60 30.0 \n", + "466 31.0 61 30.0 \n", + "468 31.0 60 29.0 \n", + "472 30.0 60 30.0 \n", + "477 31.0 60 29.0 \n", + "478 31.0 60 29.0 \n", + "483 30.0 60 30.0 \n", + "484 30.0 60 30.0 \n", + "487 31.0 61 30.0 \n", + "492 31.0 59 28.0 \n", + "500 29.0 59 30.0 \n", + "501 29.0 59 30.0 \n", + "504 31.0 60 29.0 \n", + "516 30.0 60 30.0 \n", + "517 31.0 60 29.0 \n", + "7064 31.0 30 -1.0 \n", + "31089 31.0 30 -1.0 \n", + "31511 31.0 30 -1.0 \n", + "31722 30.0 31 1.0 \n", + "32269 31.0 30 -1.0 \n", + "32444 30.0 31 1.0 \n", + "33401 30.0 31 1.0 \n", + "33665 30.0 31 1.0 \n", + "33709 31.0 28 -3.0 \n", + "33878 30.0 31 1.0 \n", + "33880 30.0 31 1.0 \n", + "34892 31.0 30 -1.0 " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
2331921-071921-081 month31.06130.0
2611923-081923-091 month31.06029.0
2711924-021924-031 month29.05930.0
3131926-111926-121 month30.06030.0
3541928-021928-031 month29.05930.0
3561928-021928-031 month29.05930.0
3931929-081929-091 month31.06029.0
3941929-081929-091 month31.06029.0
4301930-051930-061 month31.06029.0
4441930-111930-121 month30.06030.0
4621931-051931-061 month31.06029.0
4641931-061931-071 month30.06030.0
4661931-071931-081 month31.06130.0
4681931-081931-091 month31.06029.0
4721931-091931-101 month30.06030.0
4771931-101931-111 month31.06029.0
4781931-101931-111 month31.06029.0
4831931-111931-121 month30.06030.0
4841931-111931-121 month30.06030.0
4871931-121932-011 month31.06130.0
4921932-011932-021 month31.05928.0
5001932-021932-031 month29.05930.0
5011932-021932-031 month29.05930.0
5041932-031932-041 month31.06029.0
5161932-041932-051 month30.06030.0
5171932-051932-061 month31.06029.0
70641926-09-151926-10-151 month31.030-1.0
310891923-11-221923-12-221 month31.030-1.0
315111924-11-081924-12-081 month31.030-1.0
317221925-05-091925-06-091 month30.0311.0
322691926-06-101926-07-101 month31.030-1.0
324441926-10-071926-11-071 month30.0311.0
334011929-05-181929-06-181 month30.0311.0
336651932-12-151933-01-151 month30.0311.0
337091933-02-031933-03-031 month31.028-3.0
338781933-12-011934-01-011 month30.0311.0
338801933-12-021934-01-021 month30.0311.0
348921940-11-301940-12-301 month31.030-1.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The first set of these are calculated differently because they are partial dates; undate logic calculates based on earliest possible date through last possible date, but we have additional information in these cases that is project-specific and undate can't take into account, i.e. subscription duration is one month starting sometime in a known year or month.\n", + "\n", + "The handful towards the end that are off by one in either direction (+/-) are a little more concerning... (potential bug in S&co code? or value calculated based on known semantic duration?)" + ], + "metadata": { + "id": "Rm4jqlA4hq9E" + } + }, + { + "cell_type": "code", + "source": [ + "# durations other than one month\n", + "subset_subdurations[subset_subdurations.subscription_duration != '1 month']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "TEL7qdNhOXHL", + "outputId": "50e051d5-18ae-4f24-a229-fc02fb610ed8" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date subscription_duration \\\n", + "28 1927 1928 1 year \n", + "70 1931 1932 1 year \n", + "234 1921-09 1922-02 5 months \n", + "260 1923-06 1923-10 4 months \n", + "272 1924-02 1924-04 2 months \n", + "293 1926-03 1926-10 7 months \n", + "321 1927-03 1928-02 11 months \n", + "331 1927-07 1927-10 3 months \n", + "337 1927-10 1928-06 8 months \n", + "349 1928-01 1928-04 3 months \n", + "388 1929-06 1930-04 10 months \n", + "408 1930-01 1930-04 3 months \n", + "409 1930-01 1930-04 3 months \n", + "412 1930-01 1930-09 8 months \n", + "415 1930-02 1930-06 4 months \n", + "431 1930-05 1930-07 2 months \n", + "437 1930-09 1930-12 3 months \n", + "454 1930-12 1931-03 3 months \n", + "459 1931-03 1931-05 2 months \n", + "465 1931-07 1931-10 3 months \n", + "471 1931-09 1931-12 3 months \n", + "475 1931-09 1931-12 3 months \n", + "476 1931-10 1932-03 5 months \n", + "480 1931-10 1932-02 4 months \n", + "485 1931-11 1932-06 7 months \n", + "486 1931-12 1932-05 5 months \n", + "489 1931-12 1932-02 2 months \n", + "490 1931-12 1932-04 4 months \n", + "496 1932-01 1932-03 2 months \n", + "502 1932-02 1932-06 4 months \n", + "506 1932-03 1932-05 2 months \n", + "507 1932-03 1932-05 2 months \n", + "709 1919-12-02 1920-06-02 6 months \n", + "7560 1927-01-11 1927-04-11 3 months \n", + "31480 1924-10-17 1925-04-17 6 months \n", + "31917 1925-10-21 1926-01-21 3 months \n", + "32613 1927-03-14 1927-06-14 3 months \n", + "32671 1927-06-14 1927-12-14 6 months \n", + "32869 1927-12-14 1928-06-14 6 months \n", + "33902 1934-01-02 1934-07-02 6 months \n", + "33936 1934-06-02 1934-12-02 6 months \n", + "\n", + " subscription_duration_days undate_duration duration_diff \n", + "28 365.0 730 365.0 \n", + "70 365.0 730 365.0 \n", + "234 153.0 180 27.0 \n", + "260 122.0 152 30.0 \n", + "272 60.0 89 29.0 \n", + "293 214.0 244 30.0 \n", + "321 337.0 365 28.0 \n", + "331 92.0 122 30.0 \n", + "337 244.0 273 29.0 \n", + "349 91.0 120 29.0 \n", + "388 304.0 333 29.0 \n", + "408 90.0 119 29.0 \n", + "409 90.0 119 29.0 \n", + "412 243.0 272 29.0 \n", + "415 120.0 149 29.0 \n", + "431 61.0 91 30.0 \n", + "437 91.0 121 30.0 \n", + "454 90.0 120 30.0 \n", + "459 61.0 91 30.0 \n", + "465 92.0 122 30.0 \n", + "471 91.0 121 30.0 \n", + "475 91.0 121 30.0 \n", + "476 152.0 182 30.0 \n", + "480 123.0 151 28.0 \n", + "485 213.0 242 29.0 \n", + "486 152.0 182 30.0 \n", + "489 62.0 90 28.0 \n", + "490 122.0 151 29.0 \n", + "496 60.0 90 30.0 \n", + "502 121.0 150 29.0 \n", + "506 61.0 91 30.0 \n", + "507 61.0 91 30.0 \n", + "709 182.0 183 1.0 \n", + "7560 91.0 90 -1.0 \n", + "31480 181.0 182 1.0 \n", + "31917 91.0 92 1.0 \n", + "32613 90.0 92 2.0 \n", + "32671 184.0 183 -1.0 \n", + "32869 182.0 183 1.0 \n", + "33902 182.0 181 -1.0 \n", + "33936 184.0 183 -1.0 " + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_datesubscription_durationsubscription_duration_daysundate_durationduration_diff
28192719281 year365.0730365.0
70193119321 year365.0730365.0
2341921-091922-025 months153.018027.0
2601923-061923-104 months122.015230.0
2721924-021924-042 months60.08929.0
2931926-031926-107 months214.024430.0
3211927-031928-0211 months337.036528.0
3311927-071927-103 months92.012230.0
3371927-101928-068 months244.027329.0
3491928-011928-043 months91.012029.0
3881929-061930-0410 months304.033329.0
4081930-011930-043 months90.011929.0
4091930-011930-043 months90.011929.0
4121930-011930-098 months243.027229.0
4151930-021930-064 months120.014929.0
4311930-051930-072 months61.09130.0
4371930-091930-123 months91.012130.0
4541930-121931-033 months90.012030.0
4591931-031931-052 months61.09130.0
4651931-071931-103 months92.012230.0
4711931-091931-123 months91.012130.0
4751931-091931-123 months91.012130.0
4761931-101932-035 months152.018230.0
4801931-101932-024 months123.015128.0
4851931-111932-067 months213.024229.0
4861931-121932-055 months152.018230.0
4891931-121932-022 months62.09028.0
4901931-121932-044 months122.015129.0
4961932-011932-032 months60.09030.0
5021932-021932-064 months121.015029.0
5061932-031932-052 months61.09130.0
5071932-031932-052 months61.09130.0
7091919-12-021920-06-026 months182.01831.0
75601927-01-111927-04-113 months91.090-1.0
314801924-10-171925-04-176 months181.01821.0
319171925-10-211926-01-213 months91.0921.0
326131927-03-141927-06-143 months90.0922.0
326711927-06-141927-12-146 months184.0183-1.0
328691927-12-141928-06-146 months182.01831.0
339021934-01-021934-07-026 months182.0181-1.0
339361934-06-021934-12-026 months184.0183-1.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## borrow events\n", + "\n", + "compare borrow events with known duration" + ], + "metadata": { + "id": "2tk6N7SXKKCu" + } + }, + { + "cell_type": "code", + "source": [ + "borrow_duration = events_df[events_df.borrow_duration_days.notna()]\n", + "# limit to fields we care about for this check\n", + "borrow_duration = borrow_duration[['start_date', 'end_date', 'borrow_duration_days']]\n", + "borrow_duration.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "fA1Nedmz6cyF", + "outputId": "5230d5ad-fec4-4353-a0d2-9676d1aa776d" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days\n", + "602 --01-07 --01-13 6.0\n", + "603 --01-12 --01-20 8.0\n", + "604 --01-16 --02-16 31.0\n", + "605 --01-19 --01-24 5.0\n", + "606 --01-20 --01-28 8.0" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_days
602--01-07--01-136.0
603--01-12--01-208.0
604--01-16--02-1631.0
605--01-19--01-245.0
606--01-20--01-288.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "source": [ + "borrow_duration.tail()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "KPOBIRsTUKM9", + "outputId": "4a251445-e7c7-4250-82df-ece0bc9a3d56" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days\n", + "29903 1961-06-30 1961-10-04 96.0\n", + "29904 1961-06-30 1961-10-04 96.0\n", + "29905 1961-06-30 1961-10-04 96.0\n", + "29907 1961-10-04 1962-03-21 168.0\n", + "29908 1961-10-04 1962-03-21 168.0" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_days
299031961-06-301961-10-0496.0
299041961-06-301961-10-0496.0
299051961-06-301961-10-0496.0
299071961-10-041962-03-21168.0
299081961-10-041962-03-21168.0
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# add a new field for duration as calculated by undate\n", + "borrow_duration[\"undate_duration\"] = borrow_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "borrow_duration.head(10)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "id": "39nEPZva8jDo", + "outputId": "6cff4de2-c188-43ad-dc75-684c4d461029" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days undate_duration\n", + "602 --01-07 --01-13 6.0 6\n", + "603 --01-12 --01-20 8.0 8\n", + "604 --01-16 --02-16 31.0 31\n", + "605 --01-19 --01-24 5.0 5\n", + "606 --01-20 --01-28 8.0 8\n", + "607 --01-24 --03-20 55.0 55\n", + "608 --01-24 --03-20 55.0 55\n", + "609 --01-24 --03-20 55.0 55\n", + "610 --01-24 --05-30 126.0 126\n", + "611 --01-24 --05-30 126.0 126" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_duration
602--01-07--01-136.06
603--01-12--01-208.08
604--01-16--02-1631.031
605--01-19--01-245.05
606--01-20--01-288.08
607--01-24--03-2055.055
608--01-24--03-2055.055
609--01-24--03-2055.055
610--01-24--05-30126.0126
611--01-24--05-30126.0126
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what's the difference between the two?\n", + "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration - row.borrow_duration_days, axis=1)\n", + "borrow_duration" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "rL5S47wPWfd-", + "outputId": "127af40e-0037-4f99-d590-9cc2466a206b" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " start_date end_date borrow_duration_days undate_duration \\\n", + "602 --01-07 --01-13 6.0 6 \n", + "603 --01-12 --01-20 8.0 8 \n", + "604 --01-16 --02-16 31.0 31 \n", + "605 --01-19 --01-24 5.0 5 \n", + "606 --01-20 --01-28 8.0 8 \n", + "... ... ... ... ... \n", + "29903 1961-06-30 1961-10-04 96.0 96 \n", + "29904 1961-06-30 1961-10-04 96.0 96 \n", + "29905 1961-06-30 1961-10-04 96.0 96 \n", + "29907 1961-10-04 1962-03-21 168.0 168 \n", + "29908 1961-10-04 1962-03-21 168.0 168 \n", + "\n", + " duration_diff \n", + "602 0.0 \n", + "603 0.0 \n", + "604 0.0 \n", + "605 0.0 \n", + "606 0.0 \n", + "... ... \n", + "29903 0.0 \n", + "29904 0.0 \n", + "29905 0.0 \n", + "29907 0.0 \n", + "29908 0.0 \n", + "\n", + "[19728 rows x 5 columns]" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_durationduration_diff
602--01-07--01-136.060.0
603--01-12--01-208.080.0
604--01-16--02-1631.0310.0
605--01-19--01-245.050.0
606--01-20--01-288.080.0
..................
299031961-06-301961-10-0496.0960.0
299041961-06-301961-10-0496.0960.0
299051961-06-301961-10-0496.0960.0
299071961-10-041962-03-21168.01680.0
299081961-10-041962-03-21168.01680.0
\n", + "

19728 rows × 5 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 22 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# what do the duration differences look like?\n", + "borrow_duration.duration_diff.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DQumLSXZW7r6", + "outputId": "fc5196d6-9d9a-430e-ecb2-c142676c3614" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.0 19728\n", + "Name: duration_diff, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Woohoo, everything matches! 🎉\n", + "\n", + "In a previous run, there were two borrow events where the calculation did not match; this was due to an error in undate duration method when the start and end dates have unknown years and dates wrap to the following year (e.g., december to january), which has now been corrected.\n", + "\n", + "**Note:** One of those events has a range (--06-07/--06-06) that looks like a data error in S&co, but the data matches what is [written on the lending card](https://shakespeareandco.princeton.edu/members/davet-yvonne/cards/cf96d38f-e651-491c-a575-131ea32ce425/#)." + ], + "metadata": { + "id": "r0TUYWzSXIil" + } + }, + { + "cell_type": "code", + "source": [ + "borrow_duration[borrow_duration.duration_diff != 0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 + }, + "id": "-Bq76gtDWljg", + "outputId": "f1ee526d-b938-4cbf-e93c-c6c91c077ae7" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Empty DataFrame\n", + "Columns: [start_date, end_date, borrow_duration_days, undate_duration, duration_diff]\n", + "Index: []" + ], + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
start_dateend_dateborrow_duration_daysundate_durationduration_diff
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + } + ] +} \ No newline at end of file