diff --git a/day-two/10-putting-it-together.ipynb b/day-two/10-putting-it-together.ipynb deleted file mode 100644 index be039bc..0000000 --- a/day-two/10-putting-it-together.ipynb +++ /dev/null @@ -1,1725 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Workflow\n", - "\n", - "- Download JSON representations of a collection, specificatlly [Selected Digitized Books](https://www.loc.gov/collections/selected-digitized-books/)\n", - "- Use pagination (with `next` field) to get the first 5 pages of books\n", - " - Build a big list of item URLs \n", - "- Download individual item JSON from the big list of URLS\n", - " - Save JSON to disk\n", - "- Extract individual information from JSON and add to Pandas dataframe\n", - " - Page numbers from the `medium` field\n", - " - requires some string cleaning\n", - "- Visualize a histogram of page length of items in the collection\n", - "- Save the dataframe as CSV/XSLS to disk" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import json\n", - "from pathlib import Path\n", - "\n", - "import pandas as pd\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set Parameters" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# Directory for saving files\n", - "DATA_DIR = \"json-data/\"\n", - "Path(DATA_DIR).mkdir(parents=True, exist_ok=True)\n", - "\n", - "# Depth parameter\n", - "PAGE_LIMIT = 5\n", - "\n", - "\n", - "# HTTP Parameters\n", - "BASE_URL = \"https://loc.gov\"\n", - "ENDPOINT = \"/collections/selected-digitized-books\"\n", - "FORMAT = \"json\"\n", - "RESULTS_PER_PAGE = 50" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fetch Collection Index" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fetching https://loc.gov/collections/selected-digitized-books/?fo=json&c=50&sp=1\n", - "Fetching https://loc.gov/collections/selected-digitized-books/?fo=json&c=50&sp=2\n", - "Fetching https://loc.gov/collections/selected-digitized-books/?fo=json&c=50&sp=3\n", - "Fetching https://loc.gov/collections/selected-digitized-books/?fo=json&c=50&sp=4\n", - "Fetching https://loc.gov/collections/selected-digitized-books/?fo=json&c=50&sp=5\n" - ] - } - ], - "source": [ - "results_pile = []\n", - "\n", - "for page_num in range(1,PAGE_LIMIT+1):\n", - " \n", - " URL = BASE_URL + ENDPOINT + \"/?fo={FORMAT}&c={RESULTS}&sp={PAGE}\".format(FORMAT=FORMAT,\n", - " RESULTS=RESULTS_PER_PAGE,\n", - " PAGE=page_num)\n", - " print(\"Fetching\", URL)\n", - " response = requests.get(URL)\n", - " collection_index = response.json()\n", - " \n", - " results_pile.\n", - " \n", - " file_name = DATA_DIR + \"/index_\" + str(page_num) + \".json\"\n", - " with open(file_name, 'w') as f:\n", - " json.dump(collection_index, f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Fetching Individual Items" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[PosixPath('json-data/index_1.json'),\n", - " PosixPath('json-data/index_2.json'),\n", - " PosixPath('json-data/index_3.json'),\n", - " PosixPath('json-data/index_4.json'),\n", - " PosixPath('json-data/index_5.json')]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "collection_indexes = list(Path(DATA_DIR).glob(\"index_*.json\"))\n", - "collection_indexes" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "results_pile = [json.loads(index.read_text())['results'] for index in collection_indexes]" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "250" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(results_pile)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* We can't actually use a list comprehension here because it loads" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "results_pile = []\n", - "for file in collection_indexes:\n", - " index = json.loads(file.read_text())\n", - " results_pile.extend(index['results'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "250" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(results_pile)" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "def fetch_item(url):\n", - " url = url + \"?fo=json\"\n", - " response = requests.get(url)\n", - " \n", - " item_json = response.json()\n", - "\n", - " lccn = item_json['item'][\"library_of_congress_control_number\"]\n", - " \n", - " filename = DATA_DIR + \"item_\" + lccn + \".json\"\n", - " with open(filename, \"w\") as f:\n", - " json.dump(item_json, f)\n", - " \n", - " return item_json" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "item_pile = [fetch_item(result['id']) for result in results_pile]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Data Files\n", - "\n", - "* " - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PosixPath('json-data/item_ltf90006684.json')" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_files = list(Path(DATA_DIR).glob(\"item_*.json\"))\n", - "item_files[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [], - "source": [ - "def open_item(path):\n", - " \n", - " with open(path, 'r') as f:\n", - " return json.load(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "250" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_pile = [open_item(path) for path in item_files]\n", - "len(item_pile)" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [], - "source": [ - "keys_of_interest = [\n", - " \"library_of_congress_control_number\",\n", - " \"date\",\n", - " \"title\",\n", - " \"medium\",\n", - " \"created_published\",\n", - " \"id\",\n", - " \n", - " \n", - "]\n", - "def get_fields(item):\n", - " \n", - " return {key : item['item'][key] for key in keys_of_interest}" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
created_publisheddateidlibrary_of_congress_control_numbermediumtitle
0[Boston, Printed and published by Lincoln & Ed...1827http://www.loc.gov/item/ltf90006684/ltf90006684[252 p.]Conversations on natural philosophy, in which ...
1[New York, Printed by Clayton & Van Norden, 18...1825http://www.loc.gov/item/ltf90008069/ltf90008069[32 p.]An examination of Mr. Dufief's philosophical n...
2[Georgetown Heights, D.C., 1860.]1860http://www.loc.gov/item/ltf90008097/ltf90008097[4 p.]Philosophical views.
3[Rochester, New York State Pub. Co., 1900 [c18...1900http://www.loc.gov/item/ltf90002975/ltf90002975[116 p.]Hypnotism as it is; a book for everybody.
4[New York, S.R. Wells, 1875.]1875http://www.loc.gov/item/ltf90003010/ltf90003010[176 p.]New illustrated self-instructor in phrenology ...
5[Philadelphia, Gaut & Volkmar, 1860.]1860http://www.loc.gov/item/ltf90007944/ltf90007944[24 p.]Psukikos; philosophic observations on the rela...
6[Cincinnati, Gazette Co. Print, 1875.]1875http://www.loc.gov/item/ltf90007945/ltf90007945[28 p.]The mind; an introductory lecture, delivered N...
7[Chicago, 1867.]1867http://www.loc.gov/item/ltf90007948/ltf90007948[8 p.]Scientific explanation of the creed crusher, o...
8[Boston, W. White, 1871.]1871http://www.loc.gov/item/ltf90007953/ltf90007953[40 p.]What is spiritualism? And shall spiritualists ...
9[Chicago, J. Walker, 1868.]1868http://www.loc.gov/item/ltf90007976/ltf90007976[407 p.]The trance, and correlative phenomena.
10[Boston, 1896.]1896http://www.loc.gov/item/ltf90003958/ltf90003958[169 p.]Posthumous memoirs of Helena Petrovna Blavatsky,
11[New York, A.S. Barnes, 1856.]1856http://www.loc.gov/item/ltf90004222/ltf90004222[301 p.]Improvement of the mind.
12[Chicago, 1885.]1885http://www.loc.gov/item/ltf90004535/ltf90004535[48 p.]Mental gymnastics; or, Lessons on memory.
13[San Francisco, P. Elder [c1908]]1908http://www.loc.gov/item/ltf90004926/ltf90004926[1 v. (unpaged)]The perfectly good cynic's calendar, with astr...
14[New York, Funk & Wagnalls Co., 1892.]1892http://www.loc.gov/item/ltf90025142/ltf90025142[146 p.]An essay on the duties of man, addressed to wo...
15[Harrisburg, Pennsylvania Pub. Co. [1883]]1883http://www.loc.gov/item/ltf90025206/ltf90025206[443 p.]Our manners at home and abroad; a complete man...
16[Boston, Massachusetts Sabbath School Society,...1845http://www.loc.gov/item/ltf90003159/ltf90003159[54 p.]David and Jonathan; or, Considerations relatin...
17[Boston, 1852.]1852http://www.loc.gov/item/ltf90003160/ltf90003160[19 p.]Charter and by-laws, with lists of officers an...
18[Washington, 1832.]1832http://www.loc.gov/item/ltf90003161/ltf90003161[20 p.]Essay on moral and religious education in dome...
19[Honolulu, Pub. Bureau of Hongwanji Mission, 1...1918http://www.loc.gov/item/ltf90001296/ltf90001296[29, 103 p.]Democracy according to the Buddhist viewpoint.
20[Cincinnati, Published by L. Swormstedt and J....1848http://www.loc.gov/item/ltf90017676/ltf90017676[341 p.]The analogy of religion, natural and revealed ...
21[New York, T. Whittaker, 1897 [c1896]]1897http://www.loc.gov/item/ltf90017732/ltf90017732[87 p.]Some modern substitutes for Christianity; a co...
22[New York, 1890.]1890http://www.loc.gov/item/ltf90005739/ltf90005739[31 p.]Christ, the pupil of Buddha; a comparative study.
23[Chicago, Chicago Medical Book Co., 1899.]1899http://www.loc.gov/item/ltf90005769/ltf90005769[215 p.]Sex worship; an exposition of the Phallic orig...
24[Pittsburgh, Printed by Shryock & Hacke, 1850.]1850http://www.loc.gov/item/ltf90005775/ltf90005775[24 p.]Address delivered at the opening of the sessio...
25[New York, R. Worthington, 1879.]1879http://www.loc.gov/item/ltf90006298/ltf90006298[112 p.]The Jews, their customs and ceremonies, with a...
26[Philadelphia, O. Klonower, 1922.]1922http://www.loc.gov/item/ltf90006357/ltf90006357[58 p.]Intermarriage and other discourses; delivered ...
27[New York, 1903.]1903http://www.loc.gov/item/ltf90007565/ltf90007565[35 p.]Inaugural address, delivered November 20, 1902.
28[[Point Loma, Calif., Woman's Theosophical Pro...1907http://www.loc.gov/item/ltf90007757/ltf90007757[50 p. cm.]Katherine Tingley, humanity's friend; a visit ...
29[Boston, Press of T.R. Marvin, 1856.]1856http://www.loc.gov/item/ltf90024727/ltf90024727[30 p.]The Church and the college; a discourse delive...
.....................
70[Columbus, Ohio, Lutheran Book Concern, 1894.]1894http://www.loc.gov/item/ltf90008396/ltf90008396[153 p.]Before the altar; or, A series of annotated pr...
71[Louisville, Ky., Baptist Book Concern, 1892.]1892http://www.loc.gov/item/ltf90008411/ltf90008411[86 p.]Centennial celebration of modern missions,
72[New York, A.D.F. Randolph, 1864.]1864http://www.loc.gov/item/ltf90008432/ltf90008432[103 p.]Texts and hymns for the youngest; a book to le...
73[Troy, N.Y., N. Tuttle, printer, 1841.]1841http://www.loc.gov/item/ltf90008458/ltf90008458[23 p.]Christianity, a philosophy of principles; an a...
74[Richmond, H.K. Ellyson, printer, 1847.]1847http://www.loc.gov/item/ltf90008459/ltf90008459[24 p.]Hints of the best method of originating and co...
75[Baltimore, C. Harvey, printers, 1875.]1875http://www.loc.gov/item/ltf90008463/ltf90008463[12 p. 23 cm.]Letter to a son on Christian belief,
76[New York, R. Carter, 1857.]1857http://www.loc.gov/item/ltf90008493/ltf90008493[188 p.]Hymns for infant minds.
77[Dayton, Reformed Pub. Co., 1883.]1883http://www.loc.gov/item/ltf90008505/ltf90008505[648 p.]A treasury of family reading, pertaining to Go...
78[Hartford, Case, Lockwood & Brainard Co., prin...1883http://www.loc.gov/item/ltf90008532/ltf90008532[396 p.]Sermons and other papers.
79[Baltimore, J. Murphy, 1871 [c1870]]1871http://www.loc.gov/item/ltf90019634/ltf90019634[320 p.]Memoirs of a guardian angel.
80[Blair, Neb., Danish Lutheran Pub. House, 1917.]1917http://www.loc.gov/item/ltf90009561/ltf90009561[176 p.]Dansk luthersk mission i Amerika i tiden før 1...
81[Philadelphia, Presbyterian Board of Publicati...1852http://www.loc.gov/item/ltf90009671/ltf90009671[198 p.]A manual on the Christian Sabbath.
82[New York, R. Carter, 1844.]1844http://www.loc.gov/item/ltf90009637/ltf90009637[178 p.]The harp on the willows, Remembering Zion, Far...
83[New York, A.D.F. Randolph [pref. 1876]]1876http://www.loc.gov/item/ltf90009640/ltf90009640[139 p.]My King; or, Daily thoughts for the King's chi...
84[Boston, American Board, 1883.]1883http://www.loc.gov/item/ltf90009650/ltf90009650[88 p.]Story of the Morning Star, the children's miss...
85[New York, Hodder & Stoughton [introd. 1913]]1913http://www.loc.gov/item/ltf90009680/ltf90009680[228 p.]Out of the abyss; the autobiography of one who...
86[Philadelphia [1858]]1858http://www.loc.gov/item/ltf90009683/ltf90009683[263 p.]The sailor's companion; or, Book of devotions ...
87[Chicago, Missions-Vännens expedition [föror...1910http://www.loc.gov/item/ltf90009288/ltf90009288[292 p.]Passionspredikningar; betraktelser öfver de ol...
88[Charleston, S. C. Printed by B. Jenkins, 1847.]1847http://www.loc.gov/item/ltf90009727/ltf90009727[24 p.]The rule and measure of Christian charity.
89[Albany, Munsell & Rowland, 1858.]1858http://www.loc.gov/item/ltf90009729/ltf90009729[183 p.]Proclamations for Thanksgiving issued by the C...
90[New York, 1886.]1886http://www.loc.gov/item/ltf90009740/ltf90009740[23 p.]History of the American Missionary Association...
91[Dayton, Christian Pub. Association, 1881.]1881http://www.loc.gov/item/ltf90009806/ltf90009806[434 p.]Gospel sermons by Christian ministers.
92[Boston, Cummings, Hilliard, 1825.]1825http://www.loc.gov/item/ltf90009828/ltf90009828[252 p.]A family prayer-book; containing forms of morn...
93[Brooklyn, N.Y. c1890.]1890http://www.loc.gov/item/ltf90009829/ltf90009829[223 p.]De konungsliga nådegåfvorna.
94[New York, Sheldon, 1879.]1879http://www.loc.gov/item/ltf90009841/ltf90009841[324 p.]Born of water and spirit; a series of essays c...
95[New York, W.R. Jenkins, 1895 [c1983]]1895http://www.loc.gov/item/ltf90019579/ltf90019579[286 p.]Angelus Domini, an anthology in art and verse ...
96[Boston, Silver, Burdett, 1888.]1888http://www.loc.gov/item/ltf90019584/ltf90019584[163 p.]Through death to life; discourses on St. Paul'...
97[Boston, J.H. Earle, 1890.]1890http://www.loc.gov/item/ltf90019585/ltf90019585[290 p.]The Sunday question; or, The Lord's Day, its s...
98[New York, American Tract Society [c1894]]1894http://www.loc.gov/item/ltf90019589/ltf90019589[229 p.]Woman in missions; papers and addresses presen...
99[Philadelphia, J. E. Potter [c1880]]1880http://www.loc.gov/item/ltf90019608/ltf90019608[215 p.]Rev. Mr. Dashwell, the new minister at Hampton,
\n", - "

100 rows × 6 columns

\n", - "
" - ], - "text/plain": [ - " created_published date \\\n", - "0 [Boston, Printed and published by Lincoln & Ed... 1827 \n", - "1 [New York, Printed by Clayton & Van Norden, 18... 1825 \n", - "2 [Georgetown Heights, D.C., 1860.] 1860 \n", - "3 [Rochester, New York State Pub. Co., 1900 [c18... 1900 \n", - "4 [New York, S.R. Wells, 1875.] 1875 \n", - "5 [Philadelphia, Gaut & Volkmar, 1860.] 1860 \n", - "6 [Cincinnati, Gazette Co. Print, 1875.] 1875 \n", - "7 [Chicago, 1867.] 1867 \n", - "8 [Boston, W. White, 1871.] 1871 \n", - "9 [Chicago, J. Walker, 1868.] 1868 \n", - "10 [Boston, 1896.] 1896 \n", - "11 [New York, A.S. Barnes, 1856.] 1856 \n", - "12 [Chicago, 1885.] 1885 \n", - "13 [San Francisco, P. Elder [c1908]] 1908 \n", - "14 [New York, Funk & Wagnalls Co., 1892.] 1892 \n", - "15 [Harrisburg, Pennsylvania Pub. Co. [1883]] 1883 \n", - "16 [Boston, Massachusetts Sabbath School Society,... 1845 \n", - "17 [Boston, 1852.] 1852 \n", - "18 [Washington, 1832.] 1832 \n", - "19 [Honolulu, Pub. Bureau of Hongwanji Mission, 1... 1918 \n", - "20 [Cincinnati, Published by L. Swormstedt and J.... 1848 \n", - "21 [New York, T. Whittaker, 1897 [c1896]] 1897 \n", - "22 [New York, 1890.] 1890 \n", - "23 [Chicago, Chicago Medical Book Co., 1899.] 1899 \n", - "24 [Pittsburgh, Printed by Shryock & Hacke, 1850.] 1850 \n", - "25 [New York, R. Worthington, 1879.] 1879 \n", - "26 [Philadelphia, O. Klonower, 1922.] 1922 \n", - "27 [New York, 1903.] 1903 \n", - "28 [[Point Loma, Calif., Woman's Theosophical Pro... 1907 \n", - "29 [Boston, Press of T.R. Marvin, 1856.] 1856 \n", - ".. ... ... \n", - "70 [Columbus, Ohio, Lutheran Book Concern, 1894.] 1894 \n", - "71 [Louisville, Ky., Baptist Book Concern, 1892.] 1892 \n", - "72 [New York, A.D.F. Randolph, 1864.] 1864 \n", - "73 [Troy, N.Y., N. Tuttle, printer, 1841.] 1841 \n", - "74 [Richmond, H.K. Ellyson, printer, 1847.] 1847 \n", - "75 [Baltimore, C. Harvey, printers, 1875.] 1875 \n", - "76 [New York, R. Carter, 1857.] 1857 \n", - "77 [Dayton, Reformed Pub. Co., 1883.] 1883 \n", - "78 [Hartford, Case, Lockwood & Brainard Co., prin... 1883 \n", - "79 [Baltimore, J. Murphy, 1871 [c1870]] 1871 \n", - "80 [Blair, Neb., Danish Lutheran Pub. House, 1917.] 1917 \n", - "81 [Philadelphia, Presbyterian Board of Publicati... 1852 \n", - "82 [New York, R. Carter, 1844.] 1844 \n", - "83 [New York, A.D.F. Randolph [pref. 1876]] 1876 \n", - "84 [Boston, American Board, 1883.] 1883 \n", - "85 [New York, Hodder & Stoughton [introd. 1913]] 1913 \n", - "86 [Philadelphia [1858]] 1858 \n", - "87 [Chicago, Missions-Vännens expedition [föror... 1910 \n", - "88 [Charleston, S. C. Printed by B. Jenkins, 1847.] 1847 \n", - "89 [Albany, Munsell & Rowland, 1858.] 1858 \n", - "90 [New York, 1886.] 1886 \n", - "91 [Dayton, Christian Pub. Association, 1881.] 1881 \n", - "92 [Boston, Cummings, Hilliard, 1825.] 1825 \n", - "93 [Brooklyn, N.Y. c1890.] 1890 \n", - "94 [New York, Sheldon, 1879.] 1879 \n", - "95 [New York, W.R. Jenkins, 1895 [c1983]] 1895 \n", - "96 [Boston, Silver, Burdett, 1888.] 1888 \n", - "97 [Boston, J.H. Earle, 1890.] 1890 \n", - "98 [New York, American Tract Society [c1894]] 1894 \n", - "99 [Philadelphia, J. E. Potter [c1880]] 1880 \n", - "\n", - " id library_of_congress_control_number \\\n", - "0 http://www.loc.gov/item/ltf90006684/ ltf90006684 \n", - "1 http://www.loc.gov/item/ltf90008069/ ltf90008069 \n", - "2 http://www.loc.gov/item/ltf90008097/ ltf90008097 \n", - "3 http://www.loc.gov/item/ltf90002975/ ltf90002975 \n", - "4 http://www.loc.gov/item/ltf90003010/ ltf90003010 \n", - "5 http://www.loc.gov/item/ltf90007944/ ltf90007944 \n", - "6 http://www.loc.gov/item/ltf90007945/ ltf90007945 \n", - "7 http://www.loc.gov/item/ltf90007948/ ltf90007948 \n", - "8 http://www.loc.gov/item/ltf90007953/ ltf90007953 \n", - "9 http://www.loc.gov/item/ltf90007976/ ltf90007976 \n", - "10 http://www.loc.gov/item/ltf90003958/ ltf90003958 \n", - "11 http://www.loc.gov/item/ltf90004222/ ltf90004222 \n", - "12 http://www.loc.gov/item/ltf90004535/ ltf90004535 \n", - "13 http://www.loc.gov/item/ltf90004926/ ltf90004926 \n", - "14 http://www.loc.gov/item/ltf90025142/ ltf90025142 \n", - "15 http://www.loc.gov/item/ltf90025206/ ltf90025206 \n", - "16 http://www.loc.gov/item/ltf90003159/ ltf90003159 \n", - "17 http://www.loc.gov/item/ltf90003160/ ltf90003160 \n", - "18 http://www.loc.gov/item/ltf90003161/ ltf90003161 \n", - "19 http://www.loc.gov/item/ltf90001296/ ltf90001296 \n", - "20 http://www.loc.gov/item/ltf90017676/ ltf90017676 \n", - "21 http://www.loc.gov/item/ltf90017732/ ltf90017732 \n", - "22 http://www.loc.gov/item/ltf90005739/ ltf90005739 \n", - "23 http://www.loc.gov/item/ltf90005769/ ltf90005769 \n", - "24 http://www.loc.gov/item/ltf90005775/ ltf90005775 \n", - "25 http://www.loc.gov/item/ltf90006298/ ltf90006298 \n", - "26 http://www.loc.gov/item/ltf90006357/ ltf90006357 \n", - "27 http://www.loc.gov/item/ltf90007565/ ltf90007565 \n", - "28 http://www.loc.gov/item/ltf90007757/ ltf90007757 \n", - "29 http://www.loc.gov/item/ltf90024727/ ltf90024727 \n", - ".. ... ... \n", - "70 http://www.loc.gov/item/ltf90008396/ ltf90008396 \n", - "71 http://www.loc.gov/item/ltf90008411/ ltf90008411 \n", - "72 http://www.loc.gov/item/ltf90008432/ ltf90008432 \n", - "73 http://www.loc.gov/item/ltf90008458/ ltf90008458 \n", - "74 http://www.loc.gov/item/ltf90008459/ ltf90008459 \n", - "75 http://www.loc.gov/item/ltf90008463/ ltf90008463 \n", - "76 http://www.loc.gov/item/ltf90008493/ ltf90008493 \n", - "77 http://www.loc.gov/item/ltf90008505/ ltf90008505 \n", - "78 http://www.loc.gov/item/ltf90008532/ ltf90008532 \n", - "79 http://www.loc.gov/item/ltf90019634/ ltf90019634 \n", - "80 http://www.loc.gov/item/ltf90009561/ ltf90009561 \n", - "81 http://www.loc.gov/item/ltf90009671/ ltf90009671 \n", - "82 http://www.loc.gov/item/ltf90009637/ ltf90009637 \n", - "83 http://www.loc.gov/item/ltf90009640/ ltf90009640 \n", - "84 http://www.loc.gov/item/ltf90009650/ ltf90009650 \n", - "85 http://www.loc.gov/item/ltf90009680/ ltf90009680 \n", - "86 http://www.loc.gov/item/ltf90009683/ ltf90009683 \n", - "87 http://www.loc.gov/item/ltf90009288/ ltf90009288 \n", - "88 http://www.loc.gov/item/ltf90009727/ ltf90009727 \n", - "89 http://www.loc.gov/item/ltf90009729/ ltf90009729 \n", - "90 http://www.loc.gov/item/ltf90009740/ ltf90009740 \n", - "91 http://www.loc.gov/item/ltf90009806/ ltf90009806 \n", - "92 http://www.loc.gov/item/ltf90009828/ ltf90009828 \n", - "93 http://www.loc.gov/item/ltf90009829/ ltf90009829 \n", - "94 http://www.loc.gov/item/ltf90009841/ ltf90009841 \n", - "95 http://www.loc.gov/item/ltf90019579/ ltf90019579 \n", - "96 http://www.loc.gov/item/ltf90019584/ ltf90019584 \n", - "97 http://www.loc.gov/item/ltf90019585/ ltf90019585 \n", - "98 http://www.loc.gov/item/ltf90019589/ ltf90019589 \n", - "99 http://www.loc.gov/item/ltf90019608/ ltf90019608 \n", - "\n", - " medium title \n", - "0 [252 p.] Conversations on natural philosophy, in which ... \n", - "1 [32 p.] An examination of Mr. Dufief's philosophical n... \n", - "2 [4 p.] Philosophical views. \n", - "3 [116 p.] Hypnotism as it is; a book for everybody. \n", - "4 [176 p.] New illustrated self-instructor in phrenology ... \n", - "5 [24 p.] Psukikos; philosophic observations on the rela... \n", - "6 [28 p.] The mind; an introductory lecture, delivered N... \n", - "7 [8 p.] Scientific explanation of the creed crusher, o... \n", - "8 [40 p.] What is spiritualism? And shall spiritualists ... \n", - "9 [407 p.] The trance, and correlative phenomena. \n", - "10 [169 p.] Posthumous memoirs of Helena Petrovna Blavatsky, \n", - "11 [301 p.] Improvement of the mind. \n", - "12 [48 p.] Mental gymnastics; or, Lessons on memory. \n", - "13 [1 v. (unpaged)] The perfectly good cynic's calendar, with astr... \n", - "14 [146 p.] An essay on the duties of man, addressed to wo... \n", - "15 [443 p.] Our manners at home and abroad; a complete man... \n", - "16 [54 p.] David and Jonathan; or, Considerations relatin... \n", - "17 [19 p.] Charter and by-laws, with lists of officers an... \n", - "18 [20 p.] Essay on moral and religious education in dome... \n", - "19 [29, 103 p.] Democracy according to the Buddhist viewpoint. \n", - "20 [341 p.] The analogy of religion, natural and revealed ... \n", - "21 [87 p.] Some modern substitutes for Christianity; a co... \n", - "22 [31 p.] Christ, the pupil of Buddha; a comparative study. \n", - "23 [215 p.] Sex worship; an exposition of the Phallic orig... \n", - "24 [24 p.] Address delivered at the opening of the sessio... \n", - "25 [112 p.] The Jews, their customs and ceremonies, with a... \n", - "26 [58 p.] Intermarriage and other discourses; delivered ... \n", - "27 [35 p.] Inaugural address, delivered November 20, 1902. \n", - "28 [50 p. cm.] Katherine Tingley, humanity's friend; a visit ... \n", - "29 [30 p.] The Church and the college; a discourse delive... \n", - ".. ... ... \n", - "70 [153 p.] Before the altar; or, A series of annotated pr... \n", - "71 [86 p.] Centennial celebration of modern missions, \n", - "72 [103 p.] Texts and hymns for the youngest; a book to le... \n", - "73 [23 p.] Christianity, a philosophy of principles; an a... \n", - "74 [24 p.] Hints of the best method of originating and co... \n", - "75 [12 p. 23 cm.] Letter to a son on Christian belief, \n", - "76 [188 p.] Hymns for infant minds. \n", - "77 [648 p.] A treasury of family reading, pertaining to Go... \n", - "78 [396 p.] Sermons and other papers. \n", - "79 [320 p.] Memoirs of a guardian angel. \n", - "80 [176 p.] Dansk luthersk mission i Amerika i tiden før 1... \n", - "81 [198 p.] A manual on the Christian Sabbath. \n", - "82 [178 p.] The harp on the willows, Remembering Zion, Far... \n", - "83 [139 p.] My King; or, Daily thoughts for the King's chi... \n", - "84 [88 p.] Story of the Morning Star, the children's miss... \n", - "85 [228 p.] Out of the abyss; the autobiography of one who... \n", - "86 [263 p.] The sailor's companion; or, Book of devotions ... \n", - "87 [292 p.] Passionspredikningar; betraktelser öfver de ol... \n", - "88 [24 p.] The rule and measure of Christian charity. \n", - "89 [183 p.] Proclamations for Thanksgiving issued by the C... \n", - "90 [23 p.] History of the American Missionary Association... \n", - "91 [434 p.] Gospel sermons by Christian ministers. \n", - "92 [252 p.] A family prayer-book; containing forms of morn... \n", - "93 [223 p.] De konungsliga nådegåfvorna. \n", - "94 [324 p.] Born of water and spirit; a series of essays c... \n", - "95 [286 p.] Angelus Domini, an anthology in art and verse ... \n", - "96 [163 p.] Through death to life; discourses on St. Paul'... \n", - "97 [290 p.] The Sunday question; or, The Lord's Day, its s... \n", - "98 [229 p.] Woman in missions; papers and addresses presen... \n", - "99 [215 p.] Rev. Mr. Dashwell, the new minister at Hampton, \n", - "\n", - "[100 rows x 6 columns]" - ] - }, - "execution_count": 116, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = pd.DataFrame([get_fields(item) for item in item_pile])\n", - "data.head(100)" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 252.0\n", - "1 32.0\n", - "2 4.0\n", - "3 116.0\n", - "4 176.0\n", - "Name: medium, dtype: float64" - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "book_lengths = pd.to_numeric(data['medium'].str.get(0).str.split().str.get(0),\n", - " errors='coerce')\n", - "book_lengths.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 242.000000\n", - "mean 157.438017\n", - "std 161.110173\n", - "min 1.000000\n", - "25% 36.000000\n", - "50% 106.000000\n", - "75% 225.250000\n", - "max 1154.000000\n", - "Name: medium, dtype: float64" - ] - }, - "execution_count": 118, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "book_lengths.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlcAAAFpCAYAAACxow3EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAFf9JREFUeJzt3X+MZWd5H/DvU0wg8Ua2KXTkLqhLFERF7QLxiBBRVbOQEAdHgUgoAiFqF1ebP0JLK6TKBFUQpZEclR9tUZvKjSlORdmkFGJkkxDX9QZFapzuEpc1ONSGbBKvjB1qe8NSlMTJ0z/mmIzXs8yv9+zsnf18pKu5573nnvPM47O7X59z73uquwMAwBh/bbcLAADYS4QrAICBhCsAgIGEKwCAgYQrAICBhCsAgIGEKwCAgYQrAICBhCsAgIGEKwCAgS46lzt77nOf2wcOHJh1H9/4xjdy8cUXz7qPC5n+zkt/56W/89LfeenvvNbr77Fjx77W3c/b6rbOabg6cOBAjh49Ous+jhw5kpWVlVn3cSHT33np77z0d176Oy/9ndd6/a2qP9jOtlwWBAAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAY6KLdLuBcOHDD7U8bO3HjNbtQCQCw1zlzBQAwkHAFADCQcAUAMJBwBQAwkHAFADDQhuGqqp5dVb9TVf+7qr5QVT8zjb+wqu6uqgeq6per6jvmLxcA4Py2mTNXf5rk1d390iQvS3J1Vb0yyc8n+WB3f2+Sx5JcP1+ZAACLYcNw1atOT4vPnB6d5NVJPj6N35LkDbNUCACwQDb1mauqekZV3ZPkkSR3JPlykse7+4lplQeT7J+nRACAxVHdvfmVqy5N8skk/yLJR6ZLgqmqFyT5te6+Yp33HEpyKEmWlpauOnz48Ii6z+r06dPZt2/fU8aOnzz1tPWu3H/JrHXsVev1l3H0d176Oy/9nZf+zmu9/h48ePBYdy9vdVtbuv1Ndz9eVXcl+YEkl1bVRdPZq+cnOXmW99yU5KYkWV5e7pWVla3WuCVHjhzJmfu4br3b37xl3jr2qvX6yzj6Oy/9nZf+zkt/5zWyv5v5tuDzpjNWqarvTPJDSe5LcleSN06rXZvk1iEVAQAssM2cubo8yS1V9YyshrFf6e7bquqLSQ5X1b9M8rtJbp6xTgCAhbBhuOruzyd5+TrjX0nyijmKAgBYVGZoBwAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhow3BVVS+oqruq6otV9YWqesc0/t6qOllV90yP181fLgDA+e2iTazzRJJ3dvfnquq7kxyrqjum1z7Y3e+brzwAgMWyYbjq7oeSPDQ9/3pV3Zdk/9yFAQAsoi195qqqDiR5eZK7p6G3V9Xnq+rDVXXZ4NoAABZOdffmVqzal+Q3k/xcd3+iqpaSfC1JJ/nZJJd399vWed+hJIeSZGlp6arDhw+Pqn1dp0+fzr59+54ydvzkqaetd+X+S2atY69ar7+Mo7/z0t956e+89Hde6/X34MGDx7p7eavb2lS4qqpnJrktyWe6+wPrvH4gyW3dfcW3287y8nIfPXp0qzVuyZEjR7KysvKUsQM33P609U7ceM2sdexV6/WXcfR3Xvo7L/2dl/7Oa73+VtW2wtVmvi1YSW5Oct/aYFVVl69Z7ceT3LvVnQMA7DWb+bbgq5K8NcnxqrpnGvvpJG+uqpdl9bLgiSQ/OUuFAAALZDPfFvytJLXOS58eXw4AwGIzQzsAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQMIVAMBAwhUAwEDCFQDAQBuGq6p6QVXdVVVfrKovVNU7pvHnVNUdVXX/9POy+csFADi/bebM1RNJ3tndL0nyyiQ/VVUvSXJDkju7+0VJ7pyWAQAuaBuGq+5+qLs/Nz3/epL7kuxP8vokt0yr3ZLkDXMVCQCwKLb0mauqOpDk5UnuTrLU3Q9NL301ydLQygAAFlB19+ZWrNqX5DeT/Fx3f6KqHu/uS9e8/lh3P+1zV1V1KMmhJFlaWrrq8OHDYyo/i9OnT2ffvn1PGTt+8tTT1rty/yWz1rFXrddfxtHfeenvvPR3Xvo7r/X6e/DgwWPdvbzVbW0qXFXVM5PcluQz3f2BaexLSVa6+6GqujzJke5+8bfbzvLych89enSrNW7JkSNHsrKy8pSxAzfc/rT1Ttx4zax17FXr9Zdx9Hde+jsv/Z2X/s5rvf5W1bbC1Wa+LVhJbk5y35PBavKpJNdOz69NcutWdw4AsNdctIl1XpXkrUmOV9U909hPJ7kxya9U1fVJ/iDJT8xTIgDA4tgwXHX3byWps7z8mrHlAAAsNjO0AwAMJFwBAAwkXAEADCRcAQAMJFwBAAwkXAEADCRcAQAMJFwBAAwkXAEADCRcAQAMJFwBAAwkXAEADCRcAQAMJFwBAAwkXAEADCRcAQAMJFwBAAwkXAEADCRcAQAMJFwBAAwkXAEADCRcAQAMJFwBAAwkXAEADCRcAQAMdNFuF7AIDtxw+9PGTtx4zS5UAgCc75y5AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGGjPzdB+/OSpXLfOjOoAAOeCM1cAAAMJVwAAAwlXAAADCVcAAAMJVwAAAwlXAAADbRiuqurDVfVIVd27Zuy9VXWyqu6ZHq+bt0wAgMWwmTNXH0ly9TrjH+zul02PT48tCwBgMW0Yrrr7s0kePQe1AAAsvOrujVeqOpDktu6+Ylp+b5LrkvxJkqNJ3tndj53lvYeSHEqSpaWlqw4fPjyg7LN75NFTefibG6935f5LNr3N4ydP7ej9e8np06ezb9++3S5jz9LfeenvvPR3Xvo7r/X6e/DgwWPdvbzVbW03XC0l+VqSTvKzSS7v7rdttJ3l5eU+evToVmvckg999Na8//jGd/U5ceM1m97mgXVup7OV9+8lR44cycrKym6XsWfp77z0d176Oy/9ndd6/a2qbYWrbX1bsLsf7u6/6O6/TPIfk7xiO9sBANhrthWuquryNYs/nuTes60LAHAh2fD6WVV9LMlKkudW1YNJ3pNkpapeltXLgieS/OSMNQIALIwNw1V3v3md4ZtnqAUAYOGZoR0AYCDhCgBgIOEKAGAg4QoAYCDhCgBgIOEKAGAg4QoAYCDhCgBgIOEKAGAg4QoAYCDhCgBgIOEKAGAg4QoAYCDhCgBgIOEKAGCgi3a7gN1y4Ibb1x0/ceM157gSAGAvceYKAGAg4QoAYCDhCgBgIOEKAGAg4QoAYCDhCgBgIOEKAGAg4QoAYCDhCgBgoAt2hvZzZb2Z4M0CDwB7lzNXAAADCVcAAAMJVwAAAwlXAAADCVcAAAMJVwAAAwlXAAADCVcAAAMJVwAAA5mh/QzrzajO2ZmBHgCeypkrAICBhCsAgIGEKwCAgYQrAICBhCsAgIE2DFdV9eGqeqSq7l0z9pyquqOq7p9+XjZvmQAAi2EzZ64+kuTqM8ZuSHJnd78oyZ3TMgDABW/DcNXdn03y6BnDr09yy/T8liRvGFwXAMBC2u5nrpa6+6Hp+VeTLA2qBwBgoVV3b7xS1YEkt3X3FdPy49196ZrXH+vudT93VVWHkhxKkqWlpasOHz48oOyze+TRU3n4m7PuIkly5f5LNrXe8ZOntv3e89Hp06ezb9++by3vtd9vt53ZX8bS33np77z0d17r9ffgwYPHunt5q9va7u1vHq6qy7v7oaq6PMkjZ1uxu29KclOSLC8v98rKyjZ3uTkf+uitef/x+e/qc+ItK5ta77r1bg+zyfeej44cOZK1/w332u+3287sL2Pp77z0d176O6+R/d3uZcFPJbl2en5tkluHVAMAsOA2MxXDx5L8zyQvrqoHq+r6JDcm+aGquj/JD07LAAAXvA2vn3X3m8/y0msG1wIAsPDM0A4AMJBwBQAwkHAFADCQcAUAMJBwBQAwkHAFADCQcAUAMJBwBQAwkHAFADCQcAUAMJBwBQAwkHAFADCQcAUAMJBwBQAwkHAFADDQRbtdwKI6cMPtTxs7ceM1u1AJAHA+ceYKAGAg4QoAYCDhCgBgIOEKAGAg4QoAYCDhCgBgIOEKAGAg4QoAYCDhCgBgIDO0D7TerO27ZbMzyG+lZjPQA8DGnLkCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYSLgCABhIuAIAGEi4AgAYyAztu2Czs6efi/0CAGM5cwUAMJBwBQAwkHAFADCQcAUAMJBwBQAw0I6+LVhVJ5J8PclfJHmiu5dHFAUAsKhGTMVwsLu/NmA7AAALz2VBAICBdhquOslvVNWxqjo0oiAAgEVW3b39N1ft7+6TVfU3ktyR5B9392fPWOdQkkNJsrS0dNXhw4d3Uu+GHnn0VB7+5qy7mMWV+y952tjxk6dmf+9WXLn/kpw+fTr79u3b8n42W+N6661nJ+89n53ZX8bS33np77z0d17r9ffgwYPHtvN58h2Fq6dsqOq9SU539/vOts7y8nIfPXp0yP7O5kMfvTXvP754d/VZ7/Y3m71dzU7euxUnbrwmR44cycrKypb3s9kaN3sboN26hdDczuwvY+nvvPR3Xvo7r/X6W1XbClfbvixYVRdX1Xc/+TzJa5Pcu93tAQDsBTs5xbOU5JNV9eR2/kt3//qQqgAAFtS2w1V3fyXJSwfWAgCw8EzFAAAwkHAFADCQcAUAMJBwBQAwkHAFADDQ4s22uUftZNLPOSYMPZ9s9vcbPSnpevbCRKUAzMuZKwCAgYQrAICBhCsAgIGEKwCAgYQrAICBhCsAgIGEKwCAgYQrAICBhCsAgIHM0M55Za/PNn8u7GSmegB2zpkrAICBhCsAgIGEKwCAgYQrAICBhCsAgIGEKwCAgYQrAICBhCsAgIGEKwCAgczQzqYduOH2vPPKJ3LdNmZR362Z10fPVn6232P0DOibrftc9HUr+zATPIAzVwAAQwlXAAADCVcAAAMJVwAAAwlXAAADCVcAAAMJVwAAAwlXAAADCVcAAAOZoZ0LzrmaLX47+9nuDPjbqWWOGd93MiP+uZhN/yNXX7zt7W12H2apZ24X4nG3aL+zM1cAAAMJVwAAAwlXAAADCVcAAAMJVwAAA+0oXFXV1VX1pap6oKpuGFUUAMCi2na4qqpnJPl3SX4kyUuSvLmqXjKqMACARbSTM1evSPJAd3+lu/8syeEkrx9TFgDAYtpJuNqf5I/WLD84jQEAXLCqu7f3xqo3Jrm6u//RtPzWJN/f3W8/Y71DSQ5Niy9O8qXtl7spz03ytZn3cSHT33np77z0d176Oy/9ndd6/f1b3f28rW5oJ7e/OZnkBWuWnz+NPUV335Tkph3sZ0uq6mh3L5+r/V1o9Hde+jsv/Z2X/s5Lf+c1sr87uSz4v5K8qKpeWFXfkeRNST41oigAgEW17TNX3f1EVb09yWeSPCPJh7v7C8MqAwBYQDu5LJju/nSSTw+qZZRzdgnyAqW/89LfeenvvPR3Xvo7r2H93fYH2gEAeDq3vwEAGGjPhCu34tm5qnpBVd1VVV+sqi9U1Tum8edU1R1Vdf/087JpvKrq3049/3xVfd/u/gaLoaqeUVW/W1W3TcsvrKq7pz7+8vQFkVTVs6blB6bXD+xm3Yugqi6tqo9X1e9V1X1V9QOO33Gq6p9NfzfcW1Ufq6pnO363r6o+XFWPVNW9a8a2fLxW1bXT+vdX1bW78bucj87S3381/f3w+ar6ZFVduua1d039/VJV/fCa8S3niz0RrtyKZ5gnkryzu1+S5JVJfmrq4w1J7uzuFyW5c1pOVvv9oulxKMkvnPuSF9I7kty3Zvnnk3ywu783yWNJrp/Gr0/y2DT+wWk9vr1/k+TXu/tvJ3lpVvvs+B2gqvYn+SdJlrv7iqx+kelNcfzuxEeSXH3G2JaO16p6TpL3JPn+rN455T1PBjLW7e8dSa7o7r+b5P8keVeSTP/WvSnJ35ne8++n/xHeVr7YE+EqbsUzRHc/1N2fm55/Pav/MO3Pai9vmVa7JckbpuevT/JLveq3k1xaVZef47IXSlU9P8k1SX5xWq4kr07y8WmVM/v7ZN8/nuQ10/qso6ouSfL3k9ycJN39Z939eBy/I12U5Dur6qIk35XkoTh+t627P5vk0TOGt3q8/nCSO7r70e5+LKvh4cxAcUFar7/d/Rvd/cS0+NtZnaMzWe3v4e7+0+7+/SQPZDVbbCtf7JVw5VY8g02n8F+e5O4kS9390PTSV5MsTc/1fev+dZJ/nuQvp+W/nuTxNX/Y1/bwW/2dXj81rc/6Xpjkj5P8p+my6y9W1cVx/A7R3SeTvC/JH2Y1VJ1KciyO39G2erw6jrfvbUl+bXo+tL97JVwxUFXtS/LfkvzT7v6Tta/16tdLfcV0G6rqR5M80t3HdruWPeqiJN+X5Be6++VJvpG/uqSSxPG7E9OlptdnNcT+zSQXxxmSWTle51NV787qR2E+Osf290q42tSteNhYVT0zq8Hqo939iWn44Scvl0w/H5nG9X1rXpXkx6rqRFZPLb86q58RunS6zJI8tYff6u/0+iVJ/u+5LHjBPJjkwe6+e1r+eFbDluN3jB9M8vvd/cfd/edJPpHVY9rxO9ZWj1fH8RZV1XVJfjTJW/qv5qMa2t+9Eq7cimeA6fMQNye5r7s/sOalTyV58hso1ya5dc34P5i+xfLKJKfWnM7mDN39ru5+fncfyOox+j+6+y1J7kryxmm1M/v7ZN/fOK3v/2LPoru/muSPqurF09Brknwxjt9R/jDJK6vqu6a/K57sr+N3rK0er59J8tqqumw6u/jaaYx1VNXVWf1oxo919/9b89Knkrxp+pbrC7P6xYHfyXbzRXfviUeS12X1k/9fTvLu3a5nER9J/l5WT0F/Psk90+N1Wf2cxJ1J7k/y35M8Z1q/svotii8nOZ7VbxHt+u+xCI8kK0lum55/z/SH+IEk/zXJs6bxZ0/LD0yvf89u132+P5K8LMnR6Rj+1SSXOX6H9vdnkvxeknuT/Ockz3L87qifH8vq59f+PKtnXq/fzvGa1c8OPTA9/uFu/17ny+Ms/X0gq5+hevLfuP+wZv13T/39UpIfWTO+5XxhhnYAgIH2ymVBAIDzgnAFADCQcAUAMJBwBQAwkHAFADCQcAUAMJBwBQAwkHAFADDQ/wdcxTSbNUkKhAAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "book_lengths.hist(bins=100, figsize=(10,6));" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'created_published': ['Boston, Printed and published by Lincoln & Edmands, 1827.'],\n", - " 'date': '1827',\n", - " 'id': 'http://www.loc.gov/item/ltf90006684/',\n", - " 'library_of_congress_control_number': 'ltf90006684',\n", - " 'medium': ['252 p.'],\n", - " 'title': 'Conversations on natural philosophy, in which the elements of that science are familiarly explained, and adapted to the comprehension of young pupils.'}" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "503" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "response.status_code" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'item'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'item'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'library_of_congress_control_number'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m: 'item'" - ] - } - ], - "source": [ - "data['item']['library_of_congress_control_number']" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "with open('json-data/index.json') as f:\n", - " collection_index = json.load(f)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'access_restricted': False,\n", - " 'aka': ['http://hdl.loc.gov/loc.gdc/scd0001.00218180267',\n", - " 'http://www.loc.gov/resource/scd0001.00218180267/',\n", - " 'http://lccn.loc.gov/ltf90006684',\n", - " 'http://www.loc.gov/item/ltf90006684/',\n", - " 'http://www.loc.gov/resource/dcmsiabooks.conversationsonn00marc_1/'],\n", - " 'campaigns': [],\n", - " 'contributor': ['marcet, (jane haldimand)'],\n", - " 'date': '1827',\n", - " 'dates': ['1827-01-01T00:00:00Z'],\n", - " 'description': ['Also available in digital form.'],\n", - " 'digitized': True,\n", - " 'extract_timestamp': '2018-08-17T23:46:46.123Z',\n", - " 'group': ['catalog', 'dcmsiabooks2', 'main-catalog', 'cts-ia-books2'],\n", - " 'hassegments': True,\n", - " 'id': 'http://www.loc.gov/item/ltf90006684/',\n", - " 'image_url': ['//tile.loc.gov/image-services/iiif/service:gdc:dcmsiabooks:co:nv:er:sa:ti:on:so:nn:00:ma:rc:_1:conversationsonn00marc_1:conversationsonn00marc_1_0009/full/pct:12.5/0/default.jpg',\n", - " '//tile.loc.gov/image-services/iiif/service:gdc:dcmsiabooks:co:nv:er:sa:ti:on:so:nn:00:ma:rc:_1:conversationsonn00marc_1:conversationsonn00marc_1_0009/full/pct:12.5/0/default.jpg#h=414&w=216',\n", - " '//tile.loc.gov/image-services/iiif/service:gdc:dcmsiabooks:co:nv:er:sa:ti:on:so:nn:00:ma:rc:_1:conversationsonn00marc_1:conversationsonn00marc_1_0009/full/pct:25/0/default.jpg#h=828&w=432',\n", - " '//tile.loc.gov/image-services/iiif/service:gdc:dcmsiabooks:co:nv:er:sa:ti:on:so:nn:00:ma:rc:_1:conversationsonn00marc_1:conversationsonn00marc_1_0009/full/pct:50/0/default.jpg#h=1656&w=865',\n", - " '//tile.loc.gov/image-services/iiif/service:gdc:dcmsiabooks:co:nv:er:sa:ti:on:so:nn:00:ma:rc:_1:conversationsonn00marc_1:conversationsonn00marc_1_0009/full/pct:100/0/default.jpg#h=3312&w=1731'],\n", - " 'index': 1,\n", - " 'language': ['english'],\n", - " 'mime_type': ['application/xml',\n", - " 'application/epub',\n", - " 'application/pdf',\n", - " 'image/jpeg',\n", - " 'image/jp2'],\n", - " 'online_format': ['online text', 'epub', 'pdf', 'image'],\n", - " 'original_format': ['book'],\n", - " 'other_title': [],\n", - " 'partof': ['general collections', 'catalog', 'selected digitized books'],\n", - " 'resources': [{'search': 'https://www.loc.gov/collections/selected-digitized-books/?fa=segmentof:dcmsiabooks.conversationsonn00marc_1/&fo=json&sb=shelf-id&st=gallery',\n", - " 'segments': 324,\n", - " 'url': 'https://www.loc.gov/resource/dcmsiabooks.conversationsonn00marc_1/'}],\n", - " 'segments': [{'count': 324,\n", - " 'link': 'https://www.loc.gov/collections/selected-digitized-books/?fa=segmentof:dcmsiabooks.conversationsonn00marc_1/&fo=json&sb=shelf-id&st=gallery',\n", - " 'url': 'https://www.loc.gov/resource/dcmsiabooks.conversationsonn00marc_1/'}],\n", - " 'shelf_id': '4B 2042',\n", - " 'site': ['catalog'],\n", - " 'timestamp': '2018-08-18T08:39:31.979Z',\n", - " 'title': 'Conversations on natural philosophy, in which the elements of that science are familiarly explained, and adapted to the comprehension of young pupils.',\n", - " 'url': 'https://www.loc.gov/item/ltf90006684/'}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "collection_index['results'][0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "response = requests.get(collection_index['results'][0]['id']+\"/?fo=json\")\n", - "book = response.json()" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'book' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbook\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'book' is not defined" - ] - } - ], - "source": [ - "book" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "http://www.loc.gov/item/ltf90006684/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90008069/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90008097/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90002975/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90003010/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90007944/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90007945/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90007948/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90007953/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90007976/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90003958/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90004222/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90004535/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90004926/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90025142/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90025206/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90003159/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90003160/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90003161/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90001296/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90017676/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90017732/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90005739/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90005769/\n", - "got the record!\n", - "http://www.loc.gov/item/ltf90005775/\n", - "got the record!\n" - ] - } - ], - "source": [ - "for book in collection_index['results']:\n", - " print(book['id'])\n", - " response = requests.get(book['id']+\"?fo=json\")\n", - " while response.status_code != 200:\n", - " print(\"bad response, trying again in 60s\")\n", - " sleep(60)\n", - " response = requests.get(book['id']+\"?fo=json\")\n", - " \n", - " print(\"got the record!\")\n", - " data = response.json()\n", - " lccn = data['item']['library_of_congress_control_number']\n", - " with open(\"json-data/{}.json\".format(lccn), \"w\") as f:\n", - " json.dump(data, f)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['252 p.']\n", - "['32 p.']\n", - "['4 p.']\n", - "['116 p.']\n", - "['176 p.']\n", - "['24 p.']\n", - "['28 p.']\n", - "['8 p.']\n", - "['40 p.']\n", - "['407 p.']\n", - "['169 p.']\n", - "['301 p.']\n", - "['48 p.']\n", - "['1 v. (unpaged)']\n", - "['146 p.']\n", - "['443 p.']\n", - "['54 p.']\n", - "['19 p.']\n", - "['20 p.']\n", - "['29, 103 p.']\n", - "['341 p.']\n", - "['87 p.']\n", - "['31 p.']\n", - "['215 p.']\n", - "['24 p.']\n" - ] - } - ], - "source": [ - "for book in collection_index['results']:\n", - " #print(book['id'])\n", - " response = requests.get(book['id']+\"?fo=json\")\n", - " while response.status_code != 200:\n", - " #print(\"bad response, trying again in 60s\")\n", - " #sleep(1)\n", - " response = requests.get(book['id']+\"?fo=json\")\n", - " \n", - " #print(\"got the record!\")\n", - " data = response.json()\n", - " lccn = data['item']['library_of_congress_control_number']\n", - " medium = data['item']['medium']\n", - " print(medium)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/day-two/5-data-cleaning.ipynb b/day-two/5-data-cleaning.ipynb deleted file mode 100644 index 6c857ef..0000000 --- a/day-two/5-data-cleaning.ipynb +++ /dev/null @@ -1,1205 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data Cleaning\n", - "\n", - "* Anywhere from 50% to 80% of data science is data cleaning\n", - " * of course I hear 70% of statistics are made up on the spot\n", - "* Dealing with dirty data is a fact of life when doing data intensive research\n", - "* Especially if you are collecting or creating the data yourself\n", - "* Fortunately, Pandas is excellent at data cleaning and once you get the hang of it you might even enjoy it!\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load the necessary libraries\n", - "import pandas as pd\n", - "import numpy as np\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Missing Values \n", - "\n", - "* One of challenges you may face when working with messy data are *missing* or **null** values \n", - "* There are multiple conventions for representing null values when doing data science in Python\n", - "* There is a Pythonic way using the `None` object\n", - "* There is a Numpy/Pandas-y way using `NaN`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### None - Pythonic Missing Data\n", - "\n", - "* None is the standard way of representing nothing in plain python\n", - "* It is useful, but it is also a complex data structure\n", - "* It can be used in numeric and programmatic contexts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a numpy array of numbers and a null value represented by None\n", - "some_numbers = np.array([1,None,3,4])\n", - "some_numbers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Because numpy arrays (and pandas series/columns) all have to be the same data type, it will default to the most expressive and most inefficient data type for the array\n", - " * Note: Pandas will automatically convert `None` to `Nan` so we use `np.array` here\n", - "* This means any operations running over the array/column/series are going to run slower than they could if the data type was numeric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a list of objects and a list of integers\n", - "# compute their sum and time how long it takes\n", - "for dtype in ['object','int']:\n", - " print(\"data type = \", dtype)\n", - " %timeit np.arange(1E6, dtype=dtype).sum()\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Notice the integer array was ***a lot*** faster than the object array\n", - "* Also, the vectorized math operations don't like `None`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "some_numbers.sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### NaN - Numpy/Pandas-y Missing Numeric data\n", - "\n", - "* The Numpy third-party library has a mechanism for representing missing numeric values\n", - "* Under the hood, NaNs are a standards compliant floating point numbers \n", - " * Note for R users: There is no `Null` only `NaN`\n", - "* This means you can use them with other numeric arrays for fast computations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a numeric Pandas Series with a missing value\n", - "nanny = pd.Series([1, np.nan, 3, 4])\n", - "nanny.dtype" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now we can use all the fast and easy computations in Pandas without worring about missing values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compute the sum of all the numbers in the Series\n", - "nanny.sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Operating on Null Values\n", - "\n", - "* There are four functions in Pandas that are useful for working with missing data\n", - "* The examples below operate on Series, but they can work on Dataframes as well\n", - "\n", - "\n", - "### Null value functions\n", - "\n", - "* `isna()` - Generate a boolean mask of the missing values (can also use `isnull()`)\n", - "* `notna()` - Do the opposite of `isna()` (can also use `notnull()`\n", - "* `dropna()` - Create a filtered copy of the data with no null values\n", - "* `fillna(value)` - Create a copy of the data will null values filled in" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# display the Series\n", - "nanny" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# what values are null\n", - "nanny.isna()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# what values are not null\n", - "nanny.notna()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* These masks can be used to filter the data and create a view of missing or not missing " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# not super useful in a Series, but handy with Dataframes\n", - "nanny[nanny.isna()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Rather than creating a view, we can create *copies* of the data with the null values removed or filled in" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Just get rid of all the null values\n", - "no_null_nanny = nanny.dropna()\n", - "no_null_nanny" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fill in the null values with zero\n", - "fill_null_nanny = nanny.fillna(0)\n", - "fill_null_nanny" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# fill in the null values with a different value\n", - "fill_null_nanny = nanny.fillna(999)\n", - "fill_null_nanny" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# The original nanny Series remains untouched #noreboot\n", - "# Fran Drescher frowns with dissapointment \n", - "nanny" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* These functions work with dataframes as well\n", - "* But you will need to pay closer attention to what it is doing " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_nanny = pd.DataFrame([[1, np.nan, 2],\n", - " [2, 3, 5],\n", - " [np.nan, 4, 6]])\n", - "df_nanny" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Dropping null values with `dropna()` removes the entire axis (row or column) and returns a new copy of the dataframe\n", - "* You can specify dropping rows or columns with the axis parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# dropna gets rid of rows by default\n", - "df_nanny.dropna() # axis=\"rows\" or axis=0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use the axis=\"columns\" or axis=1 to drop columns\n", - "df_nanny.dropna(axis=\"columns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* There are a couple other parameters that let you specify other behaviors\n", - "* Like only dropping rows/columns with all null values or settings a threshold" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with null values in real data\n", - "\n", - "* Here is an example of some real data, the diabetes data from week 2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import data file into a Pandas dataframe\n", - "df = pd.read_csv(\"../2 - data python two/diabetes.csv\")\n", - "\n", - "# Display the first 5 rows of the data\n", - "df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display the metadata about the data, making sure to display null values\n", - "df.info() " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* If we look closely at this information we can see there are a few null values in this dataset\n", - "* There are 403 rows, but some columns have less than 403 non-null values\n", - "* Now let's check which values in the dataset are missing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a boolean mask where True indicates a null value\n", - "df.isna().head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Gak! Too much data, how can we just get a quick count of the null values?\n", - "* What if we combined `isnull()` with the `sum()` function?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use the sum function to count the True values in the boolean mask\n", - "df.isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* If we wanted to look at a specific column we can do the same operation \n", - "* These functions work with Series as well as DataFrames" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# How many null values in the chol column\n", - "df[\"chol\"].isnull().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now let's deal with missing values\n", - "* Solution 1: Remove rows with empty values\n", - "* If there are only a few null values and you know that deleting values will not cause adverse effects on your result, remove them from your DataFrame\n", - "* Make sure to save the new dataframe to a new variable!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Display missing value counts\n", - "print(\"Missing values before dropping rows: \")\n", - "print(df.isnull().sum())\n", - "\n", - "\n", - "# Display new dataset\n", - "mod_df = df.dropna() # make a copy of the dataframe with null values removed\n", - "print(\"Missing values after dropping rows: \")\n", - "print(mod_df.isnull().sum())\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### EXERCISE\n", - "\n", - "A reviewer on your article that you submitted to the most prestigious journal in your field, loves your analysis but doesn't like the fact you dropped rows with missing cholesterol values. You can't drop them and you can't just put in zero, so you need to identify a technique to deal with those missing values; some kind of *interpolation* that *fills in* a new value in place of the null values. Hopefully it won't drastically change the interpretation!\n", - "\n", - "1. Create a new `filler_value` by deriving a number (mean, median or something else) from the column of cholesterol values (`df['chol']`)\n", - "2. Use the `fillna()` function to fill in the missing values of the cholesterol column\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Put your code here\n", - "\n", - "## Create a filler value\n", - "filler_value = ???\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Solution\n", - "\n", - "* One quick and easy way is to fill in missing values with the mean value of a giving column" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Find the mean\n", - "filler_value = df[\"chol\"].mean()\n", - "filler_value" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Fill missing values with a mean (average) value of a given column\n", - "# Note the inplace=True parameter - that means that we are overwriting the data\n", - "# in the existing dataset\n", - "df[\"chol\"].fillna(filler_value, inplace=True)\n", - "df.isnull().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* No more null values in the `chol` column" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Vectorized String Operations\n", - "\n", - "* If you are dealing with textual or categorical data, you often have to clean strings\n", - "* Pandas has a set of *Vectorized String Operations* that are much faster and easier than the Python equivalents \n", - "* Especially handling bad data!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = ['peter', 'Paul', 'MARY', 'gUIDO']\n", - "\n", - "for s in data:\n", - " print(s.capitalize())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* But like above, this breaks very easily with missing values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = ['peter', 'Paul', None, 'MARY', 'gUIDO']\n", - "\n", - "for s in data:\n", - " print(s.capitalize())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* The Pandas library has *vectorized string operations* that handle missing data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert our list into a Series\n", - "names = pd.Series(data)\n", - "names" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use the string vector function to capitalize everything\n", - "names.str.capitalize()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Look ma! No errors!\n", - "* Pandas includes a a bunch of methods for doing things to strings.\n", - "\n", - "| Functions |. |. |. |\n", - "|-------------|------------------|------------------|------------------|\n", - "|``len()`` | ``lower()`` | ``translate()`` | ``islower()`` | \n", - "|``ljust()`` | ``upper()`` | ``startswith()`` | ``isupper()`` | \n", - "|``rjust()`` | ``find()`` | ``endswith()`` | ``isnumeric()`` | \n", - "|``center()`` | ``rfind()`` | ``isalnum()`` | ``isdecimal()`` | \n", - "|``zfill()`` | ``index()`` | ``isalpha()`` | ``split()`` | \n", - "|``strip()`` | ``rindex()`` | ``isdigit()`` | ``rsplit()`` | \n", - "|``rstrip()`` | ``capitalize()`` | ``isspace()`` | ``partition()`` | \n", - "|``lstrip()`` | ``swapcase()`` | ``istitle()`` | ``rpartition()`` |\n", - "\n", - "### Exercise\n", - "\n", - "* In the cells below, try three of the string operations listed above on the Pandas Series `monte`\n", - "* Remember, you can hit tab to autocomplete and shift-tab to see documentation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',\n", - " 'Eric Idle', 'Terry Jones', 'Michael Palin'])\n", - "monte" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# First\n", - "monte.str.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Second\n", - "monte.str.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Third\n", - "monte.str.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### String Vector Operations with Real Data\n", - "\n", - "* Let's try some string vector operations using real data!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# open the chipotle data and look at the first 5 rows\n", - "orders = pd.read_csv(\"../4 - data management one/chipotle.tsv\", sep=\"\\t\")\n", - "orders.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have downloaded the data and loaded it into a dataframe directly from the web." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# get the rows and columns of the dataframe\n", - "orders.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* We see there are nearly 4,622 order, and 5 columns.\n", - "* Let's take a look at the 4th row to see what textual information we have:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# display the first item in the DataFrame\n", - "orders.iloc[4]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* We can use Vectorized String Operations to explore the textual data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Summarize the length of the choice_description string\n", - "orders['choice_description'].str.len().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# which row has the longest ingredients string\n", - "orders['choice_description'].str.len().idxmax()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# use iloc to fetch that specific row from the dataframe\n", - "orders.iloc[3659]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use iloc to fetch the max row automatically\n", - "orders.iloc[orders['choice_description'].str.len().idxmax()]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# only look at the description string\n", - "orders.iloc[orders['choice_description'].str.len().idxmax()]['choice_description']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* WOW! That is a lot of ingredients! It looks like that string is semi-structured, I wonder if we can do something with it...\n", - "* We could start by doing some string matching" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# How many orders contain salsa\n", - "orders['choice_description'].str.contains('Salsa').sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Note, you can use dot notation with column names\n", - "* This is useful because then you can use autocomplete with the string vector functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# How many orders contain salsa\n", - "orders.choice_description.str.contains('Salsa').sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# How many Burritos\n", - "orders.item_name.str.contains(\"Burrito\").sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# How many burritos...capitalization matters!\n", - "orders.item_name.str.contains(\"burrito\").sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Let's find the burrito with the most items in it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# only look at the description string\n", - "burrito_mask = orders.item_name.str.contains(\"Burrito\")\n", - "burrito_mask" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the id of the burrito with the longest description\n", - "max_burrito_id = orders[burito_mask][\"choice_description\"].str.len().idxmax()\n", - "max_burrito_id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the description column of the row with the max_burrito_id\n", - "orders.iloc[max_burrito_id][\"choice_description\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* That is a LOADED BURRITO!\n", - "* This data is interesting, but not very useful because it is one big string\n", - "* But we can probably do more with that `choice_description` column\n", - "* Let's pretend [it doesn't look like Python code](https://stackoverflow.com/questions/33281450/right-way-to-use-eval-statement-in-pandas-dataframe-map-function) and instead treat it as a comma separated list\n", - "* What string function could we use?\n", - "\n", - "| Functions |. |. |. |\n", - "|-------------|------------------|------------------|------------------|\n", - "|``len()`` | ``lower()`` | ``translate()`` | ``islower()`` | \n", - "|``ljust()`` | ``upper()`` | ``startswith()`` | ``isupper()`` | \n", - "|``rjust()`` | ``find()`` | ``endswith()`` | ``isnumeric()`` | \n", - "|``center()`` | ``rfind()`` | ``isalnum()`` | ``isdecimal()`` | \n", - "|``zfill()`` | ``index()`` | ``isalpha()`` | ``split()`` | \n", - "|``strip()`` | ``rindex()`` | ``isdigit()`` | ``rsplit()`` | \n", - "|``rstrip()`` | ``capitalize()`` | ``isspace()`` | ``partition()`` | \n", - "|``lstrip()`` | ``swapcase()`` | ``istitle()`` | ``rpartition()`` |\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use the split function to break up the different \n", - "orders.choice_description.str.split(\",\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* But what about those pesky brackets! Let's get rid of them!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove the left brackets\n", - "orders.choice_description.str.replace(\"[\",\"\" )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove the left and right brackets\n", - "orders.choice_description.str.replace(\"[\",\"\" ).str.replace(\"]\",\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove the left and right brackets and split on commas\n", - "orders.choice_description.str.replace(\"[\",\"\" ).str.replace(\"]\",\"\").str.split(\",\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Wait what!? The brackets are back!(*@&#^$\n", - "* Yes, but now they indicate Python lists instead of `[` and `]` characters (confusing yes I know)\n", - "* How can we grab items from those lists of ingredients?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove the left and right brackets and split on commas and grab the first element\n", - "orders.choice_description.str.replace(\"[\",\"\" ).str.replace(\"]\",\"\").str.split(\",\").str[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove the left and right brackets and split on commas and grab the last element\n", - "orders.choice_description.str.replace(\"[\",\"\" ).str.replace(\"]\",\"\").str.split(\",\").str[-1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# remove the left and right brackets and split on commas and grab the first 3 elements\n", - "orders.choice_description.str.replace(\"[\",\"\" ).str.replace(\"]\",\"\").str.split(\",\").str[0:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Put the split descriptions into a new Series\n", - "split_description = orders.choice_description.str.replace(\"[\",\"\" ).str.replace(\"]\",\"\").str.split(\",\")\n", - "split_description" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# look at the 4504th element of the split_descriptions series\n", - "split_description.iloc[4604]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Every item in the series is a list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Count how many items are in each description list\n", - "split_description.str.len()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "split_description.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/day-two/6-data-wrangling.ipynb b/day-two/6-data-wrangling.ipynb deleted file mode 100644 index 22aa76a..0000000 --- a/day-two/6-data-wrangling.ipynb +++ /dev/null @@ -1,844 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data Wrangling\n", - "\n", - "* Merging Datasets together\n", - "* Pivoting data\n", - "* Grouping \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Merging Data\n", - "\n", - "* Bringing disparate datasets together is one of the more powerful features of Pandas\n", - "* Like with Python lists, you can `append()` and `concat()` Pandas `Series` and `Dataframes`\n", - "* The `concat` is a module function, you call it directly from the pandas module (usually called `pd`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# concatinate two series together\n", - "ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])\n", - "ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])\n", - "pd.concat([ser1, ser2]) #note the Seres are passed as a list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# order matters\n", - "pd.concat([ser2, ser1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# concatinate two dataframes\n", - "df1 = pd.DataFrame({\"A\":[\"A1\", \"A2\"],\n", - " \"B\":[\"B1\",\"B2\"]},index=[1,2])\n", - "df2 = pd.DataFrame({\"A\":[\"A3\", \"A4\"],\n", - " \"B\":[\"B3\",\"B4\"]},index=[3,4])\n", - "pd.concat([df1,df2])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Pandas will automatically line up matching indexes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# concatinate dataframes horizontally\n", - "df1 = pd.DataFrame({\"A\":[\"A1\", \"A2\"],\n", - " \"B\":[\"B1\",\"B2\"]},index=[1,2])\n", - "df2 = pd.DataFrame({\"C\":[\"C1\", \"C2\"],\n", - " \"D\":[\"D1\",\"D2\"]},index=[1,2])\n", - "pd.concat([df1,df2], axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* And pandas will gracefully handle mis-alignment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# What happens when indexes don't line up\n", - "df1 = pd.DataFrame({\"A\":[\"A1\", \"A2\"],\n", - " \"B\":[\"B1\",\"B2\"]},index=[1,2])\n", - "df2 = pd.DataFrame({\"A\":[\"A3\", \"A4\"],\n", - " \"B\":[\"B3\",\"B4\"]},index=[3,4])\n", - "pd.concat([df1,df2], axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* The `append` function is a method of a Series/Dataframe and returns a new object" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# append df2 to df1\n", - "df1.append(df2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merging and Joining\n", - "\n", - "* While `concat()` is useful it lacks the power to do complex data merging\n", - "* For example, I have two tables of different data but one shared column\n", - "* This is where the `merge()` function becomes useful because it lets you *join* datasets\n", - "* The concept of \"join\" has lots of theory and is a richly developed method for *joining* data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### One-to-one joins" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# create two dataframes with one shared column\n", - "df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue', \"Nancy\"],\n", - " 'group': ['Accounting', 'Engineering', 'Engineering', 'HR', \"Librarian\"]})\n", - "df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],\n", - " 'hire_date': [2004, 2008, 2012, 2014]})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# display df1\n", - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# display df2\n", - "df2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# merge df1 and df2 into a new dataframe df3\n", - "df3 = pd.merge(df1, df2)\n", - "df3" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* The new dataframe `df3` now has all of the data from df1 and df2\n", - "* The `merge` function automatically connected the two tables on the \"employee\" column\n", - "* But what happens when your data don't line up?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Many-to-one joins\n", - "\n", - "* Sometimes there isn't a one to one relationshp between rows in two datasets\n", - "* A *many-to-one* join lets you combine these datasets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# make another dataframe about the supervisor for each group\n", - "df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],\n", - " 'supervisor': ['Carly', 'Guido', 'Steve']})\n", - "df4" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Merge df3 from above with the supervisor info in df4\n", - "pd.merge(df3,df4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Notice how the information about Guido, the manager for Engineering, is repeated.\n", - "* Pandas automatically fills in these values to maintain the tabular, 2 dimensional structure of the data\n", - "* While this might seem like duplicated data, it makes it easier to quickly look up Jake and Lisa's supervisor without consulting multiple tables" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Many-to-many joins\n", - "\n", - "* Let's combine the employee information with skills information\n", - "* Notice there isn't a one to one or even a one to many relationship between these tables\n", - "* Each group can have multiple skills, so **what do you think will happen?**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use the employee table specified above\n", - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a new dataframe with skills information\n", - "df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',\n", - " 'Engineering', 'Engineering', 'HR', 'HR', 'Librarian'],\n", - " 'skills': ['math', 'spreadsheets', 'coding', 'linux',\n", - " 'spreadsheets', 'organization', 'nunchucks']})\n", - "df5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pd.merge(df1, df5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Amazing, Pandas merge capabilities are very useful when column names match up\n", - "* But what do you do if the names of your columns don't match?\n", - "* You could change column names...\n", - "* But that is crazy! Just use the `left_on` and `right_on` parameters to the `merge()` function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use the employee table specified above\n", - "df1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df2 = df2.rename({\"employee\":\"name\"}, axis=\"columns\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# lets try and merge them without specifying what to merge on\n", - "pd.merge(df1, df2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Gak, error! Pandas can't figure out how to combine them\n", - "* What are the column names I should specify?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Now lets specify the column name \n", - "pd.merge(df1, df2, left_on=\"employee\", right_on=\"name\" )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Notice we now have a redundant employee/name column, this is a by-product of merging different columns\n", - "* If you want to get rid of it you can use the `drop` method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# drop the name column, axis=1 means axis='col', which is confusing\n", - "pd.merge(df1, df2, left_on=\"employee\", right_on=\"name\" ).drop(\"name\", axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This is just a taste of merging and joining data\n", - "* We will cover more of this in the SQL and Relational Databases sessions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pivoting Data\n", - "\n", - "* Sometimes you get what is called \"long\" or \"stacked\" data (streaming values from an instrument or periodic observational data)\n", - "* Data in this shape can be difficult to analyze" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load the CSV file\n", - "data = pd.read_csv(\"../4 - data management one/community-center-attendance.csv\",\n", - " index_col=\"_id\")\n", - " \n", - "# look at the first ten rows of the data\n", - "data.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# How many rows we got?\n", - "data.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* These data are looooooong\n", - "* Each row represents a community center in Pittsburgh reporting how many people visited the center\n", - "* Given this shape it is possible to do some calculations, but it might make more sense to *pivot* the data so that each column is a community center and each row is a day" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Use the pivot function to make column values into columns\n", - "pivoted_data = data.pivot_table(index=\"date\", # these values will be rows\n", - " columns=\"center_name\", # these values will be columns\n", - " values=\"attendance_count\" # these values will populate the table\n", - " )\n", - "pivoted_data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now we can easily find out things about our favorite community centers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Total number of people who have visited Magee\n", - "pivoted_data['Magee Community Center'].sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Average attendence per day at Magee\n", - "pivoted_data['Magee Community Center'].mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Transpose\n", - "\n", - "* Pandas has a handy function for *transposing* dataframes\n", - "* It just rotates the table making the columns rows and the rows columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pivoted_data.T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now the Column and row indexes are swapped" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Grouping Data\n", - "\n", - "\n", - "* A common pattern in data analysis is splitting data by a key and then performing some math on all of the values with that key and finally combining it all back together\n", - "* This is commonly known in data circles as *split, apply, combine*\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a dataframe to illustrate GroupBy\n", - "df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],\n", - " 'data': range(6),\n", - " 'counts':[45,234,6,2,1324,345], \n", - " 'things':['dog', 'cat', 'cat', 'dog', 'cat', 'cat']}\n", - " )\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Dataframes have a method, groupby(), that takes a column name be be the grouping key\n", - "df.groupby('key')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for group in df.groupby('key'):\n", - " print(\"Group for key:\", group[0])\n", - " print(\"Data:\", group[1])\n", - " print(\"Data Type:\", type(group[1]))\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Cool, we can see that we have *split* our data into three groups\n", - "* Now, we need to tell Pandas what function to *apply* to each group\n", - "* We need to specify what kind of aggregation, transformation, or computation to perform on the group" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Tell pandas to add up all of the values for each key\n", - "df.groupby('key').sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Under the hood Pandas is creating a bunch of new Dataframes based on the grouping column values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# you can save the group object and run different aggregations\n", - "grouped_dataframe = df.groupby('key')\n", - "grouped_dataframe.sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "grouped_dataframe.mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* The following table summarizes some other built-in Pandas aggregations:\n", - "\n", - "| Aggregation | Description |\n", - "|--------------------------|---------------------------------|\n", - "| ``count()`` | Total number of items |\n", - "| ``size()`` | Total number of items w/ NaNs |\n", - "| ``first()``, ``last()`` | First and last item |\n", - "| ``mean()``, ``median()`` | Mean and median |\n", - "| ``min()``, ``max()`` | Minimum and maximum |\n", - "| ``std()``, ``var()`` | Standard deviation and variance |\n", - "| ``mad()`` | Mean absolute deviation |\n", - "| ``prod()`` | Product of all items |\n", - "| ``sum()`` | Sum of all items |\n", - "\n", - "* These are all methods of ``DataFrame`` and ``Series`` objects.\n", - "\n", - "* You can also do multiple levels of grouping" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.groupby(['things','key']).count()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* What you are seeing is what is called a [Multilevel Index](https://pandas.pydata.org/pandas-docs/stable/advanced.html)\n", - "* Sadly, we don't have time to cover that topic, but this chapter on [Hierarchical Indexing](https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html) in the [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/) is a great introduction to the topic." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Split, Apply, Combine with real data\n", - "\n", - "* Lets grab a dataset from the WPRDC, the [Allegheny County Jail Daily Census](https://data.wprdc.org/dataset/allegheny-county-jail-daily-census)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Grab three months of data\n", - "january17_jail_census = pd.read_csv(\"https://data.wprdc.org/datastore/dump/3b5d9c45-b5f4-4e05-9cf1-127642ad1d17\")\n", - "feburary17_jail_census = pd.read_csv(\"https://data.wprdc.org/datastore/dump/cb8dc876-6285-43a8-9db3-90b84eedb46f\")\n", - "march17_jail_census = pd.read_csv(\"https://data.wprdc.org/datastore/dump/68645668-3f89-4831-b1de-de1e77e52dd3\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "january17_jail_census.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Use the concat function to combine all three into one dataframe\n", - "* Remember `concat` is a general pandas function so we call it with `pd.concat`\n", - "* It takes as an argument a list of things to combine" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# make a list of dataframes to combine\n", - "dataframes = [january17_jail_census, \n", - " feburary17_jail_census, \n", - " march17_jail_census]\n", - "\n", - "# give the concat function the list\n", - "jail_census = pd.concat(dataframes)\n", - "jail_census" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now we can do some calculations on the data\n", - "* Note, because this is a daily census it includes many of the same people, so this mean isn't statistically *meaningful* \n", - "* We can still use these data for demonstration purposes " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Compute the average age ate booking by gender\n", - "jail_census.groupby('Gender')['Age at Booking'].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# compute the average age at booking by race then gender \n", - "jail_census.groupby(['Race', 'Gender'])['Age at Booking'].mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If we look at the [data dictionary](https://data.wprdc.org/dataset/allegheny-county-jail-daily-census/resource/f0550174-16b0-4f6e-88dc-fa917e74b56c) we can see the following mapping for race categories\n", - "```\n", - "Race of Inmate\n", - "A-ASIAN OR PACIFIC ISLANDER\n", - "B-BLACK OR AFRICAN AMERICAN\n", - "H-HISPANIC \n", - "I-AMERICAN INDIAN OR ALASKAN NATIVE\n", - "U-UNKNOWN\n", - "W-WHITE\n", - "```\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Get the statistical summary of age at booking by gender\n", - "jail_census.groupby('Gender')['Age at Booking'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# Compute the difference between Age at Booking and current age\n", - "age_difference = jail_census['Current Age'] - jail_census['Age at Booking']\n", - "age_difference.value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This uses a mathematical funciton to compute the age difference for each row\n", - "* Then we use `value_counts` to count the age differences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jail_census.groupby('Date').count()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jail_census['year'] = jail_census['Date'].str.split(\"-\").str[0]\n", - "jail_census['month'] = jail_census['Date'].str.split(\"-\").str[1]\n", - "jail_census['day'] = jail_census['Date'].str.split(\"-\").str[2]\n", - "\n", - "jail_census.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jail_census.groupby('month').count()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "jail_census.groupby('day').count()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This is a really awkward way of dealing with time\n", - "* We shouldn't have to make a separate column for year, month, day\n", - "* There must be a better way to do this time stuff...\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/day-two/7-working-with-time.ipynb b/day-two/7-working-with-time.ipynb deleted file mode 100644 index 62f7faa..0000000 --- a/day-two/7-working-with-time.ipynb +++ /dev/null @@ -1,1193 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Working with Time\n", - "\n", - "* One of the most powerful features of Pandas is its time series functionality\n", - "* Dates and time are a Python and Pandas data type (like integers and strings)\n", - "* By using the `datetime` data types you can do advanced, time-centric analysis\n", - "* One thing to remember about computers is they are *very* specific\n", - " * *Time stamps* - a specific moment in time (July 4th, 2017 at 7:52am and 34 seconds)\n", - " * *Time intervals* - a length of time with start and end points (The year 2017)\n", - " * *Time duration* - a specific length of time (a year, a month, a day)\n", - " \n", - " \n", - "* Python has data structures for working with time as part of the standard library" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Datetime in pure Python\n", - "import datetime\n", - "\n", - "date = datetime.datetime(year=2017, month=6, day=13)\n", - "date" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# A more specific datetime in pure Python\n", - "import datetime\n", - "\n", - "date = datetime.datetime(year=2017, month=6, day=13, hour=4, minute=45, second=3, microsecond=13)\n", - "date" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "type(date)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# what is that date's month?\n", - "date.month" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# what is that date's day?\n", - "date.day" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Python comes with a handle utility for parsing date strings into `datetime` objects called `dateutil.parser`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import the parser function from the datautil library to parse human date strings\n", - "from dateutil import parser\n", - "date = parser.parse(\"4th of July, 2017\")\n", - "date" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the month\n", - "date.month" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exercise\n", - "\n", - "Try some different date strings, see how smart Python can be." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "my_date = parser.parse(\"\")\n", - "my_date" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "stringy = \"January 1st 2900 at 3:15pm 45seconds\"\n", - "parser.parse(stringy)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Formatting Dates and Times" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* You can use [*string format codes*](https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior) for printing dates and time in different formats (especially useful for making human readable dates)\n", - "* Pass a format string to the `strftime()` method to print out a pretty date" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the weekday \n", - "date.strftime(\"%A\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "date.strftime(\"%B\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "date.strftime(\"%A, %B %d %f\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "date.strftime(\"The ear of ar ord %Y\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Try some of the different string format codes and see what happens\n", - "date.strftime(\"%P\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import locale" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "locale.getlocale()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Try combining a few of them together with punctuation too\n", - "date.strftime(\"%Y-%m-%d\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with time in Pandas\n", - "\n", - "* Just like how Pandas has its own datatypes for numbers, Pandas has its own dates and times (to support more granularity)\n", - "* If you have a lot of dates, it is often useful to use the Pandas functions over the native Python functions\n", - "* Pandas is most powerful when you index by time using the `DatetimeIndex`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a Series with a DateTime index\n", - "index = pd.DatetimeIndex(['2014-03-04', '2014-08-04',\n", - " '2015-04-04', '2015-09-04',\n", - " '2016-01-01', '2016-02-16'])\n", - "data = pd.Series([10, 13, 24, 34, 34, 25], index=index)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now that the index is made of DateTimes we can index using date strings\n", - "* Note, this only works on strings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# grab the value for a specific day\n", - "data[\"2015-04-04\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# grab a slice between two dates\n", - "data['2014-08-01':'2016-01']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# give me everything from 2015\n", - "data['2015']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Pandas has some functions to make parsing dates easy too" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use the to_datetime function instead of the parser function\n", - "date = pd.to_datetime(\"4th of July, 2017\")\n", - "date" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use string format codes to get the weekday\n", - "date.strftime(\"%A\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# give me today's date\n", - "today = pd.to_datetime(\"today\")\n", - "today" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* That is the day, but also the exact time... \n", - "* Timestamps must always be a specific moment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with Time on Real Data\n", - "* Let's look at the [311 data for the city of Pittsburgh](https://data.wprdc.org/dataset/311-data) from the WPRDC\n", - "* Did you know, you can give the URL directly to Pandas!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load the 311 data directly from the WPRDC\n", - "pgh_311_data = pd.read_csv(\"https://data.wprdc.org/datastore/dump/76fda9d0-69be-4dd5-8108-0de7907fc5a4\")\n", - "pgh_311_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Inspect the dataframe and Pandas automatic data type detection\n", - "pgh_311_data.info()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Ok, now we have the data, but we need it to be indexed by date\n", - "* **What column has the date information?**\n", - "* **What format do you think that column is currently in?**\n", - "* **What function might we use to convert that column into dates?**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pgh_311_data['CREATED_ON'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# convert the \"CREATED_ON\" column to dates\n", - "pd.to_datetime(pgh_311_data['CREATED_ON']).head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* We can convert the \"CREATED_ON\" column to Pandas `datetime` objects\n", - "* Now we have to set that to the dataframe's index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# set the index of pgh_311_data to be the parsed dates in the \"CREATED_ON\" column\n", - "pgh_311_data.index = pd.to_datetime(pgh_311_data['CREATED_ON'])\n", - "pgh_311_data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Do'h, now we have CREATED_ON twice, that isn't very tidy\n", - "* We can also skip this extra conversion step entirely by specifying the index column and date parsing in `read_csv()` function call." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load the 311 data directly from the WPRDC and parse dates directly\n", - "pgh_311_data = pd.read_csv(\"https://data.wprdc.org/datastore/dump/76fda9d0-69be-4dd5-8108-0de7907fc5a4\",\n", - " index_col=\"CREATED_ON\", \n", - " parse_dates=True)\n", - "pgh_311_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pgh_311_data.info()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now that the dataframe has been indexed by time we can select 311 complains by time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Select 311 complaints on January 1st, 2016\n", - "pgh_311_data['2016-01-01']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Select the times just around the new years celebration\n", - "pgh_311_data[\"2015-12-31 20:00:00\":\"2016-01-01 02:00:00\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Someone clearly had a very rowdy new years " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exercise\n", - "\n", - "* Using the timeseries index selection, select the complaints made today \n", - "* Next, try and write your code so it will work on any day you execute it\n", - " * *hint*: try the `pd.datetime('today')` \n", - " * *Another hint*: Remember the DateTime gives you the exact time \n", - " * *Yet another hint*: Datetime indexing only works with string representations " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write your code here\n", - "pgh_311_data[]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pgh_311_data['2018-10-24'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a Pandas datetime for today\n", - "today = pd.to_datetime(\"today\")\n", - "formatted_today_string = today.strftime(\"%Y-%m-%d\")\n", - "print(today)\n", - "print(formatted_today_string)\n", - "\n", - "# use Pandas date string indexing to retrieve all rows for this today's date\n", - "todays_311s = pgh_311_data[formatted_today_string]\n", - "todays_311s.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Grouping time with the `resample` method\n", - "\n", - "* The `resample()` method is like `groupby()` but for time. \n", - "* First you *split* your data into groupings of time\n", - "* Then *apply* an aggregation functions to perform calculations \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# count the number of complaints per month\n", - "pgh_311_data.resample(\"W\").size()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compute the mean of complaints per quarter...note this doesn't make sense, but works anyway\n", - "pgh_311_data.resample(\"Q\").mean()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Ok, these data are *begging* to be visualized, so I'm going to give you a teaser of next week " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Tell matplotlib to render plots inline\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a graph of the monthly complaint counts\n", - "pgh_311_data['REQUEST_ID'].resample(\"M\").count().plot();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Try the code above, but re-sampling based upon different date periods. The strings for specifying an offset are located [here](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases) and below:\n", - "\n", - "|Alias|Description|\n", - "|-----|-----------|\n", - "|B|business day frequency|\n", - "|C|custom business day frequency|\n", - "|D|calendar day frequency|\n", - "|W|weekly frequency|\n", - "|M|month end frequency|\n", - "|SM|semi-month end frequency (15th and end of month)|\n", - "|BM|business month end frequency|\n", - "|CBM|custom business month end frequency|\n", - "|MS|month start frequency|\n", - "|SMS|semi-month start frequency (1st and 15th)|\n", - "|BMS|business month start frequency|\n", - "|CBMS|custom business month start frequency|\n", - "|Q|quarter end frequency|\n", - "|BQ|business quarter end frequency|\n", - "|QS|quarter start frequency|\n", - "|BQS|business quarter start frequency|\n", - "|A, Y|year end frequency|\n", - "|BA, BY|business year end frequency|\n", - "|AS, YS|year start frequency|\n", - "|BAS, BYS|business year start frequency|\n", - "|BH|business hour frequency|\n", - "|H|hourly frequency|\n", - "|T, min|minutely frequency|\n", - "|S|secondly frequency|\n", - "|L, ms|milliseconds|\n", - "|U, us|microseconds|\n", - "|N|nanoseconds|" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Try a different resampling here\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Try yet another resampling here\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exploring the 311 Data\n", - "\n", - "* Now we can use what we have learned to do some exploratory data analysis on the 311 data\n", - "* First, lets use the `sample()` method to grab 10 random rows so we can get a feel for the data\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Sample 10 random rows from the dataframe\n", - "pgh_311_data.sample(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Where are the requests generally coming from?\n", - "pgh_311_data['REQUEST_ORIGIN'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# What neighborhood complains the most\n", - "pgh_311_data.groupby('NEIGHBORHOOD').size().sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Note, for just counting the groupby and value_counts are equivalent\n", - "# There is more than one way to skin the cat (or panda)\n", - "pgh_311_data['NEIGHBORHOOD'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Count the different types of requests\n", - "pgh_311_data.groupby(\"REQUEST_TYPE\").size().sort_values(ascending=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This categorical data is far too granular. \n", - "* Fortunately, if we look at the [311 Data](https://data.wprdc.org/dataset/311-data) we can see there is a [311 Issue and Category Codebook](https://data.wprdc.org/dataset/311-data/resource/7794b313-33be-4a8b-bf80-41751a59b84a). \n", - "\n", - "* What we need to do is download the CSV from Google Sheets directly into a Pandas dataframe, which requires some fancy URL-ing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# I googled \"pandas dataframe from google sheets\"\n", - "# and found a solution on Stackoverflow\n", - "# https://stackoverflow.com/a/35246041\n", - "issue_category_mapping = pd.read_csv('https://docs.google.com/spreadsheets/d/' + \n", - " '1DTDBhwXj1xQG1GCBKPqivlzHQaLh2HLd0SjN1XBPUw0' +\n", - " '/export?gid=0&format=csv')\n", - "issue_category_mapping.head(5) # Same result as @TomAugspurger" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exercise\n", - "\n", - "* We have two dataframes with one shared column. The `REQUEST_TYPE` column in `pgh_311_data` maps to the `Issue` column in `issue_category_mapping`\n", - "* Use the `pd.merge()` function to create a new dataframe that combines the `pgh_311_data` with the `issue_category_mapping` so we can count the number of complaints per Category instead of Request type.\n", - " * *HINT*: You will need to specify the `left_on` and `right_on` parameters\n", - "* Once you have created a new dataframe, do a `groupby` on the Category column and count the number of complaints\n", - " * *HINT*: Use the `size()` aggregation function and then `sort_values()` if you want it nicely ordered." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Your code here\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create a new merged dataframe\n", - "merged_311_data = pd.merge(pgh_311_data, \n", - " issue_category_mapping,\n", - " left_on=\"REQUEST_TYPE\",\n", - " right_on=\"Issue\")\n", - "\n", - "merged_311_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get rid of redundant columns\n", - "merged_311_data.drop(['Definition','Department', 'Issue'], \n", - " axis=1, \n", - " inplace=True)\n", - "merged_311_data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Group by the Category and count the size of each group then sort by \n", - "merged_311_data.groupby(\"Category\").size().sort_values(ascending=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Where are are most 311 from the Greenfield neighborhood originating?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Select only rows where NEIGHBORHOOD equals \"Greenfield\" and then count how many complaints came from each source\n", - "query_mask = merged_311_data['NEIGHBORHOOD'] == 'Greenfield'\n", - "merged_311_data[query_mask].groupby('REQUEST_ORIGIN').size().sort_values(ascending=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.7", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/day-two/8-apis-and-json.ipynb b/day-two/8-apis-and-json.ipynb deleted file mode 100644 index f770d93..0000000 --- a/day-two/8-apis-and-json.ipynb +++ /dev/null @@ -1,391 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Application Programming Interfaces (APIs)\n", - "\n", - "![Diagram of an API, Image from Software AG](https://www.softwareag.com/pl/images/SAG_API_Portal_KD_1_Large_tcm403-160297.png)\n", - "\n", - "* API or application programming interface is not a new term, but it has taken on a new significance with the Web\n", - "* Now we talk about \"Web APIs\" or \"Web Services\" which are APIs that use web technologies and standards ;)\n", - "* If UIs are interaces for the user to access a system, APIs are software interfaces for systems to access other systems (with the help of a Developer).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## What might an API look like?\n", - "\n", - "* Imagine having an API that takes a time zone string and returns the current time for that time zone\n", - "* This API would take a string like \"America/Los_Angeles\" and return \"2019-02-28T20:09:45-07:00\"\n", - "* One design of the API might look like this:\n", - " * http://api.example.com/timezone?tz=America+Los_Angeles\n", - "\n", - "```json\n", - "{\n", - " \"time\": \"2019-02-28T20:09:45-07:00\",\n", - " \"zone\": \"America/Los_Angeles\"\n", - "}\n", - "```\n", - "\n", - "* This API has an *endpoint*, `/timezone` that expects a *query paramater, `tz={Timezone location}`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## APIs Support Multiple Interfaces\n", - "\n", - "* With an API you don't need to build multiple interfaces to applications and platforms\n", - "* You build a generic interface and the platforms conform to the API specification and data formats\n", - "\n", - "![A web page that accesses an API. Image from Express In Action](https://cdn.glitch.com/47219279-662d-49cf-9388-c11e70fac7be%2FScreenshot%202019-02-28%2013.06.03.png?1551377204363)\n", - "\n", - "* By separating the content from the representation we can support multiple modes of access\n", - "\n", - "![A command line application accessing an API. Image from Express In Action](https://cdn.glitch.com/47219279-662d-49cf-9388-c11e70fac7be%2FScreenshot%202019-02-28%2013.06.31.png?1551377204295)\n", - "\n", - "* APIs can use different data formats like XML or proprietary formats\n", - "* These days many web-based APIs, including the LC API, use a data format called JSON" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## What is JSON\n", - "\n", - "* JSON stands for [*JavaScript Object Notation*](https://www.json.org/)\n", - "* Is a text-based format that is valid JavaScript code\n", - "* It is \"self-describing\" and easy to read and understand\n", - "* A lightweight format for storing and transferring data\n", - "* While it uses JavaScript syntax, it is supported by nearly every programming language." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## JSON Data Types\n", - "\n", - "* Strings - Strings must be double quotes. `{\"name\":\"Bob\"}`\n", - "* Numbers - Numbers must be integer or floating point numbers. `5, 5.6`\n", - "* Objects - Values can be JSON objects. `{\"employee\":{ \"name\":\"John\", \"age\":30, \"city\":\"New York\" }}` (look like anything you know?)\n", - "* Arrays - Arrays must be an order list of any values. `{\"employees\":[ \"John\", \"Anna\", \"Peter\" ]}`\n", - "* Boolean - Must be true or false value. `{\"sale\":true}`\n", - "* Null - Values can also be null. `{\"middlename\":null}`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with JSON in Python\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# import the JSON module so we can load and save JSON data\n", - "import json\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reading JSON\n", - "\n", - "* In Python you parse JSON text into Python data structures using the `json.loads()` and `json.load()` functions.\n", - "* " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n[\\n{\"first\":\"Matt\",\"last\":\"Burton\",\"city\":\"Pittsburgh\"},\\n{\"name\":\"John\", \"age\":30, \"city\":\"New York\"},\\n{\"first\": \"Jessica\", \"city\":\"Chicago\"}\\n]\\n'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# create a string that represents some JSON data\n", - "json_string = \"\"\"\n", - "[\n", - "{\"first\":\"Matt\",\"last\":\"Burton\",\"city\":\"Pittsburgh\"},\n", - "{\"name\":\"John\", \"age\":30, \"city\":\"New York\"},\n", - "{\"first\": \"Jessica\", \"city\":\"Chicago\"}\n", - "]\n", - "\"\"\"\n", - "json_string" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This is a Python string, but it can be parsed into a list of dictionaries using `json.loads()`" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'city': 'Pittsburgh', 'first': 'Matt', 'last': 'Burton'},\n", - " {'age': 30, 'city': 'New York', 'name': 'John'},\n", - " {'city': 'Chicago', 'first': 'Jessica'}]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "json_data = json.loads(json_string)\n", - "json_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Now the data had been loaded into Python and we can access it " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'city': 'Pittsburgh', 'first': 'Matt', 'last': 'Burton'}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# get the first item \n", - "json_data[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'New York'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# get the city field of the second item\n", - "json_data[1]['city']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* Parsing JSON can be very picky and cryptic\n", - "* For example, why doesn't this work?" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "JSONDecodeError", - "evalue": "Expecting property name enclosed in double quotes: line 5 column 22 (char 124)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m ]\n\u001b[1;32m 8\u001b[0m \"\"\"\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbad_json_string\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/miniconda3/lib/python3.7/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mparse_int\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mparse_float\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 348\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 349\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 350\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \"\"\"\n\u001b[0;32m--> 337\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/lib/python3.7/json/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 351\u001b[0m \"\"\"\n\u001b[1;32m 352\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 353\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 354\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expecting value\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting property name enclosed in double quotes: line 5 column 22 (char 124)" - ] - } - ], - "source": [ - "# create a string that represents some JSON data\n", - "bad_json_string = \"\"\"\n", - "[\n", - "{\"first\":\"Matt\",\"last\":\"Burton\",\"city\":\"Pittsburgh\"},\n", - "{\"name\":\"John\", \"age\":30, \"city\":\"New York\"},\n", - "{\"first\": \"Jessica\", city:\"Chicago\"}\n", - "]\n", - "\"\"\"\n", - "json.loads(bad_json_string)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reading JSON from files\n", - "\n", - "* If you have a JSON file you can read it from disk using `json.load()`\n", - " * Example file comes from [wikipedia](https://en.wikipedia.org/wiki/JSON)\n", - "* " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'address': {'city': 'New York',\n", - " 'postalCode': '10021-3100',\n", - " 'state': 'NY',\n", - " 'streetAddress': '21 2nd Street'},\n", - " 'age': 27,\n", - " 'children': [],\n", - " 'firstName': 'John',\n", - " 'isAlive': True,\n", - " 'lastName': 'Smith',\n", - " 'phoneNumbers': [{'number': '212 555-1234', 'type': 'home'},\n", - " {'number': '646 555-4567', 'type': 'office'},\n", - " {'number': '123 456-7890', 'type': 'mobile'}],\n", - " 'spouse': None}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# open a file handler in read-only mode\n", - "with open(\"test.json\", \"r\") as f:\n", - " # pass the file handler to the json parser\n", - " json_from_file = json.load(f)\n", - " \n", - "json_from_file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This has been parsed into a JSON dictionary\n", - "* The JSON keys are now Python Dictionary keys" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'John'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "json_from_file['firstName']" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'John Smith'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_name = json_from_file[\"firstName\"] + \" \" + json_from_file[\"lastName\"]\n", - "full_name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/day-two/9-data-from-the-web.ipynb b/day-two/9-data-from-the-web.ipynb deleted file mode 100644 index 1f465a5..0000000 --- a/day-two/9-data-from-the-web.ipynb +++ /dev/null @@ -1,593 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Fetching data from the web\n", - "\n", - "- [About the loc.gov JSON API](https://libraryofcongress.github.io/data-exploration/)\n", - "\n", - "\n", - "- collection\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "from time import sleep" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* What we want to do is programmatically download the information from a particular collection\n", - "* We are interested in the [Selected Digitized Books](https://www.loc.gov/collections/selected-digitized-books/) collection\n", - "* When we visit the link we can see the HTML representation of that collection\n", - "\n", - "* What we want to do is access this information using Python and manipulate it\n", - "* The first step is to retrieve a representation that is easier for the computer to work with\n", - " * While we could parse the HTML, this would be a lot of effort (save this for another workshop)\n", - "* Fortunately the LC provides convientient API for obtaining a more *computationally tractable* representation of the pages\n", - " * In JSON\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Retriving Data from the Web\n", - "\n", - "* We don't always have to download data to our local machines before loading it into Python\n", - "* If the data are openly available on the web we can retrieve them programmatically\n", - " * We can even log into systems with access control, but that is a more complicated topic\n", - "* Getting remote data requires the use of *web protocols* to \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### What is HTTP\n", - "\n", - "* HTTP is the *HyperText Transfer Protocol* and is the lingua franca of the web\n", - " > HTTP is a protocol which allows the fetching of resources, such as HTML documents. It is the foundation of any data exchange on the Web and a client-server protocol, which means requests are initiated by the recipient, usually the Web browser. A complete document is reconstructed from the different sub-documents fetched, for instance text, layout description, images, videos, scripts, and more. - [MDN Web Docs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Overview)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![HTTP Flow](images/http-flow.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Elements of HTTP\n", - "\n", - "* Request Methods - Verbs\n", - " * GET - Requests a representation of a specific resource. Retrieve only.\n", - " * POST - Submit an entity to a specified resource, often causing a change in state on the server.\n", - " * PUT - Replace the current representation of the specified resource with the request payload.\n", - " * DELETE - Remove the specified resource from the server.\n", - " * HEAD - Same as GET, but without the response body.\n", - "* User Agent - Information about the application making the request\n", - "* Headers - Metadata about the request\n", - "* Body - Data sent or received\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### HTTP Status Codes\n", - "\n", - "* HTTP has five categories of status code\n", - " * 1xx: informational – used for development\n", - " * 2xx: Successful response\n", - " * 3xx: Redirection\n", - " * 4xx: Client Error\n", - " * 5xx: Server Error\n", - "* Frequently used codes:\n", - " * 200 - success\n", - " * 301 and 302 - Moved permanently or temporarily\n", - " * 400 - bad request\n", - " * 401 - unauthorized\n", - " * 403 - forbidden\n", - " * 404 - not found\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### HTTP Request & Response\n", - "\n", - "![HTTP Request and Response](images/http-request-response.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Working with HTTP in Python\n", - "\n", - "* Because Python has the *batteries included* there is an [http client module](https://docs.python.org/3/library/http.client.html) as part of the standard library\n", - " * It is fine in a pinch, but there is a better 3rd party library\n", - "* The [Requests](https://2.python-requests.org/en/master/) library by [Kenneth Reitz](https://www.kennethreitz.org/)\n", - " * It is *HTTP for humans*\n", - "* Requests is the most popular library for fetching data from the web\n", - "* It is very powerful, but we will only touch on a little bit of it today." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# load the requests library\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# put the address of the page we want to load into a variable\n", - "URL = \"http://loc.gov\"\n", - "\n", - "# make an HTTP GET request to the specified URL\n", - "# Save the response in a variable\n", - "response = requests.get(URL)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "200" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Inspect the response status code\n", - "response.status_code" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This means tour HTTP request was successful \n", - "* Requests makes it easy to inspect various bits of information related to our HTTP transaction" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Date': 'Fri, 17 May 2019 20:36:20 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-Frame-Options': 'allow-from https://unitedstateslibraryofcongress.marketing.adobe.com', 'Expires': 'Fri, 17 May 2019 17:53:13 GMT', 'Cache-Control': 'no-transform, max-age=3600', 'Age': '0', 'Expect-CT': 'max-age=604800, report-uri=\"https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct\"', 'Server': 'cloudflare', 'CF-RAY': '4d886205dd1fa409-PIT', 'Content-Encoding': 'gzip'}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Display the HTTP headers we got from the server\n", - "response.headers" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'text/html; charset=UTF-8'" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Look at the content type of the resource we got back from the server\n", - "response.headers['Content-Type']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* This means we got an HTML document back from loc.gov\n", - "* You can access the response body in the `response.text` or `response.content` fields\n", - " * Be careful, They can be really long!" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "