From ad3ba658f39d0204d6c1dd7dbd1cdb4909ab6574 Mon Sep 17 00:00:00 2001 From: kannab Date: Wed, 9 Sep 2015 23:08:14 -0700 Subject: [PATCH 1/3] Commited with some comments --- .../02-DataScrapingQuizzes-checkpoint.ipynb | 2331 +++++++++++++++++ Lectures/02-DataScrapingQuizzes.ipynb | 9 +- 2 files changed, 2339 insertions(+), 1 deletion(-) create mode 100755 Lectures/.ipynb_checkpoints/02-DataScrapingQuizzes-checkpoint.ipynb diff --git a/Lectures/.ipynb_checkpoints/02-DataScrapingQuizzes-checkpoint.ipynb b/Lectures/.ipynb_checkpoints/02-DataScrapingQuizzes-checkpoint.ipynb new file mode 100755 index 0000000..6eeda04 --- /dev/null +++ b/Lectures/.ipynb_checkpoints/02-DataScrapingQuizzes-checkpoint.ipynb @@ -0,0 +1,2331 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "## all imports\n", + "from IPython.display import HTML\n", + "import numpy as np\n", + "import urllib2\n", + "import bs4 #this is beautiful soup\n", + "import time\n", + "import operator\n", + "import socket\n", + "import cPickle\n", + "import re # regular expressions\n", + "\n", + "from pandas import Series\n", + "import pandas as pd\n", + "from pandas import DataFrame\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "import seaborn as sns\n", + "sns.set_context(\"talk\")\n", + "sns.set_style(\"white\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "THis is a test comment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "API registrations\n", + "=================\n", + "\n", + "If you would like to run all the examples in this notebook, you need to register for the following APIs:\n", + "\n", + "* Rotten Tomatoes\n", + "\n", + "http://developer.rottentomatoes.com/member/register\n", + "\n", + "* Twitter\n", + "\n", + "https://apps.twitter.com/app/new\n", + "\n", + "* Twitter instructions\n", + "\n", + "https://twittercommunity.com/t/how-to-get-my-api-key/7033" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "CS109\n", + "=====\n", + "\n", + "Verena Kaynig-Fittkau, Joe Blitzstein, Hanspeter Pfister\n", + "\n", + "* vkaynig@seas.harvard.edu\n", + "* staff@cs109.org" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Announcements\n", + "==============\n", + "\n", + "* Over 400 sign ups on github!\n", + "* If you are still missing, fill out the survey, time is running out!\n", + "* Make sure you are on Piazza!\n", + "\n", + "\n", + "* More [git help](https://www.youtube.com/channel/UC0-KaiZFXBlGOFN71YsEV8g/videos)\n", + "\n", + "\n", + "* HW0 is due on Thursday \n", + "* HW1 is coming out on Thursday\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Todays lecture:\n", + "===============\n", + "\n", + "* introduction to pandas\n", + " - read a table\n", + " - do some plots\n", + "\n", + "* all about data scraping\n", + "* ***What is it? ***\n", + "* How to do it:\n", + " - from a website\n", + " - with an API" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "IPython Notebooks:\n", + "===================\n", + "\n", + "![IPython](images/ipython.png \"IPython\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "IPython Notebooks:\n", + "===================\n", + "\n", + "* These slides are an IPython notebook!\n", + "* https://github.com/damianavila/live_reveal" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print \"Hello CS109\"\n", + "\n", + "print \"I love IPython\"\n", + "\n", + "# Ipython notebook have tab completion!\n", + "# and inbuild help\n", + " \n", + "a = np.zeros(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "General advice about programming\n", + "==================================\n", + "\n", + "* You will find nearly everything on google\n", + "* Try: length of a list in python\n", + "* A programmer is someone who can turn stack overflow snippets into running code\n", + "* Use tab completion\n", + "* Make your variable names meaningful\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "How to load a table\n", + "===================\n", + "\n", + "* we use Pandas for this\n", + "* Pandas can do a __lot__ more\n", + "* more about it later" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "The MovieLens data\n", + "===================\n", + "\n", + "http://grouplens.org/datasets/movielens/\n", + "\n", + "![Grouplens](images/grouplens.jpg \"Grouplens\")\n", + "\n", + "Example inspired by [Greg Reda](http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Read the user data\n", + "==================" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "# pass in column names for each CSV\n", + "u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']\n", + "\n", + "users = pd.read_csv(\n", + " 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user', \n", + " sep='|', names=u_cols)\n", + "\n", + "users.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Read the ratings\n", + "============" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']\n", + "ratings = pd.read_csv(\n", + " 'http://files.grouplens.org/datasets/movielens/ml-100k/u.data', \n", + " sep='\\t', names=r_cols)\n", + "\n", + "ratings.head() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Now data about the movies\n", + "=========================" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# the movies file contains columns indicating the movie's genres\n", + "# let's only load the first five columns of the file with usecols\n", + "m_cols = ['movie_id', 'title', 'release_date', \n", + " 'video_release_date', 'imdb_url']\n", + "\n", + "movies = pd.read_csv(\n", + " 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item', \n", + " sep='|', names=m_cols, usecols=range(5))\n", + "\n", + "movies.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Get information about data\n", + "=======================" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print movies.dtypes\n", + "print\n", + "print movies.describe()\n", + "# *** Why only those two columns? ***" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Selecting data\n", + "==============\n", + "\n", + "* DataFrame => group of Series with shared index\n", + "* single DataFrame column => Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "users.head()\n", + "users['occupation'].head()\n", + "## *** Where did the nice design go? ***\n", + "columns_you_want = ['occupation', 'sex'] \n", + "users[columns_you_want].head()\n", + "\n", + "print users.head()\n", + "\n", + "print users.iloc[3]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Filtering data\n", + "==============\n", + "\n", + "Select users older than 25" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "oldUsers = users[users.age > 25]\n", + "oldUsers.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Quiz:\n", + "=====\n", + "\n", + "* show users aged 40 and male\n", + "\n", + "* show the mean age of female programmers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# users aged 40 AND male\n", + "# your code here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## users who are female and programmers\n", + "# your code here\n", + "\n", + "## show statistic summary or compute mean\n", + "# your code here" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Split-apply-combine\n", + "===================\n", + "\n", + "* splitting the data into groups based on some criteria\n", + "* applying a function to each group independently\n", + "* combining the results into a data structure" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Split-apply-combine\n", + "===================\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Find Diligent Users\n", + "===================\n", + "\n", + "* split data per user ID\n", + "* count ratings\n", + "* combine result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "print ratings.head()\n", + "## split data\n", + "grouped_data = ratings.groupby('user_id')\n", + "#grouped_data = ratings['movie_id'].groupby(ratings['user_id'])\n", + "\n", + "## count and combine\n", + "ratings_per_user = grouped_data.count()\n", + "\n", + "ratings_per_user.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Quiz\n", + "====\n", + "\n", + "* get the average rating per movie\n", + "* advanced: get the movie titles with the highest average rating" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## split data\n", + "# your code here\n", + "\n", + "## average and combine\n", + "# your code here\n", + "\n", + "# get the maximum rating\n", + "# your code here\n", + "\n", + "# get movie ids with that rating\n", + "# your code here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "print \"Good movie ids:\"\n", + "print #your code here\n", + "print\n", + "\n", + "print \"Best movie titles\"\n", + "print # your code here\n", + "print" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# get number of ratings per movie\n", + "# your code here\n", + "\n", + "print \"Number of ratings per movie\"\n", + "print # your code here" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Passing a Function\n", + "==================\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "average_ratings = grouped_data.apply(lambda f: f.mean())\n", + "average_ratings.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Quiz\n", + "====\n", + "\n", + "* get the average rating per user\n", + "* advanced: list all occupations and if they are male or female dominant" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# get the average rating per user\n", + "# your code here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# list all occupations and if they are male or female dominant\n", + "# your code here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "print 'number of male users: '\n", + "print sum(users['sex'] == 'M')\n", + "\n", + "print 'number of female users: '\n", + "print sum(users['sex'] == 'F')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Pandas \"wrapup\"\n", + "==========\n", + "\n", + "- create data frames\n", + "- get sub-frames\n", + "- filter data \n", + "- use group-by\n", + "- apply a user defined function\n", + "\n", + "\n", + "![cute panda](images/cute_panda.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Python data scraping\n", + "====================\n", + "\n", + "* Why scrape the web?\n", + " - vast source of information\n", + " - automate tasks\n", + " - keep up with sites\n", + " - fun!\n", + "\n", + "** Can you think of examples ? **" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Read and Tweet!\n", + "=================\n", + "\n", + "![ReadTweet](http://developer.nytimes.com/files/readtweet.jpg)\n", + "\n", + "* by Justin Blinder\n", + "* http://projects.justinblinder.com/We-Read-We-Tweet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "“We Read, We Tweet” geographically visualizes the dissemination of New York Times articles through Twitter. Each line connects the location of a tweet to the contextual location of the New York Times article it referenced. The lines are generated in a sequence based on the time in which a tweet occurs. The project explores digital news distribution in a temporal and spatial context through the social space of Twitter." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Twitter Sentiments\n", + "=================\n", + "\n", + "![TwitterSentiments](http://www.csc.ncsu.edu/faculty/healey/tweet_viz/figs/tweet-viz-ex.png\n", + " \"Twitter Sentiments\")\n", + "\n", + "* by Healey and Ramaswamy\n", + "* http://www.csc.ncsu.edu/faculty/healey/tweet_viz/tweet_app/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Type a keyword into the input field, then click the Query button. Recent tweets that contain your keyword are pulled from Twitter and visualized in the Sentiment tab as circles. Hover your mouse over a tweet or click on it to see its text." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Python data scraping\n", + "====================\n", + "\n", + "* copyrights and permission:\n", + " - be careful and polite\n", + " - give credit\n", + " - care about media law\n", + " - don't be evil (no spam, overloading sites, etc.)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Robots.txt\n", + "==========\n", + "\n", + "![Robots.txt](images/robots_txt.jpg \"Robots.txt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Robots.txt\n", + "==========\n", + "\n", + "* specified by web site owner\n", + "* gives instructions to web robots (aka your script)\n", + "* is located at the top-level directory of the web server\n", + "\n", + "http://www.example.com/robots.txt\n", + "\n", + "If you want you can also have a look at\n", + "\n", + "http://google.com/robots.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Robots.txt\n", + "==========\n", + "\n", + "*** What does this one do? ***" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "\n", + "User-agent: Google\n", + "Disallow:\n", + "\n", + "User-agent: *\n", + "Disallow: /" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Things to consider:\n", + "-------------------\n", + "\n", + "* can be just ignored\n", + "* can be a security risk - *** Why? ***" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Scraping with Python:\n", + "=====================\n", + "\n", + "* scraping is all about HTML tags\n", + "* bad news: \n", + " - need to learn about tags\n", + " - websites can be ugly" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "HTML\n", + "=====\n", + "\n", + "* HyperText Markup Language\n", + "\n", + "* standard for creating webpages\n", + "\n", + "* HTML tags \n", + " - have angle brackets\n", + " - typically come in pairs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "This is an example for a minimal webpage defined in HTML tags. The root tag is `` and then you have the `` tag. This part of the page typically includes the title of the page and might also have other meta information like the author or keywords that are important for search engines. The `` tag marks the actual content of the page. You can play around with the `

` tag trying different header levels. They range from 1 to 6. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "htmlString = \"\"\"\n", + "\n", + " \n", + " This is a title\n", + " \n", + " \n", + "

Test

\n", + "

Hello world!

\n", + " \n", + "\"\"\"\n", + "\n", + "htmlOutput = HTML(htmlString)\n", + "htmlOutput" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Useful Tags\n", + "===========\n", + "\n", + "* heading\n", + "`

...
`\n", + "\n", + "* paragraph\n", + "`

` \n", + "\n", + "* line break\n", + "`
` \n", + "\n", + "* link with attribute\n", + "\n", + "`An example link`\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Scraping with Python:\n", + "=====================\n", + "\n", + "* example of a beautifully simple webpage:\n", + "\n", + "http://www.crummy.com/software/BeautifulSoup" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Scraping with Python:\n", + "=====================\n", + "\n", + "* good news: \n", + " - some browsers help\n", + " - look for: inspect element\n", + " - need only basic html\n", + " \n", + "** Try 'Ctrl-Shift I' in Chrome **\n", + "\n", + "** Try 'Command-Option I' in Safari **\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Scraping with Python\n", + "==================\n", + "\n", + "* different useful libraries:\n", + " - urllib\n", + " - beautifulsoup\n", + " - pattern\n", + " - soupy\n", + " - LXML\n", + " - ...\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The following cell just defines a url as a string and then reads the data from that url using the `urllib` library. If you uncomment the print command you see that we got the whole HTML content of the page into the string variable source." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "url = 'http://www.crummy.com/software/BeautifulSoup'\n", + "source = urllib2.urlopen(url).read()\n", + "print source" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Quiz :\n", + "======\n", + "\n", + "* Is the word 'Alice' mentioned on the beautiful soup homepage?\n", + "* How often does the word 'Soup' occur on the site?\n", + " - hint: use `.count()`\n", + "* At what index occurs the substring 'alien video games' ?\n", + " - hint: use `.find()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## is 'Alice' in source?\n", + "\n", + "## count occurences of 'Soup'\n", + "\n", + "## find index of 'alien video games'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Beautiful Soup\n", + "==============\n", + "\n", + "* designed to make your life easier\n", + "* many good functions for parsing html code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Some examples\n", + "=============\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "## get bs4 object\n", + "soup = bs4.BeautifulSoup(source)\n", + " \n", + "## compare the two print statements\n", + "#print soup\n", + "#print soup.prettify()\n", + "\n", + "## show how to find all a tags\n", + "soup.findAll('a')\n", + "\n", + "## ***Why does this not work? ***\n", + "#soup.findAll('Soup')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Some examples\n", + "=============" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "## get attribute value from an element:\n", + "## find tag: this only returns the first occurrence, not all tags in the string\n", + "first_tag = soup.find('a')\n", + "\n", + "## get attribute `href`\n", + "first_tag.get('href')\n", + "\n", + "## get all links in the page\n", + "link_list = [l.get('href') for l in soup.findAll('a')]\n", + "link_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## filter all external links\n", + "# create an empty list to collect the valid links\n", + "external_links = []\n", + "\n", + "# write a loop to filter the links\n", + "# if it starts with 'http' we are happy\n", + "for l in link_list:\n", + " if l[:4] == 'http':\n", + " external_links.append(l)\n", + "\n", + "# this throws an error! It says something about 'NoneType'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# lets investigate. Have a close look at the link_list:\n", + "link_list\n", + "\n", + "# Seems that there are None elements!\n", + "# Let's verify\n", + "#print sum([l is None for l in link_list])\n", + "\n", + "# So there are two elements in the list that are None!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Let's filter those objects out in the for loop\n", + "external_links = []\n", + "\n", + "# write a loop to filter the links\n", + "# if it is not None and starts with 'http' we are happy\n", + "for l in link_list:\n", + " if l is not None and l[:4] == 'http':\n", + " external_links.append(l)\n", + " \n", + "external_links" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Note: The above `if` condition works because of lazy evaluation in Python. The `and` statement becomes `False` if the first part is `False`, so there is no need to ever evaluate the second part. Thus a `None` entry in the list gets never asked about its first four characters. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# and we can put this in a list comprehension as well, it almost reads like \n", + "# a sentence.\n", + "\n", + "[l for l in link_list if l is not None and l.startswith('http')]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Parsing the Tree\n", + "================\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# redifining `s` without any line breaks\n", + "s = \"\"\"This is a title

Test

Hello world!

\"\"\"\n", + "## get bs4 object\n", + "tree = bs4.BeautifulSoup(s)\n", + "\n", + "## get html root node\n", + "root_node = tree.html\n", + "\n", + "## get head from root using contents\n", + "head = root_node.contents[0]\n", + "\n", + "## get body from root\n", + "body = root_node.contents[1]\n", + "\n", + "## could directly access body\n", + "tree.body" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Quiz:\n", + "=====\n", + "\n", + "* Find the `h3` tag by parsing the tree starting at `body`\n", + "* Create a list of all __Hall of Fame__ entries listed on the Beautiful Soup webpage\n", + " - hint: it is the only unordered list in the page (tag `ul`)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## get h3 tag from body\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## use ul as entry point\n", + "\n", + "\n", + "## get hall of fame list from entry point\n", + "## skip the first entry \n", + "\n", + "## reformat into a list containing strings\n", + "## it is ok to have a list of lists" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "`tmp` now is actually a list of lists containing the hall of fame entries. \n", + "Here is some advanced Python on how to print really just one entry per list item.\n", + "\n", + "The cool things about this are: \n", + "* The use of `\"\"` to just access the `join` function of strings.\n", + "* The `join` function itself\n", + "* that you can actually have two nested for loops in a list comprehension" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "test = [\"\".join(str(a) for a in sublist) for sublist in tmp]\n", + "print '\\n'.join(test)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Advanced Example\n", + "===============\n", + "\n", + "Idea by [Jesse Steinweg-Woods](https://jessesw.com/Data-Science-Skills/)\n", + "--------------------------------------------------------------------------------" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Scraping data science skills\n", + "=============================\n", + "\n", + "- What skills are in demand for data scientists?\n", + "- Should we have a lecture on Spark or only on MapReduce?\n", + "\n", + "We want to scrape the information from job advertisements for data scientists from indeed.com\n", + "Let's scrape and find out!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Fixed url for job postings containing data scientist\n", + "url = 'http://www.indeed.com/jobs?q=data+scientist&l='\n", + "# read the website\n", + "source = urllib2.urlopen(url).read()\n", + "# parse html code\n", + "bs_tree = bs4.BeautifulSoup(source)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# see how many job postings we found\n", + "job_count_string = bs_tree.find(id = 'searchCount').contents[0]\n", + "job_count_string = job_count_string.split()[-1]\n", + "print(\"Search yielded %s hits.\" % (job_count_string))\n", + "\n", + "# not that job_count so far is still a string, \n", + "# not an integer, and the , separator prevents \n", + "# us from just casting it to int\n", + "\n", + "job_count_digits = [int(d) for d in job_count_string if d.isdigit()]\n", + "job_count = np.sum([digit*(10**exponent) for digit, exponent in \n", + " zip(job_count_digits[::-1], range(len(job_count_digits)))])\n", + "\n", + "print job_count" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# The website is only listing 10 results per page, \n", + "# so we need to scrape them page after page\n", + "num_pages = int(np.ceil(job_count/10.0))\n", + "\n", + "base_url = 'http://www.indeed.com'\n", + "job_links = []\n", + "for i in range(1): #do range(num_pages) if you want them all\n", + " if i%10==0:\n", + " print num_pages-i\n", + " url = 'http://www.indeed.com/jobs?q=data+scientist&start=' + str(i*10)\n", + " html_page = urllib2.urlopen(url).read() \n", + " bs_tree = bs4.BeautifulSoup(html_page)\n", + " job_link_area = bs_tree.find(id = 'resultsCol')\n", + " job_postings = job_link_area.findAll(\"div\")\n", + " job_postings = [jp for jp in job_postings if not jp.get('class') is None \n", + " and ''.join(jp.get('class')) ==\"rowresult\"]\n", + " job_ids = [jp.get('data-jk') for jp in job_postings]\n", + " \n", + " # go after each link\n", + " for id in job_ids:\n", + " job_links.append(base_url + '/rc/clk?jk=' + id)\n", + "\n", + " time.sleep(1)\n", + "\n", + "print \"We found a lot of jobs: \", len(job_links)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Some precautions to enable us to restart our search\n", + "=========================" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "-" + } + }, + "outputs": [], + "source": [ + "# Save the scraped links\n", + "#with open('data/scraped_links.pkl', 'wb') as f:\n", + "# cPickle.dump(job_links, f)\n", + " \n", + "# Read canned scraped links\n", + "with open('data/scraped_links.pkl', 'r') as f:\n", + " job_links = cPickle.load(f) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "skill_set = {'mapreduce': 0, 'spark': 0}\n", + "\n", + "## write initialization into a file, so we can restart later\n", + "#with open('scraped_links_restart.pkl', 'wb') as f:\n", + "# cPickle.dump((skill_set, 0),f) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Python Dictonaries\n", + "==================\n", + "\n", + "* build in data type\n", + "* uses key: value pairs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "a = {'a': 1, 'b':2}\n", + "print a\n", + "\n", + "#show keys\n", + "print a.keys()\n", + "\n", + "#show values\n", + "print a.values()\n", + "\n", + "#show for loop over all entries\n", + "# option 1 using zip\n", + "# this works also for iterating over any\n", + "# other two lists\n", + "for k,v in zip(a.keys(), a.values()):\n", + " print k,v\n", + "\n", + "# option 2 using the dictionary `iteritems()` function\n", + "for k,v in a.iteritems():\n", + " print k,v" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# This code below does the trick, but could be optimized for speed if necessary\n", + "# e.g. skills are typically listed at the end of the webpage\n", + "# might not need to split/join the whole webpage, as we already know\n", + "# which words we are looking for \n", + "# and could stop after the first occurance of each word\n", + "\n", + "with open('data/scraped_links_restart.pkl', 'r') as f:\n", + " skill_set, index = cPickle.load(f)\n", + " print \"How many websites still to go? \", len(job_links) - index\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "counter = 0\n", + "\n", + "for link in job_links[index:]:\n", + " counter +=1 \n", + " \n", + " try:\n", + " html_page = urllib2.urlopen(link).read()\n", + " except urllib2.HTTPError:\n", + " print \"HTTPError:\"\n", + " continue\n", + " except urllib2.URLError:\n", + " print \"URLError:\"\n", + " continue\n", + " except socket.error as error:\n", + " print \"Connection closed\"\n", + " continue\n", + "\n", + " html_text = re.sub(\"[^a-z.+3]\",\" \", html_page.lower()) # replace all but the listed characters\n", + " \n", + " for key in skill_set.keys():\n", + " if key in html_text: \n", + " skill_set[key] +=1\n", + " \n", + " if counter % 5 == 0:\n", + " print len(job_links) - counter - index\n", + " print skill_set\n", + " with open('scraped_links_restart.pkl','wb') as f:\n", + " cPickle.dump((skill_set, index+counter),f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "print skill_set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "pseries = pd.Series(skill_set)\n", + "pseries.sort(ascending=False)\n", + "\n", + "pseries.plot(kind = 'bar')\n", + "## set the title to Score Comparison\n", + "plt.title('Data Science Skills')\n", + "## set the x label\n", + "plt.xlabel('Skills')\n", + "## set the y label\n", + "plt.ylabel('Count')\n", + "## show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Another Example\n", + "================\n", + "\n", + "Designed by Katharine Jarmul\n", + "----------------------------\n", + "\n", + "https://github.com/kjam/python-web-scraping-tutorial\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Scraping Happy Hours\n", + "====================\n", + "\n", + "Scrape the happy hour list of LA for personal preferences\n", + "\n", + "http://www.downtownla.com/3_10_happyHours.asp?action=ALL\n", + "\n", + "This example is part of her talk about data scraping at PyCon2014. She is a really good speaker and I enjoyed watching her talk. Check it out: http://www.youtube.com/watch?v=p1iX0uxM1w8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "stuff_i_like = ['burger', 'sushi', 'sweet potato fries', 'BBQ','beer']\n", + "found_happy_hours = []\n", + "my_happy_hours = []\n", + "# First, I'm going to identify the areas of the page I want to look at\n", + "url = 'http://www.downtownla.com/3_10_happyHours.asp?action=ALL'\n", + "source = urllib2.urlopen(url).read()\n", + "tables = bs4.BeautifulSoup(source)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Then, I'm going to sort out the *exact* parts of the page\n", + "# that match what I'm looking for...\n", + "for t in tables.findAll('p', {'class': 'calendar_EventTitle'}):\n", + " text = t.text\n", + " for s in t.findNextSiblings():\n", + " text += '\\n' + s.text\n", + " found_happy_hours.append(text)\n", + "\n", + "print \"The scraper found %d happy hours!\" % len(found_happy_hours)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Now I'm going to loop through the food I like\n", + "# and see if any of the happy hour descriptions match\n", + "for food in stuff_i_like:\n", + " for hh in found_happy_hours:\n", + " # checking for text AND making sure I don't have duplicates\n", + " if food in hh and hh not in my_happy_hours:\n", + " print \"YAY! I found some %s!\" % food\n", + " my_happy_hours.append(hh)\n", + "\n", + "print \"I think you might like %d of them, yipeeeee!\" % len(my_happy_hours)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# Now, let's make a mail message we can read:\n", + "message = 'Hey Katharine,\\n\\n\\n'\n", + "message += 'OMG, I found some stuff for you in Downtown, take a look.\\n\\n'\n", + "message += '==============================\\n'.join(my_happy_hours)\n", + "message = message.encode('utf-8')\n", + "# To read more about encoding:\n", + "# http://diveintopython.org/xml_processing/unicode.html\n", + "message = message.replace('\\t', '').replace('\\r', '')\n", + "message += '\\n\\nXOXO,\\n Your Py Script'\n", + "\n", + "#print message" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Getting Data with an API\n", + "=========================\n", + "\n", + "* API: application programming interface\n", + "* some sites try to make your life easier\n", + "* Twitter, New York Times, ImDB, rotten Tomatoes, Yelp, ..." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Rotten Tomatoes\n", + "===============\n", + "\n", + "![The Wizard of Oz](images/wiz_oz.png \"The wizard of Oz\")\n", + "\n", + "http://www.rottentomatoes.com/top/\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "API keys\n", + "=========\n", + "\n", + "* required for data access\n", + "* identifies application (you)\n", + "* monitors usage\n", + "* limits rates" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Rotten Tomatoes Key\n", + "===================\n", + "\n", + "http://developer.rottentomatoes.com/member/register" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import json\n", + "import requests\n", + "\n", + "api_key = rottenTomatoes_key()\n", + "\n", + "url = 'http://api.rottentomatoes.com/api/public/v1.0/lists/dvds/top_rentals.json?apikey=' + api_key\n", + "data = urllib2.urlopen(url).read()\n", + "#print data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "JSON\n", + "======\n", + "\n", + "* JavaScript Object Notation\n", + "* human readable\n", + "* transmit attribute-value pairs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "a = {'a': 1, 'b':2}\n", + "s = json.dumps(a)\n", + "a2 = json.loads(s)\n", + "\n", + "## a is a dictionary\n", + "print a\n", + "## vs s is a string containing a in JSON encoding\n", + "print s\n", + "## reading back the keys are now in unicode\n", + "print a2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## create dictionary from JSON \n", + "dataDict = json.loads(data)\n", + "\n", + "## expore dictionary\n", + "print dataDict.keys()\n", + "\n", + "## there is a key named `movies` containing a list of movies as a value\n", + "movies = dataDict['movies']\n", + "\n", + "## each element of the list `movies` is a dictionary\n", + "print movies[0].keys()\n", + "\n", + "## one of the keys is called `ratings`\n", + "## the value is yet another dictionary\n", + "print movies[0]['ratings'].keys()\n", + "\n", + "## so we made it all the way to find the critics score\n", + "print movies[0]['ratings']['critics_score']\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Quiz\n", + "=====\n", + "\n", + "* build a list with critics scores\n", + "* build a list with audience scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "# critics scores list\n", + "\n", + "\n", + "# audience scores list\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The following code shows how to create a pandas data frame with the data we gathered from the webpage.\n", + "Beware of the `set_index()` function in pandas. Per default it does not change the actual data frame! You need to either reassign the output or set the `inplace` argument to `True`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## create pandas data frame with critics and audience score\n", + "scores = pd.DataFrame(data=[critics_scores, audience_scores]).transpose()\n", + "scores.columns = ['critics', 'audience']\n", + "\n", + "## also create a list with all movie titles\n", + "movie_titles = [m['title'] for m in movies]\n", + "\n", + "## set index of dataFrame BEWARE of inplace!\n", + "scores.set_index([movie_titles])\n", + "\n", + "## the line above does not changes scores!\n", + "## You need to either reassign\n", + "\n", + "scores = scores.set_index([movie_titles])\n", + "\n", + "## or set the inplace argument to True\n", + "scores.set_index([movie_titles], inplace=True)\n", + "scores.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## create a bar plot with the data\n", + "## notice that we are using the data frame itself and call its plot function\n", + "scores.plot(kind = 'bar')\n", + "\n", + "## set the title to Score Comparison\n", + "plt.title('Score Comparison')\n", + "\n", + "## set the x label\n", + "plt.xlabel('Movies')\n", + "\n", + "## set the y label\n", + "plt.ylabel('Scores')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Twitter Example:\n", + "================\n", + "\n", + "* API a bit more complicated\n", + "* libraries make life easier\n", + "* python-twitter\n", + "\n", + "https://github.com/bear/python-twitter\n", + "\n", + "What we are going to do is scrape Joe's twitter account, and then filter it for the interesting tweets. Defining interesting as tweets that have be re-tweeted at least 10 times. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import twitter\n", + "\n", + "## define the necessary keys\n", + "cKey = twitterAPI_key()\n", + "cSecret = twitterAPI_secret()\n", + "aKey = twitterAPI_access_token_key()\n", + "aSecret = twitterAPI_access_token_secret()\n", + "\n", + "## create the api object with the twitter-python library\n", + "api = twitter.Api(consumer_key=cKey, consumer_secret=cSecret, access_token_key=aKey, access_token_secret=aSecret)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## get the user timeline with screen_name = 'stat110'\n", + "twitter_statuses = api.GetUserTimeline(screen_name = 'stat110')\n", + "\n", + "## create a data frame\n", + "## first get a list of panda Series or dict\n", + "pdSeriesList = [pd.Series(t.AsDict()) for t in twitter_statuses]\n", + "\n", + "## then create the data frame\n", + "data = pd.DataFrame(pdSeriesList)\n", + "\n", + "data.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "## filter tweets with enough retweet_count\n", + "maybe_interesting = data[data.retweet_count>20]\n", + "\n", + "## get the text of these tweets\n", + "tweet_text = maybe_interesting.text\n", + "\n", + "## print them out\n", + "text = tweet_text.values\n", + "\n", + "for t in text:\n", + " print '######'\n", + " print t" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Extracting columns:\n", + "===================\n", + "\n", + "__Warning:__ The returned column `tweet_text` is a `view` on the data\n", + " \n", + "* it is not a copy\n", + "* you change the Series => you change the DataFrame\n", + "\n", + "Below is another example of such a view:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "## create a view for favorite_count on maybe_interesting\n", + "view = maybe_interesting['favorite_count']\n", + "print '-----------------'\n", + "print \"This is view:\"\n", + "print view\n", + "## change a value\n", + "view[8] = 9999\n", + "\n", + "## look at original frame\n", + "print '-----------------'\n", + "print \"This is view after changing view[8]\"\n", + "print view\n", + "\n", + "print '-----------------'\n", + "print \"This is maybe_interesting after changing view[8]\"\n", + "print \"It changed too!\"\n", + "print maybe_interesting['favorite_count']\n", + "\n", + "## to avoid this you can use copy\n", + "independent_data = maybe_interesting['favorite_count'].copy()\n", + "independent_data[10] = 999\n", + "print '-----------------'\n", + "print \"This is independent_data after changed at 10:\"\n", + "print independent_data\n", + "print '-----------------'\n", + "print \"This is maybe_interesting after changing independent_data:\"\n", + "print \"It did not change because we only changed a copy of it\"\n", + "print maybe_interesting['favorite_count']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "What we covered today:\n", + "============\n", + "\n", + "* Pandas data frames\n", + "* Guidelines for friendly scraping\n", + "* Scraping html sites\n", + "* Scraping with Api's\n", + "* Basic data cleanup\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "Further material\n", + "================\n", + "\n", + "* I highly recommend Katharine Jarmul's scraping tutorials\n", + "* For example [this one](https://www.youtube.com/watch?v=p1iX0uxM1w8)\n", + "* Pandas has extensive [documentation](http://pandas.pydata.org/pandas-docs/stable/)\n", + "* Especially the [tem minutes to pandas chapter](http://pandas.pydata.org/pandas-docs/stable/10min.html)\n", + "\n", + "* [Greg Reda](http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/) did a lot more pandas examples for the movie lens data set" + ] + } + ], + "metadata": { + "celltoolbar": "Slideshow", + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Lectures/02-DataScrapingQuizzes.ipynb b/Lectures/02-DataScrapingQuizzes.ipynb index e591011..6eeda04 100755 --- a/Lectures/02-DataScrapingQuizzes.ipynb +++ b/Lectures/02-DataScrapingQuizzes.ipynb @@ -35,6 +35,13 @@ "sns.set_style(\"white\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "THis is a test comment" + ] + }, { "cell_type": "markdown", "metadata": { @@ -2316,7 +2323,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.9" + "version": "2.7.10" } }, "nbformat": 4, From 2a372285600513559cd082af9a88bd36220f260e Mon Sep 17 00:00:00 2001 From: kannab Date: Wed, 9 Sep 2015 23:11:22 -0700 Subject: [PATCH 2/3] added hello.md --- hello.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 hello.md diff --git a/hello.md b/hello.md new file mode 100644 index 0000000..4851bd9 --- /dev/null +++ b/hello.md @@ -0,0 +1 @@ +# Hello world rahuldave From ca6f6242c61f175935dc2fdfd6635794658201c8 Mon Sep 17 00:00:00 2001 From: kannab Date: Sun, 18 Oct 2015 19:03:10 -0700 Subject: [PATCH 3/3] commited by BK @ 8:11 AM --- Lectures/02-DataScrapingQuizzes.ipynb | 592 +- Lectures/Lecture4/PandasAndSQL.ipynb | 292 +- Lectures/Lecture4/PandasAndSQL_original.ipynb | 11846 ++++++++++++++++ Lectures/Lecture4/cancont.db | Bin 0 -> 23552 bytes Lectures/Lecture4/candidates_nohead.txt | 17 + 5 files changed, 12601 insertions(+), 146 deletions(-) create mode 100644 Lectures/Lecture4/PandasAndSQL_original.ipynb create mode 100644 Lectures/Lecture4/cancont.db create mode 100644 Lectures/Lecture4/candidates_nohead.txt diff --git a/Lectures/02-DataScrapingQuizzes.ipynb b/Lectures/02-DataScrapingQuizzes.ipynb index 6eeda04..49cf346 100755 --- a/Lectures/02-DataScrapingQuizzes.ipynb +++ b/Lectures/02-DataScrapingQuizzes.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": false, "slideshow": { @@ -162,11 +162,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello CS109\n", + "I love IPython\n" + ] + } + ], "source": [ "print \"Hello CS109\"\n", "\n", @@ -244,14 +253,88 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "collapsed": false, "slideshow": { "slide_type": "-" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexoccupationzip_code
0124Mtechnician85711
1253Fother94043
2323Mwriter32067
3424Mtechnician43537
4533Fother15213
\n", + "
" + ], + "text/plain": [ + " user_id age sex occupation zip_code\n", + "0 1 24 M technician 85711\n", + "1 2 53 F other 94043\n", + "2 3 23 M writer 32067\n", + "3 4 24 M technician 43537\n", + "4 5 33 F other 15213" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# pass in column names for each CSV\n", "u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']\n", @@ -260,7 +343,9 @@ " 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user', \n", " sep='|', names=u_cols)\n", "\n", - "users.head()" + "users.head()\n", + "\n", + "users." ] }, { @@ -277,11 +362,79 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idmovie_idratingunix_timestamp
01962423881250949
11863023891717742
2223771878887116
3244512880606923
41663461886397596
\n", + "
" + ], + "text/plain": [ + " user_id movie_id rating unix_timestamp\n", + "0 196 242 3 881250949\n", + "1 186 302 3 891717742\n", + "2 22 377 1 878887116\n", + "3 244 51 2 880606923\n", + "4 166 346 1 886397596" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']\n", "ratings = pd.read_csv(\n", @@ -305,11 +458,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# the movies file contains columns indicating the movie's genres\n", "# let's only load the first five columns of the file with usecols\n", @@ -320,7 +484,9 @@ " 'http://files.grouplens.org/datasets/movielens/ml-100k/u.item', \n", " sep='|', names=m_cols, usecols=range(5))\n", "\n", - "movies.head()" + "movies.head()\n", + "\n", + "type(movies)" ] }, { @@ -339,14 +505,16 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, "outputs": [], "source": [ "print movies.dtypes\n", "print\n", "print movies.describe()\n", - "# *** Why only those two columns? ***" + "# *** Why only those two columns? *** \n", + "\n", + "#Only those columns are number based " ] }, { @@ -366,24 +534,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " user_id age sex occupation zip_code\n", + "0 1 24 M technician 85711\n", + "1 2 53 F other 94043\n", + "2 3 23 M writer 32067\n", + "3 4 24 M technician 43537\n", + "4 5 33 F other 15213\n", + "user_id 4\n", + "age 24\n", + "sex M\n", + "occupation technician\n", + "zip_code 43537\n", + "Name: 3, dtype: object\n" + ] + } + ], "source": [ "users.head()\n", "users['occupation'].head()\n", - "## *** Where did the nice design go? ***\n", + "# *** Where did the nice design go? ***\n", "columns_you_want = ['occupation', 'sex'] \n", "users[columns_you_want].head()\n", "\n", "print users.head()\n", "\n", - "print users.iloc[3]" + "print users.iloc[3] # iloc ables you to locate a row" ] }, { @@ -402,12 +589,87 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "collapsed": false }, - "outputs": [], - "source": [ + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idagesexoccupationzip_code
1253Fother94043
4533Fother15213
5642Mexecutive98101
6757Madministrator91344
7836Madministrator05201
\n", + "
" + ], + "text/plain": [ + " user_id age sex occupation zip_code\n", + "1 2 53 F other 94043\n", + "4 5 33 F other 15213\n", + "5 6 42 M executive 98101\n", + "6 7 57 M administrator 91344\n", + "7 8 36 M administrator 05201" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.age > 25\n", "oldUsers = users[users.age > 25]\n", "oldUsers.head()" ] @@ -430,17 +692,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "33.8131868132\n", + "33.8131868132\n" + ] + } + ], "source": [ "# users aged 40 AND male\n", - "# your code here" + "# your code here\n", + "\n", + "myUsers = users[(users.age == 40) & (users.sex == 'M')]\n", + "\n", + "FUsers = users[users.sex == 'F']\n", + "\n", + "print FUsers.age.mean()\n", + "print FUsers['age'].mean()" ] }, { @@ -509,19 +787,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " user_id movie_id rating unix_timestamp\n", + "0 196 242 3 881250949\n", + "1 186 302 3 891717742\n", + "2 22 377 1 878887116\n", + "3 244 51 2 880606923\n", + "4 166 346 1 886397596\n" + ] + }, + { + "data": { + "text/plain": [ + "user_id\n", + "1 272\n", + "2 62\n", + "3 54\n", + "4 24\n", + "5 175\n", + "Name: movie_id, dtype: int64" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "print ratings.head()\n", "## split data\n", "grouped_data = ratings.groupby('user_id')\n", - "#grouped_data = ratings['movie_id'].groupby(ratings['user_id'])\n", + "\n", + "#print grouped_data.head(5)\n", + "grouped_data = ratings['movie_id'].groupby(ratings.user_id)\n", "\n", "## count and combine\n", "ratings_per_user = grouped_data.count()\n", @@ -544,6 +853,93 @@ "* advanced: get the movie titles with the highest average rating" ] }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 3\n", + "1 3\n", + "2 1\n", + "3 2\n", + "4 1\n", + "5 4\n", + "6 2\n", + "7 5\n", + "8 3\n", + "9 3\n", + "10 2\n", + "11 5\n", + "12 5\n", + "13 3\n", + "14 3\n", + "15 3\n", + "16 5\n", + "17 2\n", + "18 4\n", + "19 2\n", + "20 4\n", + "21 4\n", + "22 4\n", + "23 2\n", + "24 4\n", + "25 2\n", + "26 5\n", + "27 2\n", + "28 4\n", + "29 5\n", + " ..\n", + "97401 1\n", + "97438 1\n", + "97592 3\n", + "97623 3\n", + "97649 3\n", + "97792 3\n", + "97801 1\n", + "97860 2\n", + "98086 4\n", + "98106 1\n", + "98323 2\n", + "98345 2\n", + "98427 3\n", + "98468 2\n", + "98615 3\n", + "98640 3\n", + "98649 4\n", + "98687 3\n", + "98703 5\n", + "98706 3\n", + "98732 3\n", + "98767 1\n", + "98819 2\n", + "98828 5\n", + "98949 5\n", + "98955 3\n", + "99177 5\n", + "99614 3\n", + "99749 2\n", + "99953 4\n", + "Name: rating, dtype: int64\n" + ] + } + ], + "source": [ + "\n", + "grouped_data = ratings.rating.groupby(ratings.movie_id)\n", + "\n", + "print grouped_data.head(5)\n", + "\n", + "#avg_rtgs = grouped_data.mean()\n", + "\n", + "#print avg_rtgs.max()\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -620,14 +1016,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 89, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "movie_id\n", + "1 3.878319\n", + "2 3.206107\n", + "3 3.033333\n", + "4 3.550239\n", + "5 3.302326\n", + "6 3.576923\n", + "7 3.798469\n", + "8 3.995434\n", + "9 3.896321\n", + "10 3.831461\n", + "dtype: float64" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "average_ratings = grouped_data.apply(lambda f: f.mean())\n", - "average_ratings.head()" + "average_ratings.head(10)" ] }, { @@ -1056,18 +1474,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 100, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9\n" + ] + } + ], "source": [ - "url = 'http://www.crummy.com/software/BeautifulSoup'\n", + "url = 'http://www.sanclementemedspaservices.com/'\n", "source = urllib2.urlopen(url).read()\n", - "print source" + "#print source\n", + "\n", + "print source.count('skin')" ] }, { @@ -1135,11 +1563,105 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 101, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Home,\n", + " Services ,\n", + " BOTOX,\n", + " Juvederm,\n", + " Juvederm Voluma,\n", + " Radiesse,\n", + " Sculptra,\n", + " Anti-Wrinkle Fillers,\n", + " Laser Skin Resurfacing,\n", + " Laser Hair Removal,\n", + " Laser Vein Removal,\n", + " Facial Rejuvenation,\n", + " Chemical Peel,\n", + " IPL Photorejuvenation,\n", + " Microdermabrasion,\n", + " HydraFacial,\n", + " About,\n", + " Photo Gallery,\n", + " Blog,\n", + " Meet Our Doctor,\n", + " Meet Our Staff,\n", + " Contact,\n", + " \\n\\n
\\n
\\n\\n\"Seaside\\n
\\n
\\n
\\n

Seaside Skin Care

\\n
\\n
,\n", + " 949-234-7809,\n", + " \\n\\n
\\n
,\n", + " \\n Home\\n ,\n", + " Services ,\n", + " BOTOX,\n", + " Juvederm,\n", + " Juvederm Voluma,\n", + " Radiesse,\n", + " Sculptra,\n", + " Anti-Wrinkle Fillers,\n", + " Laser Skin Resurfacing,\n", + " Laser Hair Removal,\n", + " Laser Vein Removal,\n", + " Facial Rejuvenation,\n", + " Chemical Peel,\n", + " IPL Photorejuvenation,\n", + " Microdermabrasion,\n", + " HydraFacial,\n", + " \\n About\\n ,\n", + " \\n Photo Gallery\\n ,\n", + " \\n Blog\\n ,\n", + " \\n Meet Our Doctor\\n ,\n", + " \\n Meet Our Staff\\n ,\n", + " \\n Contact\\n ,\n", + " \\n
\\n
\\n
\\n\"An\\n
\\n
\\n
\\n
\\n\\n\\n

Our Doctor - Dr. Jones M.D.

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"An\\n
\\n
\\n
\\n
\\n\\n\\n

Our Team

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"On\\n
\\n
\\n
\\n
\\n\\n\\n

Our Location - Downtown San Clemente

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"An\\n
\\n
\\n
\\n
\\n\\n\\n

Sandy Hoff - Registered Nurse, Injection & Laser Specialist

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"An\\n
\\n
\\n
\\n
\\n\\n\\n

Lilah Morgan - Medical Esthetician

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"On\\n
\\n
\\n
\\n
\\n\\n\\n

Our Reception Area

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"On\\n
\\n
\\n
\\n
\\n\\n\\n

Our Esthetician Room

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"On\\n
\\n
\\n
\\n
\\n\\n\\n

Our Waiting Area

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"On\\n
\\n
\\n
\\n
\\n\\n\\n

Our Deck

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"A\\n
\\n
\\n
\\n
\\n\\n\\n

NeoCutis Products For Flawless Skin

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"An\\n
\\n
\\n
\\n
\\n\\n\\n

Our Esthetician Maureen's Favorite Product, LATISSE!

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n
\\n
\\n
\\n\"An\\n
\\n
\\n
\\n
\\n\\n\\n

Our Top Selling Lytera Brightening System

\\n
View Image
\\n
\\n
\\n
\\n
,\n", + " \\n\"An\\n\\n\\n,\n", + " \\n\"An\\n\\n\\n,\n", + " \\n\"On\\n\\n\\n,\n", + " \\n\"An\\n\\n\\n,\n", + " \\n\"An\\n\\n\\n,\n", + " \\n\"On\\n\\n\\n,\n", + " \\n\"On\\n\\n\\n,\n", + " \\n\"On\\n\\n\\n,\n", + " \\n\"On\\n\\n\\n,\n", + " \\n\"A\\n\\n\\n,\n", + " \\n\"An\\n\\n\\n,\n", + " \\n\"An\\n\\n\\n,\n", + " ,\n", + " Read MoreR,\n", + " Read MoreR,\n", + " Read MoreR,\n", + " Read MoreR,\n", + " \\n\\n
\\n
,\n", + " \\n\\n\\t \\tRead More Reviews t,\n", + " Print Offer,\n", + " 949-234-7809,\n", + " Get Directions,\n", + " Privacy Policy,\n", + " Site Map,\n", + " G]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "## get bs4 object\n", "soup = bs4.BeautifulSoup(source)\n", diff --git a/Lectures/Lecture4/PandasAndSQL.ipynb b/Lectures/Lecture4/PandasAndSQL.ipynb index be2c58a..007f971 100644 --- a/Lectures/Lecture4/PandasAndSQL.ipynb +++ b/Lectures/Lecture4/PandasAndSQL.ipynb @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "metadata": { "collapsed": true }, @@ -125,6 +125,50 @@ "\"\"\"" ] }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "DROP TABLE IF EXISTS \"candidates\";\n", + "DROP TABLE IF EXISTS \"contributors\";\n", + "CREATE TABLE \"candidates\" (\n", + " \"id\" INTEGER PRIMARY KEY NOT NULL ,\n", + " \"first_name\" VARCHAR,\n", + " \"last_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"party\" VARCHAR NOT NULL\n", + ");\n", + "CREATE TABLE \"contributors\" (\n", + " \"id\" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,\n", + " \"last_name\" VARCHAR,\n", + " \"first_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"street_1\" VARCHAR,\n", + " \"street_2\" VARCHAR,\n", + " \"city\" VARCHAR,\n", + " \"state\" VARCHAR,\n", + " \"zip\" VARCHAR,\n", + " \"amount\" INTEGER,\n", + " \"date\" DATETIME,\n", + " \"candidate_id\" INTEGER NOT NULL,\n", + " FOREIGN KEY(candidate_id) REFERENCES candidates(id)\n", + ");\n", + "\n" + ] + } + ], + "source": [ + "print ourschema" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -156,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 23, "metadata": { "collapsed": false }, @@ -179,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 35, "metadata": { "collapsed": true }, @@ -202,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 36, "metadata": { "collapsed": false }, @@ -384,7 +428,7 @@ "16 41 Fred Thompson D. R" ] }, - "execution_count": 99, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -396,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 37, "metadata": { "collapsed": false }, @@ -512,7 +556,7 @@ "4 NaN Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" ] }, - "execution_count": 6, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -524,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 38, "metadata": { "collapsed": false }, @@ -634,7 +678,7 @@ "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" ] }, - "execution_count": 7, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -660,7 +704,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 39, "metadata": { "collapsed": false }, @@ -678,7 +722,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 40, "metadata": { "collapsed": false }, @@ -689,7 +733,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 41, "metadata": { "collapsed": true }, @@ -700,7 +744,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 42, "metadata": { "collapsed": false }, @@ -711,7 +755,7 @@ "(175, 11)" ] }, - "execution_count": 11, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -722,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 43, "metadata": { "collapsed": false }, @@ -736,7 +780,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 44, "metadata": { "collapsed": false }, @@ -763,7 +807,7 @@ " (41, u'Fred', u'Thompson', u'D.', u'R')]" ] }, - "execution_count": 13, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -774,7 +818,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 51, "metadata": { "collapsed": true }, @@ -789,7 +833,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 52, "metadata": { "collapsed": false }, @@ -800,7 +844,7 @@ "[]" ] }, - "execution_count": 15, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -825,7 +869,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 53, "metadata": { "collapsed": false }, @@ -924,7 +968,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 54, "metadata": { "collapsed": true }, @@ -937,7 +981,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 55, "metadata": { "collapsed": false }, @@ -964,7 +1008,7 @@ " (41, u'Fred', u'Thompson', u'D.', u'R')]" ] }, - "execution_count": 18, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -975,7 +1019,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 57, "metadata": { "collapsed": true }, @@ -1004,7 +1048,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 58, "metadata": { "collapsed": false }, @@ -1017,7 +1061,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 59, "metadata": { "collapsed": false }, @@ -1044,7 +1088,7 @@ " (41, u'Fred', u'Thompson', u'D.', u'R')]" ] }, - "execution_count": 21, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -1142,7 +1186,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 60, "metadata": { "collapsed": false }, @@ -1252,7 +1296,7 @@ "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" ] }, - "execution_count": 22, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -1261,6 +1305,28 @@ "dfcwci.head()" ] }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.core.frame.DataFrame" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(dfcwci)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1270,7 +1336,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 64, "metadata": { "collapsed": false }, @@ -1365,7 +1431,7 @@ "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35" ] }, - "execution_count": 23, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -1376,7 +1442,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 65, "metadata": { "collapsed": false }, @@ -1471,7 +1537,7 @@ "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35" ] }, - "execution_count": 24, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -1482,7 +1548,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 68, "metadata": { "collapsed": false }, @@ -1504,7 +1570,7 @@ " u'candidate_id']" ] }, - "execution_count": 25, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } @@ -1516,7 +1582,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 69, "metadata": { "collapsed": true }, @@ -1531,7 +1597,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 74, "metadata": { "collapsed": false }, @@ -1540,7 +1606,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "[(28, u'Buckheit', u'Bruce', None, u'8904 KAREN DR', None, u'FAIRFAX', u'VA', u'220312731', 100, u'2007-09-19', 20, u'junk'), (78, u'Ranganath', u'Anoop', None, u'2507 Willard Drive', None, u'Charlottesville', u'VA', u'22903', -100, u'2008-04-21', 32, u'junk'), (89, u'Perreault', u'Louise', None, u'503 Brockridge Hunt Drive', None, u'Hampton', u'VA', u'23666', -34.08, u'2008-04-21', 32, u'junk'), (146, u'ABDELLA', u'THOMAS', u'M.', u'4231 MONUMENT WALL WAY #340', None, u'FAIRFAX', u'VA', u'220308440', 50, u'2007-09-30', 35, u'junk')]\n" + "[(28, u'Buckheit', u'Bruce', None, u'8904 KAREN DR', None, u'FAIRFAX', u'VA', u'220312731', 100, u'2007-09-19', 20), (78, u'Ranganath', u'Anoop', None, u'2507 Willard Drive', None, u'Charlottesville', u'VA', u'22903', -100, u'2008-04-21', 32), (89, u'Perreault', u'Louise', None, u'503 Brockridge Hunt Drive', None, u'Hampton', u'VA', u'23666', -34.08, u'2008-04-21', 32), (146, u'ABDELLA', u'THOMAS', u'M.', u'4231 MONUMENT WALL WAY #340', None, u'FAIRFAX', u'VA', u'220308440', 50, u'2007-09-30', 35)]\n" ] }, { @@ -1638,7 +1704,7 @@ "3 146 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 None FAIRFAX VA 220308440 50.00 2007-09-30 35" ] }, - "execution_count": 145, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } @@ -1651,7 +1717,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 75, "metadata": { "collapsed": false }, @@ -1703,7 +1769,7 @@ "0 126 BOURNE TRAVIS None LAGE KAART 77 None BRASSCHATT None 2930 -500 2008-11-20 35" ] }, - "execution_count": 28, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } @@ -1776,7 +1842,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 79, "metadata": { "collapsed": false }, @@ -1787,7 +1853,7 @@ "(174, 12)" ] }, - "execution_count": 30, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -1799,7 +1865,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 80, "metadata": { "collapsed": false }, @@ -1810,7 +1876,7 @@ "(174, 11)" ] }, - "execution_count": 31, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -1821,7 +1887,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 82, "metadata": { "collapsed": false }, @@ -1985,7 +2051,7 @@ "7 146 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 None FAIRFAX VA 220308440 50.00 2007-09-30 35" ] }, - "execution_count": 32, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } @@ -2163,7 +2229,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 83, "metadata": { "collapsed": false }, @@ -2359,7 +2425,7 @@ "9 105 Aaron Shirley None 101 Cherry Ave None Havana FL 323331311 50 2008-02-29 34" ] }, - "execution_count": 34, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -2371,7 +2437,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 84, "metadata": { "collapsed": false }, @@ -2556,7 +2622,7 @@ "104 Aaron Shirley NaN 101 Cherry Ave NaN Havana FL 323331311 50 2008-02-29 34" ] }, - "execution_count": 35, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -2566,15 +2632,19 @@ ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": 86, + "metadata": { + "collapsed": true + }, + "outputs": [], "source": [ "###SORT" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 87, "metadata": { "collapsed": false }, @@ -2759,7 +2829,7 @@ "55 BUSH ERIC NaN P.O. BOX 61046 NaN DENVER CO 802061046 -2300 2008-03-06 22" ] }, - "execution_count": 36, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -2770,7 +2840,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 88, "metadata": { "collapsed": false }, @@ -2955,7 +3025,7 @@ "174 ABRAHAM SALEM A. P.O. BOX 7 NaN CANADIAN TX 790140007 1300 2008-01-30 37" ] }, - "execution_count": 37, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } @@ -3389,7 +3459,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 89, "metadata": { "collapsed": false }, @@ -3475,7 +3545,7 @@ "9 John D. 1300" ] }, - "execution_count": 40, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } @@ -3486,7 +3556,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 90, "metadata": { "collapsed": false }, @@ -3572,7 +3642,7 @@ "9 John D. 1300" ] }, - "execution_count": 41, + "execution_count": 90, "metadata": {}, "output_type": "execute_result" } @@ -3841,7 +3911,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 92, "metadata": { "collapsed": false }, @@ -4037,7 +4107,7 @@ "9 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1300 2007-06-29 16 Allen, John D." ] }, - "execution_count": 46, + "execution_count": 92, "metadata": {}, "output_type": "execute_result" } @@ -4049,7 +4119,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 93, "metadata": { "collapsed": false }, @@ -4256,7 +4326,7 @@ "9 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1300 2007-06-29 16 Allen, John D. Allen:John D." ] }, - "execution_count": 47, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } @@ -4403,7 +4473,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 94, "metadata": { "collapsed": false }, @@ -4419,7 +4489,7 @@ "Name: name, dtype: object" ] }, - "execution_count": 49, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } @@ -4430,7 +4500,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 95, "metadata": { "collapsed": false }, @@ -4441,7 +4511,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 96, "metadata": { "collapsed": false }, @@ -4557,7 +4627,7 @@ "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35 junk" ] }, - "execution_count": 51, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -4577,7 +4647,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 97, "metadata": { "collapsed": false }, @@ -4585,10 +4655,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 52, + "execution_count": 97, "metadata": {}, "output_type": "execute_result" } @@ -4600,7 +4670,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 99, "metadata": { "collapsed": false }, @@ -4623,7 +4693,7 @@ " (12, u'name', u'', 0, None, 0)]" ] }, - "execution_count": 53, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -4634,7 +4704,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 100, "metadata": { "collapsed": false }, @@ -4819,7 +4889,7 @@ " (u'ABRAHAM, SALEM', 175)]" ] }, - "execution_count": 54, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } @@ -4832,7 +4902,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 101, "metadata": { "collapsed": false }, @@ -4845,7 +4915,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 102, "metadata": { "collapsed": true }, @@ -4856,7 +4926,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 103, "metadata": { "collapsed": false }, @@ -5063,7 +5133,7 @@ "9 10 Allen John D. None 1052 Cannon Mill Drive None North Augusta SC 29860 1300 2007-06-29 16 Allen, John D." ] }, - "execution_count": 57, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -5082,7 +5152,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 104, "metadata": { "collapsed": false }, @@ -5095,7 +5165,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 105, "metadata": { "collapsed": false }, @@ -5217,7 +5287,7 @@ "4 146 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 None FAIRFAX VA 220308440 50.00 2007-09-30 35 junk" ] }, - "execution_count": 62, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -5238,7 +5308,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 106, "metadata": { "collapsed": false }, @@ -5250,7 +5320,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mOperationalError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ALTER TABLE contributors DROP COLUMN name;\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ALTER TABLE contributors DROP COLUMN name;\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mOperationalError\u001b[0m: near \"DROP\": syntax error" ] } @@ -5270,7 +5340,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 107, "metadata": { "collapsed": false }, @@ -5288,7 +5358,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 110, "metadata": { "collapsed": false }, @@ -5371,7 +5441,7 @@ "max 9.951532e+08 4600.000000 37.000000" ] }, - "execution_count": 66, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -5382,7 +5452,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 111, "metadata": { "collapsed": false }, @@ -5393,7 +5463,7 @@ "4600.0" ] }, - "execution_count": 67, + "execution_count": 111, "metadata": {}, "output_type": "execute_result" } @@ -5404,7 +5474,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 112, "metadata": { "collapsed": false }, @@ -5454,7 +5524,7 @@ "30 Buckel Linda NaN PO Box 683130 NaN Park City UT 840683130 4600 2007-08-14 20" ] }, - "execution_count": 68, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -5465,7 +5535,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 113, "metadata": { "collapsed": false }, @@ -5526,7 +5596,7 @@ "0 31 Buckel Linda None PO Box 683130 None Park City UT 840683130 4600 2007-08-14 20 Buckel, Linda" ] }, - "execution_count": 69, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -5539,7 +5609,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 114, "metadata": { "collapsed": false }, @@ -5559,7 +5629,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 115, "metadata": { "collapsed": false }, @@ -5579,7 +5649,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 116, "metadata": { "collapsed": false }, @@ -5644,18 +5714,18 @@ "159 ABATE MARIA ELENA 1291 NIGHTINGALE AVENUE NaN MIAMI SPRINGS FL 331663832 2600 2008-01-25 37" ] }, - "execution_count": 72, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "dfcwci[dfcwci.amount > dfcwci.amount.max() - 2300]" + " dfcwci[dfcwci.amount > dfcwci.amount.max() - 2300]" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 117, "metadata": { "collapsed": false }, @@ -5723,7 +5793,7 @@ "1 160 ABATE MARIA ELENA 1291 NIGHTINGALE AVENUE None MIAMI SPRINGS FL 331663832 2600 2008-01-25 37" ] }, - "execution_count": 73, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } @@ -5749,7 +5819,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 118, "metadata": { "collapsed": false }, @@ -6028,7 +6098,7 @@ "WA 2941290251 -500.00 90" ] }, - "execution_count": 74, + "execution_count": 118, "metadata": {}, "output_type": "execute_result" } @@ -6039,7 +6109,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 122, "metadata": { "collapsed": false }, @@ -6086,7 +6156,7 @@ "Name: amount, dtype: float64" ] }, - "execution_count": 75, + "execution_count": 122, "metadata": {}, "output_type": "execute_result" } @@ -6097,7 +6167,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 120, "metadata": { "collapsed": false }, @@ -6111,7 +6181,7 @@ " 'AK', 'LA', 'AZ'], dtype=object)" ] }, - "execution_count": 76, + "execution_count": 120, "metadata": {}, "output_type": "execute_result" } @@ -6122,7 +6192,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 121, "metadata": { "collapsed": false }, @@ -6364,7 +6434,7 @@ "35 WA -500.00" ] }, - "execution_count": 77, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } diff --git a/Lectures/Lecture4/PandasAndSQL_original.ipynb b/Lectures/Lecture4/PandasAndSQL_original.ipynb new file mode 100644 index 0000000..be2c58a --- /dev/null +++ b/Lectures/Lecture4/PandasAndSQL_original.ipynb @@ -0,0 +1,11846 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Pandas, SQL, and the Grammar of Data\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Table of Contents\n", + "* [Pandas, SQL, and the Grammar of Data](#Pandas,-SQL,-and-the-Grammar-of-Data)\n", + "\t* [Populating the Database](#Populating-the-Database)\n", + "\t\t* [SQLITE](#SQLITE)\n", + "\t\t* [Init](#Init)\n", + "\t\t\t* [Populating with Pandas!!](#Populating-with-Pandas!!)\n", + "\t\t\t* [Or populate with SQL INSERT](#Or-populate-with-SQL-INSERT)\n", + "\t\t\t* [Bulk insert](#Bulk-insert)\n", + "\t* [Single Table Verbs](#Single-Table-Verbs)\n", + "\t\t* [QUERY](#QUERY)\n", + "\t\t* [SORT](#SORT)\n", + "\t\t* [SELECT-COLUMNS](#SELECT-COLUMNS)\n", + "\t\t* [SELECT-DISTINCT](#SELECT-DISTINCT)\n", + "\t\t* [ASSIGN](#ASSIGN)\n", + "\t\t* [AGGREGATE](#AGGREGATE)\n", + "\t\t* [GROUP-AGG](#GROUP-AGG)\n", + "\t\t* [DELETE](#DELETE)\n", + "\t\t* [LIMIT](#LIMIT)\n", + "\t* [Indexes](#Indexes)\n", + "\t* [Relationships: JOINs are Cartesian Products.](#Relationships:-JOINs-are-Cartesian-Products.)\n", + "\t\t* [Simple subselect](#Simple-subselect)\n", + "\t\t* [implicit join](#implicit-join)\n", + "\t\t* [Explicit INNER JOIN](#Explicit-INNER-JOIN)\n", + "\t\t* [Outer JOIN](#Outer-JOIN)\n", + "\t\t\t* [left outer (contributors on candidates)](#left-outer-%28contributors-on-candidates%29)\n", + "\t\t\t* [right outer (contributors on candidates) = left outer (candidates on contributors)](#right-outer-%28contributors-on-candidates%29-=-left-outer-%28candidates-on-contributors%29)\n", + "\t\t\t* [full outer](#full-outer)\n", + "\t* [Pandas /SQL](#Pandas-/SQL)\n", + "\t* [Useful Links](#Useful-Links)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The %... is an iPython thing, and is not part of the Python language.\n", + "# In this case we're just telling the plotting library to draw things on\n", + "# the notebook, instead of on a separate window.\n", + "%matplotlib inline\n", + "# See all the \"as ...\" contructs? They're just aliasing the package names.\n", + "# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().\n", + "import numpy as np\n", + "import scipy as sp\n", + "import matplotlib as mpl\n", + "import matplotlib.cm as cm\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import time\n", + "pd.set_option('display.width', 500)\n", + "pd.set_option('display.max_columns', 100)\n", + "pd.set_option('display.notebook_repr_html', True)\n", + "import seaborn as sns\n", + "sns.set_style(\"whitegrid\")\n", + "sns.set_context(\"poster\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Populating the Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lets start with Relational Databases, so called because they contain \"relations\" (tables), which are SETS of \"tuples\" (rows) which map \"attributes\" to atomic values.\n", + "\n", + "The available attributes are constrained by a \"header\" tuple of attributes which set the type. We do this below here." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "ourschema=\"\"\"\n", + "DROP TABLE IF EXISTS \"candidates\";\n", + "DROP TABLE IF EXISTS \"contributors\";\n", + "CREATE TABLE \"candidates\" (\n", + " \"id\" INTEGER PRIMARY KEY NOT NULL ,\n", + " \"first_name\" VARCHAR,\n", + " \"last_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"party\" VARCHAR NOT NULL\n", + ");\n", + "CREATE TABLE \"contributors\" (\n", + " \"id\" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,\n", + " \"last_name\" VARCHAR,\n", + " \"first_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"street_1\" VARCHAR,\n", + " \"street_2\" VARCHAR,\n", + " \"city\" VARCHAR,\n", + " \"state\" VARCHAR,\n", + " \"zip\" VARCHAR,\n", + " \"amount\" INTEGER,\n", + " \"date\" DATETIME,\n", + " \"candidate_id\" INTEGER NOT NULL,\n", + " FOREIGN KEY(candidate_id) REFERENCES candidates(id)\n", + ");\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SQLITE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use sqlite here (and recommend Postgres for production purposes). Still sqlite is great for on-disk large databases which wont fit into memory. \n", + "\n", + "Its also built into Python, but to use the [command line tool](https://www.sqlite.org/cli.html), I recommend you install it: https://www.sqlite.org/download.html. I also recommend you download and install the sqlite browser: http://sqlitebrowser.org .\n", + "\n", + "Python implements a standard database API over all databases. Its called [DBAPI2](http://cewing.github.io/training.codefellows/lectures/day21/intro_to_dbapi2.html). It works across many SQL databases.\n", + "\n", + "There is an even higher level API available, called [SQLAlchemy](http://www.sqlalchemy.org). While we wont use it here, I thoroughly recommend it, either in its direct relational form, or ORM form. Many things in Pandas use it to interface with databases. Here we'll get away with things by using SQLITE." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Sqlite is a text or memory based database. Connect and get a DBAPI2 connection." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sqlite3 import dbapi2 as sq3\n", + "import os\n", + "PATHSTART=\".\"\n", + "def get_db(dbfile):\n", + " sqlite_db = sq3.connect(os.path.join(PATHSTART, dbfile))\n", + " return sqlite_db" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop tables if they exist and create them." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def init_db(dbfile, schema):\n", + " \"\"\"Creates the database tables.\"\"\"\n", + " db = get_db(dbfile)\n", + " db.cursor().executescript(schema)\n", + " db.commit()\n", + " return db" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use Pandas to read in the data" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_namemiddle_nameparty
033JosephBidenNaND
136SamuelBrownbackNaNR
234HillaryClintonR.D
339ChristopherDoddJ.D
426JohnEdwardsNaND
522RudolphGiulianiNaNR
624MikeGravelNaND
716MikeHuckabeeNaNR
830DuncanHunterNaNR
931DennisKucinichNaND
1037JohnMcCainNaNR
1120BarackObamaNaND
1232RonPaulNaNR
1329BillRichardsonNaND
1435MittRomneyNaNR
1538TomTancredoNaNR
1641FredThompsonD.R
\n", + "
" + ], + "text/plain": [ + " id first_name last_name middle_name party\n", + "0 33 Joseph Biden NaN D\n", + "1 36 Samuel Brownback NaN R\n", + "2 34 Hillary Clinton R. D\n", + "3 39 Christopher Dodd J. D\n", + "4 26 John Edwards NaN D\n", + "5 22 Rudolph Giuliani NaN R\n", + "6 24 Mike Gravel NaN D\n", + "7 16 Mike Huckabee NaN R\n", + "8 30 Duncan Hunter NaN R\n", + "9 31 Dennis Kucinich NaN D\n", + "10 37 John McCain NaN R\n", + "11 20 Barack Obama NaN D\n", + "12 32 Ron Paul NaN R\n", + "13 29 Bill Richardson NaN D\n", + "14 35 Mitt Romney NaN R\n", + "15 38 Tom Tancredo NaN R\n", + "16 41 Fred Thompson D. R" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcand=pd.read_csv(\"./candidates.txt\", sep='|')\n", + "dfcand" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0NaNAgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
1NaNAhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945662502007-05-1616
2NaNAhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816
3NaNAhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945661002007-06-2116
4NaNAkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 NaN Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "1 NaN Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 250 2007-05-16 16\n", + "2 NaN Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16\n", + "3 NaN Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 100 2007-06-21 16\n", + "4 NaN Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci=pd.read_csv(\"./contributors_with_candidate_id.txt\", sep=\"|\")\n", + "dfcwci.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
1AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945662502007-05-1616
2AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816
3AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945661002007-06-2116
4AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "1 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 250 2007-05-16 16\n", + "2 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16\n", + "3 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 100 2007-06-21 16\n", + "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "del dfcwci['id']\n", + "dfcwci.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Init" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initializing the database" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "db=init_db(\"cancont.db\", ourschema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####Populating with Pandas!!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dfcand.to_sql(\"candidates\", db, if_exists=\"append\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "dfcwci.to_sql(\"contributors\", db, if_exists=\"append\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(175, 11)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sel=\"\"\"\n", + "SELECT * FROM candidates;\n", + "\"\"\"\n", + "c=db.cursor().execute(sel)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(16, u'Mike', u'Huckabee', None, u'R'),\n", + " (20, u'Barack', u'Obama', None, u'D'),\n", + " (22, u'Rudolph', u'Giuliani', None, u'R'),\n", + " (24, u'Mike', u'Gravel', None, u'D'),\n", + " (26, u'John', u'Edwards', None, u'D'),\n", + " (29, u'Bill', u'Richardson', None, u'D'),\n", + " (30, u'Duncan', u'Hunter', None, u'R'),\n", + " (31, u'Dennis', u'Kucinich', None, u'D'),\n", + " (32, u'Ron', u'Paul', None, u'R'),\n", + " (33, u'Joseph', u'Biden', None, u'D'),\n", + " (34, u'Hillary', u'Clinton', u'R.', u'D'),\n", + " (35, u'Mitt', u'Romney', None, u'R'),\n", + " (36, u'Samuel', u'Brownback', None, u'R'),\n", + " (37, u'John', u'McCain', None, u'R'),\n", + " (38, u'Tom', u'Tancredo', None, u'R'),\n", + " (39, u'Christopher', u'Dodd', u'J.', u'D'),\n", + " (41, u'Fred', u'Thompson', u'D.', u'R')]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rem=\"\"\"\n", + "DELETE FROM candidates;\n", + "\"\"\"\n", + "c=db.cursor().execute(rem)\n", + "db.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "c.fetchall()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####Or populate with SQL INSERT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![xkcd-sqlinj](http://imgs.xkcd.com/comics/exploits_of_a_mom.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "33 Joseph Biden D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(33, 'Joseph', 'Biden', '', 'D')\n", + "36 Samuel Brownback R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(36, 'Samuel', 'Brownback', '', 'R')\n", + "34 Hillary Clinton R. D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(34, 'Hillary', 'Clinton', 'R.', 'D')\n", + "39 Christopher Dodd J. D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(39, 'Christopher', 'Dodd', 'J.', 'D')\n", + "26 John Edwards D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(26, 'John', 'Edwards', '', 'D')\n", + "22 Rudolph Giuliani R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(22, 'Rudolph', 'Giuliani', '', 'R')\n", + "24 Mike Gravel D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(24, 'Mike', 'Gravel', '', 'D')\n", + "16 Mike Huckabee R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(16, 'Mike', 'Huckabee', '', 'R')\n", + "30 Duncan Hunter R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(30, 'Duncan', 'Hunter', '', 'R')\n", + "31 Dennis Kucinich D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(31, 'Dennis', 'Kucinich', '', 'D')\n", + "37 John McCain R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(37, 'John', 'McCain', '', 'R')\n", + "20 Barack Obama D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(20, 'Barack', 'Obama', '', 'D')\n", + "32 Ron Paul R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(32, 'Ron', 'Paul', '', 'R')\n", + "29 Bill Richardson D\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(29, 'Bill', 'Richardson', '', 'D')\n", + "35 Mitt Romney R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(35, 'Mitt', 'Romney', '', 'R')\n", + "38 Tom Tancredo R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(38, 'Tom', 'Tancredo', '', 'R')\n", + "41 Fred Thompson D. R\n", + "\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) VALUES (?,?,?,?,?);\n", + "(41, 'Fred', 'Thompson', 'D.', 'R')\n" + ] + } + ], + "source": [ + "ins=\"\"\"\n", + "INSERT INTO candidates (id, first_name, last_name, middle_name, party) \\\n", + " VALUES (?,?,?,?,?);\n", + "\"\"\"\n", + "with open(\"candidates.txt\") as fd:\n", + " slines =[l.strip().split('|') for l in fd.readlines()]\n", + " for line in slines[1:]:\n", + " theid, first_name, last_name, middle_name, party = line\n", + " print theid, first_name, last_name, middle_name, party\n", + " valstoinsert = (int(theid), first_name, last_name, middle_name, party)\n", + " print ins, valstoinsert\n", + " db.cursor().execute(ins, valstoinsert)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def make_query(sel):\n", + " c=db.cursor().execute(sel)\n", + " return c.fetchall()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(16, u'Mike', u'Huckabee', u'', u'R'),\n", + " (20, u'Barack', u'Obama', u'', u'D'),\n", + " (22, u'Rudolph', u'Giuliani', u'', u'R'),\n", + " (24, u'Mike', u'Gravel', u'', u'D'),\n", + " (26, u'John', u'Edwards', u'', u'D'),\n", + " (29, u'Bill', u'Richardson', u'', u'D'),\n", + " (30, u'Duncan', u'Hunter', u'', u'R'),\n", + " (31, u'Dennis', u'Kucinich', u'', u'D'),\n", + " (32, u'Ron', u'Paul', u'', u'R'),\n", + " (33, u'Joseph', u'Biden', u'', u'D'),\n", + " (34, u'Hillary', u'Clinton', u'R.', u'D'),\n", + " (35, u'Mitt', u'Romney', u'', u'R'),\n", + " (36, u'Samuel', u'Brownback', u'', u'R'),\n", + " (37, u'John', u'McCain', u'', u'R'),\n", + " (38, u'Tom', u'Tancredo', u'', u'R'),\n", + " (39, u'Christopher', u'Dodd', u'J.', u'D'),\n", + " (41, u'Fred', u'Thompson', u'D.', u'R')]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "make_query(\"SELECT * FROM candidates;\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rem=\"\"\"\n", + "DELETE FROM candidates;\n", + "\"\"\"\n", + "c=db.cursor().execute(rem)\n", + "db.commit()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####Bulk insert" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You almost always have to do this from the command line. Its typically faster, but also different foe every database" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%%bash\n", + "tail -n +2 candidates.txt > candidates_nohead.txt\n", + "echo \".import candidates_nohead.txt candidates\" | sqlite3 cancont.db" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(16, u'Mike', u'Huckabee', u'', u'R'),\n", + " (20, u'Barack', u'Obama', u'', u'D'),\n", + " (22, u'Rudolph', u'Giuliani', u'', u'R'),\n", + " (24, u'Mike', u'Gravel', u'', u'D'),\n", + " (26, u'John', u'Edwards', u'', u'D'),\n", + " (29, u'Bill', u'Richardson', u'', u'D'),\n", + " (30, u'Duncan', u'Hunter', u'', u'R'),\n", + " (31, u'Dennis', u'Kucinich', u'', u'D'),\n", + " (32, u'Ron', u'Paul', u'', u'R'),\n", + " (33, u'Joseph', u'Biden', u'', u'D'),\n", + " (34, u'Hillary', u'Clinton', u'R.', u'D'),\n", + " (35, u'Mitt', u'Romney', u'', u'R'),\n", + " (36, u'Samuel', u'Brownback', u'', u'R'),\n", + " (37, u'John', u'McCain', u'', u'R'),\n", + " (38, u'Tom', u'Tancredo', u'', u'R'),\n", + " (39, u'Christopher', u'Dodd', u'J.', u'D'),\n", + " (41, u'Fred', u'Thompson', u'D.', u'R')]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "make_query(\"SELECT * FROM candidates;\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Single Table Verbs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us now focus on core data manipulation commands. The reason to do this is that they are *universal across systems, and by identifying them, we can quickly ask how to do these* when we encounter a new system." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See https://gist.github.com/TomAugspurger/6e052140eaa5fdb6e8c0/ which has a comparison of r/dplyr and pandas. I stole and modified this table from there:\n", + "\n", + "``dplyr`` has a small set of nicely defined verbs. I've listed their closest pandas verbs.\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VERBdplyrpandasSQL
QUERY/SELECTIONfilter() (and slice())query() (and loc[], iloc[])SELECT WHERE
SORTarrange()sort()ORDER BY
SELECT-COLUMNS/PROJECTIONselect() (and rename())[](__getitem__) (and rename())SELECT COLUMN
SELECT-DISTINCTdistinct()unique(),drop_duplicates()SELECT DISTINCT COLUMN
ASSIGNmutate() (and transmute())assignALTER/UPDATE
AGGREGATEsummarise()describe(), mean(), max()None, AVG(),MAX()
SAMPLEsample_n() and sample_frac()sample()implementation dep, use RAND()
GROUP-AGGgroup_by/summarizegroupby/agg, count, meanGROUP BY
DELETE?drop/maskingDELETE/WHERE
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
1AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945662502007-05-1616
2AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816
3AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945661002007-06-2116
4AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "1 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 250 2007-05-16 16\n", + "2 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16\n", + "3 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 100 2007-06-21 16\n", + "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###QUERY" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
27BuckheitBruceNaN8904 KAREN DRNaNFAIRFAXVA220312731100.002007-09-1920
77RanganathAnoopNaN2507 Willard DriveNaNCharlottesvilleVA22903-100.002008-04-2132
88PerreaultLouiseNaN503 Brockridge Hunt DriveNaNHamptonVA23666-34.082008-04-2132
145ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NaNFAIRFAXVA22030844050.002007-09-3035
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "27 Buckheit Bruce NaN 8904 KAREN DR NaN FAIRFAX VA 220312731 100.00 2007-09-19 20\n", + "77 Ranganath Anoop NaN 2507 Willard Drive NaN Charlottesville VA 22903 -100.00 2008-04-21 32\n", + "88 Perreault Louise NaN 503 Brockridge Hunt Drive NaN Hampton VA 23666 -34.08 2008-04-21 32\n", + "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.query(\"state=='VA' & amount < 400\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
27BuckheitBruceNaN8904 KAREN DRNaNFAIRFAXVA220312731100.002007-09-1920
77RanganathAnoopNaN2507 Willard DriveNaNCharlottesvilleVA22903-100.002008-04-2132
88PerreaultLouiseNaN503 Brockridge Hunt DriveNaNHamptonVA23666-34.082008-04-2132
145ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NaNFAIRFAXVA22030844050.002007-09-3035
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "27 Buckheit Bruce NaN 8904 KAREN DR NaN FAIRFAX VA 220312731 100.00 2007-09-19 20\n", + "77 Ranganath Anoop NaN 2507 Willard Drive NaN Charlottesville VA 22903 -100.00 2008-04-21 32\n", + "88 Perreault Louise NaN 503 Brockridge Hunt Drive NaN Hampton VA 23666 -34.08 2008-04-21 32\n", + "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[(dfcwci.state=='VA') & (dfcwci.amount < 400)]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[u'id',\n", + " u'last_name',\n", + " u'first_name',\n", + " u'middle_name',\n", + " u'street_1',\n", + " u'street_2',\n", + " u'city',\n", + " u'state',\n", + " u'zip',\n", + " u'amount',\n", + " u'date',\n", + " u'candidate_id']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cont_cols = [e[1] for e in make_query(\"PRAGMA table_info(contributors);\")]\n", + "cont_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def make_frame(list_of_tuples, legend=cont_cols):\n", + " framelist=[]\n", + " for i, cname in enumerate(legend):\n", + " framelist.append((cname,[e[i] for e in list_of_tuples]))\n", + " return pd.DataFrame.from_items(framelist)" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(28, u'Buckheit', u'Bruce', None, u'8904 KAREN DR', None, u'FAIRFAX', u'VA', u'220312731', 100, u'2007-09-19', 20, u'junk'), (78, u'Ranganath', u'Anoop', None, u'2507 Willard Drive', None, u'Charlottesville', u'VA', u'22903', -100, u'2008-04-21', 32, u'junk'), (89, u'Perreault', u'Louise', None, u'503 Brockridge Hunt Drive', None, u'Hampton', u'VA', u'23666', -34.08, u'2008-04-21', 32, u'junk'), (146, u'ABDELLA', u'THOMAS', u'M.', u'4231 MONUMENT WALL WAY #340', None, u'FAIRFAX', u'VA', u'220308440', 50, u'2007-09-30', 35, u'junk')]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
028BuckheitBruceNone8904 KAREN DRNoneFAIRFAXVA220312731100.002007-09-1920
178RanganathAnoopNone2507 Willard DriveNoneCharlottesvilleVA22903-100.002008-04-2132
289PerreaultLouiseNone503 Brockridge Hunt DriveNoneHamptonVA23666-34.082008-04-2132
3146ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NoneFAIRFAXVA22030844050.002007-09-3035
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 28 Buckheit Bruce None 8904 KAREN DR None FAIRFAX VA 220312731 100.00 2007-09-19 20\n", + "1 78 Ranganath Anoop None 2507 Willard Drive None Charlottesville VA 22903 -100.00 2008-04-21 32\n", + "2 89 Perreault Louise None 503 Brockridge Hunt Drive None Hampton VA 23666 -34.08 2008-04-21 32\n", + "3 146 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 None FAIRFAX VA 220308440 50.00 2007-09-30 35" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors WHERE state='VA' AND amount < 400;\")\n", + "print out\n", + "make_frame(out)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0126BOURNETRAVISNoneLAGE KAART 77NoneBRASSCHATTNone2930-5002008-11-2035
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 126 BOURNE TRAVIS None LAGE KAART 77 None BRASSCHATT None 2930 -500 2008-11-20 35" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors WHERE state IS NULL;\")\n", + "make_frame(out)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
125BOURNETRAVISNaNLAGE KAART 77NaNBRASSCHATTNaN2930-5002008-11-2035
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "125 BOURNE TRAVIS NaN LAGE KAART 77 NaN BRASSCHATT NaN 2930 -500 2008-11-20 35" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[dfcwci.state.isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(174, 12)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors WHERE state IS NOT NULL;\")\n", + "make_frame(out).shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(174, 11)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[dfcwci.state.notnull()].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
01AgeeStevenNone549 Laurel Branch RoadNoneFloydVA24091500.002007-06-3016
128BuckheitBruceNone8904 KAREN DRNoneFAIRFAXVA220312731100.002007-09-1920
263BURKESUZANNEM.3401 EVANSTONNoneSEATTLEWA981038677-700.002008-03-0522
378RanganathAnoopNone2507 Willard DriveNoneCharlottesvilleVA22903-100.002008-04-2132
489PerreaultLouiseNone503 Brockridge Hunt DriveNoneHamptonVA23666-34.082008-04-2132
5101AaronsonRebeccaNone2000 Village Green Dr Apt 12NoneMill CreekWA980125787100.002008-02-0834
6107AaronsonRebeccaNone2000 Village Green Dr Apt 12NoneMill CreekWA980125787100.002008-02-1434
7146ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NoneFAIRFAXVA22030844050.002007-09-3035
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 1 Agee Steven None 549 Laurel Branch Road None Floyd VA 24091 500.00 2007-06-30 16\n", + "1 28 Buckheit Bruce None 8904 KAREN DR None FAIRFAX VA 220312731 100.00 2007-09-19 20\n", + "2 63 BURKE SUZANNE M. 3401 EVANSTON None SEATTLE WA 981038677 -700.00 2008-03-05 22\n", + "3 78 Ranganath Anoop None 2507 Willard Drive None Charlottesville VA 22903 -100.00 2008-04-21 32\n", + "4 89 Perreault Louise None 503 Brockridge Hunt Drive None Hampton VA 23666 -34.08 2008-04-21 32\n", + "5 101 Aaronson Rebecca None 2000 Village Green Dr Apt 12 None Mill Creek WA 980125787 100.00 2008-02-08 34\n", + "6 107 Aaronson Rebecca None 2000 Village Green Dr Apt 12 None Mill Creek WA 980125787 100.00 2008-02-14 34\n", + "7 146 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 None FAIRFAX VA 220308440 50.00 2007-09-30 35" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors WHERE state IN ('VA','WA');\")\n", + "make_frame(out).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA24091500.002007-06-3016
27BuckheitBruceNaN8904 KAREN DRNaNFAIRFAXVA220312731100.002007-09-1920
62BURKESUZANNEM.3401 EVANSTONNaNSEATTLEWA981038677-700.002008-03-0522
77RanganathAnoopNaN2507 Willard DriveNaNCharlottesvilleVA22903-100.002008-04-2132
88PerreaultLouiseNaN503 Brockridge Hunt DriveNaNHamptonVA23666-34.082008-04-2132
100AaronsonRebeccaNaN2000 Village Green Dr Apt 12NaNMill CreekWA980125787100.002008-02-0834
106AaronsonRebeccaNaN2000 Village Green Dr Apt 12NaNMill CreekWA980125787100.002008-02-1434
145ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NaNFAIRFAXVA22030844050.002007-09-3035
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500.00 2007-06-30 16\n", + "27 Buckheit Bruce NaN 8904 KAREN DR NaN FAIRFAX VA 220312731 100.00 2007-09-19 20\n", + "62 BURKE SUZANNE M. 3401 EVANSTON NaN SEATTLE WA 981038677 -700.00 2008-03-05 22\n", + "77 Ranganath Anoop NaN 2507 Willard Drive NaN Charlottesville VA 22903 -100.00 2008-04-21 32\n", + "88 Perreault Louise NaN 503 Brockridge Hunt Drive NaN Hampton VA 23666 -34.08 2008-04-21 32\n", + "100 Aaronson Rebecca NaN 2000 Village Green Dr Apt 12 NaN Mill Creek WA 980125787 100.00 2008-02-08 34\n", + "106 Aaronson Rebecca NaN 2000 Village Green Dr Apt 12 NaN Mill Creek WA 980125787 100.00 2008-02-14 34\n", + "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[dfcwci.state.isin(['VA','WA'])].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
03AhrensDonNone4034 Rennellwood WayNonePleasantonCA94566502007-06-1816
119ArdleWilliamNone412 Dakota AvenueNoneSpringfieldOH45504502007-06-2816
226BucklerSteveNone24351 Armada DrNoneDana PointCA926291306502007-07-3020
327BucklerSteveNone24351 Armada DrNoneDana PointCA926291306252007-08-1620
435BuckBarbaraNone1780 NE 138th StNoneNorth MiamiFL331811316502007-09-1320
536BuckBarbaraNone1780 NE 138th StNoneNorth MiamiFL331811316502007-07-1920
639BuchanekElizabethNone7917 Kentbury DrNoneBethesdaMD208144615502007-09-3020
750HarrisonRyanNone2247 3rd StNoneLa VerneCA917504918252007-07-2620
8102AaronsElaineNone481 Buck Island Rd Apt 17AAPT 17AWest YarmouthMA26733300252008-02-2634
9105AaronShirleyNone101 Cherry AveNoneHavanaFL323331311502008-02-2934
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 3 Ahrens Don None 4034 Rennellwood Way None Pleasanton CA 94566 50 2007-06-18 16\n", + "1 19 Ardle William None 412 Dakota Avenue None Springfield OH 45504 50 2007-06-28 16\n", + "2 26 Buckler Steve None 24351 Armada Dr None Dana Point CA 926291306 50 2007-07-30 20\n", + "3 27 Buckler Steve None 24351 Armada Dr None Dana Point CA 926291306 25 2007-08-16 20\n", + "4 35 Buck Barbara None 1780 NE 138th St None North Miami FL 331811316 50 2007-09-13 20\n", + "5 36 Buck Barbara None 1780 NE 138th St None North Miami FL 331811316 50 2007-07-19 20\n", + "6 39 Buchanek Elizabeth None 7917 Kentbury Dr None Bethesda MD 208144615 50 2007-09-30 20\n", + "7 50 Harrison Ryan None 2247 3rd St None La Verne CA 917504918 25 2007-07-26 20\n", + "8 102 Aarons Elaine None 481 Buck Island Rd Apt 17A APT 17A West Yarmouth MA 26733300 25 2008-02-26 34\n", + "9 105 Aaron Shirley None 101 Cherry Ave None Havana FL 323331311 50 2008-02-29 34" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors WHERE amount BETWEEN 10 AND 50;\")\n", + "make_frame(out).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
2AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816
18ArdleWilliamNaN412 Dakota AvenueNaNSpringfieldOH45504502007-06-2816
25BucklerSteveNaN24351 Armada DrNaNDana PointCA926291306502007-07-3020
26BucklerSteveNaN24351 Armada DrNaNDana PointCA926291306252007-08-1620
34BuckBarbaraNaN1780 NE 138th StNaNNorth MiamiFL331811316502007-09-1320
35BuckBarbaraNaN1780 NE 138th StNaNNorth MiamiFL331811316502007-07-1920
38BuchanekElizabethNaN7917 Kentbury DrNaNBethesdaMD208144615502007-09-3020
49HarrisonRyanNaN2247 3rd StNaNLa VerneCA917504918252007-07-2620
101AaronsElaineNaN481 Buck Island Rd Apt 17AAPT 17AWest YarmouthMA26733300252008-02-2634
104AaronShirleyNaN101 Cherry AveNaNHavanaFL323331311502008-02-2934
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "2 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16\n", + "18 Ardle William NaN 412 Dakota Avenue NaN Springfield OH 45504 50 2007-06-28 16\n", + "25 Buckler Steve NaN 24351 Armada Dr NaN Dana Point CA 926291306 50 2007-07-30 20\n", + "26 Buckler Steve NaN 24351 Armada Dr NaN Dana Point CA 926291306 25 2007-08-16 20\n", + "34 Buck Barbara NaN 1780 NE 138th St NaN North Miami FL 331811316 50 2007-09-13 20\n", + "35 Buck Barbara NaN 1780 NE 138th St NaN North Miami FL 331811316 50 2007-07-19 20\n", + "38 Buchanek Elizabeth NaN 7917 Kentbury Dr NaN Bethesda MD 208144615 50 2007-09-30 20\n", + "49 Harrison Ryan NaN 2247 3rd St NaN La Verne CA 917504918 25 2007-07-26 20\n", + "101 Aarons Elaine NaN 481 Buck Island Rd Apt 17A APT 17A West Yarmouth MA 26733300 25 2008-02-26 34\n", + "104 Aaron Shirley NaN 101 Cherry Ave NaN Havana FL 323331311 50 2008-02-29 34" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.query(\"10 <= amount <= 50\").head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###SORT" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
90KazorChristopherM707 Spindletree aveNaNNapervilleIL60565-25922008-04-2132
72BRUNOJOHNNaN10136 WINDERMERE CHASE BLVD.NaNGOTHAFL347344707-23002008-03-0622
64BURKEDONALDJ.12 LOMPOCNaNRANCHO SANTA MARGACA926881817-23002008-03-1122
73BRUNOIRENENaN10136 WINDERMERE CHASE BLVD.NaNGOTHAFL347344707-23002008-03-0622
74BROWNTIMOTHYJ.26826 MARLOWE COURTNaNSTEVENSON RANCHCA913811020-23002008-03-0622
58BURTONGLENNM.4404 CHARLESTON COURTNaNTAMPAFL336092620-23002008-03-0522
57BURTONSTEVENG.9938 DEER CREEK DRIVENaNTAMPAFL33647-23002008-03-0522
84UihleinRichardNaN1396 N Waukegan RdNaNLake ForestIL600451147-23002008-04-2132
56BURTONSUSANNaN9338 DEER CREEK DRIVENaNTAMPAFL336472286-23002008-03-0522
55BUSHERICNaNP.O. BOX 61046NaNDENVERCO802061046-23002008-03-0622
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "90 Kazor Christopher M 707 Spindletree ave NaN Naperville IL 60565 -2592 2008-04-21 32\n", + "72 BRUNO JOHN NaN 10136 WINDERMERE CHASE BLVD. NaN GOTHA FL 347344707 -2300 2008-03-06 22\n", + "64 BURKE DONALD J. 12 LOMPOC NaN RANCHO SANTA MARGA CA 926881817 -2300 2008-03-11 22\n", + "73 BRUNO IRENE NaN 10136 WINDERMERE CHASE BLVD. NaN GOTHA FL 347344707 -2300 2008-03-06 22\n", + "74 BROWN TIMOTHY J. 26826 MARLOWE COURT NaN STEVENSON RANCH CA 913811020 -2300 2008-03-06 22\n", + "58 BURTON GLENN M. 4404 CHARLESTON COURT NaN TAMPA FL 336092620 -2300 2008-03-05 22\n", + "57 BURTON STEVEN G. 9938 DEER CREEK DRIVE NaN TAMPA FL 33647 -2300 2008-03-05 22\n", + "84 Uihlein Richard NaN 1396 N Waukegan Rd NaN Lake Forest IL 600451147 -2300 2008-04-21 32\n", + "56 BURTON SUSAN NaN 9338 DEER CREEK DRIVE NaN TAMPA FL 336472286 -2300 2008-03-05 22\n", + "55 BUSH ERIC NaN P.O. BOX 61046 NaN DENVER CO 802061046 -2300 2008-03-06 22" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.sort(\"amount\").head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
30BuckelLindaNaNPO Box 683130NaNPark CityUT84068313046002007-08-1420
159ABATEMARIAELENA1291 NIGHTINGALE AVENUENaNMIAMI SPRINGSFL33166383226002008-01-2537
15AnthonyJohnNaN211 Long Island DriveNaNHot SpringsAR7191323002007-06-1216
33BuckBlaineM45 Eaton AveNaNCamdenME4843175223002007-09-3020
28BuckelLindaNaNPO Box 683130NaNPark CityUT84068313023002007-08-1420
21BakerDavidNaN2550 Adamsbrooke DriveNaNConwayAR7203423002007-04-1116
13AltesR.D.NaN8600 Moody RoadNaNFort SmithAR7290323002007-06-2116
135ABRAMOWITZNIRANaN411 HARBOR ROADNaNSOUTHPORTCT6890137623002007-09-1435
5AkinMikeNaN181 Baywood LaneNaNMonticelloAR7165515002007-05-1816
174ABRAHAMSALEMA.P.O. BOX 7NaNCANADIANTX79014000713002008-01-3037
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "30 Buckel Linda NaN PO Box 683130 NaN Park City UT 840683130 4600 2007-08-14 20\n", + "159 ABATE MARIA ELENA 1291 NIGHTINGALE AVENUE NaN MIAMI SPRINGS FL 331663832 2600 2008-01-25 37\n", + "15 Anthony John NaN 211 Long Island Drive NaN Hot Springs AR 71913 2300 2007-06-12 16\n", + "33 Buck Blaine M 45 Eaton Ave NaN Camden ME 48431752 2300 2007-09-30 20\n", + "28 Buckel Linda NaN PO Box 683130 NaN Park City UT 840683130 2300 2007-08-14 20\n", + "21 Baker David NaN 2550 Adamsbrooke Drive NaN Conway AR 72034 2300 2007-04-11 16\n", + "13 Altes R.D. NaN 8600 Moody Road NaN Fort Smith AR 72903 2300 2007-06-21 16\n", + "135 ABRAMOWITZ NIRA NaN 411 HARBOR ROAD NaN SOUTHPORT CT 68901376 2300 2007-09-14 35\n", + "5 Akin Mike NaN 181 Baywood Lane NaN Monticello AR 71655 1500 2007-05-18 16\n", + "174 ABRAHAM SALEM A. P.O. BOX 7 NaN CANADIAN TX 790140007 1300 2008-01-30 37" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.sort(\"amount\", ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
091KazorChristopherM707 Spindletree aveNoneNapervilleIL60565-25922008-04-2132
130BuckelLindaNonePO Box 683130NonePark CityUT840683130-23002007-08-1420
252BYINGTONMARGARETE.2633 MIDDLEBORO LANE N.E.NoneGRAND RAPIDSMI495061254-23002008-03-0322
353BYERSBOBA.13170 TELFAIR AVENUENoneSYLMARCA913423573-23002008-03-0722
455BUSHKRYSTIENoneP.O. BOX 61046NoneDENVERCO802061046-23002008-03-0622
556BUSHERICNoneP.O. BOX 61046NoneDENVERCO802061046-23002008-03-0622
657BURTONSUSANNone9338 DEER CREEK DRIVENoneTAMPAFL336472286-23002008-03-0522
758BURTONSTEVENG.9938 DEER CREEK DRIVENoneTAMPAFL33647-23002008-03-0522
859BURTONGLENNM.4404 CHARLESTON COURTNoneTAMPAFL336092620-23002008-03-0522
965BURKEDONALDJ.12 LOMPOCNoneRANCHO SANTA MARGACA926881817-23002008-03-1122
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 91 Kazor Christopher M 707 Spindletree ave None Naperville IL 60565 -2592 2008-04-21 32\n", + "1 30 Buckel Linda None PO Box 683130 None Park City UT 840683130 -2300 2007-08-14 20\n", + "2 52 BYINGTON MARGARET E. 2633 MIDDLEBORO LANE N.E. None GRAND RAPIDS MI 495061254 -2300 2008-03-03 22\n", + "3 53 BYERS BOB A. 13170 TELFAIR AVENUE None SYLMAR CA 913423573 -2300 2008-03-07 22\n", + "4 55 BUSH KRYSTIE None P.O. BOX 61046 None DENVER CO 802061046 -2300 2008-03-06 22\n", + "5 56 BUSH ERIC None P.O. BOX 61046 None DENVER CO 802061046 -2300 2008-03-06 22\n", + "6 57 BURTON SUSAN None 9338 DEER CREEK DRIVE None TAMPA FL 336472286 -2300 2008-03-05 22\n", + "7 58 BURTON STEVEN G. 9938 DEER CREEK DRIVE None TAMPA FL 33647 -2300 2008-03-05 22\n", + "8 59 BURTON GLENN M. 4404 CHARLESTON COURT None TAMPA FL 336092620 -2300 2008-03-05 22\n", + "9 65 BURKE DONALD J. 12 LOMPOC None RANCHO SANTA MARGA CA 926881817 -2300 2008-03-11 22" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors ORDER BY amount;\")\n", + "make_frame(out).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
031BuckelLindaNonePO Box 683130NonePark CityUT84068313046002007-08-1420
1160ABATEMARIAELENA1291 NIGHTINGALE AVENUENoneMIAMI SPRINGSFL33166383226002008-01-2537
214AltesR.D.None8600 Moody RoadNoneFort SmithAR7290323002007-06-2116
316AnthonyJohnNone211 Long Island DriveNoneHot SpringsAR7191323002007-06-1216
422BakerDavidNone2550 Adamsbrooke DriveNoneConwayAR7203423002007-04-1116
529BuckelLindaNonePO Box 683130NonePark CityUT84068313023002007-08-1420
634BuckBlaineM45 Eaton AveNoneCamdenME4843175223002007-09-3020
7136ABRAMOWITZNIRANone411 HARBOR ROADNoneSOUTHPORTCT6890137623002007-09-1435
86AkinMikeNone181 Baywood LaneNoneMonticelloAR7165515002007-05-1816
910AllenJohn D.None1052 Cannon Mill DriveNoneNorth AugustaSC2986013002007-06-2916
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 31 Buckel Linda None PO Box 683130 None Park City UT 840683130 4600 2007-08-14 20\n", + "1 160 ABATE MARIA ELENA 1291 NIGHTINGALE AVENUE None MIAMI SPRINGS FL 331663832 2600 2008-01-25 37\n", + "2 14 Altes R.D. None 8600 Moody Road None Fort Smith AR 72903 2300 2007-06-21 16\n", + "3 16 Anthony John None 211 Long Island Drive None Hot Springs AR 71913 2300 2007-06-12 16\n", + "4 22 Baker David None 2550 Adamsbrooke Drive None Conway AR 72034 2300 2007-04-11 16\n", + "5 29 Buckel Linda None PO Box 683130 None Park City UT 840683130 2300 2007-08-14 20\n", + "6 34 Buck Blaine M 45 Eaton Ave None Camden ME 48431752 2300 2007-09-30 20\n", + "7 136 ABRAMOWITZ NIRA None 411 HARBOR ROAD None SOUTHPORT CT 68901376 2300 2007-09-14 35\n", + "8 6 Akin Mike None 181 Baywood Lane None Monticello AR 71655 1500 2007-05-18 16\n", + "9 10 Allen John D. None 1052 Cannon Mill Drive None North Augusta SC 29860 1300 2007-06-29 16" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors ORDER BY amount DESC;\")\n", + "make_frame(out).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###SELECT-COLUMNS" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_nameamount
0Steven500
1Don250
2Don50
3Don100
4Charles100
5Mike1500
6Rebecca500
7Brittni250
8John D.1000
9John D.1300
\n", + "
" + ], + "text/plain": [ + " first_name amount\n", + "0 Steven 500\n", + "1 Don 250\n", + "2 Don 50\n", + "3 Don 100\n", + "4 Charles 100\n", + "5 Mike 1500\n", + "6 Rebecca 500\n", + "7 Brittni 250\n", + "8 John D. 1000\n", + "9 John D. 1300" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[['first_name', 'amount']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_nameamount
0Steven500
1Don250
2Don50
3Don100
4Charles100
5Mike1500
6Rebecca500
7Brittni250
8John D.1000
9John D.1300
\n", + "
" + ], + "text/plain": [ + " first_name amount\n", + "0 Steven 500\n", + "1 Don 250\n", + "2 Don 50\n", + "3 Don 100\n", + "4 Charles 100\n", + "5 Mike 1500\n", + "6 Rebecca 500\n", + "7 Brittni 250\n", + "8 John D. 1000\n", + "9 John D. 1300" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT first_name, amount FROM contributors;\")\n", + "make_frame(out,['first_name', 'amount']).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###SELECT-DISTINCT" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "last_name 175\n", + "first_name 175\n", + "dtype: int64" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[['last_name','first_name']].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "last_name 126\n", + "first_name 126\n", + "dtype: int64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[['last_name','first_name']].drop_duplicates().count()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_name
0AgeeSteven
1AhrensDon
4AkinCharles
5AkinMike
6AkinRebecca
7AldridgeBrittni
8AllenJohn D.
10AllisonJohn W.
11AllisonRebecca
13AltesR.D.
\n", + "
" + ], + "text/plain": [ + " last_name first_name\n", + "0 Agee Steven\n", + "1 Ahrens Don\n", + "4 Akin Charles\n", + "5 Akin Mike\n", + "6 Akin Rebecca\n", + "7 Aldridge Brittni\n", + "8 Allen John D.\n", + "10 Allison John W.\n", + "11 Allison Rebecca\n", + "13 Altes R.D." + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[['last_name','first_name']].drop_duplicates().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_name
0AgeeSteven
1AhrensDon
2AkinCharles
3AkinMike
4AkinRebecca
5AldridgeBrittni
6AllenJohn D.
7AllisonJohn W.
8AllisonRebecca
9AltesR.D.
\n", + "
" + ], + "text/plain": [ + " last_name first_name\n", + "0 Agee Steven\n", + "1 Ahrens Don\n", + "2 Akin Charles\n", + "3 Akin Mike\n", + "4 Akin Rebecca\n", + "5 Aldridge Brittni\n", + "6 Allen John D.\n", + "7 Allison John W.\n", + "8 Allison Rebecca\n", + "9 Altes R.D." + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT DISTINCT last_name, first_name FROM contributors;\")\n", + "make_frame(out,['last_name', 'first_name']).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###ASSIGN" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_idname
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016Agee, Steven
1AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945662502007-05-1616Ahrens, Don
2AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816Ahrens, Don
3AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945661002007-06-2116Ahrens, Don
4AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616Akin, Charles
5AkinMikeNaN181 Baywood LaneNaNMonticelloAR7165515002007-05-1816Akin, Mike
6AkinRebeccaNaN181 Baywood LaneNaNMonticelloAR716555002007-05-1816Akin, Rebecca
7AldridgeBrittniNaN808 Capitol Square Place, SWNaNWashingtonDC200242502007-06-0616Aldridge, Brittni
8AllenJohn D.NaN1052 Cannon Mill DriveNaNNorth AugustaSC2986010002007-06-1116Allen, John D.
9AllenJohn D.NaN1052 Cannon Mill DriveNaNNorth AugustaSC2986013002007-06-2916Allen, John D.
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id name\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16 Agee, Steven\n", + "1 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 250 2007-05-16 16 Ahrens, Don\n", + "2 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16 Ahrens, Don\n", + "3 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 100 2007-06-21 16 Ahrens, Don\n", + "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16 Akin, Charles\n", + "5 Akin Mike NaN 181 Baywood Lane NaN Monticello AR 71655 1500 2007-05-18 16 Akin, Mike\n", + "6 Akin Rebecca NaN 181 Baywood Lane NaN Monticello AR 71655 500 2007-05-18 16 Akin, Rebecca\n", + "7 Aldridge Brittni NaN 808 Capitol Square Place, SW NaN Washington DC 20024 250 2007-06-06 16 Aldridge, Brittni\n", + "8 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1000 2007-06-11 16 Allen, John D.\n", + "9 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1300 2007-06-29 16 Allen, John D." + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci['name']=dfcwci['last_name']+\", \"+dfcwci['first_name']\n", + "dfcwci.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_idnameucname
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016Agee, StevenAgee:Steven
1AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945662502007-05-1616Ahrens, DonAhrens:Don
2AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816Ahrens, DonAhrens:Don
3AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945661002007-06-2116Ahrens, DonAhrens:Don
4AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616Akin, CharlesAkin:Charles
5AkinMikeNaN181 Baywood LaneNaNMonticelloAR7165515002007-05-1816Akin, MikeAkin:Mike
6AkinRebeccaNaN181 Baywood LaneNaNMonticelloAR716555002007-05-1816Akin, RebeccaAkin:Rebecca
7AldridgeBrittniNaN808 Capitol Square Place, SWNaNWashingtonDC200242502007-06-0616Aldridge, BrittniAldridge:Brittni
8AllenJohn D.NaN1052 Cannon Mill DriveNaNNorth AugustaSC2986010002007-06-1116Allen, John D.Allen:John D.
9AllenJohn D.NaN1052 Cannon Mill DriveNaNNorth AugustaSC2986013002007-06-2916Allen, John D.Allen:John D.
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id name ucname\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16 Agee, Steven Agee:Steven\n", + "1 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 250 2007-05-16 16 Ahrens, Don Ahrens:Don\n", + "2 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16 Ahrens, Don Ahrens:Don\n", + "3 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 100 2007-06-21 16 Ahrens, Don Ahrens:Don\n", + "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16 Akin, Charles Akin:Charles\n", + "5 Akin Mike NaN 181 Baywood Lane NaN Monticello AR 71655 1500 2007-05-18 16 Akin, Mike Akin:Mike\n", + "6 Akin Rebecca NaN 181 Baywood Lane NaN Monticello AR 71655 500 2007-05-18 16 Akin, Rebecca Akin:Rebecca\n", + "7 Aldridge Brittni NaN 808 Capitol Square Place, SW NaN Washington DC 20024 250 2007-06-06 16 Aldridge, Brittni Aldridge:Brittni\n", + "8 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1000 2007-06-11 16 Allen, John D. Allen:John D.\n", + "9 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1300 2007-06-29 16 Allen, John D. Allen:John D." + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.assign(ucname=dfcwci.last_name+\":\"+dfcwci.first_name).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Will the above command actually change `dfcwci`?\n", + "\n", + "####What if we wanted to change an existing assignment?" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_idname
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA24091500.002007-06-3016Agee, Steven
27BuckheitBruceNaN8904 KAREN DRNaNFAIRFAXVA220312731100.002007-09-1920Buckheit, Bruce
77RanganathAnoopNaN2507 Willard DriveNaNCharlottesvilleVA22903-100.002008-04-2132Ranganath, Anoop
88PerreaultLouiseNaN503 Brockridge Hunt DriveNaNHamptonVA23666-34.082008-04-2132Perreault, Louise
145ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NaNFAIRFAXVA22030844050.002007-09-3035ABDELLA, THOMAS
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id name\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500.00 2007-06-30 16 Agee, Steven\n", + "27 Buckheit Bruce NaN 8904 KAREN DR NaN FAIRFAX VA 220312731 100.00 2007-09-19 20 Buckheit, Bruce\n", + "77 Ranganath Anoop NaN 2507 Willard Drive NaN Charlottesville VA 22903 -100.00 2008-04-21 32 Ranganath, Anoop\n", + "88 Perreault Louise NaN 503 Brockridge Hunt Drive NaN Hampton VA 23666 -34.08 2008-04-21 32 Perreault, Louise\n", + "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35 ABDELLA, THOMAS" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[dfcwci.state=='VA']" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Agee, Steven\n", + "27 Buckheit, Bruce\n", + "77 Ranganath, Anoop\n", + "88 Perreault, Louise\n", + "145 ABDELLA, THOMAS\n", + "Name: name, dtype: object" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.loc[dfcwci.state=='VA', 'name']" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dfcwci.loc[dfcwci.state=='VA', 'name']=\"junk\"" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_idname
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA24091500.002007-06-3016junk
27BuckheitBruceNaN8904 KAREN DRNaNFAIRFAXVA220312731100.002007-09-1920junk
77RanganathAnoopNaN2507 Willard DriveNaNCharlottesvilleVA22903-100.002008-04-2132junk
88PerreaultLouiseNaN503 Brockridge Hunt DriveNaNHamptonVA23666-34.082008-04-2132junk
145ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NaNFAIRFAXVA22030844050.002007-09-3035junk
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id name\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500.00 2007-06-30 16 junk\n", + "27 Buckheit Bruce NaN 8904 KAREN DR NaN FAIRFAX VA 220312731 100.00 2007-09-19 20 junk\n", + "77 Ranganath Anoop NaN 2507 Willard Drive NaN Charlottesville VA 22903 -100.00 2008-04-21 32 junk\n", + "88 Perreault Louise NaN 503 Brockridge Hunt Drive NaN Hampton VA 23666 -34.08 2008-04-21 32 junk\n", + "145 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 NaN FAIRFAX VA 220308440 50.00 2007-09-30 35 junk" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.query(\"state=='VA'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Let us see the entire process in SQL" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "alt=\"ALTER TABLE contributors ADD COLUMN name;\"\n", + "db.cursor().execute(alt)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0, u'id', u'INTEGER', 1, None, 1),\n", + " (1, u'last_name', u'VARCHAR', 0, None, 0),\n", + " (2, u'first_name', u'VARCHAR', 0, None, 0),\n", + " (3, u'middle_name', u'VARCHAR', 0, None, 0),\n", + " (4, u'street_1', u'VARCHAR', 0, None, 0),\n", + " (5, u'street_2', u'VARCHAR', 0, None, 0),\n", + " (6, u'city', u'VARCHAR', 0, None, 0),\n", + " (7, u'state', u'VARCHAR', 0, None, 0),\n", + " (8, u'zip', u'VARCHAR', 0, None, 0),\n", + " (9, u'amount', u'INTEGER', 0, None, 0),\n", + " (10, u'date', u'DATETIME', 0, None, 0),\n", + " (11, u'candidate_id', u'INTEGER', 1, None, 0),\n", + " (12, u'name', u'', 0, None, 0)]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "make_query(\"PRAGMA table_info(contributors);\")" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(u'Agee, Steven', 1),\n", + " (u'Ahrens, Don', 2),\n", + " (u'Ahrens, Don', 3),\n", + " (u'Ahrens, Don', 4),\n", + " (u'Akin, Charles', 5),\n", + " (u'Akin, Mike', 6),\n", + " (u'Akin, Rebecca', 7),\n", + " (u'Aldridge, Brittni', 8),\n", + " (u'Allen, John D.', 9),\n", + " (u'Allen, John D.', 10),\n", + " (u'Allison, John W.', 11),\n", + " (u'Allison, Rebecca', 12),\n", + " (u'Allison, Rebecca', 13),\n", + " (u'Altes, R.D.', 14),\n", + " (u'Andres, Dale', 15),\n", + " (u'Anthony, John', 16),\n", + " (u'Arbogast, Robert', 17),\n", + " (u'Arbogast, Robert', 18),\n", + " (u'Ardle, William', 19),\n", + " (u'Atiq, Omar', 20),\n", + " (u'Atiq, Omar', 21),\n", + " (u'Baker, David', 22),\n", + " (u'Bancroft, David', 23),\n", + " (u'Banks, Charles', 24),\n", + " (u'Barbee, John', 25),\n", + " (u'Buckler, Steve', 26),\n", + " (u'Buckler, Steve', 27),\n", + " (u'Buckheit, Bruce', 28),\n", + " (u'Buckel, Linda', 29),\n", + " (u'Buckel, Linda', 30),\n", + " (u'Buckel, Linda', 31),\n", + " (u'Buck, Thomas', 32),\n", + " (u'Buck, Jay', 33),\n", + " (u'Buck, Blaine', 34),\n", + " (u'Buck, Barbara', 35),\n", + " (u'Buck, Barbara', 36),\n", + " (u'Buchman, Mark M', 37),\n", + " (u'Bucher, Ida', 38),\n", + " (u'Buchanek, Elizabeth', 39),\n", + " (u'Buchanan, John', 40),\n", + " (u'Buchanan, John', 41),\n", + " (u'Buchanan, John', 42),\n", + " (u'Buchanan, John', 43),\n", + " (u'Buchanan, John', 44),\n", + " (u'Buchanan, John', 45),\n", + " (u'Buchanan, John', 46),\n", + " (u'Buchanan, John', 47),\n", + " (u'Buchanan, John', 48),\n", + " (u'Buchanan, John', 49),\n", + " (u'Harrison, Ryan', 50),\n", + " (u'BYNUM, HERBERT', 51),\n", + " (u'BYINGTON, MARGARET', 52),\n", + " (u'BYERS, BOB', 53),\n", + " (u'BYERS, AUDREY', 54),\n", + " (u'BUSH, KRYSTIE', 55),\n", + " (u'BUSH, ERIC', 56),\n", + " (u'BURTON, SUSAN', 57),\n", + " (u'BURTON, STEVEN', 58),\n", + " (u'BURTON, GLENN', 59),\n", + " (u'BURKHARDT, CRAIG', 60),\n", + " (u'BURKHARDT, CRAIG', 61),\n", + " (u'BURKHARDT, BARBARA', 62),\n", + " (u'BURKE, SUZANNE', 63),\n", + " (u'BURKE, GAIL', 64),\n", + " (u'BURKE, DONALD', 65),\n", + " (u'BURGERT, RONALD', 66),\n", + " (u'BULL, BARTLE', 67),\n", + " (u'BULL, BARTLE', 68),\n", + " (u'BUKOWSKI, DANIEL', 69),\n", + " (u'BUISSON, MARGARET', 70),\n", + " (u'BUCKLEY, WALTER', 71),\n", + " (u'BUCKLEY, MARJORIE', 72),\n", + " (u'BRUNO, JOHN', 73),\n", + " (u'BRUNO, IRENE', 74),\n", + " (u'BROWN, TIMOTHY', 75),\n", + " (u'Schuff, Bryan', 76),\n", + " (u'Hobbs, James', 77),\n", + " (u'Ranganath, Anoop', 78),\n", + " (u'Nystrom, Michael', 79),\n", + " (u'Muse, Nina', 80),\n", + " (u'Waddell, James', 81),\n", + " (u'Brucks, William', 82),\n", + " (u'Kuehn, David', 83),\n", + " (u'Verster, Jeanette', 84),\n", + " (u'Uihlein, Richard', 85),\n", + " (u'Eskenberry, Robert', 86),\n", + " (u'Froehling, Alan', 87),\n", + " (u'Duryea, Marcia', 88),\n", + " (u'Perreault, Louise', 89),\n", + " (u'Rozenfeld, Timur', 90),\n", + " (u'Kazor, Christopher', 91),\n", + " (u'Lehner, Thomas', 92),\n", + " (u'Plummer, Joseph', 93),\n", + " (u'Raught, Philip', 94),\n", + " (u'Ferrara, Judith', 95),\n", + " (u'Johnson, Cathleen', 96),\n", + " (u'Sanford, Bradley', 97),\n", + " (u'Gaarder, Bruce', 98),\n", + " (u'Choe, Hyeokchan', 99),\n", + " (u'Jacobs, Richard', 100),\n", + " (u'Aaronson, Rebecca', 101),\n", + " (u'Aarons, Elaine', 102),\n", + " (u'Aarons, Elaine', 103),\n", + " (u'Aarons, Elaine', 104),\n", + " (u'Aaron, Shirley', 105),\n", + " (u'Aaron, Shirley', 106),\n", + " (u'Aaronson, Rebecca', 107),\n", + " (u'Aaron, Shirley', 108),\n", + " (u'Aaron, Shirley', 109),\n", + " (u'Aaron, Shirley', 110),\n", + " (u'Reid, Elizabeth', 111),\n", + " (u'Reich, Thomas', 112),\n", + " (u'Aaron, Shirley', 113),\n", + " (u'Aaron, Shirley', 114),\n", + " (u'Aaron, Sharron', 115),\n", + " (u'Aaron, Patricia', 116),\n", + " (u'Aaron, Patricia', 117),\n", + " (u'Aaron, Jim', 118),\n", + " (u'Aaron, Jim', 119),\n", + " (u'Aaron, Carole', 120),\n", + " (u'Aaron, Carole', 121),\n", + " (u'Aaron, Carole', 122),\n", + " (u'Aaron, Barbara', 123),\n", + " (u'Aanonsen, Lin', 124),\n", + " (u'Aanonsen, Lin', 125),\n", + " (u'BOURNE, TRAVIS', 126),\n", + " (u'SECRIST, BRIAN', 127),\n", + " (u'TOLLESTRUP, TRAVIS', 128),\n", + " (u'ACCORD, DEAN', 129),\n", + " (u'ABTS, HENRY', 130),\n", + " (u'ABSHIER, LANNY', 131),\n", + " (u'ABSHIER, DIANA', 132),\n", + " (u'ABREU, KEVIN', 133),\n", + " (u'ABREU, KEVIN', 134),\n", + " (u'ABREU, KEVIN', 135),\n", + " (u'ABRAMOWITZ, NIRA', 136),\n", + " (u'ABRAMS, MICHAEL', 137),\n", + " (u'ABRAMOWITZ, KEN', 138),\n", + " (u'ABOUBAKARE, NASAR', 139),\n", + " (u'ABEGG, PATRICIA', 140),\n", + " (u'ABEGG, PATRICIA', 141),\n", + " (u'ABEGG, PATRICIA', 142),\n", + " (u'ABEGG, PATRICIA', 143),\n", + " (u'ABEGG, PATRICIA', 144),\n", + " (u'ABEGG, PATRICIA', 145),\n", + " (u'ABDELLA, THOMAS', 146),\n", + " (u'ABBOTT, WELDON', 147),\n", + " (u'ABBOTT, WELDON', 148),\n", + " (u'ABBOTT, GERALD', 149),\n", + " (u'ABBOTT, GERALD', 150),\n", + " (u'ABEDIN, ZAINUL', 151),\n", + " (u'ABBOTT, SYBIL', 152),\n", + " (u'ABBOTT, SYBIL', 153),\n", + " (u'ABBOTT, RONALD', 154),\n", + " (u'ABBOTT, RONALD', 155),\n", + " (u'ABBOTT, ROBERT', 156),\n", + " (u'ABBOTT, MIKE', 157),\n", + " (u'ABBOT, DAVID', 158),\n", + " (u'ABBO, PAULINE', 159),\n", + " (u'ABATE, MARIA', 160),\n", + " (u'ABAIR, PETER', 161),\n", + " (u'ABACHERLI, SHIRLEY', 162),\n", + " (u'AARONS, CHARLES', 163),\n", + " (u'AARONS, CHARLES', 164),\n", + " (u'AARONS, CHARLES', 165),\n", + " (u'ABEL, JOHN', 166),\n", + " (u'ABEL, MARLING', 167),\n", + " (u'ABEL, RUDOLPH', 168),\n", + " (u'ABELE, RODNEY', 169),\n", + " (u'ABERCROMBIE, DENIS', 170),\n", + " (u'ABESHAUS, MERRILL', 171),\n", + " (u'ABRAHAM, GEORGE', 172),\n", + " (u'ABRAHAMSON, PETER', 173),\n", + " (u'ABRAHAM, SALEM', 174),\n", + " (u'ABRAHAM, SALEM', 175)]" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out = make_query(\"SELECT id, last_name,first_name from contributors;\")\n", + "out2 = [(e[1]+\", \"+e[2],e[0]) for e in out]\n", + "out2" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "alt2=\"UPDATE contributors SET name = ? WHERE id = ?;\"\n", + "for ele in out2:\n", + " db.cursor().execute(alt2, ele)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "db.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_idname
01AgeeStevenNone549 Laurel Branch RoadNoneFloydVA240915002007-06-3016Agee, Steven
12AhrensDonNone4034 Rennellwood WayNonePleasantonCA945662502007-05-1616Ahrens, Don
23AhrensDonNone4034 Rennellwood WayNonePleasantonCA94566502007-06-1816Ahrens, Don
34AhrensDonNone4034 Rennellwood WayNonePleasantonCA945661002007-06-2116Ahrens, Don
45AkinCharlesNone10187 Sugar Creek RoadNoneBentonvilleAR727121002007-06-1616Akin, Charles
56AkinMikeNone181 Baywood LaneNoneMonticelloAR7165515002007-05-1816Akin, Mike
67AkinRebeccaNone181 Baywood LaneNoneMonticelloAR716555002007-05-1816Akin, Rebecca
78AldridgeBrittniNone808 Capitol Square Place, SWNoneWashingtonDC200242502007-06-0616Aldridge, Brittni
89AllenJohn D.None1052 Cannon Mill DriveNoneNorth AugustaSC2986010002007-06-1116Allen, John D.
910AllenJohn D.None1052 Cannon Mill DriveNoneNorth AugustaSC2986013002007-06-2916Allen, John D.
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id name\n", + "0 1 Agee Steven None 549 Laurel Branch Road None Floyd VA 24091 500 2007-06-30 16 Agee, Steven\n", + "1 2 Ahrens Don None 4034 Rennellwood Way None Pleasanton CA 94566 250 2007-05-16 16 Ahrens, Don\n", + "2 3 Ahrens Don None 4034 Rennellwood Way None Pleasanton CA 94566 50 2007-06-18 16 Ahrens, Don\n", + "3 4 Ahrens Don None 4034 Rennellwood Way None Pleasanton CA 94566 100 2007-06-21 16 Ahrens, Don\n", + "4 5 Akin Charles None 10187 Sugar Creek Road None Bentonville AR 72712 100 2007-06-16 16 Akin, Charles\n", + "5 6 Akin Mike None 181 Baywood Lane None Monticello AR 71655 1500 2007-05-18 16 Akin, Mike\n", + "6 7 Akin Rebecca None 181 Baywood Lane None Monticello AR 71655 500 2007-05-18 16 Akin, Rebecca\n", + "7 8 Aldridge Brittni None 808 Capitol Square Place, SW None Washington DC 20024 250 2007-06-06 16 Aldridge, Brittni\n", + "8 9 Allen John D. None 1052 Cannon Mill Drive None North Augusta SC 29860 1000 2007-06-11 16 Allen, John D.\n", + "9 10 Allen John D. None 1052 Cannon Mill Drive None North Augusta SC 29860 1300 2007-06-29 16 Allen, John D." + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * from contributors;\")\n", + "make_frame(out,cont_cols+[\"name\"]).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And lets now do an assignment to an existing column" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "upd=\"UPDATE contributors SET name = 'junk' WHERE state = 'VA';\"\n", + "db.cursor().execute(upd)\n", + "db.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_idname
01AgeeStevenNone549 Laurel Branch RoadNoneFloydVA24091500.002007-06-3016junk
128BuckheitBruceNone8904 KAREN DRNoneFAIRFAXVA220312731100.002007-09-1920junk
278RanganathAnoopNone2507 Willard DriveNoneCharlottesvilleVA22903-100.002008-04-2132junk
389PerreaultLouiseNone503 Brockridge Hunt DriveNoneHamptonVA23666-34.082008-04-2132junk
4146ABDELLATHOMASM.4231 MONUMENT WALL WAY #340NoneFAIRFAXVA22030844050.002007-09-3035junk
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id name\n", + "0 1 Agee Steven None 549 Laurel Branch Road None Floyd VA 24091 500.00 2007-06-30 16 junk\n", + "1 28 Buckheit Bruce None 8904 KAREN DR None FAIRFAX VA 220312731 100.00 2007-09-19 20 junk\n", + "2 78 Ranganath Anoop None 2507 Willard Drive None Charlottesville VA 22903 -100.00 2008-04-21 32 junk\n", + "3 89 Perreault Louise None 503 Brockridge Hunt Drive None Hampton VA 23666 -34.08 2008-04-21 32 junk\n", + "4 146 ABDELLA THOMAS M. 4231 MONUMENT WALL WAY #340 None FAIRFAX VA 220308440 50.00 2007-09-30 35 junk" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * from contributors where state='VA';\")\n", + "make_frame(out,cont_cols+[\"name\"]).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####No DROP COLUMN in SQLITE\n", + "\n", + "Its available in other databases. Here you must just re-create your database, or no about this gotcha from the start." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "OperationalError", + "evalue": "near \"DROP\": syntax error", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOperationalError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0malt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ALTER TABLE contributors DROP COLUMN name;\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcursor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mOperationalError\u001b[0m: near \"DROP\": syntax error" + ] + } + ], + "source": [ + "alt=\"ALTER TABLE contributors DROP COLUMN name;\"\n", + "db.cursor().execute(alt)\n", + "db.commit()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Its much simpler in Pandas, of-course" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "del dfcwci['name']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###AGGREGATE" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
zipamountcandidate_id
count1.750000e+02175.000000175.000000
mean3.780014e+083.41811428.000000
std3.628278e+081028.4189997.823484
min2.474000e+03-2592.00000016.000000
25%9.336700e+04-175.00000020.000000
50%3.233313e+08100.00000032.000000
75%7.816946e+08300.00000035.000000
max9.951532e+084600.00000037.000000
\n", + "
" + ], + "text/plain": [ + " zip amount candidate_id\n", + "count 1.750000e+02 175.000000 175.000000\n", + "mean 3.780014e+08 3.418114 28.000000\n", + "std 3.628278e+08 1028.418999 7.823484\n", + "min 2.474000e+03 -2592.000000 16.000000\n", + "25% 9.336700e+04 -175.000000 20.000000\n", + "50% 3.233313e+08 100.000000 32.000000\n", + "75% 7.816946e+08 300.000000 35.000000\n", + "max 9.951532e+08 4600.000000 37.000000" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "4600.0" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.amount.max()" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
30BuckelLindaNaNPO Box 683130NaNPark CityUT84068313046002007-08-1420
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "30 Buckel Linda NaN PO Box 683130 NaN Park City UT 840683130 4600 2007-08-14 20" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[dfcwci.amount==dfcwci.amount.max()]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(31, u'Buckel', u'Linda', None, u'PO Box 683130', None, u'Park City', u'UT', u'840683130', 4600, u'2007-08-14', 20, u'Buckel, Linda', 4600)]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_idmaxamt
031BuckelLindaNonePO Box 683130NonePark CityUT84068313046002007-08-1420Buckel, Linda
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id maxamt\n", + "0 31 Buckel Linda None PO Box 683130 None Park City UT 840683130 4600 2007-08-14 20 Buckel, Linda" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT *, MAX(amount) AS maxamt FROM contributors;\")\n", + "print out\n", + "make_frame(out, cont_cols+['maxamt'])" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(175,)]\n" + ] + } + ], + "source": [ + "out=make_query(\"SELECT COUNT(amount) AS AMOUNTCOUNT FROM contributors;\")\n", + "print out" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(3.418114285714276,)]\n" + ] + } + ], + "source": [ + "out=make_query(\"SELECT AVG(amount) FROM contributors;\")\n", + "print out" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
30BuckelLindaNaNPO Box 683130NaNPark CityUT84068313046002007-08-1420
159ABATEMARIAELENA1291 NIGHTINGALE AVENUENaNMIAMI SPRINGSFL33166383226002008-01-2537
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "30 Buckel Linda NaN PO Box 683130 NaN Park City UT 840683130 4600 2007-08-14 20\n", + "159 ABATE MARIA ELENA 1291 NIGHTINGALE AVENUE NaN MIAMI SPRINGS FL 331663832 2600 2008-01-25 37" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[dfcwci.amount > dfcwci.amount.max() - 2300]" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
031BuckelLindaNonePO Box 683130NonePark CityUT84068313046002007-08-1420
1160ABATEMARIAELENA1291 NIGHTINGALE AVENUENoneMIAMI SPRINGSFL33166383226002008-01-2537
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 31 Buckel Linda None PO Box 683130 None Park City UT 840683130 4600 2007-08-14 20\n", + "1 160 ABATE MARIA ELENA 1291 NIGHTINGALE AVENUE None MIAMI SPRINGS FL 331663832 2600 2008-01-25 37" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors WHERE amount > (select (MAX(amount) - 2300) FROM contributors);\")\n", + "make_frame(out)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Aso `MIN`, `SUM`, `AVG`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###GROUP-AGG" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
zipamountcandidate_id
state
AK29854596211210.00111
AR86479014200.00192
AZ860011121120.0037
CA14736360720-5013.73600
CO2405477834-5823.00111
CT689013762300.0035
DC800341853-1549.91102
FL8970626520-4050.00803
IA50266250.0016
ID83648-261.0032
IL3042068689-5586.80175
KS66215-330.0032
KY402597029-200.0022
LA14060433271300.0074
MA123026638-83.00208
MD416287617300.0055
ME1656471702520.00122
MI2426973485-1265.00164
MN1102338918322.00100
MO64111100.0020
NC27502500.0016
NH32564424-24.6032
NJ70254993-817.4564
NV3575889763725.00144
NY606129991-6474.50233
OH176071450.0080
OK2202499044800.00102
PA540499020-2146.00145
RI58065892200.0070
SC2962147892400.0069
TN37188-25.0032
TX62214522451985.24302
UT92511533945050.00340
VA440691831515.92135
WA2941290251-500.0090
\n", + "
" + ], + "text/plain": [ + " zip amount candidate_id\n", + "state \n", + "AK 2985459621 1210.00 111\n", + "AR 864790 14200.00 192\n", + "AZ 860011121 120.00 37\n", + "CA 14736360720 -5013.73 600\n", + "CO 2405477834 -5823.00 111\n", + "CT 68901376 2300.00 35\n", + "DC 800341853 -1549.91 102\n", + "FL 8970626520 -4050.00 803\n", + "IA 50266 250.00 16\n", + "ID 83648 -261.00 32\n", + "IL 3042068689 -5586.80 175\n", + "KS 66215 -330.00 32\n", + "KY 402597029 -200.00 22\n", + "LA 1406043327 1300.00 74\n", + "MA 123026638 -83.00 208\n", + "MD 416287617 300.00 55\n", + "ME 165647170 2520.00 122\n", + "MI 2426973485 -1265.00 164\n", + "MN 1102338918 322.00 100\n", + "MO 64111 100.00 20\n", + "NC 27502 500.00 16\n", + "NH 32564424 -24.60 32\n", + "NJ 70254993 -817.45 64\n", + "NV 3575889763 725.00 144\n", + "NY 606129991 -6474.50 233\n", + "OH 176071 450.00 80\n", + "OK 2202499044 800.00 102\n", + "PA 540499020 -2146.00 145\n", + "RI 58065892 200.00 70\n", + "SC 296214789 2400.00 69\n", + "TN 37188 -25.00 32\n", + "TX 6221452245 1985.24 302\n", + "UT 9251153394 5050.00 340\n", + "VA 440691831 515.92 135\n", + "WA 2941290251 -500.00 90" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.groupby(\"state\").sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "AK 403.333333\n", + "AR 1183.333333\n", + "AZ 120.000000\n", + "CA -217.988261\n", + "CO -1455.750000\n", + "CT 2300.000000\n", + "DC -309.982000\n", + "FL -135.000000\n", + "IA 250.000000\n", + "ID -261.000000\n", + "IL -931.133333\n", + "KS -330.000000\n", + "KY -200.000000\n", + "LA 650.000000\n", + "MA -13.833333\n", + "MD 150.000000\n", + "ME 630.000000\n", + "MI -253.000000\n", + "MN 107.333333\n", + "MO 100.000000\n", + "NC 500.000000\n", + "NH -24.600000\n", + "NJ -408.725000\n", + "NV 181.250000\n", + "NY -809.312500\n", + "OH 112.500000\n", + "OK 266.666667\n", + "PA -429.200000\n", + "RI 100.000000\n", + "SC 800.000000\n", + "TN -25.000000\n", + "TX 220.582222\n", + "UT 459.090909\n", + "VA 103.184000\n", + "WA -166.666667\n", + "Name: amount, dtype: float64" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.groupby(\"state\")['amount'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['VA', 'CA', 'AR', 'DC', 'SC', 'IA', 'OH', 'NC', 'UT', 'MO', 'IL',\n", + " 'ME', 'FL', 'MD', 'MI', 'CO', 'WA', 'NY', 'TX', 'KY', 'PA', 'TN',\n", + " 'MA', 'MN', 'KS', 'NJ', 'NH', 'ID', 'OK', nan, 'NV', 'CT', 'RI',\n", + " 'AK', 'LA', 'AZ'], dtype=object)" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.state.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statesum
0None-500.00
1AK1210.00
2AR14200.00
3AZ120.00
4CA-5013.73
5CO-5823.00
6CT2300.00
7DC-1549.91
8FL-4050.00
9IA250.00
10ID-261.00
11IL-5586.80
12KS-330.00
13KY-200.00
14LA1300.00
15MA-83.00
16MD300.00
17ME2520.00
18MI-1265.00
19MN322.00
20MO100.00
21NC500.00
22NH-24.60
23NJ-817.45
24NV725.00
25NY-6474.50
26OH450.00
27OK800.00
28PA-2146.00
29RI200.00
30SC2400.00
31TN-25.00
32TX1985.24
33UT5050.00
34VA515.92
35WA-500.00
\n", + "
" + ], + "text/plain": [ + " state sum\n", + "0 None -500.00\n", + "1 AK 1210.00\n", + "2 AR 14200.00\n", + "3 AZ 120.00\n", + "4 CA -5013.73\n", + "5 CO -5823.00\n", + "6 CT 2300.00\n", + "7 DC -1549.91\n", + "8 FL -4050.00\n", + "9 IA 250.00\n", + "10 ID -261.00\n", + "11 IL -5586.80\n", + "12 KS -330.00\n", + "13 KY -200.00\n", + "14 LA 1300.00\n", + "15 MA -83.00\n", + "16 MD 300.00\n", + "17 ME 2520.00\n", + "18 MI -1265.00\n", + "19 MN 322.00\n", + "20 MO 100.00\n", + "21 NC 500.00\n", + "22 NH -24.60\n", + "23 NJ -817.45\n", + "24 NV 725.00\n", + "25 NY -6474.50\n", + "26 OH 450.00\n", + "27 OK 800.00\n", + "28 PA -2146.00\n", + "29 RI 200.00\n", + "30 SC 2400.00\n", + "31 TN -25.00\n", + "32 TX 1985.24\n", + "33 UT 5050.00\n", + "34 VA 515.92\n", + "35 WA -500.00" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT state,SUM(amount) FROM contributors GROUP BY state;\")\n", + "make_frame(out, legend=['state','sum'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###DELETE" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
1AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945662502007-05-1616
2AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816
3AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945661002007-06-2116
4AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "1 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 250 2007-05-16 16\n", + "2 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16\n", + "3 Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 100 2007-06-21 16\n", + "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In-place drops" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
last_name
AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945662502007-05-1616
AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA94566502007-06-1816
AhrensDonNaN4034 Rennellwood WayNaNPleasantonCA945661002007-06-2116
AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
\n", + "
" + ], + "text/plain": [ + " first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "last_name \n", + "Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 250 2007-05-16 16\n", + "Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 50 2007-06-18 16\n", + "Ahrens Don NaN 4034 Rennellwood Way NaN Pleasanton CA 94566 100 2007-06-21 16\n", + "Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2=dfcwci.copy()\n", + "df2.set_index('last_name', inplace=True)\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
first_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
last_name
AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
AkinMikeNaN181 Baywood LaneNaNMonticelloAR7165515002007-05-1816
AkinRebeccaNaN181 Baywood LaneNaNMonticelloAR716555002007-05-1816
AldridgeBrittniNaN808 Capitol Square Place, SWNaNWashingtonDC200242502007-06-0616
\n", + "
" + ], + "text/plain": [ + " first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "last_name \n", + "Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16\n", + "Akin Mike NaN 181 Baywood Lane NaN Monticello AR 71655 1500 2007-05-18 16\n", + "Akin Rebecca NaN 181 Baywood Lane NaN Monticello AR 71655 500 2007-05-18 16\n", + "Aldridge Brittni NaN 808 Capitol Square Place, SW NaN Washington DC 20024 250 2007-06-06 16" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.drop(['Ahrens'], inplace=True)\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
1AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
2AkinMikeNaN181 Baywood LaneNaNMonticelloAR7165515002007-05-1816
3AkinRebeccaNaN181 Baywood LaneNaNMonticelloAR716555002007-05-1816
4AldridgeBrittniNaN808 Capitol Square Place, SWNaNWashingtonDC200242502007-06-0616
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "1 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16\n", + "2 Akin Mike NaN 181 Baywood Lane NaN Monticello AR 71655 1500 2007-05-18 16\n", + "3 Akin Rebecca NaN 181 Baywood Lane NaN Monticello AR 71655 500 2007-05-18 16\n", + "4 Aldridge Brittni NaN 808 Capitol Square Place, SW NaN Washington DC 20024 250 2007-06-06 16" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.reset_index(inplace=True)\n", + "df2.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The recommended way to do it is to create a new dataframe. This might be impractical is things are very large." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
4AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
5AkinMikeNaN181 Baywood LaneNaNMonticelloAR7165515002007-05-1816
6AkinRebeccaNaN181 Baywood LaneNaNMonticelloAR716555002007-05-1816
7AldridgeBrittniNaN808 Capitol Square Place, SWNaNWashingtonDC200242502007-06-0616
8AllenJohn D.NaN1052 Cannon Mill DriveNaNNorth AugustaSC2986010002007-06-1116
9AllenJohn D.NaN1052 Cannon Mill DriveNaNNorth AugustaSC2986013002007-06-2916
10AllisonJohn W.NaNP.O. Box 1089NaNConwayAR7203310002007-05-1816
11AllisonRebeccaNaN3206 Summit CourtNaNLittle RockAR7222710002007-04-2516
12AllisonRebeccaNaN3206 Summit CourtNaNLittle RockAR722272002007-06-1216
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16\n", + "5 Akin Mike NaN 181 Baywood Lane NaN Monticello AR 71655 1500 2007-05-18 16\n", + "6 Akin Rebecca NaN 181 Baywood Lane NaN Monticello AR 71655 500 2007-05-18 16\n", + "7 Aldridge Brittni NaN 808 Capitol Square Place, SW NaN Washington DC 20024 250 2007-06-06 16\n", + "8 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1000 2007-06-11 16\n", + "9 Allen John D. NaN 1052 Cannon Mill Drive NaN North Augusta SC 29860 1300 2007-06-29 16\n", + "10 Allison John W. NaN P.O. Box 1089 NaN Conway AR 72033 1000 2007-05-18 16\n", + "11 Allison Rebecca NaN 3206 Summit Court NaN Little Rock AR 72227 1000 2007-04-25 16\n", + "12 Allison Rebecca NaN 3206 Summit Court NaN Little Rock AR 72227 200 2007-06-12 16" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci=dfcwci[dfcwci.last_name!='Ahrens']\n", + "dfcwci.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drow=\"DELETE FROM contributors WHERE last_name=\\\"Ahrens\\\";\"\n", + "db.cursor().execute(drow)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
01AgeeStevenNone549 Laurel Branch RoadNoneFloydVA240915002007-06-3016
15AkinCharlesNone10187 Sugar Creek RoadNoneBentonvilleAR727121002007-06-1616
26AkinMikeNone181 Baywood LaneNoneMonticelloAR7165515002007-05-1816
37AkinRebeccaNone181 Baywood LaneNoneMonticelloAR716555002007-05-1816
48AldridgeBrittniNone808 Capitol Square Place, SWNoneWashingtonDC200242502007-06-0616
59AllenJohn D.None1052 Cannon Mill DriveNoneNorth AugustaSC2986010002007-06-1116
610AllenJohn D.None1052 Cannon Mill DriveNoneNorth AugustaSC2986013002007-06-2916
711AllisonJohn W.NoneP.O. Box 1089NoneConwayAR7203310002007-05-1816
812AllisonRebeccaNone3206 Summit CourtNoneLittle RockAR7222710002007-04-2516
913AllisonRebeccaNone3206 Summit CourtNoneLittle RockAR722272002007-06-1216
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 1 Agee Steven None 549 Laurel Branch Road None Floyd VA 24091 500 2007-06-30 16\n", + "1 5 Akin Charles None 10187 Sugar Creek Road None Bentonville AR 72712 100 2007-06-16 16\n", + "2 6 Akin Mike None 181 Baywood Lane None Monticello AR 71655 1500 2007-05-18 16\n", + "3 7 Akin Rebecca None 181 Baywood Lane None Monticello AR 71655 500 2007-05-18 16\n", + "4 8 Aldridge Brittni None 808 Capitol Square Place, SW None Washington DC 20024 250 2007-06-06 16\n", + "5 9 Allen John D. None 1052 Cannon Mill Drive None North Augusta SC 29860 1000 2007-06-11 16\n", + "6 10 Allen John D. None 1052 Cannon Mill Drive None North Augusta SC 29860 1300 2007-06-29 16\n", + "7 11 Allison John W. None P.O. Box 1089 None Conway AR 72033 1000 2007-05-18 16\n", + "8 12 Allison Rebecca None 3206 Summit Court None Little Rock AR 72227 1000 2007-04-25 16\n", + "9 13 Allison Rebecca None 3206 Summit Court None Little Rock AR 72227 200 2007-06-12 16" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.commit()\n", + "out=make_query(\"SELECT * FROM contributors;\")\n", + "make_frame(out).head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###LIMIT" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
01AgeeStevenNone549 Laurel Branch RoadNoneFloydVA240915002007-06-3016
15AkinCharlesNone10187 Sugar Creek RoadNoneBentonvilleAR727121002007-06-1616
26AkinMikeNone181 Baywood LaneNoneMonticelloAR7165515002007-05-1816
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 1 Agee Steven None 549 Laurel Branch Road None Floyd VA 24091 500 2007-06-30 16\n", + "1 5 Akin Charles None 10187 Sugar Creek Road None Bentonville AR 72712 100 2007-06-16 16\n", + "2 6 Akin Mike None 181 Baywood Lane None Monticello AR 71655 1500 2007-05-18 16" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out=make_query(\"SELECT * FROM contributors LIMIT 3;\")\n", + "make_frame(out).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
0AgeeStevenNaN549 Laurel Branch RoadNaNFloydVA240915002007-06-3016
4AkinCharlesNaN10187 Sugar Creek RoadNaNBentonvilleAR727121002007-06-1616
5AkinMikeNaN181 Baywood LaneNaNMonticelloAR7165515002007-05-1816
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 Agee Steven NaN 549 Laurel Branch Road NaN Floyd VA 24091 500 2007-06-30 16\n", + "4 Akin Charles NaN 10187 Sugar Creek Road NaN Bentonville AR 72712 100 2007-06-16 16\n", + "5 Akin Mike NaN 181 Baywood Lane NaN Monticello AR 71655 1500 2007-05-18 16" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci[0:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Indexes" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "crind=\"CREATE INDEX amount_ix ON contributors(amount);\"\n", + "db.cursor().execute(crind)\n", + "db.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CREATE TABLE \"candidates\" (\n", + " \"id\" INTEGER PRIMARY KEY NOT NULL ,\n", + " \"first_name\" VARCHAR,\n", + " \"last_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"party\" VARCHAR NOT NULL\n", + ");\n", + "CREATE TABLE \"contributors\" (\n", + " \"id\" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,\n", + " \"last_name\" VARCHAR,\n", + " \"first_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"street_1\" VARCHAR,\n", + " \"street_2\" VARCHAR,\n", + " \"city\" VARCHAR,\n", + " \"state\" VARCHAR,\n", + " \"zip\" VARCHAR,\n", + " \"amount\" INTEGER,\n", + " \"date\" DATETIME,\n", + " \"candidate_id\" INTEGER NOT NULL, name,\n", + " FOREIGN KEY(candidate_id) REFERENCES candidates(id)\n", + ");\n", + "CREATE INDEX amount_ix ON contributors(amount);\n" + ] + } + ], + "source": [ + "%%bash\n", + "echo \".schema\" | sqlite3 cancont.db" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "crind=\"DROP INDEX amount_ix;\"\n", + "db.cursor().execute(crind)\n", + "db.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CREATE TABLE \"candidates\" (\n", + " \"id\" INTEGER PRIMARY KEY NOT NULL ,\n", + " \"first_name\" VARCHAR,\n", + " \"last_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"party\" VARCHAR NOT NULL\n", + ");\n", + "CREATE TABLE \"contributors\" (\n", + " \"id\" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,\n", + " \"last_name\" VARCHAR,\n", + " \"first_name\" VARCHAR,\n", + " \"middle_name\" VARCHAR,\n", + " \"street_1\" VARCHAR,\n", + " \"street_2\" VARCHAR,\n", + " \"city\" VARCHAR,\n", + " \"state\" VARCHAR,\n", + " \"zip\" VARCHAR,\n", + " \"amount\" INTEGER,\n", + " \"date\" DATETIME,\n", + " \"candidate_id\" INTEGER NOT NULL, name,\n", + " FOREIGN KEY(candidate_id) REFERENCES candidates(id)\n", + ");\n" + ] + } + ], + "source": [ + "%%bash\n", + "echo \".schema\" | sqlite3 cancont.db" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Relationships: JOINs are Cartesian Products." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Simple subselect" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_namemiddle_nameparty
033JosephBidenNaND
136SamuelBrownbackNaNR
234HillaryClintonR.D
339ChristopherDoddJ.D
426JohnEdwardsNaND
\n", + "
" + ], + "text/plain": [ + " id first_name last_name middle_name party\n", + "0 33 Joseph Biden NaN D\n", + "1 36 Samuel Brownback NaN R\n", + "2 34 Hillary Clinton R. D\n", + "3 39 Christopher Dodd J. D\n", + "4 26 John Edwards NaN D" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcand.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obamaid=dfcand.query(\"last_name=='Obama'\")['id'].values[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
25BucklerSteveNaN24351 Armada DrNaNDana PointCA926291306502007-07-3020
26BucklerSteveNaN24351 Armada DrNaNDana PointCA926291306252007-08-1620
27BuckheitBruceNaN8904 KAREN DRNaNFAIRFAXVA2203127311002007-09-1920
28BuckelLindaNaNPO Box 683130NaNPark CityUT84068313023002007-08-1420
29BuckelLindaNaNPO Box 683130NaNPark CityUT840683130-23002007-08-1420
\n", + "
" + ], + "text/plain": [ + " last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "25 Buckler Steve NaN 24351 Armada Dr NaN Dana Point CA 926291306 50 2007-07-30 20\n", + "26 Buckler Steve NaN 24351 Armada Dr NaN Dana Point CA 926291306 25 2007-08-16 20\n", + "27 Buckheit Bruce NaN 8904 KAREN DR NaN FAIRFAX VA 220312731 100 2007-09-19 20\n", + "28 Buckel Linda NaN PO Box 683130 NaN Park City UT 840683130 2300 2007-08-14 20\n", + "29 Buckel Linda NaN PO Box 683130 NaN Park City UT 840683130 -2300 2007-08-14 20" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "obamacontrib=dfcwci.query(\"candidate_id==%i\" % obamaid)\n", + "obamacontrib.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlast_namefirst_namemiddle_namestreet_1street_2citystatezipamountdatecandidate_id
026BucklerSteveNone24351 Armada DrNoneDana PointCA926291306502007-07-3020
127BucklerSteveNone24351 Armada DrNoneDana PointCA926291306252007-08-1620
228BuckheitBruceNone8904 KAREN DRNoneFAIRFAXVA2203127311002007-09-1920
329BuckelLindaNonePO Box 683130NonePark CityUT84068313023002007-08-1420
430BuckelLindaNonePO Box 683130NonePark CityUT840683130-23002007-08-1420
\n", + "
" + ], + "text/plain": [ + " id last_name first_name middle_name street_1 street_2 city state zip amount date candidate_id\n", + "0 26 Buckler Steve None 24351 Armada Dr None Dana Point CA 926291306 50 2007-07-30 20\n", + "1 27 Buckler Steve None 24351 Armada Dr None Dana Point CA 926291306 25 2007-08-16 20\n", + "2 28 Buckheit Bruce None 8904 KAREN DR None FAIRFAX VA 220312731 100 2007-09-19 20\n", + "3 29 Buckel Linda None PO Box 683130 None Park City UT 840683130 2300 2007-08-14 20\n", + "4 30 Buckel Linda None PO Box 683130 None Park City UT 840683130 -2300 2007-08-14 20" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "russiandollsel=\"\"\"\n", + "SELECT * FROM contributors WHERE \n", + " candidate_id = (SELECT id from candidates WHERE last_name = 'Obama');\n", + "\"\"\"\n", + "out=make_query(russiandollsel)\n", + "make_frame(out).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###implicit join" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contributors.last_namecontributors.first_namecontributors.amountcandidates.last_name
0BucklerSteve50Obama
1BucklerSteve25Obama
2BuckheitBruce100Obama
3BuckelLinda2300Obama
4BuckelLinda-2300Obama
\n", + "
" + ], + "text/plain": [ + " contributors.last_name contributors.first_name contributors.amount candidates.last_name\n", + "0 Buckler Steve 50 Obama\n", + "1 Buckler Steve 25 Obama\n", + "2 Buckheit Bruce 100 Obama\n", + "3 Buckel Linda 2300 Obama\n", + "4 Buckel Linda -2300 Obama" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "implicitjoinsel=\"\"\"\n", + "SELECT \n", + " contributors.last_name, contributors.first_name, contributors.amount, candidates.last_name \n", + "FROM \n", + " contributors, candidates \n", + "WHERE contributors.candidate_id = candidates.id\n", + "AND candidates.last_name = 'Obama';\n", + "\"\"\"\n", + "out=make_query(implicitjoinsel)\n", + "make_frame(out, legend=[\"contributors.last_name\", \n", + " \"contributors.first_name\", \"contributors.amount\", \"candidates.last_name\"]).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's expand to not just include Obama" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contributors.last_namecontributors.first_namecontributors.amountcandidates.last_name
0AgeeSteven500Huckabee
1AkinCharles100Huckabee
2AkinMike1500Huckabee
3AkinRebecca500Huckabee
4AldridgeBrittni250Huckabee
\n", + "
" + ], + "text/plain": [ + " contributors.last_name contributors.first_name contributors.amount candidates.last_name\n", + "0 Agee Steven 500 Huckabee\n", + "1 Akin Charles 100 Huckabee\n", + "2 Akin Mike 1500 Huckabee\n", + "3 Akin Rebecca 500 Huckabee\n", + "4 Aldridge Brittni 250 Huckabee" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "implicitjoinsel=\"\"\"\n", + "SELECT \n", + " contributors.last_name, contributors.first_name, contributors.amount, candidates.last_name \n", + "FROM \n", + " contributors, candidates \n", + "WHERE contributors.candidate_id = candidates.id;\n", + "\"\"\"\n", + "out=make_query(implicitjoinsel)\n", + "make_frame(out, legend=[\"contributors.last_name\", \n", + " \"contributors.first_name\", \"contributors.amount\", \"candidates.last_name\"]).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Explicit INNER JOIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![inner join](http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_inner.png)\n", + "\n", + "(from http://pandas.pydata.org/pandas-docs/stable/merging.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_name_xfirst_name_xcandidate_ididlast_name_y
0AgeeSteven1616Huckabee
1AkinCharles1616Huckabee
2AkinMike1616Huckabee
3AkinRebecca1616Huckabee
4AldridgeBrittni1616Huckabee
5AllenJohn D.1616Huckabee
6AllenJohn D.1616Huckabee
7AllisonJohn W.1616Huckabee
8AllisonRebecca1616Huckabee
9AllisonRebecca1616Huckabee
10AltesR.D.1616Huckabee
11AndresDale1616Huckabee
12AnthonyJohn1616Huckabee
13ArbogastRobert1616Huckabee
14ArbogastRobert1616Huckabee
15ArdleWilliam1616Huckabee
16AtiqOmar1616Huckabee
17AtiqOmar1616Huckabee
18BakerDavid1616Huckabee
19BancroftDavid1616Huckabee
20BanksCharles1616Huckabee
21BarbeeJohn1616Huckabee
22BucklerSteve2020Obama
23BucklerSteve2020Obama
24BuckheitBruce2020Obama
25BuckelLinda2020Obama
26BuckelLinda2020Obama
27BuckelLinda2020Obama
28BuckThomas2020Obama
29BuckJay2020Obama
..................
142ABDELLATHOMAS3535Romney
143ABBOTTWELDON3535Romney
144ABBOTTWELDON3535Romney
145ABBOTTGERALD3535Romney
146ABBOTTGERALD3535Romney
147ABEDINZAINUL3737McCain
148ABBOTTSYBIL3737McCain
149ABBOTTSYBIL3737McCain
150ABBOTTRONALD3737McCain
151ABBOTTRONALD3737McCain
152ABBOTTROBERT3737McCain
153ABBOTTMIKE3737McCain
154ABBOTDAVID3737McCain
155ABBOPAULINE3737McCain
156ABATEMARIA3737McCain
157ABAIRPETER3737McCain
158ABACHERLISHIRLEY3737McCain
159AARONSCHARLES3737McCain
160AARONSCHARLES3737McCain
161AARONSCHARLES3737McCain
162ABELJOHN3737McCain
163ABELMARLING3737McCain
164ABELRUDOLPH3737McCain
165ABELERODNEY3737McCain
166ABERCROMBIEDENIS3737McCain
167ABESHAUSMERRILL3737McCain
168ABRAHAMGEORGE3737McCain
169ABRAHAMSONPETER3737McCain
170ABRAHAMSALEM3737McCain
171ABRAHAMSALEM3737McCain
\n", + "

172 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " last_name_x first_name_x candidate_id id last_name_y\n", + "0 Agee Steven 16 16 Huckabee\n", + "1 Akin Charles 16 16 Huckabee\n", + "2 Akin Mike 16 16 Huckabee\n", + "3 Akin Rebecca 16 16 Huckabee\n", + "4 Aldridge Brittni 16 16 Huckabee\n", + "5 Allen John D. 16 16 Huckabee\n", + "6 Allen John D. 16 16 Huckabee\n", + "7 Allison John W. 16 16 Huckabee\n", + "8 Allison Rebecca 16 16 Huckabee\n", + "9 Allison Rebecca 16 16 Huckabee\n", + "10 Altes R.D. 16 16 Huckabee\n", + "11 Andres Dale 16 16 Huckabee\n", + "12 Anthony John 16 16 Huckabee\n", + "13 Arbogast Robert 16 16 Huckabee\n", + "14 Arbogast Robert 16 16 Huckabee\n", + "15 Ardle William 16 16 Huckabee\n", + "16 Atiq Omar 16 16 Huckabee\n", + "17 Atiq Omar 16 16 Huckabee\n", + "18 Baker David 16 16 Huckabee\n", + "19 Bancroft David 16 16 Huckabee\n", + "20 Banks Charles 16 16 Huckabee\n", + "21 Barbee John 16 16 Huckabee\n", + "22 Buckler Steve 20 20 Obama\n", + "23 Buckler Steve 20 20 Obama\n", + "24 Buckheit Bruce 20 20 Obama\n", + "25 Buckel Linda 20 20 Obama\n", + "26 Buckel Linda 20 20 Obama\n", + "27 Buckel Linda 20 20 Obama\n", + "28 Buck Thomas 20 20 Obama\n", + "29 Buck Jay 20 20 Obama\n", + ".. ... ... ... .. ...\n", + "142 ABDELLA THOMAS 35 35 Romney\n", + "143 ABBOTT WELDON 35 35 Romney\n", + "144 ABBOTT WELDON 35 35 Romney\n", + "145 ABBOTT GERALD 35 35 Romney\n", + "146 ABBOTT GERALD 35 35 Romney\n", + "147 ABEDIN ZAINUL 37 37 McCain\n", + "148 ABBOTT SYBIL 37 37 McCain\n", + "149 ABBOTT SYBIL 37 37 McCain\n", + "150 ABBOTT RONALD 37 37 McCain\n", + "151 ABBOTT RONALD 37 37 McCain\n", + "152 ABBOTT ROBERT 37 37 McCain\n", + "153 ABBOTT MIKE 37 37 McCain\n", + "154 ABBOT DAVID 37 37 McCain\n", + "155 ABBO PAULINE 37 37 McCain\n", + "156 ABATE MARIA 37 37 McCain\n", + "157 ABAIR PETER 37 37 McCain\n", + "158 ABACHERLI SHIRLEY 37 37 McCain\n", + "159 AARONS CHARLES 37 37 McCain\n", + "160 AARONS CHARLES 37 37 McCain\n", + "161 AARONS CHARLES 37 37 McCain\n", + "162 ABEL JOHN 37 37 McCain\n", + "163 ABEL MARLING 37 37 McCain\n", + "164 ABEL RUDOLPH 37 37 McCain\n", + "165 ABELE RODNEY 37 37 McCain\n", + "166 ABERCROMBIE DENIS 37 37 McCain\n", + "167 ABESHAUS MERRILL 37 37 McCain\n", + "168 ABRAHAM GEORGE 37 37 McCain\n", + "169 ABRAHAMSON PETER 37 37 McCain\n", + "170 ABRAHAM SALEM 37 37 McCain\n", + "171 ABRAHAM SALEM 37 37 McCain\n", + "\n", + "[172 rows x 5 columns]" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cols_wanted=['last_name_x', 'first_name_x', 'candidate_id', 'id', 'last_name_y']\n", + "dfcwci.merge(dfcand, left_on=\"candidate_id\", right_on=\"id\")[cols_wanted]" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contributors.last_namecontributors.first_namecandidates.last_name
0AgeeStevenHuckabee
1AkinCharlesHuckabee
2AkinMikeHuckabee
3AkinRebeccaHuckabee
4AldridgeBrittniHuckabee
\n", + "
" + ], + "text/plain": [ + " contributors.last_name contributors.first_name candidates.last_name\n", + "0 Agee Steven Huckabee\n", + "1 Akin Charles Huckabee\n", + "2 Akin Mike Huckabee\n", + "3 Akin Rebecca Huckabee\n", + "4 Aldridge Brittni Huckabee" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explicitjoinsel=\"\"\"\n", + "SELECT \n", + " contributors.last_name, contributors.first_name, candidates.last_name \n", + "FROM \n", + " contributors JOIN candidates \n", + "ON contributors.candidate_id = candidates.id;\n", + "\"\"\"\n", + "out=make_query(explicitjoinsel)\n", + "make_frame(out, legend=[\"contributors.last_name\", \n", + " \"contributors.first_name\", \"candidates.last_name\"]).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count(contributors.id)contributors.first_namecandidates.last_name
025LinClinton
125TIMOTHYGiuliani
222JohnHuckabee
325SALEMMcCain
425RyanObama
525RichardPaul
625GERALDRomney
\n", + "
" + ], + "text/plain": [ + " count(contributors.id) contributors.first_name candidates.last_name\n", + "0 25 Lin Clinton\n", + "1 25 TIMOTHY Giuliani\n", + "2 22 John Huckabee\n", + "3 25 SALEM McCain\n", + "4 25 Ryan Obama\n", + "5 25 Richard Paul\n", + "6 25 GERALD Romney" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explicitjoinsel=\"\"\"\n", + "SELECT \n", + " COUNT(contributors.id), contributors.first_name, candidates.last_name \n", + "FROM \n", + " contributors JOIN candidates \n", + "ON contributors.candidate_id = candidates.id\n", + "\n", + "GROUP BY candidates.last_name;\n", + "\"\"\"\n", + "out=make_query(explicitjoinsel)\n", + "make_frame(out, legend=[\"count(contributors.id)\", \n", + " \"contributors.first_name\", \"candidates.last_name\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "17" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(make_query(\"SELECT DISTINCT id, last_name FROM candidates;\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###Outer JOIN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####left outer (contributors on candidates)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![left outer](http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_left.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_name_xfirst_name_xcandidate_ididlast_name_y
0AgeeSteven1616Huckabee
1AkinCharles1616Huckabee
2AkinMike1616Huckabee
3AkinRebecca1616Huckabee
4AldridgeBrittni1616Huckabee
5AllenJohn D.1616Huckabee
6AllenJohn D.1616Huckabee
7AllisonJohn W.1616Huckabee
8AllisonRebecca1616Huckabee
9AllisonRebecca1616Huckabee
10AltesR.D.1616Huckabee
11AndresDale1616Huckabee
12AnthonyJohn1616Huckabee
13ArbogastRobert1616Huckabee
14ArbogastRobert1616Huckabee
15ArdleWilliam1616Huckabee
16AtiqOmar1616Huckabee
17AtiqOmar1616Huckabee
18BakerDavid1616Huckabee
19BancroftDavid1616Huckabee
20BanksCharles1616Huckabee
21BarbeeJohn1616Huckabee
22BucklerSteve2020Obama
23BucklerSteve2020Obama
24BuckheitBruce2020Obama
25BuckelLinda2020Obama
26BuckelLinda2020Obama
27BuckelLinda2020Obama
28BuckThomas2020Obama
29BuckJay2020Obama
..................
142ABDELLATHOMAS3535Romney
143ABBOTTWELDON3535Romney
144ABBOTTWELDON3535Romney
145ABBOTTGERALD3535Romney
146ABBOTTGERALD3535Romney
147ABEDINZAINUL3737McCain
148ABBOTTSYBIL3737McCain
149ABBOTTSYBIL3737McCain
150ABBOTTRONALD3737McCain
151ABBOTTRONALD3737McCain
152ABBOTTROBERT3737McCain
153ABBOTTMIKE3737McCain
154ABBOTDAVID3737McCain
155ABBOPAULINE3737McCain
156ABATEMARIA3737McCain
157ABAIRPETER3737McCain
158ABACHERLISHIRLEY3737McCain
159AARONSCHARLES3737McCain
160AARONSCHARLES3737McCain
161AARONSCHARLES3737McCain
162ABELJOHN3737McCain
163ABELMARLING3737McCain
164ABELRUDOLPH3737McCain
165ABELERODNEY3737McCain
166ABERCROMBIEDENIS3737McCain
167ABESHAUSMERRILL3737McCain
168ABRAHAMGEORGE3737McCain
169ABRAHAMSONPETER3737McCain
170ABRAHAMSALEM3737McCain
171ABRAHAMSALEM3737McCain
\n", + "

172 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " last_name_x first_name_x candidate_id id last_name_y\n", + "0 Agee Steven 16 16 Huckabee\n", + "1 Akin Charles 16 16 Huckabee\n", + "2 Akin Mike 16 16 Huckabee\n", + "3 Akin Rebecca 16 16 Huckabee\n", + "4 Aldridge Brittni 16 16 Huckabee\n", + "5 Allen John D. 16 16 Huckabee\n", + "6 Allen John D. 16 16 Huckabee\n", + "7 Allison John W. 16 16 Huckabee\n", + "8 Allison Rebecca 16 16 Huckabee\n", + "9 Allison Rebecca 16 16 Huckabee\n", + "10 Altes R.D. 16 16 Huckabee\n", + "11 Andres Dale 16 16 Huckabee\n", + "12 Anthony John 16 16 Huckabee\n", + "13 Arbogast Robert 16 16 Huckabee\n", + "14 Arbogast Robert 16 16 Huckabee\n", + "15 Ardle William 16 16 Huckabee\n", + "16 Atiq Omar 16 16 Huckabee\n", + "17 Atiq Omar 16 16 Huckabee\n", + "18 Baker David 16 16 Huckabee\n", + "19 Bancroft David 16 16 Huckabee\n", + "20 Banks Charles 16 16 Huckabee\n", + "21 Barbee John 16 16 Huckabee\n", + "22 Buckler Steve 20 20 Obama\n", + "23 Buckler Steve 20 20 Obama\n", + "24 Buckheit Bruce 20 20 Obama\n", + "25 Buckel Linda 20 20 Obama\n", + "26 Buckel Linda 20 20 Obama\n", + "27 Buckel Linda 20 20 Obama\n", + "28 Buck Thomas 20 20 Obama\n", + "29 Buck Jay 20 20 Obama\n", + ".. ... ... ... .. ...\n", + "142 ABDELLA THOMAS 35 35 Romney\n", + "143 ABBOTT WELDON 35 35 Romney\n", + "144 ABBOTT WELDON 35 35 Romney\n", + "145 ABBOTT GERALD 35 35 Romney\n", + "146 ABBOTT GERALD 35 35 Romney\n", + "147 ABEDIN ZAINUL 37 37 McCain\n", + "148 ABBOTT SYBIL 37 37 McCain\n", + "149 ABBOTT SYBIL 37 37 McCain\n", + "150 ABBOTT RONALD 37 37 McCain\n", + "151 ABBOTT RONALD 37 37 McCain\n", + "152 ABBOTT ROBERT 37 37 McCain\n", + "153 ABBOTT MIKE 37 37 McCain\n", + "154 ABBOT DAVID 37 37 McCain\n", + "155 ABBO PAULINE 37 37 McCain\n", + "156 ABATE MARIA 37 37 McCain\n", + "157 ABAIR PETER 37 37 McCain\n", + "158 ABACHERLI SHIRLEY 37 37 McCain\n", + "159 AARONS CHARLES 37 37 McCain\n", + "160 AARONS CHARLES 37 37 McCain\n", + "161 AARONS CHARLES 37 37 McCain\n", + "162 ABEL JOHN 37 37 McCain\n", + "163 ABEL MARLING 37 37 McCain\n", + "164 ABEL RUDOLPH 37 37 McCain\n", + "165 ABELE RODNEY 37 37 McCain\n", + "166 ABERCROMBIE DENIS 37 37 McCain\n", + "167 ABESHAUS MERRILL 37 37 McCain\n", + "168 ABRAHAM GEORGE 37 37 McCain\n", + "169 ABRAHAMSON PETER 37 37 McCain\n", + "170 ABRAHAM SALEM 37 37 McCain\n", + "171 ABRAHAM SALEM 37 37 McCain\n", + "\n", + "[172 rows x 5 columns]" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.merge(dfcand, left_on=\"candidate_id\", right_on=\"id\", how=\"left\")[cols_wanted]" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count(contributors.id)contributors.first_namecontributors.candidate_idcandidates.idcandidates.last_name
025LinClinton3434
125TIMOTHYGiuliani2222
222JohnHuckabee1616
325SALEMMcCain3737
425RyanObama2020
525RichardPaul3232
625GERALDRomney3535
\n", + "
" + ], + "text/plain": [ + " count(contributors.id) contributors.first_name contributors.candidate_id candidates.id candidates.last_name\n", + "0 25 Lin Clinton 34 34\n", + "1 25 TIMOTHY Giuliani 22 22\n", + "2 22 John Huckabee 16 16\n", + "3 25 SALEM McCain 37 37\n", + "4 25 Ryan Obama 20 20\n", + "5 25 Richard Paul 32 32\n", + "6 25 GERALD Romney 35 35" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explicitjoinsel=\"\"\"\n", + "SELECT \n", + " COUNT(contributors.id), contributors.first_name, candidates.last_name,\n", + " contributors.candidate_id, candidates.id\n", + "FROM \n", + " contributors LEFT OUTER JOIN candidates \n", + "ON contributors.candidate_id = candidates.id\n", + "\n", + "GROUP BY candidates.last_name;\n", + "\"\"\"\n", + "out=make_query(explicitjoinsel)\n", + "make_frame(out, legend=[\"count(contributors.id)\", \"contributors.first_name\", \n", + " \"contributors.candidate_id\", \"candidates.id\", \"candidates.last_name\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####right outer (contributors on candidates) = left outer (candidates on contributors)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We do it thus as sqlite has no support for right outer or plain outer. If it did we could write:\n", + "\n", + "```sql\n", + "SELECT \n", + " COUNT(contributors.id), contributors.first_name, candidates.last_name \n", + "FROM \n", + " contributors RIGHT OUTER JOIN candidates \n", + "ON contributors.candidate_id = candidates.id\n", + "\n", + "GROUP BY candidates.last_name;\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![right outer](http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_right.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_name_xfirst_name_xcandidate_ididlast_name_y
0AgeeSteven1616Huckabee
1AkinCharles1616Huckabee
2AkinMike1616Huckabee
3AkinRebecca1616Huckabee
4AldridgeBrittni1616Huckabee
5AllenJohn D.1616Huckabee
6AllenJohn D.1616Huckabee
7AllisonJohn W.1616Huckabee
8AllisonRebecca1616Huckabee
9AllisonRebecca1616Huckabee
10AltesR.D.1616Huckabee
11AndresDale1616Huckabee
12AnthonyJohn1616Huckabee
13ArbogastRobert1616Huckabee
14ArbogastRobert1616Huckabee
15ArdleWilliam1616Huckabee
16AtiqOmar1616Huckabee
17AtiqOmar1616Huckabee
18BakerDavid1616Huckabee
19BancroftDavid1616Huckabee
20BanksCharles1616Huckabee
21BarbeeJohn1616Huckabee
22BucklerSteve2020Obama
23BucklerSteve2020Obama
24BuckheitBruce2020Obama
25BuckelLinda2020Obama
26BuckelLinda2020Obama
27BuckelLinda2020Obama
28BuckThomas2020Obama
29BuckJay2020Obama
..................
152ABBOTTROBERT3737McCain
153ABBOTTMIKE3737McCain
154ABBOTDAVID3737McCain
155ABBOPAULINE3737McCain
156ABATEMARIA3737McCain
157ABAIRPETER3737McCain
158ABACHERLISHIRLEY3737McCain
159AARONSCHARLES3737McCain
160AARONSCHARLES3737McCain
161AARONSCHARLES3737McCain
162ABELJOHN3737McCain
163ABELMARLING3737McCain
164ABELRUDOLPH3737McCain
165ABELERODNEY3737McCain
166ABERCROMBIEDENIS3737McCain
167ABESHAUSMERRILL3737McCain
168ABRAHAMGEORGE3737McCain
169ABRAHAMSONPETER3737McCain
170ABRAHAMSALEM3737McCain
171ABRAHAMSALEM3737McCain
172NaNNaNNaN33Biden
173NaNNaNNaN36Brownback
174NaNNaNNaN39Dodd
175NaNNaNNaN26Edwards
176NaNNaNNaN24Gravel
177NaNNaNNaN30Hunter
178NaNNaNNaN31Kucinich
179NaNNaNNaN29Richardson
180NaNNaNNaN38Tancredo
181NaNNaNNaN41Thompson
\n", + "

182 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " last_name_x first_name_x candidate_id id last_name_y\n", + "0 Agee Steven 16 16 Huckabee\n", + "1 Akin Charles 16 16 Huckabee\n", + "2 Akin Mike 16 16 Huckabee\n", + "3 Akin Rebecca 16 16 Huckabee\n", + "4 Aldridge Brittni 16 16 Huckabee\n", + "5 Allen John D. 16 16 Huckabee\n", + "6 Allen John D. 16 16 Huckabee\n", + "7 Allison John W. 16 16 Huckabee\n", + "8 Allison Rebecca 16 16 Huckabee\n", + "9 Allison Rebecca 16 16 Huckabee\n", + "10 Altes R.D. 16 16 Huckabee\n", + "11 Andres Dale 16 16 Huckabee\n", + "12 Anthony John 16 16 Huckabee\n", + "13 Arbogast Robert 16 16 Huckabee\n", + "14 Arbogast Robert 16 16 Huckabee\n", + "15 Ardle William 16 16 Huckabee\n", + "16 Atiq Omar 16 16 Huckabee\n", + "17 Atiq Omar 16 16 Huckabee\n", + "18 Baker David 16 16 Huckabee\n", + "19 Bancroft David 16 16 Huckabee\n", + "20 Banks Charles 16 16 Huckabee\n", + "21 Barbee John 16 16 Huckabee\n", + "22 Buckler Steve 20 20 Obama\n", + "23 Buckler Steve 20 20 Obama\n", + "24 Buckheit Bruce 20 20 Obama\n", + "25 Buckel Linda 20 20 Obama\n", + "26 Buckel Linda 20 20 Obama\n", + "27 Buckel Linda 20 20 Obama\n", + "28 Buck Thomas 20 20 Obama\n", + "29 Buck Jay 20 20 Obama\n", + ".. ... ... ... .. ...\n", + "152 ABBOTT ROBERT 37 37 McCain\n", + "153 ABBOTT MIKE 37 37 McCain\n", + "154 ABBOT DAVID 37 37 McCain\n", + "155 ABBO PAULINE 37 37 McCain\n", + "156 ABATE MARIA 37 37 McCain\n", + "157 ABAIR PETER 37 37 McCain\n", + "158 ABACHERLI SHIRLEY 37 37 McCain\n", + "159 AARONS CHARLES 37 37 McCain\n", + "160 AARONS CHARLES 37 37 McCain\n", + "161 AARONS CHARLES 37 37 McCain\n", + "162 ABEL JOHN 37 37 McCain\n", + "163 ABEL MARLING 37 37 McCain\n", + "164 ABEL RUDOLPH 37 37 McCain\n", + "165 ABELE RODNEY 37 37 McCain\n", + "166 ABERCROMBIE DENIS 37 37 McCain\n", + "167 ABESHAUS MERRILL 37 37 McCain\n", + "168 ABRAHAM GEORGE 37 37 McCain\n", + "169 ABRAHAMSON PETER 37 37 McCain\n", + "170 ABRAHAM SALEM 37 37 McCain\n", + "171 ABRAHAM SALEM 37 37 McCain\n", + "172 NaN NaN NaN 33 Biden\n", + "173 NaN NaN NaN 36 Brownback\n", + "174 NaN NaN NaN 39 Dodd\n", + "175 NaN NaN NaN 26 Edwards\n", + "176 NaN NaN NaN 24 Gravel\n", + "177 NaN NaN NaN 30 Hunter\n", + "178 NaN NaN NaN 31 Kucinich\n", + "179 NaN NaN NaN 29 Richardson\n", + "180 NaN NaN NaN 38 Tancredo\n", + "181 NaN NaN NaN 41 Thompson\n", + "\n", + "[182 rows x 5 columns]" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.merge(dfcand, left_on=\"candidate_id\", right_on=\"id\", how=\"right\")[cols_wanted]" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count(contributors.id)contributors.first_namecontributors.candidate_idcandidates.idcandidates.last_name
00NoneBidenNaN33
10NoneBrownbackNaN36
225ThomasClinton3434
30NoneDoddNaN39
40NoneEdwardsNaN26
525WALTERGiuliani2222
60NoneGravelNaN24
722WilliamHuckabee1616
80NoneHunterNaN30
90NoneKucinichNaN31
1025ZAINULMcCain3737
1125ThomasObama2020
1225WilliamPaul3232
130NoneRichardsonNaN29
1425WELDONRomney3535
150NoneTancredoNaN38
160NoneThompsonNaN41
\n", + "
" + ], + "text/plain": [ + " count(contributors.id) contributors.first_name contributors.candidate_id candidates.id candidates.last_name\n", + "0 0 None Biden NaN 33\n", + "1 0 None Brownback NaN 36\n", + "2 25 Thomas Clinton 34 34\n", + "3 0 None Dodd NaN 39\n", + "4 0 None Edwards NaN 26\n", + "5 25 WALTER Giuliani 22 22\n", + "6 0 None Gravel NaN 24\n", + "7 22 William Huckabee 16 16\n", + "8 0 None Hunter NaN 30\n", + "9 0 None Kucinich NaN 31\n", + "10 25 ZAINUL McCain 37 37\n", + "11 25 Thomas Obama 20 20\n", + "12 25 William Paul 32 32\n", + "13 0 None Richardson NaN 29\n", + "14 25 WELDON Romney 35 35\n", + "15 0 None Tancredo NaN 38\n", + "16 0 None Thompson NaN 41" + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explicitjoinsel=\"\"\"\n", + "SELECT \n", + " COUNT(contributors.id), contributors.first_name, candidates.last_name, \n", + " contributors.candidate_id, candidates.id\n", + "FROM \n", + " candidates LEFT OUTER JOIN contributors \n", + "ON contributors.candidate_id = candidates.id\n", + "\n", + "GROUP BY candidates.last_name;\n", + "\"\"\"\n", + "out=make_query(explicitjoinsel)\n", + "make_frame(out, legend=[\"count(contributors.id)\", \"contributors.first_name\", \n", + " \"contributors.candidate_id\", \"candidates.id\", \"candidates.last_name\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "####full outer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "also not supported by sqlite\n", + "\n", + "```sql\n", + "SELECT \n", + " COUNT(contributors.id), contributors.first_name, candidates.last_name \n", + "FROM \n", + " contributors FULL OUTER JOIN candidates \n", + "ON contributors.candidate_id = candidates.id\n", + "\n", + "GROUP BY candidates.last_name;\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![outer](http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_outer.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_name_xfirst_name_xcandidate_ididlast_name_y
0AgeeSteven1616Huckabee
1AkinCharles1616Huckabee
2AkinMike1616Huckabee
3AkinRebecca1616Huckabee
4AldridgeBrittni1616Huckabee
5AllenJohn D.1616Huckabee
6AllenJohn D.1616Huckabee
7AllisonJohn W.1616Huckabee
8AllisonRebecca1616Huckabee
9AllisonRebecca1616Huckabee
10AltesR.D.1616Huckabee
11AndresDale1616Huckabee
12AnthonyJohn1616Huckabee
13ArbogastRobert1616Huckabee
14ArbogastRobert1616Huckabee
15ArdleWilliam1616Huckabee
16AtiqOmar1616Huckabee
17AtiqOmar1616Huckabee
18BakerDavid1616Huckabee
19BancroftDavid1616Huckabee
20BanksCharles1616Huckabee
21BarbeeJohn1616Huckabee
22BucklerSteve2020Obama
23BucklerSteve2020Obama
24BuckheitBruce2020Obama
25BuckelLinda2020Obama
26BuckelLinda2020Obama
27BuckelLinda2020Obama
28BuckThomas2020Obama
29BuckJay2020Obama
..................
152ABBOTTROBERT3737McCain
153ABBOTTMIKE3737McCain
154ABBOTDAVID3737McCain
155ABBOPAULINE3737McCain
156ABATEMARIA3737McCain
157ABAIRPETER3737McCain
158ABACHERLISHIRLEY3737McCain
159AARONSCHARLES3737McCain
160AARONSCHARLES3737McCain
161AARONSCHARLES3737McCain
162ABELJOHN3737McCain
163ABELMARLING3737McCain
164ABELRUDOLPH3737McCain
165ABELERODNEY3737McCain
166ABERCROMBIEDENIS3737McCain
167ABESHAUSMERRILL3737McCain
168ABRAHAMGEORGE3737McCain
169ABRAHAMSONPETER3737McCain
170ABRAHAMSALEM3737McCain
171ABRAHAMSALEM3737McCain
172NaNNaNNaN33Biden
173NaNNaNNaN36Brownback
174NaNNaNNaN39Dodd
175NaNNaNNaN26Edwards
176NaNNaNNaN24Gravel
177NaNNaNNaN30Hunter
178NaNNaNNaN31Kucinich
179NaNNaNNaN29Richardson
180NaNNaNNaN38Tancredo
181NaNNaNNaN41Thompson
\n", + "

182 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " last_name_x first_name_x candidate_id id last_name_y\n", + "0 Agee Steven 16 16 Huckabee\n", + "1 Akin Charles 16 16 Huckabee\n", + "2 Akin Mike 16 16 Huckabee\n", + "3 Akin Rebecca 16 16 Huckabee\n", + "4 Aldridge Brittni 16 16 Huckabee\n", + "5 Allen John D. 16 16 Huckabee\n", + "6 Allen John D. 16 16 Huckabee\n", + "7 Allison John W. 16 16 Huckabee\n", + "8 Allison Rebecca 16 16 Huckabee\n", + "9 Allison Rebecca 16 16 Huckabee\n", + "10 Altes R.D. 16 16 Huckabee\n", + "11 Andres Dale 16 16 Huckabee\n", + "12 Anthony John 16 16 Huckabee\n", + "13 Arbogast Robert 16 16 Huckabee\n", + "14 Arbogast Robert 16 16 Huckabee\n", + "15 Ardle William 16 16 Huckabee\n", + "16 Atiq Omar 16 16 Huckabee\n", + "17 Atiq Omar 16 16 Huckabee\n", + "18 Baker David 16 16 Huckabee\n", + "19 Bancroft David 16 16 Huckabee\n", + "20 Banks Charles 16 16 Huckabee\n", + "21 Barbee John 16 16 Huckabee\n", + "22 Buckler Steve 20 20 Obama\n", + "23 Buckler Steve 20 20 Obama\n", + "24 Buckheit Bruce 20 20 Obama\n", + "25 Buckel Linda 20 20 Obama\n", + "26 Buckel Linda 20 20 Obama\n", + "27 Buckel Linda 20 20 Obama\n", + "28 Buck Thomas 20 20 Obama\n", + "29 Buck Jay 20 20 Obama\n", + ".. ... ... ... .. ...\n", + "152 ABBOTT ROBERT 37 37 McCain\n", + "153 ABBOTT MIKE 37 37 McCain\n", + "154 ABBOT DAVID 37 37 McCain\n", + "155 ABBO PAULINE 37 37 McCain\n", + "156 ABATE MARIA 37 37 McCain\n", + "157 ABAIR PETER 37 37 McCain\n", + "158 ABACHERLI SHIRLEY 37 37 McCain\n", + "159 AARONS CHARLES 37 37 McCain\n", + "160 AARONS CHARLES 37 37 McCain\n", + "161 AARONS CHARLES 37 37 McCain\n", + "162 ABEL JOHN 37 37 McCain\n", + "163 ABEL MARLING 37 37 McCain\n", + "164 ABEL RUDOLPH 37 37 McCain\n", + "165 ABELE RODNEY 37 37 McCain\n", + "166 ABERCROMBIE DENIS 37 37 McCain\n", + "167 ABESHAUS MERRILL 37 37 McCain\n", + "168 ABRAHAM GEORGE 37 37 McCain\n", + "169 ABRAHAMSON PETER 37 37 McCain\n", + "170 ABRAHAM SALEM 37 37 McCain\n", + "171 ABRAHAM SALEM 37 37 McCain\n", + "172 NaN NaN NaN 33 Biden\n", + "173 NaN NaN NaN 36 Brownback\n", + "174 NaN NaN NaN 39 Dodd\n", + "175 NaN NaN NaN 26 Edwards\n", + "176 NaN NaN NaN 24 Gravel\n", + "177 NaN NaN NaN 30 Hunter\n", + "178 NaN NaN NaN 31 Kucinich\n", + "179 NaN NaN NaN 29 Richardson\n", + "180 NaN NaN NaN 38 Tancredo\n", + "181 NaN NaN NaN 41 Thompson\n", + "\n", + "[182 rows x 5 columns]" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfcwci.merge(dfcand, left_on=\"candidate_id\", right_on=\"id\", how=\"outer\")[cols_wanted]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When to use which?\n", + "\n", + "See this:\n", + "\n", + "http://blog.codinghorror.com/a-visual-explanation-of-sql-joins/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Pandas /SQL" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_namemiddle_nameparty
020BarackObamaD
124MikeGravelD
226JohnEdwardsD
329BillRichardsonD
431DennisKucinichD
533JosephBidenD
634HillaryClintonR.D
739ChristopherDoddJ.D
\n", + "
" + ], + "text/plain": [ + " id first_name last_name middle_name party\n", + "0 20 Barack Obama D\n", + "1 24 Mike Gravel D\n", + "2 26 John Edwards D\n", + "3 29 Bill Richardson D\n", + "4 31 Dennis Kucinich D\n", + "5 33 Joseph Biden D\n", + "6 34 Hillary Clinton R. D\n", + "7 39 Christopher Dodd J. D" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_sql(\"SELECT * FROM candidates WHERE party= 'D';\", db)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
last_namefirst_namelast_name
0AgeeStevenHuckabee
1AkinCharlesHuckabee
2AkinMikeHuckabee
3AkinRebeccaHuckabee
4AldridgeBrittniHuckabee
5AllenJohn D.Huckabee
6AllenJohn D.Huckabee
7AllisonJohn W.Huckabee
8AllisonRebeccaHuckabee
9AllisonRebeccaHuckabee
10AltesR.D.Huckabee
11AndresDaleHuckabee
12AnthonyJohnHuckabee
13ArbogastRobertHuckabee
14ArbogastRobertHuckabee
15ArdleWilliamHuckabee
16AtiqOmarHuckabee
17AtiqOmarHuckabee
18BakerDavidHuckabee
19BancroftDavidHuckabee
20BanksCharlesHuckabee
21BarbeeJohnHuckabee
22BucklerSteveObama
23BucklerSteveObama
24BuckheitBruceObama
25BuckelLindaObama
26BuckelLindaObama
27BuckelLindaObama
28BuckThomasObama
29BuckJayObama
............
142ABDELLATHOMASRomney
143ABBOTTWELDONRomney
144ABBOTTWELDONRomney
145ABBOTTGERALDRomney
146ABBOTTGERALDRomney
147ABEDINZAINULMcCain
148ABBOTTSYBILMcCain
149ABBOTTSYBILMcCain
150ABBOTTRONALDMcCain
151ABBOTTRONALDMcCain
152ABBOTTROBERTMcCain
153ABBOTTMIKEMcCain
154ABBOTDAVIDMcCain
155ABBOPAULINEMcCain
156ABATEMARIAMcCain
157ABAIRPETERMcCain
158ABACHERLISHIRLEYMcCain
159AARONSCHARLESMcCain
160AARONSCHARLESMcCain
161AARONSCHARLESMcCain
162ABELJOHNMcCain
163ABELMARLINGMcCain
164ABELRUDOLPHMcCain
165ABELERODNEYMcCain
166ABERCROMBIEDENISMcCain
167ABESHAUSMERRILLMcCain
168ABRAHAMGEORGEMcCain
169ABRAHAMSONPETERMcCain
170ABRAHAMSALEMMcCain
171ABRAHAMSALEMMcCain
\n", + "

172 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " last_name first_name last_name\n", + "0 Agee Steven Huckabee\n", + "1 Akin Charles Huckabee\n", + "2 Akin Mike Huckabee\n", + "3 Akin Rebecca Huckabee\n", + "4 Aldridge Brittni Huckabee\n", + "5 Allen John D. Huckabee\n", + "6 Allen John D. Huckabee\n", + "7 Allison John W. Huckabee\n", + "8 Allison Rebecca Huckabee\n", + "9 Allison Rebecca Huckabee\n", + "10 Altes R.D. Huckabee\n", + "11 Andres Dale Huckabee\n", + "12 Anthony John Huckabee\n", + "13 Arbogast Robert Huckabee\n", + "14 Arbogast Robert Huckabee\n", + "15 Ardle William Huckabee\n", + "16 Atiq Omar Huckabee\n", + "17 Atiq Omar Huckabee\n", + "18 Baker David Huckabee\n", + "19 Bancroft David Huckabee\n", + "20 Banks Charles Huckabee\n", + "21 Barbee John Huckabee\n", + "22 Buckler Steve Obama\n", + "23 Buckler Steve Obama\n", + "24 Buckheit Bruce Obama\n", + "25 Buckel Linda Obama\n", + "26 Buckel Linda Obama\n", + "27 Buckel Linda Obama\n", + "28 Buck Thomas Obama\n", + "29 Buck Jay Obama\n", + ".. ... ... ...\n", + "142 ABDELLA THOMAS Romney\n", + "143 ABBOTT WELDON Romney\n", + "144 ABBOTT WELDON Romney\n", + "145 ABBOTT GERALD Romney\n", + "146 ABBOTT GERALD Romney\n", + "147 ABEDIN ZAINUL McCain\n", + "148 ABBOTT SYBIL McCain\n", + "149 ABBOTT SYBIL McCain\n", + "150 ABBOTT RONALD McCain\n", + "151 ABBOTT RONALD McCain\n", + "152 ABBOTT ROBERT McCain\n", + "153 ABBOTT MIKE McCain\n", + "154 ABBOT DAVID McCain\n", + "155 ABBO PAULINE McCain\n", + "156 ABATE MARIA McCain\n", + "157 ABAIR PETER McCain\n", + "158 ABACHERLI SHIRLEY McCain\n", + "159 AARONS CHARLES McCain\n", + "160 AARONS CHARLES McCain\n", + "161 AARONS CHARLES McCain\n", + "162 ABEL JOHN McCain\n", + "163 ABEL MARLING McCain\n", + "164 ABEL RUDOLPH McCain\n", + "165 ABELE RODNEY McCain\n", + "166 ABERCROMBIE DENIS McCain\n", + "167 ABESHAUS MERRILL McCain\n", + "168 ABRAHAM GEORGE McCain\n", + "169 ABRAHAMSON PETER McCain\n", + "170 ABRAHAM SALEM McCain\n", + "171 ABRAHAM SALEM McCain\n", + "\n", + "[172 rows x 3 columns]" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_sql(implicitjoinsel, db)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is very useful if the database is big and out of memory. Sqlite3 is the only db2api database supported. For any other database you should use `SQLAlchemy`. See, for eg: https://plot.ly/ipython-notebooks/big-data-analytics-with-pandas-and-sqlite/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "db.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Useful Links" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- http://sebastianraschka.com/Articles/sqlite3_database.html and http://sebastianraschka.com/Articles/2014_sqlite_in_python_tutorial.html#unique_indexes\n", + "- https://github.com/tthibo/SQL-Tutorial\n", + "- chrisalbon.com\n", + "\n", + "And especially for R users:\n", + "\n", + "- https://cran.r-project.org/web/packages/dplyr/vignettes/introduction.html\n", + "- https://gist.github.com/TomAugspurger/6e052140eaa5fdb6e8c0/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Lectures/Lecture4/cancont.db b/Lectures/Lecture4/cancont.db new file mode 100644 index 0000000000000000000000000000000000000000..f5fda102d9e394b60f65be33270ee0e312bb3ef9 GIT binary patch literal 23552 zcmeHvZE#!JdEN!?u69?ultf8fn#;&K6eDwKk`Xh&)-%aW)%>yBNz)nIZQ@BYZ8B-o zIPs58+ce3#cAxj0djXJI%ey4q>2wwnAqv2G&ewb1kLP_4+?R^Aj?eBiTiafT>C$6T zBqF`Ym?TNh;b#;-;o~%Zc`W=7{*4U${=I&EPMZBMj}75nIdV>tzZm@n{`r6X@u}?N zPmEr^9QkyiR!9^O2gasFR>NV$>vQb zc;im3^*!F%u2rjb{~a5(J1yVuyrLci3tra_zP?fG>>Yfq-SIjPjpp^*&cV-l+s)lZ zr#I=dVhS#Xp=$%&vSE;uWjOH&sI%hjpVZKZ&_xxb1Y35jSmSdH( zmdko?w4bk4r%z6IoyVRSotTLH<)+xhu(x*qgV6&!*!#%$-@kB}?E7xa_uR`z?AVUi z8rXuKbex=Cl243Yxe{69Q)<6j$I*SI?Z3L~H#YnO|9x!Vr{{@zJrN)1_Lc<21J${+=^(0WVDq zozB$ib*Hwm>9wlV{uD#&k)hL0v$5pu)}6EGPe@~<_?Xx7Hn!|_Z`;caUlwQ!&CNz> zBkR=~&d5c444vluMx)kV+})@(fSEh@LA;E9bGr>FlsuH4-*V2Kp>fk-w*36=#+JA4 z`_AwO!Vzxy^DXb5U(Y@J>~r|&_|R$U-m2cGf#*Ij zbwZjP89treY}MMG=FXUp6Pv!-?gd{b7vl;?QZ= z+urr-nO5_DV;xg;&Q6FKFErc!&Ss`o^&7b%`G+Jq`j!-ZOa2Afk@eunWwpMN~~!9VXm-d}-ZSb;~Le+lP*qdR*_niw5UO5?*9 zBNLIUX2vn|X34e7OIF2l7GkQRD{M8!9NW#>7IQ0(WmU5ILe`wO3q?aQ5=l){=toJ- zD(bA3xZKB>VH^cBF{JU);R)%&@WtpvB>aF`v`S`dDQ3r5#=gdqS+i{B3TC--Etyu- zO84ri!B*4nu321Bl;q>*!|~`!IG!2iQiqjp?9j0Ce+^;PZ_9ro{}cJY!Snw``48mZlYdA4E%_hGUz2}B z{)+rr`PbxMmEW2i9wkj!uh-yHIsUr8vEk{OVla1id%MB6{VNfW^stF2n~uAgaP ztsAwJlENr=YMo}Cxv%beEuSsby$v76V|CSQZ`K-j;Tz_%*aa;v{du?* zPvH5a6t`dZ}q$QzMw zM5m*ljQ*j#Ab%cj_8-e5V^VrCatiSlj>;$(!x8hY@4FrUp5I8s(-4#0mS1O?7Q(Yl zzC&~M=3aHh)Z$874dum{)s-_J-)(GFqNjQrgyoogH|x>gp-_w=e#RNmbcSJjUAd2n zZGVrbgWEL-`q)uG2Zd<>^!npiOwv0-+^3Ex@6>#T1qY^l0@(fYI6-f|_X8;)sa@p> zKGslA@Ub3BQ3%MO>E@HC2DRE%vwlo!Ed-i@Fmes~%2THXb^J6;*ddhsK9$TNW>{b; zUO9Oj9qh9r`#W%-MgTA_g_dqMFv9Q6)CG-#QLn~^qF+Uje?^jiTKT{C)dmXbmMLd{^AsG-ySz@b4Gp;>wGb`bPVcNCywQu3k*+z^6UxkCB6SpdnN zR1>TmV_6XIj#*^pid9~=+~oqOd`;Eni&i;@cnelQg`c3RMx3iH5hSV@JR!|6fpxR{ zEOj)j!o*yFu0cmMH_-iNF*g^}Q)!m5%GO+=5=5X&j=fUI5o7HXw6vmY>3C>qQnPAe zGF%&haI-sdZ1Cxa;WxYIIs6OKG~du*^s81eXP4brGMQwS>0;>iawX3yR%r=|4i>i% zk5*XGT(nrWP+429q~fZAr8Clj?!;Q9dovMeo85EA+f|@Z)3e&at{}8eOXFk1S!w)w z_biSK|CLX0#LVQZV$rPR?UL!1VsTAZS;;Oh^DNA&SuCgn+5eN$Pf7Uuc4RH`3z0vIE<}Ga`p2>@ z|GfOg{R-d_|6A*xzty3!O4D$ni7dHddM#pFkvrR{d2uzx%BxIIz-@Lr_Ey~kt?RKY zQcd<^62~kB4_f0<0!FFLYQ`ngJAOor2&)^6>E>>Kz&N{y88As9AFrxmy!+70_key5 z{_q%IjtTzuLTy`9lPNam)mn%}F#i@}3%E3;LOg{vfVNCVWr3yfJAK+!krBDL%d|=1}*SY?h&zDWw?p-QC8kyR}Zq($h+kUYQF=r}fZ8D|_#7 zfPrbyX8Y3(0AD{GU~f|tJ)Exq^gaQ=al`{cLZ|tZLI1BK1E3*>gJf*f?$kDzdCzB) z%y7L1n8^u~9l}<9p z+uLq7s?>pt$fE5GL+h0{`oLnl-V<HI$> z{jwzea^#7KANkkOndqC*|14Yb=cG^JPw=?jg{L}7d@Bqpx+fCRT3)nP3guEv)s+NH zzXP+z$b7;yl&vMpbntHuoMs=&HCdP(!z!QCmW6N4(P7W5M%yI>5-iGMGye7jA5bq==`v~C_Ug;t<8ygw^XfTJ% zW;Sa(xtwK|v$0f4)xm{l3T05_%qirsNRDmhil)m}ta;NVUrkkMtF$mogGJIO1qk9Q zz^rr;uZ@gC#b4o409#UV^H$keTZ*xe4A;_W6gCtw5w?Pb!fp_BJfS7=PGGeKG8)e? z-uzYMoZ`mNQp?uQFHEsgXAg^WsIw>iZ3rh0A9M@(OE|p2 z;Tw|v1oofD|9>YUM{Y-cE&8G88`0lG4ZzM;G`%T0xmE!WZm3OiK;=^ir<2=Le@n~8F8pOo0QxUq&Qh5wm|j4b)BmJhw$DK>X&Z9k!)vlggK&bKj2 z^$HR&?``kEbi+l|4a0a~p8V*}&;I7gfj+sHcpc!QFbJ22M=+6Dr}?_yxZ~HWmD=`h zE0JV*zqRhSI*jNnr@6jfYqZ5w7E4nAS5-+*{bp4MrpNVhC{ zuQyu*+2T@C0dcldYoI=eDuI{>{?gm=Tf7^}g(Zx{|NKt_J#Yjf5_%cu<(qsvrUU&o zUGz5_ev4A`UfYdnSaNJOm^q>eSG{WcKJlY=J|5Rn>UUEE(4rwm&YN!BBKMpV!52uF zmg*pp{g%j2C&2e*>fV;mD8bE~Y8ks7ta&-FYY7AJ@dqhdpM47ypiqcQdG5-~H1^Xl z7JPlC9dGyUW@l-$R0QrgV3k5eEpOpuN7^y1Th#stHX|-~Q&n&_z4AkG$pD zD>QVhIwT0Ieq|0q12$aBQ1T%`aXWP8@@Y}S3hVS$r|NG^bSt~FS(!OWcw!(a`QzXF4sB70=S zQx9g}{NR^f`lB}ox(cwcxdGacd;|7>N|HV!{aM6~{L|=JaDU&DrsU7dcjYfg9}XOV z`sSb#fPyAxw{6wo#Q5-BSq9Y(tkA~E#cbiIPeOPzOdzg%Sb=dI8u0OSXyAy^I)YZDB}~T^w$ucq+hLP&{b4UnNk2&al`o!u z{de;NGY>F`58^1-*3ri({NJlWEap9Ikl)I*b~i!?IIg4;T!UejEZ^MrnK_p!aOOeEro(3L<2MP&kDh6NB?evgt9sdc{9y<`ujt*{lGNbj!C=Alm-(1v2e zHJ%{h?EI>kCdViNS$ue#DF_Srzx~GLc;t!#YT-ju5IHDU^}EL}#r=3FaiBNs;}X2o zz;H&6d}Q1j4_~H9FTe*EXVBfOAw9B(^a%7dbkH6d#JqRUYk2UBG#oS)(RDAz>IZE> zLlOVGlH^KXh@6e=M!p_RM*mjyzsqYl{eJ@%^3jiSm0b6pFmh44mV^8gCnnta%NZ25 zX5fz(txPPgrv)cMv=Xn$C|j#+&2|=Hx{;fL+Dhu7bn2i)o~70}SiJp)1<@QEri2cy zz-(rD(Oz{I3puk4M{psALj*^Wtvc`%d-&#(<&Y<8aZ?wE^3{!>5+V|GAVA>bau{!p ztKTq$yP8=pxNw?LyNejdstgo1M|3&Sp zhV_&R&Qm!s1K_~55lBuEx@Hvo4?;fd@szuRV_kWDQnht0n@=@;rLu$#hf)AU+6)1w}{bPvaLTaIy%e`q5+RrLPv>p_e-!LodXk z2;xcYjRKkze<89*oJ)YS@&Z3RQ&f1z4-aOG?8$Y6*;XkA;#_6ciixeq_Tzd4ls>rv zsK!ef$>jge^$tAWibV_CEZW|F5x66gdRZ8aNg6+Ri9!+75m^jfE9ZiJFqdEqIAfoN zN*J&ns$nQ<{7-Q%$^K(AdYur^4HG@UbV@28IZ4u@g+vE3H7FO+`6Ag5fbqdklUN4S;Y8eI?AUTAa&DV&PK-z>)9kLzryxLhKWClX{x zGJ$-7w+;!?;^<;vd-=GjJ^Un&a4maOlN!W#51CZK|9w@GzKY!crz8J9dNul2qyJI< zsQfeV2oCr70cff3r%W->bwt@JRQhP8%p-38I~4DT<2CNW8|rMDjb?KP$2G}T$$dp! z#%p=9RAAEVbbJwaQ^p60xxl<~2btpdn>IFh9P!@8VWJ#o%X_#!)7&mmeYsyZ)4It? zeXzR}m(Yb9vqiKBar>x#NXEbOln_gf7_^2*AljFgu)iZPNho5%m?uh5+~r!sTWD%& zq$?}URud6hV-H10@ag+;UflSN0aR)=!>HZrOBn4acQ6QV8Zmhl#V%aq7m=BQG`c$| z)F4}d3Lz9~Z6ZeVTSP^t747#2K!c`%xwz&)9gm|Tc3OnJ6!2`dX+3M+b~YAd4)nB| z#gH2wL$5@+@H3<(zWeb3V9^vli)dVNxou0+5@3NZFYbb4$a(i_RW+V~U&n*Uw2oa9 z%Ukpt{%hW%YZ#iE_|97cz`P|6Cbhj;;jYk?z)!gfDvIX0;A7(0@=}agX{-P4 zFpU~nC4^CMwR)T@-?gAImLh2GEt60{>?XNNnO&|SLP!0SxOAZEX@iv!=Iw6zkU;EQ zk=P8xd{}F8MinB%`qq#45kT~!v6okfIX*p2CLhb9TJ0^r0m7)Y2eJsf(UPj94F!Ri z$R^P$ZG%F>9EqBSD*W!mz|!6vwd{WTP`OAZ9h`&U+NX#+@axENRJ}lcj1;04Wyx!8@eCnlm=*Eb z@SkEfoX1q-=S4%F9;tfEj8vqghrSWLDOe_WR0gXMKeGX0=Vx=#YqY&K2QS$M@;WL- zsXRHMB{*(KQQevV>^UTpgfgA1nlsJw>px%wDgx zcPJ@GE=K~xe(^_togV*2EDU-{Q7;624=jc(x$rQq-z;2QUDCvQ(=Sb(thJh~X^WfiAI7ap%s)ev%0 zYF$fXR}o7zH^BRl+>jQ7w18C7!;B76BmC3=pO%HlKwN!et@#8!?2(ACZ!Q_c3KQI-AYL9w`n@RavpwxEr$m+?UQb;r;Hk;Pj!KB~;1p!Kd=7oYQxalH8<{bc7Q9CD2ri8Lcp4w7YQ{_GknlJAxLn{br*I z&oJlJeWXa?(xE<;+1^%Lgelzr&G~Kg1n;0-FyRy%Mpy%FptS3Z03gq|?jQHgiH#RF z!$;xUg)!+PK@c!&w$__>y*BROl1Hjy&Ty&e3j?qoyLe9Ec8DcX!6+5yVG(kJtUU$p_(l1#_TUIQmGz;a5PBYUFfQKs~ME zD=ffHt|A)%mq$b7sRY?36&6RdLxz~Hp!!_|B%`CF9|@|>=uMQ+ny3e|z+Rd-CMqja zq5^ax3XX%?rc%Ky6_|@J3n|aLJP2J;0D4%2r=l7vXp9v> z^PAlt0`eHfE28qCsu{&k*g6vxDh3P}7DjlLIV-yu3oBl57gVQ0XA$bh>|1aPUFi!oiM9)S2=)aI(mcAhWW9hf0 zPvg(P;~Hj;lsV6g(aaHsY}Nf1boxDCLtUiGOni$_)q}yx!SOEnV{$@9x)|aokhAF#JMLo|0J;*)kYDo<6sbVAWAi zTgX9+Wo*ah773{}D>jeZOO83_QXz-xUl?v^YAq2T$g%3r77}0=&kf%+Ao^4|CMvn7 zGHaIOn#(yD49!TSfSE&9*Iq)Y6g{QW2mqyr9HJS= zuJxy&9`hVRIBvlCqWmDZl~tW6!^(l1HH$j?WtcRbgReDQV{1l;l6h(x9=dGNa8?JgL^W9Wje4hy=U?uzWY zbcreIgI&CfYn`{dpz@AyR1~5d9M$bE(T_*JZ5m8GHLI%!yQp<>!F+;37F=kBQOB2R z*~=M|ySb)oIy_58my$|_`DLpp7#&=qW@(eF9;9 zyn7OTos;w_QSXDxvyNG^R|}Qvi&hz%)4QLIVxL76b@q#XbQRaj2CIJ3$f2e9tx3M4 zDorc|qgyWJm%F(8KXAcJboR&_W+=F{+?*Mt=rA2EjKqQ)$LIk5VsJ_AMi*aWFhL1A z5J*_L+5P~^1;<2w8Ek+9Z}+O73u3zGxFsKecQX$ZUSSqIL}57JIhu|=6u;M;zl)Y66%S#f!ijb=@gV<9|3_eVXSwN z4TR@sh294`DO!%5D_d(iDgs#vWxNGc!osB`0?rM&x8UWXi2_Z~S16O|_rzHyz(5^y zAgql-Y3r%L4WM1aH_**nHY&e*-XS85xVR~Qql=8(z~{LTKB|FWKxr(v8BI7cZg~g_ zaPV8)M2B~}lu8^QP6wz+|oqhFn#X<+;j_HsGAeVcMccy+=@9j zXI`f?1_~iG^|f9%21HH3K-2}$^)7PJRAB&J6wr-U(OJ&f#U&8pNZQ(2kW-@Gcapw# z35VTuZFGUUUoEbwxSbIAD4YXZ%^Y|(Njft-X z2