Skip to content

Commit 7a776af

Browse files
author
Justin Littman
committed
Added tweeters_by_date
1 parent 782f4b3 commit 7a776af

File tree

1 file changed

+263
-0
lines changed

1 file changed

+263
-0
lines changed
Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"toc": true
7+
},
8+
"source": [
9+
"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n",
10+
"<div class=\"toc\" style=\"margin-top: 1em;\"><ul class=\"toc-item\"></ul></div>"
11+
]
12+
},
13+
{
14+
"cell_type": "markdown",
15+
"metadata": {},
16+
"source": [
17+
"From a set of tweets, create a dataframe in which the index are the user ids and the columns are a count of tweets by day.\n",
18+
"\n",
19+
"Before using, make sure to set the correct path of the input (line-oriented JSON) files and output file (CSV)."
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 1,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"import pandas as pd\n",
29+
"import numpy as np\n",
30+
"import json\n",
31+
"from dateutil.parser import parse as date_parse\n",
32+
"\n",
33+
"df = pd.DataFrame()\n",
34+
"\n",
35+
"# Optional limit of number of tweets to load.\n",
36+
"limit = 10000\n",
37+
"\n",
38+
"count = 0\n",
39+
"\n",
40+
"# Change json files to load\n",
41+
"for filepath in ('/Users/justinlittman/Downloads/tweets-001.jsonl',):\n",
42+
" with open(filepath) as file:\n",
43+
" for line in file:\n",
44+
" if limit and limit <= count:\n",
45+
" break\n",
46+
" tweet = json.loads(line.rstrip('\\n'))\n",
47+
" user_id = tweet['user']['id_str']\n",
48+
" created_at_day = date_parse(tweet['created_at']).strftime(\"%Y-%m-%d\")\n",
49+
" try:\n",
50+
" # Get row\n",
51+
" row = df.loc[user_id]\n",
52+
" # Set column\n",
53+
" row[created_at_day] = row[created_at_day] + 1 if pd.notna(row[created_at_day]) else 1\n",
54+
" except KeyError:\n",
55+
" # Add row\n",
56+
" df = df.append(pd.DataFrame([{created_at_day: 1}], index=[user_id]))\n",
57+
" count += 1\n",
58+
" \n",
59+
"df = df.fillna(0)\n"
60+
]
61+
},
62+
{
63+
"cell_type": "code",
64+
"execution_count": 2,
65+
"metadata": {},
66+
"outputs": [
67+
{
68+
"data": {
69+
"text/html": [
70+
"<div>\n",
71+
"<style scoped>\n",
72+
" .dataframe tbody tr th:only-of-type {\n",
73+
" vertical-align: middle;\n",
74+
" }\n",
75+
"\n",
76+
" .dataframe tbody tr th {\n",
77+
" vertical-align: top;\n",
78+
" }\n",
79+
"\n",
80+
" .dataframe thead th {\n",
81+
" text-align: right;\n",
82+
" }\n",
83+
"</style>\n",
84+
"<table border=\"1\" class=\"dataframe\">\n",
85+
" <thead>\n",
86+
" <tr style=\"text-align: right;\">\n",
87+
" <th></th>\n",
88+
" <th>2017-08-26</th>\n",
89+
" </tr>\n",
90+
" </thead>\n",
91+
" <tbody>\n",
92+
" <tr>\n",
93+
" <th>count</th>\n",
94+
" <td>8812.000000</td>\n",
95+
" </tr>\n",
96+
" <tr>\n",
97+
" <th>mean</th>\n",
98+
" <td>1.134816</td>\n",
99+
" </tr>\n",
100+
" <tr>\n",
101+
" <th>std</th>\n",
102+
" <td>0.604887</td>\n",
103+
" </tr>\n",
104+
" <tr>\n",
105+
" <th>min</th>\n",
106+
" <td>1.000000</td>\n",
107+
" </tr>\n",
108+
" <tr>\n",
109+
" <th>25%</th>\n",
110+
" <td>1.000000</td>\n",
111+
" </tr>\n",
112+
" <tr>\n",
113+
" <th>50%</th>\n",
114+
" <td>1.000000</td>\n",
115+
" </tr>\n",
116+
" <tr>\n",
117+
" <th>75%</th>\n",
118+
" <td>1.000000</td>\n",
119+
" </tr>\n",
120+
" <tr>\n",
121+
" <th>max</th>\n",
122+
" <td>19.000000</td>\n",
123+
" </tr>\n",
124+
" </tbody>\n",
125+
"</table>\n",
126+
"</div>"
127+
],
128+
"text/plain": [
129+
" 2017-08-26\n",
130+
"count 8812.000000\n",
131+
"mean 1.134816\n",
132+
"std 0.604887\n",
133+
"min 1.000000\n",
134+
"25% 1.000000\n",
135+
"50% 1.000000\n",
136+
"75% 1.000000\n",
137+
"max 19.000000"
138+
]
139+
},
140+
"execution_count": 2,
141+
"metadata": {},
142+
"output_type": "execute_result"
143+
}
144+
],
145+
"source": [
146+
"df.describe()"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 3,
152+
"metadata": {},
153+
"outputs": [
154+
{
155+
"data": {
156+
"text/html": [
157+
"<div>\n",
158+
"<style scoped>\n",
159+
" .dataframe tbody tr th:only-of-type {\n",
160+
" vertical-align: middle;\n",
161+
" }\n",
162+
"\n",
163+
" .dataframe tbody tr th {\n",
164+
" vertical-align: top;\n",
165+
" }\n",
166+
"\n",
167+
" .dataframe thead th {\n",
168+
" text-align: right;\n",
169+
" }\n",
170+
"</style>\n",
171+
"<table border=\"1\" class=\"dataframe\">\n",
172+
" <thead>\n",
173+
" <tr style=\"text-align: right;\">\n",
174+
" <th></th>\n",
175+
" <th>2017-08-26</th>\n",
176+
" </tr>\n",
177+
" </thead>\n",
178+
" <tbody>\n",
179+
" <tr>\n",
180+
" <th>2804679657</th>\n",
181+
" <td>1</td>\n",
182+
" </tr>\n",
183+
" <tr>\n",
184+
" <th>35943534</th>\n",
185+
" <td>1</td>\n",
186+
" </tr>\n",
187+
" <tr>\n",
188+
" <th>458065150</th>\n",
189+
" <td>1</td>\n",
190+
" </tr>\n",
191+
" <tr>\n",
192+
" <th>30017277</th>\n",
193+
" <td>1</td>\n",
194+
" </tr>\n",
195+
" <tr>\n",
196+
" <th>827910186031599618</th>\n",
197+
" <td>1</td>\n",
198+
" </tr>\n",
199+
" </tbody>\n",
200+
"</table>\n",
201+
"</div>"
202+
],
203+
"text/plain": [
204+
" 2017-08-26\n",
205+
"2804679657 1\n",
206+
"35943534 1\n",
207+
"458065150 1\n",
208+
"30017277 1\n",
209+
"827910186031599618 1"
210+
]
211+
},
212+
"execution_count": 3,
213+
"metadata": {},
214+
"output_type": "execute_result"
215+
}
216+
],
217+
"source": [
218+
"df.head()"
219+
]
220+
},
221+
{
222+
"cell_type": "code",
223+
"execution_count": 4,
224+
"metadata": {},
225+
"outputs": [],
226+
"source": [
227+
"# Change destination file.\n",
228+
"df.to_csv('/Users/justinlittman/Downloads/tweeters_by_date.csv')"
229+
]
230+
}
231+
],
232+
"metadata": {
233+
"kernelspec": {
234+
"display_name": "Python 3",
235+
"language": "python",
236+
"name": "python3"
237+
},
238+
"language_info": {
239+
"codemirror_mode": {
240+
"name": "ipython",
241+
"version": 3
242+
},
243+
"file_extension": ".py",
244+
"mimetype": "text/x-python",
245+
"name": "python",
246+
"nbconvert_exporter": "python",
247+
"pygments_lexer": "ipython3",
248+
"version": "3.6.3"
249+
},
250+
"toc": {
251+
"nav_menu": {},
252+
"number_sections": true,
253+
"sideBar": true,
254+
"skip_h1_title": false,
255+
"toc_cell": true,
256+
"toc_position": {},
257+
"toc_section_display": "block",
258+
"toc_window_display": true
259+
}
260+
},
261+
"nbformat": 4,
262+
"nbformat_minor": 2
263+
}

0 commit comments

Comments
 (0)