forked from qalhata/Python-Scripts-Repo-on-Data-Science
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDatStream.py
134 lines (90 loc) · 3.24 KB
/
DatStream.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 30 21:27:31 2017
@author: Shabaka
"""
import pandas as pd
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)
# ''''''''''''''''' Write Generator to Load Data Chunks ''''''' #
# Define read_large_file()
def read_large_file(file_object):
"""A generator function to read a large file lazily."""
# Loop indefinitely until the end of the file
while True:
# Read a line from the file: data
data = file_object.readline()
# Break if this is the end of the file
if not data:
break
# Yield the line of data
yield data
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Create a generator object for the file: gen_file
gen_file = read_large_file(file)
# Print the first three lines of the file
print(next(gen_file))
print(next(gen_file))
print(next(gen_file))
# ''''''''''''''' Load Data in Chunks with Generator ''''''''''' '#
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
# Print
print(counts_dict)
# ''''' Iterator to load data in chunks ''''''''''' #
# Import the pandas package
# Initialize reader object: df_reader
df_reader = pd.read_csv('ind_pop.csv', chunksize=10)
# Print two chunks
print(next(df_reader))
print(next(df_reader))
# ''''''''''''' Iterator to Load Data in Chunks '''''''''''#
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
# Get the first dataframe chunk: df_urb_pop
df_urb_pop = next(urb_pop_reader)
# Check out the head of the dataframe
print(df_urb_pop.head())
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
# Zip dataframe columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Print pops_list
print(pops_list)
# Use list comp to create new dataframe column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
# Plot urban population data
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()