-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbabynames-mb.py
123 lines (98 loc) · 3.75 KB
/
babynames-mb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python
# Copyright 2010 Google Inc.
# Licensed under the Apache License, Version 2.0
# http://www.apache.org/licenses/LICENSE-2.0
# Google's Python Class
# http://code.google.com/edu/languages/google-python-class/
"""Baby Names exercise
Define the extract_names() function below and change main()
to call it.
For writing regex, it's nice to include a copy of the target
text for inspiration.
Here's what the html looks like in the baby.html files:
...
<h3 align="center">Popularity in 1990</h3>
....
<tr align="right"><td>1</td><td>Michael</td><td>Jessica</td>
<tr align="right"><td>2</td><td>Christopher</td><td>Ashley</td>
<tr align="right"><td>3</td><td>Matthew</td><td>Brittany</td>
...
Suggested milestones for incremental development:
-Extract the year and print it
-Extract the names and rank numbers and just print them
-Get the names data into a dict and print it
-Build the [year, 'name rank', ... ] list and print it
-Fix main() to use the extract_names list
"""
# Point to local directory containing html file
import sys, os
os.chdir('/Users/michaelboles/Michael/Coding/2019/Google-Python/babynames_html')
def extract_names(filename):
"""
Given a file name for baby.html, returns a list starting with the year string
followed by the name-rank strings in alphabetical order.
['2006', 'Aaliyah 91', Aaron 57', 'Abagail 895', ' ...]
"""
# open and read html file
file = open(filename)
text = file.read()
# create emtpy array where baby names will be placed
baby_names = []
# extract year by matching string pattern: "Popularity in yyyy", where y = digit
import re
year = re.search(r'Popularity\sin\s(\d\d\d\d)',text)
if not year:
sys.stderr.write('Couldn\'t find the year\n')
sys.exit(1)
year = year.group(1)
baby_names.append(year)
# extract rank, male, and female names matching string pattern:
# (any number of digits) (any number of letters) (any number of letters)
# separated by html table <td> tags
rank_names = re.findall(r'<td>(\d+)</td><td>(\w+)</td>\<td>(\w+)</td>', text)
# loop through rank_names, building a dictionary as {key: name, value: rank}
name_rank_dict = {}
for tuple in rank_names:
(rank, boy_name, girl_name) = tuple
if boy_name not in name_rank_dict:
name_rank_dict[boy_name] = int(rank)
if girl_name not in name_rank_dict:
name_rank_dict[girl_name] = int(rank)
# equivalent to saying:
# for i == 0:len(rank_names)-1:
# (rank, boy_name, girl_name) = rank_names[i][0:3]
# loop through dictionary, building list in ['name rank', 'name rank'] format
name_rank_list = []
for key in name_rank_dict:
name_rank_list.append(str(key) + ' ' + str(name_rank_dict[key]))
return name_rank_list
#names = extract_names('baby1990.html')
# command line parsing code
def main():
# Make a list of command line arguments, omitting the [0] element
# which is the script name itself
args = sys.argv[1:]
# Correct user input: need at least 1 html filename, option to write summary file
if not args:
print('usage: [--summaryfile] file [file ...]') # need one filename, summary option
sys.exit(1)
# Notice the summary flag and remove it from args if it is present
summary = False
if args[0] == '--summaryfile':
summary = True
del args[0]
# For each filename, get the names, then either print the ranked names list
# or write it to a summary file
for filename in args:
names = extract_names(filename)
# Make text out of the whole list
text = '\n'.join(names)
# If summary option selected,
if summary:
output_summary = open(filename + '_summary.txt', 'w')
output_summary.write(text + '\n')
output_summary.close()
else:
print(text)
if __name__ == '__main__':
main()