-
Notifications
You must be signed in to change notification settings - Fork 0
/
playerscsv.py
45 lines (36 loc) · 1.3 KB
/
playerscsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# This script constructs a playerinfo.csv out of the alphabetical player html
# pages in subdirectory players/.
from bs4 import BeautifulSoup
import csv
import string
import time
import re
# Get string of uppercase alphabet
alphabet_upper = string.ascii_uppercase
LOCAL_FILE = "players/"
POSITION_SET = set()
# Creates csv file
playerscsv = csv.writer(open('nflplayers.csv', 'w'))
playerscsv.writerow(['Name', 'Position', 'link', 'Active'])
# For every letter of the alphabet, goes through downloaded players[A-Z].html
# pages and extracts player info to csv.
for i in range(len(alphabet_upper)):
f = open(LOCAL_FILE + "players" + alphabet_upper[i] + ".html","r")
soup = BeautifulSoup(f, "lxml")
section = soup.find(id='div_players')
player_info = section.find_all('p')
for player in player_info:
link = "pro-football-reference.com" + player.a["href"]
name = player.a.get_text().strip()
try:
position = re.search(r"\(([^)]+)\)", player.get_text()).group()
position = position.strip('()')
except:
position = "Unassigned"
POSITION_SET.add(position)
if(player.b):
active = "yes"
else:
active = "no"
playerscsv.writerow([name, position, link, active])
f.close()