-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats_puller.py
170 lines (147 loc) · 6.81 KB
/
stats_puller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# A series of functions to pull the needed files from the smogon stats repo automatically
# These will be pulled when the GUI calls a refresh task
# Apparently Windows Defender hats multi-line strings, so docstrings are as comments
import os
import re
import sys
from typing import Optional
import pandas as pd
import requests
BASE_PATH = "https://www.smogon.com/stats/"
def resource_path(relative_path):
"""
Get the absolute path to the resource, works for dev and for PyInstaller
Because this file is within data/ you can ignore the data/ in these resource paths
"""
base_path = getattr(sys, "_MEIPASS", os.path.dirname(os.path.abspath(__file__)))
return os.path.join(base_path, relative_path)
def read_stats_page():
"""
Parses the smogon stats page to see all available subfolders
Performance depends on the structure of the stats page and may need to be updated if that page is reformatted
:return: DataFrame with columns "Link" and "Upload Date
"""
stats_page = pd.read_fwf(BASE_PATH, skiprows=3)
stats_page.columns = [
"Link",
"Upload Date",
"Unnamed: 1",
"Unnamed: 2",
"Unnamed:3",
]
stats_page = stats_page[["Link", "Upload Date"]]
stats_page["Upload Date"] = pd.to_datetime(stats_page["Upload Date"])
stats_page = stats_page.dropna()
return stats_page
def determine_available_formats(
stats_page,
months_back: int = 12,
chaos_options: Optional[pd.DataFrame] = None,
save_to_pickle: bool = False,
):
"""
Check what formats are avaiable in the recent monthly uploads.
If no chaos_options are provided, it will check the last 12 months.
If chaos_options are provided, it will check the months since the last upload in the chaos_options.
:param stats_page: DataFrame with columns "Link" and "Upload Date"
:param months_back: Number of months to check back for available formats
:param chaos_options: DataFrame with columns "Link", "Upload Date", "Size", "Date Link", "Generation", "ELO Floor", "Tier"
:param save_to_pickle: Save the DataFrame to a pickle file
"""
if chaos_options is None:
more_recent_months = months_back
chaos_options = pd.DataFrame()
check_range = range(-1 * more_recent_months, 0)
else:
# Add 1 day buffer b/c stats page includes the time of day and the chaos_options doesn't
more_recent_months = (
stats_page["Upload Date"]
> chaos_options["Upload Date"].max() + pd.Timedelta(days=1)
).sum()
if more_recent_months == 0:
return chaos_options, more_recent_months
check_range = range(-1 * more_recent_months, 0)
for recent_run_index in check_range:
mr_link = stats_page.iloc[recent_run_index]["Link"]
mr_link_path = re.split(r"[<>]", mr_link)[2]
chaos_subset = pd.read_csv(
BASE_PATH + mr_link_path + "chaos/", skiprows=4, names=["Text"]
)
chaos_subset = pd.DataFrame(
chaos_subset["Text"].apply(lambda x: re.split(r"[<>\s]+", x)).tolist()
).dropna()
chaos_subset = chaos_subset.rename(
columns={3: "Link", 5: "Upload Date", 6: "Upload Time", 7: "Size"}
)[["Link", "Upload Date", "Size"]]
chaos_subset["Date Link"] = mr_link_path
chaos_subset = chaos_subset[chaos_subset["Link"].str.contains(".gz")]
# .gz files weren't available until June 2024. Skip if no .gz files are present
if len(chaos_subset) == 0:
continue
# TODO: This will break when Generation 10 is added.
# How it will break will depend on how they handle it (i.e. Gen01 through Gen10 or Gen1 through Gen10)
# Perhaps a regex on all numeric values would be better, would need to address the VGC formats with a year though
chaos_subset["Generation"] = chaos_subset["Link"].apply(lambda x: int(x[3]))
chaos_subset = chaos_subset[chaos_subset["Generation"] <= 4]
chaos_subset["ELO Floor"] = chaos_subset["Link"].apply(
lambda x: re.split(r"[-.]", x)[1]
)
chaos_subset["Tier"] = chaos_subset["Link"].apply(
lambda x: re.split(r"[-.]", x)[0][4:]
)
chaos_subset["Upload Date"] = pd.to_datetime(chaos_subset["Upload Date"])
chaos_subset["Size"] = chaos_subset["Size"].astype(float)
chaos_options = pd.concat([chaos_options, chaos_subset])
# Remove outdated information from formats that have multiple entries
chaos_options = chaos_options.sort_values(
by=["Upload Date", "Generation", "Tier", "ELO Floor"],
ascending=True,
ignore_index=True,
)
chaos_options = chaos_options.drop_duplicates(
subset=["Generation", "ELO Floor", "Tier"], keep="last"
)
if save_to_pickle:
chaos_options.to_pickle(
resource_path("data/Smogon_Stats/available_formats.pkl.gz")
)
return chaos_options, more_recent_months
def download_files(options: pd.DataFrame, generation, tier, elo_floor):
"""
Among all the avaiable formats, subset the ones that are needed and download the chaos.json.gz and leads.txt.gz files
:param options: DataFrame with columns "Link", "Upload Date", "Size", "Date Link", "Generation", "ELO Floor", "Tier"
:param formats: List of tuples with the format (Generation, Tier, ELO Floor)
"""
options_subset = options[
(options["Generation"] == generation)
& (options["Tier"] == tier)
& (options["ELO Floor"] == elo_floor)
].iloc[0]
download_chaos(options_subset["Date Link"], options_subset["Link"])
download_leads(options_subset["Date Link"], options_subset["Link"])
def download_chaos(date_link: str, link: str):
"""Download the chaos.json.gz file for a given month/format"""
url = BASE_PATH + date_link + "chaos/" + link
local_filename = resource_path(f"data/Smogon_Stats/chaos/{link}")
os.makedirs(os.path.dirname(local_filename), exist_ok=True)
response = requests.get(url)
with open(local_filename, "wb") as f:
f.write(response.content)
def download_leads(date_link: str, link: str):
"""Download the leads.txt.gz file for a given month/format"""
url = BASE_PATH + date_link + "leads/" + link.replace("json", "txt")
local_filename = resource_path(
f"data/Smogon_Stats/leads/{link.replace("json", "txt")}"
)
os.makedirs(os.path.dirname(local_filename), exist_ok=True)
response = requests.get(url)
with open(local_filename, "wb") as f:
f.write(response.content)
def clear_downloaded_files(folders=["chaos", "leads"]):
"""Clear all downloaded files"""
for folder in folders:
try:
for file in os.listdir(resource_path(f"data/Smogon_Stats/{folder}")):
os.remove(resource_path(f"data/Smogon_Stats/{folder}/{file}"))
except FileNotFoundError:
pass