Skip to content

Commit

Permalink
export sprint sessions' data to .pkl
Browse files Browse the repository at this point in the history
also modify other parsing scripts a bit to accommodate field name changing in 985afd6
  • Loading branch information
harningle committed Sep 12, 2024
1 parent 985afd6 commit bcaf915
Show file tree
Hide file tree
Showing 7 changed files with 219 additions and 14 deletions.
2 changes: 1 addition & 1 deletion parse_pit_stop_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def to_json(df: pd.DataFrame) -> list[dict]:
lambda x: SessionEntry(
year=year,
round=round_no,
type=session_type,
session=session_type,
car_number=x
)
)
Expand Down
10 changes: 5 additions & 5 deletions parse_quali_laps.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd

from models.foreign_key import SessionEntry
from models.quali_lap import Lap, LapData
from models.lap import QualiLap, LapData


def parse_quali_final_classification(file: str | os.PathLike) -> pd.DataFrame:
Expand Down Expand Up @@ -291,11 +291,11 @@ def to_json(df: pd.DataFrame):
for q in [1, 2, 3]:
temp = df[df['Q'] == q]
temp['lap'] = temp.apply(
lambda x: Lap(
lambda x: QualiLap(
number=x['lap_no'],
time=x['lap_time'],
is_deleted=x['lap_time_deleted'],
is_fastest_lap=x['is_fastest_lap']
is_entry_fastest_lap=x['is_fastest_lap']
),
axis=1
)
Expand All @@ -304,7 +304,7 @@ def to_json(df: pd.DataFrame):
lambda x: SessionEntry(
year=year,
round=round_no,
type=f'Q{q}',
session=f'Q{q}',
car_number=x
)
)
Expand All @@ -316,7 +316,7 @@ def to_json(df: pd.DataFrame):
# Should add a smoke test here: 20-ish cars in Q1, 15 in Q2, 10 in Q3
with open('quali_lap_times.pkl', 'wb') as f:
pickle.dump(lap_data, f)
pass
return lap_data


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion parse_race_fastest_laps.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def to_json(df: pd.DataFrame) -> list[dict]:
foreign_keys=SessionEntry(
year=year,
round=round_no,
type=session_type,
session=session_type,
car_number=x['driver_no']
),
objects=FastestLap(
Expand Down
9 changes: 6 additions & 3 deletions parse_race_history_chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ def to_timedelta(s: str) -> datetime.timedelta:
"""
Covert a time string to a timedelta object, e.g. "1:32.190" -->
datetime.timedelta(seconds=92, microseconds=190000)
# TODO: move this to `utils.py`?
"""
# Parse by ":" and "."
n_colons = s.count(':')
Expand Down Expand Up @@ -174,14 +176,15 @@ def to_json(df: pd.DataFrame) -> list[dict]:
df['time'] = df['time'].apply(to_timedelta)

# Convert to json
df['lap'] = df.apply(lambda x: Lap(lap_number=x['lap'], position=x['position'], time=x['time']),
axis=1)
df['lap'] = df.apply(
lambda x: Lap(number=x['lap'], position=x['position'], time=x['time']), axis=1
)
df = df.groupby('driver_no')[['lap']].agg(list).reset_index()
df['session_entry'] = df['driver_no'].map(
lambda x: SessionEntry(
year=year,
round=round_no,
type=session_type,
session=session_type,
car_number=x
)
)
Expand Down
163 changes: 162 additions & 1 deletion parse_sprint_laps.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# -*- coding: utf-8 -*-
import datetime
import os
import pickle
import re
import warnings

import fitz
import pandas as pd

from models.foreign_key import SessionEntry
from models.quali_lap import Lap, LapData
from models.lap import Lap, LapData


def parse_sprint_lap_analysis_page(page: fitz.Page) -> pd.DataFrame:
Expand Down Expand Up @@ -54,7 +57,10 @@ def parse_sprint_lap_analysis_page(page: fitz.Page) -> pd.DataFrame:
for j in driver_tabs:
j['driver'] = name
j['car_no'] = car_no
assert (j.Col1 == '').all() # TODO: check this
del j['Col1']
j.rename(columns={'LAP': 'lap', 'TIME': 'time'}, inplace=True)
j.lap = j.lap.astype(int)
df.append(j)
driver_tabs = []
h = tab.header.bbox[1] # Top of the table
Expand All @@ -66,6 +72,10 @@ def parse_sprint_lap_analysis_page(page: fitz.Page) -> pd.DataFrame:
for j in driver_tabs:
j['driver'] = name
j['car_no'] = car_no
assert (j.Col1 == '').all() # TODO: check this
del j['Col1']
j.rename(columns={'LAP': 'lap', 'TIME': 'time'}, inplace=True)
j.lap = j.lap.astype(int)
df.append(j)

return pd.concat(df, ignore_index=True)
Expand All @@ -78,3 +88,154 @@ def parse_sprint_lap_analysis(file: str | os.PathLike) -> pd.DataFrame:
for page in doc:
df.append(parse_sprint_lap_analysis_page(page))
return pd.concat(df, ignore_index=True)


def parse_sprint_history_chart_page(page: fitz.Page) -> pd.DataFrame:
"""
Get the table(s) from a given page in "Sprint History Chart" PDF. There are multiple tables in
a page, each of which correspond to a lap No. We concat all tables into one single dataframe
See `notebook/demo.ipynb` for the detailed explanation of the table structure.
:param page: A `fitz.Page` object
:return: A dataframe of [driver No., lap No., gap to leader, lap time]
TODO: probably use better type hint using pandera later
TODO: merge this with race lap parsing script
"""

# Get the position of "Lap x"
t = page.search_for('Sprint History Chart')[0].y1
b = page.search_for('TIME')[0].y1
headers = page.search_for('Lap', clip=(0, t, W, b))

# Iterate through the tables for each lap
tables = []
for i, lap in enumerate(headers):
"""
The left boundary of the table is the leftmost of the "Lap x" text, and the right boundary
is the leftmost of the next "Lap x" text. If it's the last lap, i.e. no next table, then
the right boundary can be determined by left boundary plus table width, which is roughly
one-fifth of the page width. We add 5% extra buffer to the right boundary
"""
l = lap.x0
r = headers[i + 1].x0 if i + 1 < len(headers) else (l + W / 5) * 1.05
temp = page.find_tables(clip=fitz.Rect(l, t, r, H),
strategy='lines',
add_lines=[((l, 0), (l, H))])[0].to_pandas()

# Three columns: "LAP x", "GAP", "TIME". "LAP x" is the column for driver No. So add a new
# column for lap No. with value "x", and rename the columns
lap_no = int(temp.columns[0].split(' ')[1])
temp.columns = ['driver_no', 'gap', 'time']
temp['lap'] = lap_no
temp = temp[temp['driver_no'] != ''] # Sometimes we will get one additional empty row

# The row order/index is meaningful: it's the order/positions of the cars
# TODO: is this true for all cases? E.g. retirements?
temp.reset_index(drop=False, names=['position'], inplace=True)
temp['position'] += 1 # 1-indexed
tables.append(temp)
return pd.concat(tables, ignore_index=True)


def parse_sprint_history_chart(file: str | os.PathLike[str]) -> pd.DataFrame:
"""
Parse "Sprint History Chart" PDF
:param file: Path to PDF file
:return: The output dataframe will be [driver No., lap No., gap to leader, lap time]
"""
# Get page width and height
doc = fitz.open(file)
page = doc[0]
global W, H
W = page.bound()[2]
H = page.bound()[3]

# Parse all pages
df = pd.concat([parse_sprint_history_chart_page(page) for page in doc], ignore_index=True)

# Clean up
# TODO: check notes in `parse_race_history_chart.py`
df['lap'] = df['lap'] - df['gap'].apply(
lambda x: int(re.findall(r'\d+', x)[0]) if 'LAP' in x else 0
)
df.reset_index(drop=False, inplace=True)
df.sort_values(by=['driver_no', 'lap', 'index'], inplace=True)
df.loc[(df['driver_no'] == df['driver_no'].shift(-1)) & (df['lap'] == df['lap'].shift(-1)),
'lap'] -= 1
df.loc[(df['driver_no'] == df['driver_no'].shift(1)) & (df['lap'] == df['lap'].shift(1) + 2),
'lap'] -= 1
del df['index']
return df


def to_timedelta(s: str) -> datetime.timedelta:
"""
Covert a time string to a timedelta object, e.g. "1:32.190" -->
datetime.timedelta(seconds=92, microseconds=190000)
# TODO: move this to `utils.py`?
"""
# Parse by ":" and "."
n_colons = s.count(':')
h, m, sec, ms = 0, 0, 0, 0
match n_colons:
case 1: # "1:32.190"
m, sec = s.split(':')
sec, ms = sec.split('.')
case 2: # "1:32:19.190"
warnings.warn(f'''got an unusual time: {s}. Assuming it's "hh:mm:ss.ms"''')
h, m, sec = s.split(':')
sec, ms = sec.split('.')
case 0: # "19.190"
warnings.warn(f'''got an unusual time: {s}. Assuming it's "ss.ms"''')
sec, ms = s.split('.')
case _: # Weird case
raise ValueError(f'''got an unexpected time: {s}''')

# Check if the time is valid
assert 0 <= int(h) < 24, f'''hour should be in [0, 24), got {h} in {s}'''
assert 0 <= int(m) < 60, f'''minute should be in [0, 60), got {m} in {s}'''
assert 0 <= int(sec) < 60, f'''second should be in [0, 60), got {sec} in {s}'''
assert 0 <= int(ms) < 1000, f'''millisecond should be in [0, 1000), got {ms} in {s}'''

t = datetime.timedelta(hours=int(h), minutes=int(m), seconds=int(sec), milliseconds=int(ms))
if t == datetime.timedelta(0):
raise ValueError(f'''got an invalid time: {s}''')
return t


def to_json(df: pd.DataFrame) -> list[dict]:
"""Convert the parsed lap time df. to a json obj. See jolpica/jolpica-f1#7"""

# Hard code 2023 Brazil for now
year = 2023
round_no = 20
session_type = 'SR'

# Convert string time time to timedelta
df['time'] = df['time'].apply(to_timedelta)

# Convert to json
df['lap'] = df.apply(
lambda x: Lap(number=x['lap'], position=x['position'], time=x['time']), axis=1
)
df = df.groupby('driver_no')[['lap']].agg(list).reset_index()
df['session_entry'] = df['driver_no'].map(
lambda x: SessionEntry(
year=year,
round=round_no,
session=session_type,
car_number=x
)
)
del df['driver_no']
lap_data = df.apply(
lambda x: LapData(foreign_keys=x['session_entry'], objects=x['lap']).model_dump(),
axis=1
).tolist()
with open('sprint_laps.pkl', 'wb') as f:
pickle.dump(lap_data, f)
return lap_data
47 changes: 44 additions & 3 deletions parse_sprint_shootout_laps.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd

from models.foreign_key import SessionEntry
from models.lap import QualiLap, LapData


def parse_sprint_shootout_final_classification(file: str | os.PathLike) -> pd.DataFrame:
Expand Down Expand Up @@ -229,9 +230,9 @@ def parse_sprint_shootout(final_path: str | os.PathLike, lap_times_path: str | o
# but made into SQ3? Shouldn't be SQ1 and box and SQ2 and box, so at least two pit stops?
# TODO: should check here if all drivers are merged. All unmerged ones should be not classified
del df['NO']
df['Q'] = 1
df.loc[df['lap_no'] > df['SQ1_LAPS'], 'Q'] = 2
df.loc[df['lap_no'] > df['SQ2_LAPS'], 'Q'] = 3
df['SQ'] = 1
df.loc[df['lap_no'] > df['SQ1_LAPS'], 'SQ'] = 2
df.loc[df['lap_no'] > df['SQ2_LAPS'], 'SQ'] = 3
# TODO: should check the lap before the first Q2 and Q3 lap is pit lap. Or is it? Crashed?
del df['SQ1_LAPS'], df['SQ2_LAPS']

Expand Down Expand Up @@ -275,3 +276,43 @@ def parse_sprint_shootout(final_path: str | os.PathLike, lap_times_path: str | o
df['pit'] = (df['pit'] == 'P').astype(bool)
return df


def to_json(df: pd.DataFrame):
"""Convert the parsed lap time df. to a json obj. See jolpica/jolpica-f1#7"""
# Hard code 2023 Brazil for now
year = 2023
round_no = 20

# Convert to json
lap_data = []
df = df[df['lap_time'].str.count(':') == 1] # TODO: check this. We always lost the first lap?
df['lap_time'] = df['lap_time'].apply(parse_date)
for q in [1, 2, 3]:
temp = df[df['SQ'] == q]
temp['lap'] = temp.apply(
lambda x: QualiLap(
number=x['lap_no'],
time=x['lap_time'],
is_deleted=x['lap_time_deleted'],
is_entry_fastest_lap=x['is_fastest_lap']
),
axis=1
)
temp = temp.groupby('car_no')[['lap']].agg(list).reset_index()
temp['session_entry'] = temp['car_no'].map(
lambda x: SessionEntry(
year=year,
round=round_no,
session=f'SQ{q}',
car_number=x
)
)
lap_data.extend(temp.apply(
lambda x: LapData(foreign_keys=x['session_entry'], objects=x['lap']).model_dump(),
axis=1
).tolist())

# Should add a smoke test here: 20-ish cars in Q1, 15 in Q2, 10 in Q3
with open('sprint_quali_lap_times.pkl', 'wb') as f:
pickle.dump(lap_data, f)
return lap_data
Binary file added sprint_history_chart.pdf
Binary file not shown.

0 comments on commit bcaf915

Please sign in to comment.