export sprint sessions' data to .pkl

also modify other parsing scripts a bit to accommodate field name changing in 985afd6
harningle · Sep 12, 2024 · bcaf915 · bcaf915
1 parent 985afd6
commit bcaf915
Show file tree

Hide file tree

Showing 7 changed files with 219 additions and 14 deletions.
diff --git a/parse_pit_stop_summary.py b/parse_pit_stop_summary.py
@@ -64,7 +64,7 @@ def to_json(df: pd.DataFrame) -> list[dict]:
         lambda x: SessionEntry(
             year=year,
             round=round_no,
-            type=session_type,
+            session=session_type,
             car_number=x
         )
     )

diff --git a/parse_quali_laps.py b/parse_quali_laps.py
@@ -6,7 +6,7 @@
 import pandas as pd
 
 from models.foreign_key import SessionEntry
-from models.quali_lap import Lap, LapData
+from models.lap import QualiLap, LapData
 
 
 def parse_quali_final_classification(file: str | os.PathLike) -> pd.DataFrame:
@@ -291,11 +291,11 @@ def to_json(df: pd.DataFrame):
     for q in [1, 2, 3]:
         temp = df[df['Q'] == q]
         temp['lap'] = temp.apply(
-            lambda x: Lap(
+            lambda x: QualiLap(
                 number=x['lap_no'],
                 time=x['lap_time'],
                 is_deleted=x['lap_time_deleted'],
-                is_fastest_lap=x['is_fastest_lap']
+                is_entry_fastest_lap=x['is_fastest_lap']
             ),
             axis=1
         )
@@ -304,7 +304,7 @@ def to_json(df: pd.DataFrame):
             lambda x: SessionEntry(
                 year=year,
                 round=round_no,
-                type=f'Q{q}',
+                session=f'Q{q}',
                 car_number=x
             )
         )
@@ -316,7 +316,7 @@ def to_json(df: pd.DataFrame):
     # Should add a smoke test here: 20-ish cars in Q1, 15 in Q2, 10 in Q3
     with open('quali_lap_times.pkl', 'wb') as f:
         pickle.dump(lap_data, f)
-    pass
+    return lap_data
 
 
 if __name__ == '__main__':

diff --git a/parse_race_fastest_laps.py b/parse_race_fastest_laps.py
@@ -84,7 +84,7 @@ def to_json(df: pd.DataFrame) -> list[dict]:
             foreign_keys=SessionEntry(
                 year=year,
                 round=round_no,
-                type=session_type,
+                session=session_type,
                 car_number=x['driver_no']
             ),
             objects=FastestLap(

diff --git a/parse_race_history_chart.py b/parse_race_history_chart.py
@@ -132,6 +132,8 @@ def to_timedelta(s: str) -> datetime.timedelta:
     """
     Covert a time string to a timedelta object, e.g. "1:32.190" -->
     datetime.timedelta(seconds=92, microseconds=190000)
+
+    # TODO: move this to `utils.py`?
     """
     # Parse by ":" and "."
     n_colons = s.count(':')
@@ -174,14 +176,15 @@ def to_json(df: pd.DataFrame) -> list[dict]:
     df['time'] = df['time'].apply(to_timedelta)
 
     # Convert to json
-    df['lap'] = df.apply(lambda x: Lap(lap_number=x['lap'], position=x['position'], time=x['time']),
-                         axis=1)
+    df['lap'] = df.apply(
+        lambda x: Lap(number=x['lap'], position=x['position'], time=x['time']), axis=1
+    )
     df = df.groupby('driver_no')[['lap']].agg(list).reset_index()
     df['session_entry'] = df['driver_no'].map(
         lambda x: SessionEntry(
             year=year,
             round=round_no,
-            type=session_type,
+            session=session_type,
             car_number=x
         )
     )

diff --git a/parse_sprint_laps.py b/parse_sprint_laps.py
@@ -1,12 +1,15 @@
 # -*- coding: utf-8 -*-
+import datetime
 import os
 import pickle
+import re
+import warnings
 
 import fitz
 import pandas as pd
 
 from models.foreign_key import SessionEntry
-from models.quali_lap import Lap, LapData
+from models.lap import Lap, LapData
 
 
 def parse_sprint_lap_analysis_page(page: fitz.Page) -> pd.DataFrame:
@@ -54,7 +57,10 @@ def parse_sprint_lap_analysis_page(page: fitz.Page) -> pd.DataFrame:
                     for j in driver_tabs:
                         j['driver'] = name
                         j['car_no'] = car_no
+                        assert (j.Col1 == '').all()  # TODO: check this
                         del j['Col1']
+                        j.rename(columns={'LAP': 'lap', 'TIME': 'time'}, inplace=True)
+                        j.lap = j.lap.astype(int)
                         df.append(j)
                     driver_tabs = []
                 h = tab.header.bbox[1]  # Top of the table
@@ -66,6 +72,10 @@ def parse_sprint_lap_analysis_page(page: fitz.Page) -> pd.DataFrame:
         for j in driver_tabs:
             j['driver'] = name
             j['car_no'] = car_no
+            assert (j.Col1 == '').all()  # TODO: check this
+            del j['Col1']
+            j.rename(columns={'LAP': 'lap', 'TIME': 'time'}, inplace=True)
+            j.lap = j.lap.astype(int)
             df.append(j)
 
     return pd.concat(df, ignore_index=True)
@@ -78,3 +88,154 @@ def parse_sprint_lap_analysis(file: str | os.PathLike) -> pd.DataFrame:
     for page in doc:
         df.append(parse_sprint_lap_analysis_page(page))
     return pd.concat(df, ignore_index=True)
+
+
+def parse_sprint_history_chart_page(page: fitz.Page) -> pd.DataFrame:
+    """
+    Get the table(s) from a given page in "Sprint History Chart" PDF. There are multiple tables in
+    a page, each of which correspond to a lap No. We concat all tables into one single dataframe
+
+    See `notebook/demo.ipynb` for the detailed explanation of the table structure.
+
+    :param page: A `fitz.Page` object
+    :return: A dataframe of [driver No., lap No., gap to leader, lap time]
+
+    TODO: probably use better type hint using pandera later
+    TODO: merge this with race lap parsing script
+    """
+
+    # Get the position of "Lap x"
+    t = page.search_for('Sprint History Chart')[0].y1
+    b = page.search_for('TIME')[0].y1
+    headers = page.search_for('Lap', clip=(0, t, W, b))
+
+    # Iterate through the tables for each lap
+    tables = []
+    for i, lap in enumerate(headers):
+        """
+        The left boundary of the table is the leftmost of the "Lap x" text, and the right boundary
+        is the leftmost of the next "Lap x" text. If it's the last lap, i.e. no next table, then
+        the right boundary can be determined by left boundary plus table width, which is roughly
+        one-fifth of the page width. We add 5% extra buffer to the right boundary
+        """
+        l = lap.x0
+        r = headers[i + 1].x0 if i + 1 < len(headers) else (l + W / 5) * 1.05
+        temp = page.find_tables(clip=fitz.Rect(l, t, r, H),
+                                strategy='lines',
+                                add_lines=[((l, 0), (l, H))])[0].to_pandas()
+
+        # Three columns: "LAP x", "GAP", "TIME". "LAP x" is the column for driver No. So add a new
+        # column for lap No. with value "x", and rename the columns
+        lap_no = int(temp.columns[0].split(' ')[1])
+        temp.columns = ['driver_no', 'gap', 'time']
+        temp['lap'] = lap_no
+        temp = temp[temp['driver_no'] != '']  # Sometimes we will get one additional empty row
+
+        # The row order/index is meaningful: it's the order/positions of the cars
+        # TODO: is this true for all cases? E.g. retirements?
+        temp.reset_index(drop=False, names=['position'], inplace=True)
+        temp['position'] += 1  # 1-indexed
+        tables.append(temp)
+    return pd.concat(tables, ignore_index=True)
+
+
+def parse_sprint_history_chart(file: str | os.PathLike[str]) -> pd.DataFrame:
+    """
+    Parse "Sprint History Chart" PDF
+
+    :param file: Path to PDF file
+    :return: The output dataframe will be [driver No., lap No., gap to leader, lap time]
+    """
+    # Get page width and height
+    doc = fitz.open(file)
+    page = doc[0]
+    global W, H
+    W = page.bound()[2]
+    H = page.bound()[3]
+
+    # Parse all pages
+    df = pd.concat([parse_sprint_history_chart_page(page) for page in doc], ignore_index=True)
+
+    # Clean up
+    # TODO: check notes in `parse_race_history_chart.py`
+    df['lap'] = df['lap'] - df['gap'].apply(
+        lambda x: int(re.findall(r'\d+', x)[0]) if 'LAP' in x else 0
+    )
+    df.reset_index(drop=False, inplace=True)
+    df.sort_values(by=['driver_no', 'lap', 'index'], inplace=True)
+    df.loc[(df['driver_no'] == df['driver_no'].shift(-1)) & (df['lap'] == df['lap'].shift(-1)),
+           'lap'] -= 1
+    df.loc[(df['driver_no'] == df['driver_no'].shift(1)) & (df['lap'] == df['lap'].shift(1) + 2),
+           'lap'] -= 1
+    del df['index']
+    return df
+
+
+def to_timedelta(s: str) -> datetime.timedelta:
+    """
+    Covert a time string to a timedelta object, e.g. "1:32.190" -->
+    datetime.timedelta(seconds=92, microseconds=190000)
+
+    # TODO: move this to `utils.py`?
+    """
+    # Parse by ":" and "."
+    n_colons = s.count(':')
+    h, m, sec, ms = 0, 0, 0, 0
+    match n_colons:
+        case 1:  # "1:32.190"
+            m, sec = s.split(':')
+            sec, ms = sec.split('.')
+        case 2:  # "1:32:19.190"
+            warnings.warn(f'''got an unusual time: {s}. Assuming it's "hh:mm:ss.ms"''')
+            h, m, sec = s.split(':')
+            sec, ms = sec.split('.')
+        case 0:  # "19.190"
+            warnings.warn(f'''got an unusual time: {s}. Assuming it's "ss.ms"''')
+            sec, ms = s.split('.')
+        case _:  # Weird case
+            raise ValueError(f'''got an unexpected time: {s}''')
+
+    # Check if the time is valid
+    assert 0 <= int(h) < 24, f'''hour should be in [0, 24), got {h} in {s}'''
+    assert 0 <= int(m) < 60, f'''minute should be in [0, 60), got {m} in {s}'''
+    assert 0 <= int(sec) < 60, f'''second should be in [0, 60), got {sec} in {s}'''
+    assert 0 <= int(ms) < 1000, f'''millisecond should be in [0, 1000), got {ms} in {s}'''
+
+    t = datetime.timedelta(hours=int(h), minutes=int(m), seconds=int(sec), milliseconds=int(ms))
+    if t == datetime.timedelta(0):
+        raise ValueError(f'''got an invalid time: {s}''')
+    return t
+
+
+def to_json(df: pd.DataFrame) -> list[dict]:
+    """Convert the parsed lap time df. to a json obj. See jolpica/jolpica-f1#7"""
+
+    # Hard code 2023 Brazil for now
+    year = 2023
+    round_no = 20
+    session_type = 'SR'
+
+    # Convert string time time to timedelta
+    df['time'] = df['time'].apply(to_timedelta)
+
+    # Convert to json
+    df['lap'] = df.apply(
+        lambda x: Lap(number=x['lap'], position=x['position'], time=x['time']), axis=1
+    )
+    df = df.groupby('driver_no')[['lap']].agg(list).reset_index()
+    df['session_entry'] = df['driver_no'].map(
+        lambda x: SessionEntry(
+            year=year,
+            round=round_no,
+            session=session_type,
+            car_number=x
+        )
+    )
+    del df['driver_no']
+    lap_data = df.apply(
+        lambda x: LapData(foreign_keys=x['session_entry'], objects=x['lap']).model_dump(),
+        axis=1
+    ).tolist()
+    with open('sprint_laps.pkl', 'wb') as f:
+        pickle.dump(lap_data, f)
+    return lap_data
diff --git a/parse_sprint_shootout_laps.py b/parse_sprint_shootout_laps.py
@@ -7,6 +7,7 @@
 import pandas as pd
 
 from models.foreign_key import SessionEntry
+from models.lap import QualiLap, LapData
 
 
 def parse_sprint_shootout_final_classification(file: str | os.PathLike) -> pd.DataFrame:
@@ -229,9 +230,9 @@ def parse_sprint_shootout(final_path: str | os.PathLike, lap_times_path: str | o
     #       but made into SQ3? Shouldn't be SQ1 and box and SQ2 and box, so at least two pit stops?
     # TODO: should check here if all drivers are merged. All unmerged ones should be not classified
     del df['NO']
-    df['Q'] = 1
-    df.loc[df['lap_no'] > df['SQ1_LAPS'], 'Q'] = 2
-    df.loc[df['lap_no'] > df['SQ2_LAPS'], 'Q'] = 3
+    df['SQ'] = 1
+    df.loc[df['lap_no'] > df['SQ1_LAPS'], 'SQ'] = 2
+    df.loc[df['lap_no'] > df['SQ2_LAPS'], 'SQ'] = 3
     # TODO: should check the lap before the first Q2 and Q3 lap is pit lap. Or is it? Crashed?
     del df['SQ1_LAPS'], df['SQ2_LAPS']
 
@@ -275,3 +276,43 @@ def parse_sprint_shootout(final_path: str | os.PathLike, lap_times_path: str | o
     df['pit'] = (df['pit'] == 'P').astype(bool)
     return df
 
+
+def to_json(df: pd.DataFrame):
+    """Convert the parsed lap time df. to a json obj. See jolpica/jolpica-f1#7"""
+    # Hard code 2023 Brazil for now
+    year = 2023
+    round_no = 20
+
+    # Convert to json
+    lap_data = []
+    df = df[df['lap_time'].str.count(':') == 1]  # TODO: check this. We always lost the first lap?
+    df['lap_time'] = df['lap_time'].apply(parse_date)
+    for q in [1, 2, 3]:
+        temp = df[df['SQ'] == q]
+        temp['lap'] = temp.apply(
+            lambda x: QualiLap(
+                number=x['lap_no'],
+                time=x['lap_time'],
+                is_deleted=x['lap_time_deleted'],
+                is_entry_fastest_lap=x['is_fastest_lap']
+            ),
+            axis=1
+        )
+        temp = temp.groupby('car_no')[['lap']].agg(list).reset_index()
+        temp['session_entry'] = temp['car_no'].map(
+            lambda x: SessionEntry(
+                year=year,
+                round=round_no,
+                session=f'SQ{q}',
+                car_number=x
+            )
+        )
+        lap_data.extend(temp.apply(
+            lambda x: LapData(foreign_keys=x['session_entry'], objects=x['lap']).model_dump(),
+            axis=1
+        ).tolist())
+
+    # Should add a smoke test here: 20-ish cars in Q1, 15 in Q2, 10 in Q3
+    with open('sprint_quali_lap_times.pkl', 'wb') as f:
+        pickle.dump(lap_data, f)
+    return lap_data
diff --git a/sprint_history_chart.pdf b/sprint_history_chart.pdf