run_adloc_v2.py works

AI4EPS · Oct 24, 2024 · 0208470 · 0208470
1 parent 270f1fd
commit 0208470
Show file tree

Hide file tree

Showing 5 changed files with 261 additions and 130 deletions.
diff --git a/scripts/merge_adloc_picks.py b/scripts/merge_adloc_picks.py
@@ -0,0 +1,166 @@
+# %%
+import json
+import multiprocessing as mp
+import os
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from datetime import datetime, timedelta, timezone
+from threading import Lock, Thread
+
+import fsspec
+import numpy as np
+import pandas as pd
+import pyproj
+from obspy import read_inventory
+from obspy.clients.fdsn import Client
+from sklearn.cluster import DBSCAN
+from tqdm import tqdm
+from args import parse_args
+from glob import glob
+import matplotlib.pyplot as plt
+from utils.plotting import plotting_ransac
+
+# %%
+if __name__ == "__main__":
+
+    args = parse_args()
+    root_path = args.root_path
+    region = args.region
+    iter = args.iter
+
+    data_path = f"{region}/adloc"
+    result_path = f"{region}/adloc"
+    figure_path = f"{region}/adloc/figures"
+    if not os.path.exists(figure_path):
+        os.makedirs(figure_path)
+
+    # %%
+    # protocol = "gs"
+    # token_json = f"{os.environ['HOME']}/.config/gcloud/application_default_credentials.json"
+    # with open(token_json, "r") as fp:
+    #     token = json.load(fp)
+    # fs = fsspec.filesystem(protocol, token=token)
+
+    # %%
+    event_csvs = sorted(glob(f"{root_path}/{data_path}/????/????.???.events_sst_{iter}.csv"))
+
+    # %%
+    events = []
+    picks = []
+    stations = []
+    for event_csv in tqdm(event_csvs, desc="Load event csvs"):
+        pick_csv = event_csv.replace(f"events_sst_{iter}.csv", f"picks_sst_{iter}.csv")
+        station_csv = event_csv.replace(f"events_sst_{iter}.csv", f"stations_sst_{iter}.csv")
+
+        year, jday = event_csv.split("/")[-1].split(".")[:2]
+        events_ = pd.read_csv(event_csv, dtype=str)
+        picks_ = pd.read_csv(pick_csv, dtype=str)
+        stations_ = pd.read_csv(station_csv)
+        events_["year"] = year
+        events_["jday"] = jday
+        picks_["year"] = year
+        picks_["jday"] = jday
+        stations_["year"] = year
+        stations_["jday"] = jday
+        events.append(events_)
+        picks.append(picks_)
+        stations.append(stations_)
+
+    events = pd.concat(events, ignore_index=True)
+    picks = pd.concat(picks, ignore_index=True)
+    stations = pd.concat(stations, ignore_index=True)
+
+    station_terms = (
+        stations.groupby(["station_id"])
+        .apply(
+            lambda x: pd.Series(
+                {
+                    "station_term_time_p": (
+                        (x.station_term_time_p * x.num_pick_p).sum() / x.num_pick_p.sum()
+                        if x.num_pick_p.sum() > 0
+                        else 0
+                    ),
+                    "station_term_time_s": (
+                        (x.station_term_time_s * x.num_pick_s).sum() / x.num_pick_s.sum()
+                        if x.num_pick_s.sum() > 0
+                        else 0
+                    ),
+                    "station_term_amplitude": (
+                        (x.station_term_amplitude * x.num_pick).sum() / x.num_pick.sum() if x.num_pick.sum() > 0 else 0
+                    ),
+                }
+            )
+        )
+        .reset_index()
+    )
+    if iter > 0:
+        stations_prev = pd.read_csv(f"{root_path}/{result_path}/adloc_stations_sst_{iter-1}.csv")
+        stations_prev.set_index("station_id", inplace=True)
+
+        station_terms["station_term_time_p"] += (
+            station_terms["station_id"].map(stations_prev["station_term_time_p"]).fillna(0)
+        )
+        station_terms["station_term_time_s"] += (
+            station_terms["station_id"].map(stations_prev["station_term_time_s"]).fillna(0)
+        )
+        station_terms["station_term_amplitude"] += (
+            station_terms["station_id"].map(stations_prev["station_term_amplitude"]).fillna(0)
+        )
+
+    stations = stations.groupby(["station_id"]).first().reset_index()
+    stations.drop(["station_term_time_p", "station_term_time_s", "station_term_amplitude"], axis=1, inplace=True)
+    stations = stations.merge(station_terms, on="station_id")
+
+    events["dummy_id"] = events["year"] + "." + events["jday"] + "." + events["event_index"]
+    picks["dummy_id"] = picks["year"] + "." + picks["jday"] + "." + picks["event_index"]
+
+    events["event_index"] = np.arange(len(events))
+    picks = picks.drop("event_index", axis=1)
+    picks = picks.merge(events[["dummy_id", "event_index"]], on="dummy_id")
+
+    events.drop(["year", "jday", "dummy_id"], axis=1, inplace=True)
+    picks.drop(["year", "jday", "dummy_id"], axis=1, inplace=True)
+    stations.drop(["year", "jday"], axis=1, inplace=True)
+
+    events.to_csv(f"{root_path}/{result_path}/adloc_events_sst_{iter}.csv", index=False)
+    picks.to_csv(f"{root_path}/{result_path}/adloc_picks_sst_{iter}.csv", index=False)
+    stations.to_csv(f"{root_path}/{result_path}/adloc_stations_sst_{iter}.csv", index=False)
+
+    # %%
+
+    events = pd.read_csv(f"{root_path}/{result_path}/adloc_events_sst_{iter}.csv")
+    picks = pd.read_csv(f"{root_path}/{result_path}/adloc_picks_sst_{iter}.csv")
+    stations = pd.read_csv(f"{root_path}/{result_path}/adloc_stations_sst_{iter}.csv")
+
+    fig, ax = plt.subplots(3, 3, figsize=(12, 10))
+    ax[0, 0].scatter(events["longitude"], events["latitude"], c=events["depth_km"], s=1, cmap="viridis_r")
+    ax[0, 0].set_title(f"Events {len(events)}")
+    ax[0, 1].scatter(events["longitude"], events["depth_km"], c=events["depth_km"], s=1, cmap="viridis_r")
+    ax[0, 1].invert_yaxis()
+    ax[0, 1].set_title(f"Events depth")
+    ax[0, 2].scatter(events["latitude"], events["depth_km"], c=events["depth_km"], s=1, cmap="viridis_r")
+    ax[0, 2].invert_yaxis()
+    ax[0, 2].set_title(f"Events latitude")
+    ax[1, 0].scatter(
+        stations["longitude"], stations["latitude"], c=stations["station_term_time_p"], marker="^", cmap="viridis_r"
+    )
+    ax[1, 0].set_title(f"Station term time P {stations['station_term_time_p'].mean():.2f} s")
+    ax[1, 1].scatter(
+        stations["longitude"], stations["latitude"], c=stations["station_term_time_s"], marker="^", cmap="viridis_r"
+    )
+    ax[1, 1].set_title(f"Station term time S {stations['station_term_time_s'].mean():.2f} s")
+    ax[1, 2].scatter(
+        stations["longitude"], stations["latitude"], c=stations["station_term_amplitude"], marker="^", cmap="viridis_r"
+    )
+    ax[1, 2].set_title(f"Station term amplitude {stations['station_term_amplitude'].mean():.2f} m")
+    ax[2, 0].hist(events["adloc_residual_time"], bins=30, edgecolor="white")
+    ax[2, 0].set_title(f"Event residual time")
+    ax[2, 1].hist(events["adloc_residual_amplitude"], bins=30, edgecolor="white")
+    ax[2, 1].set_title(f"Event residual amplitude")
+    idx = picks["adloc_mask"] == 1
+    ax[2, 2].hist(picks.loc[idx, "adloc_residual_time"], bins=30, edgecolor="white")
+    ax[2, 2].set_title(f"Pick residual time")
+    # ax[2, 2].hist(picks["adloc_residual_amplitude"], bins=30, edgecolor="white")
+    # ax[2, 2].set_title(f"Pick residual amplitude")
+    plt.tight_layout()
+    plt.savefig(f"{root_path}/{figure_path}/adloc_summary_{iter}.png")
+    plt.close()
diff --git a/scripts/merge_gamma_picks.py b/scripts/merge_gamma_picks.py
@@ -18,28 +18,6 @@
 from glob import glob
 
 
-def load_data(year, jday, data_path, root_path, bucket, protocol, token):
-
-    fs = fsspec.filesystem(protocol, token=token)
-    adloc_events_csv = f"{data_path}/{year:04d}/adloc_events_{jday:03d}.csv"
-    adloc_picks_csv = f"{data_path}/{year:04d}/adloc_picks_{jday:03d}.csv"
-    if protocol == "file":
-        events = pd.read_csv(f"{root_path}/{adloc_events_csv}", parse_dates=["time"])
-        picks = pd.read_csv(f"{root_path}/{adloc_picks_csv}", parse_dates=["phase_time"])
-    else:
-        with fs.open(f"{bucket}/{adloc_events_csv}", "r") as fp:
-            events = pd.read_csv(fp, parse_dates=["time"])
-        with fs.open(f"{bucket}/{adloc_picks_csv}", "r") as fp:
-            picks = pd.read_csv(fp, parse_dates=["phase_time"])
-
-    events["year"] = year
-    events["jday"] = jday
-    picks["year"] = year
-    picks["jday"] = jday
-
-    return events, picks
-
-
 # %%
 if __name__ == "__main__":
 
@@ -83,7 +61,7 @@ def load_data(year, jday, data_path, root_path, bucket, protocol, token):
 
     events["event_index"] = np.arange(len(events))
     picks = picks.drop("event_index", axis=1)
-    picks = picks.merge(events[["dummy_id", "event_index"]], on="dummy_id")
+    picks = picks.merge(events[["dummy_id", "event_index"]], on="dummy_id", how="left")
 
     events.drop(["year", "jday", "dummy_id"], axis=1, inplace=True)
     picks.drop(["year", "jday", "dummy_id"], axis=1, inplace=True)

diff --git a/scripts/run_adloc.py b/scripts/run_adloc.py
@@ -26,7 +26,6 @@ def run_adloc(
     config: Dict,
     node_rank: int = 0,
     num_nodes: int = 1,
-    picks_csv: str = None,
     protocol: str = "file",
     bucket: str = "",
     token: Dict = None,
@@ -69,13 +68,6 @@ def run_adloc(
     config["maxdepth"] = config["maxdepth"] if "maxdepth" in config else 60.0
     config["use_amplitude"] = True
 
-    # ## Eikonal for 1D velocity model
-    zz = [0.0, 5.5, 16.0, 32.0]
-    vp = [5.5, 5.5, 6.7, 7.8]
-    vp_vs_ratio = 1.73
-    vs = [v / vp_vs_ratio for v in vp]
-    h = 0.3
-
     # %%
     ## Automatic region; you can also specify a region
     # lon0 = stations["longitude"].median()
@@ -119,6 +111,17 @@ def run_adloc(
     # %%
     config["eikonal"] = None
 
+    # ## Eikonal for 1D velocity model
+    zz = [0.0, 5.5, 16.0, 32.0]
+    vp = [5.5, 5.5, 6.7, 7.8]
+    vp_vs_ratio = 1.73
+    vs = [v / vp_vs_ratio for v in vp]
+    # Northern California (Gil7)
+    # zz = [0.0, 1.0, 3.0, 4.0, 5.0, 17.0, 25.0, 62.0]
+    # vp = [3.2, 3.2, 4.5, 4.8, 5.51, 6.21, 6.89, 7.83]
+    # vs = [1.5, 1.5, 2.4, 2.78, 3.18, 3.40, 3.98, 4.52]
+    h = 0.3
+
     if os.path.exists(f"{root_path}/{region}/obspy/velocity.csv"):
         velocity = pd.read_csv(f"{root_path}/{region}/obspy/velocity.csv")
         zz = velocity["z_km"].values
@@ -153,17 +156,6 @@ def run_adloc(
         (None, None),  # t
     )
 
-    # %%
-    plt.figure()
-    plt.scatter(stations["x_km"], stations["y_km"], c=stations["depth_km"], cmap="viridis_r", s=100, marker="^")
-    plt.colorbar(label="Depth (km)")
-    plt.xlabel("X (km)")
-    plt.ylabel("Y (km)")
-    plt.xlim(config["xlim_km"])
-    plt.ylim(config["ylim_km"])
-    plt.title("Stations")
-    plt.savefig(os.path.join(figure_path, "stations.png"), bbox_inches="tight", dpi=300)
-
     # %%
     mapping_phase_type_int = {"P": 0, "S": 1}
     config["vel"] = {mapping_phase_type_int[k]: v for k, v in config["vel"].items()}
@@ -207,9 +199,8 @@ def run_adloc(
         station_term_amp = (
             picks[picks["mask"] == 1.0].groupby("idx_sta").agg({"residual_amplitude": "median"}).reset_index()
         )
-        stations["station_term_amplitude"] += (
-            stations["idx_sta"].map(station_term_amp.set_index("idx_sta")["residual_amplitude"]).fillna(0)
-        )
+        station_term_amp.set_index("idx_sta", inplace=True)
+        stations["station_term_amplitude"] += stations["idx_sta"].map(station_term_amp["residual_amplitude"]).fillna(0)
 
         ## Same P and S station term
         # station_term_time = picks[picks["mask"] == 1.0].groupby("idx_sta").agg({"residual_time": "mean"}).reset_index()
@@ -224,15 +215,12 @@ def run_adloc(
         station_term_time = (
             picks[picks["mask"] == 1.0].groupby(["idx_sta", "phase_type"]).agg({"residual_time": "mean"}).reset_index()
         )
+        station_term_time.set_index("idx_sta", inplace=True)
         stations["station_term_time_p"] += (
-            stations["idx_sta"]
-            .map(station_term_time[station_term_time["phase_type"] == 0].set_index("idx_sta")["residual_time"])
-            .fillna(0)
+            stations["idx_sta"].map(station_term_time[station_term_time["phase_type"] == 0]["residual_time"]).fillna(0)
         )
         stations["station_term_time_s"] += (
-            stations["idx_sta"]
-            .map(station_term_time[station_term_time["phase_type"] == 1].set_index("idx_sta")["residual_time"])
-            .fillna(0)
+            stations["idx_sta"].map(station_term_time[station_term_time["phase_type"] == 1]["residual_time"]).fillna(0)
         )
 
         plotting_ransac(stations, figure_path, config, picks, events_init, events, suffix=f"_ransac_sst_{iter}")