update adloc_dtcc

AI4EPS · Sep 12, 2024 · a31c424 · a31c424
1 parent 4e3d280
commit a31c424
Show file tree

Hide file tree

Showing 16 changed files with 1,176 additions and 54 deletions.
diff --git a/examples/california/run_adloc.py b/examples/california/run_adloc.py
@@ -257,13 +257,15 @@ def run_adloc(
                 config["min_score"] = 0.5
                 config["min_p_picks"] = 0  # for filtering
                 config["min_s_picks"] = 0  # for filtering
-            stations["station_term_time"] = 0.0 # no station term
+            stations["station_term_time"] = 0.0  # no station term
             # picks, events = invert_location_iter(picks, stations, config, estimator, events_init=events_init, iter=iter)
             if iter == 0:
                 picks, events = invert_location(picks, stations, config, estimator, events_init=events_init, iter=iter)
             else:
                 # picks, events = invert_location(picks, stations, config, estimator, events_init=events_init, iter=iter)
-                picks, events = invert_location(picks[picks['mask']==1], stations, config, estimator, events_init=events_init, iter=iter)
+                picks, events = invert_location(
+                    picks[picks["mask"] == 1], stations, config, estimator, events_init=events_init, iter=iter
+                )
             # station_term = picks[picks["mask"] == 1.0].groupby("idx_sta").agg({"residual_time": "mean"}).reset_index()
             station_term_time = (
                 picks[picks["mask"] == 1.0].groupby("idx_sta").agg({"residual_time": "mean"}).reset_index()
@@ -279,16 +281,16 @@ def run_adloc(
             )
             ## Separate P and S station term
             # station_term = (
-            #     picks[picks["mask"] == 1.0].groupby(["idx_sta", "phase_type"]).agg({"residual_s": "mean"}).reset_index()
+            #     picks[picks["mask"] == 1.0].groupby(["idx_sta", "phase_type"]).agg({"residual": "mean"}).reset_index()
             # )
             # stations["station_term_p"] = (
             #     stations["idx_sta"]
-            #     .map(station_term[station_term["phase_type"] == 0].set_index("idx_sta")["residual_s"])
+            #     .map(station_term[station_term["phase_type"] == 0].set_index("idx_sta")["residual"])
             #     .fillna(0)
             # )
             # stations["station_term_s"] = (
             #     stations["idx_sta"]
-            #     .map(station_term[station_term["phase_type"] == 1].set_index("idx_sta")["residual_s"])
+            #     .map(station_term[station_term["phase_type"] == 1].set_index("idx_sta")["residual"])
             #     .fillna(0)
             # )
 
@@ -326,17 +328,19 @@ def run_adloc(
         events.drop(["idx_eve", "x_km", "y_km", "z_km"], axis=1, inplace=True, errors="ignore")
         events.sort_values(["time"], inplace=True)
 
-        picks.rename({"mask": "adloc_mask", "residual_s": "adloc_residual_s"}, axis=1, inplace=True)
+        picks.rename({"mask": "adloc_mask", "residual_time": "adloc_residual_time"}, axis=1, inplace=True)
+        if "residual_amplitude" in picks.columns:
+            picks.rename({"residual_amplitude": "adloc_residual_amplitude"}, axis=1, inplace=True)
         picks["phase_type"] = picks["phase_type"].map({0: "P", 1: "S"})
         picks.drop(["idx_eve", "idx_sta"], axis=1, inplace=True, errors="ignore")
         picks.sort_values(["phase_time"], inplace=True)
 
         # stations.drop(["idx_sta", "x_km", "y_km", "z_km"], axis=1, inplace=True, errors="ignore")
         # stations.rename({"station_term": "adloc_station_term_s"}, axis=1, inplace=True)
 
-        # picks.to_csv(os.path.join(result_path, "ransac_picks_sst.csv"), index=False)
-        # events.to_csv(os.path.join(result_path, "ransac_events_sst.csv"), index=False)
-        # stations.to_csv(os.path.join(result_path, "ransac_stations_sst.csv"), index=False)
+        # picks.to_csv(os.path.join(result_path, "ransac_picks.csv"), index=False)
+        # events.to_csv(os.path.join(result_path, "ransac_events.csv"), index=False)
+        # stations.to_csv(os.path.join(result_path, "ransac_stations.csv"), index=False)
         picks.to_csv(f"{root_path}/{adloc_picks_csv}", index=False)
         events.to_csv(f"{root_path}/{adloc_events_csv}", index=False)
         # stations.to_json(f"{root_path}/{result_path}/adloc_stations_{jday:03d}.json", orient="index")

diff --git a/scripts/config.json b/scripts/config.json
@@ -56,6 +56,22 @@
                 "SCEDC"
             ]
         },
+        "Lamont2024": {
+            "longitude0": -119.097,
+            "latitude0": 35.109,
+            "maxradius_degree": 1.0,
+            "minlatitude": 34.809,
+            "maxlatitude": 35.409,
+            "minlongitude": -119.397,
+            "maxlongitude": -118.897,
+            "mindepth": 0,
+            "maxdepth": 30,
+            "starttime": "2024-08-07T00:00:00",
+            "endtime": "2024-08-30T00:00:00",
+            "provider": [
+                "SCEDC"
+            ]
+        },
         "Hawaii": {
             "minlatitude": 18.39,
             "maxlatitude": 20.39,
@@ -164,6 +180,21 @@
             "provider": [
                 "NCEDC"
             ]
+        },
+        "Forge": {
+            "starttime": "2024-04-01T00:00:00",
+            "endtime": "2024-05-01T00:00:00",
+            "minlatitude": 38.28,
+            "maxlatitude": 38.68,
+            "minlongitude": -113.06,
+            "maxlongitude": -112.66,
+            "mindepth": 0,
+            "maxdepth": 10,
+            "network": "*",
+            "channel": "HH*,BH*,EH*,HN*,DP*",
+            "provider": [
+                "IRIS"
+            ]
         }
     }
 }
diff --git a/scripts/cut_templates_cc.py b/scripts/cut_templates_cc.py
@@ -340,13 +340,13 @@ def cut_templates(root_path, region, config):
     )
 
     # %%
-    stations = pd.read_csv(f"{root_path}/{data_path}/ransac_stations_sst.csv")
+    stations = pd.read_csv(f"{root_path}/{data_path}/ransac_stations.csv")
     stations.sort_values(by=["latitude", "longitude"], inplace=True)
     print(f"{len(stations) = }")
     print(stations.iloc[:5])
 
     # %%
-    events = pd.read_csv(f"{root_path}/{data_path}/ransac_events_sst.csv", parse_dates=["time"])
+    events = pd.read_csv(f"{root_path}/{data_path}/ransac_events.csv", parse_dates=["time"])
     events.rename(columns={"time": "event_time"}, inplace=True)
     events["event_time"] = pd.to_datetime(events["event_time"], utc=True)
     reference_t0 = events["event_time"].min()
@@ -396,7 +396,7 @@ def cut_templates(root_path, region, config):
     eikonal = init_eikonal2d(eikonal)
 
     # %%
-    picks = pd.read_csv(f"{root_path}/{region}/adloc/ransac_picks_sst.csv")
+    picks = pd.read_csv(f"{root_path}/{region}/adloc/ransac_picks.csv")
     picks = picks[picks["adloc_mask"] == 1]
     picks["phase_time"] = pd.to_datetime(picks["phase_time"], utc=True)
     min_phase_score = picks["phase_score"].min()

diff --git a/scripts/generate_pairs.py b/scripts/generate_pairs.py
@@ -0,0 +1,224 @@
+# %%
+import argparse
+import json
+import multiprocessing as mp
+import os
+import pickle
+from contextlib import nullcontext
+
+import numpy as np
+import pandas as pd
+from args import parse_args
+from pyproj import Proj
+from sklearn.neighbors import NearestNeighbors
+from tqdm import tqdm
+
+
+# %%
+def pairing_picks(event_pairs, picks, config):
+    picks = picks[["idx_eve", "idx_sta", "phase_type", "phase_score", "phase_time"]].copy()
+    merged = pd.merge(
+        event_pairs,
+        picks,
+        left_on="idx_eve1",
+        right_on="idx_eve",
+    )
+    merged = pd.merge(
+        merged,
+        picks,
+        left_on=["idx_eve2", "idx_sta", "phase_type"],
+        right_on=["idx_eve", "idx_sta", "phase_type"],
+        suffixes=("_1", "_2"),
+    )
+    merged = merged.rename(columns={"phase_time_1": "phase_time1", "phase_time_2": "phase_time2"})
+    merged["phase_score"] = (merged["phase_score_1"] + merged["phase_score_2"]) / 2.0
+
+    merged["travel_time1"] = (merged["phase_time1"] - merged["event_time1"]).dt.total_seconds()
+    merged["travel_time2"] = (merged["phase_time2"] - merged["event_time2"]).dt.total_seconds()
+    merged["phase_dtime"] = merged["travel_time1"] - merged["travel_time2"]
+
+    # filtering
+    # merged = merged.sort_values("phase_score", ascending=False)
+    merged = (
+        merged.groupby(["idx_eve1", "idx_eve2"], group_keys=False)
+        .apply(lambda x: (x.nlargest(config["MAX_OBS"], "phase_score") if len(x) > config["MIN_OBS"] else None))
+        .reset_index(drop=True)
+    )
+
+    return merged[["idx_eve1", "idx_eve2", "idx_sta", "phase_type", "phase_score", "phase_dtime"]]
+
+
+# %%
+if __name__ == "__main__":
+
+    args = parse_args()
+    root_path = args.root_path
+    region = args.region
+
+    data_path = f"{root_path}/{region}/adloc"
+    result_path = f"{root_path}/{region}/adloc_dd"
+    if not os.path.exists(result_path):
+        os.makedirs(result_path)
+
+    # %%
+    pick_file = os.path.join(data_path, "ransac_picks.csv")
+    event_file = os.path.join(data_path, "ransac_events.csv")
+    station_file = os.path.join(data_path, "ransac_stations.csv")
+
+    # %%
+    MAX_PAIR_DIST = 10  # km
+    MAX_NEIGHBORS = 50
+    MIN_NEIGHBORS = 8
+    MIN_OBS = 8
+    MAX_OBS = 100
+    config = {}
+    config["MAX_PAIR_DIST"] = MAX_PAIR_DIST
+    config["MAX_NEIGHBORS"] = MAX_NEIGHBORS
+    config["MIN_NEIGHBORS"] = MIN_NEIGHBORS
+    config["MIN_OBS"] = MIN_OBS
+    config["MAX_OBS"] = MAX_OBS
+    mapping_phase_type_int = {"P": 0, "S": 1}
+
+    # %%
+    stations = pd.read_csv(station_file)
+    picks = pd.read_csv(pick_file, parse_dates=["phase_time"])
+    events = pd.read_csv(event_file, parse_dates=["time"])
+
+    picks = picks[picks["event_index"] != -1]
+    # check phase_type is P/S or 0/1
+    if set(picks["phase_type"].unique()).issubset(set(mapping_phase_type_int.keys())):  # P/S
+        picks["phase_type"] = picks["phase_type"].map(mapping_phase_type_int)
+
+    # %%
+    if "idx_eve" in events.columns:
+        events = events.drop("idx_eve", axis=1)
+    if "idx_sta" in stations.columns:
+        stations = stations.drop("idx_sta", axis=1)
+    if "idx_eve" in picks.columns:
+        picks = picks.drop("idx_eve", axis=1)
+    if "idx_sta" in picks.columns:
+        picks = picks.drop("idx_sta", axis=1)
+
+    # %%
+    # reindex in case the index does not start from 0 or is not continuous
+    stations = stations[stations["station_id"].isin(picks["station_id"].unique())]
+    events = events[events["event_index"].isin(picks["event_index"].unique())]
+    stations["idx_sta"] = np.arange(len(stations))
+    events["idx_eve"] = np.arange(len(events))
+
+    picks = picks.merge(events[["event_index", "idx_eve"]], on="event_index")
+    picks = picks.merge(stations[["station_id", "idx_sta"]], on="station_id")
+
+    # %%
+    lon0 = stations["longitude"].median()
+    lat0 = stations["latitude"].median()
+    proj = Proj(f"+proj=sterea +lon_0={lon0} +lat_0={lat0}  +units=km")
+
+    stations[["x_km", "y_km"]] = stations.apply(
+        lambda x: pd.Series(proj(longitude=x.longitude, latitude=x.latitude)), axis=1
+    )
+    stations["depth_km"] = -stations["elevation_m"] / 1000
+    stations["z_km"] = stations["depth_km"]
+
+    events[["x_km", "y_km"]] = events.apply(
+        lambda x: pd.Series(proj(longitude=x.longitude, latitude=x.latitude)), axis=1
+    )
+    events["z_km"] = events["depth_km"]
+
+    picks = picks.merge(events[["idx_eve", "time"]], on="idx_eve")
+    picks["travel_time"] = (picks["phase_time"] - picks["time"]).dt.total_seconds()
+    picks.drop("time", axis=1, inplace=True)
+
+    # %%
+    picks_by_event = picks.groupby("idx_eve")
+
+    # Option 1:
+    neigh = NearestNeighbors(radius=MAX_PAIR_DIST, n_jobs=-1)
+    neigh.fit(events[["x_km", "y_km", "z_km"]].values)
+    pairs = set()
+    neigh_ind = neigh.radius_neighbors(sort_results=True)[1]
+    for i, neighs in enumerate(tqdm(neigh_ind, desc="Generating pairs")):
+        if len(neighs) < MIN_NEIGHBORS:
+            continue
+        for j in neighs[:MAX_NEIGHBORS]:
+            if i < j:
+                pairs.add((i, j))
+    pairs = list(pairs)
+    event_pairs = pd.DataFrame(list(pairs), columns=["idx_eve1", "idx_eve2"])
+    print(f"Number of events: {len(events)}")
+    print(f"Number of event pairs: {len(event_pairs)}")
+    event_pairs["event_time1"] = events["time"].iloc[event_pairs["idx_eve1"]].values
+    event_pairs["event_time2"] = events["time"].iloc[event_pairs["idx_eve2"]].values
+
+    # Option 2:
+    # neigh = NearestNeighbors(radius=MAX_PAIR_DIST, n_jobs=-1)
+    # neigh.fit(events[["x_km", "y_km", "z_km"]].values)
+    # pairs = set()
+    # neigh_ind = neigh.radius_neighbors()[1]
+    # for i, neighs in enumerate(tqdm(neigh_ind, desc="Generating pairs")):
+    #     if len(neighs) < MIN_NEIGHBORS:
+    #         continue
+    #     neighs = neighs[np.argsort(events.loc[neighs, "num_picks"])]  ## TODO: check if useful
+    #     for j in neighs[:MAX_NEIGHBORS]:
+    #         if i > j:
+    #             pairs.add((j, i))
+    #         else:
+    #             pairs.add((i, j))
+    # pairs = list(pairs)
+
+    # %%
+    chunk_size = 10_000
+    num_chunk = len(event_pairs) // chunk_size
+    pbar = tqdm(total=num_chunk, desc="Pairing picks")
+    results = []
+    jobs = []
+    ctx = mp.get_context("spawn")
+    ncpu = min(num_chunk, min(32, mp.cpu_count()))
+    picks = picks.set_index("idx_eve")
+    with ctx.Pool(processes=ncpu) as pool:
+        for i in np.array_split(np.arange(len(event_pairs)), num_chunk):
+            event_pairs_ = event_pairs.iloc[i]
+            idx = np.unique(event_pairs_[["idx_eve1", "idx_eve2"]].values.flatten())
+            picks_ = picks.loc[idx].reset_index()
+            job = pool.apply_async(pairing_picks, args=(event_pairs_, picks_, config), callback=lambda x: pbar.update())
+            jobs.append(job)
+        pool.close()
+        pool.join()
+        for job in jobs:
+            results.append(job.get())
+
+    event_pairs = pd.concat(results, ignore_index=True)
+    event_pairs = event_pairs.drop_duplicates()
+
+    print(f"Number of pick pairs: {len(event_pairs)}")
+
+    dtypes = np.dtype(
+        [
+            ("event_index1", np.int32),
+            ("event_index2", np.int32),
+            ("station_index", np.int32),
+            ("phase_type", np.int32),
+            ("phase_score", np.float32),
+            ("phase_dtime", np.float32),
+        ]
+    )
+    pairs_array = np.memmap(
+        os.path.join(result_path, "pair_dt.dat"),
+        mode="w+",
+        shape=(len(event_pairs),),
+        dtype=dtypes,
+    )
+    pairs_array["event_index1"] = event_pairs["idx_eve1"].values
+    pairs_array["event_index2"] = event_pairs["idx_eve2"].values
+    pairs_array["station_index"] = event_pairs["idx_sta"].values
+    pairs_array["phase_type"] = event_pairs["phase_type"].values
+    pairs_array["phase_score"] = event_pairs["phase_score"].values
+    pairs_array["phase_dtime"] = event_pairs["phase_dtime"].values
+    with open(os.path.join(result_path, "pair_dtypes.pkl"), "wb") as f:
+        pickle.dump(dtypes, f)
+
+    events.to_csv(os.path.join(result_path, "pair_events.csv"), index=False)
+    stations.to_csv(os.path.join(result_path, "pair_stations.csv"), index=False)
+    picks.to_csv(os.path.join(result_path, "pair_picks.csv"), index=False)
+
+# %%