add qtm to japan

zhuwq0 · zhuwq0 · commit 9915917c3281 · 2024-09-18T20:37:43.000-07:00
diff --git a/examples/japan/filter_similar_pairs.py b/examples/japan/filter_similar_pairs.py
@@ -0,0 +1,121 @@
+# %%
+import os
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.cluster import DBSCAN
+
+# %%
+root_path = "local"
+region = "hinet"
+data_path = f"{root_path}/{region}/cctorch"
+result_path = f"{root_path}/{region}/qtm"
+if not os.path.exists(result_path):
+    os.makedirs(result_path)
+
+# Load the datasets
+events = pd.read_csv(f"{data_path}/cctorch_events.csv")
+picks = pd.read_csv(f"{data_path}/cctorch_picks.csv")
+pairs = pd.read_csv(f"{data_path}/ccpairs/CC_002.csv")
+print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}")
+
+# basic filtering
+events = events[(events["num_picks"] > 12) & (events["adloc_score"] > 0.9)]
+picks = picks[picks["idx_eve"].isin(events["idx_eve"])]
+pairs = pairs[pairs["idx_eve1"].isin(events["idx_eve"]) & pairs["idx_eve2"].isin(events["idx_eve"])]
+print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}")
+
+# %%
+# Step 1: Calculate mean CC values, filter for CC > 0.9, and create distance matrix
+mean_cc = pairs.groupby(["idx_eve1", "idx_eve2"])["cc"].median().reset_index()
+neigh_cc = mean_cc[mean_cc["cc"] > 0.9].copy()
+neigh_cc["distance"] = 1 - neigh_cc["cc"]
+
+# Ensure distance matrix includes all events (even those without high CC values)
+all_events = np.union1d(neigh_cc["idx_eve1"], neigh_cc["idx_eve2"])
+distance_matrix = pd.DataFrame(np.ones((len(all_events), len(all_events))), index=all_events, columns=all_events)
+
+# Populate the distance matrix with valid distances from neigh_cc
+for _, row in neigh_cc.iterrows():
+    distance_matrix.loc[row["idx_eve1"], row["idx_eve2"]] = row["distance"]
+
+# Symmetrize the matrix
+distance_matrix = np.minimum(distance_matrix, distance_matrix.T)
+
+# Set diagonal to 0 (distance of event to itself)
+np.fill_diagonal(distance_matrix.values, 0)
+
+# %%
+# Step 2: Apply DBSCAN
+dbscan = DBSCAN(metric="precomputed", eps=0.1, min_samples=2)
+clusters = dbscan.fit_predict(distance_matrix)
+
+# %%
+# Step 3: Map events to clusters and find neighbors
+cluster_dict = dict(zip(distance_matrix.index, clusters))
+neighbors = defaultdict(list)
+
+for idx, cluster_id in cluster_dict.items():
+    if cluster_id == -1:  # Ignore noise
+        continue
+    # Count the number of neighbors (events with CC > 0.9)
+    subset = neigh_cc[(neigh_cc["idx_eve1"] == idx) | (neigh_cc["idx_eve2"] == idx)]
+    num_neighbors = subset["cc"].count()
+    neighbors[cluster_id].append((idx, num_neighbors))
+
+# For each cluster, select the event with the largest number of neighbors
+selected_events = {cluster: max(event_list, key=lambda x: x[1])[0] for cluster, event_list in neighbors.items()}
+
+# %%
+# Step 4: Map the filtered `events` and `picks` based on the `selected_events`
+# We will first create a mapping of the key events to their respective clusters
+event_to_key_event = {}
+for cluster, key_event in selected_events.items():
+    for idx, _ in neighbors[cluster]:
+        event_to_key_event[idx] = key_event
+
+# %%
+# Step 5: Filter Events by `idx_eve`, keeping the one with the largest `num_picks`
+# Map `idx_eve` to the key event (to map neighbors to key events)
+# events["mapped_idx_eve"] = events["idx_eve"].map(event_to_key_event)
+events["mapped_idx_eve"] = events["idx_eve"].map(lambda x: event_to_key_event.get(x, x))
+
+# %%
+# Now filter events by mapped `idx_eve` (key events), keeping the one with the largest `num_picks`
+filtered_events = events.loc[events.groupby("mapped_idx_eve")["num_picks"].idxmax()]
+
+# Step 6: Filter Picks by `(idx_eve, idx_sta, phase_type)`, keeping the one with the largest `phase_score`
+# Map `idx_eve` in picks to the key event (to map neighbors to key events)
+# picks["mapped_idx_eve"] = picks["idx_eve"].map(event_to_key_event)
+picks["mapped_idx_eve"] = picks["idx_eve"].map(lambda x: event_to_key_event.get(x, x))
+
+# Now filter picks by mapped `idx_eve`, `idx_sta`, `phase_type`, keeping the one with the largest `phase_score`
+filtered_picks = picks.loc[picks.groupby(["mapped_idx_eve", "idx_sta", "phase_type"])["phase_score"].idxmax()]
+
+
+print(f"Filtered Events: {filtered_events.shape}, Filtered Picks: {filtered_picks.shape}")
+
+# Save the results to files
+filtered_events.to_csv(f"{result_path}/qtm_events.csv", index=False)
+filtered_picks.to_csv(f"{result_path}/qtm_picks.csv", index=False)
+
+
+# %%
+plt.figure(figsize=(10, 10))
+plt.scatter(events["longitude"], events["latitude"], s=1, c="blue", label="All Events")
+plt.scatter(
+    filtered_events["longitude"],
+    filtered_events["latitude"],
+    s=1,
+    c="red",
+    marker="x",
+    label="Filtered Events",
+)
+plt.legend()
+plt.savefig(f"{result_path}/filtered_events.png")
+# %%
+plt.figure(figsize=(10, 10))
+plt.hist(events["adloc_score"], bins=100)
+# %%
diff --git a/examples/japan/plot_catalog.py b/examples/japan/plot_catalog.py
@@ -121,6 +121,8 @@
     catalog_ct_hypodd = catalog_ct_hypodd[catalog_ct_hypodd["DEPTH"] != "*********"]
     catalog_ct_hypodd["DEPTH"] = catalog_ct_hypodd["DEPTH"].astype(float)
 
+    catalog_ct_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_ct.csv", index=False)
+
     plt.figure()
     plt.scatter(catalog_ct_hypodd["LON"], catalog_ct_hypodd["LAT"], s=2)
     plt.show()
@@ -170,6 +172,8 @@
     catalog_cc_hypodd = catalog_cc_hypodd[catalog_cc_hypodd["DEPTH"] != "*********"]
     catalog_cc_hypodd["DEPTH"] = catalog_cc_hypodd["DEPTH"].astype(float)
 
+    catalog_cc_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_cc.csv", index=False)
+
     plt.figure()
     plt.scatter(catalog_cc_hypodd["LON"], catalog_cc_hypodd["LAT"], s=2)
     plt.show()
@@ -214,8 +218,12 @@
     growclust_ct_catalog["time"] = growclust_ct_catalog["time"].apply(
         lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f")
     )
+
     growclust_ct_catalog = growclust_ct_catalog[growclust_ct_catalog["nbranch"] > 1]
 
+    growclust_ct_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_ct.csv", index=False)
+
+
 # %%
 growclust_file = f"{root_path}/{region}/growclust/growclust_cc_catalog.txt"
 growclust_cc_exist = False
@@ -258,6 +266,8 @@
     )
     growclust_cc_catalog = growclust_cc_catalog[growclust_cc_catalog["nbranch"] > 1]
 
+    growclust_cc_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_cc.csv", index=False)
+
 
 # %% Debug
 # def load_Shelly2020():
diff --git a/examples/japan/run_qtm.py b/examples/japan/run_qtm.py
@@ -0,0 +1,133 @@
+# %%
+import argparse
+import json
+import os
+from datetime import datetime
+from glob import glob
+from itertools import product
+
+import numpy as np
+import pandas as pd
+import scipy
+import torch
+from tqdm import tqdm
+
+# %%
+root_path = "local"
+region = "hinet"
+
+result_path = f"{region}/qtm"
+if not os.path.exists(f"{root_path}/{result_path}"):
+    os.makedirs(f"{root_path}/{result_path}")
+
+# %%
+with open(f"{root_path}/{region}/config.json", "r") as fp:
+    config = json.load(fp)
+
+# %% Get mseed list
+# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????/???/??/*.mseed"))
+# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????-???/??/*.sac"), reverse=True)
+mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/2024-???/??/*.sac"), reverse=True)
+subdir = 3  # year-jday/hour/station_id.mseed
+mseeds = pd.DataFrame(mseed_list, columns=["fname"])
+mseeds["mseed_id"] = mseeds["fname"].apply(lambda x: "/".join(x.replace(".sac", "").split("/")[-subdir:]))
+mseeds["station_id"] = mseeds["fname"].apply(lambda x: x.replace(".sac", "").split("/")[-1])
+# remove .E/.N/.Z or .EB/.NB/.ZB
+mseeds["mseed_id"] = mseeds["mseed_id"].apply(lambda x: ".".join(x.split(".")[:-1]))
+mseeds["station_id"] = mseeds["station_id"].apply(lambda x: "." + ".".join(x.split(".")[:-1]) + "..")
+mseeds["begin_time"] = mseeds["fname"].apply(
+    lambda x: datetime.strptime(
+        # f"{x.split('/')[-subdir]}-{x.split('/')[-subdir+1]}T{x.split('/')[-subdir+2]}", "%Y-%jT%H"
+        f"{x.split('/')[-subdir]}T{x.split('/')[-subdir+1]}",
+        "%Y-%jT%H",
+    ).strftime("%Y-%m-%dT%H:%M:%S.%f")
+)
+mseeds = (
+    mseeds.groupby("mseed_id")
+    .agg(
+        {
+            "station_id": lambda x: ",".join(x.unique()),
+            "begin_time": lambda x: ",".join(x.unique()),
+            "fname": lambda x: ",".join(sorted(x)),
+        }
+    )
+    .reset_index()
+)
+mseeds["idx_mseed"] = np.arange(len(mseeds))
+mseeds.to_csv(f"{root_path}/{region}/qtm/mseed_list.csv", index=False)
+with open(f"{root_path}/{region}/qtm/mseed_list.txt", "w") as fp:
+    fp.write("\n".join(mseeds["fname"]))
+
+print(f"Number of mseed files: {len(mseeds)}")
+
+# %%
+# with open(f"{root_path}/{region}/qtm/event_phase_station_id.txt", "r") as fp:
+#     event_phase_station_id = fp.read().splitlines()
+# picks = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_picks.csv")
+picks = pd.read_csv(f"{root_path}/{region}/qtm/qtm_picks.csv")
+picks["phase_time"] = pd.to_datetime(picks["phase_time"], format="mixed")
+picks["phase_time"] = picks["phase_time"].dt.tz_localize(None)
+picks = picks[
+    (picks["phase_time"] >= pd.to_datetime("2024-01-01T00:00:00"))
+    # & (picks["phase_time"] < pd.to_datetime("2024-01-02T00:00:00"))
+]
+stations = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_stations.csv")
+picks = picks.merge(stations[["idx_sta", "station_id"]], on="idx_sta")
+print(picks.iloc[:10])
+print(f"Number of picks: {len(picks)}")
+
+# %%
+# events = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_events.csv")
+events = pd.read_csv(f"{root_path}/{region}/qtm/qtm_events.csv")
+events["event_time"] = pd.to_datetime(events["event_time"], format="mixed")
+events["event_time"] = events["event_time"].dt.tz_localize(None)
+events = events[
+    (events["event_time"] >= pd.to_datetime("2024-01-01T00:00:00"))
+    # & (events["event_time"] < pd.to_datetime("2024-01-02T00:00:00"))
+]
+print(f"Number of events: {len(events)}")
+
+# %% Generate event mseed pairs
+pairs = []
+unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique())
+print(f"Number of unique station ids: {len(unique_station_ids)}")
+
+# %%
+with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp:
+    mseeds = mseeds.set_index("idx_mseed")
+    picks = picks.groupby("station_id")
+    for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"):
+        station_id = row["station_id"]
+        if station_id not in unique_station_ids:
+            continue
+        for idx_pick in picks.get_group(station_id)["idx_pick"].values:
+            fp.write(f"{idx_mseed},{idx_pick}\n")
+
+## based on GPU memory
+batch = 16
+block_size1 = 1
+block_size2 = 100_000  # ~7GB
+
+# %%
+base_cmd = (
+    f"../../CCTorch/run.py --mode=TM --pair_list={root_path}/{region}/qtm/pairs.txt "
+    f"--data_list1={root_path}/{region}/qtm/mseed_list.txt --data_format1=mseed "
+    f"--data_list2={root_path}/{region}/cctorch/cctorch_picks.csv --data_path2={root_path}/{region}/cctorch/template.dat --data_format2=memmap "
+    f"--config={root_path}/{region}/cctorch/config.json --batch_size={batch} --block_size1={block_size1} --block_size2={block_size2} --normalize --reduce_c --result_path={root_path}/{region}/qtm/ccpairs"
+)
+
+# %%
+num_gpu = torch.cuda.device_count()
+if num_gpu == 0:
+    if os.uname().sysname == "Darwin":
+        cmd = f"python {base_cmd} --device=mps"
+    else:
+        cmd = f"python {base_cmd} --device=cpu"
+elif num_gpu == 1:
+    cmd = f"python {base_cmd}"
+else:
+    cmd = f"torchrun --standalone --nproc_per_node {num_gpu} {base_cmd}"
+
+# %%
+print(cmd)
+os.system(cmd)
diff --git a/scripts/run_growclust_cc.py b/scripts/run_growclust_cc.py
@@ -18,7 +18,7 @@
 # %%
 # stations_json = f"{region}/results/data/stations.json"
 # stations = pd.read_json(f"{root_path}/{stations_json}", orient="index")
-station_csv = f"{region}/adloc/ransac_stations.csv"
+station_csv = f"{region}/cctorch/cctorch_stations.csv"
 stations = pd.read_csv(f"{root_path}/{station_csv}")
 stations.set_index("station_id", inplace=True)
 
@@ -35,12 +35,14 @@
 
 # %%
 # events_csv = f"{region}/results/phase_association/events.csv"
-events_csv = f"{region}/adloc/ransac_events.csv"
+# events_csv = f"{region}/adloc/ransac_events.csv"
+events_csv = f"{region}/cctorch/cctorch_events.csv"
 # event_file = f"{region}/cctorch/events.csv"
 events = pd.read_csv(f"{root_path}/{events_csv}")
 # event_df = event_df[event_df["gamma_score"] > 10]
 # event_index = [f"{x:06d}" for x in event_df["event_index"]]
-events["time"] = pd.to_datetime(events["time"])
+# events["time"] = pd.to_datetime(events["time"])
+events["time"] = pd.to_datetime(events["event_time"])
 if "magnitude" not in events.columns:
     events["magnitude"] = 0.0
 
diff --git a/scripts/run_growclust_cc.sh b/scripts/run_growclust_cc.sh
@@ -77,9 +77,9 @@ TT/tt.sg
 ***** GrowClust Algorithm Parameters *****
 ******************************************
 * rmin  delmax rmsmax 
-   0.6    120    1.0
+   0.1    120    1.0
 * rpsavgmin, rmincut  ngoodmin   iponly 
-    0          0.6         8        0
+    0          0.1         8        0
 *
 ******************************************
 ************ Output files ****************
diff --git a/scripts/run_qtm.py b/scripts/run_qtm.py
@@ -70,15 +70,16 @@ def parse_args():
 # %% Generate event mseed pairs
 pairs = []
 unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique())
-
-for station_id in unique_station_ids:
-    mseed_index = mseeds.loc[mseeds["station_id"] == station_id, "idx_mseed"]
-    pick_index = picks.loc[picks["station_id"] == station_id, "idx_pick"]
-    pairs.extend(product(mseed_index, pick_index))
+print(f"Number of unique station ids: {len(unique_station_ids)}")
 
 # %%
 with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp:
-    fp.write("\n".join([f"{x[0]},{x[1]}" for x in pairs]))
+    mseeds = mseeds.set_index("idx_mseed")
+    picks = picks.groupby("station_id")
+    for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"):
+        station_id = row["station_id"]
+        for idx_pick in picks.get_group(station_id)["idx_pick"].values:
+            fp.write(f"{idx_mseed},{idx_pick}\n")
 
 ## based on GPU memory
 batch = 16