diff --git a/examples/japan/filter_similar_pairs.py b/examples/japan/filter_similar_pairs.py new file mode 100644 index 0000000..2e0b088 --- /dev/null +++ b/examples/japan/filter_similar_pairs.py @@ -0,0 +1,121 @@ +# %% +import os +from collections import defaultdict + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.cluster import DBSCAN + +# %% +root_path = "local" +region = "hinet" +data_path = f"{root_path}/{region}/cctorch" +result_path = f"{root_path}/{region}/qtm" +if not os.path.exists(result_path): + os.makedirs(result_path) + +# Load the datasets +events = pd.read_csv(f"{data_path}/cctorch_events.csv") +picks = pd.read_csv(f"{data_path}/cctorch_picks.csv") +pairs = pd.read_csv(f"{data_path}/ccpairs/CC_002.csv") +print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}") + +# basic filtering +events = events[(events["num_picks"] > 12) & (events["adloc_score"] > 0.9)] +picks = picks[picks["idx_eve"].isin(events["idx_eve"])] +pairs = pairs[pairs["idx_eve1"].isin(events["idx_eve"]) & pairs["idx_eve2"].isin(events["idx_eve"])] +print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}") + +# %% +# Step 1: Calculate mean CC values, filter for CC > 0.9, and create distance matrix +mean_cc = pairs.groupby(["idx_eve1", "idx_eve2"])["cc"].median().reset_index() +neigh_cc = mean_cc[mean_cc["cc"] > 0.9].copy() +neigh_cc["distance"] = 1 - neigh_cc["cc"] + +# Ensure distance matrix includes all events (even those without high CC values) +all_events = np.union1d(neigh_cc["idx_eve1"], neigh_cc["idx_eve2"]) +distance_matrix = pd.DataFrame(np.ones((len(all_events), len(all_events))), index=all_events, columns=all_events) + +# Populate the distance matrix with valid distances from neigh_cc +for _, row in neigh_cc.iterrows(): + distance_matrix.loc[row["idx_eve1"], row["idx_eve2"]] = row["distance"] + +# Symmetrize the matrix +distance_matrix = np.minimum(distance_matrix, distance_matrix.T) + +# Set diagonal to 0 (distance of event to itself) +np.fill_diagonal(distance_matrix.values, 0) + +# %% +# Step 2: Apply DBSCAN +dbscan = DBSCAN(metric="precomputed", eps=0.1, min_samples=2) +clusters = dbscan.fit_predict(distance_matrix) + +# %% +# Step 3: Map events to clusters and find neighbors +cluster_dict = dict(zip(distance_matrix.index, clusters)) +neighbors = defaultdict(list) + +for idx, cluster_id in cluster_dict.items(): + if cluster_id == -1: # Ignore noise + continue + # Count the number of neighbors (events with CC > 0.9) + subset = neigh_cc[(neigh_cc["idx_eve1"] == idx) | (neigh_cc["idx_eve2"] == idx)] + num_neighbors = subset["cc"].count() + neighbors[cluster_id].append((idx, num_neighbors)) + +# For each cluster, select the event with the largest number of neighbors +selected_events = {cluster: max(event_list, key=lambda x: x[1])[0] for cluster, event_list in neighbors.items()} + +# %% +# Step 4: Map the filtered `events` and `picks` based on the `selected_events` +# We will first create a mapping of the key events to their respective clusters +event_to_key_event = {} +for cluster, key_event in selected_events.items(): + for idx, _ in neighbors[cluster]: + event_to_key_event[idx] = key_event + +# %% +# Step 5: Filter Events by `idx_eve`, keeping the one with the largest `num_picks` +# Map `idx_eve` to the key event (to map neighbors to key events) +# events["mapped_idx_eve"] = events["idx_eve"].map(event_to_key_event) +events["mapped_idx_eve"] = events["idx_eve"].map(lambda x: event_to_key_event.get(x, x)) + +# %% +# Now filter events by mapped `idx_eve` (key events), keeping the one with the largest `num_picks` +filtered_events = events.loc[events.groupby("mapped_idx_eve")["num_picks"].idxmax()] + +# Step 6: Filter Picks by `(idx_eve, idx_sta, phase_type)`, keeping the one with the largest `phase_score` +# Map `idx_eve` in picks to the key event (to map neighbors to key events) +# picks["mapped_idx_eve"] = picks["idx_eve"].map(event_to_key_event) +picks["mapped_idx_eve"] = picks["idx_eve"].map(lambda x: event_to_key_event.get(x, x)) + +# Now filter picks by mapped `idx_eve`, `idx_sta`, `phase_type`, keeping the one with the largest `phase_score` +filtered_picks = picks.loc[picks.groupby(["mapped_idx_eve", "idx_sta", "phase_type"])["phase_score"].idxmax()] + + +print(f"Filtered Events: {filtered_events.shape}, Filtered Picks: {filtered_picks.shape}") + +# Save the results to files +filtered_events.to_csv(f"{result_path}/qtm_events.csv", index=False) +filtered_picks.to_csv(f"{result_path}/qtm_picks.csv", index=False) + + +# %% +plt.figure(figsize=(10, 10)) +plt.scatter(events["longitude"], events["latitude"], s=1, c="blue", label="All Events") +plt.scatter( + filtered_events["longitude"], + filtered_events["latitude"], + s=1, + c="red", + marker="x", + label="Filtered Events", +) +plt.legend() +plt.savefig(f"{result_path}/filtered_events.png") +# %% +plt.figure(figsize=(10, 10)) +plt.hist(events["adloc_score"], bins=100) +# %% diff --git a/examples/japan/plot_catalog.py b/examples/japan/plot_catalog.py index ed442ce..7be21cf 100644 --- a/examples/japan/plot_catalog.py +++ b/examples/japan/plot_catalog.py @@ -121,6 +121,8 @@ catalog_ct_hypodd = catalog_ct_hypodd[catalog_ct_hypodd["DEPTH"] != "*********"] catalog_ct_hypodd["DEPTH"] = catalog_ct_hypodd["DEPTH"].astype(float) + catalog_ct_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_ct.csv", index=False) + plt.figure() plt.scatter(catalog_ct_hypodd["LON"], catalog_ct_hypodd["LAT"], s=2) plt.show() @@ -170,6 +172,8 @@ catalog_cc_hypodd = catalog_cc_hypodd[catalog_cc_hypodd["DEPTH"] != "*********"] catalog_cc_hypodd["DEPTH"] = catalog_cc_hypodd["DEPTH"].astype(float) + catalog_cc_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_cc.csv", index=False) + plt.figure() plt.scatter(catalog_cc_hypodd["LON"], catalog_cc_hypodd["LAT"], s=2) plt.show() @@ -214,8 +218,12 @@ growclust_ct_catalog["time"] = growclust_ct_catalog["time"].apply( lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f") ) + growclust_ct_catalog = growclust_ct_catalog[growclust_ct_catalog["nbranch"] > 1] + growclust_ct_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_ct.csv", index=False) + + # %% growclust_file = f"{root_path}/{region}/growclust/growclust_cc_catalog.txt" growclust_cc_exist = False @@ -258,6 +266,8 @@ ) growclust_cc_catalog = growclust_cc_catalog[growclust_cc_catalog["nbranch"] > 1] + growclust_cc_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_cc.csv", index=False) + # %% Debug # def load_Shelly2020(): diff --git a/examples/japan/run_qtm.py b/examples/japan/run_qtm.py new file mode 100644 index 0000000..f2a5a98 --- /dev/null +++ b/examples/japan/run_qtm.py @@ -0,0 +1,133 @@ +# %% +import argparse +import json +import os +from datetime import datetime +from glob import glob +from itertools import product + +import numpy as np +import pandas as pd +import scipy +import torch +from tqdm import tqdm + +# %% +root_path = "local" +region = "hinet" + +result_path = f"{region}/qtm" +if not os.path.exists(f"{root_path}/{result_path}"): + os.makedirs(f"{root_path}/{result_path}") + +# %% +with open(f"{root_path}/{region}/config.json", "r") as fp: + config = json.load(fp) + +# %% Get mseed list +# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????/???/??/*.mseed")) +# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????-???/??/*.sac"), reverse=True) +mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/2024-???/??/*.sac"), reverse=True) +subdir = 3 # year-jday/hour/station_id.mseed +mseeds = pd.DataFrame(mseed_list, columns=["fname"]) +mseeds["mseed_id"] = mseeds["fname"].apply(lambda x: "/".join(x.replace(".sac", "").split("/")[-subdir:])) +mseeds["station_id"] = mseeds["fname"].apply(lambda x: x.replace(".sac", "").split("/")[-1]) +# remove .E/.N/.Z or .EB/.NB/.ZB +mseeds["mseed_id"] = mseeds["mseed_id"].apply(lambda x: ".".join(x.split(".")[:-1])) +mseeds["station_id"] = mseeds["station_id"].apply(lambda x: "." + ".".join(x.split(".")[:-1]) + "..") +mseeds["begin_time"] = mseeds["fname"].apply( + lambda x: datetime.strptime( + # f"{x.split('/')[-subdir]}-{x.split('/')[-subdir+1]}T{x.split('/')[-subdir+2]}", "%Y-%jT%H" + f"{x.split('/')[-subdir]}T{x.split('/')[-subdir+1]}", + "%Y-%jT%H", + ).strftime("%Y-%m-%dT%H:%M:%S.%f") +) +mseeds = ( + mseeds.groupby("mseed_id") + .agg( + { + "station_id": lambda x: ",".join(x.unique()), + "begin_time": lambda x: ",".join(x.unique()), + "fname": lambda x: ",".join(sorted(x)), + } + ) + .reset_index() +) +mseeds["idx_mseed"] = np.arange(len(mseeds)) +mseeds.to_csv(f"{root_path}/{region}/qtm/mseed_list.csv", index=False) +with open(f"{root_path}/{region}/qtm/mseed_list.txt", "w") as fp: + fp.write("\n".join(mseeds["fname"])) + +print(f"Number of mseed files: {len(mseeds)}") + +# %% +# with open(f"{root_path}/{region}/qtm/event_phase_station_id.txt", "r") as fp: +# event_phase_station_id = fp.read().splitlines() +# picks = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_picks.csv") +picks = pd.read_csv(f"{root_path}/{region}/qtm/qtm_picks.csv") +picks["phase_time"] = pd.to_datetime(picks["phase_time"], format="mixed") +picks["phase_time"] = picks["phase_time"].dt.tz_localize(None) +picks = picks[ + (picks["phase_time"] >= pd.to_datetime("2024-01-01T00:00:00")) + # & (picks["phase_time"] < pd.to_datetime("2024-01-02T00:00:00")) +] +stations = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_stations.csv") +picks = picks.merge(stations[["idx_sta", "station_id"]], on="idx_sta") +print(picks.iloc[:10]) +print(f"Number of picks: {len(picks)}") + +# %% +# events = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_events.csv") +events = pd.read_csv(f"{root_path}/{region}/qtm/qtm_events.csv") +events["event_time"] = pd.to_datetime(events["event_time"], format="mixed") +events["event_time"] = events["event_time"].dt.tz_localize(None) +events = events[ + (events["event_time"] >= pd.to_datetime("2024-01-01T00:00:00")) + # & (events["event_time"] < pd.to_datetime("2024-01-02T00:00:00")) +] +print(f"Number of events: {len(events)}") + +# %% Generate event mseed pairs +pairs = [] +unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique()) +print(f"Number of unique station ids: {len(unique_station_ids)}") + +# %% +with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp: + mseeds = mseeds.set_index("idx_mseed") + picks = picks.groupby("station_id") + for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"): + station_id = row["station_id"] + if station_id not in unique_station_ids: + continue + for idx_pick in picks.get_group(station_id)["idx_pick"].values: + fp.write(f"{idx_mseed},{idx_pick}\n") + +## based on GPU memory +batch = 16 +block_size1 = 1 +block_size2 = 100_000 # ~7GB + +# %% +base_cmd = ( + f"../../CCTorch/run.py --mode=TM --pair_list={root_path}/{region}/qtm/pairs.txt " + f"--data_list1={root_path}/{region}/qtm/mseed_list.txt --data_format1=mseed " + f"--data_list2={root_path}/{region}/cctorch/cctorch_picks.csv --data_path2={root_path}/{region}/cctorch/template.dat --data_format2=memmap " + f"--config={root_path}/{region}/cctorch/config.json --batch_size={batch} --block_size1={block_size1} --block_size2={block_size2} --normalize --reduce_c --result_path={root_path}/{region}/qtm/ccpairs" +) + +# %% +num_gpu = torch.cuda.device_count() +if num_gpu == 0: + if os.uname().sysname == "Darwin": + cmd = f"python {base_cmd} --device=mps" + else: + cmd = f"python {base_cmd} --device=cpu" +elif num_gpu == 1: + cmd = f"python {base_cmd}" +else: + cmd = f"torchrun --standalone --nproc_per_node {num_gpu} {base_cmd}" + +# %% +print(cmd) +os.system(cmd) diff --git a/scripts/run_growclust_cc.py b/scripts/run_growclust_cc.py index 93ab2ed..9e2e012 100644 --- a/scripts/run_growclust_cc.py +++ b/scripts/run_growclust_cc.py @@ -18,7 +18,7 @@ # %% # stations_json = f"{region}/results/data/stations.json" # stations = pd.read_json(f"{root_path}/{stations_json}", orient="index") -station_csv = f"{region}/adloc/ransac_stations.csv" +station_csv = f"{region}/cctorch/cctorch_stations.csv" stations = pd.read_csv(f"{root_path}/{station_csv}") stations.set_index("station_id", inplace=True) @@ -35,12 +35,14 @@ # %% # events_csv = f"{region}/results/phase_association/events.csv" -events_csv = f"{region}/adloc/ransac_events.csv" +# events_csv = f"{region}/adloc/ransac_events.csv" +events_csv = f"{region}/cctorch/cctorch_events.csv" # event_file = f"{region}/cctorch/events.csv" events = pd.read_csv(f"{root_path}/{events_csv}") # event_df = event_df[event_df["gamma_score"] > 10] # event_index = [f"{x:06d}" for x in event_df["event_index"]] -events["time"] = pd.to_datetime(events["time"]) +# events["time"] = pd.to_datetime(events["time"]) +events["time"] = pd.to_datetime(events["event_time"]) if "magnitude" not in events.columns: events["magnitude"] = 0.0 diff --git a/scripts/run_growclust_cc.sh b/scripts/run_growclust_cc.sh index 8f797bd..67d8bce 100644 --- a/scripts/run_growclust_cc.sh +++ b/scripts/run_growclust_cc.sh @@ -77,9 +77,9 @@ TT/tt.sg ***** GrowClust Algorithm Parameters ***** ****************************************** * rmin delmax rmsmax - 0.6 120 1.0 + 0.1 120 1.0 * rpsavgmin, rmincut ngoodmin iponly - 0 0.6 8 0 + 0 0.1 8 0 * ****************************************** ************ Output files **************** diff --git a/scripts/run_qtm.py b/scripts/run_qtm.py index cef6853..75b58a0 100644 --- a/scripts/run_qtm.py +++ b/scripts/run_qtm.py @@ -70,15 +70,16 @@ def parse_args(): # %% Generate event mseed pairs pairs = [] unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique()) - -for station_id in unique_station_ids: - mseed_index = mseeds.loc[mseeds["station_id"] == station_id, "idx_mseed"] - pick_index = picks.loc[picks["station_id"] == station_id, "idx_pick"] - pairs.extend(product(mseed_index, pick_index)) +print(f"Number of unique station ids: {len(unique_station_ids)}") # %% with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp: - fp.write("\n".join([f"{x[0]},{x[1]}" for x in pairs])) + mseeds = mseeds.set_index("idx_mseed") + picks = picks.groupby("station_id") + for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"): + station_id = row["station_id"] + for idx_pick in picks.get_group(station_id)["idx_pick"].values: + fp.write(f"{idx_mseed},{idx_pick}\n") ## based on GPU memory batch = 16