Skip to content

Commit 9915917

Browse files
committed
add qtm to japan
1 parent acf2fd4 commit 9915917

File tree

6 files changed

+278
-11
lines changed

6 files changed

+278
-11
lines changed
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# %%
2+
import os
3+
from collections import defaultdict
4+
5+
import matplotlib.pyplot as plt
6+
import numpy as np
7+
import pandas as pd
8+
from sklearn.cluster import DBSCAN
9+
10+
# %%
11+
root_path = "local"
12+
region = "hinet"
13+
data_path = f"{root_path}/{region}/cctorch"
14+
result_path = f"{root_path}/{region}/qtm"
15+
if not os.path.exists(result_path):
16+
os.makedirs(result_path)
17+
18+
# Load the datasets
19+
events = pd.read_csv(f"{data_path}/cctorch_events.csv")
20+
picks = pd.read_csv(f"{data_path}/cctorch_picks.csv")
21+
pairs = pd.read_csv(f"{data_path}/ccpairs/CC_002.csv")
22+
print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}")
23+
24+
# basic filtering
25+
events = events[(events["num_picks"] > 12) & (events["adloc_score"] > 0.9)]
26+
picks = picks[picks["idx_eve"].isin(events["idx_eve"])]
27+
pairs = pairs[pairs["idx_eve1"].isin(events["idx_eve"]) & pairs["idx_eve2"].isin(events["idx_eve"])]
28+
print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}")
29+
30+
# %%
31+
# Step 1: Calculate mean CC values, filter for CC > 0.9, and create distance matrix
32+
mean_cc = pairs.groupby(["idx_eve1", "idx_eve2"])["cc"].median().reset_index()
33+
neigh_cc = mean_cc[mean_cc["cc"] > 0.9].copy()
34+
neigh_cc["distance"] = 1 - neigh_cc["cc"]
35+
36+
# Ensure distance matrix includes all events (even those without high CC values)
37+
all_events = np.union1d(neigh_cc["idx_eve1"], neigh_cc["idx_eve2"])
38+
distance_matrix = pd.DataFrame(np.ones((len(all_events), len(all_events))), index=all_events, columns=all_events)
39+
40+
# Populate the distance matrix with valid distances from neigh_cc
41+
for _, row in neigh_cc.iterrows():
42+
distance_matrix.loc[row["idx_eve1"], row["idx_eve2"]] = row["distance"]
43+
44+
# Symmetrize the matrix
45+
distance_matrix = np.minimum(distance_matrix, distance_matrix.T)
46+
47+
# Set diagonal to 0 (distance of event to itself)
48+
np.fill_diagonal(distance_matrix.values, 0)
49+
50+
# %%
51+
# Step 2: Apply DBSCAN
52+
dbscan = DBSCAN(metric="precomputed", eps=0.1, min_samples=2)
53+
clusters = dbscan.fit_predict(distance_matrix)
54+
55+
# %%
56+
# Step 3: Map events to clusters and find neighbors
57+
cluster_dict = dict(zip(distance_matrix.index, clusters))
58+
neighbors = defaultdict(list)
59+
60+
for idx, cluster_id in cluster_dict.items():
61+
if cluster_id == -1: # Ignore noise
62+
continue
63+
# Count the number of neighbors (events with CC > 0.9)
64+
subset = neigh_cc[(neigh_cc["idx_eve1"] == idx) | (neigh_cc["idx_eve2"] == idx)]
65+
num_neighbors = subset["cc"].count()
66+
neighbors[cluster_id].append((idx, num_neighbors))
67+
68+
# For each cluster, select the event with the largest number of neighbors
69+
selected_events = {cluster: max(event_list, key=lambda x: x[1])[0] for cluster, event_list in neighbors.items()}
70+
71+
# %%
72+
# Step 4: Map the filtered `events` and `picks` based on the `selected_events`
73+
# We will first create a mapping of the key events to their respective clusters
74+
event_to_key_event = {}
75+
for cluster, key_event in selected_events.items():
76+
for idx, _ in neighbors[cluster]:
77+
event_to_key_event[idx] = key_event
78+
79+
# %%
80+
# Step 5: Filter Events by `idx_eve`, keeping the one with the largest `num_picks`
81+
# Map `idx_eve` to the key event (to map neighbors to key events)
82+
# events["mapped_idx_eve"] = events["idx_eve"].map(event_to_key_event)
83+
events["mapped_idx_eve"] = events["idx_eve"].map(lambda x: event_to_key_event.get(x, x))
84+
85+
# %%
86+
# Now filter events by mapped `idx_eve` (key events), keeping the one with the largest `num_picks`
87+
filtered_events = events.loc[events.groupby("mapped_idx_eve")["num_picks"].idxmax()]
88+
89+
# Step 6: Filter Picks by `(idx_eve, idx_sta, phase_type)`, keeping the one with the largest `phase_score`
90+
# Map `idx_eve` in picks to the key event (to map neighbors to key events)
91+
# picks["mapped_idx_eve"] = picks["idx_eve"].map(event_to_key_event)
92+
picks["mapped_idx_eve"] = picks["idx_eve"].map(lambda x: event_to_key_event.get(x, x))
93+
94+
# Now filter picks by mapped `idx_eve`, `idx_sta`, `phase_type`, keeping the one with the largest `phase_score`
95+
filtered_picks = picks.loc[picks.groupby(["mapped_idx_eve", "idx_sta", "phase_type"])["phase_score"].idxmax()]
96+
97+
98+
print(f"Filtered Events: {filtered_events.shape}, Filtered Picks: {filtered_picks.shape}")
99+
100+
# Save the results to files
101+
filtered_events.to_csv(f"{result_path}/qtm_events.csv", index=False)
102+
filtered_picks.to_csv(f"{result_path}/qtm_picks.csv", index=False)
103+
104+
105+
# %%
106+
plt.figure(figsize=(10, 10))
107+
plt.scatter(events["longitude"], events["latitude"], s=1, c="blue", label="All Events")
108+
plt.scatter(
109+
filtered_events["longitude"],
110+
filtered_events["latitude"],
111+
s=1,
112+
c="red",
113+
marker="x",
114+
label="Filtered Events",
115+
)
116+
plt.legend()
117+
plt.savefig(f"{result_path}/filtered_events.png")
118+
# %%
119+
plt.figure(figsize=(10, 10))
120+
plt.hist(events["adloc_score"], bins=100)
121+
# %%

examples/japan/plot_catalog.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@
121121
catalog_ct_hypodd = catalog_ct_hypodd[catalog_ct_hypodd["DEPTH"] != "*********"]
122122
catalog_ct_hypodd["DEPTH"] = catalog_ct_hypodd["DEPTH"].astype(float)
123123

124+
catalog_ct_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_ct.csv", index=False)
125+
124126
plt.figure()
125127
plt.scatter(catalog_ct_hypodd["LON"], catalog_ct_hypodd["LAT"], s=2)
126128
plt.show()
@@ -170,6 +172,8 @@
170172
catalog_cc_hypodd = catalog_cc_hypodd[catalog_cc_hypodd["DEPTH"] != "*********"]
171173
catalog_cc_hypodd["DEPTH"] = catalog_cc_hypodd["DEPTH"].astype(float)
172174

175+
catalog_cc_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_cc.csv", index=False)
176+
173177
plt.figure()
174178
plt.scatter(catalog_cc_hypodd["LON"], catalog_cc_hypodd["LAT"], s=2)
175179
plt.show()
@@ -214,8 +218,12 @@
214218
growclust_ct_catalog["time"] = growclust_ct_catalog["time"].apply(
215219
lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f")
216220
)
221+
217222
growclust_ct_catalog = growclust_ct_catalog[growclust_ct_catalog["nbranch"] > 1]
218223

224+
growclust_ct_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_ct.csv", index=False)
225+
226+
219227
# %%
220228
growclust_file = f"{root_path}/{region}/growclust/growclust_cc_catalog.txt"
221229
growclust_cc_exist = False
@@ -258,6 +266,8 @@
258266
)
259267
growclust_cc_catalog = growclust_cc_catalog[growclust_cc_catalog["nbranch"] > 1]
260268

269+
growclust_cc_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_cc.csv", index=False)
270+
261271

262272
# %% Debug
263273
# def load_Shelly2020():

examples/japan/run_qtm.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# %%
2+
import argparse
3+
import json
4+
import os
5+
from datetime import datetime
6+
from glob import glob
7+
from itertools import product
8+
9+
import numpy as np
10+
import pandas as pd
11+
import scipy
12+
import torch
13+
from tqdm import tqdm
14+
15+
# %%
16+
root_path = "local"
17+
region = "hinet"
18+
19+
result_path = f"{region}/qtm"
20+
if not os.path.exists(f"{root_path}/{result_path}"):
21+
os.makedirs(f"{root_path}/{result_path}")
22+
23+
# %%
24+
with open(f"{root_path}/{region}/config.json", "r") as fp:
25+
config = json.load(fp)
26+
27+
# %% Get mseed list
28+
# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????/???/??/*.mseed"))
29+
# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????-???/??/*.sac"), reverse=True)
30+
mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/2024-???/??/*.sac"), reverse=True)
31+
subdir = 3 # year-jday/hour/station_id.mseed
32+
mseeds = pd.DataFrame(mseed_list, columns=["fname"])
33+
mseeds["mseed_id"] = mseeds["fname"].apply(lambda x: "/".join(x.replace(".sac", "").split("/")[-subdir:]))
34+
mseeds["station_id"] = mseeds["fname"].apply(lambda x: x.replace(".sac", "").split("/")[-1])
35+
# remove .E/.N/.Z or .EB/.NB/.ZB
36+
mseeds["mseed_id"] = mseeds["mseed_id"].apply(lambda x: ".".join(x.split(".")[:-1]))
37+
mseeds["station_id"] = mseeds["station_id"].apply(lambda x: "." + ".".join(x.split(".")[:-1]) + "..")
38+
mseeds["begin_time"] = mseeds["fname"].apply(
39+
lambda x: datetime.strptime(
40+
# f"{x.split('/')[-subdir]}-{x.split('/')[-subdir+1]}T{x.split('/')[-subdir+2]}", "%Y-%jT%H"
41+
f"{x.split('/')[-subdir]}T{x.split('/')[-subdir+1]}",
42+
"%Y-%jT%H",
43+
).strftime("%Y-%m-%dT%H:%M:%S.%f")
44+
)
45+
mseeds = (
46+
mseeds.groupby("mseed_id")
47+
.agg(
48+
{
49+
"station_id": lambda x: ",".join(x.unique()),
50+
"begin_time": lambda x: ",".join(x.unique()),
51+
"fname": lambda x: ",".join(sorted(x)),
52+
}
53+
)
54+
.reset_index()
55+
)
56+
mseeds["idx_mseed"] = np.arange(len(mseeds))
57+
mseeds.to_csv(f"{root_path}/{region}/qtm/mseed_list.csv", index=False)
58+
with open(f"{root_path}/{region}/qtm/mseed_list.txt", "w") as fp:
59+
fp.write("\n".join(mseeds["fname"]))
60+
61+
print(f"Number of mseed files: {len(mseeds)}")
62+
63+
# %%
64+
# with open(f"{root_path}/{region}/qtm/event_phase_station_id.txt", "r") as fp:
65+
# event_phase_station_id = fp.read().splitlines()
66+
# picks = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_picks.csv")
67+
picks = pd.read_csv(f"{root_path}/{region}/qtm/qtm_picks.csv")
68+
picks["phase_time"] = pd.to_datetime(picks["phase_time"], format="mixed")
69+
picks["phase_time"] = picks["phase_time"].dt.tz_localize(None)
70+
picks = picks[
71+
(picks["phase_time"] >= pd.to_datetime("2024-01-01T00:00:00"))
72+
# & (picks["phase_time"] < pd.to_datetime("2024-01-02T00:00:00"))
73+
]
74+
stations = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_stations.csv")
75+
picks = picks.merge(stations[["idx_sta", "station_id"]], on="idx_sta")
76+
print(picks.iloc[:10])
77+
print(f"Number of picks: {len(picks)}")
78+
79+
# %%
80+
# events = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_events.csv")
81+
events = pd.read_csv(f"{root_path}/{region}/qtm/qtm_events.csv")
82+
events["event_time"] = pd.to_datetime(events["event_time"], format="mixed")
83+
events["event_time"] = events["event_time"].dt.tz_localize(None)
84+
events = events[
85+
(events["event_time"] >= pd.to_datetime("2024-01-01T00:00:00"))
86+
# & (events["event_time"] < pd.to_datetime("2024-01-02T00:00:00"))
87+
]
88+
print(f"Number of events: {len(events)}")
89+
90+
# %% Generate event mseed pairs
91+
pairs = []
92+
unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique())
93+
print(f"Number of unique station ids: {len(unique_station_ids)}")
94+
95+
# %%
96+
with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp:
97+
mseeds = mseeds.set_index("idx_mseed")
98+
picks = picks.groupby("station_id")
99+
for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"):
100+
station_id = row["station_id"]
101+
if station_id not in unique_station_ids:
102+
continue
103+
for idx_pick in picks.get_group(station_id)["idx_pick"].values:
104+
fp.write(f"{idx_mseed},{idx_pick}\n")
105+
106+
## based on GPU memory
107+
batch = 16
108+
block_size1 = 1
109+
block_size2 = 100_000 # ~7GB
110+
111+
# %%
112+
base_cmd = (
113+
f"../../CCTorch/run.py --mode=TM --pair_list={root_path}/{region}/qtm/pairs.txt "
114+
f"--data_list1={root_path}/{region}/qtm/mseed_list.txt --data_format1=mseed "
115+
f"--data_list2={root_path}/{region}/cctorch/cctorch_picks.csv --data_path2={root_path}/{region}/cctorch/template.dat --data_format2=memmap "
116+
f"--config={root_path}/{region}/cctorch/config.json --batch_size={batch} --block_size1={block_size1} --block_size2={block_size2} --normalize --reduce_c --result_path={root_path}/{region}/qtm/ccpairs"
117+
)
118+
119+
# %%
120+
num_gpu = torch.cuda.device_count()
121+
if num_gpu == 0:
122+
if os.uname().sysname == "Darwin":
123+
cmd = f"python {base_cmd} --device=mps"
124+
else:
125+
cmd = f"python {base_cmd} --device=cpu"
126+
elif num_gpu == 1:
127+
cmd = f"python {base_cmd}"
128+
else:
129+
cmd = f"torchrun --standalone --nproc_per_node {num_gpu} {base_cmd}"
130+
131+
# %%
132+
print(cmd)
133+
os.system(cmd)

scripts/run_growclust_cc.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# %%
1919
# stations_json = f"{region}/results/data/stations.json"
2020
# stations = pd.read_json(f"{root_path}/{stations_json}", orient="index")
21-
station_csv = f"{region}/adloc/ransac_stations.csv"
21+
station_csv = f"{region}/cctorch/cctorch_stations.csv"
2222
stations = pd.read_csv(f"{root_path}/{station_csv}")
2323
stations.set_index("station_id", inplace=True)
2424

@@ -35,12 +35,14 @@
3535

3636
# %%
3737
# events_csv = f"{region}/results/phase_association/events.csv"
38-
events_csv = f"{region}/adloc/ransac_events.csv"
38+
# events_csv = f"{region}/adloc/ransac_events.csv"
39+
events_csv = f"{region}/cctorch/cctorch_events.csv"
3940
# event_file = f"{region}/cctorch/events.csv"
4041
events = pd.read_csv(f"{root_path}/{events_csv}")
4142
# event_df = event_df[event_df["gamma_score"] > 10]
4243
# event_index = [f"{x:06d}" for x in event_df["event_index"]]
43-
events["time"] = pd.to_datetime(events["time"])
44+
# events["time"] = pd.to_datetime(events["time"])
45+
events["time"] = pd.to_datetime(events["event_time"])
4446
if "magnitude" not in events.columns:
4547
events["magnitude"] = 0.0
4648

scripts/run_growclust_cc.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,9 @@ TT/tt.sg
7777
***** GrowClust Algorithm Parameters *****
7878
******************************************
7979
* rmin delmax rmsmax
80-
0.6 120 1.0
80+
0.1 120 1.0
8181
* rpsavgmin, rmincut ngoodmin iponly
82-
0 0.6 8 0
82+
0 0.1 8 0
8383
*
8484
******************************************
8585
************ Output files ****************

scripts/run_qtm.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,16 @@ def parse_args():
7070
# %% Generate event mseed pairs
7171
pairs = []
7272
unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique())
73-
74-
for station_id in unique_station_ids:
75-
mseed_index = mseeds.loc[mseeds["station_id"] == station_id, "idx_mseed"]
76-
pick_index = picks.loc[picks["station_id"] == station_id, "idx_pick"]
77-
pairs.extend(product(mseed_index, pick_index))
73+
print(f"Number of unique station ids: {len(unique_station_ids)}")
7874

7975
# %%
8076
with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp:
81-
fp.write("\n".join([f"{x[0]},{x[1]}" for x in pairs]))
77+
mseeds = mseeds.set_index("idx_mseed")
78+
picks = picks.groupby("station_id")
79+
for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"):
80+
station_id = row["station_id"]
81+
for idx_pick in picks.get_group(station_id)["idx_pick"].values:
82+
fp.write(f"{idx_mseed},{idx_pick}\n")
8283

8384
## based on GPU memory
8485
batch = 16

0 commit comments

Comments
 (0)