Skip to content

Commit

Permalink
add qtm to japan
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuwq0 committed Sep 19, 2024
1 parent acf2fd4 commit 9915917
Show file tree
Hide file tree
Showing 6 changed files with 278 additions and 11 deletions.
121 changes: 121 additions & 0 deletions examples/japan/filter_similar_pairs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# %%
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN

# %%
root_path = "local"
region = "hinet"
data_path = f"{root_path}/{region}/cctorch"
result_path = f"{root_path}/{region}/qtm"
if not os.path.exists(result_path):
os.makedirs(result_path)

# Load the datasets
events = pd.read_csv(f"{data_path}/cctorch_events.csv")
picks = pd.read_csv(f"{data_path}/cctorch_picks.csv")
pairs = pd.read_csv(f"{data_path}/ccpairs/CC_002.csv")
print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}")

# basic filtering
events = events[(events["num_picks"] > 12) & (events["adloc_score"] > 0.9)]
picks = picks[picks["idx_eve"].isin(events["idx_eve"])]
pairs = pairs[pairs["idx_eve1"].isin(events["idx_eve"]) & pairs["idx_eve2"].isin(events["idx_eve"])]
print(f"Events: {events.shape}, Picks: {picks.shape}, Pairs: {pairs.shape}")

# %%
# Step 1: Calculate mean CC values, filter for CC > 0.9, and create distance matrix
mean_cc = pairs.groupby(["idx_eve1", "idx_eve2"])["cc"].median().reset_index()
neigh_cc = mean_cc[mean_cc["cc"] > 0.9].copy()
neigh_cc["distance"] = 1 - neigh_cc["cc"]

# Ensure distance matrix includes all events (even those without high CC values)
all_events = np.union1d(neigh_cc["idx_eve1"], neigh_cc["idx_eve2"])
distance_matrix = pd.DataFrame(np.ones((len(all_events), len(all_events))), index=all_events, columns=all_events)

# Populate the distance matrix with valid distances from neigh_cc
for _, row in neigh_cc.iterrows():
distance_matrix.loc[row["idx_eve1"], row["idx_eve2"]] = row["distance"]

# Symmetrize the matrix
distance_matrix = np.minimum(distance_matrix, distance_matrix.T)

# Set diagonal to 0 (distance of event to itself)
np.fill_diagonal(distance_matrix.values, 0)

# %%
# Step 2: Apply DBSCAN
dbscan = DBSCAN(metric="precomputed", eps=0.1, min_samples=2)
clusters = dbscan.fit_predict(distance_matrix)

# %%
# Step 3: Map events to clusters and find neighbors
cluster_dict = dict(zip(distance_matrix.index, clusters))
neighbors = defaultdict(list)

for idx, cluster_id in cluster_dict.items():
if cluster_id == -1: # Ignore noise
continue
# Count the number of neighbors (events with CC > 0.9)
subset = neigh_cc[(neigh_cc["idx_eve1"] == idx) | (neigh_cc["idx_eve2"] == idx)]
num_neighbors = subset["cc"].count()
neighbors[cluster_id].append((idx, num_neighbors))

# For each cluster, select the event with the largest number of neighbors
selected_events = {cluster: max(event_list, key=lambda x: x[1])[0] for cluster, event_list in neighbors.items()}

# %%
# Step 4: Map the filtered `events` and `picks` based on the `selected_events`
# We will first create a mapping of the key events to their respective clusters
event_to_key_event = {}
for cluster, key_event in selected_events.items():
for idx, _ in neighbors[cluster]:
event_to_key_event[idx] = key_event

# %%
# Step 5: Filter Events by `idx_eve`, keeping the one with the largest `num_picks`
# Map `idx_eve` to the key event (to map neighbors to key events)
# events["mapped_idx_eve"] = events["idx_eve"].map(event_to_key_event)
events["mapped_idx_eve"] = events["idx_eve"].map(lambda x: event_to_key_event.get(x, x))

# %%
# Now filter events by mapped `idx_eve` (key events), keeping the one with the largest `num_picks`
filtered_events = events.loc[events.groupby("mapped_idx_eve")["num_picks"].idxmax()]

# Step 6: Filter Picks by `(idx_eve, idx_sta, phase_type)`, keeping the one with the largest `phase_score`
# Map `idx_eve` in picks to the key event (to map neighbors to key events)
# picks["mapped_idx_eve"] = picks["idx_eve"].map(event_to_key_event)
picks["mapped_idx_eve"] = picks["idx_eve"].map(lambda x: event_to_key_event.get(x, x))

# Now filter picks by mapped `idx_eve`, `idx_sta`, `phase_type`, keeping the one with the largest `phase_score`
filtered_picks = picks.loc[picks.groupby(["mapped_idx_eve", "idx_sta", "phase_type"])["phase_score"].idxmax()]


print(f"Filtered Events: {filtered_events.shape}, Filtered Picks: {filtered_picks.shape}")

# Save the results to files
filtered_events.to_csv(f"{result_path}/qtm_events.csv", index=False)
filtered_picks.to_csv(f"{result_path}/qtm_picks.csv", index=False)


# %%
plt.figure(figsize=(10, 10))
plt.scatter(events["longitude"], events["latitude"], s=1, c="blue", label="All Events")
plt.scatter(
filtered_events["longitude"],
filtered_events["latitude"],
s=1,
c="red",
marker="x",
label="Filtered Events",
)
plt.legend()
plt.savefig(f"{result_path}/filtered_events.png")
# %%
plt.figure(figsize=(10, 10))
plt.hist(events["adloc_score"], bins=100)
# %%
10 changes: 10 additions & 0 deletions examples/japan/plot_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@
catalog_ct_hypodd = catalog_ct_hypodd[catalog_ct_hypodd["DEPTH"] != "*********"]
catalog_ct_hypodd["DEPTH"] = catalog_ct_hypodd["DEPTH"].astype(float)

catalog_ct_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_ct.csv", index=False)

plt.figure()
plt.scatter(catalog_ct_hypodd["LON"], catalog_ct_hypodd["LAT"], s=2)
plt.show()
Expand Down Expand Up @@ -170,6 +172,8 @@
catalog_cc_hypodd = catalog_cc_hypodd[catalog_cc_hypodd["DEPTH"] != "*********"]
catalog_cc_hypodd["DEPTH"] = catalog_cc_hypodd["DEPTH"].astype(float)

catalog_cc_hypodd.to_csv(f"{root_path}/{region}/hypodd/hypodd_cc.csv", index=False)

plt.figure()
plt.scatter(catalog_cc_hypodd["LON"], catalog_cc_hypodd["LAT"], s=2)
plt.show()
Expand Down Expand Up @@ -214,8 +218,12 @@
growclust_ct_catalog["time"] = growclust_ct_catalog["time"].apply(
lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f")
)

growclust_ct_catalog = growclust_ct_catalog[growclust_ct_catalog["nbranch"] > 1]

growclust_ct_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_ct.csv", index=False)


# %%
growclust_file = f"{root_path}/{region}/growclust/growclust_cc_catalog.txt"
growclust_cc_exist = False
Expand Down Expand Up @@ -258,6 +266,8 @@
)
growclust_cc_catalog = growclust_cc_catalog[growclust_cc_catalog["nbranch"] > 1]

growclust_cc_catalog.to_csv(f"{root_path}/{region}/growclust/growclust_cc.csv", index=False)


# %% Debug
# def load_Shelly2020():
Expand Down
133 changes: 133 additions & 0 deletions examples/japan/run_qtm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# %%
import argparse
import json
import os
from datetime import datetime
from glob import glob
from itertools import product

import numpy as np
import pandas as pd
import scipy
import torch
from tqdm import tqdm

# %%
root_path = "local"
region = "hinet"

result_path = f"{region}/qtm"
if not os.path.exists(f"{root_path}/{result_path}"):
os.makedirs(f"{root_path}/{result_path}")

# %%
with open(f"{root_path}/{region}/config.json", "r") as fp:
config = json.load(fp)

# %% Get mseed list
# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????/???/??/*.mseed"))
# mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/????-???/??/*.sac"), reverse=True)
mseed_list = sorted(glob(f"{root_path}/{region}/waveforms/2024-???/??/*.sac"), reverse=True)
subdir = 3 # year-jday/hour/station_id.mseed
mseeds = pd.DataFrame(mseed_list, columns=["fname"])
mseeds["mseed_id"] = mseeds["fname"].apply(lambda x: "/".join(x.replace(".sac", "").split("/")[-subdir:]))
mseeds["station_id"] = mseeds["fname"].apply(lambda x: x.replace(".sac", "").split("/")[-1])
# remove .E/.N/.Z or .EB/.NB/.ZB
mseeds["mseed_id"] = mseeds["mseed_id"].apply(lambda x: ".".join(x.split(".")[:-1]))
mseeds["station_id"] = mseeds["station_id"].apply(lambda x: "." + ".".join(x.split(".")[:-1]) + "..")
mseeds["begin_time"] = mseeds["fname"].apply(
lambda x: datetime.strptime(
# f"{x.split('/')[-subdir]}-{x.split('/')[-subdir+1]}T{x.split('/')[-subdir+2]}", "%Y-%jT%H"
f"{x.split('/')[-subdir]}T{x.split('/')[-subdir+1]}",
"%Y-%jT%H",
).strftime("%Y-%m-%dT%H:%M:%S.%f")
)
mseeds = (
mseeds.groupby("mseed_id")
.agg(
{
"station_id": lambda x: ",".join(x.unique()),
"begin_time": lambda x: ",".join(x.unique()),
"fname": lambda x: ",".join(sorted(x)),
}
)
.reset_index()
)
mseeds["idx_mseed"] = np.arange(len(mseeds))
mseeds.to_csv(f"{root_path}/{region}/qtm/mseed_list.csv", index=False)
with open(f"{root_path}/{region}/qtm/mseed_list.txt", "w") as fp:
fp.write("\n".join(mseeds["fname"]))

print(f"Number of mseed files: {len(mseeds)}")

# %%
# with open(f"{root_path}/{region}/qtm/event_phase_station_id.txt", "r") as fp:
# event_phase_station_id = fp.read().splitlines()
# picks = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_picks.csv")
picks = pd.read_csv(f"{root_path}/{region}/qtm/qtm_picks.csv")
picks["phase_time"] = pd.to_datetime(picks["phase_time"], format="mixed")
picks["phase_time"] = picks["phase_time"].dt.tz_localize(None)
picks = picks[
(picks["phase_time"] >= pd.to_datetime("2024-01-01T00:00:00"))
# & (picks["phase_time"] < pd.to_datetime("2024-01-02T00:00:00"))
]
stations = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_stations.csv")
picks = picks.merge(stations[["idx_sta", "station_id"]], on="idx_sta")
print(picks.iloc[:10])
print(f"Number of picks: {len(picks)}")

# %%
# events = pd.read_csv(f"{root_path}/{region}/cctorch/cctorch_events.csv")
events = pd.read_csv(f"{root_path}/{region}/qtm/qtm_events.csv")
events["event_time"] = pd.to_datetime(events["event_time"], format="mixed")
events["event_time"] = events["event_time"].dt.tz_localize(None)
events = events[
(events["event_time"] >= pd.to_datetime("2024-01-01T00:00:00"))
# & (events["event_time"] < pd.to_datetime("2024-01-02T00:00:00"))
]
print(f"Number of events: {len(events)}")

# %% Generate event mseed pairs
pairs = []
unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique())
print(f"Number of unique station ids: {len(unique_station_ids)}")

# %%
with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp:
mseeds = mseeds.set_index("idx_mseed")
picks = picks.groupby("station_id")
for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"):
station_id = row["station_id"]
if station_id not in unique_station_ids:
continue
for idx_pick in picks.get_group(station_id)["idx_pick"].values:
fp.write(f"{idx_mseed},{idx_pick}\n")

## based on GPU memory
batch = 16
block_size1 = 1
block_size2 = 100_000 # ~7GB

# %%
base_cmd = (
f"../../CCTorch/run.py --mode=TM --pair_list={root_path}/{region}/qtm/pairs.txt "
f"--data_list1={root_path}/{region}/qtm/mseed_list.txt --data_format1=mseed "
f"--data_list2={root_path}/{region}/cctorch/cctorch_picks.csv --data_path2={root_path}/{region}/cctorch/template.dat --data_format2=memmap "
f"--config={root_path}/{region}/cctorch/config.json --batch_size={batch} --block_size1={block_size1} --block_size2={block_size2} --normalize --reduce_c --result_path={root_path}/{region}/qtm/ccpairs"
)

# %%
num_gpu = torch.cuda.device_count()
if num_gpu == 0:
if os.uname().sysname == "Darwin":
cmd = f"python {base_cmd} --device=mps"
else:
cmd = f"python {base_cmd} --device=cpu"
elif num_gpu == 1:
cmd = f"python {base_cmd}"
else:
cmd = f"torchrun --standalone --nproc_per_node {num_gpu} {base_cmd}"

# %%
print(cmd)
os.system(cmd)
8 changes: 5 additions & 3 deletions scripts/run_growclust_cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# %%
# stations_json = f"{region}/results/data/stations.json"
# stations = pd.read_json(f"{root_path}/{stations_json}", orient="index")
station_csv = f"{region}/adloc/ransac_stations.csv"
station_csv = f"{region}/cctorch/cctorch_stations.csv"
stations = pd.read_csv(f"{root_path}/{station_csv}")
stations.set_index("station_id", inplace=True)

Expand All @@ -35,12 +35,14 @@

# %%
# events_csv = f"{region}/results/phase_association/events.csv"
events_csv = f"{region}/adloc/ransac_events.csv"
# events_csv = f"{region}/adloc/ransac_events.csv"
events_csv = f"{region}/cctorch/cctorch_events.csv"
# event_file = f"{region}/cctorch/events.csv"
events = pd.read_csv(f"{root_path}/{events_csv}")
# event_df = event_df[event_df["gamma_score"] > 10]
# event_index = [f"{x:06d}" for x in event_df["event_index"]]
events["time"] = pd.to_datetime(events["time"])
# events["time"] = pd.to_datetime(events["time"])
events["time"] = pd.to_datetime(events["event_time"])
if "magnitude" not in events.columns:
events["magnitude"] = 0.0

Expand Down
4 changes: 2 additions & 2 deletions scripts/run_growclust_cc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,9 @@ TT/tt.sg
***** GrowClust Algorithm Parameters *****
******************************************
* rmin delmax rmsmax
0.6 120 1.0
0.1 120 1.0
* rpsavgmin, rmincut ngoodmin iponly
0 0.6 8 0
0 0.1 8 0
*
******************************************
************ Output files ****************
Expand Down
13 changes: 7 additions & 6 deletions scripts/run_qtm.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,15 +70,16 @@ def parse_args():
# %% Generate event mseed pairs
pairs = []
unique_station_ids = np.intersect1d(mseeds["station_id"].unique(), picks["station_id"].unique())

for station_id in unique_station_ids:
mseed_index = mseeds.loc[mseeds["station_id"] == station_id, "idx_mseed"]
pick_index = picks.loc[picks["station_id"] == station_id, "idx_pick"]
pairs.extend(product(mseed_index, pick_index))
print(f"Number of unique station ids: {len(unique_station_ids)}")

# %%
with open(f"{root_path}/{region}/qtm/pairs.txt", "w") as fp:
fp.write("\n".join([f"{x[0]},{x[1]}" for x in pairs]))
mseeds = mseeds.set_index("idx_mseed")
picks = picks.groupby("station_id")
for idx_mseed, row in tqdm(mseeds.iterrows(), total=len(mseeds), desc="Writing pairs"):
station_id = row["station_id"]
for idx_pick in picks.get_group(station_id)["idx_pick"].values:
fp.write(f"{idx_mseed},{idx_pick}\n")

## based on GPU memory
batch = 16
Expand Down

0 comments on commit 9915917

Please sign in to comment.