Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow picking from the shots without requiring to save them on the disk #3

Merged
merged 5 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions first_break_picking/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,3 +796,39 @@ def load_dispersion_curve(curves: pd.DataFrame,
curve = curve[(curve["frequency"] <= fmax) & (curve["frequency"] >= fmin)]

return np.floor(curve) # / [dv, df])

def fb_pre_process_data(shot: np.ndarray,
fb: np.ndarray,
split_nt,
overlap,
time_window: List[int],
scale: bool,
grayscale: bool
):
"""
Read one shot and first break file
"""
if time_window is not None:
shot = shot[time_window[0]:time_window[1], :]
if scale:
shot = data_normalize_and_limiting(data=shot)
if grayscale:
shot = shot_to_gray_scale(shot)

points = starting_points(shot.shape[1], split_nt, overlap)
sub_shots = shot_spliting(
shot=shot,
points=points,
split_nt=split_nt
)

sub_fbs = [None]

if fb is not None:
sub_fbs = fb_spliting(
fb=fb,
points=points,
split_nt=split_nt
)

return sub_shots, shot.shape[1], sub_fbs
25 changes: 14 additions & 11 deletions first_break_picking/train_eval/_predicting_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def predict(model,
def fb_predict_validation(
batch: torch.Tensor,
model,
upsampler,
split_nt: int,
overlap: float,
shot_id: str,
Expand All @@ -76,6 +75,7 @@ def fb_predict_validation(
# case_specific_parameters: dict,
case_specific_parameters
):
# TODO Write test for this function
n_original_time_sampels = case_specific_parameters["n_original_time_sampels"]

n_trace = data_info.loc[shot_id][0]
Expand All @@ -93,9 +93,10 @@ def fb_predict_validation(
batch = batch.squeeze(0).to(device=device)
n_subshots = batch.shape[0]

batch, true_mask = upsampler(
batch,
true_mask.squeeze(0))
# batch, true_mask = upsampler(
# batch,
# true_mask.squeeze(0))
true_mask.squeeze_(0)

predicted, prob = predict(model=model, data=batch,
binary=False)
Expand Down Expand Up @@ -126,7 +127,6 @@ def fb_predict_validation(
def fb_predict_test(
batch: torch.Tensor,
model,
upsampler,
split_nt: int,
overlap: float,
shot_id: str,
Expand All @@ -136,7 +136,7 @@ def fb_predict_test(
):
n_original_time_sampels = case_specific_parameters["n_original_time_sampels"]

n_trace = data_info.loc[shot_id][0]
n_trace = data_info.loc[shot_id].iloc[0]
device = "cpu"
model = model.to(device=device)

Expand All @@ -146,12 +146,13 @@ def fb_predict_test(
n_trace = n_trace,
overlap = overlap
)


# batch [1, n_subshot, 1, n_upsample_row, n_upsample_col]
batch = batch.squeeze(0).to(device=device)
n_subshots = batch.shape[0]

batch, _ = upsampler(
batch, batch)
# batch, _ = upsampler(
# batch, batch)

predicted, prob = predict( # Check why I use prob here but not for validation
model=model, data=batch,
Expand All @@ -160,14 +161,16 @@ def fb_predict_test(
prob1 = downsample(prob)
predicted = torch.argmax(prob1, dim=1)
batch = downsample(batch).squeeze(1)

# batch ([n_subshots, n_time_steps, 22])

predicted1 = torch.zeros((n_original_time_sampels, n_trace))
shot1 = torch.zeros((n_original_time_sampels, n_trace))

for i in range(n_subshots):
predicted1[:, points[i]:points[i+1]] = predicted[i,:, :points[i+1] - points[i]]
shot1[:, points[i]:points[i+1]] = batch[i,:, :points[i+1] - points[i]]

# shot1 ([n_time_steps, number of total traces in the original shot])

predicted_pick = _fb_smooth_result(
predicted=predicted1,
n_trace=n_trace,
Expand Down
6 changes: 4 additions & 2 deletions first_break_picking/train_eval/ai_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ class Upsample:
def __init__(self, size: Tuple[int, int]):

self._upsampler = torch.nn.Upsample(size=size)

self.size = self._upsampler.size

def __call__(self, data, label):

return self._upsampler(data), self._upsampler(label).long()
return (self._upsampler(data),
(self._upsampler(label)).long())


def normalize_metrics(metrics: dict) -> dict:
Expand Down
51 changes: 49 additions & 2 deletions first_break_picking/train_eval/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
from tqdm import tqdm
import numpy as np

# from first_break_picking.data import data_utils as data_tools
import first_break_picking.data.data_utils as data_tools
from first_break_picking.data.dataset import get_predict_dataset
from first_break_picking.train_eval.unet import UNet
import first_break_picking.train_eval.parameter_tools as ptools
import first_break_picking.train_eval.ai_tools as tools
from first_break_picking.data.data_segy2npy import load_one_general_file
from first_break_picking.data.data_segy2npy import (load_one_general_file,
general_transform,
info_dict_2_df)

class Predictor:
def __init__(self,
Expand Down Expand Up @@ -126,7 +128,52 @@ def __init__(self,
)

self.smoothing_value = smoothing_threshold

def predict_ndarray_fb(self,
shot: np.ndarray) -> np.ndarray:

# No need to change. It is important when we have more than one shot
ffid: str = "12"

sub_shots, n_total_traces, _ = data_tools.fb_pre_process_data(
shot, fb=None,
split_nt=self.split_nt,
overlap=self.overlap,
time_window=[0, self.upsampled_size_row],
scale=True,
grayscale=True
)

n_subshots = len(sub_shots)
data = torch.zeros((1, n_subshots, 1,
self.upsampled_size_row, self.split_nt))
#nsp: data.shape=[1, 2, 1, 512, 32])

_transformer = general_transform()
for i, sub in enumerate(sub_shots):
data[0, i, 0, ...] = _transformer(sub)
#nsp: data.shape=[1, 2, 1, 512, 32])

data_info = info_dict_2_df({str(ffid): [n_total_traces, n_subshots]})
data_info = data_info.set_index("shot_id")

# nsp: data.shape=[1, 2, 1, 512, 32]
data, _ = self.upsampler(data.squeeze(0), data.squeeze(0))
#nsp: data.shape= [2, 512, 512])
# data = data.unsqueeze(1)

shot, predicted_pick, predicted_segment = self.predict_test(
batch=data,
model=self.model,
split_nt=self.split_nt,
overlap=self.overlap,
shot_id=ffid,
smoothing_threshold=self.smoothing_threshold,
data_info=data_info,
case_specific_parameters=self.case_specific_parameters
)
return np.array(predicted_pick * self.dt)

def predict(self, path_data: str):
"""Predict FB

Expand Down
59 changes: 36 additions & 23 deletions readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
<a id="top"></a>

# First-Break Picking Using Deep Learning

This repository is used to implement first-break (FB) picking task using deep learning.
This repository is used to implement first-break (FB) picking task using deep learning.
For this purpose, we use a U-net to segment the data as before and after first arrivals.

- [First-Break Picking Using Deep Learning](#first-break-picking-using-deep-learning)
Expand All @@ -16,44 +17,48 @@ For this purpose, we use a U-net to segment the data as before and after first a
In a seismic shot record, the first arrival is usually the direct wave from the source followed by refractions (Figure 1). The travel time of a seismic wave from a source to a geophone is called first break. First breaks are invaluable sources of information in near-surface studies. We can employ first breaks to obtain a velocity model of the near-surface. In addition to the importance of first breaks for refraction inversion and understanding the characteristics of the near-surface, they can be employed to perform a successful reflection seismic processing and multi-channel analysis of surface waves (MASW).

![Alt text](./readme_files/waves.png)

## 1. Installation

To install this package, you need to first clone the code

```console
pip install git+https://github.com/geo-stack/first_break_picking.git
```

## 2. First-Break Picking
We solve the first-break picking as a segmentation problem. It means that we have two segments,

We solve the first-break picking as a segmentation problem. It means that we have two segments,

1. before FB,
2. after FB.

In this way, FB can be picked as the interface between two segments.
![segmentation](./readme_files/fb_introducing_fb_segmentation.png)

In the next sections, we see how to prepare the dataset and the processing steps that can be done to improve the accuracy of the results.

### 2.1 Initial data files

To use this package, a user needs to prepare the dataset appropriately.
In one folder, we need to have the seismic data and corresponding FB (for training) in `.npy` and `.txt` format.
An example of the first-break file can be seen in the following figure.

![segmentation](./readme_files/fb_data.png)


### 2.2 Data preprocessing

After preparing the initial data files in `.npy` and `.txt` formats, we can perform some preprocessing steps using `save_shots_fb`. To explain the arguments of this function, let's look at the following figure.

<a id="Figure2"></a>
`<a id="Figure2"></a>`
![data_preprocessing](./readme_files/fb_data_preparing.png)

- We have a great data imbalance which leads to a decrease in accuracy. To deal with this problem, we crop the data (a) to generate data presented in (b). For this purpose, `save_shots_fb` gets an argument called `time_window` which gets a list with two integer values showing the beginning and the end of the cropping window (in terms of sample and NOT time). Basically, the first element of this list should be `0`. For example, I use `time_window = [0, 512]`.
- We have a great data imbalance which leads to a decrease in accuracy. To deal with this problem, we crop the data (a) to generate data presented in (b). For this purpose, `save_shots_fb` gets an argument called `time_window` which gets a list with two integer values showing the beginning and the end of the cropping window (in terms of sample and NOT time). Basically, the first element of this list should be `0`. For example, I use `time_window = [0, 512]`.
- In the next step, we scale the data to increase the accuracy and ease of learning. This step leads to the image (c). To do so, user can use two arguments, `scale` and `grayscale`, which are boolean and should be set to `True`.
- For data augmentation, we divide each seismic shot into some subimages with a specific overlap (e and d). For this purpose, `save_shots_fb` gets `split_nt` to specify the number of columns in each subimage and `overlap` which defines the overlap of subimages in terms of percentage, between `0.0` to `1.0`. I usually use `overlap = 0.15`. For shots with 48 traces, I use `split_nt = 22`, but in the case of shots with more traces, we can use a larger value for `split_nt`.
- It is really important to provide `save_shots_fb` with the correct value for the sampling rate as `dt`.
- This function also gets two other arguments to specify the extension of shot and first-break files as `shot_ext` and `fb_ext`. This can be used to develop the code easily in case we want to load `.segy` or `.json` files.
- `save_shots_fb` saves the processed data at `dir_to_save`.

- `save_shots_fb` saves the processed data at `dir_to_save`.

So, here is how to call this function,

Expand All @@ -77,6 +82,7 @@ data_info = save_shots_fb(

data_info.to_csv(f"{path_save}_data_info.txt", index=False)
```

The function `save_shots_fb` returns a Pandas DataFrame which should be saved for using during the prediction.
Here is an example of saved data for a project.

Expand All @@ -87,7 +93,9 @@ Here is an example of saved data for a project.
</div>

### 2.3 Training for FB picking

To train a network, we use the function `train`. This function gets some arguments that are presented here.

- `train_data_path`: Path of the training dataset (can be a list of different datasets).
- `upsampled_size_row`: We upsample the data samples before sending them into the model. This variable is used to define the number of rows in upsampled size (must be dividable by 16).
- `upsampled_size_col`: This variable is used to define the number of columns in upsampled size (must be dividable by 16).
Expand All @@ -99,9 +107,10 @@ To train a network, we use the function `train`. This function gets some argumen
- `checkpoint_path`: In case a user wants to start training a pretrained network, the path of the checkpoint should be specified here.
- `step_size_milestone`: Is used to define a learning rate scheduler. If you want to halve the learning rate at a specific number of epochs, this argument should be used.
- `show`: This is a boolean and can be used to specify if the user likes to see the learning procedure. If set to `True`, a figure would be presented like the following example.
![files](./readme_files/fb_train.gif)
![files](./readme_files/fb_train.gif)

Here is an example of calling this function,

```Python
from first_break_picking import train
from first_break_picking.tools import seed_everything
Expand Down Expand Up @@ -136,24 +145,27 @@ train(train_data_path,
```

### 2.4 Predicting the first break of one seismic shot
If you want to predict the first breaks in numerous shots, you should create the dataset as described [here](#22-data-preprocessing).

If you want to predict the first breaks in numerous shots, you should create the dataset as described [here](#22-data-preprocessing).
However, if you need to predict the first break on only one shot (or all shots in a loop without saving dataset), the class `Predictor` should be used.
This object can be created as,
This object can be created as,

```Python
from first_break_picking import Predictor

predictor = Predictor(
path_to_save="path/to/save/results/checkpoints",
path_to_save="path/to/save/results/",
checkpoint_path=checkpoint,
split_nt=split_nt,
overlap=overlap,
upsampled_size_row=n_time_sampels,
upsampled_size_col=upsampled_size_col,
dt = dt,
split_nt=split_nt, #Number of traces in splitted shot
overlap=overlap, # Overlap between each shot for spliting
upsampled_size_row=n_time_sampels, # Number of rows in the upsampled shot
upsampled_size_col=upsampled_size_col, # Number of columns in the upsampled shot
dt = dt, # sampling rate
smoothing_threshold=smoothing_threshold,
model_name="unet_resnet"
)
```

- `path_to_save`: Path to a folder to save the result (will be overwritten).
- `checkpoint_path`: Path of the checkpoint that is saved after training.
- `split_nt` Number of columns in each subimage.
Expand All @@ -162,14 +174,15 @@ predictor = Predictor(
- `upsampled_size_col`: Number of columns in the upsampled image.
- `dt`: Temporal sampling rate.
- `smoothing_threshold`: An integer used to avoid the generated artifacts above the true FB.

By creating this object, we can now give the path of one seismic shot (as presented in [Figure 2a](#Figure2)) to the method `predict` and get the first break.

```Python
predictor.predict(
path_data=path_data
)
```

![data_preprocessing](./readme_files/fb_predict_one.png)

<div class="alert alert-block alert-warning">
Expand All @@ -180,6 +193,7 @@ predictor.predict(
## 3. Cite this work

If you use this package or the dataset, please cite the following paper.

```
@article{mardan2024fine,
title = {A fine‐tuning workflow for automatic first‐break picking with deep learning.},
Expand All @@ -191,11 +205,10 @@ If you use this package or the dataset, please cite the following paper.
doi = {https://doi.org/10.1002/nsg.12316}
}
```

<!-- ## Issues and Questions -->
**Acknowledgment:**<br>
This work, developed by [Amir Mardan](https://github.com/AmirMardan), was supported by Mitacs through the Mitacs Elevate Program.

**Acknowledgment:**`<br>`
This work, developed by [Amir Mardan](https://github.com/AmirMardan), was supported by Mitacs through the Mitacs Elevate Program.

[Top](#top)

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
packages=find_packages(exclude=["*.pyc"]),
include_package_data=True,
package_data={
"": ["*.txt", "checkpoint_20_2projects.tar"],
"": ["*.txt", "fb_20.tar"],
},
install_requires = [
# "segyio",
Expand Down
Loading