Skip to content

Commit

Permalink
Merge branch 'daniel/training_dsec'
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Gehrig committed Jan 27, 2025
2 parents 6853017 + 8012acf commit d79ac9b
Show file tree
Hide file tree
Showing 13 changed files with 657 additions and 18 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
*.so
*.egg-info/
*.pyc
data/
build/
libs/
.idea/
.idea/
36 changes: 36 additions & 0 deletions config/dagr-l-ncaltech.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
path: "/data/storage/daniel/aegnn"
output_directory: "/data/storage/daniel/aegnn/logs"
pooling_dim_at_output: 5x7

task: detection
dataset: ncaltech101

# network
radius: 0.01
time_window_us: 1000000
max_neighbors: 16
n_nodes: 50000

batch_size: 64

activation: relu
edge_attr_dim: 2
aggr: sum
kernel_size: 5
pooling_aggr: max

base_width: 0.5
after_pool_width: 1
net_stem_width: 1
yolo_stem_width: 1
num_scales: 1

# learning
weight_decay: 0.00001
clip: 0.1

aug_trans: 0.1
aug_p_flip: 0
aug_zoom: 1
l_r: 0.001
tot_num_epochs: 801
33 changes: 33 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,36 @@ python scripts/visualize_detections.py --detections_folder $LOG_DIR/$WANDB_DIR \
```
This will start a visualization window showing the detections over a given sequence. If you want to save the detections
to a video, use the `--write_to_output` flag, which will create a video in the folder `$LOG_DIR/$WANDB_DIR/visualization}`.

## Training
To train on N-Caltech101, download the files with

```bash
wget https://download.ifi.uzh.ch/rpg/dagr/data/ncaltech101.zip -P $DAGR_DIR/data/
cd $DAGR_DIR/data/
unzip ncaltech101.zip
rm -rf ncaltech101.zip
```

Then run training with

```bash

python scripts/train_ncaltech101.py --config config/dagr-l-ncaltech.yaml \
--exp_name ncaltech_l \
--dataset_directory $DAGR_DIR/data/ \
--output_directory $DAGR_DIR/logs/
```
To train on DSEC, make a symlink to the data directory via
```bash
ln -s $DSEC_ROOT $DAGR_DIR/data/dsec
```
Then run training with
```bash

python scripts/train_dsec.py --config config/dagr-s-dsec.yaml \
--exp_name dsec_s_50 \
--dataset_directory $DAGR_DIR/data/ \
--output_directory $DAGR_DIR/logs/ \
--use_image --img_net resnet50 --batch_size 32
```
184 changes: 184 additions & 0 deletions scripts/train_dsec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# avoid matlab error on server
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'

import torch
import tqdm
import wandb
from pathlib import Path
import argparse

from torch_geometric.data import DataLoader

from dagr.utils.logging import Checkpointer, set_up_logging_directory, log_hparams
from dagr.utils.buffers import DetectionBuffer
from dagr.utils.args import FLAGS
from dagr.utils.learning_rate_scheduler import LRSchedule

from dagr.data.augment import Augmentations
from dagr.utils.buffers import format_data
from dagr.data.dsec_data import DSEC

from dagr.model.networks.dagr import DAGR
from dagr.model.networks.ema import ModelEMA


def gradients_broken(model):
valid_gradients = True
for name, param in model.named_parameters():
if param.grad is not None:
# valid_gradients = not (torch.isnan(param.grad).any() or torch.isinf(param.grad).any())
valid_gradients = not (torch.isnan(param.grad).any())
if not valid_gradients:
break
return not valid_gradients

def fix_gradients(model):
for name, param in model.named_parameters():
if param.grad is not None:
param.grad = torch.nan_to_num(param.grad, nan=0.0)


def train(loader: DataLoader,
model: torch.nn.Module,
ema: ModelEMA,
scheduler: torch.optim.lr_scheduler.LambdaLR,
optimizer: torch.optim.Optimizer,
args: argparse.ArgumentParser,
run_name=""):

model.train()

for i, data in enumerate(tqdm.tqdm(loader, desc=f"Training {run_name}")):
data = data.cuda(non_blocking=True)
data = format_data(data)

optimizer.zero_grad(set_to_none=True)

model_outputs = model(data)

loss_dict = {k: v for k, v in model_outputs.items() if "loss" in k}
loss = loss_dict.pop("total_loss")

loss.backward()

torch.nn.utils.clip_grad_value_(model.parameters(), args.clip)

fix_gradients(model)

optimizer.step()
scheduler.step()

ema.update(model)

training_logs = {f"training/loss/{k}": v for k, v in loss_dict.items()}
wandb.log({"training/loss": loss.item(), "training/lr": scheduler.get_last_lr()[-1], **training_logs})

def run_test(loader: DataLoader,
model: torch.nn.Module,
dry_run_steps: int=-1,
dataset="gen1"):

model.eval()

mapcalc = DetectionBuffer(height=loader.dataset.height, width=loader.dataset.width, classes=loader.dataset.classes)

for i, data in enumerate(tqdm.tqdm(loader)):
data = data.cuda()
data = format_data(data)

detections, targets = model(data)
if i % 10 == 0:
torch.cuda.empty_cache()

mapcalc.update(detections, targets, dataset, data.height[0], data.width[0])

if dry_run_steps > 0 and i == dry_run_steps:
break

torch.cuda.empty_cache()

return mapcalc

if __name__ == '__main__':
import torch_geometric
import random
import numpy as np

seed = 42
torch_geometric.seed.seed_everything(seed)
torch.random.manual_seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

args = FLAGS()

output_directory = set_up_logging_directory(args.dataset, args.task, args.output_directory, exp_name=args.exp_name)
log_hparams(args)

augmentations = Augmentations(args)

print("init datasets")
dataset_path = args.dataset_directory / args.dataset

train_dataset = DSEC(root=dataset_path, split="train", transform=augmentations.transform_training, debug=False,
min_bbox_diag=15, min_bbox_height=10)
test_dataset = DSEC(root=dataset_path, split="val", transform=augmentations.transform_testing, debug=False,
min_bbox_diag=15, min_bbox_height=10)

train_loader = DataLoader(train_dataset, follow_batch=['bbox', 'bbox0'], batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True)
num_iters_per_epoch = len(train_loader)

sampler = np.random.permutation(np.arange(len(test_dataset)))
test_loader = DataLoader(test_dataset, sampler=sampler, follow_batch=['bbox', 'bbox0'], batch_size=args.batch_size, shuffle=False, num_workers=4, drop_last=True)

print("init net")
# load a dummy sample to get height, width
model = DAGR(args, height=test_dataset.height, width=test_dataset.width)

num_params = sum([np.prod(p.size()) for p in model.parameters()])
print(f"Training with {num_params} number of parameters.")

model = model.cuda()
ema = ModelEMA(model)

nominal_batch_size = 64
lr = args.l_r * np.sqrt(args.batch_size) / np.sqrt(nominal_batch_size)
optimizer = torch.optim.AdamW(list(model.parameters()), lr=lr, weight_decay=args.weight_decay)

lr_func = LRSchedule(warmup_epochs=.3,
num_iters_per_epoch=num_iters_per_epoch,
tot_num_epochs=args.tot_num_epochs)

lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lr_func)

checkpointer = Checkpointer(output_directory=output_directory,
model=model, optimizer=optimizer,
scheduler=lr_scheduler, ema=ema,
args=args)

start_epoch = checkpointer.restore_if_existing(output_directory, resume_from_best=False)

start_epoch = 0
if "resume_checkpoint" in args:
start_epoch = checkpointer.restore_checkpoint(args.resume_checkpoint, best=False)
print(f"Resume from checkpoint at epoch {start_epoch}")

with torch.no_grad():
mapcalc = run_test(test_loader, ema.ema, dry_run_steps=2, dataset=args.dataset)
mapcalc.compute()

print("starting to train")
for epoch in range(start_epoch, args.tot_num_epochs):
train(train_loader, model, ema, lr_scheduler, optimizer, args, run_name=wandb.run.name)
checkpointer.checkpoint(epoch, name=f"last_model")

if epoch % 3 > 0:
continue

with torch.no_grad():
mapcalc = run_test(test_loader, ema.ema, dataset=args.dataset)
metrics = mapcalc.compute()
checkpointer.process(metrics, epoch)

Loading

0 comments on commit d79ac9b

Please sign in to comment.