-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
657 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,6 @@ | ||
*.so | ||
*.egg-info/ | ||
*.pyc | ||
data/ | ||
build/ | ||
libs/ | ||
.idea/ | ||
.idea/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
path: "/data/storage/daniel/aegnn" | ||
output_directory: "/data/storage/daniel/aegnn/logs" | ||
pooling_dim_at_output: 5x7 | ||
|
||
task: detection | ||
dataset: ncaltech101 | ||
|
||
# network | ||
radius: 0.01 | ||
time_window_us: 1000000 | ||
max_neighbors: 16 | ||
n_nodes: 50000 | ||
|
||
batch_size: 64 | ||
|
||
activation: relu | ||
edge_attr_dim: 2 | ||
aggr: sum | ||
kernel_size: 5 | ||
pooling_aggr: max | ||
|
||
base_width: 0.5 | ||
after_pool_width: 1 | ||
net_stem_width: 1 | ||
yolo_stem_width: 1 | ||
num_scales: 1 | ||
|
||
# learning | ||
weight_decay: 0.00001 | ||
clip: 0.1 | ||
|
||
aug_trans: 0.1 | ||
aug_p_flip: 0 | ||
aug_zoom: 1 | ||
l_r: 0.001 | ||
tot_num_epochs: 801 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
# avoid matlab error on server | ||
import os | ||
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' | ||
|
||
import torch | ||
import tqdm | ||
import wandb | ||
from pathlib import Path | ||
import argparse | ||
|
||
from torch_geometric.data import DataLoader | ||
|
||
from dagr.utils.logging import Checkpointer, set_up_logging_directory, log_hparams | ||
from dagr.utils.buffers import DetectionBuffer | ||
from dagr.utils.args import FLAGS | ||
from dagr.utils.learning_rate_scheduler import LRSchedule | ||
|
||
from dagr.data.augment import Augmentations | ||
from dagr.utils.buffers import format_data | ||
from dagr.data.dsec_data import DSEC | ||
|
||
from dagr.model.networks.dagr import DAGR | ||
from dagr.model.networks.ema import ModelEMA | ||
|
||
|
||
def gradients_broken(model): | ||
valid_gradients = True | ||
for name, param in model.named_parameters(): | ||
if param.grad is not None: | ||
# valid_gradients = not (torch.isnan(param.grad).any() or torch.isinf(param.grad).any()) | ||
valid_gradients = not (torch.isnan(param.grad).any()) | ||
if not valid_gradients: | ||
break | ||
return not valid_gradients | ||
|
||
def fix_gradients(model): | ||
for name, param in model.named_parameters(): | ||
if param.grad is not None: | ||
param.grad = torch.nan_to_num(param.grad, nan=0.0) | ||
|
||
|
||
def train(loader: DataLoader, | ||
model: torch.nn.Module, | ||
ema: ModelEMA, | ||
scheduler: torch.optim.lr_scheduler.LambdaLR, | ||
optimizer: torch.optim.Optimizer, | ||
args: argparse.ArgumentParser, | ||
run_name=""): | ||
|
||
model.train() | ||
|
||
for i, data in enumerate(tqdm.tqdm(loader, desc=f"Training {run_name}")): | ||
data = data.cuda(non_blocking=True) | ||
data = format_data(data) | ||
|
||
optimizer.zero_grad(set_to_none=True) | ||
|
||
model_outputs = model(data) | ||
|
||
loss_dict = {k: v for k, v in model_outputs.items() if "loss" in k} | ||
loss = loss_dict.pop("total_loss") | ||
|
||
loss.backward() | ||
|
||
torch.nn.utils.clip_grad_value_(model.parameters(), args.clip) | ||
|
||
fix_gradients(model) | ||
|
||
optimizer.step() | ||
scheduler.step() | ||
|
||
ema.update(model) | ||
|
||
training_logs = {f"training/loss/{k}": v for k, v in loss_dict.items()} | ||
wandb.log({"training/loss": loss.item(), "training/lr": scheduler.get_last_lr()[-1], **training_logs}) | ||
|
||
def run_test(loader: DataLoader, | ||
model: torch.nn.Module, | ||
dry_run_steps: int=-1, | ||
dataset="gen1"): | ||
|
||
model.eval() | ||
|
||
mapcalc = DetectionBuffer(height=loader.dataset.height, width=loader.dataset.width, classes=loader.dataset.classes) | ||
|
||
for i, data in enumerate(tqdm.tqdm(loader)): | ||
data = data.cuda() | ||
data = format_data(data) | ||
|
||
detections, targets = model(data) | ||
if i % 10 == 0: | ||
torch.cuda.empty_cache() | ||
|
||
mapcalc.update(detections, targets, dataset, data.height[0], data.width[0]) | ||
|
||
if dry_run_steps > 0 and i == dry_run_steps: | ||
break | ||
|
||
torch.cuda.empty_cache() | ||
|
||
return mapcalc | ||
|
||
if __name__ == '__main__': | ||
import torch_geometric | ||
import random | ||
import numpy as np | ||
|
||
seed = 42 | ||
torch_geometric.seed.seed_everything(seed) | ||
torch.random.manual_seed(seed) | ||
torch.manual_seed(seed) | ||
np.random.seed(seed) | ||
random.seed(seed) | ||
|
||
args = FLAGS() | ||
|
||
output_directory = set_up_logging_directory(args.dataset, args.task, args.output_directory, exp_name=args.exp_name) | ||
log_hparams(args) | ||
|
||
augmentations = Augmentations(args) | ||
|
||
print("init datasets") | ||
dataset_path = args.dataset_directory / args.dataset | ||
|
||
train_dataset = DSEC(root=dataset_path, split="train", transform=augmentations.transform_training, debug=False, | ||
min_bbox_diag=15, min_bbox_height=10) | ||
test_dataset = DSEC(root=dataset_path, split="val", transform=augmentations.transform_testing, debug=False, | ||
min_bbox_diag=15, min_bbox_height=10) | ||
|
||
train_loader = DataLoader(train_dataset, follow_batch=['bbox', 'bbox0'], batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True) | ||
num_iters_per_epoch = len(train_loader) | ||
|
||
sampler = np.random.permutation(np.arange(len(test_dataset))) | ||
test_loader = DataLoader(test_dataset, sampler=sampler, follow_batch=['bbox', 'bbox0'], batch_size=args.batch_size, shuffle=False, num_workers=4, drop_last=True) | ||
|
||
print("init net") | ||
# load a dummy sample to get height, width | ||
model = DAGR(args, height=test_dataset.height, width=test_dataset.width) | ||
|
||
num_params = sum([np.prod(p.size()) for p in model.parameters()]) | ||
print(f"Training with {num_params} number of parameters.") | ||
|
||
model = model.cuda() | ||
ema = ModelEMA(model) | ||
|
||
nominal_batch_size = 64 | ||
lr = args.l_r * np.sqrt(args.batch_size) / np.sqrt(nominal_batch_size) | ||
optimizer = torch.optim.AdamW(list(model.parameters()), lr=lr, weight_decay=args.weight_decay) | ||
|
||
lr_func = LRSchedule(warmup_epochs=.3, | ||
num_iters_per_epoch=num_iters_per_epoch, | ||
tot_num_epochs=args.tot_num_epochs) | ||
|
||
lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lr_func) | ||
|
||
checkpointer = Checkpointer(output_directory=output_directory, | ||
model=model, optimizer=optimizer, | ||
scheduler=lr_scheduler, ema=ema, | ||
args=args) | ||
|
||
start_epoch = checkpointer.restore_if_existing(output_directory, resume_from_best=False) | ||
|
||
start_epoch = 0 | ||
if "resume_checkpoint" in args: | ||
start_epoch = checkpointer.restore_checkpoint(args.resume_checkpoint, best=False) | ||
print(f"Resume from checkpoint at epoch {start_epoch}") | ||
|
||
with torch.no_grad(): | ||
mapcalc = run_test(test_loader, ema.ema, dry_run_steps=2, dataset=args.dataset) | ||
mapcalc.compute() | ||
|
||
print("starting to train") | ||
for epoch in range(start_epoch, args.tot_num_epochs): | ||
train(train_loader, model, ema, lr_scheduler, optimizer, args, run_name=wandb.run.name) | ||
checkpointer.checkpoint(epoch, name=f"last_model") | ||
|
||
if epoch % 3 > 0: | ||
continue | ||
|
||
with torch.no_grad(): | ||
mapcalc = run_test(test_loader, ema.ema, dataset=args.dataset) | ||
metrics = mapcalc.compute() | ||
checkpointer.process(metrics, epoch) | ||
|
Oops, something went wrong.