diff --git a/MANIFEST.in b/MANIFEST.in index cdfecc0..14c65d7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,6 +8,9 @@ include pyvision/detection/detr/utils/pallete include pyvision/detection/detr/config/*.json include pyvision/detection/detr/data/*.txt +include pyvision/detection/efficientdet/config/*.json +include pyvision/detection/efficientdet/config/*.yaml + include pyvision/segmentation/pspnet/config/*.json include pyvision/segmentation/pspnet/data/*.txt diff --git a/pyvision/detection/efficientdet/__init__.py b/pyvision/detection/efficientdet/__init__.py new file mode 100644 index 0000000..59e8c59 --- /dev/null +++ b/pyvision/detection/efficientdet/__init__.py @@ -0,0 +1 @@ +from .model import EffdetInferAPI as EfficientDet \ No newline at end of file diff --git a/pyvision/detection/efficientdet/config/__init__.py b/pyvision/detection/efficientdet/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyvision/detection/efficientdet/config/dataset_coco.yaml b/pyvision/detection/efficientdet/config/dataset_coco.yaml new file mode 100644 index 0000000..c8b61e2 --- /dev/null +++ b/pyvision/detection/efficientdet/config/dataset_coco.yaml @@ -0,0 +1,28 @@ +class_list : ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", + "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", + "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", + "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", + "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", + "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", + "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", + "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", + "teddy bear", "hair drier", "toothbrush"] + +model_name: "effdet_coco" + +colors : [(39, 129, 113), (164, 80, 133), (83, 122, 114), (99, 81, 172), (95, 56, 104), (37, 84, 86), (14, 89, 122), + (80, 7, 65), (10, 102, 25), (90, 185, 109), (106, 110, 132), (169, 158, 85), (188, 185, 26), (103, 1, 17), + (82, 144, 81), (92, 7, 184), (49, 81, 155), (179, 177, 69), (93, 187, 158), (13, 39, 73), (12, 50, 60), + (16, 179, 33), (112, 69, 165), (15, 139, 63), (33, 191, 159), (182, 173, 32), (34, 113, 133), (90, 135, 34), + (53, 34, 86), (141, 35, 190), (6, 171, 8), (118, 76, 112), (89, 60, 55), (15, 54, 88), (112, 75, 181), + (42, 147, 38), (138, 52, 63), (128, 65, 149), (106, 103, 24), (168, 33, 45), (28, 136, 135), (86, 91, 108), + (52, 11, 76), (142, 6, 189), (57, 81, 168), (55, 19, 148), (182, 101, 89), (44, 65, 179), (1, 33, 26), + (122, 164, 26), (70, 63, 134), (137, 106, 82), (120, 118, 52), (129, 74, 42), (182, 147, 112), (22, 157, 50), + (56, 50, 20), (2, 22, 177), (156, 100, 106), (21, 35, 42), (13, 8, 121), (142, 92, 28), (45, 118, 33), + (105, 118, 30), (7, 185, 124), (46, 34, 146), (105, 184, 169), (22, 18, 5), (147, 71, 73), (181, 64, 91), + (31, 39, 184), (164, 179, 33), (96, 50, 18), (95, 15, 106), (113, 68, 54), (136, 116, 112), (119, 139, 130), + (31, 139, 34), (66, 6, 127), (62, 39, 2), (49, 99, 180), (49, 119, 155), (153, 50, 183), (125, 38, 3), + (129, 87, 143), (49, 87, 40), (128, 62, 120), (73, 85, 148), (28, 144, 118), (29, 9, 24), (175, 45, 108), + (81, 175, 64), (178, 19, 157), (74, 188, 190), (18, 114, 2), (62, 128, 96), (21, 3, 150), (0, 6, 95), + (2, 20, 184), (122, 37, 185)] \ No newline at end of file diff --git a/pyvision/detection/efficientdet/config/weights_download.json b/pyvision/detection/efficientdet/config/weights_download.json new file mode 100644 index 0000000..4a208c7 --- /dev/null +++ b/pyvision/detection/efficientdet/config/weights_download.json @@ -0,0 +1,4 @@ + +{ + "effdet_coco": "1jvcGIWyZ3jjTltiErp-OPNTA7SLWlslR" +} diff --git a/pyvision/detection/efficientdet/lib/__init__.py b/pyvision/detection/efficientdet/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyvision/detection/efficientdet/lib/dataset.py b/pyvision/detection/efficientdet/lib/dataset.py new file mode 100644 index 0000000..c891e90 --- /dev/null +++ b/pyvision/detection/efficientdet/lib/dataset.py @@ -0,0 +1,200 @@ +import os +import torch +import numpy as np + +from torch.utils.data import Dataset, DataLoader +from pycocotools.coco import COCO + +import cv2 + +class CustomDataset(Dataset): + + def __init__(self, root_dir, img_dir="images", set_name="train2017", transform=None): + + self.root_dir = root_dir + self.img_dir = img_dir + self.set_name = set_name + self.transform = transform + + self.coco_tool = COCO(os.path.join(self.root_dir, 'annotations', 'instances_'+self.set_name+'.json')) + self.image_ids = self.coco_tool.getImgIds() + + self.load_classes() + + def load_classes(self): + + categories = self.coco_tool.loadCats(self.coco_tool.getCatIds()) + categories.sort(key = lambda x: x["id"]) + + # load name -> label + self.classes = {} + self.coco_labels = {} + self.coco_labels_inverse = {} + for category in categories: + self.coco_labels[len(self.classes)] = category['id'] + self.coco_labels_inverse[category['id']] = len(self.classes) + self.classes[category['name']] = len(self.classes) + + # load label -> name + self.labels = {} + for key, value in self.classes.items(): + self.labels[value] = key + + + def load_image(self, idx): + + img_info = self.coco_tool.loadImgs(self.image_ids[idx])[0] + img_path = os.path.join( + self.root_dir, self.img_dir, self.set_name, img_info['file_name'] + ) + img = cv2.imread(img_path) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + img = img.astype(np.float32) / 255.0 + + return img + + def coco_label_to_label(self, coco_label): + return self.coco_labels_inverse[coco_label] + + def label_to_coco_label(self, label): + return self.coco_labels[label] + + def num_classes(self): + return len(self.classes) + + def load_annotations(self, idx): + + anno_ids = self.coco_tool.getAnnIds( + imgIds=self.image_ids[idx], iscrowd=False + ) + annotations = np.zeros((0, 5)) + + # if some images miss annotations + if len(anno_ids) == 0: + return annotations + + # parsing the annotations here + coco_annotations = self.coco_tool.loadAnns(anno_ids) + for idx, a in enumerate(coco_annotations): + + # skip the annotations that have no height/width + if a['bbox'][2] < 1 or a['bbox'][3] < 1: + continue + + annotation = np.zeros((1, 5)) + annotation[0, :4] = a['bbox'] + annotation[0, 4] = self.coco_label_to_label(a['category_id']) + annotations = np.append(annotations, annotation, axis=0) + + # transform [x, y, w, h] -> [x1, y1, x2, y2] + annotations[:, 2] = annotations[:, 0] + annotations[:, 2] + annotations[:, 3] = annotations[:, 1] + annotations[:, 3] + + return annotations + + + def __len__(self): + return len(self.image_ids) + + + def __getitem__(self, idx): + + img = self.load_image(idx) + annot = self.load_annotations(idx) + + data = { + "img": img, + "annot": annot + } + + if self.transform: + data = self.transform(data) + + return data + + +def collater(data): + imgs = [s['img'] for s in data] + annots = [s['annot'] for s in data] + scales = [s['scale'] for s in data] + + imgs = torch.from_numpy(np.stack(imgs, axis=0)) + + max_num_annots = max(annot.shape[0] for annot in annots) + + if max_num_annots > 0: + + annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1 + + if max_num_annots > 0: + for idx, annot in enumerate(annots): + if annot.shape[0] > 0: + annot_padded[idx, :annot.shape[0], :] = annot + else: + annot_padded = torch.ones((len(annots), 1, 5)) * -1 + + imgs = imgs.permute(0, 3, 1, 2) + + return {'img': imgs, 'annot': annot_padded, 'scale': scales} + + +class Resizer(object): + """Convert ndarrays in sample to Tensors.""" + + def __call__(self, sample, common_size=512): + image, annots = sample['img'], sample['annot'] + height, width, _ = image.shape + if height > width: + scale = common_size / height + resized_height = common_size + resized_width = int(width * scale) + else: + scale = common_size / width + resized_height = int(height * scale) + resized_width = common_size + + image = cv2.resize(image, (resized_width, resized_height)) + + new_image = np.zeros((common_size, common_size, 3)) + new_image[0:resized_height, 0:resized_width] = image + + annots[:, :4] *= scale + + return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale} + + +class Augmenter(object): + """Convert ndarrays in sample to Tensors.""" + + def __call__(self, sample, flip_x=0.5): + if np.random.rand() < flip_x: + image, annots = sample['img'], sample['annot'] + image = image[:, ::-1, :] + + rows, cols, channels = image.shape + + x1 = annots[:, 0].copy() + x2 = annots[:, 2].copy() + + x_tmp = x1.copy() + + annots[:, 0] = cols - x2 + annots[:, 2] = cols - x_tmp + + sample = {'img': image, 'annot': annots} + + return sample + + +class Normalizer(object): + + def __init__(self): + self.mean = np.array([[[0.485, 0.456, 0.406]]]) + self.std = np.array([[[0.229, 0.224, 0.225]]]) + + def __call__(self, sample): + image, annots = sample['img'], sample['annot'] + + return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots} + diff --git a/pyvision/detection/efficientdet/lib/losses.py b/pyvision/detection/efficientdet/lib/losses.py new file mode 100644 index 0000000..9781b44 --- /dev/null +++ b/pyvision/detection/efficientdet/lib/losses.py @@ -0,0 +1,292 @@ +import torch +import torch.nn as nn + + +def iou(a, b): + + area_a = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area_b = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) + + iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], dim=1), b[:, 0]) + ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], dim=1), b[:, 1]) + iw = torch.clamp(iw, min=0) + ih = torch.clamp(ih, min=0) + + inter_area = iw * ih + union_area = area_a + area_b - inter_area + + union_area = torch.clamp(union_area, min=1e-8) + iou_score = inter_area / union_area + + return iou_score + +""" +class FocalLoss(nn.Module): + + def __init__(self, alpha=0.25, gamma=2, device="cuda"): + + super(FocalLoss, self).__init__() + + self.alpha = alpha + self.gamma = gamma + self.device = device + + def forward(self, classifications, regressions, anchors, annotations): + + batch_size = classifications.shape[0] + + classification_loss = [] + regression_loss = [] + + anchor = anchors[0, :, :] + + anchor_widths = anchor[:, 2] - anchor[:, 0] + anchor_heights = anchor[:, 3] - anchor[:, 1] + anchor_x = anchor[:, 0] + 0.5 * anchor_widths + anchor_y = anchor[:, 1] + 0.5 * anchor_heights + + for i in range(batch_size): + + classification = classifications[i, :, :] + regression = regressions[i, :, :] + + box_annotation = annotations[i, :, :] + box_annotation = box_annotation[box_annotation[:, 4] != -1] + + if box_annotation.shape[0] == 0: + if self.device == "cuda" and torch.cuda.is_available(): + regression_loss.append(torch.tensor(0).float().cuda()) + classification_loss.append(torch.tensor(0).float().cuda()) + else: + regression_loss.append(torch.tensor(0).float().cuda()) + classification_loss.append(torch.tensor(0).float().cuda()) + + # no loss or no det. Move on to the next item + continue + + classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) + + iou_score = iou(anchors[0, :, :], box_annotation[:, :4]) + iou_max, iou_argmax = torch.max(iou_score, dim=1) + + targets = torch.ones(classification.shape) * -1 + if self.device == "cuda" and torch.cuda.is_available(): + targets = targets.cuda() + + # zeroing out the indices with IOU less than 0.4 + targets[torch.lt(iou_max, 0.4), :] = 0 + + # getting the indices with IoU score > 0.5 + positive_idx = torch.ge(iou_max, 0.5) + num_positive_idx = positive_idx.sum() + + assigned_annots = box_annotation[iou_argmax, :] + + targets[positive_idx, :] = 0 + targets[positive_idx, assigned_annots[positive_idx, 4].long()] = 1 + + alpha_factor = torch.ones(targets.shape) * self.alpha + if self.device == "cuda" and torch.cuda.is_available(): + alpha_factor = alpha_factor.cuda() + + alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1.0 - alpha_factor) + focal_weight = torch.where(torch.eq(targets, 1.), 1.0 - classification, classification) + focal_weight = alpha_factor * torch.pow(focal_weight, self.gamma) + + bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) + cls_loss = alpha_factor * bce + + zeros = torch.zeros(cls_loss.shape) + if self.device == "cuda" and torch.cuda.is_available(): + zeros = zeros.cuda() + cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, zeros) + + classification_loss.append( + cls_loss.sum() / torch.clamp(num_positive_idx.float(), min=1.0) + ) + + # implement regression loss + if num_positive_idx > 0: + + assigned_annots = assigned_annots[positive_idx, :] + + anchor_widths_i = anchor_widths[positive_idx] + anchor_heights_i = anchor_heights[positive_idx] + anchor_xi = anchor_x[positive_idx] + anchor_yi = anchor_y[positive_idx] + + true_widths = assigned_annots[:, 2] - assigned_annots[:, 0] + true_heights = assigned_annots[:, 3] - assigned_annots[:, 1] + true_x = assigned_annots[:, 0] + 0.5 * true_widths + true_y = assigned_annots[:, 1] + 0.5 * true_heights + + true_heights = torch.clamp(true_heights, min=1) + true_widths = torch.clamp(true_widths, min=1) + + targets_dx = (true_x - anchor_xi) / anchor_widths_i + targets_dy = (true_y - anchor_yi) / anchor_heights_i + targets_dw = torch.log(true_widths / anchor_widths_i) + targets_dh = torch.log(true_heights / anchor_heights_i) + + targets = torch.stack(( + targets_dx, targets_dy, targets_dw, targets_dh + )) + targets = targets.t() + + norm = torch.Tensor([0.1, 0.1, 0.2, 0.2]) + if self.device == "cuda" and torch.cuda.is_available(): + norm = norm.cuda() + targets = targets / norm + + regression_diff = torch.abs(targets - regression[positive_idx, :]) + regression_loss_i = torch.where( + torch.le(regression_diff, 1.0/9.0), + 0.5 * 9.0 * torch.pow(regression_diff, 2), + regression_diff - 0.5 / 9.0 + ) + regression_loss.append(regression_loss_i.mean()) + + else: + + if self.device == "cuda" and torch.cuda.is_available(): + regression_loss.append(torch.tensor(0).float().cuda()) + else: + regression_loss.append(torch.tensor(0).float()) + + + return_cls_loss = torch.stack(classification_loss).mean(dim=0, keepdim=True) + return_reg_loss = torch.stack(regression_loss).mean(dim=0, keepdim=True) + + return return_cls_loss, return_reg_loss + +""" + +class FocalLoss(nn.Module): + def __init__(self, alpha=0.25, gamma=2, device="cuda"): + + super(FocalLoss, self).__init__() + + self.alpha = alpha + self.gamma = gamma + self.device = device + + def forward(self, classifications, regressions, anchors, annotations): + + batch_size = classifications.shape[0] + classification_losses = [] + regression_losses = [] + + anchor = anchors[0, :, :] + + anchor_widths = anchor[:, 2] - anchor[:, 0] + anchor_heights = anchor[:, 3] - anchor[:, 1] + anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths + anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights + + for j in range(batch_size): + + classification = classifications[j, :, :] + regression = regressions[j, :, :] + + bbox_annotation = annotations[j, :, :] + bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] + + if bbox_annotation.shape[0] == 0: + if self.device == "cuda" and torch.cuda.is_available(): + regression_losses.append(torch.tensor(0).float().cuda()) + classification_losses.append(torch.tensor(0).float().cuda()) + else: + regression_losses.append(torch.tensor(0).float()) + classification_losses.append(torch.tensor(0).float()) + + continue + + classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) + + IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) + + IoU_max, IoU_argmax = torch.max(IoU, dim=1) + + # compute the loss for classification + targets = torch.ones(classification.shape) * -1 + if self.device == "cuda" and torch.cuda.is_available(): + targets = targets.cuda() + + targets[torch.lt(IoU_max, 0.4), :] = 0 + + positive_indices = torch.ge(IoU_max, 0.5) + + num_positive_anchors = positive_indices.sum() + + assigned_annotations = bbox_annotation[IoU_argmax, :] + + targets[positive_indices, :] = 0 + targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 + + alpha_factor = torch.ones(targets.shape) * self.alpha + if self.device == "cuda" and torch.cuda.is_available(): + alpha_factor = alpha_factor.cuda() + + alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) + focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) + focal_weight = alpha_factor * torch.pow(focal_weight, self.gamma) + + bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) + + cls_loss = focal_weight * bce + + zeros = torch.zeros(cls_loss.shape) + if self.device == "cuda" and torch.cuda.is_available(): + zeros = zeros.cuda() + cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, zeros) + + classification_losses.append(cls_loss.sum() / torch.clamp(num_positive_anchors.float(), min=1.0)) + + + if positive_indices.sum() > 0: + assigned_annotations = assigned_annotations[positive_indices, :] + + anchor_widths_pi = anchor_widths[positive_indices] + anchor_heights_pi = anchor_heights[positive_indices] + anchor_ctr_x_pi = anchor_ctr_x[positive_indices] + anchor_ctr_y_pi = anchor_ctr_y[positive_indices] + + gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] + gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] + gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths + gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights + + gt_widths = torch.clamp(gt_widths, min=1) + gt_heights = torch.clamp(gt_heights, min=1) + + targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi + targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi + targets_dw = torch.log(gt_widths / anchor_widths_pi) + targets_dh = torch.log(gt_heights / anchor_heights_pi) + + targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh)) + targets = targets.t() + + norm = torch.Tensor([[0.1, 0.1, 0.2, 0.2]]) + if self.device == "cuda" and torch.cuda.is_available(): + norm = norm.cuda() + targets = targets / norm + + regression_diff = torch.abs(targets - regression[positive_indices, :]) + + regression_loss = torch.where( + torch.le(regression_diff, 1.0 / 9.0), + 0.5 * 9.0 * torch.pow(regression_diff, 2), + regression_diff - 0.5 / 9.0 + ) + regression_losses.append(regression_loss.mean()) + else: + if self.device == "cuda" and torch.cuda.is_available(): + regression_losses.append(torch.tensor(0).float().cuda()) + else: + regression_losses.append(torch.tensor(0).float()) + + return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, + keepdim=True) + + diff --git a/pyvision/detection/efficientdet/lib/model.py b/pyvision/detection/efficientdet/lib/model.py new file mode 100644 index 0000000..509edca --- /dev/null +++ b/pyvision/detection/efficientdet/lib/model.py @@ -0,0 +1,400 @@ +import torch.nn as nn +import torch + +import math + +from efficientnet_pytorch import EfficientNet as EffNet +from .utils import BBoxTransform, ClipBoxes, Anchors +from .losses import FocalLoss +from torchvision.ops.boxes import nms as torch_nms + +def nms(dets, thresh): + return torch_nms( + dets[:, :4], dets[:, 4], thresh + ) + +class ConvBlock(nn.Module): + + def __init__(self, num_channels): + + super(ConvBlock, self).__init__() + + self.conv = nn.Sequential( + nn.Conv2d(num_channels, num_channels, kernel_size=3, stride=1, padding=1, groups=num_channels), + nn.Conv2d(num_channels, num_channels, kernel_size=1, stride=1, padding=0), + nn.BatchNorm2d(num_features=num_channels, momentum=0.9997, eps=4e-5), + nn.ReLU() + ) + + def forward(self, x): + + return self.conv(x) + +class BiFPN(nn.Module): + + def __init__(self, num_channels, eps=1e-4): + + super(BiFPN, self).__init__() + + self.eps = eps + + # Here, we define the various conv layers + self.conv6_up = ConvBlock(num_channels) + self.conv5_up = ConvBlock(num_channels) + self.conv4_up = ConvBlock(num_channels) + self.conv3_up = ConvBlock(num_channels) + self.conv4_down = ConvBlock(num_channels) + self.conv5_down = ConvBlock(num_channels) + self.conv6_down = ConvBlock(num_channels) + self.conv7_down = ConvBlock(num_channels) + + # Feature scaling layers + self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest') + self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest') + + self.p4_downsample = nn.MaxPool2d(kernel_size=2) + self.p5_downsample = nn.MaxPool2d(kernel_size=2) + self.p6_downsample = nn.MaxPool2d(kernel_size=2) + self.p7_downsample = nn.MaxPool2d(kernel_size=2) + + # Weight + self.p6_w1 = nn.Parameter(torch.ones(2)) + self.p6_w1_relu = nn.ReLU() + self.p5_w1 = nn.Parameter(torch.ones(2)) + self.p5_w1_relu = nn.ReLU() + self.p4_w1 = nn.Parameter(torch.ones(2)) + self.p4_w1_relu = nn.ReLU() + self.p3_w1 = nn.Parameter(torch.ones(2)) + self.p3_w1_relu = nn.ReLU() + + self.p4_w2 = nn.Parameter(torch.ones(3)) + self.p4_w2_relu = nn.ReLU() + self.p5_w2 = nn.Parameter(torch.ones(3)) + self.p5_w2_relu = nn.ReLU() + self.p6_w2 = nn.Parameter(torch.ones(3)) + self.p6_w2_relu = nn.ReLU() + self.p7_w2 = nn.Parameter(torch.ones(2)) + self.p7_w2_relu = nn.ReLU() + + + def forward(self, inputs): + """ + P7_0 -------------------------- P7_2 --------> + P6_0 ---------- P6_1 ---------- P6_2 --------> + P5_0 ---------- P5_1 ---------- P5_2 --------> + P4_0 ---------- P4_1 ---------- P4_2 --------> + P3_0 -------------------------- P3_2 --------> + """ + + # P3_0, P4_0, P5_0, P6_0 and P7_0 + p3_in, p4_in, p5_in, p6_in, p7_in = inputs[0], inputs[1], inputs[2], inputs[3], inputs[4] + + # P7_0 to P7_2 + # Weights for P6_0 and P7_0 to P6_1 + p6_w1 = self.p6_w1_relu(self.p6_w1) + weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.eps) + + # Connections for P6_0 and P7_0 to P6_1 respectively + p6_up = self.conv6_up(weight[0] * p6_in + weight[1] * self.p6_upsample(p7_in)) + + # Weights for P5_0 and P6_0 to P5_1 + p5_w1 = self.p5_w1_relu(self.p5_w1) + weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.eps) + + # Connections for P5_0 and P6_0 to P5_1 respectively + p5_up = self.conv5_up(weight[0] * p5_in + weight[1] * self.p5_upsample(p6_up)) + + # Weights for P4_0 and P5_0 to P4_1 + p4_w1 = self.p4_w1_relu(self.p4_w1) + weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.eps) + + # Connections for P4_0 and P5_0 to P4_1 respectively + p4_up = self.conv4_up(weight[0] * p4_in + weight[1] * self.p4_upsample(p5_up)) + + # Weights for P3_0 and P4_1 to P3_2 + p3_w1 = self.p3_w1_relu(self.p3_w1) + weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.eps) + + # Connections for P3_0 and P4_1 to P3_2 respectively + p3_out = self.conv3_up(weight[0] * p3_in + weight[1] * self.p3_upsample(p4_up)) + + # Weights for P4_0, P4_1 and P3_2 to P4_2 + p4_w2 = self.p4_w2_relu(self.p4_w2) + weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.eps) + + # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively + p4_out = self.conv4_down( + weight[0] * p4_in + weight[1] * p4_up + weight[2] * self.p4_downsample(p3_out)) + + # Weights for P5_0, P5_1 and P4_2 to P5_2 + p5_w2 = self.p5_w2_relu(self.p5_w2) + weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.eps) + + # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively + p5_out = self.conv5_down( + weight[0] * p5_in + weight[1] * p5_up + weight[2] * self.p5_downsample(p4_out)) + + # Weights for P6_0, P6_1 and P5_2 to P6_2 + p6_w2 = self.p6_w2_relu(self.p6_w2) + weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.eps) + + # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively + p6_out = self.conv6_down( + weight[0] * p6_in + weight[1] * p6_up + weight[2] * self.p6_downsample(p5_out)) + + # Weights for P7_0 and P6_2 to P7_2 + p7_w2 = self.p7_w2_relu(self.p7_w2) + weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.eps) + + # Connections for P7_0 and P6_2 to P7_2 + p7_out = self.conv7_down(weight[0] * p7_in + weight[1] * self.p7_downsample(p6_out)) + + return p3_out, p4_out, p5_out, p6_out, p7_out + + +class Regressor(nn.Module): + + def __init__(self, in_channels, num_anchors, num_layers): + + super(Regressor, self).__init__() + + layers = [] + for _ in range(num_layers): + layers.append(nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) + layers.append(nn.ReLU(True)) + + self.layers = nn.Sequential(*layers) + self.header = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) + + def forward(self, inputs): + + inputs = self.layers(inputs) + inputs = self.header(inputs) + output = inputs.permute(0, 2, 3, 1) + + return output.contiguous().view(output.shape[0], -1, 4) + +class Classifier(nn.Module): + + def __init__(self, in_channels, num_anchors, num_classes, num_layers): + + super(Classifier, self).__init__() + + self.num_anchors = num_anchors + self.num_classes = num_classes + + layers = [] + for _ in range(num_layers): + layers.append( + nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + ) + layers.append( + nn.ReLU(True) + ) + + self.layers = nn.Sequential(*layers) + self.header = nn.Conv2d(in_channels, num_anchors*num_classes, kernel_size=3, stride=1, padding=1) + self.activation = nn.Sigmoid() + + def forward(self, x): + + x = self.layers(x) + x = self.header(x) + x = self.activation(x) + + x = x.permute(0, 2, 3, 1) + + output = x.contiguous().view( + x.shape[0], x.shape[1], x.shape[2], self.num_anchors, self.num_classes + ) + + return output.contiguous().view( + output.shape[0], -1, self.num_classes + ) + + +class EfficientNet(nn.Module): + + def __init__(self, model_coeff=0, pretrained=False): + + super(EfficientNet, self).__init__() + + if model_coeff not in [0, 1, 2, 3, 4, 5, 6, 7]: + raise ValueError(f"{model_coeff} not a valid model. Models supported are b0, b1, b2, b3, b4, b5, b6, b7") + model_version = f"efficientnet-b{model_coeff}" + + if pretrained: + model = EffNet.from_pretrained(model_version) # change to local load later + else: + model = EffNet.from_name(model_version) + + del model._conv_head + del model._bn1 + del model._avg_pooling + del model._dropout + del model._fc + + self.model = model + + def forward(self, x): + + x = self.model._swish(self.model._bn0(self.model._conv_stem(x))) + + feature_maps = [] + for idx, block in enumerate(self.model._blocks): + + drop_connect_rate = self.model._global_params.drop_connect_rate + if drop_connect_rate: + drop_connect_rate *= float(idx) / len(self.model._blocks) + + x = block(x, drop_connect_rate=drop_connect_rate) + + if block._depthwise_conv.stride == [2, 2]: + feature_maps.append(x) + + return feature_maps[1:] + + +class EfficientDet(nn.Module): + + def __init__(self, num_anchors=9, num_classes=20, model_coeff=0, focal_alpha=0.25, focal_gamma=2, pretrained=False, device="cuda"): + + super(EfficientDet, self).__init__() + + self.model_coeff = model_coeff + self.num_classes = num_classes + self.num_anchors = num_anchors + self.focal_alpha = focal_alpha + self.focal_gamma = focal_gamma + self.device = device + + self.num_channels = [64, 88, 112, 160, 224, 288, 384, 384][self.model_coeff] + + # model specific conv layer configurations + in_channels = [ + (40, 80, 192, 192), #b0 + (40, 80, 192, 192), #b1 + (48, 88, 208, 208), #b2 + (48, 96, 232, 232), #b3 + (56, 112, 272, 272), #b4 + (64, 128, 304, 304), #b5 + (72, 144, 344, 344), #b6 + (80, 160, 384, 384) #b7 + ] + + self.conv3 = nn.Conv2d(in_channels[self.model_coeff][0], self.num_channels, kernel_size=1, stride=1, padding=0) + self.conv4 = nn.Conv2d(in_channels[self.model_coeff][1], self.num_channels, kernel_size=1, stride=1, padding=0) + self.conv5 = nn.Conv2d(in_channels[self.model_coeff][2], self.num_channels, kernel_size=1, stride=1, padding=0) + self.conv6 = nn.Conv2d(in_channels[self.model_coeff][3], self.num_channels, kernel_size=3, stride=2, padding=1) + self.conv7 = nn.Sequential( + nn.ReLU(), + nn.Conv2d(self.num_channels, self.num_channels, kernel_size=3, stride=2, padding=1) + ) + + self.bifpn = nn.Sequential(*[BiFPN(self.num_channels) for _ in range(min(2+self.model_coeff, 8))]) + + self.regressor = Regressor( + in_channels=self.num_channels, + num_anchors=self.num_anchors, + num_layers=3 + self.model_coeff // 3 + ) + + self.classifier = Classifier( + in_channels=self.num_channels, + num_anchors=self.num_anchors, + num_classes=self.num_classes, + num_layers=3 + self.model_coeff // 3 + ) + + self.anchors = Anchors() + self.regressBoxes = BBoxTransform() + self.clipBoxes = ClipBoxes() + self.focalloss = FocalLoss(alpha=self.focal_alpha, gamma=self.focal_gamma, device=self.device) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + prior = 0.01 + self.classifier.header.weight.data.fill_(0) + self.classifier.header.bias.data.fill_(-math.log((1.0 - prior) / prior)) + + self.regressor.header.weight.data.fill_(0) + self.regressor.header.bias.data.fill_(0) + + self.backbone_net = EfficientNet(model_coeff=self.model_coeff, pretrained=pretrained) + + + def freeze_bn(self): + + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def forward(self, x): + + if len(x) == 2: + is_training = True + imgs, annots = x + else: + is_training = False + imgs = x + + c3, c4, c5 = self.backbone_net(imgs) + p3 = self.conv3(c3) + p4 = self.conv4(c4) + p5 = self.conv5(c5) + p6 = self.conv6(c5) + p7 = self.conv7(p6) + + features = [p3, p4, p5, p6, p7] + features = self.bifpn(features) + + regression = torch.cat([self.regressor(feature) for feature in features], dim=1) + classification = torch.cat([self.classifier(feature) for feature in features], dim=1) + anchors = self.anchors(imgs) + + if is_training: + return self.focalloss(classification, regression, anchors, annots) + else: + + transformed_anchors = self.regressBoxes(anchors, regression) + transformed_anchors = self.clipBoxes(transformed_anchors, imgs) + + scores = torch.max(classification, dim=2, keepdim=True)[0] + + scores_gt_thresh = (scores > 0.05)[0, :, 0] + if scores_gt_thresh.sum() == 0: + return [ + torch.zeros(0), + torch.zeros(0), + torch.zeros(0, 4) + ] + + classification = classification[:, scores_gt_thresh, :] + transformed_anchors = transformed_anchors[:, scores_gt_thresh, :] + scores = scores[:, scores_gt_thresh, :] + + anchors_nms_idx = nms(torch.cat([transformed_anchors, scores], dim=2)[0, :, :], 0.5) + + nms_scores, nms_classes = classification[0, anchors_nms_idx, :].max(dim=1) + + return [nms_scores, nms_classes, transformed_anchors[0, anchors_nms_idx, :]] + + + +if __name__ == '__main__': + + def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + model = EfficientDet(num_classes=80) + print (count_parameters(model)) + + diff --git a/pyvision/detection/efficientdet/lib/utils.py b/pyvision/detection/efficientdet/lib/utils.py new file mode 100644 index 0000000..4768423 --- /dev/null +++ b/pyvision/detection/efficientdet/lib/utils.py @@ -0,0 +1,173 @@ +import torch +import torch.nn as nn +import numpy as np + +class BBoxTransform(nn.Module): + + def __init__(self, mean=None, std=None, gpu=False): + + super(BBoxTransform, self).__init__() + + if mean is None: + self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)) + else: + self.mean = mean + if std is None: + self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)) + else: + self.std = std + if gpu and torch.cuda.is_available(): + self.mean = self.mean.cuda() + self.std = self.std.cuda() + + def forward(self, boxes, deltas): + + widths = boxes[:, :, 2] - boxes[:, :, 0] + heights = boxes[:, :, 3] - boxes[:, :, 1] + ctrx = boxes[:, :, 0] + 0.5 * widths + ctry = boxes[:, :, 1] + 0.5 * heights + + dx = deltas[:, :, 0] * self.std[0] + self.mean[0] + dy = deltas[:, :, 1] * self.std[1] + self.mean[1] + dw = deltas[:, :, 2] * self.std[2] + self.mean[2] + dh = deltas[:, :, 3] * self.std[3] + self.mean[3] + + pred_x = ctrx + dx * widths + pred_y = ctry + dy * heights + pred_w = torch.exp(dw) * widths + pred_h = torch.exp(dh) * heights + + pred_boxes_x1 = pred_x - 0.5 * pred_w + pred_boxes_y1 = pred_y - 0.5 * pred_h + pred_boxes_x2 = pred_x + 0.5 * pred_w + pred_boxes_y2 = pred_y + 0.5 * pred_h + + pred_boxes = torch.stack([ + pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2 + ], dim=2) + + return pred_boxes + + +class ClipBoxes(nn.Module): + + def __init__(self): + super(ClipBoxes, self).__init__() + + def forward(self, boxes, img): + + batch_size, num_channels, height, width = img.shape + + boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) + boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) + + boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width) + boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height) + + return boxes + +class Anchors(nn.Module): + def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None): + super(Anchors, self).__init__() + + if pyramid_levels is None: + self.pyramid_levels = [3, 4, 5, 6, 7] + if strides is None: + self.strides = [2 ** x for x in self.pyramid_levels] + if sizes is None: + self.sizes = [2 ** (x + 2) for x in self.pyramid_levels] + if ratios is None: + self.ratios = np.array([0.5, 1, 2]) + if scales is None: + self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) + + def forward(self, image): + + image_shape = image.shape[2:] + image_shape = np.array(image_shape) + image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels] + + all_anchors = np.zeros((0, 4)).astype(np.float32) + + for idx, p in enumerate(self.pyramid_levels): + anchors = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales) + shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors) + all_anchors = np.append(all_anchors, shifted_anchors, axis=0) + + all_anchors = np.expand_dims(all_anchors, axis=0) + + anchors = torch.from_numpy(all_anchors.astype(np.float32)) + if torch.cuda.is_available(): + anchors = anchors.cuda() + return anchors + + +def generate_anchors(base_size=16, ratios=None, scales=None): + if ratios is None: + ratios = np.array([0.5, 1, 2]) + + if scales is None: + scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) + + num_anchors = len(ratios) * len(scales) + anchors = np.zeros((num_anchors, 4)) + anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T + areas = anchors[:, 2] * anchors[:, 3] + anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) + anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) + anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T + anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T + + return anchors + + +def compute_shape(image_shape, pyramid_levels): + image_shape = np.array(image_shape[:2]) + image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] + return image_shapes + + +def shift(shape, stride, anchors): + shift_x = (np.arange(0, shape[1]) + 0.5) * stride + shift_y = (np.arange(0, shape[0]) + 0.5) * stride + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + shifts = np.vstack(( + shift_x.ravel(), shift_y.ravel(), + shift_x.ravel(), shift_y.ravel() + )).transpose() + + A = anchors.shape[0] + K = shifts.shape[0] + all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) + all_anchors = all_anchors.reshape((K * A, 4)) + + return all_anchors + + +def colors(label): + + if isinstance(label, str): + label = int(label) + + box_colors = [(39, 129, 113), (164, 80, 133), (83, 122, 114), (99, 81, 172), (95, 56, 104), (37, 84, 86), (14, 89, 122), + (80, 7, 65), (10, 102, 25), (90, 185, 109), (106, 110, 132), (169, 158, 85), (188, 185, 26), (103, 1, 17), + (82, 144, 81), (92, 7, 184), (49, 81, 155), (179, 177, 69), (93, 187, 158), (13, 39, 73), (12, 50, 60), + (16, 179, 33), (112, 69, 165), (15, 139, 63), (33, 191, 159), (182, 173, 32), (34, 113, 133), (90, 135, 34), + (53, 34, 86), (141, 35, 190), (6, 171, 8), (118, 76, 112), (89, 60, 55), (15, 54, 88), (112, 75, 181), + (42, 147, 38), (138, 52, 63), (128, 65, 149), (106, 103, 24), (168, 33, 45), (28, 136, 135), (86, 91, 108), + (52, 11, 76), (142, 6, 189), (57, 81, 168), (55, 19, 148), (182, 101, 89), (44, 65, 179), (1, 33, 26), + (122, 164, 26), (70, 63, 134), (137, 106, 82), (120, 118, 52), (129, 74, 42), (182, 147, 112), (22, 157, 50), + (56, 50, 20), (2, 22, 177), (156, 100, 106), (21, 35, 42), (13, 8, 121), (142, 92, 28), (45, 118, 33), + (105, 118, 30), (7, 185, 124), (46, 34, 146), (105, 184, 169), (22, 18, 5), (147, 71, 73), (181, 64, 91), + (31, 39, 184), (164, 179, 33), (96, 50, 18), (95, 15, 106), (113, 68, 54), (136, 116, 112), (119, 139, 130), + (31, 139, 34), (66, 6, 127), (62, 39, 2), (49, 99, 180), (49, 119, 155), (153, 50, 183), (125, 38, 3), + (129, 87, 143), (49, 87, 40), (128, 62, 120), (73, 85, 148), (28, 144, 118), (29, 9, 24), (175, 45, 108), + (81, 175, 64), (178, 19, 157), (74, 188, 190), (18, 114, 2), (62, 128, 96), (21, 3, 150), (0, 6, 95), + (2, 20, 184), (122, 37, 185)] + + return box_colors[label] + + + + + diff --git a/pyvision/detection/efficientdet/model.py b/pyvision/detection/efficientdet/model.py new file mode 100644 index 0000000..80f742f --- /dev/null +++ b/pyvision/detection/efficientdet/model.py @@ -0,0 +1,195 @@ +import os +import numpy as np +import shutil +import cv2 +from PIL import Image +import sys +import time +import yaml +import gdown + +import torch +import torch.nn as nn +from torchvision import transforms + +from .lib.model import EfficientDet +from .lib.utils import colors + +__PREFIX__ = os.path.dirname(os.path.realpath(__file__)) + +sys.path.append(__PREFIX__) + +import yaml +import json +import re + + + +class EffdetInferAPI(object): + + def __init__(self, dataset='coco', thresh=0.4, gpu=False, common_size=512, verbose=False, wtspath="weights/", model_path=None): + + self.model_path = model_path + self.verbose = verbose + self.common_size = common_size + self.thresh = thresh + + self.mean = np.array([[[0.485, 0.456, 0.406]]]) + self.std = np.array([[[0.229, 0.224, 0.225]]]) + + with open(__PREFIX__ + f"/config/dataset_{dataset}.yaml", "r") as f: + config_file = yaml.safe_load(f) + + self.class_list = config_file["class_list"] + self.model_name = config_file['model_name'] + + + if gpu and not torch.cuda.is_available(): + raise ValueError(f"gpu not available but found gpu={gpu}") + self.device = "cuda" if gpu else "cpu" + self.gpu = gpu + + + #Instantiate the model + self.model = EfficientDet( + model_coeff = 0, + num_classes = len(self.class_list), + device = self.device + ) + + wtspath = wtspath+"{}.pth".format(self.model_name) + resp = self._check_or_download_weights(__PREFIX__+"/"+wtspath) + if resp == 0: + print("weights downloaded.") + else: + print("weights found.") + + if self.model_path is None: + self.model_path = __PREFIX__+"/"+wtspath + self.model.load_state_dict(torch.load(self.model_path)) + + self.model = self.model.to(self.device) + + + def _check_or_download_weights(self, wtspath): + + if os.path.join(__PREFIX__, "weights") not in wtspath and not os.path.exists(wtspath): + raise FileNotFoundError("File not found. Either file doesnt exist or directory provided") + elif not os.path.exists(wtspath): + + if os.path.exists(__PREFIX__+"/weights/") and len(os.listdir(__PREFIX__+"/weights/")) == 0: + os.rmdir(__PREFIX__+"/"+"weights/") + os.mkdir(__PREFIX__+"/weights/") + + if not os.path.exists(__PREFIX__+"/weights/"): + os.mkdir(__PREFIX__+"/weights/") + + with open(os.path.join(__PREFIX__, "config/weights_download.json")) as fp: + json_file = json.load(fp) + print("fetching file ids for {}".format(self.model_name)) + file_id = json_file[self.model_name] + + url = 'https://drive.google.com/uc?id={}'.format(file_id) + wtspath = __PREFIX__ + "/weights/{}.pth".format(self.model_name) + gdown.download(url, wtspath, quiet=False) + + self.wtspath = wtspath + + return 0 + else: + self.wtspath = wtspath + return 1 + + def detect(self, img): + + if isinstance(img, str): + if os.path.exists(img): + img_name = os.path.basename(img) + img = cv2.imread(img) + else: + raise FileNotFoundError("2",img) + elif isinstance(img, np.ndarray): + pass + elif isinstance(img, Image.Image): + img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR) + + orig_img = img + + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + img = img.astype(np.float32) / 255.0 + img = (img.astype(np.float32) - self.mean) / self.std + height, width, _ = img.shape + + if height > width: + scale = self.common_size / height + resized_height = self.common_size + resized_width = int(width * scale) + else: + scale = self.common_size / width + resized_height = int(height * scale) + resized_width = self.common_size + + img = cv2.resize(img, (resized_width, resized_height)) + + new_img = np.zeros((self.common_size, self.common_size, 3)) + new_img[0:resized_height, 0:resized_width] = img + + img = torch.from_numpy(img) + + start_time = time.time() + with torch.no_grad(): + img = img.to(self.device) + scores, labels, boxes = self.model(img.permute(2, 0, 1).float().unsqueeze(dim=0)) + boxes /= scale + duration = time.time() - start_time + + scores = scores.cpu().numpy() + labels = labels.cpu().numpy() + boxes = boxes.cpu().numpy() + + #try: + + to_delete = [] + if boxes.shape[0] > 0: + + for boxid in range(boxes.shape[0]): + pred_probs = float(scores[boxid]) + #print(pred_probs) + if pred_probs < self.thresh: + #print(f"small prob: {pred_probs}") + to_delete.append(boxid) + continue + pred_labels = int(labels[boxid]) + xmin, ymin, xmax, ymax = boxes[boxid, :] + + color = colors(pred_labels) + cv2.rectangle(orig_img, (xmin, ymin), (xmax, ymax), color, 1) + #print("drawing") + put_text = self.class_list[pred_labels]+":%.2f"%pred_probs + text_size = cv2.getTextSize(self.class_list[pred_labels]+":%.2f"%pred_probs, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] + + # clipping text boxes to prevent out-of-frame boxes + text_x_max = xmin + text_size[0] + 3 if (xmin + text_size[0] + 3) < resized_width else resized_width + text_y_max = ymin + text_size[1] + 4 if (ymin + text_size[1] + 4) < resized_height else resized_height + + xmin = int(xmin) + ymin = int(ymin) + text_x_max = int(text_x_max) + text_y_max = int(text_y_max) + + cv2.rectangle(orig_img, (xmin, ymin), (text_x_max, text_y_max), color, -1) + cv2.putText( + orig_img, put_text, (xmin, ymin + text_size[1] + 4), + cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1 + ) + + + scores = np.delete(scores, to_delete) + labels = np.delete(labels, to_delete) + boxes = np.delete(boxes, to_delete) + + + labels = [self.class_list[label] for label in labels] + + return orig_img, duration, scores, labels, boxes + diff --git a/pyvision/detection/efficientdet/readme.md b/pyvision/detection/efficientdet/readme.md new file mode 100644 index 0000000..e02b8ef --- /dev/null +++ b/pyvision/detection/efficientdet/readme.md @@ -0,0 +1,20 @@ +# EfficientDet: Scalable and Efficient Object Detection + +A model zoo implementation of the EfficientDet algorithm. + +## Current Stat + +* Efficientdet-b0 trained on Dataset-v3 with a loss of 0.13 + +## Usage + +* To Train, from repo root, + +```shell +!python src/models/efficientdet/train.py +``` + +## To Do + +- [ ] Training b1 - b7 models. Experimenting with focal loss values. +- [ ] Train API diff --git a/pyvision/detection/efficientdet/train.py b/pyvision/detection/efficientdet/train.py new file mode 100644 index 0000000..de3cbd7 --- /dev/null +++ b/pyvision/detection/efficientdet/train.py @@ -0,0 +1,393 @@ +import os +import argparse +import time +from tqdm.auto import tqdm +import shutil +import numpy as np +import sys + +import torch.nn as nn +import torch +from torch.utils.data import DataLoader +from torchvision import transforms +from tensorboardX import SummaryWriter + +sys.path.append(os.path.basename(__file__)+"/lib") + +from lib.model import EfficientDet +from lib.dataset import CustomDataset, Resizer, Normalizer, Augmenter, collater + + +def parse_args(): + + parser = argparse.ArgumentParser(description="EfficientDet: Scalable and Efficient Object Detection training module") + + # General Parameters + parser.add_argument("--name", type=str, default="exp_0", help="Name of experiment") + + # Model parameters + parser.add_argument("--model_coeff", type=int, default=0, required=True, help="Efficientdet model coeff (b0, b1, ....)") + parser.add_argument("--image_size", type=int, default=512, help="The common height and width for all images") + parser.add_argument("--ckpt", type=str, help="path to checkpoint from where to resume training ") + + # Training parameters + parser.add_argument("--batch_size", type=int, default=8, help="Batch size for training") + parser.add_argument("--lr", type=float, default=1e-4, help="Initial Learning rate for training") + parser.add_argument("--gpu", type=bool, default=True, required=True, help="True if training is to use GPU. False if not.") + parser.add_argument("--alpha", type=float, default=0.25, help="Alpha parameter for focal loss") + parser.add_argument("--gamma", type=float, default=1.5, help="Gamma parameter for focal loss") + parser.add_argument("--epochs", type=int, default=100, help="Number of epochs to run training for") + parser.add_argument("--es_min_delta", type=float, default=0.0, help="Early Stopping's Parameter: minimum change in loss to qualify as improvement") + parser.add_argument("--es_patience", type=int, default=0, help="Early stopping's parameter: Number of epochs with no improvement in loss to stop training. 0 to disable") + + # Logging parameters + parser.add_argument("--log_path", type=str, default="tensorboard/", help="Path to store tensorboard logs") + parser.add_argument("--save_path", type=str, default="trained/", help="path to folder where to save trained model") + parser.add_argument("--best_epoch", type=int, default=0) + parser.add_argument("--best_loss", type=float, default=1e5) + + # Train Dataset parameters + + # Format of Dataset: + # - Root Directory + # - Annotations (COCO Format) + # - train_instance.json + # - test_instance.json + # - val_instance.json + # - train + # - img1 + # - img2 + # . + # . + # - imgn + # - test + # - img1 + # - img2 + # . + # . + # - imgn + # - val + # - img1 + # - img2 + # . + # . + # - imgn + + parser.add_argument("--root_dir", type=str, required=True, help="Path to root dataset directory") + parser.add_argument("--coco_dir", type=str, default="./", required=True) + parser.add_argument("--img_dir", type=str, required=True, help="Name of the folder containing the imgs in the root dir") + parser.add_argument("--set_dir", type=str, required=True, help="name of set (train/test/val) being used for this") + parser.add_argument("--num_threads", type=int, default=2, help="Number of threads to utilize for loading data") + + # Validation parameters + parser.add_argument("--val", type=bool, default=False, help="Perform validation boolean") + parser.add_argument("--val_interval", type=int, default=5, help="Epochs interval after which to run validation") + parser.add_argument("--val_dir", type=str, help="Path to Validation set root directory") + parser.add_argument("--val_imgs", type=str, help="Path to Val set imgs") + parser.add_argument("--val_coco", type=str) + parser.add_argument("--val_set", type=str, help="Path to set dir") + + args = parser.parse_args() + + return args + +def Train(args): + + if args.gpu and not torch.cuda.is_available(): + raise ValueError(f"--gpu is {args.gpu} but cuda not found") + + if args.gpu: + device = "cuda" + else: + device = "cpu" + + # setting the trainloader + trainset = CustomDataset( + root_dir = args.root_dir + "/" + args.coco_dir, + img_dir = args.img_dir, + set_name = args.set_dir, + transform = transforms.Compose([Normalizer(), Augmenter(), Resizer()]) + ) + trainloader = DataLoader( + trainset, + batch_size = args.batch_size, + shuffle = False, + drop_last = False, + collate_fn = collater, + num_workers = args.num_threads + ) + + # If validation is enabled, set the val loader + if args.val: + + valset = CustomDataset( + root_dir = args.val_dir + "/" + args.val_coco, + img_dir = args.val_imgs, + set_name = args.val_set, + transform = transforms.Compose([Normalizer(), Resizer()]) + ) + + valloader = DataLoader( + valset, + batch_size=args.batch_size, + shuffle=False, + drop_last=False, + collate_fn=collater, + num_workers=args.num_threads + ) + + # setting the device and other model params + + num_classes = trainset.num_classes() + efficientdet = EfficientDet( + model_coeff = args.model_coeff, + num_classes=num_classes, + focal_alpha = args.alpha, + focal_gamma = args.gamma, + device = device + ) + + # loading pretrained models (if passed) + try: + efficientdet.load_state_dict(torch.load(args.ckpt)) + print("checkpoint loaded successfully!") + except Exception as e: + print("ERROR: Model Loading failed: ", e) + + + efficientdet = efficientdet.to(device) + efficientdet.train() + + # Setting the optimizer and scheduler + optimizer = torch.optim.Adam(efficientdet.parameters(), args.lr) + scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) + + # set up logging and model save directories + args.log_path = args.log_path + "/" + "EfficientDet" + "/" + args.name + if os.path.isdir(args.log_path): + shutil.rmtree(args.log_path) + os.makedirs(args.log_path) + + if os.path.isdir(args.save_path): + shutil.rmtree(args.save_path) + os.makedirs(args.save_path) + + # setting up the tensorboard writer + writer = SummaryWriter(args.log_path) + + len_trainloader = len(trainloader) + + if args.val: + + for epoch in range(args.epochs): + + efficientdet.train() + + epoch_loss = [] + epoch_progress = tqdm(trainloader) + for idx, data in enumerate(epoch_progress): + try: + + # zero grading the optimizer + optimizer.zero_grad() + + # forward pass + + img_batch = data['img'].to(device).float() + annot_batch = data['annot'].to(device) + + cls_loss, reg_loss = efficientdet([img_batch, annot_batch]) + + # Optimization block + + cls_loss = cls_loss.mean() + reg_loss = reg_loss.mean() + + total_loss = cls_loss + reg_loss + if total_loss == 0: + continue + + total_loss.backward() + + torch.nn.utils.clip_grad_norm_(efficientdet.parameters(), 0.1) + + optimizer.step() + + epoch_loss.append(float(total_loss)) + total_mean_loss = np.mean(epoch_loss) + + epoch_progress.set_description( + "Epoch: {}/{}, Batch id: {}/{}, Classification Loss: {:.5f}, Regression Loss: {:.5f}, Batch Loss: {:.5f}, Total Loss: {:.5f}".format( + epoch+1, args.epochs, idx, len_trainloader, cls_loss, reg_loss, total_loss, total_mean_loss + ) + ) + + writer.add_scalar('Train/Total_Loss', total_mean_loss, epoch * len_trainloader + idx) + writer.add_scalar('Train/Regression_Loss', reg_loss, epoch * len_trainloader + idx) + writer.add_scalar('Train/Classification_loss (Focal Loss)', cls_loss, epoch * len_trainloader + idx) + + except Exception as e: + print(e) + continue + + scheduler.step(np.mean(epoch_loss)) + + if epoch % args.val_interval == 0: + + efficientdet.eval() + loss_reg_ls = [] + loss_cls_ls = [] + + for idx, data in enumerate(valloader): + + img_batch = data['img'].to(device).float() + annot_batch = data['annot'].to(device) + + with torch.no_grad(): + + cls_loss, reg_loss = efficientdet([img_batch, annot_batch]) + + cls_loss = cls_loss.mean() + reg_loss = reg_loss.mean() + + loss_cls_ls.append(float(cls_loss)) + loss_reg_ls.append(float(reg_loss)) + + cls_loss = np.mean(loss_cls_ls) + reg_loss = np.mean(loss_reg_ls) + loss = cls_loss + reg_loss + + print( + 'Epoch: {}/{}, Classification Loss: {:1.5f}, Regression Loss: {:1.5f}, Total Loss: {:1.5f}'.format( + epoch+1, args.epochs, cls_loss, reg_loss, np.mean(loss) + ) + ) + + + + writer.add_scalar('Val/Total_Loss', loss, epoch) + writer.add_scalar('Val/Regression_Loss', reg_loss, epoch) + writer.add_scalar('Val/Classification_Loss', cls_loss, epoch) + + if loss + args.es_min_delta < args.best_loss: + + args.best_loss = loss + args.best_epoch = epoch + torch.save(efficientdet, os.path.join(args.save_path, "efficientdet_best.pth")) + + dummy = torch.rand(1, 3, 512, 512) + dummy = dummy.to(device) + + if isinstance(efficientdet, nn.DataParallel): + + efficientdet.backbone_net.model.set_swish(memory_efficient=False) + + try: + torch.onnx.export( + efficientdet.module, dummy, os.path.join(args.save_path, "efficientdet_best.onnx"), + verbose=False, opset_version=11 + ) + except: + print("Failed ONNX export") + + else: + + efficientdet.backbone_net.model.set_swish(memory_efficient=False) + torch.onnx.export( + efficientdet, dummy, os.path.join(args.save_path, "efficientdet_best.onnx"), + verbose=False, opset_version=11 + ) + efficientdet.backbone_net.model.set_swish(memory_efficient=True) + + if epoch - args.best_epoch > args.es_patience > 0: + print(f"Stopped training at epoch: {epoch}, Lowerst loss: {loss}") + break + + else: + + for epoch in range(args.epochs): + + efficientdet.train() + + epoch_loss = [] + epoch_progress = tqdm(trainloader) + for idx, data in enumerate(epoch_progress): + try: + + # zero grading the optimizer + optimizer.zero_grad() + + # forward pass + + img_batch = data['img'].to(device).float() + annot_batch = data['annot'].to(device) + + cls_loss, reg_loss = efficientdet([img_batch, annot_batch]) + + # Optimization block + + cls_loss = cls_loss.mean() + reg_loss = reg_loss.mean() + + total_loss = cls_loss + reg_loss + if total_loss == 0: + continue + + total_loss.backward() + + torch.nn.utils.clip_grad_norm_(efficientdet.parameters(), 0.1) + + optimizer.step() + + epoch_loss.append(float(total_loss)) + total_mean_loss = np.mean(epoch_loss) + + epoch_progress.set_description( + "Epoch: {}/{}, Batch id: {}/{}, Classification Loss: {:.5f}, Regression Loss: {:.5f}, Batch Loss: {:.5f}, Total Loss: {:.5f}".format( + epoch+1, args.epochs, idx, len_trainloader, cls_loss, reg_loss, total_loss, total_mean_loss + ) + ) + + writer.add_scalar('Train/Total_Loss', total_mean_loss, epoch * len_trainloader + idx) + writer.add_scalar('Train/Regression_Loss', reg_loss, epoch * len_trainloader + idx) + writer.add_scalar('Train/Classification_loss (Focal Loss)', cls_loss, epoch * len_trainloader + idx) + + except Exception as e: + print(e) + continue + + scheduler.step(np.mean(epoch_loss)) + + torch.save(efficientdet, os.path.join(args.save_path, "efficientdet_best.pth")) + + dummy = torch.rand(1, 3, 512, 512) + dummy = dummy.to(device) + if isinstance(efficientdet, nn.DataParallel): + + efficientdet.backbone_net.model.set_swish(memory_efficient=False) + + try: + torch.onnx.export( + efficientdet.module, dummy, os.path.join(args.save_path, "efficientdet_best.onnx"), + verbose=False, opset_version=11 + ) + except: + print("Failed ONNX export") + + else: + + efficientdet.backbone_net.model.set_swish(memory_efficient=False) + torch.onnx.export( + efficientdet, dummy, os.path.join(args.save_path, "efficientdet_best.onnx"), + verbose=False, opset_version=11 + ) + efficientdet.backbone_net.model.set_swish(memory_efficient=True) + + + writer.close() + + + +if __name__ == "__main__": + opts = parse_args() + Train(opts) \ No newline at end of file diff --git a/tests/detection/effdet/2.jpg b/tests/detection/effdet/2.jpg new file mode 100644 index 0000000..6bbb7c1 Binary files /dev/null and b/tests/detection/effdet/2.jpg differ diff --git a/tests/detection/effdet/3.jpg b/tests/detection/effdet/3.jpg new file mode 100644 index 0000000..058416c Binary files /dev/null and b/tests/detection/effdet/3.jpg differ diff --git a/tests/detection/effdet/__init__.py b/tests/detection/effdet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/detection/effdet/test_effdet.py b/tests/detection/effdet/test_effdet.py new file mode 100644 index 0000000..1f3d4d1 --- /dev/null +++ b/tests/detection/effdet/test_effdet.py @@ -0,0 +1,17 @@ +import cv2 +from PIL import Image +from pyvision.detection import efficientdet + +model = efficientdet.EfficientDet("coco", thresh=0.95) + +img1 = cv2.imread("tests/detection/effdet/2.jpg") +img2 = cv2.imread("tests/detection/effdet/3.jpg") + +imgs = [img1, img2] + +for img in imgs: + img = cv2.resize(img, (416, 416)) + res = model.detect(img) + cv2.imshow("Frame", res[0]) + if cv2.waitKey() == ord('q'): + continue \ No newline at end of file