From 8ffb2c1b536215a1add39c7bd46b67e6e4058142 Mon Sep 17 00:00:00 2001 From: Sahil Date: Sun, 22 Oct 2023 09:22:00 +0000 Subject: [PATCH] yolox code and almost mmdetection --- ns_vfs/config/config.yaml | 7 + ns_vfs/config/loader.py | 16 + ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py | 547 +++++++++++++++++++ ns_vfs/model/vision/_base.py | 2 +- ns_vfs/model/vision/mmdetection.py | 62 +++ ns_vfs/model/vision/yolox.py | 167 ++++++ run_frame_to_automata.py | 29 +- 7 files changed, 820 insertions(+), 10 deletions(-) create mode 100644 ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py create mode 100644 ns_vfs/model/vision/mmdetection.py create mode 100644 ns_vfs/model/vision/yolox.py diff --git a/ns_vfs/config/config.yaml b/ns_vfs/config/config.yaml index 36c84bd..9637c9f 100644 --- a/ns_vfs/config/config.yaml +++ b/ns_vfs/config/config.yaml @@ -15,3 +15,10 @@ GROUNDING_DINO: YOLO: YOLO_CHECKPOINT_PATH: + +YOLOX: + YOLOX_CHECKPOINT_PATH: + +MMDETECTION: + MMDETECTION_CONFIG_PATH: + MMDETECTION_CHECKPOINT_PATH: \ No newline at end of file diff --git a/ns_vfs/config/loader.py b/ns_vfs/config/loader.py index 5ea6f68..7b18a31 100644 --- a/ns_vfs/config/loader.py +++ b/ns_vfs/config/loader.py @@ -31,4 +31,20 @@ def load_config(): "weights", "yolov8n.pt", ) + config.YOLOX.YOLOX_CHECKPOINT_PATH = os.path.join( + config.VERSION_AND_PATH.ARTIFACTS_PATH, + "weights", + "yolox_x.pth", + ) + config.MMDETECTION.MMDETECTION_CONFIG_PATH = os.path.join( + config.VERSION_AND_PATH.ROOT_PATH, + "ns_vfs", + "config", + "rtmdet_tiny_8xb32-300e_coco.py", + ) + config.MMDETECTION.MMDETECTION_CHECKPOINT_PATH = os.path.join( + config.VERSION_AND_PATH.ARTIFACTS_PATH, + "weights", + "rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth" + ) return config diff --git a/ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py b/ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py new file mode 100644 index 0000000..0180f23 --- /dev/null +++ b/ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py @@ -0,0 +1,547 @@ +auto_scale_lr = dict(base_batch_size=16, enable=False) +backend_args = None +base_lr = 0.004 +checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth' +custom_hooks = [ + dict( + ema_type='ExpMomentumEMA', + momentum=0.0002, + priority=49, + type='EMAHook', + update_buffers=True), + dict( + switch_epoch=280, + switch_pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + keep_ratio=True, + ratio_range=( + 0.5, + 2.0, + ), + scale=( + 640, + 640, + ), + type='RandomResize'), + dict(crop_size=( + 640, + 640, + ), type='RandomCrop'), + dict(type='YOLOXHSVRandomAug'), + dict(prob=0.5, type='RandomFlip'), + dict( + pad_val=dict(img=( + 114, + 114, + 114, + )), + size=( + 640, + 640, + ), + type='Pad'), + dict(type='PackDetInputs'), + ], + type='PipelineSwitchHook'), +] +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +default_hooks = dict( + checkpoint=dict(interval=10, max_keep_ckpts=3, type='CheckpointHook'), + logger=dict(interval=50, type='LoggerHook'), + param_scheduler=dict(type='ParamSchedulerHook'), + sampler_seed=dict(type='DistSamplerSeedHook'), + timer=dict(type='IterTimerHook'), + visualization=dict(type='DetVisualizationHook')) +default_scope = 'mmdet' +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +img_scales = [ + ( + 640, + 640, + ), + ( + 320, + 320, + ), + ( + 960, + 960, + ), +] +interval = 10 +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50) +max_epochs = 300 +model = dict( + backbone=dict( + act_cfg=dict(inplace=True, type='SiLU'), + arch='P5', + channel_attention=True, + deepen_factor=0.167, + expand_ratio=0.5, + init_cfg=dict( + checkpoint= + 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth', + prefix='backbone.', + type='Pretrained'), + norm_cfg=dict(type='SyncBN'), + type='CSPNeXt', + widen_factor=0.375), + bbox_head=dict( + act_cfg=dict(inplace=True, type='SiLU'), + anchor_generator=dict( + offset=0, strides=[ + 8, + 16, + 32, + ], type='MlvlPointGenerator'), + bbox_coder=dict(type='DistancePointBBoxCoder'), + exp_on_reg=False, + feat_channels=96, + in_channels=96, + loss_bbox=dict(loss_weight=2.0, type='GIoULoss'), + loss_cls=dict( + beta=2.0, + loss_weight=1.0, + type='QualityFocalLoss', + use_sigmoid=True), + norm_cfg=dict(type='SyncBN'), + num_classes=80, + pred_kernel_size=1, + share_conv=True, + stacked_convs=2, + type='RTMDetSepBNHead', + with_objectness=False), + data_preprocessor=dict( + batch_augments=None, + bgr_to_rgb=False, + mean=[ + 103.53, + 116.28, + 123.675, + ], + std=[ + 57.375, + 57.12, + 58.395, + ], + type='DetDataPreprocessor'), + neck=dict( + act_cfg=dict(inplace=True, type='SiLU'), + expand_ratio=0.5, + in_channels=[ + 96, + 192, + 384, + ], + norm_cfg=dict(type='SyncBN'), + num_csp_blocks=1, + out_channels=96, + type='CSPNeXtPAFPN'), + test_cfg=dict( + max_per_img=300, + min_bbox_size=0, + nms=dict(iou_threshold=0.65, type='nms'), + nms_pre=30000, + score_thr=0.001), + train_cfg=dict( + allowed_border=-1, + assigner=dict(topk=13, type='DynamicSoftLabelAssigner'), + debug=False, + pos_weight=-1), + type='RTMDet') +optim_wrapper = dict( + optimizer=dict(lr=0.004, type='AdamW', weight_decay=0.05), + paramwise_cfg=dict( + bias_decay_mult=0, bypass_duplicate=True, norm_decay_mult=0), + type='OptimWrapper') +param_scheduler = [ + dict( + begin=0, by_epoch=False, end=1000, start_factor=1e-05, + type='LinearLR'), + dict( + T_max=150, + begin=150, + by_epoch=True, + convert_to_iter_based=True, + end=300, + eta_min=0.0002, + type='CosineAnnealingLR'), +] +resume = False +stage2_num_epochs = 20 +test_cfg = dict(type='TestLoop') +test_dataloader = dict( + batch_size=5, + dataset=dict( + ann_file='annotations/instances_val2017.json', + backend_args=None, + data_prefix=dict(img='val2017/'), + data_root='data/coco/', + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(keep_ratio=True, scale=( + 640, + 640, + ), type='Resize'), + dict( + pad_val=dict(img=( + 114, + 114, + 114, + )), + size=( + 640, + 640, + ), + type='Pad'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + ), + type='PackDetInputs'), + ], + test_mode=True, + type='CocoDataset'), + drop_last=False, + num_workers=10, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +test_evaluator = dict( + ann_file='data/coco/annotations/instances_val2017.json', + backend_args=None, + format_only=False, + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='CocoMetric') +test_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(keep_ratio=True, scale=( + 640, + 640, + ), type='Resize'), + dict(pad_val=dict(img=( + 114, + 114, + 114, + )), size=( + 640, + 640, + ), type='Pad'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + ), + type='PackDetInputs'), +] +train_cfg = dict( + dynamic_intervals=[ + ( + 280, + 1, + ), + ], + max_epochs=300, + type='EpochBasedTrainLoop', + val_interval=10) +train_dataloader = dict( + batch_sampler=None, + batch_size=32, + dataset=dict( + ann_file='annotations/instances_train2017.json', + backend_args=None, + data_prefix=dict(img='train2017/'), + data_root='data/coco/', + filter_cfg=dict(filter_empty_gt=True, min_size=32), + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + max_cached_images=20, + pad_val=114.0, + random_pop=False, + type='CachedMosaic'), + dict( + keep_ratio=True, + ratio_range=( + 0.5, + 2.0, + ), + scale=( + 1280, + 1280, + ), + type='RandomResize'), + dict(crop_size=( + 640, + 640, + ), type='RandomCrop'), + dict(type='YOLOXHSVRandomAug'), + dict(prob=0.5, type='RandomFlip'), + dict( + pad_val=dict(img=( + 114, + 114, + 114, + )), + size=( + 640, + 640, + ), + type='Pad'), + dict( + img_scale=( + 640, + 640, + ), + max_cached_images=10, + pad_val=( + 114, + 114, + 114, + ), + prob=0.5, + random_pop=False, + ratio_range=( + 1.0, + 1.0, + ), + type='CachedMixUp'), + dict(type='PackDetInputs'), + ], + type='CocoDataset'), + num_workers=10, + persistent_workers=True, + pin_memory=True, + sampler=dict(shuffle=True, type='DefaultSampler')) +train_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + img_scale=( + 640, + 640, + ), + max_cached_images=20, + pad_val=114.0, + random_pop=False, + type='CachedMosaic'), + dict( + keep_ratio=True, + ratio_range=( + 0.5, + 2.0, + ), + scale=( + 1280, + 1280, + ), + type='RandomResize'), + dict(crop_size=( + 640, + 640, + ), type='RandomCrop'), + dict(type='YOLOXHSVRandomAug'), + dict(prob=0.5, type='RandomFlip'), + dict(pad_val=dict(img=( + 114, + 114, + 114, + )), size=( + 640, + 640, + ), type='Pad'), + dict( + img_scale=( + 640, + 640, + ), + max_cached_images=10, + pad_val=( + 114, + 114, + 114, + ), + prob=0.5, + random_pop=False, + ratio_range=( + 1.0, + 1.0, + ), + type='CachedMixUp'), + dict(type='PackDetInputs'), +] +train_pipeline_stage2 = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + keep_ratio=True, + ratio_range=( + 0.5, + 2.0, + ), + scale=( + 640, + 640, + ), + type='RandomResize'), + dict(crop_size=( + 640, + 640, + ), type='RandomCrop'), + dict(type='YOLOXHSVRandomAug'), + dict(prob=0.5, type='RandomFlip'), + dict(pad_val=dict(img=( + 114, + 114, + 114, + )), size=( + 640, + 640, + ), type='Pad'), + dict(type='PackDetInputs'), +] +tta_model = dict( + tta_cfg=dict(max_per_img=100, nms=dict(iou_threshold=0.6, type='nms')), + type='DetTTAModel') +tta_pipeline = [ + dict(backend_args=None, type='LoadImageFromFile'), + dict( + transforms=[ + [ + dict(keep_ratio=True, scale=( + 640, + 640, + ), type='Resize'), + dict(keep_ratio=True, scale=( + 320, + 320, + ), type='Resize'), + dict(keep_ratio=True, scale=( + 960, + 960, + ), type='Resize'), + ], + [ + dict(prob=1.0, type='RandomFlip'), + dict(prob=0.0, type='RandomFlip'), + ], + [ + dict( + pad_val=dict(img=( + 114, + 114, + 114, + )), + size=( + 960, + 960, + ), + type='Pad'), + ], + [ + dict(type='LoadAnnotations', with_bbox=True), + ], + [ + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + 'flip', + 'flip_direction', + ), + type='PackDetInputs'), + ], + ], + type='TestTimeAug'), +] +val_cfg = dict(type='ValLoop') +val_dataloader = dict( + batch_size=5, + dataset=dict( + ann_file='annotations/instances_val2017.json', + backend_args=None, + data_prefix=dict(img='val2017/'), + data_root='data/coco/', + pipeline=[ + dict(backend_args=None, type='LoadImageFromFile'), + dict(keep_ratio=True, scale=( + 640, + 640, + ), type='Resize'), + dict( + pad_val=dict(img=( + 114, + 114, + 114, + )), + size=( + 640, + 640, + ), + type='Pad'), + dict(type='LoadAnnotations', with_bbox=True), + dict( + meta_keys=( + 'img_id', + 'img_path', + 'ori_shape', + 'img_shape', + 'scale_factor', + ), + type='PackDetInputs'), + ], + test_mode=True, + type='CocoDataset'), + drop_last=False, + num_workers=10, + persistent_workers=True, + sampler=dict(shuffle=False, type='DefaultSampler')) +val_evaluator = dict( + ann_file='data/coco/annotations/instances_val2017.json', + backend_args=None, + format_only=False, + metric='bbox', + proposal_nums=( + 100, + 1, + 10, + ), + type='CocoMetric') +vis_backends = [ + dict(type='LocalVisBackend'), +] +visualizer = dict( + name='visualizer', + type='DetLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + ]) diff --git a/ns_vfs/model/vision/_base.py b/ns_vfs/model/vision/_base.py index c9beb39..f472bcc 100644 --- a/ns_vfs/model/vision/_base.py +++ b/ns_vfs/model/vision/_base.py @@ -35,7 +35,7 @@ def get_labels(self) -> list: def get_detections(self) -> sv.Detections: """Return sv.Detections""" - return self._detection + return self._detections def get_confidence(self) -> np.ndarray: return self._confidence diff --git a/ns_vfs/model/vision/mmdetection.py b/ns_vfs/model/vision/mmdetection.py new file mode 100644 index 0000000..104661e --- /dev/null +++ b/ns_vfs/model/vision/mmdetection.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from mmdet.apis import DetInferencer +from omegaconf import DictConfig +import supervision as sv +import numpy as np +import warnings + +from ns_vfs.model.vision._base import ComputerVisionDetector + +warnings.filterwarnings("ignore") + + +class MMDetection(ComputerVisionDetector): + """MMDetection""" + + def __init__( + self, + config: DictConfig, + config_path: str, + weight_path: str + ) -> None: + self.model = self.load_model(weight_path, config_path) + self._config = config + + def load_model(self, weight_path, config_path) -> DetInferencer: + """Load weight. + + Args: + weight_path (str): Path to weight file. + + Returns: + None + """ + init_args = {'model': config_path, 'weights': weight_path, 'device': 'cpu', 'palette': 'none'} + + return DetInferencer(**init_args) + + def _parse_class_name(self, class_names: list[str]) -> list[str]: + """Parse class name. + + Args: + class_names (list[str]): List of class names. + + Returns: + list[str]: List of class names. + """ + return [f"all {class_name}s" for class_name in class_names] + + def detect(self, frame_img: np.ndarray, classes: list) -> any: + """Detect object in frame. + + Args: + frame_img (np.ndarray): Frame image. + classes (list[str]): List of class names. + + Returns: + any: Detections. + """ + + return None + diff --git a/ns_vfs/model/vision/yolox.py b/ns_vfs/model/vision/yolox.py new file mode 100644 index 0000000..fb078b2 --- /dev/null +++ b/ns_vfs/model/vision/yolox.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +import warnings + +import os +import cv2 +import torch +import numpy as np +import supervision as sv +from omegaconf import DictConfig +from yolox.data.data_augment import ValTransform +from yolox.data.datasets import COCO_CLASSES +from yolox.exp import get_exp +from yolox.utils import fuse_model, get_model_info, postprocess, vis + + +from ns_vfs.model.vision._base import ComputerVisionDetector + +warnings.filterwarnings("ignore") + + +class YoloX(ComputerVisionDetector): + """YoloX.""" + + def __init__(self, config: DictConfig, weight_path: str) -> None: + self.model = self.load_model(weight_path) + self._config = config + + def load_model(self, weight_path) -> Predictor: + """Load weight. + Args: + weight_path (str): Path to weight file. + Returns: + None + """ + exp = get_exp(None, "yolox-x") # modify in case model changes + exp.test_conf = 0.25 + exp.nmsthre = 0.45 + exp.test_size = (640, 640) + + model = exp.get_model() + model.eval() + ckpt = torch.load(weight_path, map_location="cpu") + model.load_state_dict(ckpt["model"]) + + return Predictor(model, exp) + + def _parse_class_name(self, class_names: list[str]) -> list[str]: + """Parse class name. + Args: + class_names (list[str]): List of class names. + Returns: + list[str]: List of class names. + """ + return [f"all {class_name}s" for class_name in class_names] + + def detect(self, frame_img: np.ndarray, classes: list) -> any: + """Detect object in frame. + Args: + frame_img (np.ndarray): Frame image. + classes (list[str]): List of class names. + Returns: + any: Detections. + """ + class_reversed = COCO_CLASSES.index(classes[0]) + outputs, img_info = self.model.inference(frame_img) + output = outputs[0].cpu() + cls = (output[:, 6]).numpy() + scores = (output[:, 4] * output[:, 5]).numpy() + self._confidence = np.array([]) + self._labels = [] + bbox_total = (output[:, 0:4]/img_info["ratio"]).cpu().detach().numpy() + print(bbox_total) + bbox = [] + for i in range(len(cls)): + print(cls[i]) + if cls[i] == class_reversed: + self._confidence = np.append(self._confidence, scores[i]) + self._labels.append(f"{COCO_CLASSES[int(cls[i])]} {scores[i]}") + bbox.append(bbox_total[i]) + + print(self._labels) + + print(bbox) + self._detections = sv.Detections(xyxy=np.array(bbox)) + self._size = len(self._confidence) + + # result_image = self.model.visual(output, img_info, self.model.confthre) + # file_name ="/opt/Neuro-Symbolic-Video-Frame-Search/ns_vfs/model/vision/test.png" + # cv2.imwrite(file_name, result_image) + # print(outputs) + + return outputs + + +class Predictor(object): + def __init__( + self, + model, + exp, + cls_names=COCO_CLASSES, + decoder=None, + device="cpu", + fp16=False, + legacy=False, + ): + self.model = model + self.cls_names = cls_names + self.decoder = decoder + self.num_classes = exp.num_classes + self.confthre = exp.test_conf + self.nmsthre = exp.nmsthre + self.test_size = exp.test_size + self.device = device + self.fp16 = fp16 + self.preproc = ValTransform(legacy=legacy) + + def inference(self, img): + img_info = {} + + height, width = img.shape[:2] + img_info["height"] = height + img_info["width"] = width + img_info["raw_img"] = img + + ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1]) + img_info["ratio"] = ratio + + img, _ = self.preproc(img, None, self.test_size) + img = torch.from_numpy(img).unsqueeze(0) + img = img.float() + if self.device == "gpu": + img = img.cuda() + if self.fp16: + img = img.half() # to FP16 + + with torch.no_grad(): + outputs = self.model(img) + if self.decoder is not None: + outputs = self.decoder(outputs, dtype=outputs.type()) + outputs = postprocess( + outputs, self.num_classes, self.confthre, + self.nmsthre, class_agnostic=True + ) + return outputs, img_info + + def visual(self, output, img_info, cls_conf=0.35): + ratio = img_info["ratio"] + img = img_info["raw_img"] + if output is None: + return img + output = output.cpu() + + bboxes = output[:, 0:4] + + # preprocessing: resize + bboxes /= ratio + + cls = output[:, 6] + scores = output[:, 4] * output[:, 5] + + # print("CLS: %s" %cls) + # print("cls_conf: %s" %cls_conf) + # print("scores: %s" %scores) + + vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names) + return vis_res \ No newline at end of file diff --git a/run_frame_to_automata.py b/run_frame_to_automata.py index ff6762b..cc44686 100644 --- a/run_frame_to_automata.py +++ b/run_frame_to_automata.py @@ -3,6 +3,8 @@ from ns_vfs.config.loader import load_config from ns_vfs.model.vision.grounding_dino import GroundingDino from ns_vfs.model.vision.yolo import Yolo +from ns_vfs.model.vision.yolox import YoloX +from ns_vfs.model.vision.mmdetection import MMDetection from ns_vfs.processor.video_processor import VideoFrameWindowProcessor from ns_vfs.video_to_automaton import VideotoAutomaton @@ -14,24 +16,33 @@ config = load_config() frame2automaton = VideotoAutomaton( + detector=YoloX( + config=config.YOLOX, + weight_path=config.YOLOX.YOLOX_CHECKPOINT_PATH, + ), # detector=Yolo( # config=config.YOLO, # weight_path=config.YOLO.YOLO_CHECKPOINT_PATH, # ), - detector=GroundingDino( - config=config.GROUNDING_DINO, - weight_path=config.GROUNDING_DINO.GROUNDING_DINO_CHECKPOINT_PATH, - config_path=config.GROUNDING_DINO.GROUNDING_DINO_CONFIG_PATH, - ), + # detector=MMDetection( + # config=config.MMDETECTION, + # config_path=config.MMDETECTION.MMDETECTION_CONFIG_PATH, + # weight_path=config.MMDETECTION.MMDETECTION_CHECKPOINT_PATH + # ), + # detector=GroundingDino( + # config=config.GROUNDING_DINO, + # weight_path=config.GROUNDING_DINO.GROUNDING_DINO_CHECKPOINT_PATH, + # config_path=config.GROUNDING_DINO.GROUNDING_DINO_CONFIG_PATH, + # ), video_processor=VideoFrameWindowProcessor( video_path=sample_video_path, artifact_dir=config.VERSION_AND_PATH.ARTIFACTS_PATH, ), artifact_dir=config.VERSION_AND_PATH.ARTIFACTS_PATH, - proposition_set=["person", "car"], - is_annotation=False, # TODO: Debug only - save_image=False, # TODO: Debug only - ltl_formula='P>=0.99 [F "person"]', + proposition_set=["car"], + is_annotation=True, # TODO: Debug only + save_image=True, # TODO: Debug only + ltl_formula='P>=0.99 [F "car"]', ) frame_window_automata = frame2automaton.run()