From 8ffb2c1b536215a1add39c7bd46b67e6e4058142 Mon Sep 17 00:00:00 2001
From: Sahil <sahilshah379@gmail.com>
Date: Sun, 22 Oct 2023 09:22:00 +0000
Subject: [PATCH] yolox code and almost mmdetection

---
 ns_vfs/config/config.yaml                    |   7 +
 ns_vfs/config/loader.py                      |  16 +
 ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py | 547 +++++++++++++++++++
 ns_vfs/model/vision/_base.py                 |   2 +-
 ns_vfs/model/vision/mmdetection.py           |  62 +++
 ns_vfs/model/vision/yolox.py                 | 167 ++++++
 run_frame_to_automata.py                     |  29 +-
 7 files changed, 820 insertions(+), 10 deletions(-)
 create mode 100644 ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py
 create mode 100644 ns_vfs/model/vision/mmdetection.py
 create mode 100644 ns_vfs/model/vision/yolox.py

diff --git a/ns_vfs/config/config.yaml b/ns_vfs/config/config.yaml
index 36c84bd..9637c9f 100644
--- a/ns_vfs/config/config.yaml
+++ b/ns_vfs/config/config.yaml
@@ -15,3 +15,10 @@ GROUNDING_DINO:
 
 YOLO:
   YOLO_CHECKPOINT_PATH:
+
+YOLOX:
+  YOLOX_CHECKPOINT_PATH:
+
+MMDETECTION:
+  MMDETECTION_CONFIG_PATH:
+  MMDETECTION_CHECKPOINT_PATH:
\ No newline at end of file
diff --git a/ns_vfs/config/loader.py b/ns_vfs/config/loader.py
index 5ea6f68..7b18a31 100644
--- a/ns_vfs/config/loader.py
+++ b/ns_vfs/config/loader.py
@@ -31,4 +31,20 @@ def load_config():
         "weights",
         "yolov8n.pt",
     )
+    config.YOLOX.YOLOX_CHECKPOINT_PATH = os.path.join(
+        config.VERSION_AND_PATH.ARTIFACTS_PATH,
+        "weights",
+        "yolox_x.pth",
+    )
+    config.MMDETECTION.MMDETECTION_CONFIG_PATH = os.path.join(
+        config.VERSION_AND_PATH.ROOT_PATH,
+        "ns_vfs",
+        "config",
+        "rtmdet_tiny_8xb32-300e_coco.py",
+    )
+    config.MMDETECTION.MMDETECTION_CHECKPOINT_PATH = os.path.join(
+        config.VERSION_AND_PATH.ARTIFACTS_PATH,
+        "weights",
+        "rtmdet_tiny_8xb32-300e_coco_20220902_112414-78e30dcc.pth"
+    )
     return config
diff --git a/ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py b/ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py
new file mode 100644
index 0000000..0180f23
--- /dev/null
+++ b/ns_vfs/config/rtmdet_tiny_8xb32-300e_coco.py
@@ -0,0 +1,547 @@
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backend_args = None
+base_lr = 0.004
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'
+custom_hooks = [
+    dict(
+        ema_type='ExpMomentumEMA',
+        momentum=0.0002,
+        priority=49,
+        type='EMAHook',
+        update_buffers=True),
+    dict(
+        switch_epoch=280,
+        switch_pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.5,
+                    2.0,
+                ),
+                scale=(
+                    640,
+                    640,
+                ),
+                type='RandomResize'),
+            dict(crop_size=(
+                640,
+                640,
+            ), type='RandomCrop'),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    640,
+                    640,
+                ),
+                type='Pad'),
+            dict(type='PackDetInputs'),
+        ],
+        type='PipelineSwitchHook'),
+]
+data_root = 'data/coco/'
+dataset_type = 'CocoDataset'
+default_hooks = dict(
+    checkpoint=dict(interval=10, max_keep_ckpts=3, type='CheckpointHook'),
+    logger=dict(interval=50, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='DetVisualizationHook'))
+default_scope = 'mmdet'
+env_cfg = dict(
+    cudnn_benchmark=False,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+img_scales = [
+    (
+        640,
+        640,
+    ),
+    (
+        320,
+        320,
+    ),
+    (
+        960,
+        960,
+    ),
+]
+interval = 10
+load_from = None
+log_level = 'INFO'
+log_processor = dict(by_epoch=True, type='LogProcessor', window_size=50)
+max_epochs = 300
+model = dict(
+    backbone=dict(
+        act_cfg=dict(inplace=True, type='SiLU'),
+        arch='P5',
+        channel_attention=True,
+        deepen_factor=0.167,
+        expand_ratio=0.5,
+        init_cfg=dict(
+            checkpoint=
+            'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth',
+            prefix='backbone.',
+            type='Pretrained'),
+        norm_cfg=dict(type='SyncBN'),
+        type='CSPNeXt',
+        widen_factor=0.375),
+    bbox_head=dict(
+        act_cfg=dict(inplace=True, type='SiLU'),
+        anchor_generator=dict(
+            offset=0, strides=[
+                8,
+                16,
+                32,
+            ], type='MlvlPointGenerator'),
+        bbox_coder=dict(type='DistancePointBBoxCoder'),
+        exp_on_reg=False,
+        feat_channels=96,
+        in_channels=96,
+        loss_bbox=dict(loss_weight=2.0, type='GIoULoss'),
+        loss_cls=dict(
+            beta=2.0,
+            loss_weight=1.0,
+            type='QualityFocalLoss',
+            use_sigmoid=True),
+        norm_cfg=dict(type='SyncBN'),
+        num_classes=80,
+        pred_kernel_size=1,
+        share_conv=True,
+        stacked_convs=2,
+        type='RTMDetSepBNHead',
+        with_objectness=False),
+    data_preprocessor=dict(
+        batch_augments=None,
+        bgr_to_rgb=False,
+        mean=[
+            103.53,
+            116.28,
+            123.675,
+        ],
+        std=[
+            57.375,
+            57.12,
+            58.395,
+        ],
+        type='DetDataPreprocessor'),
+    neck=dict(
+        act_cfg=dict(inplace=True, type='SiLU'),
+        expand_ratio=0.5,
+        in_channels=[
+            96,
+            192,
+            384,
+        ],
+        norm_cfg=dict(type='SyncBN'),
+        num_csp_blocks=1,
+        out_channels=96,
+        type='CSPNeXtPAFPN'),
+    test_cfg=dict(
+        max_per_img=300,
+        min_bbox_size=0,
+        nms=dict(iou_threshold=0.65, type='nms'),
+        nms_pre=30000,
+        score_thr=0.001),
+    train_cfg=dict(
+        allowed_border=-1,
+        assigner=dict(topk=13, type='DynamicSoftLabelAssigner'),
+        debug=False,
+        pos_weight=-1),
+    type='RTMDet')
+optim_wrapper = dict(
+    optimizer=dict(lr=0.004, type='AdamW', weight_decay=0.05),
+    paramwise_cfg=dict(
+        bias_decay_mult=0, bypass_duplicate=True, norm_decay_mult=0),
+    type='OptimWrapper')
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=1000, start_factor=1e-05,
+        type='LinearLR'),
+    dict(
+        T_max=150,
+        begin=150,
+        by_epoch=True,
+        convert_to_iter_based=True,
+        end=300,
+        eta_min=0.0002,
+        type='CosineAnnealingLR'),
+]
+resume = False
+stage2_num_epochs = 20
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=5,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                640,
+                640,
+            ), type='Resize'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    640,
+                    640,
+                ),
+                type='Pad'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    proposal_nums=(
+        100,
+        1,
+        10,
+    ),
+    type='CocoMetric')
+test_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        640,
+        640,
+    ), type='Resize'),
+    dict(pad_val=dict(img=(
+        114,
+        114,
+        114,
+    )), size=(
+        640,
+        640,
+    ), type='Pad'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        meta_keys=(
+            'img_id',
+            'img_path',
+            'ori_shape',
+            'img_shape',
+            'scale_factor',
+        ),
+        type='PackDetInputs'),
+]
+train_cfg = dict(
+    dynamic_intervals=[
+        (
+            280,
+            1,
+        ),
+    ],
+    max_epochs=300,
+    type='EpochBasedTrainLoop',
+    val_interval=10)
+train_dataloader = dict(
+    batch_sampler=None,
+    batch_size=32,
+    dataset=dict(
+        ann_file='annotations/instances_train2017.json',
+        backend_args=None,
+        data_prefix=dict(img='train2017/'),
+        data_root='data/coco/',
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                img_scale=(
+                    640,
+                    640,
+                ),
+                max_cached_images=20,
+                pad_val=114.0,
+                random_pop=False,
+                type='CachedMosaic'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.5,
+                    2.0,
+                ),
+                scale=(
+                    1280,
+                    1280,
+                ),
+                type='RandomResize'),
+            dict(crop_size=(
+                640,
+                640,
+            ), type='RandomCrop'),
+            dict(type='YOLOXHSVRandomAug'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    640,
+                    640,
+                ),
+                type='Pad'),
+            dict(
+                img_scale=(
+                    640,
+                    640,
+                ),
+                max_cached_images=10,
+                pad_val=(
+                    114,
+                    114,
+                    114,
+                ),
+                prob=0.5,
+                random_pop=False,
+                ratio_range=(
+                    1.0,
+                    1.0,
+                ),
+                type='CachedMixUp'),
+            dict(type='PackDetInputs'),
+        ],
+        type='CocoDataset'),
+    num_workers=10,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(shuffle=True, type='DefaultSampler'))
+train_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        img_scale=(
+            640,
+            640,
+        ),
+        max_cached_images=20,
+        pad_val=114.0,
+        random_pop=False,
+        type='CachedMosaic'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.5,
+            2.0,
+        ),
+        scale=(
+            1280,
+            1280,
+        ),
+        type='RandomResize'),
+    dict(crop_size=(
+        640,
+        640,
+    ), type='RandomCrop'),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(pad_val=dict(img=(
+        114,
+        114,
+        114,
+    )), size=(
+        640,
+        640,
+    ), type='Pad'),
+    dict(
+        img_scale=(
+            640,
+            640,
+        ),
+        max_cached_images=10,
+        pad_val=(
+            114,
+            114,
+            114,
+        ),
+        prob=0.5,
+        random_pop=False,
+        ratio_range=(
+            1.0,
+            1.0,
+        ),
+        type='CachedMixUp'),
+    dict(type='PackDetInputs'),
+]
+train_pipeline_stage2 = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.5,
+            2.0,
+        ),
+        scale=(
+            640,
+            640,
+        ),
+        type='RandomResize'),
+    dict(crop_size=(
+        640,
+        640,
+    ), type='RandomCrop'),
+    dict(type='YOLOXHSVRandomAug'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(pad_val=dict(img=(
+        114,
+        114,
+        114,
+    )), size=(
+        640,
+        640,
+    ), type='Pad'),
+    dict(type='PackDetInputs'),
+]
+tta_model = dict(
+    tta_cfg=dict(max_per_img=100, nms=dict(iou_threshold=0.6, type='nms')),
+    type='DetTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale=(
+                    640,
+                    640,
+                ), type='Resize'),
+                dict(keep_ratio=True, scale=(
+                    320,
+                    320,
+                ), type='Resize'),
+                dict(keep_ratio=True, scale=(
+                    960,
+                    960,
+                ), type='Resize'),
+            ],
+            [
+                dict(prob=1.0, type='RandomFlip'),
+                dict(prob=0.0, type='RandomFlip'),
+            ],
+            [
+                dict(
+                    pad_val=dict(img=(
+                        114,
+                        114,
+                        114,
+                    )),
+                    size=(
+                        960,
+                        960,
+                    ),
+                    type='Pad'),
+            ],
+            [
+                dict(type='LoadAnnotations', with_bbox=True),
+            ],
+            [
+                dict(
+                    meta_keys=(
+                        'img_id',
+                        'img_path',
+                        'ori_shape',
+                        'img_shape',
+                        'scale_factor',
+                        'flip',
+                        'flip_direction',
+                    ),
+                    type='PackDetInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=5,
+    dataset=dict(
+        ann_file='annotations/instances_val2017.json',
+        backend_args=None,
+        data_prefix=dict(img='val2017/'),
+        data_root='data/coco/',
+        pipeline=[
+            dict(backend_args=None, type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                640,
+                640,
+            ), type='Resize'),
+            dict(
+                pad_val=dict(img=(
+                    114,
+                    114,
+                    114,
+                )),
+                size=(
+                    640,
+                    640,
+                ),
+                type='Pad'),
+            dict(type='LoadAnnotations', with_bbox=True),
+            dict(
+                meta_keys=(
+                    'img_id',
+                    'img_path',
+                    'ori_shape',
+                    'img_shape',
+                    'scale_factor',
+                ),
+                type='PackDetInputs'),
+        ],
+        test_mode=True,
+        type='CocoDataset'),
+    drop_last=False,
+    num_workers=10,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    ann_file='data/coco/annotations/instances_val2017.json',
+    backend_args=None,
+    format_only=False,
+    metric='bbox',
+    proposal_nums=(
+        100,
+        1,
+        10,
+    ),
+    type='CocoMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='DetLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
diff --git a/ns_vfs/model/vision/_base.py b/ns_vfs/model/vision/_base.py
index c9beb39..f472bcc 100644
--- a/ns_vfs/model/vision/_base.py
+++ b/ns_vfs/model/vision/_base.py
@@ -35,7 +35,7 @@ def get_labels(self) -> list:
     
     def get_detections(self) -> sv.Detections:
         """Return sv.Detections"""
-        return self._detection
+        return self._detections
 
     def get_confidence(self) -> np.ndarray:
         return self._confidence
diff --git a/ns_vfs/model/vision/mmdetection.py b/ns_vfs/model/vision/mmdetection.py
new file mode 100644
index 0000000..104661e
--- /dev/null
+++ b/ns_vfs/model/vision/mmdetection.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from mmdet.apis import DetInferencer
+from omegaconf import DictConfig
+import supervision as sv
+import numpy as np
+import warnings
+
+from ns_vfs.model.vision._base import ComputerVisionDetector
+
+warnings.filterwarnings("ignore")
+
+
+class MMDetection(ComputerVisionDetector):
+    """MMDetection"""
+
+    def __init__(
+        self,
+        config: DictConfig,
+        config_path: str,
+        weight_path: str
+    ) -> None:
+        self.model = self.load_model(weight_path, config_path)
+        self._config = config
+
+    def load_model(self, weight_path, config_path) -> DetInferencer:
+        """Load weight.
+
+        Args:
+            weight_path (str): Path to weight file.
+
+        Returns:
+            None
+        """
+        init_args = {'model': config_path, 'weights': weight_path, 'device': 'cpu', 'palette': 'none'}
+
+        return DetInferencer(**init_args)
+
+    def _parse_class_name(self, class_names: list[str]) -> list[str]:
+        """Parse class name.
+
+        Args:
+            class_names (list[str]): List of class names.
+
+        Returns:
+            list[str]: List of class names.
+        """
+        return [f"all {class_name}s" for class_name in class_names]
+
+    def detect(self, frame_img: np.ndarray, classes: list) -> any:
+        """Detect object in frame.
+
+        Args:
+            frame_img (np.ndarray): Frame image.
+            classes (list[str]): List of class names.
+
+        Returns:
+            any: Detections.
+        """
+
+        return None
+
diff --git a/ns_vfs/model/vision/yolox.py b/ns_vfs/model/vision/yolox.py
new file mode 100644
index 0000000..fb078b2
--- /dev/null
+++ b/ns_vfs/model/vision/yolox.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import warnings
+
+import os
+import cv2
+import torch
+import numpy as np
+import supervision as sv
+from omegaconf import DictConfig
+from yolox.data.data_augment import ValTransform
+from yolox.data.datasets import COCO_CLASSES
+from yolox.exp import get_exp
+from yolox.utils import fuse_model, get_model_info, postprocess, vis
+
+
+from ns_vfs.model.vision._base import ComputerVisionDetector
+
+warnings.filterwarnings("ignore")
+
+
+class YoloX(ComputerVisionDetector):
+    """YoloX."""
+
+    def __init__(self, config: DictConfig, weight_path: str) -> None:
+        self.model = self.load_model(weight_path)
+        self._config = config
+
+    def load_model(self, weight_path) -> Predictor:
+        """Load weight.
+        Args:
+            weight_path (str): Path to weight file.
+        Returns:
+            None
+        """
+        exp = get_exp(None, "yolox-x") # modify in case model changes
+        exp.test_conf = 0.25
+        exp.nmsthre = 0.45
+        exp.test_size = (640, 640)
+
+        model = exp.get_model()
+        model.eval()
+        ckpt = torch.load(weight_path, map_location="cpu")
+        model.load_state_dict(ckpt["model"])
+
+        return Predictor(model, exp)
+
+    def _parse_class_name(self, class_names: list[str]) -> list[str]:
+        """Parse class name.
+        Args:
+            class_names (list[str]): List of class names.
+        Returns:
+            list[str]: List of class names.
+        """
+        return [f"all {class_name}s" for class_name in class_names]
+
+    def detect(self, frame_img: np.ndarray, classes: list) -> any:
+        """Detect object in frame.
+        Args:
+            frame_img (np.ndarray): Frame image.
+            classes (list[str]): List of class names.
+        Returns:
+            any: Detections.
+        """
+        class_reversed = COCO_CLASSES.index(classes[0])
+        outputs, img_info = self.model.inference(frame_img)
+        output = outputs[0].cpu()
+        cls = (output[:, 6]).numpy()
+        scores = (output[:, 4] * output[:, 5]).numpy()
+        self._confidence = np.array([])
+        self._labels = []
+        bbox_total = (output[:, 0:4]/img_info["ratio"]).cpu().detach().numpy()
+        print(bbox_total)
+        bbox = []
+        for i in range(len(cls)):
+            print(cls[i])
+            if cls[i] == class_reversed:
+                self._confidence = np.append(self._confidence, scores[i])
+                self._labels.append(f"{COCO_CLASSES[int(cls[i])]} {scores[i]}")
+                bbox.append(bbox_total[i])
+
+        print(self._labels)
+        
+        print(bbox)
+        self._detections = sv.Detections(xyxy=np.array(bbox))
+        self._size = len(self._confidence)
+
+        # result_image = self.model.visual(output, img_info, self.model.confthre)
+        # file_name ="/opt/Neuro-Symbolic-Video-Frame-Search/ns_vfs/model/vision/test.png"
+        # cv2.imwrite(file_name, result_image)
+        # print(outputs)
+
+        return outputs
+
+
+class Predictor(object):
+    def __init__(
+        self,
+        model,
+        exp,
+        cls_names=COCO_CLASSES,
+        decoder=None,
+        device="cpu",
+        fp16=False,
+        legacy=False,
+    ):
+        self.model = model
+        self.cls_names = cls_names
+        self.decoder = decoder
+        self.num_classes = exp.num_classes
+        self.confthre = exp.test_conf
+        self.nmsthre = exp.nmsthre
+        self.test_size = exp.test_size
+        self.device = device
+        self.fp16 = fp16
+        self.preproc = ValTransform(legacy=legacy)
+
+    def inference(self, img):
+        img_info = {}
+
+        height, width = img.shape[:2]
+        img_info["height"] = height
+        img_info["width"] = width
+        img_info["raw_img"] = img
+
+        ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
+        img_info["ratio"] = ratio
+
+        img, _ = self.preproc(img, None, self.test_size)
+        img = torch.from_numpy(img).unsqueeze(0)
+        img = img.float()
+        if self.device == "gpu":
+            img = img.cuda()
+            if self.fp16:
+                img = img.half()  # to FP16
+
+        with torch.no_grad():
+            outputs = self.model(img)
+            if self.decoder is not None:
+                outputs = self.decoder(outputs, dtype=outputs.type())
+            outputs = postprocess(
+                outputs, self.num_classes, self.confthre,
+                self.nmsthre, class_agnostic=True
+            )
+        return outputs, img_info
+
+    def visual(self, output, img_info, cls_conf=0.35):
+        ratio = img_info["ratio"]
+        img = img_info["raw_img"]
+        if output is None:
+            return img
+        output = output.cpu()
+
+        bboxes = output[:, 0:4]
+
+        # preprocessing: resize
+        bboxes /= ratio
+
+        cls = output[:, 6]
+        scores = output[:, 4] * output[:, 5]
+
+        # print("CLS: %s" %cls)
+        # print("cls_conf: %s" %cls_conf)
+        # print("scores: %s" %scores)
+
+        vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
+        return vis_res
\ No newline at end of file
diff --git a/run_frame_to_automata.py b/run_frame_to_automata.py
index ff6762b..cc44686 100644
--- a/run_frame_to_automata.py
+++ b/run_frame_to_automata.py
@@ -3,6 +3,8 @@
 from ns_vfs.config.loader import load_config
 from ns_vfs.model.vision.grounding_dino import GroundingDino
 from ns_vfs.model.vision.yolo import Yolo
+from ns_vfs.model.vision.yolox import YoloX
+from ns_vfs.model.vision.mmdetection import MMDetection
 from ns_vfs.processor.video_processor import VideoFrameWindowProcessor
 from ns_vfs.video_to_automaton import VideotoAutomaton
 
@@ -14,24 +16,33 @@
     config = load_config()
 
     frame2automaton = VideotoAutomaton(
+        detector=YoloX(
+            config=config.YOLOX,
+            weight_path=config.YOLOX.YOLOX_CHECKPOINT_PATH,
+        ),
         # detector=Yolo(
         #     config=config.YOLO,
         #     weight_path=config.YOLO.YOLO_CHECKPOINT_PATH,
         # ),
-        detector=GroundingDino(
-            config=config.GROUNDING_DINO,
-            weight_path=config.GROUNDING_DINO.GROUNDING_DINO_CHECKPOINT_PATH,
-            config_path=config.GROUNDING_DINO.GROUNDING_DINO_CONFIG_PATH,
-        ),
+        # detector=MMDetection(
+        #     config=config.MMDETECTION,
+        #     config_path=config.MMDETECTION.MMDETECTION_CONFIG_PATH,
+        #     weight_path=config.MMDETECTION.MMDETECTION_CHECKPOINT_PATH
+        # ),
+        # detector=GroundingDino(
+        #     config=config.GROUNDING_DINO,
+        #     weight_path=config.GROUNDING_DINO.GROUNDING_DINO_CHECKPOINT_PATH,
+        #     config_path=config.GROUNDING_DINO.GROUNDING_DINO_CONFIG_PATH,
+        # ),
         video_processor=VideoFrameWindowProcessor(
             video_path=sample_video_path,
             artifact_dir=config.VERSION_AND_PATH.ARTIFACTS_PATH,
         ),
         artifact_dir=config.VERSION_AND_PATH.ARTIFACTS_PATH,
-        proposition_set=["person", "car"],
-        is_annotation=False,  # TODO: Debug only
-        save_image=False,  # TODO: Debug only
-        ltl_formula='P>=0.99 [F "person"]',
+        proposition_set=["car"],
+        is_annotation=True,  # TODO: Debug only
+        save_image=True,  # TODO: Debug only
+        ltl_formula='P>=0.99 [F "car"]',
     )
 
     frame_window_automata = frame2automaton.run()