diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bcb9644
--- /dev/null
+++ b/README.md
@@ -0,0 +1,225 @@
+
+# Exportable DensePose inference using TorchScript
+
+### This is unofficial inference implementation of [DensePose from detectron2](https://github.com/facebookresearch/detectron2/tree/main/projects/DensePose)
+
+The project is focused on creating simple and TorchScript compilable inference interface for the original pretrained
+models to free them from the heavy dependency on the detectron2 framework.
+
+#### Only inference is supported, no training. Also no confidence estimation or bootstapping pipelines were implemented.
+
+# Quickstart
+To run already exported model (which you might find in the
+[Releases](https://github.com/dajes/DensePose-TorchScript/releases) section) you only need PyTorch and OpenCV
+(for image reading):
+
+```
+pip install torch torchvision opencv-python
+```
+
+Then you can run the model using the small example script:
+
+```
+python run.py
+```
+This will run the model and save the result in the same directory as the input.
+
+
+## Exporting a model by yourself
+
+To export a model you need to have a model checkpoint and a config file. You can find them in the table below
+
+```
+python export.py [--fp16]
+```
+
+If --fp16 is specified, the model will be exported in fp16 mode. This will reduce the model size at the cost of
+precision.
+
+Example of exporting an R_50_FPN_s1x_legacy model into fp16 format model:
+
+```
+python export.py configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x_legacy/164832157/model_final_d366fa.pkl --fp16
+```
+
+### License
+
+All models available for download are licensed under the
+[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/)
+
+### Legacy Models
+
+Baselines trained using schedules from [Güler et al, 2018](https://arxiv.org/pdf/1802.00434.pdf)
+
+
+
+
+Name |
+lr sched |
+train time (s/iter) |
+inference time (s/im) |
+train mem (GB) |
+box AP |
+segm AP |
+dp. AP GPS |
+dp. AP GPSm |
+model id |
+download |
+
+
+R_50_FPN_s1x_legacy |
+s1x |
+0.307 |
+0.051 |
+3.2 |
+58.1 |
+58.2 |
+52.1 |
+54.9 |
+164832157 |
+model | metrics |
+
+
+R_101_FPN_s1x_legacy |
+s1x |
+0.390 |
+0.063 |
+4.3 |
+59.5 |
+59.3 |
+53.2 |
+56.0 |
+164832182 |
+model | metrics |
+
+
+
+```
+python export.py configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x_legacy/164832157/model_final_d366fa.pkl
+```
+
+```
+python export.py configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x_legacy/164832182/model_final_10af0e.pkl
+```
+
+### Improved Baselines, Original Fully Convolutional Head
+
+These models use an improved training schedule and Panoptic FPN head
+from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446).
+
+
+
+
+Name |
+lr sched |
+train time (s/iter) |
+inference time (s/im) |
+train mem (GB) |
+box AP |
+segm AP |
+dp. AP GPS |
+dp. AP GPSm |
+model id |
+download |
+
+
+R_50_FPN_s1x |
+s1x |
+0.359 |
+0.066 |
+4.5 |
+61.2 |
+67.2 |
+63.7 |
+65.3 |
+165712039 |
+model | metrics |
+
+
+R_101_FPN_s1x |
+s1x |
+0.428 |
+0.079 |
+5.8 |
+62.3 |
+67.8 |
+64.5 |
+66.2 |
+165712084 |
+model | metrics |
+
+
+
+```
+python export.py configs/densepose_rcnn_R_50_FPN_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl
+```
+
+```
+python export.py configs/densepose_rcnn_R_101_FPN_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_s1x/165712084/model_final_c6ab63.pkl
+```
+
+### Improved Baselines, DeepLabV3 Head
+
+These models use an improved training schedule, Panoptic FPN head
+from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446) and DeepLabV3 head
+from [Chen et al, 2017](https://arxiv.org/abs/1706.05587).
+
+
+
+
+Name |
+lr sched |
+train time (s/iter) |
+inference time (s/im) |
+train mem (GB) |
+box AP |
+segm AP |
+dp. AP GPS |
+dp. AP GPSm |
+model id |
+download |
+
+
+R_50_FPN_DL_s1x |
+s1x |
+0.392 |
+0.070 |
+6.7 |
+61.1 |
+68.3 |
+65.6 |
+66.7 |
+165712097 |
+model | metrics |
+
+
+R_101_FPN_DL_s1x |
+s1x |
+0.478 |
+0.083 |
+7.0 |
+62.3 |
+68.7 |
+66.3 |
+67.6 |
+165712116 |
+model | metrics |
+
+
+
+```
+python export.py configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_DL_s1x/165712097/model_final_0ed407.pkl
+```
+
+```
+python export.py configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_101_FPN_DL_s1x/165712116/model_final_844d15.pkl
+```
+
+```
+@InProceedings{Guler2018DensePose,
+ title={DensePose: Dense Human Pose Estimation In The Wild},
+ author={R\{i}za Alp G\"uler, Natalia Neverova, Iasonas Kokkinos},
+ journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year={2018}
+}
+```
\ No newline at end of file
diff --git a/detectron2/modeling/postprocessing.py b/detectron2/modeling/postprocessing.py
index d169658..8340014 100644
--- a/detectron2/modeling/postprocessing.py
+++ b/detectron2/modeling/postprocessing.py
@@ -47,12 +47,11 @@ def detector_postprocess(
output_boxes = results['pred_boxes']
scale_boxes(output_boxes, scale_x, scale_y)
- clip_boxes(output_boxes, results['image_size'])
keep = nonempty_boxes(output_boxes)
return {
'image_size': new_size,
- 'pred_boxes': output_boxes[keep],
+ 'pred_boxes': clip_boxes(output_boxes[keep], new_size),
'scores': results['scores'][keep],
'pred_classes': results['pred_classes'][keep],
'pred_densepose_coarse_segm': results['pred_densepose_coarse_segm'][keep],
diff --git a/export.py b/export.py
index c05119d..0f962cd 100644
--- a/export.py
+++ b/export.py
@@ -12,7 +12,7 @@ def main():
parser = argparse.ArgumentParser(description='Export DensePose model to TorchScript module')
parser.add_argument("cfg", type=str, help="Config file")
parser.add_argument("model", type=str, help="Model file")
- parser.add_argument("--min_score", default=0.8, type=float,
+ parser.add_argument("--min_score", default=0.3, type=float,
help="Minimum detection score to visualize")
parser.add_argument("--nms_thresh", metavar="", default=None, type=float,
help="NMS threshold")
diff --git a/run.py b/run.py
index 718bdb4..d82f10b 100644
--- a/run.py
+++ b/run.py
@@ -1,5 +1,6 @@
import argparse
import os
+from itertools import count
import cv2
import torch
@@ -11,23 +12,53 @@
parser.add_argument("model", type=str, help="Model file")
parser.add_argument("input", type=str, help="Input data")
parser.add_argument("--cpu", action="store_true", help="Only use CPU")
+parser.add_argument("--fp32", action="store_true", help="Only use FP32")
args = parser.parse_args()
-file_list = [args.input]
-img = cv2.imread(args.input)
-tensor = torch.from_numpy(img)
-
-visualizer = End2EndVisualizer(alpha=1.0, keep_bg=False)
-predictor = torch.jit.load(args.model)
+visualizer = End2EndVisualizer(alpha=.7, keep_bg=False)
+predictor = torch.jit.load(args.model).eval()
if torch.cuda.is_available() and not args.cpu:
- tensor = tensor.cuda()
+ device = torch.device("cuda")
predictor = predictor.cuda()
+ if args.fp32:
+ predictor = predictor.float()
+ else:
+ predictor = predictor.half()
else:
+ device = torch.device("cpu")
predictor = predictor.float()
-outputs = predictor(tensor)
-image_vis = visualizer.visualize(img, outputs)
-
save_path = "_pred".join(os.path.splitext(args.input))
-cv2.imwrite(save_path, image_vis)
-print(f"Image saved to {save_path}")
+if os.path.splitext(args.input)[1].lower() in [".jpg", ".png", ".jpeg", ".bmp", ".tif", ".tiff"]:
+ img = cv2.imread(args.input)
+ tensor = torch.from_numpy(img)
+
+ outputs = predictor(tensor)
+ image_vis = visualizer.visualize(img, outputs)
+
+ cv2.imwrite(save_path, image_vis)
+ print(f"Image saved to {save_path}")
+else:
+ cap = cv2.VideoCapture(args.input)
+ n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+ writer = None
+ try:
+ for i in count():
+ ret, frame = cap.read()
+ if not ret:
+ break
+ tensor = torch.from_numpy(frame)
+ outputs = predictor(tensor)
+ image_vis = visualizer.visualize(frame, outputs)
+ if writer is None:
+ writer = cv2.VideoWriter(
+ save_path, cv2.VideoWriter_fourcc(*'mp4v'), 30, (image_vis.shape[1], image_vis.shape[0]))
+ writer.write(image_vis)
+ print(f"Frame {i + 1}/{n_frames} processed", end="\r")
+ except KeyboardInterrupt:
+ pass
+ if writer is not None:
+ writer.release()
+ print(f"Video saved to {save_path}")
+ else:
+ print("No frames processed")
diff --git a/visualizer.py b/visualizer.py
index d4e6532..83b63fd 100644
--- a/visualizer.py
+++ b/visualizer.py
@@ -21,8 +21,8 @@ def resample_uv_tensors_to_bbox(u: torch.Tensor, v: torch.Tensor, labels: torch.
x, y, w, h = box_xywh_abs
w = max(int(w), 1)
h = max(int(h), 1)
- u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False)
- v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False)
+ u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False).float()
+ v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False).float()
uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device)
for part_id in range(1, u_bbox.size(1)):
uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id]