From 4e80df1e58901557e2824ce3b488d30209a9be33 Mon Sep 17 00:00:00 2001
From: Ly Cao <lycao@meta.com>
Date: Wed, 4 Oct 2023 08:17:29 -0700
Subject: [PATCH 1/4] Move inputs + model to GPU

Summary:
- Be conservative and say 94243.8571429/18355 ~ 5X perf improvement
- Add type hints for some methods to save time figuring out what methods can be used
- As titled
- Example log: P845438828

Differential Revision: D49662923

fbshipit-source-id: 5d7accc97ff58dc1fd586b921889520163fe72c3
---
 detectron2/engine/defaults.py | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/detectron2/engine/defaults.py b/detectron2/engine/defaults.py
index 5b95257455..ff5625ae86 100644
--- a/detectron2/engine/defaults.py
+++ b/detectron2/engine/defaults.py
@@ -312,8 +312,10 @@ def __call__(self, original_image):
             height, width = original_image.shape[:2]
             image = self.aug.get_transform(original_image).apply_image(original_image)
             image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            image.to(self.cfg.MODEL.DEVICE)
 
             inputs = {"image": image, "height": height, "width": width}
+
             predictions = self.model([inputs])[0]
             return predictions
 

From 898507047cf441a1e4be7a729270961c401c4354 Mon Sep 17 00:00:00 2001
From: Alan Li <alansli@meta.com>
Date: Mon, 16 Oct 2023 09:45:42 -0700
Subject: [PATCH 2/4] Fix strict ordering checks

Summary:
Fixing strict weak ordering checks as per this post: https://fb.workplace.com/groups/fbcode/permalink/2185586471478180/

Unblocking D48814560
Sort function replaced.

Reviewed By: bunnypak

Differential Revision: D50251104

fbshipit-source-id: 24c0e7574cdba79853cde4a898719121163b2794
---
 .../box_iou_rotated/box_iou_rotated_utils.h   | 39 ++++++++++++++-----
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
index b54a5dde2c..bc6967a768 100644
--- a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
@@ -225,19 +225,40 @@ HOST_DEVICE_INLINE int convex_hull_graham(
   }
 #else
   // CPU version
-  std::sort(
-      q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
-        T temp = cross_2d<T>(A, B);
-        if (fabs(temp) < 1e-6) {
-          return dot_2d<T>(A, A) < dot_2d<T>(B, B);
-        } else {
-          return temp > 0;
-        }
-      });
+  // std::sort(
+  //     q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
+  //       T temp = cross_2d<T>(A, B);
+
+  // if (fabs(temp) < 1e-6) {
+  //   return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+  // } else {
+  //   return temp > 0;
+  // }
+  // });
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+
   // compute distance to origin after sort, since the points are now different.
   for (int i = 0; i < num_in; i++) {
     dist[i] = dot_2d<T>(q[i], q[i]);
   }
+
 #endif
 
   // Step 4:

From 337ca3490fa7879ceeeadf6c2b73d67504ff4b4f Mon Sep 17 00:00:00 2001
From: Gabriele <34710621+Gabrysse@users.noreply.github.com>
Date: Wed, 1 Nov 2023 08:55:51 -0700
Subject: [PATCH 3/4] Bug fix with prefetch_factor

Summary:
Fix https://github.com/facebookresearch/detectron2/issues/5086.
The PyTorch DataLoader class can accepts "None" as prefetch_factor in recent versions (> 2.0). For backward compatibility, it is better to set a default value, specifically 2 as in previous PyTorch versions.
Looking at the more recent DataLoader source code, it sets the value 2 if prefetch_factor is found to be None.

Pull Request resolved: https://github.com/facebookresearch/detectron2/pull/5091

Reviewed By: wat3rBro

Differential Revision: D50693761

Pulled By: ezyang

fbshipit-source-id: 479ec794009be9e95d27c401143a88dcd45a6eff
---
 detectron2/data/build.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/detectron2/data/build.py b/detectron2/data/build.py
index 5d23f616b4..87b4a0050b 100644
--- a/detectron2/data/build.py
+++ b/detectron2/data/build.py
@@ -300,9 +300,7 @@ def build_batch_data_loader(
     num_workers=0,
     collate_fn=None,
     drop_last: bool = True,
-    prefetch_factor=None,
-    persistent_workers=False,
-    pin_memory=False,
+    **kwargs,
 ):
     """
     Build a batched dataloader. The main differences from `torch.utils.data.DataLoader` are:
@@ -341,9 +339,7 @@ def build_batch_data_loader(
             num_workers=num_workers,
             collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
             worker_init_fn=worker_init_reset_seed,
-            prefetch_factor=prefetch_factor,
-            persistent_workers=persistent_workers,
-            pin_memory=pin_memory,
+            **kwargs
         )  # yield individual mapped dict
         data_loader = AspectRatioGroupedDataset(data_loader, batch_size)
         if collate_fn is None:
@@ -357,9 +353,7 @@ def build_batch_data_loader(
             num_workers=num_workers,
             collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
             worker_init_fn=worker_init_reset_seed,
-            prefetch_factor=prefetch_factor,
-            persistent_workers=persistent_workers,
-            pin_memory=pin_memory,
+            **kwargs
         )
 
 
@@ -499,9 +493,7 @@ def build_detection_train_loader(
     aspect_ratio_grouping=True,
     num_workers=0,
     collate_fn=None,
-    prefetch_factor=None,
-    persistent_workers=False,
-    pin_memory=False,
+    **kwargs
 ):
     """
     Build a dataloader for object detection with some default features.
@@ -553,9 +545,7 @@ def build_detection_train_loader(
         aspect_ratio_grouping=aspect_ratio_grouping,
         num_workers=num_workers,
         collate_fn=collate_fn,
-        prefetch_factor=prefetch_factor,
-        persistent_workers=persistent_workers,
-        pin_memory=pin_memory,
+        **kwargs
     )
 
 

From 017abbfa5f2c2a2afa045200c2af9ccf2fc6227f Mon Sep 17 00:00:00 2001
From: Gary Zheng <dyz@meta.com>
Date: Mon, 6 Nov 2023 12:14:35 -0800
Subject: [PATCH 4/4] Log true batch size

Summary:
The true batch size was always questionable due to the auto scaling stuff.

Fix: add logging for it

Reviewed By: tglik

Differential Revision: D50995954

fbshipit-source-id: f499bbfccbd8a7f96821e0344e82c2e8f075bcc6
---
 detectron2/data/build.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/detectron2/data/build.py b/detectron2/data/build.py
index 87b4a0050b..4ab982ec59 100644
--- a/detectron2/data/build.py
+++ b/detectron2/data/build.py
@@ -326,6 +326,8 @@ def build_batch_data_loader(
         total_batch_size, world_size
     )
     batch_size = total_batch_size // world_size
+    logger = logging.getLogger(__name__)
+    logger.info("Making batched data loader with batch_size=%d", batch_size)
 
     if isinstance(dataset, torchdata.IterableDataset):
         assert sampler is None, "sampler must be None if dataset is IterableDataset"