add option for MODULE_ATTR_USE_BATCHING_HINTED_OUTPUT (#2544)

seanx92 · facebook-github-bot · commit 42c512c4bc94 · 2024-11-06T14:11:47.000-08:00
Summary: Pull Request resolved: #2544 The current GPU rebatching logics for _get_batching_hinted_output does not work for pooling_factor > 1, we need to disable it for GPU model to avoid rebatching flattened feature length Reviewed By: PaulZhang12 Differential Revision: D65499768 fbshipit-source-id: a450efa87d9e2b3e7cab4a3fcdbd94f99214e4cd
diff --git a/torchrec/quant/embedding_modules.py b/torchrec/quant/embedding_modules.py
@@ -85,6 +85,8 @@
     "__use_unflattened_lengths_for_batching"
 )
 
+MODULE_ATTR_USE_BATCHING_HINTED_OUTPUT: str = "__use_batching_hinted_output"
+
 DEFAULT_ROW_ALIGNMENT = 16
 
 
@@ -913,7 +915,8 @@ def forward(
                 lengths = _get_unflattened_lengths(lengths, len(embedding_names))
                 lookup = _get_batching_hinted_output(lengths=lengths, output=lookup)
             else:
-                lookup = _get_batching_hinted_output(lengths=lengths, output=lookup)
+                if getattr(self, MODULE_ATTR_USE_BATCHING_HINTED_OUTPUT, True):
+                    lookup = _get_batching_hinted_output(lengths=lengths, output=lookup)
                 lengths = _get_unflattened_lengths(lengths, len(embedding_names))
             jt = construct_jagged_tensors_inference(
                 embeddings=lookup,