NVIDIA-NeMo · parthchadha · May 29, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/examples/configs/eval.yaml b/examples/configs/eval.yaml
@@ -15,6 +15,7 @@ generation:
   stop_token_ids: null
   stop_strings: null
   vllm_cfg:
+    async_engine: false
     precision: "bfloat16"
     tensor_parallel_size: 1
     gpu_memory_utilization: 0.9

diff --git a/examples/configs/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/grpo-deepscaler-1.5b-8K.yaml
@@ -90,6 +90,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
@@ -101,6 +101,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
+      async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447.
       precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/grpo_sliding_puzzle.yaml b/examples/configs/grpo_sliding_puzzle.yaml
@@ -30,6 +30,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}

diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
@@ -83,6 +83,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
@@ -84,6 +84,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
@@ -84,6 +84,7 @@ policy:
       - 128009
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -84,6 +84,7 @@ policy:
       - 128009
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml
@@ -84,6 +84,7 @@ policy:
       - 151643
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -84,6 +84,7 @@ policy:
       - 151643
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml
@@ -81,6 +81,7 @@ policy:
       - 151645
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
@@ -84,6 +84,7 @@ policy:
       - 151645
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6

diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -84,6 +84,7 @@ policy:
       - 151645
     stop_strings: null
     vllm_cfg:
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6

diff --git a/nemo_rl/distributed/worker_groups.py b/nemo_rl/distributed/worker_groups.py
@@ -49,13 +49,36 @@ def get_results(self, worker_group: "RayWorkerGroup") -> list[Any]:
         worker group each worker belongs to, then selects only the first result from each group.
 
         Args:
-            worker_group: The RayWorkerGroup that created this bundle
+            worker_group: The RayWorkerGroup that spawned the futures.  The
+                mapping contained in worker_group.worker_to_tied_group_index
+                is required for the deduplication path.
 
         Returns:
             List of results, deduplicated by tied workers if respect_tied_workers is True
         """
-        # Basic case: Get all results
-        all_results = ray.get(self.futures)
+        from ray._raylet import ObjectRef, ObjectRefGenerator
+
+        # Flatten futures into a list of ObjectRefs
+        object_refs: list[ObjectRef] = []
+
+        has_generator = False
+
+        for idx, fut in enumerate(self.futures):
+            if isinstance(fut, ObjectRefGenerator):
+                # ray.get cannot be called directly on the generator object – it must be iterated to obtain the individual ObjectRef instances first.
+                for generated_ref in fut:
+                    object_refs.append(generated_ref)
+                    has_generator = True
+            else:
+                object_refs.append(fut)
+
+        # Retrieve the concrete results.
+        all_results = ray.get(object_refs)
+
+        # If expanded generator was present we are in streaming mode.
+        # Every ObjectRef now corresponds to a unique, ordered chunk of data
+        if has_generator:
+            return all_results
 
         if self.return_from_workers is not None:
             if self.called_workers is not None: