fp8/pipeline: add some profiler annotation for prepare/denoise/vae

replicate · Oct 10, 2024 · 0039a42 · 0039a42
1 parent f676e86
commit 0039a42
Showing 1 changed file with 23 additions and 20 deletions.
diff --git a/fp8/flux_pipeline.py b/fp8/flux_pipeline.py
@@ -615,15 +615,16 @@ def generate(
         )
 
         # prepare inputs
-        img, img_ids, vec, txt, txt_ids = map(
-            lambda x: x, # x.contiguous(),
-            self.prepare(
-                img=img,
-                prompt=prompt,
-                target_device=self.device_flux,
-                target_dtype=self.dtype,
-            ),
-        )
+        with torch.profiler.record_function("prepare"):
+            img, img_ids, vec, txt, txt_ids = map(
+                lambda x: x, # x.contiguous(),
+                self.prepare(
+                    img=img,
+                    prompt=prompt,
+                    target_device=self.device_flux,
+                    target_dtype=self.dtype,
+                ),
+            )
 
         # dispatch to gpu if offloaded
         if self.offload_flow:
@@ -634,16 +635,17 @@ def generate(
         output_imgs = []
 
         for i in range(batch_size):
-            denoised_img = self.denoise_single_item(
-                img[i],
-                img_ids[i],
-                txt[i],
-                txt_ids[i],
-                vec[i],
-                timesteps,
-                guidance,
-                compiling
-            )
+            with torch.profiler.record_function("denoise-single-item"):
+                denoised_img = self.denoise_single_item(
+                    img[i],
+                    img_ids[i],
+                    txt[i],
+                    txt_ids[i],
+                    vec[i],
+                    timesteps,
+                    guidance,
+                    compiling
+                )
             output_imgs.append(denoised_img)
             compiling = False
 
@@ -655,7 +657,8 @@ def generate(
             torch.cuda.empty_cache()
 
         # decode latents to pixel space
-        img = self.vae_decode(img, height, width)
+        with torch.profiler.record_function("vae-decode"):
+            img = self.vae_decode(img, height, width)
 
         return self.as_img_tensor(img)