From 15451838d3f6dbcf4dae8d8d5da3c58b3ad8f378 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivans.mailbox@gmail.com>
Date: Mon, 2 Dec 2024 00:14:21 -0500
Subject: [PATCH 1/2] Added configurable reduction step after longform chunk
 transcript generation

---
 podcastfy/content_generator.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
index f3cfd91..d688151 100644
--- a/podcastfy/content_generator.py
+++ b/podcastfy/content_generator.py
@@ -14,7 +14,7 @@
 from langchain_community.chat_models import ChatLiteLLM
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_community.llms.llamafile import Llamafile
-from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain import hub
 from podcastfy.utils.config_conversation import load_conversation_config
@@ -503,6 +503,7 @@ def clean(self,
         # Then apply additional long-form specific cleaning
         return self._clean_transcript_response(standard_clean, config)
     
+    
     def _clean_transcript_response(self, transcript: str, config: Dict[str, Any]) -> str:
         """
         Clean transcript using a two-step process with LLM-based cleaning.
@@ -522,7 +523,28 @@ def _clean_transcript_response(self, transcript: str, config: Dict[str, Any]) ->
         """
         logger.debug("Starting transcript cleaning process")
 
-        final_transcript = self._fix_alternating_tags(transcript)
+        # Run rewriting chain
+        llm = self.llm
+        rewrite_prompt = PromptTemplate(
+            input_variables=["transcript"],
+            template=config.get("rewrite_prompt_template", "Clean and improve this podcast transcript by deduping any repeated sections and improving conversational flow. Just output the improved conversation in the same format and nothing else. Do not add or omit any information.: \n\n{transcript}")
+        )
+        logger.debug("Executing rewriting chain")
+        rewrite_chain = rewrite_prompt | llm | StrOutputParser()
+
+        try:
+            rewritten_response = rewrite_chain.invoke({"transcript": transcript})
+            if not rewritten_response:
+                logger.warning("Rewriting chain returned empty response")
+                # Fall back to original
+                rewritten_response = transcript
+            logger.debug("Successfully rewrote transcript")
+            logger.debug("Successfully rewrote transcript, BEFORE = ", transcript, "AFTER = ", rewritten_response)
+        except Exception as e:
+            logger.error(f"Error in rewriting chain: {str(e)}")
+            rewritten_response = transcript  # Fall back to original
+            
+        final_transcript = self._fix_alternating_tags(rewritten_response)
         
         logger.debug("Completed transcript cleaning process")
         

From 4526958ef98500ab3297e26b69b750cbbcbdd0f6 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivans.mailbox@gmail.com>
Date: Mon, 2 Dec 2024 14:41:59 -0500
Subject: [PATCH 2/2] Added extra analysis step before rewriting

---
 podcastfy/content_generator.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
index d688151..aa1454d 100644
--- a/podcastfy/content_generator.py
+++ b/podcastfy/content_generator.py
@@ -525,21 +525,33 @@ def _clean_transcript_response(self, transcript: str, config: Dict[str, Any]) ->
 
         # Run rewriting chain
         llm = self.llm
-        rewrite_prompt = PromptTemplate(
+
+        analysis_prompt = PromptTemplate(
             input_variables=["transcript"],
-            template=config.get("rewrite_prompt_template", "Clean and improve this podcast transcript by deduping any repeated sections and improving conversational flow. Just output the improved conversation in the same format and nothing else. Do not add or omit any information.: \n\n{transcript}")
+            template=config.get("analysis_prompt_template", "You are a podcast editor. Analyze this podcast transcript and identify duplicated/repeated lines and recommendations to improve flow. Do not remove too many facts or add any new facts: \n\n{transcript} \n\nAnalysis (bullet-points, with line numbers referring to problematic lines.):")
+        )
+        analysis_chain = analysis_prompt | llm | StrOutputParser()
+
+        rewrite_prompt = PromptTemplate(
+            input_variables=["transcript", "analysis"],
+            template=config.get("rewrite_prompt_template", "Rewrite the podcast transcript by applying only the following recommendations. Refrain from shortening the transcript too much.\n\nRecommendations: \n\n{analysis}\n\nOriginal Transcript: \n\n{transcript}\n\nRewritten Transcript:")
         )
-        logger.debug("Executing rewriting chain")
         rewrite_chain = rewrite_prompt | llm | StrOutputParser()
 
         try:
-            rewritten_response = rewrite_chain.invoke({"transcript": transcript})
+            logger.debug("Executing analysis chain")
+            analysis = analysis_chain.invoke({"transcript": transcript})
+            logger.debug(f"Successfully analyzed transcript: \n\n{analysis}")
+
+            logger.debug("Executing rewriting chain")
+            rewritten_response = rewrite_chain.invoke({"analysis": analysis, "transcript": transcript})
             if not rewritten_response:
                 logger.warning("Rewriting chain returned empty response")
                 # Fall back to original
                 rewritten_response = transcript
             logger.debug("Successfully rewrote transcript")
-            logger.debug("Successfully rewrote transcript, BEFORE = ", transcript, "AFTER = ", rewritten_response)
+            logger.debug(f"Successfully rewrote transcript, BEFORE = \n\n{transcript}")
+            logger.debug(f"Successfully rewrote transcript, AFTER = \n\n{rewritten_response}")
         except Exception as e:
             logger.error(f"Error in rewriting chain: {str(e)}")
             rewritten_response = transcript  # Fall back to original