Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate api calls for secondary summaries #2366

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions per/migrations/0124_alter_opslearningpromptresponsecache_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Generated by Django 4.2.17 on 2025-01-03 03:07

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("per", "0123_alter_perdocumentupload_file_alter_perfile_file"),
]

operations = [
migrations.AlterField(
model_name="opslearningpromptresponsecache",
name="type",
field=models.IntegerField(
choices=[(1, "Primary"), (2, "Secondary"), (3, "Sector"), (4, "Component")], verbose_name="type"
),
),
]
2 changes: 2 additions & 0 deletions per/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,8 @@ class OpsLearningPromptResponseCache(models.Model):
class PromptType(models.IntegerChoices):
PRIMARY = 1, _("Primary")
SECONDARY = 2, _("Secondary")
SECTOR = 3, _("Sector")
COMPONENT = 4, _("Component")

prompt_hash = models.CharField(verbose_name=_("used prompt hash"), max_length=32)
prompt = models.TextField(verbose_name=_("used prompt"), null=True, blank=True)
Expand Down
130 changes: 86 additions & 44 deletions per/ops_learning_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class OpsLearningSummaryTask:
MIN_DIF_EXCERPTS = 3

primary_prompt = (
"Please aggregate and summarize the provided data into UP TO THREE structured paragraphs.\n"
"\n Please aggregate and summarize the provided data into UP TO THREE structured paragraphs.\n"
"The output MUST strictly adhere to the format below:\n"
"- *Title*: Each finding should begin with the main finding TITLE in bold.\n"
"Should be a high level summary of the finding below. "
Expand Down Expand Up @@ -87,11 +87,32 @@ class OpsLearningSummaryTask:
'"contradictory reports": "..."}'
)

secondary_prompt = (
"Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). \n "
component_prompt = (
"\n Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). \n "
"The output SHOULD ALWAYS follow the format below:\n"
"- *Type*: Whether the paragraph is related to a 'sector' or a 'component'\n"
"- *Subtype*: Provides the name of the sector or of the component to which the paragraph refers.\n"
"- *Type*: 'component'\n"
"- *Subtype*: Provides the name of the component to which the paragraph refers.\n"
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary.\n"
"*Content*: A short summary aggregating findings related to the Subtype, "
"so that they are supported by evidence coming from more than one report, "
"and there is ONLY ONE entry per subtype. Always integrate in the paragraph evidence that supports "
"it from the data available from multiples reports or items, include year and country of the evidence. "
"The length of each paragraph MUST be between 20 and 30 words.\n"
" Important:\n\n"
"- ONLY create one summary per subtype\n"
"- DO NOT mention the ids of the excerpts in the content of the summary.\n"
"- DO NOT use data from any source other than the one provided.\n\n"
"Output Format:\n"
"Provide your answer in valid JSON form. Reply with ONLY the answer in JSON form and include NO OTHER COMMENTARY.\n"
'{"0": {"type": "component", "subtype": "Information Management", "excerpts id":"23, 235", "content": "lorem ipsum"}, '
'"1": {"type": "component", "subtype": "Logistics", "excerpts id":"45, 678", "content": "lorem ipsum"}}'
)

sector_prompt = (
"\n Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). \n "
"The output SHOULD ALWAYS follow the format below:\n"
"- *Type*: 'sector'\n"
"- *Subtype*: Provides the name of the sector to which the paragraph refers.\n"
"- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary.\n"
"*Content*: A short summary aggregating findings related to the Subtype, "
"so that they are supported by evidence coming from more than one report, "
Expand All @@ -105,8 +126,7 @@ class OpsLearningSummaryTask:
"Output Format:\n"
"Provide your answer in valid JSON form. Reply with ONLY the answer in JSON form and include NO OTHER COMMENTARY.\n"
'{"0": {"type": "sector", "subtype": "shelter", "excerpts id":"43, 1375, 14543", "content": "lorem ipsum"}, '
'"1": {"type": "component", "subtype": "Information Management", "excerpts id":"23, 235", "content": "lorem ipsum"}, '
'"2": {"type": "sector", "subtype": "WASH", "excerpts id":"30, 40", "content": "lorem ipsum"}}'
'"1": {"type": "sector", "subtype": "WASH", "excerpts id":"30, 40", "content": "lorem ipsum"}}'
)

system_message = (
Expand Down Expand Up @@ -686,44 +706,61 @@ def process_learnings_component(component, df, max_length_per_section):
)
return learnings_component

def _build_data_section(secondary_df: pd.DataFrame):
# Secondary learnings section
sectors = get_main_sectors(secondary_df)
def _build_component_data_section(secondary_df: pd.DataFrame):
# Component learnings section
components = get_main_components(secondary_df)
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT

if (len(sectors) + len(components)) > 0:
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT / (len(components) + len(sectors))
if len(components) > 0:
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT / len(components)

learnings_sectors = (
learnings_components = (
"\n----------------\n\n"
+ "TYPE: SECTORS"
+ "TYPE: COMPONENT"
+ "\n----------------\n".join(
[process_learnings_sector(x, secondary_df, max_length_per_section) for x in sectors if pd.notna(x)]
[process_learnings_component(x, secondary_df, max_length_per_section) for x in components if pd.notna(x)]
)
)
learnings_components = (
secondary_learnings_data = learnings_components
return secondary_learnings_data

def _build_sector_data_section(secondary_df: pd.DataFrame):
# Sector learnings section
sectors = get_main_sectors(secondary_df)
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT

if len(sectors) > 0:
max_length_per_section = cls.PROMPT_DATA_LENGTH_LIMIT / len(sectors)

learnings_sectors = (
"\n----------------\n\n"
+ "TYPE: COMPONENT"
+ "TYPE: SECTORS"
+ "\n----------------\n".join(
[process_learnings_component(x, secondary_df, max_length_per_section) for x in components if pd.notna(x)]
[process_learnings_sector(x, secondary_df, max_length_per_section) for x in sectors if pd.notna(x)]
)
)
secondary_learnings_data = learnings_sectors + learnings_components
secondary_learnings_data = learnings_sectors
return secondary_learnings_data

# Prompt intro section
prompt_intro = cls._build_intro_section()
secondary_prompt_instruction = cls._build_instruction_section(
filter_data, secondary_learning_df, cls.secondary_instruction_prompt
)
secondary_learnings_data = _build_data_section(secondary_learning_df)

# Sector Prompt and Data
sector_prompt_instruction = cls._build_instruction_section(filter_data, secondary_learning_df, cls.sector_prompt)
sector_learning_data = _build_sector_data_section(secondary_learning_df)

# Components Prompt and Data
component_prompt_instruction = cls._build_instruction_section(filter_data, secondary_learning_df, cls.component_prompt)
component_learning_data = _build_component_data_section(secondary_learning_df)

# format the prompts
secondary_learning_prompt = "".join(
[prompt_intro, secondary_prompt_instruction, secondary_learnings_data, cls.secondary_prompt]
sector_learning_prompt = "".join([prompt_intro, sector_prompt_instruction, sector_learning_data, cls.sector_prompt])
component_learning_prompt = "".join(
[prompt_intro, component_prompt_instruction, component_learning_data, cls.sector_prompt]
)

logger.info("Secondary Prompt formatted.")
return secondary_learning_prompt
return sector_learning_prompt, component_learning_prompt

@classmethod
def generate_summary(cls, prompt, type: OpsLearningPromptResponseCache.PromptType) -> dict:
Expand Down Expand Up @@ -849,8 +886,10 @@ def _modify_summary(summary: dict) -> dict:

@classmethod
def _get_or_create_summary(
cls, prompt: str, prompt_hash: str, type: OpsLearningPromptResponseCache.PromptType, overwrite_prompt_cache: bool = False
cls, prompt: str, type: OpsLearningPromptResponseCache.PromptType, overwrite_prompt_cache: bool = False
) -> dict:
"""Retrieves or Generates the summary based on the provided prompt."""
prompt_hash = OpslearningSummaryCacheHelper.generate_hash(prompt)
instance, created = OpsLearningPromptResponseCache.objects.update_or_create(
prompt_hash=prompt_hash,
type=type,
Expand Down Expand Up @@ -952,13 +991,9 @@ def get_or_create_primary_summary(
"""Retrieves or Generates the primary summary based on the provided prompt."""
logger.info("Retrieving or generating primary summary.")

# generating hash for primary prompt
primary_prompt_hash = OpslearningSummaryCacheHelper.generate_hash(primary_learning_prompt)

# Checking the response for primary prompt
primary_summary = cls._get_or_create_summary(
prompt=primary_learning_prompt,
prompt_hash=primary_prompt_hash,
type=OpsLearningPromptResponseCache.PromptType.PRIMARY,
overwrite_prompt_cache=overwrite_prompt_cache,
)
Expand All @@ -981,30 +1016,37 @@ def get_or_create_primary_summary(
def get_or_create_secondary_summary(
cls,
ops_learning_summary_instance: OpsLearningCacheResponse,
secondary_learning_prompt: str,
sector_learning_prompt: str,
component_learning_prompt: str,
overwrite_prompt_cache: bool = False,
):
"""Retrieves or Generates the summary based on the provided prompts."""
logger.info("Retrieving or generating secondary summary.")

# generating hash for secondary prompt
secondary_prompt_hash = OpslearningSummaryCacheHelper.generate_hash(secondary_learning_prompt)

# Checking the response for secondary prompt
secondary_summary = cls._get_or_create_summary(
prompt=secondary_learning_prompt,
prompt_hash=secondary_prompt_hash,
type=OpsLearningPromptResponseCache.PromptType.SECONDARY,
overwrite_prompt_cache=overwrite_prompt_cache,
)
if overwrite_prompt_cache:
logger.info("Clearing the cache for secondary summary.")
# NOTE: find a better way to update the cache
OpsLearningComponentCacheResponse.objects.filter(filter_response=ops_learning_summary_instance).delete()
OpsLearningSectorCacheResponse.objects.filter(filter_response=ops_learning_summary_instance).delete()

# Saving into the database
# Checking the response for sector prompt
sector_summary = cls._get_or_create_summary(
prompt=sector_learning_prompt,
type=OpsLearningPromptResponseCache.PromptType.SECTOR,
overwrite_prompt_cache=overwrite_prompt_cache,
)
cls.secondary_response_save_to_db(
ops_learning_summary_instance=ops_learning_summary_instance,
secondary_summary=sector_summary,
)

# Checking the response for component prompt
component_summary = cls._get_or_create_summary(
prompt=component_learning_prompt,
type=OpsLearningPromptResponseCache.PromptType.COMPONENT,
overwrite_prompt_cache=overwrite_prompt_cache,
)
cls.secondary_response_save_to_db(
ops_learning_summary_instance=ops_learning_summary_instance,
secondary_summary=secondary_summary,
secondary_summary=component_summary,
)
9 changes: 7 additions & 2 deletions per/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,17 @@ def generate_ops_learning_summary(ops_learning_summary_id: int, filter_data: dic

# Prioritize excerpts for secondary insights
secondary_learning_df = OpsLearningSummaryTask.seconday_prioritize_excerpts(prioritized_learnings)

# Format secondary prompt
secondary_learning_prompt = OpsLearningSummaryTask.format_secondary_prompt(secondary_learning_df, filter_data)
sector_learning_prompt, component_learning_prompt = OpsLearningSummaryTask.format_secondary_prompt(
secondary_learning_df=secondary_learning_df, filter_data=filter_data
)

# Generate secondary summary
OpsLearningSummaryTask.get_or_create_secondary_summary(
ops_learning_summary_instance=ops_learning_summary_instance,
secondary_learning_prompt=secondary_learning_prompt,
sector_learning_prompt=sector_learning_prompt,
component_learning_prompt=component_learning_prompt,
overwrite_prompt_cache=overwrite_prompt_cache,
)

Expand Down
Loading