diff --git a/main/settings.py b/main/settings.py index f040a1143..12de49711 100644 --- a/main/settings.py +++ b/main/settings.py @@ -113,6 +113,10 @@ NS_DOCUMENT_API_KEY=(str, None), NS_INITIATIVES_API_KEY=(str, None), NS_INITIATIVES_API_TOKEN=(str, None), + # OpenAi Azure + AZURE_OPENAI_ENDPOINT=(str, None), + AZURE_OPENAI_KEY=(str, None), + AZURE_OPENAI_DEPLOYMENT_NAME=(str, None), ) diff --git a/per/cache.py b/per/cache.py index ba0403f4c..4181c9064 100644 --- a/per/cache.py +++ b/per/cache.py @@ -4,6 +4,7 @@ import django_filters from django.core.serializers.json import DjangoJSONEncoder +from django.db import transaction from per.models import OpsLearningCacheResponse from per.task import generate_summary @@ -51,7 +52,5 @@ def get_or_create( ops_learning_summary = OpsLearningCacheResponse.objects.filter(used_filters_hash=hash_value).first() if ops_learning_summary: return ops_learning_summary - # TODO: Create a new summary based on the filters - # returning a dummy object for now - # return OpsLearningCacheResponse.objects.first() - return generate_summary(filter_data, hash_value) + # Create a new summary and cache it + return transaction.on_commit(lambda: generate_summary.delay(filter_data, hash_value)) diff --git a/per/drf_views.py b/per/drf_views.py index d288cdbdd..8676d2b9f 100644 --- a/per/drf_views.py +++ b/per/drf_views.py @@ -810,16 +810,16 @@ def get_renderer_context(self): return context - @extend_schema(filters=True) + @extend_schema(response=OpsLearningSummarySerializer, filters=True) @action( detail=False, methods=["GET"], - permission_classes=[permissions.IsAuthenticated], + permission_classes=[permissions.AllowAny], url_path="summary", ) def summary(self, request): """ - Returns a summary of the OpsLearning data + Get the Ops Learning Summary based on the filters """ ops_learning_summary_instance = OpslearningSummaryCacheHelper.get_or_create(request, [self.filterset_class]) return response.Response(OpsLearningSummarySerializer(ops_learning_summary_instance).data) diff --git a/per/ops_learning_summary.py b/per/ops_learning_summary.py new file mode 100644 index 000000000..4c7ad38d2 --- /dev/null +++ b/per/ops_learning_summary.py @@ -0,0 +1,591 @@ +import ast +import os +import typing +from itertools import chain + +import pandas as pd +import tiktoken +from django.db.models import F +from openai import AzureOpenAI + +from api.logger import logger +from api.models import Country +from per.models import FormPrioritization, OpsLearning, Overview + + +class OpsLearningSummaryTask: + + PROMPT_DATA_LENGTH_LIMIT = 5000 + PROMPT_LENGTH_LIMIT = 7500 + ENCODING_NAME = "cl100k_base" + + MIN_DIF_COMPONENTS = 3 + MIN_DIF_EXCERPTS = 3 + + primary_prompt = ( + "Please aggregate and summarize the provided data into UP TO THREE structured paragraphs. " + "The output MUST strictly adhere to the format below: " + "Title: Each finding should begin with the main finding TITLE in bold. " + "Content: Aggregate findings so that they are supported by evidence from more than one report. " + "Always integrate evidence from multiple reports or items into the paragraph, and " + "include the year and country of the evidence. " + "Confidence Level: For each finding, based on the number of items/reports connected to the finding, " + "assign a score from 1 to 5 where 1 is the lowest and 5 is the highest. " + "The format should be 'Confidence level: #/5' (e.g., 'Confidence level: 4/5'). " + "At the end of the summary, please highlight any contradictory country reports. " + "DO NOT use data from any source other than the one provided. Provide your answer in JSON form. " + "Reply with only the answer in valid JSON form and include no other commentary: " + "Example: " + '{"0": {"title": "Flexible and Adaptive Response Planning", ' + '"content": "Responses in Honduras, Peru, Ecuador, and Panama highlight the importance of adaptable strategies. ' + "The shift from youth-focused MHPSS to inclusive care in Peru in 2021, the pivot from sanitation infrastructure " + "to direct aid in Ecuador in 2022, and the responsive livelihood support in Panama in 2020, " + "all underscore the need for continuous reassessment and agile adaptation to the complex, " + 'changing needs of disaster-affected communities.", "confidence level": "xxx"}, ' + '"1": {"title": "xxx", "content": "xxx", "confidence level": "xxx"}, ' + '"2": {"title": "xxx", "content": "xxx", "confidence level": "xxx"}, ' + '"contradictory reports": "xxx"}' + ) + + secondary_prompt = ( + "Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). " + "The output SHOULD ALWAYS follow the format below: " + "Type: Whether the paragraph is related to a 'sector' or a 'component'. " + "Subtype: Provides the name of the sector or of the component to which the paragraph refers. " + "Content: A short summary aggregating findings related to the Subtype, so that they are supported by " + "evidence coming from more than one report, " + "and there is ONLY ONE entry per subtype. Always integrate in the paragraph evidence that supports it " + "from the data available from multiple reports or items, " + "include year and country of the evidence. DO NOT use data from any source other than the " + "one provided. Provide your answer in JSON form. " + "Reply with ONLY the answer in valid JSON form and include NO OTHER COMMENTARY: " + '{"0": {"type": "sector", "subtype": "shelter", "content": "lorem ipsum"}, ' + '"1": {"type": "component", "subtype": "Information Management (IM)", "content": "lorem ipsum"}, ' + '"2": {"type": "sector", "subtype": "WASH", "content": "lorem ipsum"}}' + ) + + system_message = ( + "# CONTEXT # I want to summarize a set of lessons learned from a set of past emergency response operations " + "to extract the most useful and actionable insights." + "# STYLE # Use a writing style that is professional but informal." + "# TONE # Encouraging and motivating." + "# AUDIENCE # The audience is emergency response personnel from the Red Cross and Red Crescent. " + "They are action-oriented people who have very little time so they need concise, " + "not obvious information that can be easily consumed and acted upon in the time of a response." + ) + + primary_instruction_prompt = ( + "You should:" + "1. Describe, Summarize and Compare: Identify and detail the who, what, where, when and how many." + "2. Explain and Connect: Analyze why events happened and how they are related" + "3. Identify gaps: Assess what data is available, what is missing and potential biases" + "4. Identify key messages: Determine important stories and signals hidden in the data" + "5. Select top three: Select up to three findings to report" + ) + + secondary_instruction_prompt = ( + "You should for each section in the data (TYPE & SUBTYPE combination):" + "1. Describe, Summarize and Compare: Identify and detail the who, what, where, when and how many." + "2. Explain and Connect: Analyze why events happened and how they are related" + "3. Identify gaps: Assess what data is available, what is missing and potential biases" + "4. Identify key messages: Determine if there are important stories and signals hidden in the data" + "5. Conclude and make your case" + ) + + client = AzureOpenAI( + azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"), api_key=os.getenv("AZURE_OPENAI_API_KEY"), api_version="2023-05-15" + ) + + def count_tokens(string, encoding_name): + """Returns the number of tokens in a text string.""" + encoding = tiktoken.get_encoding(encoding_name) + return len(encoding.encode(string)) + + @classmethod + def fetch_ops_learnings(self, filter_data): + """Fetches the OPS learnings from the database.""" + ops_learning_qs = OpsLearning.objects.all() + from per.drf_views import OpsLearningFilter + + ops_learning_filtered_qs = OpsLearningFilter(filter_data, queryset=ops_learning_qs).qs + ops_learning_df = pd.DataFrame( + list( + ops_learning_filtered_qs.values( + "id", + "per_component", + "learning", + "appeal_code__country_id", + "appeal_code__country__region_id", + "appeal_code__name", + "appeal_code__start_date", + "sector", + ) + ) + ) + ops_learning_df = ops_learning_df.rename( + columns={ + "per_component": "component", + "appeal_code__country_id": "country_id", + "appeal_code__country__region_id": "region_id", + "appeal_code__name": "appeal_name", + "appeal_code__start_date": "appeal_year", + } + ) + ops_learning_df.set_index("id", inplace=True) + return ops_learning_df + + @classmethod + def _generate_regional_prioritization_list(self, df: pd.DataFrame): + """Generates a list of regional prioritizations from the given data.""" + df_exploded = df.explode("components") + regional_df = df_exploded.groupby(["region", "components"]).size().reset_index(name="count") + regional_df = regional_df[regional_df["count"] > 2] + regional_list = regional_df.groupby("region")["components"].apply(list).reset_index() + return regional_list + + @classmethod + def _generate_global_prioritization_list(self, regional_df: pd.DataFrame): + """Generates a global prioritization list from regional data.""" + global_df = regional_df.explode("components").groupby("components").size().reset_index(name="count") + global_components = global_df[global_df["count"] > 2]["components"].tolist() + global_list = {"global": global_components} + return global_list + + @classmethod + def _generate_country_prioritization_list( + self, regional_df: pd.DataFrame, global_components: list, prioritization_df: pd.DataFrame, country_df: pd.DataFrame + ): + """Generates a country-level prioritization list.""" + regional_dict = dict(zip(regional_df["region"], regional_df["components"])) + merged_df = country_df.merge(prioritization_df, on=["country", "region"], how="left") + no_prioritization_df = merged_df[merged_df["components"].isna()].astype(object) + + for index, row in no_prioritization_df.iterrows(): + region_id = row["region"] + components = regional_dict.get(region_id, global_components["global"]) + no_prioritization_df.at[index, "components"] = components + + final_df = pd.concat([merged_df.dropna(subset=["components"]), no_prioritization_df]) + final_df["components"] = final_df["components"].apply(lambda x: int(x) if isinstance(x, float) else x) + final_df = final_df[["country", "components"]] + return final_df + + @classmethod + def generate_priotization_list(self): + logger.info("Generating prioritization list.") + exclusion_list = [ + "IFRC Africa", + "IFRC Americas", + "IFRC Asia-Pacific", + "IFRC Europe", + "IFRC Geneva", + "IFRC MENA", + "Benelux ERU", + "ICRC", + ] + + # Get all countries + country_qs = ( + Country.objects.filter(is_deprecated=False, society_name__isnull=False, region__isnull=False) + .exclude(name__in=exclusion_list) + .values("id", "region_id") + ) + country_df = pd.DataFrame(list(country_qs)) + country_df = country_df.rename(columns={"id": "country", "region_id": "region"}) + + # Get all PER Overview + per_overview_qs = Overview.objects.select_related("country").values( + "id", + "country_id", + "country__region", + "assessment_number", + ) + per_overview_df = pd.DataFrame(list(per_overview_qs)) + per_overview_df = per_overview_df.rename(columns={"id": "overview", "country_id": "country", "country__region": "region"}) + + # Get all PER Prioritization + per_priotization_qs = ( + FormPrioritization.objects.filter( + is_draft=False, + prioritized_action_responses__isnull=False, + ) + .annotate( + components=F("prioritized_action_responses__component"), + ) + .values( + "overview", + "components", + ) + ) + per_priotization_df = pd.DataFrame(list(per_priotization_qs)) + per_priotization_df = per_priotization_df.merge( + per_overview_df[["overview", "country", "region", "assessment_number"]], on="overview", how="left" + ) + per_priotization_df = per_priotization_df.sort_values("assessment_number").drop_duplicates(subset="country", keep="last") + per_priotization_df = per_priotization_df[["region", "country", "components"]] + + # Generate the prioritization list that are in dataframes + regional_list = self._generate_regional_prioritization_list(per_priotization_df) + global_list = self._generate_global_prioritization_list(regional_list) + country_list = self._generate_country_prioritization_list(regional_list, global_list, per_priotization_df, country_df) + logger.info("Prioritization list generated.") + return regional_list, global_list, country_list + + @classmethod + def prioritize( + self, + df: pd.DataFrame, + components_countries: dict, + components_regions: dict, + components_global: dict, + type_prioritization: typing.Union[list, None], + ): + """Prioritizes components based on the type of prioritization.""" + + def _add_new_component(prioritized_components, per_prioritized_components, df): + """Adds new components to the prioritized list based on availability and frequency.""" + available_components = list(df["component"].unique()) + remaining_components = [item for item in available_components if item not in prioritized_components] + + intersect_components = list(set(per_prioritized_components) & set(remaining_components)) + + if intersect_components: + mask = df["component"].isin(intersect_components) + else: + mask = df["component"].isin(remaining_components) + + component_counts = df[mask]["component"].value_counts() + most_frequent_components = component_counts[component_counts == component_counts.max()].index.tolist() + + return prioritized_components + most_frequent_components + + if type_prioritization == "single-country": + country_id = str(df["country_id"].iloc[0]) + per_prioritized_components = components_countries.get(country_id, []) + elif type_prioritization == "single-region": + region_id = str(df["region_id"].iloc[0]) + per_prioritized_components = components_regions.get(region_id, []) + per_prioritized_components = components_global.get("global", []) + + component_counts = df["component"].value_counts() + most_frequent_components = component_counts[component_counts == component_counts.max()].index.tolist() + + while len(most_frequent_components) < 3: + most_frequent_components = _add_new_component(most_frequent_components, per_prioritized_components, df) + + mask = df["component"].isin(most_frequent_components) + return df[mask] + + @classmethod + def prioritize_components( + self, + filter_data: dict, + regional_list, + global_list, + country_list, + ): + logger.info("Prioritizing components.") + + def _need_component_prioritization(df, MIN_DIF_COMPONENTS, MIN_DIF_EXCERPTS): + """Determines if prioritization is needed based on unique components and learnings.""" + nb_dif_components = len(df["component"].unique()) + nb_dif_learnings = len(df["learning"].unique()) + return nb_dif_components > MIN_DIF_COMPONENTS and nb_dif_learnings > MIN_DIF_EXCERPTS + + def _identify_type_prioritization(df): + """Identifies the type of prioritization required based on the data.""" + if len(df["country_id"].unique()) == 1: + return "single-country" + elif len(df["region_id"].unique()) == 1: + return "single-region" + elif len(df["region_id"].unique()) > 1: + return "multi-region" + return None + + def _contextualize_learnings(df): + """Adds appeal year and event name as a contextualization of the leannings.""" + for index, row in df.iterrows(): + df.at[index, "learning"] = f"In {row['appeal_year']} in {row['appeal_name']}: {row['learning']}" + + df = df.drop(columns=["appeal_name"]) + logger.info("Contextualization added to DataFrame.") + return df + + components_countries = country_list.to_dict(orient="records") + components_countries = {item["country"]: item["components"] for item in components_countries} + + components_regions = regional_list.to_dict(orient="records") + components_regions = {item["region"]: item["components"] for item in components_regions} + + ops_learning_df = self.fetch_ops_learnings(filter_data) + + if _need_component_prioritization(ops_learning_df, self.MIN_DIF_COMPONENTS, self.MIN_DIF_EXCERPTS): + type_prioritization = _identify_type_prioritization(ops_learning_df) + prioritized_learnings = self.prioritize( + ops_learning_df, components_countries, components_regions, global_list, type_prioritization + ) + prioritized_learnings = ops_learning_df + logger.info("Prioritization of components completed.") + prioritized_learnings = _contextualize_learnings(prioritized_learnings) + return prioritized_learnings + + @classmethod + def slice_dataframe(self, df, limit=2000, encoding_name="cl100k_base"): + df["count_temp"] = [self.count_tokens(x, encoding_name) for x in df["learning"]] + df["cumsum"] = df["count_temp"].cumsum() + + slice_index = None + for i in range(1, len(df)): + if df["cumsum"].iloc[i - 1] <= limit and df["cumsum"].iloc[i] > limit: + slice_index = i - 1 + break + + if slice_index is not None: + df_sliced = df.iloc[: slice_index + 1] + else: + df_sliced = df + return df_sliced + + @classmethod + def prioritize_excerpts(self, df: pd.DataFrame): + """Prioritize the most recent excerpts within the token limit.""" + logger.info("Prioritizing excerpts within token limit.") + + # Droping duplicates based on 'learning' column for primary DataFrame + primary_learning_df = df.drop_duplicates(subset="learning") + primary_learning_df = primary_learning_df.sort_values(by="appeal_year", ascending=False) + primary_learning_df.reset_index(inplace=True, drop=True) + + # Droping duplicates based on 'learning' and 'component' columns for secondary DataFrame + secondary_learning_df = df.drop_duplicates(subset=["learning", "component"]) + secondary_learning_df = secondary_learning_df.sort_values(by=["component", "appeal_year"], ascending=[True, False]) + grouped = secondary_learning_df.groupby("component") + + # Create an interleaved list of rows + interleaved = list(chain(*zip(*[group[1].itertuples(index=False) for group in grouped]))) + + # Convert the interleaved list of rows back to a DataFrame + result = pd.DataFrame(interleaved) + result.reset_index(inplace=True, drop=True) + + # Slice the Primary and secondary dataframes + sliced_primary_learning_df = self.slice_dataframe(primary_learning_df, self.PROMPT_DATA_LENGTH_LIMIT, self.ENCODING_NAME) + sliced_secondary_learning_df = self.slice_dataframe(result, self.PROMPT_DATA_LENGTH_LIMIT, self.ENCODING_NAME) + logger.info("Excerpts prioritized within token limit.") + return sliced_primary_learning_df, sliced_secondary_learning_df + + @classmethod + def format_prompt( + self, + primary_learning_df: pd.DataFrame, + secondary_learning_df: pd.DataFrame, + filter_data: dict, + ): + """Formats the prompt based on request filter and prioritized learnings.""" + logger.info("Formatting prompt.") + + def _build_intro_section(): + """Builds the introductory section of the prompt.""" + return ( + "I will provide you with a set of instructions, data, and formatting requests in three sections." + + " I will pass you the INSTRUCTIONS section, are you ready?" + + os.linesep + + os.linesep + ) + + def _build_instruction_section(request_filter: dict, df: pd.DataFrame, instruction: str): + """Builds the instruction section of the prompt based on the request filter and DataFrame.""" + instructions = ["INSTRUCTIONS\n========================\nSummarize essential insights from the DATA"] + + if "appeal_code__dtype__in" in request_filter: + dtypes = df["dtype_name"].dropna().unique() + dtype_str = '", "'.join(dtypes) + instructions.append(f'concerning "{dtype_str}" occurrences') + + if "appeal_code__country__in" in request_filter: + countries = df["country_name"].dropna().unique() + country_str = '", "'.join(countries) + instructions.append(f'in "{country_str}"') + + if "appeal_code__region" in request_filter: + regions = df["region_name"].dropna().unique() + region_str = '", "'.join(regions) + instructions.append(f'in "{region_str}"') + + if "sector_validated__in" in request_filter: + sectors = df["sector"].dropna().unique() + sector_str = '", "'.join(sectors) + instructions.append(f'focusing on "{sector_str}" aspects') + + if "per_component_validated__in" in request_filter: + components = df["component"].dropna().unique() + component_str = '", "'.join(components) + instructions.append(f'and "{component_str}" aspects') + + instructions.append("in Emergency Response.") + instructions.append("\n\n" + instruction) + instructions.append("\n\nI will pass you the DATA section, are you ready?\n\n") + return "\n".join(instructions) + + def get_main_sectors(df: pd.DataFrame): + """Get only information from technical sectorial information""" + temp = df[df["component"] == "NS-specific areas of intervention"] + available_sectors = list(temp["sector"].unique()) + nb_sectors = len(available_sectors) + if nb_sectors == 0: + logger.info("There were not specific technical sectorial learnings") + return [] + logger.info("Main sectors for secondary summaries selected") + return available_sectors + + def get_main_components(df: pd.DataFrame): + available_components = list(df["component"].unique()) + nb_components = len(available_components) + if nb_components == 0: + logger.info("There were not specific components") + return [] + logger.info("All components for secondary summaries selected") + return available_components + + def process_learnings_sector(sector, df, max_length_per_section): + df = df[df["sector"] == sector].dropna() + df_sliced = self.slice_dataframe(df, max_length_per_section, self.ENCODING_NAME) + learnings_sector = ( + "\n----------------\n" + + "SUBTYPE: " + + str(sector) + + "\n----------------\n" + + "\n----------------\n".join(df_sliced["learning"]) + ) + return learnings_sector + + def process_learnings_component(component, df, max_length_per_section): + df = df[df["component"] == component].dropna() + df_sliced = self.slice_dataframe(df, max_length_per_section, self.ENCODING_NAME) + learnings_component = ( + "\n----------------\n" + + "SUBTYPE: " + + str(component) + + "\n----------------\n" + + "\n----------------\n".join(df_sliced["learning"]) + ) + return learnings_component + + def _build_data_section(primary_df: pd.DataFrame, secondary_df: pd.DataFrame): + # Primary learnings section + primary_learnings_data = "\n----------------\n".join(primary_df["learning"].dropna()) + + # Secondary learnings section + sectors = get_main_sectors(secondary_df) + components = get_main_components(secondary_df) + max_length_per_section = self.PROMPT_DATA_LENGTH_LIMIT / (len(components) + len(sectors)) + learnings_sectors = ( + "\n----------------\n\n" + + "TYPE: SECTORS" + + "\n----------------\n".join( + [process_learnings_sector(int(x), secondary_df, max_length_per_section) for x in sectors if pd.notna(x)] + ) + ) + learnings_components = ( + "\n----------------\n\n" + + "TYPE: COMPONENT" + + "\n----------------\n".join( + [process_learnings_component(int(x), secondary_df, max_length_per_section) for x in components if pd.notna(x)] + ) + ) + secondary_learnings_data = learnings_sectors + learnings_components + return primary_learnings_data, secondary_learnings_data + + prompt_intro = _build_intro_section() + primary_prompt_instruction = _build_instruction_section(filter_data, primary_learning_df, self.primary_instruction_prompt) + secondary_prompt_instruction = _build_instruction_section( + filter_data, secondary_learning_df, self.secondary_instruction_prompt + ) + primary_learnings_data, secondary_learnings_data = _build_data_section(primary_learning_df, secondary_learning_df) + + # format the prompts + primary_learning_prompt = "".join([prompt_intro, primary_prompt_instruction, primary_learnings_data, self.primary_prompt]) + secondary_learning_prompt = "".join( + [prompt_intro, secondary_prompt_instruction, secondary_learnings_data, self.secondary_prompt] + ) + logger.info("Prompt formatted.") + return primary_learning_prompt, secondary_learning_prompt + + @classmethod + def generate_summaries(self, primary_learning_prompt, secondary_learning_prompt): + """Generates summaries using the provided system message and prompt.""" + logger.info("Generating summaries.") + + def _validate_length_prompt(messages, prompt_length_limit): + """Validates the length of the prompt.""" + message_content = [msg["content"] for msg in messages] + text = " ".join(message_content) + count = self.count_tokens(text, self.ENCODING_NAME) + logger.info(f"Token count: {count}") + return count <= prompt_length_limit + + def _summarize(prompt, system_message="You are a helpful assistant"): + """Summarizes the prompt using the provided system message.""" + messages = [ + {"role": "system", "content": system_message}, + {"role": "user", "content": prompt}, + { + "role": "assistant", + "content": "Understood, thank you for providing the data, and formatting requests. " + + "I am ready to proceed with the task.", + }, + ] + + if not _validate_length_prompt(messages, self.PROMPT_LENGTH_LIMIT): + logger.warning("The length of the prompt might be too long.") + return "{}" + + try: + response = self.client.chat.completions.create( + model=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"), messages=messages, temperature=0.7 + ) + summary = response.choices[0].message.content + return summary + except Exception as e: + logger.error(f"Error in summarizing: {e}") + raise + + def _validate_format(summary) -> bool: + """ + Validates the format of the summary and modifies it if necessary. + """ + + def validate_text_is_dictionary(text): + formatted_text = ast.literal_eval(text) + return isinstance(formatted_text, dict) + + def modify_format(summary): + try: + # Find the index of the last closing brace before the "Note" + end_index = summary.rfind("}") + + # Truncate the string to include only the dictionary part + formatted_summary = summary[: end_index + 1] + + logger.info("Modification realized to response") + return formatted_summary + + except Exception as e: + logger.error(f"Modification failed: {e}") + return "{}" + + # Attempt to parse the summary as a dictionary + if validate_text_is_dictionary(summary): + formated_summary = ast.literal_eval(summary) + return formated_summary + else: + formatted_summary = modify_format(summary) + formatted_summary = ast.literal_eval(formatted_summary) + return formatted_summary + + primary_summary = _summarize(primary_learning_prompt, self.system_message) + secondary_summary = _summarize(secondary_learning_prompt, self.system_message) + formated_primary_summary = _validate_format(primary_summary) + formated_secondary_summary = _validate_format(secondary_summary) + logger.info("Summaries generated.") + return formated_primary_summary, formated_secondary_summary diff --git a/per/task.py b/per/task.py index e845acaf8..f7f901e33 100644 --- a/per/task.py +++ b/per/task.py @@ -1,474 +1,6 @@ -import os -import typing -from itertools import chain - -import pandas as pd -import tiktoken from celery import shared_task -from django.db.models import F - -from api.logger import logger -from api.models import Country -from per.models import FormPrioritization, OpsLearning, Overview - - -class OpsLearningSummaryTask: - - PROMPT_DATA_LENGTH_LIMIT = 5000 - ENCODING_NAME = "cl100k_base" - - MIN_DIF_COMPONENTS = 3 - MIN_DIF_EXCERPTS = 3 - primary_prompt = ( - "Please aggregate and summarize the provided data into UP TO THREE structured paragraphs. " - "The output MUST strictly adhere to the format below: " - "Title: Each finding should begin with the main finding TITLE in bold. " - "Content: Aggregate findings so that they are supported by evidence from more than one report. " - "Always integrate evidence from multiple reports or items into the paragraph, and " - "include the year and country of the evidence. " - "Confidence Level: For each finding, based on the number of items/reports connected to the finding, " - "assign a score from 1 to 5 where 1 is the lowest and 5 is the highest. " - "The format should be 'Confidence level: #/5' (e.g., 'Confidence level: 4/5'). " - "At the end of the summary, please highlight any contradictory country reports. " - "DO NOT use data from any source other than the one provided. Provide your answer in JSON form. " - "Reply with only the answer in valid JSON form and include no other commentary: " - '{"0": {"title": "xxx", "content": "xxx", "confidence level": "xxx"}, ' - '"1": {"title": "xxx", "content": "xxx", "confidence level": "xxx"}, ' - '"2": {"title": "xxx", "content": "xxx", "confidence level": "xxx"}, ' - '"contradictory reports": "xxx"}' - ) - - secondary_prompt = ( - "Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). " - "The output SHOULD ALWAYS follow the format below: " - "Type: Whether the paragraph is related to a 'sector' or a 'component'. " - "Subtype: Provides the name of the sector or of the component to which the paragraph refers. " - "Content: A short summary aggregating findings related to the Subtype, so that they are supported by " - "evidence coming from more than one report, " - "and there is ONLY ONE entry per subtype. Always integrate in the paragraph evidence that supports it " - "from the data available from multiple reports or items, " - "include year and country of the evidence. DO NOT use data from any source other than the " - "one provided. Provide your answer in JSON form. " - "Reply with ONLY the answer in valid JSON form and include NO OTHER COMMENTARY: " - '{"0": {"type": "sector", "subtype": "shelter", "content": "lorem ipsum"}, ' - '"1": {"type": "component", "subtype": "Information Management (IM)", "content": "lorem ipsum"}, ' - '"2": {"type": "sector", "subtype": "WASH", "content": "lorem ipsum"}}' - ) - - @classmethod - def fetch_ops_learnings(self, filter_data): - """Fetches the OPS learnings from the database.""" - ops_learning_qs = OpsLearning.objects.all() - from per.drf_views import OpsLearningFilter - - ops_learning_filtered_qs = OpsLearningFilter(filter_data, queryset=ops_learning_qs).qs - ops_learning_df = pd.DataFrame( - list( - ops_learning_filtered_qs.values( - "id", - "per_component", - "learning", - "appeal_code__country_id", - "appeal_code__country__region_id", - "appeal_code__name", - "appeal_code__start_date", - "sector", - ) - ) - ) - ops_learning_df = ops_learning_df.rename( - columns={ - "per_component": "component", - "appeal_code__country_id": "country_id", - "appeal_code__country__region_id": "region_id", - "appeal_code__name": "appeal_name", - "appeal_code__start_date": "appeal_year", - } - ) - ops_learning_df.set_index("id", inplace=True) - return ops_learning_df - - @classmethod - def generate_regional_prioritization_list(self, df: pd.DataFrame): - """Generates a list of regional prioritizations from the given data.""" - df_exploded = df.explode("components") - regional_df = df_exploded.groupby(["region", "components"]).size().reset_index(name="count") - regional_df = regional_df[regional_df["count"] > 2] - regional_list = regional_df.groupby("region")["components"].apply(list).reset_index() - return regional_list - - @classmethod - def generate_global_prioritization_list(self, regional_df: pd.DataFrame): - """Generates a global prioritization list from regional data.""" - global_df = regional_df.explode("components").groupby("components").size().reset_index(name="count") - global_components = global_df[global_df["count"] > 2]["components"].tolist() - global_list = {"global": global_components} - return global_list - - @classmethod - def generate_country_prioritization_list( - self, regional_df: pd.DataFrame, global_components: list, prioritization_df: pd.DataFrame, country_df: pd.DataFrame - ): - """Generates a country-level prioritization list.""" - regional_dict = dict(zip(regional_df["region"], regional_df["components"])) - merged_df = country_df.merge(prioritization_df, on=["country", "region"], how="left") - no_prioritization_df = merged_df[merged_df["components"].isna()].astype(object) - - for index, row in no_prioritization_df.iterrows(): - region_id = row["region"] - components = regional_dict.get(region_id, global_components["global"]) - no_prioritization_df.at[index, "components"] = components - - final_df = pd.concat([merged_df.dropna(subset=["components"]), no_prioritization_df]) - final_df["components"] = final_df["components"].apply(lambda x: int(x) if isinstance(x, float) else x) - final_df = final_df[["country", "components"]] - return final_df - - @classmethod - def generate_priotization_list(self): - logger.info("Generating prioritization list.") - exclusion_list = [ - "IFRC Africa", - "IFRC Americas", - "IFRC Asia-Pacific", - "IFRC Europe", - "IFRC Geneva", - "IFRC MENA", - "Benelux ERU", - "ICRC", - ] - - # Get all countries - country_qs = ( - Country.objects.filter(is_deprecated=False, society_name__isnull=False, region__isnull=False) - .exclude(name__in=exclusion_list) - .values("id", "region_id") - ) - country_df = pd.DataFrame(list(country_qs)) - country_df = country_df.rename(columns={"id": "country", "region_id": "region"}) - - # Get all PER Overview - per_overview_qs = Overview.objects.select_related("country").values( - "id", - "country_id", - "country__region", - "assessment_number", - ) - per_overview_df = pd.DataFrame(list(per_overview_qs)) - per_overview_df = per_overview_df.rename(columns={"id": "overview", "country_id": "country", "country__region": "region"}) - - # Get all PER Prioritization - per_priotization_qs = ( - FormPrioritization.objects.filter( - is_draft=False, - prioritized_action_responses__isnull=False, - ) - .annotate( - components=F("prioritized_action_responses__component"), - ) - .values( - "overview", - "components", - ) - ) - per_priotization_df = pd.DataFrame(list(per_priotization_qs)) - per_priotization_df = per_priotization_df.merge( - per_overview_df[["overview", "country", "region", "assessment_number"]], on="overview", how="left" - ) - per_priotization_df = per_priotization_df.sort_values("assessment_number").drop_duplicates(subset="country", keep="last") - per_priotization_df = per_priotization_df[["region", "country", "components"]] - - # Generate the prioritization list that are in dataframes - regional_list = self.generate_regional_prioritization_list(per_priotization_df) - global_list = self.generate_global_prioritization_list(regional_list) - country_list = self.generate_country_prioritization_list(regional_list, global_list, per_priotization_df, country_df) - logger.info("Prioritization list generated.") - return regional_list, global_list, country_list - - @classmethod - def prioritize( - self, - df: pd.DataFrame, - components_countries: dict, - components_regions: dict, - components_global: dict, - type_prioritization: typing.Union[list, None], - ): - """Prioritizes components based on the type of prioritization.""" - - def add_new_component(prioritized_components, per_prioritized_components, df): - """Adds new components to the prioritized list based on availability and frequency.""" - available_components = list(df["component"].unique()) - remaining_components = [item for item in available_components if item not in prioritized_components] - - intersect_components = list(set(per_prioritized_components) & set(remaining_components)) - - if intersect_components: - mask = df["component"].isin(intersect_components) - else: - mask = df["component"].isin(remaining_components) - - component_counts = df[mask]["component"].value_counts() - most_frequent_components = component_counts[component_counts == component_counts.max()].index.tolist() - - return prioritized_components + most_frequent_components - - if type_prioritization == "single-country": - country_id = str(df["country_id"].iloc[0]) - per_prioritized_components = components_countries.get(country_id, []) - elif type_prioritization == "single-region": - region_id = str(df["region_id"].iloc[0]) - per_prioritized_components = components_regions.get(region_id, []) - per_prioritized_components = components_global.get("global", []) - - component_counts = df["component"].value_counts() - most_frequent_components = component_counts[component_counts == component_counts.max()].index.tolist() - - while len(most_frequent_components) < 3: - most_frequent_components = add_new_component(most_frequent_components, per_prioritized_components, df) - - mask = df["component"].isin(most_frequent_components) - return df[mask] - - @classmethod - def prioritize_components( - self, - filter_data: dict, - regional_list, - global_list, - country_list, - ): - logger.info("Prioritizing components.") - - def need_component_prioritization(df, MIN_DIF_COMPONENTS, MIN_DIF_EXCERPTS): - """Determines if prioritization is needed based on unique components and learnings.""" - nb_dif_components = len(df["component"].unique()) - nb_dif_learnings = len(df["learning"].unique()) - return nb_dif_components > MIN_DIF_COMPONENTS and nb_dif_learnings > MIN_DIF_EXCERPTS - - def identify_type_prioritization(df): - """Identifies the type of prioritization required based on the data.""" - if len(df["country_id"].unique()) == 1: - return "single-country" - elif len(df["region_id"].unique()) == 1: - return "single-region" - elif len(df["region_id"].unique()) > 1: - return "multi-region" - return None - - def contextualize_learnings(df): - """Adds appeal year and event name as a contextualization of the leannings.""" - for index, row in df.iterrows(): - df.at[index, "learning"] = f"In {row['appeal_year']} in {row['appeal_name']}: {row['learning']}" - - df = df.drop(columns=["appeal_name"]) - logger.info("Contextualization added to DataFrame.") - return df - - components_countries = country_list.to_dict(orient="records") - components_countries = {item["country"]: item["components"] for item in components_countries} - - components_regions = regional_list.to_dict(orient="records") - components_regions = {item["region"]: item["components"] for item in components_regions} - - ops_learning_df = self.fetch_ops_learnings(filter_data) - - if need_component_prioritization(ops_learning_df, self.MIN_DIF_COMPONENTS, self.MIN_DIF_EXCERPTS): - type_prioritization = identify_type_prioritization(ops_learning_df) - prioritized_learnings = self.prioritize( - ops_learning_df, components_countries, components_regions, global_list, type_prioritization - ) - prioritized_learnings = ops_learning_df - logger.info("Prioritization of components completed.") - prioritized_learnings = contextualize_learnings(prioritized_learnings) - return prioritized_learnings - - @classmethod - def slice_dataframe(self, df, limit=2000, encoding_name="cl100k_base"): - def count_tokens(string, encoding_name): - """Returns the number of tokens in a text string.""" - encoding = tiktoken.get_encoding(encoding_name) - return len(encoding.encode(string)) - - df["count_temp"] = [count_tokens(x, encoding_name) for x in df["learning"]] - df["cumsum"] = df["count_temp"].cumsum() - - slice_index = None - for i in range(1, len(df)): - if df["cumsum"].iloc[i - 1] <= limit and df["cumsum"].iloc[i] > limit: - slice_index = i - 1 - break - - if slice_index is not None: - df_sliced = df.iloc[: slice_index + 1] - else: - df_sliced = df - return df_sliced - - @classmethod - def prioritize_excerpts(self, df: pd.DataFrame): - """Prioritize the most recent excerpts within the token limit.""" - logger.info("Prioritizing excerpts within token limit.") - - # Droping duplicates based on 'learning' column for primary DataFrame - primary_learning_df = df.drop_duplicates(subset="learning") - primary_learning_df = primary_learning_df.sort_values(by="appeal_year", ascending=False) - primary_learning_df.reset_index(inplace=True, drop=True) - - # Droping duplicates based on 'learning' and 'component' columns for secondary DataFrame - secondary_learning_df = df.drop_duplicates(subset=["learning", "component"]) - secondary_learning_df = secondary_learning_df.sort_values(by=["component", "appeal_year"], ascending=[True, False]) - grouped = secondary_learning_df.groupby("component") - - # Create an interleaved list of rows - interleaved = list(chain(*zip(*[group[1].itertuples(index=False) for group in grouped]))) - - # Convert the interleaved list of rows back to a DataFrame - result = pd.DataFrame(interleaved) - result.reset_index(inplace=True, drop=True) - - # Slice the Primary and secondary dataframes - sliced_primary_learning_df = self.slice_dataframe(primary_learning_df, self.PROMPT_DATA_LENGTH_LIMIT, self.ENCODING_NAME) - sliced_secondary_learning_df = self.slice_dataframe(result, self.PROMPT_DATA_LENGTH_LIMIT, self.ENCODING_NAME) - logger.info("Excerpts prioritized within token limit.") - return sliced_primary_learning_df, sliced_secondary_learning_df - - @classmethod - def format_prompt( - self, - primary_learning_df: pd.DataFrame, - secondary_learning_df: pd.DataFrame, - filter_data: dict, - ): - """Formats the prompt based on request filter and prioritized learnings.""" - logger.info("Formatting prompt.") - - def build_intro_section(): - """Builds the introductory section of the prompt.""" - return ( - "I will provide you with a set of instructions, data, and formatting requests in three sections." - + " I will pass you the INSTRUCTIONS section, are you ready?" - + os.linesep - + os.linesep - ) - - def build_instruction_section(request_filter, df): - """Builds the instruction section of the prompt based on the request filter and DataFrame.""" - instructions = ["INSTRUCTIONS", "========================", "Summarize essential insights from the DATA"] - - if "appeal_code__dtype__in" in request_filter: - dtypes = df["dtype_name"].dropna().unique() - dtype_str = '", "'.join(dtypes) - instructions.append(f'concerning "{dtype_str}" occurrences') - - if "appeal_code__country__in" in request_filter: - countries = df["country_name"].dropna().unique() - country_str = '", "'.join(countries) - instructions.append(f'in "{country_str}"') - - if "appeal_code__region" in request_filter: - regions = df["region_name"].dropna().unique() - region_str = '", "'.join(regions) - instructions.append(f'in "{region_str}"') - - if "sector_validated__in" in request_filter: - sectors = df["sector"].dropna().unique() - sector_str = '", "'.join(sectors) - instructions.append(f'focusing on "{sector_str}" aspects') - - if "per_component_validated__in" in request_filter: - components = df["component"].dropna().unique() - component_str = '", "'.join(components) - instructions.append(f'and "{component_str}" aspects') - - instructions.append( - "In Emergency Response. You should prioritize the insights based on their recurrence " - "and potential impact on humanitarian operations, and provide the top insights. \n\n" - "I will pass you the DATA section, are you ready?\n\n" - ) - return "\n".join(instructions) - - def get_main_sectors(df: pd.DataFrame): - """Get only information from technical sectorial information""" - temp = df[df["component"] == "NS-specific areas of intervention"] - available_sectors = list(temp["sector"].unique()) - nb_sectors = len(available_sectors) - if nb_sectors == 0: - logger.info("There were not specific technical sectorial learnings") - return [] - logger.info("Main sectors for secondary summaries selected") - return available_sectors - - def get_main_components(df: pd.DataFrame): - available_components = list(df["component"].unique()) - nb_components = len(available_components) - if nb_components == 0: - logger.info("There were not specific components") - return [] - logger.info("All components for secondary summaries selected") - return available_components - - def process_learnings_sector(sector, df, max_length_per_section): - df = df[df["sector"] == sector].dropna() - df_sliced = self.slice_dataframe(df, max_length_per_section, self.ENCODING_NAME) - learnings_sector = ( - "\n----------------\n" - + "SUBTYPE: " - + str(sector) - + "\n----------------\n" - + "\n----------------\n".join(df_sliced["learning"]) - ) - return learnings_sector - - def process_learnings_component(component, df, max_length_per_section): - df = df[df["component"] == component].dropna() - df_sliced = self.slice_dataframe(df, max_length_per_section, self.ENCODING_NAME) - learnings_component = ( - "\n----------------\n" - + "SUBTYPE: " - + str(component) - + "\n----------------\n" - + "\n----------------\n".join(df_sliced["learning"]) - ) - return learnings_component - - def build_data_section(primary_df: pd.DataFrame, secondary_df: pd.DataFrame): - # Primary learnings section - primary_learnings_data = "\n----------------\n".join(primary_df["learning"].dropna()) - - # Secondary learnings section - sectors = get_main_sectors(secondary_df) - components = get_main_components(secondary_df) - max_length_per_section = self.PROMPT_DATA_LENGTH_LIMIT / (len(components) + len(sectors)) - learnings_sectors = ( - "\n----------------\n\n" - + "TYPE: SECTORS" - + "\n----------------\n".join( - [process_learnings_sector(int(x), secondary_df, max_length_per_section) for x in sectors if pd.notna(x)] - ) - ) - learnings_components = ( - "\n----------------\n\n" - + "TYPE: COMPONENT" - + "\n----------------\n".join( - [process_learnings_component(int(x), secondary_df, max_length_per_section) for x in components if pd.notna(x)] - ) - ) - secondary_learnings_data = learnings_sectors + learnings_components - return primary_learnings_data, secondary_learnings_data - - prompt_intro = build_intro_section() - primary_prompt_instruction = build_instruction_section(filter_data, primary_learning_df) - secondary_prompt_instruction = build_instruction_section(filter_data, secondary_learning_df) - primary_learnings_data, secondary_learnings_data = build_data_section(primary_learning_df, secondary_learning_df) - # format the prompts - primary_learning_prompt = "".join([prompt_intro, primary_prompt_instruction, primary_learnings_data, self.primary_prompt]) - secondary_learning_prompt = "".join( - [prompt_intro, secondary_prompt_instruction, secondary_learnings_data, self.secondary_prompt] - ) - logger.info("Prompt formatted.") - return primary_learning_prompt, secondary_learning_prompt +from per.ops_learning_summary import OpsLearningSummaryTask @shared_task @@ -479,3 +11,4 @@ def generate_summary(filter_data: dict, hash_value: str): primary_learning_prompt, secondary_learning_prompt = OpsLearningSummaryTask.format_prompt( primary_learning_df, secondary_learning_df, filter_data ) + OpsLearningSummaryTask.generate_summaries(primary_learning_prompt, secondary_learning_prompt) diff --git a/poetry.lock b/poetry.lock index a03133a33..b51087a69 100644 --- a/poetry.lock +++ b/poetry.lock @@ -25,6 +25,39 @@ files = [ {file = "aniso8601-7.0.0.tar.gz", hash = "sha256:513d2b6637b7853806ae79ffaca6f3e8754bdd547048f5ccc1420aec4b714f1e"}, ] +[[package]] +name = "annotated-types" +version = "0.7.0" +description = "Reusable constraint types to use with typing.Annotated" +optional = false +python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, + {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, +] + +[[package]] +name = "anyio" +version = "4.4.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.8" +files = [ + {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"}, + {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (>=0.23)"] + [[package]] name = "arabic-reshaper" version = "3.0.0" @@ -1557,6 +1590,17 @@ setproctitle = ["setproctitle"] testing = ["coverage", "eventlet", "gevent", "pytest", "pytest-cov"] tornado = ["tornado (>=0.2)"] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + [[package]] name = "html5lib" version = "1.1" @@ -1578,6 +1622,51 @@ chardet = ["chardet (>=2.2)"] genshi = ["genshi"] lxml = ["lxml"] +[[package]] +name = "httpcore" +version = "1.0.5" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"}, + {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<0.26.0)"] + +[[package]] +name = "httpx" +version = "0.27.0" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.27.0-py3-none-any.whl", hash = "sha256:71d5465162c13681bff01ad59b2cc68dd838ea1f10e51574bac27103f00c91a5"}, + {file = "httpx-0.27.0.tar.gz", hash = "sha256:a0cb88a46f32dc874e04ee956e4c2764aba2aa228f650b06788ba6bda2962ab5"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +httpcore = "==1.*" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + [[package]] name = "idna" version = "3.10" @@ -2050,6 +2139,29 @@ files = [ {file = "numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010"}, ] +[[package]] +name = "openai" +version = "1.37.0" +description = "The official Python library for the openai API" +optional = false +python-versions = ">=3.7.1" +files = [ + {file = "openai-1.37.0-py3-none-any.whl", hash = "sha256:a903245c0ecf622f2830024acdaa78683c70abb8e9d37a497b851670864c9f73"}, + {file = "openai-1.37.0.tar.gz", hash = "sha256:dc8197fc40ab9d431777b6620d962cc49f4544ffc3011f03ce0a805e6eb54adb"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.7,<5" + +[package.extras] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] + [[package]] name = "opencensus" version = "0.11.4" @@ -3469,6 +3581,17 @@ nose = ["nose"] pytest = ["pytest"] test = ["django (>=1.10.6)", "nose", "pytest (>=4.6)", "pytest-cov", "six"] +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + [[package]] name = "sortedcontainers" version = "2.4.0" diff --git a/pyproject.toml b/pyproject.toml index 6b1bff1cc..ed7dce845 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,7 @@ colorlog = "*" mapbox-tilesets = "*" ipython = "*" tiktoken = "*" +openai = "*" [tool.poetry.dev-dependencies] pytest-profiling = "*"