Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial contact center base scenario setup and added summarization scenario #2569

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions src/helm/benchmark/scenarios/contactcenter_convo_base_scenario.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Base scenario for the contact center conversation.

This scenario defines the basic input structure of the conversation
and shared functions for the contact center conversation scenarios.

The conversation structure is json format with the following fields:
[
{
"conversation_name": "cresta-helm-cc-2018-01-12T19:31:31.404000",
"body": [
{
"id": "abcd-1",
"text": "Hello?",
"speaker_role": "visitor",
},
{
"id": "abcd-2",
"text": "Thank you for contacting xxx! My name is Jack. I am here to help you. How can I help you today?",
"speaker_role": "agent",
}
],
},
...
]
"""

import json
from .scenario import Scenario

class ContactCenterConversationScenario(Scenario):
"""Base scenario for the contact center conversation."""
name = "cc_conversation"
description = "Base scenario for contact center conversation tasks"
tags = ["cc_conversation"]
Comment on lines +33 to +35
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just delete name, description and tags since this is not a concrete class and can't be instantated.


def __init__(self, dataset_path: str) -> None:
"""
Initializes contact center base scenario.
Args:
dataset_path: path of dataset to load from.
"""
super().__init__()
self.dataset_path = dataset_path

def _load_conversations(self, dataset_path):
"""
Load the conversations from the given path.

Only returns the raw dictionary of conversations, where specific input/output text formatting
is handled by the subclass scenario.
Args:
dataset_path: path of dataset to load from.
Returns:
dataset: List of conversation dictionaries.
"""
with open(dataset_path, 'r', encoding='utf-8') as f:
raw_chats = json.load(f)
return raw_chats
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""
Scenario for contact center conversational summarization.

Loads input conversations defined in the ContactCenterConversationScenario
and packs task specific input/output format for summarization tasks.

Task structure

Conversation:
agent: message1
visitor: message2
....
Summary:
summary of the conversation


Example from the dataset
Conversation:
agent: hi how can i help you today
visitor: i need help with my account
agent: sure what is your account number
visitor: 123456
.....
Summary:
- agent helped visitor with account number 123456
"""


import json
import os
from typing import List, Optional
from helm.benchmark.scenarios.contactcenter_convo_base_scenario import ContactCenterConversationScenario
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
from .scenario import Instance, Reference, TEST_SPLIT, CORRECT_TAG, Input, Output


class ContactCenterConversationSummarizationScenario(ContactCenterConversationScenario):
"""
Scenario for contact center conversational summarization.
"""

name = "cc_convo_summarization"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pick a consistent name that is used both here and as the filename. I'd prefer both to be contact_center_conversation_summarization, but I'd also be fine with both being cc_convo_summarization.

Likewise with the other file.

description = "Scenario for contact centern summarization tasks"
tags = ["cc_conversation_summarization"]

def __init__(
self,
dataset_path: str,
sampling_min_length: Optional[int] = None,
sampling_max_length: Optional[int] = None,
doc_max_length: Optional[int] = None,
Comment on lines +49 to +51
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: names like
doc_filter_min_length, doc_filter_max_length, doc_truncation_max_length
or
filter_doc_min_length, filter_doc_max_length, truncate_doc_max_length
or
doc_filter_min_words, doc_filter_max_words, doc_truncation_max_words
would be more self-evident.

):
"""
Initializes summarization scenario.
Args:
dataset_path: path of dataset to load from
sampling_min_length: Int indicating minimum length for training
documents. Training examples smaller than
sampling_min_length will be filtered out.
Useful for preventing the adapter from sampling
really small documents.
sampling_max_length: Int indicating maximum length for training
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change comments to indicate that "length" is the number of words, not characters (or change the variable names to reflect this)

documents. Training examples larger than
sampling_max_length will be filtered out.
Useful for preventing the adapter from
sampling really large documents.
doc_max_length: Int indicating the maximum length to truncate
documents. Documents in all splits will be
truncated to doc_max_length tokens.
NOTE: Currently uses whitespace tokenization.
"""
super().__init__()
self.dataset_path = dataset_path
self.sampling_min_length = sampling_min_length
self.sampling_max_length = sampling_max_length
self.doc_max_length = doc_max_length
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem to be used. Are you missing the truncation logic? Alternatively, just delete this parameter.


def _filter(self, convo: str, summary: str):
"""filter on conversation turns"""
convo_len = len(convo.split('\n'))
if convo_len <= 10:
return True
return False

def _load_summaries(self, dataset_path):
with open(dataset_path, 'r', encoding='utf-8') as f:
summaries_list = json.load(f)
summaries = {item['conversation_name']: item['summary'] for item in summaries_list}
return summaries

def get_instances(self) -> List[Instance]:
conversation_path = os.path.join(self.dataset_path, "conversations.json")
summary_path = os.path.join(self.dataset_path, "summaries.json")
conversations = self._load_conversations(conversation_path)
summaries = self._load_summaries(summary_path)


instances: List[Instance] = []

for example in conversations:
conversation_name = example['conversation_name']
full_conversation_text = '\n'.join(f"{item['speaker_role']}:{item['text']}" for item in example['body'])
summary = summaries[conversation_name]

# use better tokenization to count tokens
conversation_len = len(full_conversation_text.split())
if self.sampling_max_length and conversation_len > self.sampling_max_length:
continue
if self.sampling_min_length and conversation_len < self.sampling_min_length:
continue

if self._filter(full_conversation_text, summary):
continue

# always load TEST split as we don't offer train data
instances.append(
Instance(
input=Input(text=full_conversation_text),
references=[Reference(Output(text=summary), tags=[CORRECT_TAG])],
split=TEST_SPLIT,
)
)

return instances
Loading