Skip to content

Commit

Permalink
feat: added langchains LLM (#66)
Browse files Browse the repository at this point in the history
fixes #46 
fixes #53 

You can now use langchains LLM abstraction to access all the LLM
endpoints langchain supports.

eg
```python
from langchain.chat_models import ChatOpenAI

gpt4 = ChatOpenAI(model_name="gpt-4")
gpt4.generate_prompt(prompts=[prompts])

# init a new Metric with llm
cr = ContextRelevancy(llm=gpt4)
cr.init_model()
result = cr.score(ds.select(range(4)))

result["context_relavency"]
# [0.46687018871307373, 0.1532887363433838,0.17359847468989234, 0.17340516530234237]
```

We're also now using OpenAI's chat models as default which brings a 10x
decrease in cost. also using `gpt-3.5-turbo-16k` as default for even
bigger context size
  • Loading branch information
jjmachan authored Jul 20, 2023
1 parent eefb0ca commit b5770f0
Show file tree
Hide file tree
Showing 9 changed files with 208 additions and 158 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ jobs:
ragas:
- "src/ragas/**"
- "tests/**"
- "examples/**"
docs:
- *related
- requirements/docs-requirements.txt
Expand All @@ -52,7 +51,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.7", "3.8", "3.9", "3.10"]
python-version: ["3.8", "3.9", "3.10"]

if: ${{ (github.event_name == 'pull_request' && needs.diff.outputs.ragas == 'true') || github.event_name == 'push' }}
name: python${{ matrix.python-version }}_unit_tests (${{ matrix.os }})
Expand Down Expand Up @@ -86,6 +85,7 @@ jobs:
pip install "."
pip install -r requirements/test.txt
- name: Run unit tests
run: |
# OPTS=(--cov-config pyproject.toml --cov=src/bentoml --cov-append)
Expand All @@ -94,7 +94,7 @@ jobs:
OPTS=(--dist loadfile -n auto)
fi
# Now run the unit tests
pytest tests/unit "${OPTS[@]}"
OPENAI_API_KEY="test" pytest tests/unit "${OPTS[@]}"
codestyle_check:
runs-on: ubuntu-latest
Expand Down
105 changes: 57 additions & 48 deletions experiments/assesments/metrics_assesments.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -106,16 +106,17 @@
"source": [
"import os\n",
"import openai\n",
"\n",
"openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n",
"\n",
"completion = openai.ChatCompletion.create(\n",
" model=\"gpt-3.5-turbo\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
" ]\n",
" model=\"gpt-3.5-turbo\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
" ],\n",
")\n",
"\n",
"print(completion.choices[0].message)\n"
"print(completion.choices[0].message)"
]
},
{
Expand All @@ -125,11 +126,10 @@
"metadata": {},
"outputs": [],
"source": [
"\n",
"def llm2(prompt, **kwargs):\n",
" response = openai.ChatCompletion.create(\n",
" model=kwargs.get(\"model\",\"gpt-3.5-turbo-16k\"),\n",
" messages=[{\"role\": \"system\", \"content\":prompt}],\n",
" model=kwargs.get(\"model\", \"gpt-3.5-turbo-16k\"),\n",
" messages=[{\"role\": \"system\", \"content\": prompt}],\n",
" temperature=kwargs.get(\"temperature\", 0),\n",
" top_p=kwargs.get(\"top_p\", 1),\n",
" frequency_penalty=kwargs.get(\"frequency_penalty\", 0.0),\n",
Expand All @@ -139,6 +139,7 @@
" )\n",
" return response\n",
"\n",
"\n",
"def llm(prompt, **kwargs):\n",
" response = openai.Completion.create(\n",
" model=kwargs.get(\"model\", \"text-davinci-003\"),\n",
Expand Down Expand Up @@ -375,7 +376,7 @@
}
],
"source": [
"llm2([Question_generation.format(2,answer)])"
"llm2([Question_generation.format(2, answer)])"
]
},
{
Expand Down Expand Up @@ -1039,10 +1040,12 @@
],
"source": [
"def get_all_facts(item):\n",
" all_facts = item['context']['sentences']\n",
" all_facts = item[\"context\"][\"sentences\"]\n",
" all_facts = [sent for para in all_facts for sent in para]\n",
" return {\"full_context\":''.join(all_facts)}\n",
"hotpot_qa = hotpot_qa.map(get_all_facts, batched=False) "
" return {\"full_context\": \"\".join(all_facts)}\n",
"\n",
"\n",
"hotpot_qa = hotpot_qa.map(get_all_facts, batched=False)"
]
},
{
Expand Down Expand Up @@ -1090,8 +1093,8 @@
"metadata": {},
"outputs": [],
"source": [
"i=15\n",
"q,c = hotpot_qa[i]['question'],hotpot_qa[i]['full_context']"
"i = 15\n",
"q, c = hotpot_qa[i][\"question\"], hotpot_qa[i][\"full_context\"]"
]
},
{
Expand All @@ -1112,7 +1115,7 @@
"outputs": [],
"source": [
"q = \"what is general relativity?\"\n",
"n=2"
"n = 2"
]
},
{
Expand All @@ -1123,20 +1126,21 @@
"outputs": [],
"source": [
"import wikipediaapi\n",
"\n",
"wiki_wiki = wikipediaapi.Wikipedia(\n",
" language='en',\n",
" extract_format=wikipediaapi.ExtractFormat.WIKI\n",
" language=\"en\", extract_format=wikipediaapi.ExtractFormat.WIKI\n",
")\n",
"\n",
"p_wiki = wiki_wiki.page(\"Black hole\")\n",
"\n",
"\n",
"def get_page_section(page, section):\n",
" all_text = \"\"\n",
" p_wiki = wiki_wiki.page(page)\n",
" sections = p_wiki.sections_by_title(section)\n",
" for s in sections:\n",
" all_text += s.full_text()\n",
" return all_text\n"
" return all_text"
]
},
{
Expand All @@ -1152,48 +1156,42 @@
"\n",
"cross_encoder = CrossEncoder(\"cross-encoder/stsb-TinyBERT-L-4\")\n",
"\n",
" \n",
"\n",
"def sent_tokenize(sent):\n",
" return [s[:-1] if s.endswith('.') else s for s in sent.strip().split('. ')]\n",
" return [s[:-1] if s.endswith(\".\") else s for s in sent.strip().split(\". \")]\n",
"\n",
"\n",
"class SentenceAgreement:\n",
" \n",
" def __init__(self, scoring=\"bert_score\"):\n",
" \n",
" self.scoring = scoring\n",
"\n",
" \n",
" @staticmethod\n",
" def bert_score(para1, para2):\n",
" \n",
" sentences1, sentences2 = sent_tokenize(para1), sent_tokenize(para2)\n",
" scores = cross_encoder.predict(list(itertools.product(sentences1, sentences2)))\n",
" scores = scores.reshape(len(sentences1), len(sentences2))\n",
" return scores.max(axis=1).mean()\n",
"\n",
" @staticmethod\n",
" def jaccard_score(para1, para2):\n",
" \n",
" sentences1, sentences2 = sent_tokenize(para1), sent_tokenize(para2)\n",
" intersect = len(np.intersect1d(sentences1, sentences2))\n",
" union = len(np.union1d(sentences1, sentences2))\n",
" return intersect/union\n",
" \n",
" def evaluate(self,answers:List[List[str]]):\n",
" \n",
" return intersect / union\n",
"\n",
" def evaluate(self, answers: List[List[str]]):\n",
" \"\"\"\n",
" eval nC2 combinations\n",
" \"\"\"\n",
" scores = []\n",
" groups = combinations(answers,2)\n",
" groups = combinations(answers, 2)\n",
" for group in groups:\n",
" if self.scoring == \"jaccard\":\n",
" score = self.jaccard_score(*group)\n",
" elif self.scoring == \"bert_score\":\n",
" score = self.bert_score(*group)\n",
" scores.append(score)\n",
" return np.mean(scores)\n",
" "
" return np.mean(scores)"
]
},
{
Expand All @@ -1204,26 +1202,30 @@
"outputs": [],
"source": [
"class ContextRelevacy:\n",
" \n",
" def __init__(self, strictness = 2, agreement_metric=\"bert_score\"):\n",
" \n",
" def __init__(self, strictness=2, agreement_metric=\"bert_score\"):\n",
" self.strictness = strictness\n",
" self.sent_agreement = SentenceAgreement(agreement_metric)\n",
" \n",
" def score(self,question,context):\n",
"\n",
" def score(self, question, context):\n",
" scores = []\n",
" outputs = llm(Context_relevency.format(q,c),n=self.strictness,temperature=1)\n",
" outputs = [outputs['choices'][i]['text'].strip() for i in range(self.strictness)]\n",
" outputs = llm(Context_relevency.format(q, c), n=self.strictness, temperature=1)\n",
" outputs = [\n",
" outputs[\"choices\"][i][\"text\"].strip() for i in range(self.strictness)\n",
" ]\n",
" context_sents = sent_tokenize(context)\n",
" for output in outputs:\n",
" indices = [context.find(sent) for sent in sent_tokenize(output) if context.find(sent)!=-1]\n",
" scores.append(len(indices)/len(context_sents))\n",
" \n",
" indices = [\n",
" context.find(sent)\n",
" for sent in sent_tokenize(output)\n",
" if context.find(sent) != -1\n",
" ]\n",
" scores.append(len(indices) / len(context_sents))\n",
"\n",
" if self.strictness > 1:\n",
" agr_score = self.sent_agreement.evaluate(outputs)\n",
" else:\n",
" agr_score =1 \n",
" return agr_score * np.mean(scores)\n"
" agr_score = 1\n",
" return agr_score * np.mean(scores)"
]
},
{
Expand All @@ -1234,7 +1236,7 @@
"outputs": [],
"source": [
"c = get_page_section(\"HIV/AIDS\", \"Prevention\")\n",
"c = ' '.join(c.split(' ')[:500])\n",
"c = \" \".join(c.split(\" \")[:500])\n",
"q = \"When was the first HIV case detected?\""
]
},
Expand All @@ -1245,7 +1247,14 @@
"metadata": {},
"outputs": [],
"source": [
"output = llm([Context_relevency.format(q,c), Context_relevency.format(\"How to prevent AIDS?\",c)],n=n,temperature=1)"
"output = llm(\n",
" [\n",
" Context_relevency.format(q, c),\n",
" Context_relevency.format(\"How to prevent AIDS?\", c),\n",
" ],\n",
" n=n,\n",
" temperature=1,\n",
")"
]
},
{
Expand Down Expand Up @@ -1397,7 +1406,7 @@
}
],
"source": [
"context_relevancy.score(dataset[\"baseline\"].select(range(0,3)))"
"context_relevancy.score(dataset[\"baseline\"].select(range(0, 3)))"
]
},
{
Expand Down Expand Up @@ -1491,7 +1500,7 @@
}
],
"source": [
"context_relevancy.score(dataset[\"baseline\"].select(range(0,3)))"
"context_relevancy.score(dataset[\"baseline\"].select(range(0, 3)))"
]
},
{
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ dependencies = [
"sentence-transformers",
"datasets",
"protobuf<=3.20.0",
"backoff",
"langchain>=0.0.218",
"openai",
"pydantic<2.0"
]
dynamic = ["version", "readme"]

Expand Down
39 changes: 39 additions & 0 deletions src/ragas/async_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""Async utils."""
import asyncio
from typing import Any, Coroutine, List


def run_async_tasks(
tasks: List[Coroutine],
show_progress: bool = False,
progress_bar_desc: str = "Running async tasks",
) -> List[Any]:
"""Run a list of async tasks."""

tasks_to_execute: List[Any] = tasks
if show_progress:
try:
import nest_asyncio
from tqdm.asyncio import tqdm

# jupyter notebooks already have an event loop running
# we need to reuse it instead of creating a new one
nest_asyncio.apply()
loop = asyncio.get_event_loop()

async def _tqdm_gather() -> List[Any]:
return await tqdm.gather(*tasks_to_execute, desc=progress_bar_desc)

tqdm_outputs: List[Any] = loop.run_until_complete(_tqdm_gather())
return tqdm_outputs
# run the operation w/o tqdm on hitting a fatal
# may occur in some environments where tqdm.asyncio
# is not supported
except Exception:
pass

async def _gather() -> List[Any]:
return await asyncio.gather(*tasks_to_execute)

outputs: List[Any] = asyncio.run(_gather())
return outputs
19 changes: 11 additions & 8 deletions src/ragas/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from math import floor

from datasets import Dataset
from langchain.chat_models.base import BaseChatModel
from langchain.llms.base import BaseLLM


def make_batches(total_size: int, batch_size: int) -> list[range]:
Expand All @@ -31,17 +33,18 @@ def make_batches(total_size: int, batch_size: int) -> list[range]:

@dataclass
class Metric(ABC):
@property
@abstractmethod
def batch_size(self: t.Self) -> int:
...
batch_size: int
llm: t.Optional[BaseLLM | BaseChatModel] = None

def __post_init__(self: t.Self):
if self.llm is None:
from langchain.chat_models import ChatOpenAI

self.llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k") # type: ignore

@property
@abstractmethod
def name(self: t.Self) -> str:
"""
the metric name
"""
def name(self) -> str:
...

@abstractmethod
Expand Down
Loading

0 comments on commit b5770f0

Please sign in to comment.