Skip to content

Commit

Permalink
contextual proximity
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulnyk committed Nov 5, 2023
1 parent ed38728 commit cfb191c
Show file tree
Hide file tree
Showing 32 changed files with 5,982 additions and 2,133 deletions.
Binary file added assets/INDIA NON JUDICIAL.pdf
Binary file not shown.
790 changes: 552 additions & 238 deletions concept_graph.ipynb

Large diffs are not rendered by default.

Binary file removed data_input/AI in Health and Medicine.pdf
Binary file not shown.
Binary file removed data_input/GlobalPublicHealth2022.pdf
Binary file not shown.
Binary file removed data_input/GuideToHealthReform_FULL_18August2023.pdf
Binary file not shown.
Binary file removed data_input/OrfPathHealth/BridgingTheDivide.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed data_input/OrfPathHealth/GlobalVaccineLibrary.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed data_input/WHO-HIS-SDS-2018.15-eng.pdf
Binary file not shown.
Binary file added data_input/cureus-0015-00000040274.pdf
Binary file not shown.
79 changes: 79 additions & 0 deletions data_input/cureus/cureus-0015-00000040274.txt

Large diffs are not rendered by default.

Binary file removed data_input/orf-path_health-ttemp.pdf
Binary file not shown.
92 changes: 92 additions & 0 deletions data_output/MediumArticles/chunks.csv

Large diffs are not rendered by default.

360 changes: 360 additions & 0 deletions data_output/MediumArticles/concepts.csv

Large diffs are not rendered by default.

869 changes: 485 additions & 384 deletions data_output/OrfPathHealth/chunks.csv

Large diffs are not rendered by default.

1,221 changes: 1,221 additions & 0 deletions data_output/OrfPathHealth/graph.csv

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions data_output/cureus/chunks.csv

Large diffs are not rendered by default.

156 changes: 156 additions & 0 deletions data_output/cureus/graph.csv

Large diffs are not rendered by default.

931 changes: 931 additions & 0 deletions docs/graph.html

Large diffs are not rendered by default.

1,810 changes: 314 additions & 1,496 deletions docs/index.html

Large diffs are not rendered by default.

1,145 changes: 1,145 additions & 0 deletions extract_graph.ipynb

Large diffs are not rendered by default.

38 changes: 36 additions & 2 deletions helpers/df_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pandas as pd
import numpy as np
from .prompts import extractConcepts
from .prompts import graphPrompt


def documents2Dataframe(documents) -> pd.DataFrame:
rows = []
Expand All @@ -16,10 +18,14 @@ def documents2Dataframe(documents) -> pd.DataFrame:
df = pd.DataFrame(rows)
return df


def df2ConceptsList(dataframe: pd.DataFrame) -> list:
# dataframe.reset_index(inplace=True)
results = dataframe.apply(
lambda row: extractConcepts(row.text, {"chunk_id": row.chunk_id, "type": "concept"}), axis=1
lambda row: extractConcepts(
row.text, {"chunk_id": row.chunk_id, "type": "concept"}
),
axis=1,
)
# invalid json results in NaN
results = results.dropna()
Expand All @@ -29,9 +35,37 @@ def df2ConceptsList(dataframe: pd.DataFrame) -> list:
concept_list = np.concatenate(results).ravel().tolist()
return concept_list


def concepts2Df(concepts_list) -> pd.DataFrame:
## Remove all NaN entities
concepts_dataframe = pd.DataFrame(concepts_list).replace(" ", np.nan)
concepts_dataframe = concepts_dataframe.dropna(subset=["entity"])
concepts_dataframe["entity"] = concepts_dataframe["entity"].apply(
lambda x: x.lower()
)

return concepts_dataframe


def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
# dataframe.reset_index(inplace=True)
results = dataframe.apply(
lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
)
# invalid json results in NaN
results = results.dropna()
results = results.reset_index(drop=True)

## Flatten the list of lists to one single list of entities.
concept_list = np.concatenate(results).ravel().tolist()
return concept_list


def graph2Df(nodes_list) -> pd.DataFrame:
## Remove all NaN entities
graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

return concepts_dataframe
return graph_dataframe
55 changes: 48 additions & 7 deletions helpers/prompts.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,73 @@
import sys
sys.path.append('..')
from yachalk import chalk
sys.path.append("..")

import json
import ollama.client as client

def extractConcepts(prompt: str, metadata = {}, model="mistral-openorca:latest"):

def extractConcepts(prompt: str, metadata={}, model="mistral-openorca:latest"):
SYS_PROMPT = (
"Your task is extract the key concepts (and non personal entities) mentioned in the given text. "
"Your task is extract the key concepts (and non personal entities) mentioned in the given context. "
"Extract only the most important and atomistic concepts, if needed break the concepts down to the simpler concepts."
"Categorize the concepts in one of the following categories: "
"[event, concept, place, object, document, organisation, condition, misc]\n"
"Format your output as a list of json with the following format:\n"
"[\n"
" {\n"
' "entity": The Concept,\n'
' "importance": The contextual importance of the concept on a scale of 1 to 5 (5 being the highest),\n'
' "importance": The concontextual importance of the concept on a scale of 1 to 5 (5 being the highest),\n'
' "category": The Type of Concept,\n'
" }, \n"
"{ }, \n"
"]\n"
)
response, _ = client.generate(
model_name=model, system=SYS_PROMPT, prompt=prompt
response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=prompt)
try:
result = json.loads(response)
result = [dict(item, **metadata) for item in result]
except:
print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
result = None
return result


def graphPrompt(input: str, metadata={}, model="mistral-openorca:latest"):
if model == None:
model = "mistral-openorca:latest"

# model_info = client.show(model_name=model)
# print( chalk.blue(model_info))

SYS_PROMPT = (
"You are a network graph maker who extracts terms and their relations from a given context"
"You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
"of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
"Thought 1: While travelling through every sentence, Think about the key terms mentioned in it.\n"
"\tTerms may include object, entity, location, organization, person, \n"
"\tcondition, acronym, documents, service, concept, etc.\n"
"\tTerms should be as atomistic as possible\n\n"
"Thought 3: Think about how these terms can have one on one relation with other terms.\n"
"\tTerms that are mentioned in same sentence or the same paragraph are typically related to each other.\n"
"\tTerms can be related to many other terms\n\n"
"Thought 4: Find out the relation between each such related pair of terms. \n\n"
"Format your output as a list of json. Each element of the list contains a pair of terms"
"and the relation between them, like the follwing: \n"
"[\n"
" {\n"
' "node_1": "A concept from extracted ontology",\n'
' "node_2": "A related concept from extracted ontology",\n'
' "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
" }, {...}\n"
"]"
)

USER_PROMPT = f"context: ```{input}``` \n\n output: "
response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=USER_PROMPT)
try:
result = json.loads(response)
result = [dict(item, **metadata) for item in result]
except:
print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
result = None
return result
return result
Loading

0 comments on commit cfb191c

Please sign in to comment.