From 60cc76e63a59d7a06e5e9f09e457446b42ac3572 Mon Sep 17 00:00:00 2001 From: framsouza Date: Mon, 21 Oct 2024 11:47:26 +0200 Subject: [PATCH] Adding github assistant code (#340) * Adding github assistant code * fmt * formating evaluation.py * lint * formating second attempt * remoivng extra spaces * formating index.py * formating file_summary.append * formating parse_document function * thrid attempt of fixing index.py * lint 4x * removing extra lines * Adding evaluation-result.txt * adding README.md --- .../github-assistant/README.md | 28 +++ .../github-assistant/evaluation-result.txt | 90 ++++++++ .../github-assistant/evaluation.py | 197 +++++++++++++++++ .../github-assistant/index.py | 201 ++++++++++++++++++ .../github-assistant/query.py | 53 +++++ .../github-assistant/requirements.txt | 11 + 6 files changed, 580 insertions(+) create mode 100644 supporting-blog-content/github-assistant/README.md create mode 100644 supporting-blog-content/github-assistant/evaluation-result.txt create mode 100644 supporting-blog-content/github-assistant/evaluation.py create mode 100644 supporting-blog-content/github-assistant/index.py create mode 100644 supporting-blog-content/github-assistant/query.py create mode 100644 supporting-blog-content/github-assistant/requirements.txt diff --git a/supporting-blog-content/github-assistant/README.md b/supporting-blog-content/github-assistant/README.md new file mode 100644 index 00000000..f1dc2643 --- /dev/null +++ b/supporting-blog-content/github-assistant/README.md @@ -0,0 +1,28 @@ +# GitHub Assistant + +Easily ask questions about your GitHub repository using RAG and Elasticsearch as a Vector database. + +### How to use this code + +1. Install Required Libraries: + +```bash +pip install -r requirements.txt +``` + +2. Set Up Environment Variables +`GITHUB_TOKEN`, `GITHUB_OWNER`, `GITHUB_REPO`, `GITHUB_BRANCH`, `ELASTIC_CLOUD_ID`, `ELASTIC_USER`, `ELASTIC_PASSWORD`, `ELASTIC_INDEX`, `OPENAI_API_KEY` + +3. Index your data and create the embeddings by running: + +```bash +python index.py +``` + +An Elasticsearch index will be generated, housing the embeddings. You can then connect to your ESS deployment and run search query against the index, you will see a new field named embeddings. + +4. Ask questions about your codebase by running: + +```bash +python query.py +``` \ No newline at end of file diff --git a/supporting-blog-content/github-assistant/evaluation-result.txt b/supporting-blog-content/github-assistant/evaluation-result.txt new file mode 100644 index 00000000..d285c330 --- /dev/null +++ b/supporting-blog-content/github-assistant/evaluation-result.txt @@ -0,0 +1,90 @@ +``` +Number of documents loaded: 5 +\All available questions generated: +0. What is the purpose of chunking monitors in the updated push command as mentioned in the changelog? +1. How does the changelog describe the improvement made to the performance of the push command? +2. What new feature is added to the synthetics project when it is created via the `init` command? +3. According to the changelog, what is the file size of the CHANGELOG.md document? +4. On what date was the CHANGELOG.md file last modified? +5. What is the significance of the example lightweight monitor yaml file mentioned in the changelog? +6. How might the changes described in the changelog impact the workflow of users creating or updating monitors? +7. What is the file path where the CHANGELOG.md document is located? +8. Can you identify the issue numbers associated with the changes mentioned in the changelog? +9. What is the creation date of the CHANGELOG.md file as per the context information? +10. What type of file is the document described in the context information? +11. On what date was the CHANGELOG.md file last modified? +12. What is the file size of the CHANGELOG.md document? +13. Identify one of the bug fixes mentioned in the CHANGELOG.md file. +14. What command is referenced in the context of creating new synthetics projects? +15. How does the CHANGELOG.md file address the issue of varying NDJSON chunked response sizes? +16. What is the significance of the number #680 in the context of the document? +17. What problem is addressed by skipping the addition of empty values for locations? +18. How many bug fixes are explicitly mentioned in the provided context? +19. What is the file path of the CHANGELOG.md document? +20. What is the file path of the document being referenced in the context information? +... + +Generated questions: +1. What command is referenced in relation to the bug fix in the CHANGELOG.md? +2. On what date was the CHANGELOG.md file created? +3. What is the primary purpose of the document based on the context provided? + +Total number of questions generated: 3 + +Processing Question 1 of 3: + +Evaluation Result: ++---------------------------------------------------+-------------------------------------------------+----------------------------------------------------+----------------------+----------------------+-------------------+------------------+------------------+ +| Query | Response | Source | Relevancy Response | Relevancy Feedback | Relevancy Score | Faith Response | Faith Feedback | ++===================================================+=================================================+====================================================+======================+======================+===================+==================+==================+ +| What command is referenced in relation to the bug | The `init` command is referenced in relation to | Bug Fixes | Pass | YES | 1 | Pass | YES | +| fix in the CHANGELOG.md? | the bug fix in the CHANGELOG.md. | | | | | | | +| | | | | | | | | +| | | - Pick the correct loader when bundling TypeScript | | | | | | +| | | or JavaScript journey files | | | | | | +| | | | | | | | | +| | | during push command #626 | | | | | | ++---------------------------------------------------+-------------------------------------------------+----------------------------------------------------+----------------------+----------------------+-------------------+------------------+------------------+ + +Processing Question 2 of 3: + +Evaluation Result: ++-------------------------------------------------+------------------------------------------------+------------------------------+----------------------+----------------------+-------------------+------------------+------------------+ +| Query | Response | Source | Relevancy Response | Relevancy Feedback | Relevancy Score | Faith Response | Faith Feedback | ++=================================================+================================================+==============================+======================+======================+===================+==================+==================+ +| On what date was the CHANGELOG.md file created? | The date mentioned in the CHANGELOG.md file is | v1.0.0-beta-38 (20222-11-02) | Pass | YES | 1 | Pass | YES | +| | November 2, 2022. | | | | | | | ++-------------------------------------------------+------------------------------------------------+------------------------------+----------------------+----------------------+-------------------+------------------+------------------+ + +Processing Question 3 of 3: + +Evaluation Result: ++---------------------------------------------------+---------------------------------------------------+------------------------------+----------------------+----------------------+-------------------+------------------+------------------+ +| Query | Response | Source | Relevancy Response | Relevancy Feedback | Relevancy Score | Faith Response | Faith Feedback | ++===================================================+===================================================+==============================+======================+======================+===================+==================+==================+ +| What is the primary purpose of the document based | The primary purpose of the document is to provide | v1.0.0-beta-38 (20222-11-02) | Pass | YES | 1 | Pass | YES | +| on the context provided? | a changelog detailing the features and | | | | | | | +| | improvements made in version 1.0.0-beta-38 of a | | | | | | | +| | software project. It highlights specific | | | | | | | +| | enhancements such as improved validation for | | | | | | | +| | monitor schedules and an enhanced push command | | | | | | | +| | experience. | | | | | | | ++---------------------------------------------------+---------------------------------------------------+------------------------------+----------------------+----------------------+-------------------+------------------+------------------+ +(clean_env) (base) framsouza@Frams-MacBook-Pro-2 git-assistant % ++-------------------------------------------------+------------------------------------------------+------------------------------+----------------------+----------------------+-------------------+------------------+------------------+------+------------------+ + +Processing Question 3 of 3: + +Evaluation Result: ++---------------------------------------------------+---------------------------------------------------+------------------------------+----------------------+----------------------+-------------------+------------------+------------------+-----------+------------------+ +| Query | Response | Source | Relevancy Response | Relevancy Feedback | Relevancy Score | Faith Response | Faith Feedback |Response | Faith Feedback | ++===================================================+===================================================+==============================+======================+======================+===================+==================+==================+===========+==================+ +| What is the primary purpose of the document based | The primary purpose of the document is to provide | v1.0.0-beta-38 (20222-11-02) | Pass | YES | 1 | Pass | YES | | YES | +| on the context provided? | a changelog detailing the features and | | | | | | | | | +| | improvements made in version 1.0.0-beta-38 of a | | | | | | | | | +| | software project. It highlights specific | | | | | | | | | +| | enhancements such as improved validation for | | | | | | | | | +| | monitor schedules and an enhanced push command | | | | | | | | | +| | experience. | | | | | | | | | ++---------------------------------------------------+---------------------------------------------------+------------------------------+----------------------+----------------------+-------------------+------------------+------------------+-----------+------------------+ +``` \ No newline at end of file diff --git a/supporting-blog-content/github-assistant/evaluation.py b/supporting-blog-content/github-assistant/evaluation.py new file mode 100644 index 00000000..dd1bd8a6 --- /dev/null +++ b/supporting-blog-content/github-assistant/evaluation.py @@ -0,0 +1,197 @@ +import logging +import sys +import os +import pandas as pd +from dotenv import load_dotenv +from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Response +from llama_index.core.evaluation import ( + DatasetGenerator, + RelevancyEvaluator, + FaithfulnessEvaluator, + EvaluationResult, +) +from llama_index.llms.openai import OpenAI +from tabulate import tabulate +import textwrap +import argparse +import traceback +from httpx import ReadTimeout + +logging.basicConfig(stream=sys.stdout, level=logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) + +parser = argparse.ArgumentParser( + description="Process documents and questions for evaluation." +) +parser.add_argument( + "--num_documents", + type=int, + default=None, + help="Number of documents to process (default: all)", +) +parser.add_argument( + "--skip_documents", + type=int, + default=0, + help="Number of documents to skip at the beginning (default: 0)", +) +parser.add_argument( + "--num_questions", + type=int, + default=None, + help="Number of questions to process (default: all)", +) +parser.add_argument( + "--skip_questions", + type=int, + default=0, + help="Number of questions to skip at the beginning (default: 0)", +) +parser.add_argument( + "--process_last_questions", + action="store_true", + help="Process last N questions instead of first N", +) +args = parser.parse_args() + +load_dotenv(".env") + +reader = SimpleDirectoryReader("/tmp/elastic/production-readiness-review") +documents = reader.load_data() +print(f"First document: {documents[0].text}") +print(f"Second document: {documents[1].text}") +print(f"Thrid document: {documents[2].text}") + + +if args.skip_documents > 0: + documents = documents[args.skip_documents :] + +if args.num_documents is not None: + documents = documents[: args.num_documents] + +print(f"Number of documents loaded: {len(documents)}") + +llm = OpenAI(model="gpt-4o", request_timeout=120) + +data_generator = DatasetGenerator.from_documents(documents, llm=llm) + +try: + eval_questions = data_generator.generate_questions_from_nodes() + if isinstance(eval_questions, str): + eval_questions_list = eval_questions.strip().split("\n") + else: + eval_questions_list = eval_questions + eval_questions_list = [q for q in eval_questions_list if q.strip()] + + if args.skip_questions > 0: + eval_questions_list = eval_questions_list[args.skip_questions :] + + if args.num_questions is not None: + if args.process_last_questions: + eval_questions_list = eval_questions_list[-args.num_questions :] + else: + eval_questions_list = eval_questions_list[: args.num_questions] + + print("\All available questions generated:") + for idx, q in enumerate(eval_questions): + print(f"{idx}. {q}") + + print("\nGenerated questions:") + for idx, q in enumerate(eval_questions_list, start=1): + print(f"{idx}. {q}") +except ReadTimeout as e: + print( + "Request to Ollama timed out during question generation. Please check the server or increase the timeout duration." + ) + traceback.print_exc() + sys.exit(1) +except Exception as e: + print(f"An error occurred while generating questions: {e}") + traceback.print_exc() + sys.exit(1) + +print(f"\nTotal number of questions generated: {len(eval_questions_list)}") + +evaluator_relevancy = RelevancyEvaluator(llm=llm) +evaluator_faith = FaithfulnessEvaluator(llm=llm) + +vector_index = VectorStoreIndex.from_documents(documents) + + +def display_eval_df( + query: str, + response: Response, + eval_result_relevancy: EvaluationResult, + eval_result_faith: EvaluationResult, +) -> None: + relevancy_feedback = getattr(eval_result_relevancy, "feedback", "") + relevancy_passing = getattr(eval_result_relevancy, "passing", False) + relevancy_passing_str = "Pass" if relevancy_passing else "Fail" + + relevancy_score = 1.0 if relevancy_passing else 0.0 + + faithfulness_feedback = getattr(eval_result_faith, "feedback", "") + faithfulness_passing_bool = getattr(eval_result_faith, "passing", False) + faithfulness_passing = "Pass" if faithfulness_passing_bool else "Fail" + + def wrap_text(text, width=50): + if text is None: + return "" + text = str(text) + text = text.replace("\r", "") + lines = text.split("\n") + wrapped_lines = [] + for line in lines: + wrapped_lines.extend(textwrap.wrap(line, width=width)) + wrapped_lines.append("") + return "\n".join(wrapped_lines) + + if response.source_nodes: + source_content = wrap_text(response.source_nodes[0].node.get_content()) + else: + source_content = "" + + eval_data = { + "Query": wrap_text(query), + "Response": wrap_text(str(response)), + "Source": source_content, + "Relevancy Response": relevancy_passing_str, + "Relevancy Feedback": wrap_text(relevancy_feedback), + "Relevancy Score": wrap_text(str(relevancy_score)), + "Faith Response": faithfulness_passing, + "Faith Feedback": wrap_text(faithfulness_feedback), + } + + eval_df = pd.DataFrame([eval_data]) + + print("\nEvaluation Result:") + print( + tabulate( + eval_df, headers="keys", tablefmt="grid", showindex=False, stralign="left" + ) + ) + + +query_engine = vector_index.as_query_engine(llm=llm) + +total_questions = len(eval_questions_list) +for idx, question in enumerate(eval_questions_list, start=1): + try: + response_vector = query_engine.query(question) + eval_result_relevancy = evaluator_relevancy.evaluate_response( + query=question, response=response_vector + ) + eval_result_faith = evaluator_faith.evaluate_response(response=response_vector) + + print(f"\nProcessing Question {idx} of {total_questions}:") + display_eval_df( + question, response_vector, eval_result_relevancy, eval_result_faith + ) + except ReadTimeout as e: + print(f"Request to OpenAI timed out while processing question {idx}.") + traceback.print_exc() + continue + except Exception as e: + print(f"An error occurred while processing question {idx}: {e}") + traceback.print_exc() + continue diff --git a/supporting-blog-content/github-assistant/index.py b/supporting-blog-content/github-assistant/index.py new file mode 100644 index 00000000..20f5259c --- /dev/null +++ b/supporting-blog-content/github-assistant/index.py @@ -0,0 +1,201 @@ +from llama_index.core import ( + Document, + Settings, + SimpleDirectoryReader, + StorageContext, + VectorStoreIndex, +) +from llama_index.core.node_parser import ( + SentenceSplitter, + CodeSplitter, + MarkdownNodeParser, + JSONNodeParser, +) +from llama_index.vector_stores.elasticsearch import ElasticsearchStore +from dotenv import load_dotenv +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.core.ingestion import IngestionPipeline +import tree_sitter_python as tspython +from tree_sitter_languages import get_parser, get_language +from tree_sitter import Parser, Language +import logging +import nest_asyncio +import elastic_transport +import sys +import subprocess +import shutil +import time +import glob +import os + + +nest_asyncio.apply() + +load_dotenv(".env") + +Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-large") +Settings.chunk_lines = 1024 +Settings.chunk_size = 1024 +Settings.chunk_lines_overlap = 20 +Settings.max_chars = 1500 + + +def clone_repository(owner, repo, branch, base_path="/tmp"): + branch = branch or os.getenv("GITHUB_BRANCH") + if not branch: + raise ValueError( + "Branch is not provided and GITHUB_BRANCH environment variable is not set." + ) + + local_repo_path = os.path.join(base_path, owner, repo) + clone_url = f"https://github.com/{owner}/{repo}.git" + + if os.path.exists(local_repo_path): + print(f"Repository already exists at {local_repo_path}. Skipping clone.") + return local_repo_path + + attempts = 3 + + for attempt in range(attempts): + try: + os.makedirs(local_repo_path, exist_ok=True) + print(f"Attempting to clone repository... Attempt {attempt + 1}") + subprocess.run( + ["git", "clone", "-b", branch, clone_url, local_repo_path], check=True + ) + print(f"Repository cloned into {local_repo_path}.") + return local_repo_path + except subprocess.CalledProcessError: + print(f"Attempt {attempt + 1} failed, retrying...") + time.sleep(10) + if attempt < attempts - 1: + continue + else: + raise Exception("Failed to clone repository after multiple attempts") + + +def print_docs_and_nodes(docs, nodes): + print("\n=== Documents ===\n") + for doc in docs: + print(f"Document ID: {doc.doc_id}") + print(f"Document Content:\n{doc.text}\n\n---\n") + + print("\n=== Nodes ===\n") + for node in nodes: + print(f"Node ID: {node.id_}") + print(f"Node Content:\n{node.text}\n\n---\n") + + +def collect_and_print_file_summary(file_summary): + print("\n=== File Summary ===\n") + for summary in file_summary: + print(summary) + + +def parse_documents(): + owner = os.getenv("GITHUB_OWNER") + repo = os.getenv("GITHUB_REPO") + branch = os.getenv("GITHUB_BRANCH") + base_path = os.getenv("BASE_PATH", "/tmp") + + if not owner or not repo: + raise ValueError( + "GITHUB_OWNER and GITHUB_REPO environment variables must be set." + ) + + local_repo_path = clone_repository(owner, repo, branch, base_path) + + nodes = [] + file_summary = [] + + ts_parser = get_parser("typescript") + py_parser = get_parser("python") + go_parser = get_parser("go") + js_parser = get_parser("javascript") + bash_parser = get_parser("bash") + yaml_parser = get_parser("yaml") + + parsers_and_extensions = [ + (SentenceSplitter(), [".md"]), + (CodeSplitter(language="python", parser=py_parser), [".py", ".ipynb"]), + (CodeSplitter(language="typescript", parser=ts_parser), [".ts"]), + (CodeSplitter(language="go", parser=go_parser), [".go"]), + (CodeSplitter(language="javascript", parser=js_parser), [".js"]), + (CodeSplitter(language="bash", parser=bash_parser), [".bash", ",sh"]), + (CodeSplitter(language="yaml", parser=yaml_parser), [".yaml", ".yml"]), + (JSONNodeParser(), [".json"]), + ] + + for parser, extensions in parsers_and_extensions: + matching_files = [] + for ext in extensions: + matching_files.extend( + glob.glob(f"{local_repo_path}/**/*{ext}", recursive=True) + ) + + if len(matching_files) > 0: + extension_list = ", ".join(extensions) + file_summary.append( + f"Found {len(matching_files)} {extension_list} files in the repository." + ) + + loader = SimpleDirectoryReader( + input_dir=local_repo_path, required_exts=extensions, recursive=True + ) + docs = loader.load_data() + parsed_nodes = parser.get_nodes_from_documents(docs) + + print_docs_and_nodes(docs, parsed_nodes) + + nodes.extend(parsed_nodes) + else: + extension_list = ", ".join(extensions) + file_summary.append(f"No {extension_list} files found in the repository.") + + collect_and_print_file_summary(file_summary) + print("\n") + return nodes + + +def get_es_vector_store(): + print("Initializing Elasticsearch store...") + es_cloud_id = os.getenv("ELASTIC_CLOUD_ID") + es_user = os.getenv("ELASTIC_USER") + es_password = os.getenv("ELASTIC_PASSWORD") + index_name = os.getenv("ELASTIC_INDEX") + retries = 20 + for attempt in range(retries): + try: + es_vector_store = ElasticsearchStore( + index_name=index_name, + es_cloud_id=es_cloud_id, + es_user=es_user, + es_password=es_password, + batch_size=100, + ) + print("Elasticsearch store initialized.") + return es_vector_store + except elastic_transport.ConnectionTimeout: + print(f"Connection attempt {attempt + 1}/{retries} timed out. Retrying...") + time.sleep(10) + raise Exception("Failed to initialize Elasticsearch store after multiple attempts") + + +def main(): + nodes = parse_documents() + es_vector_store = get_es_vector_store() + + try: + pipeline = IngestionPipeline( + vector_store=es_vector_store, + ) + + pipeline.run(documents=nodes, show_progress=True) + finally: + if hasattr(es_vector_store, "close"): + es_vector_store.close() + print("Elasticsearch connection closed.") + + +if __name__ == "__main__": + main() diff --git a/supporting-blog-content/github-assistant/query.py b/supporting-blog-content/github-assistant/query.py new file mode 100644 index 00000000..6926c862 --- /dev/null +++ b/supporting-blog-content/github-assistant/query.py @@ -0,0 +1,53 @@ +import asyncio +from llama_index.core import VectorStoreIndex, QueryBundle, Settings +from llama_index.llms.openai import OpenAI +from llama_index.embeddings.openai import OpenAIEmbedding +from index import get_es_vector_store +import httpx + +embed_model = OpenAIEmbedding(model="text-embedding-3-large") +Settings.embed_model = embed_model + + +def run_query_sync(): + query = input("Please enter your query: ") + + openai_llm = OpenAI(model="gpt-4o") + + es_vector_store = get_es_vector_store() + index = VectorStoreIndex.from_vector_store(es_vector_store) + + try: + query_engine = index.as_query_engine( + llm=openai_llm, + similarity_top_k=3, + streaming=False, + response_mode="tree_summarize", + ) + + bundle = QueryBundle(query, embedding=embed_model.get_query_embedding(query)) + + result = query_engine.query(bundle) + return result.response + except Exception as e: + print(f"An error occurred while running the query: {e}") + finally: + if hasattr(openai_llm, "client") and isinstance( + openai_llm.client, httpx.Client + ): + openai_llm.client.close() + if hasattr(embed_model, "client") and isinstance( + embed_model.client, httpx.Client + ): + embed_model.client.close() + if hasattr(es_vector_store, "close"): + es_vector_store.close() + print("Elasticsearch connection closed.") + + +if __name__ == "__main__": + try: + result = run_query_sync() + print(result) + except Exception as e: + print(f"An error occurred: {e}") diff --git a/supporting-blog-content/github-assistant/requirements.txt b/supporting-blog-content/github-assistant/requirements.txt new file mode 100644 index 00000000..ec29a2e0 --- /dev/null +++ b/supporting-blog-content/github-assistant/requirements.txt @@ -0,0 +1,11 @@ +llama-index==0.10.37 +load_dotenv==0.1.0 +tree_sitter_languages==1.10.2 +tree-sitter==0.21.3 +llama-index-vector-stores-elasticsearch==0.3.2 +llama-index-embeddings-openai==0.2.5 +llama_index.llms.ollama==0.3.3 +spacy==3.8.2 +IPython==8.28.0 +tabulate==0.9.0 +streamlit==1.39.0