forked from peterw/Chat-with-Github-Repo
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgithub.py
64 lines (51 loc) · 2.05 KB
/
github.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import subprocess
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import DeepLake
from langchain.embeddings.openai import OpenAIEmbeddings
from dotenv import load_dotenv
import openai
import logging
load_dotenv()
openai.api_key = os.environ.get('OPENAI_API_KEY')
logging.basicConfig(level=logging.INFO)
def clone_repository(repo_url, local_path):
subprocess.run(["git", "clone", repo_url, local_path])
def is_binary(file_path):
"""Check if the file is binary."""
with open(file_path, 'rb') as file:
chunk = file.read(1024)
return b'\0' in chunk
def load_docs(root_dir):
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
for file in filenames:
file_path = os.path.join(dirpath, file)
if not is_binary(file_path):
try:
loader = TextLoader(file_path, encoding='utf-8')
doc_chunks = loader.load_and_split()
# Prepend the filename to the first chunk
if doc_chunks:
doc_chunks[0].page_content = f"// {file}\n{doc_chunks[0].page_content}"
docs.extend(doc_chunks)
except Exception as e:
logging.error(f"Error loading file {file}: {str(e)}")
return docs
def split_docs(docs):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
return text_splitter.split_documents(docs)
def main(repo_url, root_dir, deep_lake_path):
docs = load_docs(root_dir)
texts = split_docs(docs)
# texts = docs
embeddings = OpenAIEmbeddings()
db = DeepLake(dataset_path=deep_lake_path, embedding_function=embeddings)
# db.delete(delete_all=True)
db.add_documents(texts)
if __name__ == "__main__":
repo_url = os.environ.get('REPO_URL')
root_dir = "/Users/afik_cohen/repos/ppl-ai/clean-android/"
deep_lake_path = os.environ.get('DEEPLAKE_DATASET_PATH')
main(repo_url, root_dir, deep_lake_path)