Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document type and github loader #172

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions lib/document.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
defmodule LangChain.Document do
use Ecto.Schema
import Ecto.Changeset
require Logger
alias __MODULE__
alias LangChain.LangChainError

@type t :: %__MODULE__{}

@primary_key false
embedded_schema do
field :content, :string
field :metadata, :map
field :type, :string
end

@create_fields [:content, :metadata, :type]
@required_fields [:content, :type]

@doc """
Build a new document and return an `:ok`/`:error` tuple with the result.
"""
@spec new(attrs :: map()) :: {:ok, t()} | {:error, Ecto.Changeset.t()}
def new(attrs \\ %{}) do
%Document{}
|> cast(attrs, @create_fields)
|> common_validations()
|> apply_action(:insert)
end

@doc """
Build a new document and error out if the changeset is invalid
"""
@spec new!(attrs :: map()) :: t()
def new!(attrs \\ %{}) do
case new(attrs) do
{:ok, doc} ->
doc

{:error, changeset} ->
raise LangChainError, changeset
end
end

defp common_validations(changeset) do
changeset
|> validate_required(@required_fields)
end
end
5 changes: 5 additions & 0 deletions lib/document/loader.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
defmodule LangChain.Document.Loader do
alias LangChain.Document

@callback load(options :: map()) :: %Document{}
end
160 changes: 160 additions & 0 deletions lib/document/loaders/github.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
defmodule LangChain.Document.Loaders.Github do
@moduledoc """
Currently this module only supports grabbing issues.

Extending this to support other resources (like PRs, commits, etc) will require
a little more work, but at the moment I am not using those resources.
"""

@behaviour LangChain.Document.Loader

use Ecto.Schema
require Logger
import Ecto.Changeset
alias __MODULE__
alias LangChain.Config
alias LangChain.Document
alias LangChain.LangChainError

# allow up to 1 minute for response.
@receive_timeout 60_000

@primary_key false
embedded_schema do
# API endpoint to use. Defaults to Github's API
field :endpoint, :string, default: "https://api.github.com"
field :api_key, :string
field :receive_timeout, :integer, default: @receive_timeout
end

@type t :: %Github{}

@create_fields [
:endpoint,
:api_key,
:receive_timeout
]
@required_fields [:endpoint, :receive_timeout]

@spec load(t(), map()) :: [Document.t()] | []
def load(%Github{} = github, %{type: :issue} = options) do
make_request(github, options[:repo])
|> to_documents()
end

def load(options) do
raise LangChainError, "Unsupported type: #{inspect(options[:type])}"
end

@spec to_documents(issues :: [map()]) :: [Document.t()]
def to_documents(issues) do
Enum.map(issues, fn issue ->
%Document{
content: issue.body,
metadata: %{
id: issue.id,
title: issue.title
},
type: "github_issue"
}
end)
end

@spec make_request(t(), String.t()) :: [map()] | no_return()
def make_request(github, repo) do
make_request(github, repo, 1, 3, [])
end

@spec make_request(t(), String.t(), integer(), integer(), [map()]) :: [map()] | no_return()
def make_request(_gihub, _repo, _page, 0, _acc) do
raise LangChainError, "Retries exceeded. Connection failed."
end

def make_request(%Github{} = github, repo, page, retry_count, acc) do
req =
Req.new(
url: "#{github.endpoint}/repos/#{repo}/issues?page=#{page}",
headers: headers(get_api_key(github)),
receive_timeout: github.receive_timeout,
retry: :transient,
max_retries: 3,
retry_delay: fn attempt -> 300 * attempt end
)

req
|> Req.get()
|> case do
{:ok, %Req.Response{body: data, headers: _headers} = _response} ->
# @TODO check the headers and see if we need to do some pagination
# if so, call this function recursively with the next page
process_response(data)

{:error, %Req.TransportError{reason: :timeout}} ->
{:error, "Request timed out"}

{:error, %Req.TransportError{reason: :closed}} ->
# Force a retry by making a recursive call decrementing the counter
Logger.debug(fn -> "Mint connection closed: retry count = #{inspect(retry_count)}" end)
make_request(github, repo, page, retry_count - 1, acc)

other ->
Logger.error("Unexpected and unhandled API response! #{inspect(other)}")
other
end
end

@spec get_api_key(t()) :: String.t()
defp get_api_key(%Github{api_key: api_key}) do
api_key || Config.resolve(:github_key, "")
end

@doc """
Setup a Github client configuration.
"""
@spec new(attrs :: map()) :: {:ok, t} | {:error, Ecto.Changeset.t()}
def new(%{} = attrs \\ %{}) do
%Github{}
|> cast(attrs, @create_fields)
|> common_validation()
|> apply_action(:insert)
end

@doc """
Setup a Guthub client configuration and return it or raise an error if invalid.
"""
@spec new!(attrs :: map()) :: t() | no_return()
def new!(attrs \\ %{}) do
case new(attrs) do
{:ok, chain} ->
chain

{:error, changeset} ->
raise LangChainError, changeset
end
end

defp common_validation(changeset) do
changeset
|> validate_required(@required_fields)
end

defp headers("") do
%{}
end

defp headers(api_key) do
%{
"Authorization" => "Bearer #{api_key}"
}
end

def process_response(response) do
Enum.map(response, fn issue ->
%{
:id => issue["id"],
:title => issue["title"],
:body => issue["body"]
}
end)
end
end
87 changes: 87 additions & 0 deletions test/document/loaders/github_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
defmodule LangChain.Document.Loaders.GithubTest do
use ExUnit.Case, async: true

alias LangChain.Document.Loaders.Github
alias LangChain.Document
alias LangChain.LangChainError

describe "new/1" do
test "creates a new Github struct with default values" do
assert {:ok, %Github{endpoint: "https://api.github.com", receive_timeout: 60_000}} =
Github.new(%{})
end

test "creates a new Github struct with custom values" do
attrs = %{
endpoint: "https://custom.github.com",
api_key: "test_key",
receive_timeout: 30_000
}

assert {:ok,
%Github{
endpoint: "https://custom.github.com",
api_key: "test_key",
receive_timeout: 30_000
}} =
Github.new(attrs)
end
end

describe "load/1 with unsupported type" do
test "raises LangChainError when type is unsupported" do
options = %{type: :unsupported_type}

assert_raise LangChainError, "Unsupported type: :unsupported_type", fn ->
Github.load(options)
end
end
end

describe "new!/1" do
test "creates a new Github struct with default values" do
assert %Github{endpoint: "https://api.github.com", receive_timeout: 60_000} =
Github.new!(%{})
end
end

describe "to_documents/1" do
test "converts a list of issues into Document structs" do
issues = [
%{id: 1, title: "Issue 1", body: "Body 1"},
%{id: 2, title: "Issue 2", body: "Body 2"}
]

documents = Github.to_documents(issues)

assert [
%Document{
content: "Body 1",
metadata: %{id: 1, title: "Issue 1"},
type: "github_issue"
},
%Document{
content: "Body 2",
metadata: %{id: 2, title: "Issue 2"},
type: "github_issue"
}
] = documents
end
end

describe "process_response/1" do
test "processes a list of GitHub issues from the API response" do
response = [
%{"id" => 1, "title" => "Issue 1", "body" => "Body 1"},
%{"id" => 2, "title" => "Issue 2", "body" => "Body 2"}
]

processed_issues = Github.process_response(response)

assert [
%{id: 1, title: "Issue 1", body: "Body 1"},
%{id: 2, title: "Issue 2", body: "Body 2"}
] = processed_issues
end
end
end
36 changes: 36 additions & 0 deletions test/document/text_document_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
defmodule LangChain.Document.TextDocuemntTest do
use ExUnit.Case
doctest LangChain.Document
alias LangChain.Document
alias LangChain.LangChainError

describe "new/1" do
test "works with basic attrs" do
assert {:ok, %Document{} = document} =
Document.new(%{
"content" => "Here's some content",
"type" => "plain_text",
"metadata" => %{"source" => "https://example.com"}
})

assert document.type == "plain_text"
assert document.content == "Here's some content"
assert document.metadata["source"] == "https://example.com"
end

test "returns error when invalid" do
assert {:error, changeset} = Document.new(%{"content" => nil})
refute changeset.valid?
assert {"can't be blank", _} = changeset.errors[:content]
assert {"can't be blank", _} = changeset.errors[:type]
end
end

describe "new!/1" do
test "throws when invalid" do
assert_raise LangChainError, fn ->
Document.new!(%{"content" => nil})
end
end
end
end
Loading