Skip to content

Commit

Permalink
support tags filter in core server
Browse files Browse the repository at this point in the history
  • Loading branch information
yujonglee committed Oct 3, 2024
1 parent 605bc47 commit 915a636
Show file tree
Hide file tree
Showing 21 changed files with 233 additions and 82 deletions.
2 changes: 1 addition & 1 deletion core/lib/canary/accounts/public_key_config.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defmodule Canary.Accounts.PublicKeyConfig do
use Ash.Resource, data_layer: :embedded

attributes do
attribute :allowed_host, :string, allow_nil?: true
attribute :allowed_host, :string, allow_nil?: false
end

actions do
Expand Down
8 changes: 4 additions & 4 deletions core/lib/canary/application.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ defmodule Canary.Application do
def start(_type, _args) do
attach_oban_telemetry()

Canary.Index.Collection.ensure(:webpage)
Canary.Index.Collection.ensure(:github_issue)
Canary.Index.Collection.ensure(:github_discussion)
Canary.Index.Stopword.ensure()
:ok = Canary.Index.Collection.ensure(:webpage)
:ok = Canary.Index.Collection.ensure(:github_issue)
:ok = Canary.Index.Collection.ensure(:github_discussion)
:ok = Canary.Index.Stopword.ensure()

children =
[
Expand Down
1 change: 1 addition & 0 deletions core/lib/canary/index/collection.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ defmodule Canary.Index.Collection do
%{name: "source_id", type: "string"},
%{name: "embedding", type: "float[]", num_dim: 384, optional: true},
%{name: "tags", type: "string[]"},
%{name: "is_empty_tags", type: "bool"},
%{name: "meta", type: "object", index: false, optional: true}
]

Expand Down
1 change: 1 addition & 0 deletions core/lib/canary/index/document.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ defmodule Canary.Index.Document.Shared do
:source_id,
:embedding,
:tags,
:is_empty_tags,
:meta
]
end
Expand Down
9 changes: 7 additions & 2 deletions core/lib/canary/index/index.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ defmodule Canary.Index do
is_parent: chunk.is_parent
}

tags = chunk.tags || []

doc = %Document.Webpage{
id: chunk.index_id,
source_id: chunk.source_id,
title: chunk.title || "",
content: chunk.content || "",
tags: [],
tags: tags,
is_empty_tags: tags == [],
meta: meta
}

Expand All @@ -39,6 +42,7 @@ defmodule Canary.Index do
title: chunk.title || "",
content: chunk.content || "",
tags: [],
is_empty_tags: true,
meta: meta
}

Expand All @@ -58,6 +62,7 @@ defmodule Canary.Index do
title: chunk.title || "",
content: chunk.content || "",
tags: [],
is_empty_tags: true,
meta: meta
}

Expand Down Expand Up @@ -115,7 +120,7 @@ defmodule Canary.Index do
[
"source_id:=[#{source_id}]",
if(tags != nil and tags != []) do
"tags:=[#{Enum.join(tags, ",")}]"
"(tags:=[#{Enum.join(tags, ",")}] || is_empty_tags:=true)"
end
]
|> Enum.reject(&is_nil/1)
Expand Down
14 changes: 10 additions & 4 deletions core/lib/canary/index/stopword.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@ defmodule Canary.Index.Stopword do
def id(), do: "default_stopwords"

def ensure() do
Canary.Index.Client.upsert_stopwords_set(id(), %{
locale: "en",
stopwords: Canary.Native.stopwords()
})
result =
Canary.Index.Client.upsert_stopwords_set(id(), %{
locale: "en",
stopwords: Canary.Native.stopwords()
})

case result do
{:ok, _} -> :ok
error -> error
end
end
end
2 changes: 1 addition & 1 deletion core/lib/canary/interactions/responder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ defmodule Canary.Interactions.Responder.Default do
alias Canary.Sources.Document

def run(sources, query, handle_delta, opts) do
{:ok, results} = Canary.Searcher.run(sources, query, cache: opts[:cache])
{:ok, results} = Canary.Searcher.run(sources, query, opts)

docs =
results
Expand Down
39 changes: 21 additions & 18 deletions core/lib/canary/searcher.ex
Original file line number Diff line number Diff line change
@@ -1,34 +1,37 @@
defmodule Canary.Searcher do
@callback run(list(any()), String.t()) :: {:ok, list(map())} | {:error, any()}
@callback run(list(any()), String.t(), keyword()) :: {:ok, list(map())} | {:error, any()}

def run(sources, query, opts \\ []) do
if opts[:cache] do
with {:error, _} <- get_cache(sources, query),
{:ok, result} <- impl().run(sources, query) do
set_cache(sources, query, result)
{cache, opts} = Keyword.pop(opts, :cache, false)

if cache do
with {:error, _} <- get_cache(sources, query, opts),
{:ok, result} <- impl().run(sources, query, opts) do
set_cache(sources, query, opts, result)
{:ok, result}
end
else
impl().run(sources, query)
impl().run(sources, query, opts)
end
end

defp set_cache(sources, query, result) do
Cachex.put(:cache, key(sources, query), result, ttl: :timer.minutes(3))
defp set_cache(sources, query, opts, result) do
Cachex.put(:cache, key(sources, query, opts), result, ttl: :timer.minutes(3))
end

defp get_cache(sources, query) do
case Cachex.get(:cache, key(sources, query)) do
defp get_cache(sources, query, opts) do
case Cachex.get(:cache, key(sources, query, opts)) do
{:ok, nil} -> {:error, :not_found}
{:ok, hit} -> {:ok, hit}
end
end

defp key(sources, query) do
defp key(sources, query, opts) do
sources
|> Enum.map(& &1.id)
|> Enum.join(",")
|> Kernel.<>(":" <> query)
|> Kernel.<>(":" <> Jason.encode!(opts))
end

defp impl(), do: Application.get_env(:canary, :searcher, Canary.Searcher.Default)
Expand All @@ -39,29 +42,29 @@ defmodule Canary.Searcher.Default do

require Ash.Query

def run(sources, query) do
def run(sources, query, opts) do
if ai?(query) do
ai_search(sources, query)
ai_search(sources, query, opts)
else
normal_search(sources, query)
normal_search(sources, query, opts)
end
end

defp ai?(query) do
String.ends_with?(query, "?") or String.split(query, " ", trim: true) |> Enum.count() > 2
end

defp ai_search(sources, query) do
defp ai_search(sources, query, opts) do
keywords = Canary.Query.Understander.keywords(sources)

with {:ok, queries} = Canary.Query.Understander.run(query, keywords),
{:ok, hits} <- Canary.Index.search(sources, queries) do
{:ok, hits} <- Canary.Index.search(sources, queries, tags: opts[:tags]) do
{:ok, transform(sources, hits)}
end
end

defp normal_search(sources, query) do
{:ok, results} = Canary.Index.search(sources, [query])
defp normal_search(sources, query, opts) do
{:ok, results} = Canary.Index.search(sources, [query], tags: opts[:tags])
{:ok, transform(sources, results)}
end

Expand Down
3 changes: 2 additions & 1 deletion core/lib/canary/sources/document/create_webpage.ex
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ defmodule Canary.Sources.Document.CreateWebpage do
|> Ash.Changeset.change_attribute(opts[:meta_attribute], wrap_union(%Webpage.DocumentMeta{}))
|> Ash.Changeset.change_attribute(opts[:chunks_attribute], [])
|> Ash.Changeset.after_action(fn _, record ->
%Webpage.FetcherResult{url: url, html: html, items: items} = fetcher_result
%Webpage.FetcherResult{url: url, html: html, items: items, tags: tags} = fetcher_result
top_level_item = items |> Enum.at(0)

hash =
Expand All @@ -46,6 +46,7 @@ defmodule Canary.Sources.Document.CreateWebpage do
is_parent: index == 0,
title: item.title,
content: item.content,
tags: tags,
url: URI.parse(url) |> Map.put(:fragment, item.id) |> to_string()
}
end)
Expand Down
7 changes: 5 additions & 2 deletions core/lib/canary/sources/github_discussion_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,11 @@ defmodule Canary.Sources.GithubDiscussion.Fetcher do
cursor: nil
}

nodes = GithubFetcher.run_all(query, variables)
{:ok, Enum.map(nodes, &transform_discussion_node/1)}
stream =
GithubFetcher.run_all(query, variables)
|> Stream.map(&transform_discussion_node/1)

{:ok, stream}
end

defp transform_discussion_node(discussion) do
Expand Down
1 change: 0 additions & 1 deletion core/lib/canary/sources/github_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ defmodule Canary.Sources.GithubFetcher do
end
end)
|> Stream.flat_map(& &1)
|> Enum.to_list()
end

def run(query, variables) do
Expand Down
9 changes: 6 additions & 3 deletions core/lib/canary/sources/github_issue_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,14 @@ defmodule Canary.Sources.GithubIssue.Fetcher do
issue_n: @default_issue_n,
comment_n: @default_comment_n,
cursor: nil,
since: DateTime.utc_now() |> DateTime.add(-365, :day) |> DateTime.to_iso8601()
since: DateTime.utc_now() |> DateTime.add(-180, :day) |> DateTime.to_iso8601()
}

nodes = GithubFetcher.run_all(query, variables)
{:ok, Enum.map(nodes, &transform_issue_node/1)}
stream =
GithubFetcher.run_all(query, variables)
|> Stream.map(&transform_issue_node/1)

{:ok, stream}
end

defp transform_issue_node(issue) do
Expand Down
2 changes: 2 additions & 0 deletions core/lib/canary/sources/webpage_chunk.ex
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ defmodule Canary.Sources.Webpage.Chunk do
attribute :document_id, :string, allow_nil?: false
attribute :is_parent, :boolean, allow_nil?: false

attribute :tags, {:array, :string}, default: []
attribute :url, :string, allow_nil?: false
attribute :title, :string, allow_nil?: false, constraints: [allow_empty?: true]
attribute :content, :string, allow_nil?: false
Expand All @@ -23,6 +24,7 @@ defmodule Canary.Sources.Webpage.Chunk do
:source_id,
:document_id,
:is_parent,
:tags,
:url,
:title,
:content,
Expand Down
7 changes: 5 additions & 2 deletions core/lib/canary/sources/webpage_config.ex
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
defmodule Canary.Sources.Webpage.Config do
use Ash.Resource, data_layer: :embedded

alias Canary.Sources.Webpage.TagDefinition

attributes do
attribute :start_urls, {:array, :string}, default: []
attribute :url_include_patterns, {:array, :string}, default: []
attribute :url_exclude_patterns, {:array, :string}, default: []
attribute :tag_definitions, {:array, TagDefinition}, default: []
end

actions do
defaults [:read]

create :create do
primary? true
accept [:start_urls, :url_include_patterns, :url_exclude_patterns]
accept [:start_urls, :url_include_patterns, :url_exclude_patterns, :tag_definitions]
end

update :update do
primary? true
accept [:start_urls, :url_include_patterns, :url_exclude_patterns]
accept [:start_urls, :url_include_patterns, :url_exclude_patterns, :tag_definitions]
end
end
end
33 changes: 23 additions & 10 deletions core/lib/canary/sources/webpage_fetcher.ex
Original file line number Diff line number Diff line change
@@ -1,35 +1,48 @@
defmodule Canary.Sources.Webpage.FetcherResult do
alias Canary.Scraper.Item

defstruct [:url, :html, :items]
@type t :: %__MODULE__{url: String.t(), html: String.t(), items: list(Item.t())}
defstruct [:url, :html, :tags, :items]

@type t :: %__MODULE__{
url: String.t(),
html: String.t(),
tags: list(String.t()),
items: list(Item.t())
}
end

defmodule Canary.Sources.Webpage.Fetcher do
alias Canary.Sources.Source
alias Canary.Sources.Webpage.FetcherResult
alias Canary.Sources.Webpage.Config
alias Canary.Sources.Webpage

def run(%Source{config: %Ash.Union{type: :webpage, value: %Config{} = config}}) do
def run(%Webpage.Config{} = config) do
case Canary.Crawler.run(config) do
{:ok, stream} ->
results =
stream =
stream
|> Stream.map(fn {url, html} ->
items = Canary.Scraper.run(html)

tags =
config.tag_definitions
|> Enum.filter(&is_matching_tag?(&1, url))
|> Enum.map(& &1.name)

if(length(items) == 0,
do: nil,
else: %FetcherResult{url: url, html: html, items: items}
else: %Webpage.FetcherResult{url: url, html: html, tags: tags, items: items}
)
end)
|> Stream.reject(&is_nil/1)
|> Enum.to_list()

{:ok, results}
{:ok, stream}

error ->
error
end
end

defp is_matching_tag?(%Webpage.TagDefinition{} = tag, url) do
tag.url_include_patterns
|> Enum.any?(&Canary.Native.glob_match(&1, url))
end
end
22 changes: 22 additions & 0 deletions core/lib/canary/sources/webpage_tag_definition.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
defmodule Canary.Sources.Webpage.TagDefinition do
use Ash.Resource, data_layer: :embedded

attributes do
attribute :name, :string, allow_nil?: false
attribute :url_include_patterns, {:array, :string}, default: []
end

actions do
defaults [:read, :destroy]

create :create do
primary? true
accept [:name, :url_include_patterns]
end

update :update do
primary? true
accept [:name, :url_include_patterns]
end
end
end
6 changes: 3 additions & 3 deletions core/lib/canary/workers/webpage_processor.ex
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ defmodule Canary.Workers.WebpageProcessor do
end
end

defp process(%Source{id: source_id} = source) do
with {:ok, incomings} = Webpage.Fetcher.run(source),
:ok <- Webpage.Syncer.run(source_id, incomings) do
defp process(%Source{id: source_id, config: %Ash.Union{type: :webpage, value: config}} = source) do
with {:ok, incomings} = Webpage.Fetcher.run(config),
:ok <- Webpage.Syncer.run(source_id, Enum.to_list(incomings)) do
Source.update_overview(source)
end
end
Expand Down
Loading

0 comments on commit 915a636

Please sign in to comment.