Skip to content

Commit

Permalink
Merge pull request #832 from bjwswang/main
Browse files Browse the repository at this point in the history
feat: configure chunksize and chunkoverlap in knowledgebase
bjwswang authored Mar 12, 2024
2 parents 5dfce73 + 74970fa commit 647ba0b
Showing 15 changed files with 353 additions and 46 deletions.
4 changes: 2 additions & 2 deletions api/app-node/documentloader/v1alpha1/document_loader_types.go
Original file line number Diff line number Diff line change
@@ -28,10 +28,10 @@ type DocumentLoaderSpec struct {
// CommonSpec
v1alpha1.CommonSpec `json:",inline"`
// ChunkSize for text splitter
// +kubebuilder:default=2048
// +kubebuilder:default=512
ChunkSize int `json:"chunkSize,omitempty"`
// ChunkOverlap for text splitter
// +kubebuilder:default=200
// +kubebuilder:default=100
ChunkOverlap int `json:"chunkOverlap,omitempty"`
// FileExtName the type of documents, can be .pdf, .txt, .mp3, etc ...
FileExtName string `json:"fileExtName,omitempty"`
13 changes: 13 additions & 0 deletions api/base/v1alpha1/knowledgebase.go
Original file line number Diff line number Diff line change
@@ -10,6 +10,19 @@ const (
UpdateSourceFileAnnotationKey = Group + "/update-source-file-time"
)

func (kb *KnowledgeBase) EmbeddingOptions() EmbeddingOptions {
options := kb.Spec.EmbeddingOptions
if kb.Spec.EmbeddingOptions.ChunkSize == 0 {
// default 1024
options.ChunkSize = 1024
}
if kb.Spec.EmbeddingOptions.ChunkOverlap == 0 {
// default 100
options.ChunkOverlap = 100
}
return options
}

func (kb *KnowledgeBase) VectorStoreCollectionName() string {
return kb.Namespace + "_" + kb.Name
}
14 changes: 12 additions & 2 deletions api/base/v1alpha1/knowledgebase_types.go
Original file line number Diff line number Diff line change
@@ -27,13 +27,23 @@ type KnowledgeBaseSpec struct {
// Embedder defines the embedder to embedding files
Embedder *TypedObjectReference `json:"embedder,omitempty"`

// TODO: add EmbedderOptions

// VectorStore defines the vectorstore to store results
VectorStore *TypedObjectReference `json:"vectorStore,omitempty"`

// FileGroups included files Grouped by VersionedDataset
FileGroups []FileGroup `json:"fileGroups,omitempty"`

// Embedding Options
EmbeddingOptions `json:",inline"`
}

type EmbeddingOptions struct {
// ChunkSize for text splitter
// +kubebuilder:default=1024
ChunkSize int `json:"chunkSize,omitempty"`
// ChunkOverlap for text splitter
// +kubebuilder:default=100
ChunkOverlap int `json:"chunkOverlap,omitempty"`
}

type FileGroupDetail struct {
16 changes: 16 additions & 0 deletions api/base/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

175 changes: 173 additions & 2 deletions apiserver/graph/generated/generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions apiserver/graph/generated/models_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 1 addition & 30 deletions apiserver/graph/impl/knowledgebase.resolvers.go
8 changes: 8 additions & 0 deletions apiserver/graph/schema/knowledgebase.gql
Original file line number Diff line number Diff line change
@@ -15,6 +15,8 @@ query listKnowledgeBases($input: ListKnowledgeBaseInput!){
creator
displayName
description
chunkSize
chunkOverlap
status
reason
message
@@ -63,6 +65,8 @@ query getKnowledgeBase($name: String!, $namespace: String!) {
creator
displayName
description
chunkSize
chunkOverlap
status
reason
message
@@ -109,6 +113,8 @@ mutation createKnowledgeBase($input: CreateKnowledgeBaseInput!) {
creator
displayName
description
chunkSize
chunkOverlap
status
reason
message
@@ -155,6 +161,8 @@ mutation updateKnowledgeBase($input: UpdateKnowledgeBaseInput) {
creator
displayName
description
chunkSize
chunkOverlap
status
reason
message
29 changes: 29 additions & 0 deletions apiserver/graph/schema/knowledgebase.graphqls
Original file line number Diff line number Diff line change
@@ -125,6 +125,16 @@ type KnowledgeBase {
fileGroupDetails为知识库中所处理的文件组的详细内容和状态
"""
fileGroupDetails: [filegroupdetail]


"""
chunkSize为知识库做文档拆分时的块大小
"""
chunkSize: Int
"""
chunkOverlap为知识库作文档拆分时相邻块的交集
"""
chunkOverlap: Int

"""
知识库整体连接状态
@@ -172,6 +182,16 @@ input CreateKnowledgeBaseInput{
vectorStore: TypedObjectReferenceInput
"""知识库文件"""
fileGroups: [filegroupinput!]


"""
chunkSize为知识库做文档拆分时的块大小
"""
chunkSize: Int
"""
chunkOverlap为知识库作文档拆分时相邻块的交集
"""
chunkOverlap: Int
}

"""知识库更新的输入"""
@@ -193,6 +213,15 @@ input UpdateKnowledgeBaseInput {

"""更新知识库文件"""
fileGroups: [filegroupinput!]

"""
chunkSize为知识库做文档拆分时的块大小
"""
chunkSize: Int
"""
chunkOverlap为知识库作文档拆分时相邻块的交集
"""
chunkOverlap: Int
}

"""知识库分页列表查询的输入"""
67 changes: 63 additions & 4 deletions apiserver/pkg/knowledgebase/knowledgebase.go
Original file line number Diff line number Diff line change
@@ -29,6 +29,12 @@ import (
"github.com/kubeagi/arcadia/apiserver/graph/generated"
"github.com/kubeagi/arcadia/apiserver/pkg/common"
graphqlutils "github.com/kubeagi/arcadia/apiserver/pkg/utils"
"github.com/kubeagi/arcadia/pkg/config"
)

const (
DefaultChunkSize = 1024
DefaultChunkOverlap = 100
)

func knowledgebase2modelConverter(ctx context.Context, c client.Client) func(obj client.Object) (generated.PageNode, error) {
@@ -101,6 +107,8 @@ func knowledgebase2model(ctx context.Context, c client.Client, knowledgebase *v1
embedderType = string(embedderResource.Spec.Provider.GetType())
}

embeddingOptions := knowledgebase.EmbeddingOptions()

md := generated.KnowledgeBase{
ID: &id,
Name: knowledgebase.GetName(),
@@ -122,6 +130,8 @@ func knowledgebase2model(ctx context.Context, c client.Client, knowledgebase *v1
Namespace: knowledgebase.Spec.VectorStore.Namespace,
},
FileGroupDetails: filegroupdetails,
ChunkSize: &embeddingOptions.ChunkSize,
ChunkOverlap: &embeddingOptions.ChunkOverlap,
// Status info
Status: &status,
Reason: &reason,
@@ -130,11 +140,49 @@ func knowledgebase2model(ctx context.Context, c client.Client, knowledgebase *v1
return &md, nil
}

func CreateKnowledgeBase(ctx context.Context, c client.Client, name, namespace, displayname, description, embedder string, vectorstore v1alpha1.TypedObjectReference, filegroups []v1alpha1.FileGroup) (*generated.KnowledgeBase, error) {
func CreateKnowledgeBase(ctx context.Context, c client.Client, input generated.CreateKnowledgeBaseInput) (*generated.KnowledgeBase, error) {
var filegroups []v1alpha1.FileGroup
var vectorstore v1alpha1.TypedObjectReference
vector, _ := config.GetVectorStore(ctx, c)
displayname, description, embedder := "", "", ""
if input.DisplayName != nil {
displayname = *input.DisplayName
}
if input.Description != nil {
description = *input.Description
}
if input.VectorStore != nil {
vectorstore = v1alpha1.TypedObjectReference(*input.VectorStore)
} else {
vectorstore = *vector
}
if input.Embedder != "" {
embedder = input.Embedder
}
if input.FileGroups != nil {
for _, f := range input.FileGroups {
filegroup := v1alpha1.FileGroup{
Source: (*v1alpha1.TypedObjectReference)(&f.Source),
Paths: f.Path,
}
filegroups = append(filegroups, filegroup)
}
}

// Embedding options
chunkSize := DefaultChunkSize
if input.ChunkSize != nil {
chunkSize = *input.ChunkSize
}
chunkOverlap := DefaultChunkOverlap
if input.ChunkOverlap != nil {
chunkOverlap = *input.ChunkOverlap
}

knowledgebase := &v1alpha1.KnowledgeBase{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: namespace,
Name: input.Name,
Namespace: input.Namespace,
},
Spec: v1alpha1.KnowledgeBaseSpec{
CommonSpec: v1alpha1.CommonSpec{
@@ -144,10 +192,14 @@ func CreateKnowledgeBase(ctx context.Context, c client.Client, name, namespace,
Embedder: &v1alpha1.TypedObjectReference{
Kind: "Embedder",
Name: embedder,
Namespace: &namespace,
Namespace: &input.Namespace,
},
VectorStore: &vectorstore,
FileGroups: filegroups,
EmbeddingOptions: v1alpha1.EmbeddingOptions{
ChunkSize: chunkSize,
ChunkOverlap: chunkOverlap,
},
},
}
common.SetCreator(ctx, &knowledgebase.Spec.CommonSpec)
@@ -213,6 +265,13 @@ func UpdateKnowledgeBase(ctx context.Context, c client.Client, input *generated.
kb.Spec.FileGroups = filegroups
}

if input.ChunkSize != nil {
kb.Spec.ChunkSize = *input.ChunkSize
}
if input.ChunkOverlap != nil {
kb.Spec.ChunkOverlap = *input.ChunkOverlap
}

err = c.Update(ctx, kb)
if err != nil {
return nil, err
Original file line number Diff line number Diff line change
@@ -36,11 +36,11 @@ spec:
description: DocumentLoaderSpec defines the desired state of DocumentLoader
properties:
chunkOverlap:
default: 200
default: 100
description: ChunkOverlap for text splitter
type: integer
chunkSize:
default: 2048
default: 512
description: ChunkSize for text splitter
type: integer
creator:
Original file line number Diff line number Diff line change
@@ -39,6 +39,14 @@ spec:
spec:
description: KnowledgeBaseSpec defines the desired state of KnowledgeBase
properties:
chunkOverlap:
default: 100
description: ChunkOverlap for text splitter
type: integer
chunkSize:
default: 1024
description: ChunkSize for text splitter
type: integer
creator:
description: Creator defines datasource creator (AUTO-FILLED by webhook)
type: string
6 changes: 4 additions & 2 deletions controllers/base/knowledgebase_controller.go
Original file line number Diff line number Diff line change
@@ -538,15 +538,17 @@ func (r *KnowledgeBaseReconciler) handleFile(ctx context.Context, log logr.Logge
}
case ".html", ".htm":
loader = documentloaders.NewHTML(dataReader)
// TODO: support .mp3,.wav
default:
loader = documentloaders.NewText(dataReader)
}

// initialize text splitter
// var split textsplitter.TextSplitter
embeddingOptions := kb.EmbeddingOptions()
split := textsplitter.NewRecursiveCharacter(
textsplitter.WithChunkSize(300),
textsplitter.WithChunkOverlap(30),
textsplitter.WithChunkSize(embeddingOptions.ChunkSize),
textsplitter.WithChunkOverlap(embeddingOptions.ChunkOverlap),
)
// switch {
// case "token":
Original file line number Diff line number Diff line change
@@ -36,11 +36,11 @@ spec:
description: DocumentLoaderSpec defines the desired state of DocumentLoader
properties:
chunkOverlap:
default: 200
default: 100
description: ChunkOverlap for text splitter
type: integer
chunkSize:
default: 2048
default: 512
description: ChunkSize for text splitter
type: integer
creator:
Original file line number Diff line number Diff line change
@@ -39,6 +39,14 @@ spec:
spec:
description: KnowledgeBaseSpec defines the desired state of KnowledgeBase
properties:
chunkOverlap:
default: 100
description: ChunkOverlap for text splitter
type: integer
chunkSize:
default: 1024
description: ChunkSize for text splitter
type: integer
creator:
description: Creator defines datasource creator (AUTO-FILLED by webhook)
type: string

0 comments on commit 647ba0b

Please sign in to comment.