From 57462a6d166f690d0894be6665e8ee70195c6132 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 24 Jun 2024 11:33:17 -0400 Subject: [PATCH 1/4] feat: btree extend --- pkg/btree/btree.go | 48 ++++++++++++++++++++++++++++++++++++++++ pkg/btree/btree_test.go | 1 + pkg/btree/node.go | 39 ++++++++++++++++++++++++++++++++ pkg/metapage/metapage.go | 13 ++++++++++- 4 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 pkg/btree/btree.go create mode 100644 pkg/btree/btree_test.go create mode 100644 pkg/btree/node.go diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go new file mode 100644 index 0000000..e8c72de --- /dev/null +++ b/pkg/btree/btree.go @@ -0,0 +1,48 @@ +package btree + +import ( + "github.com/kevmo314/appendable/pkg/metapage" + "github.com/kevmo314/appendable/pkg/pagefile" + "github.com/kevmo314/appendable/pkg/pointer" + "io" +) + +type BTree struct { + MetaPage metapage.MetaPage + PageFile pagefile.ReadWriteSeekPager + + Width uint16 +} + +func (t *BTree) root() (*BTreeNode, pointer.MemoryPointer, error) { + mp, err := t.MetaPage.Root() + if err != nil { + return nil, mp, err + } + + root, err := t.readNode(mp.Offset) + if err != nil { + return nil, mp, err + } + + return root, mp, nil +} + +func (t *BTree) readNode(offset uint64) (*BTreeNode, error) { + if _, err := t.PageFile.Seek(int64(offset), io.SeekStart); err != nil { + return nil, err + } + + node := &BTreeNode{Width: t.Width} + buf := make([]byte, t.PageFile.PageSize()) + + if _, err := t.PageFile.Read(buf); err != nil { + return nil, err + } + + if err := node.UnmarshalBinary(buf); err != nil { + return nil, err + } + + return node, nil +} diff --git a/pkg/btree/btree_test.go b/pkg/btree/btree_test.go new file mode 100644 index 0000000..3c38a25 --- /dev/null +++ b/pkg/btree/btree_test.go @@ -0,0 +1 @@ +package btree diff --git a/pkg/btree/node.go b/pkg/btree/node.go new file mode 100644 index 0000000..c3653dc --- /dev/null +++ b/pkg/btree/node.go @@ -0,0 +1,39 @@ +package btree + +import ( + "github.com/kevmo314/appendable/pkg/hnsw" + "io" +) + +type BTreeNode struct { + Ids []hnsw.Id + Vectors []hnsw.Point + + Pointers []uint64 + Width uint16 +} + +func (n *BTreeNode) Size() int64 { + return 0 +} + +// MarshalBinary TODO! +func (n *BTreeNode) MarshalBinary() ([]byte, error) { + b := []byte{} + + return b, nil +} + +// UnmarshalBinary TODO! +func (n *BTreeNode) UnmarshalBinary(buf []byte) error { + return nil +} + +func (n *BTreeNode) WriteTo(w io.Writer) (int64, error) { + buf, err := n.MarshalBinary() + if err != nil { + return 0, err + } + m, err := w.Write(buf) + return int64(m), err +} diff --git a/pkg/metapage/metapage.go b/pkg/metapage/metapage.go index eec1e38..1892704 100644 --- a/pkg/metapage/metapage.go +++ b/pkg/metapage/metapage.go @@ -1,6 +1,9 @@ package metapage -import "github.com/kevmo314/appendable/pkg/pointer" +import ( + "github.com/kevmo314/appendable/pkg/pointer" + "io" +) // MetaPage is an abstract interface over the root page of a bptree // This allows the caller to control the memory location of the meta @@ -9,3 +12,11 @@ type MetaPage interface { Root() (pointer.MemoryPointer, error) SetRoot(pointer.MemoryPointer) error } + +type NodeSerializable interface { + Size() int64 + NumPointers() int + MarshalBinary() ([]byte, error) + UnmarshalBinary([]byte) error + WriteTo(w io.Writer) (int64, error) +} From 26e6147134a2dd3a9d2a6001e237f1a7c760d965 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 24 Jun 2024 15:27:00 -0400 Subject: [PATCH 2/4] stage current thinking --- pkg/btree/btree.go | 100 +++++++++++++++++++++++++++++++++++++++++++++ pkg/btree/node.go | 7 +++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go index e8c72de..6fd49b8 100644 --- a/pkg/btree/btree.go +++ b/pkg/btree/btree.go @@ -1,10 +1,14 @@ package btree import ( + "encoding/binary" + "fmt" + "github.com/kevmo314/appendable/pkg/hnsw" "github.com/kevmo314/appendable/pkg/metapage" "github.com/kevmo314/appendable/pkg/pagefile" "github.com/kevmo314/appendable/pkg/pointer" "io" + "slices" ) type BTree struct { @@ -46,3 +50,99 @@ func (t *BTree) readNode(offset uint64) (*BTreeNode, error) { return node, nil } + +// Insert has the following assumptions: +// key.Value represents the Node Id. It is written to []bytes in LittleEndian. +func (t *BTree) Insert(key pointer.ReferencedValue, value hnsw.Point) error { + id := hnsw.Id(binary.LittleEndian.Uint64(key.Value)) + + root, rootOffset, err := t.root() + if err != nil { + return fmt.Errorf("read root node: %w, err") + } + + if root == nil { + node := &BTreeNode{ + Keys: []pointer.ReferencedValue{key}, + Vectors: []hnsw.Point{value}, + Width: t.Width, + } + buf, err := node.MarshalBinary() + if err != nil { + return err + } + + offset, err := t.PageFile.NewPage(buf) + if err != nil { + return err + } + + return t.MetaPage.SetRoot(pointer.MemoryPointer{ + Offset: uint64(offset), + Length: uint32(len(buf)), + }) + } + + parent := root + for !parent.Leaf() { + index, found := slices.BinarySearchFunc(parent.Keys, key, pointer.CompareReferencedValues) + if found { + index++ + } + + if len(parent.Pointers) > index { + return fmt.Errorf("found index %d, but node.Pointers length is %d", index, len(parent.Pointers)) + } + + childPointer := parent.Pointers[index] + child, err := t.readNode(childPointer) + if err != nil { + return err + } + + if int(child.Size()) > t.PageFile.PageSize() { + rightChild, midKey, err := t.SplitChild(parent, index, child) + if err != nil { + return err + } + + switch pointer.CompareReferencedValues(midKey, key) { + case 1: + // key < midKey + parent = child + default: + // right child + parent = rightChild + } + } else { + parent = child + } + } + + return nil +} + +func (t *BTree) SplitChild(parent *BTreeNode, leftChildIndex int, leftChild *BTreeNode) (*BTreeNode, pointer.ReferencedValue, error) { + mid := len(leftChild.Keys) / 2 + + midKey, midVector, midPointer := leftChild.Keys[mid], leftChild.Vectors[mid], leftChild.Pointers[mid] + + rightChild := &BTreeNode{ + Keys: append([]pointer.ReferencedValue(nil), leftChild.Keys[mid+1:]...), + Vectors: append([]hnsw.Point(nil), leftChild.Vectors[mid+1:]...), + Pointers: append([]uint64(nil), leftChild.Pointers[mid+1:]...), + Width: t.Width, + } + + // now that right child has been properly copied, shrink leftChild + leftChild.Keys = leftChild.Keys[:mid] + leftChild.Vectors = leftChild.Vectors[:mid] + leftChild.Pointers = leftChild.Pointers[:mid] + + // Insert the middle key into the parent node at leftChildIndex + parent.Keys = append(parent.Keys[:leftChildIndex], append([]pointer.ReferencedValue{midKey}, parent.Keys[leftChildIndex:]...)...) + parent.Vectors = append(parent.Vectors[:leftChildIndex], append([]hnsw.Point{midVector}, parent.Vectors[leftChildIndex:]...)...) + parent.Pointers = append(parent.Pointers[:leftChildIndex+1], append([]uint64{midPointer}, parent.Pointers[leftChildIndex+1:]...)...) + + return rightChild, midKey, nil +} diff --git a/pkg/btree/node.go b/pkg/btree/node.go index c3653dc..83d94a0 100644 --- a/pkg/btree/node.go +++ b/pkg/btree/node.go @@ -2,17 +2,22 @@ package btree import ( "github.com/kevmo314/appendable/pkg/hnsw" + "github.com/kevmo314/appendable/pkg/pointer" "io" ) type BTreeNode struct { - Ids []hnsw.Id + Keys []pointer.ReferencedValue Vectors []hnsw.Point Pointers []uint64 Width uint16 } +func (n *BTreeNode) Leaf() bool { + return len(n.Pointers) == 0 +} + func (n *BTreeNode) Size() int64 { return 0 } From f91eb927eeb525c96a91ef8df8cccd98816ca6a0 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 24 Jun 2024 17:30:38 -0400 Subject: [PATCH 3/4] create new page for right page, update parent pointers --- pkg/btree/btree.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go index 6fd49b8..1a8313e 100644 --- a/pkg/btree/btree.go +++ b/pkg/btree/btree.go @@ -134,6 +134,15 @@ func (t *BTree) SplitChild(parent *BTreeNode, leftChildIndex int, leftChild *BTr Width: t.Width, } + rbuf, err := rightChild.MarshalBinary() + if err != nil { + return nil, pointer.ReferencedValue{}, err + } + roffset, err := t.PageFile.NewPage(rbuf) + if err != nil { + return err + } + // now that right child has been properly copied, shrink leftChild leftChild.Keys = leftChild.Keys[:mid] leftChild.Vectors = leftChild.Vectors[:mid] @@ -142,7 +151,7 @@ func (t *BTree) SplitChild(parent *BTreeNode, leftChildIndex int, leftChild *BTr // Insert the middle key into the parent node at leftChildIndex parent.Keys = append(parent.Keys[:leftChildIndex], append([]pointer.ReferencedValue{midKey}, parent.Keys[leftChildIndex:]...)...) parent.Vectors = append(parent.Vectors[:leftChildIndex], append([]hnsw.Point{midVector}, parent.Vectors[leftChildIndex:]...)...) - parent.Pointers = append(parent.Pointers[:leftChildIndex+1], append([]uint64{midPointer}, parent.Pointers[leftChildIndex+1:]...)...) + parent.Pointers = append(parent.Pointers[:leftChildIndex+1], append([]uint64{uint64(roffset)}, parent.Pointers[leftChildIndex+1:]...)...) return rightChild, midKey, nil } From 3c553ebef17ed4473e3415751e83ca86a5e3d28e Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 24 Jun 2024 17:32:33 -0400 Subject: [PATCH 4/4] resolve test --- pkg/btree/btree.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/pkg/btree/btree.go b/pkg/btree/btree.go index 1a8313e..aa7358b 100644 --- a/pkg/btree/btree.go +++ b/pkg/btree/btree.go @@ -1,7 +1,6 @@ package btree import ( - "encoding/binary" "fmt" "github.com/kevmo314/appendable/pkg/hnsw" "github.com/kevmo314/appendable/pkg/metapage" @@ -54,11 +53,9 @@ func (t *BTree) readNode(offset uint64) (*BTreeNode, error) { // Insert has the following assumptions: // key.Value represents the Node Id. It is written to []bytes in LittleEndian. func (t *BTree) Insert(key pointer.ReferencedValue, value hnsw.Point) error { - id := hnsw.Id(binary.LittleEndian.Uint64(key.Value)) - - root, rootOffset, err := t.root() + root, _, err := t.root() if err != nil { - return fmt.Errorf("read root node: %w, err") + return fmt.Errorf("read root node: %d", err) } if root == nil { @@ -125,7 +122,7 @@ func (t *BTree) Insert(key pointer.ReferencedValue, value hnsw.Point) error { func (t *BTree) SplitChild(parent *BTreeNode, leftChildIndex int, leftChild *BTreeNode) (*BTreeNode, pointer.ReferencedValue, error) { mid := len(leftChild.Keys) / 2 - midKey, midVector, midPointer := leftChild.Keys[mid], leftChild.Vectors[mid], leftChild.Pointers[mid] + midKey, midVector := leftChild.Keys[mid], leftChild.Vectors[mid] rightChild := &BTreeNode{ Keys: append([]pointer.ReferencedValue(nil), leftChild.Keys[mid+1:]...), @@ -140,15 +137,13 @@ func (t *BTree) SplitChild(parent *BTreeNode, leftChildIndex int, leftChild *BTr } roffset, err := t.PageFile.NewPage(rbuf) if err != nil { - return err + return nil, pointer.ReferencedValue{}, err } - // now that right child has been properly copied, shrink leftChild leftChild.Keys = leftChild.Keys[:mid] leftChild.Vectors = leftChild.Vectors[:mid] leftChild.Pointers = leftChild.Pointers[:mid] - // Insert the middle key into the parent node at leftChildIndex parent.Keys = append(parent.Keys[:leftChildIndex], append([]pointer.ReferencedValue{midKey}, parent.Keys[leftChildIndex:]...)...) parent.Vectors = append(parent.Vectors[:leftChildIndex], append([]hnsw.Point{midVector}, parent.Vectors[leftChildIndex:]...)...) parent.Pointers = append(parent.Pointers[:leftChildIndex+1], append([]uint64{uint64(roffset)}, parent.Pointers[leftChildIndex+1:]...)...)