Skip to content

Commit

Permalink
Use custom renderer when creating embeddings
Browse files Browse the repository at this point in the history
Rendering makes plugin output available and and handles includes. It
might also help with #15.
The renderer uses markdown like output since all LLMs seem to be very
familiar with it's syntax. This might help them to understand the
document structure better.
This also adds a breadcrumb trail at the top of each chunk which might
help with contextulization as well.
  • Loading branch information
splitbrain committed Jun 25, 2024
1 parent bcaa910 commit 661701e
Show file tree
Hide file tree
Showing 2 changed files with 660 additions and 14 deletions.
59 changes: 45 additions & 14 deletions Embeddings.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use dokuwiki\Extension\Event;
use dokuwiki\Extension\PluginInterface;
use dokuwiki\File\PageResolver;
use dokuwiki\plugin\aichat\Model\ChatInterface;
use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
use dokuwiki\plugin\aichat\Storage\AbstractStorage;
Expand Down Expand Up @@ -177,23 +178,20 @@ public function createPageChunks($page, $firstChunkID)
{
$chunkList = [];

$textRenderer = plugin_load('renderer', 'text');
if ($textRenderer instanceof PluginInterface) {
global $ID;
$ID = $page;
try {
$text = p_cached_output(wikiFN($page), 'text', $page);
} catch (\Throwable $e) {
if ($this->logger) $this->logger->error(
'Failed to render page {page} using raw text instead. {msg}',
['page' => $page, 'msg' => $e->getMessage()]
);
$text = rawWiki($page);
}
} else {
global $ID;
$ID = $page;
try {
$text = p_cached_output(wikiFN($page), 'aichat', $page);
} catch (\Throwable $e) {
if ($this->logger) $this->logger->error(
'Failed to render page {page}. Using raw text instead. {msg}',
['page' => $page, 'msg' => $e->getMessage()]
);
$text = rawWiki($page);
}

$crumbs = $this->breadcrumbTrail($page);

// allow plugins to modify the text before splitting
$eventData = [
'page' => $page,
Expand All @@ -211,6 +209,8 @@ public function createPageChunks($page, $firstChunkID)
foreach ($parts as $part) {
if (trim((string)$part) == '') continue; // skip empty chunks

$part = $crumbs . "\n\n" . $part; // add breadcrumbs to each chunk

try {
$embedding = $this->embedModel->getEmbedding($part);
} catch (\Exception $e) {
Expand Down Expand Up @@ -285,6 +285,37 @@ public function getSimilarChunks($query, $lang = '')
return $result;
}

/**
* Create a breadcrumb trail for the given page
*
* Uses the first heading of each namespace and the page itself. This is added as a prefix to
* each chunk to give the AI some context.
*
* @param string $id
* @return string
*/
protected function breadcrumbTrail($id)
{
$namespaces = explode(':', getNS($id));
$resolver = new PageResolver($id);
$crumbs = [];

// all namespaces
$check = '';
foreach ($namespaces as $namespace) {
$check .= $namespace . ':';
$page = $resolver->resolveId($check);
$title = p_get_first_heading($page);
$crumbs[] = $title ? "$title ($namespace)" : $namespace;
}

// the page itself
$title = p_get_first_heading($id);
$page = noNS($id);
$crumbs[] = $title ? "$title ($page)" : $page;

return implode(' » ', $crumbs);
}

/**
* @param $text
Expand Down
Loading

0 comments on commit 661701e

Please sign in to comment.