Skip to content

Commit

Permalink
Merge branch 'refactor'
Browse files Browse the repository at this point in the history
* refactor: (32 commits)
  rebuild the index when the embedding model changed
  fix info output on used models
  auto style fixes
  ask the rephrased question only if it has more context
  print score in chat
  set custom info text for simulate sub command
  emit the INDEXER_PAGE_ADD event
  make threshold configurable
  mechanisms to override things on command line
  animate button on first show
  automatic stylefixes
  better JSON exception handling in storages
  small adjustments
  prefer prompted user messages over system prompts
  various refactoring and introduction of a simulate command
  separate the rephrasing model from the chat model
  do not hardcode dimensions in qdrant storage
  fix syntax error in qdrant storage
  fix info output
  correctly use storage setting
  ...
  • Loading branch information
splitbrain committed Apr 2, 2024
2 parents a55df08 + bae450a commit 50a1094
Show file tree
Hide file tree
Showing 44 changed files with 1,852 additions and 533 deletions.
54 changes: 54 additions & 0 deletions AbstractCLI.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
<?php

namespace dokuwiki\plugin\aichat;

use dokuwiki\Extension\CLIPlugin;
use splitbrain\phpcli\Options;

abstract class AbstractCLI extends CLIPlugin
{
/** @var \helper_plugin_aichat */
protected $helper;

/** @inheritdoc */
public function __construct($autocatch = true)
{
parent::__construct($autocatch);
$this->helper = plugin_load('helper', 'aichat');
$this->helper->setLogger($this);
$this->loadConfig();
ini_set('memory_limit', -1);
}

/** @inheritdoc */
protected function setup(Options $options)
{
$options->useCompactHelp();

$options->registerOption(
'lang',
'When set to a language code, it overrides the the lang and preferUIlanguage settings and asks the ' .
'bot to always use this language instead. ' .
'When set to "auto" the bot is asked to detect the language of the input falling back to the wiki lang.',
'',
'lang'
);
}

/** @inheritDoc */
protected function main(Options $options)
{
if ($this->loglevel['debug']['enabled']) {
$this->helper->factory->setDebug(true);
}

$lc = $options->getOpt('lang');
if ($lc === 'auto') {
$this->helper->updateConfig(['preferUIlanguage' => 0]);
} elseif ($lc) {
$this->helper->updateConfig(['preferUIlanguage' => 1]);
global $conf;
$conf['lang'] = $lc;
}
}
}
6 changes: 5 additions & 1 deletion Chunk.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,11 @@ public function __construct(

public function __toString(): string
{
return $this->page . '#' . $this->id;
$string = $this->page . '#' . $this->id;
if ($this->score) {
$string .= sprintf(' (%.2f)', $this->score);
}
return $string;
}

/**
Expand Down
103 changes: 82 additions & 21 deletions Embeddings.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

namespace dokuwiki\plugin\aichat;

use dokuwiki\Extension\Event;
use dokuwiki\Extension\PluginInterface;
use dokuwiki\plugin\aichat\Model\AbstractModel;
use dokuwiki\plugin\aichat\Model\ChatInterface;
use dokuwiki\plugin\aichat\Model\EmbeddingInterface;
use dokuwiki\plugin\aichat\Storage\AbstractStorage;
use dokuwiki\Search\Indexer;
use splitbrain\phpcli\CLI;
Expand All @@ -21,8 +23,12 @@ class Embeddings
/** @var int maximum overlap between chunks in tokens */
final public const MAX_OVERLAP_LEN = 200;

/** @var AbstractModel */
protected $model;
/** @var ChatInterface */
protected $chatModel;

/** @var EmbeddingInterface */
protected $embedModel;

/** @var CLI|null */
protected $logger;
/** @var Encoder */
Expand All @@ -34,10 +40,33 @@ class Embeddings
/** @var array remember sentences when chunking */
private $sentenceQueue = [];

public function __construct(AbstractModel $model, AbstractStorage $storage)
{
$this->model = $model;
/** @var int the time spent for the last similar chunk retrieval */
public $timeSpent = 0;

protected $configChunkSize;
protected $configContextChunks;
protected $similarityThreshold;

/**
* Embeddings constructor.
*
* @param ChatInterface $chatModel
* @param EmbeddingInterface $embedModel
* @param AbstractStorage $storage
* @param array $config The plugin configuration
*/
public function __construct(
ChatInterface $chatModel,
EmbeddingInterface $embedModel,
AbstractStorage $storage,
$config
) {
$this->chatModel = $chatModel;
$this->embedModel = $embedModel;
$this->storage = $storage;
$this->configChunkSize = $config['chunkSize'];
$this->configContextChunks = $config['contextChunks'];
$this->similarityThreshold = $config['similarityThreshold'] / 100;
}

/**
Expand Down Expand Up @@ -73,6 +102,20 @@ public function getTokenEncoder()
return $this->tokenEncoder;
}

/**
* Return the chunk size to use
*
* @return int
*/
public function getChunkSize()
{
return min(
floor($this->chatModel->getMaxInputTokenLength() / 4), // be able to fit 4 chunks into the max input
floor($this->embedModel->getMaxInputTokenLength() * 0.9), // only use 90% of the embedding model to be safe
$this->configChunkSize, // this is usually the smallest
);
}

/**
* Update the embeddings storage
*
Expand All @@ -95,7 +138,7 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
!page_exists($page) ||
isHiddenPage($page) ||
filesize(wikiFN($page)) < 150 || // skip very small pages
($skipRE && preg_match($skipRE, (string) $page)) ||
($skipRE && preg_match($skipRE, (string)$page)) ||
($matchRE && !preg_match($matchRE, ":$page"))
) {
// this page should not be in the index (anymore)
Expand All @@ -111,7 +154,8 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
} else {
// page is newer than the chunks we have, create new chunks
$this->storage->deletePageChunks($page, $chunkID);
$this->storage->addPageChunks($this->createPageChunks($page, $chunkID));
$chunks = $this->createPageChunks($page, $chunkID);
if ($chunks) $this->storage->addPageChunks($chunks);
}
}
$this->storage->finalizeCreation();
Expand All @@ -126,9 +170,10 @@ public function createNewIndex($skipRE = '', $matchRE = '', $clear = false)
* @param string $page Name of the page to split
* @param int $firstChunkID The ID of the first chunk of this page
* @return Chunk[] A list of chunks created for this page
* @emits INDEXER_PAGE_ADD support plugins that add additional data to the page
* @throws \Exception
*/
protected function createPageChunks($page, $firstChunkID)
public function createPageChunks($page, $firstChunkID)
{
$chunkList = [];

Expand All @@ -141,12 +186,25 @@ protected function createPageChunks($page, $firstChunkID)
$text = rawWiki($page);
}

// allow plugins to modify the text before splitting
$eventData = [
'page' => $page,
'body' => '',
'metadata' => ['title' => $page, 'relation_references' => []],
];
$event = new Event('INDEXER_PAGE_ADD', $eventData);
if ($event->advise_before()) {
$text = $eventData['body'] . ' ' . $text;
} else {
$text = $eventData['body'];
}

$parts = $this->splitIntoChunks($text);
foreach ($parts as $part) {
if (trim((string) $part) == '') continue; // skip empty chunks
if (trim((string)$part) == '') continue; // skip empty chunks

try {
$embedding = $this->model->getEmbedding($part);
$embedding = $this->embedModel->getEmbedding($part);
} catch (\Exception $e) {
if ($this->logger instanceof CLI) {
$this->logger->error(
Expand Down Expand Up @@ -186,19 +244,20 @@ protected function createPageChunks($page, $firstChunkID)
public function getSimilarChunks($query, $lang = '')
{
global $auth;
$vector = $this->model->getEmbedding($query);
$vector = $this->embedModel->getEmbedding($query);

$fetch = ceil(
($this->model->getMaxContextTokenLength() / $this->model->getMaxEmbeddingTokenLength())
* 1.5 // fetch a few more than needed, since not all chunks are maximum length
$fetch = min(
($this->chatModel->getMaxInputTokenLength() / $this->getChunkSize()),
$this->configContextChunks
);

$time = microtime(true);
$chunks = $this->storage->getSimilarChunks($vector, $lang, $fetch);
$this->timeSpent = round(microtime(true) - $time, 2);
if ($this->logger instanceof CLI) {
$this->logger->info(
'Fetched {count} similar chunks from store in {time} seconds',
['count' => count($chunks), 'time' => round(microtime(true) - $time, 2)]
['count' => count($chunks), 'time' => $this->timeSpent]
);
}

Expand All @@ -207,9 +266,10 @@ public function getSimilarChunks($query, $lang = '')
foreach ($chunks as $chunk) {
// filter out chunks the user is not allowed to read
if ($auth && auth_quickaclcheck($chunk->getPage()) < AUTH_READ) continue;
if ($chunk->getScore() < $this->similarityThreshold) continue;

$chunkSize = count($this->getTokenEncoder()->encode($chunk->getText()));
if ($size + $chunkSize > $this->model->getMaxContextTokenLength()) break; // we have enough
if ($size + $chunkSize > $this->chatModel->getMaxInputTokenLength()) break; // we have enough

$result[] = $chunk;
$size += $chunkSize;
Expand All @@ -224,7 +284,7 @@ public function getSimilarChunks($query, $lang = '')
* @throws \Exception
* @todo support splitting too long sentences
*/
public function splitIntoChunks($text)
protected function splitIntoChunks($text)
{
$sentenceSplitter = new Sentence();
$tiktok = $this->getTokenEncoder();
Expand All @@ -236,23 +296,24 @@ public function splitIntoChunks($text)
$chunk = '';
while ($sentence = array_shift($sentences)) {
$slen = count($tiktok->encode($sentence));
if ($slen > $this->model->getMaxEmbeddingTokenLength()) {
if ($slen > $this->getChunkSize()) {
// sentence is too long, we need to split it further
if ($this->logger instanceof CLI) $this->logger->warning(
'Sentence too long, splitting not implemented yet'
);
continue;
}

if ($chunklen + $slen < $this->model->getMaxEmbeddingTokenLength()) {
if ($chunklen + $slen < $this->getChunkSize()) {
// add to current chunk
$chunk .= $sentence;
$chunklen += $slen;
// remember sentence for overlap check
$this->rememberSentence($sentence);
} else {
// add current chunk to result
$chunks[] = $chunk;
$chunk = trim($chunk);
if ($chunk !== '') $chunks[] = $chunk;

// start new chunk with remembered sentences
$chunk = implode(' ', $this->sentenceQueue);
Expand Down
Loading

0 comments on commit 50a1094

Please sign in to comment.