Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove graph entry from fts input #4660

Merged
merged 6 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions extension/fts/src/fts_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
#include "catalog/catalog_entry/catalog_entry_type.h"
#include "function/create_fts_index.h"
#include "function/drop_fts_index.h"
#include "function/get_keys.h"
#include "function/query_fts_gds.h"
#include "function/query_fts_index.h"
#include "function/stem.h"
#include "main/client_context.h"
#include "main/database.h"
Expand All @@ -16,10 +14,8 @@ namespace fts_extension {
void FTSExtension::load(main::ClientContext* context) {
auto& db = *context->getDatabase();
ADD_SCALAR_FUNC(StemFunction);
ADD_SCALAR_FUNC(GetKeysFunction);
ADD_GDS_FUNC(QFTSFunction);
db.addStandaloneCallFunction(CreateFTSFunction::name, CreateFTSFunction::getFunctionSet());
db.addTableFunction(QueryFTSFunction::name, QueryFTSFunction::getFunctionSet());
db.addStandaloneCallFunction(DropFTSFunction::name, DropFTSFunction::getFunctionSet());
}

Expand Down
4 changes: 1 addition & 3 deletions extension/fts/src/function/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@ add_library(kuzu_fts_function
stem.cpp
create_fts_index.cpp
fts_config.cpp
query_fts_index.cpp
drop_fts_index.cpp
query_fts_gds.cpp
fts_utils.cpp
get_keys.cpp)
fts_utils.cpp)

set(FTS_OBJECT_FILES
${FTS_OBJECT_FILES} $<TARGET_OBJECTS:kuzu_fts_function>
Expand Down
33 changes: 23 additions & 10 deletions extension/fts/src/function/create_fts_index.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#include "function/create_fts_index.h"

#include <fstream>
#include <iostream>

#include "binder/expression/expression_util.h"
#include "binder/expression/literal_expression.h"
#include "catalog/fts_index_catalog_entry.h"
Expand Down Expand Up @@ -104,11 +107,13 @@ std::string createFTSIndexQuery(ClientContext& context, const TableFuncBindData&
}

// Create the stop words table if not exists, or the user is not using the default english one.
query += createStopWordsTableIfNotExists(context, ftsBindData->getStopWordsTableName());
auto stopWordsTableName = FTSUtils::getStopWordsTableName();
query += createStopWordsTableIfNotExists(context, stopWordsTableName);

// Create the terms_in_doc table which servers as a temporary table to store the relationship
// between terms and docs.
auto appearsInfoTableName = ftsBindData->getAppearsInfoTableName();
auto appearsInfoTableName =
FTSUtils::getAppearsInfoTableName(ftsBindData->tableID, ftsBindData->indexName);
query +=
common::stringFormat("CREATE NODE TABLE `{}` (ID SERIAL, term string, docID INT64, primary "
"key(ID));",
Expand All @@ -123,11 +128,11 @@ std::string createFTSIndexQuery(ClientContext& context, const TableFuncBindData&
"WHERE t1 is NOT NULL AND SIZE(t1) > 0 AND "
"NOT EXISTS {MATCH (s:`{}` {sw: t1})} "
"RETURN STEM(t1, '{}'), id1);",
appearsInfoTableName, tableName, property, ftsBindData->getStopWordsTableName(),
appearsInfoTableName, tableName, property, stopWordsTableName,
ftsBindData->ftsConfig.stemmer);
}

auto docsTableName = ftsBindData->getDocsTableName();
auto docsTableName = FTSUtils::getDocsTableName(ftsBindData->tableID, ftsBindData->indexName);
// Create the docs table which records the number of words in each document.
query += common::stringFormat(
"CREATE NODE TABLE `{}` (docID INT64, len UINT64, primary key(docID));", docsTableName);
Expand All @@ -136,7 +141,7 @@ std::string createFTSIndexQuery(ClientContext& context, const TableFuncBindData&
"RETURN t.docID, CAST(count(t) AS UINT64)); ",
docsTableName, appearsInfoTableName);

auto termsTableName = ftsBindData->getTermsTableName();
auto termsTableName = FTSUtils::getTermsTableName(ftsBindData->tableID, ftsBindData->indexName);
// Create the dic table which records all distinct terms and their document frequency.
query += common::stringFormat(
"CREATE NODE TABLE `{}` (term STRING, df UINT64, PRIMARY KEY(term));", termsTableName);
Expand All @@ -145,7 +150,8 @@ std::string createFTSIndexQuery(ClientContext& context, const TableFuncBindData&
"RETURN t.term, CAST(count(distinct t.docID) AS UINT64));",
termsTableName, appearsInfoTableName);

auto appearsInTableName = ftsBindData->getAppearsInTableName();
auto appearsInTableName =
FTSUtils::getAppearsInTableName(ftsBindData->tableID, ftsBindData->indexName);
// Finally, create a terms table that records the documents in which the terms appear, along
// with the frequency of each term.
query +=
Expand All @@ -158,6 +164,12 @@ std::string createFTSIndexQuery(ClientContext& context, const TableFuncBindData&

// Drop the intermediate terms_in_doc table.
query += common::stringFormat("DROP TABLE `{}`;", appearsInfoTableName);
// basic file operations

using namespace std;
ofstream myfile("/tmp/query.txt");
myfile << query << endl;
myfile.close();
return query;
}

Expand Down Expand Up @@ -194,9 +206,10 @@ void LenCompute::vertexCompute(const graph::VertexScanState::Chunk& chunk) {

// Do vertex compute to get the numDocs and avgDocLen.
static common::offset_t tableFunc(TableFuncInput& input, TableFuncOutput& /*output*/) {
auto& createFTSBindData = *input.bindData->constPtrCast<CreateFTSBindData>();
auto& bindData = *input.bindData->constPtrCast<CreateFTSBindData>();
auto& context = *input.context;
auto docTableName = createFTSBindData.getDocsTableName();
auto docTableName = FTSUtils::getDocsTableName(bindData.tableID, bindData.indexName);
;
auto docTableEntry = context.clientContext->getCatalog()->getTableCatalogEntry(
context.clientContext->getTx(), docTableName);
graph::GraphEntry entry{{docTableEntry}, {} /* relTableEntries */};
Expand All @@ -208,8 +221,8 @@ static common::offset_t tableFunc(TableFuncInput& input, TableFuncOutput& /*outp
auto numDocs = sharedState.numDocs.load();
auto avgDocLen = numDocs == 0 ? 0 : (double)sharedState.totalLen.load() / numDocs;
context.clientContext->getCatalog()->createIndex(context.clientContext->getTx(),
std::make_unique<fts_extension::FTSIndexCatalogEntry>(createFTSBindData.tableID,
createFTSBindData.indexName, numDocs, avgDocLen, createFTSBindData.ftsConfig));
std::make_unique<fts_extension::FTSIndexCatalogEntry>(bindData.tableID, bindData.indexName,
numDocs, avgDocLen, bindData.ftsConfig));
return 0;
}

Expand Down
12 changes: 7 additions & 5 deletions extension/fts/src/function/drop_fts_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ static std::unique_ptr<TableFuncBindData> bindFunc(ClientContext* context,

std::string dropFTSIndexQuery(ClientContext& /*context*/, const TableFuncBindData& bindData) {
auto ftsBindData = bindData.constPtrCast<FTSBindData>();
std::string query =
common::stringFormat("DROP TABLE `{}`;", ftsBindData->getStopWordsTableName());
query += common::stringFormat("DROP TABLE `{}`;", ftsBindData->getAppearsInTableName());
query += common::stringFormat("DROP TABLE `{}`;", ftsBindData->getDocsTableName());
query += common::stringFormat("DROP TABLE `{}`;", ftsBindData->getTermsTableName());
std::string query = common::stringFormat("DROP TABLE `{}`;", FTSUtils::getStopWordsTableName());
query += common::stringFormat("DROP TABLE `{}`;",
FTSUtils::getAppearsInTableName(ftsBindData->tableID, ftsBindData->indexName));
query += common::stringFormat("DROP TABLE `{}`;",
FTSUtils::getDocsTableName(ftsBindData->tableID, ftsBindData->indexName));
query += common::stringFormat("DROP TABLE `{}`;",
FTSUtils::getTermsTableName(ftsBindData->tableID, ftsBindData->indexName));
return query;
}

Expand Down
60 changes: 0 additions & 60 deletions extension/fts/src/function/get_keys.cpp

This file was deleted.

Loading