Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write vocabulary files to separate directory #1237

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions src/ServerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ int main(int argc, char** argv) {
// filled / set depending on the options.
using ad_utility::NonNegative;

std::string indexBasename;
std::string baseNameIndex;
std::string baseNameVocabulary;
std::string accessToken;
bool text = false;
unsigned short port;
Expand All @@ -59,8 +60,11 @@ int main(int argc, char** argv) {
};
add("help,h", "Produce this help message.");
// TODO<joka921> Can we output the "required" automatically?
add("index-basename,i", po::value<std::string>(&indexBasename)->required(),
add("index-basename,i", po::value<std::string>(&baseNameIndex)->required(),
"The basename of the index files (required).");
add("vocabulary-basename,v", po::value<std::string>(&baseNameVocabulary),
"The basename of the vocabulary files"
" (default: same as basename of the index files).");
add("port,p", po::value<unsigned short>(&port)->required(),
"The port on which HTTP requests are served (required).");
add("access-token,a", po::value<std::string>(&accessToken)->default_value(""),
Expand Down Expand Up @@ -122,14 +126,20 @@ int main(int argc, char** argv) {
return EXIT_FAILURE;
}

// If no vocabulary basename is given, use the index basename.
if (baseNameVocabulary.empty()) {
baseNameVocabulary = baseNameIndex;
}

LOG(INFO) << EMPH_ON << "QLever Server, compiled on "
<< qlever::version::DatetimeOfCompilation << " using git hash "
<< qlever::version::GitShortHash() << EMPH_OFF << std::endl;

try {
Server server(port, numSimultaneousQueries, memoryMaxSize,
std::move(accessToken), !noPatternTrick);
server.run(indexBasename, text, !noPatterns, !onlyPsoAndPosPermutations);
server.run(baseNameIndex, baseNameVocabulary, text, !noPatterns,
!onlyPsoAndPosPermutations);
Comment on lines 139 to +142
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think one thing you could do (the outer interface that interacts with the control script is mostly you area)
is to set up a simple struct ServerConfig that is then passed to the constructor as well as the run() function where they grap their respectively needed arguments. That makes it much easier to add additional arguments.
(Probably we need a similar struct IndexConfig that then becomes part of the server config for exactly the same reason).
Then it will become much easier to add additional arguments.
Are you interested in setting this up as a separate PR, or should I do this?

} catch (const std::exception& e) {
// This code should never be reached as all exceptions should be handled
// within server.run()
Expand Down
2 changes: 1 addition & 1 deletion src/VocabularyMergerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ int main(int argc, char** argv) {
auto internalVocabularyAction = [&file](const auto& word) {
file << RdfEscaping::escapeNewlinesAndBackslashes(word) << '\n';
};
m.mergeVocabulary(basename, numFiles, TripleComponentComparator(),
m.mergeVocabulary(basename, basename, numFiles, TripleComponentComparator(),
internalVocabularyAction, 4_GB);
}
12 changes: 7 additions & 5 deletions src/engine/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,16 @@ Server::Server(unsigned short port, size_t numThreads,
}

// __________________________________________________________________________
void Server::initialize(const string& indexBaseName, bool useText,
void Server::initialize(const string& baseNameIndex,
const string& baseNameVocabulary, bool useText,
bool usePatterns, bool loadAllPermutations) {
LOG(INFO) << "Initializing server ..." << std::endl;

index_.usePatterns() = usePatterns;
index_.loadAllPermutations() = loadAllPermutations;

// Init the index.
index_.createFromOnDiskIndex(indexBaseName);
index_.createFromOnDiskIndex(baseNameIndex, baseNameVocabulary);
if (useText) {
index_.addTextFromOnDiskIndex();
}
Expand All @@ -78,8 +79,8 @@ void Server::initialize(const string& indexBaseName, bool useText,
}

// _____________________________________________________________________________
void Server::run(const string& indexBaseName, bool useText, bool usePatterns,
bool loadAllPermutations) {
void Server::run(const string& baseNameIndex, const string& baseNameVocabulary,
bool useText, bool usePatterns, bool loadAllPermutations) {
using namespace ad_utility::httpUtils;

// Function that handles a request asynchronously, will be passed as argument
Expand Down Expand Up @@ -154,7 +155,8 @@ void Server::run(const string& indexBaseName, bool useText, bool usePatterns,
std::move(webSocketSessionSupplier)};

// Initialize the index
initialize(indexBaseName, useText, usePatterns, loadAllPermutations);
initialize(baseNameIndex, baseNameVocabulary, useText, usePatterns,
loadAllPermutations);

// Start listening for connections on the server.
httpServer.run();
Expand Down
8 changes: 5 additions & 3 deletions src/engine/Server.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,15 @@ class Server {

private:
//! Initialize the server.
void initialize(const string& indexBaseName, bool useText,
bool usePatterns = true, bool loadAllPermutations = true);
void initialize(const string& baseNameIndex, const string& baseNameVocabulary,
bool useText, bool usePatterns = true,
bool loadAllPermutations = true);

public:
//! First initialize the server. Then loop, wait for requests and trigger
//! processing. This method never returns except when throwing an exception.
void run(const string& indexBaseName, bool useText, bool usePatterns = true,
void run(const string& baseNameIndex, const string& baseNameVocabulary,
bool useText, bool usePatterns = true,
bool loadAllPermutations = true);

Index& index() { return index_; }
Expand Down
5 changes: 3 additions & 2 deletions src/index/ConstantsIndexBuilding.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@ static const size_t BZIP2_MAX_TOTAL_BUFFER_SIZE = 1 << 30;
static const size_t THRESHOLD_RELATION_CREATION = 2 << 20;

// ________________________________________________________________
static const std::string PARTIAL_VOCAB_FILE_NAME = ".tmp.partial-vocabulary.";
static const std::string PARTIAL_MMAP_IDS = ".tmp.partial-ids-mmap.";
static const std::string PARTIAL_VOCAB_FILE_NAME =
".tmp.partial-vocabulary.words.";
static const std::string PARTIAL_MMAP_IDS = ".tmp.partial-vocabulary.ids.";

// ________________________________________________________________
static const std::string TMP_BASENAME_COMPRESSION =
Expand Down
10 changes: 6 additions & 4 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ void Index::createFromFile(const std::string& filename) {
}

// ____________________________________________________________________________
void Index::createFromOnDiskIndex(const std::string& onDiskBase) {
pimpl_->createFromOnDiskIndex(onDiskBase);
void Index::createFromOnDiskIndex(const std::string& onDiskBaseIndex,
const std::string& onDiskBaseVocabulary) {
pimpl_->createFromOnDiskIndex(onDiskBaseIndex, onDiskBaseVocabulary);
}

// ____________________________________________________________________________
Expand Down Expand Up @@ -193,8 +194,9 @@ const ad_utility::MemorySize& Index::memoryLimitIndexBuilding() const {
}

// ____________________________________________________________________________
void Index::setOnDiskBase(const std::string& onDiskBase) {
return pimpl_->setOnDiskBase(onDiskBase);
void Index::setOnDiskBase(const std::string& onDiskBaseIndex,
const std::string& onDiskBaseVocabulary) {
return pimpl_->setOnDiskBase(onDiskBaseIndex, onDiskBaseVocabulary);
}

// ____________________________________________________________________________
Expand Down
6 changes: 4 additions & 2 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ class Index {
// constructed using the `createFromFile` method which is typically called via
// `IndexBuilderMain`. Read necessary metadata into memory and open file
// handles.
void createFromOnDiskIndex(const std::string& onDiskBase);
void createFromOnDiskIndex(const std::string& onDiskBaseIndex,
const std::string& onDiskVocabulary);

// Add a text index to a complete KB index. First read the given context
// file (if file name not empty), then add words from literals (if true).
Expand Down Expand Up @@ -181,7 +182,8 @@ class Index {

ad_utility::MemorySize& blocksizePermutationsPerColumn();

void setOnDiskBase(const std::string& onDiskBase);
void setOnDiskBase(const std::string& onDiskBaseIndex,
const std::string& onDiskBaseVocabulary);

void setSettingsFile(const std::string& filename);

Expand Down
24 changes: 17 additions & 7 deletions src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ int main(int argc, char** argv) {
std::locale locWithNumberGrouping(loc, &facet);
ad_utility::Log::imbue(locWithNumberGrouping);

string baseName;
string baseNameIndex;
string baseNameVocabulary;
string wordsfile;
string docsfile;
string textIndexName;
Expand All @@ -86,8 +87,11 @@ int main(int argc, char** argv) {
boostOptions.add_options()(std::forward<Args>(args)...);
};
add("help,h", "Produce this help message.");
add("index-basename,i", po::value(&baseName)->required(),
"The basename of the output files (required).");
add("index-basename,i", po::value(&baseNameIndex)->required(),
"The basename of the index files (required).");
add("vocabulary-basename,v", po::value(&baseNameVocabulary),
"The basename of the vocabulary files"
"(default: same as basename of the index fles).");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"(default: same as basename of the index fles).");
"(default: same as basename of the index files).");

add("kg-input-file,f", po::value(&inputFile),
"The file with the knowledge graph data to be parsed from. If omitted, "
"will read from stdin.");
Expand Down Expand Up @@ -152,6 +156,12 @@ int main(int argc, char** argv) {
index.memoryLimitIndexBuilding() = stxxlMemory.value();
}

// If no external vocabulary basename was specified, use the same as the
// index basename.
if (baseNameVocabulary.empty()) {
baseNameVocabulary = baseNameIndex;
}
Comment on lines +161 to +163
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the baseNameVocabulary is an std::optional<string> then you can pass it as is all the way down, and only in a single place in the index (in setOnDiskBase) you have to resolve the default (or not even there but only in the functions that interact with the vocabulary).


// If no text index name was specified, take the part of the wordsfile after
// the last slash.
if (textIndexName.empty() && !wordsfile.empty()) {
Expand All @@ -170,17 +180,17 @@ int main(int argc, char** argv) {

try {
LOG(TRACE) << "Configuring STXXL..." << std::endl;
size_t posOfLastSlash = baseName.rfind('/');
string location = baseName.substr(0, posOfLastSlash + 1);
string tail = baseName.substr(posOfLastSlash + 1);
size_t posOfLastSlash = baseNameIndex.rfind('/');
string location = baseNameIndex.substr(0, posOfLastSlash + 1);
string tail = baseNameIndex.substr(posOfLastSlash + 1);
writeStxxlConfigFile(location, tail);
string stxxlFileName = getStxxlDiskFileName(location, tail);
LOG(TRACE) << "done." << std::endl;

index.setKbName(kbIndexName);
index.setTextName(textIndexName);
index.usePatterns() = !noPatterns;
index.setOnDiskBase(baseName);
index.setOnDiskBase(baseNameIndex, baseNameVocabulary);
index.setKeepTempFiles(keepTemporaryFiles);
index.setSettingsFile(settingsFile);
index.setPrefixCompression(!noPrefixCompression);
Expand Down
24 changes: 12 additions & 12 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ void IndexImpl::addTextFromContextFile(const string& contextFile,
bool addWordsFromLiterals) {
LOG(INFO) << std::endl;
LOG(INFO) << "Adding text index ..." << std::endl;
string indexFilename = onDiskBase_ + ".text.index";
string indexFilename = onDiskBaseIndex_ + ".text.index";
// Either read words from given file or consider each literal as text record
// or both (but at least one of them, otherwise this function is not called).
if (!contextFile.empty()) {
Expand All @@ -107,14 +107,14 @@ void IndexImpl::addTextFromContextFile(const string& contextFile,
LOG(DEBUG) << "Reloading the RDF vocabulary ..." << std::endl;
vocab_ = RdfsVocabulary{};
readConfiguration();
vocab_.readFromFile(onDiskBase_ + INTERNAL_VOCAB_SUFFIX,
onDiskBase_ + EXTERNAL_VOCAB_SUFFIX);
vocab_.readFromFile(onDiskBaseVocabulary_ + INTERNAL_VOCAB_SUFFIX,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you make the onDiskBaseVocabulary_ an optional (see my suggestion above), and then instead introduce the function onDiskBaseVocabulary() that returns something like onDiskBaseVocabulary_.value_or(onDiskBaseIndex_), then everything is clean nice and typesafe:)

onDiskBaseVocabulary_ + EXTERNAL_VOCAB_SUFFIX);

// Build the text vocabulary (first scan over the text records).
LOG(INFO) << "Building text vocabulary ..." << std::endl;
size_t nofLines =
processWordsForVocabulary(contextFile, addWordsFromLiterals);
textVocab_.writeToFile(onDiskBase_ + ".text.vocabulary");
textVocab_.writeToFile(onDiskBaseVocabulary_ + ".text.vocabulary");

// Build the half-inverted lists (second scan over the text records).
LOG(INFO) << "Building the half-inverted index lists ..." << std::endl;
Expand All @@ -134,7 +134,7 @@ void IndexImpl::addTextFromContextFile(const string& contextFile,
void IndexImpl::buildDocsDB(const string& docsFileName) const {
LOG(INFO) << "Building DocsDB...\n";
std::ifstream docsFile{docsFileName};
std::ofstream ofs(onDiskBase_ + ".text.docsDB", std::ios_base::out);
std::ofstream ofs(onDiskBaseIndex_ + ".text.docsDB", std::ios_base::out);
// To avoid excessive use of RAM,
// we write the offsets to and stxxl:vector first;
stxxl::vector<off_t> offsets;
Expand All @@ -161,7 +161,7 @@ void IndexImpl::buildDocsDB(const string& docsFileName) const {

ofs.close();
// Now append the tmp file to the docsDB file.
ad_utility::File out(onDiskBase_ + ".text.docsDB", "a");
ad_utility::File out(onDiskBaseIndex_ + ".text.docsDB", "a");
for (size_t i = 0; i < offsets.size(); ++i) {
off_t cur = offsets[i];
out.write(&cur, sizeof(cur));
Expand All @@ -173,10 +173,10 @@ void IndexImpl::buildDocsDB(const string& docsFileName) const {
// _____________________________________________________________________________
void IndexImpl::addTextFromOnDiskIndex() {
// Read the text vocabulary (into RAM).
textVocab_.readFromFile(onDiskBase_ + ".text.vocabulary");
textVocab_.readFromFile(onDiskBaseVocabulary_ + ".text.vocabulary");

// Initialize the text index.
std::string textIndexFileName = onDiskBase_ + ".text.index";
std::string textIndexFileName = onDiskBaseIndex_ + ".text.index";
LOG(INFO) << "Reading metadata from file " << textIndexFileName << " ..."
<< std::endl;
textIndexFile_.open(textIndexFileName.c_str(), "r");
Expand All @@ -194,11 +194,11 @@ void IndexImpl::addTextFromOnDiskIndex() {
// without this, but then there is no content to show when a text record
// matches. This is perfectly fine when the text records come from IRIs or
// literals from our RDF vocabulary.
std::string docsDbFileName = onDiskBase_ + ".text.docsDB";
std::string docsDbFileName = onDiskBaseIndex_ + ".text.docsDB";
std::ifstream f(docsDbFileName.c_str());
if (f.good()) {
f.close();
docsDB_.init(string(onDiskBase_ + ".text.docsDB"));
docsDB_.init(string(onDiskBaseIndex_ + ".text.docsDB"));
LOG(INFO) << "Registered text records: #records = " << docsDB_._size
<< std::endl;
} else {
Expand Down Expand Up @@ -707,8 +707,8 @@ size_t IndexImpl::writeCodebook(const vector<T>& codebook,

// _____________________________________________________________________________
void IndexImpl::openTextFileHandle() {
AD_CONTRACT_CHECK(!onDiskBase_.empty());
textIndexFile_.open(string(onDiskBase_ + ".text.index").c_str(), "r");
AD_CONTRACT_CHECK(!onDiskBaseIndex_.empty());
textIndexFile_.open(string(onDiskBaseIndex_ + ".text.index").c_str(), "r");
}

// _____________________________________________________________________________
Expand Down
Loading
Loading