From fa54eafa4d37f5af0c7e57e6fec2015585c6a238 Mon Sep 17 00:00:00 2001 From: am9zZWY <46693545+am9zZWY@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:14:33 +0200 Subject: [PATCH] Add position of word in inverse index --- engine/custom_db.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/engine/custom_db.py b/engine/custom_db.py index 93cfc70..ec22b03 100644 --- a/engine/custom_db.py +++ b/engine/custom_db.py @@ -26,8 +26,32 @@ def save_pages(): def add_document_to_index(doc_id, words: list[str]): print(f"Adding stuff") global inverted_index - for word in set(words): - inverted_index[word].append(doc_id) + for word_index, word in enumerate(words): + position = word_index + 1 + # If the word is not in the invertex index, simply add it + if word not in inverted_index: + print(f"Adding new word {word}") + inverted_index[word] = [[doc_id, [position]]] + else: + # There are two cases. + # The word is in the inverted index: + # - but the document is not in the list + # - and the document is in the list but the position is not + + # Check if the document is in the list + doc_ids = [doc[0] for doc in inverted_index[word]] + + if doc_id not in doc_ids: + print(f"Adding new document {doc_id} to word {word}") + inverted_index[word].append([doc_id, [position]]) + else: + # The document is in the list, but the position is not + # Get the index of the document + doc_index = doc_ids.index(doc_id) + # Check if the position is in the list + if position not in inverted_index[word][doc_index][1]: + print(f"Adding new position {position} to word {word} in document {doc_id}") + inverted_index[word][doc_index][1].append(position) def index_pages(): @@ -42,5 +66,3 @@ def access_index(): # Convert the inverted index to a DataFrame - -