Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add position of word in inverse index #17

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions engine/custom_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,32 @@ def save_pages():
def add_document_to_index(doc_id, words: list[str]):
print(f"Adding stuff")
global inverted_index
for word in set(words):
inverted_index[word].append(doc_id)
for word_index, word in enumerate(words):
position = word_index + 1
# If the word is not in the invertex index, simply add it
if word not in inverted_index:
print(f"Adding new word {word}")
inverted_index[word] = [[doc_id, [position]]]
else:
# There are two cases.
# The word is in the inverted index:
# - but the document is not in the list
# - and the document is in the list but the position is not

# Check if the document is in the list
doc_ids = [doc[0] for doc in inverted_index[word]]

if doc_id not in doc_ids:
print(f"Adding new document {doc_id} to word {word}")
inverted_index[word].append([doc_id, [position]])
else:
# The document is in the list, but the position is not
# Get the index of the document
doc_index = doc_ids.index(doc_id)
# Check if the position is in the list
if position not in inverted_index[word][doc_index][1]:
print(f"Adding new position {position} to word {word} in document {doc_id}")
inverted_index[word][doc_index][1].append(position)


def index_pages():
Expand All @@ -42,5 +66,3 @@ def access_index():


# Convert the inverted index to a DataFrame