-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
temp commit, just cleaning out my disk
- Loading branch information
1 parent
e6700d8
commit 21aa243
Showing
2 changed files
with
30 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import numpy as np | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
|
||
def sentence_to_random_embedding(sentence, embedding_dim=1024): | ||
# Tokenize the sentence using a simple bag-of-words method | ||
vectorizer = CountVectorizer() | ||
token_counts = vectorizer.fit_transform([sentence]) | ||
vocabulary = vectorizer.get_feature_names_out() | ||
|
||
# Assign each word in the vocabulary a random 1024-dimensional vector | ||
word_embeddings = {word: np.random.randn(embedding_dim) for word in vocabulary} | ||
|
||
# Sum the embeddings of each word in the sentence | ||
sentence_embedding = np.zeros(embedding_dim) | ||
for word in vocabulary: | ||
sentence_embedding += token_counts[0, vectorizer.vocabulary_[word]] * word_embeddings[word] | ||
|
||
return sentence_embedding | ||
|
||
# Example usage | ||
sentence = "This is an example sentence to encode into a 1024-dimensional vector." | ||
embedding = sentence_to_random_embedding(sentence) | ||
print(embedding) | ||
print("Embedding shape:", embedding.shape) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters