Skip to content

Commit

Permalink
Create stemming.py
Browse files Browse the repository at this point in the history
  • Loading branch information
DianaHoefels authored Oct 13, 2024
1 parent 4b37276 commit 4509a40
Showing 1 changed file with 40 additions and 0 deletions.
40 changes: 40 additions & 0 deletions scripts/stemming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Install the NLTK library if you haven't already: bash pip install nltk

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Make sure you have the necessary NLTK resources downloaded
nltk.download('punkt')

def stem_words(text):
"""
Tokenizes the input text into words and applies stemming.
Args:
text (str): The input text string.
Returns:
list: A list of stemmed words.
"""
# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Tokenize the text into words
words = word_tokenize(text)

# Stem each word in the tokenized text
stemmed_words = [stemmer.stem(word) for word in words]

return stemmed_words

if __name__ == "__main__":
# Example usage
sample_text = """Natural language processing includes tasks such as tokenization, stemming, and lemmatization.
It focuses on analyzing and understanding human languages."""

stemmed_output = stem_words(sample_text)

print("Stemmed Words:", stemmed_output)

# Output: Stemmed Words: ['natur', 'languag', 'process', 'includ', 'task', 'such', 'as', 'token', ',', 'stem', ',', 'and', 'lemmat', '.', 'it', 'focus', 'on', 'analyz', 'and', 'understand', 'human', 'languag', '.']

0 comments on commit 4509a40

Please sign in to comment.