From c81418982c06ad33fcf7a743d7911938a64f3ea7 Mon Sep 17 00:00:00 2001 From: Diana Constantina Hoefels <38501557+DianaHoefels@users.noreply.github.com> Date: Sun, 13 Oct 2024 00:06:49 +0200 Subject: [PATCH] Create tokenisation.py --- scripts/tokenisation.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 scripts/tokenisation.py diff --git a/scripts/tokenisation.py b/scripts/tokenisation.py new file mode 100644 index 0000000..240cab3 --- /dev/null +++ b/scripts/tokenisation.py @@ -0,0 +1,41 @@ +import nltk +from nltk.tokenize import word_tokenize, sent_tokenize + +# Install the NLTK library if you haven't already: pip install nltk +# Make sure you have the necessary NLTK resources downloaded +nltk.download('punkt') + +def tokenize_text(text): + """ + Tokenizes the input text into sentences and words. + Args:text (str): The input text string. + Returns: dict: A dictionary with two keys - 'sentences' and 'words'. + """ + # Tokenize the text into sentences + sentences = sent_tokenize(text) + + # Tokenize the text into words + words = word_tokenize(text) + + return { + 'sentences': sentences, + 'words': words + } + +if __name__ == "__main__": + # Example usage + sample_text = """Natural language processing (NLP) is a field of artificial intelligence. + It focuses on the interaction between computers and humans through language.""" + + tokenized_output = tokenize_text(sample_text) + + print("Sentences:", tokenized_output['sentences']) + print("Words:", tokenized_output['words']) + +# Output: + Sentences: ['Natural language processing (NLP) is a field of artificial intelligence.', 'It focuses on the interaction between computers and humans through language.'] + Words: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', '.', 'It', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'language', '.'] + + +# How to Run: +# Run the script: python tokenization.py