-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenisation.py
41 lines (31 loc) · 1.49 KB
/
tokenisation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
# Install the NLTK library if you haven't already: pip install nltk
# Make sure you have the necessary NLTK resources downloaded
nltk.download('punkt')
def tokenize_text(text):
"""
Tokenizes the input text into sentences and words.
Args:text (str): The input text string.
Returns: dict: A dictionary with two keys - 'sentences' and 'words'.
"""
# Tokenize the text into sentences
sentences = sent_tokenize(text)
# Tokenize the text into words
words = word_tokenize(text)
return {
'sentences': sentences,
'words': words
}
if __name__ == "__main__":
# Example usage
sample_text = """Natural language processing (NLP) is a field of artificial intelligence.
It focuses on the interaction between computers and humans through language."""
tokenized_output = tokenize_text(sample_text)
print("Sentences:", tokenized_output['sentences'])
print("Words:", tokenized_output['words'])
# Output:
Sentences: ['Natural language processing (NLP) is a field of artificial intelligence.', 'It focuses on the interaction between computers and humans through language.']
Words: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'field', 'of', 'artificial', 'intelligence', '.', 'It', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'through', 'language', '.']
# How to Run:
# Run the script: python tokenization.py