-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode.py
135 lines (112 loc) · 5.91 KB
/
code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
import re
assamese_pronouns = set(pd.read_csv('/content/assamese_pronouns.csv')['AS-PRON'].tolist())
assamese_nouns = set(pd.read_csv('/content/assamese_nouns.csv')['AS-NOUN'].tolist())
assamese_verbs = set(pd.read_csv('/content/assamese_verbs.csv')['AS-VERB'].tolist())
assamese_adjectives = set(pd.read_csv('/content/assamese_adjectives.csv')['AS-ADJ'].tolist())
assamese_adverbs = set(pd.read_csv('/content/assamese_adverbs.csv')['AS-ADV'].tolist())
assamese_conjunctions = set(pd.read_csv('/content/assamese_conjunctions.csv')['AS-CONJ'].tolist())
english_pronouns = set(pd.read_csv('/content/english_pronouns.csv')['EN-PRON'].tolist())
english_nouns = set(pd.read_csv('/content/english_nouns.csv')['EN-NOUN'].tolist())
english_verbs = set(pd.read_csv('/content/english_verbs.csv')['EN-VERB'].tolist())
english_adjectives = set(pd.read_csv('/content/english_adjectives.csv')['EN-ADJ'].tolist())
english_adverbs = set(pd.read_csv('/content/english_adverbs.csv')['EN-ADV'].tolist())
english_prepositions = set(pd.read_csv('/content/english_prepositions.csv')['EN-PREP'].tolist())
english_conjunctions = set(pd.read_csv('/content/english_conjunctions.csv')['EN-CONJ'].tolist())
english_interjections = set(pd.read_csv('/content/english_interjections.csv')['EN-INTJ'].tolist())
english_determiners = set(pd.read_csv('/content/english_determiners.csv')['EN-DT'].tolist())
assamese_noun_suffixes = ['ৰ', 'ত', 'লৈ', 'টি', 'খিনি', 'ৰে', 'ডাল', 'জনা', 'বো', 'তকৈ', 'ৰ পৰা', 'তত', 'হক', 'কে', 'কেইজন']
assamese_verb_suffixes = ['ছি', 'ছিল', 'ল', 'িব', 'য়', 'া', 'ই আছো', 'িলেহে', 'ি থাকে', 'ইলো', 'না', 'িব পৰা', 'ক', 'োৱা নাই', 'ৱে', 'হক', 'হল']
assamese_adjective_suffixes = ['তম', 'বিলাক', 'খিনি', 'পৰা', 'ীয়া', 'মুখী']
assamese_adverb_suffixes = ['তকৈ', 'দিনে', 'ই', 'তে', 'লৈকে', 'পৰা', 'হয়']
english_noun_suffixes = ['s', 'es', 'ion', 'ment', 'ity']
english_verb_suffixes = ['ed', 's', 'ing', 'ize', 'ify']
english_adjective_suffixes = ['able', 'ible', 'al', 'ful', 'ive', 'ous', 'ish', 'less']
english_adverb_suffixes = ['ly', 'ward', 'wise']
def pos_tag(token, previous_token=None, next_token=None):
token_lower = token.lower()
if token in assamese_pronouns:
return 'AS-PRON'
elif token_lower.endswith(tuple(assamese_noun_suffixes)) or token in assamese_nouns:
return 'AS-NOUN'
elif token_lower.endswith(tuple(assamese_verb_suffixes)) or token in assamese_verbs:
return 'AS-VERB'
elif token_lower.endswith(tuple(assamese_adjective_suffixes)) or token in assamese_adjectives:
return 'AS-ADJ'
elif token_lower.endswith(tuple(assamese_adverb_suffixes)) or token in assamese_adverbs:
return 'AS-ADV'
elif token in assamese_conjunctions:
return 'AS-CONJ'
if token_lower in english_pronouns:
return 'EN-PRON'
elif token_lower in english_nouns:
return 'EN-NOUN'
elif token_lower in english_verbs:
return 'EN-VERB'
elif token_lower in english_adjectives:
return 'EN-ADJ'
elif token_lower in english_adverbs:
return 'EN-ADV'
elif token_lower in english_prepositions:
return 'EN-PREP'
elif token_lower in english_conjunctions:
return 'EN-CONJ'
elif token_lower in english_interjections:
return 'EN-INTJ'
elif token_lower in english_determiners:
return 'EN-DT'
if previous_token:
if previous_token in english_determiners and token in english_nouns:
return 'EN-NOUN'
elif previous_token in assamese_adjectives and token in assamese_nouns:
return 'AS-NOUN'
elif previous_token in english_adjectives and token in english_nouns:
return 'EN-NOUN'
elif previous_token in assamese_nouns and token in assamese_verbs:
return 'AS-VERB'
if next_token:
if token in english_determiners and next_token in english_nouns:
return 'EN-DT'
elif token in assamese_conjunctions and next_token in assamese_verbs:
return 'AS-CONJ'
elif token in english_adjectives and next_token in english_nouns:
return 'EN-ADJ'
if token in assamese_conjunctions and next_token in assamese_adjectives:
return 'AS-CONJ'
return 'UNK'
def pos_tag_sentences(sentences):
tagged_sentences = []
for sent in sentences:
tokens = sent.split()
tagged_tokens = []
for i, token in enumerate(tokens):
previous_token = tokens[i-1] if i > 0 else None
next_token = tokens[i+1] if i < len(tokens) - 1 else None
tagged_tokens.append(pos_tag(token, previous_token, next_token))
tagged_sentences.append((sent, ' '.join(tagged_tokens)))
return tagged_sentences
def get_input_data():
choice = input("Do you want to enter a sentence or a CSV file? (Enter 'sentence' or 'csv'): ").strip().lower()
if choice == 'sentence':
sentence = input("Please enter the sentence: ")
return [sentence]
elif choice == 'csv':
file_path = input("Please enter the path to the CSV file: ").strip()
try:
df = pd.read_csv(file_path)
return df['sentence'].tolist()
except Exception as e:
print(f"Error reading the CSV file: {e}")
return []
else:
print("Invalid choice. Please enter 'sentence' or 'csv'.")
return []
sentences = get_input_data()
if sentences:
tagged_sentences = pos_tag_sentences(sentences)
output_df = pd.DataFrame(tagged_sentences, columns=['Sentence', 'POS_Tags'])
output_file = 'tagged_sentences_output.csv'
output_df.to_csv(output_file, index=False)
print(f"Tagged sentences saved to {output_file}")
for sent in tagged_sentences[:10]:
print(sent)