-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram.py
28 lines (23 loc) · 882 Bytes
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/7/25 16:01
# @Author : Ting
def ngram(sentences, n):
# sentences is list of segmented words
# e.g. [['Hello', 'World'],[...]]
counter = dict()
for sent in sentences:
sent = ['START']*(n-1) + sent + ['END']*(n-1)
for i in range(len(sent)-n):
condition = tuple(w for w in sent[i:i+n-1])
if condition not in counter:
counter[condition] = dict()
if sent[i+n-1] not in counter[condition]:
counter[condition][sent[i + n - 1]] = 1.0
else:
counter[condition][sent[i + n - 1]] += 1.0
for key in counter:
total = sum(counter[key].values())
for value in counter[key]:
counter[key][value] = counter[key][value]/total
return counter