-
Notifications
You must be signed in to change notification settings - Fork 0
/
segment.rb
82 lines (67 loc) · 2.22 KB
/
segment.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
require 'rubygems'
require 'memoize'
require 'google_ngram'
# This code based on Peter Novig's chapter on "Natural Language Corpus Data" in
# Beautiful Data.
include Memoize
$bi_model = Google::Ngram.new(:path => '/local/data/web_5gram/data/2gms')
$uni_model = Google::Ngram.new(:path => '/local/data/web_5gram/data/1gms')
$magic_pr = -16.142698764157398 # twice as uncommon as "kraig" last word in Bing 100k list
# Returns all the splits of a string up to a given length
def splits(text,max_len=text.size)
Range.new(0,[text.size,max_len].min-1).map{|i| [text[0..i],text[i+1..-1]]}
end
# This keeps just those splits whose first item is above the magic unigram
# log probability
def reasonable_splits(text,max_len=text.size)
splits(text,max_len).find_all{|pre,suf| Pr(pre)>=$magic_pr}
end
# Get the unigram log probability of a token
def Pr(str)
Math.log $uni_model.cp(str)
end
# Get the conditional probability of a word, given a prior
def cPw(word, prev)
r = $bi_model.cp([prev,word].join(' '))/$uni_model.cp(prev)
r = r.nan? ? 0.0 : r
Math.log(r)
end
# combine data
def combine(pfirst, first, pr)
prem, rem = pr
return [pfirst+prem, [first]+rem]
end
# segment a text, assuming it is at the beginning of a sentence
# return a pair: the log probability, and the most probable segmentation
def segment2(text, prev="<s>")
# puts "segment2: #{text.inspect} prev: #{prev}"
return [0.0,[]] if (!text or text.size==0)
# reasonable_splits(text).map{|first,rem| combine(cPw(first,prev), first, segment2(rem, first))}.max
r = reasonable_splits(text).map{|first,rem| combine(cPw(first,prev), first, segment2(rem, first))}
r.max
end
# just return the best segmentation
def segment(text)
segment2(text)[1]
end
# We want to memoize a lot of things.
memoize :splits
memoize :reasonable_splits
memoize :Pr
memoize :cPw
memoize :segment2
p segment "CardinalKeithOBrien"
# These are some Twitter hash tags which I segmented.
# > segment("bpcares")
# => ["bp", "cares"]
# > segment("Twitter")
# => ["Twitter"]
# > segment("writers")
# => ["writers"]
# > segment("iamwriting")
# => ["i", "am", "writing"]
# > segment("backchannel")
# => ["back", "channel"]
# > segment("tcot")
# => ["tcot"]
# => ["vacation", "fall", "out"]