forked from dfm/FakeArXiv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram.py
101 lines (82 loc) · 2.62 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import (division, print_function, absolute_import,
unicode_literals)
import sys
import tweepy
import string
import numpy as np
import ConfigParser
import cPickle as pickle
from datetime import datetime
from collections import defaultdict
alpha = 0.9
START = "<S>"
STOP = "</S>"
print(datetime.now())
def build_sentence(words):
s = " "
for w in words:
if w == "$":
continue
if not len(w.strip(string.punctuation)):
if w in ["(", "{", "\{", "[", "`", "``"]:
s += w
else:
s = s[:-1]+w+" "
else:
s += w+" "
s = s.strip()
s = s[0].upper() + s[1:]
return s
if "--build" in sys.argv:
titles = [[START, START] + t.strip().split() + [STOP, STOP]
for t in open("titles.txt").readlines()]
bigrams = defaultdict(lambda: defaultdict(int))
trigrams = defaultdict(lambda: defaultdict(int))
print("Building ngrams...")
for title in titles:
for i in range(2, len(title)):
bigrams[title[i-1]][title[i]] += 1
trigrams[title[i-2]+" "+title[i-1]][title[i]] += 1
pickle.dump((dict(bigrams), dict(trigrams)), open("ngrams.pkl", "wb"), -1)
else:
print("Loading ngrams...")
bigrams, trigrams = pickle.load(open("ngrams.pkl"))
print("Generating title...")
title = [START, START]
while True:
b_prob = bigrams[title[-1]]
t_prob = trigrams[title[-2]+" "+title[-1]]
b_norm = sum(b_prob.values())
t_norm = sum(t_prob.values())
words, probs = [], []
for w in set(b_prob.keys()) | set(t_prob.keys()):
words.append(w)
probs.append(alpha * t_prob.get(w, 0.0)/t_norm
+ (1-alpha) * b_prob.get(w, 0.0)/b_norm)
word = np.random.choice(words, p=probs)
if word == STOP:
if len(title) < 5:
print("Too short")
title = [START, START]
continue
else:
break
title.append(word)
sent = build_sentence(title[2:])
if len(sent) > 140:
print("Too long")
title = [START, START]
print("Title: \"{0}\"".format(sent))
if "--tweet" in sys.argv:
config = ConfigParser.ConfigParser()
config.read("local.cfg")
sect = "twitter"
print("Posting to twitter...")
auth = tweepy.OAuthHandler(config.get(sect, "consumer_key"),
config.get(sect, "consumer_secret"))
auth.set_access_token(config.get(sect, "user_key"),
config.get(sect, "user_secret"))
api = tweepy.API(auth)
api.update_status(sent)