Skip to content

Commit

Permalink
Add python 3 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisCummins authored Apr 26, 2017
2 parents 690036d + 37a31bb commit fff1c6b
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 14 deletions.
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ numpy==1.10.4
argparse==1.2.1
h5py==2.5.0
six==1.10.0
wsgiref==0.1.2
9 changes: 6 additions & 3 deletions scripts/novel_substrings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from __future__ import print_function

import argparse
import six

"""
Check how many substrings in sampled text are novel, not appearing in training
Expand All @@ -18,14 +21,14 @@
with open(args.training_text, 'r') as f:
s2 = f.read()

for L in xrange(1, 50):
for L in six.moves.range(1, 50):
num_searched = 0
num_found = 0
for i in xrange(len(s1) - L + 1):
for i in six.moves.range(len(s1) - L + 1):
num_searched += 1
sub = s1[i:(i+L)]
assert len(sub) == L
if sub in s2:
num_found += 1
novel_frac = (num_searched - num_found) / float(num_searched)
print L, novel_frac
print(L, novel_frac)
24 changes: 14 additions & 10 deletions scripts/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import print_function

import argparse, json, os
import argparse
import json
import os
import six
import numpy as np
import h5py
import codecs
Expand Down Expand Up @@ -34,20 +38,20 @@
val_size = int(args.val_frac * total_size)
test_size = int(args.test_frac * total_size)
train_size = total_size - val_size - test_size

if not args.quiet:
print 'Total vocabulary size: %d' % len(token_to_idx)
print 'Total tokens in file: %d' % total_size
print ' Training size: %d' % train_size
print ' Val size: %d' % val_size
print ' Test size: %d' % test_size
print('Total vocabulary size: %d' % len(token_to_idx))
print('Total tokens in file: %d' % total_size)
print(' Training size: %d' % train_size)
print(' Val size: %d' % val_size)
print(' Test size: %d' % test_size)

# Choose the datatype based on the vocabulary size
dtype = np.uint8
if len(token_to_idx) > 255:
dtype = np.uint32
if not args.quiet:
print 'Using dtype ', dtype
print('Using dtype ', dtype)

# Just load data into memory ... we'll have to do something more clever
# for huge datasets but this should be fine for now
Expand Down Expand Up @@ -77,7 +81,7 @@
# doesn't crash
if args.encoding is None:
new_token_to_idx = {}
for token, idx in token_to_idx.iteritems():
for token, idx in six.iteritems(token_to_idx):
if ord(token) > 127:
new_token_to_idx['[%d]' % ord(token)] = idx
else:
Expand All @@ -87,7 +91,7 @@
# Dump a JSON file for the vocab
json_data = {
'token_to_idx': token_to_idx,
'idx_to_token': {v: k for k, v in token_to_idx.iteritems()},
'idx_to_token': {v: k for k, v in six.iteritems(token_to_idx)},
}
with open(args.output_json, 'w') as f:
json.dump(json_data, f)

0 comments on commit fff1c6b

Please sign in to comment.