diff --git a/requirements.txt b/requirements.txt index 9cb570ec..e0ba7699 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,3 @@ numpy==1.10.4 argparse==1.2.1 h5py==2.5.0 six==1.10.0 -wsgiref==0.1.2 diff --git a/scripts/novel_substrings.py b/scripts/novel_substrings.py index 9333e339..695cfc3d 100644 --- a/scripts/novel_substrings.py +++ b/scripts/novel_substrings.py @@ -1,4 +1,7 @@ +from __future__ import print_function + import argparse +import six """ Check how many substrings in sampled text are novel, not appearing in training @@ -18,14 +21,14 @@ with open(args.training_text, 'r') as f: s2 = f.read() -for L in xrange(1, 50): +for L in six.moves.range(1, 50): num_searched = 0 num_found = 0 - for i in xrange(len(s1) - L + 1): + for i in six.moves.range(len(s1) - L + 1): num_searched += 1 sub = s1[i:(i+L)] assert len(sub) == L if sub in s2: num_found += 1 novel_frac = (num_searched - num_found) / float(num_searched) - print L, novel_frac + print(L, novel_frac) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index 90b834b6..fecd56e5 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -1,6 +1,10 @@ # -*- coding: utf-8 -*- +from __future__ import print_function -import argparse, json, os +import argparse +import json +import os +import six import numpy as np import h5py import codecs @@ -34,20 +38,20 @@ val_size = int(args.val_frac * total_size) test_size = int(args.test_frac * total_size) train_size = total_size - val_size - test_size - + if not args.quiet: - print 'Total vocabulary size: %d' % len(token_to_idx) - print 'Total tokens in file: %d' % total_size - print ' Training size: %d' % train_size - print ' Val size: %d' % val_size - print ' Test size: %d' % test_size + print('Total vocabulary size: %d' % len(token_to_idx)) + print('Total tokens in file: %d' % total_size) + print(' Training size: %d' % train_size) + print(' Val size: %d' % val_size) + print(' Test size: %d' % test_size) # Choose the datatype based on the vocabulary size dtype = np.uint8 if len(token_to_idx) > 255: dtype = np.uint32 if not args.quiet: - print 'Using dtype ', dtype + print('Using dtype ', dtype) # Just load data into memory ... we'll have to do something more clever # for huge datasets but this should be fine for now @@ -77,7 +81,7 @@ # doesn't crash if args.encoding is None: new_token_to_idx = {} - for token, idx in token_to_idx.iteritems(): + for token, idx in six.iteritems(token_to_idx): if ord(token) > 127: new_token_to_idx['[%d]' % ord(token)] = idx else: @@ -87,7 +91,7 @@ # Dump a JSON file for the vocab json_data = { 'token_to_idx': token_to_idx, - 'idx_to_token': {v: k for k, v in token_to_idx.iteritems()}, + 'idx_to_token': {v: k for k, v in six.iteritems(token_to_idx)}, } with open(args.output_json, 'w') as f: json.dump(json_data, f)