-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
569 lines (513 loc) · 23.4 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
from collections import defaultdict, OrderedDict
import json
from Levenshtein import distance as levenshtein
import numpy as np
import os
import random
import signal
import sys
from typing import Any, Callable, Dict, List, Tuple, Optional
from termcolor import colored
import time
from tqdm import tqdm
import util
# from naclo_problems import run_naclo_test_suite
# from tasks import (
# # gen_borple_1, gen_borple_2, gen_borple_3,
# # test_copycat_remove,
# # gen_substitute_1, gen_substitute_2,
# run_task_suite,
# run_synthetic_data,
# # run_phone_numbers,
# # run_novel_instructions,
# )
# from synthetic_data import (
# get_vocab,
# sample_multilevel_markov_chain,
# sample_from_multilevel_markov_chain,
# multilevel_markov_chain_sequence_to_str,
# sample_hmm,
# sample_from_hmm,
# hmm_sequence_to_str,
# )
try:
import openai # type: ignore
# https://beta.openai.com/api-ref
openai.api_key = open('api-key').read().strip()
except Exception as e:
print(e)
print('Falling back to MockGPT3')
DEFAULT_CACHE_PATH = 'cache.jsonl'
HEADER_COLOR = 'magenta'
RESPONSE_COLOR = 'red'
DEFAULT_GENERATION_KWARGS = {
'engine': 'davinci',
'staged': True,
}
def make_header(s: Any):
return colored(f'===== {s}', HEADER_COLOR)
def get_key(request):
if 'logit_bias' in request and isinstance(request['logit_bias'], list):
request['logit_bias'] = {k: v for k, v in request['logit_bias']}
key = util.make_immutable(request)
return key
def read_cache(filename: str = DEFAULT_CACHE_PATH):
cache = OrderedDict()
if os.path.exists(filename):
for line in open(filename):
try:
item = json.loads(line)
cache[get_key(item['request'])] = item['response']
except Exception:
pass
#print(f"Read {len(cache)} cache entries")
cache['__filename__'] = filename
return cache
def write_cache(cache: Dict, filename: Optional[str] = None):
filename = cache.get('__filename__') or filename or DEFAULT_CACHE_PATH
s = signal.signal(signal.SIGINT, signal.SIG_IGN)
with open(filename, 'w') as f:
for key, value in cache.items():
_key = key if isinstance(key, str) else dict(key)
item = {
'request': _key,
'response': value,
}
print(json.dumps(item), file=f)
signal.signal(signal.SIGINT, s)
#print(f"Wrote {len(cache)} cache entries")
def set_seed(seed: int = 0):
random.seed(seed)
np.random.seed(seed)
# tf.random.set_seed(seed)
# torch.manual_seed(seed)
class GPT3:
def __init__(self, cache: Dict, default_generation_kwargs: Dict = DEFAULT_GENERATION_KWARGS):
self.cache = cache
self.default_generation_kwargs = default_generation_kwargs
self.clear_staged_queries()
def make_query(self, **kwargs) -> Dict:
if 'logit_bias' in kwargs and kwargs['logit_bias'] is None:
del kwargs['logit_bias']
key = get_key(kwargs)
_key = get_key({k: v for k, v in kwargs.items() if k != 'staged'})
if _key in self.cache:
response = self.cache[_key]
elif 'staged' in kwargs and kwargs['staged']:
self.cache[key] = response = None
else:
kwargs = dict(kwargs)
if 'random' in kwargs:
del kwargs['random']
if 'staged' in kwargs:
del kwargs['staged']
try:
response = openai.Completion.create(**kwargs)
self.cache[_key] = response
write_cache(self.cache)
except openai.error.InvalidRequestError as e:
print(e)
response = {
'choices': [{
'text': None
}]
}
raise Exception(e)
return response
def clear_staged_queries(self):
staged = {key: value for key, value in self.cache.items() if key != '__filename__' and ('staged', True) in key}
for key in staged.keys():
del self.cache[key]
write_cache(self.cache)
def calculate_cost(self):
staged = {key: value for key, value in self.cache.items() if key != '__filename__' and ('staged', True) in key}
total = 0
for _, (key, value) in zip(tqdm(staged), staged.items()):
kwargs = defaultdict(int, key)
total += util.count_tokens(kwargs['prompt']) + kwargs['max_tokens']
return total
def _run_staged_query(self, item):
key, value = item
kwargs = dict(key)
del kwargs['staged']
_ = self.make_query(**kwargs)
def run_staged_queries_parallel(self): # TODO test
staged = {key: value for key, value in self.cache.items() if key != '__filename__' and ('staged', True) in key}
util.run_parallel(self._run_staged_query, staged.items())
for key in staged.keys():
del self.cache[key]
write_cache(self.cache)
def run_staged_queries(self):
staged = {key: value for key, value in self.cache.items() if key != '__filename__' and ('staged', True) in key}
if not staged:
return
k = None
while k not in list('ynqc'):
k = input(f"Submit {len(staged)} staged request(s) to the server? [y/n/q/c] ")
if k not in list('yc'):
return
cntr = 0
for _, (key, value) in zip(tqdm(staged), staged.items()):
if cntr > 0:
cntr -= 1
# if cntr > 0 and cntr % 5 == 0:
# print('%d staged requests left to skip' % cntr)
continue
_key = [el for el in key if el[0] != 'prompt']
print(str(_key))
kwargs = dict(key)
del kwargs['staged']
print(kwargs['prompt'][-200:])
if k == 'c':
k2 = 'x'
while k2[0] not in list('ynqs'):
k2 = input(f"Submit this staged request to the server? [y/n/q/s <num>] ")
if k2 == 'q':
return
if k2 == 'n':
continue
if k2[0] == 's':
cntr = int(k2[2:])
print('Skipping %d staged requests' % cntr)
response = self.make_query(**kwargs)
if response is not None and response['choices']:
for choice in response['choices']:
print(colored(choice['text'], 'yellow'))
# self.print_logprobs(choice)
for key in staged.keys():
del self.cache[key]
write_cache(self.cache)
def complete(self, verbose=True, **kwargs):
kwargs = {**self.default_generation_kwargs, **kwargs}
response = self.make_query(**kwargs)
prompt = kwargs['prompt']
del kwargs['prompt']
# print(make_header(kwargs))
# print(prompt, end='')
if verbose:
if response is not None:
for choice in response['choices']:
print(colored(choice['text'], RESPONSE_COLOR))
# self.print_logprobs(choice)
print('')
return response
def few_shot(self, examples: List[Tuple[str, str]], x: str, y: Optional[str] = None, prefix: Optional[str] = None, x_label: str = 'Input', y_label: str = 'Output', return_kwargs: bool = False, formatter = None, verbose=True, **kwargs):
kwargs = {**self.default_generation_kwargs, **kwargs}
if formatter is not None:
prompt = '\n'.join(map(formatter, examples + [(x, '')])).rstrip() # [:-1]
else:
prompt = f'{x_label}: {x}\n{y_label}:'
if len(examples) > 0:
prompt = '\n'.join([f'{x_label}: {x}\n{y_label}: {y}' for x, y in examples]) + '\n' + prompt
if prefix is not None:
prompt = prefix + '\n' + prompt
kwargs['prompt'] = prompt
if 'stop' not in kwargs:
kwargs['stop'] = '\n'
if kwargs['stop'] is None:
del kwargs['stop']
response = self.make_query(**kwargs)
#prompt = kwargs['prompt']
#del kwargs['prompt']
# print(make_header(kwargs))
# print(prompt, end='')
rel = None
if y is not None:
y = y.lstrip().rstrip()
if response is not None:
for choice in response['choices']:
predicted_y = choice['text'].lstrip().rstrip()
if y is not None: # Correct answer given
if y == predicted_y:
rel = colored('EQUALS', 'green')
elif y in predicted_y:
rel = colored('CONTAINS', 'yellow')
elif 1. * levenshtein(y, predicted_y) / max(len(y), len(predicted_y)) <= .2:
rel = colored('CLOSE', 'magenta')
else:
rel = 'NOT EQUALS'
extra = f' {rel} {y}'
else:
extra = ''
if verbose:
print(f'[{len(examples)} examples] {x} -> {colored(predicted_y, RESPONSE_COLOR)}{extra}')
# self.print_logprobs(choice)
retval = [response, rel]
if return_kwargs:
retval.append(kwargs)
return retval
def print_logprobs(self, response_choice):
if 'logprobs' in response_choice and response_choice['logprobs'] is not None:
# print(colored(' | ', 'yellow').join(response_choice['logprobs']['tokens']))
arr = response_choice['logprobs']['top_logprobs']
cur_data = []
for obj in arr:
obj = OrderedDict(sorted(obj.items(), key=lambda x: -x[1]))
print(json.dumps(obj, indent=4)) # , sort_keys=True))
# for k, v in list(obj.items())[:2]:
# print(f"\"{k}\": " + "%.2f" % np.exp(v))
# val = np.exp(obj[' True'])
# ch = ''
# if hasattr(self, 'prev'):
# ch = '↓' if val < self.prev else '↑'
# self.prev = val
# print("%.2f %s" % (val, ch))
break
class MockGPT3:
def __init__(self, cache: Dict, default_generation_kwargs: Dict = DEFAULT_GENERATION_KWARGS):
self.cache = cache
self.default_generation_kwargs = default_generation_kwargs
self.clear_staged_queries()
def make_query(self, **kwargs) -> Dict:
key = get_key(kwargs)
if key in self.cache:
response = self.cache[key]
else:
kwargs = dict(kwargs)
if 'random' in kwargs:
del kwargs['random']
response = {
'choices': [] #
}
self.cache[key] = response
write_cache(self.cache)
return response
def complete(self, **kwargs):
kwargs = {**self.default_generation_kwargs, **kwargs}
response = self.make_query(**kwargs)
prompt = kwargs['prompt']
del kwargs['prompt']
# print(make_header(kwargs))
# print(prompt, end='')
for choice in response['choices']:
print(colored(choice['text'], RESPONSE_COLOR))
print('')
def few_shot(self, examples: List[Tuple[str, str]], x: str, y: Optional[str] = None, prefix: Optional[str] = None, x_label: str = 'Input', y_label: str = 'Output', return_kwargs: bool = False, formatter = None, stop: List[str] = ['\n'], verbose=True, **kwargs):
kwargs = {**self.default_generation_kwargs, **kwargs}
if formatter is not None:
prompt = '\n'.join(map(formatter, examples + [(x, '')])).rstrip() # [:-1]
else:
prompt = f'{x_label}: {x}\n{y_label}:'
if len(examples) > 0:
prompt = '\n'.join([f'{x_label}: {x}\n{y_label}: {y}' for x, y in examples]) + '\n' + prompt
if prefix is not None:
prompt = prefix + '\n' + prompt
if y is not None:
prompt += ' ' + colored(y, 'yellow')
kwargs['prompt'] = prompt
if 'stop' not in kwargs:
kwargs['stop'] = '\n'
if kwargs['stop'] is None:
del kwargs['stop']
response = self.make_query(**kwargs)
# print(prompt)
retval = [response, None]
if return_kwargs:
retval.append(kwargs)
return retval
def clear_staged_queries(self):
staged = {key: value for key, value in self.cache.items() if key != '__filename__' and ('staged', True) in key}
for key in staged.keys():
del self.cache[key]
write_cache(self.cache)
def run_staged_queries(self):
staged = {key: value for key, value in self.cache.items() if key != '__filename__' and ('staged', True) in key}
if not staged:
return
k = None
while k not in list('ynqc'):
k = input(f"Pretend to submit {len(staged)} staged request(s) to the server? [y/n/q/c] ")
if k not in list('yc'):
return
cntr = 0
for _, (key, value) in zip(tqdm(staged), staged.items()):
if cntr > 0:
cntr -= 1
# if cntr > 0 and cntr % 5 == 0:
# print('%d staged requests left to skip' % cntr)
continue
_key = [el for el in key if el[0] != 'prompt']
print(str(_key))
kwargs = dict(key)
del kwargs['staged']
print(kwargs['prompt'][-200:])
if k == 'c':
k2 = 'x'
while k2[0] not in list('ynqs'):
k2 = input(f"Pretend to submit this staged request to the server? [y/n/q/s <num>] ")
if k2 == 'q':
return
if k2 == 'n':
continue
if k2[0] == 's':
cntr = int(k2[2:])
print('Skipping %d staged requests' % cntr)
response = self.make_query(**kwargs)
if response is not None and response['choices']:
for choice in response['choices']:
print(colored(choice['text'], 'yellow'))
# self.print_logprobs(choice)
for key in staged.keys():
del self.cache[key]
write_cache(self.cache)
def run_percy_tasks(gpt3):
# Generate free-form stuff
for i in range(10):
gpt3.complete(prompt='Hello!', max_tokens=100, random=i)
# Biographies just made up
for i in range(10):
gpt3.complete(prompt='John Duchi is an assistant professor in statistics and electrical engineering at Stanford University. He', temperature=0.5, random=i, max_tokens=100)
for i in range(5):
gpt3.complete(prompt='[PERSON] John Duchi is an assistant professor in statistics and electrical engineering at Stanford University.\n[PERSON] Tommi Jaakkola is a professor in computer science at MIT.\n[PERSON] Clyde Drexler is', temperature=1, random=i, max_tokens=100, stop='\n')
# Story involving two novel characters
for i in range(5):
gpt3.complete(prompt='One day, Elon Musk invited George Washington over for dinner. Elon showed', temperature=1, random=i, max_tokens=100)
for i in range(5):
gpt3.complete(prompt='One day, Ghandi and Richard Stallman were sitting next to each other on the bus. Ghandi looked over to see what Richard was working on and was', temperature=1, random=i, max_tokens=100)
# Question answering
gpt3.few_shot([], 'What is the tallest mountain?')
for i in range(5):
gpt3.few_shot([], 'Who was the first president of the United States?', random=i)
for i in range(5):
gpt3.few_shot([('What is the tallest mountain?', 'Mount Everest')], 'Who was the first president of the United States?', random=i)
gpt3.few_shot([('What is the tallest mountain?', 'Mount Everest')], 'Who was the first president of the United States?', x_label='Q', y_label='A')
examples = [
('What is human life expectancy in the United States?', 'Human life expectancy in the United States is 78 years.'),
('Who was president of the United States in 1955?', 'Dwight D. Eisenhower was president of the United States in 1955.'),
#('What party did he belong to?', 'He belonged to the Republican Party.'),
#('Who was president of the United States before George W. Bush?', 'Bill Clinton was president of the United States before George W. Bush.'),
#('Who won the World Series in 1995?', 'The Atlanta Braves won the World Series in 1995.')
]
gpt3.few_shot(examples, 'How many children does Barack Obama have?', x_label='Q', y_label='A')
gpt3.few_shot(examples, 'Who was president before Bill Clinton?', x_label='Q', y_label='A')
gpt3.few_shot(examples, 'Who was the first person to climb Mt. Everest?', x_label='Q', y_label='A')
gpt3.few_shot(examples, 'What is the capital of Turkey?', x_label='Q', y_label='A')
gpt3.few_shot(examples, 'How many states border Texas?', x_label='Q', y_label='A') # Wrong
gpt3.few_shot(examples, 'How long did Mozart live?', x_label='Q', y_label='A')
gpt3.few_shot(examples, 'How old is Mozart?', x_label='Q', y_label='A')
gpt3.few_shot(examples, 'What languages did Mozart speak?', x_label='Q', y_label='A')
gpt3.few_shot(examples, 'What are good restaurants in Palo Alto?', x_label='Q', y_label='A') # Wrong
people = [
'George Washington',
'Albert Einstein',
'Geoff Hinton',
'Andrej Karpathy', # Wrong
]
for person in people:
gpt3.few_shot(examples, f'When was {person} born?', x_label='Input', y_label='Output')
examples = [
('Who was president of the United States in 1955?', 'Dwight D. Eisenhower'),
('What year was Microsoft founded?', '1975'),
#('What is human life expectancy in the United States?', '78 years'),
#('What party did he belong to?', 'He belonged to the Republican Party.'),
#('Who was president of the United States before George W. Bush?', 'Bill Clinton was president of the United States before George W. Bush.'),
#('Who won the World Series in 1995?', 'The Atlanta Braves won the World Series in 1995.')
]
countries = ['United States', 'Canada', 'Mexico', 'Russia', 'China', 'Spain', 'Greece', 'Belgium', 'Japan', 'North Korea', 'Mongolia', 'Kenya', 'Ghana']
for country in countries:
gpt3.few_shot(examples, f'What is the capital of {country}?', x_label='Input', y_label='Output')
gpt3.few_shot(examples, f'When was OpenAI founded?', x_label='Input', y_label='Output')
gpt3.few_shot(examples, f'What does Microsoft do?', x_label='Input', y_label='Output')
gpt3.few_shot(examples, f'What is Picasso known for?', x_label='Input', y_label='Output')
gpt3.few_shot(examples, f'What did Stravinsky compose?', x_label='Input', y_label='Output')
gpt3.few_shot(examples, f'Where is the tallest building in the world?', x_label='Input', y_label='Output')
gpt3.few_shot(examples, f'Where is the tallest building in the world how how high is it?', x_label='Input', y_label='Output') # Makes some stuff up
gpt3.few_shot([], f'Where is the tallest building in the world?', x_label='Input', y_label='Output') # Doesn't know how to continue
# Math: messes up a bit
examples = []
for a in range(4):
for b in range(4):
examples.append((f'What is {a} + {b}?', f'{a + b}'))
num_train = 5
for x, y in examples[num_train:]:
gpt3.few_shot(examples[:num_train], x=x, y=y)
# Summarize
# Natural language to bash
examples = [
('list all files', 'ls'),
('make a directory called foo', 'mkdir foo'),
('print contents of report.txt', 'cat report.txt'),
]
gpt3.few_shot(examples, 'delete file called yummy.pdf', temperature=1)
gpt3.few_shot(examples, 'print the first 15 lines of a.txt', temperature=1)
gpt3.few_shot(examples, 'print the last 15 lines of a.txt', temperature=1)
gpt3.few_shot(examples, 'check if a.txt exists', temperature=1) # fail
gpt3.few_shot(examples, 'get the number of lines in a.txt', temperature=1) # not perfect
gpt3.few_shot(examples, 'print the lines in a.txt but in reverse', temperature=1) # not quite what I had intended
gpt3.few_shot(examples, 'search for "foo" in a.txt', temperature=1)
gpt3.few_shot(examples, 'get the current directory', temperature=1)
gpt3.few_shot(examples, 'print the largest file in the current directory', temperature=1)
gpt3.few_shot(examples, 'print the size of file "massive.txt"', temperature=1)
gpt3.few_shot(examples, 'remove the first 3 lines of a.txt', temperature=1, random=1) # Wrong
gpt3.few_shot(examples, 'print the 3rd line of a.txt', temperature=1, random=1) # Wrong
# Reverse
examples = [
('a b c', 'c b a'),
('t h e', 'e h t'),
('h e l l o', 'o l l e h'),
('c a p i t a l', 'l a t i p a c'),
]
gpt3.few_shot(examples, x='h o r s e', y='e s r o h', prefix='Reverse the input.', temperature=1, random=1) # fail
# Translation
examples = [
('the house', 'la maison'),
('I am a cat.', 'je suis un chat.'),
]
gpt3.few_shot(examples, 'I like to drink water.')
# Systematicity (capitals)
# Sensitivity to prompt?
# Logical forms
train_examples = [
('what\'s the capital of Maine?', 'capitalOf(Maine)'),
('how many states border Texas?', 'count(and(states, border(Texas)))'),
('what is the largest state?', 'argmax(states, population)'),
]
test_examples = [
('how many states border Illinois?', 'count(and(states, border(Illinois)))'),
('how many states border the largest state?', 'count(and(states, border(argmax(states, population))))'),
('how many states are adjacent to Illinois?', 'count(and(states, border(Illinois)))'),
]
for x, y in test_examples:
gpt3.few_shot(train_examples, x=x, y=y, temperature=0)
# Break robot actions into steps
train_examples = [
('get the apples', 'go to apples; pick up apples'),
('wash the apples', 'go to apples; pick up apples; go to sink; turn on faucet; turn off faucet'),
('put the cereal on the shelf', 'go to cereal; pick up cereal; go to shelf; drop cereal'),
]
test_examples = [
('wash the oranges', None),
('wash the bowl', None),
('wash the windows', None),
('put the milk in the fridge', None),
('put the watermelon on the counter', None),
('cut the apples', None),
('cut the oranges', None),
('peel the apple', None),
('boil an egg', None),
('put away the groceries', None),
('clean the living room', None),
('set the table', None),
]
for x, y in train_examples + test_examples:
gpt3.few_shot(train_examples, x=x, y=y, temperature=0)
# Thesaurus
# Things beyond few-shot learning (soft influence)
def run_simple_test(gpt3):
gpt3.complete(prompt='Hello!', max_tokens=100, random=0)
gpt3.few_shot([], 'What is the tallest mountain?', max_tokens=100, random=0)
def main(argv):
if 'submit' in argv:
cache = read_cache()
gpt3 = GPT3(cache)
else:
cache_fname = 'cache_mockgpt3.jsonl'
cache = read_cache(cache_fname)
gpt3 = MockGPT3(cache)
# run_percy_tasks(gpt3)
# Begin section (frieda) =============================================================================
# run_task_suite(gpt3)
# run_synthetic_data(gpt3)
# run_novel_instructions(gpt3)
if __name__ == '__main__':
main(sys.argv)