-
Notifications
You must be signed in to change notification settings - Fork 25
/
lecture_13.py
461 lines (329 loc) · 18.8 KB
/
lecture_13.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
import math
from util import *
from typing import Set
import itertools
import mmh3
from bitarray import bitarray
def lecture_13():
note("Lecture 11: overview of different services (e.g., GitHub), datasets (C4), processing methods (CCNet)")
note("Lecture 12: mechanics of learned data filtering (KenLM, fastText, DSIR)")
note("This lecture:")
deduplication() # 1. Mechanics of deduplication
copyright() # 2. Can you train on copyrighted data?
note("## Summary")
note("- Hashing scales to large datasets")
note("- Support fuzzy match (suffix arrays, MinHash)")
note("- Use multiple hash functions to amplify probabilities (Bloom filter, LSH)")
note("- Public domain or Creative Commons licenses")
note("- License data (if you have the money)")
note("- Fair use: nuanced (transformative, affect the market)")
def deduplication():
note("Deduplication: given a training corpus")
note("Two types of duplicates")
note("- Exact duplicates (mirror sites, GitHub forks)"), see("https://www.gutenberg.org/MIRRORS.ALL")
note("- Near duplicates: same text differing by a few tokens")
note("Examples of near duplicates")
note("- Terms of service and licenses"), see("https://opensource.org/license/mit")
note("- Formulaic writing (copy/paste or generated from template)"), image("https://d3i71xaburhd42.cloudfront.net/4566c0d22ebf3c31180066ab23b6c445aeec78d5/5-Table1-1.png")
note("- Minor formatting differences in copy/pasting")
note("Product description repeated 61,036 times in C4")
note("'“by combining fantastic ideas, interesting arrangements, and follow the "
"current trends in the field of that make you more inspired and give artistic "
"touches. We’d be honored if you can apply some or all of these design in your "
"wedding. believe me, brilliant ideas would be perfect if it can be applied in "
"real and make the people around you amazed!")
see("https://www.amazon.co.uk/suryagede-100-Graffiti-Gas-Mask/dp/B07CRHT3RG")
see("https://apkpure.com/100-graffiti-gas-mask/com.GraffitiGasMask.suryagede")
note("Deduplication training data makes language models better [Lee+ 2022]"), see("https://arxiv.org/pdf/2107.06499")
note("- Train more efficiently (because have fewer tokens)")
note("- Avoid memorization (can mitigate copyright, privacy concerns)")
note("Design space")
note("1. What is an item (sentence, paragraph, document)?")
note("2. How to match (exact match, existence of common subitem, fraction of common subitems)?")
note("3. What action to take (remove all, remove all but one)?")
note("Key challenge:")
note("- Deduplication is fundamentally about comparing items to other items")
note("- Need linear time algorithms to scale")
hash_functions()
exact_deduplication()
bloom_filter()
suffix_arrays()
jaccard_minhash()
locality_sensitive_hashing()
def hash_functions():
note("Hash function h maps item to a hash value (integer or string)")
note("Hash value much smaller than item")
note("Hash collision: h(x) = h(y) for x ≠ y")
note("Tradeoff between efficiency and collision resistance")
note("- Cryptographic hash functions (SHA-256): collision resistant, slow (used in bitcoin)")
note("- DJB2, MurmurHash, CityHash: not collision resistant, fast (used for hash tables)")
# Use MurmurHash
h = mmh3.hash("hello")
see("https://softwareengineering.stackexchange.com/questions/49550/which-hashing-algorithm-is-best-for-uniqueness-and-speed")
def exact_deduplication():
note("## C4"), see("https://arxiv.org/pdf/1910.10683v4")
note("1. Item: 3-sentence spans")
note("2. Exact match")
note("3. Remove all but one")
note("Warning: when a 3-sentence span is removed from the middle of a document, "
"the resulting document might lose coherence")
note("## Simple example")
note("1. Item: string")
note("2. Exact match")
note("3. Remove all but one")
items = ["Hello!", "hello", "hello there", "hello", "hi", "bye"]
# hash -> list of items with that hash
hash_items = itertools.groupby(sorted(items, key=mmh3.hash), key=mmh3.hash)
# Keep one item from each group
deduped_items = [next(group) for h, group in hash_items]
note(deduped_items)
note("Pro: simple, clear semantics, high precision")
note("Con: does not deduplicate near duplicates")
note("This code is written in a MapReduce way, can easily parallelize")
def bloom_filter():
note("Goal: efficient, approximate data structure for testing set membership")
note("Features of Bloom filters")
note("- Memory efficient")
note("- Can update, but can't delete")
note("- If return 'no', definitely 'no'")
note("- If return 'yes', most likely 'yes', but small probability of 'no'")
note("- Can drive the false positive rate down exponentially with more time/compute")
items = ["the", "cat", "in", "the", "hat"]
note("First, make the range of hash function small.")
m = 8 # Number of bins
table = build_table(items, m)
assert query_table(table, "the", m) == 1 # Correct
assert query_table(table, "mat", m) == 0 # Correct
assert query_table(table, "what", m) == 1 # False positive!
note("Problem: false positives for small bins")
note("Naive solution: increase the number of bins")
note("Error probability is O(1/num_bins), decreases polynomially with memory")
note("Better solution: use more hash functions")
k = 2 # Number of hash functions
table = build_table_k(items, m, k)
assert query_table_k(table, "the", m, k) == 1 # Correct
assert query_table_k(table, "mat", m, k) == 0 # Correct
note("## False positive rate")
note("Assume independence of hash functions and bits"), see("https://en.wikipedia.org/wiki/Bloom_filter")
m = 1000 # Number of bins
k = 10 # Number of hash functions
n = 100 # Number of items we're inserting
# Insert one item, ask if a given test bin is 1?
f = 1 / m # P[B(i) = 1 after 1 insertion with 1 hash function]
f = 1 - (1 - 1 / m) ** k # P[B(i) = 1 after 1 insertion with k hash functions]
# Insert n items, ask if a given test bin is 1?
f = 1 - (1 - 1 / m) ** (k * n) # P[B(i) = 1 after n insertions for 1 hash function]
f = (1 - (1 - 1 / m) ** (k * n)) ** k # P[B(i) = 1 after n insertions for k hash functions]
note("Optimal value of k (given fixed m / n ratio)")
k = math.log(2) * m / n
note("Resulting false positive rate")
f = 0.5 ** k
note("Tradeoff between compute (k), memory (m), and false positive rate (f)")
see("https://people.eecs.berkeley.edu/~daw/teaching/cs170-s03/Notes/lecture10.pdf")
note("Example: Dolma")
note("- Set false positive rate to 1e-15")
note("- Perform on paragraphs")
def build_table(items: List[str], num_bins: int):
"""Build a Bloom filter table of size `num_bins`, inserting `items` into it."""
table = bitarray(num_bins)
for item in items:
h = mmh3.hash(item) % num_bins
table[h] = 1
return table
def build_table_k(items: List[str], num_bins: int, k: int):
"""Build a Bloom filter table of size `num_bins`, inserting `items` into it.
Use `k` hash functions."""
table = bitarray(num_bins)
for item in items:
# For each of the k functions
for seed in range(k):
h = mmh3.hash(item, seed) % num_bins
table[h] = 1
return table
def query_table(table: bitarray, item: str, num_bins: int, seed: int = 0):
"""Return whether `item` is in the `table`."""
h = mmh3.hash(item, seed) % num_bins
return table[h]
def query_table_k(table: bitarray, item: str, num_bins: int, k: int):
"""Return 1 if table set to 1 for all `k` hash functions."""
return all(
query_table(table, item, num_bins, seed)
for seed in range(k)
)
def suffix_arrays():
note("Definition: two items are near duplicates if "
"they share an n-gram [Lee+ 2022]"), see("https://arxiv.org/pdf/2107.06499")
note("Example of two phrases that share a 3-gram")
note("- the cat in the hat")
note("- the dog in the hat")
note("Deduplicating training data makes language models better [Lee+ 2022]")
note("1. Item: document")
note("2. Share an n-gram (for n = 50 using BPE tokenization)")
note("3. Remove all but one n-gram (but keep the rest of the document)")
note("Naive solution: map each n-gram to list of documents containing it")
note("Slicker solution: suffix arrays")
note("Suffix array is a data structure that stores all suffixes of a string S")
note("- O(|S|) time to build")
note("- Only 8 bytes of memory per element of S")
items = ["the", "cat", "in", "the", "hat", "<|endoftext|>",
"the", "dog", "in", "the", "hat"]
# This is not an efficient implementation
suffix_array = sorted(items[i:] for i in range(len(items)))
note("Suffix array")
for suffix in suffix_array:
note(" ".join(suffix), verbatim=True)
note("To find documents with shared n-grams, "
"simply look at adjacent documents and compute the longest n")
def jaccard_minhash():
note("## Jaccard similarity")
note("Jaccard similarity: size of intersection divided by size of union")
A = {"1", "2", "3", "4"}
B = {"1", "2", "3", "5"}
def compute_jaccard(A, B):
intersection = len(A & B)
union = len(A | B)
return intersection / union
jaccard = compute_jaccard(A, B)
note("Definition: two documents are near duplicates if "
"their Jaccard similarity is above a certain threshold")
note("Algorithmic challenge: find near duplicates in linear time")
note("## MinHash")
note("MinHash: a random hash function h so that "
"Pr[h(A) = h(B)] = Jaccard(A, B)")
note("Normally, you want different items to hash to different hashes, "
"but here, you want collision probability to depend on similarity")
def minhash(S: Set[str], seed: int):
return min(mmh3.hash(x, seed) for x in S)
note("Characteristic matrix representation:")
note(" | A | B", verbatim=True)
note("1 | 1 | 1", verbatim=True)
note("2 | 1 | 1", verbatim=True)
note("3 | 1 | 1", verbatim=True)
note("4 | 1 | 0", verbatim=True)
note("5 | 0 | 1", verbatim=True)
note("Random hash function induces a permutation over items")
note("If 1, 2, 3 is first (min), then hash matches")
note("If 4, 5 is first (min), then hash doesn't matches")
# Verify MinHash approximates Jaccard as advertised
n = 100 # Generate this many random hash functions
matches = [minhash(A, seed) == minhash(B, seed) for seed in range(n)]
estimated_jaccard = count(matches, True) / len(matches)
assert abs(estimated_jaccard - jaccard) < 0.01
note("We have reduced the footprint of an item from set size to n")
note("However, recall our goal was to find (A, B) with Jaccard(A, B) > threshold.")
note("Do we still have to iterate over all pairs?")
def locality_sensitive_hashing():
note("Locality sensitive hashing (LSH)")
note("Goal: hash similar items together")
note("More precisely: have A and B collide if Jaccard(A, B) > threshold")
note("Suppose we hash examples just one MinHash function")
note("P[A and B collide] = Jaccard(A, B)")
note("On average, more similar items will collide, but very stochastic...")
note("Solution: use n hash functions")
note("Break up into b bands of r hash functions each (n = b * r)")
n = 12 # Number of hash functions
b = 3 # Number of bands
r = 4 # Number of hash functions per band
note("Hash functions: h1 h2 h3 h4 | h5 h6 h7 h8 | h9 h10 h11 h12")
note("Key: A and B collide if for *some* band, *all* its hash functions return same value")
note("As we will see, the and-or structure of the bands sharpens the threshold")
note("Given Jaccard(A, B), what is the probability that A and B collide?")
def get_prob_collision(sim, b, r):
prob_match = sim ** r # Probability that a fixed band matches
prob_collision = 1 - (1 - prob_match) ** b # Probability that some band matches
return prob_collision
note("An example")
prob_collision = get_prob_collision(sim=0.8, b=5, r=10)
image("https://cdn.sanity.io/images/vr8gru94/production/b470799575b8e77911bacb8500977afef06d6c85-1280x720.png")
note("Increasing r sharpens the threshold, moves the curve to the right (harder to match)")
note("---")
for sim in [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98]:
prob_collision = get_prob_collision(sim=sim, b=20, r=20) # Used in [Lee+ 2022]
note(f"sim={sim}: P(collison) = {prob_collision}")
note("---")
for sim in [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98]:
prob_collision = get_prob_collision(sim=sim, b=20, r=30)
note(f"sim={sim}: P(collison) = {prob_collision}")
note("Increasing b moves the curve to the left (easier to match)")
image("https://cdn.sanity.io/images/vr8gru94/production/aace49fa240778e8ecf6e85ad08a2de7f5385566-1280x720.png")
note("What is the threshold?")
b = 20
r = 450
threshold = (1 / b) ** (1 / r)
prob_match = (1 / b)
prob_collision = 1 - (1 - 1 / b) ** b # approximately 1 - 1 / e (a constant)
note("Example setting [Lee+ 2022]: n = 9000, b = 20, r = 450")
note("References"), see("http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf")
def copyright():
note("Lots of lawsuits around generative AI, mostly around copyright")
see("https://www.bakerlaw.com/services/artificial-intelligence-ai/case-tracker-artificial-intelligence-copyrights-and-class-actions/")
note("## Intellectual property law")
note("Goal: *incentivize* the creation of intellectual goods")
note("Types of intellectual property: copyright, patents, trademarks, trade secrets.")
note("## Copyright law")
note("Goes back to 1709 in England (Statute of Anne), "
"first time regulated by governments and courts"), see("https://en.wikipedia.org/wiki/Statute_of_Anne")
note("In United States, most recent: Copyright Act of 1976"), see("https://en.wikipedia.org/wiki/Copyright_Act_of_1976")
note("Copyright protection applies to "
"'original works of authorship fixed in any tangible medium of expression, "
"now known or later developed, from which they can be perceived, reproduced, "
"or otherwise communicated, either directly or with the aid of a machine or device'")
note("Original works, so collections not copyrightable (e.g., telephone directories) "
"unless there is some creativity in the selection or arrangement")
note("Copyright applies to expression, not ideas (e.g., quicksort)")
note("Expanded scope from 'published' (1909) to 'fixed'")
note("Registration not required for copyright protection (in contrast with patents)")
note("Threshold for copyright is extremely low (e.g., your website)")
note("Registration is required before creator can sue someone for copyright infringement")
note("Costs $65 to register"), see("https://www.copyright.gov/about/fees.html")
note("Lasts for 75 years, and then the copyright expires and it becomes part of the public domain "
"(works of Shakespeare, Beethoven, most of Project Gutenberg, etc.)")
note("Summary: most things on the Internet are actually copyrighted.")
note("How to use a copyrighted work:")
note("1. Get a license for it.")
note("2. Appeal to the fair use clause.")
note("## Licenses")
note("A license (from contract law) is granted by a licensor to a licensee.")
note("Effectively, 'a license is a promise not to sue'.")
note("The Creative Commons license, enable free distribution of copyrighted work.")
note("Examples: Wikipedia, Open Courseware, Khan Academy, Free Music Archive, "
"307 million images from Flickr, 39 million images from MusicBrainz, 10 million videos from YouTube, etc.")
note("Created by Lessig and Eldred in 2001 to bridge public domain and existing copyright")
note("Many model developers license data for training foundation models")
note("- Google and Reddit"), see("https://www.reuters.com/technology/reddit-ai-content-licensing-deal-with-google-sources-say-2024-02-22/")
note("- OpenAI and Shutterstock"), see("https://investor.shutterstock.com/news-releases/news-release-details/shutterstock-expands-partnership-openai-signs-new-six-year")
note("- OpenAI and StackExchange"), see("https://stackoverflow.co/company/press/archive/openai-partnership")
note("## Fair use (section 107)")
note("Four factors to determine whether fair use applies:")
note("1. The purpose and character of the use "
"(educational favored over commercial, transformative favored over reproductive)")
note("2. The nature of the copyrighted work "
"(fictional favored over factual, creativitive over non-creative)")
note("3. The amount and substantiality of the portion of the original work used "
"(using a snippet favored over using the whole work)")
note("4. The effect of the use upon the market (or potential market) for the original work")
note("Examples of fair use:")
note("- You watch a movie and write a summary of it")
note("- Reimplement an algorithm (the idea) rather than copying the code (the expression)")
note("- Google Books index and show snippets (Authors Guild v. Google 2002-2013)")
note("Copyright is not about verbatim memorization")
note("- Plots and characters (e.g., Mickey Mouse) can be copyrightable")
note("- Parody is likely fair use")
note("Copyright is about semantics (and economics)")
note("Considerations for foundation models")
note("Copying data (first step of training) is violation already even if you don't do anything with it.")
note("Training an ML model is transformative (far from just copy/pasting)")
note("ML system is interested in idea (e.g., stop sign), "
"not in the concrete expression (e.g., exact artistic choices of a particular image of a stop sign).")
note("Problem: language models can definitely affect the market (writers, artists)")
note("## Terms of service")
note("Even if you have a license or can appeal to fair use, "
"terms of service might impose additional restrictions.")
note("Example: YouTube's terms of service prohibits downloading videos, "
"even if the videos are licensed under Creative Commons.")
note("Course notes"), see("https://stanford-cs324.github.io/winter2022/lectures/legality/")
note("Fair learning [Lemley & Casey]"), see("https://texaslawreview.org/fair-learning/")
note("Foundation models and fair use [Henderson+ 2023]"), see("https://arxiv.org/pdf/2303.15715")
if __name__ == "__main__":
init_content("lecture_13-content.js")
lecture_13()