-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathindex.html
594 lines (550 loc) · 36.8 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
<!DOCTYPE html>
<html lang="en" dir="auto">
<head>
<meta name="generator" content="Hugo 0.139.3"><meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="robots" content="index, follow">
<title>Lil'Log</title>
<meta name="description" content="Document my learning notes.">
<meta name="author" content="Lilian Weng">
<link rel="canonical" href="https://lilianweng.github.io/" />
<link crossorigin="anonymous" href="/assets/css/stylesheet.min.67a6fb6e33089cb29e856bcc95d7aa39f70049a42b123105531265a0d9f1258b.css" integrity="sha256-Z6b7bjMInLKehWvMldeqOfcASaQrEjEFUxJloNnxJYs=" rel="preload stylesheet" as="style">
<link rel="icon" href="https://lilianweng.github.io/favicon_wine.ico">
<link rel="icon" type="image/png" sizes="16x16" href="https://lilianweng.github.io/favicon-16x16.png">
<link rel="icon" type="image/png" sizes="32x32" href="https://lilianweng.github.io/favicon-32x32.png">
<link rel="apple-touch-icon" href="https://lilianweng.github.io/apple-touch-icon.png">
<link rel="mask-icon" href="https://lilianweng.github.io/safari-pinned-tab.svg">
<meta name="theme-color" content="#2e2e33">
<meta name="msapplication-TileColor" content="#2e2e33">
<link rel="alternate" type="application/rss+xml" href="https://lilianweng.github.io/index.xml">
<link rel="alternate" type="application/json" href="https://lilianweng.github.io/index.json">
<link rel="alternate" hreflang="en" href="https://lilianweng.github.io/" />
<noscript>
<style>
#theme-toggle,
.top-link {
display: none;
}
</style>
<style>
@media (prefers-color-scheme: dark) {
:root {
--theme: rgb(29, 30, 32);
--entry: rgb(46, 46, 51);
--primary: rgb(218, 218, 219);
--secondary: rgb(155, 156, 157);
--tertiary: rgb(65, 66, 68);
--content: rgb(196, 196, 197);
--hljs-bg: rgb(46, 46, 51);
--code-bg: rgb(55, 56, 62);
--border: rgb(51, 51, 51);
}
.list {
background: var(--theme);
}
.list:not(.dark)::-webkit-scrollbar-track {
background: 0 0;
}
.list:not(.dark)::-webkit-scrollbar-thumb {
border-color: var(--theme);
}
}
</style>
</noscript>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-HFT45VFBX6"></script>
<script>
var doNotTrack = false;
if ( false ) {
var dnt = (navigator.doNotTrack || window.doNotTrack || navigator.msDoNotTrack);
var doNotTrack = (dnt == "1" || dnt == "yes");
}
if (!doNotTrack) {
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-HFT45VFBX6');
}
</script><meta property="og:title" content="Lil'Log" />
<meta property="og:description" content="Document my learning notes." />
<meta property="og:type" content="website" />
<meta property="og:url" content="https://lilianweng.github.io/" />
<meta name="twitter:card" content="summary"/>
<meta name="twitter:title" content="Lil'Log"/>
<meta name="twitter:description" content="Document my learning notes."/>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Organization",
"name": "Lil'Log",
"url": "https://lilianweng.github.io/",
"description": "Document my learning notes.",
"thumbnailUrl": "https://lilianweng.github.io/favicon_wine.ico",
"sameAs": [
"https://twitter.com/lilianweng/", "https://scholar.google.com/citations?user=dCa-pW8AAAAJ\u0026hl=en\u0026oi=ao", "index.xml", "https://github.com/lilianweng"
]
}
</script>
</head>
<body class="list" id="top">
<script>
if (localStorage.getItem("pref-theme") === "dark") {
document.body.classList.add('dark');
} else if (localStorage.getItem("pref-theme") === "light") {
document.body.classList.remove('dark')
} else if (window.matchMedia('(prefers-color-scheme: dark)').matches) {
document.body.classList.add('dark');
}
</script>
<script>
MathJax = {
tex: {
inlineMath: [['$', '$'], ['\\(', '\\)']],
displayMath: [['$$','$$'], ['\\[', '\\]']],
processEscapes: true,
processEnvironments: true
},
options: {
skipHtmlTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
};
window.addEventListener('load', (event) => {
document.querySelectorAll("mjx-container").forEach(function(x){
x.parentElement.classList += 'has-jax'})
});
</script>
<script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
<script type="text/javascript" id="MathJax-script" async
src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
<header class="header">
<nav class="nav">
<div class="logo">
<a href="https://lilianweng.github.io/" accesskey="h" title="Lil'Log (Alt + H)">Lil'Log</a>
<span class="logo-switches">
<button id="theme-toggle" accesskey="t" title="(Alt + T)">
<svg id="moon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"
fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round"
stroke-linejoin="round">
<path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"></path>
</svg>
<svg id="sun" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24"
fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round"
stroke-linejoin="round">
<circle cx="12" cy="12" r="5"></circle>
<line x1="12" y1="1" x2="12" y2="3"></line>
<line x1="12" y1="21" x2="12" y2="23"></line>
<line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
<line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
<line x1="1" y1="12" x2="3" y2="12"></line>
<line x1="21" y1="12" x2="23" y2="12"></line>
<line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
<line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
</svg>
</button>
<ul class="lang-switch"><li>|</li>
</ul>
</span>
</div>
<ul id="menu">
<li>
<a href="https://lilianweng.github.io/" title="Posts">
<span class="active">Posts</span>
</a>
</li>
<li>
<a href="https://lilianweng.github.io/archives" title="Archive">
<span>Archive</span>
</a>
</li>
<li>
<a href="https://lilianweng.github.io/search/" title="Search (Alt + /)" accesskey=/>
<span>Search</span>
</a>
</li>
<li>
<a href="https://lilianweng.github.io/tags/" title="Tags">
<span>Tags</span>
</a>
</li>
<li>
<a href="https://lilianweng.github.io/faq" title="FAQ">
<span>FAQ</span>
</a>
</li>
<li>
<a href="https://www.emojisearch.app/" title="emojisearch.app">
<span>emojisearch.app</span>
</a>
</li>
</ul>
</nav>
</header>
<main class="main">
<article class="first-entry home-info">
<header class="entry-header">
<h1>👋 Welcome to Lil’Log</h1>
</header>
<section class="entry-content">
<p>Hi, this is Lilian. I’m documenting my learning notes in this blog since 2017. Based on the number of grammar mistakes in my posts, you can tell how much ChatGPT is involved 😉.</p>
</section>
<footer class="entry-footer">
<div class="social-icons">
<a href="https://twitter.com/lilianweng/" target="_blank" rel="noopener noreferrer me" title="Twitter">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"
stroke-linecap="round" stroke-linejoin="round">
<path
d="M23 3a10.9 10.9 0 0 1-3.14 1.53 4.48 4.48 0 0 0-7.86 3v1A10.66 10.66 0 0 1 3 4s-4 9 5 13a11.64 11.64 0 0 1-7 2c9 5 20 0 20-11.5a4.5 4.5 0 0 0-.08-.83A7.72 7.72 0 0 0 23 3z">
</path>
</svg>
</a>
<a href="https://scholar.google.com/citations?user=dCa-pW8AAAAJ&hl=en&oi=ao" target="_blank" rel="noopener noreferrer me" title="Other">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"
stroke-linecap="round" stroke-linejoin="round">
<path d="M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"></path>
<path d="M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"></path>
</svg>
</a>
<a href="index.xml" target="_blank" rel="noopener noreferrer me" title="Rss">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"
stroke-linecap="round" stroke-linejoin="round">
<path d="M4 11a9 9 0 0 1 9 9" />
<path d="M4 4a16 16 0 0 1 16 16" />
<circle cx="5" cy="19" r="1" />
</svg>
</a>
<a href="https://github.com/lilianweng" target="_blank" rel="noopener noreferrer me" title="Github">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"
stroke-linecap="round" stroke-linejoin="round">
<path
d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22">
</path>
</svg>
</a>
</div>
</footer>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Reward Hacking in Reinforcement Learning
</h2>
</header>
<section class="entry-content">
<p>Reward hacking occurs when a reinforcement learning (RL) agent exploits flaws or ambiguities in the reward function to achieve high rewards, without genuinely learning or completing the intended task. Reward hacking exists because RL environments are often imperfect, and it is fundamentally challenging to accurately specify a reward function.
With the rise of language models generalizing to a broad spectrum of tasks and RLHF becomes a de facto method for alignment training, reward hacking in RL training of language models has become a critical practical challenge. Instances where the model learns to modify unit tests to pass coding tasks, or where responses contain biases that mimic a user’s preference, are pretty concerning and are likely one of the major blockers for real-world deployment of more autonomous use cases of AI models.
...</p>
</section>
<footer class="entry-footer">Date: November 28, 2024 | Estimated Reading Time: 37 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Reward Hacking in Reinforcement Learning" href="https://lilianweng.github.io/posts/2024-11-28-reward-hacking/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Extrinsic Hallucinations in LLMs
</h2>
</header>
<section class="entry-content">
<p>Hallucination in large language models usually refers to the model generating unfaithful, fabricated, inconsistent, or nonsensical content. As a term, hallucination has been somewhat generalized to cases when the model makes mistakes. Here, I would like to narrow down the problem of hallucination to cases where the model output is fabricated and not grounded by either the provided context or world knowledge.
There are two types of hallucination:
In-context hallucination: The model output should be consistent with the source content in context. Extrinsic hallucination: The model output should be grounded by the pre-training dataset. However, given the size of the pre-training dataset, it is too expensive to retrieve and identify conflicts per generation. If we consider the pre-training data corpus as a proxy for world knowledge, we essentially try to ensure the model output is factual and verifiable by external world knowledge. Equally importantly, when the model does not know about a fact, it should say so. This post focuses on extrinsic hallucination. To avoid hallucination, LLMs need to be (1) factual and (2) acknowledge not knowing the answer when applicable.
...</p>
</section>
<footer class="entry-footer">Date: July 7, 2024 | Estimated Reading Time: 30 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Extrinsic Hallucinations in LLMs" href="https://lilianweng.github.io/posts/2024-07-07-hallucination/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Diffusion Models for Video Generation
</h2>
</header>
<section class="entry-content">
<p>Diffusion models have demonstrated strong results on image synthesis in past years. Now the research community has started working on a harder task—using it for video generation. The task itself is a superset of the image case, since an image is a video of 1 frame, and it is much more challenging because:
It has extra requirements on temporal consistency across frames in time, which naturally demands more world knowledge to be encoded into the model. In comparison to text or images, it is more difficult to collect large amounts of high-quality, high-dimensional video data, let along text-video pairs. 🥑 Required Pre-read: Please make sure you have read the previous blog on “What are Diffusion Models?” for image generation before continue here. ...</p>
</section>
<footer class="entry-footer">Date: April 12, 2024 | Estimated Reading Time: 20 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Diffusion Models for Video Generation" href="https://lilianweng.github.io/posts/2024-04-12-diffusion-video/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Thinking about High-Quality Human Data
</h2>
</header>
<section class="entry-content">
<p>[Special thank you to Ian Kivlichan for many useful pointers (E.g. the 100+ year old Nature paper “Vox populi”) and nice feedback. 🙏 ]
High-quality data is the fuel for modern data deep learning model training. Most of the task-specific labeled data comes from human annotation, such as classification task or RLHF labeling (which can be constructed as classification format) for LLM alignment training. Lots of ML techniques in the post can help with data quality, but fundamentally human data collection involves attention to details and careful execution. The community knows the value of high quality data, but somehow we have this subtle impression that “Everyone wants to do the model work, not the data work” (Sambasivan et al. 2021).
...</p>
</section>
<footer class="entry-footer">Date: February 5, 2024 | Estimated Reading Time: 21 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Thinking about High-Quality Human Data" href="https://lilianweng.github.io/posts/2024-02-05-human-data-quality/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Adversarial Attacks on LLMs
</h2>
</header>
<section class="entry-content">
<p>The use of large language models in the real world has strongly accelerated by the launch of ChatGPT. We (including my team at OpenAI, shoutout to them) have invested a lot of effort to build default safe behavior into the model during the alignment process (e.g. via RLHF). However, adversarial attacks or jailbreak prompts could potentially trigger the model to output something undesired.
A large body of ground work on adversarial attacks is on images, and differently it operates in the continuous, high-dimensional space. Attacks for discrete data like text have been considered to be a lot more challenging, due to lack of direct gradient signals. My past post on Controllable Text Generation is quite relevant to this topic, as attacking LLMs is essentially to control the model to output a certain type of (unsafe) content.
...</p>
</section>
<footer class="entry-footer">Date: October 25, 2023 | Estimated Reading Time: 33 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Adversarial Attacks on LLMs" href="https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>LLM Powered Autonomous Agents
</h2>
</header>
<section class="entry-content">
<p>Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:
Planning Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks. Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results. Memory Short-term memory: I would consider all the in-context learning (See Prompt Engineering) as utilizing short-term memory of the model to learn. Long-term memory: This provides the agent with the capability to retain and recall (infinite) information over extended periods, often by leveraging an external vector store and fast retrieval. Tool use The agent learns to call external APIs for extra information that is missing from the model weights (often hard to change after pre-training), including current information, code execution capability, access to proprietary information sources and more. Fig. 1. Overview of a LLM-powered autonomous agent system. Component One: Planning A complicated task usually involves many steps. An agent needs to know what they are and plan ahead.
...</p>
</section>
<footer class="entry-footer">Date: June 23, 2023 | Estimated Reading Time: 31 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to LLM Powered Autonomous Agents" href="https://lilianweng.github.io/posts/2023-06-23-agent/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Prompt Engineering
</h2>
</header>
<section class="entry-content">
<p>Prompt Engineering, also known as In-Context Prompting, refers to methods for how to communicate with LLM to steer its behavior for desired outcomes without updating the model weights. It is an empirical science and the effect of prompt engineering methods can vary a lot among models, thus requiring heavy experimentation and heuristics.
This post only focuses on prompt engineering for autoregressive language models, so nothing with Cloze tests, image generation or multimodality models. At its core, the goal of prompt engineering is about alignment and model steerability. Check my previous post on controllable text generation.
...</p>
</section>
<footer class="entry-footer">Date: March 15, 2023 | Estimated Reading Time: 21 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Prompt Engineering" href="https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>The Transformer Family Version 2.0
</h2>
</header>
<section class="entry-content">
<p>Many new Transformer architecture improvements have been proposed since my last post on “The Transformer Family” about three years ago. Here I did a big refactoring and enrichment of that 2020 post — restructure the hierarchy of sections and improve many sections with more recent papers. Version 2.0 is a superset of the old version, about twice the length.
Notations Symbol Meaning $d$ The model size / hidden state dimension / positional encoding size. $h$ The number of heads in multi-head attention layer. $L$ The segment length of input sequence. $N$ The total number of attention layers in the model; not considering MoE. $\mathbf{X} \in \mathbb{R}^{L \times d}$ The input sequence where each element has been mapped into an embedding vector of shape $d$, same as the model size. $\mathbf{W}^k \in \mathbb{R}^{d \times d_k}$ The key weight matrix. $\mathbf{W}^q \in \mathbb{R}^{d \times d_k}$ The query weight matrix. $\mathbf{W}^v \in \mathbb{R}^{d \times d_v}$ The value weight matrix. Often we have $d_k = d_v = d$. $\mathbf{W}^k_i, \mathbf{W}^q_i \in \mathbb{R}^{d \times d_k/h}; \mathbf{W}^v_i \in \mathbb{R}^{d \times d_v/h}$ The weight matrices per head. $\mathbf{W}^o \in \mathbb{R}^{d_v \times d}$ The output weight matrix. $\mathbf{Q} = \mathbf{X}\mathbf{W}^q \in \mathbb{R}^{L \times d_k}$ The query embedding inputs. $\mathbf{K} = \mathbf{X}\mathbf{W}^k \in \mathbb{R}^{L \times d_k}$ The key embedding inputs. $\mathbf{V} = \mathbf{X}\mathbf{W}^v \in \mathbb{R}^{L \times d_v}$ The value embedding inputs. $\mathbf{q}_i, \mathbf{k}_i \in \mathbb{R}^{d_k}, \mathbf{v}_i \in \mathbb{R}^{d_v}$ Row vectors in query, key, value matrices, $\mathbf{Q}$, $\mathbf{K}$ and $\mathbf{V}$. $S_i$ A collection of key positions for the $i$-th query $\mathbf{q}_i$ to attend to. $\mathbf{A} \in \mathbb{R}^{L \times L}$ The self-attention matrix between a input sequence of lenght $L$ and itself. $\mathbf{A} = \text{softmax}(\mathbf{Q}\mathbf{K}^\top / \sqrt{d_k})$. $a_{ij} \in \mathbf{A}$ The scalar attention score between query $\mathbf{q}_i$ and key $\mathbf{k}_j$. $\mathbf{P} \in \mathbb{R}^{L \times d}$ position encoding matrix, where the $i$-th row $\mathbf{p}_i$ is the positional encoding for input $\mathbf{x}_i$. Transformer Basics The Transformer (which will be referred to as “vanilla Transformer” to distinguish it from other enhanced versions; Vaswani, et al., 2017) model has an encoder-decoder architecture, as commonly used in many NMT models. Later simplified Transformer was shown to achieve great performance in language modeling tasks, like in encoder-only BERT or decoder-only GPT.
...</p>
</section>
<footer class="entry-footer">Date: January 27, 2023 | Estimated Reading Time: 46 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to The Transformer Family Version 2.0" href="https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Large Transformer Model Inference Optimization
</h2>
</header>
<section class="entry-content">
<p>[Updated on 2023-01-24: add a small section on Distillation.]
Large transformer models are mainstream nowadays, creating SoTA results for a variety of tasks. They are powerful but very expensive to train and use. The extremely high inference cost, in both time and memory, is a big bottleneck for adopting a powerful transformer for solving real-world tasks at scale.
Why is it hard to run inference for large transformer models? Besides the increasing size of SoTA models, there are two main factors contributing to the inference challenge (Pope et al. 2022):
...</p>
</section>
<footer class="entry-footer">Date: January 10, 2023 | Estimated Reading Time: 9 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Large Transformer Model Inference Optimization" href="https://lilianweng.github.io/posts/2023-01-10-inference-optimization/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Some Math behind Neural Tangent Kernel
</h2>
</header>
<section class="entry-content">
<p>Neural networks are well known to be over-parameterized and can often easily fit data with near-zero training loss with decent generalization performance on test dataset. Although all these parameters are initialized at random, the optimization process can consistently lead to similarly good outcomes. And this is true even when the number of model parameters exceeds the number of training data points.
Neural tangent kernel (NTK) (Jacot et al. 2018) is a kernel to explain the evolution of neural networks during training via gradient descent. It leads to great insights into why neural networks with enough width can consistently converge to a global minimum when trained to minimize an empirical loss. In the post, we will do a deep dive into the motivation and definition of NTK, as well as the proof of a deterministic convergence at different initializations of neural networks with infinite width by characterizing NTK in such a setting.
...</p>
</section>
<footer class="entry-footer">Date: September 8, 2022 | Estimated Reading Time: 17 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Some Math behind Neural Tangent Kernel" href="https://lilianweng.github.io/posts/2022-09-08-ntk/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Generalized Visual Language Models
</h2>
</header>
<section class="entry-content">
<p>Processing images to generate text, such as image captioning and visual question-answering, has been studied for years. Traditionally such systems rely on an object detection network as a vision encoder to capture visual features and then produce text via a text decoder. Given a large amount of existing literature, in this post, I would like to only focus on one approach for solving vision language tasks, which is to extend pre-trained generalized language models to be capable of consuming visual signals.
...</p>
</section>
<footer class="entry-footer">Date: June 9, 2022 | Estimated Reading Time: 25 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Generalized Visual Language Models" href="https://lilianweng.github.io/posts/2022-06-09-vlm/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Learning with not Enough Data Part 3: Data Generation
</h2>
</header>
<section class="entry-content">
<p>Here comes the Part 3 on learning with not enough data (Previous: Part 1 and Part 2). Let’s consider two approaches for generating synthetic data for training.
Augmented data. Given a set of existing training samples, we can apply a variety of augmentation, distortion and transformation to derive new data points without losing the key attributes. We have covered a bunch of augmentation methods on text and images in a previous post on contrastive learning. For the sake of post completeness, I duplicate the section on data augmentation here with some edits. New data. Given few or even no data points, we can rely on powerful pretrained models to generate a number of new data points. This is especially true in recent years given the fast progress in large pretrained language models (LM). Few shot prompting is shown to be effective for LM to learn within context without extra training. Data Augmentation The goal of data augmentation is to modify the input format (e.g. text wording, visual appearance) while the semantic meaning stays unchanged.
...</p>
</section>
<footer class="entry-footer">Date: April 15, 2022 | Estimated Reading Time: 28 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Learning with not Enough Data Part 3: Data Generation" href="https://lilianweng.github.io/posts/2022-04-15-data-gen/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Learning with not Enough Data Part 2: Active Learning
</h2>
</header>
<section class="entry-content">
<p> This is part 2 of what to do when facing a limited amount of labeled data for supervised learning tasks. This time we will get some amount of human labeling work involved, but within a budget limit, and therefore we need to be smart when selecting which samples to label.
...</p>
</section>
<footer class="entry-footer">Date: February 20, 2022 | Estimated Reading Time: 22 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Learning with not Enough Data Part 2: Active Learning" href="https://lilianweng.github.io/posts/2022-02-20-active-learning/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Learning with not Enough Data Part 1: Semi-Supervised Learning
</h2>
</header>
<section class="entry-content">
<p> When facing a limited amount of labeled data for supervised learning tasks, four approaches are commonly discussed.
...</p>
</section>
<footer class="entry-footer">Date: December 5, 2021 | Estimated Reading Time: 26 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Learning with not Enough Data Part 1: Semi-Supervised Learning" href="https://lilianweng.github.io/posts/2021-12-05-semi-supervised/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>How to Train Really Large Models on Many GPUs?
</h2>
</header>
<section class="entry-content">
<p> [Updated on 2022-03-13: add expert choice routing.] [Updated on 2022-06-10]: Greg and I wrote a shorted and upgraded version of this post, published on OpenAI Blog: “Techniques for Training Large Neural Networks”
...</p>
</section>
<footer class="entry-footer">Date: September 24, 2021 | Estimated Reading Time: 21 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to How to Train Really Large Models on Many GPUs?" href="https://lilianweng.github.io/posts/2021-09-25-train-large/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>What are Diffusion Models?
</h2>
</header>
<section class="entry-content">
<p> [Updated on 2021-09-19: Highly recommend this blog post on score-based generative modeling by Yang Song (author of several key papers in the references)]. [Updated on 2022-08-27: Added classifier-free guidance, GLIDE, unCLIP and Imagen. [Updated on 2022-08-31: Added latent diffusion model. [Updated on 2024-04-13: Added progressive distillation, consistency models, and the Model Architecture section.
...</p>
</section>
<footer class="entry-footer">Date: July 11, 2021 | Estimated Reading Time: 32 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to What are Diffusion Models?" href="https://lilianweng.github.io/posts/2021-07-11-diffusion-models/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Contrastive Representation Learning
</h2>
</header>
<section class="entry-content">
<p> The goal of contrastive representation learning is to learn such an embedding space in which similar sample pairs stay close to each other while dissimilar ones are far apart. Contrastive learning can be applied to both supervised and unsupervised settings. When working with unsupervised data, contrastive learning is one of the most powerful approaches in self-supervised learning.
...</p>
</section>
<footer class="entry-footer">Date: May 31, 2021 | Estimated Reading Time: 39 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Contrastive Representation Learning" href="https://lilianweng.github.io/posts/2021-05-31-contrastive/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Reducing Toxicity in Language Models
</h2>
</header>
<section class="entry-content">
<p> Large pretrained language models are trained over a sizable collection of online data. They unavoidably acquire certain toxic behavior and biases from the Internet. Pretrained language models are very powerful and have shown great success in many NLP tasks. However, to safely deploy them for practical real-world applications demands a strong safety control over the model generation process.
...</p>
</section>
<footer class="entry-footer">Date: March 21, 2021 | Estimated Reading Time: 23 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Reducing Toxicity in Language Models" href="https://lilianweng.github.io/posts/2021-03-21-lm-toxicity/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>Controllable Neural Text Generation
</h2>
</header>
<section class="entry-content">
<p> [Updated on 2021-02-01: Updated to version 2.0 with several work added and many typos fixed.] [Updated on 2021-05-26: Add P-tuning and Prompt Tuning in the “prompt design” section.] [Updated on 2021-09-19: Add “unlikelihood training”.]
...</p>
</section>
<footer class="entry-footer">Date: January 2, 2021 | Estimated Reading Time: 42 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to Controllable Neural Text Generation" href="https://lilianweng.github.io/posts/2021-01-02-controllable-text-generation/"></a>
</article>
<article class="post-entry">
<header class="entry-header">
<h2>How to Build an Open-Domain Question Answering System?
</h2>
</header>
<section class="entry-content">
<p> [Updated on 2020-11-12: add an example on closed-book factual QA using OpenAI API (beta).
A model that can answer any question with regard to factual knowledge can lead to many useful and practical applications, such as working as a chatbot or an AI assistant🤖. In this post, we will review several common approaches for building such an open-domain question answering system.
...</p>
</section>
<footer class="entry-footer">Date: October 29, 2020 | Estimated Reading Time: 33 min | Author: Lilian Weng</footer>
<a class="entry-link" aria-label="post link to How to Build an Open-Domain Question Answering System?" href="https://lilianweng.github.io/posts/2020-10-29-odqa/"></a>
</article>
<footer class="page-footer">
<nav class="pagination">
<a class="next" href="https://lilianweng.github.io/page/2/"> »</a>
</nav>
</footer>
</main>
<footer class="footer">
<span>© 2024 <a href="https://lilianweng.github.io/">Lil'Log</a></span>
<span>
Powered by
<a href="https://gohugo.io/" rel="noopener noreferrer" target="_blank">Hugo</a> &
<a href="https://git.io/hugopapermod" rel="noopener" target="_blank">PaperMod</a>
</span>
</footer>
<a href="#top" aria-label="go to top" title="Go to Top (Alt + G)" class="top-link" id="top-link" accesskey="g">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 12 6" fill="currentColor">
<path d="M12 6H0l6-6z" />
</svg>
</a>
<script>
let menu = document.getElementById('menu')
if (menu) {
menu.scrollLeft = localStorage.getItem("menu-scroll-position");
menu.onscroll = function () {
localStorage.setItem("menu-scroll-position", menu.scrollLeft);
}
}
document.querySelectorAll('a[href^="#"]').forEach(anchor => {
anchor.addEventListener("click", function (e) {
e.preventDefault();
var id = this.getAttribute("href").substr(1);
if (!window.matchMedia('(prefers-reduced-motion: reduce)').matches) {
document.querySelector(`[id='${decodeURIComponent(id)}']`).scrollIntoView({
behavior: "smooth"
});
} else {
document.querySelector(`[id='${decodeURIComponent(id)}']`).scrollIntoView();
}
if (id === "top") {
history.replaceState(null, null, " ");
} else {
history.pushState(null, null, `#${id}`);
}
});
});
</script>
<script>
var mybutton = document.getElementById("top-link");
window.onscroll = function () {
if (document.body.scrollTop > 800 || document.documentElement.scrollTop > 800) {
mybutton.style.visibility = "visible";
mybutton.style.opacity = "1";
} else {
mybutton.style.visibility = "hidden";
mybutton.style.opacity = "0";
}
};
</script>
<script>
document.getElementById("theme-toggle").addEventListener("click", () => {
if (document.body.className.includes("dark")) {
document.body.classList.remove('dark');
localStorage.setItem("pref-theme", 'light');
} else {
document.body.classList.add('dark');
localStorage.setItem("pref-theme", 'dark');
}
})
</script>
</body>
</html>