Add quickstart tutorial (#528)

https://lilacml.com/getting_started/quickstart.html
databricks · Aug 10, 2023 · 6f8870b · 6f8870b
1 parent bf714f9
commit 6f8870b
Show file tree

Hide file tree

Showing 28 changed files with 194 additions and 87 deletions.
diff --git a/.prettierrc b/.prettierrc
@@ -4,11 +4,12 @@
   "arrowParens": "avoid",
   "trailingComma": "none",
   "printWidth": 100,
+  "proseWrap": "always",
   "plugins": [
     "prettier-plugin-svelte",
     "prettier-plugin-organize-imports",
     "prettier-plugin-tailwindcss"
   ],
   "pluginSearchDirs": ["."],
-  "overrides": [{ "files": "*.svelte", "options": { "parser": "svelte" } }]
+  "overrides": [{"files": "*.svelte", "options": {"parser": "svelte"}}]
 }
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -27,6 +27,9 @@
   "[typescript]": {
     "editor.defaultFormatter": "esbenp.prettier-vscode"
   },
+  "[markdown]": {
+    "editor.defaultFormatter": "esbenp.prettier-vscode"
+  },
   "typescript.format.insertSpaceAfterOpeningAndBeforeClosingNonemptyBraces": false,
   "[typescriptreact]": {
     "editor.defaultFormatter": "esbenp.prettier-vscode"

diff --git a/docs/_static/custom.js b/docs/_static/custom.js
@@ -0,0 +1 @@
+document.querySelectorAll('video').forEach(v => (v.playbackRate = 1.5));
diff --git a/docs/_static/getting_started/orca-download.mp4 b/docs/_static/getting_started/orca-download.mp4
diff --git a/docs/_static/getting_started/orca-index-response.mp4 b/docs/_static/getting_started/orca-index-response.mp4
diff --git a/docs/_static/getting_started/orca-load.mp4 b/docs/_static/getting_started/orca-load.mp4
diff --git a/docs/_static/getting_started/orca-pii-enrichment.mp4 b/docs/_static/getting_started/orca-pii-enrichment.mp4
diff --git a/docs/_static/getting_started/orca-pii-filter.mp4 b/docs/_static/getting_started/orca-pii-filter.mp4
diff --git a/docs/_static/getting_started/orca-profanity-preview.mp4 b/docs/_static/getting_started/orca-profanity-preview.mp4
diff --git a/docs/_static/getting_started/orca-profanity-stats.mp4 b/docs/_static/getting_started/orca-profanity-stats.mp4
diff --git a/docs/_static/getting_started/orca-settings.mp4 b/docs/_static/getting_started/orca-settings.mp4
diff --git a/docs/conf.py b/docs/conf.py
@@ -23,6 +23,8 @@
   'myst_parser',
 ]
 
+myst_enable_extensions = ['attrs_block', 'attrs_inline']
+
 autodoc_pydantic_model_show_json = False
 autodoc_pydantic_field_list_validators = False
 autodoc_pydantic_config_members = False
@@ -59,3 +61,4 @@
 html_title = 'Lilac'
 html_static_path = ['_static']
 html_css_files = ['styles/custom.css']
+html_js_files = ['custom.js']
diff --git a/docs/firebase.json b/docs/firebase.json
@@ -1,10 +1,6 @@
 {
   "hosting": {
-    "public": "_build",
-    "ignore": [
-      "firebase.json",
-      "**/.*",
-      "**/node_modules/**"
-    ]
+    "public": "_build/html",
+    "ignore": ["firebase.json", "**/.*", "**/node_modules/**"]
   }
 }
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
@@ -12,16 +12,97 @@ lilac start
 
 This should open a browser tab pointing to `http://localhost:5432`.
 
-## Adding a dataset
+## Overview
 
-Let's load a small dataset with movie descriptions.
-Click the "Add dataset" button on the Getting Started page. Choose the `csv` loader and paste in
-the following URL in the Filepaths section:
+In this quick start we're going to:
 
-```
-https://storage.googleapis.com/lilac-data-us-east1/datasets/csv_datasets/the_movies_dataset/the_movies_dataset.csv
-```
+- Load [OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca), a popular instruction dataset
+  for tuning LLMs.
+- Find PII (emails, etc)
+- Find profanity in the responses (using powerful text embeddings)
+- Download the enriched dataset as a json file so we can clean it in a Python notebook
+
+## Add a dataset
+
+Let's load [OpenOrca](https://huggingface.co/datasets/Open-Orca/OpenOrca), a popular instruction
+dataset used for tuning LLM models.
+
+Click the `Add dataset` button on the Getting Started page and add the dataset info as shown below.
+While the Lilac tool can scale to millions of rows on a single machine, we are sampling to 100,000
+so we can get started quickly.
+
+<video loop muted autoplay controls src="../_static/getting_started/orca-load.mp4"></video>
+
+## Configure
+
+When we load a dataset, Lilac creates a default UI configuration, inferring which fields are _media_
+(e.g. unstructured documents), and which are _metadata_ fields. The two types of fields are
+presented differently in the UI.
+
+Let's edit the configuration by clicking the `Dataset settings` button in the top-right corner. If
+your media field contains markdown, you can enable markdown rendering.
+
+<video loop muted autoplay controls src="../_static/getting_started/orca-settings.mp4"></video>
+
+## Enrich
+
+Lilac can enrich your media fields with additional metadata by:
+
+- Running a [signal](../signals/signals.md) (e.g. PII detection, language detection, text
+  statistics, etc.)
+- Running a [concept](../concepts/concepts.md) (e.g. profanity, sentiment, etc. or a custom concept
+  that you create)
+
+### PII detection
+
+Let's run the PII detection signal on both the `question` and the `response` field and see if there
+is any PII like emails, secret tokens, etc.
+
+<video loop muted autoplay controls src="../_static/getting_started/orca-pii-enrichment.mp4"></video>
+
+Once it's done, we can see that both the `question` and the `response` fields have emails present.
+We can click on an email to apply a filter and see all the rows that contain that email.
+
+<video loop muted autoplay controls src="../_static/getting_started/orca-pii-filter.mp4"></video>
+
+We notice that the selected email in the `response` field was not hallucinated by the LLM because it
+was also present in the `question` field. Later we can use the enriched metadata of both fields to
+filter out only responses that have hallucinated emails.
+
+### Profanity detection
+
+Let's also run the profanity concept on the `response` field to see if the LLM produced any profane
+content. To see the results, we need to _index_ the `response` field using a text embedding. We only
+need to index once. For a fast on-device embedding, we recommend the
+[GTE-Small embedding](https://huggingface.co/thenlper/gte-small).
+
+<video loop muted autoplay controls src="../_static/getting_started/orca-index-response.mp4"></video>
+
+It takes ~20 minutes to index the 100,000 responses on a Macbook M1. Now that the field is indexed,
+we can now do _semantic search_ and _concept search_ on the field (in addition to the usual _keyword
+search_).
+
+Let's search by the profanity concept and see if the LLM produced any profane content. Results in
+the video are blurred due to sensitive content.
+
+Concepts by default run in _preview_ mode, where we only compute the concept scores for the top K
+results. To compute the concept score over the entire dataset, we click the blue `Compute signal`
+button next to `lilac/profanity/gte-small` in the schema.
+
+<video loop muted autoplay controls src="../_static/getting_started/orca-profanity-preview.mp4"></video>
+
+Computing the concept takes ~20 seconds on a Macbook M1 laptop. Now that the concept is computed, we
+can open the statistics panel to see the distribution of concept scores.
+
+<video loop muted autoplay controls src="../_static/getting_started/orca-profanity-stats.mp4"></video>
+
+## Download
+
+Now that we've enriched the dataset, let's download it by clicking on the `Download data` button in
+the top-right corner. This will download a json file with the same name as the dataset. Once we have
+the data, we can continue working with it in a Python notebook, or any other language.
 
-Click the "Add" button on the bottom on start a background job to load the dataset.
+For other formats (csv, parquet, pandas, etc.) see the
+[Download section](quickstart_python.md#download) in [Quick Start (Python)](quickstart_python.md).
 
-<video loop muted autoplay controls src="../_static/getting_started/add-dataset.mp4"></video>
+<video loop muted autoplay controls src="../_static/getting_started/orca-download.mp4"></video>
diff --git a/docs/getting_started/quickstart_python.md b/docs/getting_started/quickstart_python.md
@@ -0,0 +1,11 @@
+# Quick Start (Python)
+
+```{tip}
+Make sure you've followed the [installation](installation.md) steps first.
+```
+
+[TODO: Write]
+
+## Download
+
+[TODO: Write]
diff --git a/docs/index.rst b/docs/index.rst
@@ -12,6 +12,7 @@ Welcome to Lilac
 
    getting_started/installation.md
    getting_started/quickstart.md
+   getting_started/quickstart_python.md
 
 .. toctree::
    :caption: Concepts

diff --git a/lilac/data/dataset_compute_signal_chain_test.py b/lilac/data/dataset_compute_signal_chain_test.py
@@ -205,7 +205,7 @@ def test_entity_on_split_signal(make_test_data: TestDataMaker) -> None:
   dataset.compute_signal(TestSplitter(), 'text')
   dataset.compute_signal(entity, ('text', 'test_splitter', '*'))
 
-  result = dataset.select_rows(['text'], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, 'text'], combine_columns=True)
   assert list(result) == [{
     UUID_COLUMN: '1',
     'text': enriched_item(

diff --git a/lilac/data/dataset_compute_signal_test.py b/lilac/data/dataset_compute_signal_test.py
@@ -226,7 +226,7 @@ def test_sparse_signal(make_test_data: TestDataMaker) -> None:
 
   dataset.compute_signal(TestSparseSignal(), 'text')
 
-  result = dataset.select_rows(['text'], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, 'text'], combine_columns=True)
   assert list(result) == [{
     UUID_COLUMN: '1',
     'text': enriched_item('hello', {'test_sparse_signal': None})
@@ -247,7 +247,7 @@ def test_sparse_rich_signal(make_test_data: TestDataMaker) -> None:
 
   dataset.compute_signal(TestSparseRichSignal(), 'text')
 
-  result = dataset.select_rows(['text'], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, 'text'], combine_columns=True)
   assert list(result) == [{
     UUID_COLUMN: '1',
     'text': enriched_item('hello', {'test_sparse_rich_signal': None})
@@ -299,7 +299,7 @@ def test_source_joined_with_signal(make_test_data: TestDataMaker) -> None:
     }),
     num_items=3)
 
-  result = dataset.select_rows(['str'], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, 'str'], combine_columns=True)
   assert list(result) == [{
     UUID_COLUMN: '1',
     'str': enriched_item('a', {'test_signal': {
@@ -321,7 +321,7 @@ def test_source_joined_with_signal(make_test_data: TestDataMaker) -> None:
   }]
 
   # Select a specific signal leaf test_signal.flen with 'str'.
-  result = dataset.select_rows(['str', ('str', 'test_signal', 'flen')])
+  result = dataset.select_rows([UUID_COLUMN, 'str', ('str', 'test_signal', 'flen')])
 
   assert list(result) == [{
     UUID_COLUMN: '1',
@@ -339,7 +339,7 @@ def test_source_joined_with_signal(make_test_data: TestDataMaker) -> None:
 
   # Select multiple signal leafs with aliasing.
   result = dataset.select_rows([
-    'str',
+    UUID_COLUMN, 'str',
     Column(('str', 'test_signal', 'flen'), alias='flen'),
     Column(('str', 'test_signal', 'len'), alias='len')
   ])
@@ -389,7 +389,7 @@ def test_parameterized_signal(make_test_data: TestDataMaker) -> None:
     }),
     num_items=2)
 
-  result = dataset.select_rows(['text'], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, 'text'], combine_columns=True)
   assert list(result) == [{
     UUID_COLUMN: '1',
     'text': enriched_item('hello', {
@@ -427,7 +427,7 @@ def test_split_signal(make_test_data: TestDataMaker) -> None:
     }),
     num_items=2)
 
-  result = dataset.select_rows(['text'], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, 'text'], combine_columns=True)
   expected_result = [{
     UUID_COLUMN: '1',
     'text': enriched_item('[1, 1] first sentence. [1, 1] second sentence.',
@@ -475,7 +475,7 @@ def test_signal_on_repeated_field(make_test_data: TestDataMaker) -> None:
     }),
     num_items=2)
 
-  result = dataset.select_rows([('text', '*')], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, ('text', '*')], combine_columns=True)
 
   assert list(result) == [{
     UUID_COLUMN: '1',
@@ -515,7 +515,7 @@ def test_text_splitter(make_test_data: TestDataMaker) -> None:
 
   dataset.compute_signal(TestSplitSignal(), 'text')
 
-  result = dataset.select_rows(['text'], combine_columns=True)
+  result = dataset.select_rows([UUID_COLUMN, 'text'], combine_columns=True)
   expected_result = [{
     UUID_COLUMN: '1',
     'text': enriched_item('[1, 1] first sentence. [1, 1] second sentence.',

diff --git a/lilac/data/dataset_duckdb.py b/lilac/data/dataset_duckdb.py
@@ -369,7 +369,7 @@ def compute_signal(self,
     manifest = self.manifest()
 
     signal_col = Column(path=source_path, alias='value', signal_udf=signal)
-    select_rows_result = self.select_rows([signal_col],
+    select_rows_result = self.select_rows([UUID_COLUMN, signal_col],
                                           task_step_id=task_step_id,
                                           resolve_span=True)
     df = select_rows_result.df()
@@ -420,7 +420,7 @@ def compute_embedding(self,
 
     signal = get_signal_by_type(embedding, TextEmbeddingSignal)()
     signal_col = Column(path=source_path, alias='value', signal_udf=signal)
-    select_rows_result = self.select_rows([signal_col],
+    select_rows_result = self.select_rows([UUID_COLUMN, signal_col],
                                           task_step_id=task_step_id,
                                           resolve_span=True)
     df = select_rows_result.df()
@@ -811,12 +811,6 @@ def select_rows(self,
     manifest = self.manifest()
     cols = self._normalize_columns(columns, manifest.data_schema)
     offset = offset or 0
-
-    # Always return the UUID column.
-    col_paths = [col.path for col in cols]
-    if (UUID_COLUMN,) not in col_paths:
-      cols.append(column_from_identifier(UUID_COLUMN))
-
     schema = manifest.data_schema
 
     if combine_columns:

diff --git a/lilac/data/dataset_select_rows_filter_test.py b/lilac/data/dataset_select_rows_filter_test.py
@@ -185,15 +185,15 @@ def test_filter_by_exists(make_test_data: TestDataMaker) -> None:
     }))
 
   exists_filter = ('name', UnaryOp.EXISTS)
-  result = dataset.select_rows(['name'], filters=[exists_filter])
+  result = dataset.select_rows([UUID_COLUMN, 'name'], filters=[exists_filter])
   assert list(result) == [{UUID_COLUMN: '1', 'name': 'A'}, {UUID_COLUMN: '3', 'name': 'C'}]
 
   exists_filter = ('info.lang', UnaryOp.EXISTS)
-  result = dataset.select_rows(['name'], filters=[exists_filter])
+  result = dataset.select_rows([UUID_COLUMN, 'name'], filters=[exists_filter])
   assert list(result) == [{UUID_COLUMN: '1', 'name': 'A'}, {UUID_COLUMN: '2', 'name': None}]
 
   exists_filter = ('ages.*.*', UnaryOp.EXISTS)
-  result = dataset.select_rows(['name'], filters=[exists_filter])
+  result = dataset.select_rows([UUID_COLUMN, 'name'], filters=[exists_filter])
   assert list(result) == [{UUID_COLUMN: '3', 'name': 'C'}]
 
   with pytest.raises(ValueError, match='Unable to filter on path'):

diff --git a/lilac/data/dataset_select_rows_sort_test.py b/lilac/data/dataset_select_rows_sort_test.py
@@ -186,7 +186,7 @@ def test_sort_by_signal_alias_no_repeated(make_test_data: TestDataMaker) -> None
   # Sort by `signal.len`.
   signal_alias = Column('text.test_signal', alias='signal')
   result = dataset.select_rows(
-    columns=[signal_alias], sort_by=['signal.len'], sort_order=SortOrder.ASC)
+    columns=[UUID_COLUMN, signal_alias], sort_by=['signal.len'], sort_order=SortOrder.ASC)
   assert list(result) == [{
     UUID_COLUMN: '3',
     'signal': {
@@ -207,7 +207,7 @@ def test_sort_by_signal_alias_no_repeated(make_test_data: TestDataMaker) -> None
     }
   }]
   result = dataset.select_rows(
-    columns=[signal_alias], sort_by=['signal.len'], sort_order=SortOrder.DESC)
+    columns=[UUID_COLUMN, signal_alias], sort_by=['signal.len'], sort_order=SortOrder.DESC)
   assert list(result) == [{
     UUID_COLUMN: '2',
     'signal': {
@@ -246,7 +246,7 @@ def test_sort_by_enriched_alias_no_repeated(make_test_data: TestDataMaker) -> No
   # Sort by `document.test_signal.is_all_cap` where 'document' is an alias to 'text'.
   text_alias = Column('text', alias='document')
   result = dataset.select_rows(
-    columns=[text_alias],
+    columns=[UUID_COLUMN, text_alias],
     sort_by=['document.test_signal.is_all_cap'],
     sort_order=SortOrder.ASC,
     combine_columns=True)
@@ -271,7 +271,7 @@ def test_sort_by_enriched_alias_no_repeated(make_test_data: TestDataMaker) -> No
   }]
 
   result = dataset.select_rows(
-    columns=[text_alias],
+    columns=[UUID_COLUMN, text_alias],
     sort_by=['document.test_signal.is_all_cap'],
     sort_order=SortOrder.DESC,
     combine_columns=True)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		document.querySelectorAll('video').forEach(v => (v.playbackRate = 1.5));