diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4170c99..512b641 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -113,10 +113,14 @@ jobs:
         uses: actions/checkout@v4
         with:
           ref: "main"
+      - name: Install dependencies
+        run: |
+          sudo apt update && 
+          sudo apt install -y doxygen graphviz dia git && 
+          pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 && 
+          npm install -g jsdoc
       - name: Setup GitHub Pages
         uses: actions/configure-pages@v2
-      - name: Install dependencies
-        run: sudo apt update && sudo apt install -y doxygen graphviz dia git && pip install sphinx==7.1.2 breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery toml
       - name: Install UForm from PyPi
         run: pip install uform
       - name: Build documentation
diff --git a/.gitignore b/.gitignore
index af7d4af..1bbdc30 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,21 @@ test
 build/
 package-lock.json
 *.egg-info
-*.onnx
 __pycache__
 .build
-.swiftpm
\ No newline at end of file
+.swiftpm
+.hf_token
+
+dictionary*
+vocab*
+
+# Tensors & ML Model
+*.onnx
+*.pt
+*.safetensors
+*.mlpackage
+
+# NodeJS
+node_modules
+node_build
+yarn-error.log
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 59eb78c..92a1844 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -5,11 +5,29 @@
     "version": "0.2.0",
     "configurations": [
         {
-            "name": "Python Debugger: Current File with Arguments",
+            "name": "Python Debugger",
             "type": "debugpy",
             "request": "launch",
             "program": "${file}",
             "console": "integratedTerminal",
+        },
+        {
+            "name": "PyTest Debugger",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "pytest",
+            "console": "integratedTerminal",
+            "args": [
+                "${file}",
+                "-s",
+                "-x",
+            ],
+        },
+        {
+            "name": "NodeJS Debugger",
+            "type": "node-terminal",
+            "request": "launch",
+            "command": "npm run test",
         }
     ]
 }
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a6cceb8..3275f93 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,8 +1,10 @@
 {
     "cSpell.words": [
         "arange",
+        "astype",
         "CFURL",
         "coreml",
+        "crossattn",
         "cumsum",
         "dtype",
         "embs",
@@ -19,26 +21,37 @@
         "ndarray",
         "numpy",
         "ONNX",
+        "onnxconverter",
         "onnxruntime",
+        "opset",
         "packbits",
         "preprocess",
         "pretrained",
         "probs",
         "pypi",
+        "pytest",
+        "randn",
         "rerank",
         "reranker",
         "reranking",
+        "sandbeach",
         "sess",
         "SIMD",
         "softmax",
+        "Tensorrt",
+        "torchvision",
         "transfromers",
         "uform",
         "unimodal",
         "unsqueeze",
-        "Vardanian"
+        "Vardanian",
+        "whitespaces"
     ],
     "[python]": {
         "editor.defaultFormatter": "ms-python.black-formatter"
     },
-    "python.formatting.provider": "none"
+    "python.formatting.provider": "none",
+    "window.autoDetectColorScheme": true,
+    "workbench.colorTheme": "Default Dark+",
+    "workbench.preferredDarkColorTheme": "Default Dark+"
 }
\ No newline at end of file
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
new file mode 100644
index 0000000..07ff0bb
--- /dev/null
+++ b/BENCHMARKS.md
@@ -0,0 +1,182 @@
+# UForm Model Benchmarks
+
+## Accuracy
+
+### Embedding Models
+
+Few retrieval benchmarks exist for multimodal embeddings.
+The most famous ones for English are "MS-COCO" and "Flickr30k".
+Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
+
+| Dataset   | Recall @ 1 | Recall @ 5 | Recall @ 10 |
+| :-------- | ---------: | ---------: | ----------: |
+| Flickr    |      0.727 |      0.915 |       0.949 |
+| MS-COCO ¹ |      0.510 |      0.761 |       0.838 |
+
+For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
+Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
+
+| Language  | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
+| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
+| English 🇺🇸 |     __37.8__ |      37.7 |         63.5 |  __65.0__ |          73.5 |   __75.9__ |  1'452 M |
+| Chinese 🇨🇳 |         27.3 |  __32.2__ |         51.3 |  __59.0__ |          62.1 |   __70.5__ |  1'118 M |
+| Hindi 🇮🇳   |         20.7 |  __31.3__ |         42.5 |  __57.9__ |          53.7 |   __69.6__ |    602 M |
+| Spanish 🇪🇸 |         32.6 |  __35.6__ |         58.0 |  __62.8__ |          68.8 |   __73.7__ |    548 M |
+| Arabic 🇸🇦  |         22.7 |  __31.7__ |         44.9 |  __57.8__ |          55.8 |   __69.2__ |    274 M |
+| French 🇫🇷  |         31.3 |  __35.4__ |         56.5 |  __62.6__ |          67.4 |   __73.3__ |    274 M |
+
+
+All languages:
+
+| Language             | OpenCLIP @ 1 |    UForm @ 1 | OpenCLIP @ 5 |    UForm @ 5 | OpenCLIP @ 10 |   UForm @ 10 | Speakers |
+| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
+| Arabic 🇸🇦             |         22.7 |     __31.7__ |         44.9 |     __57.8__ |          55.8 |     __69.2__ |    274 M |
+| Armenian 🇦🇲           |          5.6 |     __22.0__ |         14.3 |     __44.7__ |          20.2 |     __56.0__ |      4 M |
+| Chinese 🇨🇳            |         27.3 |     __32.2__ |         51.3 |     __59.0__ |          62.1 |     __70.5__ |  1'118 M |
+| English 🇺🇸            |     __37.8__ |         37.7 |         63.5 |     __65.0__ |          73.5 |     __75.9__ |  1'452 M |
+| French 🇫🇷             |         31.3 |     __35.4__ |         56.5 |     __62.6__ |          67.4 |     __73.3__ |    274 M |
+| German 🇩🇪             |         31.7 |     __35.1__ |         56.9 |     __62.2__ |          67.4 |     __73.3__ |    134 M |
+| Hebrew 🇮🇱             |         23.7 |     __26.7__ |         46.3 |     __51.8__ |          57.0 |     __63.5__ |      9 M |
+| Hindi 🇮🇳              |         20.7 |     __31.3__ |         42.5 |     __57.9__ |          53.7 |     __69.6__ |    602 M |
+| Indonesian 🇮🇩         |         26.9 |     __30.7__ |         51.4 |     __57.0__ |          62.7 |     __68.6__ |    199 M |
+| Italian 🇮🇹            |         31.3 |     __34.9__ |         56.7 |     __62.1__ |          67.1 |     __73.1__ |     67 M |
+| Japanese 🇯🇵           |         27.4 |     __32.6__ |         51.5 |     __59.2__ |          62.6 |     __70.6__ |    125 M |
+| Korean 🇰🇷             |         24.4 |     __31.5__ |         48.1 |     __57.8__ |          59.2 |     __69.2__ |     81 M |
+| Persian 🇮🇷            |         24.0 |     __28.8__ |         47.0 |     __54.6__ |          57.8 |     __66.2__ |     77 M |
+| Polish 🇵🇱             |         29.2 |     __33.6__ |         53.9 |     __60.1__ |          64.7 |     __71.3__ |     41 M |
+| Portuguese 🇵🇹         |         31.6 |     __32.7__ |         57.1 |     __59.6__ |          67.9 |     __71.0__ |    257 M |
+| Russian 🇷🇺            |         29.9 |     __33.9__ |         54.8 |     __60.9__ |          65.8 |     __72.0__ |    258 M |
+| Spanish 🇪🇸            |         32.6 |     __35.6__ |         58.0 |     __62.8__ |          68.8 |     __73.7__ |    548 M |
+| Thai 🇹🇭               |         21.5 |     __28.7__ |         43.0 |     __54.6__ |          53.7 |     __66.0__ |     61 M |
+| Turkish 🇹🇷            |         25.5 |     __33.0__ |         49.1 |     __59.6__ |          60.3 |     __70.8__ |     88 M |
+| Ukranian 🇺🇦           |         26.0 |     __30.6__ |         49.9 |     __56.7__ |          60.9 |     __68.1__ |     41 M |
+| Vietnamese 🇻🇳         |         25.4 |     __28.3__ |         49.2 |     __53.9__ |          60.3 |     __65.5__ |     85 M |
+|                      |              |              |              |              |               |              |          |
+| Mean                 |     26.5±6.4 | __31.8±3.5__ |     49.8±9.8 | __58.1±4.5__ |     60.4±10.6 | __69.4±4.3__ |        - |
+| Google Translate     |     27.4±6.3 | __31.5±3.5__ |     51.1±9.5 | __57.8±4.4__ |     61.7±10.3 | __69.1±4.3__ |        - |
+| Microsoft Translator |     27.2±6.4 | __31.4±3.6__ |     50.8±9.8 | __57.7±4.7__ |     61.4±10.6 | __68.9±4.6__ |        - |
+| Meta NLLB            |     24.9±6.7 | __32.4±3.5__ |    47.5±10.3 | __58.9±4.5__ |     58.2±11.2 | __70.2±4.3__ |        - |
+
+### Generative Models
+
+| Model                | LLM Size |  SQA |    MME | MMBench | Average¹ |
+| :------------------- | -------: | ---: | -----: | ------: | -------: |
+| UForm-Gen2-Qwen-500m |     0.5B | 45.5 |  880.1 |    42.0 |    29.31 |
+| MobileVLM v2         |     1.4B | 52.1 | 1302.8 |    57.7 |    36.81 |
+| LLaVA-Phi            |     2.7B | 68.4 | 1335.1 |    59.8 |    42.95 |
+
+For captioning evaluation we measure CLIPScore and RefCLIPScore³.
+
+| Model                               | Size | Caption Length | CLIPScore | RefCLIPScore |
+| :---------------------------------- | ---: | -------------: | --------: | -----------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7B |           Long |     0.878 |        0.529 |
+| `llava-hf/llava-1.5-7b-hf`          |   7B |          Short |     0.886 |        0.531 |
+|                                     |      |                |           |              |
+| `Salesforce/instructblip-vicuna-7b` |   7B |           Long |     0.902 |        0.534 |
+| `Salesforce/instructblip-vicuna-7b` |   7B |          Short |     0.848 |        0.523 |
+|                                     |      |                |           |              |
+| `unum-cloud/uform-gen`              | 1.5B |           Long |     0.847 |        0.523 |
+| `unum-cloud/uform-gen`              | 1.5B |          Short |     0.842 |        0.522 |
+|                                     |      |                |           |              |
+| `unum-cloud/uform-gen-chat`         | 1.5B |           Long |     0.860 |        0.525 |
+| `unum-cloud/uform-gen-chat`         | 1.5B |          Short |     0.858 |        0.525 |
+
+Results for VQAv2 evaluation.
+
+| Model                      | Size | Accuracy |
+| :------------------------- | ---: | -------: |
+| `llava-hf/llava-1.5-7b-hf` |   7B |     78.5 |
+| `unum-cloud/uform-gen`     | 1.5B |     66.5 |
+
+<br/>
+
+> ¹ Train split was in training data. <br/>
+> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section. <br/>
+> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
+
+## Speed
+
+### Embedding Models
+
+UForm comes pre-packaged with speed benchmarks for the models.
+    
+```bash
+$ python python/scripts/bench_encoders.py --help
+usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
+
+options:
+  -h, --help            show this help message and exit
+  --filter-out FILTER_OUT
+                        Filter out models, backends, or devices with a Regular Expression.
+  --batch-size BATCH_SIZE
+                        Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+```
+
+Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and 
+
+| Model Name                                     | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s |
+| :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- |
+| unum-cloud/uform3-image-text-english-base      | cpu    | torch   |                 23.03 | 76.57            | 15,978.03            | 562.28          |
+| unum-cloud/uform3-image-text-english-base      | cpu    | onnx    |                 23.11 | 77.75            | 13,880.27            | 1,067.40        |
+| unum-cloud/uform3-image-text-english-base      | cuda   | torch   |                 22.87 | 1,060.40         | 12,348.94            | 13,242.83       |
+| unum-cloud/uform3-image-text-english-large     | cpu    | torch   |                 22.41 | 10.84            | 13,350.45            | 145.12          |
+| unum-cloud/uform3-image-text-english-large     | cpu    | onnx    |                 23.13 | 19.60            | 18,031.85            | 960.09          |
+| unum-cloud/uform3-image-text-english-large     | cuda   | torch   |                 22.78 | 244.86           | 13,226.40            | 10,204.04       |
+| unum-cloud/uform3-image-text-english-small     | cpu    | torch   |                 20.08 | 71.68            | 12,147.05            | 249.63          |
+| unum-cloud/uform3-image-text-english-small     | cpu    | onnx    |                 22.84 | 195.27           | 13,636.99            | 1,385.25        |
+| unum-cloud/uform3-image-text-english-small     | cuda   | torch   |                 22.63 | 2,662.16         | 14,731.18            | 14,694.87       |
+| unum-cloud/uform3-image-text-multilingual-base | cpu    | torch   |                 22.98 | 64.28            | 10,129.27            | 209.76          |
+| unum-cloud/uform3-image-text-multilingual-base | cpu    | onnx    |                 23.06 | 66.81            | 8,963.13             | 1,104.32        |
+| unum-cloud/uform3-image-text-multilingual-base | cuda   | torch   |                 22.88 | 1,051.95         | 15,639.72            | 12,416.12       |
+
+If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates.
+On Nvidia RTX 3090:
+
+| Model                                            | Multilingual |                  Speed |    Speedup |
+| :----------------------------------------------- | -----------: | ---------------------: | ---------: |
+| `bert-base-uncased`                              |           No | 1'612 sequences/second |            |
+| `distilbert-base-uncased`                        |           No | 3'174 sequences/second |     x 1.96 |
+| `sentence-transformers/all-MiniLM-L12-v2`        |      __Yes__ | 3'604 sequences/second |     x 2.24 |
+| `unum-cloud/uform3-image-text-multilingual-base` |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
+
+Given the small size of the model it also work well on mobile devices.
+On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
+
+| Device                 |               Speed | Device TDP |        Efficiency |
+| :--------------------- | ------------------: | ---------: | ----------------: |
+| Nvidia RTX 3090        | ~ 140 tokens/second |     < 350W | 0.40 tokens/joule |
+| Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
+| Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
+| Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
+
+### Generative Models
+
+```bash
+$ python python/scripts/bench_decoders.py --help
+usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
+
+options:
+  -h, --help            show this help message and exit
+  --batch-size BATCH_SIZE
+                        Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+  --max-length MAX_LENGTH
+                        Maximum length of the generated text in tokens.
+```
+
+On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model                               |  Size | Decoding Speed |    Decoding Parallel Streams |
+| :---------------------------------- | ----: | -------------: | ---------------------------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7 B | ~ 141 tokens/s |  ~ 4 K tokens/s (32 streams) |
+| `Salesforce/instructblip-vicuna-7b` |   7 B | ~ 211 tokens/s |  ~ 2 K tokens/s (32 streams) |
+| `unum-cloud/uform-gen`              | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) |
+| `unum-cloud/uform-gen2-dpo`         | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) |
+
+On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model                               |  Size | Decoding Speed |   Speedup |
+| :---------------------------------- | ----: | -------------: | --------: |
+| `llava-hf/llava-1.5-7b-hf`          |   7 B |  ~ 40 tokens/s |           |
+| `Salesforce/instructblip-vicuna-7b` |   7 B |  ~ 40 tokens/s |           |
+| `unum-cloud/uform-gen`              | 1.5 B | ~ 140 tokens/s | __x 3.5__ |
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 181d9e2..65e0b26 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,12 +7,11 @@ We welcome contributions to UForm!
 Before submitting any changes, please make sure that the tests pass.
 
 ```sh
-pip install -e .                # For core dependencies
-
+pip install -e ".[dev]"         # For development dependencies
 pip install -e ".[torch]"       # For PyTorch
 pip install -e ".[onnx]"        # For ONNX on CPU
 pip install -e ".[onnx-gpu]"    # For ONNX on GPU, available for some platforms
-pip install -e ".[torch,onnx]"  # For PyTorch and ONNX Python tests
+pip install -e ".[torch,onnx,onnx-gpu,dev]"  # For all
 
 pytest python/scripts/ -s -x -Wd -v
 pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch
@@ -20,6 +19,13 @@ pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loa
 
 ## Swift
 
+To build and test the Swift package, use the following command:
+
+```bash
+swift build
+swift test
+```
+
 Swift formatting is enforced with `swift-format` default utility from Apple.
 To install and run it on all the files in the project, use the following command:
 
@@ -30,3 +36,31 @@ swift-format . -i -r
 
 The style is controlled by the `.swift-format` JSON file in the root of the repository.
 As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings.
+
+## JavaScript
+
+For rapid development you can avoid the TypeScript precompilation step:
+
+```sh
+npm install -g ts-node
+ts-node javascript/embeddings.mts
+```
+
+Before submitting any changes, please make sure that the tests pass.
+
+```sh
+npm install
+npm run test
+```
+
+## Benchmarking
+
+If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally.
+The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU.
+
+```sh
+git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository
+cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies
+python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large"
+```
+
diff --git a/Package.resolved b/Package.resolved
index fe63c94..6e3b1f7 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -14,7 +14,7 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/ashvardanian/swift-transformers",
       "state" : {
-        "revision" : "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+        "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
       }
     }
   ],
diff --git a/Package.swift b/Package.swift
index 6ac8372..c2f7fe7 100644
--- a/Package.swift
+++ b/Package.swift
@@ -19,7 +19,7 @@ let package = Package(
     dependencies: [
         .package(
             url: "https://github.com/ashvardanian/swift-transformers",
-            revision: "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+            revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
         )
     ],
     targets: [
@@ -29,13 +29,13 @@ let package = Package(
                 .product(name: "Transformers", package: "swift-transformers")
             ],
             path: "swift",
-            exclude: ["EmbeddingsTests.swift"]
+            exclude: ["EncodersTests.swift"]
         ),
         .testTarget(
             name: "UFormTests",
             dependencies: ["UForm"],
             path: "swift",
-            sources: ["EmbeddingsTests.swift"]
+            sources: ["EncodersTests.swift"]
         ),
     ]
 )
diff --git a/README.md b/README.md
index 031c484..8484b0f 100755
--- a/README.md
+++ b/README.md
@@ -20,18 +20,24 @@ For Content Understanding and Generation<br/>
 <p align="center">
 Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
 <br/>
-Short Texts • Images • 🔜 Video Clips
+Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
 <br/>
-PyTorch • ONNX
+ONNX • CoreML • PyTorch
+<br/>
+<a href="https://github.com/unum-cloud/uform/blob/main/python/README.md">Python</a>
+ • 
+<a href="https://github.com/unum-cloud/uform/blob/main/javascript/README.md">JavaScript</a>
+ • 
+<a href="https://github.com/unum-cloud/uform/blob/main/swift/README.md">Swift</a>
 </p>
 
 ---
 
-![](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true)
+![UForm Chat Preview](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true)
 
 Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient.
 UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages.
-UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are also capable of image captioning and Visual Question Answering (VQA).
+UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA).
 With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone.
 
 ## Features
@@ -40,108 +46,167 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
 - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors.
 - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform.
 - __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall.
-- __Multilingual__: Trained on a balanced dataset, the recall is great across over [20 languages](#evaluation).
+- __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages.
 
 [usearch]: https://github.com/unum-cloud/usearch
 [matryoshka]: https://arxiv.org/abs/2205.13147
 
 ## Models
 
-### Embedding Models
+For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md).
 
-| Model                                    | Parameters | Languages |                                 Architecture |
-| :--------------------------------------- | ---------: | --------: | -------------------------------------------: |
-| [`uform-vl-english-large`][model-e-l] 🆕  |       365M |         1 | 6 text layers, ViT-L/14, 6 multimodal layers |
-| [`uform-vl-english`][model-e]            |       143M |         1 | 2 text layers, ViT-B/16, 2 multimodal layers |
-| [`uform-vl-english-small`][model-e-s] 🆕  |        79M |         1 | 2 text layers, ViT-S/16, 2 multimodal layers |
-| [`uform-vl-multilingual-v2`][model-m-v2] |       206M |        21 | 8 text layers, ViT-B/16, 4 multimodal layers |
-| [`uform-vl-multilingual`][model-m]       |       206M |        12 | 8 text layers, ViT-B/16, 4 multimodal layers |
+### Embedding Models
 
-[model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/
-[model-e]: https://huggingface.co/unum-cloud/uform-vl-english/
-[model-e-s]: https://huggingface.co/unum-cloud/uform-vl-english-small/
-[model-m]: https://huggingface.co/unum-cloud/uform-vl-multilingual/
-[model-m-v2]: https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/
+<table style="width:100%; border-collapse:collapse;">
+    <thead>
+        <tr>
+            <th>Model</th>
+            <th style="text-align:right;">Parameters</th>
+            <th style="text-align:right;">Languages</th>
+            <th style="text-align:right;">Architecture</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english-large/">uform3-image-text-english-large</a></code>  🆕</td>
+            <td style="text-align:right;">365 M</td>
+            <td style="text-align:right;">1</td>
+            <td style="text-align:right;">12 layer BERT, ViT-L/14</td>
+        </tr>
+        <tr>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english/">uform3-image-text-english-base</a></code></td>
+            <td style="text-align:right;">143 M</td>
+            <td style="text-align:right;">1</td>
+            <td style="text-align:right;">4 layer BERT, ViT-B/16</td>
+        </tr>
+        <tr>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-english-small/">uform3-image-text-english-small</a></code>  🆕</td>
+            <td style="text-align:right;">79 M</td>
+            <td style="text-align:right;">1</td>
+            <td style="text-align:right;">4 layer BERT, ViT-S/16</td>
+        </tr>
+        <tr>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/">uform3-image-text-multilingual-base</a></code></td>
+            <td style="text-align:right;">206M</td>
+            <td style="text-align:right;">21</td>
+            <td style="text-align:right;">12 layer BERT, ViT-B/16</td>
+        </tr>
+    </tbody>
+</table>
 
 ### Generative Models
 
-| Model                              | Parameters |                     Purpose |           Architecture |
-| :--------------------------------- | ---------: | --------------------------: | ---------------------: |
-| [`uform-gen2-dpo`][model-g2] 🆕     |       1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 |
-| [`uform-gen2-qwen-500m`][model-g2] |       1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 |
-| [`uform-gen`][model-g1]            |       1.5B |       Image Captioning, VQA |   llama-1.3B, ViT-B/16 |
+<table style="width:100%; border-collapse:collapse;">
+    <thead>
+        <tr>
+            <th>Model</th>
+            <th style="text-align:right;">Parameters</th>
+            <th style="text-align:right;">Purpose</th>
+            <th style="text-align:right;">Architecture</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-gen2-dpo/">uform-gen2-dpo</a></code>  🆕</td>
+            <td style="text-align:right;">1.2 B</td>
+            <td style="text-align:right;">Chat, Image Captioning, VQA</td>
+            <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
+        </tr>
+        <tr>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/">uform-gen2-qwen-500m</a></code></td>
+            <td style="text-align:right;">1.2 B</td>
+            <td style="text-align:right;">Chat, Image Captioning, VQA</td>
+            <td style="text-align:right;">qwen1.5-0.5B, ViT-H/14</td>
+        </tr>
+        <tr>
+            <td><code><a href="https://huggingface.co/unum-cloud/uform-gen/">uform-gen</a></code> ⚠️</td>
+            <td style="text-align:right;">1.5 B</td>
+            <td style="text-align:right;">Image Captioning, VQA</td>
+            <td style="text-align:right;">llama-1.3B, ViT-B/16</td>
+        </tr>
+    </tbody>
+</table>
+
+## Quick Start Examples
+
+### Embedding Models
 
-[model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/
-[model-g1]: https://huggingface.co/unum-cloud/uform-gen/
+First, `pip install uform`.
+Then, load the model:
 
-## Producing Embeddings
+```py
+from uform import get_model, Modality
 
-Add UForm to your dependencies list, or just install it locally:
+processors, models = get_model('unum-cloud/uform3-image-text-english-small')
 
-```bash
-pip install uform
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
 ```
 
-Then, you can use the following code to get embeddings for text and images.
-You can do that either with the PyTorch reference model or the lighter cross-platform ONNX weights.
+Embed images:
 
-```python
-import uform
+```py
+import requests
+from io import BytesIO
 from PIL import Image
 
-# If you want to use the PyTorch model
-model, processor = uform.get_model('unum-cloud/uform-vl-english-large') # Just English
-model, processor = uform.get_model('unum-cloud/uform-vl-multilingual-v2') # 21 Languages
+image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
+image_url = Image.open(BytesIO(requests.get(image_url).content))
+image_data = processor_image(image)
+image_features, image_embedding = model_image.encode(image_data, return_features=True)
+```
 
-# If you want to use the light-weight portable ONNX model
-# Available combinations: cpu & fp32, gpu & fp32, gpu & fp16
-# Check out Unum's Hugging Face space for more details: https://huggingface.co/unum-cloud
-model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-small', 'cpu', 'fp32')
-model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-large', 'gpu', 'fp16')
+Embed queries:
 
-text = 'a small red panda in a zoo'
-image = Image.open('red_panda.jpg')
+```py
+text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
+text_data = processor_text(text)
+text_features, text_embedding = model_text.encode(text_data, return_features=True)
+```
 
-image_data = processor.preprocess_image(image)
-text_data = processor.preprocess_text(text)
+For more details check out:
 
-image_features, image_embedding = model.encode_image(image_data, return_features=True)
-text_features, text_embedding = model.encode_text(text_data, return_features=True)
-```
+- Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models)
+- JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models)
+- Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models)
+
+### Generative Models
 
-To search for similar items, the embeddings can be compared using cosine similarity.
-The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match.
-PyTorch provides a built-in function for calculating cosine similarity, while for ONNX, you can use NumPy.
+The generative models are natively compatible with 
 
 ```python
-import torch.nn.functional as F
+from transformers import AutoModel, AutoProcessor
 
-similarity = F.cosine_similarity(image_embedding, text_embedding)
-```
+model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
 
-ONNX has no such function, but you can calculate the cosine similarity using [SimSIMD](https://github.com/ashvardanian/simsimd) or manually, with NumPy:
+prompt = 'Question or Instruction'
+image = Image.open('image.jpg')
 
-```python
-import numpy as np
+inputs = processor(text=[prompt], images=[image], return_tensors='pt')
 
-image_embedding = image_embedding / np.linalg.norm(image_embedding, keepdims=True, axis=1)
-text_embedding = text_embedding / np.linalg.norm(text_embedding, keepdims=True, axis=1)
-similarity = (image_embedding * text_embedding).sum(axis=1)
+with torch.inference_mode():
+     output = model.generate(
+        **inputs,
+        do_sample=False,
+        use_cache=True,
+        max_new_tokens=256,
+        eos_token_id=151645,
+        pad_token_id=processor.tokenizer.pad_token_id
+    )
+prompt_len = inputs['input_ids'].shape[1]
+decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
 ```
 
-### Reranking
+For more details check out:
 
-Once the list of nearest neighbors (best matches) is obtained, the joint multimodal embeddings, created from both text and image features, can be used to better rerank (reorder) the list.
-The model can calculate a "matching score" that falls within the range of `[0, 1]`, where `1` indicates a high likelihood of a match.
+- Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models)
+- JavaScript docs on generative models 🔜
+- Swift docs on generative models 🔜
 
-```python
-score, joint_embedding = model.encode_multimodal(
-    image_features=image_features,
-    text_features=text_features,
-    attention_mask=text_data['attention_mask'],
-    return_scores=True,
-)
-```
+## Technical Details
 
 ### Down-casting, Quantization, Matryoshka, and Slicing
 
@@ -153,7 +218,7 @@ Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is
 ```python
 import numpy as np
 
-f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
 f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
 i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
 b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
@@ -164,7 +229,7 @@ Alternative approach to quantization is to use the Matryoshka embeddings, where
 ```python
 import numpy as np
 
-large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+large_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
 small_embedding: np.ndarray = large_embedding[:, :256]
 tiny_embedding: np.ndarray = large_embedding[:, :64]
 ```
@@ -219,253 +284,16 @@ You can pick one of many supported [ONNX execution providers][onnx-providers], w
 
 [onnx-providers]: https://onnxruntime.ai/docs/execution-providers/
 
----
-
-The configuration process may include a few additional steps, depending on the environment.
-When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
-
-```sh
-wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-sudo apt-get update
-sudo apt-get -y install cuda-toolkit-12
-pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
-export CUDA_PATH="/usr/local/cuda-12/bin"
-export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
-export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
-pytest python/scripts/ -s -x -Wd -v -k onnx
-```
-
-[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
-
-## Chat, Image Captioning and Question Answering
-
-UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
-Those models can be used to caption images or power multimodal chat experiences.
-
-```python
-from transformers import AutoModel, AutoProcessor
-
-model = AutoModel.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True)
-processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True)
+### Multimodal Chat in CLI
 
-prompt = 'Question or Instruction'
-image = Image.open('image.jpg')
-
-inputs = processor(text=[prompt], images=[image], return_tensors='pt')
-
-with torch.inference_mode():
-     output = model.generate(
-        **inputs,
-        do_sample=False,
-        use_cache=True,
-        max_new_tokens=256,
-        eos_token_id=151645,
-        pad_token_id=processor.tokenizer.pad_token_id
-    )
-prompt_len = inputs['input_ids'].shape[1]
-decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
-```
-
-You can check examples of different prompts in our [demo space](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
-
-
-### Image Captioning and Question Answering
-
-__It is the instruction for the first version of UForm-Gen model. We highly recommend you use the new model, instructions for which you can find above.__
-
-
-The generative model can be used to caption images, summarize their content, or answer questions about them.
-The exact behavior is controlled by prompts.
-
-```python
-from uform.gen_model import VLMForCausalLM, VLMProcessor
-
-model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen')
-processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen')
-
-# [cap] Narrate the contents of the image with precision.
-# [cap] Summarize the visual content of the image.
-# [vqa] What is the main subject of the image?
-prompt = '[cap] Summarize the visual content of the image.'
-image = Image.open('zebra.jpg')
-
-inputs = processor(texts=[prompt], images=[image], return_tensors='pt')
-with torch.inference_mode():
-     output = model.generate(
-        **inputs,
-        do_sample=False,
-        use_cache=True,
-        max_new_tokens=128,
-        eos_token_id=32001,
-        pad_token_id=processor.tokenizer.pad_token_id
-    )
-
-prompt_len = inputs['input_ids'].shape[1]
-decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
-```
-
-### Multimodal Chat
-
-The generative models can be used for chat-like experiences, where the user can provide both text and images as input.
-To use that feature, you can start with the following CLI command:
+The generative models can be used for chat-like experiences in the command line.
+For that, you can use the `uform-chat` CLI tool, which is available in the UForm package.
 
 ```bash
-uform-chat --model unum-cloud/uform-gen-chat --image=zebra.jpg
-uform-chat --model unum-cloud/uform-gen-chat \
-    --image="https://bit.ly/3tIVg9M" \
-    --device="cuda:0" \
-    --fp16
+$ pip install uform
+$ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg
+$ uform-chat --model unum-cloud/uform-gen2-dpo \
+>     --image="https://bit.ly/3tIVg9M" \
+>     --device="cuda:0" \
+>     --fp16
 ```
-
-### Multi-GPU
-
-To achieve higher throughput, you can launch UForm on multiple GPUs.
-For that pick the encoder of the model you want to run in parallel (`text_encoder` or `image_encoder`), and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
-
-```python
-import uform
-
-model, processor = uform.get_model('unum-cloud/uform-vl-english')
-model_image = nn.DataParallel(model.image_encoder)
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model_image.to(device)
-
-_, res = model_image(images, 0)
-```
-
-## Evaluation
-
-### Embedding Models
-
-Few retrieval benchmarks exist for multimodal embeddings.
-The most famous ones for English are "MS-COCO" and "Flickr30k".
-Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
-
-| Dataset  | Recall @ 1 | Recall @ 5 | Recall @ 10 |
-| :------- | ---------: | ---------: | ----------: |
-| Flickr   |      0.727 |      0.915 |       0.949 |
-| MS-COCO¹ |      0.510 |      0.761 |       0.838 |
-
-
-For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
-Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
-
-| Language  | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
-| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
-| English 🇺🇸 |     __37.8__ |      37.7 |         63.5 |  __65.0__ |          73.5 |   __75.9__ |  1'452 M |
-| Chinese 🇨🇳 |         27.3 |  __32.2__ |         51.3 |  __59.0__ |          62.1 |   __70.5__ |  1'118 M |
-| Hindi 🇮🇳   |         20.7 |  __31.3__ |         42.5 |  __57.9__ |          53.7 |   __69.6__ |    602 M |
-| Spanish 🇪🇸 |         32.6 |  __35.6__ |         58.0 |  __62.8__ |          68.8 |   __73.7__ |    548 M |
-| Arabic 🇸🇦  |         22.7 |  __31.7__ |         44.9 |  __57.8__ |          55.8 |   __69.2__ |    274 M |
-| French 🇫🇷  |         31.3 |  __35.4__ |         56.5 |  __62.6__ |          67.4 |   __73.3__ |    274 M |
-
-
-<details>
-<summary>All languages.</summary>
-<br>
-
-| Language             | OpenCLIP @ 1 |    UForm @ 1 | OpenCLIP @ 5 |    UForm @ 5 | OpenCLIP @ 10 |   UForm @ 10 | Speakers |
-| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
-| Arabic 🇸🇦             |         22.7 |     __31.7__ |         44.9 |     __57.8__ |          55.8 |     __69.2__ |    274 M |
-| Armenian 🇦🇲           |          5.6 |     __22.0__ |         14.3 |     __44.7__ |          20.2 |     __56.0__ |      4 M |
-| Chinese 🇨🇳            |         27.3 |     __32.2__ |         51.3 |     __59.0__ |          62.1 |     __70.5__ |  1'118 M |
-| English 🇺🇸            |     __37.8__ |         37.7 |         63.5 |     __65.0__ |          73.5 |     __75.9__ |  1'452 M |
-| French 🇫🇷             |         31.3 |     __35.4__ |         56.5 |     __62.6__ |          67.4 |     __73.3__ |    274 M |
-| German 🇩🇪             |         31.7 |     __35.1__ |         56.9 |     __62.2__ |          67.4 |     __73.3__ |    134 M |
-| Hebrew 🇮🇱             |         23.7 |     __26.7__ |         46.3 |     __51.8__ |          57.0 |     __63.5__ |      9 M |
-| Hindi 🇮🇳              |         20.7 |     __31.3__ |         42.5 |     __57.9__ |          53.7 |     __69.6__ |    602 M |
-| Indonesian 🇮🇩         |         26.9 |     __30.7__ |         51.4 |     __57.0__ |          62.7 |     __68.6__ |    199 M |
-| Italian 🇮🇹            |         31.3 |     __34.9__ |         56.7 |     __62.1__ |          67.1 |     __73.1__ |     67 M |
-| Japanese 🇯🇵           |         27.4 |     __32.6__ |         51.5 |     __59.2__ |          62.6 |     __70.6__ |    125 M |
-| Korean 🇰🇷             |         24.4 |     __31.5__ |         48.1 |     __57.8__ |          59.2 |     __69.2__ |     81 M |
-| Persian 🇮🇷            |         24.0 |     __28.8__ |         47.0 |     __54.6__ |          57.8 |     __66.2__ |     77 M |
-| Polish 🇵🇱             |         29.2 |     __33.6__ |         53.9 |     __60.1__ |          64.7 |     __71.3__ |     41 M |
-| Portuguese 🇵🇹         |         31.6 |     __32.7__ |         57.1 |     __59.6__ |          67.9 |     __71.0__ |    257 M |
-| Russian 🇷🇺            |         29.9 |     __33.9__ |         54.8 |     __60.9__ |          65.8 |     __72.0__ |    258 M |
-| Spanish 🇪🇸            |         32.6 |     __35.6__ |         58.0 |     __62.8__ |          68.8 |     __73.7__ |    548 M |
-| Thai 🇹🇭               |         21.5 |     __28.7__ |         43.0 |     __54.6__ |          53.7 |     __66.0__ |     61 M |
-| Turkish 🇹🇷            |         25.5 |     __33.0__ |         49.1 |     __59.6__ |          60.3 |     __70.8__ |     88 M |
-| Ukranian 🇺🇦           |         26.0 |     __30.6__ |         49.9 |     __56.7__ |          60.9 |     __68.1__ |     41 M |
-| Vietnamese 🇻🇳         |         25.4 |     __28.3__ |         49.2 |     __53.9__ |          60.3 |     __65.5__ |     85 M |
-|                      |              |              |              |              |               |              |          |
-| Mean                 |     26.5±6.4 | __31.8±3.5__ |     49.8±9.8 | __58.1±4.5__ |     60.4±10.6 | __69.4±4.3__ |        - |
-| Google Translate     |     27.4±6.3 | __31.5±3.5__ |     51.1±9.5 | __57.8±4.4__ |     61.7±10.3 | __69.1±4.3__ |        - |
-| Microsoft Translator |     27.2±6.4 | __31.4±3.6__ |     50.8±9.8 | __57.7±4.7__ |     61.4±10.6 | __68.9±4.6__ |        - |
-| Meta NLLB            |     24.9±6.7 | __32.4±3.5__ |    47.5±10.3 | __58.9±4.5__ |     58.2±11.2 | __70.2±4.3__ |        - |
-
-</details>
-
-### Generative Models
-
-| Model                | LLM Size |  SQA |    MME | MMBench | Average¹ |
-| :------------------- | -------: | ---: | -----: | ------: | -------: |
-| UForm-Gen2-Qwen-500m |     0.5B | 45.5 |  880.1 |    42.0 |    29.31 |
-| MobileVLM v2         |     1.4B | 52.1 | 1302.8 |    57.7 |    36.81 |
-| LLaVA-Phi            |     2.7B | 68.4 | 1335.1 |    59.8 |    42.95 |
-
-For captioning evaluation we measure CLIPScore and RefCLIPScore³.
-
-| Model                               | Size | Caption Length | CLIPScore | RefCLIPScore |
-| :---------------------------------- | ---: | -------------: | --------: | -----------: |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |           Long |     0.878 |        0.529 |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |          Short |     0.886 |        0.531 |
-|                                     |
-| `Salesforce/instructblip-vicuna-7b` |   7B |           Long |     0.902 |        0.534 |
-| `Salesforce/instructblip-vicuna-7b` |   7B |          Short |     0.848 |        0.523 |
-|                                     |
-| `unum-cloud/uform-gen`              | 1.5B |           Long |     0.847 |        0.523 |
-| `unum-cloud/uform-gen`              | 1.5B |          Short |     0.842 |        0.522 |
-|                                     |
-| `unum-cloud/uform-gen-chat`         | 1.5B |           Long |     0.860 |        0.525 |
-| `unum-cloud/uform-gen-chat`         | 1.5B |          Short |     0.858 |        0.525 |
-
-Results for VQAv2 evaluation.
-
-| Model                      | Size | Accuracy |
-| :------------------------- | ---: | -------: |
-| `llava-hf/llava-1.5-7b-hf` |   7B |     78.5 |
-| `unum-cloud/uform-gen`     | 1.5B |     66.5 |
-
-<br/>
-
-> ¹ Train split was in training data. <br/>
-> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section. <br/>
-> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
-
-## Speed
-
-On Nvidia RTX 3090, the following performance is expected on text encoding.
-
-| Model                                     | Multilingual |                  Speed |    Speedup |
-| :---------------------------------------- | -----------: | ---------------------: | ---------: |
-| `bert-base-uncased`                       |           No | 1'612 sequences/second |            |
-| `distilbert-base-uncased`                 |           No | 3'174 sequences/second |     x 1.96 |
-| `sentence-transformers/all-MiniLM-L12-v2` |      __Yes__ | 3'604 sequences/second |     x 2.24 |
-| `unum-cloud/uform-vl-multilingual-v2`     |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
-
-On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
-
-| Model                               | Size |               Speed |   Speedup |
-| :---------------------------------- | ---: | ------------------: | --------: |
-| `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
-| `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
-| `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
-
-Given the small size of the model it also work well on mobile devices.
-On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
-
-| Device                 |               Speed | Device TDP |        Efficiency |
-| :--------------------- | ------------------: | ---------: | ----------------: |
-| Nvidia RTX 3090        | ~ 140 tokens/second |     < 350W | 0.40 tokens/joule |
-| Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
-| Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
-| Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
-
-> [!WARNING]
-> The above numbers are for reference only and are not guaranteed to be accurate.
-
-## License
-
-All models come under the same license as the code - Apache 2.0.
diff --git a/docs/_static/custom.js b/docs/_static/custom.js
index b909a1d..3dd0974 100644
--- a/docs/_static/custom.js
+++ b/docs/_static/custom.js
@@ -3,5 +3,5 @@ $(document).ready(function () {
     <svg style="fill: var(--color-foreground-primary);" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16" width="16" height="16"><path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0016 8c0-4.42-3.58-8-8-8z"></path></svg>
         </a>`
 
-    $(".sidebar-brand-text").html("Unum · UForm<br/> <span style='font-size:0.8em'>$(VERSION)</span>" + github_logo)
+    $(".sidebar-brand-text").html("Unum · UForm<br/> <span style='font-size:0.8em'>2.1.1</span>" + github_logo)
 })
diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
new file mode 100644
index 0000000..7683788
--- /dev/null
+++ b/docs/benchmarks.rst
@@ -0,0 +1,5 @@
+====================
+Benchmarks
+====================
+
+.. mdinclude:: ../BENCHMARKS.md
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index acc061e..f9061f5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,12 +5,11 @@
 
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-import toml
 
 project = "Unum · UForm"
 copyright = "2023, Unum"
 author = "Unum"
-release = toml.load("../pyproject.toml")["project"]["version"]
+release = open("../VERSION", "r").read().strip()
 with open("_static/custom.js", "r+") as js:
     content = js.read()
     js.seek(0)
@@ -24,6 +23,7 @@
     "breathe",
     "m2r2",
     "sphinx.ext.autodoc",
+    "sphinx_js",
     "sphinx.ext.autosummary",
     "sphinx.ext.intersphinx",
     "sphinx.ext.napoleon",
@@ -44,6 +44,9 @@
 html_static_path = ["_static"]
 html_css_files = ["custom.css"]
 html_js_files = ["custom.js"]
+html_baseurl = "/docs/uform/"
 
 breathe_projects = {"UForm": "../build/xml"}
 breathe_default_project = "UForm"
+
+js_source_path = "../javascript/"
diff --git a/docs/contributing.rst b/docs/contributing.rst
new file mode 100644
index 0000000..48893cf
--- /dev/null
+++ b/docs/contributing.rst
@@ -0,0 +1,5 @@
+====================
+Contributing
+====================
+
+.. mdinclude:: ../CONTRIBUTING.md
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 162bbee..d3da0ec 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,11 +1,25 @@
-==========
+====================
 Overview
-==========
+====================
 .. mdinclude:: ../README.md
 
-.. toctree::
+.. toctree:: 
    :hidden:
+   :caption: �
+
+   python/index
+   javascript/index
+   swift/index
+
+.. toctree:: 
+   :hidden:
+   :caption: �
+
+   contributing
+   benchmarks
+
+.. toctree:: 
+   :hidden:
+   :caption: �
 
-   self
-   reference
    genindex
diff --git a/docs/javascript/index.rst b/docs/javascript/index.rst
new file mode 100644
index 0000000..771081c
--- /dev/null
+++ b/docs/javascript/index.rst
@@ -0,0 +1,9 @@
+====================
+JavaScript SDK
+====================
+
+
+.. mdinclude:: ../../javascript/README.md
+
+.. toctree::
+   :hidden:
diff --git a/docs/javascript/reference.rst.txt b/docs/javascript/reference.rst.txt
new file mode 100644
index 0000000..356176a
--- /dev/null
+++ b/docs/javascript/reference.rst.txt
@@ -0,0 +1,18 @@
+API Reference
+====================
+
+====================
+Encoders
+====================
+
+.. js:autoclass:: ../javascript/encoders.TextProcessor
+   :members:
+
+.. js:autoclass:: ../javascript/encoders.ImageProcessor
+   :members:
+
+.. js:autoclass:: ../javascript/encoders.TextEncoder
+   :members:
+
+.. js:autoclass:: ../javascript/encoders.ImageEncoder
+   :members:
diff --git a/docs/python/index.rst b/docs/python/index.rst
new file mode 100644
index 0000000..5f870d1
--- /dev/null
+++ b/docs/python/index.rst
@@ -0,0 +1,11 @@
+====================
+Python SDK
+====================
+
+
+.. mdinclude:: ../../python/README.md
+
+.. toctree::
+   :hidden:
+
+   reference
\ No newline at end of file
diff --git a/docs/python/reference.rst b/docs/python/reference.rst
new file mode 100644
index 0000000..d580583
--- /dev/null
+++ b/docs/python/reference.rst
@@ -0,0 +1,42 @@
+API Reference
+====================
+
+====================
+Root
+====================
+
+.. automodule:: uform
+    :members:
+    :undoc-members:
+
+====================
+Torch Encoreds
+====================
+
+.. automodule:: uform.torch_encoders
+    :members:
+    :undoc-members:
+
+====================
+Torch Processors
+====================
+
+.. automodule:: uform.torch_processors
+    :members:
+    :undoc-members:
+
+====================
+ONNX Encoders
+====================
+
+.. automodule:: uform.onnx_encoders
+    :members:
+    :undoc-members:
+
+====================
+NumPy Processors
+====================
+
+.. automodule:: uform.numpy_processors
+    :members:
+    :undoc-members:
diff --git a/docs/reference.rst b/docs/reference.rst
deleted file mode 100644
index 5828f41..0000000
--- a/docs/reference.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-API Reference
-==============
-
-.. automodule:: uform
-    :members:
-    :undoc-members:
diff --git a/docs/swift/index.rst b/docs/swift/index.rst
new file mode 100644
index 0000000..5f2e213
--- /dev/null
+++ b/docs/swift/index.rst
@@ -0,0 +1,6 @@
+====================
+Swift SDK
+====================
+
+
+.. mdinclude:: ../../swift/README.md
diff --git a/javascript/README.md b/javascript/README.md
new file mode 100644
index 0000000..0ef5c54
--- /dev/null
+++ b/javascript/README.md
@@ -0,0 +1,67 @@
+# UForm for JavaScript
+
+UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications.
+Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware.
+
+## Installation
+
+There are several ways to install the UForm JavaScript SDK from NPM.
+
+```bash
+pnpm add uform 
+npm add uform  
+yarn add uform  
+```
+
+## Quick Start
+
+### Embeddings
+
+```js
+import { getModel, Modality } from 'uform';
+import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from 'uform';
+
+const { configPath, modalityPaths, tokenizerPath } = await getModel({
+    modelId: 'unum-cloud/uform3-image-text-english-small',
+    modalities: [Modality.TextEncoder, Modality.ImageEncoder],
+    token: null, // Optional Hugging Face token for private models
+    saveDir: null, // Optional directory to save the model to       
+});
+
+const textProcessor = new TextProcessor(configPath, tokenizerPath);
+await textProcessor.init();
+const processedTexts = await textProcessor.process("a small red panda in a zoo");
+
+const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+await textEncoder.init();
+const textOutput = await textEncoder.encode(processedTexts);
+assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
+await textEncoder.dispose();
+
+const imageProcessor = new ImageProcessor(configPath);
+await imageProcessor.init();
+const processedImages = await imageProcessor.process("path/to/image.png");
+
+const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+await imageEncoder.init();
+const imageOutput = await imageEncoder.encode(processedImages);
+assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
+```
+
+The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK.
+The embeddings can later be compared using the cosine similarity or other distance metrics.
+
+### Generative Models
+
+Coming soon ...
+
+## Technical Details
+
+### Faster Search
+
+Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
+Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search.
+In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd].
+
+[github-usearch]: https://github.com/unum-cloud/usearch
+[github-simsimd]: https://github.com/ashvardanian/simsimd
diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
new file mode 100644
index 0000000..3c41636
--- /dev/null
+++ b/javascript/encoders.mjs
@@ -0,0 +1,311 @@
+import { readFileSync } from 'fs';
+import { InferenceSession, Tensor } from 'onnxruntime-node';
+import { PreTrainedTokenizer } from '@xenova/transformers';
+import sharp from 'sharp';
+
+/**
+ * A processor for text data that prepares input for the text encoder model.
+ */
+class TextProcessor {
+
+    /**
+     * Constructs a new TextProcessor instance.
+     *
+     * @param {string} configPath - The path to the configuration file for the text encoder.
+     * @param {string} tokenizerPath - The path to the tokenizer configuration file.
+     */
+    constructor(configPath, tokenizerPath) {
+        this.configPath = configPath;
+        this.tokenizerPath = tokenizerPath;
+
+        this.maxSeqLen = 0;
+        this.padTokenIdx = 0;
+        this.tokenizer = null;
+    }
+
+    /**
+     * Initializes the TextProcessor by loading configurations and setting up the tokenizer.
+     */
+    async init() {
+        var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
+        if (config.text_encoder !== undefined) {
+            config = config.text_encoder;
+        }
+
+        this.maxSeqLen = config.max_position_embeddings;
+        this.padTokenIdx = config.padding_idx;
+
+        const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' }));
+        this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
+        this.tokenizer.model_max_length = this.maxSeqLen;
+        this.tokenizer.pad_token_id = this.padTokenIdx;
+    }
+
+    /**
+     * Processes a list of text strings into model-ready format, including padding and attention masks.
+     *
+     * @param {Array<string>} texts - An array of text strings to process.
+     * @return {Object} The processed texts as model input features.
+     */
+    async process(texts) {
+
+        const encoded = await this.tokenizer(texts, {
+            add_special_tokens: true,
+            padding: 'max_length',
+            max_length: this.maxSeqLen,
+            truncation: true,
+        });
+
+        return {
+            'input_ids': encoded.input_ids,
+            'attention_mask': encoded.attention_mask,
+        };
+    }
+}
+
+/**
+ * An encoder for text data that uses a pre-trained model to encode text.
+ */
+class TextEncoder {
+
+    /**
+     * Constructs a new TextEncoder instance.
+     *
+     * @param {string} modelPath - The path to the pre-trained ONNX model.
+     */
+    constructor(modelPath) {
+        this.modelPath = modelPath;
+        this.session = null;
+    }
+
+    /**
+     * Initializes the ONNX session with the pre-trained model.
+     */
+    async init() {
+        this.session = await InferenceSession.create(this.modelPath);
+    }
+
+    /**
+     * Releases the ONNX session resources.
+     */
+    async dispose() {
+        if (this.session) {
+            await this.session.release();
+            this.session = null;
+        }
+    }
+
+    /**
+     * Encodes the input data using the pre-trained model.
+     *
+     * @param {Object} inputs - The input data containing input_ids and attention_mask.
+     * @return {Object} The encoded outputs from the model.
+     */
+    async encode(inputs) {
+        if (!this.session) {
+            throw new Error("Session is not initialized.");
+        }
+
+        // Helper function to convert BigInt64Array to Int32Array or validate Int32Array
+        function ensureInt32Array(data) {
+            if (data instanceof Int32Array) {
+                return data; // Use as is if already Int32Array
+            }
+            if (data instanceof BigInt64Array) {
+                // Convert BigInt64Array to Int32Array, ensuring all values are in range
+                return new Int32Array(Array.from(data).map(bigInt => {
+                    if (bigInt > 2147483647n || bigInt < -2147483648n) {
+                        throw new Error("Value out of range for Int32.");
+                    }
+                    return Number(bigInt); // Convert BigInt to Number
+                }));
+            }
+            // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
+            if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) {
+                return new Int32Array(data); // Convert directly
+            }
+            throw new Error("Unsupported data type for tensor conversion.");
+        }
+
+        // Prepare tensor data
+        const inputIDsData = ensureInt32Array(inputs.input_ids.data);
+        const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);
+
+        // Create ONNX Tensors as 'int32'
+        const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims);
+        const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims);
+
+        // Run model inference
+        return this.session.run({
+            input_ids: inputIDs,
+            attention_mask: attentionMask,
+        });
+    }
+
+}
+
+/**
+ * A processor for image data that prepares images for the image encoder model.
+ */
+class ImageProcessor {
+    constructor(configPath) {
+        this.configPath = configPath;
+    }
+
+    /**
+     * Initializes the ImageProcessor by loading configuration settings for image preprocessing.
+     */
+    async init() {
+        var config = JSON.parse(readFileSync(this.configPath, 'utf8'));
+        if (config.image_encoder !== undefined) {
+            config = config.image_encoder;
+        }
+
+        this.imageSize = config.image_size;
+        this.normalizationMeans = config.normalization_means;
+        this.normalizationDeviations = config.normalization_deviations;
+
+        this.imageMean = new Float32Array(this.normalizationMeans);
+        this.imageStd = new Float32Array(this.normalizationDeviations);
+    }
+    /**
+     * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing.
+     *
+     * @param {Buffer|Array<Buffer>} images - A single image or an array of images to process.
+     * @return {Array<Float32Array>} The processed image data as an array of Float32Arrays.
+     */
+    async process(images) {
+        const processSingle = async (image) => {
+            let img = sharp(image).toColorspace('srgb');
+            const metadata = await img.metadata();
+            const scale = this.imageSize / Math.min(metadata.width, metadata.height);
+            const scaledWidth = Math.ceil(metadata.width * scale);
+            const scaledHeight = Math.ceil(metadata.height * scale);
+            img = img.resize({
+                width: scaledWidth,
+                height: scaledHeight,
+                fit: sharp.fit.cover,
+                position: sharp.strategy.entropy,
+                options: sharp.interpolators.bicubic
+            }).extract({
+                left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)),
+                top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)),
+                width: this.imageSize,
+                height: this.imageSize
+            }).removeAlpha();
+
+            let buffer = await img.raw().toBuffer();
+            let array = new Float32Array(buffer.length);
+
+            // When we export into the `array`, we reorder the dimensions of the tensor 
+            // from HWC to CHW, and normalize the pixel values.
+            let channelSize = this.imageSize * this.imageSize;
+            for (let i = 0; i < this.imageSize * this.imageSize; i++) {
+                let r = buffer[i * 3];
+                let g = buffer[i * 3 + 1];
+                let b = buffer[i * 3 + 2];
+                array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0];
+                array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1];
+                array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2];
+            }
+
+            return array;
+        };
+
+        if (Array.isArray(images)) {
+            return Promise.all(images.map(img => processSingle(img)));
+        } else {
+            return [await processSingle(images)];
+        }
+    }
+}
+
+/**
+ * An encoder for image data that uses a pre-trained model to encode images.
+ */
+class ImageEncoder {
+    constructor(modelPath, processor) {
+        this.modelPath = modelPath;
+        this.imageSize = processor.imageSize;
+    }
+
+    /**
+     * Initializes the ONNX session with the pre-trained model.
+     */
+    async init() {
+        this.session = await InferenceSession.create(this.modelPath);
+    }
+
+    /**
+     * Releases the ONNX session resources.
+     */
+    async dispose() {
+        if (this.session) {
+            await this.session.release();
+            this.session = null;
+        }
+    }
+
+    /**
+     * Encodes the processed image data using the pre-trained model.
+     *
+     * @param {Float32Array|Array<Float32Array>} images - The processed image data.
+     * @return {Object} The encoded outputs from the model.
+     */
+    async encode(images) {
+        if (!this.session) {
+            throw new Error("Session is not initialized.");
+        }
+
+        // Helper function to ensure data is a Float32Array.
+        const ensureFloat32Array = (data) => {
+            if (!(data instanceof Float32Array)) {
+                throw new Error("Unsupported data type for tensor conversion.");
+            }
+            return data;
+        };
+
+        // Helper function to concatenate multiple Float32Arrays into a single Float32Array.
+        const concatFloat32Arrays = (arrays) => {
+            const totalLength = arrays.reduce((acc, val) => acc + val.length, 0);
+            const result = new Float32Array(totalLength);
+            let offset = 0;
+            for (let arr of arrays) {
+                result.set(arr, offset);
+                offset += arr.length;
+            }
+            return result;
+        };
+
+        let imagesData;
+        let dims;
+
+        if (Array.isArray(images)) {
+            // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size.
+            const arrays = images.map(ensureFloat32Array);
+            imagesData = concatFloat32Arrays(arrays);
+            const numImages = arrays.length;
+            const numChannels = 3;
+            const height = this.imageSize;
+            const width = this.imageSize;
+            dims = [numImages, numChannels, height, width];
+        } else {
+            // Single image images, which is already a Float32Array.
+            imagesData = ensureFloat32Array(images);
+            const numChannels = 3;
+            const height = this.imageSize;
+            const width = this.imageSize;
+            dims = [1, numChannels, height, width];
+        }
+
+        // Create ONNX Tensor
+        const imagesTensor = new Tensor('float32', imagesData, dims);
+
+        // Run model inference
+        return this.session.run({
+            images: imagesTensor,
+        });
+    }
+}
+
+export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder };
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
new file mode 100644
index 0000000..30ea96a
--- /dev/null
+++ b/javascript/encoders_test.js
@@ -0,0 +1,233 @@
+import { existsSync, readFileSync } from 'fs';
+import { fileURLToPath } from 'url';
+import path from 'path';
+import assert from 'assert';
+import fetch from 'node-fetch';
+
+import { getModel, Modality } from "./hub.mjs";
+import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs";
+
+// Check if the HuggingFace Hub API token is set in the environment variable.
+let hf_token = process.env.HUGGINGFACE_HUB_TOKEN;
+if (!hf_token) {
+    const dirname = path.dirname(fileURLToPath(import.meta.url));
+    const tokenPath = path.join(dirname, '../', '.hf_token');
+    if (existsSync(tokenPath)) {
+        hf_token = readFileSync(tokenPath, 'utf8').trim();
+    }
+}
+
+async function tryGettingCheckpoint(modelId, modalities) {
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    assert(configPath !== null, "Config path should not be null");
+    assert(modalityPaths !== null, "Modality paths should not be null");
+    assert(tokenizerPath !== null, "Tokenizer path should not be null");
+
+    // Check if the file actually exists
+    assert(existsSync(configPath), `Config file should exist at ${configPath}`);
+    assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`);
+    for (const modalityPath of Object.values(modalityPaths)) {
+        assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`);
+    }
+}
+
+async function testGetCheckpoint() {
+    console.log("- `testGetCheckpoint`: Start");
+
+    try {
+        const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
+
+        for (const modelId of [
+            'unum-cloud/uform3-image-text-english-small',
+            'unum-cloud/uform3-image-text-english-base',
+            'unum-cloud/uform3-image-text-english-large',
+            'unum-cloud/uform3-image-text-multilingual-base',
+        ]) {
+            await tryGettingCheckpoint(modelId, modalities, hf_token);
+        }
+
+        console.log("- `testGetCheckpoint`: Success");
+    } catch (error) {
+        console.error("- `testGetCheckpoint`: Failed", error);
+    }
+}
+
+async function tryTextEncoderForwardPass(modelId) {
+    const modalities = [Modality.TextEncoder];
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    const textProcessor = new TextProcessor(configPath, tokenizerPath);
+    await textProcessor.init();
+    const processedTexts = await textProcessor.process("a small red panda in a zoo");
+
+    const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+    await textEncoder.init();
+    const textOutput = await textEncoder.encode(processedTexts);
+    assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
+
+    await textEncoder.dispose();
+}
+
+async function tryImageEncoderForwardPass(modelId) {
+    const modalities = [Modality.ImageEncoder];
+    const { configPath, modalityPaths } = await getModel(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    const imageProcessor = new ImageProcessor(configPath);
+    await imageProcessor.init();
+    const processedImages = await imageProcessor.process("assets/unum.png");
+
+    const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+    await imageEncoder.init();
+    const imageOutput = await imageEncoder.encode(processedImages);
+    assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
+
+    await imageEncoder.dispose();
+}
+
+function cosineSimilarity(vecA, vecB) {
+    // We may be receiving a complex tensor type, so let's check if it
+    // has an array member named `data`.
+    if (vecA.data) {
+        vecA = vecA.data;
+    }
+    if (vecB.data) {
+        vecB = vecB.data;
+    }
+
+    let dotProduct = 0.0;
+    let normA = 0.0;
+    let normB = 0.0;
+    for (let i = 0; i < vecA.length; i++) {
+        dotProduct += vecA[i] * 1.0 * vecB[i];
+        normA += vecA[i] * 1.0 * vecA[i];
+        normB += vecB[i] * 1.0 * vecB[i];
+    }
+    if (normA === 0 || normB === 0) {
+        return 0;
+    } else {
+        return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
+    }
+}
+
+async function fetchImage(url) {
+    const response = await fetch(url);
+    const arrayBuffer = await response.arrayBuffer();
+    const buffer = Buffer.from(arrayBuffer);
+    return buffer;
+}
+
+async function tryCrossReferencingImageAndText(modelId) {
+
+    const modalities = [Modality.ImageEncoder, Modality.TextEncoder];
+    const { configPath, modalityPaths, tokenizerPath } = await getModel(
+        modelId,
+        modalities,
+        hf_token,
+        '.onnx'
+    );
+
+    const imageProcessor = new ImageProcessor(configPath);
+    await imageProcessor.init();
+    const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+    await imageEncoder.init();
+    const textProcessor = new TextProcessor(configPath, tokenizerPath);
+    await textProcessor.init();
+    const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+    await textEncoder.init();
+
+    const texts = [
+        "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+        "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+        "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+        "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+        "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+    ];
+    const imageUrls = [
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+    ];
+
+    const textEmbeddings = [];
+    const imageEmbeddings = [];
+
+    for (let i = 0; i < texts.length; i++) {
+        const text = texts[i];
+        const imageUrl = imageUrls[i];
+        const imageBuffer = await fetchImage(imageUrl);
+
+        const processedText = await textProcessor.process(text);
+        const processedImage = await imageProcessor.process(imageBuffer);
+
+        const textEmbedding = await textEncoder.encode(processedText);
+        const imageEmbedding = await imageEncoder.encode(processedImage);
+
+        textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
+        imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
+
+        // Print-based debugging at its best :)
+        // console.log(`Text: ${text}, Image: ${imageUrl}`);
+        // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`);
+        // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`);
+        console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`)
+    }
+
+    for (let i = 0; i < texts.length; i++) {
+        const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]);
+        const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i]));
+        const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie));
+
+        const maxOtherTextSimilarity = Math.max(...otherTextSimilarities);
+        const maxOtherImageSimilarity = Math.max(...otherImageSimilarities);
+
+        assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images.");
+        assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts.");
+    }
+
+    await textEncoder.dispose();
+    await imageEncoder.dispose();
+}
+
+async function testEncoders() {
+    console.log("- `testEncoders`: Start");
+
+    try {
+
+        // Go through the bi-modal models
+        for (const modelId of [
+            'unum-cloud/uform3-image-text-english-small',
+            // 'unum-cloud/uform3-image-text-english-base',
+            // 'unum-cloud/uform3-image-text-english-large',
+            // 'unum-cloud/uform3-image-text-multilingual-base',
+        ]) {
+            await tryTextEncoderForwardPass(modelId, hf_token);
+            await tryImageEncoderForwardPass(modelId, hf_token);
+            await tryCrossReferencingImageAndText(modelId, hf_token);
+        }
+
+        console.log("- `testEncoders`: Success");
+    } catch (error) {
+        console.error("- `testEncoders`: Failed", error);
+    }
+}
+
+testGetCheckpoint();
+testEncoders();
diff --git a/javascript/hub.mjs b/javascript/hub.mjs
new file mode 100644
index 0000000..a59fb73
--- /dev/null
+++ b/javascript/hub.mjs
@@ -0,0 +1,104 @@
+import { join } from "path"
+import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs";
+
+import { downloadFile, listFiles } from "@huggingface/hub";
+
+const Modality = {
+    TextEncoder: "text_encoder",
+    ImageEncoder: "image_encoder",
+    VideoEncoder: "video_encoder",
+    TextDecoder: "text_decoder",
+};
+
+function isModality(value) {
+    return Object.values(Modality).includes(value);
+}
+
+function normalizeModalities(modalities) {
+    return modalities.map(x => {
+        if (typeof x === "string") {
+            if (isModality(x)) {
+                return x;
+            } else {
+                throw new Error(`Invalid modality: ${x}`);
+            }
+        }
+        return x;
+    });
+}
+
+async function ensureDirectoryExists(dirPath) {
+    if (!existsSync(dirPath)) {
+        mkdirSync(dirPath, { recursive: true });
+    }
+}
+
+async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
+    modalities = normalizeModalities(modalities);
+
+    const configNames = ['config.json'];
+    const tokenizerNames = ['tokenizer.json'];
+    const modelFileNames = modalities.map(modality => `${modality}${format}`);
+    const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames];
+
+    const repo = { type: "model", name: modelId };
+    const credentials = token ? { accessToken: token } : undefined;
+
+    let configPath = null;
+    let tokenizerPath = null;
+    const modalityPaths = {};
+    const modelSaveDir = join(saveDir, modelId);
+
+    await ensureDirectoryExists(modelSaveDir);
+
+    const fileIterator = listFiles({ repo, recursive: true, credentials });
+    for await (const file of fileIterator) {
+        const fileName = file.path.split('/').pop();
+        if (fileName && allowedPatterns.includes(fileName)) {
+            const filePath = file.path;
+            const savePath = join(modelSaveDir, fileName);
+
+            if (configNames.includes(fileName)) {
+                configPath = savePath;
+            } else if (tokenizerNames.includes(fileName)) {
+                tokenizerPath = savePath;
+            } else {
+                const modalityName = fileName.split('.')[0];
+                modalityPaths[modalityName] = savePath;
+            }
+
+            const response = await downloadFile({ repo, path: filePath, credentials });
+            if (response) {
+                // HuggingFace might be defining the `env.localModelPath` variable
+                // to store the downloaded files in a local directory.
+                // Let's check if the file is there.
+                // const localPath = join(env.localModelPath, repo, filePath);
+                // if (existsSync(localPath)) {
+                //     console.log(`File already exists locally at ${localPath}`);
+                // }
+
+                if (response.body && response.body.pipe) {
+                    const fileStream = createWriteStream(savePath);
+                    response.body.pipe(fileStream);
+                    await new Promise((resolve, reject) => {
+                        fileStream.on('finish', resolve);
+                        fileStream.on('error', reject);
+                    });
+                } else if (response.arrayBuffer) {
+                    // Handle non-streamable response for environments like Node.js
+                    const buffer = await response.arrayBuffer();
+                    writeFileSync(savePath, Buffer.from(buffer));
+                } else {
+                    console.error('Unexpected response type');
+                }
+                console.log(`Downloaded ${fileName} successfully to ${savePath}`);
+            } else {
+                console.log('No response received for the file download request.');
+            }
+        }
+    }
+
+    return { configPath, modalityPaths, tokenizerPath };
+}
+
+export { getModel, Modality };
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..948550b
--- /dev/null
+++ b/package.json
@@ -0,0 +1,33 @@
+{
+  "name": "uform",
+  "type": "module",
+  "private": true,
+  "version": "2.0.2",
+  "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
+  "dependencies": {
+    "@huggingface/hub": "^0.14.8",
+    "@xenova/transformers": "^2.17.0",
+    "node-fetch": "^3.3.2",
+    "onnxruntime-node": "^1.17.0",
+    "onnxruntime-web": "^1.17.3"
+  },
+  "devDependencies": {
+    "nodemon": "^2.0.15"
+  },
+  "scripts": {
+    "start": "node javascript/encoders.mjs",
+    "test": "node javascript/encoders_test.js"
+  },
+  "main": "javascript/encoders.mjs",
+  "directories": {
+    "doc": "docs"
+  },
+  "keywords": [
+    "AI",
+    "multimodal",
+    "content generation",
+    "huggingface"
+  ],
+  "author": "Ash Vardanian, Unum Cloud",
+  "license": "Apache-2.0"
+}
diff --git a/pyproject.toml b/pyproject.toml
index 10f7a9b..fef02d3 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,8 @@ classifiers = [
 dependencies = [
     "huggingface_hub>=0.16.4",
     "tokenizers>=0.13.3",
-    "pillow"
+    "pillow",
+    "simsimd",
 ]
 description = "Pocket-Sized Multimodal AI for Content Understanding and Generation"
 maintainers = [
@@ -49,6 +50,7 @@ uform-chat = "uform.chat:main"
 torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"]
 onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"]
 onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"]
+dev = ["pytest", "pandas"]
 
 [project.urls]
 "Homepage" = "https://github.com/unum-cloud/uform"
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..dd7611d
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,148 @@
+# UForm Python SDK
+
+UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications.
+The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware.
+
+## Installation
+
+There are several ways to install the UForm Python SDK, depending on the backend you want to use.
+PyTorch is by far the heaviest, but the most capable.
+ONNX is a lightweight alternative that can run on any CPU, and on some GPUs.
+
+```bash
+pip install "uform[torch]"       # For PyTorch
+pip install "uform[onnx]"        # For ONNX on CPU
+pip install "uform[onnx-gpu]"    # For ONNX on GPU, available for some platforms
+pip install "uform[torch,onnx]"  # For PyTorch and ONNX Python tests
+```
+
+## Quick Start
+
+### Embeddings
+
+Load the model:
+
+```py
+from uform import get_model, Modality
+
+model_name = 'unum-cloud/uform3-image-text-english-small'
+modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER]
+processors, models = get_model(model_name, modalities=modalities)
+
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
+```
+
+Embed images:
+
+```py
+import requests
+from io import BytesIO
+from PIL import Image
+
+image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
+image_url = Image.open(BytesIO(requests.get(image_url).content))
+image_data = processor_image(image)
+image_features, image_embedding = model_image.encode(image_data, return_features=True)
+```
+
+Embed queries:
+
+```py
+text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
+text_data = processor_text(text)
+text_features, text_embedding = model_text.encode(text_data, return_features=True)
+```
+
+### Generative Models
+
+UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
+Those models can be used to caption images or power multimodal chat experiences.
+
+```python
+from transformers import AutoModel, AutoProcessor
+
+model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+
+prompt = 'Question or Instruction'
+image = Image.open('image.jpg')
+
+inputs = processor(text=[prompt], images=[image], return_tensors='pt')
+
+with torch.inference_mode():
+     output = model.generate(
+        **inputs,
+        do_sample=False,
+        use_cache=True,
+        max_new_tokens=256,
+        eos_token_id=151645,
+        pad_token_id=processor.tokenizer.pad_token_id
+    )
+prompt_len = inputs['input_ids'].shape[1]
+decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
+```
+
+You can check examples of different prompts in our demo Gradio spaces on HuggingFace:
+
+- for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
+- for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo)
+
+## Technical Details
+
+### Multi-GPU Parallelism
+
+To achieve higher throughput, you can launch UForm on multiple GPUs.
+For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
+
+```python
+from uform import get_model, Modality
+import torch.nn as nn
+
+encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch')
+
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
+
+model_text.return_features = False
+model_image.return_features = False
+model_text_parallel = nn.DataParallel(model_text)
+model_image_parallel = nn.DataParallel(model_image)
+```
+
+Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays.
+
+```python
+def get_image_embedding(images: List[Image]):
+    preprocessed = processor_image(images)
+    embedding = model_image_parallel.forward(preprocessed)
+    return embedding.detach().cpu().numpy()
+
+def get_text_embedding(texts: List[str]):
+    preprocessed = processor_text(texts)
+    embedding = model_text_parallel.forward(preprocessed)
+    return embedding.detach().cpu().numpy()
+```
+
+### ONNX and CUDA
+
+The configuration process may include a few additional steps, depending on the environment.
+When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
+
+```sh
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install cuda-toolkit-12
+pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+export CUDA_PATH="/usr/local/cuda-12/bin"
+export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
+export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+pytest python/scripts/ -s -x -Wd -v -k onnx
+```
+
+[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
diff --git a/python/scripts/bench.py b/python/scripts/bench_decoders.py
similarity index 60%
rename from python/scripts/bench.py
rename to python/scripts/bench_decoders.py
index 49c7004..0842ba9 100644
--- a/python/scripts/bench.py
+++ b/python/scripts/bench_decoders.py
@@ -1,6 +1,8 @@
 from functools import partial
 from time import perf_counter
+from dataclasses import dataclass
 from typing import List
+import argparse
 
 import requests
 import torch
@@ -10,18 +12,38 @@
     InstructBlipForConditionalGeneration,
     InstructBlipProcessor,
     LlavaForConditionalGeneration,
+    AutoModel,
+    AutoProcessor,
 )
 
-from uform import get_model
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
 
 dtype = torch.bfloat16
 low_cpu_mem_usage = False
 device = "cuda:0"
 
 
-def caption(model, processor, prompt: str, image: Image.Image) -> str:
-    inputs = processor(prompt, image, return_tensors="pt")
+@dataclass
+class BenchmarkResult:
+    model_name: str
+    device_name: str
+    backend_name: str
+    duration_image_preprocessing: float
+    duration_image_embedding: float
+    duration_text_preprocessing: float
+    duration_text_embedding: float
+
+
+def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]:
+    # BLIP models require the prompt to be the first argument
+    prompt = [prompt] * batch_size
+    image = [image] * batch_size
+    try:
+        inputs = processor(prompt, image, return_tensors="pt")
+    except ValueError:
+        inputs = processor(image, prompt, return_tensors="pt")
+
+    # Downcast and move to device
     for possible_key in ["images", "pixel_values"]:
         if possible_key not in inputs:
             continue
@@ -33,19 +55,20 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str:
             **inputs,
             do_sample=False,
             # use_cache=True,
-            max_new_tokens=128,
+            max_new_tokens=max_length,
             eos_token_id=32001,
             pad_token_id=processor.tokenizer.pad_token_id,
         )
     prompt_len = inputs["input_ids"].shape[1]
-    decoded_text = processor.batch_decode(
+    decoded_texts = processor.batch_decode(
         output[:, prompt_len:],
         skip_special_tokens=True,
-    )[0].strip()
-    return decoded_text
+    )
+    return decoded_texts
 
 
 def duration(callable):
+    """Profile the duration of a callable and return the duration and the result."""
     start = perf_counter()
     result = callable()
     stop = perf_counter()
@@ -57,49 +80,35 @@ def bench_captions(
     processor,
     prompt: str,
     images: List[Image.Image],
+    max_length: int = 256,
+    batch_size: int = 10,
 ) -> List[str]:
     total_duration = 0
     total_length = 0
     model = torch.compile(model)
 
-    def caption_image(image, model=model, processor=processor, prompt=prompt):
-        return caption(model=model, processor=processor, prompt=prompt, image=image)
+    def caption_image(image):
+        return caption(
+            model=model,
+            processor=processor,
+            prompt=prompt,
+            image=image,
+            max_length=max_length,
+            batch_size=batch_size,
+        )
 
     for image in images:
-        seconds, text = duration(partial(caption_image, image=image))
+        seconds, captions = duration(partial(caption_image, image=image))
         total_duration += seconds
-        total_length += len(text)
+        total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions)
 
     del model
     del processor
     print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
 
 
-def bench_image_embeddings(model, images):
-    total_duration = 0
-    total_embeddings = 0
-    images *= 10
-    while total_duration < 10:
-        seconds, embeddings = duration(lambda: model.encode_image(processor.preprocess_image(images)))
-        total_duration += seconds
-        total_embeddings += len(embeddings)
-
-    print(f"Throughput: {total_embeddings/total_duration:.2f} images/s")
-
-
-def bench_text_embeddings(model, texts):
-    total_duration = 0
-    total_embeddings = 0
-    texts *= 10
-    while total_duration < 10:
-        seconds, embeddings = duration(lambda: model.encode_text(processor.preprocess_text(texts)))
-        total_duration += seconds
-        total_embeddings += len(embeddings)
-
-    print(f"Throughput: {total_embeddings/total_duration:.2f} queries/s")
-
+def main(batch_size: int = 10, max_length: int = 256):
 
-if __name__ == "__main__":
     image_urls = [
         "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
         "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
@@ -116,18 +125,40 @@ def bench_text_embeddings(model, texts):
         "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
     ]
 
+    print("UForm-Gen2")
+    bench_captions(
+        model=AutoModel.from_pretrained(
+            "unum-cloud/uform-gen2-dpo",
+            trust_remote_code=True,
+            torch_dtype=dtype,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            ignore_mismatched_sizes=True,
+        ).to(device),
+        processor=AutoProcessor.from_pretrained(
+            "unum-cloud/uform-gen2-dpo",
+            trust_remote_code=True,
+        ),
+        prompt="Describe the picture in great detail",
+        images=images,
+        batch_size=batch_size,
+        max_length=max_length,
+    )
+
     print("UForm-Gen")
     bench_captions(
         model=VLMForCausalLM.from_pretrained(
             "unum-cloud/uform-gen",
             torch_dtype=dtype,
             low_cpu_mem_usage=low_cpu_mem_usage,
+            ignore_mismatched_sizes=True,
         ).to(device),
         processor=VLMProcessor.from_pretrained(
             "unum-cloud/uform-gen",
         ),
         prompt="[cap] Summarize the visual content of the image.",
         images=images,
+        batch_size=batch_size,
+        max_length=max_length,
     )
 
     print("LLaVA")
@@ -142,6 +173,8 @@ def bench_text_embeddings(model, texts):
         ),
         prompt="USER: <image>\nWhat are these?\nASSISTANT:",
         images=images,
+        batch_size=batch_size,
+        max_length=max_length,
     )
 
     print("InstructBLIP")
@@ -156,12 +189,26 @@ def bench_text_embeddings(model, texts):
         ),
         prompt="Summarize the visual content of the image.",
         images=images,
+        batch_size=batch_size,
+        max_length=max_length,
     )
 
-    print("UForm-English")
-    bench_image_embeddings(get_model("unum-cloud/uform-vl-english"), images)
-    bench_text_embeddings(get_model("unum-cloud/uform-vl-english"), captions)
 
-    print("UForm-Multilingual")
-    bench_image_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), images)
-    bench_text_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), captions)
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=10,
+        help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
+    )
+    parser.add_argument(
+        "--max-length",
+        type=str,
+        default=256,
+        help="Maximum length of the generated text in tokens.",
+    )
+    args = parser.parse_args()
+
+    main(batch_size=args.batch_size, max_length=args.max_length)
diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py
new file mode 100644
index 0000000..b237126
--- /dev/null
+++ b/python/scripts/bench_encoders.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This script provides the throughput of UForm multimodal embedding models.
+
+The output of the script will cover:
+    - Time to preprocess an image, and throughput in images/s.
+    - Time to tokenize the text, and throughput in queries/s.
+    - Time to encode the image, and throughput in images/s.
+    - Time to encode the text, and throughput in queries/s.
+    - Share of time spent on each part of the pipeline.
+    
+Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx), 
+and precision (float32 or bfloat16), producing a pretty comprehensive benchmark.
+
+Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`.
+Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled.
+"""
+
+from functools import partial
+from time import perf_counter
+from dataclasses import dataclass
+from typing import List, Tuple, Literal, Callable, Generator
+import re
+import argparse
+
+import requests
+from PIL import Image
+import pandas as pd
+
+from uform import get_model, Modality, ExecutionProviderError
+
+# Define global constants for the hardware availability
+torch_available = False
+try:
+    import torch
+
+    torch_available = True
+except ImportError:
+    pass
+onnx_available = False
+try:
+    import onnx
+
+    onnx_available = True
+except ImportError:
+    pass
+cuda_available = False
+try:
+    if torch_available:
+        cuda_available = torch.cuda.is_available()
+    elif onnx_available:
+        import onnxruntime
+
+        cuda_available = onnxruntime.get_device() == "GPU"
+except ImportError:
+    pass
+
+
+@dataclass
+class BenchmarkResult:
+    model_name: str
+    device_name: Literal["cpu", "cuda"] = "cpu"
+    backend_name: Literal["torch", "onnx"] = "torch"
+    duration_image_preprocessing: float = 0
+    duration_image_embedding: float = 0
+    duration_text_preprocessing: float = 0
+    duration_text_embedding: float = 0
+
+
+def duration(callable, synchronize=False):
+    """Profile the duration of a callable and return the duration and the result."""
+    if synchronize and torch_available and cuda_available:
+        torch.cuda.synchronize()  # Wait for CUDA operations to complete
+    start = perf_counter()
+    result = callable()
+    if synchronize and torch_available and cuda_available:
+        torch.cuda.synchronize()  # Ensure all CUDA kernels have finished
+    stop = perf_counter()
+    return stop - start, result
+
+
+def get_captioned_images() -> List[Tuple[Image.Image, str]]:
+    """Get a list of pre-downloaded and decoded images and their captions."""
+    image_urls = [
+        "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+        "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+    ]
+    images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
+    captions = [
+        "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field",
+        "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta",
+        "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank",
+        "asian girl sleeping in a bed. top down view",
+        "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
+    ]
+    return list(zip(images, captions))
+
+
+def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]:
+    """Yields callable benchmarks for all supported backends of the given model."""
+
+    # Pull the content and artificially grow the batch size
+    images, captions = zip(*get_captioned_images())
+
+    if len(images) < batch_size:
+        import math
+
+        multiplier = int(math.ceil(batch_size / len(images)))
+        images *= multiplier
+        captions *= multiplier
+    images = images[:batch_size]
+    captions = captions[:batch_size]
+
+    def run(model_name: str, device: str, backend_name: str):
+        result = BenchmarkResult(
+            model_name=model_name,
+            backend_name=backend_name,
+            device_name=device,
+            duration_image_preprocessing=0,
+            duration_image_embedding=0,
+            duration_text_preprocessing=0,
+            duration_text_embedding=0,
+        )
+
+        sync = backend_name == "torch"
+        processors, models = get_model(
+            model_name,
+            device=device,
+            modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER],
+            backend=backend_name,
+        )
+
+        model_text = models[Modality.TEXT_ENCODER]
+        model_image = models[Modality.IMAGE_ENCODER]
+        processor_text = processors[Modality.TEXT_ENCODER]
+        processor_image = processors[Modality.IMAGE_ENCODER]
+
+        # Image preprocessing
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10 and total_iterations < 100:
+            seconds, _ = duration(lambda: processor_image(images))
+            total_duration += seconds
+            total_iterations += len(images)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_image_preprocessing = duration_per_iteration
+
+        # Image embedding
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10 and total_iterations < 100:
+            images_data = processor_image(images)
+            seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync)
+            total_duration += seconds
+            total_iterations += len(images)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_image_embedding = duration_per_iteration
+
+        # Text preprocessing
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10 and total_iterations < 100:
+            seconds, _ = duration(lambda: processor_text(captions))
+            total_duration += seconds
+            total_iterations += len(captions)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_text_preprocessing = duration_per_iteration
+
+        # Text embedding
+        total_duration = 0
+        total_iterations = 0
+        while total_duration < 10 and total_iterations < 100:
+            texts_data = processor_text(captions)
+            seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync)
+            total_duration += seconds
+            total_iterations += len(captions)
+        duration_per_iteration = total_duration / total_iterations
+        result.duration_text_embedding = duration_per_iteration
+
+        return result
+
+    devices = ["cpu"]
+    if cuda_available:
+        devices.append("cuda")
+    backends = []
+    if torch_available:
+        backends.append("torch")
+    if onnx_available:
+        backends.append("onnx")
+
+    for device in devices:
+        for backend_name in backends:
+            for model_name in [
+                "unum-cloud/uform3-image-text-english-small",
+                "unum-cloud/uform3-image-text-english-base",
+                "unum-cloud/uform3-image-text-english-large",
+                "unum-cloud/uform3-image-text-multilingual-base",
+            ]:
+                yield BenchmarkResult(
+                    model_name=model_name,
+                    device_name=device,
+                    backend_name=backend_name,
+                ), partial(run, model_name, device, backend_name)
+
+
+def main(filter_out: str = None, batch_size: int = 10):
+    results = []
+    filter_pattern = re.compile(filter_out) if filter_out else None
+    for specs, func in yield_benchmarks(batch_size=batch_size):
+        if filter_pattern and (
+            filter_pattern.search(specs.model_name)
+            or filter_pattern.search(specs.backend_name)
+            or filter_pattern.search(specs.device_name)
+        ):
+            continue
+
+        try:
+            print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend")
+            result = func()
+            results.append(result)
+        except ExecutionProviderError as e:
+            print(f"- skipping missing backend")
+            print(e)
+
+    results = sorted(results, key=lambda x: x.model_name)
+    results = [x.__dict__ for x in results]
+
+    df = pd.DataFrame(results)
+    df.columns = [
+        "Model Name",
+        "Device",
+        "Backend",
+        "Images Preprocessed/s",
+        "Images Encoded/s",
+        "Texts Preprocessed/s",
+        "Texts Encoded/s",
+    ]
+
+    def inverse(x):
+        return 1 / x if x != 0 else 0
+
+    # Apply number formatting directly in the DataFrame
+    formatted_df = df.copy()
+    formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format)
+    formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format)
+    formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format)
+    formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format)
+
+    # Convert formatted DataFrame to Markdown
+    print(formatted_df.to_markdown())
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--filter-out",
+        type=str,
+        default=None,
+        help="Filter out models, backends, or devices with a Regular Expression.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=10,
+        help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
+    )
+    args = parser.parse_args()
+
+    main(filter_out=args.filter_out, batch_size=args.batch_size)
diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb
deleted file mode 100644
index ce8cf10..0000000
--- a/python/scripts/export.ipynb
+++ /dev/null
@@ -1,666 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install --upgrade \"uform[torch]\" coremltools"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n",
-      "  Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n",
-      "  Expected in:     <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n",
-      "  warn(f\"Failed to load image Python extension: {e}\")\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fadffc0299c04e249fd4f7a5b40ba0af",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(torch.Size([1, 197, 384]),\n",
-       " torch.Size([1, 64, 768]),\n",
-       " torch.Size([1, 256]),\n",
-       " torch.Size([1, 256]))"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import uform\n",
-    "from PIL import Image\n",
-    "\n",
-    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
-    "text = 'a small red panda in a zoo'\n",
-    "image = Image.open('../../assets/unum.png')\n",
-    "\n",
-    "image_data = processor.preprocess_image(image)\n",
-    "text_data = processor.preprocess_text(text)\n",
-    "\n",
-    "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
-    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
-    "\n",
-    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "TextEncoder(model_type='bert', dim=768, context_dim=384, vocab_size=30522, padding_idx=0, num_layers=4, num_heads=12, embedding_dim=256, multimodal_layers_ids=[2, 3], head_one_neuron=False, pooling='cls', max_position_embeddings=64, dropout_prob=0.1)"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model.text_encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "VisualEncoder(dim=384, patch_size=16, image_size=224, num_layers=12, num_heads=6, embedding_dim=256, pooling='cls', num_reg_tokens=0)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model.image_encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "First layer of image_encoder: patch_embed\n",
-      "First layer of text_encoder: word_embeddings\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
-    "for name, module in model.image_encoder.named_children():\n",
-    "    print(f\"First layer of image_encoder: {name}\")\n",
-    "    break  # We break after the first layer\n",
-    "\n",
-    "for name, module in model.text_encoder.named_children():\n",
-    "    print(f\"First layer of text_encoder: {name}\")\n",
-    "    break  # We break after the first layer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## ONNX"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## CoreML"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n",
-      "Torch version 2.1.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import coremltools as ct\n",
-    "import torch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
-    "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
-    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
-    "text_features = ct.TensorType(name=\"features\")\n",
-    "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
-    "image_features = ct.TensorType(name=\"features\")\n",
-    "image_embeddings = ct.TensorType(name=\"embeddings\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "VisualEncoder(\n",
-       "  original_name=VisualEncoder\n",
-       "  (patch_embed): Conv2d(original_name=Conv2d)\n",
-       "  (blocks): Sequential(\n",
-       "    original_name=Sequential\n",
-       "    (0): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (1): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (2): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (3): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (4): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (5): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (6): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (7): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (8): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (9): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (10): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "    (11): VisualEncoderBlock(\n",
-       "      original_name=VisualEncoderBlock\n",
-       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
-       "      (attn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls1): LayerScale(original_name=LayerScale)\n",
-       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (ls2): LayerScale(original_name=LayerScale)\n",
-       "    )\n",
-       "  )\n",
-       "  (norm): LayerNorm(original_name=LayerNorm)\n",
-       "  (embedding_projection): Linear(original_name=Linear)\n",
-       ")"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "module = model.image_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
-    "traced_script_module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Tuple detected at graph output. This will be flattened in the converted model.\n",
-      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n",
-      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n",
-      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n",
-      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "coreml_model = ct.convert(\n",
-    "    traced_script_module, source=\"pytorch\",\n",
-    "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
-    "\n",
-    "coreml_model.author = 'Unum Cloud'\n",
-    "coreml_model.license = 'Apache 2.0'\n",
-    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "TextEncoder(\n",
-       "  original_name=TextEncoder\n",
-       "  (word_embeddings): Embedding(original_name=Embedding)\n",
-       "  (position_embeddings): Embedding(original_name=Embedding)\n",
-       "  (layer_norm): LayerNorm(original_name=LayerNorm)\n",
-       "  (dropout): Dropout(original_name=Dropout)\n",
-       "  (blocks): ModuleList(\n",
-       "    original_name=ModuleList\n",
-       "    (0): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (1): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (2): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
-       "      (crossattn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "    (3): TextEncoderBlock(\n",
-       "      original_name=TextEncoderBlock\n",
-       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
-       "      (attention): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
-       "      (crossattn): Attention(\n",
-       "        original_name=Attention\n",
-       "        (query): Linear(original_name=Linear)\n",
-       "        (key): Linear(original_name=Linear)\n",
-       "        (value): Linear(original_name=Linear)\n",
-       "        (out): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
-       "      (mlp): MLP(\n",
-       "        original_name=MLP\n",
-       "        (hidden_layer): Linear(original_name=Linear)\n",
-       "        (output_layer): Linear(original_name=Linear)\n",
-       "      )\n",
-       "      (dropout): Dropout(original_name=Dropout)\n",
-       "    )\n",
-       "  )\n",
-       "  (embedding_projection): Linear(original_name=Linear)\n",
-       "  (matching_head): Linear(original_name=Linear)\n",
-       "  (context_projection): Linear(original_name=Linear)\n",
-       ")"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "module = model.text_encoder\n",
-    "module.eval()\n",
-    "module.return_features = True\n",
-    "\n",
-    "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
-    "traced_script_module"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Tuple detected at graph output. This will be flattened in the converted model.\n",
-      "Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/157 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.\n",
-      "Converting PyTorch Frontend ==> MIL Ops:  99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n",
-      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n",
-      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n",
-      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "coreml_model = ct.convert(\n",
-    "    traced_script_module, source=\"pytorch\",\n",
-    "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
-    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
-    "\n",
-    "coreml_model.author = 'Unum Cloud'\n",
-    "coreml_model.license = 'Apache 2.0'\n",
-    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
-    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/scripts/export_decoders.ipynb b/python/scripts/export_decoders.ipynb
new file mode 100644
index 0000000..26e463b
--- /dev/null
+++ b/python/scripts/export_decoders.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+    "\n",
+    "Depending on the backend, we prefer different qunatization schemes.\n",
+    "\n",
+    "- For ONNX we use `uint8` quantization.\n",
+    "- For PyTorch we use `bfloat16` quantization.\n",
+    "- For CoreML we use `float32` representation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade \"uform[torch]\" coremltools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "model_name = \"unum-cloud/uform-gen2-dpo\"\n",
+    "output_directory = \"../../\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import uform\n",
+    "from PIL import Image\n",
+    "from transformers import AutoModel, AutoProcessor\n",
+    "\n",
+    "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n",
+    "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n",
+    "\n",
+    "prompt = 'Describe the picture'\n",
+    "image = Image.open('../../assets/unum.png')\n",
+    "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "     output = model.generate(\n",
+    "        **inputs,\n",
+    "        do_sample=False,\n",
+    "        use_cache=True,\n",
+    "        max_new_tokens=256,\n",
+    "        eos_token_id=151645,\n",
+    "        pad_token_id=processor.tokenizer.pad_token_id\n",
+    "    )\n",
+    "prompt_len = inputs['input_ids'].shape[1]\n",
+    "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n",
+    "\n",
+    "print(decoded_text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
new file mode 100644
index 0000000..a8b868d
--- /dev/null
+++ b/python/scripts/export_encoders.ipynb
@@ -0,0 +1,681 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+    "\n",
+    "Depending on the backend, we prefer different qunatization schemes.\n",
+    "\n",
+    "- For ONNX we use `uint8` quantization.\n",
+    "- For PyTorch we use `bfloat16` quantization.\n",
+    "- For CoreML we use `float32` representation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade \"uform[torch]\" coremltools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "working_directory = \"../..\"\n",
+    "model_name = \"uform3-image-text-english-small\"\n",
+    "model_directory = os.path.join(working_directory, \"models\", model_name)\n",
+    "model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n",
+    "config_path = os.path.join(model_directory, \"config.json\")\n",
+    "tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "state_dict = torch.load(model_weights_path)\n",
+    "list(state_dict.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from uform.torch_encoders import ImageEncoder, TextEncoder\n",
+    "from uform.torch_processors import ImageProcessor, TextProcessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n",
+    "text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n",
+    "image_encoder, text_encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_processor = TextProcessor(config_path, tokenizer_path)\n",
+    "image_processor = ImageProcessor(config_path)\n",
+    "text_processor, image_processor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uform\n",
+    "from PIL import Image\n",
+    "\n",
+    "text = 'a small red panda in a zoo'\n",
+    "image = Image.open('../../assets/unum.png')\n",
+    "\n",
+    "text_data = text_processor(text)\n",
+    "image_data = image_processor(image)\n",
+    "\n",
+    "image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n",
+    "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CoreML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import coremltools as ct\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = ct.precision.FLOAT32"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n",
+    "\n",
+    "```python\n",
+    "        image_input = ct.TensorType(name=\"images\", shape=image_data.shape)\n",
+    "        text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
+    "        text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+    "```\n",
+    "\n",
+    "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n",
+    "\n",
+    "```python\n",
+    "        ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generalize_first_dimensions(input_shape, upper_bound=64):\n",
+    "    if upper_bound == 1:\n",
+    "        return input_shape\n",
+    "    input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
+    "    return input_shape\n",
+    "\n",
+    "generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n",
+    "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
+    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
+    "text_features = ct.TensorType(name=\"features\")\n",
+    "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
+    "image_features = ct.TensorType(name=\"features\")\n",
+    "image_embeddings = ct.TensorType(name=\"embeddings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=precision)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PyTorch\n",
+    "\n",
+    "Let's ensure:\n",
+    "\n",
+    "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
+    "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
+    "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from safetensors import safe_open\n",
+    "from safetensors.torch import save_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_encoder.eval()\n",
+    "image_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_encoder.eval()\n",
+    "text_encoder.to(dtype=torch.bfloat16)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n",
+    "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ONNX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install onnx onnxconverter-common"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.onnx import export as onnx_export\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "onnx_export(\n",
+    "    module,\n",
+    "    (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
+    "    os.path.join(model_directory, \"text_encoder.onnx\"), \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['input_ids', 'attention_mask'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'input_ids' : {0 : 'batch_size'}, \n",
+    "        'attention_mask' : {0 : 'batch_size'}, \n",
+    "        'features' : {0 : 'batch_size'}, \n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now repeat the same for images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module = image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "module.to(dtype=torch.float32)\n",
+    "\n",
+    "torch.onnx.export(\n",
+    "    module,\n",
+    "    image_data[\"images\"], \n",
+    "    os.path.join(model_directory, \"image_encoder.onnx\"), \n",
+    "    export_params=True,\n",
+    "    opset_version=15,\n",
+    "    do_constant_folding=True,\n",
+    "    input_names = ['images'], \n",
+    "    output_names = ['features', 'embeddings'],\n",
+    "    dynamic_axes={\n",
+    "        'images' : {0 : 'batch_size'},\n",
+    "        'features' : {0 : 'batch_size'},\n",
+    "        'embeddings' : {0 : 'batch_size'}})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `float16`\n",
+    "\n",
+    "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "from onnxconverter_common import float16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "module_fp16 = float16.convert_float_to_float16(module)\n",
+    "onnx.save(module_fp16, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Quantizing to `uint8`\n",
+    "\n",
+    "We can further quantize the model into `uint8` using ONNX quantization tools.\n",
+    "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from onnxruntime.quantization import quantize_dynamic, QuantType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
+    "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's make sure that all the text inputs are integers of identical type - `int32`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnx\n",
+    "import os\n",
+    "from onnx import helper\n",
+    "\n",
+    "# Load the ONNX model\n",
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+    "module = onnx.load(module_path)\n",
+    "\n",
+    "# Get the module's graph\n",
+    "graph = module.graph\n",
+    "\n",
+    "# Iterate through the inputs and update the data type of `input_ids`\n",
+    "for input_tensor in graph.input:\n",
+    "    # Check if this is the tensor we want to change\n",
+    "    if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n",
+    "        # Get the tensor type information\n",
+    "        tensor_type = input_tensor.type.tensor_type\n",
+    "        # Set the element type to INT32 (int32's enum value in onnx is 6)\n",
+    "        tensor_type.elem_type = onnx.TensorProto.INT32\n",
+    "\n",
+    "# Optionally, check that the module is still valid\n",
+    "onnx.checker.check_model(module)\n",
+    "\n",
+    "# Save the modified module\n",
+    "onnx.save(module, module_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use the following function to print and validate the input and output types of the ONNX model files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_model_inputs_and_outputs(onnx_model_path):\n",
+    "    model = onnx.load(onnx_model_path)\n",
+    "\n",
+    "    # Get the model's graph\n",
+    "    graph = model.graph\n",
+    "\n",
+    "    # Print input information\n",
+    "    print(\"Model Inputs:\")\n",
+    "    for input_tensor in graph.input:\n",
+    "        tensor_type = input_tensor.type.tensor_type\n",
+    "        # Get the element type (data type)\n",
+    "        elem_type = tensor_type.elem_type\n",
+    "        # Convert numeric type to readable format\n",
+    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+    "        # Get tensor shape\n",
+    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+    "        print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n",
+    "\n",
+    "    # Print output information similarly if needed\n",
+    "    print(\"\\nModel Outputs:\")\n",
+    "    for output_tensor in graph.output:\n",
+    "        tensor_type = output_tensor.type.tensor_type\n",
+    "        elem_type = tensor_type.elem_type\n",
+    "        readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+    "        shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+    "        print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check that the runtime can actually load those models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import onnxruntime as ort\n",
+    "session_options = ort.SessionOptions()\n",
+    "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
+    "session = ort.InferenceSession(module_path, sess_options=session_options)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Upload to Hugging Face"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
+    "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py
similarity index 100%
rename from python/scripts/test_generative.py
rename to python/scripts/test_decoders.py
diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_embeddings.py
deleted file mode 100644
index d71bf0b..0000000
--- a/python/scripts/test_embeddings.py
+++ /dev/null
@@ -1,148 +0,0 @@
-from typing import Tuple
-
-import pytest
-from PIL import Image
-import uform
-
-# PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
-try:
-    import torch
-
-    torch_available = True
-except:
-    torch_available = False
-
-# ONNX is not a very light dependency either
-try:
-    import onnx
-
-    onnx_available = True
-except:
-    onnx_available = False
-
-torch_models = [
-    "unum-cloud/uform-vl-english",
-    "unum-cloud/uform-vl-multilingual-v2",
-]
-
-onnx_models_and_providers = [
-    ("unum-cloud/uform-vl-english-small", "cpu", "fp32"),
-    ("unum-cloud/uform-vl-english-large", "cpu", "fp32"),
-    ("unum-cloud/uform-vl-english-small", "gpu", "fp32"),
-    ("unum-cloud/uform-vl-english-large", "gpu", "fp32"),
-    ("unum-cloud/uform-vl-english-small", "gpu", "fp16"),
-    ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
-]
-
-
-@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
-@pytest.mark.parametrize("model_name", torch_models)
-def test_torch_one_embedding(model_name: str):
-    model, processor = uform.get_model(model_name)
-    text = "a small red panda in a zoo"
-    image_path = "assets/unum.png"
-
-    image = Image.open(image_path)
-    image_data = processor.preprocess_image(image)
-    text_data = processor.preprocess_text(text)
-
-    image_features, image_embedding = model.encode_image(image_data, return_features=True)
-    text_features, text_embedding = model.encode_text(text_data, return_features=True)
-
-    assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
-    assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
-
-    # Test reranking
-    score, joint_embedding = model.encode_multimodal(
-        image_features=image_features,
-        text_features=text_features,
-        attention_mask=text_data["attention_mask"],
-        return_scores=True,
-    )
-    assert score.shape[0] == 1, "Matching score batch size is not 1"
-    assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
-
-
-@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
-@pytest.mark.parametrize("model_name", torch_models)
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_torch_many_embeddings(model_name: str, batch_size: int):
-    model, processor = uform.get_model(model_name)
-    texts = ["a small red panda in a zoo"] * batch_size
-    image_paths = ["assets/unum.png"] * batch_size
-
-    images = [Image.open(path) for path in image_paths]
-    image_data = processor.preprocess_image(images)
-    text_data = processor.preprocess_text(texts)
-
-    image_embeddings = model.encode_image(image_data, return_features=False)
-    text_embeddings = model.encode_text(text_data, return_features=False)
-
-    assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
-    assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
-
-
-@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
-def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
-
-    from uform.onnx_models import ExecutionProviderError
-
-    try:
-
-        model, processor = uform.get_model_onnx(*model_specs)
-        text = "a small red panda in a zoo"
-        image_path = "assets/unum.png"
-
-        image = Image.open(image_path)
-        image_data = processor.preprocess_image(image)
-        text_data = processor.preprocess_text(text)
-
-        image_features, image_embedding = model.encode_image(image_data, return_features=True)
-        text_features, text_embedding = model.encode_text(text_data, return_features=True)
-
-        assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
-        assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
-
-        score, joint_embedding = model.encode_multimodal(
-            image_features=image_features,
-            text_features=text_features,
-            attention_mask=text_data["attention_mask"],
-            return_scores=True,
-        )
-        assert score.shape[0] == 1, "Matching score batch size is not 1"
-        assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
-
-    except ExecutionProviderError as e:
-        pytest.skip(f"Execution provider error: {e}")
-
-
-@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int):
-
-    from uform.onnx_models import ExecutionProviderError
-
-    try:
-
-        model, processor = uform.get_model_onnx(*model_specs)
-        texts = ["a small red panda in a zoo"] * batch_size
-        image_paths = ["assets/unum.png"] * batch_size
-
-        images = [Image.open(path) for path in image_paths]
-        image_data = processor.preprocess_image(images)
-        text_data = processor.preprocess_text(texts)
-
-        image_embeddings = model.encode_image(image_data, return_features=False)
-        text_embeddings = model.encode_text(text_data, return_features=False)
-
-        assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
-        assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
-
-    except ExecutionProviderError as e:
-        pytest.skip(f"Execution provider error: {e}")
-
-
-if __name__ == "__main__":
-    pytest.main(["-s", "-x", __file__])
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
new file mode 100644
index 0000000..20caed2
--- /dev/null
+++ b/python/scripts/test_encoders.py
@@ -0,0 +1,292 @@
+from functools import wraps
+from typing import Tuple
+import requests
+from io import BytesIO
+import os
+
+import pytest
+import numpy as np
+from PIL import Image
+
+from uform import Modality, get_model, ExecutionProviderError
+
+# PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
+try:
+    import torch
+
+    torch_available = True
+except:
+    torch_available = False
+
+# ONNX is not a very light dependency either
+try:
+    import onnx
+
+    onnx_available = True
+except:
+    onnx_available = False
+
+torch_models = [
+    "unum-cloud/uform3-image-text-english-small",
+    "unum-cloud/uform3-image-text-english-base",
+    "unum-cloud/uform3-image-text-english-large",
+    "unum-cloud/uform3-image-text-multilingual-base",
+]
+
+onnx_models = [
+    "unum-cloud/uform3-image-text-english-small",
+    "unum-cloud/uform3-image-text-english-base",
+    "unum-cloud/uform3-image-text-english-large",
+    "unum-cloud/uform3-image-text-multilingual-base",
+]
+
+# Let's check if the HuggingFace Hub API token is set in the environment variable.
+# If it's not there, check if the `.hf_token` file is present in the current working directory.
+token = os.getenv("HUGGINGFACE_HUB_TOKEN", None)
+if token is None:
+    token_path = "./.hf_token"
+    if os.path.exists(token_path):
+        with open(token_path, "r") as file:
+            token = file.read().strip()
+
+
+def skip_on(exception, reason="No good reason :)"):
+    def decorator_func(f):
+        @wraps(f)
+        def wrapper(*args, **kwargs):
+            try:
+                # Try to run the test
+                return f(*args, **kwargs)
+            except exception:
+                pytest.skip(reason)
+
+        return wrapper
+
+    return decorator_func
+
+
+def cosine_similarity(x, y) -> float:
+    if not isinstance(x, np.ndarray):
+        x = x.detach().numpy()
+    if not isinstance(y, np.ndarray):
+        y = y.detach().numpy()
+
+    # Unlike NumPy, SimSIMD can properly deal with integer types
+    x = x.astype(np.float32).flatten()
+    y = y.astype(np.float32).flatten()
+    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
+
+
+def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1):
+    """Test if the embeddings of text and image are semantically similar
+    using a small set of example text-image pairs."""
+
+    texts = [
+        "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+        "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+        "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+        "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+        "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+    ]
+
+    image_urls = [
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+        "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+    ]
+    assert len(texts) == len(image_urls), "Number of texts and images should be the same."
+
+    images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls]
+    count_pairs = len(texts)
+
+    # Ensure we have a sufficiently large batch
+    texts = texts * batch_size_multiple
+    images = images * batch_size_multiple
+
+    # Compute the embedding in a batch fashion
+    text_embeddings = text_to_embedding(texts)
+    image_embeddings = image_to_embedding(images)
+
+    # Evaluate cosine similarity
+    for i in range(count_pairs):
+        pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i])
+        other_text_similarities = [
+            cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i
+        ]
+        other_image_similarities = [
+            cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i
+        ]
+
+        assert pair_similarity > max(
+            other_text_similarities
+        ), "Text should be more similar to its corresponding image than to other images."
+        assert pair_similarity > max(
+            other_image_similarities
+        ), "Image should be more similar to its corresponding text than to other texts."
+
+
+@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
+@pytest.mark.parametrize("model_name", torch_models)
+def test_torch_one_embedding(model_name: str):
+    processors, models = get_model(model_name, token=token, backend="torch")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
+    text = "a small red panda in a zoo"
+    image_path = "assets/unum.png"
+
+    image = Image.open(image_path)
+    image_data = processor_image(image)
+    text_data = processor_text(text)
+
+    image_features, image_embedding = model_image.encode(image_data, return_features=True)
+    text_features, text_embedding = model_text.encode(text_data, return_features=True)
+
+    assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
+    assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
+
+    # Test if the model outputs actually make sense
+    cross_references_image_and_text_embeddings(
+        lambda text: model_text(processor_text(text)),
+        lambda image: model_image(processor_image(image)),
+    )
+
+
+@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
+@pytest.mark.parametrize("model_name", torch_models)
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_torch_many_embeddings(model_name: str, batch_size: int):
+
+    processors, models = get_model(model_name, token=token, backend="torch")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
+    texts = ["a small red panda in a zoo"] * batch_size
+    image_paths = ["assets/unum.png"] * batch_size
+
+    images = [Image.open(path) for path in image_paths]
+    image_data = processor_image(images)
+    text_data = processor_text(texts)
+
+    image_embeddings = model_image.encode(image_data, return_features=False)
+    text_embeddings = model_text.encode(text_data, return_features=False)
+
+    assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
+    assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
+
+
+@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
+@pytest.mark.parametrize("model_name", onnx_models)
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+@skip_on(ExecutionProviderError, reason="Missing execution provider")
+def test_onnx_one_embedding(model_name: str, device: str):
+
+    processors, models = get_model(model_name, token=token, device=device, backend="onnx")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
+    text = "a small red panda in a zoo"
+    image_path = "assets/unum.png"
+
+    image = Image.open(image_path)
+    image_data = processor_image(image)
+    text_data = processor_text(text)
+
+    image_features, image_embedding = model_image.encode(image_data)
+    text_features, text_embedding = model_text.encode(text_data)
+
+    assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
+    assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
+
+    # Nested fucntions are easier to debug, than lambdas
+    def get_image_embedding(image_data):
+        features, embedding = model_image.encode(processor_image(image_data))
+        return embedding
+
+    def get_text_embedding(text_data):
+        features, embedding = model_text.encode(processor_text(text_data))
+        return embedding
+
+    # Test if the model outputs actually make sense
+    cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding)
+
+
+@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
+@pytest.mark.parametrize("model_name", onnx_models)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+@skip_on(ExecutionProviderError, reason="Missing execution provider")
+def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
+
+    processors, models = get_model(model_name, token=token, device=device, backend="onnx")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
+    texts = ["a small red panda in a zoo"] * batch_size
+    image_paths = ["assets/unum.png"] * batch_size
+
+    images = [Image.open(path) for path in image_paths]
+    image_data = processor_image(images)
+    text_data = processor_text(texts)
+
+    image_embeddings = model_image.encode(image_data, return_features=False)
+    text_embeddings = model_text.encode(text_data, return_features=False)
+
+    assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
+    assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
+
+
+@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
+@pytest.mark.parametrize("model_name", torch_models[:1])
+def test_torch_multi_gpu(model_name: str):
+
+    count_cuda_devices = torch.cuda.device_count()
+    if count_cuda_devices < 2:
+        pytest.skip("Not enough CUDA devices to run multi-GPU test")
+
+    processors, models = get_model(model_name, token=token, backend="torch", device="cuda")
+    model_text = models[Modality.TEXT_ENCODER]
+    model_image = models[Modality.IMAGE_ENCODER]
+    processor_text = processors[Modality.TEXT_ENCODER]
+    processor_image = processors[Modality.IMAGE_ENCODER]
+
+    import torch.nn as nn
+
+    model_text.return_features = False
+    model_image.return_features = False
+    model_text_parallel = nn.DataParallel(model_text)
+    model_image_parallel = nn.DataParallel(model_image)
+
+    # Nested fucntions are easier to debug, than lambdas
+    def get_image_embedding(image_data):
+        preprocessed = processor_image(image_data)
+        embedding = model_image_parallel.forward(preprocessed)
+        return embedding.detach().cpu().numpy()
+
+    def get_text_embedding(text_data):
+        preprocessed = processor_text(text_data)
+        embedding = model_text_parallel.forward(preprocessed)
+        return embedding.detach().cpu().numpy()
+
+    # Test if the model outputs actually make sense
+    cross_references_image_and_text_embeddings(
+        get_text_embedding,
+        get_image_embedding,
+        batch_size_multiple=count_cuda_devices,
+    )
+
+
+if __name__ == "__main__":
+    # If you want to run this test file individually, you can do so by running:
+    # pytest.main(["-s", "-x", __file__])
+    pass
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 1ecb242..7af8b75 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,59 +1,191 @@
-from json import load
-from os.path import join
-from typing import Mapping, Optional, Tuple
-
-from huggingface_hub import snapshot_download
-
-
-def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]:
-    import torch
-
-    model_path = snapshot_download(repo_id=model_name, token=token)
-    config_path = join(model_path, "torch_config.json")
-
-    state = torch.load(join(model_path, "torch_weight.pt"))
-    return config_path, state, join(model_path, "tokenizer.json")
-
-
-def get_model(model_name: str, token: Optional[str] = None):
-    from uform.torch_models import VLM
-    from uform.torch_preprocessor import TorchProcessor
-
-    config_path, state, tokenizer_path = get_checkpoint(model_name, token)
-
-    with open(config_path) as f:
-        config = load(f)
-
-    model = VLM(config, tokenizer_path)
-    model.image_encoder.load_state_dict(state["image_encoder"])
-    model.text_encoder.load_state_dict(state["text_encoder"])
-    processor = TorchProcessor(config, tokenizer_path)
-
-    return model.eval(), processor
-
-
-def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
-    from uform.onnx_models import VLM_ONNX
-    from uform.numpy_preprocessor import NumPyProcessor
-
-    assert device in (
-        "cpu",
-        "gpu",
-    ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
-    assert dtype in (
-        "fp32",
-        "fp16",
-    ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
-    assert (
-        device == "cpu" and dtype == "fp32"
-    ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
-    model_path = snapshot_download(repo_id=f"{model_name}-{device}-{dtype}", token=token)
-
-    with open(join(model_path, "config.json")) as f:
-        config = load(f)
-
-    model = VLM_ONNX(model_path, config, device, dtype)
-    processor = NumPyProcessor(config, join(model_path, "tokenizer.json"))
-
-    return model, processor
+from os.path import join, exists
+from typing import Dict, Optional, Tuple, Literal, Union, Callable
+
+from huggingface_hub import snapshot_download, utils
+
+from uform.shared import ExecutionProviderError, Modality
+
+
+def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
+    if modalities is None:
+        return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER)
+
+    return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities)
+
+
+def get_checkpoint(
+    model_name: str,
+    modalities: Tuple[str, Modality],
+    token: Optional[str] = None,
+    format: Literal[".pt", ".onnx"] = ".pt",
+) -> Tuple[str, Dict[Modality, str], Optional[str]]:
+    """Downloads a model checkpoint from the Hugging Face Hub.
+
+    :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small`
+    :param token: The Hugging Face API token, if required
+    :param modalities: The modalities to download, like `("text_encoder", "image_encoder")`
+    :param format: The format of the model checkpoint, either `.pt` or `.onnx`
+    :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path
+    """
+
+    modalities = _normalize_modalities(modalities)
+
+    # It is not recommended to use `.pth` extension when checkpointing models
+    # because it collides with Python path (`.pth`) configuration files.
+    merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]]
+    separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities]
+    config_names = ["torch_config.json", "config.json"]
+    tokenizer_names = ["tokenizer.json"]
+
+    old_progress_behavior = utils.are_progress_bars_disabled()
+    utils.disable_progress_bars()
+
+    # The download stats depend on the number of times the `config.json` is pulled
+    # https://huggingface.co/docs/hub/models-download-stats
+    model_path = snapshot_download(
+        repo_id=model_name,
+        token=token,
+        allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
+    )
+
+    if old_progress_behavior:
+        utils.enable_progress_bars()
+
+    # Find the first name in `config_names` that is present
+    config_path = None
+    for config_name in config_names:
+        if exists(join(model_path, config_name)):
+            config_path = join(model_path, config_name)
+            break
+
+    # Same for the tokenizer
+    tokenizer_path = None
+    for tokenizer_name in tokenizer_names:
+        if exists(join(model_path, tokenizer_name)):
+            tokenizer_path = join(model_path, tokenizer_name)
+            break
+
+    # Ideally, we want to separately fetch all the models.
+    # If those aren't available, aggregate separate modalities and merge them.
+    modality_paths = None
+    for file_name in merged_model_names:
+        if exists(join(model_path, file_name)):
+            modality_paths = join(model_path, file_name)
+            break
+
+    if modality_paths is None:
+        modality_paths = {}
+        for separate_modality_name in separate_modality_names:
+            if exists(join(model_path, separate_modality_name)):
+                modality_name, _, _ = separate_modality_name.partition(".")
+                modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name)
+
+    return config_path, modality_paths, tokenizer_path
+
+
+def get_model_torch(
+    model_name: str,
+    *,
+    token: Optional[str] = None,
+    device: Literal["cpu", "cuda"] = "cpu",
+    modalities: Optional[Tuple[Union[str, Modality]]] = None,
+) -> Tuple[Dict[Modality, Callable], Dict]:
+    """
+    Fetches and constructs a PyTorch model with its processors based on provided modalities.
+
+    :param model_name: The identifier of the model on the Hugging Face Hub.
+    :param token: Optional API token for authenticated access to the model.
+    :param device: The device to load the model onto ('cpu' or 'cuda').
+    :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
+    :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+    """
+    from uform.torch_encoders import TextEncoder, ImageEncoder
+    from uform.torch_processors import TextProcessor, ImageProcessor
+
+    modalities = _normalize_modalities(modalities)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt")
+
+    result_processors = {}
+    result_models = {}
+
+    if Modality.TEXT_ENCODER in modalities:
+        processor = TextProcessor(config_path, tokenizer_path)
+        encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER))
+        encoder = encoder.eval().to(device)
+        result_processors[Modality.TEXT_ENCODER] = processor
+        result_models[Modality.TEXT_ENCODER] = encoder
+
+    if Modality.IMAGE_ENCODER in modalities:
+        processor = ImageProcessor(config_path)
+        encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER))
+        encoder = encoder.eval().to(device)
+        result_processors[Modality.IMAGE_ENCODER] = processor
+        result_models[Modality.IMAGE_ENCODER] = encoder
+
+    return result_processors, result_models
+
+
+def get_model_onnx(
+    model_name: str,
+    *,
+    device: Literal["cpu", "cuda"] = "cpu",
+    token: Optional[str] = None,
+    modalities: Optional[Tuple[str]] = None,
+):
+    """
+    Fetches and constructs an ONNX model with its processors based on provided modalities.
+
+    :param model_name: The identifier of the model on the Hugging Face Hub.
+    :param device: The device on which the model will operate ('cpu' or 'cuda').
+    :param token: Optional API token for authenticated access to the model.
+    :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
+    :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+    """
+    from uform.onnx_encoders import TextEncoder, ImageEncoder
+    from uform.numpy_processors import TextProcessor, ImageProcessor
+
+    modalities = _normalize_modalities(modalities)
+    config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx")
+
+    result_processors = {}
+    result_models = {}
+
+    if Modality.TEXT_ENCODER in modalities:
+        processor = TextProcessor(config_path, tokenizer_path)
+        encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device)
+        result_processors[Modality.TEXT_ENCODER] = processor
+        result_models[Modality.TEXT_ENCODER] = encoder
+
+    if Modality.IMAGE_ENCODER in modalities:
+        processor = ImageProcessor(config_path)
+        encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device)
+        result_processors[Modality.IMAGE_ENCODER] = processor
+        result_models[Modality.IMAGE_ENCODER] = encoder
+
+    return result_processors, result_models
+
+
+def get_model(
+    model_name: str,
+    *,
+    device: Literal["cpu", "cuda"] = "cpu",  # change this if you have a GPU
+    backend: Literal["onnx", "torch"] = "onnx",  # lighter = better
+    modalities: Optional[Tuple[str, Modality]] = None,  # all by default
+    token: Optional[str] = None,  # optional HuggingFace Hub token for private models
+) -> Tuple[Dict[Modality, Callable], Dict]:
+    """
+    Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend.
+
+    :param model_name: The identifier of the model on the Hugging Face Hub.
+    :param device: The device to load the model onto ('cpu' or 'cuda').
+    :param backend: The backend framework to use ('onnx' or 'torch').
+    :param modalities: A tuple specifying the types of model components to fetch.
+    :param token: Optional API token for authenticated access to the model.
+    :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+    """
+    if backend == "onnx":
+        return get_model_onnx(model_name, device=device, token=token, modalities=modalities)
+    elif backend == "torch":
+        return get_model_torch(model_name, device=device, token=token, modalities=modalities)
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
diff --git a/python/uform/chat.py b/python/uform/chat.py
index 5ef44b7..b9e4423 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -3,20 +3,16 @@
 import requests
 import torch
 from PIL import Image
-from transformers import TextStreamer
-
-from uform.gen_model import VLMForCausalLM, VLMProcessor
-
-EOS_TOKEN = 32001
+from transformers import TextStreamer, AutoModel, AutoProcessor
 
 
 def parse_args():
     parser = ArgumentParser(description="Chat with UForm generative model")
 
-    parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat")
-    parser.add_argument("--image", type=str, help="", required=True)
-    parser.add_argument("--device", type=str, required=True)
-    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path")
+    parser.add_argument("--image", type=str, required=True, help="Path to image or URL")
+    parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`")
+    parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference")
 
     return parser.parse_args()
 
@@ -30,22 +26,18 @@ def run_chat(opts, model, processor):
 
     messages = [{"role": "system", "content": "You are a helpful assistant."}]
     is_first_message = True
+
     if opts.image.startswith("http"):
-        image = (
-            processor.image_processor(
-                Image.open(requests.get(opts.image, stream=True).raw),
-            )
-            .unsqueeze(0)
-            .to(torch.bfloat16 if opts.fp16 else torch.float32)
-            .to(opts.device)
-        )
+        image = Image.open(requests.get(opts.image, stream=True).raw)
     else:
-        image = (
-            processor.image_processor(Image.open(opts.image))
-            .unsqueeze(0)
-            .to(torch.bfloat16 if opts.fp16 else torch.float32)
-            .to(opts.device)
-        )
+        image = Image.open(opts.image)
+
+    image = (
+        processor.feature_extractor(image)  #
+        .unsqueeze(0)
+        .to(torch.bfloat16 if opts.fp16 else torch.float32)
+        .to(opts.device)
+    )
 
     while True:
         if messages[-1]["role"] in ("system", "assistant"):
@@ -68,7 +60,7 @@ def run_chat(opts, model, processor):
                 1,
                 input_ids.shape[1] + processor.num_image_latents - 1,
             ).to(opts.device)
-            x = {
+            inputs = {
                 "input_ids": input_ids,
                 "attention_mask": attention_mask,
                 "images": image,
@@ -76,18 +68,19 @@ def run_chat(opts, model, processor):
 
             print("Assistant: ", end="")
             with torch.inference_mode():
-                y = model.generate(
-                    **x,
+                output = model.generate(
+                    **inputs,
                     do_sample=False,
                     use_cache=True,
                     max_new_tokens=1024,
-                    eos_token_id=EOS_TOKEN,
+                    eos_token_id=151645,
                     pad_token_id=processor.tokenizer.pad_token_id,
                     streamer=streamer,
                 )
             print()
 
-            message = processor.batch_decode(y[:, x["input_ids"].shape[1] : -1])[0]
+            prompt_len = inputs["input_ids"].shape[1]
+            message = processor.batch_decode(output[:, prompt_len:-1])[0]
 
             messages.append({"role": "assistant", "content": message})
 
@@ -95,16 +88,17 @@ def run_chat(opts, model, processor):
 def main():
     try:
         opts = parse_args()
-
+        processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True)
         model = (
-            VLMForCausalLM.from_pretrained(
+            AutoModel.from_pretrained(
                 opts.model,
                 torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32,
+                ignore_mismatched_sizes=True,
+                trust_remote_code=True,
             )
             .eval()
             .to(opts.device)
         )
-        processor = VLMProcessor.from_pretrained(opts.model)
 
         run_chat(opts, model, processor)
 
diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py
index c03b6eb..6792120 100644
--- a/python/uform/gen_model.py
+++ b/python/uform/gen_model.py
@@ -1,464 +1 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
-                                    Normalize, RandomResizedCrop, Resize,
-                                    ToTensor)
-from transformers import AutoConfig, AutoTokenizer
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto.modeling_auto import (AutoModel,
-                                                    AutoModelForCausalLM)
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import BatchEncoding
-
-from uform.torch_models import VisualEncoder
-
-IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
-IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
-
-
-def convert_to_rgb(image):
-    return image.convert("RGB")
-
-
-class LayerScale(nn.Module):
-    def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
-        super().__init__()
-        self.weight = nn.Parameter(init_values * torch.ones(dim))
-        self.inplace = inplace
-
-    def forward(self, x):
-        return x.mul_(self.weight) if self.inplace else x * self.weight
-
-
-class ImageFeaturesPooler(nn.Module):
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_attn_heads,
-        intermediate_size,
-        num_latents,
-        initializer_range,
-    ):
-        super().__init__()
-        self.projection = nn.Linear(input_size, hidden_size)
-
-        self.pooler = nn.TransformerDecoderLayer(
-            hidden_size,
-            num_attn_heads,
-            intermediate_size,
-            activation=nn.functional.silu,
-            batch_first=True,
-            norm_first=True,
-        )
-        self.image_latents = nn.Parameter(
-            torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
-        )
-
-    def forward(self, features):
-        features = self.projection(features)
-        return self.pooler(
-            self.image_latents.expand(features.shape[0], -1, -1),
-            features,
-        )
-
-
-class VLMConfig(PretrainedConfig):
-    model_type = "vlm"
-
-    def __init__(
-        self,
-        text_decoder_name_or_path: str = "",
-        tokenizer_name_or_path: str = "",
-        image_size: int = 224,
-        image_encoder_hidden_size: int = 768,
-        image_encoder_patch_size: int = 16,
-        image_encoder_num_layers: int = 12,
-        image_encoder_num_heads: int = 12,
-        image_encoder_embedding_dim: int = 256,
-        image_encoder_pooling: str = "cls",
-        image_pooler_num_attn_heads: int = 16,
-        image_pooler_intermediate_size: int = 5504,
-        image_pooler_num_latents: int = 196,
-        image_token_id: int = 32002,
-        initializer_range: float = 0.02,
-        use_cache: bool = True,
-        center_crop: bool = True,
-        **kwargs,
-    ):
-        self.text_decoder_name_or_path = text_decoder_name_or_path
-        self.tokenizer_name_or_path = tokenizer_name_or_path
-
-        self.image_size = image_size
-        self.image_encoder_hidden_size = image_encoder_hidden_size
-        self.image_encoder_patch_size = image_encoder_patch_size
-        self.image_encoder_num_layers = image_encoder_num_layers
-        self.image_encoder_num_heads = image_encoder_num_heads
-        self.image_encoder_embedding_dim = image_encoder_embedding_dim
-        self.image_encoder_pooling = image_encoder_pooling
-
-        self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
-        self.image_pooler_intermediate_size = image_pooler_intermediate_size
-        self.image_pooler_num_latents = image_pooler_num_latents
-
-        self.image_token_id = image_token_id
-
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.center_crop = center_crop
-
-        super().__init__(**kwargs)
-
-
-class VLMPreTrainedModel(PreTrainedModel):
-    config_class = VLMConfig
-    base_model_prefix = "vlm"
-    supports_gradient_checkpointing = True
-    _no_split_modules = []
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        pass
-
-    def _initialize_weights(self, module):
-        pass
-
-
-class VLMForCausalLM(VLMPreTrainedModel):
-    def __init__(self, config: VLMConfig):
-        super().__init__(config)
-
-        self.config = config
-        self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
-        self.text_config.vocab_size += 3
-        self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
-
-        self.image_encoder = VisualEncoder(
-            self.config.image_encoder_hidden_size,
-            self.config.image_encoder_patch_size,
-            self.config.image_size,
-            self.config.image_encoder_num_layers,
-            self.config.image_encoder_num_heads,
-            self.config.image_encoder_embedding_dim,
-            self.config.image_encoder_pooling,
-        )
-
-        # replace models' layerscales because `transformers` automatically renames keys in state_dict
-        for i in range(len(self.image_encoder.blocks)):
-            self.image_encoder.blocks[i].ls1 = LayerScale(
-                self.image_encoder.blocks[i].ls1.dim,
-            )
-            self.image_encoder.blocks[i].ls2 = LayerScale(
-                self.image_encoder.blocks[i].ls2.dim,
-            )
-
-        self.image_pooler = ImageFeaturesPooler(
-            self.config.image_encoder_hidden_size,
-            self.text_config.hidden_size,
-            self.config.image_pooler_num_attn_heads,
-            self.config.image_pooler_intermediate_size,
-            self.config.image_pooler_num_latents,
-            self.config.initializer_range,
-        )
-
-    def get_input_embeddings(self):
-        return self.text_decoder.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.text_decoder.set_input_embeddings(value)
-
-    def get_images_embeddings(self, images):
-        features = self.image_encoder.forward_features(images)
-        return self.image_pooler(features)
-
-    def gather_continuous_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        word_embeddings: torch.Tensor,
-        image_embeddings: torch.Tensor,
-    ) -> torch.Tensor:
-        start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
-        embeddings = []
-
-        for sample_idx, start_idx in enumerate(start_indices.tolist()):
-            embeddings.append(
-                torch.cat(
-                    (
-                        word_embeddings[sample_idx, :start_idx],
-                        image_embeddings[sample_idx],
-                        word_embeddings[sample_idx, start_idx + 1 :],
-                    ),
-                    dim=0,
-                ),
-            )
-
-        return torch.stack(embeddings, dim=0)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        images: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        labels: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time",
-            )
-        elif input_ids is None and inputs_embeds is None:
-            raise ValueError("You have to specify either input_is or inputs_embeds")
-
-        if inputs_embeds is None and past_key_values is None:
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-            if images is not None:
-                image_embeds = self.get_images_embeddings(images)
-                inputs_embeds = self.gather_continuous_embeddings(
-                    input_ids,
-                    inputs_embeds,
-                    image_embeds,
-                )
-
-        if position_ids is None:
-            seq_length = (
-                inputs_embeds.shape[1]
-                if inputs_embeds is not None
-                else input_ids.shape[1]
-            )
-            past_key_values_length = 0
-
-            if past_key_values is not None:
-                past_key_values_length = past_key_values[0][0].shape[2]
-
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length,
-                seq_length + past_key_values_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0)
-
-        outputs = self.text_decoder(
-            inputs_embeds=inputs_embeds,
-            input_ids=input_ids if past_key_values is not None else None,
-            attention_mask=attention_mask,
-            labels=labels,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            use_cache=use_cache,
-            return_dict=return_dict,
-        )
-
-        return outputs
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        images=None,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        **kwargs,
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        if images is not None:
-            model_inputs["images"] = images
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "images": images if past_key_values is None else None,
-            },
-        )
-        return model_inputs
-
-    @classmethod
-    def from_config(cls, config, **kwargs):
-        return cls._from_config(config, **kwargs)
-
-
-class VLMProcessor(ProcessorMixin):
-    def __init__(self, config, **kwargs):
-        self.feature_extractor = None
-        self.config = config
-
-        if config.center_crop:
-            self.image_processor = Compose(
-                [
-                    Resize(256, interpolation=InterpolationMode.BICUBIC),
-                    CenterCrop(config.image_size),
-                    convert_to_rgb,
-                    ToTensor(),
-                    Normalize(
-                        mean=IMAGENET_MEAN,
-                        std=IMAGENET_STD,
-                    ),
-                ],
-            )
-        else:
-            self.image_processor = Compose(
-                [
-                    RandomResizedCrop(
-                        config.image_size,
-                        scale=(0.8, 1),
-                        interpolation=InterpolationMode.BICUBIC,
-                    ),
-                    convert_to_rgb,
-                    ToTensor(),
-                    Normalize(
-                        mean=IMAGENET_MEAN,
-                        std=IMAGENET_STD,
-                    ),
-                ],
-            )
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            config.tokenizer_name_or_path,
-            additional_special_tokens=["<|im_end|>"],
-        )
-        self.num_image_latents = config.image_pooler_num_latents
-
-    def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
-        if texts is not None:
-            if isinstance(texts, str):
-                texts = [texts]
-
-            tokenized_texts = []
-            for text in texts:
-                messages = [
-                    {"role": "system", "content": "You are a helpful assistant."},
-                    {"role": "user", "content": f" <image> {text}"},
-                ]
-                tokenized_prompt = self.tokenizer.apply_chat_template(
-                    messages,
-                    add_generation_prompt=True,
-                    return_tensors=return_tensors,
-                )
-
-                tokenized_texts.append(tokenized_prompt)
-
-            max_len = max(len(t[0]) for t in tokenized_texts)
-            input_ids = torch.full(
-                (len(tokenized_texts), max_len),
-                fill_value=self.tokenizer.pad_token_id,
-                dtype=torch.int64,
-            )
-            attention_mask = torch.full(
-                (len(tokenized_texts), max_len),
-                fill_value=0,
-                dtype=torch.int64,
-            )
-
-            for i, tokens in enumerate(tokenized_texts):
-                input_ids[i, -len(tokens[0]) :] = tokens[0]
-                attention_mask[i, -len(tokens[0]) :] = 1
-
-            attention_mask = F.pad(
-                attention_mask,
-                pad=(0, self.num_image_latents - 1),
-                value=1,
-            )
-
-            encoding = BatchEncoding(
-                data={"input_ids": input_ids, "attention_mask": attention_mask},
-            )
-
-        if images is not None:
-            if isinstance(images, (list, tuple)):
-                image_features = torch.empty(
-                    (len(images), 3, self.config.image_size, self.config.image_size),
-                    dtype=torch.float32,
-                )
-
-                for i, image in enumerate(images):
-                    image_features[i] = self.image_processor(image)
-            else:
-                image_features = self.image_processor(images).unsqueeze(0)
-
-        if texts is not None and images is not None:
-            encoding["images"] = image_features
-            return encoding
-
-        if texts is not None:
-            return encoding
-
-        return BatchEncoding(
-            data={
-                "images": image_features,
-            },
-            tensor_type=return_tensors,
-        )
-
-    def batch_decode(self, *args, **kwargs):
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    def decode(self, *args, **kwargs):
-        return self.tokenizer.decode(*args, **kwargs)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path,
-        cache_dir=None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token=None,
-        revision: str = "main",
-        **kwargs,
-    ):
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-        return cls(config)
-
-
-AutoConfig.register("vlm", VLMConfig)
-AutoModel.register(VLMConfig, VLMForCausalLM)
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py
similarity index 62%
rename from python/uform/numpy_preprocessor.py
rename to python/uform/numpy_processors.py
index a556db4..166ecf4 100644
--- a/python/uform/numpy_preprocessor.py
+++ b/python/uform/numpy_processors.py
@@ -1,29 +1,31 @@
 from os import PathLike
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Sequence
+import json
 
 from PIL.Image import Image, BICUBIC
 from tokenizers import Tokenizer
 import numpy as np
 
+from uform.shared import read_config
 
-class NumPyProcessor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
+
+class TextProcessor:
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
-        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
-        self._image_size = config["image_encoder"]["image_size"]
-        self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
+        config = read_config(config_path)
+        if "text_encoder" in config:
+            config = config["text_encoder"]
+
+        self._max_seq_len = config["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
         self._tokenizer.no_padding()
-        self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
-        self.image_mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)[None, None]
-        self.image_std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)[None, None]
+        self._pad_token_idx = config["padding_idx"]
 
-    def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
+    def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]:
         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 
         :param texts: text of list of texts to tokenizer
@@ -34,7 +36,7 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]
         input_ids = np.full(
             (len(texts), self._max_seq_len),
             fill_value=self._pad_token_idx,
-            dtype=np.int64,
+            dtype=np.int32,
         )
 
         attention_mask = np.zeros(
@@ -51,13 +53,37 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]
 
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
-    def preprocess_image(self, images: Union[Image, List[Image]]) -> np.ndarray:
+
+class ImageProcessor:
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
+        """
+        :param config: model config
+        :param tokenizer_path: path to tokenizer file
+        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
+        """
+
+        config = read_config(config_path)
+        if "image_encoder" in config:
+            config = config["image_encoder"]
+
+        self._image_size = config["image_size"]
+        self._normalization_means = config["normalization_means"]
+        self._normalization_deviations = config["normalization_deviations"]
+
+        assert isinstance(self._image_size, int) and self._image_size > 0
+        assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
+        assert len(self._normalization_means) == len(self._normalization_deviations) == 3
+
+        self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None]
+        self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None]
+
+    def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray:
         """Transforms one or more Pillow images into Torch Tensors.
 
         :param images: image or list of images to preprocess
         """
 
-        if isinstance(images, list):
+        if isinstance(images, Sequence):
             batch_images = np.empty(
                 (len(images), 3, self._image_size, self._image_size),
                 dtype=np.float32,
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
new file mode 100644
index 0000000..b9c4cc4
--- /dev/null
+++ b/python/uform/onnx_encoders.py
@@ -0,0 +1,139 @@
+from os import PathLike
+from typing import Dict, Optional, Tuple, Union, Literal
+import json
+
+import onnxruntime as ort
+from numpy import ndarray
+
+from uform.shared import ExecutionProviderError
+
+
+def available_providers(device: Optional[str]) -> Tuple[str, ...]:
+    """Returns a tuple of available execution providers based on the requested device.
+    https://onnxruntime.ai/docs/execution-providers/
+
+    :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name.
+    :return: Tuple of available execution providers.
+    :raises ExecutionProviderError: If the requested device is not available.
+    """
+
+    gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
+    cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
+    available = ort.get_available_providers()
+
+    # If no target device is specified, let's sort all the available ones with respect to our preference
+    if device is None:
+        preferences = gpu_providers + cpu_providers
+        filtered_preferences = tuple(provider for provider in preferences if provider in available)
+        if len(filtered_preferences):
+            return filtered_preferences
+        if len(available):
+            return available
+        raise ExecutionProviderError("No execution providers are available")
+
+    # If a GPU is requested, but no GPU providers are available, raise an error
+    if device == "gpu" or device == "cuda":
+        if all(provider not in available for provider in gpu_providers):
+            raise ExecutionProviderError(
+                f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
+            )
+        return [x for x in gpu_providers if x in available]
+
+    # If a CPU is requested, but no CPU providers are available, raise an error
+    if device == "cpu":
+        if all(provider not in available for provider in cpu_providers):
+            raise ExecutionProviderError(
+                f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}"
+            )
+        return [x for x in cpu_providers if x in available]
+
+    if device not in available:
+        available_providers = ", ".join(available)
+        raise ExecutionProviderError(
+            f"Execution provider {device} is not available. Currently installed: {available_providers}"
+        )
+
+    return (device,)
+
+
+class ImageEncoder:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        device: Literal["cpu", "cuda"] = "cpu",
+        return_features: bool = True,
+    ):
+        """
+        :param model_path: Path to onnx model
+        :param device: Device name, either cpu or gpu
+        """
+
+        session_options = ort.SessionOptions()
+        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+        self.return_features = return_features
+        self.session = ort.InferenceSession(
+            model_path,
+            sess_options=session_options,
+            providers=available_providers(device),
+        )
+
+    def encode(
+        self, images: ndarray, return_features: Optional[bool] = None
+    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
+        features, embeddings = self.session.run(None, {"images": images})
+        return_features = return_features if return_features is not None else self.return_features
+        if return_features:
+            return features, embeddings
+        return embeddings
+
+
+class TextEncoder:
+    def __init__(
+        self,
+        model_path: str,
+        *,
+        device: Literal["cpu", "cuda"] = "cpu",
+        return_features: bool = True,
+    ):
+        """
+        :param text_encoder_path: Path to onnx of text encoder
+        :param device: Device name, either cpu or gpu
+        """
+
+        session_options = ort.SessionOptions()
+        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+        self.return_features = return_features
+        self.text_encoder_session = ort.InferenceSession(
+            model_path,
+            sess_options=session_options,
+            providers=available_providers(device),
+        )
+
+    def encode(
+        self,
+        x: Union[ndarray, dict],
+        attention_mask: Optional[ndarray] = None,
+        return_features: Optional[bool] = None,
+    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
+        if isinstance(x, dict):
+            assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
+            attention_mask = x["attention_mask"]
+            input_ids = x["input_ids"]
+        else:
+            input_ids = x
+
+        features, embeddings = self.text_encoder_session.run(
+            None,
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+            },
+        )
+
+        return_features = return_features if return_features is not None else self.return_features
+        if return_features:
+            return features, embeddings
+        return embeddings
diff --git a/python/uform/onnx_models.py b/python/uform/onnx_models.py
deleted file mode 100644
index 8e2a87a..0000000
--- a/python/uform/onnx_models.py
+++ /dev/null
@@ -1,231 +0,0 @@
-from os.path import join
-from typing import Dict, Optional, Tuple, Union
-
-import onnxruntime as ort
-from numpy import ndarray
-
-
-class ExecutionProviderError(Exception):
-    """Exception raised when a requested execution provider is not available."""
-
-
-def available_providers(device: str) -> Tuple[str, ...]:
-    gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
-    cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
-    available = ort.get_available_providers()
-    if device == "gpu":
-        if all(provider not in available for provider in gpu_providers):
-            raise ExecutionProviderError(
-                f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
-            )
-        return gpu_providers
-
-    return cpu_providers
-
-
-class VisualEncoderONNX:
-    def __init__(self, model_path: str, device: str):
-        """
-        :param model_path: Path to onnx model
-        :param device: Device name, either cpu or gpu
-        """
-
-        session_options = ort.SessionOptions()
-        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-
-        self.session = ort.InferenceSession(
-            model_path,
-            sess_options=session_options,
-            providers=available_providers(device),
-        )
-
-    def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
-        return self.session.run(None, {"images": images})
-
-
-class TextEncoderONNX:
-    def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
-        """
-        :param text_encoder_path: Path to onnx of text encoder
-        :param reranker_path: Path to onnx of reranker
-        :param device: Device name, either cpu or gpu
-        """
-
-        session_options = ort.SessionOptions()
-        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-
-        self.text_encoder_session = ort.InferenceSession(
-            text_encoder_path,
-            sess_options=session_options,
-            providers=available_providers(device),
-        )
-
-        self.reranker_session = ort.InferenceSession(
-            reranker_path,
-            sess_options=session_options,
-            providers=available_providers(device),
-        )
-
-    def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]:
-        return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
-
-    def forward_multimodal(
-        self, text_features: ndarray, attention_mask: ndarray, image_features: ndarray
-    ) -> Tuple[ndarray, ndarray]:
-        return self.reranker_session.run(
-            None,
-            {
-                "text_features": text_features,
-                "attention_mask": attention_mask,
-                "image_features": image_features,
-            },
-        )
-
-
-class VLM_ONNX:
-    def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
-        assert device in (
-            "cpu",
-            "gpu",
-        ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
-        assert dtype in (
-            "fp32",
-            "fp16",
-        ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
-        assert (
-            device == "cpu" and dtype == "fp32"
-        ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
-        self.device = device
-        self.dtype = dtype
-
-        self._embedding_dim = config["text_encoder"]["embedding_dim"]
-        self._text_encoder_dim = config["text_encoder"]["dim"]
-        self._image_encoder_dim = config["image_encoder"]["dim"]
-
-        self.text_encoder = TextEncoderONNX(
-            join(checkpoint_path, f"text_encoder.onnx"),
-            join(checkpoint_path, f"reranker.onnx"),
-            device,
-        )
-
-        self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device)
-
-    def encode_image(
-        self,
-        images: ndarray,
-        return_features: bool = False,
-    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
-        """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings.
-
-        :param images: Preprocessed image
-        :param return_features: Whether to return images features or return only embeddings
-        """
-
-        features, embeddings = self.image_encoder(images)
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def encode_text(
-        self,
-        texts: Dict[str, ndarray],
-        return_features: bool = False,
-    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
-        """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings.
-
-        :param texts: Dictionary with tokenized texts and attention masks
-        :param return_features: Whether to return texts features or return only embeddings
-        """
-
-        features, embeddings = self.text_encoder(**texts)
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def encode_multimodal(
-        self,
-        image: Optional[ndarray] = None,
-        text: Dict[str, ndarray] = None,
-        image_features: Optional[ndarray] = None,
-        text_features: Optional[ndarray] = None,
-        attention_mask: Optional[ndarray] = None,
-        return_scores: bool = False,
-    ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
-        """Passes preprocessed texts (or precomputed texts features) and
-            preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings.
-
-        :param image: Preprocessed images
-        :param text: Preprocessed texts
-        :param image_features: Precomputed images features
-        :param text_features: Precomputed text features
-        :param attention_mask: Attention masks, not required if pass `text` instead of text_features
-        """
-
-        assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None"
-        assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None"
-
-        if text_features is not None:
-            assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`"
-
-        if image_features is None:
-            image_features = self.image_encoder(image)
-
-        if text_features is None:
-            text_features = self.text_encoder(
-                text["input_ids"],
-                text["attention_mask"],
-            )
-
-        matching_scores, embeddings = self.text_encoder.forward_multimodal(
-            text_features,
-            attention_mask if attention_mask is not None else text["attention_mask"],
-            image_features,
-        )
-
-        if return_scores:
-            return matching_scores, embeddings
-
-        return embeddings
-
-    def forward(
-        self,
-        images: ndarray,
-        texts: Dict[str, ndarray],
-    ) -> Union[ndarray, ndarray]:
-        """Inference forward method
-
-        :param images: Preprocessed images
-        :param texts: Preprocessed texts
-        :return: embeddings for images and texts
-        """
-        _, image_embeddings = self.image_encoder(images)
-        _, text_embeddings = self.text_encoder(texts)
-        return image_embeddings, text_embeddings
-
-    @property
-    def text_features_dim(self) -> int:
-        """Dimensionality of the text encoder features."""
-
-        return self._text_encoder_dim
-
-    @property
-    def image_features_dim(self) -> int:
-        """Dimensionality of the image encoder features."""
-
-        return self._image_encoder_dim
-
-    @property
-    def embedding_dim(self) -> int:
-        """Dimensionality of shared space embedding."""
-
-        return self._embedding_dim
-
-    @property
-    def multimodal_embedding_dim(self) -> int:
-        """Dimensionality of multimodal joint embedding."""
-        return self._text_encoder_dim
diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py
deleted file mode 100644
index d3d833e..0000000
--- a/python/uform/preprocessing.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from os import PathLike
-from typing import Dict, List, Union
-
-import torch
-from PIL import Image
-from tokenizers import Tokenizer
-from torch import Tensor
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
-                                    Normalize, Resize, ToTensor)
-
-
-# lambda is not pickable
-def convert_to_rgb(image):
-    return image.convert("RGB")
-
-
-class Processor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"):
-        """
-        :param config: model config
-        :param tokenizer_path: path to tokenizer file
-        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
-        """
-
-        assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`"
-
-        self._image_size = config["image_encoder"]["image_size"]
-        self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
-        self._tokenizer = Tokenizer.from_file(tokenizer_path)
-        self._tokenizer.no_padding()
-        self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
-        self.tensor_type = tensor_type
-
-        self._image_transform = Compose(
-            [
-                Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
-                convert_to_rgb,
-                CenterCrop(self._image_size),
-                ToTensor(),
-                Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ],
-        )
-
-    def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
-        """Transforms one or more strings into dictionary with tokenized strings and attention masks.
-
-        :param texts: text of list of texts to tokenizer
-        """
-        if isinstance(texts, str):
-            texts = [texts]
-
-        input_ids = torch.full(
-            (len(texts), self._max_seq_len),
-            fill_value=self._pad_token_idx,
-            dtype=torch.int64,
-        )
-
-        attention_mask = torch.zeros(
-            len(texts),
-            self._max_seq_len,
-            dtype=torch.int32,
-        )
-        encoded = self._tokenizer.encode_batch(texts)
-
-        for i, seq in enumerate(encoded):
-            seq_len = min(len(seq), self._max_seq_len)
-            input_ids[i, :seq_len] = torch.LongTensor(
-                seq.ids[: self._max_seq_len],
-            )
-            attention_mask[i, :seq_len] = 1
-
-        if self.tensor_type == "np":
-            return {
-                "input_ids": input_ids.numpy(),
-                "attention_mask": attention_mask.numpy(),
-            }
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-    def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
-        """Transforms one or more Pillow images into Torch Tensors.
-
-        :param images: image or list of images to preprocess
-        """
-
-        if isinstance(images, list):
-            batch_images = torch.empty(
-                (len(images), 3, self._image_size, self._image_size),
-                dtype=torch.float32,
-            )
-
-            for i, image in enumerate(images):
-                batch_images[i] = self._image_transform(image)
-
-        else:
-            batch_images = self._image_transform(images).unsqueeze(0)
-
-        if self.tensor_type == "np":
-            return batch_images.numpy()
-
-        return batch_images
diff --git a/python/uform/shared.py b/python/uform/shared.py
new file mode 100644
index 0000000..37d256b
--- /dev/null
+++ b/python/uform/shared.py
@@ -0,0 +1,26 @@
+from enum import Enum
+from typing import Union
+from os import PathLike
+import json
+
+
+class Modality(Enum):
+    TEXT_ENCODER = "text_encoder"
+    IMAGE_ENCODER = "image_encoder"
+    VIDEO_ENCODER = "video_encoder"
+    TEXT_DECODER = "text_decoder"
+
+
+class ExecutionProviderError(Exception):
+    """Exception raised when a requested execution provider is not available."""
+
+
+ConfigOrPath = Union[PathLike, str, object]
+
+
+def read_config(path_or_object: ConfigOrPath) -> object:
+    if isinstance(path_or_object, (PathLike, str)):
+        with open(path_or_object, "r") as f:
+            return json.load(f)
+    else:
+        return path_or_object
diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py
new file mode 100644
index 0000000..475f5b0
--- /dev/null
+++ b/python/uform/torch_decoders.py
@@ -0,0 +1,469 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    InterpolationMode,
+    Normalize,
+    RandomResizedCrop,
+    Resize,
+    ToTensor,
+)
+from transformers import AutoConfig, AutoTokenizer
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+
+from uform.torch_encoders import ImageEncoder
+
+IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+def convert_to_rgb(image):
+    return image.convert("RGB")
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
+        super().__init__()
+        self.weight = nn.Parameter(init_values * torch.ones(dim))
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x.mul_(self.weight) if self.inplace else x * self.weight
+
+
+class ImageFeaturesPooler(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_attn_heads,
+        intermediate_size,
+        num_latents,
+        initializer_range,
+    ):
+        super().__init__()
+        self.projection = nn.Linear(input_size, hidden_size)
+
+        self.pooler = nn.TransformerDecoderLayer(
+            hidden_size,
+            num_attn_heads,
+            intermediate_size,
+            activation=nn.functional.silu,
+            batch_first=True,
+            norm_first=True,
+        )
+        self.image_latents = nn.Parameter(
+            torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
+        )
+
+    def forward(self, features):
+        features = self.projection(features)
+        return self.pooler(
+            self.image_latents.expand(features.shape[0], -1, -1),
+            features,
+        )
+
+
+class VLMConfig(PretrainedConfig):
+    model_type = "vlm"
+
+    def __init__(
+        self,
+        text_decoder_name_or_path: str = "",
+        tokenizer_name_or_path: str = "",
+        image_size: int = 224,
+        image_encoder_hidden_size: int = 768,
+        image_encoder_patch_size: int = 16,
+        image_encoder_num_layers: int = 12,
+        image_encoder_num_heads: int = 12,
+        image_encoder_embedding_dim: int = 256,
+        image_encoder_pooling: str = "cls",
+        image_pooler_num_attn_heads: int = 16,
+        image_pooler_intermediate_size: int = 5504,
+        image_pooler_num_latents: int = 196,
+        image_token_id: int = 32002,
+        initializer_range: float = 0.02,
+        use_cache: bool = True,
+        center_crop: bool = True,
+        **kwargs,
+    ):
+        self.text_decoder_name_or_path = text_decoder_name_or_path
+        self.tokenizer_name_or_path = tokenizer_name_or_path
+
+        self.image_size = image_size
+        self.image_encoder_hidden_size = image_encoder_hidden_size
+        self.image_encoder_patch_size = image_encoder_patch_size
+        self.image_encoder_num_layers = image_encoder_num_layers
+        self.image_encoder_num_heads = image_encoder_num_heads
+        self.image_encoder_embedding_dim = image_encoder_embedding_dim
+        self.image_encoder_pooling = image_encoder_pooling
+
+        self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
+        self.image_pooler_intermediate_size = image_pooler_intermediate_size
+        self.image_pooler_num_latents = image_pooler_num_latents
+
+        self.image_token_id = image_token_id
+
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.center_crop = center_crop
+
+        super().__init__(**kwargs)
+
+
+class VLMPreTrainedModel(PreTrainedModel):
+    config_class = VLMConfig
+    base_model_prefix = "vlm"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        pass
+
+    def _initialize_weights(self, module):
+        pass
+
+
+class VLMForCausalLM(VLMPreTrainedModel):
+    def __init__(self, config: VLMConfig):
+        super().__init__(config)
+
+        self.config = config
+        self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
+        self.text_config.vocab_size += 3
+        self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
+
+        self.image_encoder = ImageEncoder(
+            self.config.image_encoder_hidden_size,
+            self.config.image_encoder_patch_size,
+            self.config.image_size,
+            self.config.image_encoder_num_layers,
+            self.config.image_encoder_num_heads,
+            self.config.image_encoder_embedding_dim,
+            self.config.image_encoder_pooling,
+        )
+
+        # replace models' layerscales because `transformers` automatically renames keys in `state_dict`
+        for i in range(len(self.image_encoder.blocks)):
+            self.image_encoder.blocks[i].ls1 = LayerScale(
+                self.image_encoder.blocks[i].ls1.dim,
+            )
+            self.image_encoder.blocks[i].ls2 = LayerScale(
+                self.image_encoder.blocks[i].ls2.dim,
+            )
+
+        self.image_pooler = ImageFeaturesPooler(
+            self.config.image_encoder_hidden_size,
+            self.text_config.hidden_size,
+            self.config.image_pooler_num_attn_heads,
+            self.config.image_pooler_intermediate_size,
+            self.config.image_pooler_num_latents,
+            self.config.initializer_range,
+        )
+
+    def get_input_embeddings(self):
+        return self.text_decoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_decoder.set_input_embeddings(value)
+
+    def get_images_embeddings(self, images):
+        features = self.image_encoder.forward_features(images)
+        return self.image_pooler(features)
+
+    def gather_continuous_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        word_embeddings: torch.Tensor,
+        image_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
+        embeddings = []
+
+        for sample_idx, start_idx in enumerate(start_indices.tolist()):
+            embeddings.append(
+                torch.cat(
+                    (
+                        word_embeddings[sample_idx, :start_idx],
+                        image_embeddings[sample_idx],
+                        word_embeddings[sample_idx, start_idx + 1 :],
+                    ),
+                    dim=0,
+                ),
+            )
+
+        return torch.stack(embeddings, dim=0)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        images: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time",
+            )
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_is or inputs_embeds")
+
+        if inputs_embeds is None and past_key_values is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            if images is not None:
+                image_embeds = self.get_images_embeddings(images)
+                inputs_embeds = self.gather_continuous_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    image_embeds,
+                )
+
+        if position_ids is None:
+            seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1]
+            past_key_values_length = 0
+
+            if past_key_values is not None:
+                past_key_values_length = past_key_values[0][0].shape[2]
+
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        outputs = self.text_decoder(
+            inputs_embeds=inputs_embeds,
+            input_ids=input_ids if past_key_values is not None else None,
+            attention_mask=attention_mask,
+            labels=labels,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        images=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        if images is not None:
+            model_inputs["images"] = images
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": images if past_key_values is None else None,
+            },
+        )
+        return model_inputs
+
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        return cls._from_config(config, **kwargs)
+
+
+class VLMProcessor(ProcessorMixin):
+    def __init__(self, config, **kwargs):
+        self.feature_extractor = None
+        self.config = config
+
+        if config.center_crop:
+            self.image_processor = Compose(
+                [
+                    Resize(256, interpolation=InterpolationMode.BICUBIC),
+                    CenterCrop(config.image_size),
+                    convert_to_rgb,
+                    ToTensor(),
+                    Normalize(
+                        mean=IMAGENET_MEAN,
+                        std=IMAGENET_STD,
+                    ),
+                ],
+            )
+        else:
+            self.image_processor = Compose(
+                [
+                    RandomResizedCrop(
+                        config.image_size,
+                        scale=(0.8, 1),
+                        interpolation=InterpolationMode.BICUBIC,
+                    ),
+                    convert_to_rgb,
+                    ToTensor(),
+                    Normalize(
+                        mean=IMAGENET_MEAN,
+                        std=IMAGENET_STD,
+                    ),
+                ],
+            )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            config.tokenizer_name_or_path,
+            additional_special_tokens=["<|im_end|>"],
+        )
+        self.num_image_latents = config.image_pooler_num_latents
+
+    def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
+        if texts is not None:
+            if isinstance(texts, str):
+                texts = [texts]
+
+            tokenized_texts = []
+            for text in texts:
+                messages = [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": f" <image> {text}"},
+                ]
+                tokenized_prompt = self.tokenizer.apply_chat_template(
+                    messages,
+                    add_generation_prompt=True,
+                    return_tensors=return_tensors,
+                )
+
+                tokenized_texts.append(tokenized_prompt)
+
+            max_len = max(len(t[0]) for t in tokenized_texts)
+            input_ids = torch.full(
+                (len(tokenized_texts), max_len),
+                fill_value=self.tokenizer.pad_token_id,
+                dtype=torch.int64,
+            )
+            attention_mask = torch.full(
+                (len(tokenized_texts), max_len),
+                fill_value=0,
+                dtype=torch.int64,
+            )
+
+            for i, tokens in enumerate(tokenized_texts):
+                input_ids[i, -len(tokens[0]) :] = tokens[0]
+                attention_mask[i, -len(tokens[0]) :] = 1
+
+            attention_mask = F.pad(
+                attention_mask,
+                pad=(0, self.num_image_latents - 1),
+                value=1,
+            )
+
+            encoding = BatchEncoding(
+                data={
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                },
+            )
+
+        if images is not None:
+            if isinstance(images, (list, tuple)):
+                image_features = torch.empty(
+                    (len(images), 3, self.config.image_size, self.config.image_size),
+                    dtype=torch.float32,
+                )
+
+                for i, image in enumerate(images):
+                    image_features[i] = self.image_processor(image)
+            else:
+                image_features = self.image_processor(images).unsqueeze(0)
+
+        if texts is not None and images is not None:
+            encoding["images"] = image_features
+            return encoding
+
+        if texts is not None:
+            return encoding
+
+        return BatchEncoding(
+            data={
+                "images": image_features,
+            },
+            tensor_type=return_tensors,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        cache_dir=None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token=None,
+        revision: str = "main",
+        **kwargs,
+    ):
+        config = AutoConfig.from_pretrained(
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            revision=revision,
+            token=token,
+            **kwargs,
+        )
+        return cls(config)
+
+
+AutoConfig.register("vlm", VLMConfig)
+AutoModel.register(VLMConfig, VLMForCausalLM)
diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py
similarity index 63%
rename from python/uform/torch_models.py
rename to python/uform/torch_encoders.py
index ab86622..89f6631 100644
--- a/python/uform/torch_models.py
+++ b/python/uform/torch_encoders.py
@@ -1,11 +1,23 @@
+from __future__ import annotations
+
 from dataclasses import dataclass
 from os import PathLike
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Union, Mapping, Any, Tuple
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
+from PIL.Image import Image
+
+from uform.shared import read_config
+
+
+def _is_on_gpu(model: nn.Module) -> bool:
+    try:
+        return next(model.parameters()).device.type == "cuda"
+    except StopIteration:
+        return False
 
 
 @dataclass(eq=False)
@@ -132,7 +144,7 @@ def forward(
 
 
 @dataclass(eq=False)
-class VisualEncoderBlock(nn.Module):
+class ImageEncoderBlock(nn.Module):
     dim: int
     num_heads: int
 
@@ -219,36 +231,14 @@ def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
 
         return x
 
-    def forward_multimodal(
-        self,
-        x: Tensor,
-        attn_mask: Tensor,
-        context: Tensor,
-    ) -> Tensor:
-        context = self.context_projection(context)
-        expanded_attn_mask = self.get_attention_mask(attn_mask, x.dtype)
-        for block in self.blocks:
-            if block.cross_attention:
-                x = block(x, expanded_attn_mask, context)
-
-        return self.pool_features(x, attn_mask)
-
     def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor:
         return self.embedding_projection(self.pool_features(x, attn_mask))
 
-    def forward_matching(self, x: Tensor) -> Tensor:
-        logits = self.matching_head(x)
-        if self.head_one_neuron:
-            return torch.sigmoid(logits)[:, 0]
-
-        return F.softmax(logits, dim=1)[:, 1]
-
     def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
         if self.pooling == "cls":
             return x[:, 0]
 
         attn_mask = attn_mask.unsqueeze(2).type_as(x)
-
         return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1)
 
     def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor:
@@ -273,7 +263,8 @@ def forward(
         x: Union[Tensor, dict],
         attention_mask: Optional[Tensor] = None,
         return_features: Optional[bool] = None,
-    ) -> Tensor:
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+
         if isinstance(x, dict):
             assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
             attention_mask = x["attention_mask"]
@@ -282,6 +273,11 @@ def forward(
             # If no attention mask is provided - create one with all ones
             attention_mask = torch.ones_like(x)
 
+        # If the model is on the GPU and the input matrices are not, shift them there
+        if _is_on_gpu(self) and not x.is_cuda:
+            x = x.cuda()
+            attention_mask = attention_mask.cuda()
+
         features = self.forward_features(x, attention_mask)
         embeddings = self.forward_embedding(features, attention_mask)
 
@@ -290,9 +286,48 @@ def forward(
             return features, embeddings
         return embeddings
 
+    def encode(
+        self,
+        x: Union[Tensor, dict],
+        attention_mask: Optional[Tensor] = None,
+        return_features: Optional[bool] = None,
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+
+        result = self.forward(x, attention_mask, return_features)
+        if isinstance(result, tuple):
+            return result[0].detach(), result[1].detach()
+        else:
+            return result.detach()
+
+    @staticmethod
+    def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder:
+        """Load the image encoder from the given configuration and model path.
+
+        :param config: the configuration dictionary or path to the JSON configuration file
+        :param model: the model state dictionary or path to the `.pt` model file
+        """
+        config = read_config(config)
+        if "text_encoder" in config:
+            config = config["text_encoder"]
+
+        # We must strip all the non-member attributes before initializing the classes.
+        text_fields = TextEncoder.__dataclass_fields__
+        config = {k: v for k, v in config.items() if k in text_fields}
+        encoder = TextEncoder(**config)
+
+        # Load from disk
+        if isinstance(model, (PathLike, str)):
+            state = torch.load(model)
+        else:
+            state = model
+        if "text_encoder" in state:
+            state = state["text_encoder"]
+        encoder.load_state_dict(state)
+        return encoder
+
 
 @dataclass(eq=False)
-class VisualEncoder(nn.Module):
+class ImageEncoder(nn.Module):
     dim: int
     patch_size: int
     image_size: int
@@ -314,26 +349,23 @@ def __post_init__(self):
             self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim))
 
         self.blocks = nn.Sequential(
-            *[VisualEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
+            *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
         )
 
         self.norm = nn.LayerNorm(self.dim, eps=1e-6)
         self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
         self.return_features = False
 
-    def forward_features(self, x: Tensor) -> Tensor:
+    def forward_features(self, x: Union[Tensor, dict]) -> Tensor:
         x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1)
         x = x + self.pos_embed
-
         special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)]
 
         if self.num_reg_tokens > 0:
             special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1))
 
         x = torch.cat(special_tokens + [x], dim=1)
-
         x = self.blocks(x)
-
         return self.norm(x)
 
     def forward_embedding(self, x: Tensor) -> Tensor:
@@ -344,7 +376,14 @@ def forward_embedding(self, x: Tensor) -> Tensor:
 
         return self.embedding_projection(x)
 
-    def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
+    def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
+        if isinstance(x, dict):
+            x = x["images"]
+
+        # If the model is on the GPU and the input matrices are not, shift them there
+        if _is_on_gpu(self) and not x.is_cuda:
+            x = x.cuda()
+
         features = self.forward_features(x)
         embeddings = self.forward_embedding(features)
         return_features = return_features if return_features is not None else self.return_features
@@ -352,154 +391,38 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
             return features, embeddings
         return embeddings
 
+    def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
+        result = self.forward(x, return_features)
+        if isinstance(result, tuple):
+            return result[0].detach(), result[1].detach()
+        else:
+            return result.detach()
 
-class VLM(nn.Module):
-    """
-    Vision-Language Model for Multimodal embeddings.
-    """
-
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
-        """
-        :param config: Model config
-        """
-
-        super().__init__()
-        self._embedding_dim = config["text_encoder"]["embedding_dim"]
-
-        self.text_encoder = TextEncoder(**config["text_encoder"])
-        self.image_encoder = VisualEncoder(**config["image_encoder"])
-
-    def encode_image(
-        self,
-        images: Tensor,
-        return_features: bool = False,
-    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
-        """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings.
-
-        :param images: Preprocessed image
-        :param return_features: Whether to return images features or return only embeddings
-        """
-
-        features = self.image_encoder.forward_features(images)
-        embeddings = self.image_encoder.forward_embedding(features)
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def encode_text(
-        self,
-        texts: Dict[str, Tensor],
-        return_features: bool = False,
-    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
-        """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings.
-
-        :param texts: Dictionary with tokenized texts and attention masks
-        :param return_features: Whether to return texts features or return only embeddings
-        """
-
-        features = self.text_encoder.forward_features(
-            texts["input_ids"],
-            texts["attention_mask"],
-        )
-        embeddings = self.text_encoder.forward_embedding(
-            features,
-            texts["attention_mask"],
-        )
-
-        if return_features:
-            return features, embeddings
-
-        return embeddings
-
-    def encode_multimodal(
-        self,
-        image: Optional[Tensor] = None,
-        text: Optional[Dict] = None,
-        image_features: Optional[Tensor] = None,
-        text_features: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        return_scores: bool = False,
-    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
-        """Passes preprocessed texts (or precomputed texts features) and
-            preprocessed images (or precomputed images features) through multimodal encoded to produce multimodal joint embeddings.
-
-        :param image: Preprocessed images
-        :param text: Preprocessed texts
-        :param image_features: Precomputed images features
-        :param text_features: Precomputed text features
-        :param attention_mask: Attention masks, not required if pass `text` instead of text_features
-        """
-
-        assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None"
-        assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None"
-
-        if text_features is not None:
-            assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`"
-
-        if image_features is None:
-            image_features = self.image_encoder.forward_features(image)
-
-        if text_features is None:
-            text_features = self.text_encoder.forward_features(
-                text["input_ids"],
-                text["attention_mask"],
-            )
-
-        embeddings = self.text_encoder.forward_multimodal(
-            text_features,
-            attention_mask if attention_mask is not None else text["attention_mask"],
-            image_features,
-        )
-
-        if return_scores:
-            return self.get_matching_scores(embeddings), embeddings
-
-        return embeddings
-
-    def get_matching_scores(self, embeddings: Tensor) -> Tensor:
-        """Computes the probability that there is a match between images and texts based on their multimodal embeddings
-
-        :param embeddings: multimodal joint embeddings
-        """
-
-        return self.text_encoder.forward_matching(embeddings)
+    @staticmethod
+    def from_pretrained(
+        config: Union[PathLike, str, object],
+        model: Union[PathLike, str, Mapping[str, Any]],
+    ) -> ImageEncoder:
+        """Load the image encoder from the given configuration and model path.
 
-    def forward(
-        self,
-        images: Tensor,
-        texts: Dict[str, Tensor],
-    ) -> Union[Tensor, Tensor]:
-        """Inference forward method
-
-        :param images: Preprocessed images
-        :param texts: Preprocessed texts
-        :return: embeddings for images and texts
+        :param config: the configuration dictionary or path to the JSON configuration file
+        :param model: the model state dictionary or path to the `.pt` model file
         """
-        _, image_embeddings = self.image_encoder(images)
-        _, text_embeddings = self.text_encoder(texts)
-        return image_embeddings, text_embeddings
-
-    @property
-    def text_features_dim(self) -> int:
-        """Dimensionality of the text encoder features."""
-
-        return self.text_encoder.dim
-
-    @property
-    def image_features_dim(self) -> int:
-        """Dimensionality of the image encoder features."""
-
-        return self.image_encoder.dim
-
-    @property
-    def embedding_dim(self) -> int:
-        """Dimensionality of shared space embedding."""
-
-        return self._embedding_dim
-
-    @property
-    def multimodal_embedding_dim(self) -> int:
-        """Dimensionality of multimodal joint embedding."""
-        return self.text_encoder.dim
+        config = read_config(config)
+        if "image_encoder" in config:
+            config = config["image_encoder"]
+
+        # We must strip all the non-member attributes before initializing the classes.
+        image_fields = ImageEncoder.__dataclass_fields__
+        config = {k: v for k, v in config.items() if k in image_fields}
+        encoder = ImageEncoder(**config)
+
+        # Load from disk
+        if isinstance(model, (PathLike, str)):
+            state = torch.load(model)
+        else:
+            state = model
+        if "image_encoder" in state:
+            state = state["image_encoder"]
+        encoder.load_state_dict(state)
+        return encoder
diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py
similarity index 57%
rename from python/uform/torch_preprocessor.py
rename to python/uform/torch_processors.py
index 8bdc70b..79c7e87 100644
--- a/python/uform/torch_preprocessor.py
+++ b/python/uform/torch_processors.py
@@ -1,5 +1,6 @@
 from os import PathLike
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Sequence
+import json
 
 import torch
 from PIL.Image import Image
@@ -14,43 +15,35 @@
     ToTensor,
 )
 
+from uform.shared import read_config
 
-# lambda is not pickable
+
+# lambda is not pickle-able
 def convert_to_rgb(image):
     return image.convert("RGB")
 
 
-class TorchProcessor:
-    def __init__(self, config: Dict, tokenizer_path: PathLike):
+class TextProcessor:
+    def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
         """
         :param config: model config
         :param tokenizer_path: path to tokenizer file
-        :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
         """
 
-        self._image_size = config["image_encoder"]["image_size"]
-        self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
+        config = read_config(config_path)
+        if "text_encoder" in config:
+            config = config["text_encoder"]
+
+        self._max_seq_len = config["max_position_embeddings"]
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
         self._tokenizer.no_padding()
-        self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
-        self._image_transform = Compose(
-            [
-                Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
-                convert_to_rgb,
-                CenterCrop(self._image_size),
-                ToTensor(),
-                Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ],
-        )
+        self._pad_token_idx = config["padding_idx"]
 
-    def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
+    def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
         """Transforms one or more strings into dictionary with tokenized strings and attention masks.
 
         :param texts: text of list of texts to tokenizer
+        :return: dictionary with tokenized strings and attention masks as values
         """
         if isinstance(texts, str):
             texts = [texts]
@@ -77,13 +70,46 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
 
         return {"input_ids": input_ids, "attention_mask": attention_mask}
 
-    def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
+
+class ImageProcessor:
+    def __init__(self, config_path: PathLike):
+        """
+        :param config: model config
+        """
+
+        config = read_config(config_path)
+        if "image_encoder" in config:
+            config = config["image_encoder"]
+
+        self._image_size = config["image_size"]
+        self._normalization_means = config["normalization_means"]
+        self._normalization_deviations = config["normalization_deviations"]
+
+        assert isinstance(self._image_size, int) and self._image_size > 0
+        assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
+        assert len(self._normalization_means) == len(self._normalization_deviations) == 3
+
+        self._image_transform = Compose(
+            [
+                Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
+                convert_to_rgb,
+                CenterCrop(self._image_size),
+                ToTensor(),
+                Normalize(
+                    mean=tuple(self._normalization_means),
+                    std=tuple(self._normalization_deviations),
+                ),
+            ],
+        )
+
+    def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]:
         """Transforms one or more Pillow images into Torch Tensors.
 
         :param images: image or list of images to preprocess
+        :return: dictionary with float-represented images in tensors as values
         """
 
-        if isinstance(images, list):
+        if isinstance(images, Sequence):
             batch_images = torch.empty(
                 (len(images), 3, self._image_size, self._image_size),
                 dtype=torch.float32,
@@ -95,4 +121,4 @@ def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
         else:
             batch_images = self._image_transform(images).unsqueeze(0)
 
-        return batch_images
+        return {"images": batch_images}
diff --git a/swift/Embeddings.swift b/swift/Embeddings.swift
deleted file mode 100644
index 6d973ac..0000000
--- a/swift/Embeddings.swift
+++ /dev/null
@@ -1,403 +0,0 @@
-//
-//  Embeddings.swift
-//
-//
-//  Created by Ash Vardanian on 3/27/24.
-//
-import Accelerate
-import CoreGraphics
-import CoreML
-import Foundation
-import Hub  // `Config`
-import Tokenizers  // `AutoTokenizer`
-
-public enum Embedding {
-    case i32s([Int32])
-    case f16s([Float16])
-    case f32s([Float32])
-    case f64s([Float64])
-
-    init?(from multiArray: MLMultiArray) {
-        switch multiArray.dataType {
-        case .float64:
-            self = .f64s(
-                Array(
-                    UnsafeBufferPointer(
-                        start: multiArray.dataPointer.assumingMemoryBound(to: Float64.self),
-                        count: Int(truncating: multiArray.shape[1])
-                    )
-                )
-            )
-        case .float32:
-            self = .f32s(
-                Array(
-                    UnsafeBufferPointer(
-                        start: multiArray.dataPointer.assumingMemoryBound(to: Float32.self),
-                        count: Int(truncating: multiArray.shape[1])
-                    )
-                )
-            )
-        case .float16:
-            self = .f16s(
-                Array(
-                    UnsafeBufferPointer(
-                        start: multiArray.dataPointer.assumingMemoryBound(to: Float16.self),
-                        count: Int(truncating: multiArray.shape[1])
-                    )
-                )
-            )
-        case .int32:
-            self = .i32s(
-                Array(
-                    UnsafeBufferPointer(
-                        start: multiArray.dataPointer.assumingMemoryBound(to: Int32.self),
-                        count: Int(truncating: multiArray.shape[1])
-                    )
-                )
-            )
-        @unknown default:
-            return nil  // return nil for unsupported data types
-        }
-    }
-
-    public func asFloats() -> [Float] {
-        switch self {
-        case .f32s(let array):
-            return array
-        case .i32s(let array):
-            return array.map { Float($0) }
-        case .f16s(let array):
-            return array.map { Float($0) }
-        case .f64s(let array):
-            return array.map { Float($0) }
-        }
-    }
-}
-
-// MARK: - Helpers
-
-func readConfig(fromPath path: String) throws -> [String: Any] {
-    // If it's not an absolute path, let's assume it's a path relative to the current working directory
-    let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
-    let data = try Data(contentsOf: URL(fileURLWithPath: absPath))
-    return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any]
-}
-
-func readModel(fromURL modelURL: URL) throws -> MLModel {
-    let compiledModelURL = try MLModel.compileModel(at: modelURL)
-    return try MLModel(contentsOf: compiledModelURL)
-}
-
-func readModel(fromPath path: String) throws -> MLModel {
-    // If it's not an absolute path, let's assume it's a path relative to the current working directory
-    let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
-    let modelURL = URL(fileURLWithPath: absPath, isDirectory: true)
-    return try readModel(fromURL: modelURL)
-}
-
-// MARK: - Encoders
-
-public class TextEncoder {
-    let model: MLModel
-    let processor: TextProcessor
-
-    public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws {
-        let finalConfigPath = configPath ?? modelPath + "/config.json"
-        let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
-        self.model = try readModel(fromPath: modelPath)
-        self.processor = try TextProcessor(configPath: finalConfigPath, tokenizerPath: finalTokenizerPath, model: self.model)
-    }
-
-    
-    public init(modelName: String, hubApi: HubApi = .shared) async throws {
-        let repo = Hub.Repo(id: modelName)
-        let modelURL = try await hubApi.snapshot(from: repo, matching: ["text.mlpackage/*", "config.json", "tokenizer.json"])
-        let configPath = modelURL.appendingPathComponent("config.json").path
-        let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("text.mlpackage", isDirectory: true))
-        self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
-    }
-
-    public func forward(with text: String) throws -> Embedding {
-        let inputFeatureProvider = try self.processor.preprocess(text)
-        let prediction = try self.model.prediction(from: inputFeatureProvider)
-        guard let predictionFeature = prediction.featureValue(for: "embeddings"),
-            let output = predictionFeature.multiArrayValue,
-            let embedding = Embedding(from: output)
-        else {
-            throw NSError(
-                domain: "TextEncoder",
-                code: 0,
-                userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
-            )
-        }
-        return embedding
-    }
-}
-
-public class ImageEncoder {
-    let model: MLModel
-    let processor: ImageProcessor
-
-    public init(modelPath: String, configPath: String? = nil) throws {
-        let finalConfigPath = configPath ?? modelPath + "/config.json"
-        self.model = try readModel(fromPath: modelPath)
-        self.processor = try ImageProcessor(configPath: finalConfigPath)
-    }
-
-    public init(modelName: String, hubApi: HubApi = .shared) async throws {
-        let repo = Hub.Repo(id: modelName)
-        let modelURL = try await hubApi.snapshot(from: repo, matching: ["image.mlpackage/*", "config.json"])
-        let configPath = modelURL.appendingPathComponent("config.json").path
-        self.model = try readModel(fromURL: modelURL.appendingPathComponent("image.mlpackage", isDirectory: true))
-        self.processor = try ImageProcessor(configPath: configPath)
-    }
-    
-    public func forward(with image: CGImage) throws -> Embedding {
-        let inputFeatureProvider = try self.processor.preprocess(image)
-        let prediction = try self.model.prediction(from: inputFeatureProvider)
-        guard let predictionFeature = prediction.featureValue(for: "embeddings"),
-            let output = predictionFeature.multiArrayValue,
-            let embedding = Embedding(from: output)
-        else {
-            throw NSError(
-                domain: "ImageEncoder",
-                code: 0,
-                userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
-            )
-        }
-        return embedding
-    }
-}
-
-// MARK: - Processors
-
-class TextProcessor {
-    let tokenizer: Tokenizer
-    let minContextLength: Int
-    let maxContextLength: Int
-
-    public init(configPath: String, tokenizerPath: String, model: MLModel) throws {
-        var configDict = try readConfig(fromPath: configPath)
-        let tokenizerDict = try readConfig(fromPath: tokenizerPath)
-
-        // Check if there's a specific 'text_encoder' configuration within the main configuration
-        if let textEncoderConfig = configDict["text_encoder"] as? [String: Any] {
-            configDict = textEncoderConfig  // Use the specific 'text_encoder' configuration
-        }
-
-        let config = Config(configDict)
-        let tokenizerData = Config(tokenizerDict)
-        self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData)
-
-        let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"]
-        guard let shapeConstraint = inputDescription?.multiArrayConstraint?.shapeConstraint else {
-            fatalError("Cannot obtain shape information")
-        }
-
-        switch shapeConstraint.type {
-        case .enumerated:
-            minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue
-            maxContextLength = minContextLength
-        case .range:
-            let range = inputDescription?.multiArrayConstraint?.shapeConstraint.sizeRangeForDimension[1] as? NSRange
-            minContextLength = range?.location ?? 1
-            maxContextLength = range?.length ?? 128
-        case .unspecified:
-            minContextLength = 128
-            maxContextLength = 128
-        @unknown default:
-            minContextLength = 128
-            maxContextLength = 128
-        }
-    }
-
-    public func preprocess(_ text: String) throws -> MLFeatureProvider {
-        let inputIDs = self.tokenizer.encode(text: text)
-        return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength)
-    }
-}
-
-class ImageProcessor {
-    let imageSize: Int
-    let mean: [Float] = [0.485, 0.456, 0.406]  // Common mean values for normalization
-    let std: [Float] = [0.229, 0.224, 0.225]  // Common std values for normalization
-
-    init(configPath: String) throws {
-        var configDict = try readConfig(fromPath: configPath)
-        // Check if there's a specific 'image_encoder' configuration within the main configuration
-        if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
-            configDict = imageEncoderConfig
-        }
-        
-        let config = Config(configDict)
-        self.imageSize = config.imageSize!.intValue!
-    }
-
-    func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider {
-        // Populate a tensor of size 3 x `imageSize` x `imageSize`,
-        // by resizing the image, then performing a center crop.
-        // Then normalize with the `mean` and `std` and export as a provider.
-        let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize)!
-        let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)!
-        let featureValue = MLFeatureValue(multiArray: normalized)
-        return try ImageInput(precomputedFeature: featureValue)
-    }
-
-    private func resizeAndCrop(image: CGImage, toSideLength imageSize: Int) -> CGImage? {
-        let originalWidth = CGFloat(image.width)
-        let originalHeight = CGFloat(image.height)
-
-        // Calculate new size preserving the aspect ratio
-        let widthRatio = CGFloat(imageSize) / originalWidth
-        let heightRatio = CGFloat(imageSize) / originalHeight
-        let scaleFactor = max(widthRatio, heightRatio)
-
-        let scaledWidth = originalWidth * scaleFactor
-        let scaledHeight = originalHeight * scaleFactor
-
-        // Calculate the crop rectangle
-        let dx = (scaledWidth - CGFloat(imageSize)) / 2.0
-        let dy = (scaledHeight - CGFloat(imageSize)) / 2.0
-        guard
-            let context = CGContext(
-                data: nil,
-                width: imageSize,
-                height: imageSize,
-                bitsPerComponent: image.bitsPerComponent,
-                bytesPerRow: 0,
-                space: image.colorSpace ?? CGColorSpaceCreateDeviceRGB(),
-                bitmapInfo: image.bitmapInfo.rawValue
-            )
-        else { return nil }
-
-        // Draw the scaled and cropped image in the context
-        context.interpolationQuality = .high
-        context.draw(image, in: CGRect(x: -dx, y: -dy, width: scaledWidth, height: scaledHeight))
-        return context.makeImage()
-    }
-
-    private func exportToTensorAndNormalize(image: CGImage, mean: [Float], std: [Float]) -> MLMultiArray? {
-        let width = image.width
-        let height = image.height
-
-        // Prepare the bitmap context for drawing the image.
-        var pixelData = [UInt8](repeating: 0, count: width * height * 4)
-        let colorSpace = CGColorSpaceCreateDeviceRGB()
-        let context = CGContext(
-            data: &pixelData,
-            width: width,
-            height: height,
-            bitsPerComponent: 8,
-            bytesPerRow: 4 * width,
-            space: colorSpace,
-            bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
-        )
-        context?.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
-
-        // Normalize the pixel data
-        var floatPixels = [Float](repeating: 0, count: width * height * 3)
-        for c in 0 ..< 3 {
-            for i in 0 ..< (width * height) {
-                floatPixels[i * 3 + c] = (Float(pixelData[i * 4 + c]) / 255.0 - mean[c]) / std[c]
-            }
-        }
-
-        // Create the tensor array
-        var tensor = [Float](repeating: 0, count: 3 * width * height)
-        for i in 0 ..< (width * height) {
-            for c in 0 ..< 3 {
-                tensor[c * width * height + i] = floatPixels[i * 3 + c]
-            }
-        }
-
-        let multiArray = try? MLMultiArray(
-            shape: [1, 3, NSNumber(value: height), NSNumber(value: width)],
-            dataType: .float32
-        )
-        for i in 0 ..< tensor.count {
-            multiArray?[i] = NSNumber(value: tensor[i])
-        }
-        return multiArray
-    }
-
-}
-
-// MARK: - Feature Providers
-
-class TextInput: MLFeatureProvider {
-    var inputIDs: [Int]
-    var sequenceLength: Int
-    var paddingID: Int
-
-    init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) {
-        self.inputIDs = inputIDs
-        self.sequenceLength = sequenceLength
-        self.paddingID = paddingID
-    }
-
-    var featureNames: Set<String> {
-        return Set(["input_ids", "attention_mask"])
-    }
-
-    // The model expects the input IDs to be an array of integers
-    // of length `sequenceLength`, padded with `paddingID` if necessary
-    func featureValue(for featureName: String) -> MLFeatureValue? {
-        switch featureName {
-        case "input_ids", "attention_mask":
-            return createFeatureValue(for: featureName)
-        default:
-            return nil
-        }
-    }
-
-    private func createFeatureValue(for featureName: String) -> MLFeatureValue? {
-        let count = min(inputIDs.count, sequenceLength)
-        let totalElements = sequenceLength
-        guard let multiArray = try? MLMultiArray(shape: [1, NSNumber(value: totalElements)], dataType: .int32) else {
-            return nil
-        }
-
-        if featureName == "input_ids" {
-            for i in 0 ..< count {
-                multiArray[i] = NSNumber(value: inputIDs[i])
-            }
-            for i in count ..< totalElements {
-                multiArray[i] = NSNumber(value: paddingID)
-            }
-        }
-        else if featureName == "attention_mask" {
-            for i in 0 ..< count {
-                multiArray[i] = NSNumber(value: 1)
-            }
-            for i in count ..< totalElements {
-                multiArray[i] = NSNumber(value: 0)
-            }
-        }
-
-        return MLFeatureValue(multiArray: multiArray)
-    }
-}
-
-class ImageInput: MLFeatureProvider {
-    var precomputedFeature: MLFeatureValue
-
-    init(precomputedFeature: MLFeatureValue) throws {
-        self.precomputedFeature = precomputedFeature
-    }
-
-    var featureNames: Set<String> {
-        return Set(["input"])
-    }
-
-    // The model expects the input IDs to be an array of integers
-    // of length `sequenceLength`, padded with `paddingID` if necessary
-    func featureValue(for featureName: String) -> MLFeatureValue? {
-        switch featureName {
-        case "input":
-            return precomputedFeature
-        default:
-            return nil
-        }
-    }
-}
diff --git a/swift/Encoders.swift b/swift/Encoders.swift
new file mode 100644
index 0000000..509ad11
--- /dev/null
+++ b/swift/Encoders.swift
@@ -0,0 +1,505 @@
+//
+//  Embeddings.swift
+//
+//
+//  Created by Ash Vardanian on 3/27/24.
+//
+import Accelerate
+import CoreGraphics
+import CoreML
+import Foundation
+import Hub  // `Config`
+import Tokenizers  // `AutoTokenizer`
+
+/// Defines custom errors related to the encoder's functionality.
+enum EncoderError: Error {
+    case downloadError(String)
+    case loadingError(String)
+    case invalidInput(String)
+    case modelPredictionFailed(String)
+    case unknownError(String)
+}
+
+/// Represents different types of embeddings as arrays of different numeric types.
+public enum Embedding {
+    case i32s([Int32])
+    case f16s([Float16])
+    case f32s([Float32])
+    case f64s([Float64])
+
+    /// Initializes an embedding from a `MLMultiArray`.
+    /// - Parameter multiArray: The MLMultiArray to convert into an Embedding.
+    /// - Returns: nil if the data type is unsupported.
+    init?(from multiArray: MLMultiArray) {
+        switch multiArray.dataType {
+        case .float64:
+            self = .f64s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Float64.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        case .float32:
+            self = .f32s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Float32.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        case .float16:
+            self = .f16s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Float16.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        case .int32:
+            self = .i32s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Int32.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        @unknown default:
+            return nil
+        }
+    }
+
+    /// Converts the embedding to an array of `Float`.
+    public func asFloats() -> [Float] {
+        switch self {
+        case .f32s(let array): return array
+        case .i32s(let array): return array.map(Float.init)
+        case .f16s(let array): return array.map(Float.init)
+        case .f64s(let array): return array.map(Float.init)
+        }
+    }
+}
+
+/// Provides methods for reading and handling configurations and models.
+/// - Parameter path: The file path where the configuration file is located.
+/// - Returns: A dictionary containing the configuration data.
+func readConfig(fromPath path: String) throws -> [String: Any] {
+    let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
+    let data = try Data(contentsOf: URL(fileURLWithPath: absPath))
+    return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any]
+}
+
+/// Compiles and loads a machine learning model from a URL.
+/// - Parameter modelURL: The URL where the model package is located.
+/// - Returns: An instance of `MLModel`.
+func readModel(fromURL modelURL: URL) throws -> MLModel {
+    let compiledModelURL = try MLModel.compileModel(at: modelURL)
+    return try MLModel(contentsOf: compiledModelURL)
+}
+
+/// Loads a machine learning model from a local file path.
+/// - Parameter path: The file path where the model file is located.
+/// - Returns: An instance of `MLModel`.
+func readModel(fromPath path: String) throws -> MLModel {
+    let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
+    let modelURL = URL(fileURLWithPath: absPath, isDirectory: true)
+    return try readModel(fromURL: modelURL)
+}
+
+/// Encodes text input into embeddings using a machine learning model.
+public class TextEncoder {
+    let model: MLModel
+    let processor: TextProcessor
+
+    /// Initializes a `TextEncoder` using paths for the model and configuration.
+    /// - Parameters:
+    ///   - modelPath: The path to the directory containing the machine learning model.
+    ///   - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
+    ///   - tokenizerPath: Optional. The path to the tokenizer file. Defaults to tokenizer.json in the model directory.
+    public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws {
+        let finalConfigPath = configPath ?? modelPath + "/config.json"
+        let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
+        self.model = try readModel(fromPath: modelPath)
+        self.processor = try TextProcessor(
+            configPath: finalConfigPath,
+            tokenizerPath: finalTokenizerPath,
+            model: self.model
+        )
+    }
+
+    /// Initializes a `TextEncoder` using a model name and an API for fetching models.
+    /// - Parameters:
+    ///   - modelName: The identifier for the model repository.
+    ///   - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
+    public init(modelName: String, hubApi: HubApi = .shared) async throws {
+        let repo = Hub.Repo(id: modelName)
+        let modelURL = try await hubApi.snapshot(
+            from: repo,
+            matching: ["text_encoder.mlpackage/*", "config.json", "tokenizer.json"]
+        )
+        let configPath = modelURL.appendingPathComponent("config.json").path
+        let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
+        self.model = try readModel(
+            fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)
+        )
+        self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
+    }
+
+    /// Processes text and returns embeddings. Throws an error if processing fails.
+    /// - Parameter text: The text input to encode.
+    /// - Returns: An `Embedding` object containing the model output.
+    public func encode(_ text: String) throws -> Embedding {
+        let inputFeatureProvider = try self.processor.preprocess(text)
+        guard let prediction = try? self.model.prediction(from: inputFeatureProvider),
+            let predictionFeature = prediction.featureValue(for: "embeddings"),
+            let output = predictionFeature.multiArrayValue,
+            let embedding = Embedding(from: output)
+        else {
+            throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.")
+        }
+        return embedding
+    }
+}
+
+/// Encodes image input into embeddings using a machine learning model.
+public class ImageEncoder {
+    let model: MLModel
+    let processor: ImageProcessor
+
+    /// Initializes an `ImageEncoder` using a path for the model and optionally a configuration file.
+    /// - Parameters:
+    ///   - modelPath: The path to the directory containing the machine learning model.
+    ///   - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
+    public init(modelPath: String, configPath: String? = nil) throws {
+        let finalConfigPath = configPath ?? modelPath + "/config.json"
+        self.model = try readModel(fromPath: modelPath)
+        self.processor = try ImageProcessor(configPath: finalConfigPath)
+    }
+
+    /// Initializes an `ImageEncoder` using a model name and an API for fetching models.
+    /// - Parameters:
+    ///   - modelName: The identifier for the model repository.
+    ///   - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
+    public init(modelName: String, hubApi: HubApi = .shared) async throws {
+        let repo = Hub.Repo(id: modelName)
+        let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"])
+        let configPath = modelURL.appendingPathComponent("config.json").path
+        self.model = try readModel(
+            fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)
+        )
+        self.processor = try ImageProcessor(configPath: configPath)
+    }
+
+    /// Processes an image and returns embeddings. Throws an error if processing fails.
+    /// - Parameter image: The `CGImage` to encode.
+    /// - Returns: An `Embedding` object containing the model output.
+    public func encode(_ image: CGImage) throws -> Embedding {
+        let inputFeatureProvider = try self.processor.preprocess(image)
+        guard let prediction = try? self.model.prediction(from: inputFeatureProvider),
+            let predictionFeature = prediction.featureValue(for: "embeddings"),
+            let output = predictionFeature.multiArrayValue,
+            let embedding = Embedding(from: output)
+        else {
+            throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.")
+        }
+        return embedding
+    }
+}
+
+// MARK: - Processors
+
+/// Handles the preprocessing of text data to be used by a machine learning model.
+class TextProcessor {
+    let tokenizer: Tokenizer
+    let minContextLength: Int
+    let maxContextLength: Int
+
+    /// Initializes a `TextProcessor` with specific configuration.
+    /// - Parameters:
+    ///   - configPath: The path to the configuration file specifying tokenizer and model configurations.
+    ///   - tokenizerPath: The path to the tokenizer configuration.
+    ///   - model: The machine learning model to be used with this processor.
+    /// - Throws: An error if the configuration is invalid or missing necessary components.
+    public init(configPath: String, tokenizerPath: String, model: MLModel) throws {
+        var configDict = try readConfig(fromPath: configPath)
+        let tokenizerDict = try readConfig(fromPath: tokenizerPath)
+
+        // Check if there's a specific 'text_encoder' configuration within the main configuration
+        if let textEncoderConfig = configDict["text_encoder"] as? [String: Any] {
+            configDict = textEncoderConfig  // Use the specific 'text_encoder' configuration
+        }
+
+        // Initialize the tokenizer with its configuration.
+        let config = Config(configDict)
+        let tokenizerData = Config(tokenizerDict)
+        self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData)
+
+        // Extract the model's input shape constraints.
+        guard let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"],
+            let multiArrayConstraint = inputDescription.multiArrayConstraint
+        else {
+            throw EncoderError.invalidInput("Cannot obtain shape information from the model.")
+        }
+
+        // Determine the context length constraints based on the model's input shape constraint.
+        let shapeConstraint = multiArrayConstraint.shapeConstraint
+        switch shapeConstraint.type {
+        case .enumerated:
+            minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue
+            maxContextLength = minContextLength
+        case .range:
+            guard let range = shapeConstraint.sizeRangeForDimension[1] as? NSRange else {
+                throw EncoderError.unknownError("Model input shape has a range constraint that cannot be interpreted.")
+            }
+            minContextLength = range.location
+            maxContextLength = range.length
+        case .unspecified:
+            throw EncoderError.unknownError("Model input shape is unspecified.")
+        @unknown default:
+            throw EncoderError.unknownError("Unknown model input shape constraint type.")
+        }
+    }
+
+    /// Preprocesses a string of text into a format suitable for model prediction.
+    /// - Parameter text: The text to preprocess.
+    /// - Returns: A `MLFeatureProvider` containing the processed text ready for the model.
+    /// - Throws: An error if the text encoding fails.
+    public func preprocess(_ text: String) throws -> MLFeatureProvider {
+        let inputIDs = self.tokenizer.encode(text: text)
+        return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength)
+    }
+}
+
+/// Handles the preprocessing of image data to be used by a machine learning model.
+class ImageProcessor {
+    let imageSize: Int
+    let mean: [Float]
+    let std: [Float]
+
+    /// Initializes an `ImageProcessor` with specific configuration.
+    /// - Parameter configPath: The path to the configuration file specifying image size, mean, and std.
+    init(configPath: String) throws {
+        var configDict = try readConfig(fromPath: configPath)
+        if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
+            configDict = imageEncoderConfig
+        }
+
+        let config = Config(configDict)
+        guard let imageSize = config.imageSize?.value as? Int else {
+            throw EncoderError.invalidInput("Invalid or missing image size.")
+        }
+        self.imageSize = imageSize
+
+        guard let meanArray = config.normalizationMeans?.value as? [Any],
+            let stdArray = config.normalizationDeviations?.value as? [Any]
+        else {
+            throw EncoderError.invalidInput("Normalization means or deviations are missing.")
+        }
+
+        self.mean = try meanArray.compactMap({
+            guard let doubleValue = $0 as? Double else {
+                throw EncoderError.invalidInput("Normalization means should be an array of floats.")
+            }
+            return Float(doubleValue)
+        })
+
+        self.std = try stdArray.compactMap({
+            guard let doubleValue = $0 as? Double else {
+                throw EncoderError.invalidInput("Normalization deviations should be an array of floats.")
+            }
+            return Float(doubleValue)
+        })
+
+        // Check if the arrays have 3 values for the 3 channels
+        if self.mean.count != 3 || self.std.count != 3 {
+            throw EncoderError.invalidInput("Normalization means should contain 3 values.")
+        }
+    }
+
+    /// Preprocesses a `CGImage` into a format suitable for model prediction.
+    /// - Parameter cgImage: The image to preprocess.
+    /// - Returns: An `MLFeatureProvider` containing the preprocessed image data.
+    func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider {
+        guard let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize),
+            let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)
+        else {
+            throw EncoderError.invalidInput("Image preprocessing failed.")
+        }
+        let featureValue = MLFeatureValue(multiArray: normalized)
+        return try ImageInput(precomputedFeature: featureValue)
+    }
+
+    private func resizeAndCrop(image: CGImage, toSideLength imageSize: Int) -> CGImage? {
+        let originalWidth = CGFloat(image.width)
+        let originalHeight = CGFloat(image.height)
+
+        let widthRatio = CGFloat(imageSize) / originalWidth
+        let heightRatio = CGFloat(imageSize) / originalHeight
+        let scaleFactor = max(widthRatio, heightRatio)
+
+        let scaledWidth = originalWidth * scaleFactor
+        let scaledHeight = originalHeight * scaleFactor
+
+        let dx = (scaledWidth - CGFloat(imageSize)) / 2.0
+        let dy = (scaledHeight - CGFloat(imageSize)) / 2.0
+        guard
+            let context = CGContext(
+                data: nil,
+                width: imageSize,
+                height: imageSize,
+                bitsPerComponent: image.bitsPerComponent,
+                bytesPerRow: 0,
+                space: image.colorSpace ?? CGColorSpaceCreateDeviceRGB(),
+                bitmapInfo: image.bitmapInfo.rawValue
+            )
+        else { return nil }
+
+        // Draw the scaled and cropped image in the context
+        context.interpolationQuality = .high
+        context.draw(image, in: CGRect(x: -dx, y: -dy, width: scaledWidth, height: scaledHeight))
+        return context.makeImage()
+    }
+
+    private func exportToTensorAndNormalize(image: CGImage, mean: [Float], std: [Float]) -> MLMultiArray? {
+        let width = image.width
+        let height = image.height
+
+        // Prepare the bitmap context for drawing the image.
+        var pixelData = [UInt8](repeating: 0, count: width * height * 4)
+        let colorSpace = CGColorSpaceCreateDeviceRGB()
+        guard
+            let context = CGContext(
+                data: &pixelData,
+                width: width,
+                height: height,
+                bitsPerComponent: 8,
+                bytesPerRow: 4 * width,
+                space: colorSpace,
+                bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
+            )
+        else { return nil }
+        context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
+
+        // While normalizing the pixels, let's also transpose them from HWC to CHW
+        let channelSize = width * height
+        var floatPixels = [Float](repeating: 0, count: channelSize * 3)
+        for i in 0 ..< channelSize {
+            floatPixels[channelSize * 0 + i] = (Float(pixelData[i * 4 + 0]) / 255.0 - mean[0]) / std[0]
+            floatPixels[channelSize * 1 + i] = (Float(pixelData[i * 4 + 1]) / 255.0 - mean[1]) / std[1]
+            floatPixels[channelSize * 2 + i] = (Float(pixelData[i * 4 + 2]) / 255.0 - mean[2]) / std[2]
+        }
+
+        // We need to wrap the constructor that may fail
+        do {
+            let tensor = try MLMultiArray(
+                shape: [1, 3, NSNumber(value: height), NSNumber(value: width)],
+                dataType: .float32
+            )
+            for i in 0 ..< floatPixels.count {
+                tensor[i] = NSNumber(value: floatPixels[i])
+            }
+            return tensor
+        }
+        catch {
+            return nil
+        }
+    }
+}
+
+// MARK: - Feature Providers
+
+/// Provides features for text input to a machine learning model, handling padding and attention mask generation.
+class TextInput: MLFeatureProvider {
+    var inputIDs: [Int]
+    var sequenceLength: Int
+    var paddingID: Int
+
+    /// Initializes a new instance for providing text input features.
+    /// - Parameters:
+    ///   - inputIDs: Array of integer IDs representing the encoded text.
+    ///   - sequenceLength: The fixed length to which the input sequence should be padded.
+    ///   - paddingID: The integer ID used for padding shorter sequences. Defaults to 0.
+    init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) {
+        self.inputIDs = inputIDs
+        self.sequenceLength = sequenceLength
+        self.paddingID = paddingID
+    }
+
+    var featureNames: Set<String> {
+        return Set(["input_ids", "attention_mask"])
+    }
+
+    /// Returns the feature value for the specified feature name.
+    /// - Parameter featureName: The name of the feature for which the value is requested.
+    /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature.
+    func featureValue(for featureName: String) -> MLFeatureValue? {
+        switch featureName {
+        case "input_ids", "attention_mask":
+            return createFeatureValue(for: featureName)
+        default:
+            return nil
+        }
+    }
+
+    /// Creates the feature value for input IDs or attention mask based on the specified feature name.
+    /// - Parameter featureName: The name of the feature.
+    /// - Returns: An `MLFeatureValue` if the array can be created, otherwise nil.
+    private func createFeatureValue(for featureName: String) -> MLFeatureValue? {
+        let count = min(inputIDs.count, sequenceLength)
+        let totalElements = sequenceLength
+        guard let multiArray = try? MLMultiArray(shape: [1, NSNumber(value: totalElements)], dataType: .int32) else {
+            return nil
+        }
+
+        if featureName == "input_ids" {
+            for i in 0 ..< count {
+                multiArray[i] = NSNumber(value: inputIDs[i])
+            }
+            for i in count ..< totalElements {
+                multiArray[i] = NSNumber(value: paddingID)
+            }
+        }
+        else if featureName == "attention_mask" {
+            for i in 0 ..< count {
+                multiArray[i] = NSNumber(value: 1)
+            }
+            for i in count ..< totalElements {
+                multiArray[i] = NSNumber(value: 0)
+            }
+        }
+
+        return MLFeatureValue(multiArray: multiArray)
+    }
+}
+
+/// Provides a precomputed feature for image inputs to a machine learning model.
+class ImageInput: MLFeatureProvider {
+    var precomputedFeature: MLFeatureValue
+
+    /// Initializes a new instance with a precomputed feature.
+    /// - Parameter precomputedFeature: The `MLFeatureValue` containing the precomputed feature data.
+    /// - Throws: An error if the precomputed feature is not valid for the model.
+    init(precomputedFeature: MLFeatureValue) throws {
+        self.precomputedFeature = precomputedFeature
+    }
+
+    var featureNames: Set<String> {
+        return Set(["images"])
+    }
+
+    /// Returns the feature value for the specified feature name.
+    /// - Parameter featureName: The name of the feature for which the value is requested.
+    /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature.
+    func featureValue(for featureName: String) -> MLFeatureValue? {
+        switch featureName {
+        case "images":
+            return precomputedFeature
+        default:
+            return nil
+        }
+    }
+}
diff --git a/swift/EmbeddingsTests.swift b/swift/EncodersTests.swift
similarity index 75%
rename from swift/EmbeddingsTests.swift
rename to swift/EncodersTests.swift
index 5efb87f..645d531 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EncodersTests.swift
@@ -1,11 +1,26 @@
 import CoreGraphics
+import Hub
 import ImageIO
 import UForm
-import Hub
 import XCTest
 
 final class TokenizerTests: XCTestCase {
 
+    var hfToken: String?
+
+    override func setUp() {
+        super.setUp()
+        // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory
+        let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token")
+        if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines)
+        {
+            hfToken = token
+        }
+
+        hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"]
+        hfToken = hfToken ?? "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"
+    }
+
     func cosineSimilarity<T: FloatingPoint>(between vectorA: [T], and vectorB: [T]) -> T {
         guard vectorA.count == vectorB.count else {
             fatalError("Vectors must be of the same length.")
@@ -23,11 +38,11 @@ final class TokenizerTests: XCTestCase {
         return dotProduct / (magnitudeA * magnitudeB)
     }
 
-    func testTextEmbeddings() async throws {
+    func testTextEmbeddings(forModel modelName: String) async throws {
 
-        let api = HubApi(hfToken: "xxx")
+        let api = HubApi(hfToken: hfToken)
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: "unum-cloud/uform3-image-text-english-small",
             hubApi: api
         )
 
@@ -40,7 +55,7 @@ final class TokenizerTests: XCTestCase {
 
         var textEmbeddings: [[Float32]] = []
         for text in texts {
-            let embedding: [Float32] = try textModel.forward(with: text).asFloats()
+            let embedding: [Float32] = try textModel.encode(text).asFloats()
             textEmbeddings.append(embedding)
         }
 
@@ -60,36 +75,47 @@ final class TokenizerTests: XCTestCase {
         )
     }
 
-    func testImageEmbeddings() async throws {
+    func testTextEmbeddings() async throws {
+        for model in [
+            "unum-cloud/uform3-image-text-english-small",
+            "unum-cloud/uform3-image-text-english-base",
+            "unum-cloud/uform3-image-text-english-large",
+            "unum-cloud/uform3-image-text-multilingual-base",
+        ] {
+            try await testTextEmbeddings(forModel: model)
+        }
+    }
+
+    func testImageEmbeddings(forModel modelName: String) async throws {
 
         // One option is to use a local model repository.
         //
         //        let root = "uform/"
         //        let textModel = try TextEncoder(
-        //            modelPath: root + "uform-vl-english-large-text.mlpackage",
+        //            modelPath: root + "uform-vl-english-large-text_encoder.mlpackage",
         //            configPath: root + "uform-vl-english-large-text.json",
         //            tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json"
         //        )
         //        let imageModel = try ImageEncoder(
-        //            modelPath: root + "uform-vl-english-large-image.mlpackage",
+        //            modelPath: root + "uform-vl-english-large-image_encoder.mlpackage",
         //            configPath: root + "uform-vl-english-large-image.json"
         //        )
         //
         // A better option is to fetch directly from HuggingFace, similar to how users would do that:
-        let api = HubApi(hfToken: "xxx")
+        let api = HubApi(hfToken: hfToken)
         let textModel = try await TextEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: modelName,
             hubApi: api
         )
         let imageModel = try await ImageEncoder(
-            modelName: "unum-cloud/uform-vl2-english-small",
+            modelName: modelName,
             hubApi: api
         )
 
         let texts = [
             "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
             "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
-            "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+            "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
             "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
             "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
         ]
@@ -115,9 +141,9 @@ final class TokenizerTests: XCTestCase {
                 )
             }
 
-            let textEmbedding: [Float32] = try textModel.forward(with: text).asFloats()
+            let textEmbedding: [Float32] = try textModel.encode(text).asFloats()
             textEmbeddings.append(textEmbedding)
-            let imageEmbedding: [Float32] = try imageModel.forward(with: cgImage).asFloats()
+            let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats()
             imageEmbeddings.append(imageEmbedding)
         }
 
@@ -143,4 +169,15 @@ final class TokenizerTests: XCTestCase {
         }
     }
 
+    func testImageEmbeddings() async throws {
+        for model in [
+            "unum-cloud/uform3-image-text-english-small",
+            "unum-cloud/uform3-image-text-english-base",
+            "unum-cloud/uform3-image-text-english-large",
+            "unum-cloud/uform3-image-text-multilingual-base",
+        ] {
+            try await testImageEmbeddings(forModel: model)
+        }
+    }
+
 }
diff --git a/swift/README.md b/swift/README.md
new file mode 100644
index 0000000..8fa0eb8
--- /dev/null
+++ b/swift/README.md
@@ -0,0 +1,73 @@
+# UForm Swift SDK
+
+UForm offers first-party support for Swift.
+To get started, add UForm to your project using Swift Package Manager.
+
+```bash
+swift package init --type executable
+swift package add uform
+```
+
+Then, import UForm in your Swift code:
+
+```swift
+import UForm
+```
+
+## Embeddings
+
+### Text Embeddings
+
+```swift
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
+let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
+let textEmbedding: Embedding = try textModel.encode(text)
+let textVector: [Float32] = textEmbedding.asFloats()
+```
+
+### Image Embeddings
+
+```swift
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
+let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
+guard let url = URL(string: imageURL),
+    let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
+    let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
+    throw Exception("Could not load image from URL: \(imageURL)")
+}
+
+var imageEmbedding: Embedding = try imageModel.encode(cgImage)
+var imageVector: [Float32] = embedding.asFloats()
+```
+
+### Computing Distances
+
+There are several ways to compute distances between embeddings, once you have them.
+Naive Swift code might look like this:
+
+```swift
+func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
+    let dotProduct = zip(a, b).map(*).reduce(0, +)
+    let normA = sqrt(a.map { $0 * $0 }.reduce(0, +))
+    let normB = sqrt(b.map { $0 * $0 }.reduce(0, +))
+    return dotProduct / (normA * normB)
+}
+```
+
+A faster way to compute distances is to use the Accelerate framework:
+
+```swift
+import Accelerate
+
+func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
+    var result: Float32 = 0
+    var aNorm: Float32 = 0
+    var bNorm: Float32 = 0
+    vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count))
+    vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count))
+    vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count))
+    return result / sqrt(aNorm * bNorm)
+}
+```
+
+An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings.
diff --git a/yarn.lock b/yarn.lock
new file mode 100644
index 0000000..5ab5bbe
--- /dev/null
+++ b/yarn.lock
@@ -0,0 +1,594 @@
+# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
+# yarn lockfile v1
+
+
+"@huggingface/hub@^0.14.8":
+  version "0.14.8"
+  resolved "https://registry.npmjs.org/@huggingface/hub/-/hub-0.14.8.tgz"
+  integrity sha512-vdJRham99E5Uzsc4rO0gTz0ykafmx6V78pgPpJ7LGz5X+P2exe/izPFndqczAzy8jVWN55Jjtnuqg+Y0zrjc+Q==
+  dependencies:
+    hash-wasm "^4.9.0"
+
+"@huggingface/jinja@^0.2.2":
+  version "0.2.2"
+  resolved "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz"
+  integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==
+
+"@protobufjs/aspromise@^1.1.1", "@protobufjs/aspromise@^1.1.2":
+  version "1.1.2"
+  resolved "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz"
+  integrity sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==
+
+"@protobufjs/base64@^1.1.2":
+  version "1.1.2"
+  resolved "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz"
+  integrity sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==
+
+"@protobufjs/codegen@^2.0.4":
+  version "2.0.4"
+  resolved "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz"
+  integrity sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==
+
+"@protobufjs/eventemitter@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz"
+  integrity sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==
+
+"@protobufjs/fetch@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz"
+  integrity sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==
+  dependencies:
+    "@protobufjs/aspromise" "^1.1.1"
+    "@protobufjs/inquire" "^1.1.0"
+
+"@protobufjs/float@^1.0.2":
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz"
+  integrity sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==
+
+"@protobufjs/inquire@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz"
+  integrity sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==
+
+"@protobufjs/path@^1.1.2":
+  version "1.1.2"
+  resolved "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz"
+  integrity sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==
+
+"@protobufjs/pool@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz"
+  integrity sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==
+
+"@protobufjs/utf8@^1.1.0":
+  version "1.1.0"
+  resolved "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz"
+  integrity sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==
+
+"@types/long@^4.0.1":
+  version "4.0.2"
+  resolved "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz"
+  integrity sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==
+
+"@types/node@>=13.7.0":
+  version "20.12.7"
+  resolved "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz"
+  integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==
+  dependencies:
+    undici-types "~5.26.4"
+
+"@xenova/transformers@^2.17.0":
+  version "2.17.0"
+  resolved "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.0.tgz"
+  integrity sha512-usmDut7hwnrc4EqP59cboYqE6C8up63SqMy3E9RjG9nCsOhrsLndEU7DMu+bZ9R+HcAI8jRGabTIxH+B6agBVA==
+  dependencies:
+    "@huggingface/jinja" "^0.2.2"
+    onnxruntime-web "1.14.0"
+    sharp "^0.32.0"
+  optionalDependencies:
+    onnxruntime-node "1.14.0"
+
+b4a@^1.6.4:
+  version "1.6.6"
+  resolved "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz"
+  integrity sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg==
+
+bare-events@^2.0.0, bare-events@^2.2.0:
+  version "2.2.2"
+  resolved "https://registry.npmjs.org/bare-events/-/bare-events-2.2.2.tgz"
+  integrity sha512-h7z00dWdG0PYOQEvChhOSWvOfkIKsdZGkWr083FgN/HyoQuebSew/cgirYqh9SCuy/hRvxc5Vy6Fw8xAmYHLkQ==
+
+bare-fs@^2.1.1:
+  version "2.2.3"
+  resolved "https://registry.npmjs.org/bare-fs/-/bare-fs-2.2.3.tgz"
+  integrity sha512-amG72llr9pstfXOBOHve1WjiuKKAMnebcmMbPWDZ7BCevAoJLpugjuAPRsDINEyjT0a6tbaVx3DctkXIRbLuJw==
+  dependencies:
+    bare-events "^2.0.0"
+    bare-path "^2.0.0"
+    streamx "^2.13.0"
+
+bare-os@^2.1.0:
+  version "2.2.1"
+  resolved "https://registry.npmjs.org/bare-os/-/bare-os-2.2.1.tgz"
+  integrity sha512-OwPyHgBBMkhC29Hl3O4/YfxW9n7mdTr2+SsO29XBWKKJsbgj3mnorDB80r5TiCQgQstgE5ga1qNYrpes6NvX2w==
+
+bare-path@^2.0.0, bare-path@^2.1.0:
+  version "2.1.1"
+  resolved "https://registry.npmjs.org/bare-path/-/bare-path-2.1.1.tgz"
+  integrity sha512-OHM+iwRDRMDBsSW7kl3dO62JyHdBKO3B25FB9vNQBPcGHMo4+eA8Yj41Lfbk3pS/seDY+siNge0LdRTulAau/A==
+  dependencies:
+    bare-os "^2.1.0"
+
+base64-js@^1.3.1:
+  version "1.5.1"
+  resolved "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz"
+  integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==
+
+bl@^4.0.3:
+  version "4.1.0"
+  resolved "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz"
+  integrity sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==
+  dependencies:
+    buffer "^5.5.0"
+    inherits "^2.0.4"
+    readable-stream "^3.4.0"
+
+buffer@^5.5.0:
+  version "5.7.1"
+  resolved "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz"
+  integrity sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==
+  dependencies:
+    base64-js "^1.3.1"
+    ieee754 "^1.1.13"
+
+chownr@^1.1.1:
+  version "1.1.4"
+  resolved "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz"
+  integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==
+
+color-convert@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz"
+  integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==
+  dependencies:
+    color-name "~1.1.4"
+
+color-name@^1.0.0, color-name@~1.1.4:
+  version "1.1.4"
+  resolved "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz"
+  integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==
+
+color-string@^1.9.0:
+  version "1.9.1"
+  resolved "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz"
+  integrity sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==
+  dependencies:
+    color-name "^1.0.0"
+    simple-swizzle "^0.2.2"
+
+color@^4.2.3:
+  version "4.2.3"
+  resolved "https://registry.npmjs.org/color/-/color-4.2.3.tgz"
+  integrity sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==
+  dependencies:
+    color-convert "^2.0.1"
+    color-string "^1.9.0"
+
+decompress-response@^6.0.0:
+  version "6.0.0"
+  resolved "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz"
+  integrity sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==
+  dependencies:
+    mimic-response "^3.1.0"
+
+deep-extend@^0.6.0:
+  version "0.6.0"
+  resolved "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz"
+  integrity sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==
+
+detect-libc@^2.0.0, detect-libc@^2.0.2:
+  version "2.0.3"
+  resolved "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.3.tgz"
+  integrity sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw==
+
+end-of-stream@^1.1.0, end-of-stream@^1.4.1:
+  version "1.4.4"
+  resolved "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz"
+  integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==
+  dependencies:
+    once "^1.4.0"
+
+expand-template@^2.0.3:
+  version "2.0.3"
+  resolved "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz"
+  integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==
+
+fast-fifo@^1.1.0, fast-fifo@^1.2.0:
+  version "1.3.2"
+  resolved "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz"
+  integrity sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==
+
+flatbuffers@^1.12.0:
+  version "1.12.0"
+  resolved "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz"
+  integrity sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==
+
+fs-constants@^1.0.0:
+  version "1.0.0"
+  resolved "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz"
+  integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==
+
+github-from-package@0.0.0:
+  version "0.0.0"
+  resolved "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz"
+  integrity sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==
+
+guid-typescript@^1.0.9:
+  version "1.0.9"
+  resolved "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz"
+  integrity sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==
+
+hash-wasm@^4.9.0:
+  version "4.11.0"
+  resolved "https://registry.npmjs.org/hash-wasm/-/hash-wasm-4.11.0.tgz"
+  integrity sha512-HVusNXlVqHe0fzIzdQOGolnFN6mX/fqcrSAOcTBXdvzrXVHwTz11vXeKRmkR5gTuwVpvHZEIyKoePDvuAR+XwQ==
+
+ieee754@^1.1.13:
+  version "1.2.1"
+  resolved "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz"
+  integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==
+
+inherits@^2.0.3, inherits@^2.0.4:
+  version "2.0.4"
+  resolved "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz"
+  integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
+
+ini@~1.3.0:
+  version "1.3.8"
+  resolved "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz"
+  integrity sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==
+
+is-arrayish@^0.3.1:
+  version "0.3.2"
+  resolved "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz"
+  integrity sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==
+
+long@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.npmjs.org/long/-/long-4.0.0.tgz"
+  integrity sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==
+
+long@^5.0.0:
+  version "5.2.3"
+  resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz"
+  integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==
+
+long@^5.2.3:
+  version "5.2.3"
+  resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz"
+  integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==
+
+lru-cache@^6.0.0:
+  version "6.0.0"
+  resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz"
+  integrity sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==
+  dependencies:
+    yallist "^4.0.0"
+
+mimic-response@^3.1.0:
+  version "3.1.0"
+  resolved "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz"
+  integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==
+
+minimist@^1.2.0, minimist@^1.2.3:
+  version "1.2.8"
+  resolved "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz"
+  integrity sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==
+
+mkdirp-classic@^0.5.2, mkdirp-classic@^0.5.3:
+  version "0.5.3"
+  resolved "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz"
+  integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==
+
+napi-build-utils@^1.0.1:
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz"
+  integrity sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==
+
+node-abi@^3.3.0:
+  version "3.57.0"
+  resolved "https://registry.npmjs.org/node-abi/-/node-abi-3.57.0.tgz"
+  integrity sha512-Dp+A9JWxRaKuHP35H77I4kCKesDy5HUDEmScia2FyncMTOXASMyg251F5PhFoDA5uqBrDDffiLpbqnrZmNXW+g==
+  dependencies:
+    semver "^7.3.5"
+
+node-addon-api@^6.1.0:
+  version "6.1.0"
+  resolved "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz"
+  integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==
+
+once@^1.3.1, once@^1.4.0:
+  version "1.4.0"
+  resolved "https://registry.npmjs.org/once/-/once-1.4.0.tgz"
+  integrity sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==
+  dependencies:
+    wrappy "1"
+
+onnx-proto@^4.0.4:
+  version "4.0.4"
+  resolved "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz"
+  integrity sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==
+  dependencies:
+    protobufjs "^6.8.8"
+
+onnxruntime-common@~1.14.0:
+  version "1.14.0"
+  resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz"
+  integrity sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==
+
+onnxruntime-common@1.17.3:
+  version "1.17.3"
+  resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.17.3.tgz"
+  integrity sha512-IkbaDelNVX8cBfHFgsNADRIq2TlXMFWW+nG55mwWvQT4i0NZb32Jf35Pf6h9yjrnK78RjcnlNYaI37w394ovMw==
+
+onnxruntime-node@1.14.0:
+  version "1.14.0"
+  resolved "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz"
+  integrity sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==
+  dependencies:
+    onnxruntime-common "~1.14.0"
+
+onnxruntime-web@^1.17.3:
+  version "1.17.3"
+  resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.17.3.tgz"
+  integrity sha512-MSDrNUWgc1biP0YzY488OJ9n/jTMS9EXysgm9Aw4CUj2A836ALbO2J1sgzguWJeVUHTlM6p7tRzo8IGAgaXWKw==
+  dependencies:
+    flatbuffers "^1.12.0"
+    guid-typescript "^1.0.9"
+    long "^5.2.3"
+    onnxruntime-common "1.17.3"
+    platform "^1.3.6"
+    protobufjs "^7.2.4"
+
+onnxruntime-web@1.14.0:
+  version "1.14.0"
+  resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz"
+  integrity sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==
+  dependencies:
+    flatbuffers "^1.12.0"
+    guid-typescript "^1.0.9"
+    long "^4.0.0"
+    onnx-proto "^4.0.4"
+    onnxruntime-common "~1.14.0"
+    platform "^1.3.6"
+
+platform@^1.3.6:
+  version "1.3.6"
+  resolved "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz"
+  integrity sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==
+
+prebuild-install@^7.1.1:
+  version "7.1.2"
+  resolved "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.2.tgz"
+  integrity sha512-UnNke3IQb6sgarcZIDU3gbMeTp/9SSU1DAIkil7PrqG1vZlBtY5msYccSKSHDqa3hNg436IXK+SNImReuA1wEQ==
+  dependencies:
+    detect-libc "^2.0.0"
+    expand-template "^2.0.3"
+    github-from-package "0.0.0"
+    minimist "^1.2.3"
+    mkdirp-classic "^0.5.3"
+    napi-build-utils "^1.0.1"
+    node-abi "^3.3.0"
+    pump "^3.0.0"
+    rc "^1.2.7"
+    simple-get "^4.0.0"
+    tar-fs "^2.0.0"
+    tunnel-agent "^0.6.0"
+
+protobufjs@^6.8.8:
+  version "6.11.4"
+  resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.4.tgz"
+  integrity sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==
+  dependencies:
+    "@protobufjs/aspromise" "^1.1.2"
+    "@protobufjs/base64" "^1.1.2"
+    "@protobufjs/codegen" "^2.0.4"
+    "@protobufjs/eventemitter" "^1.1.0"
+    "@protobufjs/fetch" "^1.1.0"
+    "@protobufjs/float" "^1.0.2"
+    "@protobufjs/inquire" "^1.1.0"
+    "@protobufjs/path" "^1.1.2"
+    "@protobufjs/pool" "^1.1.0"
+    "@protobufjs/utf8" "^1.1.0"
+    "@types/long" "^4.0.1"
+    "@types/node" ">=13.7.0"
+    long "^4.0.0"
+
+protobufjs@^7.2.4:
+  version "7.2.6"
+  resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.6.tgz"
+  integrity sha512-dgJaEDDL6x8ASUZ1YqWciTRrdOuYNzoOf27oHNfdyvKqHr5i0FV7FSLU+aIeFjyFgVxrpTOtQUi0BLLBymZaBw==
+  dependencies:
+    "@protobufjs/aspromise" "^1.1.2"
+    "@protobufjs/base64" "^1.1.2"
+    "@protobufjs/codegen" "^2.0.4"
+    "@protobufjs/eventemitter" "^1.1.0"
+    "@protobufjs/fetch" "^1.1.0"
+    "@protobufjs/float" "^1.0.2"
+    "@protobufjs/inquire" "^1.1.0"
+    "@protobufjs/path" "^1.1.2"
+    "@protobufjs/pool" "^1.1.0"
+    "@protobufjs/utf8" "^1.1.0"
+    "@types/node" ">=13.7.0"
+    long "^5.0.0"
+
+pump@^3.0.0:
+  version "3.0.0"
+  resolved "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz"
+  integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==
+  dependencies:
+    end-of-stream "^1.1.0"
+    once "^1.3.1"
+
+queue-tick@^1.0.1:
+  version "1.0.1"
+  resolved "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz"
+  integrity sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==
+
+rc@^1.2.7:
+  version "1.2.8"
+  resolved "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz"
+  integrity sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==
+  dependencies:
+    deep-extend "^0.6.0"
+    ini "~1.3.0"
+    minimist "^1.2.0"
+    strip-json-comments "~2.0.1"
+
+readable-stream@^3.1.1, readable-stream@^3.4.0:
+  version "3.6.2"
+  resolved "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz"
+  integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==
+  dependencies:
+    inherits "^2.0.3"
+    string_decoder "^1.1.1"
+    util-deprecate "^1.0.1"
+
+safe-buffer@^5.0.1, safe-buffer@~5.2.0:
+  version "5.2.1"
+  resolved "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz"
+  integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
+
+semver@^7.3.5, semver@^7.5.4:
+  version "7.6.0"
+  resolved "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz"
+  integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==
+  dependencies:
+    lru-cache "^6.0.0"
+
+sharp@^0.32.0:
+  version "0.32.6"
+  resolved "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz"
+  integrity sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==
+  dependencies:
+    color "^4.2.3"
+    detect-libc "^2.0.2"
+    node-addon-api "^6.1.0"
+    prebuild-install "^7.1.1"
+    semver "^7.5.4"
+    simple-get "^4.0.1"
+    tar-fs "^3.0.4"
+    tunnel-agent "^0.6.0"
+
+simple-concat@^1.0.0:
+  version "1.0.1"
+  resolved "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz"
+  integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==
+
+simple-get@^4.0.0, simple-get@^4.0.1:
+  version "4.0.1"
+  resolved "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz"
+  integrity sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==
+  dependencies:
+    decompress-response "^6.0.0"
+    once "^1.3.1"
+    simple-concat "^1.0.0"
+
+simple-swizzle@^0.2.2:
+  version "0.2.2"
+  resolved "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz"
+  integrity sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==
+  dependencies:
+    is-arrayish "^0.3.1"
+
+streamx@^2.13.0, streamx@^2.15.0:
+  version "2.16.1"
+  resolved "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz"
+  integrity sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ==
+  dependencies:
+    fast-fifo "^1.1.0"
+    queue-tick "^1.0.1"
+  optionalDependencies:
+    bare-events "^2.2.0"
+
+string_decoder@^1.1.1:
+  version "1.3.0"
+  resolved "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz"
+  integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
+  dependencies:
+    safe-buffer "~5.2.0"
+
+strip-json-comments@~2.0.1:
+  version "2.0.1"
+  resolved "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz"
+  integrity sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==
+
+tar-fs@^2.0.0:
+  version "2.1.1"
+  resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz"
+  integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==
+  dependencies:
+    chownr "^1.1.1"
+    mkdirp-classic "^0.5.2"
+    pump "^3.0.0"
+    tar-stream "^2.1.4"
+
+tar-fs@^3.0.4:
+  version "3.0.5"
+  resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.5.tgz"
+  integrity sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg==
+  dependencies:
+    pump "^3.0.0"
+    tar-stream "^3.1.5"
+  optionalDependencies:
+    bare-fs "^2.1.1"
+    bare-path "^2.1.0"
+
+tar-stream@^2.1.4:
+  version "2.2.0"
+  resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz"
+  integrity sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==
+  dependencies:
+    bl "^4.0.3"
+    end-of-stream "^1.4.1"
+    fs-constants "^1.0.0"
+    inherits "^2.0.3"
+    readable-stream "^3.1.1"
+
+tar-stream@^3.1.5:
+  version "3.1.7"
+  resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz"
+  integrity sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==
+  dependencies:
+    b4a "^1.6.4"
+    fast-fifo "^1.2.0"
+    streamx "^2.15.0"
+
+tunnel-agent@^0.6.0:
+  version "0.6.0"
+  resolved "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz"
+  integrity sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==
+  dependencies:
+    safe-buffer "^5.0.1"
+
+undici-types@~5.26.4:
+  version "5.26.5"
+  resolved "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz"
+  integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==
+
+util-deprecate@^1.0.1:
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz"
+  integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==
+
+wrappy@1:
+  version "1.0.2"
+  resolved "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz"
+  integrity sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==
+
+yallist@^4.0.0:
+  version "4.0.0"
+  resolved "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz"
+  integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==