Skip to content

Commit b9c497e

Browse files
committed
Add sql2dbx: LLM-powered SQL to Databricks notebook converter
1 parent ec65d02 commit b9c497e

File tree

82 files changed

+11536
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+11536
-0
lines changed

sql2dbx/.gitignore

+173
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# poetry
98+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102+
#poetry.lock
103+
104+
# pdm
105+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106+
#pdm.lock
107+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108+
# in version control.
109+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110+
.pdm.toml
111+
.pdm-python
112+
.pdm-build/
113+
114+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115+
__pypackages__/
116+
117+
# Celery stuff
118+
celerybeat-schedule
119+
celerybeat.pid
120+
121+
# SageMath parsed files
122+
*.sage.py
123+
124+
# Environments
125+
.env
126+
.venv
127+
env/
128+
venv/
129+
ENV/
130+
env.bak/
131+
venv.bak/
132+
133+
# Spyder project settings
134+
.spyderproject
135+
.spyproject
136+
137+
# Rope project settings
138+
.ropeproject
139+
140+
# mkdocs documentation
141+
/site
142+
143+
# mypy
144+
.mypy_cache/
145+
.dmypy.json
146+
dmypy.json
147+
148+
# Pyre type checker
149+
.pyre/
150+
151+
# pytype static type analyzer
152+
.pytype/
153+
154+
# Cython debug symbols
155+
cython_debug/
156+
157+
# PyCharm
158+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160+
# and can be added to the global gitignore or merged into this file. For a more nuclear
161+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
162+
#.idea/
163+
164+
# Databricks
165+
.databricks/
166+
167+
# Project specific
168+
.clinerules/
169+
.vscode/
170+
scratch/**
171+
!scratch/README.md
172+
test_output/
173+
databricks.yml

sql2dbx/README.md

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
---
2+
title: "sql2dbx"
3+
language: python
4+
author: "Hiroyuki Nakazato"
5+
date: 2025-4-25
6+
7+
tags:
8+
- sql-migration-tool
9+
- multi-dialect-sql
10+
- llm
11+
- automation
12+
---
13+
14+
# sql2dbx
15+
**sql2dbx** is an automation tool designed to convert SQL files into Databricks notebooks. It leverages Large Language Models (LLMs) to perform the conversion based on system prompts tailored for various SQL dialects. sql2dbx consists of a series of Databricks notebooks.
16+
17+
## How to Execute
18+
1. Clone the [databrickslabs/sandbox](https://github.com/databrickslabs/sandbox) repository.
19+
2. Import the `sql2dbx` folder into your Databricks workspace.
20+
3. Run either notebook as your entry point:
21+
- `notebooks/00_main` (English)
22+
- `notebooks/00_main_ja` (Japanese)
23+
24+
These notebooks contain all instructions and documentation needed to use sql2dbx.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
-- ==========================================
2+
-- MySQL EXAMPLE #1: Multi-Statement Data Transformation
3+
-- ==========================================
4+
5+
-- Create a table for orders
6+
CREATE TABLE Orders (
7+
OrderID INT,
8+
CustomerName VARCHAR(100),
9+
OrderDate DATETIME DEFAULT NOW(),
10+
OrderTotal DECIMAL(10,2)
11+
);
12+
13+
-- Insert some sample orders
14+
INSERT INTO Orders (OrderID, CustomerName, OrderTotal)
15+
VALUES
16+
(101, 'Alice', 200.00),
17+
(102, 'Bob', 350.75),
18+
(103, 'Charlie', 99.99);
19+
20+
-- Create a temporary table for order statuses
21+
CREATE TEMPORARY TABLE TempOrderStatus (
22+
OrderID INT,
23+
Status VARCHAR(50)
24+
);
25+
26+
-- Insert statuses
27+
INSERT INTO TempOrderStatus (OrderID, Status)
28+
VALUES
29+
(101, 'PROCESSING'),
30+
(102, 'SHIPPED'),
31+
(104, 'CANCELLED');
32+
33+
-- Update orders with a discount if they appear in the temporary status table
34+
-- Demonstrates MySQL's UPDATE with JOIN syntax
35+
UPDATE Orders AS o
36+
JOIN TempOrderStatus AS t ON o.OrderID = t.OrderID
37+
SET o.OrderTotal = o.OrderTotal * 0.90 -- 10% discount
38+
WHERE t.Status = 'SHIPPED';
39+
40+
-- Delete any order older than 90 days if not referenced in TempOrderStatus
41+
DELETE o
42+
FROM Orders AS o
43+
WHERE o.OrderDate < DATE_SUB(NOW(), INTERVAL 90 DAY)
44+
AND o.OrderID NOT IN (SELECT OrderID FROM TempOrderStatus);
45+
46+
-- Final check
47+
SELECT * FROM Orders;
48+
49+
-- Clean up
50+
-- DROP TABLE IF EXISTS TempOrderStatus; -- Temp tables are automatically dropped at the end of the session
51+
DROP TABLE IF EXISTS Orders;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
-- ==========================================
2+
-- MySQL EXAMPLE #2: Stored Procedure with Threshold Checking
3+
-- ==========================================
4+
5+
DELIMITER $$
6+
7+
CREATE PROCEDURE DemoThresholdCheck(
8+
IN p_table_name VARCHAR(64),
9+
IN p_threshold DECIMAL(10,2),
10+
OUT p_rows_updated INT
11+
)
12+
BEGIN
13+
-- Declare a handler to catch any SQL errors, then roll back
14+
DECLARE EXIT HANDLER FOR SQLEXCEPTION
15+
BEGIN
16+
ROLLBACK;
17+
SET p_rows_updated = -1;
18+
END;
19+
20+
-- Start a transaction
21+
START TRANSACTION;
22+
23+
-- 1) Create a temporary table that captures rows above the threshold
24+
-- We'll build this query dynamically based on p_table_name
25+
SET @sql = CONCAT(
26+
'CREATE TEMPORARY TABLE TempData AS ',
27+
'SELECT id, metric ',
28+
'FROM ', p_table_name, ' ',
29+
'WHERE metric > ', p_threshold
30+
);
31+
32+
PREPARE stmt FROM @sql;
33+
EXECUTE stmt;
34+
DEALLOCATE PREPARE stmt;
35+
36+
-- 2) Update the original table to cap values at the threshold
37+
SET @sql = CONCAT(
38+
'UPDATE ', p_table_name, ' ',
39+
'SET metric = ', p_threshold, ' ',
40+
'WHERE metric > ', p_threshold
41+
);
42+
43+
PREPARE stmt FROM @sql;
44+
EXECUTE stmt;
45+
SET p_rows_updated = ROW_COUNT(); -- track how many rows changed
46+
DEALLOCATE PREPARE stmt;
47+
48+
-- DROP TEMPORARY TABLE IF EXISTS TempData; -- Temp tables are automatically dropped at the end of the session
49+
50+
-- Commit the transaction
51+
COMMIT;
52+
END $$
53+
54+
DELIMITER ;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Databricks notebook source
2+
# MAGIC %md
3+
# MAGIC # mysql_example1_multi_statement_transformation
4+
# MAGIC This notebook was automatically converted from the script below. It may contain errors, so use it as a starting point and make necessary corrections.
5+
# MAGIC
6+
# MAGIC Source script: `/Workspace/Users/[email protected]/.bundle/sql2dbx/dev/files/examples/mysql/input/mysql_example1_multi_statement_transformation.sql`
7+
8+
# COMMAND ----------
9+
10+
# Create the Orders table
11+
spark.sql("""
12+
CREATE TABLE Orders (
13+
OrderID INT,
14+
CustomerName STRING,
15+
OrderDate TIMESTAMP DEFAULT current_timestamp(),
16+
OrderTotal DECIMAL(10,2)
17+
)
18+
""")
19+
20+
# COMMAND ----------
21+
22+
# Insert data into Orders table
23+
spark.sql("""
24+
INSERT INTO Orders (OrderID, CustomerName, OrderTotal)
25+
VALUES
26+
(101, 'Alice', 200.00),
27+
(102, 'Bob', 350.75),
28+
(103, 'Charlie', 99.99)
29+
""")
30+
31+
# COMMAND ----------
32+
33+
# Create "temporary" table as Delta table
34+
spark.sql("""
35+
CREATE OR REPLACE TABLE TempOrderStatus (
36+
OrderID INT,
37+
Status STRING
38+
)
39+
""")
40+
41+
# COMMAND ----------
42+
43+
# Insert data into temporary status table
44+
spark.sql("""
45+
INSERT INTO TempOrderStatus (OrderID, Status)
46+
VALUES
47+
(101, 'PROCESSING'),
48+
(102, 'SHIPPED'),
49+
(104, 'CANCELLED')
50+
""")
51+
52+
# COMMAND ----------
53+
54+
# Update Orders using MERGE pattern since Databricks doesn't support JOIN in UPDATE
55+
spark.sql("""
56+
MERGE INTO Orders o
57+
USING (SELECT * FROM TempOrderStatus WHERE Status = 'SHIPPED') t
58+
ON o.OrderID = t.OrderID
59+
WHEN MATCHED THEN
60+
UPDATE SET o.OrderTotal = o.OrderTotal * 0.90
61+
""")
62+
63+
# COMMAND ----------
64+
65+
# Delete old orders not in the status table
66+
spark.sql("""
67+
DELETE FROM Orders
68+
WHERE OrderDate < date_sub(current_timestamp(), 90)
69+
AND OrderID NOT IN (SELECT OrderID FROM TempOrderStatus)
70+
""")
71+
72+
# COMMAND ----------
73+
74+
# Query the results
75+
orders_df = spark.sql("SELECT * FROM Orders")
76+
display(orders_df)
77+
78+
# COMMAND ----------
79+
80+
# Clean up tables
81+
spark.sql("DROP TABLE IF EXISTS Orders")
82+
spark.sql("DROP TABLE IF EXISTS TempOrderStatus")
83+
84+
# COMMAND ----------
85+
86+
# MAGIC %md
87+
# MAGIC ## Static Syntax Check Results
88+
# MAGIC No syntax errors were detected during the static check.
89+
# MAGIC However, please review the code carefully as some issues may only be detected during runtime.

0 commit comments

Comments
 (0)