diff --git a/_posts/2023-07-07-python-udf.md b/_posts/2023-07-07-python-udf.md index 6fbe04389d1..40438cc4352 100644 --- a/_posts/2023-07-07-python-udf.md +++ b/_posts/2023-07-07-python-udf.md @@ -69,6 +69,7 @@ con.create_function("wc_titles", world_cups, [VARCHAR], INTEGER) ``` That's it, the function is then registered and ready to be called through SQL. + ```python # Let's create an example countries table with the countries we are interested in using con.execute("CREATE TABLE countries(country VARCHAR)") @@ -76,7 +77,6 @@ con.execute("INSERT INTO countries VALUES ('Brazil'), ('Germany'), ('Italy'), (' # We can simply call the function through SQL, and even use the function return to eliminate the countries that never won a world cup con.sql("SELECT country, wc_titles(country) as world_cups from countries").fetchall() # [('Brazil', 5), ('Germany', 4), ('Italy', 4), ('Argentina', 2), ('Uruguay', 2), ('France', 2), ('England', 1), ('Spain', 1), ('Netherlands', None)] - ``` ### Generating Fake Data with Faker (Built-In Type UDF) diff --git a/docs/guides/snippets/create_synthetic_data.md b/docs/guides/snippets/create_synthetic_data.md index 1b6f37f8ce2..9ae5e7a21d5 100644 --- a/docs/guides/snippets/create_synthetic_data.md +++ b/docs/guides/snippets/create_synthetic_data.md @@ -21,13 +21,29 @@ import duckdb from duckdb.typing import * from faker import Faker +fake = Faker() + def random_date(): - fake = Faker() return fake.date_between() -duckdb.create_function("random_date", random_date, [], DATE, type="native", side_effects=True) -res = duckdb.sql(""" - SELECT hash(i * 10 + j) AS id, random_date() AS creationDate, IF (j % 2, true, false) +def random_short_text(): + return fake.text(max_nb_chars=20) + +def random_long_text(): + return fake.text(max_nb_chars=200) + +con = duckdb.connect() +con.create_function("random_date", random_date, [], DATE, type="native", side_effects=True) +con.create_function("random_short_text", random_short_text, [], VARCHAR, type="native", side_effects=True) +con.create_function("random_long_text", random_long_text, [], VARCHAR, type="native", side_effects=True) + +res = con.sql(""" + SELECT + hash(i * 10 + j) AS id, + random_date() AS creationDate, + random_short_text() AS short, + random_long_text() AS long, + IF (j % 2, true, false) AS bool FROM generate_series(1, 5) s(i) CROSS JOIN generate_series(1, 2) t(j) """)