From d6514bedf6d18e997bff763e42d8a9cc82d1bbe5 Mon Sep 17 00:00:00 2001 From: Renato Leite Date: Fri, 19 Apr 2024 09:19:46 +0000 Subject: [PATCH] remove files from notebooks folder --- notebooks/1_env_setup_script.py | 67 - notebooks/1_environment_setup.ipynb | 303 ---- notebooks/aux_data/bq_tag_generation.py | 169 --- notebooks/aux_data/customers_aux_data.py | 1517 ------------------- notebooks/aux_data/data_gen.py | 237 --- notebooks/aux_data/events_aux_data.py | 22 - notebooks/aux_data/metadata_aux_data.py | 213 --- notebooks/aux_data/transactions_aux_data.py | 45 - 8 files changed, 2573 deletions(-) delete mode 100644 notebooks/1_env_setup_script.py delete mode 100644 notebooks/1_environment_setup.ipynb delete mode 100644 notebooks/aux_data/bq_tag_generation.py delete mode 100644 notebooks/aux_data/customers_aux_data.py delete mode 100644 notebooks/aux_data/data_gen.py delete mode 100644 notebooks/aux_data/events_aux_data.py delete mode 100644 notebooks/aux_data/metadata_aux_data.py delete mode 100644 notebooks/aux_data/transactions_aux_data.py diff --git a/notebooks/1_env_setup_script.py b/notebooks/1_env_setup_script.py deleted file mode 100644 index 4473750..0000000 --- a/notebooks/1_env_setup_script.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.` -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -from google.cloud import bigquery -from google.cloud import datacatalog_v1 -from aux_data import data_gen -from aux_data import bq_tag_generation - - -print( - """This installation script will: - - Create a dataset on BigQuery with data about Audiences. - - Create a TagTemplate with metadata about the dataset. - - This process can take up to 3 minutes to finish.""") - -# Set environmental variables -PROJECT_ID = os.environ['PROJECT_ID'] -LOCATION = os.environ['LOCATION'] -DATASET_ID = 'cdp_dataset' - -# Tag template definitions -TAG_TEMPLATE_ID = 'llmcdptemplate' -TAG_TEMPLATE_PATH = f"projects/{PROJECT_ID}/locations/{LOCATION}/tagTemplates/{TAG_TEMPLATE_ID}" - -# Create clients for BigQuery and DataCatalog -bq_client = bigquery.Client(project=PROJECT_ID) -datacatalog_client = datacatalog_v1.DataCatalogClient() - -dataset_id = "{}.{}".format(bq_client.project, DATASET_ID) -dataset = bigquery.Dataset(dataset_id) -dataset.location = "US" - -# Create the dataset -try: - dataset = bq_client.create_dataset(dataset, timeout=30) - print(f'Dataset {DATASET_ID} create successfully.') -except Exception as e: - print(e) - - -data_gen.generate_and_populate_dataset( - PROJECT_ID=PROJECT_ID, - DATASET_ID=DATASET_ID -) - -bq_tag_generation.create_template_and_tag_bq( - PROJECT_ID, - DATASET_ID, - TAG_TEMPLATE_ID, - LOCATION -) - -print("Dataset and TagTemplate created successfully.") \ No newline at end of file diff --git a/notebooks/1_environment_setup.ipynb b/notebooks/1_environment_setup.ipynb deleted file mode 100644 index d4f85f3..0000000 --- a/notebooks/1_environment_setup.ipynb +++ /dev/null @@ -1,303 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2023 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Environment Setup" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the following python packages to setup the environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install -U google-cloud-datacatalog\n", - "! pip install -U google-cloud-storage\n", - "! pip install -U google-cloud-bigquery\n", - "! pip install -U numpy" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Specify your project ID in the next cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "PROJECT_ID = '' # Change to your project ID\n", - "LOCATION = 'us-central1'\n", - "DATASET_ID = 'cdp_dataset'\n", - "\n", - "# Tag template \n", - "TAG_TEMPLATE_ID = 'llmcdptemplate'\n", - "TAG_TEMPLATE_PATH = f\"projects/{PROJECT_ID}/locations/{LOCATION}/tagTemplates/{TAG_TEMPLATE_ID}\"\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### BigQuery: Create dataset" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a BigQuery dataset to upload the CDP data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create BigQuery Dataset talktodata on your project\n", - "from google.cloud import bigquery\n", - "from google.cloud import datacatalog_v1\n", - "\n", - "bq_client = bigquery.Client(project=PROJECT_ID)\n", - "datacatalog_client = datacatalog_v1.DataCatalogClient()\n", - "\n", - "dataset_id = \"{}.{}\".format(bq_client.project, DATASET_ID)\n", - "dataset = bigquery.Dataset(dataset_id)\n", - "dataset.location = \"US\"\n", - "\n", - "# Create the dataset\n", - "try:\n", - " dataset = bq_client.create_dataset(dataset, timeout=30)\n", - " print(f'Dataset {DATASET_ID} create successfully.')\n", - "except Exception as e:\n", - " print(e)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### BigQuery: Create tables and populate with data\n", - "\n", - "The next cell will generate the synthetic data for the tables and load to BigQuery.\n", - "\n", - "> This process will take approximately 2 minute and 40 seconds.\n", - "\n", - "If this process fails, try to recreate the dataset with the cell above and regenerate the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from aux_data import data_gen\n", - "\n", - "data_gen.generate_and_populate_dataset(\n", - " PROJECT_ID=PROJECT_ID,\n", - " DATASET_ID=DATASET_ID\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup Data Catalog" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The cell bellow will execute the following steps:\n", - "\n", - "1) Specify a query to retrieve the metadata from the tables you just uploaded;\n", - "2) Create a TagTemplate on Google Dataplex that specifies how the table will be tagged with medatada;\n", - "3) Tag all the tables you created on BigQuery." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from aux_data import bq_tag_generation\n", - "\n", - "bq_tag_generation.create_template_and_tag_bq(\n", - " PROJECT_ID,\n", - " DATASET_ID,\n", - " TAG_TEMPLATE_ID,\n", - " LOCATION\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Quick test" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test the integration by retrieving the metadata from BigQuery tables." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "QUERY = f'SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.TABLES` WHERE table_name NOT LIKE \"%metadata%\"'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_tags_from_table(table_id):\n", - " # Lookup Data Catalog's Entry referring to the table.\n", - " resource_name = (\n", - " f\"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{DATASET_ID}/tables/{table_id}\"\n", - " )\n", - " table_entry = datacatalog_client.lookup_entry(\n", - " request={\"linked_resource\": resource_name}\n", - " )\n", - "\n", - " # Make the request\n", - " page_result = datacatalog_client.list_tags(parent=table_entry.name)\n", - " # print(page_result)\n", - "\n", - " tags_str = ''\n", - "\n", - " # Handle the response\n", - " for response in page_result:\n", - " if response.template == TAG_TEMPLATE_PATH:\n", - " desc = response.fields[\"description\"].string_value\n", - " data_type = response.fields[\"data_type\"].string_value\n", - " pk = response.fields[\"is_primary_key\"].bool_value\n", - " fk = response.fields[\"is_foreign_key\"].bool_value \n", - " tags_str += (\"Table: {} \"\n", - " \"- Column: {} \" \n", - " \"- Data Type: {} \" \n", - " \"- Primary Key: {} \" \n", - " \"- Foreing Key: {} \" \n", - " \"- Description: {}\\n\".format(\n", - " table_id, response.column, data_type, pk, fk, desc))\n", - " return tags_str" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_metadata_from_dataset(\n", - " query: str\n", - "):\n", - " # print(\"Gets the metadata once\")\n", - " query_job = bq_client.query(query) # API request\n", - " rows = query_job.result()\n", - " metadata = []\n", - "\n", - " for row in rows:\n", - " table_metadata = {}\n", - " table_metadata['ddl'] = row.ddl\n", - " table_metadata['description'] = get_tags_from_table(row.table_name)\n", - " metadata.append(table_metadata)\n", - " \n", - " return metadata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tags = get_metadata_from_dataset(QUERY)\n", - "for i in tags:\n", - " print(i['description'])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "llm-dev-py311", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/aux_data/bq_tag_generation.py b/notebooks/aux_data/bq_tag_generation.py deleted file mode 100644 index b07fb79..0000000 --- a/notebooks/aux_data/bq_tag_generation.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from google.cloud import datacatalog_v1 -from google.cloud import bigquery -from typing import List - -datacatalog_client = datacatalog_v1.DataCatalogClient() - - -def define_query(PROJECT_ID: str, DATASET_ID: str): - # Perform a query to retrieve metadata - return f''' - SELECT - meta.*, - cols.data_type as data_type - FROM - `{PROJECT_ID}.{DATASET_ID}.metadata` meta - JOIN - `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.COLUMNS` cols - ON - cols.table_schema = meta.dataset_id - AND cols.table_name = meta.table_id - AND cols.column_name = meta.column_id - ''' - - -def create_tag_template( - TAG_TEMPLATE_ID: str, - PROJECT_ID: str, - LOCATION: str -): - tag_template = datacatalog_v1.TagTemplate() - tag_template.name = TAG_TEMPLATE_ID - tag_template.display_name = "Talk to data catalog template" - - field_desc = datacatalog_v1.TagTemplateField() - field_desc.type_.primitive_type = datacatalog_v1.FieldType.PrimitiveType.STRING - field_desc.display_name = "Description" - field_desc.is_required = True - tag_template.fields["description"] = field_desc - - field_pk = datacatalog_v1.TagTemplateField() - field_pk.type_.primitive_type = datacatalog_v1.FieldType.PrimitiveType.BOOL - field_pk.display_name = "is_primary_key" - field_pk.is_required = True - tag_template.fields["is_primary_key"] = field_pk - - field_fk = datacatalog_v1.TagTemplateField() - field_fk.type_.primitive_type = datacatalog_v1.FieldType.PrimitiveType.BOOL - field_fk.display_name = "is_foreign_key" - field_fk.is_required = True - tag_template.fields["is_foreign_key"] = field_fk - - field_type = datacatalog_v1.TagTemplateField() - field_type.type_.primitive_type = datacatalog_v1.FieldType.PrimitiveType.STRING - field_type.display_name = "Data Type" - field_type.is_required = False - tag_template.fields["data_type"] = field_type - - try: - tag_full_path = datacatalog_client.create_tag_template( - parent=f'projects/{PROJECT_ID}/locations/{LOCATION}', - tag_template_id=TAG_TEMPLATE_ID, - tag_template=tag_template) - print('Tag created') - except Exception as e: - print(e) - - return tag_full_path.name - - -def tag_bq_columns( - PROJECT_ID: str, - TAG_TEMPLATE_PATH: str, - TAG_TEMPLATE_ID: str, - DATASET_ID: str, - table_id: str, - column_id: str, - values: List): - # Lookup Data Catalog's Entry referring to the table. - resource_name = ( - f"//bigquery.googleapis.com/projects/{PROJECT_ID}/datasets/{DATASET_ID}/tables/{table_id}" - ) - - table_entry = datacatalog_client.lookup_entry( - request={"linked_resource": resource_name} - ) - # Attach a Tag to the table. - tag = datacatalog_v1.types.Tag() - - tag.template = TAG_TEMPLATE_PATH - tag.name = "talktodata tag" - tag.fields["description"] = datacatalog_v1.types.TagField() - tag.fields["description"].string_value = values[0] - - tag.fields["is_primary_key"] = datacatalog_v1.types.TagField() - tag.fields["is_primary_key"].bool_value = values[1] - - tag.fields["is_foreign_key"] = datacatalog_v1.types.TagField() - tag.fields["is_foreign_key"].bool_value = values[2] - - tag.fields["data_type"] = datacatalog_v1.types.TagField() - tag.fields["data_type"].string_value = values[3] - - tag.column = column_id - try: - tag = datacatalog_client.create_tag(parent=table_entry.name, tag=tag) - print('tag created/updated for {}'.format(column_id)) - except Exception as e: - print(e) - print( - 'Failed to create template {} for {}.{}.{}.{}'.format( - TAG_TEMPLATE_ID, - DATASET_ID, - DATASET_ID, - table_id, - column_id) - ) - -def tag_metadata_from_bq( - PROJECT_ID, - DATASET_ID, - TAG_TEMPLATE_PATH, - TAG_TEMPLATE_ID -): - bq_client = bigquery.Client(project=PROJECT_ID) - query_job = bq_client.query(define_query(PROJECT_ID, DATASET_ID)) - rows = query_job.result() - for row in rows: - tag_bq_columns( - PROJECT_ID, - TAG_TEMPLATE_PATH, - TAG_TEMPLATE_ID, - row.dataset_id, - row.table_id, - row.column_id, - [row.description, row.is_primary_key, row.is_foreign_key, row.data_type]) - - -def create_template_and_tag_bq( - PROJECT_ID: str, - DATASET_ID: str, - TAG_TEMPLATE_ID: str, - LOCATION: str -): - - TAG_TEMPLATE_PATH = create_tag_template( - TAG_TEMPLATE_ID, - PROJECT_ID, - LOCATION) - - tag_metadata_from_bq( - PROJECT_ID, - DATASET_ID, - TAG_TEMPLATE_PATH, - TAG_TEMPLATE_ID - ) \ No newline at end of file diff --git a/notebooks/aux_data/customers_aux_data.py b/notebooks/aux_data/customers_aux_data.py deleted file mode 100644 index 2c71561..0000000 --- a/notebooks/aux_data/customers_aux_data.py +++ /dev/null @@ -1,1517 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -channel = [{ - "channel": "email" -}, { - "channel": "other" -}, { - "channel": "inbound" -}, { - "channel": "in-store" -}, { - "channel": "outbound" -}, { - "channel": "mobile-app" -}, { - "channel": "online-store" -}] - - -locations = [{ - "city": "Harrisburg", - "state": "Pennsylvania" -}, { - "city": "Washington", - "state": "District of Columbia" -}, { - "city": "El Paso", - "state": "Texas" -}, { - "city": "Punta Gorda", - "state": "Florida" -}, { - "city": "Newport News", - "state": "Virginia" -}, { - "city": "Houston", - "state": "Texas" -}, { - "city": "New Haven", - "state": "Connecticut" -}, { - "city": "Clearwater", - "state": "Florida" -}, { - "city": "Evansville", - "state": "Indiana" -}, { - "city": "Lexington", - "state": "Kentucky" -}, { - "city": "Los Angeles", - "state": "California" -}, { - "city": "Fort Wayne", - "state": "Indiana" -}, { - "city": "Saint Petersburg", - "state": "Florida" -}, { - "city": "North Las Vegas", - "state": "Nevada" -}, { - "city": "Lubbock", - "state": "Texas" -}, { - "city": "Portland", - "state": "Oregon" -}, { - "city": "Glendale", - "state": "California" -}, { - "city": "Wilmington", - "state": "Delaware" -}, { - "city": "Boston", - "state": "Massachusetts" -}, { - "city": "Chula Vista", - "state": "California" -}, { - "city": "Las Vegas", - "state": "Nevada" -}, { - "city": "Madison", - "state": "Wisconsin" -}, { - "city": "Knoxville", - "state": "Tennessee" -}, { - "city": "Atlanta", - "state": "Georgia" -}, { - "city": "Cleveland", - "state": "Ohio" -}, { - "city": "Cincinnati", - "state": "Ohio" -}, { - "city": "Honolulu", - "state": "Hawaii" -}, { - "city": "Indianapolis", - "state": "Indiana" -}, { - "city": "Huntington", - "state": "West Virginia" -}, { - "city": "Dayton", - "state": "Ohio" -}, { - "city": "Miami", - "state": "Florida" -}, { - "city": "Jacksonville", - "state": "Florida" -}, { - "city": "Whittier", - "state": "California" -}, { - "city": "Tampa", - "state": "Florida" -}, { - "city": "Akron", - "state": "Ohio" -}, { - "city": "Lancaster", - "state": "California" -}, { - "city": "Orlando", - "state": "Florida" -}, { - "city": "Englewood", - "state": "Colorado" -}, { - "city": "Riverside", - "state": "California" -}, { - "city": "Monroe", - "state": "Louisiana" -}, { - "city": "Tyler", - "state": "Texas" -}, { - "city": "Metairie", - "state": "Louisiana" -}, { - "city": "Greensboro", - "state": "North Carolina" -}, { - "city": "Bethesda", - "state": "Maryland" -}, { - "city": "New York City", - "state": "New York" -}, { - "city": "Birmingham", - "state": "Alabama" -}, { - "city": "Irving", - "state": "Texas" -}, { - "city": "Oklahoma City", - "state": "Oklahoma" -}, { - "city": "Shawnee Mission", - "state": "Kansas" -}, { - "city": "San Jose", - "state": "California" -}, { - "city": "Alexandria", - "state": "Virginia" -}, { - "city": "Nashville", - "state": "Tennessee" -}, { - "city": "Fort Lauderdale", - "state": "Florida" -}, { - "city": "Alexandria", - "state": "Louisiana" -}, { - "city": "Odessa", - "state": "Texas" -}, { - "city": "San Diego", - "state": "California" -}, { - "city": "Oakland", - "state": "California" -}, { - "city": "Salt Lake City", - "state": "Utah" -}, { - "city": "Lafayette", - "state": "Indiana" -}, { - "city": "Melbourne", - "state": "Florida" -}, { - "city": "Trenton", - "state": "New Jersey" -}, { - "city": "West Palm Beach", - "state": "Florida" -}, { - "city": "Norfolk", - "state": "Virginia" -}, { - "city": "Kansas City", - "state": "Missouri" -}, { - "city": "Sacramento", - "state": "California" -}, { - "city": "Stockton", - "state": "California" -}, { - "city": "Chandler", - "state": "Arizona" -}, { - "city": "Charlotte", - "state": "North Carolina" -}, { - "city": "Bronx", - "state": "New York" -}, { - "city": "Wilkes Barre", - "state": "Pennsylvania" -}, { - "city": "Boulder", - "state": "Colorado" -}, { - "city": "San Angelo", - "state": "Texas" -}, { - "city": "Colorado Springs", - "state": "Colorado" -}, { - "city": "Baltimore", - "state": "Maryland" -}, { - "city": "Hartford", - "state": "Connecticut" -}, { - "city": "Phoenix", - "state": "Arizona" -}, { - "city": "Portsmouth", - "state": "New Hampshire" -}, { - "city": "Louisville", - "state": "Kentucky" -}, { - "city": "San Francisco", - "state": "California" -}, { - "city": "Gainesville", - "state": "Florida" -}, { - "city": "Omaha", - "state": "Nebraska" -}, { - "city": "Tulsa", - "state": "Oklahoma" -}, { - "city": "Buffalo", - "state": "New York" -}, { - "city": "Memphis", - "state": "Tennessee" -}, { - "city": "Gatesville", - "state": "Texas" -}, { - "city": "Anchorage", - "state": "Alaska" -}, { - "city": "Monticello", - "state": "Minnesota" -}, { - "city": "Pensacola", - "state": "Florida" -}, { - "city": "Cape Coral", - "state": "Florida" -}, { - "city": "Richmond", - "state": "Virginia" -}, { - "city": "Fullerton", - "state": "California" -}, { - "city": "Augusta", - "state": "Georgia" -}, { - "city": "Lees Summit", - "state": "Missouri" -}, { - "city": "Detroit", - "state": "Michigan" -}, { - "city": "Garland", - "state": "Texas" -}, { - "city": "Columbus", - "state": "Georgia" -}, { - "city": "Loretto", - "state": "Minnesota" -}, { - "city": "Denver", - "state": "Colorado" -}, { - "city": "Juneau", - "state": "Alaska" -}, { - "city": "San Antonio", - "state": "Texas" -}, { - "city": "Seattle", - "state": "Washington" -}, { - "city": "San Rafael", - "state": "California" -}, { - "city": "Chicago", - "state": "Illinois" -}, { - "city": "Baton Rouge", - "state": "Louisiana" -}, { - "city": "Boise", - "state": "Idaho" -}, { - "city": "Greeley", - "state": "Colorado" -}, { - "city": "Rochester", - "state": "New York" -}, { - "city": "Philadelphia", - "state": "Pennsylvania" -}, { - "city": "Palm Bay", - "state": "Florida" -}, { - "city": "Dallas", - "state": "Texas" -}, { - "city": "Topeka", - "state": "Kansas" -}, { - "city": "Shreveport", - "state": "Louisiana" -}, { - "city": "Inglewood", - "state": "California" -}, { - "city": "Boynton Beach", - "state": "Florida" -}, { - "city": "Spokane", - "state": "Washington" -}, { - "city": "Springfield", - "state": "Illinois" -}, { - "city": "Sterling", - "state": "Virginia" -}, { - "city": "Tacoma", - "state": "Washington" -}, { - "city": "Saint Louis", - "state": "Missouri" -}, { - "city": "Decatur", - "state": "Illinois" -}, { - "city": "Sarasota", - "state": "Florida" -}, { - "city": "Olympia", - "state": "Washington" -}, { - "city": "Winston Salem", - "state": "North Carolina" -}, { - "city": "Texarkana", - "state": "Texas" -}, { - "city": "Arlington", - "state": "Texas" -}, { - "city": "Worcester", - "state": "Massachusetts" -}, { - "city": "Santa Fe", - "state": "New Mexico" -}, { - "city": "Corona", - "state": "California" -}, { - "city": "Pittsburgh", - "state": "Pennsylvania" -}, { - "city": "Newark", - "state": "New Jersey" -}, { - "city": "Erie", - "state": "Pennsylvania" -}, { - "city": "Missoula", - "state": "Montana" -}, { - "city": "Flushing", - "state": "New York" -}, { - "city": "Minneapolis", - "state": "Minnesota" -}, { - "city": "Long Beach", - "state": "California" -}, { - "city": "Ocala", - "state": "Florida" -}, { - "city": "Mc Keesport", - "state": "Pennsylvania" -}, { - "city": "Florence", - "state": "South Carolina" -}, { - "city": "Waterbury", - "state": "Connecticut" -}, { - "city": "Austin", - "state": "Texas" -}, { - "city": "Brooklyn", - "state": "New York" -}, { - "city": "Anaheim", - "state": "California" -}, { - "city": "Jefferson City", - "state": "Missouri" -}, { - "city": "Fresno", - "state": "California" -}, { - "city": "Des Moines", - "state": "Iowa" -}, { - "city": "Biloxi", - "state": "Mississippi" -}, { - "city": "Billings", - "state": "Montana" -}, { - "city": "Paterson", - "state": "New Jersey" -}, { - "city": "Columbia", - "state": "South Carolina" -}, { - "city": "Apache Junction", - "state": "Arizona" -}, { - "city": "Reno", - "state": "Nevada" -}, { - "city": "Silver Spring", - "state": "Maryland" -}, { - "city": "New Orleans", - "state": "Louisiana" -}, { - "city": "Tucson", - "state": "Arizona" -}, { - "city": "Hollywood", - "state": "Florida" -}, { - "city": "Bloomington", - "state": "Illinois" -}, { - "city": "Norcross", - "state": "Georgia" -}, { - "city": "Newark", - "state": "Delaware" -}, { - "city": "Anderson", - "state": "South Carolina" -}, { - "city": "Mobile", - "state": "Alabama" -}, { - "city": "Savannah", - "state": "Georgia" -}, { - "city": "Kent", - "state": "Washington" -}, { - "city": "Cheyenne", - "state": "Wyoming" -}, { - "city": "Albany", - "state": "New York" -}, { - "city": "Southfield", - "state": "Michigan" -}, { - "city": "Spartanburg", - "state": "South Carolina" -}, { - "city": "Bradenton", - "state": "Florida" -}, { - "city": "Migrate", - "state": "Kentucky" -}, { - "city": "Waco", - "state": "Texas" -}, { - "city": "Grand Rapids", - "state": "Michigan" -}, { - "city": "Arlington", - "state": "Virginia" -}, { - "city": "Youngstown", - "state": "Ohio" -}, { - "city": "Montgomery", - "state": "Alabama" -}, { - "city": "Idaho Falls", - "state": "Idaho" -}, { - "city": "Fredericksburg", - "state": "Virginia" -}, { - "city": "Cedar Rapids", - "state": "Iowa" -}, { - "city": "Johnson City", - "state": "Tennessee" -}, { - "city": "Santa Barbara", - "state": "California" -}, { - "city": "Fort Worth", - "state": "Texas" -}, { - "city": "Roanoke", - "state": "Virginia" -}, { - "city": "Lake Charles", - "state": "Louisiana" -}, { - "city": "Helena", - "state": "Montana" -}, { - "city": "Troy", - "state": "Michigan" -}, { - "city": "Gulfport", - "state": "Mississippi" -}, { - "city": "Irvine", - "state": "California" -}, { - "city": "Hagerstown", - "state": "Maryland" -}, { - "city": "Port Charlotte", - "state": "Florida" -}, { - "city": "Albuquerque", - "state": "New Mexico" -}, { - "city": "Raleigh", - "state": "North Carolina" -}, { - "city": "Scottsdale", - "state": "Arizona" -}, { - "city": "Laurel", - "state": "Maryland" -}, { - "city": "Midland", - "state": "Michigan" -}, { - "city": "Carson City", - "state": "Nevada" -}, { - "city": "Fargo", - "state": "North Dakota" -}, { - "city": "Killeen", - "state": "Texas" -}, { - "city": "Hot Springs National Park", - "state": "Arkansas" -}, { - "city": "Lawrenceville", - "state": "Georgia" -}, { - "city": "Bonita Springs", - "state": "Florida" -}, { - "city": "Hattiesburg", - "state": "Mississippi" -}, { - "city": "Lansing", - "state": "Michigan" -}, { - "city": "Amarillo", - "state": "Texas" -}, { - "city": "Rockville", - "state": "Maryland" -}, { - "city": "Charlottesville", - "state": "Virginia" -}, { - "city": "Saint Paul", - "state": "Minnesota" -}, { - "city": "San Bernardino", - "state": "California" -}, { - "city": "Jackson", - "state": "Mississippi" -}, { - "city": "Peoria", - "state": "Illinois" -}, { - "city": "Salem", - "state": "Oregon" -}, { - "city": "Fayetteville", - "state": "North Carolina" -}, { - "city": "Saint Augustine", - "state": "Florida" -}, { - "city": "Alpharetta", - "state": "Georgia" -}, { - "city": "Staten Island", - "state": "New York" -}, { - "city": "Lincoln", - "state": "Nebraska" -}, { - "city": "Springfield", - "state": "Ohio" -}, { - "city": "Waterloo", - "state": "Iowa" -}, { - "city": "Vancouver", - "state": "Washington" -}, { - "city": "Van Nuys", - "state": "California" -}, { - "city": "Toledo", - "state": "Ohio" -}, { - "city": "Stamford", - "state": "Connecticut" -}, { - "city": "Plano", - "state": "Texas" -}, { - "city": "South Bend", - "state": "Indiana" -}, { - "city": "Wichita", - "state": "Kansas" -}, { - "city": "Littleton", - "state": "Colorado" -}, { - "city": "Manchester", - "state": "New Hampshire" -}, { - "city": "Modesto", - "state": "California" -}, { - "city": "Macon", - "state": "Georgia" -}, { - "city": "Sparks", - "state": "Nevada" -}, { - "city": "Battle Creek", - "state": "Michigan" -}, { - "city": "Joliet", - "state": "Illinois" -}, { - "city": "Lakewood", - "state": "Washington" -}, { - "city": "Montpelier", - "state": "Vermont" -}, { - "city": "Greenville", - "state": "South Carolina" -}, { - "city": "North Little Rock", - "state": "Arkansas" -}, { - "city": "Mesa", - "state": "Arizona" -}, { - "city": "Canton", - "state": "Ohio" -}, { - "city": "Gilbert", - "state": "Arizona" -}, { - "city": "Schaumburg", - "state": "Illinois" -}, { - "city": "Pompano Beach", - "state": "Florida" -}, { - "city": "Concord", - "state": "California" -}, { - "city": "Pasadena", - "state": "California" -}, { - "city": "New Brunswick", - "state": "New Jersey" -}, { - "city": "Jamaica", - "state": "New York" -}, { - "city": "Tallahassee", - "state": "Florida" -}, { - "city": "Watertown", - "state": "Massachusetts" -}, { - "city": "Athens", - "state": "Georgia" -}, { - "city": "Muncie", - "state": "Indiana" -}, { - "city": "Lakeland", - "state": "Florida" -}, { - "city": "San Luis Obispo", - "state": "California" -}, { - "city": "Fort Collins", - "state": "Colorado" -}, { - "city": "Huntsville", - "state": "Alabama" -}, { - "city": "West Hartford", - "state": "Connecticut" -}, { - "city": "Champaign", - "state": "Illinois" -}, { - "city": "Young America", - "state": "Minnesota" -}, { - "city": "Newton", - "state": "Massachusetts" -}, { - "city": "Crawfordsville", - "state": "Indiana" -}, { - "city": "Spring", - "state": "Texas" -}, { - "city": "Camden", - "state": "New Jersey" -}, { - "city": "Chico", - "state": "California" -}, { - "city": "Fairfield", - "state": "Connecticut" -}, { - "city": "Bakersfield", - "state": "California" -}, { - "city": "Rockford", - "state": "Illinois" -}, { - "city": "Torrance", - "state": "California" -}, { - "city": "Lynchburg", - "state": "Virginia" -}, { - "city": "Temple", - "state": "Texas" -}, { - "city": "Simi Valley", - "state": "California" -}, { - "city": "Lancaster", - "state": "Pennsylvania" -}, { - "city": "Saint Cloud", - "state": "Minnesota" -}, { - "city": "Fort Smith", - "state": "Arkansas" -}, { - "city": "Duluth", - "state": "Georgia" -}, { - "city": "Saginaw", - "state": "Michigan" -}, { - "city": "Ogden", - "state": "Utah" -}, { - "city": "Lima", - "state": "Ohio" -}, { - "city": "Davenport", - "state": "Iowa" -}, { - "city": "Longview", - "state": "Texas" -}, { - "city": "Pinellas Park", - "state": "Florida" -}, { - "city": "Redwood City", - "state": "California" -}, { - "city": "Reading", - "state": "Pennsylvania" -}, { - "city": "Lehigh Acres", - "state": "Florida" -}, { - "city": "Santa Clara", - "state": "California" -}, { - "city": "Kansas City", - "state": "Kansas" -}, { - "city": "Green Bay", - "state": "Wisconsin" -}, { - "city": "Santa Monica", - "state": "California" -}, { - "city": "Iowa City", - "state": "Iowa" -}, { - "city": "Pueblo", - "state": "Colorado" -}, { - "city": "Brea", - "state": "California" -}, { - "city": "Bowie", - "state": "Maryland" -}, { - "city": "Norman", - "state": "Oklahoma" -}, { - "city": "Sioux Falls", - "state": "South Dakota" -}, { - "city": "Frankfort", - "state": "Kentucky" -}, { - "city": "Fort Pierce", - "state": "Florida" -}, { - "city": "Corpus Christi", - "state": "Texas" -}, { - "city": "Burbank", - "state": "California" -}, { - "city": "Little Rock", - "state": "Arkansas" -}, { - "city": "York", - "state": "Pennsylvania" -}, { - "city": "Warren", - "state": "Ohio" -}, { - "city": "Gastonia", - "state": "North Carolina" -}, { - "city": "Fairbanks", - "state": "Alaska" -}, { - "city": "Largo", - "state": "Florida" -}, { - "city": "Santa Ana", - "state": "California" -}, { - "city": "Valdosta", - "state": "Georgia" -}, { - "city": "Valley Forge", - "state": "Pennsylvania" -}, { - "city": "Naples", - "state": "Florida" -}, { - "city": "Oxnard", - "state": "California" -}, { - "city": "Zephyrhills", - "state": "Florida" -}, { - "city": "Petaluma", - "state": "California" -}, { - "city": "Northridge", - "state": "California" -}, { - "city": "Kingsport", - "state": "Tennessee" -}, { - "city": "Columbia", - "state": "Missouri" -}, { - "city": "Carlsbad", - "state": "California" -}, { - "city": "Arvada", - "state": "Colorado" -}, { - "city": "Henderson", - "state": "Nevada" -}, { - "city": "Abilene", - "state": "Texas" -}, { - "city": "Everett", - "state": "Washington" -}, { - "city": "Chattanooga", - "state": "Tennessee" -}, { - "city": "Elmira", - "state": "New York" -}, { - "city": "Santa Cruz", - "state": "California" -}, { - "city": "Virginia Beach", - "state": "Virginia" -}, { - "city": "Sandy", - "state": "Utah" -}, { - "city": "Kalamazoo", - "state": "Michigan" -}, { - "city": "Charleston", - "state": "South Carolina" -}, { - "city": "Conroe", - "state": "Texas" -}, { - "city": "Muskegon", - "state": "Michigan" -}, { - "city": "Ventura", - "state": "California" -}, { - "city": "Eugene", - "state": "Oregon" -}, { - "city": "Charleston", - "state": "West Virginia" -}, { - "city": "Vero Beach", - "state": "Florida" -}, { - "city": "Columbus", - "state": "Ohio" -}, { - "city": "Maple Plain", - "state": "Minnesota" -}, { - "city": "Tuscaloosa", - "state": "Alabama" -}, { - "city": "Providence", - "state": "Rhode Island" -}, { - "city": "Brooksville", - "state": "Florida" -}, { - "city": "Fairfax", - "state": "Virginia" -}, { - "city": "Milwaukee", - "state": "Wisconsin" -}, { - "city": "Hialeah", - "state": "Florida" -}, { - "city": "Denton", - "state": "Texas" -}, { - "city": "Laredo", - "state": "Texas" -}, { - "city": "New Castle", - "state": "Pennsylvania" -}, { - "city": "Manassas", - "state": "Virginia" -}, { - "city": "Flint", - "state": "Michigan" -}, { - "city": "Mount Vernon", - "state": "New York" -}, { - "city": "Oceanside", - "state": "California" -}, { - "city": "Lake Worth", - "state": "Florida" -}, { - "city": "San Mateo", - "state": "California" -}, { - "city": "Moreno Valley", - "state": "California" -}, { - "city": "Norwalk", - "state": "Connecticut" -}, { - "city": "Rochester", - "state": "Minnesota" -}, { - "city": "Sunnyvale", - "state": "California" -}, { - "city": "Bryan", - "state": "Texas" -}, { - "city": "South Lake Tahoe", - "state": "California" -}, { - "city": "Racine", - "state": "Wisconsin" -}, { - "city": "Pomona", - "state": "California" -}, { - "city": "Las Cruces", - "state": "New Mexico" -}, { - "city": "North Hollywood", - "state": "California" -}, { - "city": "Newport Beach", - "state": "California" -}, { - "city": "Jackson", - "state": "Tennessee" -}, { - "city": "Albany", - "state": "Georgia" -}, { - "city": "Marietta", - "state": "Georgia" -}, { - "city": "Port Saint Lucie", - "state": "Florida" -}, { - "city": "Jeffersonville", - "state": "Indiana" -}, { - "city": "Gainesville", - "state": "Georgia" -}, { - "city": "White Plains", - "state": "New York" -}, { - "city": "New Hyde Park", - "state": "New York" -}, { - "city": "Cambridge", - "state": "Massachusetts" -}, { - "city": "Wilmington", - "state": "North Carolina" -}, { - "city": "Winter Haven", - "state": "Florida" -}, { - "city": "Syracuse", - "state": "New York" -}, { - "city": "Lafayette", - "state": "Louisiana" -}, { - "city": "Meridian", - "state": "Mississippi" -}, { - "city": "Santa Rosa", - "state": "California" -}, { - "city": "Mansfield", - "state": "Ohio" -}, { - "city": "Yakima", - "state": "Washington" -}, { - "city": "Huntsville", - "state": "Texas" -}, { - "city": "Daytona Beach", - "state": "Florida" -}, { - "city": "Berkeley", - "state": "California" -}, { - "city": "Springfield", - "state": "Missouri" -}, { - "city": "Garden Grove", - "state": "California" -}, { - "city": "Hyattsville", - "state": "Maryland" -}, { - "city": "Bethlehem", - "state": "Pennsylvania" -}, { - "city": "Seminole", - "state": "Florida" -}, { - "city": "Ann Arbor", - "state": "Michigan" -}, { - "city": "Boca Raton", - "state": "Florida" -}, { - "city": "Myrtle Beach", - "state": "South Carolina" -}, { - "city": "Brockton", - "state": "Massachusetts" -}, { - "city": "Midland", - "state": "Texas" -}, { - "city": "Palmdale", - "state": "California" -}, { - "city": "Saint Joseph", - "state": "Missouri" -}, { - "city": "Evanston", - "state": "Illinois" -}, { - "city": "North Port", - "state": "Florida" -}, { - "city": "New Bedford", - "state": "Massachusetts" -}, { - "city": "Fort Myers", - "state": "Florida" -}, { - "city": "Port Washington", - "state": "New York" -}, { - "city": "Gaithersburg", - "state": "Maryland" -}, { - "city": "Tempe", - "state": "Arizona" -}, { - "city": "Columbus", - "state": "Mississippi" -}, { - "city": "Beaverton", - "state": "Oregon" -}, { - "city": "Panama City", - "state": "Florida" -}, { - "city": "Peoria", - "state": "Arizona" -}, { - "city": "Springfield", - "state": "Massachusetts" -}, { - "city": "Great Neck", - "state": "New York" -}, { - "city": "Richmond", - "state": "California" -}, { - "city": "Escondido", - "state": "California" -}, { - "city": "Spring Hill", - "state": "Florida" -}, { - "city": "Aiken", - "state": "South Carolina" -}, { - "city": "Suffolk", - "state": "Virginia" -}, { - "city": "Carol Stream", - "state": "Illinois" -}, { - "city": "Warren", - "state": "Michigan" -}, { - "city": "Beaumont", - "state": "Texas" -}, { - "city": "Pocatello", - "state": "Idaho" -}, { - "city": "Ashburn", - "state": "Virginia" -}, { - "city": "Elizabeth", - "state": "New Jersey" -}, { - "city": "Merrifield", - "state": "Virginia" -}, { - "city": "Palo Alto", - "state": "California" -}, { - "city": "Glendale", - "state": "Arizona" -}, { - "city": "Salinas", - "state": "California" -}, { - "city": "Naperville", - "state": "Illinois" -}, { - "city": "Aurora", - "state": "Colorado" -}, { - "city": "Cumming", - "state": "Georgia" -}, { - "city": "Sioux City", - "state": "Iowa" -}, { - "city": "Vienna", - "state": "Virginia" -}, { - "city": "Durham", - "state": "North Carolina" -}, { - "city": "Levittown", - "state": "Pennsylvania" -}, { - "city": "Scranton", - "state": "Pennsylvania" -}, { - "city": "Aurora", - "state": "Illinois" -}, { - "city": "Portsmouth", - "state": "Virginia" -}, { - "city": "Edmond", - "state": "Oklahoma" -}, { - "city": "Provo", - "state": "Utah" -}, { - "city": "Asheville", - "state": "North Carolina" -}, { - "city": "Wichita Falls", - "state": "Texas" -}, { - "city": "Prescott", - "state": "Arizona" -}, { - "city": "Miami Beach", - "state": "Florida" -}, { - "city": "Hampton", - "state": "Virginia" -}, { - "city": "Herndon", - "state": "Virginia" -}, { - "city": "Gary", - "state": "Indiana" -}, { - "city": "Bridgeport", - "state": "Connecticut" -}, { - "city": "Bloomington", - "state": "Indiana" -}, { - "city": "Anderson", - "state": "Indiana" -}, { - "city": "Huntington Beach", - "state": "California" -}, { - "city": "Kissimmee", - "state": "Florida" -}, { - "city": "Hicksville", - "state": "New York" -}, { - "city": "College Station", - "state": "Texas" -}, { - "city": "Waltham", - "state": "Massachusetts" -}, { - "city": "Anniston", - "state": "Alabama" -}, { - "city": "Duluth", - "state": "Minnesota" -}, { - "city": "Bismarck", - "state": "North Dakota" -}, { - "city": "Terre Haute", - "state": "Indiana" -}, { - "city": "Alhambra", - "state": "California" -}, { - "city": "Orange", - "state": "California" -}, { - "city": "Mesquite", - "state": "Texas" -}, { - "city": "East Saint Louis", - "state": "Illinois" -}, { - "city": "Pasadena", - "state": "Texas" -}, { - "city": "Katy", - "state": "Texas" -}, { - "city": "Chesapeake", - "state": "Virginia" -}, { - "city": "Round Rock", - "state": "Texas" -}, { - "city": "Dearborn", - "state": "Michigan" -}, { - "city": "Annapolis", - "state": "Maryland" -}, { - "city": "Danbury", - "state": "Connecticut" -}, { - "city": "Gadsden", - "state": "Alabama" -}, { - "city": "Reston", - "state": "Virginia" -}, { - "city": "Yonkers", - "state": "New York" -}, { - "city": "Dulles", - "state": "Virginia" -}, { - "city": "Johnstown", - "state": "Pennsylvania" -}, { - "city": "Bellevue", - "state": "Washington" -}, { - "city": "Allentown", - "state": "Pennsylvania" -}, { - "city": "Decatur", - "state": "Georgia" -}, { - "city": "High Point", - "state": "North Carolina" -}, { - "city": "Lynn", - "state": "Massachusetts" -}, { - "city": "London", - "state": "Kentucky" -}, { - "city": "Frederick", - "state": "Maryland" -}, { - "city": "Visalia", - "state": "California" -}, { - "city": "Utica", - "state": "New York" -}, { - "city": "Independence", - "state": "Missouri" -}, { - "city": "Murfreesboro", - "state": "Tennessee" -}, { - "city": "Portland", - "state": "Maine" -}, { - "city": "Springfield", - "state": "Virginia" -}, { - "city": "Grand Junction", - "state": "Colorado" -}, { - "city": "Ridgely", - "state": "Maryland" -}, { - "city": "Humble", - "state": "Texas" -}, { - "city": "Schenectady", - "state": "New York" -}, { - "city": "Jersey City", - "state": "New Jersey" -}, { - "city": "Morgantown", - "state": "West Virginia" -}, { - "city": "Mountain View", - "state": "California" -}, { - "city": "Appleton", - "state": "Wisconsin" -}, { - "city": "Palatine", - "state": "Illinois" -}, { - "city": "Delray Beach", - "state": "Florida" -}, { - "city": "Hayward", - "state": "California" -}, { - "city": "Galveston", - "state": "Texas" -}, { - "city": "Homestead", - "state": "Florida" -}, { - "city": "Falls Church", - "state": "Virginia" -}, { - "city": "Hamilton", - "state": "Ohio" -}, { - "city": "Grand Forks", - "state": "North Dakota" -}, { - "city": "Beaufort", - "state": "South Carolina" -}, { - "city": "Bozeman", - "state": "Montana" -}, { - "city": "Farmington", - "state": "Michigan" -}, { - "city": "Woburn", - "state": "Massachusetts" -}] - - diff --git a/notebooks/aux_data/data_gen.py b/notebooks/aux_data/data_gen.py deleted file mode 100644 index dc71124..0000000 --- a/notebooks/aux_data/data_gen.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from google.cloud import bigquery -from aux_data.metadata_aux_data import get_metadata_data -from typing import List, Dict -import numpy as np -from datetime import datetime, timedelta - -SEED = 1 -rng = np.random.default_rng(SEED) - - -def create_and_populate_customers(num_customers: int = 50000) -> List[Dict]: - from aux_data.customers_aux_data import channel, locations - - customers_location = rng.choice(locations, size=(num_customers)) - customers_channel = rng.choice(channel, size=(num_customers)) - customers_total_purchases = rng.integers(1, 100, size=(num_customers)) - customers_total_value = rng.integers(10, 1000, size=(num_customers)) - customers_total_emails = rng.integers(1, 100, size=(num_customers)) - customers_loyalty_score = rng.integers(1, 100, size=(num_customers)) - customers_is_media_follower = rng.choice([False, True], size=(num_customers)) - - baseline_datetime = datetime(2023, 4, 1) - customers_last_sign_up_date = rng.integers(500, 1000, size=(num_customers)) - customers_last_purchase_date = rng.integers(20, 100, size=(num_customers)) - customers_last_activity_date = customers_last_purchase_date - rng.integers(10, 20, size=(num_customers)) - - customers_cart_total = rng.uniform(0.0, 800.0, size=(num_customers)) - - customers_data = [] - - for i in range(num_customers): - customer = {} - customer['customer_id'] = int(i) - customer['email'] = f'user{i}@sample_user{i}.sample' - customer['city'] = customers_location[i]['city'] - customer['state'] = customers_location[i]['state'] - customer['channel'] = customers_channel[i]['channel'] - customer['total_purchases'] = int(customers_total_purchases[i]) - customer['total_value'] = int(customers_total_value[i]) - customer['total_emails'] = int(customers_total_emails[i]) - customer['loyalty_score'] = int(customers_loyalty_score[i]) - customer['is_media_follower'] = bool(customers_is_media_follower[i]) - - customer['last_sign_up_date'] = baseline_datetime - timedelta(days=int(customers_last_sign_up_date[i])) - customer['last_sign_up_date'] = customer['last_sign_up_date'].strftime('%Y-%m-%d') - - customer['last_purchase_date'] = baseline_datetime - timedelta(days=int(customers_last_purchase_date[i])) - customer['last_purchase_date'] = customer['last_purchase_date'].strftime('%Y-%m-%d') - - customer['last_activity_date'] = baseline_datetime - timedelta(days=int(customers_last_activity_date[i])) - customer['last_activity_date'] = customer['last_activity_date'].strftime('%Y-%m-%d') - - customer['cart_total'] = round(float(customers_cart_total[i]), 2) - - customers_data.append(customer) - - return customers_data - - -# Generate and load events table to BQ -def create_and_populate_events(num_customers: int = 50000) -> Dict: - from aux_data.events_aux_data import event_type - - events_per_customer = list(map(int, np.absolute(np.floor(rng.normal(1, 1, size=(num_customers)) * 100)))) - num_events = sum(events_per_customer) - - events_type = rng.choice(event_type, size=(num_events)) - - baseline_datetime = datetime(2023, 4, 1) - events_date_delta = rng.integers(20, 200, size=(num_events)) - - events_data = [] - idx = 0 - - for i in range(num_customers): - for _ in range(events_per_customer[i]): - event = {} - event['customer_id'] = i - event['event_id'] = idx - event['event_date'] = baseline_datetime - timedelta(days=int(events_date_delta[idx])) - event['event_date'] = event['event_date'].strftime('%Y-%m-%d') - event['event_type'] = events_type[idx]['event_type'] - idx += 1 - events_data.append(event) - return events_data - - -# Generate and load transactions to BQ -def create_and_populate_transactions(num_customers: int = 50000) -> Dict: - from aux_data.transactions_aux_data import product_name, transaction_type - - transactions_per_customer = list(map(int, np.absolute(np.floor(rng.normal(1, 1, size=(num_customers)) * 100)))) - num_transactions = sum(transactions_per_customer) - - product_name_choice = rng.choice(product_name, size=(num_transactions)) - transaction_type_choice = rng.choice(transaction_type, size=(num_transactions)) - - transaction_qtn = rng.integers(1, 30, size=(num_transactions)) - transaction_value = rng.integers(1, 5000, size=(num_transactions)) - app_purchase_quantity = rng.integers(1, 10, size=(num_transactions)) - - transaction_is_online = rng.choice([False, True], size=(num_transactions)) - - baseline_datetime = datetime(2023, 4, 1) - transactions_date_delta = rng.integers(20, 200, size=(num_transactions)) - - transaction_data = [] - - transaction_id = 0 - for i in range(num_customers): - for _ in range(transactions_per_customer[i]): - transaction = {} - transaction['transaction_id'] = transaction_id - transaction['customer_id'] = i - transaction['transaction_quantity'] = int(transaction_qtn[transaction_id]) - transaction['transaction_value'] = int(transaction_value[transaction_id]) - transaction['transaction_type'] = transaction_type_choice[transaction_id]['transaction_type'] - transaction['app_purchase_quantity'] = int(app_purchase_quantity[transaction_id]) - transaction['is_online'] = bool(transaction_is_online[transaction_id]) - - transaction['transaction_date'] = baseline_datetime - timedelta(days=int(transactions_date_delta[transaction_id])) - transaction['transaction_date'] = transaction['transaction_date'].strftime('%Y-%m-%d') - - transaction['product_name'] = product_name_choice[transaction_id]['product_name'] - transaction['product_id'] = product_name_choice[transaction_id]['product_id'] - - transaction_id += 1 - transaction_data.append(transaction) - - return transaction_data - - -def generate_and_populate_dataset( - PROJECT_ID: str, - DATASET_ID: str, - create_tables: bool = True -): - bq_client = bigquery.Client(project=PROJECT_ID) - - # Define tables schema - customers_schema = [ - bigquery.SchemaField('customer_id', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('email', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('city', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('state', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('channel', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('total_purchases', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('total_value', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('total_emails', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('loyalty_score', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('is_media_follower', 'BOOLEAN', mode='NULLABLE'), - bigquery.SchemaField('last_sign_up_date', 'DATE', mode='NULLABLE'), - bigquery.SchemaField('last_purchase_date', 'DATE', mode='NULLABLE'), - bigquery.SchemaField('last_activity_date', 'DATE', mode='NULLABLE'), - bigquery.SchemaField('cart_total', 'FLOAT', mode='NULLABLE') - ] - - events_schema = [ - bigquery.SchemaField('customer_id', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('event_id', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('event_date', 'DATE', mode='NULLABLE'), - bigquery.SchemaField('event_type', 'STRING', mode='NULLABLE') - ] - - transactions_schema = [ - bigquery.SchemaField('transaction_id', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('customer_id', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('transaction_quantity', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('transaction_value', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('transaction_type', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('app_purchase_quantity', 'INTEGER', mode='NULLABLE'), - bigquery.SchemaField('is_online', 'BOOLEAN', mode='NULLABLE'), - bigquery.SchemaField('transaction_date', 'DATE', mode='NULLABLE'), - bigquery.SchemaField('product_name', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('product_id', 'INTEGER', mode='NULLABLE') - ] - - metadata_schema = [ - bigquery.SchemaField('dataset_id', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('table_id', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('column_id', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('description', 'STRING', mode='NULLABLE'), - bigquery.SchemaField('is_primary_key', 'BOOLEAN', mode='NULLABLE'), - bigquery.SchemaField('is_foreign_key', 'BOOLEAN', mode='NULLABLE') - ] - - if(create_tables): - print('Creating tables ...') - for table_id, table_schema in zip(['customers', 'events', 'transactions', 'metadata'], - [customers_schema, events_schema, transactions_schema, metadata_schema]): - table_id = f'{PROJECT_ID}.{DATASET_ID}.{table_id}' - table = bigquery.Table(table_id, schema=table_schema) - table = bq_client.create_table(table) - - - print('Generating and populating METADATA table ...') - table_id = f"{PROJECT_ID}.{DATASET_ID}.metadata" - bq_client.load_table_from_json( - get_metadata_data(DATASET_ID=DATASET_ID), - destination=bigquery.Table(table_ref=table_id, schema=metadata_schema) - ) - - print('Generating and populating CUSTOMERS table ...') - customers_data = create_and_populate_customers() - table_id = f"{PROJECT_ID}.{DATASET_ID}.customers" - bq_client.load_table_from_json( - customers_data, - destination=bigquery.Table(table_ref=table_id, schema=customers_schema)) - - print('Generating and populating EVENTS table ...') - events_data = create_and_populate_events() - table_id = f"{PROJECT_ID}.{DATASET_ID}.events" - bq_client.load_table_from_json( - events_data, - destination=bigquery.Table(table_ref=table_id, schema=events_schema)) - - print('Generating and populating TRANSACTIONS table ...') - transactions_data = create_and_populate_transactions() - table_id = f"{PROJECT_ID}.{DATASET_ID}.transactions" - bq_client.load_table_from_json( - transactions_data, - destination=bigquery.Table(table_ref=table_id, schema=transactions_schema)) \ No newline at end of file diff --git a/notebooks/aux_data/events_aux_data.py b/notebooks/aux_data/events_aux_data.py deleted file mode 100644 index 9bf478e..0000000 --- a/notebooks/aux_data/events_aux_data.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -event_type = [{ - "event_type": "email-open" -}, { - "event_type": "in-store-purchase" -}, { - "event_type": "online-interaction" -}] \ No newline at end of file diff --git a/notebooks/aux_data/metadata_aux_data.py b/notebooks/aux_data/metadata_aux_data.py deleted file mode 100644 index 8ff8499..0000000 --- a/notebooks/aux_data/metadata_aux_data.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def get_metadata_data(DATASET_ID: str = 'cdp_dataset') -> list: - return [{ - "dataset_id": DATASET_ID, - "table_id": "events", - "column_id": "event_id", - "description": "A unique identifier for the event.", - "is_primary_key": "true", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "events", - "column_id": "event_date", - "description": "The date of the event.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "events", - "column_id": "event_type", - "description": "The type of event.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "customer_id", - "description": "A unique identifier of the customer.", - "is_primary_key": "true", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "email", - "description": "The customer's email address.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "city", - "description": "The city where the customer lives.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "state", - "description": "The state where the customer lives.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "channel", - "description": "The channel through which the customer was acquired.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "total_purchases", - "description": "The total number of purchases made by the customer.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "total_value", - "description": "The total value of all purchases made by the customer.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "total_emails", - "description": "The total number of emails opened by the customer.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "loyalty_score", - "description": "A score that measures the customer's engagement with the company.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "is_media_follower", - "description": "Whether the customer is a social media follower.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "last_sign_up_date", - "description": "The date the customer signed up.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "last_purchase_date", - "description": "The date the customer made their last purchase.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "last_activity_date", - "description": "The date of the customer's last account activity.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "customers", - "column_id": "cart_total", - "description": "The value of the items in the customer's shopping cart.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "transaction_id", - "description": "A unique identifier for the transaction.", - "is_primary_key": "true", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "transaction_quantity", - "description": "The quantity of items purchased in the transaction.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "transaction_value", - "description": "The total value of the transaction.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "transaction_type", - "description": "The type of transaction (e.g., purchase, refund, etc.).", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "app_purchase_qnt", - "description": "The value of the in-app purchase.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "is_purchase_online", - "description": "Whether the purchase was made online.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "transaction_date", - "description": "The date the transaction was made.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "product_name", - "description": "The name of the product that was purchased.", - "is_primary_key": "false", - "is_foreign_key": "false" - }, { - "dataset_id": DATASET_ID, - "table_id": "events", - "column_id": "customer_id", - "description": "A unique identifier of the customer.", - "is_primary_key": "false", - "is_foreign_key": "true" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "customer_id", - "description": "A unique identifier of the customer.", - "is_primary_key": "false", - "is_foreign_key": "true" - }, { - "dataset_id": DATASET_ID, - "table_id": "transactions", - "column_id": "product_id", - "description": "The code of the inventory item that was purchased.", - "is_primary_key": "false", - "is_foreign_key": "true" - }] \ No newline at end of file diff --git a/notebooks/aux_data/transactions_aux_data.py b/notebooks/aux_data/transactions_aux_data.py deleted file mode 100644 index bc57424..0000000 --- a/notebooks/aux_data/transactions_aux_data.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -transaction_type = [{ - "transaction_type": "in-app-purchase" -}, { - "transaction_type": "online-purchase" -}, { - "transaction_type": "in-store-purchase" -}, { - "transaction_type": "recurrent-customer" -}, { - "transaction_type": "thirdparty-purchase" -}] - - -product_name = [{ - "product_name": "Coat", "product_id": 1 -}, { - "product_name": "misc", "product_id": 2 -}, { - "product_name": "Handbag", "product_id": 3 -}, { - "product_name": "Headset", "product_id": 4 -}, { - "product_name": "Backpack", "product_id": 5 -}, { - "product_name": "Flashlight", "product_id": 6 -}, { - "product_name": "Hiking Boots", "product_id": 7 -}, { - "product_name": "Running Shoes", "product_id": 8 -}] \ No newline at end of file