Update the management command to accept datasets_list env var

The management commands for create, relate and truncate data now accept the datasets_list and datasets_exclude and env vars. These values can also be set using the named arguments in the command. For backwards compatibility the current positional argument and the --exclude argument are kept in place.
Amsterdam · Jan 20, 2025 · b8614c3 · b8614c3
1 parent 0e7558d
commit b8614c3
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 30 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,13 @@
+# 2025-01-16 (6.1.3)
+
+* Fixed `create_mock_data` to accept valid Amsterdam Schema
+* Updated the arguments for the `create_mock_data`, `relate_mock_data` and `truncate_tables` to accept the
+  DATASETS_LIST and DATASETS_EXCLUDE env vars.
+
+# 2024-10-12 (6.1.2)
+
+* Added pool_pre_ping=True to fix connection pool issues.
+
 # 2024-10-12 (6.1.2)
 
 * Added pool_pre_ping=True to fix connection pool issues.

diff --git a/README.md b/README.md
@@ -132,15 +132,16 @@ schema-tools, make sure one of the commit increments the version number in
 The schematools library contains two Django management commands to generate
 mock data. The first one is `create_mock_data` which generates mock data for all
 the datasets that are found at the configured schema location `SCHEMA_URL`
-(where `SCHEMA_URL` can be configure to point to a path at the local filesystem).
+(where `SCHEMA_URL` can be configured to point to a path at the local filesystem).
 
-The `create_mock_data` command processes all datasets. However, it is possible
-to limit this by adding positional arguments. These positional arguments can be
-dataset ids or paths to the location of the `dataset.json` on the local filesystem.
+The `create_mock_data` command expects either a list of dataset ids to include or a
+list of dataset ids to exclude. The datasets to include can be provided as positional arguments
+or using the --datasets-list argument, which defaults to the environment variable
+`DATASETS_LIST`. To exclude datasets the `--datasets-exclude` argument or the
+environment variables `DATASET_EXCLUDE` can be used.
 
-Furthermore, the command has some options, e.g. to change
-the default number of generated records (`--size`) or to reverse meaning of the positional
-arguments using `--exclude`.
+Furthermore, the command has the options to change the default number of
+generated records (`--size`).
 
 To avoid duplicate primary keys on subsequent runs the `--start-at` options can be used
 to start autonumbering of primary keys at an offset.
@@ -152,10 +153,17 @@ autonumbering of primary keys at 50.
     django create_mock_data bag gebieden --size 5 --start-at 50
 ```
 
+or by using the environment variable
+
+```
+    export DATASETS_LIST=bag,gebieden
+    django create_mock_data --size 5 --start-at 50
+```
+
 To generate records for all datasets, except for the `fietspaaltjes` dataset:
 
 ```
-    django create_mock_data fietspaaltjes --exclude  # or -x
+    django create_mock_data --datasets-exclude fietspaaltjes  # or --exclude
 ```
 
 To generate records for the `bbga` dataset, by loading the schema from the local filesystem:
@@ -188,7 +196,7 @@ To add relations for `bag` and `gebieden` only:
 To add relations for all datasets except `meetbouten`:
 
 ```
-    django relate_mock_data meetbouten --exclude  # or -x
+    django relate_mock_data --datasets-exclude meetbouten # or --exclude
 ```
 
 NB. When only a subset of the datasets is being mocked, the command can fail when datasets that

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = amsterdam-schema-tools
-version = 6.1.2
+version = 6.1.3
 url = https://github.com/amsterdam/schema-tools
 license = Mozilla Public 2.0
 author = Team Data Diensten, van het Dataplatform onder de Directie Digitale Voorzieningen (Gemeente Amsterdam)

diff --git a/src/schematools/contrib/django/management/commands/__init__.py b/src/schematools/contrib/django/management/commands/__init__.py
@@ -3,6 +3,7 @@
 from collections.abc import Iterable
 from typing import Any
 
+from django.conf import settings
 from django.core.management import BaseCommand, CommandError, CommandParser
 
 from schematools.contrib.django.models import Dataset
@@ -13,32 +14,52 @@ class BaseDatasetCommand(BaseCommand):
 
     def add_arguments(self, parser: CommandParser) -> None:
         """Provide default arguments to pass dataset names to this management command."""
-        parser.add_argument("dataset", nargs="*", help="Names of the datasets.")
+        parser.add_argument(
+            "dataset", nargs="*", help="Datasets to use. Takes precedent over --datasets-list"
+        )
+        parser.add_argument(
+            "--datasets-list",
+            nargs="*",
+            default=settings.DATASETS_LIST,
+            help=f"Datasets to use (default: {settings.DATASETS_LIST})",
+        )
         parser.add_argument(
             "-x",
             "--exclude",
-            dest="exclude",
+            "--datasets-exclude",
+            dest="datasets_exclude",
             nargs="*",
-            default=[],
-            help="Datasets that need to be skipped.",
+            default=settings.DATASETS_EXCLUDE,
+            help="Datasets that need to be skipped. (default: {settings.DATASETS_EXCLUDE})",
         )
 
     def get_datasets(
         self, options: dict[str, Any], enable_db=None, default_all=False
     ) -> Iterable[Dataset]:
         """Provide the datasets based on the command options"""
-        datasets = Dataset.objects.all()
+        # Provide backwards compatibility for the positional argument datasets
+        options["datasets_list"] = (
+            options["dataset"] if options["dataset"] else options["datasets_list"]
+        )
+
+        if not options["datasets_list"] and not options["datasets_exclude"] and not default_all:
+            raise CommandError(
+                "Provide at least a dataset using --datasets-list, "
+                "or use the --datasets-exclude option."
+            )
+        queryset = Dataset.objects.all()
         if enable_db is not None:
-            datasets = datasets.filter(enable_db=enable_db)
-
-        if options["dataset"]:
-            names = set(options["dataset"]) - set(options["exclude"])
-            datasets = datasets.filter(name__in=names)
-            if invalid_names := names - {ds.name for ds in datasets}:
-                raise CommandError(f"Datasets not found: {', '.join(sorted(invalid_names))}")
-        elif options["exclude"]:
-            datasets = datasets.exclude(name__in=options["exclude"])
-        elif not default_all:
-            raise CommandError("Provide at least a dataset by name, or use the --exclude option.")
-
-        return datasets
+            queryset = queryset.filter(enable_db=enable_db)
+
+        datasets = {ds.name for ds in queryset}
+        if options["datasets_list"] is not None:
+            datasets = set(options["datasets_list"])
+
+        if options["datasets_exclude"] is not None:
+            datasets = datasets - set(options["datasets_exclude"])
+
+        queryset = queryset.filter(name__in=datasets)
+        if invalid_names := datasets - {ds.name for ds in queryset}:
+            raise CommandError(f"Datasets not found: {', '.join(sorted(invalid_names))}")
+
+        return queryset
diff --git a/src/schematools/contrib/django/management/commands/truncate_tables.py b/src/schematools/contrib/django/management/commands/truncate_tables.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from django.db import connection
+from django.db import ProgrammingError, connection
 
 from . import BaseDatasetCommand
 
@@ -20,4 +20,8 @@ def handle(self, *args, **options):  # noqa: D102
         with connection.cursor() as cursor:
             for db_table in sorted(db_tables):
                 self.stdout.write(f"Truncating {db_table}")
-                cursor.execute(f"TRUNCATE {db_table}")
+                try:
+                    cursor.execute(f"TRUNCATE {db_table}")
+                except ProgrammingError:
+                    # Catch missing tables, happens when views aren't generated on import_schemas
+                    self.stdout.write(f"Failed to truncate {db_table}")