From 83fc6f5eb300de03fa5367279974ffd47c7e9534 Mon Sep 17 00:00:00 2001 From: Adam Sachs Date: Wed, 22 Jan 2025 17:43:40 -0500 Subject: [PATCH] initial cut of asset model --- .../versions/021166731846_add_asset_table.py | 101 +++++ src/fides/api/db/base.py | 1 + src/fides/api/models/asset.py | 159 ++++++++ src/fides/api/models/sql_models.py | 4 + tests/conftest.py | 20 + tests/ctl/models/test_asset.py | 346 ++++++++++++++++++ 6 files changed, 631 insertions(+) create mode 100644 src/fides/api/alembic/migrations/versions/021166731846_add_asset_table.py create mode 100644 src/fides/api/models/asset.py create mode 100644 tests/ctl/models/test_asset.py diff --git a/src/fides/api/alembic/migrations/versions/021166731846_add_asset_table.py b/src/fides/api/alembic/migrations/versions/021166731846_add_asset_table.py new file mode 100644 index 0000000000..07295f26ea --- /dev/null +++ b/src/fides/api/alembic/migrations/versions/021166731846_add_asset_table.py @@ -0,0 +1,101 @@ +"""add asset table + +Revision ID: 021166731846 +Revises: 58f8edd66b69 +Create Date: 2025-01-22 22:14:35.548869 + +""" + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "021166731846" +down_revision = "58f8edd66b69" +branch_labels = None +depends_on = None + + +def upgrade(): + # stored separately to be referenced in md5 expression on index + + asset_table = op.create_table( + "asset", + sa.Column("id", sa.String(length=255), nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + sa.Column( + "updated_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + sa.Column("name", sa.String(), nullable=False), + sa.Column("asset_type", sa.String(), nullable=False), + sa.Column("domain", sa.String(), nullable=True), + sa.Column("parent", sa.String(), nullable=True), + sa.Column("parent_domain", sa.String(), nullable=True), + sa.Column( + "locations", + postgresql.ARRAY(sa.String()), + server_default="{}", + nullable=False, + ), + sa.Column("with_consent", sa.BOOLEAN(), nullable=False), + sa.Column( + "data_uses", + postgresql.ARRAY(sa.String()), + server_default="{}", + nullable=False, + ), + sa.Column( + "meta", + postgresql.JSONB(astext_type=sa.Text()), + server_default="{}", + nullable=False, + ), + sa.Column("path", sa.String(), nullable=True), + sa.Column("base_url", sa.String(), nullable=True), + sa.Column("system_id", sa.String(), nullable=True), + sa.ForeignKeyConstraint(["system_id"], ["ctl_systems.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_asset_asset_type"), "asset", ["asset_type"], unique=False) + op.create_index(op.f("ix_asset_domain"), "asset", ["domain"], unique=False) + op.create_index(op.f("ix_asset_id"), "asset", ["id"], unique=False) + op.create_index(op.f("ix_asset_name"), "asset", ["name"], unique=False) + op.create_index(op.f("ix_asset_system_id"), "asset", ["system_id"], unique=False) + + op.create_index( + op.f("ix_asset_name_asset_type_domain_base_url_system_id"), + "asset", + [ + "name", + "asset_type", + "domain", + sa.text("coalesce(md5(base_url), 'NULL')"), + "system_id", + ], + unique=True, + ) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_asset_system_id"), table_name="asset") + op.drop_index(op.f("ix_asset_name"), table_name="asset") + op.drop_index(op.f("ix_asset_id"), table_name="asset") + op.drop_index(op.f("ix_asset_domain"), table_name="asset") + op.drop_index(op.f("ix_asset_asset_type"), table_name="asset") + op.drop_index( + op.f("ix_asset_name_asset_type_domain_base_url_system_id"), table_name="asset" + ) + op.drop_table("asset") + # ### end Alembic commands ### diff --git a/src/fides/api/db/base.py b/src/fides/api/db/base.py index 9743c8dc29..4a7a3495a0 100644 --- a/src/fides/api/db/base.py +++ b/src/fides/api/db/base.py @@ -3,6 +3,7 @@ # imported by Alembic from fides.api.db.base_class import Base from fides.api.models.application_config import ApplicationConfig +from fides.api.models.asset import Asset from fides.api.models.audit_log import AuditLog from fides.api.models.authentication_request import AuthenticationRequest from fides.api.models.client import ClientDetail diff --git a/src/fides/api/models/asset.py b/src/fides/api/models/asset.py new file mode 100644 index 0000000000..e351875ea7 --- /dev/null +++ b/src/fides/api/models/asset.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +from typing import Any, Dict, Optional, Type + +from sqlalchemy import ( + ARRAY, + BOOLEAN, + Column, + ForeignKey, + Index, + String, + func, + insert, + select, + update, +) +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.ext.mutable import MutableDict +from sqlalchemy.orm import relationship + +from fides.api.db.base_class import Base +from fides.api.models.sql_models import System + + +class Asset(Base): + """ + Web assets associated with a system. This model will supersede `Cookies` once we have established + a migration path and backward compatibility with all `Cookies` related APIs. + """ + + # Common attributes + name = Column(String, index=True, nullable=False) + asset_type = Column(String, index=True, nullable=False) + domain = Column(String, index=True) + parent = Column(String) + parent_domain = Column(String) + locations = Column(ARRAY(String), server_default="{}", nullable=False) + with_consent = Column(BOOLEAN, default=False, nullable=False) + data_uses = Column(ARRAY(String), server_default="{}", nullable=False) + + # generic object to store additional attributes, specific to asset type + meta = Column( + MutableDict.as_mutable(JSONB), + nullable=False, + server_default="{}", + default=dict, + ) + + # Cookie-specific attributes + path = Column(String) + + # Browser request-specific attributes + base_url = Column(String) + + system_id = Column( + String, ForeignKey(System.id_field_path, ondelete="CASCADE"), index=True + ) # If system is deleted, remove the associated assets. + + system = relationship( + System, + back_populates="assets", + cascade="all,delete", + uselist=False, + lazy="selectin", + ) + + # we need to use an md5 of the base_url to avoid constraint/index length issues + # and we need to use a unique index, rather than constraint, since postgresql constraints + # do not support expressions, only direct column references + __table_args__ = ( + Index( + "ix_asset_name_asset_type_domain_base_url_system_id", + name, + asset_type, + domain, + func.coalesce(func.md5(base_url), "NULL"), + system_id, + unique=True, + ), + ) + + @classmethod + async def upsert_async( + cls: Type[Asset], + async_session: AsyncSession, + *, + data: Dict[str, Any], + ) -> Asset: + """ + Creates a new Asset record if it does not exist, otherwise updates the existing Asset record + with the attribute values provided in the `data` dict. + + Assets are looked up by the provided attributes that make up their uniqueness criteria: + - name + - asset_type + - domain + - base_url (if applicable) + - system_id. + """ + if ( + "name" not in data + or "asset_type" not in data + or "domain" not in data + or "system_id" not in data + ): + raise ValueError( + "name, asset_type, domain, and system_id are required fields on assets" + ) + + result = await async_session.execute( + select(cls).where( + cls.name == data["name"], + cls.asset_type == data["asset_type"], + cls.domain == data["domain"], + cls.system_id == data["system_id"], + cls.base_url == data.get("base_url"), + ) + ) # type: ignore[arg-type] + existing_record = result.scalars().first() + record_id: str + if existing_record: + await async_session.execute( + update(cls).where(cls.id == existing_record.id).values(data) # type: ignore[arg-type] + ) + record_id = existing_record.id + else: + result = await async_session.execute(insert(cls).values(data)) # type: ignore[arg-type] + record_id = result.inserted_primary_key.id + + result = await async_session.execute(select(cls).where(cls.id == record_id)) # type: ignore[arg-type] + return result.scalars().first() + + @classmethod + async def get_by_system_async( + cls: Type[Asset], + async_session: AsyncSession, + system_id: Optional[str] = None, + system_fides_key: Optional[str] = None, + ) -> list[Asset]: + """ + Retrieves all assets associated with a given system, + using the provided system `id` or `fides_key`, whichever is provided + """ + if system_id: + query = select(cls).where(cls.system_id == system_id) + else: + if not system_fides_key: + raise ValueError( + "Either system_id or system_fides_key must be provided" + ) + query = ( + select(cls) + .join(System, System.id == cls.system_id) + .where(System.fides_key == system_fides_key) + ) + + result = await async_session.execute(query) + return result.scalars().all() diff --git a/src/fides/api/models/sql_models.py b/src/fides/api/models/sql_models.py index ff45fd8b7a..db8add5351 100644 --- a/src/fides/api/models/sql_models.py +++ b/src/fides/api/models/sql_models.py @@ -438,6 +438,10 @@ class System(Base, FidesBase): "Cookies", back_populates="system", lazy="selectin", uselist=True, viewonly=True ) + assets = relationship( + "Asset", back_populates="system", lazy="selectin", uselist=True, viewonly=True + ) + @classmethod def get_data_uses( cls: Type[System], systems: List[System], include_parents: bool = True diff --git a/tests/conftest.py b/tests/conftest.py index 98e8f07f97..2a30eefe8d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,7 @@ from fastapi import Query from fastapi.testclient import TestClient from fideslang import DEFAULT_TAXONOMY, models +from fideslang.models import System as SystemSchema from httpx import AsyncClient from loguru import logger from sqlalchemy.engine.base import Engine @@ -28,6 +29,7 @@ JWE_PAYLOAD_SYSTEMS, ) from fides.api.db.ctl_session import sync_engine +from fides.api.db.system import create_system from fides.api.main import app from fides.api.models.privacy_request import ( EXITED_EXECUTION_LOG_STATUSES, @@ -1271,6 +1273,24 @@ def system(db: Session) -> System: return system +@pytest.fixture() +@pytest.mark.asyncio +async def system_async(async_session): + """Creates a system for testing with an async session, to be used in async tests""" + resource = SystemSchema( + fides_key=str(uuid4()), + organization_fides_key="default_organization", + name="test_system_1", + system_type="test", + privacy_declarations=[], + ) + + system = await create_system( + resource, async_session, CONFIG.security.oauth_root_client_id + ) + return system + + @pytest.fixture(scope="function") def system_hidden(db: Session) -> Generator[System, None, None]: system = System.create( diff --git a/tests/ctl/models/test_asset.py b/tests/ctl/models/test_asset.py new file mode 100644 index 0000000000..70f9c50c49 --- /dev/null +++ b/tests/ctl/models/test_asset.py @@ -0,0 +1,346 @@ +from typing import Any, Dict +from uuid import uuid4 + +import pytest +from fideslang.models import System as SystemSchema +from sqlalchemy import delete, insert, select +from sqlalchemy.exc import IntegrityError + +from fides.api.db.system import create_system +from fides.api.models.asset import Asset + +""" +Unit tests for the Asset model class. + +These tests are in the `test/ctl` subdir to load async db fixtures correctly. +""" + + +@pytest.fixture(autouse=True) +async def clear_table(async_session): + """Ensure a clean table state before and after each test.""" + async with async_session.begin(): + await async_session.execute(delete(Asset)) + yield + async with async_session.begin(): + await async_session.execute(delete(Asset)) + + +@pytest.fixture() +async def javascript_asset_data(system_async) -> Dict[str, Any]: + return { + "name": "gtm.js", + "asset_type": "javascript", + "base_url": "https://www.googletagmanager.com/gtm.js", + "domain": "www.googletagmanager.com", + "system_id": system_async.id, + "locations": ["US", "Canada"], + "data_uses": ["analytics"], + } + + +@pytest.fixture() +async def cookie_asset_data(system_async) -> Dict[str, Any]: + # cookies have no `base_url` + return { + "name": "cookie1", + "asset_type": "cookie", + "domain": "www.googletagmanager.com", + "system_id": system_async.id, + "locations": ["US", "Canada"], + "data_uses": ["analytics"], + } + + +@pytest.fixture +async def system_2_async(async_session): + resource = SystemSchema( + fides_key=str(uuid4()), + organization_fides_key="default_organization", + name="test_system_2", + system_type="test", + privacy_declarations=[], + ) + + system = await create_system( + resource, + async_session, + ) + return system + + +class TestUpsertAsset: + + async def test_upsert_asset_async(self, async_session, javascript_asset_data): + """ + Tests basic upsert (create and update) function of the Asset model. + + Ensures that upsert function defines uniqueness criteria based on input data. + """ + async with async_session.begin(): + created_asset = await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + # ensure our asset was stored in the DB properly + created_asset: Asset = ( + ( + await async_session.execute( + select(Asset).where(Asset.id == created_asset.id) + ) + ) + .scalars() + .first() + ) + + assert created_asset.name == "gtm.js" + assert created_asset.asset_type == "javascript" + assert created_asset.domain == "www.googletagmanager.com" + assert created_asset.system_id == javascript_asset_data["system_id"] + assert created_asset.base_url == "https://www.googletagmanager.com/gtm.js" + assert created_asset.locations == ["US", "Canada"] + assert created_asset.data_uses == ["analytics"] + + # update a field on the asset - specifically, this field is _not_ part of uniqueness criteria + javascript_asset_data["locations"] = ["US", "Canada", "EU"] + # and update the asset + await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + # retrieve updated asset, ensure it's the same record as before, + # but with updated locations + updated_asset: Asset = ( + ( + await async_session.execute( + select(Asset).where( + Asset.id == created_asset.id + ) # same ID as created asset above + ) + ) + .scalars() + .first() + ) + + # check locations have been updated + assert updated_asset.locations == [ + "US", + "Canada", + "EU", + ] # check updated locations + + # ensure all other attributes have stayed the same + assert updated_asset.name == created_asset.name + assert updated_asset.asset_type == created_asset.asset_type + assert updated_asset.domain == created_asset.domain + assert updated_asset.system_id == created_asset.system_id + assert updated_asset.base_url == created_asset.base_url + + # now, let's change an attribute that's part of the uniqueness criteria, + # and ensure a new asset is created + javascript_asset_data["name"] = "gtm2.js" + new_asset = await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + # now, retrieve new asset + new_asset: Asset = ( + ( + await async_session.execute( + select(Asset).where(Asset.id == new_asset.id) + ) + ) + .scalars() + .first() + ) + # ensure it is a _new_ record, i.e. `id` is different + assert new_asset.id != updated_asset.id + # ensure all other attributes are as expected + assert new_asset.name == "gtm2.js" + assert new_asset.asset_type == "javascript" + assert new_asset.domain == "www.googletagmanager.com" + assert new_asset.system_id == javascript_asset_data["system_id"] + assert new_asset.base_url == "https://www.googletagmanager.com/gtm.js" + + # @pytest.mark.skip(reason="Figuring out unique index constraints") + async def test_upsert_asset_no_base_url(self, async_session, cookie_asset_data): + """ + Ensures assets can be created and updated without a `base_url` even if `base_url` is part of unique key. + + Ensures adding a base URL to an asset properly creates a new record with the upsert function. + """ + + async with async_session.begin(): + created_asset = await Asset.upsert_async( + async_session=async_session, + data=cookie_asset_data, + ) + + # ensure our asset was stored in the DB properly + created_asset: Asset = ( + ( + await async_session.execute( + select(Asset).where(Asset.id == created_asset.id) + ) + ) + .scalars() + .first() + ) + + assert created_asset.name == "cookie1" + assert created_asset.asset_type == "cookie" + assert created_asset.domain == "www.googletagmanager.com" + assert created_asset.system_id == cookie_asset_data["system_id"] + assert created_asset.base_url is None + assert created_asset.locations == ["US", "Canada"] + assert created_asset.data_uses == ["analytics"] + + # now add a base_url, and ensure a _new_ record is created + cookie_asset_data["base_url"] = "https://www.googletagmanager.com/gtm.js" + new_asset = await Asset.upsert_async( + async_session=async_session, + data=cookie_asset_data, + ) + + # retrieve new asset + new_asset: Asset = ( + ( + await async_session.execute( + select(Asset).where(Asset.id == new_asset.id) + ) + ) + .scalars() + .first() + ) + # ensure this is an entirely new record + assert new_asset.id != created_asset.id + # ensure base_url is set + assert new_asset.base_url == "https://www.googletagmanager.com/gtm.js" + # sanity check other attributes are set correctly + assert new_asset.name == "cookie1" + assert new_asset.asset_type == "cookie" + assert new_asset.domain == "www.googletagmanager.com" + assert new_asset.system_id == cookie_asset_data["system_id"] + assert new_asset.locations == ["US", "Canada"] + + async def test_upsert_asset_requires_uniqueness_attributes( + self, async_session, javascript_asset_data + ): + """ + Ensures the upsert function raises a ValueError if required uniqueness attributes are not provided. + """ + # remove a required attribute + del javascript_asset_data["domain"] + with pytest.raises(ValueError) as e: + await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + +class TestGetAssetBySystem: + @pytest.fixture + async def create_assets_for_systems( + self, async_session, javascript_asset_data, system_2_async + ): + async with async_session.begin(): + # create one asset tied to system 1 + created_asset_1 = await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + # create another asset tied to system 1 + javascript_asset_data["name"] = "gtm2.js" + created_asset_2 = await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + # create a third asset tied to system 2 + javascript_asset_data["name"] = "gtm3.js" + javascript_asset_data["system_id"] = system_2_async.id + created_asset_3 = await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + return (created_asset_1, created_asset_2, created_asset_3) + + async def test_get_asset_by_system_async( + self, async_session, create_assets_for_systems, system_async, system_2_async + ): + """ + Tests the get_by_system_async function of the Asset model. + + Ensures that the function returns the correct assets for a given system. + """ + + async with async_session.begin(): + # now, retrieve assets for system 1 by its id + assets = await Asset.get_by_system_async( + async_session=async_session, + system_id=system_async.id, + ) + + # ensure we have 2 assets for system 1 + assert len(assets) == 2 + + # ensure the assets are as expected + assert create_assets_for_systems[0] in assets + assert create_assets_for_systems[1] in assets + + # and retrieve assets for system 1 by its key + assets = await Asset.get_by_system_async( + async_session=async_session, + system_fides_key=system_async.fides_key, + ) + + # ensure we have 2 assets for system 1 + assert len(assets) == 2 + + # ensure the assets are as expected + assert create_assets_for_systems[0] in assets + assert create_assets_for_systems[1] in assets + + # now, retrieve assets for system 2 by its id + assets = await Asset.get_by_system_async( + async_session=async_session, + system_id=system_2_async.id, + ) + + # ensure we have 1 assets for system 2 + assert len(assets) == 1 + + # ensure the assets are as expected + assert create_assets_for_systems[2] in assets + + +class TestAssetDbConstraints: + async def test_asset_unique_index(self, async_session, javascript_asset_data): + """ + Ensures the db-level unique index works as expected for the Asset model. + """ + async with async_session.begin(): + created_asset = await Asset.upsert_async( + async_session=async_session, + data=javascript_asset_data, + ) + + # now try to 'manually' insert a different asset but with same uniqueness criteria + # and ensure it's rejected at the db level. + # this is not done through the upsert function, as that would attempt to update the existing record + javascript_asset_data["locations"] = ["US", "Canada", "EU"] + with pytest.raises(IntegrityError): + await async_session.execute(insert(Asset).values(javascript_asset_data)) + + async with async_session.begin(): + # if we remove the base_url, this should be valid, as it's hitting a different unique key + del javascript_asset_data["base_url"] + result = await async_session.execute( + insert(Asset).values(javascript_asset_data) + ) + assert result.inserted_primary_key.id