-
Notifications
You must be signed in to change notification settings - Fork 213
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add DAG for filtering archived providers in catalog (#3259)
* Add deleted media tables * Set up new columns * Add dag for deleting records * Add tests * Update dag docs * Remove unnecessary retries * Pull RETURN_ROW_COUNT out into utility * Clean up comments * Update dag docs * Simplify table creation
- Loading branch information
Showing
18 changed files
with
617 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from datetime import datetime, timedelta | ||
|
||
|
||
DAG_ID = "delete_records" | ||
SLACK_USERNAME = "Upstream Delete Records" | ||
SLACK_ICON = ":database:" | ||
START_DATE = datetime(2023, 10, 25) | ||
DAGRUN_TIMEOUT = timedelta(days=31 * 3) | ||
CREATE_TIMEOUT = timedelta(hours=6) | ||
DELETE_TIMEOUT = timedelta(hours=1) | ||
|
||
CREATE_RECORDS_QUERY = """ | ||
INSERT INTO {destination_table} ({destination_cols}) | ||
SELECT {source_cols} | ||
FROM {source_table} | ||
{select_query} | ||
""" | ||
DELETE_RECORDS_QUERY = """ | ||
DELETE FROM {table} | ||
{select_query} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import logging | ||
from datetime import timedelta | ||
|
||
from airflow.decorators import task | ||
from airflow.models.abstractoperator import AbstractOperator | ||
|
||
from common import slack | ||
from common.constants import POSTGRES_CONN_ID | ||
from common.sql import RETURN_ROW_COUNT, PostgresHook | ||
from common.storage.columns import DELETED_ON, Column | ||
from common.storage.db_columns import ( | ||
setup_db_columns_for_media_type, | ||
setup_deleted_db_columns_for_media_type, | ||
) | ||
from database.delete_records import constants | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def run_sql( | ||
sql_template: str, | ||
postgres_conn_id: str = POSTGRES_CONN_ID, | ||
task: AbstractOperator = None, | ||
timeout: timedelta = None, | ||
handler: callable = RETURN_ROW_COUNT, | ||
**kwargs, | ||
): | ||
query = sql_template.format(**kwargs) | ||
|
||
postgres = PostgresHook( | ||
postgres_conn_id=postgres_conn_id, | ||
default_statement_timeout=( | ||
timeout if timeout else PostgresHook.get_execution_timeout(task) | ||
), | ||
) | ||
|
||
return postgres.run(query, handler=handler) | ||
|
||
|
||
@task | ||
@setup_deleted_db_columns_for_media_type | ||
@setup_db_columns_for_media_type | ||
def create_deleted_records( | ||
*, | ||
select_query: str, | ||
deleted_reason: str, | ||
media_type: str, | ||
db_columns: list[Column] = None, | ||
deleted_db_columns: list[Column] = None, | ||
task: AbstractOperator = None, | ||
postgres_conn_id: str = POSTGRES_CONN_ID, | ||
): | ||
""" | ||
Select records from the given media table using the select query, and then for each | ||
record create a corresponding record in the Deleted Media table. | ||
""" | ||
|
||
destination_cols = ", ".join([col.db_name for col in deleted_db_columns]) | ||
|
||
# To build the source columns, we first list all columns in the main media table | ||
source_cols = ", ".join([col.db_name for col in db_columns]) | ||
# Then add the deleted-media specific columns. | ||
# `deleted_on` is set to its insert value to get the current timestamp: | ||
source_cols += f", {DELETED_ON.get_insert_value()}" | ||
# `deleted_reason` is set to the given string | ||
source_cols += f", '{deleted_reason}'" | ||
|
||
return run_sql( | ||
sql_template=constants.CREATE_RECORDS_QUERY, | ||
postgres_conn_id=postgres_conn_id, | ||
task=task, | ||
destination_table=f"deleted_{media_type}", | ||
destination_cols=destination_cols, | ||
source_table=media_type, | ||
source_cols=source_cols, | ||
select_query=select_query, | ||
) | ||
|
||
|
||
@task | ||
def delete_records_from_media_table( | ||
table: str, select_query: str, postgres_conn_id: str = POSTGRES_CONN_ID | ||
): | ||
"""Delete records matching the select_query from the given media table.""" | ||
return run_sql( | ||
sql_template=constants.DELETE_RECORDS_QUERY, | ||
table=table, | ||
select_query=select_query, | ||
) | ||
|
||
|
||
@task | ||
def notify_slack(text: str) -> str: | ||
"""Send a message to Slack.""" | ||
slack.send_message( | ||
text, | ||
username=constants.SLACK_USERNAME, | ||
icon_emoji=constants.SLACK_ICON, | ||
dag_id=constants.DAG_ID, | ||
) | ||
|
||
return text |
114 changes: 114 additions & 0 deletions
114
catalog/dags/database/delete_records/delete_records_dag.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
""" | ||
# Delete Records DAG | ||
This DAG is used to delete records from the Catalog media tables, after creating a | ||
corresponding record in the associated `deleted_<media_type>` table for each record | ||
to be deleted. It is important to note that records deleted by this DAG will still be | ||
available in the API until the next data refresh runs. | ||
Required Dagrun Configuration parameters: | ||
* table_name: the name of the table to delete from. Must be a valid media table | ||
* select_query: a SQL `WHERE` clause used to select the rows that will be deleted | ||
* reason: a string explaining the reason for deleting the records. Ex ('deadlink') | ||
An example dag_run configuration used to delete all records for the "foo" image provider | ||
due to deadlinks would look like this: | ||
``` | ||
{ | ||
"table_name": "image", | ||
"select_query": "WHERE provider='foo'", | ||
"reason": "deadlink" | ||
} | ||
``` | ||
## Warnings | ||
Presently, there is no logic to prevent records that have an entry in a Deleted Media | ||
table from simply being reingested during provider ingestion. Therefore in its current | ||
state, the DAG should _only_ be used to delete records that we can guarantee will not | ||
be reingested (for example, because the provider is archived). | ||
This DAG does not have automated handling for deadlocks, so you must be certain that | ||
records selected for deletion in this DAG are not also being written to by a provider | ||
DAG, for instance. The simplest way to do this is to ensure that any affected provider | ||
DAGs are not currently running. | ||
""" | ||
|
||
|
||
import logging | ||
|
||
from airflow.decorators import dag | ||
from airflow.models.param import Param | ||
|
||
from common.constants import AUDIO, DAG_DEFAULT_ARGS, MEDIA_TYPES | ||
from database.delete_records import constants | ||
from database.delete_records.delete_records import ( | ||
create_deleted_records, | ||
delete_records_from_media_table, | ||
notify_slack, | ||
) | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@dag( | ||
dag_id=constants.DAG_ID, | ||
schedule=None, | ||
start_date=constants.START_DATE, | ||
tags=["database"], | ||
dagrun_timeout=constants.DAGRUN_TIMEOUT, | ||
doc_md=__doc__, | ||
default_args={**DAG_DEFAULT_ARGS, "retries": 0}, | ||
render_template_as_native_obj=True, | ||
params={ | ||
"table_name": Param( | ||
default=AUDIO, | ||
enum=MEDIA_TYPES, | ||
description="The name of the media table from which to select records.", | ||
), | ||
"select_query": Param( | ||
default="WHERE...", | ||
type="string", | ||
description=( | ||
"The `WHERE` clause of a query that selects all the rows to" | ||
" be deleted." | ||
), | ||
pattern="^WHERE", | ||
), | ||
"reason": Param( | ||
default="", | ||
type="string", | ||
description="Short descriptor of the reason for deleting the records.", | ||
), | ||
}, | ||
) | ||
def delete_records(): | ||
# Create the records in the Deleted Media table | ||
insert_into_deleted_media_table = create_deleted_records.override( | ||
task_id="update_deleted_media_table", execution_timeout=constants.CREATE_TIMEOUT | ||
)( | ||
select_query="{{ params.select_query }}", | ||
deleted_reason="{{ params.reason }}", | ||
media_type="{{ params.table_name }}", | ||
) | ||
|
||
# If successful, delete the records from the media table | ||
delete_records = delete_records_from_media_table.override( | ||
execution_timeout=constants.DELETE_TIMEOUT | ||
)(table="{{ params.table_name }}", select_query="{{ params.select_query }}") | ||
|
||
notify_complete = notify_slack( | ||
text=( | ||
f"Deleted {delete_records} records from the" | ||
" {{ params.table_name }} table matching query: `{{ params.select_query }}`" | ||
), | ||
) | ||
|
||
insert_into_deleted_media_table >> delete_records >> notify_complete | ||
|
||
|
||
delete_records() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.