@@ -131,9 +131,11 @@ class ConfigureSourcesWorkflowStep(WorkflowStep):
131
131
end_date : str
132
132
force_reimport : bool
133
133
force_reimport_not_chunk_partitioned : bool
134
+ enable_incremental_snapshots : bool
134
135
135
136
def __init__ (self , idt , name , type_value , state , timing , engine_size , config_files , rel_config_dir , sources ,
136
- paths_builders , start_date , end_date , force_reimport , force_reimport_not_chunk_partitioned ):
137
+ paths_builders , start_date , end_date , force_reimport , force_reimport_not_chunk_partitioned ,
138
+ enable_incremental_snapshots ):
137
139
super ().__init__ (idt , name , type_value , state , timing , engine_size )
138
140
self .config_files = config_files
139
141
self .rel_config_dir = rel_config_dir
@@ -143,6 +145,7 @@ def __init__(self, idt, name, type_value, state, timing, engine_size, config_fil
143
145
self .end_date = end_date
144
146
self .force_reimport = force_reimport
145
147
self .force_reimport_not_chunk_partitioned = force_reimport_not_chunk_partitioned
148
+ self .enable_incremental_snapshots = enable_incremental_snapshots
146
149
147
150
def _execute (self , logger : logging .Logger , env_config : EnvConfig , rai_config : RaiConfig ):
148
151
rai .install_models (logger , rai_config , env_config , build_models (self .config_files , self .rel_config_dir ))
@@ -152,6 +155,15 @@ def _execute(self, logger: logging.Logger, env_config: EnvConfig, rai_config: Ra
152
155
declared_sources = {src ["source" ]: src for src in
153
156
rai .execute_relation_json (logger , rai_config , env_config ,
154
157
constants .DECLARED_DATE_PARTITIONED_SOURCE_REL )}
158
+
159
+ # if `enable_incremental_snapshots` is set to True, we need to filter out snapshot sources
160
+ if self .enable_incremental_snapshots :
161
+ # get a set of snapshot source names from the `relation` field
162
+ snapshot_sources = {src .relation for src in self .sources if src .snapshot_validity_days and
163
+ src .snapshot_validity_days >= 0 }
164
+ logger .info (f"Snapshot sources skipped, will be reloaded incrementally: { snapshot_sources } " )
165
+ # filter out snapshot sources from the declared sources
166
+ declared_sources = {k : v for k , v in declared_sources .items () if k not in snapshot_sources }
155
167
expired_sources = self ._calculate_expired_sources (logger , declared_sources )
156
168
157
169
# mark declared sources for reimport
@@ -284,14 +296,16 @@ def _get_step(self, logger: logging.Logger, config: WorkflowConfig, idt, name, t
284
296
force_reimport = config .step_params .get (constants .FORCE_REIMPORT , False )
285
297
force_reimport_not_chunk_partitioned = config .step_params .get (constants .FORCE_REIMPORT_NOT_CHUNK_PARTITIONED ,
286
298
False )
299
+ enable_incremental_snapshots = config .step_params [constants .ENABLE_INCREMENTAL_SNAPSHOTS ]
287
300
paths_builders = {}
288
301
for src in sources :
289
302
container = src .container
290
303
if container .name not in paths_builders :
291
304
paths_builders [container .name ] = paths .PathsBuilderFactory .get_path_builder (container )
292
305
return ConfigureSourcesWorkflowStep (idt , name , type_value , state , timing , engine_size , step ["configFiles" ],
293
306
rel_config_dir , sources , paths_builders , start_date , end_date ,
294
- force_reimport , force_reimport_not_chunk_partitioned )
307
+ force_reimport , force_reimport_not_chunk_partitioned ,
308
+ enable_incremental_snapshots )
295
309
296
310
@staticmethod
297
311
def _parse_sources (step : dict , env_config : EnvConfig ) -> List [Source ]:
@@ -328,11 +342,14 @@ def _parse_sources(step: dict, env_config: EnvConfig) -> List[Source]:
328
342
class LoadDataWorkflowStep (WorkflowStep ):
329
343
collapse_partitions_on_load : bool
330
344
load_jointly : bool
345
+ enable_incremental_snapshots : bool
331
346
332
- def __init__ (self , idt , name , type_value , state , timing , engine_size , collapse_partitions_on_load , load_jointly ):
347
+ def __init__ (self , idt , name , type_value , state , timing , engine_size , collapse_partitions_on_load , load_jointly ,
348
+ enable_incremental_snapshots ):
333
349
super ().__init__ (idt , name , type_value , state , timing , engine_size )
334
350
self .collapse_partitions_on_load = collapse_partitions_on_load
335
351
self .load_jointly = load_jointly
352
+ self .enable_incremental_snapshots = enable_incremental_snapshots
336
353
337
354
def _execute (self , logger : logging .Logger , env_config : EnvConfig , rai_config : RaiConfig ):
338
355
rai .execute_query (logger , rai_config , env_config , q .DELETE_REFRESHED_SOURCES_DATA , readonly = False )
@@ -353,6 +370,7 @@ def _execute(self, logger: logging.Logger, env_config: EnvConfig, rai_config: Ra
353
370
354
371
self ._load_simple_resources (logger , env_config , rai_config , simple_resources )
355
372
373
+ # note: async resources do not support snapshot diffing, as CDC should be incremental
356
374
if async_resources :
357
375
self ._load_async_resources (logger , env_config , rai_config , async_resources )
358
376
@@ -425,28 +443,29 @@ def _get_date_part_load_query(self, logger: logging.Logger, config, src):
425
443
resources = []
426
444
for d in srcs :
427
445
resources += d ["resources" ]
428
- return [q .load_resources (logger , config , resources , src )]
446
+ return [q .load_resources (logger , config , resources , src , self . enable_incremental_snapshots )]
429
447
else :
430
448
logger .info (f"Loading '{ source_name } ' one date partition at a time" )
431
449
batch = []
432
450
for d in src ["dates" ]:
433
451
logger .info (f"Loading partition for date { d ['date' ]} " )
434
452
435
453
for res in d ["resources" ]:
436
- batch .append (q .load_resources (logger , config , [res ], src ))
454
+ batch .append (
455
+ q .load_resources (logger , config , [res ], src , self .enable_incremental_snapshots ))
437
456
return batch
438
457
439
458
def _get_simple_src_load_query (self , logger : logging .Logger , config , src ):
440
459
source_name = src ["source" ]
441
460
logger .info (f"Loading source '{ source_name } ' not partitioned by date" )
442
461
if self .collapse_partitions_on_load :
443
462
logger .info (f"Loading '{ source_name } ' all chunk partitions simultaneously" )
444
- return [q .load_resources (logger , config , src ["resources" ], src )]
463
+ return [q .load_resources (logger , config , src ["resources" ], src , self . enable_incremental_snapshots )]
445
464
else :
446
465
logger .info (f"Loading '{ source_name } ' one chunk partition at a time" )
447
466
batch = []
448
467
for res in src ["resources" ]:
449
- batch .append (q .load_resources (logger , config , [res ], src ))
468
+ batch .append (q .load_resources (logger , config , [res ], src , self . enable_incremental_snapshots ))
450
469
return batch
451
470
452
471
@staticmethod
@@ -470,8 +489,9 @@ def _get_step(self, logger: logging.Logger, config: WorkflowConfig, idt, name, t
470
489
engine_size , step : dict ) -> WorkflowStep :
471
490
collapse_partitions_on_load = config .step_params [constants .COLLAPSE_PARTITIONS_ON_LOAD ]
472
491
load_jointly = config .step_params [constants .LOAD_DATA_JOINTLY ]
492
+ enable_incremental_snapshots = config .step_params [constants .ENABLE_INCREMENTAL_SNAPSHOTS ]
473
493
return LoadDataWorkflowStep (idt , name , type_value , state , timing , engine_size , collapse_partitions_on_load ,
474
- load_jointly )
494
+ load_jointly , enable_incremental_snapshots )
475
495
476
496
477
497
class MaterializeWorkflowStep (WorkflowStep ):
0 commit comments