Skip to content

Commit f2d398c

Browse files
add incremental snapshot support (#82)
* add incremental snapshot support * disable expiration reloading for snapshots * add snapshot catalog * fix test * fix test * update oracle * up version
1 parent 81a5609 commit f2d398c

21 files changed

+165
-41
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
device,IMEI
2+
IPhone8,125
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
device,last_seen
2+
IPhone1,2022-01-05
3+
IPhone2,2022-01-05
4+
IPhone7,2022-01-01
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
name
2+
Los Angeles
3+
Santa Monica
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
device,last_seen
2+
IPhone1,2022-01-09
3+
IPhone2,2022-01-09
4+
IPhone6,2021-11-16
5+
IPhone7,2022-01-01
6+
IPhone8,2022-01-09
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name
2+
IPhone1
3+
IPhone2
4+
IPhone8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name
2+
mobile_phone
3+
router
4+
adapter
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
name
2+
Store1
3+
Store2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name
2+
Andrew
3+
Sam
4+
Jordan

cli-e2e-test/rel/device_seen.rel

+9-7
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@ module import_config:device_seen_snapshot
55
}
66

77
bound syntax:header
8-
end
98

10-
module device_seen_snapshot
11-
def DEVICE_NAME[idx, row] = source_catalog:device_seen_snapshot[idx, :device, row]
12-
def LAST_SEEN[idx, row] = source_catalog:device_seen_snapshot[idx, :last_seen, row]
9+
@inline
10+
def row_key_map[R](idx, row, key) {
11+
^Device(R[idx, :device, row], key)
12+
}
1313
end
1414

15+
entity type Device = String
16+
1517
/*
1618
* device_seen snapshot is a snapshot of the last time a device was seen, which is
1719
* essentially an aggregation over the snapshot data and the current day's data
@@ -28,9 +30,9 @@ def device_last_seen[d] = max[t:
2830
device_seen_today(d, t)
2931
or
3032
(
31-
device_seen_snapshot:DEVICE_NAME(idx, row, d) and
32-
device_seen_snapshot:LAST_SEEN(idx, row, t)
33-
from idx, row
33+
snapshot_catalog:device_seen_snapshot:device(key, d) and
34+
snapshot_catalog:device_seen_snapshot:last_seen(key, t)
35+
from key
3436
)
3537
]
3638

cli-e2e-test/test_e2e.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,26 @@ def test_scenario1_model(self):
4141
self.assert_output_dir_files(self.test_scenario1_model.__name__)
4242

4343
def test_scenario1_load_data_jointly(self):
44-
# when
44+
# when loading as of 20220105
4545
test_args = ["--batch-config", "./config/model/scenario1.json",
4646
"--end-date", "20220105",
47-
"--drop-db", "--load-data-jointly"]
47+
"--drop-db", "--load-data-jointly",
48+
"--enable-incremental-snapshots"]
4849
rsp = call(self.cmd_with_common_arguments + test_args)
49-
# then
50+
# then should get the same result as other tests for scenario1
5051
self.assertNotEqual(rsp, 1)
5152
self.assert_output_dir_files(self.test_scenario1_model.__name__)
5253

54+
# when loading as of 20220109
55+
test_args = ["--batch-config", "./config/model/scenario1.json",
56+
"--end-date", "20220109",
57+
"--load-data-jointly",
58+
"--enable-incremental-snapshots"]
59+
rsp = call(self.cmd_with_common_arguments + test_args)
60+
# then should get an updated snapshot
61+
self.assertNotEqual(rsp, 1)
62+
self.assert_output_dir_files(self.test_scenario1_load_data_jointly.__name__)
63+
5364
def test_scenario1_model_yaml(self):
5465
# when
5566
test_args = ["--batch-config", "./config/model/scenario1.yaml",

cli/args.py

+6
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,12 @@ def parse() -> Namespace:
153153
action=BooleanOptionalAction,
154154
default=True
155155
)
156+
parser.add_argument(
157+
"--enable-incremental-snapshots",
158+
help="Loading snapshot sources via first doing a diff and applying insert:* and delete:* deltas",
159+
action=BooleanOptionalAction,
160+
default=False
161+
)
156162
parser.add_argument(
157163
"--recover",
158164
help="Recover a batch run starting from a FAILED step",

cli/runner.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ def start(factories: dict[str, workflow.executor.WorkflowStepFactory] = MappingP
5252
workflow.constants.FORCE_REIMPORT: args.force_reimport,
5353
workflow.constants.FORCE_REIMPORT_NOT_CHUNK_PARTITIONED: args.force_reimport_not_chunk_partitioned,
5454
workflow.constants.COLLAPSE_PARTITIONS_ON_LOAD: args.collapse_partitions_on_load,
55-
workflow.constants.LOAD_DATA_JOINTLY: args.load_data_jointly
55+
workflow.constants.LOAD_DATA_JOINTLY: args.load_data_jointly,
56+
workflow.constants.ENABLE_INCREMENTAL_SNAPSHOTS: args.enable_incremental_snapshots
5657
}
5758
config = workflow.executor.WorkflowConfig(env_config, workflow.common.BatchConfig(args.batch_config_name,
5859
batch_config_json),

rel/source_configs/config.rel

+14
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ bound date_partitioned_source_relation = String
44
bound source_declares_resource = String, String, String
55
bound source_has_input_format = String, String
66
bound source_has_container_type = String, String
7+
bound snapshot_catalog
78
bound source_catalog
89
bound simple_source_catalog
910
bound part_resource_date_pattern = String
@@ -268,6 +269,10 @@ module source
268269
}
269270

270271
def resource_part_index = declares . part_resource:part_index
272+
273+
def has_batch_config(s, cfg) {
274+
source:relname[s] = #(batch_source:relation[cfg])
275+
}
271276
end
272277

273278
/**
@@ -396,6 +401,15 @@ def missing_resources_json(:[], n, :is_date_partitioned, v) {
396401
from s
397402
}
398403

404+
def missing_resources_json(:[], n, :is_snapshot, v) {
405+
source:needs_resource(s) and
406+
source:index[s] = n and
407+
source:has_batch_config(s, cfg) and
408+
batch_source:snapshot_validity_days(cfg, _) and
409+
boolean_true(v)
410+
from s, cfg
411+
}
412+
399413
def missing_resources_json(:[], i, :resources, :[], k, :uri, n) {
400414
source:index[s] = i and
401415
resource:index[s, r] = k and

rel/source_configs/data_reload.rel

+4-1
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,10 @@ def resources_to_delete(rel) {
123123
rel = #(transpose[resource_to_invalidate][_])
124124
}
125125

126-
def resources_data_to_delete_idx = enumerate[resources_data_to_delete]
126+
// the following enumerate is split in order to help the compiler handle arity overloads
127+
def resources_data_to_delete_idx = enumerate[r, ix: resources_data_to_delete(r, ix)]
128+
def resources_data_to_delete_idx = enumerate[r: resources_data_to_delete(r)]
129+
127130
def resources_data_to_delete_json(:[], i, :relation, s) {
128131
resources_data_to_delete_idx(i, r, _) and
129132
s = relname_string[r]

rel/util/snapshot_diff.rel

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
@outline @nomaintain
2+
module snapshot_diff[{old}, {new}]
3+
def insertions(x...) = new(x...) and not old(x...)
4+
def deletions(x...) = old(x...) and not new(x...)
5+
end

test/test_cfg_src_step.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,8 @@ def _create_cfg_sources_step(sources: List[Source], paths_builders: dict[str, pa
581581
start_date=start_date,
582582
end_date=end_date,
583583
force_reimport=False,
584-
force_reimport_not_chunk_partitioned=False
584+
force_reimport_not_chunk_partitioned=False,
585+
enable_incremental_snapshots=False
585586
)
586587

587588

test/test_cfg_src_step_factory.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ def _create_wf_cfg(env_config: EnvConfig, batch_config: BatchConfig) -> Workflow
7777
constants.FORCE_REIMPORT: False,
7878
constants.FORCE_REIMPORT_NOT_CHUNK_PARTITIONED: False,
7979
constants.COLLAPSE_PARTITIONS_ON_LOAD: False,
80-
constants.LOAD_DATA_JOINTLY: False
80+
constants.LOAD_DATA_JOINTLY: False,
81+
constants.ENABLE_INCREMENTAL_SNAPSHOTS: False,
8182
}
8283
return WorkflowConfig(
8384
env=env_config,

workflow/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
__version_info__ = (0, 0, 49)
15+
__version_info__ = (0, 0, 50)
1616
__version__ = ".".join(map(str, __version_info__))

workflow/constants.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"batch_config/workflow/steps/load_data.rel",
1212
"batch_config/workflow/steps/materialize.rel",
1313
"batch_config/workflow/steps/execute_command.rel",
14+
"util/snapshot_diff.rel",
1415
]
1516

1617
COMMON_MODEL_RELATIVE_PATH = "/../rel"
@@ -81,6 +82,7 @@
8182
FORCE_REIMPORT_NOT_CHUNK_PARTITIONED = "force_reimport_not_chunk_partitioned"
8283
COLLAPSE_PARTITIONS_ON_LOAD = "collapse_partitions_on_load"
8384
LOAD_DATA_JOINTLY = "load_data_jointly"
85+
ENABLE_INCREMENTAL_SNAPSHOTS = "enable_incremental_snapshots"
8486

8587
# Snowflake constants
8688

workflow/executor.py

+28-8
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,11 @@ class ConfigureSourcesWorkflowStep(WorkflowStep):
131131
end_date: str
132132
force_reimport: bool
133133
force_reimport_not_chunk_partitioned: bool
134+
enable_incremental_snapshots: bool
134135

135136
def __init__(self, idt, name, type_value, state, timing, engine_size, config_files, rel_config_dir, sources,
136-
paths_builders, start_date, end_date, force_reimport, force_reimport_not_chunk_partitioned):
137+
paths_builders, start_date, end_date, force_reimport, force_reimport_not_chunk_partitioned,
138+
enable_incremental_snapshots):
137139
super().__init__(idt, name, type_value, state, timing, engine_size)
138140
self.config_files = config_files
139141
self.rel_config_dir = rel_config_dir
@@ -143,6 +145,7 @@ def __init__(self, idt, name, type_value, state, timing, engine_size, config_fil
143145
self.end_date = end_date
144146
self.force_reimport = force_reimport
145147
self.force_reimport_not_chunk_partitioned = force_reimport_not_chunk_partitioned
148+
self.enable_incremental_snapshots = enable_incremental_snapshots
146149

147150
def _execute(self, logger: logging.Logger, env_config: EnvConfig, rai_config: RaiConfig):
148151
rai.install_models(logger, rai_config, env_config, build_models(self.config_files, self.rel_config_dir))
@@ -152,6 +155,15 @@ def _execute(self, logger: logging.Logger, env_config: EnvConfig, rai_config: Ra
152155
declared_sources = {src["source"]: src for src in
153156
rai.execute_relation_json(logger, rai_config, env_config,
154157
constants.DECLARED_DATE_PARTITIONED_SOURCE_REL)}
158+
159+
# if `enable_incremental_snapshots` is set to True, we need to filter out snapshot sources
160+
if self.enable_incremental_snapshots:
161+
# get a set of snapshot source names from the `relation` field
162+
snapshot_sources = {src.relation for src in self.sources if src.snapshot_validity_days and
163+
src.snapshot_validity_days >= 0}
164+
logger.info(f"Snapshot sources skipped, will be reloaded incrementally: {snapshot_sources}")
165+
# filter out snapshot sources from the declared sources
166+
declared_sources = {k: v for k, v in declared_sources.items() if k not in snapshot_sources}
155167
expired_sources = self._calculate_expired_sources(logger, declared_sources)
156168

157169
# mark declared sources for reimport
@@ -284,14 +296,16 @@ def _get_step(self, logger: logging.Logger, config: WorkflowConfig, idt, name, t
284296
force_reimport = config.step_params.get(constants.FORCE_REIMPORT, False)
285297
force_reimport_not_chunk_partitioned = config.step_params.get(constants.FORCE_REIMPORT_NOT_CHUNK_PARTITIONED,
286298
False)
299+
enable_incremental_snapshots = config.step_params[constants.ENABLE_INCREMENTAL_SNAPSHOTS]
287300
paths_builders = {}
288301
for src in sources:
289302
container = src.container
290303
if container.name not in paths_builders:
291304
paths_builders[container.name] = paths.PathsBuilderFactory.get_path_builder(container)
292305
return ConfigureSourcesWorkflowStep(idt, name, type_value, state, timing, engine_size, step["configFiles"],
293306
rel_config_dir, sources, paths_builders, start_date, end_date,
294-
force_reimport, force_reimport_not_chunk_partitioned)
307+
force_reimport, force_reimport_not_chunk_partitioned,
308+
enable_incremental_snapshots)
295309

296310
@staticmethod
297311
def _parse_sources(step: dict, env_config: EnvConfig) -> List[Source]:
@@ -328,11 +342,14 @@ def _parse_sources(step: dict, env_config: EnvConfig) -> List[Source]:
328342
class LoadDataWorkflowStep(WorkflowStep):
329343
collapse_partitions_on_load: bool
330344
load_jointly: bool
345+
enable_incremental_snapshots: bool
331346

332-
def __init__(self, idt, name, type_value, state, timing, engine_size, collapse_partitions_on_load, load_jointly):
347+
def __init__(self, idt, name, type_value, state, timing, engine_size, collapse_partitions_on_load, load_jointly,
348+
enable_incremental_snapshots):
333349
super().__init__(idt, name, type_value, state, timing, engine_size)
334350
self.collapse_partitions_on_load = collapse_partitions_on_load
335351
self.load_jointly = load_jointly
352+
self.enable_incremental_snapshots = enable_incremental_snapshots
336353

337354
def _execute(self, logger: logging.Logger, env_config: EnvConfig, rai_config: RaiConfig):
338355
rai.execute_query(logger, rai_config, env_config, q.DELETE_REFRESHED_SOURCES_DATA, readonly=False)
@@ -353,6 +370,7 @@ def _execute(self, logger: logging.Logger, env_config: EnvConfig, rai_config: Ra
353370

354371
self._load_simple_resources(logger, env_config, rai_config, simple_resources)
355372

373+
# note: async resources do not support snapshot diffing, as CDC should be incremental
356374
if async_resources:
357375
self._load_async_resources(logger, env_config, rai_config, async_resources)
358376

@@ -425,28 +443,29 @@ def _get_date_part_load_query(self, logger: logging.Logger, config, src):
425443
resources = []
426444
for d in srcs:
427445
resources += d["resources"]
428-
return [q.load_resources(logger, config, resources, src)]
446+
return [q.load_resources(logger, config, resources, src, self.enable_incremental_snapshots)]
429447
else:
430448
logger.info(f"Loading '{source_name}' one date partition at a time")
431449
batch = []
432450
for d in src["dates"]:
433451
logger.info(f"Loading partition for date {d['date']}")
434452

435453
for res in d["resources"]:
436-
batch.append(q.load_resources(logger, config, [res], src))
454+
batch.append(
455+
q.load_resources(logger, config, [res], src, self.enable_incremental_snapshots))
437456
return batch
438457

439458
def _get_simple_src_load_query(self, logger: logging.Logger, config, src):
440459
source_name = src["source"]
441460
logger.info(f"Loading source '{source_name}' not partitioned by date")
442461
if self.collapse_partitions_on_load:
443462
logger.info(f"Loading '{source_name}' all chunk partitions simultaneously")
444-
return [q.load_resources(logger, config, src["resources"], src)]
463+
return [q.load_resources(logger, config, src["resources"], src, self.enable_incremental_snapshots)]
445464
else:
446465
logger.info(f"Loading '{source_name}' one chunk partition at a time")
447466
batch = []
448467
for res in src["resources"]:
449-
batch.append(q.load_resources(logger, config, [res], src))
468+
batch.append(q.load_resources(logger, config, [res], src, self.enable_incremental_snapshots))
450469
return batch
451470

452471
@staticmethod
@@ -470,8 +489,9 @@ def _get_step(self, logger: logging.Logger, config: WorkflowConfig, idt, name, t
470489
engine_size, step: dict) -> WorkflowStep:
471490
collapse_partitions_on_load = config.step_params[constants.COLLAPSE_PARTITIONS_ON_LOAD]
472491
load_jointly = config.step_params[constants.LOAD_DATA_JOINTLY]
492+
enable_incremental_snapshots = config.step_params[constants.ENABLE_INCREMENTAL_SNAPSHOTS]
473493
return LoadDataWorkflowStep(idt, name, type_value, state, timing, engine_size, collapse_partitions_on_load,
474-
load_jointly)
494+
load_jointly, enable_incremental_snapshots)
475495

476496

477497
class MaterializeWorkflowStep(WorkflowStep):

0 commit comments

Comments
 (0)