diff --git a/csd3-side/scripts/lsst-backup.py b/csd3-side/scripts/lsst-backup.py index 1aa749a..4be6f7e 100644 --- a/csd3-side/scripts/lsst-backup.py +++ b/csd3-side/scripts/lsst-backup.py @@ -1198,17 +1198,17 @@ def process_files(s3, bucket_name, api, current_objects, exclude, local_dir, des del zip_batch_files, zip_batch_object_names, zip_batch_sizes else: - to_collate = dd.read_csv(collate_list_file) - p = to_collate.npartitions - to_collate.object_names = dd.from_pandas(to_collate.object_names.compute().apply(literal_eval),npartitions=p) - to_collate.file_paths = dd.from_pandas(to_collate.file_paths.compute().apply(literal_eval),npartitions=p) + npart = len(client.scheduler_info()['workers'])*10 + to_collate = dd.from_pandas(pd.read_csv(collate_list_file).drop(['upload']), npartitions=npart) + to_collate.object_names = dd.from_pandas(to_collate.object_names.compute().apply(literal_eval),npartitions=npart) + to_collate.file_paths = dd.from_pandas(to_collate.file_paths.compute().apply(literal_eval),npartitions=npart) print(f'Loaded collate list from {collate_list_file}.', flush=True) if not current_objects.empty: # now using pandas for both current_objects and to_collate - this could be re-written to using vectorised operations # client.scatter([current_objects,to_collate]) # to_collate = dd.from_pandas(to_collate, npartitions=len(client.scheduler_info()['workers'])*10) print('Created Dask dataframe for to_collate.', flush=True) - to_collate = to_collate.drop(columns=['upload']) + # to_collate = to_collate.drop(columns=['upload']) print(to_collate.index) print(to_collate.columns) print(to_collate.dtypes) @@ -1217,6 +1217,7 @@ def process_files(s3, bucket_name, api, current_objects, exclude, local_dir, des print('Comparison setup.', flush=True) to_collate = to_collate.compute() print('Comparison complete.', flush=True) + print(to_collate) exit() if save_collate_file: