Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

"Bug fixed" make_dataset supports dicts as input and doesn't do unnecessary save-load #65

Merged
merged 2 commits into from
Jun 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 59 additions & 31 deletions deeplenstronomy/deeplenstronomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, config=None, save=False, store=True):
"""

if config:
make_dataset(config, dataset=self, save=save, store=store)
make_dataset(config, dataset=self, save_to_disk=save, store_in_memory=store)
return

def update_param(self, new_param_dict, configuration):
Expand Down Expand Up @@ -267,29 +267,29 @@ def _format_time(elapsed_time):
def make_dataset(config, dataset=None, save_to_disk=False, store_in_memory=True,
verbose=False, store_sample=False, image_file_format='npy',
survey=None, return_planes=False, skip_image_generation=False,
solve_lens_equation=False):
solve_lens_equation=False, random_seed=None):
"""
Generate a dataset from a config file.

Args:
config (str or dict): name of yaml file specifying dataset characteristics or pre-parsed yaml file as dictionary
verbose (bool, optional, default=False): print progress and status updates at runtime
store_in_memory (bool, optional, default=True): save images and metadata as attributes
save_to_disk (bool, optional, default=False): save images and metadata to disk
store_sample (bool, optional, default=False): save five images and metadata as attribute
store_in_memory (bool, optional, default=True): save images and metadata as attributes
save_to_disk (bool, optional, default=False): save images and metadata to disk
store_sample (bool, optional, default=False): save five images and metadata as attribute
image_file_format (str, optional, default='npy'): outfile format type, options include ('npy', 'h5')
survey (str or None, optional, default=None): a default astronomical survey to use
survey (str or None, optional, default=None): a default astronomical survey to use
return_planes (bool, optional, default=False): return the lens, source, noise, and point source planes of the simulated images
skip_image_generation (bool, optional, default=False): skip image generation
solve_lens_equation (bool, optional, default=False): calculate the source positions

random_seed: unit (seed of random generation used in case if dataset.seed is not specified )
Returns:
dataset (Dataset): and instance of the Dataset class

Raises:
RuntimeError: If `skip_image_generation == True` and `solve_lens_equation == True`
RuntimeError: If `survey` is not a valid survey name

"""

if solve_lens_equation and skip_image_generation:
Expand All @@ -305,6 +305,21 @@ def make_dataset(config, dataset=None, save_to_disk=False, store_in_memory=True,

if isinstance(config, dict):
dataset.config_dict = config

# paths and configurations of 'BACKGROUNDS' images (copy-paste from Parser._get_image_locations)
image_paths = []
image_configurations = []
if "BACKGROUNDS" in config.keys():
image_paths.append(config['BACKGROUNDS']['PATH'])
# probably there also should be append. It is really worth testing how Parser will behave
# in case of several background files
image_configurations = config['BACKGROUNDS']['CONFIGURATIONS'][:]

# paths and configurations of 'DISTRIBUTIONS' images (copy-paste from Parser._get_image_locations)
file_paths = []
if "DISTRIBUTIONS" in config.keys():
for k in config['DISTRIBUTIONS'].keys():
file_paths.append('DISTRIBUTIONS.' + k)
else:
# Store config file
dataset.config_file = config
Expand All @@ -315,18 +330,28 @@ def make_dataset(config, dataset=None, save_to_disk=False, store_in_memory=True,
parser = Parser(config, survey=survey)
dataset.config_dict = parser.config_dict

image_paths=parser.image_paths
if len(parser.image_paths)>0:
image_configurations=parser.image_configurations
file_paths=parser.file_paths

# store parser
dataset.parser = parser
#dataset.parser = parser

# Store top-level dataset info
dataset.name = dataset.config_dict['DATASET']['NAME']
dataset.size = dataset.config_dict['DATASET']['PARAMETERS']['SIZE']
dataset.outdir = dataset.config_dict['DATASET']['PARAMETERS']['OUTDIR']
dataset.bands = dataset.config_dict['SURVEY']['PARAMETERS']['BANDS'].split(',')
# Just check that the key 'SEED' exists and it is 32 bit unsigned integer convertible
# Try-catch blocks in C++ were super slow, not speaking of Python
try:
dataset.seed = int(dataset.config_dict['DATASET']['PARAMETERS']["SEED"])
except KeyError:
dataset.seed = random.randint(0, 100)
if random_seed is not None:
dataset.seed = random_seed
else:
dataset.seed = random.randint(0, 100)
np.random.seed(dataset.seed)
random.seed(dataset.seed)

Expand All @@ -339,33 +364,33 @@ def make_dataset(config, dataset=None, save_to_disk=False, store_in_memory=True,
dataset.configurations = list(dataset.config_dict['GEOMETRY'].keys())

# Handle image backgrounds if they exist
if len(parser.image_paths) > 0:
im_dir = parser.config_dict['BACKGROUNDS']["PATH"]
image_backgrounds = read_images(im_dir, parser.config_dict['IMAGE']['PARAMETERS']['numPix'], dataset.bands)
if len(image_paths) > 0:
im_dir = dataset.config_dict['BACKGROUNDS']["PATH"]
image_backgrounds = read_images(im_dir, dataset.config_dict['IMAGE']['PARAMETERS']['numPix'], dataset.bands)
else:
image_backgrounds = np.zeros((len(dataset.bands), parser.config_dict['IMAGE']['PARAMETERS']['numPix'], parser.config_dict['IMAGE']['PARAMETERS']['numPix']))[np.newaxis,:]
image_backgrounds = np.zeros((len(dataset.bands), dataset.config_dict['IMAGE']['PARAMETERS']['numPix'], dataset.config_dict['IMAGE']['PARAMETERS']['numPix']))[np.newaxis,:]

# If user-specified distributions exist, draw from them
forced_inputs = {}
max_size = dataset.size * 100 # maximum 100 epochs if timeseries

for fp in parser.file_paths:
filename = eval("parser.config_dict['" + fp.replace('.', "']['") + "']" + "['FILENAME']")
mode = eval("parser.config_dict['" + fp.replace('.', "']['") + "']" + "['MODE']")
for fp in file_paths:
filename = eval("dataset.config_dict['" + fp.replace('.', "']['") + "']" + "['FILENAME']")
mode = eval("dataset.config_dict['" + fp.replace('.', "']['") + "']" + "['MODE']")
try:
step = eval("parser.config_dict['" + fp.replace('.', "']['") + "']" + "['STEP']")
step = eval("dataset.config_dict['" + fp.replace('.', "']['") + "']" + "['STEP']")
except KeyError:
step = 10
try:
params = eval("parser.config_dict['"+fp.replace('.',"']['")+"']"+"['PARAMS']")
params = eval("dataset.config_dict['"+fp.replace('.',"']['")+"']"+"['PARAMS']")
except KeyError:
params = None
draw_param_names, draw_param_values = draw_from_user_dist(filename, max_size, mode, step, params=params)
forced_inputs[fp] = {'names': draw_param_names, 'values': draw_param_values}
# If we want to iterate through map.txt, add the parameters to the forced inputs
if len(parser.image_paths) > 0 and "ITERATE" in parser.config_dict['BACKGROUNDS']:
if len(image_paths) > 0 and "ITERATE" in dataset.config_dict['BACKGROUNDS']:
background_iterate = True
im_dir = parser.config_dict['BACKGROUNDS']["PATH"]
im_dir = dataset.config_dict['BACKGROUNDS']["PATH"]
draw_param_names, draw_param_values = treat_map_like_user_dist(im_dir, max_size)
forced_inputs[im_dir + '/map.txt'] = {'names': draw_param_names, 'values': draw_param_values}
else:
Expand Down Expand Up @@ -397,18 +422,21 @@ def make_dataset(config, dataset=None, save_to_disk=False, store_in_memory=True,
# Initialize the ImageGenerator
ImGen = ImageGenerator(return_planes, solve_lens_equation)

# Clear the sim_dicts out of memory
if not os.path.exists(dataset.outdir):
os.system('mkdir ' + dataset.outdir)
if save_to_disk:
# Clear the sim_dicts out of memory
if not os.path.exists(dataset.outdir):
os.system('mkdir ' + dataset.outdir)

for configuration in dataset.configurations:
np.save("{0}/{1}_sim_dicts.npy".format(dataset.outdir, configuration), {0: organizer.configuration_sim_dicts[configuration]}, allow_pickle=True)
del organizer.configuration_sim_dicts[configuration]
for configuration in dataset.configurations:
np.save("{0}/{1}_sim_dicts.npy".format(dataset.outdir, configuration), {0: organizer.configuration_sim_dicts[configuration]}, allow_pickle=True)
#del organizer.configuration_sim_dicts[configuration]

# Simulate images
#for configuration, sim_inputs in organizer.configuration_sim_dicts.items():
for configuration in dataset.configurations:
sim_inputs = np.load("{0}/{1}_sim_dicts.npy".format(dataset.outdir, configuration), allow_pickle=True).item()[0]
sim_inputs = dataset.organizer.configuration_sim_dicts[configuration]
# previously you deleted it (why?)
#del organizer.configuration_sim_dicts[configuration]

if verbose:
print("Generating images for {0}".format(configuration))
Expand All @@ -418,7 +446,7 @@ def make_dataset(config, dataset=None, save_to_disk=False, store_in_memory=True,

# Handle image backgrounds if they exist
real_image_indices = []
if len(parser.image_paths) > 0 and configuration in parser.image_configurations:
if len(image_paths) > 0 and configuration in image_configurations:
image_indices = organize_image_backgrounds(im_dir, len(image_backgrounds), [_flatten_image_info(sim_input) for sim_input in sim_inputs], configuration, overwrite=background_iterate)
check_background_indices(image_indices, background_iterate)
else:
Expand Down Expand Up @@ -504,10 +532,10 @@ def make_dataset(config, dataset=None, save_to_disk=False, store_in_memory=True,
configuration_planes = np.array(planes)

# Add image backgrounds -- will just add zeros if no backgrounds have been specified
if len(parser.image_paths) > 0 and configuration in parser.image_configurations:
if len(image_paths) > 0 and configuration in image_configurations:
additive_image_backgrounds = image_backgrounds[np.array(real_image_indices)]
else:
temp_array = np.zeros((len(dataset.bands), parser.config_dict['IMAGE']['PARAMETERS']['numPix'], parser.config_dict['IMAGE']['PARAMETERS']['numPix']))[np.newaxis,:]
temp_array = np.zeros((len(dataset.bands), dataset.config_dict['IMAGE']['PARAMETERS']['numPix'], dataset.config_dict['IMAGE']['PARAMETERS']['numPix']))[np.newaxis,:]
additive_image_backgrounds = temp_array[np.array(real_image_indices)]


Expand Down
63 changes: 63 additions & 0 deletions test/test_make_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import unittest2

import sys
Folder='../deeplenstronomy'
sys.path.append(Folder)

import os
import numpy as np

import deeplenstronomy.deeplenstronomy as dl
from deeplenstronomy.input_reader import Parser

class test_make_dataset(unittest2.TestCase):

@classmethod
def setUpClass(self):
self.filename= '../Notebooks/data/demo.yaml'
self.config_dict = Parser(self.filename, survey=None).config_dict

def test_make_from_dict(self):
dataset_from_file = dl.make_dataset(self.filename, random_seed=42)
images_from_file = dataset_from_file.CONFIGURATION_1_images

dataset_from_dict = dl.make_dataset(self.config_dict, random_seed=42)
images_from_dict = dataset_from_dict.CONFIGURATION_1_images

# Test that datasets generated from dict and from .yaml file with this dict are the same
self.assertTrue((images_from_file==images_from_dict).all())

def test_no_save_load_of_sim_dicts(self):
dataset = dl.make_dataset(self.config_dict, save_to_disk=False, random_seed=42)
# create temporary directory
os.system('mkdir temp')
# replicate previous behaviour
np.save("temp/{0}_sim_dicts.npy".format( dataset.configurations[0]),
{0: dataset.organizer.configuration_sim_dicts[dataset.configurations[0]]}, allow_pickle=True)
saved_loaded_file = np.load("temp/{1}_sim_dicts.npy".format(dataset.outdir, dataset.configurations[0]),
allow_pickle=True).item()[0]
# remove temporary directory
os.system('rm -r temp')

# results of new behaviour
original_file=dataset.organizer.configuration_sim_dicts[dataset.configurations[0]]

similarity_arr = []
for i, band_dict in enumerate(saved_loaded_file):
for band in band_dict.keys():
for key in band_dict[band].keys():
similarity_arr += [saved_loaded_file[i][band][key] == original_file[i][band][key]]

# Check that I use the same as I've previously received from save-load
self.assertTrue(np.array(similarity_arr).all())

# Test that no files are saved if save_to_disk=False
self.assertTrue(not np.isin('MySimulationResults',os.listdir('./')).item())

# Check that data generated from new realisation is correct
images_from_dict = dataset.CONFIGURATION_1_images
self.assertTrue(images_from_dict.shape==(24,5,100,100))


if __name__ == '__main__':
unittest2.main()