diff --git a/README.md b/README.md index 9828715..adce894 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,26 @@ -# Brut +# Brut-v1.1 -This repository contains the code and manuscript text used in the paper +This repository contains an updated version of Brut (https://github.com/ChrisBeaumont/brut) used in the paper -*The Milky Way Project: Leveraging Citizen Science and Machine Learning to Detect Interstellar Bubbles. Beaumont, Goodman, Kendrew, Williams, Simpson 2014, ApJS, in press ([arXiv link](http://arxiv.org/abs/1406.2692))* +Assessing the Performance of a Machine Learning Algorithm in Identifying Bubbles in Dust Emission, ApJ in press ([arXiv link](https://arxiv.org/abs/1711.03480))* -The `v1` tag represents the state of the code at the time of publication. +We make slight changes on the modules that Brut import. The current version Brut can be successfully run with the following libraries -Data associated with this project is also archived at [The Dataverse](http://thedata.harvard.edu/dvn/dv/brut) (doi:10.7910/DVN/26463) +* astropy '2.0.2' +* h5py '2.7.0' +* matplotlib '2.0.2' +* numpy '1.13.3' +* scipy '1.0.0' +* skimage '0.13.0' +* sklearn '0.19.1' +* cloud '2.8.5' -## High level summary +We update the retrained model in models/ directory. -Brut uses a database of known bubbles (from the [Milky Way Project](http://www.milkywayproject.org/)) and Spitzer images from our galaxy to build an automatic bubble classifier. The classifier is based on the Random Forest algorithm, and uses the [WiseRF](http://docs.wise.io/wiserf_python.html) implementation of this algorithm. - -The main question that Brut attempts to answer is "does this image contain a bubble?" The images presented to Brut are 2-color square postage stamps extracted from 8 and 24 micron Spitzer images of the Galactic plane. - -The [picloud](http://www.picloud.com/) platform was used to perform some of the computation in parallel, in the cloud. - -If you want to dig into the details of how the model is built, start with the Makefile in the scripts/ directory. ## Organization -### bubbly/ -Contains the python library used to fit Random Forest classification models to Spitzer images - -### figures/ -Contains code to generate figures in the paper - -### notebooks/ -Contains several IPython notebooks in various states of organization -- some are polished documents describing aspects of the analysis, others are temporary workbooks. - -### paper/ -Contains the manuscript text itself - -### scripts/ -Python scripts to fit models and generate other derived data products - - -## Reproduction - -This repository is MIT Licensed. - -To reproduce the figures and models generated for the paper, type: - -``` -python setup.py develop -cd bubbly/data && make -cd ../../paper && make -``` - -Though I promise you you'll have to play with dependencies to get this all set up :) - -## Dependencies - -Brut is built on top of several python libraries, and uses data from the GLIMPSE and MIPSGAL surveys from the Spitzer Space Telescope. You'll need the following libraries - -* aplpy -* astropy -* h5py -* IPython -* matplotlib -* numpy -* scipy -* skimage -* sklearn -* picloud -* WiseRF +### models/ +Contains the original training model and the model retrained on synthetic images and the orignial traning set. +The synthetic bubble images can be found here (https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OSMNDG). -In addition, you need to download the GLIMPSE and MIPSGAL mosaic data. The Makefile inside bubbly/data does this. diff --git a/bubbly/extractors.py b/bubbly/extractors.py index b6f93cc..8bc4493 100644 --- a/bubbly/extractors.py +++ b/bubbly/extractors.py @@ -7,7 +7,7 @@ import numpy as np from scipy.optimize import fmin_l_bfgs_b as minimize from skimage.morphology import disk -from skimage.filter.rank import percentile_autolevel +from skimage.filters.rank import autolevel_percentile from skimage.feature import daisy from .field import get_field @@ -60,6 +60,25 @@ def extract(self, lon, l, b, r, **kwargs): rgb = self._preprocess_rgb(rgb) return self._extract_rgb(rgb) + + def extract_xd(self, lon, l, b, r, **kwargs): + kwargs.setdefault('limits', [1, 97]) + kwargs.setdefault('shp', self.shp) +# kwargs.setdefault('i3', True) + + rgb = get_field(lon).extract_stamp(l, b, r, **kwargs) + + if rgb is None: + raise ValueError("Field is out of bounds") + elif (rgb[:, :, 1] == 0).mean() > 0.1: + raise ValueError("Field has no green channel") + +# rgb = self._preprocess_rgb(rgb) + rgb = self._preprocess_rgb(rgb) +# return self._extract_rgb(rgb) + return rgb + + def _extract_rgb(self, rgb): raise NotImplementedError() @@ -182,7 +201,22 @@ def _prepare_templates(self, shp): ts *= (s / 2) / 20 self.rings = np.column_stack(np.exp(-(r - rr) ** 2 / tt ** 2).ravel() for rr, tt in product(rs, ts)) + + def _normal_templates_coeff(self, shp): + if self._template_shp == shp: + return + self._template_shp = shp + + s = shp[0] + y, x = np.mgrid[0:s, 0:s].astype(np.float) + r = np.hypot(y - s / 2, x - s / 2) + rs = np.linspace(1., s / 2, 7) + ts = np.array([2, 4, 6, 8, 10, 15, 20]).astype(np.float) + ts *= (s / 2) / 20 + return np.column_stack(np.exp(-(r - rr) ** 2 / tt ** 2).ravel() + for rr, tt in product(rs, ts)) + def _extract_rgb(self, rgb): self._prepare_templates(rgb.shape) @@ -197,6 +231,8 @@ def _extract_rgb(self, rgb): np.dot(rnorm, self.rings), np.dot(gnorm, self.rings), np.dot(rnorm - gnorm, self.rings)]) +# result = np.hstack([np.dot(r, self.rings), +# np.dot(rnorm, self.rings)]) return result.reshape(1, -1) @@ -204,6 +240,9 @@ class DaisyExtractor(Extractor): def _extract_rgb(self, rgb): kwargs = dict(step=rgb.shape[0]/5, radius=rgb.shape[0] / 10, rings=2, histograms=6, orientations=8) + + self.daisyextractor_xd=np.hstack(daisy(rgb[:, :, i], **kwargs).ravel() for i in [0, 1]) + return np.hstack(daisy(rgb[:, :, i], **kwargs).ravel() for i in [0, 1]) @@ -212,12 +251,49 @@ def __init__(self, orig): self.orig = orig def extract(self, lon, l, b, r): + return np.hstack((self.orig.extract(lon, l, b, r), self.orig.extract(lon, l, b, r / 2), self.orig.extract(lon, l, b, r * 2), self.orig.extract(lon, l, b + r / 2, r))) - + def extract_xd_mve(self, lon, l, b, r): + + return self.orig.extract_xd(lon, l, b, r) + + def extract_xd_daisy(self, lon, l, b, r): + methodname = '_extract_rgb' + a = DaisyExtractor() + method_1 = getattr(a, methodname) + return method_1(self.orig.extract_xd(lon, l, b, r)) + + def extract_xd_ring(self, lon, l, b, r): + methodname = '_extract_rgb' + a = RingExtractor() + method_1 = getattr(a, methodname) + return method_1(self.orig.extract_xd(lon, l, b, r)) + + def extract_xd_compression(self, lon, l, b, r): + methodname = '_extract_rgb' + a = CompressionExtractor() + method_1 = getattr(a, methodname) + return method_1(self.orig.extract_xd(lon, l, b, r)) + + def extract_xd_wavelet(self, lon, l, b, r): + methodname = '_extract_rgb' + a = MultiWaveletExtractor() + method_1 = getattr(a, methodname) + return method_1(self.orig.extract_xd(lon, l, b, r)) + + def extract_xd_ring_templates(self, lon, l, b, r): + methodname = '_normal_templates_coeff' + a = RingExtractor() + method_1 = getattr(a, methodname) + return method_1(self.orig.extract_xd(lon, l, b, r).shape) + + + + class CompositeExtractor(Extractor): composite_classes = [] @@ -258,5 +334,5 @@ def enhance_contrast(rgb): s = rgb.shape d = disk(s[0] / 5) for i in range(3): - rgb[:, :, i] = percentile_autolevel(rgb[:, :, i], d, p0=.1, p1=.9) + rgb[:, :, i] = autolevel_percentile(rgb[:, :, i], d, p0=.1, p1=.9) return rgb diff --git a/bubbly/util.py b/bubbly/util.py index ae9aac6..6bd5760 100644 --- a/bubbly/util.py +++ b/bubbly/util.py @@ -3,7 +3,7 @@ import logging from skimage.transform import resize -from sklearn.metrics import recall_score, auc_score +from sklearn.metrics import recall_score, roc_auc_score import numpy as np @@ -56,9 +56,14 @@ def scale(x, mask=None, limits=None): if mask is None: lo, hi = np.percentile(x, limits) else: - lo, hi = np.percentile(x[mask], limits) - + if x[mask].size>0: +# print x[mask].shape + lo, hi = np.percentile(x[mask], limits) + else: + lo,hi=0,0 + x = (np.clip(x, lo, hi) - lo) / (hi - lo) +# return x[mask].shape return (np.sqrt(x) * 255).astype(np.uint8) @@ -67,7 +72,7 @@ def resample(arr, shape): # skimage's resize needs scaled data lo, hi = np.nanmin(arr), np.nanmax(arr) arr = (arr - lo) / (hi - lo) - result = resize(arr, shape, mode='nearest') + result = resize(arr, shape, mode='edge') return result * (hi - lo) + lo diff --git a/bubbly/wiserf.py b/bubbly/wiserf.py index 4d94420..f3cbbaa 100644 --- a/bubbly/wiserf.py +++ b/bubbly/wiserf.py @@ -1,6 +1,7 @@ import os -import PyWiseRF +#import PyWiseRF +from sklearn.ensemble import RandomForestClassifier import cloud import numpy as np @@ -14,7 +15,12 @@ def test(): clf = WiseRF().fit(x, y) return clf -class WiseRF(PyWiseRF.WiseRF): +#class WiseRF(PyWiseRF.WiseRF): +# def decision_function(self, x): +# p = self.predict_proba(x) +# return p[:, 1] - p[:, 0] + +class WiseRF(RandomForestClassifier): def decision_function(self, x): p = self.predict_proba(x) return p[:, 1] - p[:, 0] diff --git a/models/README.md b/models/README.md index d59a1fd..12eaefd 100644 --- a/models/README.md +++ b/models/README.md @@ -1,5 +1,14 @@ # Models -These files contain model input and output data. They are included in -this repository, but are all generated from other scripts in the -``scripts/`` and ``notebooks/`` directory. \ No newline at end of file +These files contain model input and output data. + +* "full_classifier_retrain_xd_all_gini_noise_0722.dat": retrained on the synthetic bubbles with and without noise and the original training set from MWP +* "full_classifier_retrain_xd_all_nonnoise_gini_0722.dat": retrained on the synthetic bubbles without noise and the original training set from MWP +* "full_classifier_xd_only_sim_non_noi_1102.dat": retrained on the synthetic bubbles without noise +* "full_classifier_xd_only_simulation_1029.dat": retrained on the synthetic bubbles with and without noise +* "full_classifier_xd_reduceMWP_simulation_1025.dat": retrained on the synthetic bubbles with and without noise and half of the original training set from MWP +* "full_classifier.dat": original training + + + + diff --git a/models/full_classifier_retrain_xd_all_gini_noise_0722.dat b/models/full_classifier_retrain_xd_all_gini_noise_0722.dat new file mode 100644 index 0000000..c04a8e3 Binary files /dev/null and b/models/full_classifier_retrain_xd_all_gini_noise_0722.dat differ diff --git a/models/full_classifier_retrain_xd_all_nonnoise_gini_0722.dat b/models/full_classifier_retrain_xd_all_nonnoise_gini_0722.dat new file mode 100644 index 0000000..18275bf Binary files /dev/null and b/models/full_classifier_retrain_xd_all_nonnoise_gini_0722.dat differ diff --git a/models/full_classifier_xd_only_sim_non_noi_1102.dat b/models/full_classifier_xd_only_sim_non_noi_1102.dat new file mode 100644 index 0000000..788c668 Binary files /dev/null and b/models/full_classifier_xd_only_sim_non_noi_1102.dat differ diff --git a/models/full_classifier_xd_only_simulation_1029.dat b/models/full_classifier_xd_only_simulation_1029.dat new file mode 100644 index 0000000..8db9ce1 Binary files /dev/null and b/models/full_classifier_xd_only_simulation_1029.dat differ diff --git a/models/full_classifier_xd_reduceMWP_simulation_1025.dat b/models/full_classifier_xd_reduceMWP_simulation_1025.dat new file mode 100644 index 0000000..2e985d9 Binary files /dev/null and b/models/full_classifier_xd_reduceMWP_simulation_1025.dat differ diff --git a/scripts/build_full_classifier.py b/scripts/build_full_classifier.py index 7d4a647..f4016bf 100644 --- a/scripts/build_full_classifier.py +++ b/scripts/build_full_classifier.py @@ -1,35 +1,119 @@ import json import cPickle as pickle +import numpy as np from bubbly.model import Model, ModelGroup from bubbly.extractors import MultiViewExtractor, ManyManyExtractors -from bubbly.dr1 import WideLocationGenerator +from bubbly.dr1 import WideLocationGenerator,LocationGenerator from bubbly.wiserf import WiseRF +#from sklearn.ensemble import RandomForestClassifier - +def add_traningset_1(data,lon): + for ctt_l in range(10): + for ctt_b in range(4): + data['pos'].append([lon, lon%360-0.95+ctt_l*0.1, (ctt_b-1)*0.1, 0.046]) + return data + +def add_traningset_2(data,lon): + for ctt_l in range(10): + for ctt_b in range(4): + data['pos'].append([lon, lon%360-0.95+ctt_l*0.1, (ctt_b-1)*0.1, 0.038]) + return data + +def add_traningset_neg(data,lon): + for ctt_l in range(20): + for ctt_b in range(8): + data['neg'].append([lon, lon%360-0.95+ctt_l*0.1, (ctt_b-3.5)*0.1, 0.046]) + return data + + def make_model(mod3): params = {'max_features': 'auto', - 'min_samples_split': 4, 'n_jobs': 2, - 'criterion': 'infogain', + 'min_samples_split': 4, +# 'criterion': 'infogain', + 'criterion': 'gini', ### entropy +# 'criterion': 'entropy', ### 'n_estimators': 800} ex = MultiViewExtractor(ManyManyExtractors()) loc = WideLocationGenerator(mod3) +# clf = RandomForestClassifier(**params) clf = WiseRF(**params) return Model(ex, loc, clf) def train_model(model, mod3): - data = json.load(open('../models/training_data_%i.json' % mod3)) +# data = json.load(open('../models/training_data_%i.json' % mod3)) + data = json.load(open('../models/training_dataxdno_%i.json' % mod3)) +# data = json.load(open('../models/training_dataxd_%i.json' % mod3)) + + +# if mod3==0: +# for lon_all in np.array([71,82,74,76])+360: +# data=add_traningset_1(data,np.int(lon_all)) +# for lon_all in np.array([73,85,77])+360: +# data=add_traningset_2(data,np.int(lon_all)) +# data=add_traningset_neg(data,82) +# if mod3==1: +# for lon_all in np.array([71,72,74,86])+360: +# data=add_traningset_1(data,np.int(lon_all)) +# for lon_all in np.array([83,75,77])+360: +# data=add_traningset_2(data,np.int(lon_all)) +# data=add_traningset_neg(data,83) +# if mod3==2: +# for lon_all in np.array([81,72,84,76])+360: +# data=add_traningset_1(data,np.int(lon_all)) +# for lon_all in np.array([73,75,87])+360: +# data=add_traningset_2(data,np.int(lon_all)) +# data=add_traningset_neg(data,82) + + if mod3==0: + for lon_all in np.array([71,82,74,76,121,112,124,116])+360: +# for lon_all in np.array([121,112,124,116])+360: + data=add_traningset_1(data,np.int(lon_all)) + for lon_all in np.array([73,85,77,113,115,127])+360: +# for lon_all in np.array([113,115,127])+360: + data=add_traningset_2(data,np.int(lon_all)) + data=add_traningset_neg(data,82) + if mod3==1: + for lon_all in np.array([71,72,74,86,111,122,114,116])+360: +# for lon_all in np.array([111,122,114,116])+360: + data=add_traningset_1(data,np.int(lon_all)) + for lon_all in np.array([83,75,77,113,125,117])+360: +# for lon_all in np.array([113,125,117])+360: + data=add_traningset_2(data,np.int(lon_all)) + data=add_traningset_neg(data,83) + if mod3==2: + for lon_all in np.array([81,72,84,76,111,112,114,126])+360: +# for lon_all in np.array([111,112,114,126])+360: + data=add_traningset_1(data,np.int(lon_all)) + for lon_all in np.array([73,75,87,123,115,117])+360: +# for lon_all in np.array([123,115,117])+360: + data=add_traningset_2(data,np.int(lon_all)) + data=add_traningset_neg(data,82) + + model.fit(data['pos'], data['neg']) return model - +#""" def main(): models = [train_model(make_model(i), i) for i in [0, 1, 2]] +# models = [train_model(make_model(i), i) for i in [0]] mg = ModelGroup(*models) - mg.save('../models/full_classifier.dat') +# mg.save('../models/full_classifier_retrain_xd_all_0417_noise.dat') +# mg.save('../models/full_classifier_retrain_xd_all_entropy_0430.dat') +# mg.save('../models/full_classifier_retrain_xd_all_gini_0528.dat') +# mg.save('../models/full_classifier_xd_entropy_0528.dat') +# mg.save('../models/full_classifier_xd_reduceMWP_simulation_1025.dat') +# mg.save('../models/full_classifier_xd_only_simulation_1029.dat') +# mg.save('../models/full_classifier_xd_retrain_noise_1030.dat') + mg.save('../models/full_classifier_xd_only_sim_non_noi_1102.dat') +# mg.save('../models/full_classifier_retrain_xd_all_gini_0528.dat') +# mg.save('../models/full_classifier_xd_all_entropy_0430.dat') +# mg.save('../models/full_classifier_xd_all_0417.dat') if __name__ == "__main__": main() +