From c8d5329d376e03c8a1aca08fb419df023a143d9c Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Tue, 19 Jan 2021 11:59:53 -0500
Subject: [PATCH 001/116] Integrated SQF pipeline

---
 examples/configs/datasets.py        |  20 ++
 examples/configs/supported.py       |   2 +
 wilds/common/metrics/all_metrics.py |  37 ++++
 wilds/common/utils.py               |   4 +-
 wilds/datasets/sqf_dataset.py       | 276 ++++++++++++++++++++++++++++
 5 files changed, 337 insertions(+), 2 deletions(-)
 create mode 100644 wilds/datasets/sqf_dataset.py

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 1d15c7af..e760ff50 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -223,6 +223,26 @@
         'n_epochs': 3,
         'n_groups_per_batch': 2,
     },
+    'sqf': {
+        'split_scheme': 'all_race',
+        'model': 'logistic_regression',
+        'train_transform': None,
+        'eval_transform': None,
+        'model_kwargs': {'in_features': 104},
+        'loss_function': 'cross_entropy',
+        'groupby_fields': ['y'],
+        'val_metric': 'precision_at_global_recall__all',
+        'val_metric_decreasing': False,
+        'algo_log_metric': 'accuracy',
+        'optimizer': 'Adam',
+        'optimizer_kwargs': {},
+        'scheduler': None,
+        'batch_size': 4,
+        'lr': 5e-5,
+        'weight_decay': 0,
+        'n_epochs': 4,
+        # 'dataset_kwargs' : {'uniform_over_groups' :True}
+    },
 }
 
 ##########################################
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index bcbe54a9..34c470c9 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -11,6 +11,7 @@
 from wilds.datasets.iwildcam_dataset import IWildCamDataset
 from wilds.datasets.ogbmolpcba_dataset import OGBPCBADataset
 from wilds.datasets.poverty_dataset import PovertyMapDataset
+from wilds.datasets.sqf_dataset import SQFDataset
 from wilds.datasets.waterbirds_dataset import WaterbirdsDataset
 from wilds.datasets.yelp_dataset import YelpDataset
 # metrics
@@ -29,6 +30,7 @@
     'poverty': PovertyMapDataset,
     'fmow': FMoWDataset,
     'bdd100k': BDD100KDataset,
+    'sqf': SQFDataset,
 }
 
 losses = {
diff --git a/wilds/common/metrics/all_metrics.py b/wilds/common/metrics/all_metrics.py
index 3c2af169..33328071 100644
--- a/wilds/common/metrics/all_metrics.py
+++ b/wilds/common/metrics/all_metrics.py
@@ -128,3 +128,40 @@ def __init__(self, name=None):
         if name is None:
             name = 'mse'
         super().__init__(name=name, loss_fn=mse_loss)
+
+class BinaryAUPRC(Metric):
+    def __init__(self, score_fn=logits_to_score, name=None):
+        self.score_fn = score_fn
+        if name is None:
+            name = f'auprc'
+
+        super().__init__(name=name)
+
+    def _compute(self, y_pred, y_true):
+        if self.score_fn is not None:
+            score = self.score_fn(y_pred)
+        try:
+            return torch.tensor(sklearn.metrics.average_precision_score(y_true, score))
+        except ValueError:
+            print('Warning: AUPRC not defined when there are no positive cases.')
+            return torch.FloatTensor([float('nan')])
+
+    def worst(self, metrics):
+        return minimum(metrics)
+
+class PrecisionAtRecall(Metric):
+    """Given a specific model threshold, determine the precision score achieved"""
+    def __init__(self, threshold, score_fn=logits_to_score, name=None):
+        self.score_fn = score_fn
+        self.threshold = threshold
+        if name is None:
+            name = "precision_at_global_recall_"
+        super().__init__(name=name)
+
+    def _compute(self, y_pred, y_true):
+        score = self.score_fn(y_pred)
+        predictions = (score > self.threshold)
+        return torch.tensor(sklearn.metrics.precision_score(y_true, predictions))
+
+    def worst(self, metrics):
+        return minimum(metrics)
diff --git a/wilds/common/utils.py b/wilds/common/utils.py
index 9fd6426f..7854393a 100644
--- a/wilds/common/utils.py
+++ b/wilds/common/utils.py
@@ -122,7 +122,7 @@ def shuffle_arr(arr, seed=None):
     rng.shuffle(arr)
     return arr
 
-def threshold_at_recall(y_pred, y_true, global_recall=0.6):
+def threshold_at_recall(y_pred, y_true, global_recall=60):
     """ Calculate the model threshold to use to achieve a desired global_recall level. Assumes that
     y_true is a vector of the true binary labels."""
-    return np.percentile(y_pred[y_true == 1], global_recall)
+    return np.percentile(y_pred[y_true == 1], 100-global_recall)
diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
new file mode 100644
index 00000000..2b236bc1
--- /dev/null
+++ b/wilds/datasets/sqf_dataset.py
@@ -0,0 +1,276 @@
+import os
+import torch
+import pandas as pd
+import numpy as np
+from wilds.datasets.wilds_dataset import WILDSDataset
+from wilds.common.metrics.all_metrics import Accuracy, BinaryAUPRC, PrecisionAtRecall
+from wilds.common.grouper import CombinatorialGrouper
+from wilds.common.utils import subsample_idxs, threshold_at_recall
+import torch.nn.functional as F
+
+class SQFDataset(WILDSDataset):
+    """
+    New york stop and frisk data. CPW (weapons stops) from 2009 - 2012, as orginally provided by the NYPD and later
+    cleaned by Goel, Rao, and Shroff 2016 https://projecteuclid.org/euclid.aoas/1458909920 . Shared with permission.
+    https://5harad.com/#research for the full dataset.
+
+     Supported `split_scheme`:
+        'black', 'all_race', 'bronx', or 'all_borough'
+
+     Input (x):
+     Either 29 one-hot pre-stop observable features or 104=29 observables + 75 one-hot district indicators
+
+     Label (y):
+        y is binary. It is 1 if the stop is listed as finding a weapon, 0 otherwise.
+
+    Metadata:
+        Each stop is annotated with the borough the stop took place, the race of the stopped person, and whether the stop
+        took place in the early or later time periond
+
+    Website:
+        NYPD - https://www1.nyc.gov/site/nypd/stats/reports-analysis/stopfrisk.page
+        Cleaned data - https://5harad.com/data/sqf.RData
+
+    Cleaning and analysis citation:
+        @article{goel_precinct_2016,
+            title = {Precinct or prejudice? {Understanding} racial disparities in {New} {York} {City}’s stop-and-frisk policy},
+            volume = {10},
+            issn = {1932-6157},
+            shorttitle = {Precinct or prejudice?},
+            url = {http://projecteuclid.org/euclid.aoas/1458909920},
+            doi = {10.1214/15-AOAS897},
+            language = {en},
+            number = {1},
+            journal = {The Annals of Applied Statistics},
+            author = {Goel, Sharad and Rao, Justin M. and Shroff, Ravi},
+            month = mar,
+            year = {2016},
+            pages = {365--394},
+        }
+    """
+    def __init__(self, root_dir, download, split_scheme):
+        # set variables
+        self._dataset_name = 'sqf'
+        self._version = '1.0'
+        self._split_scheme = split_scheme
+        self._y_size = 1
+        self._n_classes = 2
+        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xea27fd7daef642d2aa95b02f1e3ac404/contents/blob/'
+        # path
+        self._data_dir = self.initialize_data_dir(root_dir, download)
+
+        # Load data
+        data_df = pd.read_csv(os.path.join(self.data_dir, 'sqf.csv') , index_col=0)
+
+        categories = ['black', 'white hispanic', 'black hispanic', 'hispanic', 'white']
+        data_df = data_df.loc[data_df['suspect.race'].map(lambda x: x in categories)]
+        data_df['suspect.race'] = data_df['suspect.race'].map(lambda x: 'Hispanic' if 'hispanic' in x else x.title())
+
+        # Only track weapons stops
+        data_df = data_df[data_df['suspected.crime']=='cpw']
+
+        # Get district features if measuring race, don't if measuring boroughs
+        self.feats_to_use  = self.get_split_features(data_df.columns)
+
+        # Drop data that doesn't have the all of the predictive features. This preserves almost all rows.
+        data_df = data_df.dropna(subset=self.feats_to_use)
+
+        # Get indices based on new index / after dropping rows with missing data
+        train_idxs, test_idxs, val_idxs = self.get_split_indices(data_df)
+
+        # Drop rows with unused metadata categories
+        data_df = data_df.loc[train_idxs + test_idxs + val_idxs]
+
+        # Reindex for simplicity
+        data_df.index = range(data_df.shape[0])
+        train_idxs = range(0, len(train_idxs))
+        test_idxs = range(len(train_idxs), len(train_idxs)+ len(test_idxs))
+        val_idxs = range(test_idxs[-1], data_df.shape[0] )
+
+        # Normalize continuous features
+        data_df = self.normalize_data(data_df, train_idxs)
+        self._input_array = data_df
+
+        # Create split dictionaries
+        self.initialize_split_dicts()
+
+        # Get whether a weapon was found for various groups
+        self._y_array = torch.from_numpy(data_df['found.weapon'].values).long()
+
+        # Metadata will be int dicts
+        self._identity_vars = [ 'suspect.race', 'borough', 'train.period']
+
+        explicit_identity_label_df, self._metadata_map = self.load_metadata(data_df)
+
+        self._metadata_array = torch.cat(
+            (
+                torch.LongTensor(explicit_identity_label_df.values),
+                self._y_array.reshape((-1, 1))
+            ),
+            dim=1
+        )
+        self._metadata_fields = ['suspect race', 'borough', '2010 or earlier?'] + ['y']
+
+        self.get_split_maps( data_df,  train_idxs, test_idxs, val_idxs)
+
+        data_df = data_df[self.feats_to_use]
+        self._input_array = pd.get_dummies(data_df, columns=[i for i in self.feats_to_use if 'suspect.' not in i and
+                                                             'observation.period' not in i], drop_first=True)
+        # Recover relevant features after taking dummies
+        new_feats = []
+        for i in self.feats_to_use:
+            for j in self._input_array:
+                if i in j:
+                    new_feats.append(j)
+                else:
+                    pass
+        self._input_array = self._input_array[new_feats]
+        self.initialize_eval_grouper()
+
+
+    def load_metadata(self, data_df):
+        metadata_df = data_df[self._identity_vars].copy()
+        metadata_names = ['suspect race', 'borough', '2010 or earlier?']
+        metadata_ordered_maps = {}
+        for col_name, meta_name in zip(metadata_df.columns, metadata_names):
+
+            col_order = sorted(set(metadata_df[col_name]))
+            col_dict = dict(zip(col_order, range(len(col_order))))
+            metadata_ordered_maps[col_name] = col_order
+            metadata_df[meta_name] = metadata_df[col_name].map(col_dict)
+        return metadata_df[metadata_names], metadata_ordered_maps
+
+
+    def get_split_indices(self, data_df):
+        """Finds splits based on the split type """
+        test_idxs =  data_df[data_df.year > 2010].index.tolist()
+        train_df = data_df[data_df.year <= 2010]
+        validation_id_idxs = subsample_idxs(train_df.index.tolist(), num=int(train_df.shape[0] * 0.2),  seed=2851,
+                                            take_rest=False)
+
+        train_df = train_df[~train_df.index.isin(validation_id_idxs)]
+
+        if 'black' == self._split_scheme:
+            train_idxs = train_df[train_df['suspect.race'] == 'Black'].index.tolist()
+
+        elif 'all_race' in self._split_scheme:
+            black_train_size = train_df[train_df['suspect.race'] == 'Black'].shape[0]
+            train_idxs = subsample_idxs(train_df.index.tolist(), num=black_train_size, take_rest=False, seed=4999)
+
+        elif 'all_borough' == self._split_scheme:
+            bronx_train_size = train_df[train_df['borough'] == 'Bronx'].shape[0]
+            train_idxs = subsample_idxs(train_df.index.tolist(), num=bronx_train_size, take_rest=False, seed=8614)
+
+        elif 'bronx' == self._split_scheme:
+            train_idxs = train_df[train_df['borough'] == 'Bronx'].index.tolist()
+
+        else:
+            raise ValueError(f'Split scheme {self.split_scheme} not recognized')
+
+        return train_idxs, test_idxs, validation_id_idxs
+
+    def indices_to_dict(self, indices, int_val):
+        local_idx_dict = {}
+        for i in indices:
+            local_idx_dict[i] = int_val
+        return local_idx_dict
+
+    def get_split_maps(self, data_df,  train_idxs, test_idxs, val_idxs):
+        """Using the existing split indices, create a map to put entries to training and validation sets. Set class var."""
+        index_dict = {}
+        for arg, idx_set in enumerate([train_idxs, test_idxs, val_idxs]):
+            index_dict.update(self.indices_to_dict(idx_set, arg))
+
+        index_accumulator = []
+
+        for index, sample in data_df.iterrows():
+            index_accumulator.append(index_dict[index])
+
+        self._split_array = np.array(index_accumulator)
+
+    def get_split_features(self, columns):
+        """Get features that include precinct if we're splitting on race or don't include if we're using borough splits."""
+        feats_to_use = []
+        if 'bronx' not in self._split_scheme and 'borough' not in self._split_scheme:
+            feats_to_use.append('precinct')
+
+        feats_to_use += ['suspect.height', 'suspect.weight', 'suspect.age', 'observation.period',
+                        'inside.outside', 'location.housing', 'radio.run', 'officer.uniform']
+        # Primary stop reasoning features
+        feats_to_use += [i for i in columns if 'stopped.bc' in i]
+        # Secondary stop reasoning features, if any
+        feats_to_use += [i for i in columns if 'additional' in i]
+
+        return feats_to_use
+
+    def normalize_data(self, df,  train_idxs):
+        """"Normalizes the data as Goel et al do - continuous features only"""
+        columns_to_norm = ['suspect.height', 'suspect.weight', 'suspect.age', 'observation.period']
+        df_unnormed_train = df.loc[train_idxs].copy()
+        for feature_name in columns_to_norm:
+            df[feature_name] = df[feature_name] - np.mean(df_unnormed_train[feature_name])
+            df[feature_name] = df[feature_name] / np.std(df_unnormed_train[feature_name])
+        return df
+
+
+    def initialize_split_dicts(self):
+        """Identify split indices and name splits"""
+        if 'all_borough' == self.split_scheme :
+            self._split_dict = {'train': 0, 'test': 1, 'val':2}
+            self._split_names = {'train': 'All Boroughs 2009 & 10  subsampled to match Bronx train set size', 'test':'All Stops 2010 & 11', \
+                                 'val':'20% sample of all stops 2009 & 10'}
+        elif 'bronx' == self.split_scheme:
+                self._split_dict = {'train': 0, 'test': 1, 'val': 2}
+                self._split_names = {'train': 'Bronx 2009 & 10', 'test': 'All Stops 2010 & 11', \
+                                     'val': '20% sample of all stops 2009 & 10'}
+        elif 'black' == self.split_scheme:
+            self._split_dict = {'train': 0, 'test': 1, 'val':2}
+            self._split_names = {'train': 'train: 80% Black Stops 2009 and 2010', 'test':'Test: All Stops 2011 and 2012. ', \
+                                 'val':'20% sample of all stops 2009 & 10'}
+        elif 'all_race' == self.split_scheme or 'test' == self.split_scheme :
+            self._split_dict = {'train': 0, 'test': 1, 'val':2}
+            self._split_names = {'train': 'train: Stops 2009 and 2010 subsampled to the size of Black people training set', 'test':'Test: All Stops 2011 and 2012. ', \
+                                 'val':'20% sample of all stops 2009 & 10'}
+        else:
+            raise ValueError(f'Split scheme {self.split_scheme} not recognized')
+
+
+    def get_input(self, idx):
+        return torch.FloatTensor(self._input_array.loc[idx].values)
+
+    def eval(self, y_pred, y_true, metadata):
+        """Evaluate the precision achieve overall and across groups for a given global recall"""
+        g = self._eval_grouper.metadata_to_group(metadata)
+
+        y_scores = F.softmax(y_pred, dim=1)[:,1]
+        threshold_60 = threshold_at_recall(y_scores, y_true)
+        results = Accuracy().compute(y_pred, y_true)
+        results.update(BinaryAUPRC().compute(y_pred, y_true))
+        results.update(PrecisionAtRecall(threshold_60).compute(y_pred, y_true))
+        results.update(Accuracy().compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
+        results.update(BinaryAUPRC().compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
+        results.update(
+        PrecisionAtRecall(threshold_60).compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
+
+        results_str = (
+            f"Average {PrecisionAtRecall(threshold=threshold_60).name }:  {results[PrecisionAtRecall(threshold=threshold_60).agg_metric_field]:.3f}\n"
+            f"Average {BinaryAUPRC().name}:  {results[BinaryAUPRC().agg_metric_field]:.3f}\n"
+            f"Average {Accuracy().name}:  {results[Accuracy().agg_metric_field]:.3f}\n"
+        )
+
+        return results, results_str
+
+    def initialize_eval_grouper(self):
+        if 'black' in self.split_scheme or 'race' in self.split_scheme or 'test' in self.split_scheme:
+            self._eval_grouper = CombinatorialGrouper(
+                dataset=self,
+                groupby_fields = ['suspect race']
+            )
+        elif 'bronx' in self.split_scheme or 'all_borough' == self.split_scheme:
+            self._eval_grouper = CombinatorialGrouper(
+                dataset=self,
+                groupby_fields =  ['borough'])
+        else:
+            raise ValueError(f'Split scheme {self.split_scheme} not recognized')
+
+

From 69937098241339f8d075458b5a95bd4a107bc18e Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Tue, 19 Jan 2021 21:54:01 -0500
Subject: [PATCH 002/116] Updated to balance groups while training

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index e760ff50..5c7956f7 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -241,7 +241,7 @@
         'lr': 5e-5,
         'weight_decay': 0,
         'n_epochs': 4,
-        # 'dataset_kwargs' : {'uniform_over_groups' :True}
+        'uniform_over_groups':True,
     },
 }
 

From 39e0d66fb960590bdd101e63041c08c4b62914c9 Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 12:22:55 -0500
Subject: [PATCH 003/116] Removed AUPRC

---
 wilds/common/metrics/all_metrics.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/wilds/common/metrics/all_metrics.py b/wilds/common/metrics/all_metrics.py
index 33328071..0dff4f47 100644
--- a/wilds/common/metrics/all_metrics.py
+++ b/wilds/common/metrics/all_metrics.py
@@ -129,26 +129,6 @@ def __init__(self, name=None):
             name = 'mse'
         super().__init__(name=name, loss_fn=mse_loss)
 
-class BinaryAUPRC(Metric):
-    def __init__(self, score_fn=logits_to_score, name=None):
-        self.score_fn = score_fn
-        if name is None:
-            name = f'auprc'
-
-        super().__init__(name=name)
-
-    def _compute(self, y_pred, y_true):
-        if self.score_fn is not None:
-            score = self.score_fn(y_pred)
-        try:
-            return torch.tensor(sklearn.metrics.average_precision_score(y_true, score))
-        except ValueError:
-            print('Warning: AUPRC not defined when there are no positive cases.')
-            return torch.FloatTensor([float('nan')])
-
-    def worst(self, metrics):
-        return minimum(metrics)
-
 class PrecisionAtRecall(Metric):
     """Given a specific model threshold, determine the precision score achieved"""
     def __init__(self, threshold, score_fn=logits_to_score, name=None):

From 5501e23f6927cfe8a99101a21952c63163b8b312 Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 12:44:41 -0500
Subject: [PATCH 004/116] Light formatting fixes

---
 wilds/datasets/sqf_dataset.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index 2b236bc1..4a82e8c1 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import numpy as np
 from wilds.datasets.wilds_dataset import WILDSDataset
-from wilds.common.metrics.all_metrics import Accuracy, BinaryAUPRC, PrecisionAtRecall
+from wilds.common.metrics.all_metrics import Accuracy, PrecisionAtRecall
 from wilds.common.grouper import CombinatorialGrouper
 from wilds.common.utils import subsample_idxs, threshold_at_recall
 import torch.nn.functional as F
@@ -61,13 +61,15 @@ def __init__(self, root_dir, download, split_scheme):
 
         # Load data
         data_df = pd.read_csv(os.path.join(self.data_dir, 'sqf.csv') , index_col=0)
-
+        data_df = data_df[data_df['suspected.crime'] == 'cpw']
+        print('!!!!!', data_df.shape)
         categories = ['black', 'white hispanic', 'black hispanic', 'hispanic', 'white']
         data_df = data_df.loc[data_df['suspect.race'].map(lambda x: x in categories)]
         data_df['suspect.race'] = data_df['suspect.race'].map(lambda x: 'Hispanic' if 'hispanic' in x else x.title())
 
         # Only track weapons stops
         data_df = data_df[data_df['suspected.crime']=='cpw']
+        print(data_df.shape)
 
         # Get district features if measuring race, don't if measuring boroughs
         self.feats_to_use  = self.get_split_features(data_df.columns)
@@ -245,16 +247,13 @@ def eval(self, y_pred, y_true, metadata):
         y_scores = F.softmax(y_pred, dim=1)[:,1]
         threshold_60 = threshold_at_recall(y_scores, y_true)
         results = Accuracy().compute(y_pred, y_true)
-        results.update(BinaryAUPRC().compute(y_pred, y_true))
         results.update(PrecisionAtRecall(threshold_60).compute(y_pred, y_true))
         results.update(Accuracy().compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
-        results.update(BinaryAUPRC().compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
         results.update(
         PrecisionAtRecall(threshold_60).compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
 
         results_str = (
             f"Average {PrecisionAtRecall(threshold=threshold_60).name }:  {results[PrecisionAtRecall(threshold=threshold_60).agg_metric_field]:.3f}\n"
-            f"Average {BinaryAUPRC().name}:  {results[BinaryAUPRC().agg_metric_field]:.3f}\n"
             f"Average {Accuracy().name}:  {results[Accuracy().agg_metric_field]:.3f}\n"
         )
 
@@ -269,7 +268,7 @@ def initialize_eval_grouper(self):
         elif 'bronx' in self.split_scheme or 'all_borough' == self.split_scheme:
             self._eval_grouper = CombinatorialGrouper(
                 dataset=self,
-                groupby_fields =  ['borough'])
+                groupby_fields = ['borough'])
         else:
             raise ValueError(f'Split scheme {self.split_scheme} not recognized')
 

From e9d694e0c04d9f6bc87d49a3851b29ea006ae7db Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 12:46:07 -0500
Subject: [PATCH 005/116] Change PAR name

---
 wilds/common/metrics/all_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/common/metrics/all_metrics.py b/wilds/common/metrics/all_metrics.py
index 0dff4f47..146b96a3 100644
--- a/wilds/common/metrics/all_metrics.py
+++ b/wilds/common/metrics/all_metrics.py
@@ -135,7 +135,7 @@ def __init__(self, threshold, score_fn=logits_to_score, name=None):
         self.score_fn = score_fn
         self.threshold = threshold
         if name is None:
-            name = "precision_at_global_recall_"
+            name = "precision_at_global_recall"
         super().__init__(name=name)
 
     def _compute(self, y_pred, y_true):

From 8c39d8e654bf859d92c66d501115502a0bb89ece Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 12:47:20 -0500
Subject: [PATCH 006/116] removed test split

---
 wilds/datasets/sqf_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index 4a82e8c1..e88586d0 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -229,7 +229,7 @@ def initialize_split_dicts(self):
             self._split_dict = {'train': 0, 'test': 1, 'val':2}
             self._split_names = {'train': 'train: 80% Black Stops 2009 and 2010', 'test':'Test: All Stops 2011 and 2012. ', \
                                  'val':'20% sample of all stops 2009 & 10'}
-        elif 'all_race' == self.split_scheme or 'test' == self.split_scheme :
+        elif 'all_race' == self.split_scheme :
             self._split_dict = {'train': 0, 'test': 1, 'val':2}
             self._split_names = {'train': 'train: Stops 2009 and 2010 subsampled to the size of Black people training set', 'test':'Test: All Stops 2011 and 2012. ', \
                                  'val':'20% sample of all stops 2009 & 10'}
@@ -260,7 +260,7 @@ def eval(self, y_pred, y_true, metadata):
         return results, results_str
 
     def initialize_eval_grouper(self):
-        if 'black' in self.split_scheme or 'race' in self.split_scheme or 'test' in self.split_scheme:
+        if 'black' in self.split_scheme or 'race' in self.split_scheme :
             self._eval_grouper = CombinatorialGrouper(
                 dataset=self,
                 groupby_fields = ['suspect race']

From bdd6cd2ae89dea787f3425d9d32e8cfee20b0f3e Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 12:48:37 -0500
Subject: [PATCH 007/116] Removed uniform_over_groups

---
 examples/configs/datasets.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 5c7956f7..a8d1e521 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -241,7 +241,6 @@
         'lr': 5e-5,
         'weight_decay': 0,
         'n_epochs': 4,
-        'uniform_over_groups':True,
     },
 }
 

From b64d924680a0a46860c29a5b0852c3b7ae4c1070 Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 12:57:13 -0500
Subject: [PATCH 008/116] removed spurious print statements

---
 examples/train.py             | 1 +
 wilds/datasets/sqf_dataset.py | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/train.py b/examples/train.py
index ba29be90..2251f8d5 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -91,6 +91,7 @@ def train(algorithm, datasets, general_logger, config, epoch_offset, best_val_me
 
         # Then run val
         val_results = run_epoch(algorithm, datasets['val'], general_logger, epoch, config, train=False)
+        print(val_results)
         curr_val_metric = val_results[config.val_metric]
         general_logger.write(f'Validation {config.val_metric}: {curr_val_metric:.3f}\n')
 
diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index e88586d0..3fca77cf 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -62,14 +62,12 @@ def __init__(self, root_dir, download, split_scheme):
         # Load data
         data_df = pd.read_csv(os.path.join(self.data_dir, 'sqf.csv') , index_col=0)
         data_df = data_df[data_df['suspected.crime'] == 'cpw']
-        print('!!!!!', data_df.shape)
         categories = ['black', 'white hispanic', 'black hispanic', 'hispanic', 'white']
         data_df = data_df.loc[data_df['suspect.race'].map(lambda x: x in categories)]
         data_df['suspect.race'] = data_df['suspect.race'].map(lambda x: 'Hispanic' if 'hispanic' in x else x.title())
 
         # Only track weapons stops
         data_df = data_df[data_df['suspected.crime']=='cpw']
-        print(data_df.shape)
 
         # Get district features if measuring race, don't if measuring boroughs
         self.feats_to_use  = self.get_split_features(data_df.columns)

From 0f0dd2d0fac4197c79bc482cf5abb792a967ccd9 Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 13:07:13 -0500
Subject: [PATCH 009/116] Fixed dataset label

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index a8d1e521..a3150e0e 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -231,7 +231,7 @@
         'model_kwargs': {'in_features': 104},
         'loss_function': 'cross_entropy',
         'groupby_fields': ['y'],
-        'val_metric': 'precision_at_global_recall__all',
+        'val_metric': 'precision_at_global_recall_all',
         'val_metric_decreasing': False,
         'algo_log_metric': 'accuracy',
         'optimizer': 'Adam',

From 543f9ccfecf39ee4d7c32d421feed5d651640ff5 Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Mon, 8 Feb 2021 13:10:11 -0500
Subject: [PATCH 010/116] Removed debug print

---
 examples/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/train.py b/examples/train.py
index 2251f8d5..ba29be90 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -91,7 +91,6 @@ def train(algorithm, datasets, general_logger, config, epoch_offset, best_val_me
 
         # Then run val
         val_results = run_epoch(algorithm, datasets['val'], general_logger, epoch, config, train=False)
-        print(val_results)
         curr_val_metric = val_results[config.val_metric]
         general_logger.write(f'Validation {config.val_metric}: {curr_val_metric:.3f}\n')
 

From f6dcc1d1248f0b9a190c61ed6060a3e4a3f3160d Mon Sep 17 00:00:00 2001
From: Michihiro Yasunaga <gomagoma353@gmail.com>
Date: Mon, 8 Feb 2021 14:09:59 -0800
Subject: [PATCH 011/116] add py150

---
 examples/algorithms/deepCORAL.py   |  15 +++-
 examples/algorithms/initializer.py |   2 +
 examples/configs/datasets.py       |  18 ++++
 examples/configs/model.py          |   5 ++
 examples/configs/supported.py      |   9 +-
 examples/models/code_gpt.py        |  35 ++++++++
 examples/models/initializer.py     |  10 +++
 examples/optimizer.py              |   2 +-
 examples/train.py                  |  12 ++-
 wilds/common/metrics/loss.py       |  11 ++-
 wilds/common/metrics/metric.py     |   4 +-
 wilds/datasets/py150_dataset.py    | 129 +++++++++++++++++++++++++++++
 12 files changed, 240 insertions(+), 12 deletions(-)
 create mode 100644 examples/models/code_gpt.py
 create mode 100644 wilds/datasets/py150_dataset.py

diff --git a/examples/algorithms/deepCORAL.py b/examples/algorithms/deepCORAL.py
index 7069d127..70d3287a 100644
--- a/examples/algorithms/deepCORAL.py
+++ b/examples/algorithms/deepCORAL.py
@@ -27,9 +27,14 @@ def __init__(self, config, d_out, grouper, loss, metric, n_train_steps):
         assert config.uniform_over_groups
         assert config.distinct_groups
         # initialize models
-        featurizer = initialize_model(config, d_out=None).to(config.device)
-        classifier = torch.nn.Linear(featurizer.d_out, d_out).to(config.device)
-        model = torch.nn.Sequential(featurizer, classifier).to(config.device)
+        if config.model == 'code-gpt-py': #in case of pre-trained language model (`classifier` is also pre-trained)
+            model = initialize_model(config, d_out=None).to(config.device)
+            featurizer = model.transformer
+            classifier = model.lm_head
+        else:
+            featurizer = initialize_model(config, d_out=None).to(config.device)
+            classifier = torch.nn.Linear(featurizer.d_out, d_out).to(config.device)
+            model = torch.nn.Sequential(featurizer, classifier).to(config.device)
         # initialize module
         super().__init__(
             config=config,
@@ -48,6 +53,10 @@ def __init__(self, config, d_out, grouper, loss, metric, n_train_steps):
         self.classifier = classifier
 
     def coral_penalty(self, x, y):
+        if x.dim() == 3: #in case of language model [batch_size, seqlen, d_out]
+            x = x.view(-1, x.size(-1))
+            y = y.view(-1, y.size(-1))
+
         mean_x = x.mean(0, keepdim=True)
         mean_y = y.mean(0, keepdim=True)
         cent_x = x - mean_x
diff --git a/examples/algorithms/initializer.py b/examples/algorithms/initializer.py
index 9c6a5444..00748cfc 100644
--- a/examples/algorithms/initializer.py
+++ b/examples/algorithms/initializer.py
@@ -14,6 +14,8 @@ def initialize_algorithm(config, datasets, train_grouper):
     if (train_dataset.is_classification) and (train_dataset.y_size == 1):
         # For single-task classification, we have one output per class
         d_out = train_dataset.n_classes
+    elif (train_dataset.is_classification) and (train_dataset.y_size is None):
+        d_out = train_dataset.n_classes
     elif (train_dataset.is_classification) and (train_dataset.y_size > 1) and (train_dataset.n_classes == 2):
         # For multi-task binary classification (each output is the logit for each binary class)
         d_out = train_dataset.y_size
diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 1d15c7af..3990901e 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -157,6 +157,24 @@
         'coral_penalty_weight': 0.1,
         'no_group_logging': True,
     },
+    'py150': {
+        'split_scheme': 'official',
+        'model': 'code-gpt-py',
+        'loss_function': 'lm_cross_entropy',
+        'val_metric': 'acc',
+        'val_metric_decreasing': False,
+        'optimizer': 'AdamW',
+        'optimizer_kwargs': {'eps':1e-8},
+        'lr': 8e-5,
+        'weight_decay': 0.01,
+        'n_epochs': 5,
+        'batch_size': 40,
+        'groupby_fields': ['repo',],
+        'n_groups_per_batch': 10,
+        'irm_lambda': 1.,
+        'coral_penalty_weight': 0.1,
+        'no_group_logging': True,
+    },
     'poverty': {
         'split_scheme': 'official',
         'dataset_kwargs': {
diff --git a/examples/configs/model.py b/examples/configs/model.py
index 12a429a7..f70d24e1 100644
--- a/examples/configs/model.py
+++ b/examples/configs/model.py
@@ -4,6 +4,11 @@
         'max_grad_norm': 1.0,
         'scheduler': 'linear_schedule_with_warmup',
     },
+    'code-gpt-py': {
+        'optimizer': 'AdamW',
+        'max_grad_norm': 1.0,
+        'scheduler': 'linear_schedule_with_warmup',
+    },
     'densenet121': {
         'model_kwargs':{
             'pretrained':True,
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index bcbe54a9..6cbdf1cb 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -13,8 +13,9 @@
 from wilds.datasets.poverty_dataset import PovertyMapDataset
 from wilds.datasets.waterbirds_dataset import WaterbirdsDataset
 from wilds.datasets.yelp_dataset import YelpDataset
+from wilds.datasets.py150_dataset import Py150Dataset
 # metrics
-from wilds.common.metrics.loss import ElementwiseLoss, Loss, MultiTaskLoss
+from wilds.common.metrics.loss import ElementwiseLoss, Loss, MultiTaskLoss, lm_cross_entropy_loss
 from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE
 
 datasets = {
@@ -29,10 +30,12 @@
     'poverty': PovertyMapDataset,
     'fmow': FMoWDataset,
     'bdd100k': BDD100KDataset,
+    'py150': Py150Dataset,
 }
 
 losses = {
     'cross_entropy': ElementwiseLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
+    'lm_cross_entropy': ElementwiseLoss(loss_fn=lm_cross_entropy_loss),
     'mse': MSE(name='loss'),
     'multitask_bce': MultiTaskLoss(loss_fn=nn.BCEWithLogitsLoss(reduction='none')),
 }
@@ -46,8 +49,8 @@
 
 # see initialize_*() functions for correspondence
 transforms = ['bert', 'image_base', 'image_resize_and_center_crop', 'poverty_train']
-models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50', 'densenet121', 'bert-base-uncased', 'gin-virtual', 
-    'logistic_regression']
+models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50', 'densenet121', 'bert-base-uncased', 'gin-virtual',
+    'logistic_regression', 'code-gpt-py']
 algorithms = ['ERM', 'groupDRO', 'deepCORAL', 'IRM']
 optimizers = ['SGD', 'Adam', 'AdamW']
 schedulers = ['linear_schedule_with_warmup', 'ReduceLROnPlateau', 'StepLR']
diff --git a/examples/models/code_gpt.py b/examples/models/code_gpt.py
new file mode 100644
index 00000000..a85ef064
--- /dev/null
+++ b/examples/models/code_gpt.py
@@ -0,0 +1,35 @@
+from transformers import GPT2LMHeadModel, GPT2Model
+import torch
+
+class GPT2LMHeadLogit(GPT2LMHeadModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.d_out = config.vocab_size
+
+    def __call__(self, x):
+        outputs = super().__call__(x)
+        logits = outputs[0] #[batch_size, seqlen, vocab_size]
+        return logits
+
+
+class GPT2Featurizer(GPT2Model):
+    def __init__(self, config):
+        super().__init__(config)
+        self.d_out = config.n_embd
+
+    def __call__(self, x):
+        outputs = super().__call__(x)
+        hidden_states = outputs[0] #[batch_size, seqlen, n_embd]
+        return hidden_states
+
+
+class GPT2FeaturizerLMHeadLogit(GPT2LMHeadModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.d_out = config.vocab_size
+        self.transformer = GPT2Featurizer(config)
+
+    def __call__(self, x):
+        hidden_states = self.transformer(x) #[batch_size, seqlen, n_embd]
+        logits = self.lm_head(hidden_states) #[batch_size, seqlen, vocab_size]
+        return logits
diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index cea5ebfc..6aa72272 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -4,6 +4,8 @@
 from models.resnet_multispectral import ResNet18
 from models.layers import Identity
 from models.gnn import GINVirtual
+from models.code_gpt import GPT2LMHeadLogit, GPT2FeaturizerLMHeadLogit
+from transformers import GPT2Tokenizer
 
 def initialize_model(config, d_out):
     if config.model == 'resnet18_ms':
@@ -22,6 +24,14 @@ def initialize_model(config, d_out):
                 config.model,
                 num_labels=d_out,
                 **config.model_kwargs)
+    elif config.model == 'code-gpt-py':
+        name = 'microsoft/CodeGPT-small-py'
+        if d_out is None:
+            model = GPT2FeaturizerLMHeadLogit.from_pretrained(name)
+        else:
+            model = GPT2LMHeadLogit.from_pretrained(name)
+        tokenizer = GPT2Tokenizer.from_pretrained(name)
+        model.resize_token_embeddings(len(tokenizer))
     elif config.model == 'logistic_regression':
         model = nn.Linear(out_features=d_out, **config.model_kwargs)
     elif config.model == 'gin-virtual':
diff --git a/examples/optimizer.py b/examples/optimizer.py
index a31777ff..96fcffd3 100644
--- a/examples/optimizer.py
+++ b/examples/optimizer.py
@@ -13,7 +13,7 @@ def initialize_optimizer(config, model):
             weight_decay=config.weight_decay,
             **config.optimizer_kwargs)
     elif config.optimizer=='AdamW':
-        assert config.model.startswith('bert'), "Only BERT supported for AdamW"
+        assert config.model.startswith('bert') or 'gpt' in config.model, "Only BERT/GPT supported for AdamW"
         no_decay = ['bias', 'LayerNorm.weight']
         params = [
             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
diff --git a/examples/train.py b/examples/train.py
index ba29be90..f597e401 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -49,7 +49,11 @@ def run_epoch(algorithm, dataset, general_logger, epoch, config, train):
         # The subsequent detach is just for safety
         # (they should already be detached in batch_results)
         epoch_y_true.append(batch_results['y_true'].clone().detach())
-        epoch_y_pred.append(batch_results['y_pred'].clone().detach())
+        if batch_results['y_pred'].dim() == 3:
+            #language model preds have a very big vocab size (e.g. 50000), so need to do argmax here. otherwise get OOM
+            epoch_y_pred.append(batch_results['y_pred'].clone().detach().argmax(-1))
+        else:
+            epoch_y_pred.append(batch_results['y_pred'].clone().detach())
         epoch_metadata.append(batch_results['metadata'].clone().detach())
 
         if train and (batch_idx+1) % config.log_every==0:
@@ -135,7 +139,11 @@ def evaluate(algorithm, datasets, epoch, general_logger, config):
         for batch in iterator:
             batch_results = algorithm.evaluate(batch)
             epoch_y_true.append(batch_results['y_true'].clone().detach())
-            epoch_y_pred.append(batch_results['y_pred'].clone().detach())
+            if batch_results['y_pred'].dim() == 3:
+                #language model preds have a very big vocab size (e.g. 50000), so need to do argmax here. otherwise get OOM
+                epoch_y_pred.append(batch_results['y_pred'].clone().detach().argmax(-1))
+            else:
+                epoch_y_pred.append(batch_results['y_pred'].clone().detach())
             epoch_metadata.append(batch_results['metadata'].clone().detach())
 
         results, results_str = dataset['dataset'].eval(
diff --git a/wilds/common/metrics/loss.py b/wilds/common/metrics/loss.py
index 40df9b0b..9d8f8a27 100644
--- a/wilds/common/metrics/loss.py
+++ b/wilds/common/metrics/loss.py
@@ -29,7 +29,7 @@ def worst(self, metrics):
             - worst_metric (float): Worst-case metric
         """
         return maximum(metrics)
-    
+
 class ElementwiseLoss(ElementwiseMetric):
     def __init__(self, loss_fn, name=None):
         self.loss_fn = loss_fn
@@ -82,3 +82,12 @@ def worst(self, metrics):
         """
         return maximum(metrics)
 
+
+def lm_cross_entropy_loss(input, target):
+    """
+    Cross entropy loss for language model head (input's dimenstionality is 3)
+        input: [batch_size, seqlen, vocab_size]
+        target: [batch_size, seqlen]
+    """
+    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
+    return loss_fn(input.transpose(1,2), target) #[batch_size, seqlen]
diff --git a/wilds/common/metrics/metric.py b/wilds/common/metrics/metric.py
index 4c3e8440..39985afd 100644
--- a/wilds/common/metrics/metric.py
+++ b/wilds/common/metrics/metric.py
@@ -135,7 +135,7 @@ def _compute_group_wise(self, y_pred, y_true, g, n_groups):
                         y_true[g == group_idx]))
         group_metrics = torch.stack(group_metrics)
         worst_group_metric = self.worst(group_metrics[group_counts>0])
-        
+
         return group_metrics, group_counts, worst_group_metric
 
 class ElementwiseMetric(Metric):
@@ -203,7 +203,7 @@ def compute_element_wise(self, y_pred, y_true, return_dict=True):
         """
         element_wise_metrics = self._compute_element_wise(y_pred, y_true)
         batch_size = y_pred.size()[0]
-        assert element_wise_metrics.dim()==1 and element_wise_metrics.numel()==batch_size
+        assert (element_wise_metrics.dim()==1 and element_wise_metrics.numel()==batch_size) or (element_wise_metrics.dim()==2 and element_wise_metrics.size(0)==batch_size)
 
         if return_dict:
             return {self.name: element_wise_metrics}
diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
new file mode 100644
index 00000000..efde4764
--- /dev/null
+++ b/wilds/datasets/py150_dataset.py
@@ -0,0 +1,129 @@
+from pathlib import Path
+import os
+
+import pandas as pd
+import numpy as np
+import torch
+import json
+import gc
+
+from wilds.datasets.wilds_dataset import WILDSDataset
+from transformers import GPT2Tokenizer
+
+class Py150Dataset(WILDSDataset):
+    """
+        The Py150 dataset.
+        This is a modified version of the original Py150 dataset.
+        Input (x):
+            A Python code snippet (a sequence of tokens)
+        Label (y):
+            A sequence of next tokens (shifted x)
+        Metadata:
+            Each image is annotated with the original GitHub repo id and file name
+        Website:
+            https://www.sri.inf.ethz.ch/py150
+            https://github.com/microsoft/CodeXGLUE
+        Original publication:
+            @article{raychev2016probabilistic,
+              title={Probabilistic model for code with decision trees},
+              author={Raychev, Veselin and Bielik, Pavol and Vechev, Martin},
+              journal={ACM SIGPLAN Notices},
+              year={2016},
+            }
+            @article{CodeXGLUE,
+              title={CodeXGLUE: A Benchmark Dataset and Open Challenge for Code Intelligence},
+              year={2020},
+            }
+        License:
+            This dataset is distributed under the MIT license.
+        """
+
+    def __init__(self, root_dir='data', download=False, split_scheme='official'):
+
+        self._dataset_name = 'py150'
+        self._version = '1.0'
+        self._split_scheme = split_scheme
+        if self._split_scheme != 'official':
+            raise ValueError(f'Split scheme {self._split_scheme} not recognized')
+
+        # path
+        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x45343bd9e1c64acfbcb4a22a76302994/contents/blob/'
+        self._data_dir = Path(self.initialize_data_dir(root_dir, download))
+
+        # Load data
+        df = self._load_all_data()
+
+        # Splits
+        data = {}
+        self._split_dict = {'train': 0, 'val': 1, 'test': 2, 'IDval': 3, 'IDtest': 4}
+        self._split_names = {'train': 'Train', 'val': 'Validation (OOD)',
+                                'test': 'Test (OOD)', 'IDval': 'Validation (ID)',
+                                'IDtest': 'Test (ID)'}
+
+        df['split_id'] = df['split'].apply(lambda x: self._split_dict[x])
+        self._split_array = df['split_id'].values
+
+        # Input
+        self._input_array = torch.tensor(list(df['input'].apply(lambda x: x[:-1]).values)) #[n_samples, seqlen-1]
+
+        # Labels
+        name = 'microsoft/CodeGPT-small-py'
+        tokenizer = GPT2Tokenizer.from_pretrained(name)
+        self._n_classes = len(tokenizer)
+        self._y_array = torch.tensor(list(df['input'].apply(lambda x: x[1:]).values))
+        self._y_size = None
+
+        _repo = torch.tensor(df['repo'].values).reshape(-1,1)  #[n_samples, 1]
+        _mask = torch.tensor(list(df['mask'].apply(lambda x: x[1:]).values)) #[n_samples, seqlen-1]
+        self._metadata_array = _repo
+        self._metadata_fields = ['repo']
+
+        self._y_array = self._y_array * _mask + torch.full(self._y_array.size(), -100) * (1-_mask)
+
+        super().__init__(root_dir, download, split_scheme)
+
+    def eval(self, y_pred, y_true, metadata):
+        #y_pred: [n_samples, seqlen-1]
+        #y_true: [n_samples, seqlen-1]
+        mask = (y_true != -100).long()
+        assert y_pred.size() == mask.size() == y_true.size(), (y_pred.size(), y_true.size(), mask.size())
+        acc = ((y_pred==y_true)*mask).float().sum() / (mask.float().sum() +1e-8)
+
+        results = {'acc': acc}
+        results_str = f"Average acc: {results['acc']:.3f}\n"
+        return results, results_str
+
+    def get_input(self, idx):
+        """
+        Args:
+            - idx (int): Index of a data point
+        Output:
+            - x (Tensor): Input features of the idx-th data point
+        """
+        return self._input_array[idx]
+
+
+    def _load_all_data(self):
+        def fname2repo_id(fname, repo_name2id):
+            return repo_name2id['/'.join(fname.split('/')[:2])]
+
+        _df = pd.read_csv(self._data_dir/'metadata/repo_file_names/repo_ids.csv')
+        repo_name2id = {repo_name: id for id, repo_name in zip(_df.id, _df.repo_name)}
+
+        dfs = []
+        pad_token_id = 1
+        for type in ['train', 'IDval', 'OODval', 'IDtest', 'OODtest']:
+            inputs = json.load(open(self._data_dir/f'processed/{type}_input.json'))
+            fnames = open(self._data_dir/f'metadata/repo_file_names/{type}.txt').readlines()
+            repo_ids = [fname2repo_id(fname, repo_name2id) for fname in fnames]
+            splits   = [type.replace('OOD','')] * len(inputs)
+            if type == 'train':
+                masks = (np.array(inputs) != pad_token_id).astype(int).tolist()
+            else:
+                masks = json.load(open(self._data_dir/f'processed/{type}_input_mask.json'))
+            assert len(repo_ids) == len(inputs) == len(masks)
+
+            _df  = pd.DataFrame({'input':inputs, 'mask': masks, 'repo': repo_ids, 'split': splits})
+            dfs.append(_df)
+
+        return pd.concat(dfs)

From 59a3dd6eeb71f03f5d525854a0aeb7a385a93ba4 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 8 Feb 2021 15:19:03 -0800
Subject: [PATCH 012/116] dev branch

---
 wilds/common/metrics/metric.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wilds/common/metrics/metric.py b/wilds/common/metrics/metric.py
index 4c3e8440..9c4372b0 100644
--- a/wilds/common/metrics/metric.py
+++ b/wilds/common/metrics/metric.py
@@ -135,7 +135,7 @@ def _compute_group_wise(self, y_pred, y_true, g, n_groups):
                         y_true[g == group_idx]))
         group_metrics = torch.stack(group_metrics)
         worst_group_metric = self.worst(group_metrics[group_counts>0])
-        
+
         return group_metrics, group_counts, worst_group_metric
 
 class ElementwiseMetric(Metric):
@@ -212,7 +212,7 @@ def compute_element_wise(self, y_pred, y_true, return_dict=True):
 
     def compute_flattened(self, y_pred, y_true, return_dict=True):
         flattened_metrics = self.compute_element_wise(y_pred, y_true, return_dict=False)
-        index =  torch.arange(y_true.numel())
+        index = torch.arange(y_true.numel())
         if return_dict:
             return {self.name: flattened_metrics, 'index': index}
         else:

From 3dd771bafa13c6e70658ce990625cff178d918cd Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Tue, 9 Feb 2021 12:42:44 -0800
Subject: [PATCH 013/116] minor edits for readability

---
 wilds/datasets/sqf_dataset.py | 124 +++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 54 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index 3fca77cf..c0352495 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -10,22 +10,29 @@
 
 class SQFDataset(WILDSDataset):
     """
-    New york stop and frisk data. CPW (weapons stops) from 2009 - 2012, as orginally provided by the NYPD and later
-    cleaned by Goel, Rao, and Shroff 2016 https://projecteuclid.org/euclid.aoas/1458909920 . Shared with permission.
-    https://5harad.com/#research for the full dataset.
+    New York City stop-question-and-frisk data.
+    The dataset covers data from 2009 - 2012, as orginally provided by the New York Police Department (NYPD) and later cleaned by Goel, Rao, and Shroff, 2016.
 
      Supported `split_scheme`:
         'black', 'all_race', 'bronx', or 'all_borough'
 
      Input (x):
-     Either 29 one-hot pre-stop observable features or 104=29 observables + 75 one-hot district indicators
+        For the 'black' and 'all_race' split schemes:
+            29 pre-stop observable features
+            + 75 one-hot district indicators = 104 features
+
+        For the 'bronx' and 'all_borough' split schemes:
+            29 pre-stop observable features.
+            As these split schemes study location shifts, we remove the district
+            indicators here as they prevent generalizing to new locations.
 
      Label (y):
-        y is binary. It is 1 if the stop is listed as finding a weapon, 0 otherwise.
+        Binary. It is 1 if the stop is listed as finding a weapon, and 0 otherwise.
 
     Metadata:
-        Each stop is annotated with the borough the stop took place, the race of the stopped person, and whether the stop
-        took place in the early or later time periond
+        Each stop is annotated with the borough the stop took place,
+        the race of the stopped person, and whether the stop took
+        place in 2009-2010 or in 2011-2012
 
     Website:
         NYPD - https://www1.nyc.gov/site/nypd/stats/reports-analysis/stopfrisk.page
@@ -47,6 +54,10 @@ class SQFDataset(WILDSDataset):
             year = {2016},
             pages = {365--394},
         }
+
+    License:
+        The original data frmo the NYPD is in the public domain.
+        The cleaned data from Goel, Rao, and Shroff is shared with permission.
     """
     def __init__(self, root_dir, download, split_scheme):
         # set variables
@@ -70,9 +81,10 @@ def __init__(self, root_dir, download, split_scheme):
         data_df = data_df[data_df['suspected.crime']=='cpw']
 
         # Get district features if measuring race, don't if measuring boroughs
-        self.feats_to_use  = self.get_split_features(data_df.columns)
+        self.feats_to_use = self.get_split_features(data_df.columns)
 
-        # Drop data that doesn't have the all of the predictive features. This preserves almost all rows.
+        # Drop rows that don't have all of the predictive features.
+        # This preserves almost all rows.
         data_df = data_df.dropna(subset=self.feats_to_use)
 
         # Get indices based on new index / after dropping rows with missing data
@@ -85,23 +97,20 @@ def __init__(self, root_dir, download, split_scheme):
         data_df.index = range(data_df.shape[0])
         train_idxs = range(0, len(train_idxs))
         test_idxs = range(len(train_idxs), len(train_idxs)+ len(test_idxs))
-        val_idxs = range(test_idxs[-1], data_df.shape[0] )
+        val_idxs = range(test_idxs[-1], data_df.shape[0])
 
         # Normalize continuous features
         data_df = self.normalize_data(data_df, train_idxs)
         self._input_array = data_df
 
         # Create split dictionaries
-        self.initialize_split_dicts()
+        self._split_dict, self._split_names = self.initialize_split_dicts()
 
         # Get whether a weapon was found for various groups
         self._y_array = torch.from_numpy(data_df['found.weapon'].values).long()
 
         # Metadata will be int dicts
-        self._identity_vars = [ 'suspect.race', 'borough', 'train.period']
-
-        explicit_identity_label_df, self._metadata_map = self.load_metadata(data_df)
-
+        explicit_identity_label_df, self._metadata_map = self.load_metadata(data_df, ['suspect.race', 'borough', 'train.period'])
         self._metadata_array = torch.cat(
             (
                 torch.LongTensor(explicit_identity_label_df.values),
@@ -111,11 +120,14 @@ def __init__(self, root_dir, download, split_scheme):
         )
         self._metadata_fields = ['suspect race', 'borough', '2010 or earlier?'] + ['y']
 
-        self.get_split_maps( data_df,  train_idxs, test_idxs, val_idxs)
-
+        self._split_array = self.get_split_maps(data_df,  train_idxs, test_idxs, val_idxs)
         data_df = data_df[self.feats_to_use]
-        self._input_array = pd.get_dummies(data_df, columns=[i for i in self.feats_to_use if 'suspect.' not in i and
-                                                             'observation.period' not in i], drop_first=True)
+        self._input_array = pd.get_dummies(
+            data_df,
+            columns=[i for i in self.feats_to_use
+                     if 'suspect.' not in i and 'observation.period' not in i],
+            drop_first=True)
+
         # Recover relevant features after taking dummies
         new_feats = []
         for i in self.feats_to_use:
@@ -125,28 +137,28 @@ def __init__(self, root_dir, download, split_scheme):
                 else:
                     pass
         self._input_array = self._input_array[new_feats]
-        self.initialize_eval_grouper()
+        self._eval_grouper = self.initialize_eval_grouper()
 
-
-    def load_metadata(self, data_df):
-        metadata_df = data_df[self._identity_vars].copy()
+    def load_metadata(self, data_df, identity_vars):
+        metadata_df = data_df[identity_vars].copy()
         metadata_names = ['suspect race', 'borough', '2010 or earlier?']
         metadata_ordered_maps = {}
         for col_name, meta_name in zip(metadata_df.columns, metadata_names):
-
             col_order = sorted(set(metadata_df[col_name]))
             col_dict = dict(zip(col_order, range(len(col_order))))
             metadata_ordered_maps[col_name] = col_order
             metadata_df[meta_name] = metadata_df[col_name].map(col_dict)
         return metadata_df[metadata_names], metadata_ordered_maps
 
-
     def get_split_indices(self, data_df):
         """Finds splits based on the split type """
-        test_idxs =  data_df[data_df.year > 2010].index.tolist()
+        test_idxs = data_df[data_df.year > 2010].index.tolist()
         train_df = data_df[data_df.year <= 2010]
-        validation_id_idxs = subsample_idxs(train_df.index.tolist(), num=int(train_df.shape[0] * 0.2),  seed=2851,
-                                            take_rest=False)
+        validation_id_idxs = subsample_idxs(
+            train_df.index.tolist(),
+            num=int(train_df.shape[0] * 0.2),
+            seed=2851,
+            take_rest=False)
 
         train_df = train_df[~train_df.index.isin(validation_id_idxs)]
 
@@ -175,18 +187,15 @@ def indices_to_dict(self, indices, int_val):
             local_idx_dict[i] = int_val
         return local_idx_dict
 
-    def get_split_maps(self, data_df,  train_idxs, test_idxs, val_idxs):
+    def get_split_maps(self, data_df, train_idxs, test_idxs, val_idxs):
         """Using the existing split indices, create a map to put entries to training and validation sets. Set class var."""
         index_dict = {}
         for arg, idx_set in enumerate([train_idxs, test_idxs, val_idxs]):
             index_dict.update(self.indices_to_dict(idx_set, arg))
-
         index_accumulator = []
-
         for index, sample in data_df.iterrows():
             index_accumulator.append(index_dict[index])
-
-        self._split_array = np.array(index_accumulator)
+        return np.array(index_accumulator)
 
     def get_split_features(self, columns):
         """Get features that include precinct if we're splitting on race or don't include if we're using borough splits."""
@@ -212,38 +221,46 @@ def normalize_data(self, df,  train_idxs):
             df[feature_name] = df[feature_name] / np.std(df_unnormed_train[feature_name])
         return df
 
-
     def initialize_split_dicts(self):
         """Identify split indices and name splits"""
+        split_dict = {'train': 0, 'test': 1, 'val':2}
         if 'all_borough' == self.split_scheme :
-            self._split_dict = {'train': 0, 'test': 1, 'val':2}
-            self._split_names = {'train': 'All Boroughs 2009 & 10  subsampled to match Bronx train set size', 'test':'All Stops 2010 & 11', \
-                                 'val':'20% sample of all stops 2009 & 10'}
+            split_names = {
+                'train': 'Stops in 2009 & 2010, subsampled to match Bronx train set size',
+                'test': 'All stops in 2011 & 2012',
+                'val': '20% sample of all stops 2009 & 2010'
+            }
         elif 'bronx' == self.split_scheme:
-                self._split_dict = {'train': 0, 'test': 1, 'val': 2}
-                self._split_names = {'train': 'Bronx 2009 & 10', 'test': 'All Stops 2010 & 11', \
-                                     'val': '20% sample of all stops 2009 & 10'}
+            split_names = {
+                'train': 'Bronx stops in 2009 & 2010',
+                'test': 'All stops in 2011 & 2012',
+                'val': '20% sample of all stops 2009 & 2010'
+            }
         elif 'black' == self.split_scheme:
-            self._split_dict = {'train': 0, 'test': 1, 'val':2}
-            self._split_names = {'train': 'train: 80% Black Stops 2009 and 2010', 'test':'Test: All Stops 2011 and 2012. ', \
-                                 'val':'20% sample of all stops 2009 & 10'}
-        elif 'all_race' == self.split_scheme :
-            self._split_dict = {'train': 0, 'test': 1, 'val':2}
-            self._split_names = {'train': 'train: Stops 2009 and 2010 subsampled to the size of Black people training set', 'test':'Test: All Stops 2011 and 2012. ', \
-                                 'val':'20% sample of all stops 2009 & 10'}
+            split_names = {
+                'train': '80% Black Stops 2009 and 2010',
+                'test': 'All stops in 2011 & 2012',
+                'val': '20% sample of all stops 2009 & 2010'
+            }
+        elif 'all_race' == self.split_scheme:
+            split_names = {
+                'train': 'Stops in 2009 & 2010, subsampled to match Black people train set size',
+                'test': 'All stops in 2011 & 2012',
+                'val': '20% sample of all stops 2009 & 2010'
+            }
         else:
             raise ValueError(f'Split scheme {self.split_scheme} not recognized')
-
+        return split_dict, split_names
 
     def get_input(self, idx):
         return torch.FloatTensor(self._input_array.loc[idx].values)
 
     def eval(self, y_pred, y_true, metadata):
-        """Evaluate the precision achieve overall and across groups for a given global recall"""
+        """Evaluate the precision achieved overall and across groups for a given global recall"""
         g = self._eval_grouper.metadata_to_group(metadata)
 
         y_scores = F.softmax(y_pred, dim=1)[:,1]
-        threshold_60 = threshold_at_recall(y_scores, y_true)
+        threshold_60 = threshold_at_recall(y_scores, y_true, global_recall=60)
         results = Accuracy().compute(y_pred, y_true)
         results.update(PrecisionAtRecall(threshold_60).compute(y_pred, y_true))
         results.update(Accuracy().compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
@@ -259,15 +276,14 @@ def eval(self, y_pred, y_true, metadata):
 
     def initialize_eval_grouper(self):
         if 'black' in self.split_scheme or 'race' in self.split_scheme :
-            self._eval_grouper = CombinatorialGrouper(
+            eval_grouper = CombinatorialGrouper(
                 dataset=self,
                 groupby_fields = ['suspect race']
             )
         elif 'bronx' in self.split_scheme or 'all_borough' == self.split_scheme:
-            self._eval_grouper = CombinatorialGrouper(
+            eval_grouper = CombinatorialGrouper(
                 dataset=self,
                 groupby_fields = ['borough'])
         else:
             raise ValueError(f'Split scheme {self.split_scheme} not recognized')
-
-
+        return eval_grouper

From 7571cfa326ba3326671378f5b6d8a3ad6aa1dd43 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Tue, 9 Feb 2021 12:47:48 -0800
Subject: [PATCH 014/116] docstring edit

---
 wilds/datasets/sqf_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index c0352495..35217fd6 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -188,7 +188,7 @@ def indices_to_dict(self, indices, int_val):
         return local_idx_dict
 
     def get_split_maps(self, data_df, train_idxs, test_idxs, val_idxs):
-        """Using the existing split indices, create a map to put entries to training and validation sets. Set class var."""
+        """Using the existing split indices, create a map to put entries to training and validation sets, and returns split_array."""
         index_dict = {}
         for arg, idx_set in enumerate([train_idxs, test_idxs, val_idxs]):
             index_dict.update(self.indices_to_dict(idx_set, arg))

From 010fab1acc76db0a96fad53cfd65cc61aeda5a87 Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Sat, 13 Feb 2021 17:11:38 -0500
Subject: [PATCH 015/116] simplified index assignment

---
 wilds/datasets/sqf_dataset.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index 3fca77cf..e37c1c54 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -127,7 +127,6 @@ def __init__(self, root_dir, download, split_scheme):
         self._input_array = self._input_array[new_feats]
         self.initialize_eval_grouper()
 
-
     def load_metadata(self, data_df):
         metadata_df = data_df[self._identity_vars].copy()
         metadata_names = ['suspect race', 'borough', '2010 or earlier?']
@@ -176,17 +175,11 @@ def indices_to_dict(self, indices, int_val):
         return local_idx_dict
 
     def get_split_maps(self, data_df,  train_idxs, test_idxs, val_idxs):
-        """Using the existing split indices, create a map to put entries to training and validation sets. Set class var."""
-        index_dict = {}
-        for arg, idx_set in enumerate([train_idxs, test_idxs, val_idxs]):
-            index_dict.update(self.indices_to_dict(idx_set, arg))
-
-        index_accumulator = []
-
-        for index, sample in data_df.iterrows():
-            index_accumulator.append(index_dict[index])
-
-        self._split_array = np.array(index_accumulator)
+        """Using the existing split indices, create a map to put entries to training and validation sets. """
+        self._split_array = np.zeros(data_df.shape[0])
+        self._split_array[train_idxs] = 0
+        self._split_array[test_idxs] = 1
+        self._split_array[val_idxs] = 2
 
     def get_split_features(self, columns):
         """Get features that include precinct if we're splitting on race or don't include if we're using borough splits."""

From e8e6eacf78373c2ad59e2bdb9ac22639a4eb6173 Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Sat, 13 Feb 2021 17:12:08 -0500
Subject: [PATCH 016/116] removed unused func

---
 wilds/datasets/sqf_dataset.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index e37c1c54..630b59c9 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -168,12 +168,6 @@ def get_split_indices(self, data_df):
 
         return train_idxs, test_idxs, validation_id_idxs
 
-    def indices_to_dict(self, indices, int_val):
-        local_idx_dict = {}
-        for i in indices:
-            local_idx_dict[i] = int_val
-        return local_idx_dict
-
     def get_split_maps(self, data_df,  train_idxs, test_idxs, val_idxs):
         """Using the existing split indices, create a map to put entries to training and validation sets. """
         self._split_array = np.zeros(data_df.shape[0])

From 00d213988929733c8d69bc173483c7bd0acbb59b Mon Sep 17 00:00:00 2001
From: rlphilli <rlanasphillips@gmail.com>
Date: Sat, 13 Feb 2021 18:18:45 -0500
Subject: [PATCH 017/116] mergefix

---
 wilds/datasets/sqf_dataset.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index dc40d6f6..e911b62e 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -183,10 +183,11 @@ def get_split_indices(self, data_df):
 
     def get_split_maps(self, data_df,  train_idxs, test_idxs, val_idxs):
         """Using the existing split indices, create a map to put entries to training and validation sets. """
-        self._split_array = np.zeros(data_df.shape[0])
-        self._split_array[train_idxs] = 0
-        self._split_array[test_idxs] = 1
-        self._split_array[val_idxs] = 2
+        split_array = np.zeros(data_df.shape[0])
+        split_array[train_idxs] = 0
+        split_array[test_idxs] = 1
+        split_array[val_idxs] = 2
+        return split_array
 
     def get_split_features(self, columns):
         """Get features that include precinct if we're splitting on race or don't include if we're using borough splits."""

From 21ab71ac0b1d0482d511bc77e37d3d87c5ef9b45 Mon Sep 17 00:00:00 2001
From: Michihiro Yasunaga <gomagoma353@gmail.com>
Date: Tue, 16 Feb 2021 19:37:33 -0800
Subject: [PATCH 018/116] py150 update to multitaskmetric

---
 examples/configs/supported.py   |  4 ++--
 wilds/common/metrics/loss.py    | 12 ++----------
 wilds/common/metrics/metric.py  |  2 +-
 wilds/datasets/py150_dataset.py | 27 ++++++++++++++++++---------
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index 6cbdf1cb..771a78b7 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -15,7 +15,7 @@
 from wilds.datasets.yelp_dataset import YelpDataset
 from wilds.datasets.py150_dataset import Py150Dataset
 # metrics
-from wilds.common.metrics.loss import ElementwiseLoss, Loss, MultiTaskLoss, lm_cross_entropy_loss
+from wilds.common.metrics.loss import ElementwiseLoss, Loss, MultiTaskLoss
 from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE
 
 datasets = {
@@ -35,7 +35,7 @@
 
 losses = {
     'cross_entropy': ElementwiseLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
-    'lm_cross_entropy': ElementwiseLoss(loss_fn=lm_cross_entropy_loss),
+    'lm_cross_entropy': MultiTaskLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
     'mse': MSE(name='loss'),
     'multitask_bce': MultiTaskLoss(loss_fn=nn.BCEWithLogitsLoss(reduction='none')),
 }
diff --git a/wilds/common/metrics/loss.py b/wilds/common/metrics/loss.py
index 9d8f8a27..4d2aa1ad 100644
--- a/wilds/common/metrics/loss.py
+++ b/wilds/common/metrics/loss.py
@@ -69,6 +69,8 @@ def _compute_flattened(self, flattened_y_pred, flattened_y_true):
         if isinstance(self.loss_fn, torch.nn.BCEWithLogitsLoss):
             flattened_y_pred = flattened_y_pred.float()
             flattened_y_true = flattened_y_true.float()
+        elif isinstance(self.loss_fn, torch.nn.CrossEntropyLoss):
+            flattened_y_true = flattened_y_true.long()
         flattened_loss = self.loss_fn(flattened_y_pred, flattened_y_true)
         return flattened_loss
 
@@ -81,13 +83,3 @@ def worst(self, metrics):
             - worst_metric (float): Worst-case metric
         """
         return maximum(metrics)
-
-
-def lm_cross_entropy_loss(input, target):
-    """
-    Cross entropy loss for language model head (input's dimenstionality is 3)
-        input: [batch_size, seqlen, vocab_size]
-        target: [batch_size, seqlen]
-    """
-    loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
-    return loss_fn(input.transpose(1,2), target) #[batch_size, seqlen]
diff --git a/wilds/common/metrics/metric.py b/wilds/common/metrics/metric.py
index 39985afd..4e07bc53 100644
--- a/wilds/common/metrics/metric.py
+++ b/wilds/common/metrics/metric.py
@@ -203,7 +203,7 @@ def compute_element_wise(self, y_pred, y_true, return_dict=True):
         """
         element_wise_metrics = self._compute_element_wise(y_pred, y_true)
         batch_size = y_pred.size()[0]
-        assert (element_wise_metrics.dim()==1 and element_wise_metrics.numel()==batch_size) or (element_wise_metrics.dim()==2 and element_wise_metrics.size(0)==batch_size)
+        assert element_wise_metrics.dim()==1 and element_wise_metrics.numel()==batch_size
 
         if return_dict:
             return {self.name: element_wise_metrics}
diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index efde4764..5a362426 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -6,7 +6,7 @@
 import torch
 import json
 import gc
-
+from wilds.common.metrics.all_metrics import Accuracy
 from wilds.datasets.wilds_dataset import WILDSDataset
 from transformers import GPT2Tokenizer
 
@@ -55,10 +55,10 @@ def __init__(self, root_dir='data', download=False, split_scheme='official'):
 
         # Splits
         data = {}
-        self._split_dict = {'train': 0, 'val': 1, 'test': 2, 'IDval': 3, 'IDtest': 4}
+        self._split_dict = {'train': 0, 'val': 1, 'test': 2, 'id_val': 3, 'id_test': 4}
         self._split_names = {'train': 'Train', 'val': 'Validation (OOD)',
-                                'test': 'Test (OOD)', 'IDval': 'Validation (ID)',
-                                'IDtest': 'Test (ID)'}
+                                'test': 'Test (OOD)', 'id_val': 'Validation (ID)',
+                                'id_test': 'Test (ID)'}
 
         df['split_id'] = df['split'].apply(lambda x: self._split_dict[x])
         self._split_array = df['split_id'].values
@@ -78,16 +78,20 @@ def __init__(self, root_dir='data', download=False, split_scheme='official'):
         self._metadata_array = _repo
         self._metadata_fields = ['repo']
 
-        self._y_array = self._y_array * _mask + torch.full(self._y_array.size(), -100) * (1-_mask)
+        self._y_array = self._y_array.float()
+        self._y_array[(1-_mask).bool()] = float('nan')
+
 
         super().__init__(root_dir, download, split_scheme)
 
     def eval(self, y_pred, y_true, metadata):
         #y_pred: [n_samples, seqlen-1]
         #y_true: [n_samples, seqlen-1]
-        mask = (y_true != -100).long()
-        assert y_pred.size() == mask.size() == y_true.size(), (y_pred.size(), y_true.size(), mask.size())
-        acc = ((y_pred==y_true)*mask).float().sum() / (mask.float().sum() +1e-8)
+        is_labeled = ~torch.isnan(y_true)
+        flattened_y_pred = y_pred[is_labeled]
+        flattened_y_true = y_true[is_labeled]
+        assert flattened_y_pred.size() == flattened_y_true.size() and flattened_y_pred.dim() == 1
+        acc = (flattened_y_pred==flattened_y_true).float().sum() / (len(flattened_y_pred) +1e-8)
 
         results = {'acc': acc}
         results_str = f"Average acc: {results['acc']:.3f}\n"
@@ -107,6 +111,11 @@ def _load_all_data(self):
         def fname2repo_id(fname, repo_name2id):
             return repo_name2id['/'.join(fname.split('/')[:2])]
 
+        def get_split_name(name):
+            if name.startswith('OOD'): return name.replace('OOD','')
+            if name.startswith('ID'): return name.replace('ID','id_')
+            return name
+
         _df = pd.read_csv(self._data_dir/'metadata/repo_file_names/repo_ids.csv')
         repo_name2id = {repo_name: id for id, repo_name in zip(_df.id, _df.repo_name)}
 
@@ -116,7 +125,7 @@ def fname2repo_id(fname, repo_name2id):
             inputs = json.load(open(self._data_dir/f'processed/{type}_input.json'))
             fnames = open(self._data_dir/f'metadata/repo_file_names/{type}.txt').readlines()
             repo_ids = [fname2repo_id(fname, repo_name2id) for fname in fnames]
-            splits   = [type.replace('OOD','')] * len(inputs)
+            splits   = [get_split_name(type)] * len(inputs)
             if type == 'train':
                 masks = (np.array(inputs) != pad_token_id).astype(int).tolist()
             else:

From d602ce287bc9dee5f8ed369dfe746847f9515e15 Mon Sep 17 00:00:00 2001
From: Anonymous <random53888195315050355754@gmail.com>
Date: Thu, 18 Feb 2021 18:05:45 -0800
Subject: [PATCH 019/116] iwildcam_v2.0

---
 dataset_preprocessing/iwildcam/create_split.py | 12 ++++++++----
 examples/configs/datasets.py                   |  2 +-
 wilds/datasets/iwildcam_dataset.py             |  6 +++---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index d181a7fd..85913ca7 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -65,6 +65,10 @@ def _create_split(data_dir, seed, skip=True):
             })
 
 
+    # Extract the date from the datetime.
+    df['datetime_obj'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
+    df['date'] = df['datetime_obj'].apply(lambda x: x.date())
+
     # Split by location to get the cis & trans validation set
     locations = np.unique(df['location'])
     n_locations = len(locations)
@@ -84,7 +88,7 @@ def _create_split(data_dir, seed, skip=True):
     # Split remaining samples by dates to get the cis validation and test set
     frac_validation = 0.05
     frac_test = 0.05
-    unique_dates = np.unique(remaining_df['datetime'])
+    unique_dates = np.unique(remaining_df['date'])
     n_dates = len(unique_dates)
     n_val_dates = int(n_dates * frac_validation)
     n_test_dates = int(n_dates * frac_test)
@@ -94,9 +98,9 @@ def _create_split(data_dir, seed, skip=True):
     train_dates, val_cis_dates = unique_dates[:n_train_dates], unique_dates[n_train_dates:(n_train_dates+n_val_dates)]
     test_cis_dates = unique_dates[(n_train_dates+n_val_dates):]
 
-    val_cis_df = remaining_df[remaining_df['datetime'].isin(val_cis_dates)]
-    test_cis_df = remaining_df[remaining_df['datetime'].isin(test_cis_dates)]
-    train_df = remaining_df[remaining_df['datetime'].isin(train_dates)]
+    val_cis_df = remaining_df[remaining_df['date'].isin(val_cis_dates)]
+    test_cis_df = remaining_df[remaining_df['date'].isin(test_cis_dates)]
+    train_df = remaining_df[remaining_df['date'].isin(train_dates)]
 
     # Locations in val_cis and test_cis but not in train are all moved to train set
     # since we want all locations in tcis splits to be in the train set.
diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 1d15c7af..46b9beb1 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -122,7 +122,7 @@
         'model_kwargs': {'pretrained': True},
         'train_transform': 'image_base',
         'eval_transform': 'image_base',
-        'target_resolution': (224, 224),
+        'target_resolution': (448, 448),
         'val_metric_decreasing': False,
         'algo_log_metric': 'accuracy',
         'model': 'resnet50',
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 011c1f1c..1a0b0c05 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -39,14 +39,14 @@ class IWildCamDataset(WILDSDataset):
     def __init__(self, root_dir='data', download=False, split_scheme='official'):
 
         self._dataset_name = 'iwildcam'
-        self._version = '1.0'
+        self._version = '2.0'
         self._split_scheme = split_scheme
         if self._split_scheme != 'official':
             raise ValueError(f'Split scheme {self._split_scheme} not recognized')
 
         # path
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x3f1b346ff2d74b5daf1a08685d68c6ec/contents/blob/'
-        self._compressed_size = 90_094_666_806
+        self._download_url = 'https://worksheets.codalab.org/bundles/0xc7205ccf81d34247b68f34a40f54747f/contents/blob/'
+        self._compressed_size = 12_000_000_000
         self._data_dir = Path(self.initialize_data_dir(root_dir, download))
 
         # Load splits

From ce79af58cbbf13bfc76c19f66e3c3af3ecfe5a52 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Thu, 18 Feb 2021 22:26:53 -0800
Subject: [PATCH 020/116] url fix

---
 wilds/datasets/iwildcam_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 1a0b0c05..8f460528 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -45,7 +45,7 @@ def __init__(self, root_dir='data', download=False, split_scheme='official'):
             raise ValueError(f'Split scheme {self._split_scheme} not recognized')
 
         # path
-        self._download_url = 'https://worksheets.codalab.org/bundles/0xc7205ccf81d34247b68f34a40f54747f/contents/blob/'
+        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xc7205ccf81d34247b68f34a40f54747f/contents/blob/'
         self._compressed_size = 12_000_000_000
         self._data_dir = Path(self.initialize_data_dir(root_dir, download))
 

From 7e20c6958da15ff21a3bc46cafd93c5f63e359c1 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Wed, 24 Feb 2021 12:56:50 -0800
Subject: [PATCH 021/116] sqf docstring

---
 wilds/datasets/sqf_dataset.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index e911b62e..b85f06ae 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -25,6 +25,9 @@ class SQFDataset(WILDSDataset):
             29 pre-stop observable features.
             As these split schemes study location shifts, we remove the district
             indicators here as they prevent generalizing to new locations.
+            In order to run the example code with these split_schemes,
+            pass in the command-line parameter `--model_kwargs in_features=29`
+            to `examples/run_expt.py`.
 
      Label (y):
         Binary. It is 1 if the stop is listed as finding a weapon, and 0 otherwise.

From a6da1fea503985469ce053ad6b21f1d511ced172 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Wed, 24 Feb 2021 21:02:33 -0800
Subject: [PATCH 022/116] add datetime components as metadata

---
 .../iwildcam/create_split.py                  |  1 +
 wilds/datasets/iwildcam_dataset.py            | 21 +++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index 85913ca7..af13cfdd 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 
 import pandas as pd
 import numpy as np
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 8f460528..bb0c17c1 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 from pathlib import Path
 import os
 
@@ -93,15 +94,27 @@ def __init__(self, root_dir='data', download=False, split_scheme='official'):
         df['group_id' ] = df['location'].apply(lambda x: location_to_group_id[x])
 
         self._n_groups = n_groups
-        self._metadata_array = torch.tensor(np.stack([df['group_id'].values, self.y_array], axis=1))
-        self._metadata_fields = ['location', 'y']
+
+        # Extract datetime subcomponents and include in metadata
+        df['datetime_obj'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
+        df['year'] = df['datetime_obj'].apply(lambda x: int(x.year))
+        df['month'] = df['datetime_obj'].apply(lambda x: int(x.month))
+        df['day'] = df['datetime_obj'].apply(lambda x: int(x.day))
+        df['hour'] = df['datetime_obj'].apply(lambda x: int(x.hour))
+        df['minute'] = df['datetime_obj'].apply(lambda x: int(x.minute))
+        df['second'] = df['datetime_obj'].apply(lambda x: int(x.second))
+
+        self._metadata_array = torch.tensor(np.stack([df['group_id'].values,
+                            df['year'].values, df['month'].values, df['day'].values,
+                            df['hour'].values, df['minute'].values, df['second'].values,
+                            self.y_array], axis=1))
+        self._metadata_fields = ['location', 'year', 'month', 'day', 'hour', 'minute', 'second', 'y']
         # eval grouper
         self._eval_grouper = CombinatorialGrouper(
             dataset=self,
             groupby_fields=(['location']))
 
-        self._metrics = [Accuracy(), Recall(average='macro'), Recall(average='weighted'),
-                        F1(average='macro'), F1(average='weighted')]
+        self._metrics = [Accuracy(), Recall(average='macro'), F1(average='macro')]
         super().__init__(root_dir, download, split_scheme)
 
     def eval(self, y_pred, y_true, metadata):

From 2cd1b47ba044ee029cf02a4bd8e5c81cb7f616d3 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Wed, 24 Feb 2021 21:47:33 -0800
Subject: [PATCH 023/116] metric fix

---
 wilds/datasets/iwildcam_dataset.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index bb0c17c1..4f338b79 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -128,9 +128,7 @@ def eval(self, y_pred, y_true, metadata):
         results_str = (
             f"Average acc: {results[self._metrics[0].agg_metric_field]:.3f}\n"
             f"Recall macro: {results[self._metrics[1].agg_metric_field]:.3f}\n"
-            f"Recall weighted: {results[self._metrics[2].agg_metric_field]:.3f}\n"
-            f"F1 macro: {results[self._metrics[3].agg_metric_field]:.3f}\n"
-            f"F1 weighted: {results[self._metrics[4].agg_metric_field]:.3f}\n"
+            f"F1 macro: {results[self._metrics[2].agg_metric_field]:.3f}\n"
         )
 
         return results, results_str

From b7a1a7addcdff8349d7099c164bfbf5cdf56c9a0 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Wed, 24 Feb 2021 22:17:49 -0800
Subject: [PATCH 024/116] efficientnet

---
 examples/configs/model.py      | 9 +++++++++
 examples/configs/supported.py  | 4 ++--
 examples/models/initializer.py | 3 +++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/examples/configs/model.py b/examples/configs/model.py
index 12a429a7..28f077bf 100644
--- a/examples/configs/model.py
+++ b/examples/configs/model.py
@@ -26,5 +26,14 @@
     'resnet18_ms': {
         'target_resolution': (224, 224),
     },
+    'efficientnet-b0': {
+        'target_resolution': (224, 224),
+    },
+    'efficientnet-b1': {
+        'target_resolution': (224, 224),
+    },
+    'efficientnet-b2': {
+        'target_resolution': (224, 224),
+    },
     'logistic_regression': {},
 }
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index bcbe54a9..82085de0 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -46,8 +46,8 @@
 
 # see initialize_*() functions for correspondence
 transforms = ['bert', 'image_base', 'image_resize_and_center_crop', 'poverty_train']
-models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50', 'densenet121', 'bert-base-uncased', 'gin-virtual', 
-    'logistic_regression']
+models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50', 'densenet121', 'bert-base-uncased', 'gin-virtual',
+    'logistic_regression', 'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2']
 algorithms = ['ERM', 'groupDRO', 'deepCORAL', 'IRM']
 optimizers = ['SGD', 'Adam', 'AdamW']
 schedulers = ['linear_schedule_with_warmup', 'ReduceLROnPlateau', 'StepLR']
diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index cea5ebfc..de501175 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -2,6 +2,7 @@
 import torchvision
 from models.bert import BertClassifier, BertFeaturizer
 from models.resnet_multispectral import ResNet18
+from efficientnet_pytorch import EfficientNet
 from models.layers import Identity
 from models.gnn import GINVirtual
 
@@ -26,6 +27,8 @@ def initialize_model(config, d_out):
         model = nn.Linear(out_features=d_out, **config.model_kwargs)
     elif config.model == 'gin-virtual':
         model = GINVirtual(num_tasks=d_out, **config.model_kwargs)
+    elif config.model in ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2'):
+        model = EfficientNet.from_pretrained(config.model)
     else:
         raise ValueError('Model not recognized.')
     return model

From 1052ebac301edb7f8c781107a4e78be0f6aadb91 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Wed, 24 Feb 2021 22:35:57 -0800
Subject: [PATCH 025/116] updated default resolutions

---
 examples/configs/model.py      | 12 ++++++++++--
 examples/configs/supported.py  |  6 ++++--
 examples/models/initializer.py |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/configs/model.py b/examples/configs/model.py
index 28f077bf..e744bf0e 100644
--- a/examples/configs/model.py
+++ b/examples/configs/model.py
@@ -1,3 +1,5 @@
+
+
 model_defaults = {
     'bert-base-uncased': {
         'optimizer': 'AdamW',
@@ -30,10 +32,16 @@
         'target_resolution': (224, 224),
     },
     'efficientnet-b1': {
-        'target_resolution': (224, 224),
+        'target_resolution': (240, 240),
     },
     'efficientnet-b2': {
-        'target_resolution': (224, 224),
+        'target_resolution': (260, 260),
+    },
+    'efficientnet-b3': {
+        'target_resolution': (300, 300),
+    },
+    'efficientnet-b4': {
+        'target_resolution': (380, 380),
     },
     'logistic_regression': {},
 }
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index 82085de0..3e74924b 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -46,8 +46,10 @@
 
 # see initialize_*() functions for correspondence
 transforms = ['bert', 'image_base', 'image_resize_and_center_crop', 'poverty_train']
-models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50', 'densenet121', 'bert-base-uncased', 'gin-virtual',
-    'logistic_regression', 'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2']
+models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50',
+         'densenet121', 'bert-base-uncased', 'gin-virtual',
+         'logistic_regression',
+         'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3', 'efficientnet-b4']
 algorithms = ['ERM', 'groupDRO', 'deepCORAL', 'IRM']
 optimizers = ['SGD', 'Adam', 'AdamW']
 schedulers = ['linear_schedule_with_warmup', 'ReduceLROnPlateau', 'StepLR']
diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index de501175..32c5baab 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -27,7 +27,7 @@ def initialize_model(config, d_out):
         model = nn.Linear(out_features=d_out, **config.model_kwargs)
     elif config.model == 'gin-virtual':
         model = GINVirtual(num_tasks=d_out, **config.model_kwargs)
-    elif config.model in ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2'):
+    elif config.model in ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3', 'efficientnet-b4'):
         model = EfficientNet.from_pretrained(config.model)
     else:
         raise ValueError('Model not recognized.')

From 7e02d985d04b8befe253db81c05998ba9423ef99 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Thu, 25 Feb 2021 00:15:44 -0800
Subject: [PATCH 026/116] update create split new proportions

---
 dataset_preprocessing/iwildcam/create_split.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index af13cfdd..53d97ffe 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -1,4 +1,7 @@
 from datetime import datetime
+from pathlib import Path
+import argparse
+import json
 
 import pandas as pd
 import numpy as np
@@ -44,7 +47,6 @@ def create_split(data_dir):
 
 
 def _create_split(data_dir, seed, skip=True):
-    data_dir = Path(data_dir)
     np_rng = np.random.default_rng(seed)
 
     # Load Kaggle train data
@@ -87,8 +89,8 @@ def _create_split(data_dir, seed, skip=True):
     test_trans_df = df[df['location'].isin(test_trans_locations)]
 
     # Split remaining samples by dates to get the cis validation and test set
-    frac_validation = 0.05
-    frac_test = 0.05
+    frac_validation = 0.08
+    frac_test = 0.06
     unique_dates = np.unique(remaining_df['date'])
     n_dates = len(unique_dates)
     n_val_dates = int(n_dates * frac_validation)
@@ -157,3 +159,12 @@ def check_overlap(df1, df2):
     n_intersection = len(intersection)
 
     return False if n_intersection == 0 else True
+
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', type=str)
+    args = parser.parse_args()
+
+    create_split(Path(args.data_dir))

From 0e47e4556d5a87d699c54a3dc6a72a8f0bdaff2f Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Thu, 25 Feb 2021 00:38:59 -0800
Subject: [PATCH 027/116] split ratio update

---
 dataset_preprocessing/iwildcam/create_split.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index 53d97ffe..bc7840c4 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -90,7 +90,7 @@ def _create_split(data_dir, seed, skip=True):
 
     # Split remaining samples by dates to get the cis validation and test set
     frac_validation = 0.08
-    frac_test = 0.06
+    frac_test = 0.07
     unique_dates = np.unique(remaining_df['date'])
     n_dates = len(unique_dates)
     n_val_dates = int(n_dates * frac_validation)
@@ -142,6 +142,11 @@ def _create_split(data_dir, seed, skip=True):
     for df in [val_cis_df, val_trans_df, test_cis_df, test_trans_df]:
         assert not check_overlap(train_df, df)
 
+
+    print("val cis df : ", len(val_cis_df))
+    print("test cis df : ", len(test_cis_df))
+    print("test cis df : ", len(train_df))
+
     return train_df, val_cis_df, val_trans_df, test_cis_df, test_trans_df
 
 def remove(dfs):

From d19fcfee8da18693d8818ec6d0da723bdeb5312e Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Thu, 25 Feb 2021 11:32:25 -0800
Subject: [PATCH 028/116] remove location 485 from dataset

---
 dataset_preprocessing/iwildcam/create_split.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index bc7840c4..268f898e 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -6,8 +6,9 @@
 import pandas as pd
 import numpy as np
 
-# Examples to skip due to e.g them missing, loading issues
-LOCATIONS_TO_SKIP = [537]
+# For more info see https://www.kaggle.com/c/iwildcam-2020-fgvc7/discussion/135200
+# 485 had multiple images from indoors, and just a few were actually from out in the wild.
+LOCATIONS_TO_SKIP = [537, 485]
 
 CANNOT_OPEN = ['99136aa6-21bc-11ea-a13a-137349068a90.jpg',
                '87022118-21bc-11ea-a13a-137349068a90.jpg',

From a7bbbb07d54fe3dba607ff53049b630b42861eef Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Thu, 25 Feb 2021 13:27:40 -0800
Subject: [PATCH 029/116] Refactored downloads; support for multiple dataset
 versions; and download script

---
 examples/configs/datasets.py            |   2 +-
 examples/configs/supported.py           |   9 ++
 examples/download_datasets.py           |  32 +++++
 examples/run_expt.py                    |   8 +-
 wilds/datasets/amazon_dataset.py        |  14 +-
 wilds/datasets/bdd100k_dataset.py       |  30 +++--
 wilds/datasets/camelyon17_dataset.py    |  13 +-
 wilds/datasets/celebA_dataset.py        |  13 +-
 wilds/datasets/civilcomments_dataset.py |  13 +-
 wilds/datasets/fmow_dataset.py          |  11 +-
 wilds/datasets/iwildcam_dataset.py      |  19 ++-
 wilds/datasets/ogbmolpcba_dataset.py    |  12 +-
 wilds/datasets/poverty_dataset.py       |  17 ++-
 wilds/datasets/waterbirds_dataset.py    |  12 +-
 wilds/datasets/wilds_dataset.py         | 172 +++++++++++-------------
 wilds/datasets/yelp_dataset.py          |  15 ++-
 16 files changed, 233 insertions(+), 159 deletions(-)
 create mode 100644 examples/download_datasets.py

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 46b9beb1..4ed1c412 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -122,7 +122,7 @@
         'model_kwargs': {'pretrained': True},
         'train_transform': 'image_base',
         'eval_transform': 'image_base',
-        'target_resolution': (448, 448),
+        # 'target_resolution': (448, 448),
         'val_metric_decreasing': False,
         'algo_log_metric': 'accuracy',
         'model': 'resnet50',
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index 3e74924b..c69e187e 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -17,6 +17,15 @@
 from wilds.common.metrics.loss import ElementwiseLoss, Loss, MultiTaskLoss
 from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE
 
+benchmark_datasets = [
+    'amazon',
+    'camelyon17',
+    'civilcomments',
+    'iwildcam',
+    'ogb-molpcba',
+    'poverty',
+    'fmow']
+
 datasets = {
     'amazon': AmazonDataset,
     'camelyon17': Camelyon17Dataset,
diff --git a/examples/download_datasets.py b/examples/download_datasets.py
new file mode 100644
index 00000000..2688ef70
--- /dev/null
+++ b/examples/download_datasets.py
@@ -0,0 +1,32 @@
+import os, sys
+import argparse
+import configs.supported as supported
+
+def main():
+    """
+    Downloads the latest versions of all specified datasets,
+    if they do not already exist.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--root_dir', required=True,
+                        help='The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).')
+    parser.add_argument('--datasets', nargs='*', default=None,
+                        help=f'Specify a space-separated list of dataset names to download. If left unspecified, the script will download all of the official benchmark datasets. Available choices are {list(supported.datasets.keys())}.')
+    config = parser.parse_args()
+
+    if config.datasets is None:
+        config.datasets = supported.benchmark_datasets
+
+    for dataset in config.datasets:
+        if dataset not in supported.datasets:
+            raise ValueError(f'{dataset} not recognized; must be one of {list(supported.datasets.keys())}.')
+
+    print(f'Downloading the following datasets: {config.datasets}')
+    for dataset in config.datasets:
+        print(f'=== {dataset} ===')
+        constructor = supported.datasets[dataset]
+        constructor(root_dir=config.root_dir, download=True)
+
+
+if __name__=='__main__':
+    main()
diff --git a/examples/run_expt.py b/examples/run_expt.py
index 166df04f..8a0b11e7 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -53,7 +53,7 @@ def main():
     # Transforms
     parser.add_argument('--train_transform', choices=supported.transforms)
     parser.add_argument('--eval_transform', choices=supported.transforms)
-    parser.add_argument('--target_resolution', nargs='+', type=int, help='target resolution. for example --target_resolution 224 224 for standard resnet.')
+    parser.add_argument('--target_resolution', nargs='+', type=int, help='The input resolution that images will be resized to before being passed into the model. For example, use --target_resolution 224 224 for a standard ResNet.')
     parser.add_argument('--resize_scale', type=float)
     parser.add_argument('--max_token_length', type=int)
 
@@ -193,7 +193,7 @@ def main():
         datasets[split]['split'] = split
         datasets[split]['name'] = full_dataset.split_names[split]
         datasets[split]['verbose'] = verbose
-        # Loggers
+
         # Loggers
         datasets[split]['eval_logger'] = BatchLogger(
             os.path.join(config.log_dir, f'{split}_eval.csv'), mode=mode, use_wandb=(config.use_wandb and verbose))
@@ -204,7 +204,8 @@ def main():
             initialize_wandb(config)
 
     # Logging dataset info
-    if config.no_group_logging and full_dataset.is_classification and full_dataset.y_size==1:
+    # Show class breakdown if feasible
+    if config.no_group_logging and full_dataset.is_classification and full_dataset.y_size==1 and full_dataset.n_classes <= 10:
         log_grouper = CombinatorialGrouper(
             dataset=full_dataset,
             groupby_fields=['y'])
@@ -244,7 +245,6 @@ def main():
             epoch_offset=0
             best_val_metric=None
 
-
         train(
             algorithm=algorithm,
             datasets=datasets,
diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index 518beee2..abe1def9 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -50,12 +50,14 @@ class AmazonDataset(WILDSDataset):
     License:
         None. However, the original authors request that the data be used for research purposes only.
     """
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-        # set variables
-        self._dataset_name = 'amazon'
-        self._version = '1.0'
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x60237058e01749cda7b0701c2bd01420/contents/blob/'
-        self._compressed_size = 4_066_541_568
+    _dataset_name = 'amazon'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x60237058e01749cda7b0701c2bd01420/contents/blob/',
+            'compressed_size': 4_066_541_568}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        self._version = version
         # the official split is the user split
         if split_scheme=='official':
             split_scheme = 'user'
diff --git a/wilds/datasets/bdd100k_dataset.py b/wilds/datasets/bdd100k_dataset.py
index 0b97df31..31d4f7b7 100644
--- a/wilds/datasets/bdd100k_dataset.py
+++ b/wilds/datasets/bdd100k_dataset.py
@@ -45,18 +45,18 @@ class BDD100KDataset(WILDSDataset):
 
     License (original text):
         Copyright ©2018. The Regents of the University of California (Regents). All Rights Reserved.
-        Permission to use, copy, modify, and distribute this software and its documentation for educational, research, and 
-        not-for-profit purposes, without fee and without a signed licensing agreement; and permission use, copy, modify and 
-        distribute this software for commercial purposes (such rights not subject to transfer) to BDD member and its affiliates, 
-        is hereby granted, provided that the above copyright notice, this paragraph and the following two paragraphs appear in 
-        all copies, modifications, and distributions. Contact The Office of Technology Licensing, UC Berkeley, 2150 Shattuck 
+        Permission to use, copy, modify, and distribute this software and its documentation for educational, research, and
+        not-for-profit purposes, without fee and without a signed licensing agreement; and permission use, copy, modify and
+        distribute this software for commercial purposes (such rights not subject to transfer) to BDD member and its affiliates,
+        is hereby granted, provided that the above copyright notice, this paragraph and the following two paragraphs appear in
+        all copies, modifications, and distributions. Contact The Office of Technology Licensing, UC Berkeley, 2150 Shattuck
         Avenue, Suite 510, Berkeley, CA 94720-1620, (510) 643-7201, otl@berkeley.edu,
         http://ipira.berkeley.edu/industry-info for commercial licensing opportunities.
-        IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, 
-        INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN ADVISED 
+        IN NO EVENT SHALL REGENTS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES,
+        INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF REGENTS HAS BEEN ADVISED
         OF THE POSSIBILITY OF SUCH DAMAGE.
-        REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 
-        AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED HEREUNDER IS PROVIDED 
+        REGENTS SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+        AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE AND ACCOMPANYING DOCUMENTATION, IF ANY, PROVIDED HEREUNDER IS PROVIDED
         "AS IS". REGENTS HAS NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
     """
 
@@ -65,11 +65,15 @@ class BDD100KDataset(WILDSDataset):
     TIMEOFDAY_SPLITS = ['daytime', 'night', 'dawn/dusk', 'undefined']
     LOCATION_SPLITS = ['New York', 'California']
 
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-        self._dataset_name = 'bdd100k'
-        self._version = '1.0'
+    _dataset_name = 'bdd100k'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x0ac62ae89a644676a57fa61d6aa2f87d/contents/blob/',
+            'compressed_size': None}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        self._version = version
         self._original_resolution = (1280, 720)
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x0ac62ae89a644676a57fa61d6aa2f87d/contents/blob/'
         self._data_dir = self.initialize_data_dir(root_dir, download)
         self.root = Path(self.data_dir)
 
diff --git a/wilds/datasets/camelyon17_dataset.py b/wilds/datasets/camelyon17_dataset.py
index 0a76f615..f62216db 100644
--- a/wilds/datasets/camelyon17_dataset.py
+++ b/wilds/datasets/camelyon17_dataset.py
@@ -45,11 +45,14 @@ class Camelyon17Dataset(WILDSDataset):
         https://creativecommons.org/publicdomain/zero/1.0/
     """
 
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-        self._dataset_name = 'camelyon17'
-        self._version = '1.0'
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xe45e15f39fb54e9d9e919556af67aabe/contents/blob/'
-        self._compressed_size = 10_658_709_504
+    _dataset_name = 'camelyon17'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xe45e15f39fb54e9d9e919556af67aabe/contents/blob/',
+            'compressed_size': 10_658_709_504}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        self._version = version
         self._data_dir = self.initialize_data_dir(root_dir, download)
         self._original_resolution = (96,96)
 
diff --git a/wilds/datasets/celebA_dataset.py b/wilds/datasets/celebA_dataset.py
index 37b9ffd2..caa1a021 100644
--- a/wilds/datasets/celebA_dataset.py
+++ b/wilds/datasets/celebA_dataset.py
@@ -51,11 +51,14 @@ class CelebADataset(WILDSDataset):
 
         It is available for non-commercial research purposes only.
     """
-
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-        self._dataset_name = 'celebA'
-        self._version = '1.0'
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xa174edc9c11041869d11f98d1dc19935/contents/blob/'
+    _dataset_name = 'celebA'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xa174edc9c11041869d11f98d1dc19935/contents/blob/',
+            'compressed_size': None}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        self._version = version
         self._data_dir = self.initialize_data_dir(root_dir, download)
         target_name = 'Blond_Hair'
         confounder_names = ['Male']
diff --git a/wilds/datasets/civilcomments_dataset.py b/wilds/datasets/civilcomments_dataset.py
index 78fe9310..82c57adc 100644
--- a/wilds/datasets/civilcomments_dataset.py
+++ b/wilds/datasets/civilcomments_dataset.py
@@ -55,11 +55,14 @@ class CivilCommentsDataset(WILDSDataset):
         https://creativecommons.org/publicdomain/zero/1.0/
     """
 
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-        self._dataset_name = 'civilcomments'
-        self._version = '1.0'
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x8cd3de0634154aeaad2ee6eb96723c6e/contents/blob/'
-        self._compressed_size = 90_644_480
+    _dataset_name = 'civilcomments'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x8cd3de0634154aeaad2ee6eb96723c6e/contents/blob/',
+            'compressed_size': 90_644_480}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        self._version = version
         self._data_dir = self.initialize_data_dir(root_dir, download)
 
         # Read in metadata
diff --git a/wilds/datasets/fmow_dataset.py b/wilds/datasets/fmow_dataset.py
index 7c6e1814..82e6d7df 100644
--- a/wilds/datasets/fmow_dataset.py
+++ b/wilds/datasets/fmow_dataset.py
@@ -57,12 +57,13 @@ class FMoWDataset(WILDSDataset):
 
     """
     _dataset_name = 'fmow'
-    _download_url = 'https://worksheets.codalab.org/rest/bundles/0xc59ea8261dfe4d2baa3820866e33d781/contents/blob/'
-    _version = '1.0'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xc59ea8261dfe4d2baa3820866e33d781/contents/blob/',
+            'compressed_size': 70_000_000_000}}
 
-    def __init__(self, root_dir='data', download=False, split_scheme='official',
-                 oracle_training_set=False, seed=111, use_ood_val=False):
-        self._compressed_size = 70_000_000_000
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official', oracle_training_set=False, seed=111, use_ood_val=False):
+        self._version = version
         self._data_dir = self.initialize_data_dir(root_dir, download)
 
         self._split_dict = {'train': 0, 'id_val': 1, 'id_test': 2, 'val': 3, 'test': 4}
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 4f338b79..0ef13a7e 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -36,18 +36,23 @@ class IWildCamDataset(WILDSDataset):
             This dataset is distributed under Community Data License Agreement – Permissive – Version 1.0
             https://cdla.io/permissive-1-0/
         """
-
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-
-        self._dataset_name = 'iwildcam'
-        self._version = '2.0'
+    _dataset_name = 'iwildcam'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3f1b346ff2d74b5daf1a08685d68c6ec/contents/blob/',
+            'compressed_size': 90_094_666_806},
+        '2.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xc7205ccf81d34247b68f34a40f54747f/contents/blob/',
+            'compressed_size': 12_000_000_000}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+
+        self._version = version
         self._split_scheme = split_scheme
         if self._split_scheme != 'official':
             raise ValueError(f'Split scheme {self._split_scheme} not recognized')
 
         # path
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xc7205ccf81d34247b68f34a40f54747f/contents/blob/'
-        self._compressed_size = 12_000_000_000
         self._data_dir = Path(self.initialize_data_dir(root_dir, download))
 
         # Load splits
diff --git a/wilds/datasets/ogbmolpcba_dataset.py b/wilds/datasets/ogbmolpcba_dataset.py
index 38ddd4ab..0fbb8e10 100644
--- a/wilds/datasets/ogbmolpcba_dataset.py
+++ b/wilds/datasets/ogbmolpcba_dataset.py
@@ -51,12 +51,20 @@ class OGBPCBADataset(WILDSDataset):
         https://github.com/snap-stanford/ogb/blob/master/LICENSE
     """
 
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
+    _dataset_name = 'ogbg-molpcba'
+    _versions_dict = {
+        '1.0': {
+            'download_url': None,
+            'compressed_size': None}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        self._version = version
+        if version is not None:
+            raise ValueError('Versioning for OGB-MolPCBA is handled through the OGB package. Please set version=none.')
         # internally call ogb package
         self.ogb_dataset = PygGraphPropPredDataset(name = 'ogbg-molpcba', root = root_dir)
 
         # set variables
-        self._dataset_name = 'ogbg-molpcba'
         self._data_dir = self.ogb_dataset.root
         if split_scheme=='official':
             split_scheme = 'scaffold'
diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index 889881c7..55e67a5c 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -142,13 +142,16 @@ class PovertyMapDataset(WILDSDataset):
 
     """
     _dataset_name = 'poverty'
-    _download_url = 'https://worksheets.codalab.org/rest/bundles/0x9a2add5219db4ebc89965d7f42719750/contents/blob/'
-    _version = '1.0'
-
-    def __init__(self, root_dir='data', download=False, split_scheme='official',
-                 no_nl=True, fold='A', oracle_training_set=False, use_ood_val=False):
-
-        self._compressed_size = 18_630_656_000
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x9a2add5219db4ebc89965d7f42719750/contents/blob/',
+            'compressed_size': 18_630_656_000}}
+
+    def __init__(self, version=None, root_dir='data', download=False,
+                 split_scheme='official',
+                 no_nl=False, fold='A', oracle_training_set=False,
+                 use_ood_val=True):
+        self._version = version        
         self._data_dir = self.initialize_data_dir(root_dir, download)
 
         self._split_dict = {'train': 0, 'id_val': 1, 'id_test': 2, 'val': 3, 'test': 4}
diff --git a/wilds/datasets/waterbirds_dataset.py b/wilds/datasets/waterbirds_dataset.py
index d9e69349..1fb2c561 100644
--- a/wilds/datasets/waterbirds_dataset.py
+++ b/wilds/datasets/waterbirds_dataset.py
@@ -53,10 +53,14 @@ class WaterbirdsDataset(WILDSDataset):
         The use of this dataset is restricted to non-commercial research and educational purposes.
     """
 
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-        self._dataset_name = 'waterbirds'
-        self._version = '1.0'
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x505056d5cdea4e4eaa0e242cbfe2daa4/contents/blob/'
+    _dataset_name = 'waterbirds'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x505056d5cdea4e4eaa0e242cbfe2daa4/contents/blob/',
+            'compressed_size': None}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        self._version = version
         self._data_dir = self.initialize_data_dir(root_dir, download)
 
         if not os.path.exists(self.data_dir):
diff --git a/wilds/datasets/wilds_dataset.py b/wilds/datasets/wilds_dataset.py
index ab149bac..47e00dc6 100644
--- a/wilds/datasets/wilds_dataset.py
+++ b/wilds/datasets/wilds_dataset.py
@@ -109,6 +109,24 @@ def check_init(self):
         if self.y_size == 1:
             assert 'y' in self.metadata_fields
 
+    @property
+    def latest_version(cls):
+        def is_later(u, v):
+            """Returns true if u is a later version than v."""
+            u_major, u_minor = tuple(map(int, u.split('.')))
+            v_major, v_minor = tuple(map(int, v.split('.')))
+            if (u_major > v_major) or (
+                (u_major == v_major) and (u_minor > v_minor)):
+                return True
+            else:
+                return False
+
+        latest_version = '0.0'
+        for key in cls.versions_dict.keys():
+            if is_later(key, latest_version):
+                latest_version = key
+        return latest_version
+
     @property
     def dataset_name(self):
         """
@@ -121,16 +139,25 @@ def version(self):
         """
         A string that identifies the dataset version, e.g., '1.0'.
         """
-        return self._version
+        if self._version is None:
+            return self.latest_version
+        else:
+            return self._version
 
     @property
-    def download_url(self):
+    def versions_dict(self):
         """
-        URL for downloading the dataset archive.
+        A dictionary where each key is a version string (e.g., '1.0')
+        and each value is a dictionary containing the 'download_url' and
+        'compressed_size' keys.
+
+        'download_url' is the URL for downloading the dataset archive.
         If None, the dataset cannot be downloaded automatically
         (e.g., because it first requires accepting a usage agreement).
+
+        'compressed_size' is the approximate size of the compressed dataset in bytes.
         """
-        return getattr(self, '_download_url', None)
+        return self._versions_dict
 
     @property
     def data_dir(self):
@@ -256,13 +283,6 @@ def original_resolution(self):
         """
         return getattr(self, '_original_resolution', None)
 
-    @property
-    def compressed_size(self):
-        """
-        Size of the compressed bundle
-        """
-        return getattr(self, '_compressed_size', None)
-
     def initialize_data_dir(self, root_dir, download):
         """
         Helper function for downloading/updating the dataset if required.
@@ -271,102 +291,74 @@ def initialize_data_dir(self, root_dir, download):
         Datasets for which we don't control the download, like Yelp,
         might not handle versions similarly.
         """
+        if self.version not in self.versions_dict:
+            raise ValueError(f'Version {self.version} not recognized. Must be in {self.versions_dict.keys()}.')
+
+        download_url = self.versions_dict[self.version]['download_url']
+        compressed_size = self.versions_dict[self.version]['compressed_size']
+
         os.makedirs(root_dir, exist_ok=True)
 
         data_dir = os.path.join(root_dir, f'{self.dataset_name}_v{self.version}')
         version_file = os.path.join(data_dir, f'RELEASE_v{self.version}.txt')
         current_major_version, current_minor_version = tuple(map(int, self.version.split('.')))
 
+        # Check if we specified the latest version. Otherwise, print a warning.
+        latest_major_version, latest_minor_version = tuple(map(int, self.latest_version.split('.')))
+        if latest_major_version > current_major_version:
+            print(
+                f'*****************************\n'
+                f'{self.dataset_name} has been updated to version {self.latest_version}.\n'
+                f'You are currently using version {self.version}.\n'
+                f'We highly recommend updating the dataset.\n'
+                f'See https://wilds.stanford.edu/changelog for changes.\n'
+                f'*****************************\n')
+        elif latest_minor_version > current_minor_version:
+            print(
+                f'*****************************\n'
+                f'{self.dataset_name} has been updated to version {self.latest_version}.\n'
+                f'You are currently using version {self.version}.\n'
+                f'Please consider updating the dataset.\n'
+                f'See https://wilds.stanford.edu/changelog for changes.\n'
+                f'*****************************\n')
+
         # If the data_dir exists and contains the right RELEASE file,
         # we assume the dataset is correctly set up
         if os.path.exists(data_dir) and os.path.exists(version_file):
             return data_dir
 
-        # If the data_dir exists and is not empty, and the download_url is set,
+        # If the data_dir exists and does not contain the right RELEASE file, but it is not empty and the download_url is not set,
         # we assume the dataset is correctly set up
         if ((os.path.exists(data_dir)) and
             (len(os.listdir(data_dir)) > 0) and
-            (self.download_url is None)):
+            (download_url is None)):
             return data_dir
 
-        # Otherwise, check if there's an older version of the dataset around
-        old_major_version, old_minor_version = -1, -1
-        old_folders = [
-            f for f in os.listdir(root_dir) if (
-                os.path.isdir(os.path.join(root_dir, f)) and
-                f.startswith(self.dataset_name))]
-        for old_folder in old_folders:
-            prefix = f'{self.dataset_name}_v'
-            try:
-                version = old_folder.split(prefix)[1]
-                if os.path.exists(
-                    os.path.join(root_dir, old_folder, f'RELEASE_v{version}.txt')):
-                    major_version, minor_version = tuple(map(int, version.split('.')))
-                    if ((old_major_version < major_version) or
-                        ((old_major_version == major_version) and
-                         (old_minor_version < minor_version))):
-                         old_major_version, old_minor_version = major_version, minor_version
-                         latest_existing_data_dir = os.path.join(root_dir, old_folder)
-            except:
-                continue
-
-        do_download = False
-
-        # No existing dataset
-        if (old_major_version == -1):
-            if download == False:
-                if self.download_url is None:
-                    raise FileNotFoundError(f'The {self.dataset_name} dataset could not be found in {data_dir}. {self.dataset_name} cannot be automatically downloaded. Please download it manually.')
-                else:
-                    raise FileNotFoundError(f'The {self.dataset_name} dataset could not be found in {data_dir}. Initialize the dataset with download=True to download the dataset. If you are using the example script, run with --download. This might take some time for large datasets.')
+        # Otherwise, we assume the dataset needs to be downloaded.
+        # If download == False, then return an error.
+        if download == False:
+            if download_url is None:
+                raise FileNotFoundError(f'The {self.dataset_name} dataset could not be found in {data_dir}. {self.dataset_name} cannot be automatically downloaded. Please download it manually.')
             else:
-                do_download = True
-
-        # Older major version:
-        # Prompt for update, ignore `download` flag
-        elif (old_major_version < current_major_version):
-            print(
-                '***********\n'
-                f'{self.dataset_name} has been updated to a new major version.\n'
-                f'We recommend updating the dataset.\n')
-            confirm = input(f'Will you update the dataset now? This might take some time for large datasets. (y/n)\n').lower()
-            if confirm == 'y':
-                do_download = True
-
-        # Same major version, older minor version:
-        # Notify user but do not prompt unless `download` is set
-        elif ((old_major_version == current_major_version) and
-              (old_minor_version < current_minor_version)):
-            print(
-                '***********\n'
-                f'{self.dataset_name} has been updated to a new minor version.\n')
-            if download == False:
-                print(
-                    'Initialize the dataset with download=True to download the dataset. If you are using the example script, run with --download. This might take some time for large datasets.\n'
-                    '***********\n')
-            else:
-                do_download = True
-
-        # Download if necessary
-        if do_download == False:
-            data_dir = latest_existing_data_dir
-        else:
-            if self.download_url is None:
-                raise ValueError(f'Sorry, {self.dataset_name} cannot be automatically downloaded. Please download it manually.')
-
-            from wilds.datasets.download_utils import download_and_extract_archive
-            print(f'Downloading dataset to {data_dir}...')
-            print(f'You can also download the dataset manually at https://wilds.stanford.edu/downloads.')
-            try:
-                download_and_extract_archive(
-                    url=self.download_url,
-                    download_root=data_dir,
-                    filename='archive.tar.gz',
-                    remove_finished=True,
-                    size=self.compressed_size)
-            except Exception as e:
-                print(f"\n{os.path.join(data_dir, 'archive.tar.gz')} may be corrupted. Please try deleting it and rerunning this command.\n")
-                print(f"Exception: ", e)
+                raise FileNotFoundError(f'The {self.dataset_name} dataset could not be found in {data_dir}. Initialize the dataset with download=True to download the dataset. If you are using the example script, run with --download. This might take some time for large datasets.')
+
+        # Otherwise, proceed with downloading.
+        if download_url is None:
+            raise ValueError(f'Sorry, {self.dataset_name} cannot be automatically downloaded. Please download it manually.')
+
+        from wilds.datasets.download_utils import download_and_extract_archive
+        print(f'Downloading dataset to {data_dir}...')
+        print(f'You can also download the dataset manually at https://wilds.stanford.edu/downloads.')
+        try:
+            download_and_extract_archive(
+                url=download_url,
+                download_root=data_dir,
+                filename='archive.tar.gz',
+                remove_finished=True,
+                size=compressed_size)
+        except Exception as e:
+            print(f"\n{os.path.join(data_dir, 'archive.tar.gz')} may be corrupted. Please try deleting it and rerunning this command.\n")
+            print(f"Exception: ", e)
 
         return data_dir
 
diff --git a/wilds/datasets/yelp_dataset.py b/wilds/datasets/yelp_dataset.py
index 39923e8f..a5214e0f 100644
--- a/wilds/datasets/yelp_dataset.py
+++ b/wilds/datasets/yelp_dataset.py
@@ -41,12 +41,17 @@ class YelpDataset(WILDSDataset):
     License:
         Because of the Dataset License provided by Yelp, we are unable to redistribute the data.
         Please download the data through the website (https://www.yelp.com/dataset/download) by
-        agreeing to the Dataset License. 
+        agreeing to the Dataset License.
     """
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
-        # set variables
-        self._dataset_name = 'yelp'
-        self._version = '1.0'
+    _dataset_name = 'yelp'
+    _versions_dict = {
+        '1.0': {
+            'download_url': None,
+            'compressed_size': None}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+        # set variables        
+        self._version = version
         if split_scheme=='official':
             split_scheme = 'time'
         self._split_scheme = split_scheme

From 696a40bb5ae70670b3285845f7b7feb44ff3ceec Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Thu, 25 Feb 2021 14:38:42 -0800
Subject: [PATCH 030/116] updated url

---
 examples/run_expt.py               | 2 ++
 examples/utils.py                  | 2 +-
 wilds/datasets/iwildcam_dataset.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/run_expt.py b/examples/run_expt.py
index 8a0b11e7..1bde3100 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -35,6 +35,7 @@ def main():
                         help='If true, tries to downloads the dataset if it does not exist in root_dir.')
     parser.add_argument('--frac', type=float, default=1.0,
                         help='Convenience parameter that scales all dataset splits down to the specified fraction, for development purposes.')
+    parser.add_argument('--version', default=None, type=str)
 
     # Loaders
     parser.add_argument('--loader_kwargs', nargs='*', action=ParseKwargs, default={})
@@ -134,6 +135,7 @@ def main():
 
     # Data
     full_dataset = supported.datasets[config.dataset](
+        version=config.version,
         root_dir=config.root_dir,
         download=config.download,
         split_scheme=config.split_scheme,
diff --git a/examples/utils.py b/examples/utils.py
index dcfcba3e..8a12f859 100644
--- a/examples/utils.py
+++ b/examples/utils.py
@@ -30,7 +30,7 @@ class ParseKwargs(argparse.Action):
     def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, dict())
         for value in values:
-            key, value_str = value.split('=')
+            key, value_str = value.split('=')            
             if value_str.replace('-','').isnumeric():
                 processed_val = int(value_str)
             elif value_str.replace('-','').replace('.','').isnumeric():
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 0ef13a7e..67f7e168 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -42,7 +42,7 @@ class IWildCamDataset(WILDSDataset):
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3f1b346ff2d74b5daf1a08685d68c6ec/contents/blob/',
             'compressed_size': 90_094_666_806},
         '2.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xc7205ccf81d34247b68f34a40f54747f/contents/blob/',
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x95b53cfe322f44a08b70cc638d946422/contents/blob/',
             'compressed_size': 12_000_000_000}}
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):

From 574e68b6cd019002c62bf54fdef1609efc17d1ba Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Thu, 25 Feb 2021 14:53:01 -0800
Subject: [PATCH 031/116] removing efficientnets due to slow pytorch
 implementation

---
 examples/configs/model.py      | 17 -----------------
 examples/configs/supported.py  |  3 +--
 examples/models/initializer.py |  5 +----
 3 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/examples/configs/model.py b/examples/configs/model.py
index e744bf0e..12a429a7 100644
--- a/examples/configs/model.py
+++ b/examples/configs/model.py
@@ -1,5 +1,3 @@
-
-
 model_defaults = {
     'bert-base-uncased': {
         'optimizer': 'AdamW',
@@ -28,20 +26,5 @@
     'resnet18_ms': {
         'target_resolution': (224, 224),
     },
-    'efficientnet-b0': {
-        'target_resolution': (224, 224),
-    },
-    'efficientnet-b1': {
-        'target_resolution': (240, 240),
-    },
-    'efficientnet-b2': {
-        'target_resolution': (260, 260),
-    },
-    'efficientnet-b3': {
-        'target_resolution': (300, 300),
-    },
-    'efficientnet-b4': {
-        'target_resolution': (380, 380),
-    },
     'logistic_regression': {},
 }
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index c69e187e..999a233e 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -57,8 +57,7 @@
 transforms = ['bert', 'image_base', 'image_resize_and_center_crop', 'poverty_train']
 models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50',
          'densenet121', 'bert-base-uncased', 'gin-virtual',
-         'logistic_regression',
-         'efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3', 'efficientnet-b4']
+         'logistic_regression']
 algorithms = ['ERM', 'groupDRO', 'deepCORAL', 'IRM']
 optimizers = ['SGD', 'Adam', 'AdamW']
 schedulers = ['linear_schedule_with_warmup', 'ReduceLROnPlateau', 'StepLR']
diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index 32c5baab..7a0cb718 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -2,7 +2,6 @@
 import torchvision
 from models.bert import BertClassifier, BertFeaturizer
 from models.resnet_multispectral import ResNet18
-from efficientnet_pytorch import EfficientNet
 from models.layers import Identity
 from models.gnn import GINVirtual
 
@@ -26,9 +25,7 @@ def initialize_model(config, d_out):
     elif config.model == 'logistic_regression':
         model = nn.Linear(out_features=d_out, **config.model_kwargs)
     elif config.model == 'gin-virtual':
-        model = GINVirtual(num_tasks=d_out, **config.model_kwargs)
-    elif config.model in ('efficientnet-b0', 'efficientnet-b1', 'efficientnet-b2', 'efficientnet-b3', 'efficientnet-b4'):
-        model = EfficientNet.from_pretrained(config.model)
+        model = GINVirtual(num_tasks=d_out, **config.model_kwargs)    
     else:
         raise ValueError('Model not recognized.')
     return model

From 26adb15c13612de338ddd01c37864666e045d8c6 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Thu, 25 Feb 2021 15:15:32 -0800
Subject: [PATCH 032/116] iwildcam url update

---
 wilds/datasets/iwildcam_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 4f338b79..337b44f1 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -46,7 +46,7 @@ def __init__(self, root_dir='data', download=False, split_scheme='official'):
             raise ValueError(f'Split scheme {self._split_scheme} not recognized')
 
         # path
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xc7205ccf81d34247b68f34a40f54747f/contents/blob/'
+        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x95b53cfe322f44a08b70cc638d946422/contents/blob/'
         self._compressed_size = 12_000_000_000
         self._data_dir = Path(self.initialize_data_dir(root_dir, download))
 

From a47f571ae7a1d6717500981588cee826ca719714 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Thu, 25 Feb 2021 15:21:28 -0800
Subject: [PATCH 033/116] target res default change for iwildcam

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 4ed1c412..46b9beb1 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -122,7 +122,7 @@
         'model_kwargs': {'pretrained': True},
         'train_transform': 'image_base',
         'eval_transform': 'image_base',
-        # 'target_resolution': (448, 448),
+        'target_resolution': (448, 448),
         'val_metric_decreasing': False,
         'algo_log_metric': 'accuracy',
         'model': 'resnet50',

From d5bc386127771985e6b1e8111762a6033256692d Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Thu, 25 Feb 2021 15:56:39 -0800
Subject: [PATCH 034/116] camelyon default changes: target res and resnet50

---
 examples/configs/datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 4af89a0c..b728f924 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -34,10 +34,11 @@
     },
     'camelyon17': {
         'split_scheme': 'official',
-        'model': 'densenet121',
+        'model': 'resnet50',
         'model_kwargs': {'pretrained': False},
         'train_transform': 'image_base',
         'eval_transform': 'image_base',
+        'target_resolution': (96, 96),
         'loss_function': 'cross_entropy',
         'groupby_fields': ['hospital'],
         'val_metric': 'acc_avg',

From 072d839840fde8eceea384891feed822a708b7b6 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sat, 27 Feb 2021 09:57:23 -0800
Subject: [PATCH 035/116] fmow and povertymap v1.1

---
 .../fmow/convert_npy_to_jpg.py                | 28 ++++++++++++++++
 dataset_preprocessing/poverty/split_npys.py   | 25 +++++++++++++++
 examples/configs/datasets.py                  |  6 +---
 wilds/datasets/celebA_dataset.py              |  2 +-
 wilds/datasets/fmow_dataset.py                | 21 ++++++------
 wilds/datasets/poverty_dataset.py             | 32 +++++++------------
 wilds/datasets/wilds_dataset.py               |  2 +-
 7 files changed, 79 insertions(+), 37 deletions(-)
 create mode 100644 dataset_preprocessing/fmow/convert_npy_to_jpg.py
 create mode 100644 dataset_preprocessing/poverty/split_npys.py

diff --git a/dataset_preprocessing/fmow/convert_npy_to_jpg.py b/dataset_preprocessing/fmow/convert_npy_to_jpg.py
new file mode 100644
index 00000000..c883198b
--- /dev/null
+++ b/dataset_preprocessing/fmow/convert_npy_to_jpg.py
@@ -0,0 +1,28 @@
+import os, sys
+import argparse
+import numpy as np
+from PIL import Image
+from pathlib import Path
+from tqdm import tqdm
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--root_dir', required=True,
+                        help='The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).')
+    config = parser.parse_args()
+    data_dir = Path(config.root_dir) / 'fmow_v1.0'
+    image_dir = Path(config.root_dir) / 'fmow_v1.0_images_jpg'
+    os.makedirs(image_dir, exist_ok=True)
+
+    img_counter = 0
+    for chunk in tqdm(range(101)):
+        npy_chunk = np.load(data_dir / f'rgb_all_imgs_{chunk}.npy', mmap_mode='r')
+        for i in range(len(npy_chunk)):
+            npy_image = npy_chunk[i]
+            img = Image.fromarray(npy_image, mode='RGB')
+            img.save(image_dir / f'rgb_img_{img_counter}.jpg')
+            img_counter += 1
+
+if __name__=='__main__':
+    main()
diff --git a/dataset_preprocessing/poverty/split_npys.py b/dataset_preprocessing/poverty/split_npys.py
new file mode 100644
index 00000000..4bf9f023
--- /dev/null
+++ b/dataset_preprocessing/poverty/split_npys.py
@@ -0,0 +1,25 @@
+import os, sys
+import argparse
+import numpy as np
+from PIL import Image
+from pathlib import Path
+from tqdm import tqdm
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--root_dir', required=True,
+                        help='The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).')
+    config = parser.parse_args()
+    data_dir = Path(config.root_dir) / 'poverty_v1.0'
+    indiv_dir = Path(config.root_dir) / 'poverty_v1.0_indiv_npz'
+    os.makedirs(indiv_dir, exist_ok=True)
+
+    f = np.load(data_dir / 'landsat_poverty_imgs.npy', mmap_mode='r')
+    f = f.transpose((0, 3, 1, 2))
+    for i in tqdm(range(len(f))):
+        x = f[i]
+        np.savez_compressed(indiv_dir / f'landsat_poverty_img_{i}.npz', x=x)
+
+if __name__=='__main__':
+    main()
diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index b728f924..e66a43f5 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -165,11 +165,7 @@
             'fold': 'A',
             'oracle_training_set': False,
             'use_ood_val': True
-        },
-        'loader_kwargs': {
-            'num_workers': 1,
-            'pin_memory': False,
-        },
+        },        
         'model': 'resnet18_ms',
         'model_kwargs': {'num_channels': 8},
         'train_transform': 'poverty_train',
diff --git a/wilds/datasets/celebA_dataset.py b/wilds/datasets/celebA_dataset.py
index caa1a021..5422ed47 100644
--- a/wilds/datasets/celebA_dataset.py
+++ b/wilds/datasets/celebA_dataset.py
@@ -54,7 +54,7 @@ class CelebADataset(WILDSDataset):
     _dataset_name = 'celebA'
     _versions_dict = {
         '1.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xa174edc9c11041869d11f98d1dc19935/contents/blob/',
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xfe55077f5cd541f985ebf9ec50473293/contents/blob/',
             'compressed_size': None}}
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
diff --git a/wilds/datasets/fmow_dataset.py b/wilds/datasets/fmow_dataset.py
index 82e6d7df..ad97974c 100644
--- a/wilds/datasets/fmow_dataset.py
+++ b/wilds/datasets/fmow_dataset.py
@@ -17,6 +17,8 @@
 from wilds.common.grouper import CombinatorialGrouper
 from wilds.datasets.wilds_dataset import WILDSDataset
 
+import IPython
+
 Image.MAX_IMAGE_PIXELS = 10000000000
 
 
@@ -60,7 +62,11 @@ class FMoWDataset(WILDSDataset):
     _versions_dict = {
         '1.0': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0xc59ea8261dfe4d2baa3820866e33d781/contents/blob/',
-            'compressed_size': 70_000_000_000}}
+            'compressed_size': 70_000_000_000},
+        '1.1': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xaec91eb7c9d548ebb15e1b5e60f966ab/contents/blob/',
+            'compressed_size': 53_893_324_800}
+    }
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official', oracle_training_set=False, seed=111, use_ood_val=False):
         self._version = version
@@ -172,14 +178,11 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
-       """
-       Returns x for a given idx.
-       """
-       idx = self.full_idxs[idx]
-       batch_idx = idx // self.chunk_size
-       within_batch_idx = idx % self.chunk_size
-       img_batch = np.load(self.root / f'rgb_all_imgs_{batch_idx}.npy', mmap_mode='r')
-       return img_batch[within_batch_idx]
+        """
+        Returns x for a given idx.
+        """
+        img = Image.open(self.root / 'images' /f'rgb_img_{idx}.png').convert('RGB')
+        return img
 
     def eval(self, y_pred, y_true, metadata):
         # Overall evaluation + evaluate by year
diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index 55e67a5c..0c121ea2 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -145,13 +145,16 @@ class PovertyMapDataset(WILDSDataset):
     _versions_dict = {
         '1.0': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0x9a2add5219db4ebc89965d7f42719750/contents/blob/',
-            'compressed_size': 18_630_656_000}}
+            'compressed_size': 18_630_656_000},
+        '1.1': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xfc0aa86ad9af4eb08c42dfc40eacf094/contents/blob/',
+            'compressed_size': 13_091_823_616}}
 
     def __init__(self, version=None, root_dir='data', download=False,
                  split_scheme='official',
                  no_nl=False, fold='A', oracle_training_set=False,
                  use_ood_val=True):
-        self._version = version        
+        self._version = version
         self._data_dir = self.initialize_data_dir(root_dir, download)
 
         self._split_dict = {'train': 0, 'id_val': 1, 'id_test': 2, 'val': 3, 'test': 4}
@@ -211,10 +214,6 @@ def __init__(self, version=None, root_dir='data', download=False,
             self._split_dict = {'train': 0, 'val': 1, 'id_test': 2, 'ood_val': 3, 'test': 4}
             self._split_names = {'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test'}
 
-
-        self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
-
-        self.imgs = self.imgs.transpose((0, 3, 1, 2))
         self._y_array = torch.from_numpy(np.asarray(self.metadata['wealthpooled'])[:, np.newaxis]).float()
         self._y_size = 1
 
@@ -236,21 +235,12 @@ def __init__(self, version=None, root_dir='data', download=False,
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
-       """
-       Returns x for a given idx.
-       """
-       img = self.imgs[idx].copy()
-       if self.no_nl:
-           img[-1] = 0
-       img = torch.from_numpy(img).float()
-
-       self.cache_counter += 1
-       if self.cache_counter > 1000:
-           self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
-           self.imgs = self.imgs.transpose((0, 3, 1, 2))
-           self.cache_counter = 0
-
-       return img
+        """
+        Returns x for a given idx.
+        """
+        img = np.load(self.root / 'images' / f'landsat_poverty_img_{idx}.npz')['x']
+        img = torch.from_numpy(img).float()
+        return img
 
     def eval(self, y_pred, y_true, metadata):
         all_results = {}
diff --git a/wilds/datasets/wilds_dataset.py b/wilds/datasets/wilds_dataset.py
index 47e00dc6..b19c9782 100644
--- a/wilds/datasets/wilds_dataset.py
+++ b/wilds/datasets/wilds_dataset.py
@@ -310,7 +310,7 @@ def initialize_data_dir(self, root_dir, download):
                 f'*****************************\n'
                 f'{self.dataset_name} has been updated to version {self.latest_version}.\n'
                 f'You are currently using version {self.version}.\n'
-                f'We highly recommend updating the dataset.\n'
+                f'We highly recommend updating the dataset by not specifying the older version in the command-line argument or dataset constructor.\n'
                 f'See https://wilds.stanford.edu/changelog for changes.\n'
                 f'*****************************\n')
         elif latest_minor_version > current_minor_version:

From 4f47348767284e39fd4284746be9e3924c8d37ef Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sat, 27 Feb 2021 10:42:41 -0800
Subject: [PATCH 036/116] Fix path

---
 examples/run_expt.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/run_expt.py b/examples/run_expt.py
index 1bde3100..1faca2b5 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -8,6 +8,9 @@
 import sys
 from collections import defaultdict
 
+# TODO: delete later -Tony
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+
 from wilds.common.data_loaders import get_train_loader, get_eval_loader
 from wilds.common.grouper import CombinatorialGrouper
 

From 7d9f5369dc593f77b0f50ea1ec32fc2ebe5612f2 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sat, 27 Feb 2021 13:30:50 -0800
Subject: [PATCH 037/116] removing grouper warning for povertymap and adding
 celeba size

---
 examples/run_expt.py             | 2 +-
 wilds/common/grouper.py          | 7 ++++---
 wilds/datasets/celebA_dataset.py | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/run_expt.py b/examples/run_expt.py
index 1bde3100..7f6d3101 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -34,7 +34,7 @@ def main():
     parser.add_argument('--download', default=False, type=parse_bool, const=True, nargs='?',
                         help='If true, tries to downloads the dataset if it does not exist in root_dir.')
     parser.add_argument('--frac', type=float, default=1.0,
-                        help='Convenience parameter that scales all dataset splits down to the specified fraction, for development purposes.')
+                        help='Convenience parameter that scales all dataset splits down to the specified fraction, for development purposes. Note that this also scales the test set down, so the reported numbers are not comparable with the full test set.')
     parser.add_argument('--version', default=None, type=str)
 
     # Loaders
diff --git a/wilds/common/grouper.py b/wilds/common/grouper.py
index 2c6f8d82..599be554 100644
--- a/wilds/common/grouper.py
+++ b/wilds/common/grouper.py
@@ -3,6 +3,7 @@
 from wilds.common.utils import get_counts
 from wilds.datasets.wilds_dataset import WILDSSubset
 import warnings
+import IPython
 
 class Grouper:
     """
@@ -87,10 +88,11 @@ def __init__(self, dataset, groupby_fields):
             # Note that this might result in some empty groups.
             self.groupby_field_indices = [i for (i, field) in enumerate(dataset.metadata_fields) if field in groupby_fields]
             if len(self.groupby_field_indices) != len(self.groupby_fields):
-                raise ValueError('at least one group field not found in dataset.metadata_fields')
+                raise ValueError('At least one group field not found in dataset.metadata_fields')
             grouped_metadata = dataset.metadata_array[:, self.groupby_field_indices]
             if not isinstance(grouped_metadata, torch.LongTensor):
-                warnings.warn(f'CombinatorialGrouper: converting metadata with fields [{", ".join(groupby_fields)}] into long')
+                if not torch.all(grouped_metadata == grouped_metadata.long()):
+                    warnings.warn(f'CombinatorialGrouper: converting metadata with fields [{", ".join(groupby_fields)}] into long')
                 grouped_metadata = grouped_metadata.long()
             for idx, field in enumerate(self.groupby_fields):
                 min_value = grouped_metadata[:,idx].min()
@@ -150,4 +152,3 @@ def group_str(self, group):
 
     def group_field_str(self, group):
         return self.group_str(group).replace('=', ':').replace(',','_').replace(' ','')
-
diff --git a/wilds/datasets/celebA_dataset.py b/wilds/datasets/celebA_dataset.py
index 5422ed47..9c7d44ca 100644
--- a/wilds/datasets/celebA_dataset.py
+++ b/wilds/datasets/celebA_dataset.py
@@ -55,7 +55,7 @@ class CelebADataset(WILDSDataset):
     _versions_dict = {
         '1.0': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0xfe55077f5cd541f985ebf9ec50473293/contents/blob/',
-            'compressed_size': None}}
+            'compressed_size': 1_308_557_312}}
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
         self._version = version

From f46edd801ccb5670c81fd59e3e3e6e4fee9fc24d Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 28 Feb 2021 10:31:01 -0800
Subject: [PATCH 038/116] Fix paths

---
 examples/run_expt.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/run_expt.py b/examples/run_expt.py
index 1bde3100..1faca2b5 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -8,6 +8,9 @@
 import sys
 from collections import defaultdict
 
+# TODO: delete later -Tony
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+
 from wilds.common.data_loaders import get_train_loader, get_eval_loader
 from wilds.common.grouper import CombinatorialGrouper
 

From 895129d6444d595d358a9c0d992daa3631355ba2 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 28 Feb 2021 10:38:35 -0800
Subject: [PATCH 039/116] Updated hyperparemeters for iWildCam and Camelyon

---
 examples/configs/datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index b728f924..9dbeb729 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -34,7 +34,7 @@
     },
     'camelyon17': {
         'split_scheme': 'official',
-        'model': 'resnet50',
+        'model': 'densenet121',
         'model_kwargs': {'pretrained': False},
         'train_transform': 'image_base',
         'eval_transform': 'image_base',
@@ -127,10 +127,10 @@
         'val_metric_decreasing': False,
         'algo_log_metric': 'accuracy',
         'model': 'resnet50',
-        'lr': 1e-5,
+        'lr': 3e-5,
         'weight_decay': 0.0,
         'batch_size': 16,
-        'n_epochs': 18,
+        'n_epochs': 12,
         'optimizer': 'Adam',
         'split_scheme': 'official',
         'scheduler': None,

From 2ac1ef1e3e2ebebfc0210bf5f72981b7d984061e Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 28 Feb 2021 16:51:57 -0800
Subject: [PATCH 040/116] Amazon v2.0 + Support DistilBERT

---
 .../amazon_yelp/subsample_amazon.py           | 91 +++++++++++++++++++
 examples/configs/datasets.py                  | 18 +++-
 examples/configs/model.py                     |  5 +
 examples/configs/supported.py                 |  4 +-
 examples/models/bert/__init__.py              |  0
 examples/models/{ => bert}/bert.py            |  0
 examples/models/bert/distilbert.py            | 31 +++++++
 examples/models/initializer.py                | 31 +++++--
 examples/optimizer.py                         |  7 +-
 examples/run_expt.py                          |  3 -
 examples/transforms.py                        | 33 +++++--
 wilds/datasets/amazon_dataset.py              |  8 +-
 wilds/datasets/wilds_dataset.py               |  5 +-
 13 files changed, 207 insertions(+), 29 deletions(-)
 create mode 100644 dataset_preprocessing/amazon_yelp/subsample_amazon.py
 create mode 100644 examples/models/bert/__init__.py
 rename examples/models/{ => bert}/bert.py (100%)
 create mode 100644 examples/models/bert/distilbert.py

diff --git a/dataset_preprocessing/amazon_yelp/subsample_amazon.py b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
new file mode 100644
index 00000000..5be0f8f0
--- /dev/null
+++ b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
@@ -0,0 +1,91 @@
+import argparse
+import csv
+import os
+
+import pandas as pd
+import numpy as np
+
+
+"""
+Subsample the Amazon dataset.
+
+Usage:
+    python dataset_preprocessing/amazon_yelp/subsample_amazon.py <path> <frac>
+"""
+
+NOT_IN_DATASET = -1
+
+
+def main(dataset_path, frac=0.25):
+    def output_dataset_sizes(split_df):
+        print("-" * 50)
+        print(f'Train size: {len(split_df[split_df["split"] == 0])}')
+        print(f'Val size: {len(split_df[split_df["split"] == 1])}')
+        print(f'Test size: {len(split_df[split_df["split"] == 2])}')
+        print(
+            f'Number of examples not included: {len(split_df[split_df["split"] == NOT_IN_DATASET])}'
+        )
+        print("-" * 50)
+        print("\n")
+
+    data_df = pd.read_csv(
+        os.path.join(dataset_path, "reviews.csv"),
+        dtype={
+            "reviewerID": str,
+            "asin": str,
+            "reviewTime": str,
+            "unixReviewTime": int,
+            "reviewText": str,
+            "summary": str,
+            "verified": bool,
+            "category": str,
+            "reviewYear": int,
+        },
+        keep_default_na=False,
+        na_values=[],
+        quoting=csv.QUOTE_NONNUMERIC,
+    )
+
+    user_csv_path = os.path.join(dataset_path, "splits", "user.csv")
+    split_df = pd.read_csv(user_csv_path)
+    output_dataset_sizes(split_df)
+
+    train_data_df = data_df[split_df["split"] == 0]
+    train_reviewer_ids = train_data_df.reviewerID.unique()
+    print(f"Number of unique reviewers in train set: {len(train_reviewer_ids)}")
+
+    blackout_indices = []
+    for i, reviewer_id in enumerate(train_reviewer_ids):
+        reviews = train_data_df[train_data_df["reviewerID"] == reviewer_id]
+
+        # Randomly sample (1 - frac) x number of reviews this particular user has.
+        # Add to blackout_indices to blackout later, so frac x number of reviews remain.
+        blackout_count = int((1 - frac) * len(reviews))
+        blackout_indices.extend(
+            np.random.choice(reviews.index, blackout_count, replace=False)
+        )
+
+    # Mark all the corresponding reviews of blackout_indices as -1
+    split_df.loc[blackout_indices, "split"] = NOT_IN_DATASET
+    output_dataset_sizes(split_df)
+
+    # Write out the new splits to user.csv
+    split_df.to_csv(user_csv_path, index=False)
+    print("Done.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Subsample the Amazon dataset.")
+    parser.add_argument(
+        "path",
+        type=str,
+        help="Path to the Amazon dataset",
+    )
+    parser.add_argument(
+        "frac",
+        type=float,
+        help="Subsample fraction",
+    )
+
+    args = parser.parse_args()
+    main(args.path, args.frac)
diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index b728f924..5bddab4c 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -1,19 +1,23 @@
 dataset_defaults = {
     'amazon': {
         'split_scheme': 'official',
-        'model': 'bert-base-uncased',
+        'model': 'distilbert-base-uncased',
         'train_transform': 'bert',
         'eval_transform': 'bert',
         'max_token_length': 512,
         'loss_function': 'cross_entropy',
         'algo_log_metric': 'accuracy',
         'batch_size': 8,
-        'lr': 2e-6,
+        'lr': 1e-5,
         'weight_decay': 0.01,
         'n_epochs': 3,
         'n_groups_per_batch': 2,
         'irm_lambda': 1.0,
-        'coral_penalty_weight': 10.0,
+        'coral_penalty_weight': 0.1,
+        'loader_kwargs': {
+            'num_workers': 1,
+            'pin_memory': True,
+        },
     },
     'bdd100k': {
         'split_scheme': 'official',
@@ -76,7 +80,7 @@
     },
     'civilcomments': {
         'split_scheme': 'official',
-        'model': 'bert-base-uncased',
+        'model': 'distilbert-base-uncased',
         'train_transform': 'bert',
         'eval_transform': 'bert',
         'loss_function': 'cross_entropy',
@@ -89,6 +93,12 @@
         'n_epochs': 5,
         'algo_log_metric': 'accuracy',
         'max_token_length': 300,
+        'irm_lambda': 1.0,
+        'coral_penalty_weight': 10.0,
+        'loader_kwargs': {
+            'num_workers': 1,
+            'pin_memory': True,
+        },
     },
     'fmow': {
         'split_scheme': 'official',
diff --git a/examples/configs/model.py b/examples/configs/model.py
index 12a429a7..f587b3ff 100644
--- a/examples/configs/model.py
+++ b/examples/configs/model.py
@@ -4,6 +4,11 @@
         'max_grad_norm': 1.0,
         'scheduler': 'linear_schedule_with_warmup',
     },
+    'distilbert-base-uncased': {
+        'optimizer': 'AdamW',
+        'max_grad_norm': 1.0,
+        'scheduler': 'linear_schedule_with_warmup',
+    },
     'densenet121': {
         'model_kwargs':{
             'pretrained':True,
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index 898797cb..1b7f13c3 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -58,8 +58,8 @@
 # see initialize_*() functions for correspondence
 transforms = ['bert', 'image_base', 'image_resize_and_center_crop', 'poverty_train']
 models = ['resnet18_ms', 'resnet50', 'resnet34', 'wideresnet50',
-         'densenet121', 'bert-base-uncased', 'gin-virtual',
-         'logistic_regression']
+         'densenet121', 'bert-base-uncased', 'distilbert-base-uncased',
+         'gin-virtual', 'logistic_regression']
 algorithms = ['ERM', 'groupDRO', 'deepCORAL', 'IRM']
 optimizers = ['SGD', 'Adam', 'AdamW']
 schedulers = ['linear_schedule_with_warmup', 'ReduceLROnPlateau', 'StepLR']
diff --git a/examples/models/bert/__init__.py b/examples/models/bert/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/models/bert.py b/examples/models/bert/bert.py
similarity index 100%
rename from examples/models/bert.py
rename to examples/models/bert/bert.py
diff --git a/examples/models/bert/distilbert.py b/examples/models/bert/distilbert.py
new file mode 100644
index 00000000..c803fe35
--- /dev/null
+++ b/examples/models/bert/distilbert.py
@@ -0,0 +1,31 @@
+from transformers import DistilBertForSequenceClassification, DistilBertModel
+
+class DistilBertClassifier(DistilBertForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+        self.d_out = config.num_labels
+
+    def __call__(self, x):
+        input_ids = x[:, :, 0]
+        attention_mask = x[:, :, 1]
+        outputs = super().__call__(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )[0]
+        return outputs
+
+
+class DistilBertFeaturizer(DistilBertModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.d_out = config.hidden_size
+
+    def __call__(self, x):
+        input_ids = x[:, :, 0]
+        attention_mask = x[:, :, 1]
+        hidden_state = super().__call__(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )[0]
+        pooled_output = hidden_state[:, 0]
+        return pooled_output
diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index 7a0cb718..ee532a07 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -1,6 +1,7 @@
 import torch.nn as nn
 import torchvision
-from models.bert import BertClassifier, BertFeaturizer
+from models.bert.bert import BertClassifier, BertFeaturizer
+from models.bert.distil_bert import DistilBertClassifier, DistilBertFeaturizer
 from models.resnet_multispectral import ResNet18
 from models.layers import Identity
 from models.gnn import GINVirtual
@@ -14,7 +15,18 @@ def initialize_model(config, d_out):
             name=config.model,
             d_out=d_out,
             **config.model_kwargs)
-    elif config.model.startswith('bert'):
+    elif 'bert' in config.model:
+        model = initializeBertBasedModel(config, d_out)
+    elif config.model == 'logistic_regression':
+        model = nn.Linear(out_features=d_out, **config.model_kwargs)
+    elif config.model == 'gin-virtual':
+        model = GINVirtual(num_tasks=d_out, **config.model_kwargs)    
+    else:
+        raise ValueError(f'Model: {config.model} not recognized.')
+    return model
+
+def initializeBertBasedModel(config, d_out):
+    if config.model == 'bert-base-uncased':
         if d_out is None:
             model = BertFeaturizer.from_pretrained(config.model, **config.model_kwargs)
         else:
@@ -22,12 +34,17 @@ def initialize_model(config, d_out):
                 config.model,
                 num_labels=d_out,
                 **config.model_kwargs)
-    elif config.model == 'logistic_regression':
-        model = nn.Linear(out_features=d_out, **config.model_kwargs)
-    elif config.model == 'gin-virtual':
-        model = GINVirtual(num_tasks=d_out, **config.model_kwargs)    
+    elif config.model == 'distilbert-base-uncased':
+        if d_out is None:
+            model = DistilBertFeaturizer.from_pretrained(config.model, **config.model_kwargs)
+        else:
+            model = DistilBertClassifier.from_pretrained(
+                config.model,
+                num_labels=d_out,
+                **config.model_kwargs)
     else:
-        raise ValueError('Model not recognized.')
+        raise ValueError(f'Model: {config.model} not recognized.')
+
     return model
 
 def initialize_torchvision_model(name, d_out, **kwargs):
diff --git a/examples/optimizer.py b/examples/optimizer.py
index a31777ff..50eb80f7 100644
--- a/examples/optimizer.py
+++ b/examples/optimizer.py
@@ -2,8 +2,9 @@
 from transformers import AdamW
 
 def initialize_optimizer(config, model):
-    if config.model.startswith('bert'):
-        assert config.optimizer=='AdamW', 'Only AdamW supported for BERT models'
+    if 'bert' in config.model:
+        assert config.optimizer=='AdamW', 'Only AdamW is supported for BERT-based models'
+
     # initialize optimizers
     if config.optimizer=='SGD':
         params = filter(lambda p: p.requires_grad, model.parameters())
@@ -13,7 +14,7 @@ def initialize_optimizer(config, model):
             weight_decay=config.weight_decay,
             **config.optimizer_kwargs)
     elif config.optimizer=='AdamW':
-        assert config.model.startswith('bert'), "Only BERT supported for AdamW"
+        assert 'bert' in config.model, 'Only BERT-based models are supported for AdamW'
         no_decay = ['bias', 'LayerNorm.weight']
         params = [
             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
diff --git a/examples/run_expt.py b/examples/run_expt.py
index 1faca2b5..1bde3100 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -8,9 +8,6 @@
 import sys
 from collections import defaultdict
 
-# TODO: delete later -Tony
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
 from wilds.common.data_loaders import get_train_loader, get_eval_loader
 from wilds.common.grouper import CombinatorialGrouper
 
diff --git a/examples/transforms.py b/examples/transforms.py
index cbacb1f1..bafbd42f 100644
--- a/examples/transforms.py
+++ b/examples/transforms.py
@@ -1,5 +1,5 @@
 import torchvision.transforms as transforms
-from transformers import BertTokenizerFast
+from transformers import BertTokenizerFast, DistilBertTokenizerFast
 import torch
 
 def initialize_transform(transform_name, config, dataset):
@@ -17,9 +17,10 @@ def initialize_transform(transform_name, config, dataset):
         raise ValueError(f"{transform_name} not recognized")
 
 def initialize_bert_transform(config):
-    assert config.model.startswith('bert')
+    assert 'bert' in config.model
     assert config.max_token_length is not None
-    tokenizer = BertTokenizerFast.from_pretrained(config.model)
+
+    tokenizer = getBertTokenizer(config.model)
     def transform(text):
         tokens = tokenizer(
             text,
@@ -27,15 +28,31 @@ def transform(text):
             truncation=True,
             max_length=config.max_token_length,
             return_tensors='pt')
-        x = torch.stack(
-            (tokens['input_ids'],
-             tokens['attention_mask'],
-             tokens['token_type_ids']),
-            dim=2)
+        if config.model == 'bert-base-uncased':
+            x = torch.stack(
+                (tokens['input_ids'],
+                 tokens['attention_mask'],
+                 tokens['token_type_ids']),
+                dim=2)
+        elif config.model == 'distilbert-base-uncased':
+            x = torch.stack(
+                (tokens['input_ids'],
+                 tokens['attention_mask']),
+                dim=2)
         x = torch.squeeze(x, dim=0) # First shape dim is always 1
         return x
     return transform
 
+def getBertTokenizer(model):
+    if model == 'bert-base-uncased':
+        tokenizer = BertTokenizerFast.from_pretrained(model)
+    elif model == 'distilbert-base-uncased':
+        tokenizer = DistilBertTokenizerFast.from_pretrained(model)
+    else:
+        raise ValueError(f'Model: {model} not recognized.')
+
+    return tokenizer
+
 def initialize_image_base_transform(config, dataset):
     transform_steps = []
     if dataset.original_resolution is not None and min(dataset.original_resolution)!=max(dataset.original_resolution):
diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index abe1def9..37e42bbf 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -54,7 +54,13 @@ class AmazonDataset(WILDSDataset):
     _versions_dict = {
         '1.0': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0x60237058e01749cda7b0701c2bd01420/contents/blob/',
-            'compressed_size': 4_066_541_568}}
+            'compressed_size': 4_066_541_568
+        },
+        '2.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x690dfddf794148b2aec3def6db8fe25a/contents/blob/',
+            'compressed_size': 1_987_520_225
+        },
+    }
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
         self._version = version
diff --git a/wilds/datasets/wilds_dataset.py b/wilds/datasets/wilds_dataset.py
index 47e00dc6..a97b6e94 100644
--- a/wilds/datasets/wilds_dataset.py
+++ b/wilds/datasets/wilds_dataset.py
@@ -1,5 +1,6 @@
 import os
-import shutil
+import time
+
 import torch
 import numpy as np
 
@@ -350,12 +351,14 @@ def initialize_data_dir(self, root_dir, download):
         print(f'Downloading dataset to {data_dir}...')
         print(f'You can also download the dataset manually at https://wilds.stanford.edu/downloads.')
         try:
+            start_time = time.time()
             download_and_extract_archive(
                 url=download_url,
                 download_root=data_dir,
                 filename='archive.tar.gz',
                 remove_finished=True,
                 size=compressed_size)
+            print(f"It took {(time.time() - start_time) / 60} minutes to download and uncompress the dataset.")
         except Exception as e:
             print(f"\n{os.path.join(data_dir, 'archive.tar.gz')} may be corrupted. Please try deleting it and rerunning this command.\n")
             print(f"Exception: ", e)

From a3b5b6995cba028af107960540138bcc64c02737 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 28 Feb 2021 20:06:58 -0800
Subject: [PATCH 041/116] Fix import for distilbert

---
 examples/models/initializer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index ee532a07..87a746c6 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 import torchvision
 from models.bert.bert import BertClassifier, BertFeaturizer
-from models.bert.distil_bert import DistilBertClassifier, DistilBertFeaturizer
+from models.bert.distilbert import DistilBertClassifier, DistilBertFeaturizer
 from models.resnet_multispectral import ResNet18
 from models.layers import Identity
 from models.gnn import GINVirtual

From ad6168fc1011adbaf43eae8a9329b87ac44d3b24 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sun, 28 Feb 2021 21:47:25 -0800
Subject: [PATCH 042/116] added support back for v1.0, fixed indexing issues
 with fmow, and pulled in earlier changes from Michael for v1.0

---
 wilds/common/grouper.py           |  1 -
 wilds/datasets/fmow_dataset.py    | 12 +++++++++---
 wilds/datasets/poverty_dataset.py | 26 +++++++++++++++++++++++---
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/wilds/common/grouper.py b/wilds/common/grouper.py
index 599be554..ad4ca460 100644
--- a/wilds/common/grouper.py
+++ b/wilds/common/grouper.py
@@ -3,7 +3,6 @@
 from wilds.common.utils import get_counts
 from wilds.datasets.wilds_dataset import WILDSSubset
 import warnings
-import IPython
 
 class Grouper:
     """
diff --git a/wilds/datasets/fmow_dataset.py b/wilds/datasets/fmow_dataset.py
index ad97974c..42211795 100644
--- a/wilds/datasets/fmow_dataset.py
+++ b/wilds/datasets/fmow_dataset.py
@@ -17,8 +17,6 @@
 from wilds.common.grouper import CombinatorialGrouper
 from wilds.datasets.wilds_dataset import WILDSDataset
 
-import IPython
-
 Image.MAX_IMAGE_PIXELS = 10000000000
 
 
@@ -181,7 +179,15 @@ def get_input(self, idx):
         """
         Returns x for a given idx.
         """
-        img = Image.open(self.root / 'images' /f'rgb_img_{idx}.png').convert('RGB')
+        idx = self.full_idxs[idx]
+        if self.version == '1.0':
+            batch_idx = idx // self.chunk_size
+            within_batch_idx = idx % self.chunk_size
+            img_batch = np.load(self.root / f'rgb_all_imgs_{batch_idx}.npy', mmap_mode='r')
+            img = img_batch[within_batch_idx].copy()
+        elif self.version == '1.1':
+            img = Image.open(self.root / 'images' / f'rgb_img_{idx}.png').convert('RGB')
+
         return img
 
     def eval(self, y_pred, y_true, metadata):
diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index 0c121ea2..34f702e1 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -153,7 +153,8 @@ class PovertyMapDataset(WILDSDataset):
     def __init__(self, version=None, root_dir='data', download=False,
                  split_scheme='official',
                  no_nl=False, fold='A', oracle_training_set=False,
-                 use_ood_val=True):
+                 use_ood_val=True,
+                 cache_size=100):
         self._version = version
         self._data_dir = self.initialize_data_dir(root_dir, download)
 
@@ -214,6 +215,10 @@ def __init__(self, version=None, root_dir='data', download=False,
             self._split_dict = {'train': 0, 'val': 1, 'id_test': 2, 'ood_val': 3, 'test': 4}
             self._split_names = {'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test'}
 
+        if self.version == '1.0':
+            self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
+            self.imgs = self.imgs.transpose((0, 3, 1, 2))
+
         self._y_array = torch.from_numpy(np.asarray(self.metadata['wealthpooled'])[:, np.newaxis]).float()
         self._y_size = 1
 
@@ -238,8 +243,23 @@ def get_input(self, idx):
         """
         Returns x for a given idx.
         """
-        img = np.load(self.root / 'images' / f'landsat_poverty_img_{idx}.npz')['x']
-        img = torch.from_numpy(img).float()
+        if self.version == '1.0':
+            img = self.imgs[idx].copy()
+            if self.no_nl:
+                img[-1] = 0
+            img = torch.from_numpy(img).float()
+            # consider refreshing cache if cache_size is limited
+            if self.cache_size < self.imgs.shape[0]:
+                self.cache_counter += 1
+                if self.cache_counter > self.cache_size:
+                    self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
+                    self.imgs = self.imgs.transpose((0, 3, 1, 2))
+                    self.cache_counter = 0
+
+        elif self.version == '1.1':
+            img = np.load(self.root / 'images' / f'landsat_poverty_img_{idx}.npz')['x']
+            img = torch.from_numpy(img).float()
+
         return img
 
     def eval(self, y_pred, y_true, metadata):

From 5813757b96e48018beec4b4fa1e52a592088f737 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sun, 28 Feb 2021 21:54:25 -0800
Subject: [PATCH 043/116] now with no_nl

---
 wilds/datasets/poverty_dataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index 34f702e1..66b806c1 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -258,6 +258,8 @@ def get_input(self, idx):
 
         elif self.version == '1.1':
             img = np.load(self.root / 'images' / f'landsat_poverty_img_{idx}.npz')['x']
+            if self.no_nl:
+                img[-1] = 0
             img = torch.from_numpy(img).float()
 
         return img

From 66721b0eed20e2bd2eaff1c3528214f06a33345d Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sun, 28 Feb 2021 21:56:32 -0800
Subject: [PATCH 044/116] povertymap cache

---
 wilds/datasets/poverty_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index 66b806c1..7837d94f 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -216,6 +216,7 @@ def __init__(self, version=None, root_dir='data', download=False,
             self._split_names = {'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test'}
 
         if self.version == '1.0':
+            self.cache_size = cache_size
             self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
             self.imgs = self.imgs.transpose((0, 3, 1, 2))
 

From 39f9b8d38525ca81a90038a98c40cd56689a2a1c Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sun, 28 Feb 2021 22:18:25 -0800
Subject: [PATCH 045/116] grouper change

---
 wilds/common/grouper.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/wilds/common/grouper.py b/wilds/common/grouper.py
index ad4ca460..07dc92a3 100644
--- a/wilds/common/grouper.py
+++ b/wilds/common/grouper.py
@@ -90,9 +90,10 @@ def __init__(self, dataset, groupby_fields):
                 raise ValueError('At least one group field not found in dataset.metadata_fields')
             grouped_metadata = dataset.metadata_array[:, self.groupby_field_indices]
             if not isinstance(grouped_metadata, torch.LongTensor):
-                if not torch.all(grouped_metadata == grouped_metadata.long()):
+                grouped_metadata_long = grouped_metadata.long()
+                if not torch.all(grouped_metadata == grouped_metadata_long):
                     warnings.warn(f'CombinatorialGrouper: converting metadata with fields [{", ".join(groupby_fields)}] into long')
-                grouped_metadata = grouped_metadata.long()
+                grouped_metadata = grouped_metadata_long
             for idx, field in enumerate(self.groupby_fields):
                 min_value = grouped_metadata[:,idx].min()
                 if min_value < 0:

From c1ab7566d6278e84825e87367b1748efd0ddd644 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 1 Mar 2021 09:50:33 -0800
Subject: [PATCH 046/116] moved cache_counter

---
 wilds/datasets/poverty_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index 7837d94f..cde3c437 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -217,6 +217,7 @@ def __init__(self, version=None, root_dir='data', download=False,
 
         if self.version == '1.0':
             self.cache_size = cache_size
+            self.cache_counter = 0            
             self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
             self.imgs = self.imgs.transpose((0, 3, 1, 2))
 
@@ -236,7 +237,6 @@ def __init__(self, version=None, root_dir='data', download=False,
             groupby_fields=['urban'])
 
         self._metrics = [MSE(), PearsonCorrelation()]
-        self.cache_counter = 0
 
         super().__init__(root_dir, download, split_scheme)
 

From 19151bcb8dd0e9a4f4a6b71ad56159a8218d0c0f Mon Sep 17 00:00:00 2001
From: Michihiro Yasunaga <michiyasunaga@users.noreply.github.com>
Date: Mon, 1 Mar 2021 12:12:13 -0800
Subject: [PATCH 047/116] py150 update splits

---
 wilds/datasets/py150_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index 5a362426..244628bf 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -47,7 +47,7 @@ def __init__(self, root_dir='data', download=False, split_scheme='official'):
             raise ValueError(f'Split scheme {self._split_scheme} not recognized')
 
         # path
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x45343bd9e1c64acfbcb4a22a76302994/contents/blob/'
+        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x3441a145a298405a966f7288373349bf/contents/blob/'
         self._data_dir = Path(self.initialize_data_dir(root_dir, download))
 
         # Load data

From 8e0e1f20d4a1e333c7af463c24bd910944ccf30f Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 1 Mar 2021 12:47:47 -0800
Subject: [PATCH 048/116] comment

---
 examples/models/initializer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index 7a0cb718..f9198e80 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -6,6 +6,9 @@
 from models.gnn import GINVirtual
 
 def initialize_model(config, d_out):
+    """
+    This function is called within each algorithm (e.g., ERM, groupDRO).
+    """
     if config.model == 'resnet18_ms':
         # multispectral resnet 18
         model = ResNet18(num_classes=d_out, **config.model_kwargs)
@@ -25,7 +28,7 @@ def initialize_model(config, d_out):
     elif config.model == 'logistic_regression':
         model = nn.Linear(out_features=d_out, **config.model_kwargs)
     elif config.model == 'gin-virtual':
-        model = GINVirtual(num_tasks=d_out, **config.model_kwargs)    
+        model = GINVirtual(num_tasks=d_out, **config.model_kwargs)
     else:
         raise ValueError('Model not recognized.')
     return model

From 96c4020d077df3a53b01355fc044479d7598ff0e Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 1 Mar 2021 13:12:57 -0800
Subject: [PATCH 049/116] switch camelyon back to densenet

---
 examples/configs/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index e66a43f5..4fd0f047 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -34,7 +34,7 @@
     },
     'camelyon17': {
         'split_scheme': 'official',
-        'model': 'resnet50',
+        'model': 'densenet121',
         'model_kwargs': {'pretrained': False},
         'train_transform': 'image_base',
         'eval_transform': 'image_base',
@@ -165,7 +165,7 @@
             'fold': 'A',
             'oracle_training_set': False,
             'use_ood_val': True
-        },        
+        },
         'model': 'resnet18_ms',
         'model_kwargs': {'num_channels': 8},
         'train_transform': 'poverty_train',

From 5edcf7ba854838796c64833f67fc9eaa011e7c2c Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Mon, 1 Mar 2021 13:52:17 -0800
Subject: [PATCH 050/116] update py150 versioning

---
 wilds/datasets/py150_dataset.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index 244628bf..76caf356 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -38,16 +38,20 @@ class Py150Dataset(WILDSDataset):
             This dataset is distributed under the MIT license.
         """
 
-    def __init__(self, root_dir='data', download=False, split_scheme='official'):
+    _dataset_name = 'py150'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3441a145a298405a966f7288373349bf/contents/blob/',
+            'compressed_size': 154_304_512}}
 
-        self._dataset_name = 'py150'
-        self._version = '1.0'
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+
+        self._version = version
         self._split_scheme = split_scheme
         if self._split_scheme != 'official':
             raise ValueError(f'Split scheme {self._split_scheme} not recognized')
 
         # path
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0x3441a145a298405a966f7288373349bf/contents/blob/'
         self._data_dir = Path(self.initialize_data_dir(root_dir, download))
 
         # Load data

From 068b608b59fa37308c9852c3f7ca7e4a9921807f Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Mon, 1 Mar 2021 16:42:53 -0800
Subject: [PATCH 051/116] generalize featurizer code to remove
 codeGPT2-specific code

---
 examples/algorithms/deepCORAL.py | 16 +++----
 examples/models/initializer.py   | 77 +++++++++++++++++++++++---------
 2 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/examples/algorithms/deepCORAL.py b/examples/algorithms/deepCORAL.py
index 70d3287a..11eff952 100644
--- a/examples/algorithms/deepCORAL.py
+++ b/examples/algorithms/deepCORAL.py
@@ -27,14 +27,10 @@ def __init__(self, config, d_out, grouper, loss, metric, n_train_steps):
         assert config.uniform_over_groups
         assert config.distinct_groups
         # initialize models
-        if config.model == 'code-gpt-py': #in case of pre-trained language model (`classifier` is also pre-trained)
-            model = initialize_model(config, d_out=None).to(config.device)
-            featurizer = model.transformer
-            classifier = model.lm_head
-        else:
-            featurizer = initialize_model(config, d_out=None).to(config.device)
-            classifier = torch.nn.Linear(featurizer.d_out, d_out).to(config.device)
-            model = torch.nn.Sequential(featurizer, classifier).to(config.device)
+        featurizer, classifier = initialize_model(config, d_out=d_out, featurizer=True)
+        featurizer = featurizer.to(config.device)
+        classifier = classifier.to(config.device)
+        model = torch.nn.Sequential(featurizer, classifier).to(config.device)
         # initialize module
         super().__init__(
             config=config,
@@ -53,7 +49,9 @@ def __init__(self, config, d_out, grouper, loss, metric, n_train_steps):
         self.classifier = classifier
 
     def coral_penalty(self, x, y):
-        if x.dim() == 3: #in case of language model [batch_size, seqlen, d_out]
+        if x.dim() > 2: 
+            # featurizers output Tensors of size (batch_size, ..., feature dimensionality).
+            # we flatten to Tensors of size (*, feature dimensionality)
             x = x.view(-1, x.size(-1))
             y = y.view(-1, y.size(-1))
 
diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index 6aa5c0ca..027a61e1 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -7,35 +7,73 @@
 from models.code_gpt import GPT2LMHeadLogit, GPT2FeaturizerLMHeadLogit
 from transformers import GPT2Tokenizer
 
-def initialize_model(config, d_out):
-    if config.model == 'resnet18_ms':
-        # multispectral resnet 18
-        model = ResNet18(num_classes=d_out, **config.model_kwargs)
-    elif config.model in ('resnet50', 'resnet34', 'wideresnet50','densenet121'):
-        model = initialize_torchvision_model(
-            name=config.model,
-            d_out=d_out,
-            **config.model_kwargs)
+def initialize_model(config, d_out, featurizer=False):
+    """
+    Initializes models according to the config
+        Args:
+            - config (dictionary): config dictionary
+            - d_out (int): the dimensionality of the model output
+            - featurizer (bool): whether to return a model or a (featurizer, classifier) pair that constitutes a model.
+        Output:
+            If feauturizer=True:
+            - featurizer: a model that outputs feature Tensors of shape (batch_size, ..., feature dimensionality)
+            - classifier: a model that takes in feature Tensors and outputs predictions. In most cases, this is a linear layer.
+            
+            If featurizer=False:
+            - model: a model that is equivalent to nn.Sequential(featurizer, classifier)
+    """
+    if config.model in ('resnet50', 'resnet34', 'wideresnet50','densenet121'):
+        if featurizer:
+            featurizer = initialize_torchvision_model(
+                name=config.model,
+                d_out=None,
+                **config.model_kwargs)
+            classifier = nn.Linear(featurizer.d_out, d_out)
+            model = (featurizer, classifier)
+        else:
+            model = initialize_torchvision_model(
+                name=config.model,
+                d_out=d_out,
+                **config.model_kwargs)
     elif config.model.startswith('bert'):
-        if d_out is None:
-            model = BertFeaturizer.from_pretrained(config.model, **config.model_kwargs)
+        if featurizer:
+            featurizer = BertFeaturizer.from_pretrained(config.model, **config.model_kwargs)
+            classifier = nn.Linear(featurizer.d_out, d_out)
+            model = (featurizer, classifier)
         else:
             model = BertClassifier.from_pretrained(
                 config.model,
                 num_labels=d_out,
                 **config.model_kwargs)
+    elif config.model == 'resnet18_ms': # multispectral resnet 18
+        if featurizer:
+            featurizer = ResNet18(num_classes=None, **config.model_kwargs)
+            classifier = nn.Linear(featurizer.d_out, d_out)
+            model = (featurizer, classifier)
+        else:
+            model = ResNet18(num_classes=d_out, **config.model_kwargs)
+    elif config.model == 'gin-virtual':
+        if featurizer:
+            featurizer = GINVirtual(num_tasks=None, **config.model_kwargs)    
+            classifier = nn.Linear(featurizer.d_out, d_out)
+            model = (featurizer, classifier)
+        else:
+            model = GINVirtual(num_tasks=d_out, **config.model_kwargs)    
     elif config.model == 'code-gpt-py':
         name = 'microsoft/CodeGPT-small-py'
-        if d_out is None:
+        tokenizer = GPT2Tokenizer.from_pretrained(name)
+        if featurizer:
             model = GPT2FeaturizerLMHeadLogit.from_pretrained(name)
+            model.resize_token_embeddings(len(tokenizer))
+            featurizer = model.transformer
+            classifier = model.lm_head
+            model = (featurizer, classifier)
         else:
             model = GPT2LMHeadLogit.from_pretrained(name)
-        tokenizer = GPT2Tokenizer.from_pretrained(name)
-        model.resize_token_embeddings(len(tokenizer))
+            model.resize_token_embeddings(len(tokenizer))
     elif config.model == 'logistic_regression':
+        assert not featurizer, "Featurizer not supported for logistic regression"
         model = nn.Linear(out_features=d_out, **config.model_kwargs)
-    elif config.model == 'gin-virtual':
-        model = GINVirtual(num_tasks=d_out, **config.model_kwargs)    
     else:
         raise ValueError('Model not recognized.')
     return model
@@ -57,13 +95,12 @@ def initialize_torchvision_model(name, d_out, **kwargs):
     constructor = getattr(torchvision.models, constructor_name)
     model = constructor(**kwargs)
     # adjust the last layer
-    d = getattr(model, last_layer_name).in_features
+    d_features = getattr(model, last_layer_name).in_features
     if d_out is None: # want to initialize a featurizer model
-        last_layer = Identity(d)
-        model.d_out = d
+        last_layer = Identity(d_features)
+        model.d_out = d_features
     else: # want to initialize a classifier for a particular num_classes
         last_layer = nn.Linear(d, d_out)
         model.d_out = d_out
     setattr(model, last_layer_name, last_layer)
-    # set the feature dimension as an attribute for convenience
     return model

From 99d565df97ee2fad38a49cfc5812c7b3032a6dec Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Mon, 1 Mar 2021 17:23:44 -0800
Subject: [PATCH 052/116] Fix subsampling Amazon

---
 .../amazon_yelp/subsample_amazon.py           | 20 +++++++++++--------
 examples/models/bert/distilbert.py            |  2 --
 wilds/datasets/amazon_dataset.py              |  4 ++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/dataset_preprocessing/amazon_yelp/subsample_amazon.py b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
index 5be0f8f0..82472cba 100644
--- a/dataset_preprocessing/amazon_yelp/subsample_amazon.py
+++ b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
@@ -5,6 +5,8 @@
 import pandas as pd
 import numpy as np
 
+# Fix the seed for reproducibility
+np.random.seed(0)
 
 """
 Subsample the Amazon dataset.
@@ -54,16 +56,18 @@ def output_dataset_sizes(split_df):
     train_reviewer_ids = train_data_df.reviewerID.unique()
     print(f"Number of unique reviewers in train set: {len(train_reviewer_ids)}")
 
+    # Randomly sample (1 - frac) x number of reviewers
+    # Blackout all the reviews belonging to the randomly sampled reviewers
+    subsampled_reviewers_count = int((1 - frac) * len(train_reviewer_ids))
+    subsampled_reviewers = np.random.choice(
+        train_reviewer_ids, subsampled_reviewers_count, replace=False
+    )
+    print(subsampled_reviewers)
+
     blackout_indices = []
-    for i, reviewer_id in enumerate(train_reviewer_ids):
+    for reviewer_id in subsampled_reviewers:
         reviews = train_data_df[train_data_df["reviewerID"] == reviewer_id]
-
-        # Randomly sample (1 - frac) x number of reviews this particular user has.
-        # Add to blackout_indices to blackout later, so frac x number of reviews remain.
-        blackout_count = int((1 - frac) * len(reviews))
-        blackout_indices.extend(
-            np.random.choice(reviews.index, blackout_count, replace=False)
-        )
+        blackout_indices.extend(reviews.index)
 
     # Mark all the corresponding reviews of blackout_indices as -1
     split_df.loc[blackout_indices, "split"] = NOT_IN_DATASET
diff --git a/examples/models/bert/distilbert.py b/examples/models/bert/distilbert.py
index c803fe35..ca7664b6 100644
--- a/examples/models/bert/distilbert.py
+++ b/examples/models/bert/distilbert.py
@@ -3,7 +3,6 @@
 class DistilBertClassifier(DistilBertForSequenceClassification):
     def __init__(self, config):
         super().__init__(config)
-        self.d_out = config.num_labels
 
     def __call__(self, x):
         input_ids = x[:, :, 0]
@@ -18,7 +17,6 @@ def __call__(self, x):
 class DistilBertFeaturizer(DistilBertModel):
     def __init__(self, config):
         super().__init__(config)
-        self.d_out = config.hidden_size
 
     def __call__(self, x):
         input_ids = x[:, :, 0]
diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index 37e42bbf..0dae3dba 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -57,8 +57,8 @@ class AmazonDataset(WILDSDataset):
             'compressed_size': 4_066_541_568
         },
         '2.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x690dfddf794148b2aec3def6db8fe25a/contents/blob/',
-            'compressed_size': 1_987_520_225
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x9de7edd6e3f14034809dc8cf3b060ba3/contents/blob/',
+            'compressed_size': 1_987_523_759
         },
     }
 

From 1a84eea7efb746ab2345700aafd98293c89f4e66 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Mon, 1 Mar 2021 18:13:34 -0800
Subject: [PATCH 053/116] py150 as a benchmark set

---
 examples/configs/supported.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index 7392db42..813cad4c 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -26,7 +26,8 @@
     'iwildcam',
     'ogb-molpcba',
     'poverty',
-    'fmow']
+    'fmow',
+    'py150']
 
 datasets = {
     'amazon': AmazonDataset,

From 3ef4280dd80f106e7e9ac0245dc1ed7cf68928f2 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Mon, 1 Mar 2021 19:35:32 -0800
Subject: [PATCH 054/116] Fix subsampling Amazon

---
 .../amazon_yelp/subsample_amazon.py           | 75 +++++++++++++++++--
 1 file changed, 68 insertions(+), 7 deletions(-)

diff --git a/dataset_preprocessing/amazon_yelp/subsample_amazon.py b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
index 82472cba..33fe7004 100644
--- a/dataset_preprocessing/amazon_yelp/subsample_amazon.py
+++ b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
@@ -16,14 +16,18 @@
 """
 
 NOT_IN_DATASET = -1
+# Split: {'train': 0, 'val': 1, 'id_val': 2, 'test': 3, 'id_test': 4}
+TRAIN, OOD_VAL, ID_VAL, OOD_TEST, ID_TEST = range(5)
 
 
 def main(dataset_path, frac=0.25):
     def output_dataset_sizes(split_df):
         print("-" * 50)
-        print(f'Train size: {len(split_df[split_df["split"] == 0])}')
-        print(f'Val size: {len(split_df[split_df["split"] == 1])}')
-        print(f'Test size: {len(split_df[split_df["split"] == 2])}')
+        print(f'Train size: {len(split_df[split_df["split"] == TRAIN])}')
+        print(f'Val size: {len(split_df[split_df["split"] == OOD_VAL])}')
+        print(f'ID Val size: {len(split_df[split_df["split"] == ID_VAL])}')
+        print(f'Test size: {len(split_df[split_df["split"] == OOD_TEST])}')
+        print(f'ID Test size: {len(split_df[split_df["split"] == ID_TEST])}')
         print(
             f'Number of examples not included: {len(split_df[split_df["split"] == NOT_IN_DATASET])}'
         )
@@ -64,16 +68,73 @@ def output_dataset_sizes(split_df):
     )
     print(subsampled_reviewers)
 
-    blackout_indices = []
-    for reviewer_id in subsampled_reviewers:
-        reviews = train_data_df[train_data_df["reviewerID"] == reviewer_id]
-        blackout_indices.extend(reviews.index)
+    blackout_indices = train_data_df[
+        train_data_df["reviewerID"].isin(subsampled_reviewers)
+    ].index
 
     # Mark all the corresponding reviews of blackout_indices as -1
     split_df.loc[blackout_indices, "split"] = NOT_IN_DATASET
     output_dataset_sizes(split_df)
 
+    # Mark duplicates
+    duplicated_within_user = data_df[["reviewerID", "reviewText"]].duplicated()
+    df_deduplicated_within_user = data_df[~duplicated_within_user]
+    duplicated_text = df_deduplicated_within_user[
+        df_deduplicated_within_user["reviewText"]
+        .apply(lambda x: x.lower())
+        .duplicated(keep=False)
+    ]["reviewText"]
+    duplicated_text = set(duplicated_text.values)
+    data_df["duplicate"] = (
+        data_df["reviewText"].isin(duplicated_text)
+    ) | duplicated_within_user
+
+    # Mark html candidates
+    data_df["contains_html"] = data_df["reviewText"].apply(
+        lambda x: "<" in x and ">" in x
+    )
+
+    # Mark clean ones
+    data_df["clean"] = ~data_df["duplicate"] & ~data_df["contains_html"]
+
+    # Clear ID val and ID test since we're regenerating
+    split_df.loc[split_df["split"] == ID_VAL, "split"] = NOT_IN_DATASET
+    split_df.loc[split_df["split"] == ID_TEST, "split"] = NOT_IN_DATASET
+
+    # Regenerate ID val and ID test
+    train_reviewer_ids = data_df[split_df["split"] == TRAIN]["reviewerID"].unique()
+    cutoff = int(len(train_reviewer_ids) / 2)
+    id_val_reviewer_ids = train_reviewer_ids[:cutoff]
+    id_test_reviewer_ids = train_reviewer_ids[cutoff:]
+    split_df.loc[
+        (split_df["split"] == NOT_IN_DATASET)
+        & data_df["clean"]
+        & data_df["reviewerID"].isin(id_val_reviewer_ids),
+        "split",
+    ] = ID_VAL
+    split_df.loc[
+        (split_df["split"] == NOT_IN_DATASET)
+        & data_df["clean"]
+        & data_df["reviewerID"].isin(id_test_reviewer_ids),
+        "split",
+    ] = ID_TEST
+
+    # Sanity check
+    assert (
+        data_df[(split_df["split"] == ID_VAL)]["reviewerID"].value_counts().min() == 75
+    )
+    assert (
+        data_df[(split_df["split"] == ID_VAL)]["reviewerID"].value_counts().max() == 75
+    )
+    assert (
+        data_df[(split_df["split"] == ID_TEST)]["reviewerID"].value_counts().min() == 75
+    )
+    assert (
+        data_df[(split_df["split"] == ID_TEST)]["reviewerID"].value_counts().max() == 75
+    )
+
     # Write out the new splits to user.csv
+    output_dataset_sizes(split_df)
     split_df.to_csv(user_csv_path, index=False)
     print("Done.")
 

From cee6053189a84604244f39a80b16be180c5019fa Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Mon, 1 Mar 2021 19:38:12 -0800
Subject: [PATCH 055/116] Update bundle link

---
 wilds/datasets/amazon_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index 0dae3dba..991611e3 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -57,7 +57,7 @@ class AmazonDataset(WILDSDataset):
             'compressed_size': 4_066_541_568
         },
         '2.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x9de7edd6e3f14034809dc8cf3b060ba3/contents/blob/',
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x2732a175b5a644468b0342081544d1fd/contents/blob/',
             'compressed_size': 1_987_523_759
         },
     }

From ad77efb0790b04ae59117320298ec82d722d7c2b Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 1 Mar 2021 21:54:46 -0800
Subject: [PATCH 056/116] update docstring for py150

---
 wilds/datasets/py150_dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index 76caf356..306e797e 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -19,7 +19,10 @@ class Py150Dataset(WILDSDataset):
         Label (y):
             A sequence of next tokens (shifted x)
         Metadata:
-            Each image is annotated with the original GitHub repo id and file name
+            Each example is annotated with the original GitHub repo id.
+            This repo id can be matched with the name of the repo in natural language by
+            matching it with the contents of the metadata/ folder in the downloaded dataset.
+            Similarly, each example can also associated with the name of the file in natural language.
         Website:
             https://www.sri.inf.ethz.ch/py150
             https://github.com/microsoft/CodeXGLUE

From 1a4c94d6717e841aa68944696109e654ad59ee77 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Tue, 2 Mar 2021 04:13:25 -0800
Subject: [PATCH 057/116] Add d_out back for DistilBertModel

---
 examples/models/bert/distilbert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/models/bert/distilbert.py b/examples/models/bert/distilbert.py
index ca7664b6..c508fea2 100644
--- a/examples/models/bert/distilbert.py
+++ b/examples/models/bert/distilbert.py
@@ -17,6 +17,7 @@ def __call__(self, x):
 class DistilBertFeaturizer(DistilBertModel):
     def __init__(self, config):
         super().__init__(config)
+        self.d_out = config.hidden_size
 
     def __call__(self, x):
         input_ids = x[:, :, 0]

From a1059697ef529fb0a8d248fd8c6e8c45c1022dae Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Tue, 2 Mar 2021 05:05:48 -0800
Subject: [PATCH 058/116] Fix BERT import

---
 examples/models/initializer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index b87f324a..54fb0adb 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -1,6 +1,6 @@
 import torch.nn as nn
 import torchvision
-from models.bert import BertClassifier, BertFeaturizer
+from models.bert.bert import BertClassifier, BertFeaturizer
 from models.bert.distilbert import DistilBertClassifier, DistilBertFeaturizer
 from models.resnet_multispectral import ResNet18
 from models.layers import Identity

From 4cd3e91c5d5f51c262f310ed3c0765d7424032b7 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Tue, 2 Mar 2021 06:14:56 -0800
Subject: [PATCH 059/116] Fix DistilBERT for featurizer

---
 examples/models/initializer.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index 54fb0adb..72888014 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -16,7 +16,7 @@ def initialize_model(config, d_out, featurizer=False):
             - d_out (int): the dimensionality of the model output
             - featurizer (bool): whether to return a model or a (featurizer, classifier) pair that constitutes a model.
         Output:
-            If feauturizer=True:
+            If featurizer=True:
             - featurizer: a model that outputs feature Tensors of shape (batch_size, ..., feature dimensionality)
             - classifier: a model that takes in feature Tensors and outputs predictions. In most cases, this is a linear layer.
 
@@ -37,7 +37,12 @@ def initialize_model(config, d_out, featurizer=False):
                 d_out=d_out,
                 **config.model_kwargs)
     elif 'bert' in config.model:
-            model = initializeBertBasedModel(config, d_out)
+        if featurizer:
+            featurizer = initialize_bert_based_model(config, d_out, featurizer)
+            classifier = nn.Linear(featurizer.d_out, d_out)
+            model = (featurizer, classifier)
+        else:
+            model = initialize_bert_based_model(config, d_out)
     elif config.model == 'resnet18_ms':  # multispectral resnet 18
         if featurizer:
             featurizer = ResNet18(num_classes=None, **config.model_kwargs)
@@ -71,9 +76,9 @@ def initialize_model(config, d_out, featurizer=False):
         raise ValueError(f'Model: {config.model} not recognized.')
     return model
 
-def initializeBertBasedModel(config, d_out):
+def initialize_bert_based_model(config, d_out, is_featurizer=False):
     if config.model == 'bert-base-uncased':
-        if d_out is None:
+        if is_featurizer:
             model = BertFeaturizer.from_pretrained(config.model, **config.model_kwargs)
         else:
             model = BertClassifier.from_pretrained(
@@ -81,7 +86,7 @@ def initializeBertBasedModel(config, d_out):
                 num_labels=d_out,
                 **config.model_kwargs)
     elif config.model == 'distilbert-base-uncased':
-        if d_out is None:
+        if is_featurizer:
             model = DistilBertFeaturizer.from_pretrained(config.model, **config.model_kwargs)
         else:
             model = DistilBertClassifier.from_pretrained(

From 15509072c86facdd3e1dbb23c0cfb7dbc890e159 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Tue, 2 Mar 2021 06:17:51 -0800
Subject: [PATCH 060/116] Fix torch vision models

---
 examples/models/initializer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index 027a61e1..845bc5bd 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -100,7 +100,7 @@ def initialize_torchvision_model(name, d_out, **kwargs):
         last_layer = Identity(d_features)
         model.d_out = d_features
     else: # want to initialize a classifier for a particular num_classes
-        last_layer = nn.Linear(d, d_out)
+        last_layer = nn.Linear(d_features, d_out)
         model.d_out = d_out
     setattr(model, last_layer_name, last_layer)
     return model

From 139a8d900c43b91d639abe2537c55bb2358666b5 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 14:15:11 -0800
Subject: [PATCH 061/116] Update README.md

---
 README.md | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 1cc09658..84ca82ff 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ pip install wilds
 If you have already installed it, please check that you have the latest version:
 ```bash
 python -c "import wilds; print(wilds.__version__)"
-# This should print "1.0.0". If it doesn't, update by running:
+# This should print "1.1.0". If it doesn't, update by running:
 pip install -U wilds
 ```
 
@@ -48,7 +48,7 @@ pip install -e .
 - tqdm>=4.53.0
 - pytz>=2020.4
 - outdated>=0.2.0
-- ogb>=1.2.3
+- ogb>=1.2.5
 - torch-scatter>=2.0.5
 - torch-geometric>=1.6.1
 
@@ -70,9 +70,12 @@ To run these scripts, you will need to install these additional dependencies:
 
 All baseline experiments in the paper were run on Python 3.8.5 and CUDA 10.1.
 
+
 ## Usage
-### Default models
-In the `examples/` folder, we provide a set of scripts that we used to train models on the WILDS package. These scripts are configured with the default models and hyperparameters that we used for all of the baselines described in our paper. All baseline results in the paper can be easily replicated with commands like:
+
+### Example scripts with default models and dataset downloading
+In the `examples/` folder, we provide a set of scripts that can be used to download WILDS datasets and train models on them.
+These scripts are configured with the default models and hyperparameters that we used for all of the baselines described in our paper. All baseline results in the paper can be easily replicated with commands like:
 
 ```bash
 cd examples
@@ -87,6 +90,27 @@ The first time you run these scripts, you might need to download the datasets. Y
 python run_expt.py --dataset civilcomments --algorithm groupDRO --root_dir data --download
 ```
 
+Alternatively, you can use the standalone `examples/download_datasets.py` script, for example:
+
+```bash
+python download_datasets.py --root_dir data
+```
+
+This will download all datasets to the specified `data` folder. You can also pass in the `--datasets` argument to only download specified datasets.
+
+These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single run using a NVIDIA V100 GPU.
+
+| Dataset       | Download size (GB) | Size on disk (GB) | Train+eval time |
+|---------------|--------------------|-------------------|-----------------|
+| iWildCam      | 11                 | 25                |                 |
+| Camelyon17    | 10                 | 15                |                 |
+| MolPCBA       | 0.04               | 2                 |                 |
+| CivilComments | 0.1                | 0.3               |                 |
+| FMoW          | 50                 | 55                |                 |
+| PovertyMap    | 12                 | 14                |                 |
+| Amazon        |                    |                   |                 |
+| Py150         | 0.1                | 0.8               |                 |
+
 ### Data loading
 
 The WILDS package provides a simple, standardized interface for all datasets in the benchmark.
@@ -175,7 +199,7 @@ Invoking the `eval` method of each dataset yields all metrics reported in the pa
 ## Citing WILDS
 If you use WILDS datasets in your work, please cite [our paper](https://arxiv.org/abs/2012.07421) ([Bibtex](https://wilds.stanford.edu/assets/files/bibtex.md)):
 
-- **WILDS: A Benchmark of in-the-Wild Distribution Shifts** (2020). Pang Wei Koh*, Shiori Sagawa*, Henrik Marklund, Sang Michael Xie, Marvin Zhang, Akshay Balsubramani, Weihua Hu, Michihiro Yasunaga, Richard Lanas Phillips, Sara Beery, Jure Leskovec, Anshul Kundaje, Emma Pierson, Sergey Levine, Chelsea Finn, and Percy Liang.
+- **WILDS: A Benchmark of in-the-Wild Distribution Shifts** (2021). Pang Wei Koh*, Shiori Sagawa*, Henrik Marklund, Sang Michael Xie, Marvin Zhang, Akshay Balsubramani, Weihua Hu, Michihiro Yasunaga, Richard Lanas Phillips, Irena Gao, Tony Lee, Etienne David, Ian Stavness, Wei Guo, Berton A. Earnshaw, Imran S. Haque, Sara Beery, Jure Leskovec, Anshul Kundaje, Emma Pierson, Sergey Levine, Chelsea Finn, and Percy Liang.
 
 Please also cite the original papers that introduce the datasets, as listed on the [datasets page](https://wilds.stanford.edu/datasets/).
 

From 4df91230856fb49173e2bc4433b6b8364a38df2c Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 14:15:49 -0800
Subject: [PATCH 062/116] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 84ca82ff..bf29c3a0 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,7 @@ Alternatively, you can use the standalone `examples/download_datasets.py` script
 python download_datasets.py --root_dir data
 ```
 
-This will download all datasets to the specified `data` folder. You can also pass in the `--datasets` argument to only download specified datasets.
+This will download all datasets to the specified `data` folder. You can also use the `--datasets` argument to download particular datasets.
 
 These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single run using a NVIDIA V100 GPU.
 

From b932182d5547795f3ec971cf77d091608d9e92ee Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 14:24:16 -0800
Subject: [PATCH 063/116] Update README.md

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bf29c3a0..29876969 100644
--- a/README.md
+++ b/README.md
@@ -71,9 +71,8 @@ To run these scripts, you will need to install these additional dependencies:
 All baseline experiments in the paper were run on Python 3.8.5 and CUDA 10.1.
 
 
-## Usage
+## Using the example scripts
 
-### Example scripts with default models and dataset downloading
 In the `examples/` folder, we provide a set of scripts that can be used to download WILDS datasets and train models on them.
 These scripts are configured with the default models and hyperparameters that we used for all of the baselines described in our paper. All baseline results in the paper can be easily replicated with commands like:
 
@@ -111,6 +110,10 @@ These are the sizes of each of our datasets, as well as their approximate time t
 | Amazon        |                    |                   |                 |
 | Py150         | 0.1                | 0.8               |                 |
 
+We have an [executable version](https://worksheets.codalab.org/worksheets/0x52cea64d1d3f4fa89de326b4e31aa50a) of our paper on CodaLab that contains all of the output of  This contains the exact commands, code, and data used for each experiment reported in our paper. The trained model weights for every experiment can also be found there.
+
+
+## Using the WILDS package
 ### Data loading
 
 The WILDS package provides a simple, standardized interface for all datasets in the benchmark.

From 2cddfcefc3fff47c75cb96a16ca042848879b5d1 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 14:26:26 -0800
Subject: [PATCH 064/116] Update README.md

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 29876969..3310efff 100644
--- a/README.md
+++ b/README.md
@@ -42,15 +42,15 @@ pip install -e .
 
 ### Requirements
 - numpy>=1.19.1
+- ogb>=1.2.5
+- outdated>=0.2.0
 - pandas>=1.1.0
 - pillow>=7.2.0
-- torch>=1.7.0
-- tqdm>=4.53.0
 - pytz>=2020.4
-- outdated>=0.2.0
-- ogb>=1.2.5
+- torch>=1.7.0
 - torch-scatter>=2.0.5
 - torch-geometric>=1.6.1
+- tqdm>=4.53.0 
 
 Running `pip install wilds` or `pip install -e .` will automatically check for and install all of these requirements
 except for the `torch-scatter` and `torch-geometric` packages, which require a [quick manual install](https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html#installation-via-binaries).

From b1460936a3eace81224dd73a11faab994c8f07df Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Tue, 2 Mar 2021 21:39:15 -0800
Subject: [PATCH 065/116] Set default CORAL penalty weight to 1.0

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index c6fb09f3..c9b3241c 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -13,7 +13,7 @@
         'n_epochs': 3,
         'n_groups_per_batch': 2,
         'irm_lambda': 1.0,
-        'coral_penalty_weight': 0.1,
+        'coral_penalty_weight': 1.0,
         'loader_kwargs': {
             'num_workers': 1,
             'pin_memory': True,

From 362ad534cd238b1cc281e85dc2bc9014175a6284 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 22:04:42 -0800
Subject: [PATCH 066/116] Update README.md

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 3310efff..9f25b4be 100644
--- a/README.md
+++ b/README.md
@@ -99,16 +99,16 @@ This will download all datasets to the specified `data` folder. You can also use
 
 These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single run using a NVIDIA V100 GPU.
 
-| Dataset       | Download size (GB) | Size on disk (GB) | Train+eval time |
-|---------------|--------------------|-------------------|-----------------|
-| iWildCam      | 11                 | 25                |                 |
-| Camelyon17    | 10                 | 15                |                 |
-| MolPCBA       | 0.04               | 2                 |                 |
-| CivilComments | 0.1                | 0.3               |                 |
-| FMoW          | 50                 | 55                |                 |
-| PovertyMap    | 12                 | 14                |                 |
-| Amazon        |                    |                   |                 |
-| Py150         | 0.1                | 0.8               |                 |
+| Dataset command | Download size (GB) | Size on disk (GB) | Train+eval time |
+|-----------------|--------------------|-------------------|-----------------|
+| iwildcam        | 11                 | 25                |                 |
+| camelyon17      | 10                 | 15                |                 |
+| ogb-molpcba     | 0.04               | 2                 |                 |
+| civilcomments   | 0.1                | 0.3               |                 |
+| fmow            | 50                 | 55                |                 |
+| poverty         | 12                 | 14                |                 |
+| amazon          |                    |                   |                 |
+| py150           | 0.1                | 0.8               |                 |
 
 We have an [executable version](https://worksheets.codalab.org/worksheets/0x52cea64d1d3f4fa89de326b4e31aa50a) of our paper on CodaLab that contains all of the output of  This contains the exact commands, code, and data used for each experiment reported in our paper. The trained model weights for every experiment can also be found there.
 

From 2c34a4abc83e4b01f8b77ce87f2a1bbe145d1d35 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Tue, 2 Mar 2021 23:10:55 -0800
Subject: [PATCH 067/116] fix

---
 examples/models/initializer.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/models/initializer.py b/examples/models/initializer.py
index d1f9824c..4d414763 100644
--- a/examples/models/initializer.py
+++ b/examples/models/initializer.py
@@ -8,23 +8,23 @@
 from models.code_gpt import GPT2LMHeadLogit, GPT2FeaturizerLMHeadLogit
 from transformers import GPT2Tokenizer
 
-def initialize_model(config, d_out, featurizer=False):
+def initialize_model(config, d_out, is_featurizer=False):
     """
     Initializes models according to the config
         Args:
             - config (dictionary): config dictionary
             - d_out (int): the dimensionality of the model output
-            - featurizer (bool): whether to return a model or a (featurizer, classifier) pair that constitutes a model.
+            - is_featurizer (bool): whether to return a model or a (featurizer, classifier) pair that constitutes a model.
         Output:
-            If featurizer=True:
+            If is_featurizer=True:
             - featurizer: a model that outputs feature Tensors of shape (batch_size, ..., feature dimensionality)
             - classifier: a model that takes in feature Tensors and outputs predictions. In most cases, this is a linear layer.
 
-            If featurizer=False:
+            If is_featurizer=False:
             - model: a model that is equivalent to nn.Sequential(featurizer, classifier)
     """
     if config.model in ('resnet50', 'resnet34', 'wideresnet50', 'densenet121'):
-        if featurizer:
+        if is_featurizer:
             featurizer = initialize_torchvision_model(
                 name=config.model,
                 d_out=None,
@@ -37,21 +37,21 @@ def initialize_model(config, d_out, featurizer=False):
                 d_out=d_out,
                 **config.model_kwargs)
     elif 'bert' in config.model:
-        if featurizer:
-            featurizer = initialize_bert_based_model(config, d_out, featurizer)
+        if is_featurizer:
+            featurizer = initialize_bert_based_model(config, d_out, is_featurizer)
             classifier = nn.Linear(featurizer.d_out, d_out)
             model = (featurizer, classifier)
         else:
             model = initialize_bert_based_model(config, d_out)
     elif config.model == 'resnet18_ms':  # multispectral resnet 18
-        if featurizer:
+        if is_featurizer:
             featurizer = ResNet18(num_classes=None, **config.model_kwargs)
             classifier = nn.Linear(featurizer.d_out, d_out)
             model = (featurizer, classifier)
         else:
             model = ResNet18(num_classes=d_out, **config.model_kwargs)
     elif config.model == 'gin-virtual':
-        if featurizer:
+        if is_featurizer:
             featurizer = GINVirtual(num_tasks=None, **config.model_kwargs)
             classifier = nn.Linear(featurizer.d_out, d_out)
             model = (featurizer, classifier)
@@ -60,7 +60,7 @@ def initialize_model(config, d_out, featurizer=False):
     elif config.model == 'code-gpt-py':
         name = 'microsoft/CodeGPT-small-py'
         tokenizer = GPT2Tokenizer.from_pretrained(name)
-        if featurizer:
+        if is_featurizer:
             model = GPT2FeaturizerLMHeadLogit.from_pretrained(name)
             model.resize_token_embeddings(len(tokenizer))
             featurizer = model.transformer
@@ -70,7 +70,7 @@ def initialize_model(config, d_out, featurizer=False):
             model = GPT2LMHeadLogit.from_pretrained(name)
             model.resize_token_embeddings(len(tokenizer))
     elif config.model == 'logistic_regression':
-        assert not featurizer, "Featurizer not supported for logistic regression"
+        assert not is_featurizer, "Featurizer not supported for logistic regression"
         model = nn.Linear(out_features=d_out, **config.model_kwargs)
     else:
         raise ValueError(f'Model: {config.model} not recognized.')

From 7bbb89eeac3a9af7961d8584eb54422526512467 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 23:37:04 -0800
Subject: [PATCH 068/116] relaxed optimizer restriction

---
 examples/configs/datasets.py |  2 +-
 examples/optimizer.py        | 10 +++++-----
 setup.py                     |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index c9b3241c..fff4e6df 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -104,7 +104,7 @@
         'split_scheme': 'official',
         'dataset_kwargs': {
             'oracle_training_set': False,
-            'seed':111,
+            'seed': 111,
             'use_ood_val': True
         },
         'model': 'densenet121',
diff --git a/examples/optimizer.py b/examples/optimizer.py
index 931fc882..bc390394 100644
--- a/examples/optimizer.py
+++ b/examples/optimizer.py
@@ -2,9 +2,6 @@
 from transformers import AdamW
 
 def initialize_optimizer(config, model):
-    if 'bert' in config.model:
-        assert config.optimizer=='AdamW', 'Only AdamW is supported for BERT-based models'
-
     # initialize optimizers
     if config.optimizer=='SGD':
         params = filter(lambda p: p.requires_grad, model.parameters())
@@ -14,8 +11,11 @@ def initialize_optimizer(config, model):
             weight_decay=config.weight_decay,
             **config.optimizer_kwargs)
     elif config.optimizer=='AdamW':
-        assert 'bert' in config.model or 'gpt' in config.model, "Only BERT-based models and GPT supported for AdamW"
-        no_decay = ['bias', 'LayerNorm.weight']
+        if 'bert' in config.model or 'gpt' in config.model:
+            no_decay = ['bias', 'LayerNorm.weight']
+        else:
+            no_decay = []
+
         params = [
             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
diff --git a/setup.py b/setup.py
index ab7c7c98..c9c397d0 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
         'scikit-learn>=0.20.0',
         'pillow>=7.2.0',
         'torch>=1.7.0',
-        'ogb>=1.2.3',
+        'ogb>=1.2.5',
         'tqdm>=4.53.0',
         'outdated>=0.2.0',
         'pytz>=2020.4',

From a528a6561b13f72c51b9c174f5497112dddf6661 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 23:41:15 -0800
Subject: [PATCH 069/116] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9f25b4be..93eea7e2 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ python download_datasets.py --root_dir data
 
 This will download all datasets to the specified `data` folder. You can also use the `--datasets` argument to download particular datasets.
 
-These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single run using a NVIDIA V100 GPU.
+These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single ERM run using a NVIDIA V100 GPU.
 
 | Dataset command | Download size (GB) | Size on disk (GB) | Train+eval time |
 |-----------------|--------------------|-------------------|-----------------|

From ba4b55a63ff203fd67ce4eb04286f60de50c84ca Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Tue, 2 Mar 2021 23:41:59 -0800
Subject: [PATCH 070/116] Update README.md

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 93eea7e2..03ef5b01 100644
--- a/README.md
+++ b/README.md
@@ -99,16 +99,16 @@ This will download all datasets to the specified `data` folder. You can also use
 
 These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single ERM run using a NVIDIA V100 GPU.
 
-| Dataset command | Download size (GB) | Size on disk (GB) | Train+eval time |
-|-----------------|--------------------|-------------------|-----------------|
-| iwildcam        | 11                 | 25                |                 |
-| camelyon17      | 10                 | 15                |                 |
-| ogb-molpcba     | 0.04               | 2                 |                 |
-| civilcomments   | 0.1                | 0.3               |                 |
-| fmow            | 50                 | 55                |                 |
-| poverty         | 12                 | 14                |                 |
-| amazon          |                    |                   |                 |
-| py150           | 0.1                | 0.8               |                 |
+| Dataset command | Download size (GB) | Size on disk (GB) | Train+eval time (h) |
+|-----------------|--------------------|-------------------|---------------------|
+| iwildcam        | 11                 | 25                |                     |
+| camelyon17      | 10                 | 15                |                     |
+| ogb-molpcba     | 0.04               | 2                 |                     |
+| civilcomments   | 0.1                | 0.3               |                     |
+| fmow            | 50                 | 55                |                     |
+| poverty         | 12                 | 14                |                     |
+| amazon          |                    |                   |                     |
+| py150           | 0.1                | 0.8               |                     |
 
 We have an [executable version](https://worksheets.codalab.org/worksheets/0x52cea64d1d3f4fa89de326b4e31aa50a) of our paper on CodaLab that contains all of the output of  This contains the exact commands, code, and data used for each experiment reported in our paper. The trained model weights for every experiment can also be found there.
 

From 285705002f36b784f84ac8599ee03ae397419a1c Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Wed, 3 Mar 2021 13:44:56 -0800
Subject: [PATCH 071/116] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 03ef5b01..d08c8e12 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,8 @@ These are the sizes of each of our datasets, as well as their approximate time t
 | amazon          |                    |                   |                     |
 | py150           | 0.1                | 0.8               |                     |
 
+The image datasets (iwildcam, camelyon17, fmow, and poverty) tend to have high disk I/O usage. If training time is much slower for you than the approximate times listed above, consider checking if I/O is a bottleneck (e.g., by moving to a local disk if you are using a network drive, or by increasing the number of data loader workers). To speed up training, you could also disable evaluation at each epoch or for all splits by toggling `--evaluate_all_splits` and related arguments.
+
 We have an [executable version](https://worksheets.codalab.org/worksheets/0x52cea64d1d3f4fa89de326b4e31aa50a) of our paper on CodaLab that contains all of the output of  This contains the exact commands, code, and data used for each experiment reported in our paper. The trained model weights for every experiment can also be found there.
 
 

From a1eecd59df267dd0fdd93b8434bc2abd8b8bb986 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Wed, 3 Mar 2021 16:31:01 -0800
Subject: [PATCH 072/116] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d08c8e12..62fad627 100644
--- a/README.md
+++ b/README.md
@@ -112,7 +112,7 @@ These are the sizes of each of our datasets, as well as their approximate time t
 
 The image datasets (iwildcam, camelyon17, fmow, and poverty) tend to have high disk I/O usage. If training time is much slower for you than the approximate times listed above, consider checking if I/O is a bottleneck (e.g., by moving to a local disk if you are using a network drive, or by increasing the number of data loader workers). To speed up training, you could also disable evaluation at each epoch or for all splits by toggling `--evaluate_all_splits` and related arguments.
 
-We have an [executable version](https://worksheets.codalab.org/worksheets/0x52cea64d1d3f4fa89de326b4e31aa50a) of our paper on CodaLab that contains all of the output of  This contains the exact commands, code, and data used for each experiment reported in our paper. The trained model weights for every experiment can also be found there.
+We have an [executable version](https://wilds.stanford.edu/codalab) of our paper on CodaLab that contains the exact commands, code, and data used for the experiments reported in our paper. Trained model weights for all datasets can also be found there.
 
 
 ## Using the WILDS package

From 9494e58e6933b2b3cd35cbd1d5abca5e8913e434 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Wed, 3 Mar 2021 21:29:33 -0800
Subject: [PATCH 073/116] sqf version fix

---
 wilds/datasets/sqf_dataset.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index b85f06ae..e910786a 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -62,14 +62,18 @@ class SQFDataset(WILDSDataset):
         The original data frmo the NYPD is in the public domain.
         The cleaned data from Goel, Rao, and Shroff is shared with permission.
     """
-    def __init__(self, root_dir, download, split_scheme):
+    _dataset_name = 'sqf'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xea27fd7daef642d2aa95b02f1e3ac404/contents/blob/',
+            'compressed_size': None}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='all_race'):
         # set variables
-        self._dataset_name = 'sqf'
-        self._version = '1.0'
+        self._version = version
         self._split_scheme = split_scheme
         self._y_size = 1
         self._n_classes = 2
-        self._download_url = 'https://worksheets.codalab.org/rest/bundles/0xea27fd7daef642d2aa95b02f1e3ac404/contents/blob/'
         # path
         self._data_dir = self.initialize_data_dir(root_dir, download)
 

From e64f04b75a72f7d87b08d7dd265105b8d906f076 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Wed, 3 Mar 2021 22:09:20 -0800
Subject: [PATCH 074/116] update eval input signatures

---
 examples/configs/datasets.py            | 17 ++++++++++++-
 examples/configs/supported.py           | 13 +++++++---
 examples/run_expt.py                    |  1 +
 examples/train.py                       | 19 +++++++-------
 wilds/common/metrics/all_metrics.py     | 33 +++++++++++--------------
 wilds/datasets/amazon_dataset.py        | 24 +++++++++---------
 wilds/datasets/bdd100k_dataset.py       | 10 ++++----
 wilds/datasets/camelyon17_dataset.py    |  7 +++---
 wilds/datasets/celebA_dataset.py        |  6 ++---
 wilds/datasets/civilcomments_dataset.py | 24 +++++++++---------
 wilds/datasets/fmow_dataset.py          | 28 ++++++++++-----------
 wilds/datasets/iwildcam_dataset.py      | 20 +++++++++------
 wilds/datasets/ogbmolpcba_dataset.py    |  3 ++-
 wilds/datasets/poverty_dataset.py       | 10 +++++---
 wilds/datasets/py150_dataset.py         |  7 +++---
 wilds/datasets/sqf_dataset.py           | 23 +++++++++--------
 wilds/datasets/waterbirds_dataset.py    |  6 ++---
 wilds/datasets/yelp_dataset.py          | 24 +++++++++---------
 18 files changed, 152 insertions(+), 123 deletions(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index fff4e6df..87a60805 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -18,6 +18,7 @@
             'num_workers': 1,
             'pin_memory': True,
         },
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'bdd100k': {
         'split_scheme': 'official',
@@ -32,9 +33,10 @@
         'lr': 0.001,
         'weight_decay': 0.0001,
         'n_epochs': 10,
-        'algo_log_metric': 'multitask_accuracy',
+        'algo_log_metric': 'multitask_binary_accuracy',
         'train_transform': 'image_base',
         'eval_transform': 'image_base',
+        'process_outputs_function': 'binary_logits_to_pred',
     },
     'camelyon17': {
         'split_scheme': 'official',
@@ -58,6 +60,7 @@
         'irm_lambda': 1.0,
         'coral_penalty_weight': 0.1,
         'algo_log_metric': 'accuracy',
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'celebA': {
         'split_scheme': 'official',
@@ -77,6 +80,7 @@
         'weight_decay': 0.0,
         'n_epochs': 200,
         'algo_log_metric': 'accuracy',
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'civilcomments': {
         'split_scheme': 'official',
@@ -99,6 +103,7 @@
             'num_workers': 1,
             'pin_memory': True,
         },
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'fmow': {
         'split_scheme': 'official',
@@ -126,6 +131,7 @@
         'irm_lambda': 1.0,
         'coral_penalty_weight': 0.1,
         'algo_log_metric': 'accuracy',
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'iwildcam': {
         'loss_function': 'cross_entropy',
@@ -149,6 +155,7 @@
         'irm_lambda': 1.,
         'coral_penalty_weight': 0.1,
         'no_group_logging': True,
+        'process_outputs_function': 'multiclass_logits_to_pred'
     },
     'ogb-molpcba': {
         'split_scheme': 'official',
@@ -167,6 +174,8 @@
         'irm_lambda': 1.,
         'coral_penalty_weight': 0.1,
         'no_group_logging': True,
+        'process_outputs_function': None,
+        'algo_log_metric': 'multitask_binary_accuracy',
     },
     'py150': {
         'split_scheme': 'official',
@@ -185,6 +194,8 @@
         'irm_lambda': 1.,
         'coral_penalty_weight': 0.1,
         'no_group_logging': True,
+        'algo_log_metric': 'multitask_accuracy',
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'poverty': {
         'split_scheme': 'official',
@@ -213,6 +224,7 @@
         'n_groups_per_batch': 8,
         'irm_lambda': 1.0,
         'coral_penalty_weight': 10,
+        'process_outputs_function': None,
     },
     'waterbirds': {
         'split_scheme': 'official',
@@ -233,6 +245,7 @@
         'lr': 1e-5,
         'weight_decay': 1.0,
         'n_epochs': 300,
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'yelp': {
         'split_scheme': 'official',
@@ -247,6 +260,7 @@
         'weight_decay': 0.01,
         'n_epochs': 3,
         'n_groups_per_batch': 2,
+        'process_outputs_function': 'multiclass_logits_to_pred',
     },
     'sqf': {
         'split_scheme': 'all_race',
@@ -266,6 +280,7 @@
         'lr': 5e-5,
         'weight_decay': 0,
         'n_epochs': 4,
+        'process_outputs_function': None,
     },
 }
 
diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index ec6e8b9a..fd68b4bd 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -17,7 +17,7 @@
 from wilds.datasets.py150_dataset import Py150Dataset
 # metrics
 from wilds.common.metrics.loss import ElementwiseLoss, Loss, MultiTaskLoss
-from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE
+from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE, multiclass_logits_to_pred, binary_logits_to_pred
 
 benchmark_datasets = [
     'amazon',
@@ -53,9 +53,16 @@
 }
 
 algo_log_metrics = {
-    'accuracy': Accuracy(),
+    'accuracy': Accuracy(prediction_fn=multiclass_logits_to_pred),
     'mse': MSE(),
-    'multitask_accuracy': MultiTaskAccuracy(),
+    'multitask_accuracy': MultiTaskAccuracy(prediction_fn=multiclass_logits_to_pred),
+    'multitask_binary_accuracy': MultiTaskAccuracy(prediction_fn=binary_logits_to_pred),
+    None: None,
+}
+
+process_outputs_functions = {
+    'binary_logits_to_pred': binary_logits_to_pred,
+    'multiclass_logits_to_pred': multiclass_logits_to_pred,
     None: None,
 }
 
diff --git a/examples/run_expt.py b/examples/run_expt.py
index 7f6d3101..031c8252 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -88,6 +88,7 @@ def main():
     parser.add_argument('--scheduler_metric_name')
 
     # Evaluation
+    parser.add_argument('--process_outputs_function', choices = supported.process_outputs_functions)
     parser.add_argument('--evaluate_all_splits', type=parse_bool, const=True, nargs='?', default=True)
     parser.add_argument('--eval_splits', nargs='+', default=[])
     parser.add_argument('--eval_only', type=parse_bool, const=True, nargs='?', default=False)
diff --git a/examples/train.py b/examples/train.py
index f597e401..3a754214 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -3,6 +3,7 @@
 import torch
 from utils import save
 import torch.autograd.profiler as profiler
+from configs.supported import process_outputs_functions
 
 def log_results(algorithm, dataset, general_logger, epoch, batch_idx):
     if algorithm.has_log:
@@ -49,11 +50,10 @@ def run_epoch(algorithm, dataset, general_logger, epoch, config, train):
         # The subsequent detach is just for safety
         # (they should already be detached in batch_results)
         epoch_y_true.append(batch_results['y_true'].clone().detach())
-        if batch_results['y_pred'].dim() == 3:
-            #language model preds have a very big vocab size (e.g. 50000), so need to do argmax here. otherwise get OOM
-            epoch_y_pred.append(batch_results['y_pred'].clone().detach().argmax(-1))
-        else:
-            epoch_y_pred.append(batch_results['y_pred'].clone().detach())
+        y_pred = batch_results['y_pred'].clone().detach()
+        if config.process_outputs_function is not None:
+            y_pred = process_outputs_functions[config.process_outputs_function](y_pred)
+        epoch_y_pred.append(y_pred)
         epoch_metadata.append(batch_results['metadata'].clone().detach())
 
         if train and (batch_idx+1) % config.log_every==0:
@@ -139,11 +139,10 @@ def evaluate(algorithm, datasets, epoch, general_logger, config):
         for batch in iterator:
             batch_results = algorithm.evaluate(batch)
             epoch_y_true.append(batch_results['y_true'].clone().detach())
-            if batch_results['y_pred'].dim() == 3:
-                #language model preds have a very big vocab size (e.g. 50000), so need to do argmax here. otherwise get OOM
-                epoch_y_pred.append(batch_results['y_pred'].clone().detach().argmax(-1))
-            else:
-                epoch_y_pred.append(batch_results['y_pred'].clone().detach())
+            y_pred = batch_results['y_pred'].clone().detach()
+            if config.process_outputs_function is not None:
+                y_pred = process_outputs_functions[config.process_outputs_function](y_pred)
+            epoch_y_pred.append(y_pred)
             epoch_metadata.append(batch_results['metadata'].clone().detach())
 
         results, results_str = dataset['dataset'].eval(
diff --git a/wilds/common/metrics/all_metrics.py b/wilds/common/metrics/all_metrics.py
index 146b96a3..0f5d7eb1 100644
--- a/wilds/common/metrics/all_metrics.py
+++ b/wilds/common/metrics/all_metrics.py
@@ -8,7 +8,7 @@
 import sklearn.metrics
 from scipy.stats import pearsonr
 
-def logits_to_score(logits):
+def binary_logits_to_score(logits):
     assert logits.dim() in (1,2)
     if logits.dim()==2: #multi-class logits
         assert logits.size(1)==2, "Only binary classification"
@@ -17,22 +17,19 @@ def logits_to_score(logits):
         score = logits
     return score
 
-def logits_to_pred(logits):
-    assert logits.dim() in (1,2)
-    if logits.dim()==2: #multi-class logits
-        pred = torch.argmax(logits, 1)
-    else:
-        pred = (logits>0).long()
-    return pred
-
-def logits_to_binary_pred(logits):
-    assert logits.dim() in (1,2)
-    pred = (logits>0).long()
-    return pred
+def multiclass_logits_to_pred(logits):
+    """
+    Takes multi-class logits of size (batch_size, ..., n_classes) and returns predictions 
+    by taking an argmax at the last dimension
+    """
+    assert logits.dim() > 1
+    return logits.argmax(-1)
 
+def binary_logits_to_pred(logits):
+    return (logits>0).long()
 
 class Accuracy(ElementwiseMetric):
-    def __init__(self, prediction_fn=logits_to_pred, name=None):
+    def __init__(self, prediction_fn=None, name=None):
         self.prediction_fn = prediction_fn
         if name is None:
             name = 'acc'
@@ -47,7 +44,7 @@ def worst(self, metrics):
         return minimum(metrics)
 
 class MultiTaskAccuracy(MultiTaskMetric):
-    def __init__(self, prediction_fn=logits_to_binary_pred, name=None):
+    def __init__(self, prediction_fn=None, name=None):
         self.prediction_fn = prediction_fn  # should work on flattened inputs
         if name is None:
             name = 'acc'
@@ -62,7 +59,7 @@ def worst(self, metrics):
         return minimum(metrics)
 
 class Recall(Metric):
-    def __init__(self, prediction_fn=logits_to_pred, name=None, average='binary'):
+    def __init__(self, prediction_fn=None, name=None, average='binary'):
         self.prediction_fn = prediction_fn
         if name is None:
             name = f'recall'
@@ -81,7 +78,7 @@ def worst(self, metrics):
         return minimum(metrics)
 
 class F1(Metric):
-    def __init__(self, prediction_fn=logits_to_pred, name=None, average='binary'):
+    def __init__(self, prediction_fn=None, name=None, average='binary'):
         self.prediction_fn = prediction_fn
         if name is None:
             name = f'F1'
@@ -131,7 +128,7 @@ def __init__(self, name=None):
 
 class PrecisionAtRecall(Metric):
     """Given a specific model threshold, determine the precision score achieved"""
-    def __init__(self, threshold, score_fn=logits_to_score, name=None):
+    def __init__(self, threshold, score_fn=None, name=None):
         self.score_fn = score_fn
         self.threshold = threshold
         if name is None:
diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index 991611e3..7fae11b0 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -93,41 +93,41 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         self.initialize_split_dicts()
         # eval
         self.initialize_eval_grouper()
-        self._metric = Accuracy()
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
         return self._input_array[idx]
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = Accuracy(prediction_fn=prediction_fn)
         if self.split_scheme=='user':
             # first compute groupwise accuracies
             g = self._eval_grouper.metadata_to_group(metadata)
             results = {
-                **self._metric.compute(y_pred, y_true),
-                **self._metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)
+                **metric.compute(y_pred, y_true),
+                **metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)
             }
             accs = []
             for group_idx in range(self._eval_grouper.n_groups):
                 group_str = self._eval_grouper.group_field_str(group_idx)
-                group_metric = results.pop(self._metric.group_metric_field(group_idx))
-                group_counts = results.pop(self._metric.group_count_field(group_idx))
-                results[f'{self._metric.name}_{group_str}'] = group_metric
+                group_metric = results.pop(metric.group_metric_field(group_idx))
+                group_counts = results.pop(metric.group_count_field(group_idx))
+                results[f'{metric.name}_{group_str}'] = group_metric
                 results[f'count_{group_str}'] = group_counts
                 if group_counts>0:
                     accs.append(group_metric)
             accs = np.array(accs)
             results['10th_percentile_acc'] = np.percentile(accs, 10)
-            results[f'{self._metric.worst_group_metric_field}'] = self._metric.worst(accs)
+            results[f'{metric.worst_group_metric_field}'] = metric.worst(accs)
             results_str = (
-                f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n"
-                f"10th percentile {self._metric.name}: {results['10th_percentile_acc']:.3f}\n"
-                f"Worst-group {self._metric.name}: {results[self._metric.worst_group_metric_field]:.3f}\n"
+                f"Average {metric.name}: {results[metric.agg_metric_field]:.3f}\n"
+                f"10th percentile {metric.name}: {results['10th_percentile_acc']:.3f}\n"
+                f"Worst-group {metric.name}: {results[metric.worst_group_metric_field]:.3f}\n"
             )
             return results, results_str
         else:
             return self.standard_group_eval(
-                self._metric,
+                metric,
                 self._eval_grouper,
                 y_pred, y_true, metadata)
 
diff --git a/wilds/datasets/bdd100k_dataset.py b/wilds/datasets/bdd100k_dataset.py
index 31d4f7b7..599450bb 100644
--- a/wilds/datasets/bdd100k_dataset.py
+++ b/wilds/datasets/bdd100k_dataset.py
@@ -107,14 +107,14 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         split_names = (self.TIMEOFDAY_SPLITS if split_to_load == 'timeofday'
                        else self.LOCATION_SPLITS)
         self._metadata_map = {split_to_load: split_names}
-        self._metric = MultiTaskAccuracy()
 
     def get_input(self, idx):
         img = Image.open(self.root / 'images' / self._image_array[idx])
         return img
 
-    def eval(self, y_pred, y_true, metadata):
-        results = self._metric.compute(y_pred, y_true)
-        results_str = (f'{self._metric.name}: '
-                       f'{results[self._metric.agg_metric_field]:.3f}\n')
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = MultiTaskAccuracy(prediction_fn=prediction_fn)
+        results = metric.compute(y_pred, y_true)
+        results_str = (f'{metric.name}: '
+                       f'{results[metric.agg_metric_field]:.3f}\n')
         return results, results_str
diff --git a/wilds/datasets/camelyon17_dataset.py b/wilds/datasets/camelyon17_dataset.py
index f62216db..5d67ae91 100644
--- a/wilds/datasets/camelyon17_dataset.py
+++ b/wilds/datasets/camelyon17_dataset.py
@@ -123,8 +123,6 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
             dataset=self,
             groupby_fields=['slide'])
 
-        self._metric = Accuracy()
-
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
@@ -137,8 +135,9 @@ def get_input(self, idx):
        x = Image.open(img_filename).convert('RGB')
        return x
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = Accuracy(prediction_fn=prediction_fn)
         return self.standard_group_eval(
-            self._metric,
+            metric,
             self._eval_grouper,
             y_pred, y_true, metadata)
diff --git a/wilds/datasets/celebA_dataset.py b/wilds/datasets/celebA_dataset.py
index 9c7d44ca..63d0e3a6 100644
--- a/wilds/datasets/celebA_dataset.py
+++ b/wilds/datasets/celebA_dataset.py
@@ -103,7 +103,6 @@ def attr_idx(attr_name):
         self._eval_grouper = CombinatorialGrouper(
             dataset=self,
             groupby_fields=(confounder_names + ['y']))
-        self._metric = Accuracy()
 
         # Extract splits
         self._split_scheme = split_scheme
@@ -124,8 +123,9 @@ def get_input(self, idx):
        x = Image.open(img_filename).convert('RGB')
        return x
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = Accuracy(prediction_fn=prediction_fn)
         return self.standard_group_eval(
-            self._metric,
+            metric,
             self._eval_grouper,
             y_pred, y_true, metadata)
diff --git a/wilds/datasets/civilcomments_dataset.py b/wilds/datasets/civilcomments_dataset.py
index 82c57adc..fcaf8461 100644
--- a/wilds/datasets/civilcomments_dataset.py
+++ b/wilds/datasets/civilcomments_dataset.py
@@ -124,18 +124,18 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
                 dataset=self,
                 groupby_fields=[identity_var, 'y'])
             for identity_var in self._identity_vars]
-        self._metric = Accuracy()
 
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
         return self._text_array[idx]
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = Accuracy(prediction_fn=prediction_fn)
         results = {
-            **self._metric.compute(y_pred, y_true),
+            **metric.compute(y_pred, y_true),
         }
-        results_str = f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n"
+        results_str = f"Average {metric.name}: {results[metric.agg_metric_field]:.3f}\n"
         # Each eval_grouper is over label + a single identity
         # We only want to keep the groups where the identity is positive
         # The groups are:
@@ -148,31 +148,31 @@ def eval(self, y_pred, y_true, metadata):
         for identity_var, eval_grouper in zip(self._identity_vars, self._eval_groupers):
             g = eval_grouper.metadata_to_group(metadata)
             group_results = {
-                **self._metric.compute_group_wise(y_pred, y_true, g, eval_grouper.n_groups)
+                **metric.compute_group_wise(y_pred, y_true, g, eval_grouper.n_groups)
             }
             results_str += f"  {identity_var:20s}"
             for group_idx in range(eval_grouper.n_groups):
                 group_str = eval_grouper.group_field_str(group_idx)
                 if f'{identity_var}:1' in group_str:
-                    group_metric = group_results[self._metric.group_metric_field(group_idx)]
-                    group_counts = group_results[self._metric.group_count_field(group_idx)]
-                    results[f'{self._metric.name}_{group_str}'] = group_metric
+                    group_metric = group_results[metric.group_metric_field(group_idx)]
+                    group_counts = group_results[metric.group_count_field(group_idx)]
+                    results[f'{metric.name}_{group_str}'] = group_metric
                     results[f'count_{group_str}'] = group_counts
                     if f'y:0' in group_str:
                         label_str = 'non_toxic'
                     else:
                         label_str = 'toxic'
                     results_str += (
-                        f"   {self._metric.name} on {label_str}: {group_metric:.3f}"
+                        f"   {metric.name} on {label_str}: {group_metric:.3f}"
                         f" (n = {results[f'count_{group_str}']:6.0f}) "
                     )
                     if worst_group_metric is None:
                         worst_group_metric = group_metric
                     else:
-                        worst_group_metric = self._metric.worst(
+                        worst_group_metric = metric.worst(
                             [worst_group_metric, group_metric])
             results_str += f"\n"
-        results[f'{self._metric.worst_group_metric_field}'] = worst_group_metric
-        results_str += f"Worst-group {self._metric.name}: {worst_group_metric:.3f}\n"
+        results[f'{metric.worst_group_metric_field}'] = worst_group_metric
+        results_str += f"Worst-group {metric.name}: {worst_group_metric:.3f}\n"
 
         return results, results_str
diff --git a/wilds/datasets/fmow_dataset.py b/wilds/datasets/fmow_dataset.py
index 42211795..c690d09f 100644
--- a/wilds/datasets/fmow_dataset.py
+++ b/wilds/datasets/fmow_dataset.py
@@ -172,7 +172,6 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
             'region': CombinatorialGrouper(dataset=self, groupby_fields=['region']),
         }
 
-        self._metric = Accuracy()
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
@@ -190,35 +189,36 @@ def get_input(self, idx):
 
         return img
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = Accuracy(prediction_fn=prediction_fn)
         # Overall evaluation + evaluate by year
         all_results, all_results_str = self.standard_group_eval(
-            self._metric,
+            metric,
             self._eval_groupers['year'],
             y_pred, y_true, metadata)
         # Evaluate by region and ignore the "Other" region
         region_grouper = self._eval_groupers['region']
-        region_results = self._metric.compute_group_wise(
+        region_results = metric.compute_group_wise(
             y_pred,
             y_true,
             region_grouper.metadata_to_group(metadata),
             region_grouper.n_groups)
-        all_results[f'{self._metric.name}_worst_year'] = all_results.pop(self._metric.worst_group_metric_field)
+        all_results[f'{metric.name}_worst_year'] = all_results.pop(metric.worst_group_metric_field)
         region_metric_list = []
         for group_idx in range(region_grouper.n_groups):
             group_str = region_grouper.group_field_str(group_idx)
-            group_metric = region_results[self._metric.group_metric_field(group_idx)]
-            group_counts = region_results[self._metric.group_count_field(group_idx)]
-            all_results[f'{self._metric.name}_{group_str}'] = group_metric
+            group_metric = region_results[metric.group_metric_field(group_idx)]
+            group_counts = region_results[metric.group_count_field(group_idx)]
+            all_results[f'{metric.name}_{group_str}'] = group_metric
             all_results[f'count_{group_str}'] = group_counts
-            if region_results[self._metric.group_count_field(group_idx)] == 0 or "Other" in group_str:
+            if region_results[metric.group_count_field(group_idx)] == 0 or "Other" in group_str:
                 continue
             all_results_str += (
                 f'  {region_grouper.group_str(group_idx)}  '
-                f"[n = {region_results[self._metric.group_count_field(group_idx)]:6.0f}]:\t"
-                f"{self._metric.name} = {region_results[self._metric.group_metric_field(group_idx)]:5.3f}\n")
-            region_metric_list.append(region_results[self._metric.group_metric_field(group_idx)])
-        all_results[f'{self._metric.name}_worst_region'] = self._metric.worst(region_metric_list)
-        all_results_str += f"Worst-group {self._metric.name}: {all_results[f'{self._metric.name}_worst_region']:.3f}\n"
+                f"[n = {region_results[metric.group_count_field(group_idx)]:6.0f}]:\t"
+                f"{metric.name} = {region_results[metric.group_metric_field(group_idx)]:5.3f}\n")
+            region_metric_list.append(region_results[metric.group_metric_field(group_idx)])
+        all_results[f'{metric.name}_worst_region'] = metric.worst(region_metric_list)
+        all_results_str += f"Worst-group {metric.name}: {all_results[f'{metric.name}_worst_region']:.3f}\n"
 
         return all_results, all_results_str
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 67f7e168..87e22180 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -119,21 +119,26 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
             dataset=self,
             groupby_fields=(['location']))
 
-        self._metrics = [Accuracy(), Recall(average='macro'), F1(average='macro')]
         super().__init__(root_dir, download, split_scheme)
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metrics = [
+            Accuracy(prediction_fn=prediction_fn), 
+            Recall(prediction_fn=prediction_fn, average='macro'), 
+            F1(prediction_fn=prediction_fn, average='macro'),
+        ]
+
         results = {}
 
-        for i in range(len(self._metrics)):
+        for i in range(len(metrics)):
             results.update({
-                **self._metrics[i].compute(y_pred, y_true),
+                **metrics[i].compute(y_pred, y_true),
                         })
 
         results_str = (
-            f"Average acc: {results[self._metrics[0].agg_metric_field]:.3f}\n"
-            f"Recall macro: {results[self._metrics[1].agg_metric_field]:.3f}\n"
-            f"F1 macro: {results[self._metrics[2].agg_metric_field]:.3f}\n"
+            f"Average acc: {results[metrics[0].agg_metric_field]:.3f}\n"
+            f"Recall macro: {results[metrics[1].agg_metric_field]:.3f}\n"
+            f"F1 macro: {results[metrics[2].agg_metric_field]:.3f}\n"
         )
 
         return results, results_str
@@ -150,5 +155,4 @@ def get_input(self, idx):
         img_path = self.data_dir / 'train' / self._input_array[idx]
         img = Image.open(img_path)
 
-
         return img
diff --git a/wilds/datasets/ogbmolpcba_dataset.py b/wilds/datasets/ogbmolpcba_dataset.py
index 0fbb8e10..13f50412 100644
--- a/wilds/datasets/ogbmolpcba_dataset.py
+++ b/wilds/datasets/ogbmolpcba_dataset.py
@@ -96,7 +96,8 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
     def get_input(self, idx):
         return self.ogb_dataset[int(idx)]
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        assert prediction_fn is None, "OGBPCBADataset.eval() does not support prediction_fn. Only binary logits accepted"
         input_dict = {"y_true": y_true, "y_pred": y_pred}
         results = self._metric.eval(input_dict)
 
diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index cde3c437..0ba47f92 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -236,8 +236,6 @@ def __init__(self, version=None, root_dir='data', download=False,
             dataset=self,
             groupby_fields=['urban'])
 
-        self._metrics = [MSE(), PearsonCorrelation()]
-
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
@@ -265,10 +263,14 @@ def get_input(self, idx):
 
         return img
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        assert prediction_fn is None, "PovertyMapDataset.eval() does not support prediction_fn"
+
+        metrics = [MSE(), PearsonCorrelation()]
+
         all_results = {}
         all_results_str = ''
-        for metric in self._metrics:
+        for metric in metrics:
             results, results_str = self.standard_group_eval(
                 metric,
                 self._eval_grouper,
diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index 306e797e..9501009d 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -88,19 +88,20 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         self._y_array = self._y_array.float()
         self._y_array[(1-_mask).bool()] = float('nan')
 
-
         super().__init__(root_dir, download, split_scheme)
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
         #y_pred: [n_samples, seqlen-1]
         #y_true: [n_samples, seqlen-1]
         is_labeled = ~torch.isnan(y_true)
         flattened_y_pred = y_pred[is_labeled]
+        if prediction_fn is not None:
+            flattened_y_pred = prediction_fn(flattened_y_pred)
         flattened_y_true = y_true[is_labeled]
         assert flattened_y_pred.size() == flattened_y_true.size() and flattened_y_pred.dim() == 1
         acc = (flattened_y_pred==flattened_y_true).float().sum() / (len(flattened_y_pred) +1e-8)
 
-        results = {'acc': acc}
+        results = {'acc': acc.item()}
         results_str = f"Average acc: {results['acc']:.3f}\n"
         return results, results_str
 
diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index b85f06ae..36694543 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import numpy as np
 from wilds.datasets.wilds_dataset import WILDSDataset
-from wilds.common.metrics.all_metrics import Accuracy, PrecisionAtRecall
+from wilds.common.metrics.all_metrics import Accuracy, PrecisionAtRecall, binary_logits_to_score, binary_logits_to_pred
 from wilds.common.grouper import CombinatorialGrouper
 from wilds.common.utils import subsample_idxs, threshold_at_recall
 import torch.nn.functional as F
@@ -250,21 +250,24 @@ def initialize_split_dicts(self):
     def get_input(self, idx):
         return torch.FloatTensor(self._input_array.loc[idx].values)
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=binary_logits_to_pred, score_fn=binary_logits_to_score):
         """Evaluate the precision achieved overall and across groups for a given global recall"""
         g = self._eval_grouper.metadata_to_group(metadata)
 
-        y_scores = F.softmax(y_pred, dim=1)[:,1]
+        y_scores = score_fn(y_pred)
         threshold_60 = threshold_at_recall(y_scores, y_true, global_recall=60)
-        results = Accuracy().compute(y_pred, y_true)
-        results.update(PrecisionAtRecall(threshold_60).compute(y_pred, y_true))
-        results.update(Accuracy().compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
-        results.update(
-        PrecisionAtRecall(threshold_60).compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
+
+        accuracy_metric = Accuracy(prediction_fn=prediction_fn)
+        PAR_metric = PrecisionAtRecall(threshold_60)
+
+        results = accuracy_metric.compute(y_pred, y_true)
+        results.update(PAR_metric.compute(y_pred, y_true))
+        results.update(accuracy_metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
+        results.update(PAR_metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups))
 
         results_str = (
-            f"Average {PrecisionAtRecall(threshold=threshold_60).name }:  {results[PrecisionAtRecall(threshold=threshold_60).agg_metric_field]:.3f}\n"
-            f"Average {Accuracy().name}:  {results[Accuracy().agg_metric_field]:.3f}\n"
+            f"Average {PAR_metric.name}:  {results[PAR_metric.agg_metric_field]:.3f}\n"
+            f"Average {accuracy_metric.name}:  {results[accuracy_metric.agg_metric_field]:.3f}\n"
         )
 
         return results, results_str
diff --git a/wilds/datasets/waterbirds_dataset.py b/wilds/datasets/waterbirds_dataset.py
index 1fb2c561..b8834688 100644
--- a/wilds/datasets/waterbirds_dataset.py
+++ b/wilds/datasets/waterbirds_dataset.py
@@ -100,7 +100,6 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         self._eval_grouper = CombinatorialGrouper(
             dataset=self,
             groupby_fields=(['background', 'y']))
-        self._metric = Accuracy()
 
         super().__init__(root_dir, download, split_scheme)
 
@@ -114,8 +113,9 @@ def get_input(self, idx):
        x = Image.open(img_filename).convert('RGB')
        return x
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = Accuracy(prediction_fn=prediction_fn)
         return self.standard_group_eval(
-            self._metric,
+            metric,
             self._eval_grouper,
             y_pred, y_true, metadata)
diff --git a/wilds/datasets/yelp_dataset.py b/wilds/datasets/yelp_dataset.py
index a5214e0f..f4ae755c 100644
--- a/wilds/datasets/yelp_dataset.py
+++ b/wilds/datasets/yelp_dataset.py
@@ -80,41 +80,41 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         self.initialize_split_dicts()
         # eval
         self.initialize_eval_grouper()
-        self._metric = Accuracy()
         super().__init__(root_dir, download, split_scheme)
 
     def get_input(self, idx):
         return self._input_array[idx]
 
-    def eval(self, y_pred, y_true, metadata):
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        metric = Accuracy(prediction_fn=prediction_fn)
         if self.split_scheme=='user':
             # first compute groupwise accuracies
             g = self._eval_grouper.metadata_to_group(metadata)
             results = {
-                **self._metric.compute(y_pred, y_true),
-                **self._metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)
+                **metric.compute(y_pred, y_true),
+                **metric.compute_group_wise(y_pred, y_true, g, self._eval_grouper.n_groups)
             }
             accs = []
             for group_idx in range(self._eval_grouper.n_groups):
                 group_str = self._eval_grouper.group_field_str(group_idx)
-                group_metric = results.pop(self._metric.group_metric_field(group_idx))
-                group_counts = results.pop(self._metric.group_count_field(group_idx))
-                results[f'{self._metric.name}_{group_str}'] = group_metric
+                group_metric = results.pop(metric.group_metric_field(group_idx))
+                group_counts = results.pop(metric.group_count_field(group_idx))
+                results[f'{metric.name}_{group_str}'] = group_metric
                 results[f'count_{group_str}'] = group_counts
                 if group_counts>0:
                     accs.append(group_metric)
             accs = np.array(accs)
             results['10th_percentile_acc'] = np.percentile(accs, 10)
-            results[f'{self._metric.worst_group_metric_field}'] = self._metric.worst(accs)
+            results[f'{metric.worst_group_metric_field}'] = metric.worst(accs)
             results_str = (
-                f"Average {self._metric.name}: {results[self._metric.agg_metric_field]:.3f}\n"
-                f"10th percentile {self._metric.name}: {results['10th_percentile_acc']:.3f}\n"
-                f"Worst-group {self._metric.name}: {results[self._metric.worst_group_metric_field]:.3f}\n"
+                f"Average {metric.name}: {results[metric.agg_metric_field]:.3f}\n"
+                f"10th percentile {metric.name}: {results['10th_percentile_acc']:.3f}\n"
+                f"Worst-group {metric.name}: {results[metric.worst_group_metric_field]:.3f}\n"
             )
             return results, results_str
         else:
             return self.standard_group_eval(
-                self._metric,
+                metric,
                 self._eval_grouper,
                 y_pred, y_true, metadata)
 

From 329eb897296f730f0def8d53379d697adbdf586b Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Wed, 3 Mar 2021 22:48:41 -0800
Subject: [PATCH 075/116] add documentation

---
 wilds/datasets/amazon_dataset.py        | 13 +++++++++++++
 wilds/datasets/bdd100k_dataset.py       | 13 +++++++++++++
 wilds/datasets/camelyon17_dataset.py    | 13 +++++++++++++
 wilds/datasets/celebA_dataset.py        | 13 +++++++++++++
 wilds/datasets/civilcomments_dataset.py | 13 +++++++++++++
 wilds/datasets/fmow_dataset.py          | 13 +++++++++++++
 wilds/datasets/iwildcam_dataset.py      | 13 +++++++++++++
 wilds/datasets/ogbmolpcba_dataset.py    | 12 ++++++++++++
 wilds/datasets/poverty_dataset.py       | 11 +++++++++++
 wilds/datasets/py150_dataset.py         | 13 +++++++++++++
 wilds/datasets/sqf_dataset.py           | 13 +++++++++++++
 wilds/datasets/waterbirds_dataset.py    | 13 +++++++++++++
 wilds/datasets/yelp_dataset.py          | 13 +++++++++++++
 13 files changed, 166 insertions(+)

diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index 7fae11b0..f50f7771 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -99,6 +99,19 @@ def get_input(self, idx):
         return self._input_array[idx]
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = Accuracy(prediction_fn=prediction_fn)
         if self.split_scheme=='user':
             # first compute groupwise accuracies
diff --git a/wilds/datasets/bdd100k_dataset.py b/wilds/datasets/bdd100k_dataset.py
index 599450bb..29f4f16a 100644
--- a/wilds/datasets/bdd100k_dataset.py
+++ b/wilds/datasets/bdd100k_dataset.py
@@ -113,6 +113,19 @@ def get_input(self, idx):
         return img
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = MultiTaskAccuracy(prediction_fn=prediction_fn)
         results = metric.compute(y_pred, y_true)
         results_str = (f'{metric.name}: '
diff --git a/wilds/datasets/camelyon17_dataset.py b/wilds/datasets/camelyon17_dataset.py
index 5d67ae91..2efeaa41 100644
--- a/wilds/datasets/camelyon17_dataset.py
+++ b/wilds/datasets/camelyon17_dataset.py
@@ -136,6 +136,19 @@ def get_input(self, idx):
        return x
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = Accuracy(prediction_fn=prediction_fn)
         return self.standard_group_eval(
             metric,
diff --git a/wilds/datasets/celebA_dataset.py b/wilds/datasets/celebA_dataset.py
index 63d0e3a6..06fcde93 100644
--- a/wilds/datasets/celebA_dataset.py
+++ b/wilds/datasets/celebA_dataset.py
@@ -124,6 +124,19 @@ def get_input(self, idx):
        return x
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = Accuracy(prediction_fn=prediction_fn)
         return self.standard_group_eval(
             metric,
diff --git a/wilds/datasets/civilcomments_dataset.py b/wilds/datasets/civilcomments_dataset.py
index fcaf8461..c4d6bb8b 100644
--- a/wilds/datasets/civilcomments_dataset.py
+++ b/wilds/datasets/civilcomments_dataset.py
@@ -131,6 +131,19 @@ def get_input(self, idx):
         return self._text_array[idx]
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = Accuracy(prediction_fn=prediction_fn)
         results = {
             **metric.compute(y_pred, y_true),
diff --git a/wilds/datasets/fmow_dataset.py b/wilds/datasets/fmow_dataset.py
index c690d09f..a67b5c34 100644
--- a/wilds/datasets/fmow_dataset.py
+++ b/wilds/datasets/fmow_dataset.py
@@ -190,6 +190,19 @@ def get_input(self, idx):
         return img
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = Accuracy(prediction_fn=prediction_fn)
         # Overall evaluation + evaluate by year
         all_results, all_results_str = self.standard_group_eval(
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 87e22180..746658b9 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -122,6 +122,19 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         super().__init__(root_dir, download, split_scheme)
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metrics = [
             Accuracy(prediction_fn=prediction_fn), 
             Recall(prediction_fn=prediction_fn, average='macro'), 
diff --git a/wilds/datasets/ogbmolpcba_dataset.py b/wilds/datasets/ogbmolpcba_dataset.py
index 13f50412..413fd330 100644
--- a/wilds/datasets/ogbmolpcba_dataset.py
+++ b/wilds/datasets/ogbmolpcba_dataset.py
@@ -97,6 +97,18 @@ def get_input(self, idx):
         return self.ogb_dataset[int(idx)]
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (FloatTensor): Binary logits from a model
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels. 
+                                        Only None is supported because OGB Evaluators accept binary logits
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         assert prediction_fn is None, "OGBPCBADataset.eval() does not support prediction_fn. Only binary logits accepted"
         input_dict = {"y_true": y_true, "y_pred": y_pred}
         results = self._metric.eval(input_dict)
diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index 0ba47f92..c9376c9b 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -264,6 +264,17 @@ def get_input(self, idx):
         return img
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model
+            - y_true (LongTensor): Ground-truth values
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): Only None supported
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         assert prediction_fn is None, "PovertyMapDataset.eval() does not support prediction_fn"
 
         metrics = [MSE(), PearsonCorrelation()]
diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index 9501009d..f460a774 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -91,6 +91,19 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         super().__init__(root_dir, download, split_scheme)
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         #y_pred: [n_samples, seqlen-1]
         #y_true: [n_samples, seqlen-1]
         is_labeled = ~torch.isnan(y_true)
diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index aafc9e8c..b01ab47f 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -255,6 +255,19 @@ def get_input(self, idx):
         return torch.FloatTensor(self._input_array.loc[idx].values)
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=multiclass_logits_to_pred, score_fn=binary_logits_to_score):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are multi-class logits (FloatTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels and score_fn(y_pred) are confidence scores.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         """Evaluate the precision achieved overall and across groups for a given global recall"""
         g = self._eval_grouper.metadata_to_group(metadata)
 
diff --git a/wilds/datasets/waterbirds_dataset.py b/wilds/datasets/waterbirds_dataset.py
index b8834688..9caeb4cb 100644
--- a/wilds/datasets/waterbirds_dataset.py
+++ b/wilds/datasets/waterbirds_dataset.py
@@ -114,6 +114,19 @@ def get_input(self, idx):
        return x
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = Accuracy(prediction_fn=prediction_fn)
         return self.standard_group_eval(
             metric,
diff --git a/wilds/datasets/yelp_dataset.py b/wilds/datasets/yelp_dataset.py
index f4ae755c..36e9ea10 100644
--- a/wilds/datasets/yelp_dataset.py
+++ b/wilds/datasets/yelp_dataset.py
@@ -86,6 +86,19 @@ def get_input(self, idx):
         return self._input_array[idx]
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels 
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
         metric = Accuracy(prediction_fn=prediction_fn)
         if self.split_scheme=='user':
             # first compute groupwise accuracies

From 2c9a8b6f2bdf60da73ca6db630285bd3db9b595d Mon Sep 17 00:00:00 2001
From: Michihiro Yasunaga <michiyasunaga@users.noreply.github.com>
Date: Thu, 4 Mar 2021 08:42:35 -0800
Subject: [PATCH 076/116] py150 token type metadata/evaluation

---
 wilds/datasets/py150_dataset.py | 67 ++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 23 deletions(-)

diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index f460a774..b1a1fceb 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -44,8 +44,8 @@ class Py150Dataset(WILDSDataset):
     _dataset_name = 'py150'
     _versions_dict = {
         '1.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3441a145a298405a966f7288373349bf/contents/blob/',
-            'compressed_size': 154_304_512}}
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x442a0661a84649e69c0a946cc5f84237/contents/blob/',
+            'compressed_size': 162_811_706}}
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
 
@@ -59,6 +59,8 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
 
         # Load data
         df = self._load_all_data()
+        self._TYPE2ID = {'class':0, 'method':1, 'punctuation':2, 'keyword':3, 'builtin':4, 'literal':5, 'other_identifier':6}
+        self._ID2TYPE = {v: k for k, v in self._TYPE2ID.items()}
 
         # Splits
         data = {}
@@ -81,15 +83,23 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         self._y_size = None
 
         _repo = torch.tensor(df['repo'].values).reshape(-1,1)  #[n_samples, 1]
-        _mask = torch.tensor(list(df['mask'].apply(lambda x: x[1:]).values)) #[n_samples, seqlen-1]
-        self._metadata_array = _repo
-        self._metadata_fields = ['repo']
+        _tok_type = torch.tensor(list(df['tok_type'].apply(lambda x: x[1:]).values)) #[n_samples, seqlen-1]
+        length = _tok_type.size(1)
+        self._metadata_fields = ['repo'] + [f'tok_{i+1}_type' for i in range(length)]
+        self._metadata_array = torch.cat([_repo, _tok_type], dim=1)
 
         self._y_array = self._y_array.float()
-        self._y_array[(1-_mask).bool()] = float('nan')
+        self._y_array[(_tok_type==-100).bool()] = float('nan')
 
         super().__init__(root_dir, download, split_scheme)
 
+    def _compute_acc(self, y_pred, y_true, eval_pos):
+        flattened_y_pred = y_pred[eval_pos]
+        flattened_y_true = y_true[eval_pos]
+        assert flattened_y_pred.size()==flattened_y_true.size() and flattened_y_pred.dim()==1
+        acc = (flattened_y_pred==flattened_y_true).sum() / (len(flattened_y_pred) +1e-8)
+        return acc
+
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
         """
         Computes all evaluation metrics.
@@ -99,23 +109,37 @@ def eval(self, y_pred, y_true, metadata, prediction_fn=None):
                                are predicted labels.
             - y_true (LongTensor): Ground-truth labels
             - metadata (Tensor): Metadata
-            - prediction_fn (function): A function that turns y_pred into predicted labels 
+            - prediction_fn (function): A function that turns y_pred into predicted labels
         Output:
             - results (dictionary): Dictionary of evaluation metrics
             - results_str (str): String summarizing the evaluation metrics
         """
         #y_pred: [n_samples, seqlen-1]
         #y_true: [n_samples, seqlen-1]
-        is_labeled = ~torch.isnan(y_true)
-        flattened_y_pred = y_pred[is_labeled]
-        if prediction_fn is not None:
-            flattened_y_pred = prediction_fn(flattened_y_pred)
-        flattened_y_true = y_true[is_labeled]
-        assert flattened_y_pred.size() == flattened_y_true.size() and flattened_y_pred.dim() == 1
-        acc = (flattened_y_pred==flattened_y_true).float().sum() / (len(flattened_y_pred) +1e-8)
-
-        results = {'acc': acc.item()}
-        results_str = f"Average acc: {results['acc']:.3f}\n"
+        tok_type = metadata[:, 1:] #[n_samples, seqlen-1]
+        results = {}
+        results_str = ""
+
+        #Acc for class & method combined
+        eval_pos = (tok_type == self._TYPE2ID['class']) | (tok_type == self._TYPE2ID['method'])
+        acc = self._compute_acc(y_pred, y_true, eval_pos)
+        results['acc'] = acc
+        results['Acc (Class-Method)'] = acc
+        results_str += f"Acc (Class-Method): {acc:.3f}\n"
+
+        #Overall acc
+        eval_pos = ~torch.isnan(y_true)
+        acc = self._compute_acc(y_pred, y_true, eval_pos)
+        results['Acc (Overall)'] = acc
+        results_str += f"Acc (Overall): {acc:.3f}\n"
+
+        #Acc for each token type
+        for TYPE, TYPEID in self._TYPE2ID.items():
+            eval_pos = (tok_type == TYPEID)
+            acc = self._compute_acc(y_pred, y_true, eval_pos)
+            results[f'Acc ({TYPE})'] = acc
+            results_str += f"Acc ({TYPE}): {acc:.3f}\n"
+
         return results, results_str
 
     def get_input(self, idx):
@@ -147,13 +171,10 @@ def get_split_name(name):
             fnames = open(self._data_dir/f'metadata/repo_file_names/{type}.txt').readlines()
             repo_ids = [fname2repo_id(fname, repo_name2id) for fname in fnames]
             splits   = [get_split_name(type)] * len(inputs)
-            if type == 'train':
-                masks = (np.array(inputs) != pad_token_id).astype(int).tolist()
-            else:
-                masks = json.load(open(self._data_dir/f'processed/{type}_input_mask.json'))
-            assert len(repo_ids) == len(inputs) == len(masks)
+            tok_types = json.load(open(self._data_dir/f'processed/{type}_input_tok_type.json'))
+            assert len(repo_ids) == len(inputs) == len(tok_types)
 
-            _df  = pd.DataFrame({'input':inputs, 'mask': masks, 'repo': repo_ids, 'split': splits})
+            _df  = pd.DataFrame({'input': inputs, 'tok_type': tok_types, 'repo': repo_ids, 'split': splits})
             dfs.append(_df)
 
         return pd.concat(dfs)

From ae1a38f1b239870e8ffe73fe531eacdc0b809cae Mon Sep 17 00:00:00 2001
From: Michihiro Yasunaga <michiyasunaga@users.noreply.github.com>
Date: Thu, 4 Mar 2021 08:48:07 -0800
Subject: [PATCH 077/116] py150 config

---
 examples/configs/datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 87a60805..7edb3492 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -187,10 +187,10 @@
         'optimizer_kwargs': {'eps':1e-8},
         'lr': 8e-5,
         'weight_decay': 0.01,
-        'n_epochs': 5,
-        'batch_size': 40,
+        'n_epochs': 3,
+        'batch_size': 6,
         'groupby_fields': ['repo',],
-        'n_groups_per_batch': 10,
+        'n_groups_per_batch': 3,
         'irm_lambda': 1.,
         'coral_penalty_weight': 0.1,
         'no_group_logging': True,

From dc96ca146be8a881998ff669a1d69479a2966bfb Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Thu, 4 Mar 2021 09:55:42 -0800
Subject: [PATCH 078/116] featurizer initialization

---
 examples/algorithms/deepCORAL.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/algorithms/deepCORAL.py b/examples/algorithms/deepCORAL.py
index 11eff952..e82981d4 100644
--- a/examples/algorithms/deepCORAL.py
+++ b/examples/algorithms/deepCORAL.py
@@ -27,7 +27,7 @@ def __init__(self, config, d_out, grouper, loss, metric, n_train_steps):
         assert config.uniform_over_groups
         assert config.distinct_groups
         # initialize models
-        featurizer, classifier = initialize_model(config, d_out=d_out, featurizer=True)
+        featurizer, classifier = initialize_model(config, d_out=d_out, is_featurizer=True)
         featurizer = featurizer.to(config.device)
         classifier = classifier.to(config.device)
         model = torch.nn.Sequential(featurizer, classifier).to(config.device)

From 86221ac12db3d2c35a6bd2e738497f5d613a642a Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Thu, 4 Mar 2021 10:01:38 -0800
Subject: [PATCH 079/116] sqf file size

---
 wilds/datasets/sqf_dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wilds/datasets/sqf_dataset.py b/wilds/datasets/sqf_dataset.py
index b01ab47f..d7f233c5 100644
--- a/wilds/datasets/sqf_dataset.py
+++ b/wilds/datasets/sqf_dataset.py
@@ -66,7 +66,7 @@ class SQFDataset(WILDSDataset):
     _versions_dict = {
         '1.0': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0xea27fd7daef642d2aa95b02f1e3ac404/contents/blob/',
-            'compressed_size': None}}
+            'compressed_size': 36_708_352}}
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='all_race'):
         # set variables
@@ -263,7 +263,7 @@ def eval(self, y_pred, y_true, metadata, prediction_fn=multiclass_logits_to_pred
                                are predicted labels and score_fn(y_pred) are confidence scores.
             - y_true (LongTensor): Ground-truth labels
             - metadata (Tensor): Metadata
-            - prediction_fn (function): A function that turns y_pred into predicted labels 
+            - prediction_fn (function): A function that turns y_pred into predicted labels
         Output:
             - results (dictionary): Dictionary of evaluation metrics
             - results_str (str): String summarizing the evaluation metrics

From b073f3db666a567d031f37dd290cfc066df145af Mon Sep 17 00:00:00 2001
From: Michihiro Yasunaga <michiyasunaga@users.noreply.github.com>
Date: Thu, 4 Mar 2021 14:20:48 -0800
Subject: [PATCH 080/116] incorporate PR review comment

---
 wilds/datasets/py150_dataset.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index b1a1fceb..4523d92a 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -59,7 +59,7 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
 
         # Load data
         df = self._load_all_data()
-        self._TYPE2ID = {'class':0, 'method':1, 'punctuation':2, 'keyword':3, 'builtin':4, 'literal':5, 'other_identifier':6}
+        self._TYPE2ID = {'class':0, 'method':1, 'punctuation':2, 'keyword':3, 'builtin':4, 'literal':5, 'other_identifier':6, 'masked':-100}
         self._ID2TYPE = {v: k for k, v in self._TYPE2ID.items()}
 
         # Splits
@@ -85,11 +85,11 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         _repo = torch.tensor(df['repo'].values).reshape(-1,1)  #[n_samples, 1]
         _tok_type = torch.tensor(list(df['tok_type'].apply(lambda x: x[1:]).values)) #[n_samples, seqlen-1]
         length = _tok_type.size(1)
-        self._metadata_fields = ['repo'] + [f'tok_{i+1}_type' for i in range(length)]
+        self._metadata_fields = ['repo'] + [f'tok_{i}_type' for i in range(length)]
         self._metadata_array = torch.cat([_repo, _tok_type], dim=1)
 
         self._y_array = self._y_array.float()
-        self._y_array[(_tok_type==-100).bool()] = float('nan')
+        self._y_array[(_tok_type==self._TYPE2ID['masked']).bool()] = float('nan')
 
         super().__init__(root_dir, download, split_scheme)
 
@@ -135,6 +135,8 @@ def eval(self, y_pred, y_true, metadata, prediction_fn=None):
 
         #Acc for each token type
         for TYPE, TYPEID in self._TYPE2ID.items():
+            if TYPE == 'masked':
+               continue 
             eval_pos = (tok_type == TYPEID)
             acc = self._compute_acc(y_pred, y_true, eval_pos)
             results[f'Acc ({TYPE})'] = acc

From 3f957eab00cc78298cef13eed6f3a3382ed43d9e Mon Sep 17 00:00:00 2001
From: Michihiro Yasunaga <michiyasunaga@users.noreply.github.com>
Date: Thu, 4 Mar 2021 14:26:24 -0800
Subject: [PATCH 081/116] incorporate PR review

---
 wilds/datasets/py150_dataset.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index 4523d92a..44f4587e 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -97,7 +97,10 @@ def _compute_acc(self, y_pred, y_true, eval_pos):
         flattened_y_pred = y_pred[eval_pos]
         flattened_y_true = y_true[eval_pos]
         assert flattened_y_pred.size()==flattened_y_true.size() and flattened_y_pred.dim()==1
-        acc = (flattened_y_pred==flattened_y_true).sum() / (len(flattened_y_pred) +1e-8)
+        if len(flattened_y_pred) == 0:
+            acc = 0
+        else:
+            acc = (flattened_y_pred==flattened_y_true).float().mean().item()
         return acc
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
@@ -136,7 +139,7 @@ def eval(self, y_pred, y_true, metadata, prediction_fn=None):
         #Acc for each token type
         for TYPE, TYPEID in self._TYPE2ID.items():
             if TYPE == 'masked':
-               continue 
+               continue
             eval_pos = (tok_type == TYPEID)
             acc = self._compute_acc(y_pred, y_true, eval_pos)
             results[f'Acc ({TYPE})'] = acc

From fa68a6fd3767f1fad6cc92528eddc38f50fdc49d Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Thu, 4 Mar 2021 15:39:01 -0800
Subject: [PATCH 082/116] py150 config

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 7edb3492..4c104dfe 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -190,7 +190,7 @@
         'n_epochs': 3,
         'batch_size': 6,
         'groupby_fields': ['repo',],
-        'n_groups_per_batch': 3,
+        'n_groups_per_batch': 2,
         'irm_lambda': 1.,
         'coral_penalty_weight': 0.1,
         'no_group_logging': True,

From e5d91b7e06de0bdb1156bbb4b52ad85ce255436a Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Thu, 4 Mar 2021 16:46:01 -0800
Subject: [PATCH 083/116] Change validation metrics for FMoW and Poverty

---
 examples/algorithms/deepCORAL.py | 2 +-
 examples/configs/datasets.py     | 4 ++--
 examples/run_expt.py             | 3 +++
 wilds/datasets/wilds_dataset.py  | 4 +++-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/examples/algorithms/deepCORAL.py b/examples/algorithms/deepCORAL.py
index 11eff952..e82981d4 100644
--- a/examples/algorithms/deepCORAL.py
+++ b/examples/algorithms/deepCORAL.py
@@ -27,7 +27,7 @@ def __init__(self, config, d_out, grouper, loss, metric, n_train_steps):
         assert config.uniform_over_groups
         assert config.distinct_groups
         # initialize models
-        featurizer, classifier = initialize_model(config, d_out=d_out, featurizer=True)
+        featurizer, classifier = initialize_model(config, d_out=d_out, is_featurizer=True)
         featurizer = featurizer.to(config.device)
         classifier = classifier.to(config.device)
         model = torch.nn.Sequential(featurizer, classifier).to(config.device)
diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 87a60805..5a040abc 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -118,7 +118,7 @@
         'eval_transform': 'image_base',
         'loss_function': 'cross_entropy',
         'groupby_fields': ['year',],
-        'val_metric': 'acc_avg',
+        'val_metric': 'acc_worst_region',
         'val_metric_decreasing': False,
         'optimizer': 'Adam',
         'scheduler': 'StepLR',
@@ -211,7 +211,7 @@
         'eval_transform': None,
         'loss_function': 'mse',
         'groupby_fields': ['country',],
-        'val_metric': 'r_all',
+        'val_metric': 'r_wg',
         'val_metric_decreasing': False,
         'algo_log_metric': 'mse',
         'optimizer': 'Adam',
diff --git a/examples/run_expt.py b/examples/run_expt.py
index 031c8252..784be080 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -8,6 +8,9 @@
 import sys
 from collections import defaultdict
 
+# TODO: delete later -Tony
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+
 from wilds.common.data_loaders import get_train_loader, get_eval_loader
 from wilds.common.grouper import CombinatorialGrouper
 
diff --git a/wilds/datasets/wilds_dataset.py b/wilds/datasets/wilds_dataset.py
index 8b02243b..66a8d6e8 100644
--- a/wilds/datasets/wilds_dataset.py
+++ b/wilds/datasets/wilds_dataset.py
@@ -358,7 +358,9 @@ def initialize_data_dir(self, root_dir, download):
                 filename='archive.tar.gz',
                 remove_finished=True,
                 size=compressed_size)
-            print(f"It took {(time.time() - start_time) / 60} minutes to download and uncompress the dataset.")
+
+            download_time_in_minutes = (time.time() - start_time) / 60
+            print(f"It took {round(download_time_in_minutes, 2)} minutes to download and uncompress the dataset.")
         except Exception as e:
             print(f"\n{os.path.join(data_dir, 'archive.tar.gz')} may be corrupted. Please try deleting it and rerunning this command.\n")
             print(f"Exception: ", e)

From 2ea1b91cccac199de77d956693210e08f02917aa Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Thu, 4 Mar 2021 22:03:15 -0800
Subject: [PATCH 084/116] URL update

---
 wilds/datasets/iwildcam_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 8ea8fed8..353e184f 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -42,7 +42,7 @@ class IWildCamDataset(WILDSDataset):
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3f1b346ff2d74b5daf1a08685d68c6ec/contents/blob/',
             'compressed_size': 90_094_666_806},
         '2.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x95b53cfe322f44a08b70cc638d946422/contents/blob/',
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x5a405f743c4b4c66a16cc09cc3a858ca/contents/blob/',
             'compressed_size': 12_000_000_000}}
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):

From 440803e4a0b94003d59c8bb4636eeb9eb2470df1 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Thu, 4 Mar 2021 22:16:47 -0800
Subject: [PATCH 085/116] remove print statements create split

---
 dataset_preprocessing/iwildcam/create_split.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index 1bb9f065..660899db 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -90,10 +90,6 @@ def _create_split(data_dir, seed, skip=True):
     n_test_locations = int(frac_test_locations * n_locations)
     n_train_locations = n_locations - n_val_locations - n_test_locations
 
-    print("n val locations", n_val_locations)
-    print("n test locations", n_test_locations)
-    print("n train locations", n_train_locations)
-
     np_rng.shuffle(locations) # Shuffle, then split
     train_locations, val_trans_locations = locations[:n_train_locations], locations[n_train_locations:(n_train_locations+n_val_locations)]
     test_trans_locations = locations[(n_train_locations+n_val_locations):]
@@ -167,13 +163,6 @@ def _create_split(data_dir, seed, skip=True):
     for split_df in [val_cis_df, val_trans_df, test_cis_df, test_trans_df]:
         assert not check_overlap(train_df, split_df)
 
-    print("val trans df : ", len(val_trans_df))
-    print("test trans df : ", len(test_trans_df))
-
-    print("val cis df : ", len(val_cis_df))
-    print("test cis df : ", len(test_cis_df))
-    print("train cis df : ", len(train_df))
-
     return train_df, val_cis_df, val_trans_df, test_cis_df, test_trans_df
 
 def remove(dfs):

From 6e0c6d4b61cd5e0844e4fbb39ab2090c76a90ad1 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Fri, 5 Mar 2021 08:19:02 -0800
Subject: [PATCH 086/116] Update weight decay for py150

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 4c104dfe..061625ca 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -186,7 +186,7 @@
         'optimizer': 'AdamW',
         'optimizer_kwargs': {'eps':1e-8},
         'lr': 8e-5,
-        'weight_decay': 0.01,
+        'weight_decay': 0.,
         'n_epochs': 3,
         'batch_size': 6,
         'groupby_fields': ['repo',],

From 950c5c73ff1ab4c2c61dc9f0c9497f1bd4cf29e1 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Fri, 5 Mar 2021 08:40:23 -0800
Subject: [PATCH 087/116] Update coral penalty weight for Poverty

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 5a040abc..d2477035 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -223,7 +223,7 @@
         'n_epochs': 200,
         'n_groups_per_batch': 8,
         'irm_lambda': 1.0,
-        'coral_penalty_weight': 10,
+        'coral_penalty_weight': 0.1,
         'process_outputs_function': None,
     },
     'waterbirds': {

From f252f1aa259870ed1381e36d1467849b729bdd20 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Fri, 5 Mar 2021 08:59:30 -0800
Subject: [PATCH 088/116] fix path

---
 examples/run_expt.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/run_expt.py b/examples/run_expt.py
index 784be080..031c8252 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -8,9 +8,6 @@
 import sys
 from collections import defaultdict
 
-# TODO: delete later -Tony
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
 from wilds.common.data_loaders import get_train_loader, get_eval_loader
 from wilds.common.grouper import CombinatorialGrouper
 

From 0b5115eac3abfa2fef5d0ff98c78ce02be7a4b22 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Fri, 5 Mar 2021 18:03:25 -0800
Subject: [PATCH 089/116] Updated learning rate for iWildCam

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 1b08845b..34dcb474 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -143,7 +143,7 @@
         'val_metric_decreasing': False,
         'algo_log_metric': 'accuracy',
         'model': 'resnet50',
-        'lr': 3e-5,
+        'lr': 1e-4,
         'weight_decay': 0.0,
         'batch_size': 16,
         'n_epochs': 12,

From abd545956ffbf6dd31d2835e937599de682f6885 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Fri, 5 Mar 2021 18:10:01 -0800
Subject: [PATCH 090/116] Fix path

---
 examples/run_expt.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/run_expt.py b/examples/run_expt.py
index 784be080..031c8252 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -8,9 +8,6 @@
 import sys
 from collections import defaultdict
 
-# TODO: delete later -Tony
-sys.path.insert(1, os.path.join(sys.path[0], '..'))
-
 from wilds.common.data_loaders import get_train_loader, get_eval_loader
 from wilds.common.grouper import CombinatorialGrouper
 

From 2dfc9a0c7ca8d33e35a899025ae6de256e04c6af Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Fri, 5 Mar 2021 22:53:52 -0800
Subject: [PATCH 091/116] Update coral penalty weight for Py150

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 35217ef1..c819c573 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -192,7 +192,7 @@
         'groupby_fields': ['repo',],
         'n_groups_per_batch': 2,
         'irm_lambda': 1.,
-        'coral_penalty_weight': 0.1,
+        'coral_penalty_weight': 1.,
         'no_group_logging': True,
         'algo_log_metric': 'multitask_accuracy',
         'process_outputs_function': 'multiclass_logits_to_pred',

From 75dd11e83e77cc6dd0e6d9c9ab0b28ce933b754a Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sat, 6 Mar 2021 07:40:30 -0800
Subject: [PATCH 092/116] Update coral penalty weight for iWildCam

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index 34dcb474..57eba16e 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -153,7 +153,7 @@
         'groupby_fields': ['location',],
         'n_groups_per_batch': 2,
         'irm_lambda': 1.,
-        'coral_penalty_weight': 0.1,
+        'coral_penalty_weight': 1.,
         'no_group_logging': True,
         'process_outputs_function': 'multiclass_logits_to_pred'
     },

From 05a80187c5a9937a28eceb9f09f62a1d09b6aad9 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Sat, 6 Mar 2021 15:46:33 -0800
Subject: [PATCH 093/116] Update README.md

---
 README.md | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 62fad627..cff34ecf 100644
--- a/README.md
+++ b/README.md
@@ -99,18 +99,20 @@ This will download all datasets to the specified `data` folder. You can also use
 
 These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single ERM run using a NVIDIA V100 GPU.
 
-| Dataset command | Download size (GB) | Size on disk (GB) | Train+eval time (h) |
-|-----------------|--------------------|-------------------|---------------------|
-| iwildcam        | 11                 | 25                |                     |
-| camelyon17      | 10                 | 15                |                     |
-| ogb-molpcba     | 0.04               | 2                 |                     |
-| civilcomments   | 0.1                | 0.3               |                     |
-| fmow            | 50                 | 55                |                     |
-| poverty         | 12                 | 14                |                     |
-| amazon          |                    |                   |                     |
-| py150           | 0.1                | 0.8               |                     |
-
-The image datasets (iwildcam, camelyon17, fmow, and poverty) tend to have high disk I/O usage. If training time is much slower for you than the approximate times listed above, consider checking if I/O is a bottleneck (e.g., by moving to a local disk if you are using a network drive, or by increasing the number of data loader workers). To speed up training, you could also disable evaluation at each epoch or for all splits by toggling `--evaluate_all_splits` and related arguments.
+| Dataset command | Modality | Download size (GB) | Size on disk (GB) | Train+eval time |
+|-----------------|----------|--------------------|-------------------|-----------------|
+| iwildcam        | Image    | 11                 | 25                |                 |
+| camelyon17      | Image    | 10                 | 15                |                 |
+| ogb-molpcba     | Graph    | 0.04               | 2                 |                 |
+| civilcomments   | Text     | 0.1                | 0.3               |                 |
+| fmow            | Image    | 50                 | 55                |                 |
+| poverty         | Image    | 12                 | 14                |                 |
+| amazon          | Text     |                    |                   |                 |
+| py150           | Text     | 0.1                | 0.8               |                 |
+
+While the `camelyon17` dataset is small and fast to train on, we advise against using it as the only dataset to prototype methods on, as the test performance of models trained on this dataset tend to exhibit a large degree of variability over random seeds.
+
+The image datasets (`iwildcam`, `camelyon17`, `fmow`, and `poverty`) tend to have high disk I/O usage. If training time is much slower for you than the approximate times listed above, consider checking if I/O is a bottleneck (e.g., by moving to a local disk if you are using a network drive, or by increasing the number of data loader workers). To speed up training, you could also disable evaluation at each epoch or for all splits by toggling `--evaluate_all_splits` and related arguments.
 
 We have an [executable version](https://wilds.stanford.edu/codalab) of our paper on CodaLab that contains the exact commands, code, and data used for the experiments reported in our paper. Trained model weights for all datasets can also be found there.
 

From 585e81d424249d4208df844a957c1a485d1dec9a Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Sat, 6 Mar 2021 16:10:01 -0800
Subject: [PATCH 094/116] fix loading of json in create split, put all metadata
 into one csv, save category id to name mapping in csv

---
 .../iwildcam/create_split.py                  | 159 +++++++++---------
 wilds/datasets/iwildcam_dataset.py            |  41 ++---
 2 files changed, 90 insertions(+), 110 deletions(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index 660899db..0894e737 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -7,70 +7,37 @@
 import pandas as pd
 import numpy as np
 
-# For more info see https://www.kaggle.com/c/iwildcam-2020-fgvc7/discussion/135200
-# 485 had multiple images from indoors, and just a few were actually from out in the wild.
-LOCATIONS_TO_SKIP = [485]
-
-CANNOT_OPEN = ['99136aa6-21bc-11ea-a13a-137349068a90.jpg',
-               '87022118-21bc-11ea-a13a-137349068a90.jpg',
-               '8f17b296-21bc-11ea-a13a-137349068a90.jpg',
-               '883572ba-21bc-11ea-a13a-137349068a90.jpg',
-               '896c1198-21bc-11ea-a13a-137349068a90.jpg',
-               '8792549a-21bc-11ea-a13a-137349068a90.jpg',
-               '94529be0-21bc-11ea-a13a-137349068a90.jpg']
-
-CANNOT_LOAD = ['929da9de-21bc-11ea-a13a-137349068a90.jpg',
-               '9631e6a0-21bc-11ea-a13a-137349068a90.jpg',
-               '8c3a31fc-21bc-11ea-a13a-137349068a90.jpg',
-               '88313344-21bc-11ea-a13a-137349068a90.jpg',
-               '8c53e822-21bc-11ea-a13a-137349068a90.jpg',
-               '911848a8-21bc-11ea-a13a-137349068a90.jpg',
-               '98bd006c-21bc-11ea-a13a-137349068a90.jpg',
-               '91ba7b50-21bc-11ea-a13a-137349068a90.jpg',
-               '9799f64a-21bc-11ea-a13a-137349068a90.jpg',
-               '88007592-21bc-11ea-a13a-137349068a90.jpg',
-               '94860606-21bc-11ea-a13a-137349068a90.jpg',
-               '9166fbd8-21bc-11ea-a13a-137349068a90.jpg']
-
-OTHER = ['8e0c091a-21bc-11ea-a13a-137349068a90.jpg'] # This one got slightly different error
-
-
-IDS_TO_SKIP = CANNOT_OPEN + CANNOT_LOAD + OTHER
-
-
-def create_split(data_dir):
-    train_df, val_cis_df, val_trans_df, test_cis_df, test_trans_df = _create_split(data_dir, seed=0)
-
-    train_df.to_csv(data_dir / 'train.csv')
-    val_cis_df.to_csv(data_dir / 'val_cis.csv')
-    val_trans_df.to_csv(data_dir / 'val_trans.csv')
-    test_cis_df.to_csv(data_dir / 'test_cis.csv')
-    test_trans_df.to_csv(data_dir / 'test_trans.csv')
-
-
-def _create_split(data_dir, seed, skip=True):
+def create_split(data_dir, seed):
     np_rng = np.random.default_rng(seed)
 
-    # Load Kaggle train data
-    filename = f'iwildcam2021_train_annotations.json'
+    # Loading json was adapted from
+    # https://www.kaggle.com/ateplyuk/iwildcam2020-pytorch-start
+    filename = f'iwildcam2021_train_annotations_final.json'
     with open(data_dir / filename ) as json_file:
         data = json.load(json_file)
 
+    df_annotations = pd.DataFrame({
+         'category_id': [item['category_id'] for item in data['annotations']],
+         'image_id': [item['image_id'] for item in data['annotations']]
+    })
+
+    df_metadata = pd.DataFrame({
+          'image_id': [item['id'] for item in data['images']],
+          'location': [item['location'] for item in data['images']],
+          'filename': [item['file_name'] for item in data['images']],
+          'datetime': [item['datetime'] for item in data['images']],
+          'frame_num': [item['frame_num'] for item in data['images']], # this attribute is not used
+          'seq_id': [item['seq_id'] for item in data['images']] # this attribute is not used
+      })
 
-    # This line was adapted from
-    # https://www.kaggle.com/ateplyuk/iwildcam2020-pytorch-start
-    df = pd.DataFrame(
-            {
-                'id': [item['id'] for item in data['annotations']],
-                'category_id': [item['category_id'] for item in data['annotations']],
-                'image_id': [item['image_id'] for item in data['annotations']],
-                'location': [item['location'] for item in data['images']],
-                'filename': [item['file_name'] for item in data['images']],
-                'datetime': [item['datetime'] for item in data['images']],
-                'frame_num': [item['frame_num'] for item in data['images']], # this attribute is not used
-                'seq_id': [item['seq_id'] for item in data['images']] # this attribute is not used
-            })
 
+    df = df_metadata.merge(df_annotations, on='image_id', how='inner')
+
+    # Create category_id to name dictionary
+    cat_id_to_name_map = {}
+    for item in data['categories']:
+        cat_id_to_name_map[item['id']] = item['name']
+    df['category_name'] = df['category_id'].apply(lambda x: cat_id_to_name_map[x])
 
     # Extract the date from the datetime.
     df['datetime_obj'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
@@ -137,41 +104,75 @@ def _create_split(data_dir, seed, skip=True):
     test_cis_df = test_cis_df[test_cis_df['category_id'].isin(train_classes)]
     test_trans_df = test_trans_df[test_trans_df['category_id'].isin(train_classes)]
 
-
-
-    # Remove examples that are corrupted in some way
-    if skip:
-        train_df, val_cis_df, val_trans_df, test_cis_df, test_trans_df = remove([train_df, val_cis_df,
-                                                                                val_trans_df, test_cis_df,
-                                                                                test_trans_df])
-
     # Assert that all sequences that spanned across multiple days ended up in the same split
     for seq_id in seq_ids_that_span_across_days:
         n_splits = 0
-        for split_df in train_df, val_cis_df, test_cis_df:
+        for split_df in [train_df, val_cis_df, test_cis_df]:
             if seq_id in split_df['seq_id'].values:
                 n_splits += 1
             assert n_splits == 1, "Each sequence should only be in one split. Please move manually"
 
-
-
     # Reset index
-    train_df.reset_index(inplace=True), val_cis_df.reset_index(inplace=True), val_trans_df.reset_index(inplace=True)
-    test_cis_df.reset_index(inplace=True), test_trans_df.reset_index(inplace=True)
+    train_df.reset_index(inplace=True, drop=True), val_cis_df.reset_index(inplace=True, drop=True), val_trans_df.reset_index(inplace=True, drop=True)
+    test_cis_df.reset_index(inplace=True, drop=True), test_trans_df.reset_index(inplace=True, drop=True)
+
+    print("n train: ", len(train_df))
+    print("n val trans: ", len(val_trans_df))
+    print("n test trans: ", len(test_trans_df))
+    print("n val cis: ", len(val_cis_df))
+    print("n test cis: ", len(test_cis_df))
+
+    # Merge into one df
+    train_df['split'] = 'train'
+    val_trans_df['split'] = 'val'
+    test_trans_df['split'] = 'test'
+    val_cis_df['split'] = 'id_val'
+    test_cis_df['split'] = 'id_test'
+    df = pd.concat([train_df, val_trans_df, test_trans_df, test_cis_df, val_cis_df])
+    df = df.reset_index(drop=True)
+
+    # Create y labels by remapping the category ids to be contiguous
+    unique_categories = np.unique(df['category_id'])
+    n_classes = len(unique_categories)
+    category_to_label = dict([(i, j) for i, j in zip(unique_categories, range(n_classes))])
+    df['y'] = df['category_id'].apply(lambda x: category_to_label[x]).values
+    print("N classes: ", n_classes)
+
+    # Create y to category name map and save
+    categories_df = pd.DataFrame({
+          'category_id': [item['id'] for item in data['categories']],
+          'name': [item['name'] for item in data['categories']]
+      })
+
+    categories_df['y'] = categories_df['category_id'].apply(lambda x: category_to_label[x] if x in category_to_label else 99999)
+    categories_df = categories_df.sort_values('y').reset_index(drop=True)
+    categories_df = categories_df[['y','category_id','name']]
+
+    # Create remapped location id such that they are contigious contiguous
+    location_ids = df['location']
+    locations = np.unique(location_ids)
+    n_groups = len(locations)
+    location_to_group_id = {locations[i]: i for i in range(n_groups)}
+    df['location_remapped' ] = df['location'].apply(lambda x: location_to_group_id[x])
+
+    # Create remapped location id such that they are contigious contiguous
+    sequence_ids = df['seq_id']
+    sequences = np.unique(sequence_ids)
+    n_sequences = len(sequences)
+    sequence_to_normalized_id = {sequences[i]: i for i in range(n_sequences)}
+    df['sequence_remapped' ] = df['seq_id'].apply(lambda x: sequence_to_normalized_id[x])
+
 
     # Make sure there's no overlap
     for split_df in [val_cis_df, val_trans_df, test_cis_df, test_trans_df]:
         assert not check_overlap(train_df, split_df)
 
-    return train_df, val_cis_df, val_trans_df, test_cis_df, test_trans_df
+    # Save
+    df = df.sort_values(['split','location_remapped', 'sequence_remapped','datetime']).reset_index(drop=True)
+    cols = ['split', 'location_remapped', 'location', 'sequence_remapped', 'seq_id',  'y', 'category_id', 'datetime', 'filename', 'image_id']
+    df[cols].to_csv(data_dir / 'metadata.csv')
+    categories_df.to_csv(data_dir / 'categories.csv', index=False)
 
-def remove(dfs):
-    new_dfs = []
-    for df in dfs:
-        df = df[~df['location'].isin(LOCATIONS_TO_SKIP)]
-        df = df[~df['filename'].isin(IDS_TO_SKIP)]
-        new_dfs.append(df)
-    return new_dfs
 
 def check_overlap(df1, df2, column='filename'):
     files1 = set(df1[column])
@@ -188,4 +189,4 @@ def check_overlap(df1, df2, column='filename'):
     parser.add_argument('--data_dir', type=str)
     args = parser.parse_args()
 
-    create_split(Path(args.data_dir))
+    create_split(Path(args.data_dir), seed=0)
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 353e184f..40b65a1c 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -56,22 +56,9 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         self._data_dir = Path(self.initialize_data_dir(root_dir, download))
 
         # Load splits
-        train_df = pd.read_csv(self._data_dir / 'train.csv')
-        val_trans_df = pd.read_csv(self._data_dir / 'val_trans.csv')
-        test_trans_df = pd.read_csv(self._data_dir / 'test_trans.csv')
-        val_cis_df = pd.read_csv(self._data_dir / 'val_cis.csv')
-        test_cis_df = pd.read_csv(self._data_dir / 'test_cis.csv')
-
-        # Merge all dfs
-        train_df['split'] = 'train'
-        val_trans_df['split'] = 'val'
-        test_trans_df['split'] = 'test'
-        val_cis_df['split'] = 'id_val'
-        test_cis_df['split'] = 'id_test'
-        df = pd.concat([train_df, val_trans_df, test_trans_df, test_cis_df, val_cis_df])
+        df = pd.read_csv(self._data_dir / 'metadata.csv')
 
         # Splits
-        data = {}
         self._split_dict = {'train': 0, 'val': 1, 'test': 2, 'id_val': 3, 'id_test': 4}
         self._split_names = {'train': 'Train', 'val': 'Validation (OOD/Trans)',
                                 'test': 'Test (OOD/Trans)', 'id_val': 'Validation (ID/Cis)',
@@ -84,28 +71,20 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         self._input_array = df['filename'].values
 
         # Labels
-        unique_categories = np.unique(df['category_id'])
-        self._n_classes = len(unique_categories)
-        category_to_label = dict([(i, j) for i, j in zip(unique_categories, range(self._n_classes))])
-        label_to_category = dict([(v, k) for k, v in category_to_label.items()])
-        self._y_array = torch.tensor(df['category_id'].apply(lambda x: category_to_label[x]).values)
+        self._y_array = torch.tensor(df['y'].values)
+        self._n_classes = max(df['y'])
         self._y_size = 1
+        assert len(np.unique(df['y']) == self._n_classes)
 
         # Location/group info
-        location_ids = df['location']
-        locations = np.unique(location_ids)
-        n_groups = len(locations)
-        location_to_group_id = {locations[i]: i for i in range(n_groups)}
-        df['group_id' ] = df['location'].apply(lambda x: location_to_group_id[x])
+        n_groups = max(df['location_remapped']) + 1
         self._n_groups = n_groups
+        assert len(np.unique(df['location_remapped']) == self._n_groups)
 
         # Sequence info
-        sequence_ids = df['seq_id']
-        sequences = np.unique(sequence_ids)
-        n_sequences = len(sequences)
-        sequence_to_normalized_id = {sequences[i]: i for i in range(n_sequences)}
-        df['sequence_id_normalized' ] = df['seq_id'].apply(lambda x: sequence_to_normalized_id[x])
+        n_sequences = max(df['sequence_remapped']) + 1
         self._n_sequences = n_sequences
+        assert len(np.unique(df['sequence_remapped']) == self._n_sequences)
 
         # Extract datetime subcomponents and include in metadata
         df['datetime_obj'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
@@ -116,8 +95,8 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
         df['minute'] = df['datetime_obj'].apply(lambda x: int(x.minute))
         df['second'] = df['datetime_obj'].apply(lambda x: int(x.second))
 
-        self._metadata_array = torch.tensor(np.stack([df['group_id'].values,
-                            df['sequence_id_normalized'].values,
+        self._metadata_array = torch.tensor(np.stack([df['location_remapped'].values,
+                            df['sequence_remapped'].values,
                             df['year'].values, df['month'].values, df['day'].values,
                             df['hour'].values, df['minute'].values, df['second'].values,
                             self.y_array], axis=1))

From 756d3ca318c5d422aa2d3ad49762f5f6991d8180 Mon Sep 17 00:00:00 2001
From: Henrik Marklund <henrikmarklund92@gmail.com>
Date: Sat, 6 Mar 2021 16:42:33 -0800
Subject: [PATCH 095/116] assert fix

---
 dataset_preprocessing/iwildcam/create_split.py | 4 ++--
 wilds/datasets/iwildcam_dataset.py             | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dataset_preprocessing/iwildcam/create_split.py b/dataset_preprocessing/iwildcam/create_split.py
index 0894e737..249c4dcd 100644
--- a/dataset_preprocessing/iwildcam/create_split.py
+++ b/dataset_preprocessing/iwildcam/create_split.py
@@ -148,14 +148,14 @@ def create_split(data_dir, seed):
     categories_df = categories_df.sort_values('y').reset_index(drop=True)
     categories_df = categories_df[['y','category_id','name']]
 
-    # Create remapped location id such that they are contigious contiguous
+    # Create remapped location id such that they are contigious
     location_ids = df['location']
     locations = np.unique(location_ids)
     n_groups = len(locations)
     location_to_group_id = {locations[i]: i for i in range(n_groups)}
     df['location_remapped' ] = df['location'].apply(lambda x: location_to_group_id[x])
 
-    # Create remapped location id such that they are contigious contiguous
+    # Create remapped sequence id such that they are contigious
     sequence_ids = df['seq_id']
     sequences = np.unique(sequence_ids)
     n_sequences = len(sequences)
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 40b65a1c..9b5bfbdd 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -72,19 +72,19 @@ def __init__(self, version=None, root_dir='data', download=False, split_scheme='
 
         # Labels
         self._y_array = torch.tensor(df['y'].values)
-        self._n_classes = max(df['y'])
+        self._n_classes = max(df['y']) + 1
         self._y_size = 1
-        assert len(np.unique(df['y']) == self._n_classes)
+        assert len(np.unique(df['y'])) == self._n_classes
 
         # Location/group info
         n_groups = max(df['location_remapped']) + 1
         self._n_groups = n_groups
-        assert len(np.unique(df['location_remapped']) == self._n_groups)
+        assert len(np.unique(df['location_remapped'])) == self._n_groups
 
         # Sequence info
         n_sequences = max(df['sequence_remapped']) + 1
         self._n_sequences = n_sequences
-        assert len(np.unique(df['sequence_remapped']) == self._n_sequences)
+        assert len(np.unique(df['sequence_remapped'])) == self._n_sequences
 
         # Extract datetime subcomponents and include in metadata
         df['datetime_obj'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))

From 67b6849accfbe0a0c1bd817ad3f7cd09d93c4647 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sat, 6 Mar 2021 22:21:35 -0800
Subject: [PATCH 096/116] iWC 2.0 URL and OGB update

---
 setup.py                           | 2 +-
 wilds/datasets/iwildcam_dataset.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index c9c397d0..72b12368 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@
         'scikit-learn>=0.20.0',
         'pillow>=7.2.0',
         'torch>=1.7.0',
-        'ogb>=1.2.5',
+        'ogb>=1.2.6',
         'tqdm>=4.53.0',
         'outdated>=0.2.0',
         'pytz>=2020.4',
diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index 045c609f..d42082f2 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -42,7 +42,7 @@ class IWildCamDataset(WILDSDataset):
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3f1b346ff2d74b5daf1a08685d68c6ec/contents/blob/',
             'compressed_size': 90_094_666_806},
         '2.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x5a405f743c4b4c66a16cc09cc3a858ca/contents/blob/',
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x6313da2b204647e79a14b468131fcd64/contents/blob/',
             'compressed_size': 12_000_000_000}}
 
     def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
@@ -118,14 +118,14 @@ def eval(self, y_pred, y_true, metadata, prediction_fn=None):
                                are predicted labels.
             - y_true (LongTensor): Ground-truth labels
             - metadata (Tensor): Metadata
-            - prediction_fn (function): A function that turns y_pred into predicted labels 
+            - prediction_fn (function): A function that turns y_pred into predicted labels
         Output:
             - results (dictionary): Dictionary of evaluation metrics
             - results_str (str): String summarizing the evaluation metrics
         """
         metrics = [
-            Accuracy(prediction_fn=prediction_fn), 
-            Recall(prediction_fn=prediction_fn, average='macro'), 
+            Accuracy(prediction_fn=prediction_fn),
+            Recall(prediction_fn=prediction_fn, average='macro'),
             F1(prediction_fn=prediction_fn, average='macro'),
         ]
 

From 2fc4d292b458b8e07814ac35e4764e426e9018d0 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 7 Mar 2021 02:40:47 -0800
Subject: [PATCH 097/116] Update README with train+eval times

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index cff34ecf..0fca5e09 100644
--- a/README.md
+++ b/README.md
@@ -99,16 +99,16 @@ This will download all datasets to the specified `data` folder. You can also use
 
 These are the sizes of each of our datasets, as well as their approximate time taken to train and evaluate the default model for a single ERM run using a NVIDIA V100 GPU.
 
-| Dataset command | Modality | Download size (GB) | Size on disk (GB) | Train+eval time |
-|-----------------|----------|--------------------|-------------------|-----------------|
-| iwildcam        | Image    | 11                 | 25                |                 |
-| camelyon17      | Image    | 10                 | 15                |                 |
-| ogb-molpcba     | Graph    | 0.04               | 2                 |                 |
-| civilcomments   | Text     | 0.1                | 0.3               |                 |
-| fmow            | Image    | 50                 | 55                |                 |
-| poverty         | Image    | 12                 | 14                |                 |
-| amazon          | Text     |                    |                   |                 |
-| py150           | Text     | 0.1                | 0.8               |                 |
+| Dataset command | Modality | Download size (GB) | Size on disk (GB) | Train+eval time (Hours) |
+|-----------------|----------|--------------------|-------------------|-------------------------|
+| iwildcam        | Image    | 11                 | 25                | 7.5                     |
+| camelyon17      | Image    | 10                 | 15                | 2                       |
+| ogb-molpcba     | Graph    | 0.04               | 2                 | 15                      |
+| civilcomments   | Text     | 0.1                | 0.3               | 4.5                     |
+| fmow            | Image    | 50                 | 55                | 6                       |
+| poverty         | Image    | 12                 | 14                | 5                       |
+| amazon          | Text     | 6.6                | 7                 | 5                       |
+| py150           | Text     | 0.1                | 0.8               | 9.5                     |
 
 While the `camelyon17` dataset is small and fast to train on, we advise against using it as the only dataset to prototype methods on, as the test performance of models trained on this dataset tend to exhibit a large degree of variability over random seeds.
 

From e45d57f34ecdb505475199b507c84937fdb70428 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 7 Mar 2021 07:45:59 -0800
Subject: [PATCH 098/116] Update learning rate for iWildCam

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index f94ac6be..feb38077 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -143,7 +143,7 @@
         'val_metric_decreasing': False,
         'algo_log_metric': 'accuracy',
         'model': 'resnet50',
-        'lr': 1e-4,
+        'lr': 3e-5,
         'weight_decay': 0.0,
         'batch_size': 16,
         'n_epochs': 12,

From b894093b7498570f3544579b2ffb9c8aaba2d2ef Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 7 Mar 2021 07:47:18 -0800
Subject: [PATCH 099/116] Updated iWildCam runtime

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0fca5e09..331ff038 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ These are the sizes of each of our datasets, as well as their approximate time t
 
 | Dataset command | Modality | Download size (GB) | Size on disk (GB) | Train+eval time (Hours) |
 |-----------------|----------|--------------------|-------------------|-------------------------|
-| iwildcam        | Image    | 11                 | 25                | 7.5                     |
+| iwildcam        | Image    | 11                 | 25                | 7                       |
 | camelyon17      | Image    | 10                 | 15                | 2                       |
 | ogb-molpcba     | Graph    | 0.04               | 2                 | 15                      |
 | civilcomments   | Text     | 0.1                | 0.3               | 4.5                     |

From 93a001731956ab05dc6d644d8ab593babd6cc6fa Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Sun, 7 Mar 2021 13:45:47 -0800
Subject: [PATCH 100/116] remove iwc v1.0 support

---
 wilds/datasets/iwildcam_dataset.py | 3 ---
 wilds/datasets/wilds_dataset.py    | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/wilds/datasets/iwildcam_dataset.py b/wilds/datasets/iwildcam_dataset.py
index d42082f2..533f7fbb 100644
--- a/wilds/datasets/iwildcam_dataset.py
+++ b/wilds/datasets/iwildcam_dataset.py
@@ -38,9 +38,6 @@ class IWildCamDataset(WILDSDataset):
         """
     _dataset_name = 'iwildcam'
     _versions_dict = {
-        '1.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3f1b346ff2d74b5daf1a08685d68c6ec/contents/blob/',
-            'compressed_size': 90_094_666_806},
         '2.0': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0x6313da2b204647e79a14b468131fcd64/contents/blob/',
             'compressed_size': 12_000_000_000}}
diff --git a/wilds/datasets/wilds_dataset.py b/wilds/datasets/wilds_dataset.py
index 66a8d6e8..1f8bf21a 100644
--- a/wilds/datasets/wilds_dataset.py
+++ b/wilds/datasets/wilds_dataset.py
@@ -293,7 +293,7 @@ def initialize_data_dir(self, root_dir, download):
         might not handle versions similarly.
         """
         if self.version not in self.versions_dict:
-            raise ValueError(f'Version {self.version} not recognized. Must be in {self.versions_dict.keys()}.')
+            raise ValueError(f'Version {self.version} not supported. Must be in {self.versions_dict.keys()}.')
 
         download_url = self.versions_dict[self.version]['download_url']
         compressed_size = self.versions_dict[self.version]['compressed_size']

From 60acde17c1becc18c3f98866c622bb80bdae9f67 Mon Sep 17 00:00:00 2001
From: Tony Lee <tonyh.lee@yahoo.com>
Date: Sun, 7 Mar 2021 16:04:37 -0800
Subject: [PATCH 101/116] Updated coral penalty weight for iWildCam

---
 examples/configs/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/datasets.py b/examples/configs/datasets.py
index feb38077..cd2d1d6f 100644
--- a/examples/configs/datasets.py
+++ b/examples/configs/datasets.py
@@ -153,7 +153,7 @@
         'groupby_fields': ['location',],
         'n_groups_per_batch': 2,
         'irm_lambda': 1.,
-        'coral_penalty_weight': 1.,
+        'coral_penalty_weight': 10.,
         'no_group_logging': True,
         'process_outputs_function': 'multiclass_logits_to_pred'
     },

From 54a2a3d2aee0b678f1ae33e3dcf8e0620d635772 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Sun, 7 Mar 2021 19:12:22 -0800
Subject: [PATCH 102/116] support prediction_fn on py150

---
 wilds/datasets/py150_dataset.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/wilds/datasets/py150_dataset.py b/wilds/datasets/py150_dataset.py
index 44f4587e..e821c632 100644
--- a/wilds/datasets/py150_dataset.py
+++ b/wilds/datasets/py150_dataset.py
@@ -117,6 +117,9 @@ def eval(self, y_pred, y_true, metadata, prediction_fn=None):
             - results (dictionary): Dictionary of evaluation metrics
             - results_str (str): String summarizing the evaluation metrics
         """
+        if prediction_fn is not None:
+            y_pred = prediction_fn(y_pred)
+
         #y_pred: [n_samples, seqlen-1]
         #y_true: [n_samples, seqlen-1]
         tok_type = metadata[:, 1:] #[n_samples, seqlen-1]

From 818704e8387d6d10bd85985f2d569b4903d8a6e6 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Sun, 7 Mar 2021 23:16:38 -0800
Subject: [PATCH 103/116] amazon subsample update

---
 dataset_preprocessing/amazon_yelp/subsample_amazon.py | 1 +
 wilds/datasets/amazon_dataset.py                      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dataset_preprocessing/amazon_yelp/subsample_amazon.py b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
index 33fe7004..7b4971bd 100644
--- a/dataset_preprocessing/amazon_yelp/subsample_amazon.py
+++ b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
@@ -103,6 +103,7 @@ def output_dataset_sizes(split_df):
 
     # Regenerate ID val and ID test
     train_reviewer_ids = data_df[split_df["split"] == TRAIN]["reviewerID"].unique()
+    np.random.shuffle(train_reviewer_ids)
     cutoff = int(len(train_reviewer_ids) / 2)
     id_val_reviewer_ids = train_reviewer_ids[:cutoff]
     id_test_reviewer_ids = train_reviewer_ids[cutoff:]
diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index f50f7771..0e1210be 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -57,7 +57,7 @@ class AmazonDataset(WILDSDataset):
             'compressed_size': 4_066_541_568
         },
         '2.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x2732a175b5a644468b0342081544d1fd/contents/blob/',
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xadbf6198d3a64bdc96fb64d6966b5e79/contents/blob/'
             'compressed_size': 1_987_523_759
         },
     }

From 6ca8ace22373e4f3f2f12a6ccb1adf8222cab73b Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Sun, 7 Mar 2021 23:16:38 -0800
Subject: [PATCH 104/116] amazon subsample update

---
 dataset_preprocessing/amazon_yelp/subsample_amazon.py | 1 +
 wilds/datasets/amazon_dataset.py                      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/dataset_preprocessing/amazon_yelp/subsample_amazon.py b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
index 33fe7004..7b4971bd 100644
--- a/dataset_preprocessing/amazon_yelp/subsample_amazon.py
+++ b/dataset_preprocessing/amazon_yelp/subsample_amazon.py
@@ -103,6 +103,7 @@ def output_dataset_sizes(split_df):
 
     # Regenerate ID val and ID test
     train_reviewer_ids = data_df[split_df["split"] == TRAIN]["reviewerID"].unique()
+    np.random.shuffle(train_reviewer_ids)
     cutoff = int(len(train_reviewer_ids) / 2)
     id_val_reviewer_ids = train_reviewer_ids[:cutoff]
     id_test_reviewer_ids = train_reviewer_ids[cutoff:]
diff --git a/wilds/datasets/amazon_dataset.py b/wilds/datasets/amazon_dataset.py
index f50f7771..0e1210be 100644
--- a/wilds/datasets/amazon_dataset.py
+++ b/wilds/datasets/amazon_dataset.py
@@ -57,7 +57,7 @@ class AmazonDataset(WILDSDataset):
             'compressed_size': 4_066_541_568
         },
         '2.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x2732a175b5a644468b0342081544d1fd/contents/blob/',
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xadbf6198d3a64bdc96fb64d6966b5e79/contents/blob/'
             'compressed_size': 1_987_523_759
         },
     }

From f26cdade85b065cc993f5a7aeb9d41ea000a46c6 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Sun, 7 Mar 2021 23:33:41 -0800
Subject: [PATCH 105/116] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 331ff038..acfbdcf2 100644
--- a/README.md
+++ b/README.md
@@ -202,6 +202,7 @@ Invoking the `eval` method of each dataset yields all metrics reported in the pa
 >>> dataset.eval(all_y_pred, all_y_true, all_metadata)
 {'recall_macro_all': 0.66, ...}
 ```
+The `eval` method expects `all_y_pred` in a certain form by default and this vaires by dataset (e.g., predicted labels for most datasets, and binary logits for OGB-MolPCBA), as documented in the docstrings of each `eval` method. 
 
 ## Citing WILDS
 If you use WILDS datasets in your work, please cite [our paper](https://arxiv.org/abs/2012.07421) ([Bibtex](https://wilds.stanford.edu/assets/files/bibtex.md)):

From 4e0263145aa9811b04a1174f9e804eea1fe7f359 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Sun, 7 Mar 2021 23:54:33 -0800
Subject: [PATCH 106/116] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index acfbdcf2..4602f644 100644
--- a/README.md
+++ b/README.md
@@ -202,7 +202,7 @@ Invoking the `eval` method of each dataset yields all metrics reported in the pa
 >>> dataset.eval(all_y_pred, all_y_true, all_metadata)
 {'recall_macro_all': 0.66, ...}
 ```
-The `eval` method expects `all_y_pred` in a certain form by default and this vaires by dataset (e.g., predicted labels for most datasets, and binary logits for OGB-MolPCBA), as documented in the docstrings of each `eval` method. 
+Most `eval` methods take in predicted labels for `all_y_pred` by default, but the default inputs vary across datasets and are documented in docstrings.
 
 ## Citing WILDS
 If you use WILDS datasets in your work, please cite [our paper](https://arxiv.org/abs/2012.07421) ([Bibtex](https://wilds.stanford.edu/assets/files/bibtex.md)):

From 8d5bb483390bc7a00f1832ebb716663eae97dc0c Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 8 Mar 2021 12:23:46 -0800
Subject: [PATCH 107/116] backwards support for old datasets

---
 examples/configs/supported.py                 |  41 +--
 examples/run_expt.py                          |   6 +-
 examples/train.py                             |   3 -
 wilds/__init__.py                             |  28 ++
 wilds/datasets/archive/__init__.py            |   0
 wilds/datasets/archive/fmow_v1_0_dataset.py   | 230 ++++++++++++++
 .../datasets/archive/iwildcam_v1_0_dataset.py | 168 +++++++++++
 .../datasets/archive/poverty_v1_0_dataset.py  | 280 ++++++++++++++++++
 wilds/datasets/fmow_dataset.py                |  14 +-
 wilds/datasets/poverty_dataset.py             |  31 +-
 {examples => wilds}/download_datasets.py      |  18 +-
 wilds/get_dataset.py                          |  79 +++++
 12 files changed, 806 insertions(+), 92 deletions(-)
 create mode 100644 wilds/datasets/archive/__init__.py
 create mode 100644 wilds/datasets/archive/fmow_v1_0_dataset.py
 create mode 100644 wilds/datasets/archive/iwildcam_v1_0_dataset.py
 create mode 100644 wilds/datasets/archive/poverty_v1_0_dataset.py
 rename {examples => wilds}/download_datasets.py (69%)
 create mode 100644 wilds/get_dataset.py

diff --git a/examples/configs/supported.py b/examples/configs/supported.py
index fd68b4bd..8b66b74e 100644
--- a/examples/configs/supported.py
+++ b/examples/configs/supported.py
@@ -1,50 +1,11 @@
 import torch.nn as nn
 import torch
 import sys, os
-# Datasets
-from wilds.datasets.amazon_dataset import AmazonDataset
-from wilds.datasets.bdd100k_dataset import BDD100KDataset
-from wilds.datasets.camelyon17_dataset import Camelyon17Dataset
-from wilds.datasets.celebA_dataset import CelebADataset
-from wilds.datasets.civilcomments_dataset import CivilCommentsDataset
-from wilds.datasets.fmow_dataset import FMoWDataset
-from wilds.datasets.iwildcam_dataset import IWildCamDataset
-from wilds.datasets.ogbmolpcba_dataset import OGBPCBADataset
-from wilds.datasets.poverty_dataset import PovertyMapDataset
-from wilds.datasets.sqf_dataset import SQFDataset
-from wilds.datasets.waterbirds_dataset import WaterbirdsDataset
-from wilds.datasets.yelp_dataset import YelpDataset
-from wilds.datasets.py150_dataset import Py150Dataset
+
 # metrics
 from wilds.common.metrics.loss import ElementwiseLoss, Loss, MultiTaskLoss
 from wilds.common.metrics.all_metrics import Accuracy, MultiTaskAccuracy, MSE, multiclass_logits_to_pred, binary_logits_to_pred
 
-benchmark_datasets = [
-    'amazon',
-    'camelyon17',
-    'civilcomments',
-    'iwildcam',
-    'ogb-molpcba',
-    'poverty',
-    'fmow',
-    'py150']
-
-datasets = {
-    'amazon': AmazonDataset,
-    'camelyon17': Camelyon17Dataset,
-    'celebA': CelebADataset,
-    'civilcomments': CivilCommentsDataset,
-    'iwildcam': IWildCamDataset,
-    'waterbirds': WaterbirdsDataset,
-    'yelp': YelpDataset,
-    'ogb-molpcba': OGBPCBADataset,
-    'poverty': PovertyMapDataset,
-    'fmow': FMoWDataset,
-    'bdd100k': BDD100KDataset,
-    'py150': Py150Dataset,
-    'sqf': SQFDataset,
-}
-
 losses = {
     'cross_entropy': ElementwiseLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
     'lm_cross_entropy': MultiTaskLoss(loss_fn=nn.CrossEntropyLoss(reduction='none')),
diff --git a/examples/run_expt.py b/examples/run_expt.py
index 031c8252..adadfa61 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -8,6 +8,7 @@
 import sys
 from collections import defaultdict
 
+import wilds
 from wilds.common.data_loaders import get_train_loader, get_eval_loader
 from wilds.common.grouper import CombinatorialGrouper
 
@@ -23,7 +24,7 @@ def main():
     parser = argparse.ArgumentParser()
 
     # Required arguments
-    parser.add_argument('-d', '--dataset', choices=supported.datasets, required=True)
+    parser.add_argument('-d', '--dataset', choices=wilds.supported_datasets, required=True)
     parser.add_argument('--algorithm', required=True, choices=supported.algorithms)
     parser.add_argument('--root_dir', required=True,
                         help='The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).')
@@ -135,7 +136,8 @@ def main():
     set_seed(config.seed)
 
     # Data
-    full_dataset = supported.datasets[config.dataset](
+    full_dataset = wilds.get_dataset(
+        dataset=config.dataset,
         version=config.version,
         root_dir=config.root_dir,
         download=config.download,
diff --git a/examples/train.py b/examples/train.py
index 3a754214..63deaee1 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -23,9 +23,6 @@ def run_epoch(algorithm, dataset, general_logger, epoch, config, train):
         algorithm.train()
     else:
         algorithm.eval()
-    # process = psutil.Process(os.getpid())
-
-    # process = psutil.Process(os.getpid())
 
     # Not preallocating memory is slower
     # but makes it easier to handle different types of data loaders
diff --git a/wilds/__init__.py b/wilds/__init__.py
index 77ac4a0d..ac377605 100644
--- a/wilds/__init__.py
+++ b/wilds/__init__.py
@@ -1 +1,29 @@
 from .version import __version__
+from .get_dataset import get_dataset
+
+supported_datasets = [
+    'amazon',
+    'camelyon17',
+    'celebA',
+    'civilcomments',
+    'iwildcam',
+    'waterbirds',
+    'yelp',
+    'ogb-molpcba',
+    'poverty',
+    'fmow',
+    'bdd100k',
+    'py150',
+    'sqf',
+]
+
+benchmark_datasets = [
+    'amazon',
+    'camelyon17',
+    'civilcomments',
+    'iwildcam',
+    'ogb-molpcba',
+    'poverty',
+    'fmow',
+    'py150',
+]
diff --git a/wilds/datasets/archive/__init__.py b/wilds/datasets/archive/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/wilds/datasets/archive/fmow_v1_0_dataset.py b/wilds/datasets/archive/fmow_v1_0_dataset.py
new file mode 100644
index 00000000..2fef7d51
--- /dev/null
+++ b/wilds/datasets/archive/fmow_v1_0_dataset.py
@@ -0,0 +1,230 @@
+from pathlib import Path
+import shutil
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+import pickle
+import numpy as np
+import torchvision.transforms.functional as F
+from torchvision import transforms
+import tarfile
+import datetime
+import pytz
+from PIL import Image
+from tqdm import tqdm
+from wilds.common.utils import subsample_idxs
+from wilds.common.metrics.all_metrics import Accuracy
+from wilds.common.grouper import CombinatorialGrouper
+from wilds.datasets.wilds_dataset import WILDSDataset
+
+Image.MAX_IMAGE_PIXELS = 10000000000
+
+
+categories = ["airport", "airport_hangar", "airport_terminal", "amusement_park", "aquaculture", "archaeological_site", "barn", "border_checkpoint", "burial_site", "car_dealership", "construction_site", "crop_field", "dam", "debris_or_rubble", "educational_institution", "electric_substation", "factory_or_powerplant", "fire_station", "flooded_road", "fountain", "gas_station", "golf_course", "ground_transportation_station", "helipad", "hospital", "impoverished_settlement", "interchange", "lake_or_pond", "lighthouse", "military_facility", "multi-unit_residential", "nuclear_powerplant", "office_building", "oil_or_gas_facility", "park", "parking_lot_or_garage", "place_of_worship", "police_station", "port", "prison", "race_track", "railway_bridge", "recreational_facility", "road_bridge", "runway", "shipyard", "shopping_mall", "single-unit_residential", "smokestack", "solar_farm", "space_facility", "stadium", "storage_tank", "surface_mine", "swimming_pool", "toll_booth", "tower", "tunnel_opening", "waste_disposal", "water_treatment_facility", "wind_farm", "zoo"]
+
+
+class FMoWDataset(WILDSDataset):
+    """
+    The Functional Map of the World land use / building classification dataset.
+    This is a processed version of the Functional Map of the World dataset originally sourced from https://github.com/fMoW/dataset.
+
+    Support `split_scheme`
+        'official': official split, which is equivalent to 'time_after_2016'
+        `time_after_{YEAR}` for YEAR between 2002--2018
+
+    Input (x):
+        224 x 224 x 3 RGB satellite image.
+
+    Label (y):
+        y is one of 62 land use / building classes
+
+    Metadata:
+        each image is annotated with a location coordinate, timestamp, country code. This dataset computes region as a derivative of country code.
+
+    Website: https://github.com/fMoW/dataset
+
+    Original publication:
+    @inproceedings{fmow2018,
+      title={Functional Map of the World},
+      author={Christie, Gordon and Fendley, Neil and Wilson, James and Mukherjee, Ryan},
+      booktitle={CVPR},
+      year={2018}
+    }
+
+    License:
+        Distributed under the FMoW Challenge Public License.
+        https://github.com/fMoW/dataset/blob/master/LICENSE
+
+    """
+    _dataset_name = 'fmow'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xc59ea8261dfe4d2baa3820866e33d781/contents/blob/',
+            'compressed_size': 70_000_000_000}
+    }
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official', oracle_training_set=False, seed=111, use_ood_val=False):
+        self._version = version
+        self._data_dir = self.initialize_data_dir(root_dir, download)
+
+        self._split_dict = {'train': 0, 'id_val': 1, 'id_test': 2, 'val': 3, 'test': 4}
+        self._split_names = {'train': 'Train', 'id_val': 'ID Val', 'id_test': 'ID Test', 'val': 'OOD Val', 'test': 'OOD Test'}
+        if split_scheme=='official':
+            split_scheme='time_after_2016'
+        self._split_scheme = split_scheme
+        self.oracle_training_set = oracle_training_set
+
+        self.root = Path(self._data_dir)
+        self.seed = int(seed)
+        self._original_resolution = (224, 224)
+
+        self.category_to_idx = {cat: i for i, cat in enumerate(categories)}
+
+        self.metadata = pd.read_csv(self.root / 'rgb_metadata.csv')
+        country_codes_df = pd.read_csv(self.root / 'country_code_mapping.csv')
+        countrycode_to_region = {k: v for k, v in zip(country_codes_df['alpha-3'], country_codes_df['region'])}
+        regions = [countrycode_to_region.get(code, 'Other') for code in self.metadata['country_code'].to_list()]
+        self.metadata['region'] = regions
+        all_countries = self.metadata['country_code']
+
+        self.num_chunks = 101
+        self.chunk_size = len(self.metadata) // (self.num_chunks - 1)
+
+        if self._split_scheme.startswith('time_after'):
+            year = int(self._split_scheme.split('_')[2])
+            year_dt = datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)
+            self.test_ood_mask = np.asarray(pd.to_datetime(self.metadata['timestamp']) >= year_dt)
+            # use 3 years of the training set as validation
+            year_minus_3_dt = datetime.datetime(year-3, 1, 1, tzinfo=pytz.UTC)
+            self.val_ood_mask = np.asarray(pd.to_datetime(self.metadata['timestamp']) >= year_minus_3_dt) & ~self.test_ood_mask
+            self.ood_mask = self.test_ood_mask | self.val_ood_mask
+        else:
+            raise ValueError(f"Not supported: self._split_scheme = {self._split_scheme}")
+
+        self._split_array = -1 * np.ones(len(self.metadata))
+        for split in self._split_dict.keys():
+            idxs = np.arange(len(self.metadata))
+            if split == 'test':
+                test_mask = np.asarray(self.metadata['split'] == 'test')
+                idxs = idxs[self.test_ood_mask & test_mask]
+            elif split == 'val':
+                val_mask = np.asarray(self.metadata['split'] == 'val')
+                idxs = idxs[self.val_ood_mask & val_mask]
+            elif split == 'id_test':
+                test_mask = np.asarray(self.metadata['split'] == 'test')
+                idxs = idxs[~self.ood_mask & test_mask]
+            elif split == 'id_val':
+                val_mask = np.asarray(self.metadata['split'] == 'val')
+                idxs = idxs[~self.ood_mask & val_mask]
+            else:
+                split_mask = np.asarray(self.metadata['split'] == split)
+                idxs = idxs[~self.ood_mask & split_mask]
+
+            if self.oracle_training_set and split == 'train':
+                test_mask = np.asarray(self.metadata['split'] == 'test')
+                unused_ood_idxs = np.arange(len(self.metadata))[self.ood_mask & ~test_mask]
+                subsample_unused_ood_idxs = subsample_idxs(unused_ood_idxs, num=len(idxs)//2, seed=self.seed+2)
+                subsample_train_idxs = subsample_idxs(idxs.copy(), num=len(idxs) // 2, seed=self.seed+3)
+                idxs = np.concatenate([subsample_unused_ood_idxs, subsample_train_idxs])
+            self._split_array[idxs] = self._split_dict[split]
+
+        if not use_ood_val:
+            self._split_dict = {'train': 0, 'val': 1, 'id_test': 2, 'ood_val': 3, 'test': 4}
+            self._split_names = {'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test'}
+
+        # filter out sequestered images from full dataset
+        seq_mask = np.asarray(self.metadata['split'] == 'seq')
+        # take out the sequestered images
+        self._split_array = self._split_array[~seq_mask]
+        self.full_idxs = np.arange(len(self.metadata))[~seq_mask]
+
+        self._y_array = np.asarray([self.category_to_idx[y] for y in list(self.metadata['category'])])
+        self.metadata['y'] = self._y_array
+        self._y_array = torch.from_numpy(self._y_array).long()[~seq_mask]
+        self._y_size = 1
+        self._n_classes = 62
+
+        # convert region to idxs
+        all_regions = list(self.metadata['region'].unique())
+        region_to_region_idx = {region: i for i, region in enumerate(all_regions)}
+        self._metadata_map = {'region': all_regions}
+        region_idxs = [region_to_region_idx[region] for region in self.metadata['region'].tolist()]
+        self.metadata['region'] = region_idxs
+
+        # make a year column in metadata
+        year_array = -1 * np.ones(len(self.metadata))
+        ts = pd.to_datetime(self.metadata['timestamp'])
+        for year in range(2002, 2018):
+            year_mask = np.asarray(ts >= datetime.datetime(year, 1, 1, tzinfo=pytz.UTC)) \
+                        & np.asarray(ts < datetime.datetime(year+1, 1, 1, tzinfo=pytz.UTC))
+            year_array[year_mask] = year - 2002
+        self.metadata['year'] = year_array
+        self._metadata_map['year'] = list(range(2002, 2018))
+
+        self._metadata_fields = ['region', 'year', 'y']
+        self._metadata_array = torch.from_numpy(self.metadata[self._metadata_fields].astype(int).to_numpy()).long()[~seq_mask]
+
+        self._eval_groupers = {
+            'year': CombinatorialGrouper(dataset=self, groupby_fields=['year']),
+            'region': CombinatorialGrouper(dataset=self, groupby_fields=['region']),
+        }
+
+        super().__init__(root_dir, download, split_scheme)
+
+    def get_input(self, idx):
+        """
+        Returns x for a given idx.
+        """
+        idx = self.full_idxs[idx]
+        batch_idx = idx // self.chunk_size
+        within_batch_idx = idx % self.chunk_size
+        img_batch = np.load(self.root / f'rgb_all_imgs_{batch_idx}.npy', mmap_mode='r')
+        img = img_batch[within_batch_idx].copy()
+        return img
+
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
+        metric = Accuracy(prediction_fn=prediction_fn)
+        # Overall evaluation + evaluate by year
+        all_results, all_results_str = self.standard_group_eval(
+            metric,
+            self._eval_groupers['year'],
+            y_pred, y_true, metadata)
+        # Evaluate by region and ignore the "Other" region
+        region_grouper = self._eval_groupers['region']
+        region_results = metric.compute_group_wise(
+            y_pred,
+            y_true,
+            region_grouper.metadata_to_group(metadata),
+            region_grouper.n_groups)
+        all_results[f'{metric.name}_worst_year'] = all_results.pop(metric.worst_group_metric_field)
+        region_metric_list = []
+        for group_idx in range(region_grouper.n_groups):
+            group_str = region_grouper.group_field_str(group_idx)
+            group_metric = region_results[metric.group_metric_field(group_idx)]
+            group_counts = region_results[metric.group_count_field(group_idx)]
+            all_results[f'{metric.name}_{group_str}'] = group_metric
+            all_results[f'count_{group_str}'] = group_counts
+            if region_results[metric.group_count_field(group_idx)] == 0 or "Other" in group_str:
+                continue
+            all_results_str += (
+                f'  {region_grouper.group_str(group_idx)}  '
+                f"[n = {region_results[metric.group_count_field(group_idx)]:6.0f}]:\t"
+                f"{metric.name} = {region_results[metric.group_metric_field(group_idx)]:5.3f}\n")
+            region_metric_list.append(region_results[metric.group_metric_field(group_idx)])
+        all_results[f'{metric.name}_worst_region'] = metric.worst(region_metric_list)
+        all_results_str += f"Worst-group {metric.name}: {all_results[f'{metric.name}_worst_region']:.3f}\n"
+
+        return all_results, all_results_str
diff --git a/wilds/datasets/archive/iwildcam_v1_0_dataset.py b/wilds/datasets/archive/iwildcam_v1_0_dataset.py
new file mode 100644
index 00000000..49c53d1e
--- /dev/null
+++ b/wilds/datasets/archive/iwildcam_v1_0_dataset.py
@@ -0,0 +1,168 @@
+from datetime import datetime
+from pathlib import Path
+import os
+
+from PIL import Image
+import pandas as pd
+import numpy as np
+import torch
+import json
+
+from wilds.datasets.wilds_dataset import WILDSDataset
+from wilds.common.grouper import CombinatorialGrouper
+from wilds.common.metrics.all_metrics import Accuracy, Recall, F1
+
+
+class IWildCamDataset(WILDSDataset):
+    """
+        The iWildCam2020 dataset.
+        This is a modified version of the original iWildCam2020 competition dataset.
+        Input (x):
+            RGB images from camera traps
+        Label (y):
+            y is one of 186 classes corresponding to animal species
+        Metadata:
+            Each image is annotated with the ID of the location (camera trap) it came from.
+        Website:
+            https://www.kaggle.com/c/iwildcam-2020-fgvc7
+        Original publication:
+            @article{beery2020iwildcam,
+            title={The iWildCam 2020 Competition Dataset},
+            author={Beery, Sara and Cole, Elijah and Gjoka, Arvi},
+            journal={arXiv preprint arXiv:2004.10340},
+                    year={2020}
+            }
+        License:
+            This dataset is distributed under Community Data License Agreement – Permissive – Version 1.0
+            https://cdla.io/permissive-1-0/
+        """
+    _dataset_name = 'iwildcam'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x3f1b346ff2d74b5daf1a08685d68c6ec/contents/blob/',
+            'compressed_size': 90_094_666_806}}
+
+    def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'):
+
+        self._version = version
+        self._split_scheme = split_scheme
+        if self._split_scheme != 'official':
+            raise ValueError(f'Split scheme {self._split_scheme} not recognized')
+
+        # path
+        self._data_dir = Path(self.initialize_data_dir(root_dir, download))
+
+        # Load splits
+        train_df = pd.read_csv(self._data_dir / 'train.csv')
+        val_trans_df = pd.read_csv(self._data_dir / 'val_trans.csv')
+        test_trans_df = pd.read_csv(self._data_dir / 'test_trans.csv')
+        val_cis_df = pd.read_csv(self._data_dir / 'val_cis.csv')
+        test_cis_df = pd.read_csv(self._data_dir / 'test_cis.csv')
+
+        # Merge all dfs
+        train_df['split'] = 'train'
+        val_trans_df['split'] = 'val'
+        test_trans_df['split'] = 'test'
+        val_cis_df['split'] = 'id_val'
+        test_cis_df['split'] = 'id_test'
+        df = pd.concat([train_df, val_trans_df, test_trans_df, test_cis_df, val_cis_df])
+
+        # Splits
+        data = {}
+        self._split_dict = {'train': 0, 'val': 1, 'test': 2, 'id_val': 3, 'id_test': 4}
+        self._split_names = {'train': 'Train', 'val': 'Validation (OOD/Trans)',
+                                'test': 'Test (OOD/Trans)', 'id_val': 'Validation (ID/Cis)',
+                                'id_test': 'Test (ID/Cis)'}
+
+        df['split_id'] = df['split'].apply(lambda x: self._split_dict[x])
+        self._split_array = df['split_id'].values
+
+        # Filenames
+        self._input_array = df['filename'].values
+
+        # Labels
+        unique_categories = np.unique(df['category_id'])
+        self._n_classes = len(unique_categories)
+        category_to_label = dict([(i, j) for i, j in zip(unique_categories, range(self._n_classes))])
+        label_to_category = dict([(v, k) for k, v in category_to_label.items()])
+        self._y_array = torch.tensor(df['category_id'].apply(lambda x: category_to_label[x]).values)
+        self._y_size = 1
+
+        # Location/group info
+        location_ids = df['location']
+        locations = np.unique(location_ids)
+        n_groups = len(locations)
+        location_to_group_id = {locations[i]: i for i in range(n_groups)}
+        df['group_id' ] = df['location'].apply(lambda x: location_to_group_id[x])
+
+        self._n_groups = n_groups
+
+        # Extract datetime subcomponents and include in metadata
+        df['datetime_obj'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f'))
+        df['year'] = df['datetime_obj'].apply(lambda x: int(x.year))
+        df['month'] = df['datetime_obj'].apply(lambda x: int(x.month))
+        df['day'] = df['datetime_obj'].apply(lambda x: int(x.day))
+        df['hour'] = df['datetime_obj'].apply(lambda x: int(x.hour))
+        df['minute'] = df['datetime_obj'].apply(lambda x: int(x.minute))
+        df['second'] = df['datetime_obj'].apply(lambda x: int(x.second))
+
+        self._metadata_array = torch.tensor(np.stack([df['group_id'].values,
+                            df['year'].values, df['month'].values, df['day'].values,
+                            df['hour'].values, df['minute'].values, df['second'].values,
+                            self.y_array], axis=1))
+        self._metadata_fields = ['location', 'year', 'month', 'day', 'hour', 'minute', 'second', 'y']
+        # eval grouper
+        self._eval_grouper = CombinatorialGrouper(
+            dataset=self,
+            groupby_fields=(['location']))
+
+        super().__init__(root_dir, download, split_scheme)
+
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model. By default, they are predicted labels (LongTensor).
+                               But they can also be other model outputs such that prediction_fn(y_pred)
+                               are predicted labels.
+            - y_true (LongTensor): Ground-truth labels
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): A function that turns y_pred into predicted labels
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
+        metrics = [
+            Accuracy(prediction_fn=prediction_fn),
+            Recall(prediction_fn=prediction_fn, average='macro'),
+            F1(prediction_fn=prediction_fn, average='macro'),
+        ]
+
+        results = {}
+
+        for i in range(len(metrics)):
+            results.update({
+                **metrics[i].compute(y_pred, y_true),
+                        })
+
+        results_str = (
+            f"Average acc: {results[metrics[0].agg_metric_field]:.3f}\n"
+            f"Recall macro: {results[metrics[1].agg_metric_field]:.3f}\n"
+            f"F1 macro: {results[metrics[2].agg_metric_field]:.3f}\n"
+        )
+
+        return results, results_str
+
+    def get_input(self, idx):
+        """
+        Args:
+            - idx (int): Index of a data point
+        Output:
+            - x (Tensor): Input features of the idx-th data point
+        """
+
+        # All images are in the train folder
+        img_path = self.data_dir / 'train' / self._input_array[idx]
+        img = Image.open(img_path)
+
+        return img
diff --git a/wilds/datasets/archive/poverty_v1_0_dataset.py b/wilds/datasets/archive/poverty_v1_0_dataset.py
new file mode 100644
index 00000000..438e7beb
--- /dev/null
+++ b/wilds/datasets/archive/poverty_v1_0_dataset.py
@@ -0,0 +1,280 @@
+from pathlib import Path
+import pandas as pd
+import torch
+from torch.utils.data import Dataset
+import pickle
+import numpy as np
+from wilds.datasets.wilds_dataset import WILDSDataset
+from wilds.common.metrics.all_metrics import MSE, PearsonCorrelation
+from wilds.common.grouper import CombinatorialGrouper
+from wilds.common.utils import subsample_idxs, shuffle_arr
+
+DATASET = '2009-17'
+BAND_ORDER = ['BLUE', 'GREEN', 'RED', 'SWIR1', 'SWIR2', 'TEMP1', 'NIR', 'NIGHTLIGHTS']
+
+
+DHS_COUNTRIES = [
+    'angola', 'benin', 'burkina_faso', 'cameroon', 'cote_d_ivoire',
+    'democratic_republic_of_congo', 'ethiopia', 'ghana', 'guinea', 'kenya',
+    'lesotho', 'malawi', 'mali', 'mozambique', 'nigeria', 'rwanda', 'senegal',
+    'sierra_leone', 'tanzania', 'togo', 'uganda', 'zambia', 'zimbabwe']
+
+_SURVEY_NAMES_2009_17A = {
+    'train': ['cameroon', 'democratic_republic_of_congo', 'ghana', 'kenya',
+              'lesotho', 'malawi', 'mozambique', 'nigeria', 'senegal',
+              'togo', 'uganda', 'zambia', 'zimbabwe'],
+    'val': ['benin', 'burkina_faso', 'guinea', 'sierra_leone', 'tanzania'],
+    'test': ['angola', 'cote_d_ivoire', 'ethiopia', 'mali', 'rwanda'],
+}
+_SURVEY_NAMES_2009_17B = {
+    'train': ['angola', 'cote_d_ivoire', 'democratic_republic_of_congo',
+              'ethiopia', 'kenya', 'lesotho', 'mali', 'mozambique',
+              'nigeria', 'rwanda', 'senegal', 'togo', 'uganda', 'zambia'],
+    'val': ['cameroon', 'ghana', 'malawi', 'zimbabwe'],
+    'test': ['benin', 'burkina_faso', 'guinea', 'sierra_leone', 'tanzania'],
+}
+_SURVEY_NAMES_2009_17C = {
+    'train': ['angola', 'benin', 'burkina_faso', 'cote_d_ivoire', 'ethiopia',
+              'guinea', 'kenya', 'lesotho', 'mali', 'rwanda', 'senegal',
+              'sierra_leone', 'tanzania', 'zambia'],
+    'val': ['democratic_republic_of_congo', 'mozambique', 'nigeria', 'togo', 'uganda'],
+    'test': ['cameroon', 'ghana', 'malawi', 'zimbabwe'],
+}
+_SURVEY_NAMES_2009_17D = {
+    'train': ['angola', 'benin', 'burkina_faso', 'cameroon', 'cote_d_ivoire',
+              'ethiopia', 'ghana', 'guinea', 'malawi', 'mali', 'rwanda',
+              'sierra_leone', 'tanzania', 'zimbabwe'],
+    'val': ['kenya', 'lesotho', 'senegal', 'zambia'],
+    'test': ['democratic_republic_of_congo', 'mozambique', 'nigeria', 'togo', 'uganda'],
+}
+_SURVEY_NAMES_2009_17E = {
+    'train': ['benin', 'burkina_faso', 'cameroon', 'democratic_republic_of_congo',
+              'ghana', 'guinea', 'malawi', 'mozambique', 'nigeria', 'sierra_leone',
+              'tanzania', 'togo', 'uganda', 'zimbabwe'],
+    'val': ['angola', 'cote_d_ivoire', 'ethiopia', 'mali', 'rwanda'],
+    'test': ['kenya', 'lesotho', 'senegal', 'zambia'],
+}
+
+SURVEY_NAMES = {
+    '2009-17A': _SURVEY_NAMES_2009_17A,
+    '2009-17B': _SURVEY_NAMES_2009_17B,
+    '2009-17C': _SURVEY_NAMES_2009_17C,
+    '2009-17D': _SURVEY_NAMES_2009_17D,
+    '2009-17E': _SURVEY_NAMES_2009_17E,
+}
+
+
+# means and standard deviations calculated over the entire dataset (train + val + test),
+# with negative values set to 0, and ignoring any pixel that is 0 across all bands
+# all images have already been mean subtracted and normalized (x - mean) / std
+
+_MEANS_2009_17 = {
+    'BLUE':  0.059183,
+    'GREEN': 0.088619,
+    'RED':   0.104145,
+    'SWIR1': 0.246874,
+    'SWIR2': 0.168728,
+    'TEMP1': 299.078023,
+    'NIR':   0.253074,
+    'DMSP':  4.005496,
+    'VIIRS': 1.096089,
+    # 'NIGHTLIGHTS': 5.101585, # nightlights overall
+}
+
+_STD_DEVS_2009_17 = {
+    'BLUE':  0.022926,
+    'GREEN': 0.031880,
+    'RED':   0.051458,
+    'SWIR1': 0.088857,
+    'SWIR2': 0.083240,
+    'TEMP1': 4.300303,
+    'NIR':   0.058973,
+    'DMSP':  23.038301,
+    'VIIRS': 4.786354,
+    # 'NIGHTLIGHTS': 23.342916, # nightlights overall
+}
+
+
+def split_by_countries(idxs, ood_countries, metadata):
+    countries = np.asarray(metadata['country'].iloc[idxs])
+    is_ood = np.any([(countries == country) for country in ood_countries], axis=0)
+    return idxs[~is_ood], idxs[is_ood]
+
+
+class PovertyMapDataset(WILDSDataset):
+    """
+    The PovertyMap poverty measure prediction dataset.
+    This is a processed version of LandSat 5/7/8 satellite imagery originally from Google Earth Engine under the names `LANDSAT/LC08/C01/T1_SR`,`LANDSAT/LE07/C01/T1_SR`,`LANDSAT/LT05/C01/T1_SR`,
+    nighttime light imagery from the DMSP and VIIRS satellites (Google Earth Engine names `NOAA/DMSP-OLS/CALIBRATED_LIGHTS_V4` and `NOAA/VIIRS/DNB/MONTHLY_V1/VCMSLCFG`)
+    and processed DHS survey metadata obtained from https://github.com/sustainlab-group/africa_poverty and originally from `https://dhsprogram.com/data/available-datasets.cfm`.
+
+    Supported `split_scheme`:
+        'official' and `countries`, which are equivalent
+
+    Input (x):
+        224 x 224 x 8 satellite image, with 7 channels from LandSat and 1 nighttime light channel from DMSP/VIIRS. Already mean/std normalized.
+
+    Output (y):
+        y is a real-valued asset wealth index. Higher index corresponds to more asset wealth.
+
+    Metadata:
+        each image is annotated with location coordinates (noised for anonymity), survey year, urban/rural classification, country, nighttime light mean, nighttime light median.
+
+    Website: https://github.com/sustainlab-group/africa_poverty
+
+    Original publication:
+    @article{yeh2020using,
+        author = {Yeh, Christopher and Perez, Anthony and Driscoll, Anne and Azzari, George and Tang, Zhongyi and Lobell, David and Ermon, Stefano and Burke, Marshall},
+        day = {22},
+        doi = {10.1038/s41467-020-16185-w},
+        issn = {2041-1723},
+        journal = {Nature Communications},
+        month = {5},
+        number = {1},
+        title = {{Using publicly available satellite imagery and deep learning to understand economic well-being in Africa}},
+        url = {https://www.nature.com/articles/s41467-020-16185-w},
+        volume = {11},
+        year = {2020}
+    }
+
+    License:
+        LandSat/DMSP/VIIRS data is U.S. Public Domain.
+
+    """
+    _dataset_name = 'poverty'
+    _versions_dict = {
+        '1.0': {
+            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x9a2add5219db4ebc89965d7f42719750/contents/blob/',
+            'compressed_size': 18_630_656_000}}
+
+    def __init__(self, version=None, root_dir='data', download=False,
+                 split_scheme='official',
+                 no_nl=False, fold='A', oracle_training_set=False,
+                 use_ood_val=True,
+                 cache_size=100):
+        self._version = version
+        self._data_dir = self.initialize_data_dir(root_dir, download)
+
+        self._split_dict = {'train': 0, 'id_val': 1, 'id_test': 2, 'val': 3, 'test': 4}
+        self._split_names = {'train': 'Train', 'id_val': 'ID Val', 'id_test': 'ID Test', 'val': 'OOD Val', 'test': 'OOD Test'}
+
+        if split_scheme=='official':
+            split_scheme = 'countries'
+        self._split_scheme = split_scheme
+        if self._split_scheme != 'countries':
+            raise ValueError("Split scheme not recognized")
+
+        self.oracle_training_set = oracle_training_set
+
+        self.no_nl = no_nl
+        if fold not in {'A', 'B', 'C', 'D', 'E'}:
+            raise ValueError("Fold must be A, B, C, D, or E")
+
+        self.root = Path(self._data_dir)
+        self.metadata = pd.read_csv(self.root / 'dhs_metadata.csv')
+        # country folds, split off OOD
+        country_folds = SURVEY_NAMES[f'2009-17{fold}']
+
+        self._split_array = -1 * np.ones(len(self.metadata))
+
+        incountry_folds_split = np.arange(len(self.metadata))
+        # take the test countries to be ood
+        idxs_id, idxs_ood_test = split_by_countries(incountry_folds_split, country_folds['test'], self.metadata)
+        # also create a validation OOD set
+        idxs_id, idxs_ood_val = split_by_countries(idxs_id, country_folds['val'], self.metadata)
+        for split in ['test', 'val', 'id_test', 'id_val', 'train']:
+            # keep ood for test, otherwise throw away ood data
+            if split == 'test':
+                idxs = idxs_ood_test
+            elif split == 'val':
+                idxs = idxs_ood_val
+            else:
+                idxs = idxs_id
+                num_eval = 2000
+                # if oracle, do 50-50 split between OOD and ID
+                if split == 'train' and self.oracle_training_set:
+                    idxs = subsample_idxs(incountry_folds_split, num=len(idxs_id), seed=ord(fold))[num_eval:]
+                elif split != 'train' and self.oracle_training_set:
+                    eval_idxs = subsample_idxs(incountry_folds_split, num=len(idxs_id), seed=ord(fold))[:num_eval]
+                elif split == 'train':
+                    idxs = subsample_idxs(idxs, take_rest=True, num=num_eval, seed=ord(fold))
+                else:
+                    eval_idxs  = subsample_idxs(idxs, take_rest=False, num=num_eval, seed=ord(fold))
+
+                if split != 'train':
+                    if split == 'id_val':
+                        idxs = eval_idxs[:num_eval//2]
+                    else:
+                        idxs = eval_idxs[num_eval//2:]
+            self._split_array[idxs] = self._split_dict[split]
+
+        if not use_ood_val:
+            self._split_dict = {'train': 0, 'val': 1, 'id_test': 2, 'ood_val': 3, 'test': 4}
+            self._split_names = {'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test'}
+
+        self.cache_size = cache_size
+        self.cache_counter = 0
+        self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
+        self.imgs = self.imgs.transpose((0, 3, 1, 2))
+
+        self._y_array = torch.from_numpy(np.asarray(self.metadata['wealthpooled'])[:, np.newaxis]).float()
+        self._y_size = 1
+
+        # add country group field
+        country_to_idx = {country: i for i, country in enumerate(DHS_COUNTRIES)}
+        self.metadata['country'] = [country_to_idx[country] for country in self.metadata['country'].tolist()]
+        self._metadata_map = {'country': DHS_COUNTRIES}
+        self._metadata_array = torch.from_numpy(self.metadata[['urban', 'wealthpooled', 'country']].astype(float).to_numpy())
+        # rename wealthpooled to y
+        self._metadata_fields = ['urban', 'y', 'country']
+
+        self._eval_grouper = CombinatorialGrouper(
+            dataset=self,
+            groupby_fields=['urban'])
+
+        super().__init__(root_dir, download, split_scheme)
+
+    def get_input(self, idx):
+        """
+        Returns x for a given idx.
+        """
+        img = self.imgs[idx].copy()
+        if self.no_nl:
+            img[-1] = 0
+        img = torch.from_numpy(img).float()
+        # consider refreshing cache if cache_size is limited
+        if self.cache_size < self.imgs.shape[0]:
+            self.cache_counter += 1
+            if self.cache_counter > self.cache_size:
+                self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
+                self.imgs = self.imgs.transpose((0, 3, 1, 2))
+                self.cache_counter = 0
+
+        return img
+
+    def eval(self, y_pred, y_true, metadata, prediction_fn=None):
+        """
+        Computes all evaluation metrics.
+        Args:
+            - y_pred (Tensor): Predictions from a model
+            - y_true (LongTensor): Ground-truth values
+            - metadata (Tensor): Metadata
+            - prediction_fn (function): Only None supported
+        Output:
+            - results (dictionary): Dictionary of evaluation metrics
+            - results_str (str): String summarizing the evaluation metrics
+        """
+        assert prediction_fn is None, "PovertyMapDataset.eval() does not support prediction_fn"
+
+        metrics = [MSE(), PearsonCorrelation()]
+
+        all_results = {}
+        all_results_str = ''
+        for metric in metrics:
+            results, results_str = self.standard_group_eval(
+                metric,
+                self._eval_grouper,
+                y_pred, y_true, metadata)
+            all_results.update(results)
+            all_results_str += results_str
+        return all_results, all_results_str
diff --git a/wilds/datasets/fmow_dataset.py b/wilds/datasets/fmow_dataset.py
index a67b5c34..4a310b40 100644
--- a/wilds/datasets/fmow_dataset.py
+++ b/wilds/datasets/fmow_dataset.py
@@ -58,9 +58,6 @@ class FMoWDataset(WILDSDataset):
     """
     _dataset_name = 'fmow'
     _versions_dict = {
-        '1.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0xc59ea8261dfe4d2baa3820866e33d781/contents/blob/',
-            'compressed_size': 70_000_000_000},
         '1.1': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0xaec91eb7c9d548ebb15e1b5e60f966ab/contents/blob/',
             'compressed_size': 53_893_324_800}
@@ -179,14 +176,7 @@ def get_input(self, idx):
         Returns x for a given idx.
         """
         idx = self.full_idxs[idx]
-        if self.version == '1.0':
-            batch_idx = idx // self.chunk_size
-            within_batch_idx = idx % self.chunk_size
-            img_batch = np.load(self.root / f'rgb_all_imgs_{batch_idx}.npy', mmap_mode='r')
-            img = img_batch[within_batch_idx].copy()
-        elif self.version == '1.1':
-            img = Image.open(self.root / 'images' / f'rgb_img_{idx}.png').convert('RGB')
-
+        img = Image.open(self.root / 'images' / f'rgb_img_{idx}.png').convert('RGB')
         return img
 
     def eval(self, y_pred, y_true, metadata, prediction_fn=None):
@@ -198,7 +188,7 @@ def eval(self, y_pred, y_true, metadata, prediction_fn=None):
                                are predicted labels.
             - y_true (LongTensor): Ground-truth labels
             - metadata (Tensor): Metadata
-            - prediction_fn (function): A function that turns y_pred into predicted labels 
+            - prediction_fn (function): A function that turns y_pred into predicted labels
         Output:
             - results (dictionary): Dictionary of evaluation metrics
             - results_str (str): String summarizing the evaluation metrics
diff --git a/wilds/datasets/poverty_dataset.py b/wilds/datasets/poverty_dataset.py
index c9376c9b..7b062002 100644
--- a/wilds/datasets/poverty_dataset.py
+++ b/wilds/datasets/poverty_dataset.py
@@ -143,9 +143,6 @@ class PovertyMapDataset(WILDSDataset):
     """
     _dataset_name = 'poverty'
     _versions_dict = {
-        '1.0': {
-            'download_url': 'https://worksheets.codalab.org/rest/bundles/0x9a2add5219db4ebc89965d7f42719750/contents/blob/',
-            'compressed_size': 18_630_656_000},
         '1.1': {
             'download_url': 'https://worksheets.codalab.org/rest/bundles/0xfc0aa86ad9af4eb08c42dfc40eacf094/contents/blob/',
             'compressed_size': 13_091_823_616}}
@@ -215,12 +212,6 @@ def __init__(self, version=None, root_dir='data', download=False,
             self._split_dict = {'train': 0, 'val': 1, 'id_test': 2, 'ood_val': 3, 'test': 4}
             self._split_names = {'train': 'Train', 'val': 'ID Val', 'id_test': 'ID Test', 'ood_val': 'OOD Val', 'test': 'OOD Test'}
 
-        if self.version == '1.0':
-            self.cache_size = cache_size
-            self.cache_counter = 0            
-            self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
-            self.imgs = self.imgs.transpose((0, 3, 1, 2))
-
         self._y_array = torch.from_numpy(np.asarray(self.metadata['wealthpooled'])[:, np.newaxis]).float()
         self._y_size = 1
 
@@ -242,24 +233,10 @@ def get_input(self, idx):
         """
         Returns x for a given idx.
         """
-        if self.version == '1.0':
-            img = self.imgs[idx].copy()
-            if self.no_nl:
-                img[-1] = 0
-            img = torch.from_numpy(img).float()
-            # consider refreshing cache if cache_size is limited
-            if self.cache_size < self.imgs.shape[0]:
-                self.cache_counter += 1
-                if self.cache_counter > self.cache_size:
-                    self.imgs = np.load(self.root / 'landsat_poverty_imgs.npy', mmap_mode='r')
-                    self.imgs = self.imgs.transpose((0, 3, 1, 2))
-                    self.cache_counter = 0
-
-        elif self.version == '1.1':
-            img = np.load(self.root / 'images' / f'landsat_poverty_img_{idx}.npz')['x']
-            if self.no_nl:
-                img[-1] = 0
-            img = torch.from_numpy(img).float()
+        img = np.load(self.root / 'images' / f'landsat_poverty_img_{idx}.npz')['x']
+        if self.no_nl:
+            img[-1] = 0
+        img = torch.from_numpy(img).float()
 
         return img
 
diff --git a/examples/download_datasets.py b/wilds/download_datasets.py
similarity index 69%
rename from examples/download_datasets.py
rename to wilds/download_datasets.py
index 2688ef70..bf085739 100644
--- a/examples/download_datasets.py
+++ b/wilds/download_datasets.py
@@ -1,6 +1,6 @@
 import os, sys
 import argparse
-import configs.supported as supported
+import wilds
 
 def main():
     """
@@ -11,21 +11,23 @@ def main():
     parser.add_argument('--root_dir', required=True,
                         help='The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).')
     parser.add_argument('--datasets', nargs='*', default=None,
-                        help=f'Specify a space-separated list of dataset names to download. If left unspecified, the script will download all of the official benchmark datasets. Available choices are {list(supported.datasets.keys())}.')
+                        help=f'Specify a space-separated list of dataset names to download. If left unspecified, the script will download all of the official benchmark datasets. Available choices are {wilds.supported_datasets}.')
     config = parser.parse_args()
 
     if config.datasets is None:
-        config.datasets = supported.benchmark_datasets
+        config.datasets = wilds.benchmark_datasets
 
     for dataset in config.datasets:
-        if dataset not in supported.datasets:
-            raise ValueError(f'{dataset} not recognized; must be one of {list(supported.datasets.keys())}.')
+        if dataset not in wilds.supported_datasets:
+            raise ValueError(f'{dataset} not recognized; must be one of {wilds.supported_datasets}.')
 
     print(f'Downloading the following datasets: {config.datasets}')
     for dataset in config.datasets:
-        print(f'=== {dataset} ===')
-        constructor = supported.datasets[dataset]
-        constructor(root_dir=config.root_dir, download=True)
+        print(f'=== {dataset} ===')        
+        wilds.get_dataset(
+            dataset=dataset,
+            root_dir=config.root_dir,
+            download=True)
 
 
 if __name__=='__main__':
diff --git a/wilds/get_dataset.py b/wilds/get_dataset.py
new file mode 100644
index 00000000..1073100f
--- /dev/null
+++ b/wilds/get_dataset.py
@@ -0,0 +1,79 @@
+import wilds
+
+def get_dataset(dataset, version=None, **dataset_kwargs):
+    """
+    Returns the appropriate WILDS dataset class.
+    Input:
+        dataset (str): Name of the dataset
+        version (str): Dataset version number, e.g., '1.0'.
+                       Defaults to the latest version.
+        dataset_kwargs: Other keyword arguments to pass to the dataset constructors.
+    Output:
+        The specified WILDSDataset class.
+    """
+    if version is not None:
+        version = str(version)
+
+    if dataset not in wilds.supported_datasets:
+        raise ValueError(f'The dataset {dataset} is not recognized. Must be one of {wilds.supported_datasets}.')
+
+    if dataset == 'amazon':
+        from wilds.datasets.amazon_dataset import AmazonDataset
+        return AmazonDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'camelyon17':
+        from wilds.datasets.camelyon17_dataset import Camelyon17Dataset
+        return Camelyon17Dataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'celebA':
+        from wilds.datasets.celebA_dataset import CelebADataset
+        return CelebADataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'civilcomments':
+        from wilds.datasets.civilcomments_dataset import CivilCommentsDataset
+        return CivilCommentsDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'iwildcam':
+        if version == '1.0':
+            from wilds.datasets.archive.iwildcam_v1_0_dataset import IWildCamDataset
+        else:
+            from wilds.datasets.iwildcam_dataset import IWildCamDataset
+        return IWildCamDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'waterbirds':
+        from wilds.datasets.waterbirds_dataset import WaterbirdsDataset
+        return WaterbirdsDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'yelp':
+        from wilds.datasets.yelp_dataset import YelpDataset
+        return YelpDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'ogb-molpcba':
+        from wilds.datasets.ogbmolpcba_dataset import OGBPCBADataset
+        return OGBPCBADataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'poverty':
+        if version == '1.0':
+            from wilds.datasets.archive.poverty_v1_0_dataset import PovertyMapDataset
+        else:            
+            from wilds.datasets.poverty_dataset import PovertyMapDataset
+        return PovertyMapDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'fmow':
+        if version == '1.0':
+            from wilds.datasets.archive.fmow_v1_0_dataset import FMoWDataset
+        else:
+            from wilds.datasets.fmow_dataset import FMoWDataset
+        return FMoWDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'bdd100k':
+        from wilds.datasets.bdd100k_dataset import BDD100KDataset
+        return BDD100KDataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'py150':
+        from wilds.datasets.py150_dataset import Py150Dataset
+        return Py150Dataset(version=version, **dataset_kwargs)
+
+    elif dataset == 'sqf':
+        from wilds.datasets.sqf_dataset import SQFDataset
+        return SQFDataset(version=version, **dataset_kwargs)

From 70ccb105d28b3a2f09d040a4c3e70681313b9a00 Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 8 Mar 2021 12:41:37 -0800
Subject: [PATCH 108/116] clean up supported_datasets

---
 wilds/__init__.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/wilds/__init__.py b/wilds/__init__.py
index ac377605..77f0ad5a 100644
--- a/wilds/__init__.py
+++ b/wilds/__init__.py
@@ -1,29 +1,23 @@
 from .version import __version__
 from .get_dataset import get_dataset
 
-supported_datasets = [
+benchmark_datasets = [
     'amazon',
     'camelyon17',
-    'celebA',
     'civilcomments',
     'iwildcam',
-    'waterbirds',
-    'yelp',
     'ogb-molpcba',
     'poverty',
     'fmow',
-    'bdd100k',
     'py150',
-    'sqf',
 ]
 
-benchmark_datasets = [
-    'amazon',
-    'camelyon17',
-    'civilcomments',
-    'iwildcam',
-    'ogb-molpcba',
-    'poverty',
-    'fmow',
-    'py150',
+additional_datasets = [
+    'celebA',
+    'waterbirds',
+    'yelp',
+    'bdd100k',
+    'sqf',
 ]
+
+supported_datasets = benchmark_datasets + additional_datasets

From 9f99e61617a2bf84f8fa7caecaccc0cf4acb0375 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Mon, 8 Mar 2021 12:59:21 -0800
Subject: [PATCH 109/116] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4602f644..3a203a45 100644
--- a/README.md
+++ b/README.md
@@ -124,16 +124,16 @@ The WILDS package provides a simple, standardized interface for all datasets in
 This short Python snippet covers all of the steps of getting started with a WILDS dataset, including dataset download and initialization, accessing various splits, and preparing a user-customizable data loader.
 
 ```py
->>> from wilds.datasets.iwildcam_dataset import IWildCamDataset
+>>> from wilds import get_dataset
 >>> from wilds.common.data_loaders import get_train_loader
 >>> import torchvision.transforms as transforms
 
 # Load the full dataset, and download it if necessary
->>> dataset = IWildCamDataset(download=True)
+>>> dataset = get_dataset(dataset='iwildcam', download=True)
 
 # Get the training set
 >>> train_data = dataset.get_subset('train',
-...                                 transform=transforms.Compose([transforms.Resize((224,224)),
+...                                 transform=transforms.Compose([transforms.Resize((448,448)),
 ...                                                               transforms.ToTensor()]))
 
 # Prepare the standard data loader

From 20c1d10c2990b8033325da4b73a682993f0231d8 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Mon, 8 Mar 2021 13:06:21 -0800
Subject: [PATCH 110/116] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3a203a45..1954b951 100644
--- a/README.md
+++ b/README.md
@@ -202,7 +202,7 @@ Invoking the `eval` method of each dataset yields all metrics reported in the pa
 >>> dataset.eval(all_y_pred, all_y_true, all_metadata)
 {'recall_macro_all': 0.66, ...}
 ```
-Most `eval` methods take in predicted labels for `all_y_pred` by default, but the default inputs vary across datasets and are documented in docstrings.
+Most `eval` methods take in predicted labels for `all_y_pred` by default, but the default inputs vary across datasets and are documented in the `eval` docstrings of the corresponding dataset class.
 
 ## Citing WILDS
 If you use WILDS datasets in your work, please cite [our paper](https://arxiv.org/abs/2012.07421) ([Bibtex](https://wilds.stanford.edu/assets/files/bibtex.md)):

From 829630bcbef9b9dc13044d3348f93452d363ac29 Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Mon, 8 Mar 2021 13:06:45 -0800
Subject: [PATCH 111/116] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1954b951..d54f137c 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ pip install -e .
 
 ### Requirements
 - numpy>=1.19.1
-- ogb>=1.2.5
+- ogb>=1.2.6
 - outdated>=0.2.0
 - pandas>=1.1.0
 - pillow>=7.2.0

From b90aebb6c5d3fe408267accbf6f0766b6cca046b Mon Sep 17 00:00:00 2001
From: kohpangwei <pangwei@cs.stanford.edu>
Date: Mon, 8 Mar 2021 13:09:35 -0800
Subject: [PATCH 112/116] Update README.md

---
 README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index d54f137c..ca56d6a0 100644
--- a/README.md
+++ b/README.md
@@ -77,22 +77,21 @@ In the `examples/` folder, we provide a set of scripts that can be used to downl
 These scripts are configured with the default models and hyperparameters that we used for all of the baselines described in our paper. All baseline results in the paper can be easily replicated with commands like:
 
 ```bash
-cd examples
-python run_expt.py --dataset iwildcam --algorithm ERM --root_dir data
-python run_expt.py --dataset civilcomments --algorithm groupDRO --root_dir data
+python examples/run_expt.py --dataset iwildcam --algorithm ERM --root_dir data
+python examples/run_expt.py --dataset civilcomments --algorithm groupDRO --root_dir data
 ```
 
 The scripts are set up to facilitate general-purpose algorithm development: new algorithms can be added to `examples/algorithms` and then run on all of the WILDS datasets using the default models.
 
 The first time you run these scripts, you might need to download the datasets. You can do so with the `--download` argument, for example:
 ```
-python run_expt.py --dataset civilcomments --algorithm groupDRO --root_dir data --download
+python examples/run_expt.py --dataset civilcomments --algorithm groupDRO --root_dir data --download
 ```
 
-Alternatively, you can use the standalone `examples/download_datasets.py` script, for example:
+Alternatively, you can use the standalone `wilds/download_datasets.py` script to download the datasets, for example:
 
 ```bash
-python download_datasets.py --root_dir data
+python wilds/download_datasets.py --root_dir data
 ```
 
 This will download all datasets to the specified `data` folder. You can also use the `--datasets` argument to download particular datasets.

From 58956af7514219bcd9c74a00a49fbf4143a8745d Mon Sep 17 00:00:00 2001
From: Pang Wei Koh <pangwei@cs.stanford.edu>
Date: Mon, 8 Mar 2021 13:31:30 -0800
Subject: [PATCH 113/116] eval epoch comments

---
 examples/run_expt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_expt.py b/examples/run_expt.py
index adadfa61..acee29db 100644
--- a/examples/run_expt.py
+++ b/examples/run_expt.py
@@ -93,7 +93,7 @@ def main():
     parser.add_argument('--evaluate_all_splits', type=parse_bool, const=True, nargs='?', default=True)
     parser.add_argument('--eval_splits', nargs='+', default=[])
     parser.add_argument('--eval_only', type=parse_bool, const=True, nargs='?', default=False)
-    parser.add_argument('--eval_epoch', default=None, type=int)
+    parser.add_argument('--eval_epoch', default=None, type=int, help='If eval_only is set, then eval_epoch allows you to specify evaluating at a particular epoch. By default, it evaluates the best epoch by validation performance.')
 
     # Misc
     parser.add_argument('--device', type=int, default=0)

From 1b34bed8b2cd9710e3223c65284c440b21f922ba Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Tue, 9 Mar 2021 17:29:09 -0800
Subject: [PATCH 114/116] update to version 1.1.0

---
 wilds/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wilds/version.py b/wilds/version.py
index 3f7bf4a6..6d19cfa3 100644
--- a/wilds/version.py
+++ b/wilds/version.py
@@ -4,7 +4,7 @@
 import logging
 from threading import Thread
 
-__version__ = '1.0.0'
+__version__ = '1.1.0'
 
 try:
     os.environ['OUTDATED_IGNORE'] = '1'

From 1ec63b75aa5e8ce8b3b74624ae1dcb9ac66bd85b Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Tue, 9 Mar 2021 20:50:22 -0800
Subject: [PATCH 115/116] clean up for pypi release

---
 examples/models/bert/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/models/bert/__init__.py

diff --git a/examples/models/bert/__init__.py b/examples/models/bert/__init__.py
deleted file mode 100644
index e69de29b..00000000

From 9c84fec9e05f0d7e46808ee9b34be74417af88c4 Mon Sep 17 00:00:00 2001
From: Shiori Sagawa <ssagawa@cs.stanford.edu>
Date: Tue, 9 Mar 2021 21:13:33 -0800
Subject: [PATCH 116/116] clean up for pypi release

---
 examples/models/bert/__init__.py | 0
 setup.py                         | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 examples/models/bert/__init__.py

diff --git a/examples/models/bert/__init__.py b/examples/models/bert/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/setup.py b/setup.py
index 72b12368..9cd1f596 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
         'pytz>=2020.4',
     ],
     license='MIT',
-    packages=setuptools.find_packages(exclude=['dataset_preprocessing', 'examples', 'examples.models']),
+    packages=setuptools.find_packages(exclude=['dataset_preprocessing', 'examples', 'examples.models', 'examples.models.bert']),
     classifiers=[
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Intended Audience :: Science/Research',