diff --git a/Project.toml b/Project.toml
index 2abfd76b..8e1fd321 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,20 +1,20 @@
 name = "CausalELM"
 uuid = "26abab4e-b12e-45db-9809-c199ca6ddca8"
 authors = ["Darren Colby <dscolby17@gmail.com> and contributors"]
-version = "0.6"
+version = "0.7.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [compat]
-LinearAlgebra = "1.7"
-Random = "1.7"
-julia = "1.7"
 Aqua = "0.8"
 DataFrames = "1.5"
 Documenter = "1.2"
+LinearAlgebra = "1.7"
+Random = "1.7"
 Test = "1.7"
+julia = "1.7"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/README.md b/README.md
index 79df210f..7e4b9d01 100644
--- a/README.md
+++ b/README.md
@@ -41,11 +41,11 @@ series analysis, G-computation, and double machine learning; average treatment e
 treated (ATT) with G-computation; cumulative treatment effect with interrupted time series 
 analysis; and the conditional average treatment effect (CATE) via S-learning, T-learning, 
 X-learning, R-learning, and doubly robust estimation. Underlying all of these estimators are 
-extreme learning machines, a simple neural network that uses randomized weights instead of 
-using gradient descent. Once a model has been estimated, CausalELM can summarize the model, 
-including computing p-values via randomization inference, and conduct sensitivity analysis 
-to calidate the plausibility of modeling assumptions. Furthermore, all of this can be done 
-in four lines of code.
+ensembles of extreme learning machines, a simple neural network that uses randomized weights 
+and least squares optimization instead of gradient descent. Once a model has been estimated, 
+CausalELM can summarize the model and conduct sensitivity analysis to validate the 
+plausibility of modeling assumptions. Furthermore, all of this can be done in four lines of 
+code.
 </p>
 
 <h2>Extreme Learning Machines and Causal Inference</h2>
@@ -73,37 +73,39 @@ to adjust the initial estimates. This approach has three advantages. First, it i
 efficient with high dimensional data than conventional methods. Metalearners take a similar 
 approach to estimate the CATE. While all of these models are different, they have one thing 
 in common: how well they perform depends on the underlying model they fit to the data. To 
-that end, CausalELMs use extreme learning machines because they are simple yet flexible 
-enough to be universal function approximators.
+that end, CausalELMs use bagged ensembles of extreme learning machines because they are 
+simple yet flexible enough to be universal function approximators with lower varaince than 
+single extreme learning machines.
 </p>
 
 <h2>CausalELM Features</h2>
 <ul>
   <li>Estimate a causal effect, get a summary, and validate assumptions in just four lines of code</li>
-  <li>All models automatically select the best number of neurons and L2 penalty</li>
+  <li>Bagging improves performance and reduces variance without the need to tune a regularization parameter</li>
   <li>Enables using the same structs for regression and classification</li>
   <li>Includes 13 activation functions and allows user-defined activation functions</li>
   <li>Most inference and validation tests do not assume functional or distributional forms</li>
   <li>Implements the latest techniques form statistics, econometrics, and biostatistics</li>
-  <li>Works out of the box with DataFrames or arrays</li>
+  <li>Works out of the box with arrays or any data structure that implements the Tables.jl interface</li>
   <li>Codebase is high-quality, well tested, and regularly updated</li>
 </ul>
 
 <h2>What's New?</h2>
 <ul>
   <li>Now includes doubly robust estimator for CATE estimation</li>
-  <li>Uses generalized cross validation with successive halving to find the best ridge penalty</li>
-  <li>Double machine learning, R-learning, and doubly robust estimators suppot specifying confounders and covariates of interest separately</li>
-  <li>Counterfactual consistency validation simulates outcomes that violate the assumption rather than the previous binning approach</li>
-  <li>Standardized and improved docstrings and added doctests</li>
+  <li>All estimators now implement bagging to reduce predictive performance and reduce variance</li>
+  <li>Counterfactual consistency validation simulates more realistic violations of the counterfactual consistency assumption</li>
+  <li>Uses a simple heuristic to choose the number of neurons, which reduces training time and still works well in practice</li>
+  <li>Probability clipping for classifier predictions and residuals is no longer necessary due to the bagging procedure</li>
   <li>CausalELM talk has been accepted to JuliaCon 2024!</li> 
 </ul>
 
 <h2>What's Next?</h2>
 <p>
-Newer versions of CausalELM will hopefully support using GPUs and provide textual 
-interpretations of the results of calling validate on a model that has been estimated. 
-However, these priorities could also change depending on feedback recieved at JuliaCon.
+Newer versions of CausalELM will hopefully support using GPUs and provide interpretations of 
+the results of calling validate on a model that has been estimated. In addition, some 
+estimators will also support using instrumental variables. However, these priorities could 
+also change depending on feedback recieved at JuliaCon.
 </p>
 
 <h2>Disclaimer</h2>
diff --git a/docs/src/api.md b/docs/src/api.md
index 3ccdbf86..41e0ff3a 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -1,7 +1,7 @@
 # CausalELM
-Most of the methods and structs here are private, not exported, should not be called by the 
-user, and are documented for the purpose of developing CausalELM or to facilitate 
-understanding of the implementation.
+```@docs
+CausalELM.CausalELM
+```
 
 ## Types
 ```@docs
@@ -15,9 +15,8 @@ RLearner
 DoublyRobustLearner
 CausalELM.CausalEstimator
 CausalELM.Metalearner
-CausalELM.ExtremeLearningMachine
 CausalELM.ExtremeLearner
-CausalELM.RegularizedExtremeLearner
+CausalELM.ELMEnsemble
 CausalELM.Nonbinary
 CausalELM.Binary
 CausalELM.Count
@@ -41,28 +40,15 @@ elish
 fourier
 ```
 
-## Cross Validation
-```@docs
-CausalELM.generate_folds
-CausalELM.generate_temporal_folds
-CausalELM.validation_loss
-CausalELM.cross_validate
-CausalELM.best_size
-CausalELM.shuffle_data
-```
-
 ## Average Causal Effect Estimators
 ```@docs
 CausalELM.g_formula!
-CausalELM.causal_loss!
 CausalELM.predict_residuals
-CausalELM.make_folds
 CausalELM.moving_average
 ```
 
 ## Metalearners
 ```@docs
-CausalELM.causal_loss
 CausalELM.doubly_robust_formula!
 CausalELM.stage1!
 CausalELM.stage2!
@@ -94,7 +80,6 @@ CausalELM.e_value
 CausalELM.binarize
 CausalELM.risk_ratio
 CausalELM.positivity
-CausalELM.var_type
 ```
 
 ## Validation Metrics
@@ -114,17 +99,17 @@ CausalELM.fit!
 CausalELM.predict
 CausalELM.predict_counterfactual!
 CausalELM.placebo_test
-CausalELM.ridge_constant
 CausalELM.set_weights_biases
 ```
 
 ## Utility Functions
 ```@docs
+CausalELM.var_type
 CausalELM.mean
 CausalELM.var
 CausalELM.one_hot_encode
 CausalELM.clip_if_binary
 CausalELM.@model_config
 CausalELM.@standard_input_data
-CausalELM.@double_learner_input_data
+CausalELM.generate_folds
 ```
diff --git a/docs/src/contributing.md b/docs/src/contributing.md
index cce36f11..eda36ddc 100644
--- a/docs/src/contributing.md
+++ b/docs/src/contributing.md
@@ -27,15 +27,15 @@ code follows the guidelines below.
 
 *   Most new structs for estimating causal effects should have mostly the same fields. To 
     reduce the burden of repeatedly defining all these fields, it is advisable to use the 
-    model_config, standard_input_data, and double_learner_input_data macros to 
-    programmatically generate fields for new structs. Doing so will ensure that with little 
-    to no effort the new structs will work with the summarize and validate methods.
+    model_config and standard_input_data macros to programmatically generate fields for new 
+    structs. Doing so will ensure that with little to no effort the new structs will work 
+    with the summarize and validate methods.
 
 *   There are no repeated code blocks. If there are repeated codeblocks, then they should be 
     consolidated into a separate function.
 
-*   Methods should generally include types and be type stable. If there is a strong reason 
-    to deviate from this point, there should be a comment in the code explaining why.
+*   Interanl methods can contain types and be parametric but public methods should be as 
+    general as possible.
 
 *   Minimize use of new constants and macros. If they must be included, the reason for their 
     inclusion should be obvious or included in the docstring.
diff --git a/docs/src/guide/doublemachinelearning.md b/docs/src/guide/doublemachinelearning.md
index de870e50..ff0657cb 100644
--- a/docs/src/guide/doublemachinelearning.md
+++ b/docs/src/guide/doublemachinelearning.md
@@ -4,13 +4,8 @@ estimating causal effects when the dimensionality of the covariates is too high
 regression or the treatment or outcomes cannot be easily modeled parametrically. Double 
 machine learning estimates models of the treatment assignment and outcome and then combines 
 them in a final model. This is a semiparametric model in the sense that the first stage 
-models can take on any functional form but the final stage model is linear.
-
-!!! note
-    If regularized is set to true then the ridge penalty will be estimated using generalized 
-    cross validation where the maximum number of iterations is 2 * folds for the successive 
-    halving procedure. However, if the penalty in on iteration is approximately the same as in 
-    the previous penalty, then the procedure will stop early.
+models can take on any functional form but the final stage model is a linear combination of 
+the residuals from the first stage models.
 
 !!! note
     For more information see:
@@ -19,70 +14,53 @@ models can take on any functional form but the final stage model is linear.
     Whitney Newey, and James Robins. "Double/debiased machine learning for treatment and 
     structural parameters." (2018): C1-C68.
 
-
 ## Step 1: Initialize a Model
-The DoubleMachineLearning constructor takes at least three arguments, an array of 
-covariates, a treatment vector, and an outcome vector. This estimator supports binary, count, 
-or continuous treatments and binary, count, continuous, or time to event outcomes. You can 
-also specify confounders that you do not want to estimate the CATE for by passing a parameter 
-to the W argument. Otherwise, the model assumes all possible confounders are contained in X.
+The DoubleMachineLearning constructor takes at least three arguments—covariates, a 
+treatment statuses, and outcomes, all of which may be either an array or any struct that 
+implements the Tables.jl interface (e.g. DataFrames). This estimator supports binary, count, 
+or continuous treatments and binary, count, continuous, or time to event outcomes.
 
 !!! note
-    Internally, the outcome and treatment models are treated as a regression since extreme 
-    learning machines minimize the MSE. This means that predicted treatments and outcomes 
-    under treatment and control groups could fall outside [0, 1], although this is not likely 
-    in practice. To deal with this, predicted binary variables are automatically clipped to 
-    [0.0000001, 0.9999999]. This also means that count outcomes will be predicted as continuous 
-    variables.
+    Non-binary categorical outcomes are treated as continuous.
 
 !!! tip
-    You can also specify the following options: whether the treatment vector is categorical ie 
-    not continuous and containing more than two classes, whether to use L2 regularization, the 
-    activation function, the validation metric to use when searching for the best number of 
-    neurons, the minimum and maximum number of neurons to consider, the number of folds to use 
-    for cross validation, the number of iterations to perform cross validation, and the number 
-    of neurons to use in the ELM used to learn the function from number of neurons to validation 
-    loss. These arguments are specified with the following keyword arguments: t\_cat, 
-    regularized, activation, validation\_metric, min\_neurons, max\_neurons, folds, iterations, 
-    and approximator\_neurons.
+    You can also specify the the number of folds to use for cross-fitting, the number of 
+    extreme learning machines to incorporate in the ensemble, the number of features to 
+    consider for each extreme learning machine, the activation function to use, the number 
+    of observations to bootstrap in each extreme learning machine, and the number of neurons 
+    in each extreme learning machine. These arguments are specified with the folds, 
+    num_machines, num_features, activation, sample_size, and num\_neurons keywords.
+
 ```julia
 # Create some data with a binary treatment
 X, T, Y, W = rand(100, 5), [rand()<0.4 for i in 1:100], rand(100), rand(100, 4)
 
-# We could also use DataFrames
+# We could also use DataFrames or any other package implementing the Tables.jl API
 # using DataFrames
 # X = DataFrame(x1=rand(100), x2=rand(100), x3=rand(100), x4=rand(100), x5=rand(100))
 # T, Y = DataFrame(t=[rand()<0.4 for i in 1:100]), DataFrame(y=rand(100))
-# W = DataFrame(w1=rand(100), w2=rand(100), w3=rand(100), w4=rand(100))
-
-# W is optional and means there are confounders that you are not interested in estimating
-# the CATE for
-dml = DoubleMachineLearning(X, T, Y, W=W)
+dml = DoubleMachineLearning(X, T, Y)
 ```
 
 ## Step 2: Estimate the Causal Effect
-To estimate the causal effect, we call estimatecausaleffect! on the model above.
+To estimate the causal effect, we call estimate_causal_effect! on the model above.
 ```julia
 # we could also estimate the ATT by passing quantity_of_interest="ATT"
 estimate_causal_effect!(dml)
 ```
 
 # Get a Summary
-We can get a summary that includes a p-value and standard error estimated via asymptotic 
-randomization inference by passing our model to the summarize method.
-
-Calling the summarize method returns a dictionary with the estimator's task (regression or 
-classification), the quantity of interest being estimated (ATE), whether the model uses an 
-L2 penalty (always true for DML), the activation function used in the model's outcome 
-predictors, whether the data is temporal (always false for DML), the validation metric used 
-for cross validation to find the best number of neurons, the number of neurons used in the 
-ELMs used by the estimator, the number of neurons used in the ELM used to learn a mapping 
-from number of neurons to validation loss during cross validation, the causal effect, 
-standard error, and p-value.
+We can get a summary of the model by pasing the model to the summarize method.
+
+!!!note
+    To calculate the p-value and standard error for the treatmetn effect, you can set the 
+    inference argument to false. However, p-values and standard errors are calculated via 
+    randomization inference, which will take a long time. But can be sped up by launching 
+    Julia with a higher number of threads.
+
 ```julia
 # Can also use the British spelling
 # summarise(dml)
-
 summarize(dml)
 ```
 
@@ -94,12 +72,12 @@ tests do not provide definitive evidence of a violation of these assumptions. To
 counterfactual consistency assumption, we simulate counterfactual outcomes that are 
 different from the observed outcomes, estimate models with the simulated counterfactual 
 outcomes, and take the averages. If the outcome is continuous, the noise for the simulated 
-counterfactuals is drawn from N(0, dev) for each element in devs, otherwise the default is 
-0.25, 0.5, 0.75, and 1.0 standard deviations from the mean outcome. For discrete variables, 
-each outcome is replaced with a different value in the range of outcomes with probability ϵ 
-for each ϵ in devs, otherwise the default is 0.025, 0.05, 0.075, 0.1. If the average 
-estimate for a given level of violation differs greatly from the effect estimated on the 
-actual data, then the model is very sensitive to violations of the counterfactual 
+counterfactuals is drawn from N(0, dev) for each element in devs and each outcome, 
+multiplied by the original outcome, and added to the original outcome. For discrete 
+variables, each outcome is replaced with a different value in the range of outcomes with 
+probability ϵ for each ϵ in devs, otherwise the default is 0.025, 0.05, 0.075, 0.1. If the 
+average estimate for a given level of violation differs greatly from the effect estimated on 
+the actual data, then the model is very sensitive to violations of the counterfactual 
 consistency assumption for that level of violation. Next, this method tests the model's 
 sensitivity to a violation of the exchangeability assumption by calculating the E-value, 
 which is the minimum strength of association, on the risk ratio scale, that an unobserved 
diff --git a/docs/src/guide/estimatorselection.md b/docs/src/guide/estimatorselection.md
index f42af92f..ca948056 100644
--- a/docs/src/guide/estimatorselection.md
+++ b/docs/src/guide/estimatorselection.md
@@ -5,15 +5,13 @@ given dataset and causal question.
 
 | Model                            | Struct                | Causal Estimands                 | Supported Treatment Types | Supported Outcome Types                  |
 |----------------------------------|-----------------------|----------------------------------|---------------------------|------------------------------------------|
-| Interrupted Time Series Analysis | InterruptedTimeSeries | ATE, Cumulative Treatment Effect | Binary                   | Continuous, Count[^2], Time to Event         |
-| G-computation                    | GComputation          | ATE, ATT, ITT                    | Binary                   | Binary[^1],Continuous, Time to Event, Count[^2] |
-| Double Machine Learning          | DoubleMachineLearning | ATE                              | Binary[^1], Count[^2], Continuous | Binary[^1], Count[^2], Continuous, Time to Event |
-| S-learning                       | SLearner              | CATE                             | Binary                    | Binary[^1], Continuous, Time to Event, Count[^2] |
-| T-learning                       | TLearner              | CATE                             | Binary                    | Binary[^1], Continuous, Count[^2], Time to Event |
-| X-learning                       | XLearner              | CATE                             | Binary[^1]                    | Binary[^1], Continuous, Count[^2], Time to Event |
-| R-learning                       | RLearner              | CATE                             | Binary[^1], Count[^2], Continuous | Binary[^1], Count[^2], Continuous, Time to Event |
-| Doubly Robust Estimation         | DoublyRobustLearner   | CATE                             | Binary                    | Binary[^1], Continuous, Count[^2], Time to Event |
+| Interrupted Time Series Analysis | InterruptedTimeSeries | ATE, Cumulative Treatment Effect | Binary                   | Continuous, Count[^1], Time to Event         |
+| G-computation                    | GComputation          | ATE, ATT, ITT                    | Binary                   | Binary,Continuous, Time to Event, Count[^1] |
+| Double Machine Learning          | DoubleMachineLearning | ATE                              | Binary, Count[^1], Continuous | Binary, Count[^1], Continuous, Time to Event |
+| S-learning                       | SLearner              | CATE                             | Binary                    | Binary, Continuous, Time to Event, Count[^1] |
+| T-learning                       | TLearner              | CATE                             | Binary                    | Binary, Continuous, Count[^1], Time to Event |
+| X-learning                       | XLearner              | CATE                             | Binary                    | Binary, Continuous, Count[^1], Time to Event |
+| R-learning                       | RLearner              | CATE                             | Binary, Count[^1], Continuous | Binary, Count[^1], Continuous, Time to Event |
+| Doubly Robust Estimation         | DoublyRobustLearner   | CATE                             | Binary                    | Binary, Continuous, Count[^1], Time to Event |
 
-[^1]: Models that use propensity scores or predict binary treatment assignment may, on very rare occasions, return values outside of [0, 1]. In that case, values are clipped to be between 0.0000001 and 0.9999999.
-
-[^2]: Similar to other packages, predictions of count variables is treated as a continuous regression task.
\ No newline at end of file
+[^1]: Similar to other packages, predictions of count variables is treated as a continuous regression task.
\ No newline at end of file
diff --git a/docs/src/guide/gcomputation.md b/docs/src/guide/gcomputation.md
index 950be7d9..8f3a266d 100644
--- a/docs/src/guide/gcomputation.md
+++ b/docs/src/guide/gcomputation.md
@@ -5,12 +5,6 @@ given at multiple times whose status depends on the health of the patient at a g
 One way to get an unbiased estimate of the causal effect is to use G-computation. The basic 
 steps for using G-computation in CausalELM are below.
 
-!!! note
-    If regularized is set to true then the ridge penalty will be estimated using generalized 
-    cross validation where the maximum number of iterations is 2 * folds for the successive 
-    halving procedure. However, if the penalty in on iteration is approximately the same as in 
-    the previous penalty, then the procedure will stop early.
-
 !!! note 
     For a good overview of G-Computation see:
     
@@ -21,57 +15,50 @@ steps for using G-computation in CausalELM are below.
         study." Scientific reports 10, no. 1 (2020): 9219.
 
 ## Step 1: Initialize a Model
-The GComputation method takes at least three arguments: an array of covariates, a vector of 
-treatment statuses, and an outcome vector. It can support binary treatments and binary, 
-continuous, time to event, and count outcome variables.
-
-!!! tip
-    You can also specify the causal estimand, whether to employ L2 regularization, which 
-    activation function to use, whether the data is of a temporal nature, the metric to use when 
-    using cross validation to find the best number of neurons, the minimum number of neurons to 
-    consider, the maximum number of neurons to consider, the number of folds to use during cross 
-    caidation, and the number of neurons to use in the ELM that learns a mapping from number of 
-    neurons to validation loss. These options are specified with the following keyword 
-    arguments: quantity\_of\_interest, regularized, activation, temporal, validation\_metric, 
-    min\_neurons, max\_neurons, folds, iterations, and approximator\_neurons.
+The GComputation constructor takes at least three arguments: covariates, treatment statuses, 
+outcomes, all of which can be either an array or any data structure that implements the 
+Tables.jl interface (e.g. DataFrames). This implementation supports binary treatments and 
+binary, continuous, time to event, and count outcome variables.
 
 !!! note
-    Internally, the outcome model is treated as a regression since extreme learning machines 
-    minimize the MSE. This means that predicted outcomes under treatment and control groups 
-    could fall outside [0, 1], although this is not likely in practice. To deal with this, 
-    predicted binary variables are automatically clipped to [0.0000001, 0.9999999]. This also 
-    means that count outcomes will be predicted as continuous variables.
+    Non-binary categorical outcomes are treated as continuous.
+
+!!! tip
+    You can also specify the causal estimand, which activation function to use, whether the 
+    data is of a temporal nature, the number of extreme learning machines to use, the 
+    number of features to consider for each extreme learning machine, the number of 
+    bootstrapped observations to include in each extreme learning machine, and the number of 
+    neurons to use during estimation. These options are specified with the following keyword 
+    arguments: quantity\_of\_interest, activation, temporal, num_machines, num_feats, 
+    sample_size, and num\_neurons.
 
 ```julia
 # Create some data with a binary treatment
 X, T, Y =  rand(1000, 5), [rand()<0.4 for i in 1:1000], rand(1000)
 
-# We could also use DataFrames
+# We could also use DataFrames or any other package that implements the Tables.jl API
 # using DataFrames
 # X = DataFrame(x1=rand(1000), x2=rand(1000), x3=rand(1000), x4=rand(1000), x5=rand(1000))
 # T, Y = DataFrame(t=[rand()<0.4 for i in 1:1000]), DataFrame(y=rand(1000))
-
 g_computer = GComputation(X, T, Y)
 ```
 
 ## Step 2: Estimate the Causal Effect
-To estimate the causal effect, we pass the model above to estimatecausaleffect!.
+To estimate the causal effect, we pass the model above to estimate_causal_effect!.
 ```julia
 # Note that we could also estimate the ATT by setting quantity_of_interest="ATT"
 estimate_causal_effect!(g_computer)
 ```
 
 ## Step 3: Get a Summary
-We get a summary of the model that includes a p-value and standard error estimated via 
-asymptotic randomization inference by passing our model to the summarize method.
-
-Calling the summarize method returns a dictionary with the estimator's task (regression or 
-classification), the quantity of interest being estimated (ATE or ATT), whether the model 
-uses an L2 penalty, the activation function used in the model's outcome predictors, whether 
-the data is temporal, the validation metric used for cross validation to find the best 
-number of neurons, the number of neurons used in the ELMs used by the estimator, the number 
-of neurons used in the ELM used to learn a mapping from number of neurons to validation 
-loss during cross validation, the causal effect, standard error, and p-value.
+We can get a summary of the model by pasing the model to the summarize method.
+
+!!!note
+    To calculate the p-value and standard error for the treatmetn effect, you can set the 
+    inference argument to false. However, p-values and standard errors are calculated via 
+    randomization inference, which will take a long time. But can be sped up by launching 
+    Julia with a higher number of threads.
+
 ```julia
 summarize(g_computer)
 ```
@@ -84,12 +71,12 @@ tests do not provide definitive evidence of a violation of these assumptions. To
 counterfactual consistency assumption, we simulate counterfactual outcomes that are 
 different from the observed outcomes, estimate models with the simulated counterfactual 
 outcomes, and take the averages. If the outcome is continuous, the noise for the simulated 
-counterfactuals is drawn from N(0, dev) for each element in devs, otherwise the default is 
-0.25, 0.5, 0.75, and 1.0 standard deviations from the mean outcome. For discrete variables, 
-each outcome is replaced with a different value in the range of outcomes with probability ϵ 
-for each ϵ in devs, otherwise the default is 0.025, 0.05, 0.075, 0.1. If the average 
-estimate for a given level of violation differs greatly from the effect estimated on the 
-actual data, then the model is very sensitive to violations of the counterfactual 
+counterfactuals is drawn from N(0, dev) for each element in devs and each outcome, 
+multiplied by the original outcome, and added to the original outcome. For discrete 
+variables, each outcome is replaced with a different value in the range of outcomes with 
+probability ϵ for each ϵ in devs, otherwise the default is 0.025, 0.05, 0.075, 0.1. If the 
+average estimate for a given level of violation differs greatly from the effect estimated on 
+the actual data, then the model is very sensitive to violations of the counterfactual 
 consistency assumption for that level of violation. Next, this method tests the model's 
 sensitivity to a violation of the exchangeability assumption by calculating the E-value, 
 which is the minimum strength of association, on the risk ratio scale, that an unobserved 
@@ -102,8 +89,7 @@ an estimated zero probability of treatment, which implies the positivity assumpt
 satisfied.
 
 !!! tip
-    One can also specify the maxium number of possible treatments to consider for the causal 
-    consistency assumption and the minimum and maximum probabilities of treatment for the 
+    One can also specify the minimum and maximum probabilities of treatment for the 
     positivity assumption with the num\_treatments, min, and max keyword arguments.
 
 !!! danger
diff --git a/docs/src/guide/its.md b/docs/src/guide/its.md
index 8fddf609..982dd65d 100644
--- a/docs/src/guide/its.md
+++ b/docs/src/guide/its.md
@@ -1,23 +1,17 @@
 # Interrupted Time Series Analysis
 Sometimes we want to know how an outcome variable for a single unit changed after an event 
 or intervention. For example, if regulators announce sanctions against company A, we might 
-want to know how the price of stock A changed after the announcement. Since we do not know
-what the price of Company A's stock would have been if the santions were not announced, we
-need some way to predict those values. An interrupted time series analysis does this by 
-using some covariates that are related to the oucome variable but not related to whether the 
-event happened to predict what would have happened. The estimated effects are the 
-differences between the predicted post-event counterfactual outcomes and the observed 
+want to know how the price of company A's stock changed after the announcement. Since we do 
+not know what the price of Company A's stock would have been if the santions were not 
+announced, we need some way to predict those values. An interrupted time series analysis 
+does this by using some covariates that are related to the outcome but not related to 
+whether the event happened to predict what would have happened. The estimated effects are 
+the differences between the predicted post-event counterfactual outcomes and the observed 
 post-event outcomes, which can also be aggregated to mean or cumulative effects. 
 Estimating an interrupted time series design in CausalELM consists of three steps.
 
 !!! note
-    If regularized is set to true then the ridge penalty will be estimated using generalized 
-    cross validation where the maximum number of iterations is 2 * folds for the successive 
-    halving procedure. However, if the penalty in on iteration is approximately the same as in 
-    the previous penalty, then the procedure will stop early.
-
-!!! note
-    For a deeper dive on interrupted time series estimation see:
+    For a general overview of interrupted time series estimation see:
     
         Bernal, James Lopez, Steven Cummins, and Antonio Gasparrini. "Interrupted time series 
         regression for the evaluation of public health interventions: a tutorial." International 
@@ -35,35 +29,32 @@ Estimating an interrupted time series design in CausalELM consists of three step
     opposed to the commonly used segment linear regression.
 
 ## Step 1: Initialize an interrupted time series estimator
-The InterruptedTimeSeries method takes at least four agruments: an array of pre-event 
-covariates, a vector of pre-event outcomes, an array of post-event covariates, and a vector 
-of post-event outcomes. The interrupted time series estimator assumes outcomes are either 
-continuous, count, or time to event variables.
+The InterruptedTimeSeries constructor takes at least four agruments: pre-event covariates, 
+pre-event outcomes, post-event covariates, and post-event outcomes, all of which can be 
+either an array or any data structure that implements the Tables.jl interface (e.g. 
+DataFrames). The interrupted time series estimator assumes outcomes are either continuous, 
+count, or time to event variables.
 
 !!! note
-    Since extreme learning machines minimize the MSE, count outcomes will be predicted as 
-    continuous variables.
+    Non-binary categorical outcomes are treated as continuous.
 
 !!! tip
-    You can also specify whether or not to use L2 regularization, which activation function to 
-    use, the metric to use when using cross validation to find the best number of neurons, the 
-    minimum number of neurons to consider, the maximum number of neurons to consider, the number 
-    of folds to use during cross caidation, the number of neurons to use in the ELM that learns 
-    a mapping from number of neurons to validation loss, and whether to include a rolling 
-    average autoregressive term. These options can be specified using the keyword arguments 
-    regularized, activation, validation\_metric, min\_neurons, max\_neurons, folds, iterations, 
-    approximator\_neurons, and autoregression.
+    You can also specify which activation function to use, the number of extreme learning 
+    machines to use, the number of features to consider for each extreme learning machine, 
+    the number of bootstrapped observations to include in each extreme learning machine, and 
+    the number of neurons to use during estimation. These options are specified with the 
+    following keyword arguments: activation, num_machines, num_feats, sample_size, and 
+    num\_neurons.
 
 ```julia
 # Generate some data to use
 X₀, Y₀, X₁, Y₁ =  rand(1000, 5), rand(1000), rand(100, 5), rand(100)
 
-# We could also use DataFrames
+# We could also use DataFrames or any other package that implements the Tables.jl interface
 # using DataFrames
 # X₀ = DataFrame(x1=rand(1000), x2=rand(1000), x3=rand(1000), x4=rand(1000), x5=rand(1000))
 # X₁ = DataFrame(x1=rand(1000), x2=rand(1000), x3=rand(1000), x4=rand(1000), x5=rand(1000))
 # Y₀, Y₁ = DataFrame(y=rand(1000)), DataFrame(y=rand(1000))
-
 its = InterruptedTimeSeries(X₀, Y₀, X₁, Y₁)
 ```
 
@@ -75,16 +66,14 @@ estimate_causal_effect!(its)
 ```
 
 ## Step 3: Get a Summary
-We can get a summary of the model, including a p-value and statndard via asymptotic 
-randomization inference, by pasing the model to the summarize method.
-
-Calling the summarize method returns a dictionary with the estimator's task (always 
-regression for interrupted time series analysis), whether the model uses an L2 penalty, 
-the activation function used in the model's outcome predictors, the validation metric used 
-for cross validation to find the best number of neurons, the number of neurons used in the 
-ELMs used by the estimator, the number of neurons used in the ELM used to learn a mapping 
-from number of neurons to validation loss during cross validation, the causal effect, 
-standard error, and p-value.
+We can get a summary of the model by pasing the model to the summarize method.
+
+!!!note
+    To calculate the p-value and standard error for the treatmetn effect, you can set the 
+    inference argument to false. However, p-values and standard errors are calculated via 
+    randomization inference, which will take a long time. But can be sped up by launching 
+    Julia with a higher number of threads.
+
 ```julia
 summarize(its)
 ```
diff --git a/docs/src/guide/metalearners.md b/docs/src/guide/metalearners.md
index f5cbe56e..76718c60 100644
--- a/docs/src/guide/metalearners.md
+++ b/docs/src/guide/metalearners.md
@@ -11,12 +11,6 @@ doubly robust learners, they can only handle binary treatments. On the other han
 R-learners can handle binary, categorical, count, or continuous treatments but only supports 
 continuous outcomes.
 
-!!! note
-    If regularized is set to true then the ridge penalty will be estimated using generalized 
-    cross validation where the maximum number of iterations is 2 * folds for the successive 
-    halving procedure. However, if the penalty in on iteration is approximately the same as 
-    in the previous penalty, then the procedure will stop early.
-
 !!! note
     For a deeper dive on S-learning, T-learning, and X-learning see:
     
@@ -30,45 +24,44 @@ continuous outcomes.
         Nie, Xinkun, and Stefan Wager. "Quasi-oracle estimation of heterogeneous treatment 
         effects." Biometrika 108, no. 2 (2021): 299-319.
 
+
     To see the details out doubly robust estimation implemented in CausalELM see:
+
         Kennedy, Edward H. "Towards optimal doubly robust estimation of heterogeneous causal 
         effects." Electronic Journal of Statistics 17, no. 2 (2023): 3008-3049.
 
 # Initialize a Metalearner
 S-learners, T-learners, X-learners, R-learners, and doubly robust estimators all take at 
-least three arguments: an array of covariates, a vector of outcomes, and a vector of 
-treatment statuses. S, T, X, and doubly robust learners support binary treatment variables 
-and binary, continuous, count, or time to event outcomes. The R-learning estimator supports 
-binary, continuous, or count treatment variables and binary, continuous, count, or time to 
-event outcomes.
+least three arguments—covariates, treatment statuses, and outcomes, all of which can be 
+either an array or any struct that implements the Tables.jl interface (e.g. DataFrames). S, 
+T, X, and doubly robust learners support binary treatment variables and binary, continuous, 
+count, or time to event outcomes. The R-learning estimator supports binary, continuous, or 
+count treatment variables and binary, continuous, count, or time to event outcomes.
 
 !!! note
-    Internally, the outcome and treatment models of the metalearners are treated as a regression 
-    since extreme learning machines minimize the MSE. This means that predicted treatments and 
-    outcomes under treatment and control groups could fall outside [0, 1], although this is not 
-    likely in practice. To deal with this, predicted binary variables are automatically clipped to 
-    [0.0000001, 0.9999999].This also means that count outcomes will be predicted as continuous 
-    variables.
+    Non-binary categorical outcomes are treated as continuous.
 
 !!! tip
-    Additional options can be specified for each type of metalearner using its keyword arguments.
+    You can also specify the the number of folds to use for cross-fitting, the number of 
+    extreme learning machines to incorporate in the ensemble, the number of features to 
+    consider for each extreme learning machine, the activation function to use, the number 
+    of observations to bootstrap in each extreme learning machine, and the number of neurons 
+    in each extreme learning machine. These arguments are specified with the folds, 
+    num_machines, num_features, activation, sample_size, and num\_neurons keywords.
+
 ```julia
 # Generate data to use
 X, Y, T =  rand(1000, 5), rand(1000), [rand()<0.4 for i in 1:1000]
 
-# We can also speficy potential confounders that we are not interested in
-W = randn(1000, 6)
-
-# We could also use DataFrames
+# We could also use DataFrames or any other package that implements the Tables.jl API
 # using DataFrames
 # X = DataFrame(x1=rand(1000), x2=rand(1000), x3=rand(1000), x4=rand(1000), x5=rand(1000))
 # T, Y = DataFrame(t=[rand()<0.4 for i in 1:1000]), DataFrame(y=rand(1000))
-
 s_learner = SLearner(X, Y, T)
 t_learner = TLearner(X, Y, T)
 x_learner = XLearner(X, Y, T)
-r_learner = RLearner(X, Y, T, W=W)
-dr_learner = DoublyRobustLearner(X, T, Y, W=W)
+r_learner = RLearner(X, Y, T)
+dr_learner = DoublyRobustLearner(X, T, Y)
 ```
 
 # Estimate the CATE
@@ -82,16 +75,14 @@ estimate_causal_effect!(dr_lwarner)
 ```
 
 # Get a Summary
-We can get a summary of the models that includes p0values and standard errors for the 
-average treatment effect by passing the models to the summarize method.
-
-Calling the summarize methodd returns a dictionary with the estimator's task (regression or 
-classification), the quantity of interest being estimated (CATE), whether the model 
-uses an L2 penalty, the activation function used in the model's outcome predictors, whether 
-the data is temporal, the validation metric used for cross validation to find the best 
-number of neurons, the number of neurons used in the ELMs used by the estimator, the number 
-of neurons used in the ELM used to learn a mapping from number of neurons to validation 
-loss during cross validation, the causal effect, standard error, and p-value for the ATE.
+We can get a summary of the model by pasing the model to the summarize method.
+
+!!!note
+    To calculate the p-value and standard error for the treatmetn effect, you can set the 
+    inference argument to false. However, p-values and standard errors are calculated via 
+    randomization inference, which will take a long time. But can be sped up by launching 
+    Julia with a higher number of threads.
+
 ```julia
 summarize(s_learner)
 summarize(t_learner)
@@ -108,12 +99,12 @@ tests do not provide definitive evidence of a violation of these assumptions. To
 counterfactual consistency assumption, we simulate counterfactual outcomes that are 
 different from the observed outcomes, estimate models with the simulated counterfactual 
 outcomes, and take the averages. If the outcome is continuous, the noise for the simulated 
-counterfactuals is drawn from N(0, dev) for each element in devs, otherwise the default is 
-0.25, 0.5, 0.75, and 1.0 standard deviations from the mean outcome. For discrete variables, 
-each outcome is replaced with a different value in the range of outcomes with probability ϵ 
-for each ϵ in devs, otherwise the default is 0.025, 0.05, 0.075, 0.1. If the average 
-estimate for a given level of violation differs greatly from the effect estimated on the 
-actual data, then the model is very sensitive to violations of the counterfactual 
+counterfactuals is drawn from N(0, dev) for each element in devs and each outcome, 
+multiplied by the original outcome, and added to the original outcome. For discrete 
+variables, each outcome is replaced with a different value in the range of outcomes with 
+probability ϵ for each ϵ in devs, otherwise the default is 0.025, 0.05, 0.075, 0.1. If the 
+average estimate for a given level of violation differs greatly from the effect estimated on 
+the actual data, then the model is very sensitive to violations of the counterfactual 
 consistency assumption for that level of violation. Next, this method tests the model's 
 sensitivity to a violation of the exchangeability assumption by calculating the E-value, 
 which is the minimum strength of association, on the risk ratio scale, that an unobserved 
diff --git a/docs/src/index.md b/docs/src/index.md
index 57c2583e..5b777f0a 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -16,33 +16,33 @@ CurrentModule = CausalELM
 CausalELM leverages new techniques in machine learning and statistics to estimate individual 
 and aggregate treatment effects in situations where traditional methods are unsatisfactory 
 or infeasible. To enable this, CausalELM provides a simple API to initialize a model, 
-estimate a causal effect, get a summary from the model, and test the robustness of the 
-model. CausalELM includes estimators for interupted time series analysis, G-Computation, 
-double machine learning, S-Learning, T-Learning, X-Learning, R-learning, and doubly robust 
-estimation. Underlying all these estimators are extreme learning machines. Like tree-based 
-learners, which are often used in causal machine learning, extreme learning machines are 
-simple and can capture non-linear relationships. However, unlike random forests or other 
-ensemble models, they essentially only require two hyperparameters—the number of neurons, 
-and the L2 penalty (when using regularization)—which are automatically tuned when 
-estimate_causal_effect! is called. This makes CausalELM both very simple and very powerful 
-for estimating treatment effects.
+estimate a causal effect, get a summary of the model, and test its robustness. CausalELM 
+includes estimators for interupted time series analysis, G-Computation, double machine 
+learning, S-Learning, T-Learning, X-Learning, R-learning, and doubly robust estimation. 
+Underlying all these estimators are bagged extreme learning machines. Extreme learning 
+machines are a single layer feedfoward neural network that relies on randomized weights and 
+least squares optimization, making them expressive, simple, and computationally 
+efficient. Combining them with bagging reduces the variance caused by the randomization of 
+weights and provides a form of regularization that does not have to be tuned through cross 
+validation. These attributes make CausalELM a very simple and powerful package for 
+estimating treatment effects.
 
 ### Features
 *   Estimate a causal effect, get a summary, and validate assumptions in just four lines of code
-*   All models automatically select the best number of neurons and L2 penalty
+*   Bagging improves performance and reduces variance without the need to tune a regularization parameter
 *   Enables using the same structs for regression and classification
 *   Includes 13 activation functions and allows user-defined activation functions
 *   Most inference and validation tests do not assume functional or distributional forms
-*   Implements the latest techniques form statistics, econometrics, and biostatistics
-*   Works out of the box with DataFrames or arrays
+*   Implements the latest techniques from statistics, econometrics, and biostatistics
+*   Works out of the box with arrays or any data structure that implements the Tables.jl interface
 *   Codebase is high-quality, well tested, and regularly updated
 
 ### What's New?
 *   Now includes doubly robust estimator for CATE estimation
-*   Uses generalized cross validation with successive halving to find the best ridge penalty
-*   Double machine learning, R-learning, and doubly robust estimators suppot specifying confounders and covariates of interest separately
-*   Counterfactual consistency validation simulates outcomes that violate the assumption rather than the previous binning approach
-*   Standardized and improved docstrings and added doctests
+*   All estimators now implement bagging to reduce predictive performance and reduce variance
+*   Counterfactual consistency validation simulates more realistic violations of the counterfactual consistency assumption
+*   Uses a simple heuristic to choose the number of neurons, which reduces training time and still works well in practice
+*   Probability clipping for classifier predictions and residuals is no longer necessary due to the bagging procedure
 *   CausalELM talk has been accepted to JuliaCon 2024!
 
 ### What makes CausalELM different?
@@ -50,16 +50,16 @@ Other packages, mainly EconML, DoWhy, CausalAI, and CausalML, have similar funci
 Beides being written in Julia rather than Python, the main differences between CausalELM and 
 these libraries are:
 *   Simplicity is core to casualELM's design philosophy. CausalELM only uses one type of
-    machine learning model, extreme learning machines (with optional L2 regularization) and 
-    does not require you to import any other packages or initialize machine learning models, 
-    pass machine learning structs to CausalELM's estimators, convert dataframes or arrays to 
-    a special type, or one hot encode categorical treatments. By trading a little bit of 
-    flexibility for a simpler API, all of CausalELM's functionality can be used with just 
-    four lines of code.
-*   As part of this design principle, CausalELM's estimators handle all of the work in 
-    finding the best number of neurons during estimation. They create folds or rolling 
-    rolling for time series data and use an extreme learning machine interpolator to find 
-    the best number of neurons.
+    machine learning model, extreme learning machines (with bagging) and does not require 
+    you to import any other packages or initialize machine learning models, pass machine 
+    learning structs to CausalELM's estimators, convert dataframes or arrays to a special 
+    type, or one hot encode categorical treatments. By trading a little bit of flexibility 
+    for a simpler API, all of CausalELM's functionality can be used with just four lines of 
+    code.
+*   As part of this design principle, CausalELM's estimators decide whether to use regression 
+    or classification based on the type of outcome variable. This is in contrast to most 
+    machine learning packages, which have separate classes or structs fro regressors and 
+    classifiers of the same model.
 *   CausalELM's validate method, which is specific to each estimator, allows you to validate 
     or test the sentitivity of an estimator to possible violations of identifying assumptions.
 *   Unlike packages that do not allow you to estimate p-values and standard errors, use 
diff --git a/docs/src/release_notes.md b/docs/src/release_notes.md
index bc71e7d1..2e3cb566 100644
--- a/docs/src/release_notes.md
+++ b/docs/src/release_notes.md
@@ -1,7 +1,22 @@
 # Release Notes
 These release notes adhere to the [keep a changelog](https://keepachangelog.com/en/1.0.0/) format. Below is a list of changes since CausalELM was first released.
 
-## Version [v0.6.0](https://github.com/dscolby/CausalELM.jl/releases/tag/v0.6.0) - 2024-03-23
+## Version [v0.7.0](https://github.com/dscolby/CausalELM.jl/releases/tag/v0.6.1) - 2024-06-22
+### Added
+*   Implemented bagged ensemble of extreme learning machines to use with estimators [#67](https://github.com/dscolby/CausalELM.jl/issues/67)
+*   Implemented multithreading for testing the sensitivity of estimators to the counterfactual consistency assumption
+### Changed
+*   Compute the number of neurons to use with log heuristic instead of cross validation [#62](https://github.com/dscolby/CausalELM.jl/issues/62)
+*   Calculate probabilities as the average label predicted by the ensemble instead of clipping [#71](https://github.com/dscolby/CausalELM.jl/issues/71)
+*   Made calculation of p-values and standard errors optional and not executed by default in summarize methods [#65](https://github.com/dscolby/CausalELM.jl/issues/65)
+*   Removed redundant W argument for double machine learning, R-learning, and doubly robust estimation [#68](https://github.com/dscolby/CausalELM.jl/issues/68)
+*   Use swish as the default activation function [#72](https://github.com/dscolby/CausalELM.jl/issues/72)
+*   Implemented noise as a function of each observation instead of the variance of the outcome when testing the sensitivity of the counterfactual consistency assumption [#74](https://github.com/dscolby/CausalELM.jl/issues/74)
+*   p-values and standard errors for randomization inference are generated in parallel
+### Fixed
+*   Applying the weight trick for R-learning [#70](https://github.com/dscolby/CausalELM.jl/issues/70)
+
+## Version [v0.6.0](https://github.com/dscolby/CausalELM.jl/releases/tag/v0.6.0) - 2024-06-15
 ### Added
 *   Implemented doubly robust learner for CATE estimation [#31](https://github.com/dscolby/CausalELM.jl/issues/31)
 *   Provided better explanations of supported treatment and outcome variable types in the docs [#41](https://github.com/dscolby/CausalELM.jl/issues/41)
diff --git a/src/CausalELM.jl b/src/CausalELM.jl
index f53299e0..949e42ae 100644
--- a/src/CausalELM.jl
+++ b/src/CausalELM.jl
@@ -1,10 +1,9 @@
 """
-Macros, functions, and structs for applying Extreme Learning Machines to causal inference
-tasks where the counterfactual is unavailable or biased and must be predicted. Supports 
-causal inference via interrupted time series designs, parametric G-computation, double 
-machine learning, and S-learning, T-learning, X-learning, R-learning, and doubly robust 
-estimation. Additionally, these tasks can be performed with or without L2 penalization and
-will automatically choose the best number of neurons and L2 penalty. 
+Macros, functions, and structs for applying Ensembles of extreme learning machines to causal 
+inference tasks where the counterfactual is unavailable or biased and must be predicted. 
+Supports causal inference via interrupted time series designs, parametric G-computation, 
+double machine learning, and S-learning, T-learning, X-learning, R-learning, and doubly 
+robust estimation.
 
 For more details on Extreme Learning Machines see:
     Huang, Guang-Bin, Qin-Yu Zhu, and Chee-Kheong Siew. "Extreme learning machine: theory 
@@ -22,11 +21,10 @@ export estimate_causal_effect!, summarize, summarise
 export InterruptedTimeSeries, GComputation, DoubleMachineLearning
 export SLearner, TLearner, XLearner, RLearner, DoublyRobustLearner
 
-include("utilities.jl")
 include("activation.jl")
+include("utilities.jl")
 include("models.jl")
 include("metrics.jl")
-include("crossval.jl")
 include("estimators.jl")
 include("metalearners.jl")
 include("inference.jl")
diff --git a/src/crossval.jl b/src/crossval.jl
deleted file mode 100644
index b41825d6..00000000
--- a/src/crossval.jl
+++ /dev/null
@@ -1,228 +0,0 @@
-using Random: randperm
-
-"""
-    generate_folds(X, Y, folds)
-
-Create folds for cross validation.
-
-# Examples
-```jldoctest
-julia> xfolds, y_folds = CausalELM.generate_folds(zeros(4, 2), zeros(4), 2)
-([[0.0 0.0], [0.0 0.0; 0.0 0.0; 0.0 0.0]], [[0.0], [0.0, 0.0, 0.0]])
-```
-"""
-function generate_folds(X, Y, folds)
-    msg = """the number of folds must be less than the number of observations"""
-    n = length(Y)
-
-    if folds >= n
-        throw(ArgumentError(msg))
-    end
-
-    fold_setx = Array{Array{Float64,2}}(undef, folds)
-    fold_sety = Array{Array{Float64,1}}(undef, folds)
-
-    # Indices to start and stop for each fold
-    stops = round.(Int, range(; start=1, stop=n, length=folds + 1))
-
-    # Indices to use for making folds
-    indices = [s:(e - (e < n) * 1) for (s, e) in zip(stops[1:(end - 1)], stops[2:end])]
-
-    for (i, idx) in enumerate(indices)
-        fold_setx[i], fold_sety[i] = X[idx, :], Y[idx]
-    end
-
-    return fold_setx, fold_sety
-end
-
-"""
-    generate_temporal_folds(X, Y, folds)
-
-Create rolling folds for cross validation of time series data.
-
-# Examples
-```jldoctest
-julia> xfolds, yfolds = CausalELM.generate_temporal_folds([1 1; 1 1; 0 0; 0 0], zeros(4), 2)
-([[1 1; 1 1], [1 1; 1 1; 0 0; 0 0]], [[0.0, 0.0], [0.0, 0.0, 0.0, 0.0]])
-```
-"""
-function generate_temporal_folds(X, Y, folds=5)
-    msg = """the number of folds must be less than the number of 
-             observations and greater than or equal to iteration"""
-    n = length(Y)
-
-    # Make sure there aren't more folds than observations
-    if folds >= n
-        throw(ArgumentError(msg))
-    end
-
-    # The indices are evely spaced and start at the top to make rolling splits for TS data
-    indices = Int.(floor.(collect(range(1, size(X, 1), folds + 1))))
-    x_folds, y_folds = [X[1:i, :] for i in indices[2:end]], [Y[1:i] for i in indices[2:end]]
-
-    return x_folds, y_folds
-end
-
-"""
-    validation_loss(xtrain, ytrain, xtest, ytest, nodes, metric; kwargs...)
-
-Calculate a validation metric for a single fold in k-fold cross validation.
-
-# Arguments
-- `xtrain::Any`: an array of features to train on.
-- `ytrain::Any`: an array of training labels.
-- `xtest::Any`: an array of features to test on.
-- `ytrain::Any`: an array of testing labels.
-- `nodes::Int`: the number of neurons in the extreme learning machine.
-- `metric::Function`: the validation metric to calculate.
-
-# Keywords
-- `activation::Function=relu`: the activation function to use.
-- `regularized::Function=true`: whether to use L2 regularization.
-
-# Examples
-```julia
-julia> x = rand(100, 5); y = Float64.(rand(100) .> 0.5)
-julia> validation_loss(x, y, 5, accuracy, 3)
-0.5402532843396273
-```
-"""
-function validation_loss(
-    xtrain, ytrain, xtest, ytest, nodes, metric; activation=relu, regularized=true
-)
-    if regularized
-        network = RegularizedExtremeLearner(xtrain, ytrain, nodes, activation)
-    else
-        network = ExtremeLearner(xtrain, ytrain, nodes, activation)
-    end
-
-    fit!(network)
-    predictions = predict(network, xtest)
-
-    return metric(ytest[1, :], predictions[1, :])
-end
-
-"""
-    cross_validate(X, Y, neurons, metric, activation, regularized, folds, temporal)
-
-Calculate a validation metric for k folds using a single set of hyperparameters.
-
-# Arguments
-- `X::Array`: array of features to train on.
-- `Y::Vector`: vector of labels to train on.
-- `neurons::Int`: number of neurons to use in the extreme learning machine.
-- `metric::Function`: validation metric to calculate.
-- `activation::Function=relu`: activation function to use.
-- `regularized::Function=true`: whether to use L2 regularization
-- `folds::Int`: number of folds to use for cross validation.
-- `temporal::Function=true`: whether the data is of a time series or panel nature.
-
-# Examples
-```julia
-julia> x = rand(100, 5); y = Float64.(rand(100) .> 0.5)
-julia> cross_validate(x, y, 5, accuracy)
-0.8891028047100136
-```
-"""
-function cross_validate(X, Y, neurons, metric, activation, regularized, folds, temporal)
-    mean_metric = 0.0
-    xfs, yfs = temporal ? generate_temporal_folds(X, Y, folds) : generate_folds(X, Y, folds)
-
-    @inbounds for fold in 1:folds
-        if !temporal
-            xtr = reduce(vcat, [xfs[f] for f in 1:folds if f != fold])
-            ytr = reduce(vcat, [yfs[f] for f in 1:folds if f != fold])
-            xtst, ytst = xfs[fold], yfs[fold]
-            # The last fold can't be used to training since it will leave nothing to predict
-        elseif temporal && fold < folds
-            xtr, ytr = reduce(vcat, xfs[1:fold]), reduce(vcat, yfs[1:fold])
-            xtst, ytst = reduce(vcat, xfs[(fold + 1):end]),
-            reduce(vcat, yfs[(fold + 1):end])
-        else
-            continue
-        end
-
-        mean_metric += validation_loss(
-            xtr,
-            ytr,
-            xtst,
-            ytst,
-            neurons,
-            metric;
-            activation=activation,
-            regularized=regularized,
-        )
-    end
-    return mean_metric / folds
-end
-
-"""
-    best_size(m)
-
-Compute the best number of neurons for an estimator.
-
-# Notes
-The procedure tests networks with numbers of neurons in a sequence whose length is given 
-by iterations on the interval [min_neurons, max_neurons]. Then, it uses the networks 
-sizes and validation errors from the sequence to predict the validation error or metric 
-for every network size between min_neurons and max_neurons using the function 
-approximation ability of an Extreme Learning Machine. Finally, it returns the network 
-size with the best predicted validation error or metric.
-
-# Arguments
-- `m::Any`: estimator to find the best number of neurons for.
-
-# Examples
-```julia
-julia> X, T, Y = rand(100, 5), rand(0:1, 100), rand(100)
-julia> m1 = GComputation(X, T, y)
-julia> best_size(m1)
-8
-```
-"""
-function best_size(m)
-    loss = Vector{Float64}(undef, m.iterations)
-    num_neurons = round.(Int, range(m.min_neurons, m.max_neurons; length=m.iterations))
-    (X, Y) = m isa InterruptedTimeSeries ? (m.X₀, m.Y₀) : (m.X, m.Y)
-
-    # Use cross validation to get testing loss from [min_neurons, max_neurons] by iterations
-    @inbounds for (idx, potential_neurons) in pairs(num_neurons)
-        loss[idx] = cross_validate(
-            X,
-            Y,
-            round(Int, potential_neurons),
-            m.validation_metric,
-            m.activation,
-            m.regularized,
-            m.folds,
-            m.temporal,
-        )
-    end
-
-    # Use an extreme learning machine to learn a function F:num_neurons -> loss
-    mapper = ExtremeLearner(
-        reshape(num_neurons, :, 1), reshape(loss, :, 1), m.approximator_neurons, relu
-    )
-    fit!(mapper)
-    pred_metrics = predict(mapper, Float64[(m.min_neurons):(m.max_neurons);])
-    return ifelse(startswith(m.task, "c"), argmax([pred_metrics]), argmin([pred_metrics]))
-end
-
-"""
-    shuffle_data(X, Y)
-
-Shuffles covariates and outcome vector for cross validation.
-
-# Examples
-```julia
-julia> shuffle_data([1 1; 2 2; 3 3; 4 4], collect(1:4))
-([4 4; 2 2; 1 1; 3 3], [4, 2, 1, 3])
-```
-"""
-function shuffle_data(X, Y)
-    idx = randperm(size(X, 1))
-    new_data = mapslices.(x -> x[idx], [X, Y], dims=1)
-    X, Y = new_data
-
-    return Array(X), vec(Y)
-end
diff --git a/src/estimators.jl b/src/estimators.jl
index 89cbc5ee..02afe8e0 100644
--- a/src/estimators.jl
+++ b/src/estimators.jl
@@ -7,28 +7,23 @@ abstract type CausalEstimator end
 Initialize an interrupted time series estimator. 
 
 # Arguments
-- `X₀::Any`: an array or DataFrame of covariates from the pre-treatment period.
-- `Y₁::Any`: an array or DataFrame of outcomes from the pre-treatment period.
-- `X₁::Any`: an array or DataFrame of covariates from the post-treatment period.
-- `Y₁::Any`: an array or DataFrame of outcomes from the post-treatment period.
-- `regularized::Function=true`: whether to use L2 regularization
+- `X₀::Any`: array or DataFrame of covariates from the pre-treatment period.
+- `Y₁::Any`: array or DataFrame of outcomes from the pre-treatment period.
+- `X₁::Any`: array or DataFrame of covariates from the post-treatment period.
+- `Y₁::Any`: array or DataFrame of outcomes from the post-treatment period.
 
 # Keywords
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross validation.
-- `min_neurons::Real`: the minimum number of neurons to consider for the extreme learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme learner.
-- `folds::Real`: the number of cross validation folds to find the best number of neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-    min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss approximator 
-    network.
+- `activation::Function=swish`: activation function to use.
+- `sample_size::Integer=size(X₀, 1)`: number of bootstrapped samples for the extreme 
+    learner.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75 * size(X₀, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as in 
-the previous penalty, then the procedure will stop early.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 # References
 For a simple linear regression-based tutorial on interrupted time series analysis see:
@@ -36,10 +31,6 @@ For a simple linear regression-based tutorial on interrupted time series analysi
     regression for the evaluation of public health interventions: a tutorial." International 
     journal of epidemiology 46, no. 1 (2017): 348-355.
 
-For details and a derivation of the generalized cross validation estimator see:
-    Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-    method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 215-223.
-
 # Examples
 ```julia
 julia> X₀, Y₀, X₁, Y₁ =  rand(100, 5), rand(100), rand(10, 5), rand(10)
@@ -65,17 +56,13 @@ function InterruptedTimeSeries(
     Y₀,
     X₁,
     Y₁;
-    regularized::Bool=true,
-    activation::Function=relu,
-    validation_metric::Function=mse,
-    min_neurons::Real=1,
-    max_neurons::Real=100,
-    folds::Real=5,
-    iterations::Real=round(size(X₀, 1) / 10),
-    approximator_neurons::Real=round(size(X₀, 1) / 10),
+    activation::Function=swish,
+    sample_size::Integer=size(X₀, 1),
+    num_machines::Integer=50,
+    num_feats::Integer=Int(round(0.75 * size(X₀, 2))),
+    num_neurons::Integer=round(Int, log10(size(X₀, 1)) * size(X₀, 2)),
     autoregression::Bool=true,
 )
-
     # Convert to arrays
     X₀, X₁, Y₀, Y₁ = Matrix{Float64}(X₀), Matrix{Float64}(X₁), Y₀[:, 1], Y₁[:, 1]
 
@@ -83,23 +70,21 @@ function InterruptedTimeSeries(
     X₀ = ifelse(autoregression == true, reduce(hcat, (X₀, moving_average(Y₀))), X₀)
     X₁ = ifelse(autoregression == true, reduce(hcat, (X₁, moving_average(Y₁))), X₁)
 
+    task = var_type(Y₀) isa Binary ? "classification" : "regression"
+
     return InterruptedTimeSeries(
         X₀,
-        Float64.(Y₀),
-        Float64.(X₁),
-        Float64.(Y₁),
+        float(Y₀),
+        X₁,
+        float(Y₁),
         "difference",
         true,
-        "regression",
-        regularized,
+        task,
         activation,
-        validation_metric,
-        min_neurons,
-        max_neurons,
-        folds,
-        iterations,
-        approximator_neurons,
-        0,
+        sample_size,
+        num_machines,
+        num_feats,
+        num_neurons,
         fill(NaN, size(Y₁, 1)),
     )
 end
@@ -110,30 +95,24 @@ end
 Initialize a G-Computation estimator.
 
 # Arguments
-- `X::Any`: an array or DataFrame of covariates.
-- `T::Any`: an vector or DataFrame of treatment statuses.
-- `Y::Any`: an array or DataFrame of outcomes.
+- `X::Any`: array or DataFrame of covariates.
+- `T::Any`: vector or DataFrame of treatment statuses.
+- `Y::Any`: array or DataFrame of outcomes.
 
 # Keywords
 - `quantity_of_interest::String`: ATE for average treatment effect or ATT for average 
     treatment effect on the treated.
-- `regularized::Function=true`: whether to use L2 regularization
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross 
-    validation.
-- `min_neurons::Real: the minimum number of neurons to consider for the extreme learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme learner.
-- `folds::Real`: the number of cross validation folds to find the best number of neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-    min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss approximator 
-    network.
+- `activation::Function=swish`: activation function to use.
+- `sample_size::Integer=size(X, 1)`: number of bootstrapped samples for the extreme 
+    learners.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75 * size(X, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as in 
-the previous penalty, then the procedure will stop early.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 # References
 For a good overview of G-Computation see:
@@ -143,11 +122,6 @@ For a good overview of G-Computation see:
     estimator for causal inference with different covariates sets: a comparative simulation 
     study." Scientific reports 10, no. 1 (2020): 9219.
 
-
-For details and a derivation of the generalized cross validation estimator see:
-    Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-    method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 215-223.
-
 # Examples
 ```julia
 julia> X, T, Y =  rand(100, 5), rand(100), [rand()<0.4 for i in 1:100]
@@ -163,22 +137,19 @@ julia> m5 = GComputation(x_df, t_df, y_df)
 mutable struct GComputation <: CausalEstimator
     @standard_input_data
     @model_config average_effect
-    learner::ExtremeLearningMachine
+    ensemble::ELMEnsemble
 
     function GComputation(
         X,
         T,
         Y;
         quantity_of_interest::String="ATE",
-        regularized::Bool=true,
-        activation::Function=relu,
+        activation::Function=swish,
+        sample_size::Integer=size(X, 1),
+        num_machines::Integer=50,
+        num_feats::Integer=Int(round(0.75 * size(X, 2))),
+        num_neurons::Integer=round(Int, log10(size(X, 1)) * size(X, 2)),
         temporal::Bool=true,
-        validation_metric::Function=mse,
-        min_neurons::Real=1,
-        max_neurons::Real=100,
-        folds::Real=5,
-        iterations::Real=round(size(X, 1) / 10),
-        approximator_neurons::Real=round(size(X, 1) / 10),
     )
         if quantity_of_interest ∉ ("ATE", "ITT", "ATT")
             throw(ArgumentError("quantity_of_interest must be ATE, ITT, or ATT"))
@@ -190,21 +161,17 @@ mutable struct GComputation <: CausalEstimator
         task = var_type(Y) isa Binary ? "classification" : "regression"
 
         return new(
-            Float64.(X),
-            Float64.(T),
-            Float64.(Y),
+            X,
+            float(T),
+            float(Y),
             quantity_of_interest,
             temporal,
             task,
-            regularized,
             activation,
-            validation_metric,
-            min_neurons,
-            max_neurons,
-            folds,
-            iterations,
-            approximator_neurons,
-            0,
+            sample_size,
+            num_machines,
+            num_feats,
+            num_neurons,
             NaN,
         )
     end
@@ -216,32 +183,23 @@ end
 Initialize a double machine learning estimator with cross fitting.
 
 # Arguments
-- `X::Any`: an array or DataFrame of covariates of interest.
-- `T::Any`: an vector or DataFrame of treatment statuses.
-- `Y::Any`: an array or DataFrame of outcomes.
+- `X::Any`: array or DataFrame of covariates of interest.
+- `T::Any`: vector or DataFrame of treatment statuses.
+- `Y::Any`: array or DataFrame of outcomes.
 
 # Keywords
-- `W::Any`: an array or dataframe of all possible confounders.
-- `regularized::Function=true`: whether to use L2 regularization
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross validation.
-- `min_neurons::Real`: the minimum number of neurons to consider for the extreme learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme learner.
-- `folds::Real`: the number of cross validation folds to find the best number of neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-    min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss approximator 
-    network.
+- `activation::Function=swish`: activation function to use.
+- `sample_size::Integer=size(X, 1)`: number of bootstrapped samples for teh extreme 
+    learners.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75, * size(X, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
+- `folds::Integer`: number of folds to use for cross fitting.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as in 
-the previous penalty, then the procedure will stop early.
-
-Unlike other estimators, this method does not support time series or panel data. This method 
-also does not work as well with smaller datasets because it estimates separate outcome 
-models for the treatment and control groups.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 # References
 For more information see:
@@ -249,65 +207,56 @@ For more information see:
     Whitney Newey, and James Robins. "Double/debiased machine learning for treatment and 
     structural parameters." (2016): C1-C68.
 
-
-For details and a derivation of the generalized cross validation estimator see:
-    Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-    method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 215-223.
-
 # Examples
 ```julia
 julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
 julia> m1 = DoubleMachineLearning(X, T, Y)
-julia> m2 = DoubleMachineLearning(X, T, Y; task="regression")
 
 julia> x_df = DataFrame(x1=rand(100), x2=rand(100), x3=rand(100), x4=rand(100))
 julia> t_df, y_df = DataFrame(t=rand(0:1, 100)), DataFrame(y=rand(100))
-julia> m3 = DoubleMachineLearning(x_df, t_df, y_df)
+julia> m2 = DoubleMachineLearning(x_df, t_df, y_df)
 ```
 """
 mutable struct DoubleMachineLearning <: CausalEstimator
-    @double_learner_input_data
+    @standard_input_data
     @model_config average_effect
+    folds::Integer
 end
 
 function DoubleMachineLearning(
     X,
     T,
     Y;
-    W=X,
-    regularized::Bool=true,
-    activation::Function=relu,
-    validation_metric::Function=mse,
-    min_neurons::Real=1,
-    max_neurons::Real=100,
-    folds::Real=5,
-    iterations::Real=round(size(X, 1) / 10),
-    approximator_neurons::Real=round(size(X, 1) / 10),
+    activation::Function=swish,
+    sample_size::Integer=size(X, 1),
+    num_machines::Integer=50,
+    num_feats::Integer=Int(round(0.75 * size(X, 2))),
+    num_neurons::Integer=round(Int, log10(size(X, 1)) * num_feats),
+    folds::Integer=5,
 )
-
     # Convert to arrays
-    X, T, Y, W = Matrix{Float64}(X), T[:, 1], Y[:, 1], Matrix{Float64}(W)
+    X, T, Y = Matrix{Float64}(X), T[:, 1], Y[:, 1]
+
+    # Shuffle data with random indices
+    indices = shuffle(1:length(Y))
+    X, T, Y = X[indices, :], T[indices], Y[indices]
 
     task = var_type(Y) isa Binary ? "classification" : "regression"
 
     return DoubleMachineLearning(
         X,
-        Float64.(T),
-        Float64.(Y),
-        Float64.(W),
+        float(T),
+        float(Y),
         "ATE",
         false,
         task,
-        regularized,
         activation,
-        validation_metric,
-        min_neurons,
-        max_neurons,
-        folds,
-        iterations,
-        approximator_neurons,
-        0,
+        sample_size, 
+        num_machines, 
+        num_feats,
+        num_neurons,
         NaN,
+        folds,
     )
 end
 
@@ -324,28 +273,22 @@ julia> estimate_causal_effect!(m1)
 ```
 """
 function estimate_causal_effect!(its::InterruptedTimeSeries)
-    # We will not find the best number of neurons after we have already estimated the causal
-    # effect and are getting p-values, confidence intervals, or standard errors. We will use
-    # the same number that was found when calling this method.
-    its.num_neurons = its.num_neurons === 0 ? best_size(its) : its.num_neurons
-
-    if its.regularized
-        learner = RegularizedExtremeLearner(its.X₀, its.Y₀, its.num_neurons, its.activation)
-    else
-        learner = ExtremeLearner(its.X₀, its.Y₀, its.num_neurons, its.activation)
-    end
+    learner = ELMEnsemble(
+        its.X₀, 
+        its.Y₀, 
+        its.sample_size, 
+        its.num_machines,
+        its.num_feats, 
+        its.num_neurons, 
+        its.activation
+    )
 
     fit!(learner)
-    its.causal_effect = predict_counterfactual!(learner, its.X₁) - its.Y₁
+    its.causal_effect = predict(learner, its.X₁) - its.Y₁
 
     return its.causal_effect
 end
 
-function estimate_causal_effect!(g::GComputation)
-    g.causal_effect = mean(g_formula!(g))
-    return g.causal_effect
-end
-
 """
     estimate_causal_effect!(g)
 
@@ -358,14 +301,34 @@ no periods. For example, given that ividuals 1, 2, ..., i ∈ I recieved either
 or a placebo in p different periods, the model would estimate the average treatment effect 
 as E[Yᵢ|T₁=1, T₂=1, ... Tₚ=1, Xₚ] - E[Yᵢ|T₁=0, T₂=0, ... Tₚ=0, Xₚ].
 
+# Examples
+```julia
+julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
+julia> m1 = GComputation(X, T, Y)
+julia> estimate_causal_effect!(m1)
+```
+"""
+function estimate_causal_effect!(g::GComputation)
+    g.causal_effect = mean(g_formula!(g))
+    return g.causal_effect
+end
+
+"""
+    g_formula!(g)
+
+Compute the G-formula for G-computation and S-learning.
+
 # Examples
 ```julia
 julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
 julia> m1 = GComputation(X, T, Y)
 julia> g_formula!(m1)
+
+julia> m2 = SLearner(X, T, Y)
+julia> g_formula!(m2)
 ```
 """
-function g_formula!(g)
+function g_formula!(g)  # Keeping this separate enables it to be reused for S-Learning
     covariates, y = hcat(g.X, g.T), g.Y
 
     if g.quantity_of_interest ∈ ("ITT", "ATE", "CATE")
@@ -376,17 +339,20 @@ function g_formula!(g)
         Xᵤ = hcat(covariates[g.T .== 1, 1:(end - 1)], zeros(size(g.T[g.T .== 1], 1)))
     end
 
-    g.num_neurons = g.num_neurons === 0 ? best_size(g) : g.num_neurons
+    g.ensemble = ELMEnsemble(
+        covariates, 
+        y, 
+        g.sample_size, 
+        g.num_machines, 
+        g.num_feats,
+        g.num_neurons, 
+        g.activation
+    )
 
-    if g.regularized
-        g.learner = RegularizedExtremeLearner(covariates, y, g.num_neurons, g.activation)
-    else
-        g.learner = ExtremeLearner(covariates, y, g.num_neurons, g.activation)
-    end
+    fit!(g.ensemble)
+    
+    yₜ, yᵤ = predict(g.ensemble, Xₜ), predict(g.ensemble, Xᵤ)
 
-    fit!(g.learner)
-    yₜ = clip_if_binary(predict(g.learner, Xₜ), var_type(g.Y))
-    yᵤ = clip_if_binary(predict(g.learner, Xᵤ), var_type(g.Y))
     return vec(yₜ) - vec(yᵤ)
 end
 
@@ -407,35 +373,7 @@ julia> estimate_causal_effect!(m2)
 ```
 """
 function estimate_causal_effect!(DML::DoubleMachineLearning)
-    # Uses the same number of neurons for all phases of estimation
-    DML.num_neurons = DML.num_neurons === 0 ? best_size(DML) : DML.num_neurons
-
-    causal_loss!(DML)
-    DML.causal_effect /= DML.folds
-
-    return DML.causal_effect
-end
-
-"""
-    causal_loss!(DML, [,cate])
-
-Minimize the causal loss function for double machine learning.
-
-# Notes
-This method should not be called directly.
-
-# Arguments
-- `DML::DoubleMachineLearning`: the DoubleMachineLearning struct to estimate the effect for.
-
-# Examples
-```julia
-julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
-julia> m1 = DoubleMachineLearning(X, T, Y)
-julia> causal_loss!(m1)
-```
-"""
-function causal_loss!(DML::DoubleMachineLearning)
-    X, T, W, Y = make_folds(DML)
+    X, T, Y = generate_folds(DML.X, DML.T, DML.Y, DML.folds)
     DML.causal_effect = 0
 
     # Cross fitting by training on the main folds and predicting residuals on the auxillary
@@ -443,13 +381,14 @@ function causal_loss!(DML::DoubleMachineLearning)
         X_train, X_test = reduce(vcat, X[1:end .!== fld]), X[fld]
         Y_train, Y_test = reduce(vcat, Y[1:end .!== fld]), Y[fld]
         T_train, T_test = reduce(vcat, T[1:end .!== fld]), T[fld]
-        W_train, W_test = reduce(vcat, W[1:end .!== fld]), W[fld]
 
-        Ỹ, T̃ = predict_residuals(
-            DML, X_train, X_test, Y_train, Y_test, T_train, T_test, W_train, W_test
-        )
-        DML.causal_effect += (vec(sum(T̃ .* X_test; dims=2)) \ Ỹ)[1]
+        Ỹ, T̃ = predict_residuals(DML, X_train, X_test, Y_train, Y_test, T_train, T_test)
+
+        DML.causal_effect += T̃\Ỹ
     end
+    DML.causal_effect /= DML.folds
+
+    return DML.causal_effect
 end
 
 """
@@ -471,50 +410,28 @@ julia> predict_residuals(m1, x_train, x_test, y_train, y_test, t_train, t_test)
 ```
 """
 function predict_residuals(
-    D, x_train, x_test, y_train, y_test, t_train, t_test, w_train, w_test
+    D, 
+    xₜᵣ::Array{Float64}, 
+    xₜₑ::Array{Float64}, 
+    yₜᵣ::Vector{Float64}, 
+    yₜₑ::Vector{Float64}, 
+    tₜᵣ::Vector{Float64}, 
+    tₜₑ::Vector{Float64}, 
 )
-    V = x_train != w_train && x_test != w_test ? reduce(hcat, (x_train, w_train)) : x_train
-    V_test = V == x_train ? x_test : reduce(hcat, (x_test, w_test))
+    y = ELMEnsemble(
+        xₜᵣ, yₜᵣ, D.sample_size, D.num_machines, D.num_feats, D.num_neurons, D.activation
+    )
 
-    if D.regularized
-        y = RegularizedExtremeLearner(V, y_train, D.num_neurons, D.activation)
-        t = RegularizedExtremeLearner(V, t_train, D.num_neurons, D.activation)
-    else
-        y = ExtremeLearner(V, y_train, D.num_neurons, D.activation)
-        t = ExtremeLearner(V, t_train, D.num_neurons, D.activation)
-    end
+    t = ELMEnsemble(
+        xₜᵣ, tₜᵣ, D.sample_size, D.num_machines, D.num_feats, D.num_neurons, D.activation
+    )
 
     fit!(y)
     fit!(t)
-    y_pred = clip_if_binary(predict(y, V_test), var_type(D.Y))
-    t_pred = clip_if_binary(predict(t, V_test), var_type(D.T))
-    ỹ, t̃ = y_test - y_pred, t_test - t_pred
-
-    return ỹ, t̃
-end
-
-"""
-    make_folds(D)
 
-Make folds for cross fitting for a double machine learning estimator.
-
-# Notes
-This method should not be called directly.
-
-# Examples
-```julia
-julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
-julia> m1 = DoubleMachineLearning(X, T, Y)
-julia> make_folds(m1)
-```
-"""
-function make_folds(D)
-    X_T_W, Y = generate_folds(reduce(hcat, (D.X, D.T, D.W)), D.Y, D.folds)
-    X = [fl[:, 1:size(D.X, 2)] for fl in X_T_W]
-    T = [fl[:, size(D.X, 2) + 1] for fl in X_T_W]
-    W = [fl[:, (size(D.X, 2) + 2):end] for fl in X_T_W]
+    yₚᵣ, tₚᵣ = predict(y, xₜₑ), predict(t, xₜₑ)
 
-    return X, T, W, Y
+    return yₜₑ - yₚᵣ, tₜₑ - tₚᵣ
 end
 
 """
diff --git a/src/inference.jl b/src/inference.jl
index 1fcbad0f..23226e04 100644
--- a/src/inference.jl
+++ b/src/inference.jl
@@ -1,17 +1,21 @@
 using Random: shuffle
 
 """
-    summarize(mod, n)
+    summarize(mod, kwargs...)
 
 Get a summary from a CausalEstimator or Metalearner.
 
 # Arguments
 - `mod::Union{CausalEstimator, Metalearner}`: a model to summarize.
+
+# Keywords
 - `n::Int=100`: the number of iterations to generate the numll distribution for 
     randomization inference.
+- `inference::Bool`=false: wheteher calculate p-values and standard errors.
 
 # Notes
-p-values and standard errors are estimated using approximate randomization inference.
+p-values and standard errors are estimated using approximate randomization inference. If set 
+to true, this procedure takes a VERY long time due to repeated matrix inversions.
 
 # References
 For a primer on randomization inference see: 
@@ -33,39 +37,41 @@ julia> estimate_causal_effect!(m3)
 julia> summarise(m3)  # British spelling works too!
 ```
 """
-function summarize(mod, n=1000)
+function summarize(mod; n=1000, inference=false)
     if all(isnan, mod.causal_effect)
         throw(ErrorException("call estimate_causal_effect! before calling summarize"))
     end
 
     summary_dict = Dict()
-    double_estimators = (DoubleMachineLearning, DoublyRobustLearner)
-    task = typeof(mod) in double_estimators ? "regression" : mod.task
     nicenames = [
         "Task",
         "Quantity of Interest",
-        "Regularized",
         "Activation Function",
-        "Time Series/Panel Data",
-        "Validation Metric",
+        "Sample Size",
+        "Number of Machines",
+        "Number of Features",
         "Number of Neurons",
-        "Number of Neurons in Approximator",
+        "Time Series/Panel Data",
         "Causal Effect",
         "Standard Error",
         "p-value",
     ]
 
-    p, stderr = quantities_of_interest(mod, n)
+    if inference
+        p, stderr = quantities_of_interest(mod, n)
+    else
+        p, stderr = NaN, NaN
+    end
 
     values = [
-        task,
+        mod.task,
         mod.quantity_of_interest,
-        mod.regularized,
         mod.activation,
-        mod.temporal,
-        mod.validation_metric,
+        mod.sample_size,
+        mod.num_machines,
+        mod.num_feats,
         mod.num_neurons,
-        mod.approximator_neurons,
+        mod.temporal,
         mod.causal_effect,
         stderr,
         p,
@@ -79,16 +85,23 @@ function summarize(mod, n=1000)
 end
 
 """
-    summarize(its, n, mean_effect)
+    summarize(its, kwargs...)
 
 Get a summary from an interrupted time series estimator.
 
 # Arguments
 - `its::InterruptedTimeSeries`: interrupted time series estimator
+
+# Keywords
 - `n::Int=100`: number of iterations to generate the numll distribution for randomization 
     inference.
 - `mean_effect::Bool=true`: whether to estimate the mean or cumulative effect for an 
     interrupted time series estimator.
+- `inference::Bool`=false: wheteher calculate p-values and standard errors.
+
+# Notes
+p-values and standard errors are estimated using approximate randomization inference. If set 
+to true, this procedure takes a VERY long time due to repeated matrix inversions.
 
 # Examples
 ```julia
@@ -98,35 +111,44 @@ julia> estimate_causal_effect!(m4)
 julia> summarize(m4)
 ```
 """
-function summarize(its::InterruptedTimeSeries, n=1000, mean_effect=true)
+function summarize(its::InterruptedTimeSeries; n=1000, mean_effect=true, inference=false)
     if all(isnan, its.causal_effect)
         throw(ErrorException("call estimate_causal_effect! before calling summarize"))
     end
 
     effect = ifelse(mean_effect, mean(its.causal_effect), sum(its.causal_effect))
+    qoi = mean_effect ? "Average Difference" : "Cumulative Difference"
 
-    p, stderr = quantities_of_interest(its, n, mean_effect)
+    if inference
+        p, stderr = quantities_of_interest(its, n, mean_effect)
+    else
+        p, stderr = NaN, NaN
+    end
 
     summary_dict = Dict()
     nicenames = [
         "Task",
-        "Regularized",
+        "Quantity of Interest",
         "Activation Function",
-        "Validation Metric",
+        "Sample Size",
+        "Number of Machines",
+        "Number of Features",
         "Number of Neurons",
-        "Number of Neurons in Approximator",
+        "Time Series/Panel Data",
         "Causal Effect",
         "Standard Error",
         "p-value",
     ]
 
     values = [
-        "Regression",
-        its.regularized,
+        its.task,
+        qoi,
         its.activation,
-        its.validation_metric,
+        its.sample_size,
+        its.num_machines,
+        its.num_feats,
         its.num_neurons,
-        its.approximator_neurons,
+        its.temporal,
         effect,
         stderr,
         p,
@@ -167,22 +189,26 @@ julia> generate_null_distribution(g_computer, 500)
 ```
 """
 function generate_null_distribution(mod, n)
-    local m = deepcopy(mod)
-    nobs = size(m.T, 1)
+    nobs, mods = size(mod.T, 1), [deepcopy(mod) for i ∈ 1:n]
     results = Vector{Float64}(undef, n)
 
     # Generate random treatment assignments and estimate the causal effects
-    for iter in 1:n
+    Threads.@threads for i ∈ 1:n
 
         # Sample from a continuous distribution if the treatment is continuous
         if var_type(mod.T) isa Continuous
-            m.T = (maximum(m.T) - minimum(m.T)) .* rand(nobs) .+ minimum(m.T)
+            mods[i].T = (maximum(mod.T) - minimum(mod.T)) .* rand(nobs) .+ minimum(mod.T)
         else
-            m.T = float(rand(unique(m.T), nobs))
+            mods[i].T = float(rand(unique(mod.T), nobs))
         end
 
-        estimate_causal_effect!(m)
-        results[iter] = mod isa Metalearner ? mean(m.causal_effect) : m.causal_effect
+        estimate_causal_effect!(mods[i])
+
+        results[i] = if mod isa Metalearner
+            mean(mods[i].causal_effect)
+        else 
+            mods[i].causal_effect
+        end
     end
     return results
 end
@@ -206,28 +232,28 @@ julia> generate_null_distribution(its, 10)
 ```
 """
 function generate_null_distribution(its::InterruptedTimeSeries, n, mean_effect)
-    model = deepcopy(its)
-    split_idx = size(model.Y₀, 1)
+    mods = [deepcopy(its) for i ∈ 1:n]
+    split_idx = size(its.Y₀, 1)
     results = Vector{Float64}(undef, n)
     data = reduce(hcat, (reduce(vcat, (its.X₀, its.X₁)), reduce(vcat, (its.Y₀, its.Y₁))))
 
     # Generate random treatment assignments and estimate the causal effects
-    for iter in 1:n
-        permuted_data = data[shuffle(1:end), :]
-        permuted_x₀ = permuted_data[1:split_idx, 1:(end - 1)]
-        permuted_x₁ = permuted_data[(split_idx + 1):end, 1:(end - 1)]
-        permuted_y₀ = permuted_data[1:split_idx, end]
-        permuted_y₁ = permuted_data[(split_idx + 1):end, end]
+    Threads.@threads for iter in 1:n
+        local permuted_data = data[shuffle(1:end), :]
+        local permuted_x₀ = permuted_data[1:split_idx, 1:(end - 1)]
+        local permuted_x₁ = permuted_data[(split_idx + 1):end, 1:(end - 1)]
+        local permuted_y₀ = permuted_data[1:split_idx, end]
+        local permuted_y₁ = permuted_data[(split_idx + 1):end, end]
 
         # Reestimate the model with the intervention now at the nth interval
-        model.X₀, model.Y₀ = permuted_x₀, permuted_y₀
-        model.X₁, model.Y₁ = permuted_x₁, permuted_y₁
-        estimate_causal_effect!(model)
+        mods[iter].X₀, mods[iter].Y₀ = permuted_x₀, permuted_y₀
+        mods[iter].X₁, mods[iter].Y₁ = permuted_x₁, permuted_y₁
+        estimate_causal_effect!(mods[iter])
 
         results[iter] = if mean_effect
-            mean(model.causal_effect)
+            mean(mods[iter].causal_effect)
         else
-            sum(model.causal_effect)
+            sum(mods[iter].causal_effect)
         end
     end
     return results
diff --git a/src/metalearners.jl b/src/metalearners.jl
index 6b358d6c..68ccfec6 100644
--- a/src/metalearners.jl
+++ b/src/metalearners.jl
@@ -12,33 +12,23 @@ Initialize a S-Learner.
 - `Y::Any`: an array or DataFrame of outcomes.
 
 # Keywords
-- `regularized::Function=true`: whether to use L2 regularization
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross validation.
-- `min_neurons::Real`: the minimum number of neurons to consider for the extreme learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme learner.
-- `folds::Real`: the number of cross validation folds to find the best number of neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss approximator 
-network.
+- `activation::Function=swish`: the activation function to use.
+- `sample_size::Integer=size(X, 1)`: number of bootstrapped samples for eth extreme 
+    learners.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75 * size(X, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as 
-in the previous penalty, then the procedure will stop early.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 # References
 For an overview of S-Learners and other metalearners see:
-Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu. "Metalearners for 
-estimating heterogeneous treatment effects using machine learning." Proceedings of 
-the national academy of sciences 116, no. 10 (2019): 4156-4165.
-
-For details and a derivation of the generalized cross validation estimator see:
-Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 
-215-223.
+    Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu. "Metalearners for 
+    estimating heterogeneous treatment effects using machine learning." Proceedings of 
+    the national academy of sciences 116, no. 10 (2019): 4156-4165.
 
 # Examples
 ```julia
@@ -55,20 +45,17 @@ julia> m4 = SLearner(x_df, t_df, y_df)
 mutable struct SLearner <: Metalearner
     @standard_input_data
     @model_config individual_effect
-    learner::ExtremeLearningMachine
+    ensemble::ELMEnsemble
 
     function SLearner(
         X,
         T,
         Y;
-        regularized::Bool=true,
-        activation::Function=relu,
-        validation_metric::Function=mse,
-        min_neurons::Real=1,
-        max_neurons::Real=100,
-        folds::Real=5,
-        iterations::Real=round(size(X, 1) / 10),
-        approximator_neurons::Real=round(size(X, 1) / 10),
+        activation::Function=swish,
+        sample_size::Integer=size(X, 1),
+        num_machines::Integer=50,
+        num_feats::Integer=Int(round(0.75 * size(X, 2))),
+        num_neurons::Integer=round(Int, log10(size(X, 1)) * size(X, 2)),
     )
 
         # Convert to arrays
@@ -83,15 +70,11 @@ mutable struct SLearner <: Metalearner
             "CATE",
             false,
             task,
-            regularized,
             activation,
-            validation_metric,
-            min_neurons,
-            max_neurons,
-            folds,
-            iterations,
-            approximator_neurons,
-            0,
+            sample_size,
+            num_machines,
+            num_feats,
+            num_neurons,
             fill(NaN, size(T, 1)),
         )
     end
@@ -108,70 +91,51 @@ Initialize a T-Learner.
 - `Y::Any`: an array or DataFrame of outcomes.
 
 # Keywords
-- `regularized::Function=true`: whether to use L2 regularization
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross 
-validation.
-- `min_neurons::Real`: the minimum number of neurons to consider for the extreme 
-learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme 
-learner.
-- `folds::Real`: the number of cross validation folds to find the best number of 
-neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss approximator 
-network.
+- `activation::Function=swish`: the activation function to use.
+- `sample_size::Integer=size(X, 1)`: number of bootstrapped samples for eth extreme 
+    learners.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75 * size(X, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as 
-in the previous penalty, then the procedure will stop early.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 # References
 For an overview of T-Learners and other metalearners see:
-Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu. "Metalearners for 
-estimating heterogeneous treatment effects using machine learning." Proceedings of 
-the national academy of sciences 116, no. 10 (2019): 4156-4165.
-
-For details and a derivation of the generalized cross validation estimator see:
-Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 
-215-223.
+    Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu. "Metalearners for 
+    estimating heterogeneous treatment effects using machine learning." Proceedings of 
+    the national academy of sciences 116, no. 10 (2019): 4156-4165.
 
 # Examples
 ```julia
 julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
 julia> m1 = TLearner(X, T, Y)
-julia> m2 = TLearner(X, T, Y; task="regression")
-julia> m3 = TLearner(X, T, Y; task="regression", regularized=true)
+julia> m2 = TLearner(X, T, Y; regularized=false)
 
 julia> x_df = DataFrame(x1=rand(100), x2=rand(100), x3=rand(100), x4=rand(100))
 julia> t_df, y_df = DataFrame(t=rand(0:1, 100)), DataFrame(y=rand(100))
-julia> m4 = TLearner(x_df, t_df, y_df)
+julia> m3 = TLearner(x_df, t_df, y_df)
 ```
 """
 mutable struct TLearner <: Metalearner
     @standard_input_data
     @model_config individual_effect
-    μ₀::ExtremeLearningMachine
-    μ₁::ExtremeLearningMachine
+    μ₀::ELMEnsemble
+    μ₁::ELMEnsemble
 
     function TLearner(
         X,
         T,
         Y;
-        regularized::Bool=true,
-        activation::Function=relu,
-        validation_metric::Function=mse,
-        min_neurons::Real=1,
-        max_neurons::Real=100,
-        folds::Real=5,
-        iterations::Real=round(size(X, 1) / 10),
-        approximator_neurons::Real=round(size(X, 1) / 10),
+        activation::Function=swish,
+        sample_size::Integer=size(X, 1),
+        num_machines::Integer=50,
+        num_feats::Integer=Int(round(0.75 * size(X, 2))),
+        num_neurons::Integer=round(Int, log10(size(X, 1)) * size(X, 2)),
     )
-
         # Convert to arrays
         X, T, Y = Matrix{Float64}(X), T[:, 1], Y[:, 1]
 
@@ -184,15 +148,11 @@ mutable struct TLearner <: Metalearner
             "CATE",
             false,
             task,
-            regularized,
             activation,
-            validation_metric,
-            min_neurons,
-            max_neurons,
-            folds,
-            iterations,
-            approximator_neurons,
-            0,
+            sample_size,
+            num_machines,
+            num_feats,
+            num_neurons,
             fill(NaN, size(T, 1)),
         )
     end
@@ -209,71 +169,52 @@ Initialize an X-Learner.
 - `Y::Any`: an array or DataFrame of outcomes.
 
 # Keywords
-- `regularized::Function=true`: whether to use L2 regularization
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross 
-validation.
-- `min_neurons::Real`: the minimum number of neurons to consider for the extreme 
-learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme 
-learner.
-- `folds::Real`: the number of cross validation folds to find the best number of 
-neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss 
-approximator network.
+- `activation::Function=swish`: the activation function to use.
+- `sample_size::Integer=size(X, 1)`: number of bootstrapped samples for eth extreme 
+    learners.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75 * size(X, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as 
-in the previous penalty, then the procedure will stop early.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 # References
 For an overview of X-Learners and other metalearners see:
-Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu. "Metalearners for 
-estimating heterogeneous treatment effects using machine learning." Proceedings of 
-the national academy of sciences 116, no. 10 (2019): 4156-4165.
-
-For details and a derivation of the generalized cross validation estimator see:
-Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 
-215-223.
+    Künzel, Sören R., Jasjeet S. Sekhon, Peter J. Bickel, and Bin Yu. "Metalearners for 
+    estimating heterogeneous treatment effects using machine learning." Proceedings of the 
+    national academy of sciences 116, no. 10 (2019): 4156-4165.
 
 # Examples
 ```julia
 julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
 julia> m1 = XLearner(X, T, Y)
-julia> m2 = XLearner(X, T, Y; task="regression")
-julia> m3 = XLearner(X, T, Y; task="regression", regularized=true)
+julia> m2 = XLearner(X, T, Y; regularized=false)
 
 julia> x_df = DataFrame(x1=rand(100), x2=rand(100), x3=rand(100), x4=rand(100))
 julia> t_df, y_df = DataFrame(t=rand(0:1, 100)), DataFrame(y=rand(100))
-julia> m4 = XLearner(x_df, t_df, y_df)
+julia> m3 = XLearner(x_df, t_df, y_df)
 ```
 """
 mutable struct XLearner <: Metalearner
     @standard_input_data
     @model_config individual_effect
-    μ₀::ExtremeLearningMachine
-    μ₁::ExtremeLearningMachine
+    μ₀::ELMEnsemble
+    μ₁::ELMEnsemble
     ps::Array{Float64}
 
     function XLearner(
         X,
         T,
         Y;
-        regularized::Bool=true,
-        activation::Function=relu,
-        validation_metric::Function=mse,
-        min_neurons::Real=1,
-        max_neurons::Real=100,
-        folds::Real=5,
-        iterations::Real=round(size(X, 1) / 10),
-        approximator_neurons::Real=round(size(X, 1) / 10),
+        activation::Function=swish,
+        sample_size::Integer=size(X, 1),
+        num_machines::Integer=50,
+        num_feats::Integer=Int(round(0.75 * size(X, 2))),
+        num_neurons::Integer=round(Int, log10(size(X, 1)) * size(X, 2)),
     )
-
         # Convert to arrays
         X, T, Y = Matrix{Float64}(X), T[:, 1], Y[:, 1]
 
@@ -286,15 +227,11 @@ mutable struct XLearner <: Metalearner
             "CATE",
             false,
             task,
-            regularized,
             activation,
-            validation_metric,
-            min_neurons,
-            max_neurons,
-            folds,
-            iterations,
-            approximator_neurons,
-            0,
+            sample_size,
+            num_machines,
+            num_feats,
+            num_neurons,
             fill(NaN, size(T, 1)),
         )
     end
@@ -311,68 +248,57 @@ Initialize an R-Learner.
 - `Y::Any`: an array or DataFrame of outcomes.
 
 # Keywords
-- `W::Any` : an array of all possible confounders.
-- `regularized::Function=true`: whether to use L2 regularization
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross validation.
-- `min_neurons::Real`: the minimum number of neurons to consider for the extreme learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme learner.
-- `folds::Real`: the number of cross validation folds to find the best number of neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-    min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss approximator 
-    network.
+- `activation::Function=swish`: the activation function to use.
+- `sample_size::Integer=size(X, 1)`: number of bootstrapped samples for eth extreme 
+    learners.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75 * size(X, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as in 
-the previous penalty, then the procedure will stop early.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 ## References
 For an explanation of R-Learner estimation see:
     Nie, Xinkun, and Stefan Wager. "Quasi-oracle estimation of heterogeneous treatment 
     effects." Biometrika 108, no. 2 (2021): 299-319.
-    
-For details and a derivation of the generalized cross validation estimator see:
-    Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-    method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 215-223.
 
 # Examples
 ```julia
 julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
 julia> m1 = RLearner(X, T, Y)
-julia> m2 = RLearner(X, T, Y; t_cat=true)
 
 julia> x_df = DataFrame(x1=rand(100), x2=rand(100), x3=rand(100), x4=rand(100))
 julia> t_df, y_df = DataFrame(t=rand(0:1, 100)), DataFrame(y=rand(100))
-julia> m4 = RLearner(x_df, t_df, y_df)
-
-julia> w = rand(100, 6)
-julia> m5 = RLearner(X, T, Y, W=w)
+julia> m2 = RLearner(x_df, t_df, y_df)
 ```
 """
 mutable struct RLearner <: Metalearner
-    @double_learner_input_data
+    @standard_input_data
     @model_config individual_effect
+    folds::Integer
 end
 
 function RLearner(
     X,
     T,
     Y;
-    W=X,
-    activation::Function=relu,
-    validation_metric::Function=mse,
-    min_neurons::Real=1,
-    max_neurons::Real=100,
-    folds::Real=5,
-    iterations::Real=round(size(X, 1) / 10),
-    approximator_neurons::Real=round(size(X, 1) / 10),
+    activation::Function=swish,
+    sample_size::Integer=size(X, 1),
+    num_machines::Integer=50,
+    num_feats::Integer=Int(round(0.75 * size(X, 2))),
+    num_neurons::Integer=round(Int, log10(size(X, 1)) * size(X, 2)),
+    folds::Integer=5,
 )
 
     # Convert to arrays
-    X, T, Y, W = Matrix{Float64}(X), T[:, 1], Y[:, 1], Matrix{Float64}(W)
+    X, T, Y = Matrix{Float64}(X), T[:, 1], Y[:, 1]
+
+    # Shuffle data with random indices
+    indices = shuffle(1:length(Y))
+    X, T, Y = X[indices, :], T[indices], Y[indices]
 
     task = var_type(Y) isa Binary ? "classification" : "regression"
 
@@ -380,20 +306,16 @@ function RLearner(
         X,
         Float64.(T),
         Float64.(Y),
-        W,
         "CATE",
         false,
         task,
-        true,
         activation,
-        validation_metric,
-        min_neurons,
-        max_neurons,
-        folds,
-        iterations,
-        approximator_neurons,
-        0,
+        sample_size,
+        num_machines,
+        num_feats,
+        num_neurons,
         fill(NaN, size(T, 1)),
+        folds,
     )
 end
 
@@ -408,68 +330,58 @@ Initialize a doubly robust CATE estimator.
 - `Y::Any`: an array or DataFrame of outcomes.
 
 # Keywords
-- `W::Any`: an array or dataframe of all possible confounders.
-- `regularized::Function=true`: whether to use L2 regularization
-- `activation::Function=relu`: the activation function to use.
-- `validation_metric::Function`: the validation metric to calculate during cross validation.
-- `min_neurons::Real`: the minimum number of neurons to consider for the extreme learner.
-- `max_neurons::Real`: the maximum number of neurons to consider for the extreme learner.
-- `folds::Real`: the number of cross validation folds to find the best number of neurons.
-- `iterations::Real`: the number of iterations to perform cross validation between 
-    min_neurons and max_neurons.
-- `approximator_neurons::Real`: the number of nuerons in the validation loss approximator 
-    network.
+- `activation::Function=swish`: the activation function to use.
+- `sample_size::Integer=size(X, 1)`: number of bootstrapped samples for eth extreme 
+    learners.
+- `num_machines::Integer=50`: number of extreme learning machines for the ensemble.
+- `num_feats::Integer=Int(round(0.75 * size(X, 2)))`: number of features to bootstrap for 
+    each learner in the ensemble.
+- `num_neurons::Integer`: number of neurons to use in the extreme learning machines.
 
 # Notes
-If regularized is set to true then the ridge penalty will be estimated using generalized 
-cross validation where the maximum number of iterations is 2 * folds for the successive 
-halving procedure. However, if the penalty in on iteration is approximately the same as in 
-the previous penalty, then the procedure will stop early.
+To reduce the computational complexity you can reduce sample_size, num_machines, or 
+num_neurons.
 
 # References
 For an explanation of doubly robust cate estimation see:
     Kennedy, Edward H. "Towards optimal doubly robust estimation of heterogeneous causal 
     effects." Electronic Journal of Statistics 17, no. 2 (2023): 3008-3049.
 
-For details and a derivation of the generalized cross validation estimator see:
-    Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-    method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 215-223.
-
 # Examples
 ```julia
 julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
 julia> m1 = DoublyRobustLearner(X, T, Y)
-julia> m2 = DoublyRobustLearnerLearner(X, T, Y; t_cat=true)
 
 julia> x_df = DataFrame(x1=rand(100), x2=rand(100), x3=rand(100), x4=rand(100))
 julia> t_df, y_df = DataFrame(t=rand(0:1, 100)), DataFrame(y=rand(100))
-julia> m4 = DoublyRobustLearner(x_df, t_df, y_df)
+julia> m2 = DoublyRobustLearner(x_df, t_df, y_df)
 
 julia> w = rand(100, 6)
-julia> m5 = DoublyRobustLearner(X, T, Y, W=w)
+julia> m3 = DoublyRobustLearner(X, T, Y, W=w)
 ```
 """
 mutable struct DoublyRobustLearner <: Metalearner
-    @double_learner_input_data
+    @standard_input_data
     @model_config individual_effect
+    folds::Integer
 end
 
 function DoublyRobustLearner(
     X,
     T,
     Y;
-    W=X,
-    regularized::Bool=true,
-    activation::Function=relu,
-    validation_metric::Function=mse,
-    min_neurons::Real=1,
-    max_neurons::Real=100,
-    iterations::Real=round(size(X, 1) / 10),
-    approximator_neurons::Real=round(size(X, 1) / 10),
+    activation::Function=swish,
+    sample_size::Integer=size(X, 1),
+    num_machines::Integer=50,
+    num_feats::Integer=Int(round(0.75 * size(X, 2))),
+    num_neurons::Integer=round(Int, log10(size(X, 1)) * size(X, 2)),
 )
-
     # Convert to arrays
-    X, T, Y, W = Matrix{Float64}(X), T[:, 1], Y[:, 1], Matrix{Float64}(W)
+    X, T, Y = Matrix{Float64}(X), T[:, 1], Y[:, 1]
+
+    # Shuffle data with random indices
+    indices = shuffle(1:length(Y))
+    X, T, Y = X[indices, :], T[indices], Y[indices]
 
     task = var_type(Y) isa Binary ? "classification" : "regression"
 
@@ -477,20 +389,16 @@ function DoublyRobustLearner(
         X,
         Float64.(T),
         Float64.(Y),
-        W,
         "CATE",
         false,
         task,
-        regularized,
         activation,
-        validation_metric,
-        min_neurons,
-        max_neurons,
-        2,
-        iterations,
-        approximator_neurons,
-        0,
+        sample_size,
+        num_machines,
+        num_feats,
+        num_neurons,
         fill(NaN, size(T, 1)),
+        2,
     )
 end
 
@@ -537,24 +445,19 @@ julia> estimate_causal_effect!(m5)
 """
 function estimate_causal_effect!(t::TLearner)
     x₀, x₁, y₀, y₁ = t.X[t.T .== 0, :], t.X[t.T .== 1, :], t.Y[t.T .== 0], t.Y[t.T .== 1]
-    type = var_type(t.Y)
 
-    # Only search for the best number of neurons once and use the same number for inference
-    t.num_neurons = t.num_neurons === 0 ? best_size(t) : t.num_neurons
+    t.μ₀ = ELMEnsemble(
+        x₀, y₀, t.sample_size, t.num_machines, t.num_feats, t.num_neurons, t.activation
+    )
 
-    if t.regularized
-        t.μ₀ = RegularizedExtremeLearner(x₀, y₀, t.num_neurons, t.activation)
-        t.μ₁ = RegularizedExtremeLearner(x₁, y₁, t.num_neurons, t.activation)
-    else
-        t.μ₀ = ExtremeLearner(x₀, y₀, t.num_neurons, t.activation)
-        t.μ₁ = ExtremeLearner(x₁, y₁, t.num_neurons, t.activation)
-    end
+    t.μ₁ = ELMEnsemble(
+        x₁, y₁, t.sample_size, t.num_machines, t.num_feats, t.num_neurons, t.activation
+    )
 
     fit!(t.μ₀)
     fit!(t.μ₁)
-    predictionsₜ = clip_if_binary(predict(t.μ₁, t.X), type)
-    predictionsᵪ = clip_if_binary(predict(t.μ₀, t.X), type)
-    t.causal_effect = @fastmath vec(predictionsₜ .- predictionsᵪ)
+    predictionsₜ, predictionsᵪ = predict(t.μ₁, t.X), predict(t.μ₀, t.X)
+    t.causal_effect = @fastmath vec(predictionsₜ - predictionsᵪ)
 
     return t.causal_effect
 end
@@ -578,16 +481,11 @@ julia> estimate_causal_effect!(m1)
 ```
 """
 function estimate_causal_effect!(x::XLearner)
-    # Only search for the best number of neurons once and use the same number for inference
-    x.num_neurons = x.num_neurons === 0 ? best_size(x) : x.num_neurons
-
-    type = var_type(x.Y)
     stage1!(x)
     μχ₀, μχ₁ = stage2!(x)
 
     x.causal_effect = @fastmath vec((
-        (x.ps .* clip_if_binary(predict(μχ₀, x.X), type)) .+
-        ((1 .- x.ps) .* clip_if_binary(predict(μχ₁, x.X), type))
+        (x.ps .* predict(μχ₀, x.X)) .+ ((1 .- x.ps) .* predict(μχ₁, x.X))
     ))
 
     return x.causal_effect
@@ -611,58 +509,29 @@ julia> estimate_causal_effect!(m1)
 ```
 """
 function estimate_causal_effect!(R::RLearner)
-    # Uses the same number of neurons for all phases of estimation
-    R.num_neurons = R.num_neurons === 0 ? best_size(R) : R.num_neurons
-
-    # Just estimate the causal effect using the underlying DML and the weight trick
-    R.causal_effect = causal_loss(R)
-
-    return R.causal_effect
-end
-
-"""
-    causal_loss(R)
-
-Minimize the causal loss function for an R-learner.
-
-# Notes
-This function should not be called directly.
+    X, T̃, Ỹ = generate_folds(R.X, R.T, R.Y, R.folds)
+    R.X, R.T, R.Y = reduce(vcat, X), reduce(vcat, T̃), reduce(vcat, Ỹ)
+
+    # Get residuals from out-of-fold predictions
+    for f in 1:(R.folds)
+        X_train, X_test = reduce(vcat, X[1:end .!== f]), X[f]
+        Y_train, Y_test = reduce(vcat, Ỹ[1:end .!== f]), Ỹ[f]
+        T_train, T_test = reduce(vcat, T̃[1:end .!== f]), T̃[f]
+        Ỹ[f], T̃[f] = predict_residuals(R, X_train, X_test, Y_train, Y_test, T_train, T_test)
+    end
 
-# References
-For an overview of R-learning see:
-    Nie, Xinkun, and Stefan Wager. "Quasi-oracle estimation of heterogeneous treatment 
-    effects." Biometrika 108, no. 2 (2021): 299-319.
+    # Using target transformation and the weight trick to minimize the causal loss
+    T̃², target = reduce(vcat, T̃).^2, reduce(vcat, Ỹ) ./ reduce(vcat, T̃)
+    Xʷ, Yʷ = R.X .* T̃², target .* T̃²
 
-# Examples
-```julia
-julia> X, T, Y =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100)
-julia> m1 = RLearner(X, T, Y)
-julia> causal_loss(m1)
-```
-"""
-function causal_loss(R::RLearner)
-    X, T, W, Y = make_folds(R)
-    predictors = Vector{RegularizedExtremeLearner}(undef, R.folds)
-
-    # Cross fitting by training on the main folds and predicting residuals on the auxillary
-    for fld in 1:(R.folds)
-        X_train, X_test = reduce(vcat, X[1:end .!== fld]), X[fld]
-        Y_train, Y_test = reduce(vcat, Y[1:end .!== fld]), Y[fld]
-        T_train, T_test = reduce(vcat, T[1:end .!== fld]), T[fld]
-        W_train, W_test = reduce(vcat, W[1:end .!== fld]), W[fld]
-
-        Ỹ, T̃ = predict_residuals(
-            R, X_train, X_test, Y_train, Y_test, T_train, T_test, W_train, W_test
-        )
+    # Fit a weighted residual-on-residual model
+    final_model = ELMEnsemble(
+        Xʷ, Yʷ, R.sample_size, R.num_machines, R.num_feats, R.num_neurons, R.activation
+    )
+    fit!(final_model)
+    R.causal_effect = predict(final_model, R.X)
 
-        # Using the weight trick to get the non-parametric CATE for an R-learner
-        X[fld], Y[fld] = (T̃ .^ 2) .* X_test, (T̃ .^ 2) .* (Ỹ ./ T̃)
-        mod = RegularizedExtremeLearner(X[fld], Y[fld], R.num_neurons, R.activation)
-        fit!(mod)
-        predictors[fld] = mod
-    end
-    final_predictions = [predict(m, reduce(vcat, X)) for m in predictors]
-    return vec(mapslices(mean, reduce(hcat, final_predictions); dims=2))
+    return R.causal_effect
 end
 
 """
@@ -683,17 +552,13 @@ julia> estimate_causal_effect!(m1)
 ```
 """
 function estimate_causal_effect!(DRE::DoublyRobustLearner)
-    X, T, W, Y = make_folds(DRE)
-    Z = DRE.W == DRE.X ? X : [reduce(hcat, (z)) for z in zip(X, W)]
+    X, T, Y = generate_folds(DRE.X, DRE.T, DRE.Y, DRE.folds)
     causal_effect = zeros(size(DRE.T, 1))
 
-    # Uses the same number of neurons for all phases of estimation
-    DRE.num_neurons = DRE.num_neurons === 0 ? best_size(DRE) : DRE.num_neurons
-
     # Rotating folds for cross fitting
-    for i in 1:(DRE.folds)
-        causal_effect .+= doubly_robust_formula!(DRE, X, T, Y, Z)
-        X, T, Y, Z = [X[2], X[1]], [T[2], T[1]], [Y[2], Y[1]], [Z[2], Z[1]]
+    for i in 1:2
+        causal_effect .+= doubly_robust_formula!(DRE, X, T, Y)
+        X, T, Y = [X[2], X[1]], [T[2], T[1]], [Y[2], Y[1]]
     end
 
     causal_effect ./= 2
@@ -703,7 +568,7 @@ function estimate_causal_effect!(DRE::DoublyRobustLearner)
 end
 
 """
-    doubly_robust_formula!(DRE, X, T, Y, Z)
+    doubly_robust_formula!(DRE, X, T, Y)
 
 Estimate the CATE for a single cross fitting iteration via doubly robust estimation.
 
@@ -715,33 +580,52 @@ This method should not be called directly.
 - `X`: a vector of three covariate folds.
 - `T`: a vector of three treatment folds.
 - `Y`: a vector of three outcome folds.
-- `Z` : a vector of three confounder folds and covariate folds.
 
 # Examples
 ```julia
 julia> X, T, Y, W =  rand(100, 5), [rand()<0.4 for i in 1:100], rand(100), rand(6, 100)
-julia> m1 = DoublyRobustLearner(X, T, Y, W=W)
+julia> m1 = DoublyRobustLearner(X, T, Y)
 
 julia> X, T, W, Y = make_folds(m1)
 julia> Z = m1.W == m1.X ? X : [reduce(hcat, (z)) for z in zip(X, W)]
 julia> g_formula!(m1, X, T, Y, Z)
 ```
 """
-function doubly_robust_formula!(DRE::DoublyRobustLearner, X, T, Y, Z)
-    π_arg, P = (Z[1], T[1], DRE.num_neurons, σ), var_type(DRE.Y)
-    μ₀_arg = Z[1][T[1] .== 0, :], Y[1][T[1] .== 0], DRE.num_neurons, DRE.activation
-    μ₁_arg = Z[1][T[1] .== 1, :], Y[1][T[1] .== 1], DRE.num_neurons, DRE.activation
-
+function doubly_robust_formula!(DRE::DoublyRobustLearner, X, T, Y)
     # Propensity scores
-    π_e = DRE.regularized ? RegularizedExtremeLearner(π_arg...) : ExtremeLearner(π_arg...)
+    π_e = ELMEnsemble(
+        X[1], 
+        T[1], 
+        DRE.sample_size, 
+        DRE.num_machines, 
+        DRE.num_feats, 
+        DRE.num_neurons, 
+        DRE.activation
+    )
+
+    # Outcome models
+    μ₀ = ELMEnsemble(
+        X[1][T[1] .== 0, :], 
+        Y[1][T[1] .== 0], 
+        DRE.sample_size, 
+        DRE.num_machines, 
+        DRE.num_feats,
+        DRE.num_neurons, 
+        DRE.activation
+    )
 
-    # Outcome predictions
-    μ₀ = DRE.regularized ? RegularizedExtremeLearner(μ₀_arg...) : ExtremeLearner(μ₀_arg...)
-    μ₁ = DRE.regularized ? RegularizedExtremeLearner(μ₁_arg...) : ExtremeLearner(μ₁_arg...)
+    μ₁ = ELMEnsemble(
+        X[1][T[1] .== 1, :], 
+        Y[1][T[1] .== 1], 
+        DRE.sample_size, 
+        DRE.num_machines, 
+        DRE.num_feats,
+        DRE.num_neurons, 
+        DRE.activation
+    )
 
     fit!.((π_e, μ₀, μ₁))
-    π̂ = clip_if_binary(predict(π_e, Z[2]), Binary())
-    μ₀̂, μ₁̂ = clip_if_binary(predict(μ₀, Z[2]), P), clip_if_binary(predict(μ₁, Z[2]), P)
+    π̂ , μ₀̂, μ₁̂  = predict(π_e, X[2]), predict(μ₀, X[2]), predict(μ₁, X[2])
 
     # Pseudo outcomes
     ϕ̂ =
@@ -749,11 +633,17 @@ function doubly_robust_formula!(DRE::DoublyRobustLearner, X, T, Y, Z)
         (Y[2] .- T[2] .* μ₁̂ .- (1 .- T[2]) .* μ₀̂) .+ μ₁̂ .- μ₀̂
 
     # Final model
-    τ_arg = X[2], ϕ̂, DRE.num_neurons, DRE.activation
-    τ_est = DRE.regularized ? RegularizedExtremeLearner(τ_arg...) : ExtremeLearner(τ_arg...)
+    τ_est = ELMEnsemble(
+        X[2], 
+        ϕ̂, 
+        DRE.sample_size, 
+        DRE.num_machines, 
+        DRE.num_feats, 
+        DRE.num_neurons, 
+        DRE.activation
+    )
     fit!(τ_est)
-
-    return clip_if_binary(predict(τ_est, DRE.X), P)
+    return predict(τ_est, DRE.X)
 end
 
 """
@@ -772,27 +662,33 @@ julia> stage1!(m1)
 ```
 """
 function stage1!(x::XLearner)
-    if x.regularized
-        g = RegularizedExtremeLearner(x.X, x.T, x.num_neurons, x.activation)
-        x.μ₀ = RegularizedExtremeLearner(
-            x.X[x.T .== 0, :], x.Y[x.T .== 0], x.num_neurons, x.activation
-        )
-        x.μ₁ = RegularizedExtremeLearner(
-            x.X[x.T .== 1, :], x.Y[x.T .== 1], x.num_neurons, x.activation
-        )
-    else
-        g = ExtremeLearner(x.X, x.T, x.num_neurons, x.activation)
-        x.μ₀ = ExtremeLearner(
-            x.X[x.T .== 0, :], x.Y[x.T .== 0], x.num_neurons, x.activation
-        )
-        x.μ₁ = ExtremeLearner(
-            x.X[x.T .== 1, :], x.Y[x.T .== 1], x.num_neurons, x.activation
-        )
-    end
+    g = ELMEnsemble(
+        x.X, x.T, x.sample_size, x.num_machines, x.num_feats, x.num_neurons, x.activation
+    )
+
+    x.μ₀ = ELMEnsemble(
+        x.X[x.T .== 0, :], 
+        x.Y[x.T .== 0], 
+        x.sample_size, 
+        x.num_machines, 
+        x.num_feats,
+        x.num_neurons, 
+        x.activation
+    )
+
+    x.μ₁ = ELMEnsemble(
+        x.X[x.T .== 1, :], 
+        x.Y[x.T .== 1], 
+        x.sample_size, 
+        x.num_machines, 
+        x.num_feats,
+        x.num_neurons, 
+        x.activation
+    )
 
     # Get propensity scores
     fit!(g)
-    x.ps = clip_if_binary(predict(g, x.X), Binary())
+    x.ps = predict(g, x.X)
 
     # Fit first stage outcome models
     fit!(x.μ₀)
@@ -816,21 +712,28 @@ julia> stage2!(m1)
 ```
 """
 function stage2!(x::XLearner)
-    m₁ = clip_if_binary(predict(x.μ₁, x.X .- x.Y), var_type(x.Y))
-    m₀ = clip_if_binary(predict(x.μ₀, x.X), var_type(x.Y))
+    m₁, m₀ = predict(x.μ₁, x.X .- x.Y), predict(x.μ₀, x.X)
     d = ifelse(x.T === 0, m₁, x.Y .- m₀)
+    
+    μχ₀ = ELMEnsemble(
+        x.X[x.T .== 0, :], 
+        d[x.T .== 0], 
+        x.sample_size, 
+        x.num_machines, 
+        x.num_feats,
+        x.num_neurons, 
+        x.activation
+    )
 
-    if x.regularized
-        μχ₀ = RegularizedExtremeLearner(
-            x.X[x.T .== 0, :], d[x.T .== 0], x.num_neurons, x.activation
-        )
-        μχ₁ = RegularizedExtremeLearner(
-            x.X[x.T .== 1, :], d[x.T .== 1], x.num_neurons, x.activation
-        )
-    else
-        μχ₀ = ExtremeLearner(x.X[x.T .== 0, :], d[x.T .== 0], x.num_neurons, x.activation)
-        μχ₁ = ExtremeLearner(x.X[x.T .== 1, :], d[x.T .== 1], x.num_neurons, x.activation)
-    end
+    μχ₁ = ELMEnsemble(
+        x.X[x.T .== 1, :], 
+        d[x.T .== 1], 
+        x.sample_size, 
+        x.num_machines, 
+        x.num_feats,
+        x.num_neurons, 
+        x.activation
+    )
 
     fit!(μχ₀)
     fit!(μχ₁)
diff --git a/src/model_validation.jl b/src/model_validation.jl
index c51bc032..30e0d8ba 100644
--- a/src/model_validation.jl
+++ b/src/model_validation.jl
@@ -1,37 +1,3 @@
-"""Abstract type used to dispatch risk_ratio on nonbinary treatments"""
-abstract type Nonbinary end
-
-"""Type used to dispatch risk_ratio on binary treatments"""
-struct Binary end
-
-"""Type used to dispatch risk_ratio on count treatments"""
-struct Count <: Nonbinary end
-
-"""Type used to dispatch risk_ratio on continuous treatments"""
-struct Continuous <: Nonbinary end
-
-"""
-    var_type(x)
-
-Determine the type of variable held by a vector.
-
-# Examples
-```jldoctest
-julia> CausalELM.var_type([1, 2, 3, 2, 3, 1, 1, 3, 2])
-CausalELM.Count()
-```
-"""
-function var_type(x::Array{<:Real})
-    x_set = Set(x)
-    if x_set == Set([0, 1]) || x_set == Set([0]) || x_set == Set([1])
-        return Binary()
-    elseif x_set == Set(round.(x_set))
-        return Count()
-    else
-        return Continuous()
-    end
-end
-
 """
     validate(its; kwargs...)
 
@@ -208,7 +174,7 @@ function covariate_independence(its::InterruptedTimeSeries; n=1000)
     x₀ = reduce(hcat, (its.X₀[:, 1:(end - 1)], zeros(size(its.X₀, 1))))
     x₁ = reduce(hcat, (its.X₁[:, 1:(end - 1)], ones(size(its.X₁, 1))))
     x = reduce(vcat, (x₀, x₁))
-    results = Dict{String,Float64}()
+    results = Dict{String, Float64}()
 
     # Estimate a linear regression with each covariate as a dependent variable and all other
     # covariates and time as independent variables
@@ -424,10 +390,11 @@ julia> counterfactual_consistency(g_computer)
 """
 function counterfactual_consistency(model, devs, iterations)
     counterfactual_model = deepcopy(model)
-    avg_counterfactual_effects = Dict{Float64,Float64}()
+    avg_counterfactual_effects = Dict{String,Float64}()
 
     for dev in devs
-        avg_counterfactual_effects[dev] = 0.0
+        key = string(dev) * " Standard Deviations from Observed Outcomes"
+        avg_counterfactual_effects[key] = 0.0
 
         # Averaging multiple iterations of random violatons for each std dev
         for iteration in 1:iterations
@@ -435,12 +402,12 @@ function counterfactual_consistency(model, devs, iterations)
             estimate_causal_effect!(counterfactual_model)
 
             if counterfactual_model isa Metalearner
-                avg_counterfactual_effects[dev] += mean(counterfactual_model.causal_effect)
+                avg_counterfactual_effects[key] += mean(counterfactual_model.causal_effect)
             else
-                avg_counterfactual_effects[dev] += counterfactual_model.causal_effect
+                avg_counterfactual_effects[key] += counterfactual_model.causal_effect
             end
         end
-        avg_counterfactual_effects[dev] /= iterations
+        avg_counterfactual_effects[key] /= iterations
     end
     return avg_counterfactual_effects
 end
@@ -465,10 +432,10 @@ function simulate_counterfactual_violations(y::Vector{<:Real}, dev::Float64)
     min_y, max_y = minimum(y), maximum(y)
 
     if var_type(y) isa Continuous
-        violations = (sqrt(var(y)) * dev) * randn(length(y))
-        counterfactual_Y = y .+ violations
+        violations = dev .* randn(length(y))
+        counterfactual_Y = y .+ (violations .* y)
     else
-        counterfactual_Y = ifelse.(rand() > dev, Float64(rand(min_y:max_y)), y)
+        counterfactual_Y = ifelse.(rand() < dev, Float64(rand(min_y:max_y)), y)
     end
     return counterfactual_Y
 end
@@ -592,7 +559,7 @@ function risk_ratio(::Nonbinary, mod)
         # Otherwise, we convert the treatment variable to a binary variable and then 
         # dispatch based on the type of outcome variable
     else
-        original_T, binary_T = mod.T, binarize(mod.T, mean(mod.Y))
+        original_T, binary_T = mod.T, binarize(mod.T, mean(mod.T))
         mod.T = binary_T
         rr = risk_ratio(Binary(), mod)
 
@@ -609,8 +576,8 @@ function risk_ratio(::Binary, ::Binary, mod)
     Xₜ, Xᵤ = reduce(hcat, (Xₜ, ones(size(Xₜ, 1)))), reduce(hcat, (Xᵤ, ones(size(Xᵤ, 1))))
 
     # For algorithms that use one model to estimate the outcome
-    if hasfield(typeof(mod), :learner)
-        return @fastmath mean(predict(mod.learner, Xₜ)) / mean(predict(mod.learner, Xᵤ))
+    if hasfield(typeof(mod), :ensemble)
+        return @fastmath (mean(predict(mod.ensemble, Xₜ)) / mean(predict(mod.ensemble, Xᵤ)))
 
         # For models that use separate models for outcomes in the treatment and control group
     else
@@ -627,26 +594,27 @@ function risk_ratio(::Binary, ::Count, mod)
     Xₜ, Xᵤ = reduce(hcat, (Xₜ, ones(m))), reduce(hcat, (Xᵤ, ones(n)))
 
     # For estimators with a single model of the outcome variable
-    if hasfield(typeof(mod), :learner)
-        return @fastmath (sum(predict(mod.learner, Xₜ)) / m) /
-            (sum(predict(mod.learner, Xᵤ)) / n)
+    if hasfield(typeof(mod), :ensemble)
+        return @fastmath (sum(predict(mod.ensemble, Xₜ)) / m) /
+            (sum(predict(mod.ensemble, Xᵤ)) / n)
 
         # For models that use separate models for outcomes in the treatment and control group
     elseif hasfield(typeof(mod), :μ₀)
         Xₜ, Xᵤ = mod.X[mod.T .== 1, :], mod.X[mod.T .== 0, :]
         return @fastmath mean(predict(mod.μ₁, Xₜ)) / mean(predict(mod.μ₀, Xᵤ))
     else
-        if mod.regularized
-            learner = RegularizedExtremeLearner(
-                reduce(hcat, (mod.X, mod.T)), mod.Y, mod.num_neurons, mod.activation
-            )
-        else
-            learner = ExtremeLearner(
-                reduce(hcat, (mod.X, mod.T)), mod.Y, mod.num_neurons, mod.activation
+        learner = ELMEnsemble(
+                reduce(hcat, (mod.X, mod.T)), 
+                mod.Y, 
+                mod.sample_size, 
+                mod.num_machines, 
+                mod.num_feats, 
+                mod.num_neurons, 
+                mod.activation
             )
-        end
+
         fit!(learner)
-        @fastmath (sum(predict(learner, Xₜ)) / m) / (sum(predict(learner, Xᵤ)) / n)
+        @fastmath mean(predict(learner, Xₜ)) / mean(predict(learner, Xᵤ))
     end
 end
 
@@ -686,13 +654,15 @@ julia> positivity(g_computer)
 ```
 """
 function positivity(model, min=1.0e-6, max=1 - min)
-    if model.regularized
-        ps_mod = RegularizedExtremeLearner(
-            model.X, model.T, model.num_neurons, model.activation
-            )
-    else
-        ps_mod = ExtremeLearner(model.X, model.T, model.num_neurons, model.activation)
-    end
+    ps_mod = ELMEnsemble(
+            model.X, 
+            model.T, 
+            model.sample_size, 
+            model.num_machines, 
+            model.num_feats, 
+            model.num_neurons, 
+            model.activation
+        )
 
     fit!(ps_mod)
     propensity_scores = predict(ps_mod, model.X)
@@ -717,25 +687,3 @@ function positivity(model::XLearner, min=1.0e-6, max=1 - min)
         ),
     )
 end
-
-function positivity(model::Union{DoubleMachineLearning,RLearner}, min=1.0e-6, max=1 - min)
-    num_neurons = best_size(model)
-
-    if model.regularized
-        ps_mod = RegularizedExtremeLearner(model.X, model.T, num_neurons, model.activation)
-    else
-        ps_mod = ExtremeLearner(model.X, model.T, num_neurons, model.activation)
-    end
-
-    fit!(ps_mod)
-    propensity_scores = predict(ps_mod, model.X)
-
-    # Observations that have a zero probability of treatment or control assignment
-    return reduce(
-        hcat,
-        (
-            model.X[propensity_scores .<= min .|| propensity_scores .>= max, :],
-            propensity_scores[propensity_scores .<= min .|| propensity_scores .>= max],
-        ),
-    )
-end
diff --git a/src/models.jl b/src/models.jl
index c61b2803..b13edda5 100644
--- a/src/models.jl
+++ b/src/models.jl
@@ -1,7 +1,5 @@
-using LinearAlgebra: pinv, I, norm, tr
-
-"""Abstract type that includes vanilla and L2 regularized Extreme Learning Machines"""
-abstract type ExtremeLearningMachine end
+using Random: shuffle
+using CausalELM: mean, var_type, clip_if_binary
 
 """
     ExtremeLearner(X, Y, hidden_neurons, activation)
@@ -17,15 +15,13 @@ For more details see:
     Huang, Guang-Bin, Qin-Yu Zhu, and Chee-Kheong Siew. "Extreme learning machine: theory 
     and applications." Neurocomputing 70, no. 1-3 (2006): 489-501.
 
-See also [`CausalELM.RegularizedExtremeLearner`](@ref).
-
 # Examples
 ```julia
 julia> x, y = [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0], [0.0, 1.0, 0.0, 1.0]
 julia> m1 = ExtremeLearner(x, y, 10, σ)
 ```
 """
-mutable struct ExtremeLearner <: ExtremeLearningMachine
+mutable struct ExtremeLearner
     X::Array{Float64}
     Y::Array{Float64}
     training_samples::Int64
@@ -33,52 +29,70 @@ mutable struct ExtremeLearner <: ExtremeLearningMachine
     hidden_neurons::Int64
     activation::Function
     __fit::Bool
-    __estimated::Bool
     weights::Array{Float64}
     β::Array{Float64}
     H::Array{Float64}
     counterfactual::Array{Float64}
 
     function ExtremeLearner(X, Y, hidden_neurons, activation)
-        return new(X, Y, size(X, 1), size(X, 2), hidden_neurons, activation, false, false)
+        return new(X, Y, size(X, 1), size(X, 2), hidden_neurons, activation, false)
     end
 end
 
 """
-    RegularizedExtremeLearner(X, Y, hidden_neurons, activation)
+    ELMEnsemble(X, Y, sample_size, num_machines, num_neurons)
+
+Initialize a bagging ensemble of extreme learning machines. 
+
+# Arguments
+- `X::Array{Float64}`: array of features for predicting labels.
+- `Y::Array{Float64}`: array of labels to predict.
+- `sample_size::Integer`: how many data points to use for each extreme learning machine.
+- `num_machines::Integer`: how many extreme learning machines to use.
+- `num_feats::Integer`: how many features to consider for eac exreme learning machine.
+- `num_neurons::Integer`: how many neurons to use for each extreme learning machine.
+- `activation::Function`: activation function to use for the extreme learning machines.
 
-Construct a RegularizedExtremeLearner for fitting and prediction.
+# Notes
+ELMEnsemble uses the same bagging approach as random forests when the labels are continuous 
+but uses the average predicted probability, rather than voting, for classification.
 
 # Examples
 ```julia
-julia> x, y = [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0], [0.0, 1.0, 0.0, 1.0]
-julia> m1 = RegularizedExtremeLearner(x, y, 10, σ)
+julia> X, Y =  rand(100, 5), rand(100)
+julia> m1 = ELMEnsemble(X, Y, 10, 50, 5, 5, CausalELM.relu)
 ```
 """
-mutable struct RegularizedExtremeLearner <: ExtremeLearningMachine
+mutable struct ELMEnsemble
     X::Array{Float64}
     Y::Array{Float64}
-    training_samples::Int64
-    features::Int64
-    hidden_neurons::Int64
-    activation::Function
-    __fit::Bool
-    __estimated::Bool
-    weights::Array{Float64}
-    β::Array{Float64}
-    k::Float64
-    H::Array{Float64}
-    counterfactual::Array{Float64}
+    elms::Array{ExtremeLearner}
+    feat_indices::Vector{Vector{Int64}}
+end
 
-    function RegularizedExtremeLearner(X, Y, hidden_neurons, activation)
-        return new(X, Y, size(X, 1), size(X, 2), hidden_neurons, activation, false, false)
-    end
+function ELMEnsemble(
+    X::Array{Float64}, 
+    Y::Array{Float64}, 
+    sample_size::Integer, 
+    num_machines::Integer,
+    num_feats::Integer, 
+    num_neurons::Integer,
+    activation::Function
+)
+    # Sampling from the data with replacement
+    indices = [rand(1:length(Y), sample_size) for i ∈ 1:num_machines]
+    feat_indices = [shuffle(1:size(X, 2))[1:num_feats] for i ∈ 1:num_machines]
+    xs = [X[indices[i], feat_indices[i]] for i ∈ 1:num_machines]
+    ys = [Y[indices[i]] for i ∈ 1:num_machines]
+    elms = [ExtremeLearner(xs[i], ys[i], num_neurons, activation) for i ∈ eachindex(xs)]
+
+    return ELMEnsemble(X, Y, elms, feat_indices)
 end
 
 """
     fit!(model)
 
-Make predictions with an ExtremeLearner.
+Fit an ExtremeLearner to the data.
 
 # References
 For more details see: 
@@ -95,45 +109,43 @@ function fit!(model::ExtremeLearner)
     set_weights_biases(model)
 
     model.__fit = true
-    model.β = @fastmath pinv(model.H) * model.Y
+    model.β = model.H\model.Y
     return model.β
 end
 
 """
     fit!(model)
 
-Fit a Regularized Extreme Learner.
+Fit an ensemble of ExtremeLearners to the data. 
 
-# References
-For more details see: 
-    Li, Guoqiang, and Peifeng Niu. "An enhanced extreme learning machine based on ridge 
-    regression for regression." Neural Computing and Applications 22, no. 3 (2013): 
-    803-810.
+# Arguments
+- `model::ELMEnsemble`: ensemble of ExtremeLearners to fit.
+
+# Notes
+This uses the same bagging approach as random forests when the labels are continuous but 
+uses the average predicted probability, rather than voting, for classification.
 
 # Examples
 ```julia
-julia> x, y = [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0], [0.0, 1.0, 0.0, 1.0]
-julia> m1 = RegularizedExtremeLearner(x, y, 10, σ)
-julia> f1 = fit!(m1)
+julia> X, Y =  rand(100, 5), rand(100)
+julia> m1 = ELMEnsemble(X, Y, 10, 50, 5, CausalELM.relu)
+julia> fit!(m1)
 ```
 """
-function fit!(model::RegularizedExtremeLearner)
-    set_weights_biases(model)
-    k = ridge_constant(model)
-    Id = Matrix(I, size(model.H, 2), size(model.H, 2))
-
-    model.β = @fastmath pinv(transpose(model.H) * model.H + k * Id) *
-        transpose(model.H) *
-        model.Y
-    model.__fit = true  # Enables running predict
-
-    return model.β
+function fit!(model::ELMEnsemble)
+    Threads.@threads for elm in model.elms
+        fit!(elm)
+    end
 end
 
 """
     predict(model, X)
 
-Use an ExtremeLearningMachine to make predictions.
+Use an ExtremeLearningMachine or ELMEnsemble to make predictions.
+
+# Notes
+If using an ensemble to make predictions, this method returns a maxtirs where each row is a
+prediction and each column is a model.
 
 # References
 For more details see: 
@@ -144,16 +156,31 @@ For more details see:
 ```julia
 julia> x, y = [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0], [0.0, 1.0, 0.0, 1.0]
 julia> m1 = ExtremeLearner(x, y, 10, σ)
-julia> f1 = fit(m1, sigmoid)
+julia> fit!(m1, sigmoid)
 julia> predict(m1, [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0])
+
+julia> m2 = ELMEnsemble(X, Y, 10, 50, 5, CausalELM.relu)
+julia> fit!(m2)
+julia> predict(m2)
 ```
 """
-function predict(model::ExtremeLearningMachine, X)
+function predict(model::ExtremeLearner, X)
     if !model.__fit
         throw(ErrorException("run fit! before calling predict"))
     end
 
-    return @fastmath model.activation(X * model.weights) * model.β
+    predictions = model.activation(X * model.weights) * model.β
+
+    return clip_if_binary(predictions, var_type(model.Y))
+end
+
+@inline function predict(model::ELMEnsemble, X) 
+    predictions = reduce(
+        hcat, 
+        [predict(model.elms[i], X[:, model.feat_indices[i]]) for i ∈ 1:length(model.elms)]
+    )
+
+    return vec(mapslices(mean, predictions, dims=2))
 end
 
 """
@@ -175,8 +202,8 @@ julia> f1 = fit(m1, sigmoid)
 julia> predict_counterfactual!(m1, [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0])
 ```
 """
-function predict_counterfactual!(model::ExtremeLearningMachine, X)
-    model.counterfactual, model.__estimated = predict(model, X), true
+function predict_counterfactual!(model::ExtremeLearner, X)
+    model.counterfactual = predict(model, X)
 
     return model.counterfactual
 end
@@ -202,69 +229,18 @@ julia> predict_counterfactual(m1, [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0])
 julia> placebo_test(m1)
 ```
 """
-function placebo_test(model::ExtremeLearningMachine)
+function placebo_test(model::ExtremeLearner)
     m = "Use predict_counterfactual! to estimate a counterfactual before using placebo_test"
-    if !model.__estimated
+    if !isdefined(model, :counterfactual)
         throw(ErrorException(m))
     end
     return predict(model, model.X), model.counterfactual
 end
 
-"""
-    ridge_constant(model, [,iterations])
-
-Calculate the L2 penalty for a regularized extreme learning machine using generalized cross 
-validation with successive halving.
-
-# Arguments
-- `model::RegularizedExtremeLearner`: regularized extreme learning machine.
-- `iterations::Int`: number of iterations to perform for successive halving.
-
-# References
-For more information see: 
-    Golub, Gene H., Michael Heath, and Grace Wahba. "Generalized cross-validation as a 
-    method for choosing a good ridge parameter." Technometrics 21, no. 2 (1979): 215-223.
-
-# Examples
-```julia
-julia> m1 = RegularizedExtremeLearner(x, y, 10, σ)
-julia> ridge_constant(m1)
-julia> ridge_constant(m1, iterations=20)
-```
-"""
-function ridge_constant(model::RegularizedExtremeLearner, iterations::Int=10)
-    S(λ, X, X̂, n) = X * pinv(X̂ .+ (n * λ * Matrix(I, n, n))) * transpose(X)
-    set_weights_biases(model)
-    Ĥ = transpose(model.H) * model.H
-
-    function gcv(H, Y, λ)  # Estimates the generalized cross validation function for given λ
-        S̃, n = S(λ, H, Ĥ, size(H, 2)), size(H, 1)
-        return ((norm((ones(n) .- S̃) * Y)^2) / n) / ((tr(Matrix(I, n, n) .- S̃) / n)^2)
-    end
-
-    k₁, k₂, Λ = 1e-9, 1 - 1e-9, sum((1e-9, 1 - 1e-9)) / 2  # Initial window to search
-    for i in 1:iterations
-        gcv₁, gcv₂ = @fastmath gcv(model.H, model.Y, k₁), gcv(model.H, model.Y, k₂)
-
-        # Divide the search space in half
-        if gcv₁ < gcv₂
-            k₂ /= 2
-        elseif gcv₁ > gcv₂
-            k₁ *= 2
-        elseif gcv₁ ≈ gcv₂
-            return (k₁ + k₂) / 2  # Early stopping
-        end
-
-        Λ = (k₁ + k₂) / 2
-    end
-    return Λ
-end
-
 """
     set_weights_biases(model)
 
-Calculate the weights and biases for an extreme learning machine or regularized extreme 
-learning machine.
+Calculate the weights and biases for an extreme learning machine.
 
 # Notes
 Initialization is done using uniform Xavier initialization.
@@ -280,9 +256,8 @@ julia> m1 = RegularizedExtremeLearner(x, y, 10, σ)
 julia> set_weights_biases(m1)
 ```
 """
-function set_weights_biases(model::ExtremeLearningMachine)
-    n_in, n_out = size(model.X, 2), model.hidden_neurons
-    a, b = -sqrt(6) / sqrt(n_in + n_out), sqrt(6) / sqrt(n_in + n_out)
+function set_weights_biases(model::ExtremeLearner)
+    a, b = -1, 1
     model.weights = @fastmath a .+ ((b - a) .* rand(model.features, model.hidden_neurons))
 
     return model.H = @fastmath model.activation((model.X * model.weights))
@@ -294,11 +269,8 @@ function Base.show(io::IO, model::ExtremeLearner)
     )
 end
 
-function Base.show(io::IO, model::RegularizedExtremeLearner)
+function Base.show(io::IO, model::ELMEnsemble)
     return print(
-        io,
-        "Regularized Extreme Learning Machine with ",
-        model.hidden_neurons,
-        " hidden neurons",
+        io, "Extreme Learning Machine Ensemble with ", length(model.elms), " learners"
     )
 end
diff --git a/src/utilities.jl b/src/utilities.jl
index 9bcd3917..3c44495c 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -1,3 +1,40 @@
+using Random: shuffle
+
+"""Abstract type used to dispatch risk_ratio on nonbinary treatments"""
+abstract type Nonbinary end
+
+"""Type used to dispatch risk_ratio on binary treatments"""
+struct Binary end
+
+"""Type used to dispatch risk_ratio on count treatments"""
+struct Count <: Nonbinary end
+
+"""Type used to dispatch risk_ratio on continuous treatments"""
+struct Continuous <: Nonbinary end
+
+"""
+    var_type(x)
+
+Determine the type of variable held by a vector.
+
+# Examples
+```jldoctest
+julia> CausalELM.var_type([1, 2, 3, 2, 3, 1, 1, 3, 2])
+CausalELM.Count()
+```
+"""
+function var_type(x::Array{<:Real})
+    x_set = Set(x)
+    
+    if x_set == Set([0, 1]) || x_set == Set([0]) || x_set == Set([1])
+        return Binary()
+    elseif x_set == Set(round.(x_set))
+        return Count()
+    else
+        return Continuous()
+    end
+end
+
 """
     mean(x)
 
@@ -60,8 +97,8 @@ See also [`var_type`](@ref).
 ```jldoctest
 julia> CausalELM.clip_if_binary([1.2, -0.02], CausalELM.Binary())
 2-element Vector{Float64}:
- 0.9999999
- 1.0e-7
+ 1.0
+ 0.0
 
 julia> CausalELM.clip_if_binary([1.2, -0.02], CausalELM.Count())
 2-element Vector{Float64}:
@@ -69,7 +106,7 @@ julia> CausalELM.clip_if_binary([1.2, -0.02], CausalELM.Count())
  -0.02
 ```
 """
-clip_if_binary(x::Array{<:Real}, var) = var isa Binary ? clamp.(x, 1e-7, 1 - 1e-7) : x
+clip_if_binary(x::Array{<:Real}, var) = var isa Binary ? clamp.(x, 0.0, 1.0) : x
 
 """
     model_config(effect_type)
@@ -103,15 +140,11 @@ macro model_config(effect_type)
         quantity_of_interest::String
         temporal::Bool
         task::String
-        regularized::Bool
         activation::Function
-        validation_metric::Function
-        min_neurons::Int64
-        max_neurons::Int64
-        folds::Int64
-        iterations::Int64
-        approximator_neurons::Int64
-        num_neurons::Int64
+        sample_size::Integer
+        num_machines::Integer
+        num_feats::Integer
+        num_neurons::Integer
         causal_effect::$field_type
     end
     return esc(fields)
@@ -140,23 +173,35 @@ macro standard_input_data()
 end
 
 """
-    double_learner_input_data()
+    generate_folds(X, T, Y, folds)
 
-Generate fields common to DoubleMachineLearning, RLearner, and DoublyRobustLearner.
+Create folds for cross validation.
 
 # Examples
-```julia
-julia> struct TestStruct CausalELM.@double_learner_input_data end
-julia> TestStruct([5.2], [0.8], [0.96], [0.87 1.8])
-TestStruct([5.2], [0.8], [0.96], [0.87 1.8])
+```jldoctest
+julia> xfolds, tfolds, yfolds = CausalELM.generate_folds(zeros(4, 2), zeros(4), ones(4), 2)
+([[0.0 0.0], [0.0 0.0; 0.0 0.0; 0.0 0.0]], [[0.0], [0.0, 0.0, 0.0]], [[1.0], [1.0, 1.0, 1.0]])
 ```
 """
-macro double_learner_input_data()
-    inputs = quote
-        X::Array{Float64}
-        T::Array{Float64}
-        Y::Array{Float64}
-        W::Array{Float64}
+function generate_folds(X, T, Y, folds)
+    msg = """the number of folds must be less than the number of observations"""
+    n = length(Y)
+
+    if folds >= n throw(ArgumentError(msg))end
+
+    x_folds = Array{Array{Float64, 2}}(undef, folds)
+    t_folds = Array{Array{Float64, 1}}(undef, folds)
+    y_folds = Array{Array{Float64, 1}}(undef, folds)
+
+    # Indices to start and stop for each fold
+    stops = round.(Int, range(; start=1, stop=n, length=folds + 1))
+
+    # Indices to use for making folds
+    indices = [s:(e - (e < n) * 1) for (s, e) in zip(stops[1:(end - 1)], stops[2:end])]
+
+    for (i, idx) in enumerate(indices)
+        x_folds[i], t_folds[i], y_folds[i] = X[idx, :], T[idx], Y[idx]
     end
-    return esc(inputs)
+
+    return x_folds, t_folds, y_folds
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index a801b185..18b1e7ad 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,15 +1,18 @@
-using Test, Documenter, CausalELM
+using Test
+using Aqua
+using Documenter
+using CausalELM
 
 include("test_activation.jl")
 include("test_models.jl")
 include("test_metrics.jl")
-include("test_crossval.jl")
 include("test_estimators.jl")
 include("test_metalearners.jl")
 include("test_inference.jl")
 include("test_model_validation.jl")
 include("test_utilities.jl")
-include("test_aqua.jl")
+
+Aqua.test_all(CausalELM)
 
 DocMeta.setdocmeta!(CausalELM, :DocTestSetup, :(using CausalELM); recursive=true)
 doctest(CausalELM)
diff --git a/test/test_aqua.jl b/test/test_aqua.jl
deleted file mode 100644
index 865951c2..00000000
--- a/test/test_aqua.jl
+++ /dev/null
@@ -1,3 +0,0 @@
-using Aqua
-
-Aqua.test_all(CausalELM; persistent_tasks=false)
diff --git a/test/test_crossval.jl b/test/test_crossval.jl
deleted file mode 100644
index 58eb508f..00000000
--- a/test/test_crossval.jl
+++ /dev/null
@@ -1,165 +0,0 @@
-using Test
-using CausalELM
-
-using CausalELM: relu
-include("../src/crossval.jl")
-
-x, y = shuffle_data(rand(100, 5), Float64.([rand() < 0.4 for i in 1:100]))
-xfolds, yfolds = generate_folds(zeros(20, 2), zeros(20), 5)
-xfolds_ts, yfolds_ts = generate_temporal_folds(
-    float.(hcat([1:10;], 11:20)), [1.0:1.0:10.0;], 5
-)
-
-X₀, Y₀, X₁, Y₁ = rand(100, 5), rand(100), rand(10, 5), rand(10)
-its = InterruptedTimeSeries(X₀, Y₀, X₁, Y₁)
-
-X, T, Y = rand(100, 5), rand(100), [rand() < 0.4 for i in 1:100]
-g_computation_regression = GComputation(X, T, Y)
-g_computation_classification = GComputation(X, T, rand(0:1, 100))
-
-@testset "Fold Generation" begin
-    @test_throws ArgumentError generate_folds(zeros(5, 2), zeros(5), 6)
-    @test_throws ArgumentError generate_folds(zeros(5, 2), zeros(5), 5)
-    @test size(xfolds, 1) == 5
-    @test size(xfolds[1], 1) == 4
-    @test size(xfolds[2], 2) == 2
-    @test length(yfolds) == 5
-    @test size(yfolds[1], 1) == 4
-    @test size(yfolds[2], 2) == 1
-    @test isa(xfolds, Array)
-    @test isa(yfolds, Array)
-
-    # Time series or panel data
-    # Testing incorrect input
-    @test_throws ArgumentError generate_temporal_folds(zeros(5, 2), zeros(5), 6)
-    @test_throws ArgumentError generate_temporal_folds(zeros(5, 2), zeros(5), 5)
-    @test_throws ArgumentError generate_temporal_folds(zeros(10, 2), zeros(5), 6)
-    @test_throws ArgumentError generate_temporal_folds(zeros(10, 2), zeros(5), 5)
-
-    @test size(xfolds_ts, 1) == 5
-    @test size(xfolds_ts[1], 1) == 2
-    @test size(xfolds_ts[2], 2) == 2
-    @test length(yfolds_ts) == 5
-    @test size(yfolds_ts[1], 1) == 2
-    @test size(yfolds_ts[2], 2) == 1
-    @test isa(xfolds_ts, Array)
-    @test isa(yfolds_ts, Array)
-end
-
-@testset "Single cross validation iteration" begin
-
-    # Regression: Not TS L2, TS L2
-    @test isa(
-        validation_loss(rand(100, 5), rand(100), rand(20, 5), rand(20), 5, mse), Float64
-    )
-    @test isa(
-        validation_loss(rand(100, 5), rand(100), rand(20, 5), rand(20), 5, mse), Float64
-    )
-    @test isa(
-        validation_loss(
-            rand(100, 5), rand(100), rand(20, 5), rand(20), 5, mse; regularized=false
-        ),
-        Float64,
-    )
-    @test isa(
-        validation_loss(
-            rand(100, 5),
-            rand(100),
-            rand(20, 5),
-            rand(20),
-            5,
-            mse;
-            regularized=false,
-            activation=gelu,
-        ),
-        Float64,
-    )
-
-    # Classification: Not TS L2, TS L2
-    @test isa(
-        validation_loss(
-            rand(100, 5),
-            Float64.(rand(100) .> 0.5),
-            rand(20, 5),
-            Float64.(rand(20) .> 0.5),
-            5,
-            accuracy,
-        ),
-        Float64,
-    )
-    @test isa(
-        validation_loss(
-            rand(100, 5),
-            Float64.(rand(100) .> 0.5),
-            rand(20, 5),
-            Float64.(rand(20) .> 0.5),
-            5,
-            accuracy,
-        ),
-        Float64,
-    )
-    @test isa(
-        validation_loss(
-            rand(100, 5),
-            Float64.(rand(100) .> 0.5),
-            rand(20, 5),
-            Float64.(rand(20) .> 0.5),
-            5,
-            accuracy;
-            regularized=false,
-            activation=gelu,
-        ),
-        Float64,
-    )
-
-    @test isa(
-        validation_loss(
-            rand(100, 5),
-            Float64.(rand(100) .> 0.5),
-            rand(20, 5),
-            Float64.(rand(20) .> 0.5),
-            5,
-            accuracy;
-            regularized=false,
-        ),
-        Float64,
-    )
-end
-
-@testset "Cross validation" begin
-
-    # Regression
-    @test isa(
-        cross_validate(rand(100, 5), rand(100), 5, mse, relu, true, 5, false), Float64
-    )
-    @test isa(
-        cross_validate(rand(100, 5), rand(100), 5, mse, relu, false, 5, true), Float64
-    )
-
-    # Classification
-    @test isa(
-        cross_validate(
-            rand(100, 5), Float64.(rand(100) .> 0.5), 5, accuracy, relu, true, 5, false
-        ),
-        Float64,
-    )
-    @test isa(
-        cross_validate(
-            rand(100, 5), Float64.(rand(100) .> 0.5), 5, accuracy, relu, false, 5, true
-        ),
-        Float64,
-    )
-end
-
-@testset "Best network size" begin
-    @test 100 >= best_size(its) >= 1
-    @test 100 >= best_size(g_computation_regression) >= 1
-    @test 100 >= best_size(g_computation_classification) >= 1
-end
-
-@testset "Data Shuffling" begin
-    @test size(x) === (100, 5)
-    @test x isa Array{Float64}
-    @test size(y, 1) === 100
-    @test y isa Vector{Float64}
-end
diff --git a/test/test_estimators.jl b/test/test_estimators.jl
index 29e859e0..f9916840 100644
--- a/test/test_estimators.jl
+++ b/test/test_estimators.jl
@@ -19,10 +19,6 @@ its_df = InterruptedTimeSeries(x₀_df, y₀_df, x₁_df, y₁_df)
 its_no_ar = InterruptedTimeSeries(x₀, y₀, x₁, y₁)
 estimate_causal_effect!(its_no_ar)
 
-# Testing without regularization
-its_noreg = InterruptedTimeSeries(x₀, y₀, x₁, y₁; regularized=false)
-estimate_causal_effect!(its_noreg)
-
 x, t, y = rand(100, 5), rand(0:1, 100), vec(rand(1:100, 100, 1))
 g_computer = GComputation(x, t, y; temporal=false)
 estimate_causal_effect!(g_computer)
@@ -37,15 +33,14 @@ t_df, y_df = DataFrame(; t=rand(0:1, 100)), DataFrame(; y=rand(100))
 g_computer_df = GComputation(x_df, t_df, y_df)
 gcomputer_att = GComputation(x, t, y; quantity_of_interest="ATT", temporal=false)
 estimate_causal_effect!(gcomputer_att)
-gcomputer_noreg = GComputation(x, t, y; regularized=false)
-estimate_causal_effect!(gcomputer_noreg)
 
 # Make sure the data isn't shuffled
 g_computer_ts = GComputation(
     float.(hcat([1:10;], 11:20)), Float64.([rand() < 0.4 for i in 1:10]), rand(10)
 )
 
-dm = DoubleMachineLearning(x, t, y)
+big_x, big_t, big_y = rand(10000, 8), rand(0:1, 10000), vec(rand(1:100, 10000, 1))
+dm = DoubleMachineLearning(big_x, big_t, big_y)
 estimate_causal_effect!(dm)
 
 # Testing with a binary outcome
@@ -55,30 +50,13 @@ estimate_causal_effect!(dm_binary_out)
 # With dataframes instead of arrays
 dm_df = DoubleMachineLearning(x_df, t_df, y_df)
 
-# No regularization
-dm_noreg = DoubleMachineLearning(x, t, y; regularized=false)
-estimate_causal_effect!(dm_noreg)
-
-# Specifying W
-dm_w = DoubleMachineLearning(x, t, y; W=rand(100, 4))
-estimate_causal_effect!(dm_w)
-
-# Calling estimate_effect!
-dm_estimate_effect = DoubleMachineLearning(x, t, y)
-dm_estimate_effect.num_neurons = 5
-CausalELM.causal_loss!(dm_estimate_effect)
-
-# Generating folds
-x_fold, t_fold, w_fold, y_fold = CausalELM.make_folds(dm)
-
 # Test predicting residuals
 x_train, x_test = x[1:80, :], x[81:end, :]
-t_train, t_test = t[1:80], t[81:100]
-y_train, y_test = y[1:80], y[81:end]
-residual_predictor = DoubleMachineLearning(x, t, y)
-residual_predictor.num_neurons = 5
+t_train, t_test = float(t[1:80]), float(t[81:end])
+y_train, y_test = float(y[1:80]), float(y[81:end])
+residual_predictor = DoubleMachineLearning(x, t, y, num_neurons=5)
 residuals = CausalELM.predict_residuals(
-    residual_predictor, x_train, x_test, y_train, y_test, t_train, t_test, x_train, x_test
+    residual_predictor, x_train, x_test, y_train, y_test, t_train, t_test
 )
 
 @testset "Interrupted Time Series Estimation" begin
@@ -106,9 +84,6 @@ residuals = CausalELM.predict_residuals(
 
         # Without autocorrelation
         @test isa(its_no_ar.causal_effect, Array)
-
-        # Without regularization
-        @test isa(its_noreg.causal_effect, Array)
     end
 end
 
@@ -132,9 +107,6 @@ end
 
     @testset "G-Computation Estimation" begin
         @test isa(g_computer.causal_effect, Float64)
-
-        # Estimation without regularization
-        @test isa(gcomputer_noreg.causal_effect, Float64)
         @test isa(g_computer_binary_out.causal_effect, Float64)
 
         # Check that the estimats for ATE and ATT are different
@@ -148,37 +120,23 @@ end
         @test dm.T !== Nothing
         @test dm.Y !== Nothing
 
-        # No regularization
-        @test dm_noreg.X !== Nothing
-        @test dm_noreg.T !== Nothing
-        @test dm_noreg.Y !== Nothing
-
         # Intialized with dataframes
         @test dm_df.X !== Nothing
         @test dm_df.T !== Nothing
         @test dm_df.Y !== Nothing
     end
 
-    @testset "Double Machine Learning Estimation Helpers" begin
-        @test dm_estimate_effect.causal_effect isa Float64
-        @test size(x_fold[1], 2) == size(dm.X, 2)
-        @test size(w_fold[1], 2) == size(dm.W, 2)
-        @test y_fold isa Vector{Vector{Float64}}
-        @test t_fold isa Vector{Vector{Float64}}
-        @test length(t_fold) == dm.folds
+    @testset "Generating Residuals" begin
         @test residuals[1] isa Vector
         @test residuals[2] isa Vector
     end
 
     @testset "Double Machine Learning Post-estimation Structure" begin
         @test dm.causal_effect isa Float64
-        @test dm_binary_out.causal_effect isa Float64
-        @test dm_noreg.causal_effect isa Float64
-        @test dm_w.causal_effect isa Float64
     end
 end
 
-@testset "Summarization and Inference" begin
+@testset "Miscellaneous Tests" begin
     @testset "Quanities of Interest Errors" begin
         @test_throws ArgumentError GComputation(x, y, t, quantity_of_interest="abc")
     end
diff --git a/test/test_inference.jl b/test/test_inference.jl
index 18a73cb6..116857e4 100644
--- a/test/test_inference.jl
+++ b/test/test_inference.jl
@@ -7,27 +7,28 @@ Float64.([rand() < 0.4 for i in 1:100])
 
 g_computer = GComputation(x, t, y)
 estimate_causal_effect!(g_computer)
-g_inference = CausalELM.generate_null_distribution(g_computer, 1000)
-p1, stderr1 = CausalELM.quantities_of_interest(g_computer, 1000)
-summary1 = summarize(g_computer)
+g_inference = CausalELM.generate_null_distribution(g_computer, 10)
+p1, stderr1 = CausalELM.quantities_of_interest(g_computer, 10)
+summary1 = summarize(g_computer, n=10, inference=true)
 
 dm = DoubleMachineLearning(x, 5 * randn(100) .+ 2, y)
 estimate_causal_effect!(dm)
-dm_inference = CausalELM.generate_null_distribution(dm, 1000)
-p2, stderr2 = CausalELM.quantities_of_interest(dm, 1000)
-summary2 = summarize(dm)
+dm_inference = CausalELM.generate_null_distribution(dm, 10)
+p2, stderr2 = CausalELM.quantities_of_interest(dm, 10)
+summary2 = summarize(dm, n=10)
 
 # With a continuous treatment variable
 dm_continuous = DoubleMachineLearning(x, t, rand(1:4, 100))
 estimate_causal_effect!(dm_continuous)
-dm_continuous_inference = CausalELM.generate_null_distribution(dm_continuous, 1000)
-p3, stderr3 = CausalELM.quantities_of_interest(dm_continuous, 1000)
-summary3 = summarize(dm_continuous)
+dm_continuous_inference = CausalELM.generate_null_distribution(dm_continuous, 10)
+p3, stderr3 = CausalELM.quantities_of_interest(dm_continuous, 10)
+summary3 = summarize(dm_continuous, n=10)
 
 x₀, y₀, x₁, y₁ = rand(1:100, 100, 5), rand(100), rand(10, 5), rand(10)
 its = InterruptedTimeSeries(x₀, y₀, x₁, y₁)
 estimate_causal_effect!(its)
-summary4 = summarize(its, 10)
+summary4 = summarize(its, n=10)
+summary4_inference = summarize(its, n=10, inference=true)
 
 # Null distributions for the mean and cummulative changes
 its_inference1 = CausalELM.generate_null_distribution(its, 10, true)
@@ -36,47 +37,47 @@ p4, stderr4 = CausalELM.quantities_of_interest(its, 10, true)
 
 slearner = SLearner(x, t, y)
 estimate_causal_effect!(slearner)
-summary5 = summarize(slearner)
+summary5 = summarize(slearner, n=10)
 
 tlearner = TLearner(x, t, y)
 estimate_causal_effect!(tlearner)
-tlearner_inference = CausalELM.generate_null_distribution(tlearner, 1000)
-p6, stderr6 = CausalELM.quantities_of_interest(tlearner, 1000)
-summary6 = summarize(tlearner)
+tlearner_inference = CausalELM.generate_null_distribution(tlearner, 10)
+p6, stderr6 = CausalELM.quantities_of_interest(tlearner, 10)
+summary6 = summarize(tlearner, n=10)
 
 xlearner = XLearner(x, t, y)
 estimate_causal_effect!(xlearner)
-xlearner_inference = CausalELM.generate_null_distribution(xlearner, 1000)
-p7, stderr7 = CausalELM.quantities_of_interest(xlearner, 1000)
-summary7 = summarize(xlearner)
-summary8 = summarise(xlearner)
+xlearner_inference = CausalELM.generate_null_distribution(xlearner, 10)
+p7, stderr7 = CausalELM.quantities_of_interest(xlearner, 10)
+summary7 = summarize(xlearner, n=10)
+summary8 = summarise(xlearner, n=10)
 
 rlearner = RLearner(x, t, y)
 estimate_causal_effect!(rlearner)
-summary9 = summarize(rlearner)
+summary9 = summarize(rlearner, n=10)
 
 dr_learner = DoublyRobustLearner(x, t, y)
 estimate_causal_effect!(dr_learner)
-dr_learner_inference = CausalELM.generate_null_distribution(dr_learner, 1000)
-p8, stderr8 = CausalELM.quantities_of_interest(dr_learner, 1000)
-summary10 = summarize(dr_learner)
+dr_learner_inference = CausalELM.generate_null_distribution(dr_learner, 10)
+p8, stderr8 = CausalELM.quantities_of_interest(dr_learner, 10)
+summary10 = summarize(dr_learner, n=10)
 
 @testset "Generating Null Distributions" begin
-    @test size(g_inference, 1) === 1000
+    @test size(g_inference, 1) === 10
     @test g_inference isa Array{Float64}
-    @test size(dm_inference, 1) === 1000
+    @test size(dm_inference, 1) === 10
     @test dm_inference isa Array{Float64}
-    @test size(dm_continuous_inference, 1) === 1000
+    @test size(dm_continuous_inference, 1) === 10
     @test dm_continuous_inference isa Array{Float64}
     @test size(its_inference1, 1) === 10
     @test its_inference1 isa Array{Float64}
     @test size(its_inference2, 1) === 10
     @test its_inference2 isa Array{Float64}
-    @test size(tlearner_inference, 1) === 1000
+    @test size(tlearner_inference, 1) === 10
     @test tlearner_inference isa Array{Float64}
-    @test size(xlearner_inference, 1) === 1000
+    @test size(xlearner_inference, 1) === 10
     @test xlearner_inference isa Array{Float64}
-    @test size(dr_learner_inference, 1) === 1000
+    @test size(dr_learner_inference, 1) === 10
     @test dr_learner_inference isa Array{Float64}
 end
 
@@ -118,6 +119,10 @@ end
         @test !isnothing(v)
     end
 
+    # Interrupted Time Series with randomization inference
+    @test summary4_inference["Standard Error"] !== NaN
+    @test summary4_inference["p-value"] !== NaN
+
     # S-Learners
     for (k, v) in summary5
         @test !isnothing(v)
@@ -150,7 +155,7 @@ end
 end
 
 @testset "Error Handling" begin
-    @test_throws ErrorException summarize(InterruptedTimeSeries(x₀, y₀, x₁, y₁), 10)
+    @test_throws ErrorException summarize(InterruptedTimeSeries(x₀, y₀, x₁, y₁), n=10)
     @test_throws ErrorException summarize(GComputation(x, y, t))
     @test_throws ErrorException summarize(TLearner(x, y, t))
 end
diff --git a/test/test_metalearners.jl b/test/test_metalearners.jl
index c0bd6eba..d63fd857 100644
--- a/test/test_metalearners.jl
+++ b/test/test_metalearners.jl
@@ -5,9 +5,8 @@ using DataFrames
 include("../src/models.jl")
 
 x, t, y = rand(100, 5), Float64.([rand() < 0.4 for i in 1:100]), vec(rand(1:100, 100, 1))
-slearner1, slearner2 = SLearner(x, t, y), SLearner(x, t, y; regularized=false)
-estimate_causal_effect!(slearner1);
-estimate_causal_effect!(slearner2);
+slearner1 = SLearner(x, t, y)
+estimate_causal_effect!(slearner1)
 
 # S-learner with a binary outcome
 s_learner_binary = SLearner(x, y, t)
@@ -19,12 +18,11 @@ t_df, y_df = DataFrame(; t=rand(0:1, 100)), DataFrame(; y=rand(100))
 
 s_learner_df = SLearner(x_df, t_df, y_df)
 
-tlearner1, tlearner2 = TLearner(x, t, y), TLearner(x, t, y; regularized=false)
-estimate_causal_effect!(tlearner1);
-estimate_causal_effect!(tlearner2);
+tlearner1 = TLearner(x, t, y)
+estimate_causal_effect!(tlearner1)
 
 # T-learner initialized with DataFrames
-t_learner_df = TLearner(x_df, t_df, y_df, regularized=false)
+t_learner_df = TLearner(x_df, t_df, y_df)
 
 # Testing with a binary outcome
 t_learner_binary = TLearner(x, t, Float64.([rand() < 0.8 for i in 1:100]))
@@ -35,7 +33,7 @@ xlearner1.num_neurons = 5
 CausalELM.stage1!(xlearner1)
 stage21 = CausalELM.stage2!(xlearner1)
 
-xlearner2 = XLearner(x, t, y; regularized=false)
+xlearner2 = XLearner(x, t, y)
 xlearner2.num_neurons = 5
 CausalELM.stage1!(xlearner2);
 CausalELM.stage2!(xlearner2);
@@ -44,9 +42,6 @@ stage22 = CausalELM.stage2!(xlearner1)
 xlearner3 = XLearner(x, t, y)
 estimate_causal_effect!(xlearner3)
 
-xlearner4 = XLearner(x, t, y; regularized=true)
-estimate_causal_effect!(xlearner4)
-
 # Testing initialization with DataFrames
 x_learner_df = XLearner(x_df, t_df, y_df)
 
@@ -57,28 +52,17 @@ estimate_causal_effect!(x_learner_binary)
 rlearner = RLearner(x, t, y)
 estimate_causal_effect!(rlearner)
 
-# Testing with a W arguments
-r_learner_w = RLearner(x, t, y; W=rand(100, 4))
-estimate_causal_effect!(r_learner_w)
-
 # Testing initialization with DataFrames
 r_learner_df = RLearner(x_df, t_df, y_df)
 
 # Doubly Robust Estimation
-dr_learner = DoublyRobustLearner(x, t, y; W=rand(100, 4))
-X_T, Y = generate_folds(
-    reduce(hcat, (dr_learner.X, dr_learner.T, dr_learner.W)), dr_learner.Y, 2
-)
-X = [fl[:, 1:size(dr_learner.X, 2)] for fl in X_T]
-T = [fl[:, size(dr_learner.X, 2) + 1] for fl in X_T]
-W = [fl[:, (size(dr_learner.W, 2) + 2):end] for fl in X_T]
-τ̂ = CausalELM.doubly_robust_formula!(dr_learner, X, T, Y, reduce(hcat, (W, X)))
+dr_learner = DoublyRobustLearner(x, t, y)
+X, T, Y = CausalELM.generate_folds(
+    dr_learner.X, dr_learner.T, dr_learner.Y, dr_learner.folds
+    )
+τ̂ = CausalELM.doubly_robust_formula!(dr_learner, X, T, Y)
 estimate_causal_effect!(dr_learner)
 
-# Doubly Robust Estimation with no regularization
-dr_no_reg = DoublyRobustLearner(x, t, y; W=rand(100, 4), regularized=false)
-estimate_causal_effect!(dr_no_reg)
-
 # Testing Doubly Robust Estimation with a binary outcome
 dr_learner_binary = DoublyRobustLearner(x, t, Float64.([rand() < 0.8 for i in 1:100]))
 estimate_causal_effect!(dr_learner_binary)
@@ -93,10 +77,6 @@ estimate_causal_effect!(dr_learner_df)
         @test slearner1.T isa Array{Float64}
         @test slearner1.Y isa Array{Float64}
 
-        @test slearner2.X isa Array{Float64}
-        @test slearner2.T isa Array{Float64}
-        @test slearner2.Y isa Array{Float64}
-
         @test s_learner_df.X isa Array{Float64}
         @test s_learner_df.T isa Array{Float64}
         @test s_learner_df.Y isa Array{Float64}
@@ -104,7 +84,6 @@ estimate_causal_effect!(dr_learner_df)
 
     @testset "S-Learner Estimation" begin
         @test isa(slearner1.causal_effect, Array{Float64})
-        @test isa(slearner2.causal_effect, Array{Float64})
         @test isa(s_learner_binary.causal_effect, Array{Float64})
     end
 end
@@ -114,9 +93,6 @@ end
         @test tlearner1.X !== Nothing
         @test tlearner1.T !== Nothing
         @test tlearner1.Y !== Nothing
-        @test tlearner2.X !== Nothing
-        @test tlearner2.T !== Nothing
-        @test tlearner2.Y !== Nothing
         @test t_learner_df.X !== Nothing
         @test t_learner_df.T !== Nothing
         @test t_learner_df.Y !== Nothing
@@ -124,47 +100,39 @@ end
 
     @testset "T-Learner Estimation" begin
         @test isa(tlearner1.causal_effect, Array{Float64})
-        @test isa(tlearner2.causal_effect, Array{Float64})
         @test isa(t_learner_binary.causal_effect, Array{Float64})
     end
 end
 
 @testset "X-Learners" begin
     @testset "First Stage X-Learner" begin
-        @test typeof(xlearner1.μ₀) <: CausalELM.ExtremeLearningMachine
-        @test typeof(xlearner1.μ₁) <: CausalELM.ExtremeLearningMachine
+        @test typeof(xlearner1.μ₀) <: CausalELM.ELMEnsemble
+        @test typeof(xlearner1.μ₁) <: CausalELM.ELMEnsemble
         @test xlearner1.ps isa Array{Float64}
-        @test xlearner1.μ₀.__fit === true
-        @test xlearner1.μ₁.__fit === true
-        @test typeof(xlearner2.μ₀) <: CausalELM.ExtremeLearningMachine
-        @test typeof(xlearner2.μ₁) <: CausalELM.ExtremeLearningMachine
+        @test typeof(xlearner2.μ₀) <: CausalELM.ELMEnsemble
+        @test typeof(xlearner2.μ₁) <: CausalELM.ELMEnsemble
         @test xlearner2.ps isa Array{Float64}
-        @test xlearner2.μ₀.__fit === true
-        @test xlearner2.μ₁.__fit === true
     end
 
     @testset "Second Stage X-Learner" begin
         @test length(stage21) == 2
-        @test eltype(stage21) <: CausalELM.ExtremeLearningMachine
+        @test eltype(stage21) <: CausalELM.ELMEnsemble
         @test length(stage22) == 2
-        @test eltype(stage22) <: CausalELM.ExtremeLearningMachine
+        @test eltype(stage22) <: CausalELM.ELMEnsemble
     end
 
     @testset "X-Learner Structure" begin
         @test xlearner3.X !== Nothing
         @test xlearner3.T !== Nothing
         @test xlearner3.Y !== Nothing
-        @test xlearner4.X !== Nothing
-        @test xlearner4.T !== Nothing
-        @test xlearner4.Y !== Nothing
         @test x_learner_df.X !== Nothing
         @test x_learner_df.T !== Nothing
         @test x_learner_df.Y !== Nothing
     end
 
     @testset "X-Learner Estimation" begin
-        @test typeof(xlearner3.μ₀) <: CausalELM.ExtremeLearningMachine
-        @test typeof(xlearner3.μ₁) <: CausalELM.ExtremeLearningMachine
+        @test typeof(xlearner3.μ₀) <: CausalELM.ELMEnsemble
+        @test typeof(xlearner3.μ₁) <: CausalELM.ELMEnsemble
         @test xlearner3.ps isa Array{Float64}
         @test xlearner3.causal_effect isa Array{Float64}
         @test x_learner_binary.causal_effect isa Array{Float64}
@@ -176,20 +144,16 @@ end
         @test rlearner.X isa Array{Float64}
         @test rlearner.T isa Array{Float64}
         @test rlearner.Y isa Array{Float64}
-        @test rlearner.W isa Array{Float64}
         @test r_learner_df.X isa Array{Float64}
         @test r_learner_df.T isa Array{Float64}
         @test r_learner_df.Y isa Array{Float64}
-        @test r_learner_df.W isa Array{Float64}
     end
 
     @testset "R-learner estimation" begin
         @test rlearner.causal_effect isa Vector
         @test length(rlearner.causal_effect) == length(y)
         @test eltype(rlearner.causal_effect) == Float64
-        @test r_learner_w.causal_effect isa Vector
-        @test length(r_learner_w.causal_effect) == length(y)
-        @test eltype(r_learner_w.causal_effect) == Float64
+        @test all(isnan, rlearner.causal_effect) == false
     end
 end
 
@@ -212,14 +176,12 @@ end
         @test dr_learner.causal_effect isa Vector
         @test length(dr_learner.causal_effect) === length(y)
         @test eltype(dr_learner.causal_effect) == Float64
+        @test all(isnan, dr_learner.causal_effect) == false
         @test dr_learner_df.causal_effect isa Vector
         @test length(dr_learner_df.causal_effect) === length(y)
         @test eltype(dr_learner_df.causal_effect) == Float64
         @test dr_learner_binary.causal_effect isa Vector
         @test length(dr_learner_binary.causal_effect) === length(y)
         @test eltype(dr_learner_binary.causal_effect) == Float64
-        @test dr_no_reg.causal_effect isa Vector
-        @test length(dr_no_reg.causal_effect) === length(y)
-        @test eltype(dr_no_reg.causal_effect) == Float64
     end
 end
diff --git a/test/test_model_validation.jl b/test/test_model_validation.jl
index f855fe35..c5823c3d 100644
--- a/test/test_model_validation.jl
+++ b/test/test_model_validation.jl
@@ -37,10 +37,6 @@ discrete_counterfactual_violations = CausalELM.simulate_counterfactual_violation
 dml = DoubleMachineLearning(x, t, y)
 estimate_causal_effect!(dml)
 
-# Create double machine learning estimator without regularization
-dml_noreg = DoubleMachineLearning(x, t, y; regularized=false)
-estimate_causal_effect!(dml_noreg)
-
 # Testing the risk ratio with a nonbinary treatment variable
 nonbinary_dml = DoubleMachineLearning(x, rand(1:3, 100), y)
 estimate_causal_effect!(nonbinary_dml)
@@ -141,7 +137,7 @@ end
         @test_throws ErrorException CausalELM.omitted_predictor(
             InterruptedTimeSeries(x₀, y₀, x₁, y₁)
         )
-        @test ovb isa Dict{String,Float64}
+        @test ovb isa Dict{String, Float64}
         @test isa.(values(ovb), Float64) == Bool[1, 1, 1, 1]
     end
 
@@ -158,7 +154,6 @@ end
         @test CausalELM.e_value(count_g_computer) isa Real
         @test CausalELM.e_value(g_computer) isa Real
         @test CausalELM.e_value(dml) isa Real
-        @test CausalELM.e_value(dml_noreg) isa Real
         @test CausalELM.e_value(t_learner) isa Real
         @test CausalELM.e_value(x_learner) isa Real
         @test CausalELM.e_value(dr_learner) isa Real
@@ -169,7 +164,7 @@ end
     @testset "Counterfactual Consistency" begin
         @test CausalELM.counterfactual_consistency(
             g_computer, (0.25, 0.5, 0.75, 1.0), 10
-        ) isa Dict{Float64,Float64}
+        ) isa Dict{String,Float64}
     end
 
     @testset "Exchangeability" begin
@@ -188,7 +183,6 @@ end
         @test size(CausalELM.positivity(count_g_computer), 2) ==
             size(count_g_computer.X, 2) + 1
         @test size(CausalELM.positivity(g_computer), 2) == size(g_computer.X, 2) + 1
-        @test size(CausalELM.positivity(dm_noreg), 2) == size(dm_noreg.X, 2) + 1
     end
 
     @testset "All Assumptions for G-computation" begin
@@ -200,7 +194,7 @@ end
 
 @testset "Double Machine Learning Assumptions" begin
     @test CausalELM.counterfactual_consistency(dml, (0.25, 0.5, 0.75, 1.0), 10) isa
-        Dict{Float64,Float64}
+        Dict{String, Float64}
     @test CausalELM.exchangeability(dml) isa Real
     @test size(CausalELM.positivity(dml), 2) == size(dml.X, 2) + 1
     @test length(validate(dml)) == 3
@@ -210,19 +204,19 @@ end
     @testset "Counterfactual Consistency" begin
         @test CausalELM.counterfactual_consistency(
             s_learner, (0.25, 0.5, 0.75, 1.0), 10
-        ) isa Dict{Float64,Float64}
+        ) isa Dict{String, Float64}
 
         @test CausalELM.counterfactual_consistency(
             t_learner, (0.25, 0.5, 0.75, 1.0), 10
-        ) isa Dict{Float64,Float64}
+        ) isa Dict{String, Float64}
 
         @test CausalELM.counterfactual_consistency(
             x_learner, (0.25, 0.5, 0.75, 1.0), 10
-        ) isa Dict{Float64,Float64}
+        ) isa Dict{String, Float64}
 
         @test CausalELM.counterfactual_consistency(
             dr_learner, (0.25, 0.5, 0.75, 1.0), 10
-        ) isa Dict{Float64,Float64}
+        ) isa Dict{String, Float64}
     end
 
     @testset "Exchangeability" begin
diff --git a/test/test_models.jl b/test/test_models.jl
index 17beb158..ce2304e8 100644
--- a/test/test_models.jl
+++ b/test/test_models.jl
@@ -9,22 +9,20 @@ x = [1.0 1.0; 0.0 1.0; 0.0 0.0; 1.0 0.0]
 y = [0.0, 1.0, 0.0, 1.0]
 x_test = [1.0 1.0; 0.0 1.0; 0.0 0.0]
 
+big_x, big_y = rand(10000, 7), rand(10000)
+
 x1 = rand(20, 5)
 y1 = rand(20)
 x1test = rand(30, 5)
 
+mock_model = ExtremeLearner(x, y, 10, σ)
+
 m1 = ExtremeLearner(x, y, 10, σ)
 f1 = fit!(m1)
 predictions1 = predict(m1, x_test)
 predict_counterfactual!(m1, x_test)
 placebo1 = placebo_test(m1)
 
-m2 = RegularizedExtremeLearner(x1, y1, 10, σ)
-f2 = fit!(m2)
-predictions2 = predict(m2, x1test)
-predict_counterfactual!(m2, x1test)
-placebo2 = placebo_test(m2)
-
 m3 = ExtremeLearner(x1, y1, 10, σ)
 fit!(m3)
 predictions3 = predict(m3, x1test)
@@ -32,58 +30,78 @@ predictions3 = predict(m3, x1test)
 m4 = ExtremeLearner(rand(100, 5), rand(100), 5, relu)
 fit!(m4)
 
-m5 = RegularizedExtremeLearner(rand(100, 5), rand(100), 5, relu)
-fit!(m5)
-
 nofit = ExtremeLearner(x1, y1, 10, σ)
-
-helper_elm = RegularizedExtremeLearner(x1, y1, 5, σ)
-set_weights_biases(helper_elm)
-k = ridge_constant(helper_elm)
-
-@testset "Model Fit" begin
-    @test length(m1.β) == 10
-    @test size(m1.weights) == (2, 10)
-    @test size(helper_elm.H) == (20, 5)
-    @test length(m4.β) == size(m4.X, 2)
-    @test length(m5.β) == size(m5.X, 2)
-end
-
-@testset "Regularization" begin
-    @test k isa Float64
-end
-
-@testset "Model Predictions" begin
-    @test predictions1[1] < 0.1
-    @test predictions1[2] > 0.9
-    @test predictions1[3] < 0.1
-
-    # Regularized case
-    @test predictions1[1] < 0.1
-    @test predictions1[2] > 0.9
-    @test predictions1[3] < 0.1
-
-    # Ensure the counterfactual attribute gets step
-    @test m1.counterfactual == predictions1
-    @test m2.counterfactual == predictions2
-
-    # Ensure we can predict with a test set with more data points than the training set
-    @test isa(predictions3, Array{Float64})
-end
-
-@testset "Placebo Test" begin
-    @test length(placebo1) == 2
-    @test length(placebo2) == 2
-end
-
-@testset "Predict Before Fit" begin
-    @test_throws ErrorException predict(nofit, x1test)
-    @test_throws ErrorException placebo_test(nofit)
+set_weights_biases(nofit)
+
+ensemble = ELMEnsemble(big_x, big_y, 10000, 100, 5, 10, relu)
+fit!(ensemble)
+predictions = predict(ensemble, big_x)
+
+@testset "Extreme Learning Machines" begin
+    @testset "Extreme Learning Machine Structure" begin
+        @test mock_model.X isa Array{Float64}
+        @test mock_model.Y isa Array{Float64}
+        @test mock_model.training_samples == size(x, 1)
+        @test mock_model.hidden_neurons == 10
+        @test mock_model.activation == σ
+        @test mock_model.__fit == false
+    end
+
+    @testset "Model Fit" begin
+        @test length(m1.β) == 10
+        @test size(m1.weights) == (2, 10)
+        @test length(m4.β) == size(m4.X, 2)
+    end
+
+    @testset "Model Predictions" begin
+        @test predictions1[1] < 0.1
+        @test predictions1[2] > 0.9
+        @test predictions1[3] < 0.1
+
+        # Ensure the counterfactual attribute gets step
+        @test m1.counterfactual == predictions1
+
+        # Ensure we can predict with a test set with more data points than the training set
+        @test isa(predictions3, Array{Float64})
+    end
+
+    @testset "Placebo Test" begin
+        @test length(placebo1) == 2
+    end
+
+    @testset "Predict Before Fit" begin
+        @test isdefined(nofit, :H) == true
+        @test_throws ErrorException predict(nofit, x1test)
+        @test_throws ErrorException placebo_test(nofit)
+    end
+
+    @testset "Print Models" begin
+        msg1, msg2 = "Extreme Learning Machine with ", "hidden neurons"
+        msg3 = "Regularized " * msg1
+        @test sprint(print, m1) === msg1 * string(m1.hidden_neurons) * " " * msg2
+    end
 end
 
-@testset "Print Models" begin
-    msg1, msg2 = "Extreme Learning Machine with ", "hidden neurons"
-    msg3 = "Regularized " * msg1
-    @test sprint(print, m1) === msg1 * string(m1.hidden_neurons) * " " * msg2
-    @test sprint(print, m2) === msg3 * string(m2.hidden_neurons) * " " * msg2
+@testset "Extreme Learning Machine Ensembles" begin
+    @testset "Initializing Ensembles" begin
+        @test ensemble isa ELMEnsemble
+        @test ensemble.X isa Array{Float64}
+        @test ensemble.Y isa Array{Float64}
+        @test ensemble.elms isa Array{ExtremeLearner}
+        @test length(ensemble.elms) == 100
+        @test ensemble.feat_indices isa Vector{Vector{Int64}}
+        @test length(ensemble.feat_indices) == 100
+    end
+    
+    @testset "Ensemble Fitting and Prediction" begin
+        @test all([elm.__fit for elm in ensemble.elms]) == true
+        @test predictions isa Vector{Float64}
+        @test length(predictions) == 10000
+    end
+
+    @testset "Print Models" begin
+        msg1, msg2 = "Extreme Learning Machine Ensemble with ", "learners"
+        msg3 = "Regularized " * msg1
+        @test sprint(print, ensemble) === msg1 * string(length(ensemble.elms)) * " " * msg2
+    end
 end
diff --git a/test/test_utilities.jl b/test/test_utilities.jl
index 20b8f0ac..7a63ef50 100644
--- a/test/test_utilities.jl
+++ b/test/test_utilities.jl
@@ -1,27 +1,19 @@
 using Test
-
-include("../src/utilities.jl")
-
-struct Binary end
-struct Count end
+using CausalELM
 
 # Variables for checking the output of the model_config macro because it is difficult
-model_config_avg_expr = @macroexpand @model_config average_effect
-model_config_ind_expr = @macroexpand @model_config individual_effect
-model_config_avg_idx = Int64.(collect(range(2, 26, 13)))
-model_config_ind_idx = Int64.(collect(range(2, 26, 13)))
+model_config_avg_expr = @macroexpand CausalELM.@model_config average_effect
+model_config_ind_expr = @macroexpand CausalELM.@model_config individual_effect
+model_config_avg_idx = Int64.(collect(range(2, 18, 9)))
+model_config_ind_idx = Int64.(collect(range(2, 18, 9)))
 model_config_avg_ground_truth = quote
     quantity_of_interest::String
     temporal::Bool
     task::String
-    regularized::Bool
     activation::Function
-    validation_metric::Function
-    min_neurons::Int64
-    max_neurons::Int64
-    folds::Int64
-    iterations::Int64
-    approximator_neurons::Int64
+    sample_size::Integer
+    num_machines::Integer
+    num_feats::Integer
     num_neurons::Int64
     causal_effect::Float64
 end
@@ -32,18 +24,15 @@ model_config_ind_ground_truth = quote
     task::String
     regularized::Bool
     activation::Function
-    validation_metric::Function
-    min_neurons::Int64
-    max_neurons::Int64
-    folds::Int64
-    iterations::Int64
-    approximator_neurons::Int64
+    sample_size::Integer
+    num_machines::Integer
+    num_feats::Integer
     num_neurons::Int64
     causal_effect::Array{Float64}
 end
 
 # Fields for the user supplied data
-standard_input_expr = @macroexpand @standard_input_data
+standard_input_expr = @macroexpand CausalELM.@standard_input_data
 standard_input_idx = [2, 4, 6]
 standard_input_ground_truth = quote
     X::Array{Float64}
@@ -52,7 +41,7 @@ standard_input_ground_truth = quote
 end
 
 # Fields for the user supplied data
-double_model_input_expr = @macroexpand @standard_input_data
+double_model_input_expr = @macroexpand CausalELM.@standard_input_data
 double_model_input_idx = [2, 4, 6]
 double_model_input_ground_truth = quote
     X::Array{Float64}
@@ -61,18 +50,24 @@ double_model_input_ground_truth = quote
     W::Array{Float64}
 end
 
+# Generating folds
+big_x, big_t, big_y = rand(10000, 8), rand(0:1, 10000), vec(rand(1:100, 10000, 1))
+dm = DoubleMachineLearning(big_x, big_t, big_y)
+estimate_causal_effect!(dm)
+x_fold, t_fold, y_fold = CausalELM.generate_folds(dm.X, dm.T, dm.Y, dm.folds)
+
 @testset "Moments" begin
     @test mean([1, 2, 3]) == 2
-    @test var([1, 2, 3]) == 1
+    @test CausalELM.var([1, 2, 3]) == 1
 end
 
 @testset "One Hot Encoding" begin
-    @test one_hot_encode([1, 2, 3]) == [1 0 0; 0 1 0; 0 0 1]
+    @test CausalELM.one_hot_encode([1, 2, 3]) == [1 0 0; 0 1 0; 0 0 1]
 end
 
 @testset "Clipping" begin
-    @test clip_if_binary([1.2, -0.02], Binary()) == [0.9999999, 1.0e-7]
-    @test clip_if_binary([1.2, -0.02], Count()) == [1.2, -0.02]
+    @test CausalELM.clip_if_binary([1.2, -0.02], CausalELM.Binary()) == [1.0, 0.0]
+    @test CausalELM.clip_if_binary([1.2, -0.02], CausalELM.Count()) == [1.2, -0.02]
 end
 
 @testset "Generating Fields with Macros" begin
@@ -91,7 +86,7 @@ end
         model_config_ind_ground_truth.args[model_config_avg_idx]
     )
 
-    @test_throws ArgumentError @macroexpand @model_config mean
+    @test_throws ArgumentError @macroexpand CausalELM.@model_config mean
 
     @test standard_input_expr.head == standard_input_ground_truth.head
 
@@ -107,3 +102,10 @@ end
         double_model_input_ground_truth.args[double_model_input_idx]
     )
 end
+
+@testset "Generating Folds" begin
+    @test size(x_fold[1], 2) == size(dm.X, 2)
+    @test y_fold isa Vector{Vector{Float64}}
+    @test t_fold isa Vector{Vector{Float64}}
+    @test length(t_fold) == dm.folds
+end