From eba94517472641e924e67536b72bf81de82206bc Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Tue, 31 May 2022 02:09:02 +0200
Subject: [PATCH 01/36] Add batchsize utils

---
 Project.toml       |   4 +-
 src/NaiveGAflux.jl |   6 +-
 src/batchsize.jl   | 206 +++++++++++++++++++++++++++++++++++++++++++++
 src/candidate.jl   |  43 +++++++++-
 test/batchsize.jl  |  81 ++++++++++++++++++
 test/candidate.jl  |  26 ++++--
 test/runtests.jl   |   5 +-
 7 files changed, 359 insertions(+), 12 deletions(-)
 create mode 100644 src/batchsize.jl
 create mode 100644 test/batchsize.jl

diff --git a/Project.toml b/Project.toml
index bfc76dcb..0cb4ab63 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NaiveGAflux"
 uuid = "81ede08e-ab29-11e9-16d3-79edd30a1d76"
 authors = ["DrChainsaw"]
-version = "0.9.1"
+version = "0.10.0"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -31,4 +31,4 @@ NaiveNASflux = "2"
 NaiveNASlib = "2"
 Reexport = "0.2.0, 1"
 Setfield = "0.3.4, 0.5, 0.6, 0.7, 0.8"
-julia = "1"
+julia = "1.7"
diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index 3786edec..971fbfbb 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -3,6 +3,7 @@ module NaiveGAflux
 using Base: release
 using Reexport
 @reexport using NaiveNASflux
+using NaiveNASlib: name
 using NaiveNASflux: FluxDense, FluxConv, FluxConvolutional, FluxNoParLayer, FluxParNorm, FluxRnn, FluxBatchNorm
 using NaiveNASflux: nograd, layertype
 using NaiveNASlib.Advanced, NaiveNASlib.Extend
@@ -32,7 +33,7 @@ const modeldir = "models"
 export fitness, AbstractFitness, LogFitness, GpuFitness, AccuracyFitness, TrainThenFitness, TrainAccuracyFitness, MapFitness, EwmaFitness, TimeFitness, SizeFitness, AggFitness
 
 # Candidate
-export evolvemodel, AbstractCandidate, CandidateModel, CandidateOptModel, FittedCandidate, model, opt, lossfun
+export evolvemodel, AbstractCandidate, CandidateModel, CandidateOptModel, CandidateBatchSize, FittedCandidate, model, opt, lossfun, batchsize
 
 # Evolution
 export evolve, AbstractEvolution, NoOpEvolution, AfterEvolution, EliteSelection, SusSelection, TournamentSelection, CombinedEvolution, EvolutionChain, PairCandidates, ShuffleCandidates, EvolveCandidates
@@ -43,6 +44,8 @@ export Population, generation
 # misc types
 export Probability, MutationShield, ApplyIf, RemoveIfSingleInput, PersistentArray, ShieldedOpt
 
+export BatchSizeSelectionWithDefaultInShape, BatchSizeSelectionScaled, BatchSizeSelectionFromAlternatives, BatchSizeSelectionMaxSize
+
 # Iterators. These should preferably come from somewhere else, but I haven't found anything which fits the bill w.r.t repeatability over subsets
 export RepeatPartitionIterator, SeedIterator, MapIterator, GpuIterator, BatchIterator, ShuffleIterator, TimedIterator, TimedIteratorStop, StatefulGenerationIter
 
@@ -78,6 +81,7 @@ export PlotFitness, ScatterPop, ScatterOpt, MultiPlot, CbAll
 
 include("util.jl")
 include("shape.jl")
+include("batchsize.jl")
 include("archspace.jl")
 include("mutation.jl")
 include("crossover.jl")
diff --git a/src/batchsize.jl b/src/batchsize.jl
new file mode 100644
index 00000000..5db7f986
--- /dev/null
+++ b/src/batchsize.jl
@@ -0,0 +1,206 @@
+generic_batchsizefun_docstring(fname="batchsizefun") = """
+
+`$(fname)` is a function with the following signature:
+
+`$(fname)(model, batchsize; inshape_nobatch, availablebytes)`
+
+It returns the largest batch size not larger than `batchsize` which can be used for `model` without using more than `availablebytes` bytes of memory.
+The type of `batchsize` may be used to e.g. determine if one shall account for backwards pass (if `typeof(batchsize) === TrainBatchSize`) or not (if `typeof(batchsize) == ValidationBatchSize`).
+
+"""
+
+generic_batchsizefun_testgraph() = """
+julia> v0 = conv2dinputvertex("v0", 3);
+
+julia> v1 = fluxvertex("v1", Conv((3,3), nout(v0) => 8), v0);
+
+julia> model = CompGraph(v0, v1);
+"""
+
+generic_batchsizeselection_example(sbs, kwres...) = """
+
+# availablebytes is automatically computed if omitted, but here we supply it to avoid doctest errors
+julia> bs(model, TrainBatchSize(512); $(first(kwres[1]))availablebytes = 10_000_000)
+$(last(kwres[1]))
+
+julia> bs(model, TrainBatchSize(512); $(first(kwres[2]))availablebytes = 1000_000_000)
+$(last(kwres[2]))
+
+julia> $sbs
+
+julia> sbs(model, TrainBatchSize(512); $(first(kwres[3]))availablebytes = 10_000_000)
+$(last(kwres[3]))
+
+julia> sbs(model, TrainBatchSize(512); $(first(kwres[4]))availablebytes = 1000_000_000)
+$(last(kwres[4]))
+
+julia> bs(model, ValidationBatchSize(512); $(first(kwres[5]))availablebytes=10_000_000)
+$(last(kwres[5]))
+"""
+
+# Mostly to enable dispatch when mutating since that happens to be the only way to know what about a candidate to mutate :(
+# We make use of types below as well, but that is mostly because they happen to already be there.
+struct TrainBatchSize
+    size::Int
+end
+batchsize(bs::TrainBatchSize) = bs.size
+
+struct ValidationBatchSize
+    size::Int
+end
+batchsize(bs::ValidationBatchSize) = bs.size
+
+
+"""
+    BatchSizeSelectionWithDefaultInShape{T, F}
+    BatchSizeSelectionWithDefaultInShape(default_inshape)
+    BatchSizeSelectionWithDefaultInShape(batchsizefun, default_inshape)
+
+Batch size selection with a default assumed inshape used for estimating valid batch sizes.
+
+$(generic_batchsizefun_docstring())
+
+Return the result of `batchsizefun` with default value of `inshape_nobatch = default_inshape` when called as a function.
+
+Composable with other batch size selection types which may be used as `batchsizefun`. See examples.
+
+# Examples
+```jldoctest
+julia> using NaiveGAflux
+
+$(generic_batchsizefun_testgraph())
+
+julia> bs = BatchSizeSelectionWithDefaultInShape((32,32,3));
+
+$(generic_batchsizeselection_example(
+    "sbs = BatchSizeSelectionWithDefaultInShape(BatchSizeSelectionScaled(0.5), (32,32,3))",
+    "" => "120",
+    "" => "512",
+    "" => "60",
+    "" => "512",
+    "" => "243"))
+```
+"""
+struct BatchSizeSelectionWithDefaultInShape{T, F}
+    batchsizefun::F
+    default_inshape::T
+end
+function BatchSizeSelectionWithDefaultInShape(default_inshape) 
+        BatchSizeSelectionWithDefaultInShape(limit_maxbatchsize, default_inshape)
+end
+function (bs::BatchSizeSelectionWithDefaultInShape)(args...; inshape_nobatch=bs.default_inshape ,kwargs...) 
+        bs.batchsizefun(args...; inshape_nobatch, kwargs...)
+end
+
+
+"""
+    BatchSizeSelectionScaled{F}
+    BatchSizeSelectionScaled(scale)
+    BatchSizeSelectionScaled(batchsizefun, scale)
+
+Batch size selection with a margin applied when estimating valid batch sizes.
+
+$(generic_batchsizefun_docstring())
+
+Return the result of `batchsizefun` with default value of `availablebytes = floor(scale * availablebytes)` when called as a function.
+
+Composable with other batch size selection types which may be used as `batchsizefun`. See examples.
+
+# Examples
+```jldoctest
+julia> using NaiveGAflux
+
+$(generic_batchsizefun_testgraph())
+
+julia> bs = BatchSizeSelectionScaled(0.5);
+
+$(generic_batchsizeselection_example(
+    "sbs = BatchSizeSelectionScaled(BatchSizeSelectionWithDefaultInShape((32,32,3)), 0.5);",
+    "inshape_nobatch=(32,32,3), " => "60",
+    "inshape_nobatch=(32,32,3), " => "512",
+    "" => "60",
+    "" => "512",
+    "inshape_nobatch=(32,32,3), " => "121"))
+```
+"""
+struct BatchSizeSelectionScaled{F}
+    batchsizefun::F
+    scale::Float64
+end
+BatchSizeSelectionScaled(scale::AbstractFloat) = BatchSizeSelectionScaled(limit_maxbatchsize, scale)  
+function (bs::BatchSizeSelectionScaled)(args...; availablebytes=_availablebytes(), kwargs...) 
+    bs.batchsizefun(args...;availablebytes = floor(Int, bs.scale * availablebytes), kwargs...)
+end
+
+
+struct BatchSizeSelectionFromAlternatives{F, T}
+    batchsizefun::F
+    alts::T
+end
+BatchSizeSelectionFromAlternatives(alts) = BatchSizeSelectionFromAlternatives(limit_maxbatchsize, alts)
+
+function (bs::BatchSizeSelectionFromAlternatives)(args...;kwargs...) 
+    select_bestfit_smaller(bs.batchsizefun(args...;kwargs...), bs.alts)
+end
+
+function select_bestfit_smaller(bs::Integer, alts)
+    validalts = filter(<=(bs), alts)
+    isempty(validalts) && return nothing
+    argmin(x -> abs(bs - x), validalts)
+end
+
+struct BatchSizeSelectionMaxSize{F}
+    batchsizefun::F
+    uppersize::Int
+end
+BatchSizeSelectionMaxSize(uppersize=1024) = BatchSizeSelectionMaxSize(limit_maxbatchsize, uppersize)
+function (bs::BatchSizeSelectionMaxSize)(c, ::Any, args...; kwargs...)
+     bs.batchsizefun(c, bs.uppersize, args...; kwargs...)
+end
+
+function limit_maxbatchsize(model, tbs::TrainBatchSize; inshape_nobatch, availablebytes = _availablebytes())
+    min(batchsize(tbs), maxtrainbatchsize(model, inshape_nobatch, availablebytes))
+end
+
+function limit_maxbatchsize(model,
+                            tbs::ValidationBatchSize; 
+                            inshape_nobatch,
+                            availablebytes = _availablebytes()
+                            )
+    min(batchsize(tbs), maxvalidationbatchsize(model, inshape_nobatch, availablebytes))
+end
+
+function maxtrainbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
+    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model))
+    actsize = activationsizes(model, inshape_nobatch) 
+    return fld(availablebytes - paramsize, paramsize + 2 * actsize)
+end
+
+function maxvalidationbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
+    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model))
+    actsize = activationsizes(model, inshape_nobatch)
+    return fld(availablebytes - paramsize, actsize)
+end
+
+function activationsizes(model::CompGraph, inshape_nobatch, elemsize = model |> params |> first |> eltype |> sizeof)
+    activations = if length(inputs(model)) == 1
+        Dict{AbstractVertex, Any}(v => Flux.nil_input(true, inshape_nobatch) for v in inputs(model))
+    else
+        Dict{AbstractVertex, Any}(v => Flux.nil_input(true, inshape_nobatch)[i] for (i, v) in inputs(model))
+    end
+    for v in outputs(model) 
+        output!(activations, v)
+    end
+
+    mapreduce(act -> length(act) * elemsize, +, values(activations))
+end
+
+
+function _availablebytes()
+    if CUDA.functional()
+        info = CUDA.MemoryInfo()
+        info.free_bytes + info.pool_reserved_bytes - info.pool_used_bytes
+    else
+        Int(Sys.free_memory())
+    end
+end
diff --git a/src/candidate.jl b/src/candidate.jl
index f2710de8..bd0a49d5 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -45,8 +45,10 @@ opt(::AbstractCandidate; default=nothing) = default
 Return the loss function of candidate `c` if `c` has a lossfunction, `default` (which defaults to `nothing`) otherwise.
 """
 lossfun(::AbstractCandidate; default=nothing) = default
+
 fitness(::AbstractCandidate; default=nothing) = default
 generation(::AbstractCandidate; default=nothing) = default
+batchsize(::AbstractCandidate; withgradient, default=nothing) = default
 
 
 wrappedcand(::T) where T <: AbstractCandidate = error("$T does not wrap any candidate! Check your base case!")
@@ -72,7 +74,7 @@ opt(c::AbstractWrappingCandidate; kwargs...) = opt(wrappedcand(c); kwargs...)
 lossfun(c::AbstractWrappingCandidate; kwargs...) = lossfun(wrappedcand(c); kwargs...)
 fitness(c::AbstractWrappingCandidate; kwargs...) = fitness(wrappedcand(c); kwargs...)
 generation(c::AbstractWrappingCandidate; kwargs...) = generation(wrappedcand(c); kwargs...)
-
+batchsize(c::AbstractWrappingCandidate; kwargs...) = batchsize(wrappedcand(c); kwargs...)
 
 """
     CandidateModel <: Candidate
@@ -118,6 +120,45 @@ opt(c::CandidateOptModel; kwargs...) = c.opt
 
 newcand(c::CandidateOptModel, mapfield) = CandidateOptModel(mapfield(c.opt), newcand(wrappedcand(c), mapfield))
 
+
+struct CandidateBatchSize{F, C <: AbstractCandidate} <: AbstractWrappingCandidate
+    tbs::TrainBatchSize
+    vbs::ValidationBatchSize
+    limitfun::F
+    c::C
+
+    function CandidateBatchSize{F, C}(limitfun::F, tbs::TrainBatchSize, vbs::ValidationBatchSize, c::C) where {F, C}
+        new{F, C}(TrainBatchSize(limitfun(c, tbs)), ValidationBatchSize(limitfun(c, vbs)), limitfun, c)
+    end
+end
+
+@functor CandidateBatchSize
+
+function CandidateBatchSize(limitfun, tbs::Integer, vbs::Integer, c)
+    CandidateBatchSize(limitfun, TrainBatchSize(tbs), ValidationBatchSize(vbs), c)
+end
+function CandidateBatchSize(limitfun::F, tbs::TrainBatchSize, vbs::ValidationBatchSize, c::C) where {C<:AbstractCandidate, F}
+    CandidateBatchSize{F, C}(limitfun, tbs, vbs, c)
+end
+
+
+function batchsize(c::CandidateBatchSize; withgradient, inshape_nobatch=nothing, default=nothing, kwargs...) 
+    bs = withgradient ? c.tbs : c.vbs
+    isnothing(inshape_nobatch) ? batchsize(bs) : c.limitfun(c, bs; inshape_nobatch, kwargs...) 
+end
+
+function newcand(c::CandidateBatchSize, mapfield) 
+    CandidateBatchSize(mapfield(c.limitfun),
+                       mapfield(c.tbs), 
+                       mapfield(c.vbs), 
+                       newcand(c.c, mapfield))
+end
+
+limit_maxbatchsize(c::AbstractCandidate, bs; inshape_nobatch, availablebytes = _availablebytes()) = model(c) do model
+    isnothing(model) && return bs
+    limit_maxbatchsize(model, bs; inshape_nobatch, availablebytes)
+end
+
 """
     FileCandidate <: AbstractWrappingCandidate
     FileCandidate(c::AbstractCandidate) 
diff --git a/test/batchsize.jl b/test/batchsize.jl
new file mode 100644
index 00000000..cabb5bf7
--- /dev/null
+++ b/test/batchsize.jl
@@ -0,0 +1,81 @@
+@testset "BatchSizeSelection" begin
+    
+    @testset "BatchSizeSelectionWithDefaultInShape" begin
+        testfun = function(x; inshape_nobatch)
+            return x => inshape_nobatch
+        end
+
+        @test BatchSizeSelectionWithDefaultInShape(testfun, (2,3,4))(13) == (13 => (2,3,4))
+
+        @test BatchSizeSelectionWithDefaultInShape(testfun, (2,3,4))(13; inshape_nobatch=(3,)) == (13 => (3,))
+    end
+
+    @testset "BatchSizeSelectionScaled" begin
+        testfun = function(x; availablebytes=1)
+            return x => availablebytes
+        end
+        @test BatchSizeSelectionScaled(testfun, 0.5)(4; availablebytes=6) == (4 => 3)
+    end
+
+    @testset "BatchSizeSelectionFromAlternatives" begin
+        bs = BatchSizeSelectionFromAlternatives(identity, [1, 3, 7])
+        @test bs(0) === nothing
+        @test bs(1) === 1
+        @test bs(2) === 1
+        @test bs(3) === 3
+        @test bs(4) === 3
+        @test bs(5) === 3
+        @test bs(6) === 3
+        @test bs(7) === 7
+        @test bs(8) === 7
+    end
+
+    @testset "BatchSizeSelectionMaxSize" begin
+        BatchSizeSelectionMaxSize(10) do x,y
+            @test x === 1
+            @test y === 10
+        end(1, 13)
+    end
+
+    @testset "availablebytes" begin
+        # Just a smoketest so that we e.g don't crash if CUDA.functional() is false
+        @test NaiveGAflux._availablebytes() > 0
+    end
+
+    function testgraph(insize)
+        v0 = denseinputvertex("v0", insize)
+        v1 = fluxvertex("v1", Dense(nout(v0) => 5), v0)
+        v2 = fluxvertex("v2", Dense(nout(v1) => 2), v1)
+        v3 = concat("v3", v1, v2)
+        CompGraph(v0, "v4" >> v3 + v3)
+    end
+
+    @testset "activationsizes" begin
+        graph = testgraph(3)
+        @test NaiveGAflux.activationsizes(graph, (3,)) == sum(nout, vertices(graph)) * 4
+    end
+
+    @testset "Max batch size" begin
+        import NaiveGAflux: maxtrainbatchsize, maxvalidationbatchsize
+        graph = testgraph(5)
+
+        @test maxtrainbatchsize(graph, (5,), 1000) == 2
+        @test maxtrainbatchsize(graph, (5,), 2000) == 4
+
+        @test maxvalidationbatchsize(graph, (5,), 1000) == 8
+        @test maxvalidationbatchsize(graph, (5,), 2000) == 17
+    end
+
+    @testset "limit_maxbatchsize" begin
+        import NaiveGAflux: limit_maxbatchsize, TrainBatchSize, ValidationBatchSize
+        graph = testgraph(5)
+
+        @test limit_maxbatchsize(graph, TrainBatchSize(1); inshape_nobatch=(5,), availablebytes=1000) == 1 
+        @test limit_maxbatchsize(graph, TrainBatchSize(2); inshape_nobatch=(5,), availablebytes=1000) == 2
+        @test limit_maxbatchsize(graph, TrainBatchSize(3); inshape_nobatch=(5,), availablebytes=1000) == 2
+        
+        @test limit_maxbatchsize(graph, ValidationBatchSize(6); inshape_nobatch=(5,), availablebytes=1000) == 6
+        @test limit_maxbatchsize(graph, ValidationBatchSize(8); inshape_nobatch=(5,), availablebytes=1000) == 8
+        @test limit_maxbatchsize(graph, ValidationBatchSize(10); inshape_nobatch=(5,), availablebytes=1000) == 8
+    end
+end
\ No newline at end of file
diff --git a/test/candidate.jl b/test/candidate.jl
index 01df5153..830bc0f9 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -7,7 +7,8 @@
     import MemPool
     @testset "$ctype" for (ctype, candfun) in (
         (CandidateModel, CandidateModel),
-        (CandidateOptModel, g -> CandidateOptModel(Descent(0.01), g))
+        (CandidateOptModel, g -> CandidateOptModel(Descent(0.01), g)),
+        (CandidateBatchSize, g -> CandidateBatchSize(BatchSizeSelectionWithDefaultInShape((3,)), 16, 32, CandidateModel(g)))
     )
     
         @testset " $lbl" for (lbl, wrp) in (
@@ -38,15 +39,26 @@
                 @test NaiveGAflux.model(nvertices, newcand) == 4
                 @test NaiveGAflux.model(nvertices, cand) == 3
 
-                optimizer(c) = typeof(opt(c)) 
+                opttype(c) = typeof(opt(c)) 
 
                 if ctype === CandidateOptModel
-                    @test optimizer(newcand) !== optimizer(cand) !== Nothing
+                    @test opttype(newcand) !== opttype(cand) !== Nothing
                     fmapped = fmap(identity, newcand)
                     @test opt(fmapped) !== opt(newcand)
-                    @test optimizer(fmapped) === optimizer(newcand)
+                    @test opttype(fmapped) === opttype(newcand)
                 else
-                    @test optimizer(newcand) === optimizer(cand) === Nothing
+                    @test opttype(newcand) === opttype(cand) === Nothing
+                end
+
+                if ctype == CandidateBatchSize
+                    @test batchsize(cand; withgradient=true, default=64) == 16  
+                    @test batchsize(cand; withgradient=false, default=128) == 32  
+                    # TODO Add mutation
+                    @test batchsize(newcand; withgradient=true, default=64) == 16  
+                    @test batchsize(newcand; withgradient=false, default=128) == 32  
+                else
+                    @test batchsize(cand; withgradient=true, default=64) == 64  
+                    @test batchsize(cand; withgradient=false, default=128) == 128  
                 end
 
                 teststrat() = NaiveGAflux.default_crossoverswap_strategy(v -> 1)
@@ -56,8 +68,8 @@
 
                 newcand1, newcand2 = crossfun((cand, newcand))
 
-                @test optimizer(newcand1) === optimizer(newcand)
-                @test optimizer(newcand2) === optimizer(cand)
+                @test opttype(newcand1) === opttype(newcand)
+                @test opttype(newcand2) === opttype(cand)
 
                 @test NaiveGAflux.model(nvertices, newcand1) == 4
                 @test NaiveGAflux.model(nvertices, newcand2) == 3
diff --git a/test/runtests.jl b/test/runtests.jl
index 8b27ff1d..034e88b8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -30,6 +30,9 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
     @info "Testing shape"
     include("shape.jl")
 
+    @info "Testing batch size utils"
+    include("batchsize.jl")
+
     @info "Testing archspace"
     include("archspace.jl")
 
@@ -57,7 +60,7 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
     @info "Testing visualization"
     include("visualization/callbacks.jl")
 
-    if VERSION === v"1.7.2"
+    if VERSION === v"1.7.3"
         @info "Testing README examples"
         include("examples.jl")
     else

From 3524dc47bbfd469369a6d749da760cc2795c01cf Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Tue, 31 May 2022 16:03:19 +0200
Subject: [PATCH 02/36] Fix limit_batchsize method ambiguity Finish docs for
 existing batch size stuff

---
 src/NaiveGAflux.jl |   3 +-
 src/batchsize.jl   | 173 +++++++++++++++++++++++++++++++++++++++++----
 src/candidate.jl   |   9 ++-
 test/batchsize.jl  |  35 +++++++--
 test/candidate.jl  |   2 +-
 5 files changed, 201 insertions(+), 21 deletions(-)

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index 971fbfbb..690ef8da 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -44,7 +44,8 @@ export Population, generation
 # misc types
 export Probability, MutationShield, ApplyIf, RemoveIfSingleInput, PersistentArray, ShieldedOpt
 
-export BatchSizeSelectionWithDefaultInShape, BatchSizeSelectionScaled, BatchSizeSelectionFromAlternatives, BatchSizeSelectionMaxSize
+# Batch size selection
+export BatchSizeSelectionWithDefaultInShape, BatchSizeSelectionScaled, BatchSizeSelectionFromAlternatives, BatchSizeSelectionMaxSize, batchsizeselection
 
 # Iterators. These should preferably come from somewhere else, but I haven't found anything which fits the bill w.r.t repeatability over subsets
 export RepeatPartitionIterator, SeedIterator, MapIterator, GpuIterator, BatchIterator, ShuffleIterator, TimedIterator, TimedIteratorStop, StatefulGenerationIter
diff --git a/src/batchsize.jl b/src/batchsize.jl
index 5db7f986..071173ff 100644
--- a/src/batchsize.jl
+++ b/src/batchsize.jl
@@ -18,7 +18,6 @@ julia> model = CompGraph(v0, v1);
 """
 
 generic_batchsizeselection_example(sbs, kwres...) = """
-
 # availablebytes is automatically computed if omitted, but here we supply it to avoid doctest errors
 julia> bs(model, TrainBatchSize(512); $(first(kwres[1]))availablebytes = 10_000_000)
 $(last(kwres[1]))
@@ -60,7 +59,7 @@ Batch size selection with a default assumed inshape used for estimating valid ba
 
 $(generic_batchsizefun_docstring())
 
-Return the result of `batchsizefun` with default value of `inshape_nobatch = default_inshape` when called as a function.
+Returns the result of `batchsizefun` with default value of `inshape_nobatch = default_inshape` when called as a function with valid inputs to `batchsizefun`.
 
 Composable with other batch size selection types which may be used as `batchsizefun`. See examples.
 
@@ -68,6 +67,9 @@ Composable with other batch size selection types which may be used as `batchsize
 ```jldoctest
 julia> using NaiveGAflux
 
+# These should not generally be needed in user code, but here we need them to make the examples
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+
 $(generic_batchsizefun_testgraph())
 
 julia> bs = BatchSizeSelectionWithDefaultInShape((32,32,3));
@@ -102,7 +104,7 @@ Batch size selection with a margin applied when estimating valid batch sizes.
 
 $(generic_batchsizefun_docstring())
 
-Return the result of `batchsizefun` with default value of `availablebytes = floor(scale * availablebytes)` when called as a function.
+Returns the result of `batchsizefun` with default value of `availablebytes = floor(scale * availablebytes)` when called as a function with valid inputs to `batchsizefun`.
 
 Composable with other batch size selection types which may be used as `batchsizefun`. See examples.
 
@@ -110,6 +112,9 @@ Composable with other batch size selection types which may be used as `batchsize
 ```jldoctest
 julia> using NaiveGAflux
 
+# These should not generally be needed in user code, but here we need them to make the examples
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+
 $(generic_batchsizefun_testgraph())
 
 julia> bs = BatchSizeSelectionScaled(0.5);
@@ -132,7 +137,39 @@ function (bs::BatchSizeSelectionScaled)(args...; availablebytes=_availablebytes(
     bs.batchsizefun(args...;availablebytes = floor(Int, bs.scale * availablebytes), kwargs...)
 end
 
+"""
+    BatchSizeSelectionFromAlternatives{F, T}
+    BatchSizeSelectionFromAlternatives(alts)
+    BatchSizeSelectionFromAlternatives(batchsizefun, alts)
+
+Batch size selection from a set of available alternatives. Useful for iterators which need to be pre-loaded with batch size, for example the iterators in this package.
+
+$(generic_batchsizefun_docstring())
+
+Returns the largest number in `alts` smaller than the result of `batchsizefun` when called as a function with valid inputs to `batchsizefun`.
+
+Composable with other batch size selection types which may be used as `batchsizefun`. See examples.
 
+# Examples
+```jldoctest
+julia> using NaiveGAflux
+
+# These should not generally be needed in user code, but here we need them to make the examples
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+
+$(generic_batchsizefun_testgraph())
+
+julia> bs = BatchSizeSelectionFromAlternatives(2 .^ (0:10));
+
+$(generic_batchsizeselection_example(
+    "sbs = BatchSizeSelectionFromAlternatives(BatchSizeSelectionScaled(0.5), 2 .^ (0:10));",
+    "inshape_nobatch=(32,32,3), " => "64",
+    "inshape_nobatch=(32,32,3), " => "512",
+    "inshape_nobatch=(32,32,3), " => "32",
+    "inshape_nobatch=(32,32,3), " => "512",
+    "inshape_nobatch=(32,32,3), " => "128"))
+```
+"""
 struct BatchSizeSelectionFromAlternatives{F, T}
     batchsizefun::F
     alts::T
@@ -145,29 +182,138 @@ end
 
 function select_bestfit_smaller(bs::Integer, alts)
     validalts = filter(<=(bs), alts)
-    isempty(validalts) && return nothing
-    argmin(x -> abs(bs - x), validalts)
+    isempty(validalts) && return 0
+    argmin(x -> bs - x, validalts)
 end
 
+"""
+    BatchSizeSelectionMaxSize{F}
+    BatchSizeSelectionMaxSize(uppersize) 
+    BatchSizeSelectionMaxSize(batchsizefun, uppersize) 
+
+Batch size selection which always try to select `uppersize`. Basically the strategy to select the largest batchsize which fits in memory.
+
+$(generic_batchsizefun_docstring())
+
+Returns the result of `batchsizefun` but with the `batchsize` as `uppersize` of the same type as `batchsize` (i.e. to differentiate between train size and validation size).
+
+Composable with other batch size selection types which may be used as `batchsizefun`. See examples.
+
+# Examples
+```jldoctest
+julia> using NaiveGAflux
+
+# These should not generally be needed in user code, but here we need them to make the examples
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+
+$(generic_batchsizefun_testgraph())
+
+julia> bs = BatchSizeSelectionMaxSize(1024);
+
+$(generic_batchsizeselection_example(
+    "sbs = BatchSizeSelectionMaxSize(BatchSizeSelectionScaled(0.5), 1024);",
+    "inshape_nobatch=(32,32,3), " => "120",
+    "inshape_nobatch=(32,32,3), " => "1024",
+    "inshape_nobatch=(32,32,3), " => "60",
+    "inshape_nobatch=(32,32,3), " => "1024",
+    "inshape_nobatch=(32,32,3), " => "243"))
+```
+"""
 struct BatchSizeSelectionMaxSize{F}
     batchsizefun::F
     uppersize::Int
 end
-BatchSizeSelectionMaxSize(uppersize=1024) = BatchSizeSelectionMaxSize(limit_maxbatchsize, uppersize)
-function (bs::BatchSizeSelectionMaxSize)(c, ::Any, args...; kwargs...)
-     bs.batchsizefun(c, bs.uppersize, args...; kwargs...)
+BatchSizeSelectionMaxSize(uppersize) = BatchSizeSelectionMaxSize(limit_maxbatchsize, uppersize)
+function (bs::BatchSizeSelectionMaxSize)(c, orgbs, args...; kwargs...)
+     bs.batchsizefun(c, newbatchsize(orgbs, bs.uppersize), args...; kwargs...)
+end
+# For strange batch size types which can't be created from just a number
+newbatchsize(::T, newsize) where T = T(newsize) 
+
+"""
+    batchsizeselection(inshape_nobatch::Tuple; maxmemutil=0.7, uppersize=nothing, alternatives=nothing, batchsizefun=limit_maxbatchsize)
+
+Return a batch size selection callable which may be used to select an appropriate batch size when given a model and 
+a suggested batch size.
+
+`inshape_nobatch` is the size of the input without the batch dimension (e.g. 3 values for images) to be assumed. See [`BatchSizeSelectionWithDefaultInShape`](@ref)
+
+$(generic_batchsizefun_docstring())
+
+`maxmemutil` is the maximum memory utilization which typically need to be `< 1` to account for inaccuracies in the estimation. See [`BatchSizeSelectionScaled`](@ref)
+
+If `uppersize` is not `nothing` the maximum possible batchsize smaller or equal to `uppersize` will be used. See [`BatchSizeSelectionMaxSize`](@ref)
+
+If `alternatives` is not nothing, the returned batchsize will be quantized to the closest matching size in `alternatives` which is not bigger than the unquantized batch size. See [`BatchSizeSelectionFromAlternatives`](@ref).
+
+# Examples
+```jldoctest
+julia> using NaiveGAflux
+
+# These should not generally be needed in user code, but here we need them to make the examples
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+
+$(generic_batchsizefun_testgraph())
+
+julia> bs = batchsizeselection((32,32,3));
+
+# availablebytes is automatically computed if omitted, but here we supply it to avoid doctest errors
+julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+84
+
+julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+128
+
+julia> bs = batchsizeselection((32,32,3); maxmemutil=0.1);
+
+julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+12
+
+julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+24
+
+julia> bs = batchsizeselection((32,32,3); uppersize=1024);
+
+julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+84
+
+julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+170
+
+julia> bs = batchsizeselection((32,32,3); uppersize=1024, alternatives = 2 .^ (0:10));
+
+julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+64
+
+julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+128
+"""
+function batchsizeselection(inshape_nobatch::Tuple;
+                            batchsizefun=limit_maxbatchsize, 
+                            maxmemutil=0.7, 
+                            uppersize=nothing, 
+                            alternatives=nothing)
+    bs = BatchSizeSelectionWithDefaultInShape(batchsizefun, inshape_nobatch)
+    bs = isnothing(maxmemutil) ? bs : BatchSizeSelectionScaled(bs, maxmemutil)
+    bs = isnothing(uppersize) ? bs : BatchSizeSelectionMaxSize(bs, uppersize)
+    bs = isnothing(alternatives) ? bs : BatchSizeSelectionFromAlternatives(bs, alternatives)
 end
 
-function limit_maxbatchsize(model, tbs::TrainBatchSize; inshape_nobatch, availablebytes = _availablebytes())
-    min(batchsize(tbs), maxtrainbatchsize(model, inshape_nobatch, availablebytes))
+
+# specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
+# Consider refactoring
+function limit_maxbatchsize(model::CompGraph, bs::TrainBatchSize; inshape_nobatch, availablebytes = _availablebytes())
+    min(batchsize(bs), maxtrainbatchsize(model, inshape_nobatch, availablebytes))
 end
 
-function limit_maxbatchsize(model,
-                            tbs::ValidationBatchSize; 
+# specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
+# Consider refactoring
+function limit_maxbatchsize(model::CompGraph, 
+                            bs::ValidationBatchSize; 
                             inshape_nobatch,
                             availablebytes = _availablebytes()
                             )
-    min(batchsize(tbs), maxvalidationbatchsize(model, inshape_nobatch, availablebytes))
+    min(batchsize(bs), maxvalidationbatchsize(model, inshape_nobatch, availablebytes))
 end
 
 function maxtrainbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
@@ -195,7 +341,6 @@ function activationsizes(model::CompGraph, inshape_nobatch, elemsize = model |>
     mapreduce(act -> length(act) * elemsize, +, values(activations))
 end
 
-
 function _availablebytes()
     if CUDA.functional()
         info = CUDA.MemoryInfo()
diff --git a/src/candidate.jl b/src/candidate.jl
index bd0a49d5..81b7d0d7 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -94,7 +94,7 @@ newcand(c::CandidateModel, mapfield) = CandidateModel(map(mapfield, getproperty.
 
 """
     CandidateOptModel <: AbstractCandidate
-    CandidateOptModel(candidate::AbstractCandidate, optimizer)
+    CandidateOptModel(optimizer, candidate)
 
 A candidate adding an optimizer to another candidate. The optimizer is accessed by [`opt(c)`] for `CandidateOptModel c`.
 """
@@ -120,7 +120,14 @@ opt(c::CandidateOptModel; kwargs...) = c.opt
 
 newcand(c::CandidateOptModel, mapfield) = CandidateOptModel(mapfield(c.opt), newcand(wrappedcand(c), mapfield))
 
+"""
+    CandidateBatchSize <: AbstractWrappingCandidate
+    CandidateBatchSize(limitfun, trainbatchsize, validationbatchsize, candidate)
+
+A candidate adding batch sizes to another candiate. `limitfun` is used to try to ensure that batch sizes are small enough so that training and validating the model does not risk an out of memory error. Use [`batchsizeselection`](@ref) to create an appropriate `limitfun`.
 
+The batch sizes are accessed by [`batchsize(c; withgradient)`] for `CandidateBatchSize c` where `withgradient=true` gives the training batch size and `withgradient=false` gives the validation batch size.
+"""
 struct CandidateBatchSize{F, C <: AbstractCandidate} <: AbstractWrappingCandidate
     tbs::TrainBatchSize
     vbs::ValidationBatchSize
diff --git a/test/batchsize.jl b/test/batchsize.jl
index cabb5bf7..2a39daa7 100644
--- a/test/batchsize.jl
+++ b/test/batchsize.jl
@@ -18,10 +18,10 @@
     end
 
     @testset "BatchSizeSelectionFromAlternatives" begin
-        bs = BatchSizeSelectionFromAlternatives(identity, [1, 3, 7])
-        @test bs(0) === nothing
-        @test bs(1) === 1
-        @test bs(2) === 1
+        bs = BatchSizeSelectionFromAlternatives(identity, [2, 3, 7])
+        @test bs(0) === 0
+        @test bs(1) === 0
+        @test bs(2) === 2
         @test bs(3) === 3
         @test bs(4) === 3
         @test bs(5) === 3
@@ -78,4 +78,31 @@
         @test limit_maxbatchsize(graph, ValidationBatchSize(8); inshape_nobatch=(5,), availablebytes=1000) == 8
         @test limit_maxbatchsize(graph, ValidationBatchSize(10); inshape_nobatch=(5,), availablebytes=1000) == 8
     end
+
+    @testset "batchsizeselection" begin
+        import NaiveGAflux: limit_maxbatchsize, TrainBatchSize, ValidationBatchSize
+        # Pretty much the integration tests as it uses all the above components
+        graph = testgraph(4)
+        bs = batchsizeselection((4,))
+        
+        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 19
+        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 31
+
+        bs = batchsizeselection((4,); maxmemutil=0.1)
+        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 2
+        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 8
+
+        bs = batchsizeselection((4,); uppersize=64)
+        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 19
+        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 64
+
+        bs = batchsizeselection((4,); alternatives=2 .^ (0:10))
+        @test bs(graph, TrainBatchSize(33), availablebytes=10000) == 16
+        @test bs(graph, ValidationBatchSize(33), availablebytes=10000) == 32
+
+        bs = batchsizeselection((4,); uppersize=65, alternatives=2 .^ (0:10))
+        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 16
+        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 64
+
+    end
 end
\ No newline at end of file
diff --git a/test/candidate.jl b/test/candidate.jl
index 830bc0f9..d6e7159e 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -8,7 +8,7 @@
     @testset "$ctype" for (ctype, candfun) in (
         (CandidateModel, CandidateModel),
         (CandidateOptModel, g -> CandidateOptModel(Descent(0.01), g)),
-        (CandidateBatchSize, g -> CandidateBatchSize(BatchSizeSelectionWithDefaultInShape((3,)), 16, 32, CandidateModel(g)))
+        (CandidateBatchSize, g -> CandidateBatchSize(batchsizeselection((3,)), 16, 32, CandidateModel(g)))
     )
     
         @testset " $lbl" for (lbl, wrp) in (

From 291d28ba8c088d72cca94230c7bba46529e5051b Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Tue, 31 May 2022 22:19:52 +0200
Subject: [PATCH 03/36] Change order of members in batch size selection structs
 Add reference docs and doctests for batch size selection

---
 docs/Project.toml               |   3 +
 docs/make.jl                    |   1 +
 docs/src/reference/batchsize.md |   9 +++
 docs/src/reference/candidate.md |   1 +
 src/batchsize.jl                | 106 ++++++++++++++------------------
 src/candidate.jl                |  16 ++---
 test/Project.toml               |   4 ++
 test/batchsize.jl               |  14 ++---
 test/candidate.jl               |   4 +-
 test/runtests.jl                |   5 +-
 10 files changed, 85 insertions(+), 78 deletions(-)
 create mode 100644 docs/src/reference/batchsize.md

diff --git a/docs/Project.toml b/docs/Project.toml
index 8f061370..e9c6bf05 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -2,3 +2,6 @@
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 NaiveGAflux = "81ede08e-ab29-11e9-16d3-79edd30a1d76"
+
+[compat]
+Documenter = "0.27"
diff --git a/docs/make.jl b/docs/make.jl
index 94965ad3..1457ec6e 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -46,6 +46,7 @@ makedocs(   sitename="NaiveGAflux",
                     "reference/fitness.md",
                     "reference/candidate.md",
                     "reference/evolution.md",
+                    "reference/batchsize.md"
                     "reference/iterators.md",
                     "reference/utils.md",
                 ]
diff --git a/docs/src/reference/batchsize.md b/docs/src/reference/batchsize.md
new file mode 100644
index 00000000..832ec298
--- /dev/null
+++ b/docs/src/reference/batchsize.md
@@ -0,0 +1,9 @@
+# [Batch Size Utilities](@id BatchSizeUtilsAPI)
+
+```@docs
+batchsizeselection
+BatchSizeSelectionWithDefaultInShape
+BatchSizeSelectionScaled
+BatchSizeSelectionFromAlternatives
+BatchSizeSelectionMaxSize
+```
\ No newline at end of file
diff --git a/docs/src/reference/candidate.md b/docs/src/reference/candidate.md
index 2b57ae9b..9b4bad96 100644
--- a/docs/src/reference/candidate.md
+++ b/docs/src/reference/candidate.md
@@ -3,6 +3,7 @@
 ```@docs
 CandidateModel 
 CandidateOptModel
+CandidateBatchSize
 FittedCandidate
 evolvemodel
 Population
diff --git a/src/batchsize.jl b/src/batchsize.jl
index 071173ff..7fa6f9cd 100644
--- a/src/batchsize.jl
+++ b/src/batchsize.jl
@@ -14,26 +14,25 @@ julia> v0 = conv2dinputvertex("v0", 3);
 
 julia> v1 = fluxvertex("v1", Conv((3,3), nout(v0) => 8), v0);
 
-julia> model = CompGraph(v0, v1);
+julia> graph = CompGraph(v0, v1);
 """
 
 generic_batchsizeselection_example(sbs, kwres...) = """
-# availablebytes is automatically computed if omitted, but here we supply it to avoid doctest errors
-julia> bs(model, TrainBatchSize(512); $(first(kwres[1]))availablebytes = 10_000_000)
+julia> bs(graph, TrainBatchSize(512); $(first(kwres[1]))availablebytes = 10_000_000) # availablebytes supplied for doctest reasons
 $(last(kwres[1]))
 
-julia> bs(model, TrainBatchSize(512); $(first(kwres[2]))availablebytes = 1000_000_000)
+julia> bs(graph, TrainBatchSize(512); $(first(kwres[2]))availablebytes = 1000_000_000)
 $(last(kwres[2]))
 
 julia> $sbs
 
-julia> sbs(model, TrainBatchSize(512); $(first(kwres[3]))availablebytes = 10_000_000)
+julia> sbs(graph, TrainBatchSize(512); $(first(kwres[3]))availablebytes = 10_000_000)
 $(last(kwres[3]))
 
-julia> sbs(model, TrainBatchSize(512); $(first(kwres[4]))availablebytes = 1000_000_000)
+julia> sbs(graph, TrainBatchSize(512); $(first(kwres[4]))availablebytes = 1000_000_000)
 $(last(kwres[4]))
 
-julia> bs(model, ValidationBatchSize(512); $(first(kwres[5]))availablebytes=10_000_000)
+julia> bs(graph, ValidationBatchSize(512); $(first(kwres[5]))availablebytes=10_000_000)
 $(last(kwres[5]))
 """
 
@@ -53,7 +52,7 @@ batchsize(bs::ValidationBatchSize) = bs.size
 """
     BatchSizeSelectionWithDefaultInShape{T, F}
     BatchSizeSelectionWithDefaultInShape(default_inshape)
-    BatchSizeSelectionWithDefaultInShape(batchsizefun, default_inshape)
+    BatchSizeSelectionWithDefaultInShape(default_inshape, batchsizefun)
 
 Batch size selection with a default assumed inshape used for estimating valid batch sizes.
 
@@ -65,17 +64,15 @@ Composable with other batch size selection types which may be used as `batchsize
 
 # Examples
 ```jldoctest
-julia> using NaiveGAflux
+julia> using NaiveGAflux, Flux
 
-# These should not generally be needed in user code, but here we need them to make the examples
-julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize # Needed only for examples
 
 $(generic_batchsizefun_testgraph())
-
 julia> bs = BatchSizeSelectionWithDefaultInShape((32,32,3));
 
 $(generic_batchsizeselection_example(
-    "sbs = BatchSizeSelectionWithDefaultInShape(BatchSizeSelectionScaled(0.5), (32,32,3))",
+    "sbs = BatchSizeSelectionWithDefaultInShape((32,32,3), BatchSizeSelectionScaled(0.5));",
     "" => "120",
     "" => "512",
     "" => "60",
@@ -84,11 +81,11 @@ $(generic_batchsizeselection_example(
 ```
 """
 struct BatchSizeSelectionWithDefaultInShape{T, F}
-    batchsizefun::F
     default_inshape::T
+    batchsizefun::F
 end
 function BatchSizeSelectionWithDefaultInShape(default_inshape) 
-        BatchSizeSelectionWithDefaultInShape(limit_maxbatchsize, default_inshape)
+        BatchSizeSelectionWithDefaultInShape(default_inshape, limit_maxbatchsize)
 end
 function (bs::BatchSizeSelectionWithDefaultInShape)(args...; inshape_nobatch=bs.default_inshape ,kwargs...) 
         bs.batchsizefun(args...; inshape_nobatch, kwargs...)
@@ -98,7 +95,7 @@ end
 """
     BatchSizeSelectionScaled{F}
     BatchSizeSelectionScaled(scale)
-    BatchSizeSelectionScaled(batchsizefun, scale)
+    BatchSizeSelectionScaled(scale, batchsizefun)
 
 Batch size selection with a margin applied when estimating valid batch sizes.
 
@@ -110,17 +107,15 @@ Composable with other batch size selection types which may be used as `batchsize
 
 # Examples
 ```jldoctest
-julia> using NaiveGAflux
+julia> using NaiveGAflux, Flux
 
-# These should not generally be needed in user code, but here we need them to make the examples
-julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize # Needed only for examples
 
 $(generic_batchsizefun_testgraph())
-
 julia> bs = BatchSizeSelectionScaled(0.5);
 
 $(generic_batchsizeselection_example(
-    "sbs = BatchSizeSelectionScaled(BatchSizeSelectionWithDefaultInShape((32,32,3)), 0.5);",
+    "sbs = BatchSizeSelectionScaled(0.5, BatchSizeSelectionWithDefaultInShape((32,32,3)));",
     "inshape_nobatch=(32,32,3), " => "60",
     "inshape_nobatch=(32,32,3), " => "512",
     "" => "60",
@@ -129,18 +124,18 @@ $(generic_batchsizeselection_example(
 ```
 """
 struct BatchSizeSelectionScaled{F}
-    batchsizefun::F
     scale::Float64
+    batchsizefun::F
 end
-BatchSizeSelectionScaled(scale::AbstractFloat) = BatchSizeSelectionScaled(limit_maxbatchsize, scale)  
+BatchSizeSelectionScaled(scale::AbstractFloat) = BatchSizeSelectionScaled(scale, limit_maxbatchsize)  
 function (bs::BatchSizeSelectionScaled)(args...; availablebytes=_availablebytes(), kwargs...) 
     bs.batchsizefun(args...;availablebytes = floor(Int, bs.scale * availablebytes), kwargs...)
 end
 
 """
-    BatchSizeSelectionFromAlternatives{F, T}
+    BatchSizeSelectionFromAlternatives{T, F}
     BatchSizeSelectionFromAlternatives(alts)
-    BatchSizeSelectionFromAlternatives(batchsizefun, alts)
+    BatchSizeSelectionFromAlternatives(alts, batchsizefun)
 
 Batch size selection from a set of available alternatives. Useful for iterators which need to be pre-loaded with batch size, for example the iterators in this package.
 
@@ -152,17 +147,15 @@ Composable with other batch size selection types which may be used as `batchsize
 
 # Examples
 ```jldoctest
-julia> using NaiveGAflux
+julia> using NaiveGAflux, Flux
 
-# These should not generally be needed in user code, but here we need them to make the examples
-julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize # Needed only for examples
 
 $(generic_batchsizefun_testgraph())
-
 julia> bs = BatchSizeSelectionFromAlternatives(2 .^ (0:10));
 
 $(generic_batchsizeselection_example(
-    "sbs = BatchSizeSelectionFromAlternatives(BatchSizeSelectionScaled(0.5), 2 .^ (0:10));",
+    "sbs = BatchSizeSelectionFromAlternatives(2 .^ (0:10), BatchSizeSelectionScaled(0.5));",
     "inshape_nobatch=(32,32,3), " => "64",
     "inshape_nobatch=(32,32,3), " => "512",
     "inshape_nobatch=(32,32,3), " => "32",
@@ -170,11 +163,11 @@ $(generic_batchsizeselection_example(
     "inshape_nobatch=(32,32,3), " => "128"))
 ```
 """
-struct BatchSizeSelectionFromAlternatives{F, T}
-    batchsizefun::F
+struct BatchSizeSelectionFromAlternatives{T, F}
     alts::T
+    batchsizefun::F
 end
-BatchSizeSelectionFromAlternatives(alts) = BatchSizeSelectionFromAlternatives(limit_maxbatchsize, alts)
+BatchSizeSelectionFromAlternatives(alts) = BatchSizeSelectionFromAlternatives(alts, limit_maxbatchsize)
 
 function (bs::BatchSizeSelectionFromAlternatives)(args...;kwargs...) 
     select_bestfit_smaller(bs.batchsizefun(args...;kwargs...), bs.alts)
@@ -189,7 +182,7 @@ end
 """
     BatchSizeSelectionMaxSize{F}
     BatchSizeSelectionMaxSize(uppersize) 
-    BatchSizeSelectionMaxSize(batchsizefun, uppersize) 
+    BatchSizeSelectionMaxSize(uppersize, batchsizefun) 
 
 Batch size selection which always try to select `uppersize`. Basically the strategy to select the largest batchsize which fits in memory.
 
@@ -201,17 +194,15 @@ Composable with other batch size selection types which may be used as `batchsize
 
 # Examples
 ```jldoctest
-julia> using NaiveGAflux
+julia> using NaiveGAflux, Flux
 
-# These should not generally be needed in user code, but here we need them to make the examples
-julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize # Needed only for examples
 
 $(generic_batchsizefun_testgraph())
-
 julia> bs = BatchSizeSelectionMaxSize(1024);
 
 $(generic_batchsizeselection_example(
-    "sbs = BatchSizeSelectionMaxSize(BatchSizeSelectionScaled(0.5), 1024);",
+    "sbs = BatchSizeSelectionMaxSize(1024, BatchSizeSelectionScaled(0.5));",
     "inshape_nobatch=(32,32,3), " => "120",
     "inshape_nobatch=(32,32,3), " => "1024",
     "inshape_nobatch=(32,32,3), " => "60",
@@ -220,10 +211,10 @@ $(generic_batchsizeselection_example(
 ```
 """
 struct BatchSizeSelectionMaxSize{F}
-    batchsizefun::F
     uppersize::Int
+    batchsizefun::F
 end
-BatchSizeSelectionMaxSize(uppersize) = BatchSizeSelectionMaxSize(limit_maxbatchsize, uppersize)
+BatchSizeSelectionMaxSize(uppersize) = BatchSizeSelectionMaxSize(uppersize, limit_maxbatchsize)
 function (bs::BatchSizeSelectionMaxSize)(c, orgbs, args...; kwargs...)
      bs.batchsizefun(c, newbatchsize(orgbs, bs.uppersize), args...; kwargs...)
 end
@@ -248,58 +239,55 @@ If `alternatives` is not nothing, the returned batchsize will be quantized to th
 
 # Examples
 ```jldoctest
-julia> using NaiveGAflux
+julia> using NaiveGAflux, Flux
 
-# These should not generally be needed in user code, but here we need them to make the examples
-julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize
+julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize # Needed only for examples
 
 $(generic_batchsizefun_testgraph())
-
 julia> bs = batchsizeselection((32,32,3));
 
-# availablebytes is automatically computed if omitted, but here we supply it to avoid doctest errors
-julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000) # availablebytes supplied for doctest reasons
 84
 
-julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
 128
 
 julia> bs = batchsizeselection((32,32,3); maxmemutil=0.1);
 
-julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000)
 12
 
-julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
 24
 
 julia> bs = batchsizeselection((32,32,3); uppersize=1024);
 
-julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000)
 84
 
-julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
 170
 
 julia> bs = batchsizeselection((32,32,3); uppersize=1024, alternatives = 2 .^ (0:10));
 
-julia> bs(model, TrainBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000)
 64
 
-julia> bs(model, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
 128
+```
 """
 function batchsizeselection(inshape_nobatch::Tuple;
                             batchsizefun=limit_maxbatchsize, 
                             maxmemutil=0.7, 
                             uppersize=nothing, 
                             alternatives=nothing)
-    bs = BatchSizeSelectionWithDefaultInShape(batchsizefun, inshape_nobatch)
-    bs = isnothing(maxmemutil) ? bs : BatchSizeSelectionScaled(bs, maxmemutil)
-    bs = isnothing(uppersize) ? bs : BatchSizeSelectionMaxSize(bs, uppersize)
-    bs = isnothing(alternatives) ? bs : BatchSizeSelectionFromAlternatives(bs, alternatives)
+    bs = BatchSizeSelectionWithDefaultInShape(inshape_nobatch, batchsizefun)
+    bs = isnothing(maxmemutil) ? bs : BatchSizeSelectionScaled(maxmemutil, bs)
+    bs = isnothing(uppersize) ? bs : BatchSizeSelectionMaxSize(uppersize, bs)
+    bs = isnothing(alternatives) ? bs : BatchSizeSelectionFromAlternatives(alternatives, bs)
 end
 
-
 # specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
 # Consider refactoring
 function limit_maxbatchsize(model::CompGraph, bs::TrainBatchSize; inshape_nobatch, availablebytes = _availablebytes())
diff --git a/src/candidate.jl b/src/candidate.jl
index 81b7d0d7..5886652e 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -134,18 +134,18 @@ struct CandidateBatchSize{F, C <: AbstractCandidate} <: AbstractWrappingCandidat
     limitfun::F
     c::C
 
-    function CandidateBatchSize{F, C}(limitfun::F, tbs::TrainBatchSize, vbs::ValidationBatchSize, c::C) where {F, C}
+    function CandidateBatchSize{F, C}(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, c::C) where {F, C}
         new{F, C}(TrainBatchSize(limitfun(c, tbs)), ValidationBatchSize(limitfun(c, vbs)), limitfun, c)
     end
 end
 
 @functor CandidateBatchSize
 
-function CandidateBatchSize(limitfun, tbs::Integer, vbs::Integer, c)
-    CandidateBatchSize(limitfun, TrainBatchSize(tbs), ValidationBatchSize(vbs), c)
+function CandidateBatchSize(tbs::Integer, vbs::Integer, limitfun, c)
+    CandidateBatchSize(TrainBatchSize(tbs), ValidationBatchSize(vbs), limitfun, c)
 end
-function CandidateBatchSize(limitfun::F, tbs::TrainBatchSize, vbs::ValidationBatchSize, c::C) where {C<:AbstractCandidate, F}
-    CandidateBatchSize{F, C}(limitfun, tbs, vbs, c)
+function CandidateBatchSize(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, c::C) where {C<:AbstractCandidate, F}
+    CandidateBatchSize{F, C}(tbs, vbs, limitfun, c)
 end
 
 
@@ -155,9 +155,9 @@ function batchsize(c::CandidateBatchSize; withgradient, inshape_nobatch=nothing,
 end
 
 function newcand(c::CandidateBatchSize, mapfield) 
-    CandidateBatchSize(mapfield(c.limitfun),
-                       mapfield(c.tbs), 
-                       mapfield(c.vbs), 
+    CandidateBatchSize(mapfield(c.tbs), 
+                       mapfield(c.vbs),
+                       mapfield(c.limitfun), 
                        newcand(c.c, mapfield))
 end
 
diff --git a/test/Project.toml b/test/Project.toml
index ba739221..b4e261c6 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -12,3 +13,6 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+Documenter = "0.27"
diff --git a/test/batchsize.jl b/test/batchsize.jl
index 2a39daa7..aa3f9f1c 100644
--- a/test/batchsize.jl
+++ b/test/batchsize.jl
@@ -5,20 +5,20 @@
             return x => inshape_nobatch
         end
 
-        @test BatchSizeSelectionWithDefaultInShape(testfun, (2,3,4))(13) == (13 => (2,3,4))
+        @test BatchSizeSelectionWithDefaultInShape((2,3,4), testfun)(13) == (13 => (2,3,4))
 
-        @test BatchSizeSelectionWithDefaultInShape(testfun, (2,3,4))(13; inshape_nobatch=(3,)) == (13 => (3,))
+        @test BatchSizeSelectionWithDefaultInShape((2,3,4), testfun)(13; inshape_nobatch=(3,)) == (13 => (3,))
     end
 
     @testset "BatchSizeSelectionScaled" begin
         testfun = function(x; availablebytes=1)
             return x => availablebytes
         end
-        @test BatchSizeSelectionScaled(testfun, 0.5)(4; availablebytes=6) == (4 => 3)
+        @test BatchSizeSelectionScaled(0.5, testfun)(4; availablebytes=6) == (4 => 3)
     end
 
     @testset "BatchSizeSelectionFromAlternatives" begin
-        bs = BatchSizeSelectionFromAlternatives(identity, [2, 3, 7])
+        bs = BatchSizeSelectionFromAlternatives([2, 3, 7], identity)
         @test bs(0) === 0
         @test bs(1) === 0
         @test bs(2) === 2
@@ -31,10 +31,8 @@
     end
 
     @testset "BatchSizeSelectionMaxSize" begin
-        BatchSizeSelectionMaxSize(10) do x,y
-            @test x === 1
-            @test y === 10
-        end(1, 13)
+
+        BatchSizeSelectionMaxSize(10, Pair)(1, 13) == 10 => 13
     end
 
     @testset "availablebytes" begin
diff --git a/test/candidate.jl b/test/candidate.jl
index d6e7159e..52f01d90 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -8,7 +8,7 @@
     @testset "$ctype" for (ctype, candfun) in (
         (CandidateModel, CandidateModel),
         (CandidateOptModel, g -> CandidateOptModel(Descent(0.01), g)),
-        (CandidateBatchSize, g -> CandidateBatchSize(batchsizeselection((3,)), 16, 32, CandidateModel(g)))
+        (CandidateBatchSize, g -> CandidateBatchSize(16, 32, batchsizeselection((3,)), CandidateModel(g)))
     )
     
         @testset " $lbl" for (lbl, wrp) in (
@@ -141,7 +141,7 @@
             @testset "Hold in mem" begin
                 import NaiveGAflux: wrappedcand, callcand, candinmem
                 struct BoolCand <: AbstractCandidate
-                    x::Ref{Bool}
+                    x::Base.RefValue{Bool}
                 end
                 testref(c::BoolCand, f=identity) = f(c.x)
                 testref(c::AbstractWrappingCandidate) = testref(wrappedcand(c))
diff --git a/test/runtests.jl b/test/runtests.jl
index 034e88b8..714b363d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -64,9 +64,12 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
         @info "Testing README examples"
         include("examples.jl")
     else
-        @warn "README examples will only be tested in julia version 1.7.2 due to rng dependency. Skipping..."
+        @warn "README examples will only be tested in julia version 1.7.3 due to rng dependency. Skipping..."
     end
 
     @info "Testing AutoFlux"
     include("app/autoflux.jl")
+
+    import Documenter
+    Documenter.doctest(NaiveGAflux)
 end

From 38930e9c661e182ce50cd82d353fa3627348e487 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 1 Jun 2022 00:59:33 +0200
Subject: [PATCH 04/36] Add missing comma in array

---
 docs/make.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index 1457ec6e..c8bdb2bb 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -46,7 +46,7 @@ makedocs(   sitename="NaiveGAflux",
                     "reference/fitness.md",
                     "reference/candidate.md",
                     "reference/evolution.md",
-                    "reference/batchsize.md"
+                    "reference/batchsize.md",
                     "reference/iterators.md",
                     "reference/utils.md",
                 ]

From 7c68e598ec2e6abbb748c55b9e360c16e1d36e47 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Thu, 2 Jun 2022 00:02:16 +0200
Subject: [PATCH 05/36] Add ReBatchingIterator

---
 docs/Project.toml |   1 +
 src/iterators.jl  | 147 ++++++++++++++++++++++++++++++++++++++------
 src/util.jl       |   2 +-
 test/iterators.jl | 151 ++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 269 insertions(+), 32 deletions(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index e9c6bf05..569dec2a 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
 NaiveGAflux = "81ede08e-ab29-11e9-16d3-79edd30a1d76"
 
diff --git a/src/iterators.jl b/src/iterators.jl
index 21eefee6..387e0521 100644
--- a/src/iterators.jl
+++ b/src/iterators.jl
@@ -25,8 +25,8 @@ end
 ```
 
 """
-struct RepeatPartitionIterator{T, VS}
-    base::Iterators.Stateful{T, VS}
+struct RepeatPartitionIterator{I <: Iterators.Stateful}
+    base::I
     ntake::Int
 end
 RepeatPartitionIterator(base, nrep) = RepeatPartitionIterator(Iterators.Stateful(base), nrep)
@@ -41,16 +41,16 @@ function Base.iterate(itr::RepeatPartitionIterator, reset=true)
 end
 
 Base.length(itr::RepeatPartitionIterator) = cld(length(itr.base), itr.ntake)
-Base.eltype(itr::RepeatPartitionIterator) = eltype(itr.base)
-Base.size(itr::RepeatPartitionIterator) = size(itr.base.itr)
+Base.eltype(::Type{RepeatPartitionIterator{I}}) where {I} = eltype(I)
+Base.size(itr::RepeatPartitionIterator) = tuple(length(itr))
 
 
-Base.IteratorSize(itr::RepeatPartitionIterator) = Base.IteratorSize(itr.base.itr)
-Base.IteratorEltype(itr::RepeatPartitionIterator) = Base.IteratorEltype(itr.base.itr)
+Base.IteratorSize(::Type{RepeatPartitionIterator{I}}) where {I} = Base.IteratorSize(I)
+Base.IteratorEltype(::Type{RepeatPartitionIterator{I}}) where {I} = Base.IteratorEltype(I)
 
 
-struct RepeatStatefulIterator{T, VS}
-    base::Iterators.Stateful{T, VS}
+struct RepeatStatefulIterator{I <: Iterators.Stateful, VS}
+    base::I
     start::VS
     taken::Int
 end
@@ -66,21 +66,21 @@ function Base.iterate(itr::RepeatStatefulIterator, reset=true)
 end
 
 Base.length(itr::RepeatStatefulIterator) = length(itr.base.itr) - itr.taken
-Base.eltype(itr::RepeatStatefulIterator) = eltype(itr.base)
+Base.eltype(::Type{RepeatStatefulIterator{I,VS}}) where {I,VS} = eltype(I)
 Base.size(itr::RepeatStatefulIterator) = size(itr.base.itr)
 
-Base.IteratorSize(::Type{RepeatStatefulIterator{T,VS}}) where {T, VS} = Base.IteratorSize(Iterators.Stateful{T,VS})
-Base.IteratorEltype(::Type{RepeatStatefulIterator{T,VS}}) where {T, VS} = Base.IteratorEltype(Iterators.Stateful{T,VS})
+Base.IteratorSize(::Type{RepeatStatefulIterator{I,VS}}) where {I,VS} = Base.IteratorSize(I)
+Base.IteratorEltype(::Type{RepeatStatefulIterator{I,VS}}) where {I,VS} = Base.IteratorEltype(I)
 
 """
     StatefulGenerationIter{T, VS}
 
 Uses a `RepeatPartitionIterator` to ensure that the same iterator is returned for the same generation number.
 """
-struct StatefulGenerationIter{I, T, VS}
-    currgen::Ref{Int}
-    curriter::Ref{I}
-    iter::RepeatPartitionIterator{T, VS}
+struct StatefulGenerationIter{I, R}
+    currgen::Base.RefValue{Int}
+    curriter::Base.RefValue{I}
+    iter::RepeatPartitionIterator{R}
 end
 # TODO : This is a bit of cludge-on-cludge. Try to refactor someday to a more straighforward design, perhaps use LearnBase.getobs
 StatefulGenerationIter(iter::RepeatPartitionIterator, gen=0) = StatefulGenerationIter(Ref(gen), Ref(first(iterate(iter))), iter)
@@ -108,7 +108,7 @@ Calls `Random.seed!(rng, seed)` every iteration so that wrapped iterators which
 
 Useful in conjunction with [`RepeatPartitionIterator`](@ref) and [`BatchIterator`](@ref) and/or random data augmentation so that all candidates in a generation are trained with identical data.
 """
-struct SeedIterator{R <: AbstractRNG,T}
+struct SeedIterator{R <: AbstractRNG, T}
     rng::R
     seed::UInt32
     base::T
@@ -129,7 +129,7 @@ function Base.iterate(itr::SeedIterator, state)
 end
 
 Base.length(itr::SeedIterator) = length(itr.base)
-Base.eltype(itr::SeedIterator) = eltype(itr.base)
+Base.eltype(::Type{SeedIterator{R,T}}) where {R,T} = eltype(T)
 Base.size(itr::SeedIterator) = size(itr.base)
 
 Base.IteratorSize(::Type{SeedIterator{R, T}}) where {R,T}= Base.IteratorSize(T)
@@ -140,7 +140,7 @@ Base.IteratorEltype(::Type{SeedIterator{R, T}}) where {R,T} = Base.IteratorEltyp
 
 Return an iterator which sends values from `itr` to the GPU.
 """
-GpuIterator(itr) = Iterators.map(gpuitr, itr)
+GpuIterator(itr) = Iterators.map(gpuitr, itr) # Iterator.map can't infer eltypes, but we can't either as we don't know for sure what Flux.gpu will do
 gpuitr(a) = Flux.gpu(a)
 gpuitr(a::SubArray) = gpuitr(collect(a))
 gpuitr(a::Tuple) = gpuitr.(a)
@@ -185,9 +185,14 @@ function Base.iterate(itr::BatchIterator, inds = shuffle(itr.rng, 1:itr.nobs))
 end
 
 Base.length(itr::BatchIterator) = cld(itr.nobs, itr.batchsize)
+Base.eltype(::Type{BatchIterator{R,D}}) where {R,D} = D
+Base.eltype(::Type{BatchIterator{R,Singleton{D}}}) where {R,D} = D
+Base.eltype(::Type{BatchIterator{R, D}}) where {R <: AbstractRNG, D <: AbstractRange} = Array{eltype(D), ndims(D)}
 Base.size(itr::BatchIterator) = tuple(length(itr))
 
-Base.IteratorEltype(::Type{BatchIterator{R,D}}) where {R, D} = Base.IteratorEltype(D)
+Base.IteratorEltype(::Type{BatchIterator{R,D}}) where {R,D} = Base.IteratorEltype(D)
+Base.IteratorEltype(::Type{BatchIterator{R,Singleton{D}}}) where {R,D} = Base.IteratorEltype(D)
+
 
 batch(s::Singleton, inds) = batch(val(s), inds)
 batch(b::Tuple, inds) = batch.(b, Ref(inds))
@@ -227,6 +232,7 @@ struct TimedIteratorStop
 end
 
 Base.length(itr::TimedIterator) = length(itr.base)
+Base.eltype(::Type{TimedIterator{F,A,I}}) where {F,A,I} = eltype(I)
 Base.size(itr::TimedIterator) = size(itr.base)
 
 Base.IteratorSize(::Type{TimedIterator{F,A,I}}) where {F,A,I} = Base.IteratorSize(I)
@@ -259,4 +265,105 @@ function Base.iterate(itr::TimedIterator, (tstamp, ntimeout, bstate)::Tuple)
 end
 
 # itergeneration on timeoutaction just in case user e.g. wants to deepcopy it to avoid some shared state.
-itergeneration(itr::TimedIterator, gen) = TimedIterator(itr.timelimit, itr.patience, itergeneration(itr.timeoutaction, gen), itr.accumulate_timeouts, itergeneration(itr.base, gen))
\ No newline at end of file
+itergeneration(itr::TimedIterator, gen) = TimedIterator(itr.timelimit, itr.patience, itergeneration(itr.timeoutaction, gen), itr.accumulate_timeouts, itergeneration(itr.base, gen))
+
+"""
+    ReBatchingIterator{I}
+    ReBatchingIterator(base, batchsize)
+
+Return and iterator which iterates `batchsize` samples from `base` where `base` is in itself assumed to provide batches of another batchsize.
+
+Reason for this convoluted construct is to provide a way to use different batch sizes for different models while still allowing all models to see the same samples (including data augmentation) in the same order. As we don't want to make assumption about what `base` is, this iterator is used by default. 
+    
+Implement `setbatchsize(itr::T, batchsize::Int)` for iterator types `T` where it is possible to set the batch size (or create a new iterator).
+# Examples
+```jldoctest
+julia> using NaiveGAflux
+
+julia> itr = ReBatchingIterator(BatchIterator(1:20, 8), 4); # Batch size 8 rebatched to 4
+
+julia> collect(itr)
+5-element Vector{Array{Int64}}:
+ [1, 2, 3, 4]
+ [5, 6, 7, 8]
+ [9, 10, 11, 12]
+ [13, 14, 15, 16]
+ [17, 18, 19, 20]
+
+julia> itr = ReBatchingIterator(BatchIterator((1:20, 21:40), 4), 8); # Batch size 4 rebatched to 8
+
+julia> map(x -> Pair(x...), itr) # Pair to make results a bit easier on the eyes
+3-element Vector{Pair{SubArray{Int64, 1, Vector{Int64}, Tuple{UnitRange{Int64}}, true}, SubArray{Int64, 1, Vector{Int64}, Tuple{UnitRange{Int64}}, true}}}:
+        [1, 2, 3, 4, 5, 6, 7, 8] => [21, 22, 23, 24, 25, 26, 27, 28]
+ [9, 10, 11, 12, 13, 14, 15, 16] => [29, 30, 31, 32, 33, 34, 35, 36]
+                [17, 18, 19, 20] => [37, 38, 39, 40]
+"""
+struct ReBatchingIterator{I}
+    batchsize::Int
+    base::I
+end
+ReBatchingIterator(base, batchsize::Int) = ReBatchingIterator(batchsize, base)
+
+"""
+    setbatchsize(itr, batchsize) 
+
+Return an iterator which iterates over the same data in the same order as `itr` with batch size `batchsize`.
+
+Defaults to [`ReBatchingIterator`](@ref) for iterators which don't have a specialized method. 
+"""
+setbatchsize(itr, batchsize) = ReBatchingIterator(itr, batchsize)
+
+Base.eltype(::Type{ReBatchingIterator{I}}) where I = _rangetoarr(eltype(I))
+
+# We can only know the size if the underlying iterator does not produce partial batches (i.e smaller than the batch size)
+Base.IteratorSize(::Type{ReBatchingIterator{I}}) where I = Base.SizeUnknown()
+Base.IteratorEltype(::Type{ReBatchingIterator{I}}) where I = Base.IteratorEltype(I) 
+
+_rangetoarr(a) = a
+_rangetoarr(::Tuple{T1, T2}) where {T1, T2} = Tuple{_rangetoarr(T1), _rangetoarr(T2)}
+_rangetoarr(t::Type{<:Tuple}) = Tuple{_rangetoarr.(t.parameters)...}
+_rangetoarr(a::Type{<:Array}) = a
+_rangetoarr(a::Type{<:CUDA.CuArray}) = a
+_rangetoarr(::Type{<:AbstractArray{T,N}}) where {T,N} = Array{T,N}
+
+function Base.iterate(itr::ReBatchingIterator)
+    innerval, innerstate = IterTools.@ifsomething iterate(itr.base)
+    innerval, innerstate = _concat_inner(itr.base, itr.batchsize, _collectbatch(innerval), innerstate)
+    bitr = BatchIterator(innerval, itr.batchsize)
+    outerval, outerstate = IterTools.@ifsomething iterate(bitr)
+    return outerval, (bitr, outerstate, innerstate)
+end
+
+
+function Base.iterate(itr::ReBatchingIterator, (bitr, outerstate, innerstate))
+    outervalstate = iterate(bitr, outerstate)
+    if outervalstate === nothing
+        innerval, innerstate = IterTools.@ifsomething iterate(itr.base, innerstate)
+        innerval, innerstate = _concat_inner(itr.base, itr.batchsize, _collectbatch(innerval), innerstate)
+        bitr = BatchIterator(innerval, itr.batchsize)
+        outerval, outerstate = IterTools.@ifsomething iterate(bitr)
+        return outerval, (bitr, outerstate, innerstate)
+    end
+    outerval, outerstate = outervalstate
+    return outerval, (bitr, outerstate, innerstate)
+end
+
+function _concat_inner(inneritr, batchsize, innerval, innerstate)
+    while _innerbatchsize(innerval) < batchsize
+        innervalstate = iterate(inneritr, innerstate)
+        innervalstate === nothing && break
+        innerval, innerstate = _catbatch(innerval, first(innervalstate)), last(innervalstate)
+    end
+    return innerval, innerstate
+end
+
+_catbatch(b1::Tuple, b2::Tuple) = _catbatch.(b1, b2)
+_catbatch(b1::AbstractArray{T, N}, b2::AbstractArray{T, N}) where {T,N} = cat(b1, b2; dims=ndims(b1))
+_catbatch(::T1, ::T2) where {T1, T2}= throw(DimensionMismatch("Tried to cat incompatible types when rebatching: $T1 vs $T2"))
+
+_collectbatch(b::Tuple) = _collectbatch.(b)
+_collectbatch(b::AbstractRange) = collect(b)
+_collectbatch(b) = b
+
+_innerbatchsize(t::Tuple) = _innerbatchsize(first(t))
+_innerbatchsize(a::AbstractArray) = size(a, ndims(a))
diff --git a/src/util.jl b/src/util.jl
index 62337dd6..4cb8d049 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -202,7 +202,7 @@ julia> extrema(cumsum([brw() for i in 1:10000]))
 struct BoundedRandomWalk{T <: Real, R <: Function}
     lb::T
     ub::T
-    state::Ref{T}
+    state::Base.RefValue{T}
     rfun::R
 end
 BoundedRandomWalk(lb::T,ub::T, rfun = (x...) -> 0.2randn(rng_default)) where T = BoundedRandomWalk(lb,ub, Ref(zero(ub)), rfun)
diff --git a/test/iterators.jl b/test/iterators.jl
index 8513b404..73d98769 100644
--- a/test/iterators.jl
+++ b/test/iterators.jl
@@ -4,10 +4,17 @@
 
         bitr = RepeatPartitionIterator(1:20, 5)
 
-        for (itr, exp) in zip(bitr, [1:5, 6:10, 11:15, 16:20])
-            @test collect(itr) == exp
+        @test length(bitr) == 4
+        @test size(bitr) == (4,)
+        @test eltype(bitr) == Int
+
+        @testset "Iteration $i" for (i, (itr, exp)) in enumerate(zip(bitr, [1:5, 6:10, 11:15, 16:20]))
             @test collect(itr) == exp
             @test collect(itr) == exp
+            @test collect(itr) == exp    
+
+            @test length(itr) == 5
+            @test eltype(itr) == Int
         end
     end
 
@@ -15,7 +22,7 @@
 
         bitr = RepeatPartitionIterator(Iterators.partition(1:20, 5), 2)
 
-        for (itr, exp) in zip(bitr, [[1:5, 6:10], [11:15, 16:20]])
+        @testset "Iteration $i" for (i, (itr, exp)) in enumerate(zip(bitr, [[1:5, 6:10], [11:15, 16:20]]))
             @test collect(itr) == exp
             @test collect(itr) == exp
             @test collect(itr) == exp
@@ -26,17 +33,21 @@
         import IterTools: ncycle
         bitr = RepeatPartitionIterator(ncycle(Iterators.partition(1:20, 5), 3), 2)
 
-        cnt = 0;
-        for (itr, exp) in zip(bitr, [[1:5, 6:10], [11:15, 16:20],[1:5, 6:10], [11:15, 16:20],[1:5, 6:10], [11:15, 16:20]])
+        @testset "Iteration $i" for (i, (itr, exp)) in enumerate(zip(bitr, [[1:5, 6:10], [11:15, 16:20],[1:5, 6:10], [11:15, 16:20],[1:5, 6:10], [11:15, 16:20]]))
             @test collect(itr) == exp
             @test collect(itr) == exp
             @test collect(itr) == exp
-            cnt += 1
         end
     end
 end
 
 @testset "SeedIterator" begin
+    basicitr = SeedIterator(1:10)
+    
+    @test length(basicitr) == 10
+    @test size(basicitr) == (10,)
+    @test eltype(basicitr) == Int
+
     rng = MersenneTwister(123)
     testitr = SeedIterator(Iterators.map(x -> x * rand(rng, Int), ones(10)); rng=rng, seed=12)
     @test collect(testitr) == collect(testitr)
@@ -59,7 +70,12 @@ end
 
     @testset "Single array" begin
         itr = BatchIterator(collect(reshape(1:2*3*4*5,2,3,4,5)), 2)
-        for (i, batch) in enumerate(itr)
+
+        @test length(itr) == 3
+        @test size(itr) == (3,)
+        @test eltype(itr) == Array{Int, 4}
+
+        @testset "Iteration $i" for (i, batch) in enumerate(itr)
             @test size(batch) == (2,3,4,i==3 ? 1 : 2)
         end
 
@@ -68,7 +84,12 @@ end
 
     @testset "Tuple data shuffle=$shuffle" for shuffle in (true, false)
         itr = BatchIterator((collect([1:10 21:30]'), 110:10:200), 3; shuffle)
-        for (i, (x, y)) in enumerate(itr)
+        
+        @test length(itr) == 4
+        @test size(itr) == (4,)
+        @test eltype(itr) == Tuple{Matrix{Int64}, StepRange{Int64, Int64}}
+        
+        @testset "Iteration $i" for (i, (x, y)) in enumerate(itr)
             expsize = i == 4 ? 1 : 3
             @test size(x) == (2, expsize)
             @test size(y) == (expsize,)
@@ -76,10 +97,16 @@ end
     end
 
     @testset "BatchIterator singleton" begin
+        import NaiveGAflux: Singleton
         itr = BatchIterator(Singleton([1,3,5,7,9,11]), 2)
-        for (i, b) in enumerate(itr)
+        
+        @test eltype(itr) == Vector{Int}
+        
+        @testset "Iteration $i" for (i, b) in enumerate(itr)
             @test b == [1,3] .+ 4(i-1)
         end
+
+        @test collect(itr) == [[1,3],[5,7],[9,11]]
     end
 
     @testset "BatchIterator shuffle basic" begin
@@ -93,7 +120,7 @@ end
         sitr = BatchIterator(collect(reshape(1:prod(dims),dims...)), 2;shuffle=MersenneTwister(12))
         bitr = BatchIterator(collect(reshape(1:prod(dims),dims...)), 2)
         sall, nall = Set{Int}(), Set{Int}()
-        for (sb, nb) in zip(sitr, bitr)
+        @testset "Iteration $i" for (i,(sb, nb)) in enumerate(zip(sitr, bitr))
             @test sb != nb
             @test size(sb) == size(nb)
             push!(sall, sb...)
@@ -108,7 +135,8 @@ end
 
     @testset "Single epoch small" begin
         ritr = RepeatPartitionIterator(BatchIterator(1:20, 3; shuffle=MersenneTwister(123)), 4)
-        for itr in ritr
+
+        @testset "Iteration $i" for (i, itr) in enumerate(ritr)
             @test collect(itr) == collect(itr)
         end
     end
@@ -179,3 +207,104 @@ end
     end
 
 end
+
+@testset "ReBatchingIterator" begin
+    import NaiveGAflux: ReBatchingIterator
+
+    @testset "_concat_inner" begin
+        import NaiveGAflux: _concat_inner
+        @testset "Single array" begin
+            itr = BatchIterator(1:20, 4)
+            v, s = _concat_inner(itr, 9, iterate(itr)...)
+            @test v == 1:12
+
+            v, s = _concat_inner(itr, 4, iterate(itr, s)...)
+            @test v == 13:16
+        end
+
+        @testset "Tuple" begin
+            itr = BatchIterator((1:20, collect(21:40)), 5)
+            
+            v, s = _concat_inner(itr, 10, iterate(itr)...)
+            @test v == (1:10, 21:30)
+
+            v, s = _concat_inner(itr, 1, iterate(itr, s)...)
+            @test v == (11:15, 31:35)
+
+            v, s = _concat_inner(itr, 100, iterate(itr, s)...)
+            @test v == (16:20, 36:40)
+        end
+    end
+
+    @testset "Even split" begin
+        itr = ReBatchingIterator(BatchIterator(1:20, 4), 2)
+        @test eltype(itr) == Vector{Int}
+        
+        @testset "Iteration $i" for (i, (res, exp)) in enumerate(zip(itr, (a:a+1 for a in 1:2:20)))
+            @test res == exp
+        end
+
+        @testset "Iteration $i again" for (i, (res, exp)) in enumerate(zip(itr, (a:a+1 for a in 1:2:20)))
+            @test res == exp
+        end
+    end
+
+    @testset "Even concat" begin 
+        itr = ReBatchingIterator(BatchIterator(1:20, 4), 8)
+        @test eltype(itr) == Vector{Int}
+
+        @testset "Iteration $i" for (i, (res, exp)) in enumerate(zip(itr, (a:min(a+7,20) for a in 1:8:20)))
+            @test res == exp
+        end
+
+        @testset "Iteration $i again" for (i, (res, exp)) in enumerate(zip(itr, (a:min(a+7, 20) for a in 1:8:20)))
+            @test res == exp
+        end
+    end
+
+    @testset "Odd split" begin
+        itr = ReBatchingIterator(BatchIterator(1:31, 5), 3)
+        @testset "Iteration $i" for (i, res) in enumerate(itr)
+            @test length(res) <= 3
+        end
+        @test reduce(vcat,itr) == 1:31
+    end
+
+    @testset "Odd concat" begin
+        itr = ReBatchingIterator(BatchIterator(1:31, 3), 5)
+        @testset "Iteration $i" for (i, res) in enumerate(itr)
+            @test length(res) <= 5
+        end
+        @test reduce(vcat,itr) == 1:31    
+    end
+
+    @testset "With tuple" begin
+        itr = ReBatchingIterator(BatchIterator((1:20, 21:40), 4), 2)
+
+        @test eltype(itr) == Tuple{Vector{Int}, Vector{Int}} 
+
+        @testset "Iteration $i" for (i, (res, exp)) in enumerate(zip(itr, ((a:a+1, 20+a:21+a) for a in 1:2:20)))
+            @test res == exp
+        end
+    end
+
+    @testset "With $(length(dims)) dims" for dims in ((8), (3,8),(2,3,9), (2,3,4,10), (2,3,4,5,11), (2,3,4,5,6,12))
+        itr = ReBatchingIterator(BatchIterator(collect(reshape(1:prod(dims),dims...)), 4), 2)
+        @testset "Iteration $i" for (i, res) in enumerate(itr)
+            @test size(res, ndims(res)) <= 2
+        end
+    end
+
+    @testset "Split with StatefulGenerationIter" begin
+        import NaiveGAflux: itergeneration
+        sitr = StatefulGenerationIter(RepeatPartitionIterator(BatchIterator(1:20, 3), 4))
+
+        @testset "Generation $i" for i in 1:5
+            itr = ReBatchingIterator(itergeneration(sitr, i), 3)
+            @testset "Iteration $j" for (j, res) in enumerate(itr)
+                @test length(res) <= 3 
+            end
+            @test collect(itr) == collect(itergeneration(sitr, i))
+        end
+    end
+end

From 1b0bf38aab1c6c8845522b9204ba5908806109c3 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Thu, 2 Jun 2022 00:34:45 +0200
Subject: [PATCH 06/36] Add validation of batch sizes

---
 src/iterators.jl  | 13 ++++++++++++-
 test/iterators.jl |  8 ++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/iterators.jl b/src/iterators.jl
index 387e0521..1d8579c5 100644
--- a/src/iterators.jl
+++ b/src/iterators.jl
@@ -163,6 +163,10 @@ struct BatchIterator{R, D}
     batchsize::Int
     rng::R
     data::D
+    function BatchIterator(nobs::Int, batchsize::Int, rng::R, data::D) where {R,D}
+        batchsize > 1 || throw(ArgumentError("Batch size must be > 0. Got $(batchsize)!"))
+        new{R,D}(nobs, batchsize, rng, data)
+    end
 end
 BatchIterator(data::Union{AbstractArray, Singleton}, bs::Int; kwargs...) = BatchIterator(size(data)[end], bs, data; kwargs...)
 function BatchIterator(data::Tuple, bs; kwargs...) 
@@ -301,8 +305,15 @@ julia> map(x -> Pair(x...), itr) # Pair to make results a bit easier on the eyes
 struct ReBatchingIterator{I}
     batchsize::Int
     base::I
+
+    function ReBatchingIterator(batchsize::Int, base::I) where I
+        batchsize > 1 || throw(ArgumentError("Batch size must be > 0. Got $(batchsize)!"))
+        new{I}(batchsize, base)
+    end
+end
+function ReBatchingIterator(base, batchsize::Int) 
+    ReBatchingIterator(batchsize, base)
 end
-ReBatchingIterator(base, batchsize::Int) = ReBatchingIterator(batchsize, base)
 
 """
     setbatchsize(itr, batchsize) 
diff --git a/test/iterators.jl b/test/iterators.jl
index 73d98769..c3ecbcbc 100644
--- a/test/iterators.jl
+++ b/test/iterators.jl
@@ -82,6 +82,10 @@ end
         @test "biter: $itr" == "biter: BatchIterator(size=(2, 3, 4, 5), batchsize=2, shuffle=false)"
     end
 
+    @testset "Batch size 0" begin
+        @test_throws ArgumentError BatchIterator(1:20, 0)      
+    end
+
     @testset "Tuple data shuffle=$shuffle" for shuffle in (true, false)
         itr = BatchIterator((collect([1:10 21:30]'), 110:10:200), 3; shuffle)
         
@@ -278,6 +282,10 @@ end
         @test reduce(vcat,itr) == 1:31    
     end
 
+    @testset "Batch size 0" begin
+        @test_throws ArgumentError ReBatchingIterator(BatchIterator(1:20, 4), 0)      
+    end
+
     @testset "With tuple" begin
         itr = ReBatchingIterator(BatchIterator((1:20, 21:40), 4), 2)
 

From 28c49214452b6a9f20a296f1cccaaf1ed9962a89 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sat, 4 Jun 2022 01:54:51 +0200
Subject: [PATCH 07/36] Replace batchsize with trainiterator and
 validationiterator Determine automatically if iterator shall be wrapped in
 GpuIterator

---
 src/NaiveGAflux.jl                      |  2 +-
 src/app/imageclassification/strategy.jl |  8 +++---
 src/batchsize.jl                        |  4 ++-
 src/candidate.jl                        | 17 ++++++++----
 src/fitness.jl                          | 32 ++++++++++++++++-----
 src/iterators.jl                        | 37 +++++++++++++++++++++++--
 test/candidate.jl                       | 14 +++++-----
 test/iterators.jl                       | 14 +++++++---
 8 files changed, 97 insertions(+), 31 deletions(-)

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index 690ef8da..a0ba882d 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -33,7 +33,7 @@ const modeldir = "models"
 export fitness, AbstractFitness, LogFitness, GpuFitness, AccuracyFitness, TrainThenFitness, TrainAccuracyFitness, MapFitness, EwmaFitness, TimeFitness, SizeFitness, AggFitness
 
 # Candidate
-export evolvemodel, AbstractCandidate, CandidateModel, CandidateOptModel, CandidateBatchSize, FittedCandidate, model, opt, lossfun, batchsize
+export evolvemodel, AbstractCandidate, CandidateModel, CandidateOptModel, CandidateBatchSize, FittedCandidate, model, opt, lossfun
 
 # Evolution
 export evolve, AbstractEvolution, NoOpEvolution, AfterEvolution, EliteSelection, SusSelection, TournamentSelection, CombinedEvolution, EvolutionChain, PairCandidates, ShuffleCandidates, EvolveCandidates
diff --git a/src/app/imageclassification/strategy.jl b/src/app/imageclassification/strategy.jl
index a06279e0..e7c18a62 100644
--- a/src/app/imageclassification/strategy.jl
+++ b/src/app/imageclassification/strategy.jl
@@ -133,7 +133,7 @@ end
 
 """
     struct BatchedIterConfig{T, V}
-    BatchedIterConfig(;batchsize=32, dataaug=identity, iterwrap=GpuIterator) 
+    BatchedIterConfig(;batchsize=32, dataaug=identity, iterwrap=identity) 
 
 Configuration for creating batch iterators from array data.
 
@@ -146,12 +146,12 @@ struct BatchedIterConfig{T, V}
     dataaug::T
     iterwrap::V
 end
-BatchedIterConfig(;batchsize=32, dataaug=identity, iterwrap=GpuIterator) = BatchedIterConfig(batchsize, dataaug, iterwrap)
+BatchedIterConfig(;batchsize=32, dataaug=identity, iterwrap=identity) = BatchedIterConfig(batchsize, dataaug, iterwrap)
 dataiter(s::BatchedIterConfig, x, y) = dataiter(x, y, s.batchsize, s.dataaug) |> s.iterwrap
 
 """
     struct ShuffleIterConfig{T, V}
-    ShuffleIterConfig(;batchsize=32, seed=123, dataaug=identity, iterwrap=GpuIterator) 
+    ShuffleIterConfig(;batchsize=32, seed=123, dataaug=identity, iterwrap=identity) 
 
 Configuration for creating shuffled batch iterators from array data. Data will be re-shuffled every time the iterator restarts.
 
@@ -168,7 +168,7 @@ struct ShuffleIterConfig{T, V}
     dataaug::T
     iterwrap::V
 end
-ShuffleIterConfig(;batchsize=32, seed=123, dataaug=identity, iterwrap=GpuIterator) = ShuffleIterConfig(batchsize, seed, dataaug, iterwrap)
+ShuffleIterConfig(;batchsize=32, seed=123, dataaug=identity, iterwrap=identity) = ShuffleIterConfig(batchsize, seed, dataaug, iterwrap)
 dataiter(s::ShuffleIterConfig, x, y) = dataiter(x, y, s.batchsize, s.seed, s.dataaug) |> s.iterwrap
 
 
diff --git a/src/batchsize.jl b/src/batchsize.jl
index 7fa6f9cd..42534938 100644
--- a/src/batchsize.jl
+++ b/src/batchsize.jl
@@ -317,18 +317,20 @@ function maxvalidationbatchsize(model, inshape_nobatch, availablebytes=_availabl
 end
 
 function activationsizes(model::CompGraph, inshape_nobatch, elemsize = model |> params |> first |> eltype |> sizeof)
+    model = cpu(model) # Flux.outputsize does not work for CuArrays
     activations = if length(inputs(model)) == 1
         Dict{AbstractVertex, Any}(v => Flux.nil_input(true, inshape_nobatch) for v in inputs(model))
     else
         Dict{AbstractVertex, Any}(v => Flux.nil_input(true, inshape_nobatch)[i] for (i, v) in inputs(model))
     end
-    for v in outputs(model) 
+    for v in outputs(model)
         output!(activations, v)
     end
 
     mapreduce(act -> length(act) * elemsize, +, values(activations))
 end
 
+# TODO: Take model as input and look at params to determine of cpu or gpu
 function _availablebytes()
     if CUDA.functional()
         info = CUDA.MemoryInfo()
diff --git a/src/candidate.jl b/src/candidate.jl
index 5886652e..373c306a 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -48,7 +48,8 @@ lossfun(::AbstractCandidate; default=nothing) = default
 
 fitness(::AbstractCandidate; default=nothing) = default
 generation(::AbstractCandidate; default=nothing) = default
-batchsize(::AbstractCandidate; withgradient, default=nothing) = default
+trainiterator(::AbstractCandidate; default=nothing) = default
+validationiterator(::AbstractCandidate; default=nothing) = default
 
 
 wrappedcand(::T) where T <: AbstractCandidate = error("$T does not wrap any candidate! Check your base case!")
@@ -74,7 +75,8 @@ opt(c::AbstractWrappingCandidate; kwargs...) = opt(wrappedcand(c); kwargs...)
 lossfun(c::AbstractWrappingCandidate; kwargs...) = lossfun(wrappedcand(c); kwargs...)
 fitness(c::AbstractWrappingCandidate; kwargs...) = fitness(wrappedcand(c); kwargs...)
 generation(c::AbstractWrappingCandidate; kwargs...) = generation(wrappedcand(c); kwargs...)
-batchsize(c::AbstractWrappingCandidate; kwargs...) = batchsize(wrappedcand(c); kwargs...)
+trainiterator(c::AbstractWrappingCandidate; kwargs...) = trainiterator(wrappedcand(c); kwargs...)
+validationiterator(c::AbstractWrappingCandidate; kwargs...) = validationiterator(wrappedcand(c); kwargs...)
 
 """
     CandidateModel <: Candidate
@@ -149,9 +151,14 @@ function CandidateBatchSize(tbs::TrainBatchSize, vbs::ValidationBatchSize, limit
 end
 
 
-function batchsize(c::CandidateBatchSize; withgradient, inshape_nobatch=nothing, default=nothing, kwargs...) 
-    bs = withgradient ? c.tbs : c.vbs
-    isnothing(inshape_nobatch) ? batchsize(bs) : c.limitfun(c, bs; inshape_nobatch, kwargs...) 
+function trainiterator(c::CandidateBatchSize; kwargs...) 
+    iter = trainiterator(wrappedcand(c); kwargs...)
+    setbatchsize(iter, batchsize(c.tbs))
+end
+
+function validationiterator(c::CandidateBatchSize; kwargs...) 
+    iter = validationiterator(wrappedcand(c); kwargs...)
+    setbatchsize(iter, batchsize(c.vbs))
 end
 
 function newcand(c::CandidateBatchSize, mapfield) 
diff --git a/src/fitness.jl b/src/fitness.jl
index cb6485a6..81391b28 100644
--- a/src/fitness.jl
+++ b/src/fitness.jl
@@ -115,16 +115,33 @@ struct AccuracyFitness{D} <: AbstractFitness
     dataset::D
 end
 function _fitness(s::AccuracyFitness, c::AbstractCandidate)
-    acc,cnt = 0, 0
+
     m = model(c)
-    for (x,y) in s.dataset
-        correct = Flux.onecold(cpu(m(x))) .== Flux.onecold(cpu(y))
+    ninput = ninputs(m)
+
+    iter = _fitnessiterator(validationiterator, c, s.dataset)
+
+    acc,cnt = 0.0, 0
+    for (data) in iter
+        xs = data[1:ninput]
+        ys = data[ninput+1:end]
+
+        correct = Flux.onecold(cpu(m(xs...))) .== Flux.onecold(cpu(ys)...)
         acc += sum(correct)
         cnt += length(correct)
     end
-    return acc / cnt
+    return cnt == 0 ? acc : acc / cnt
 end
 
+function _fitnessiterator(f, c::AbstractCandidate, iter)
+    geniter = itergeneration(iter, generation(c; default=0))
+    canditer = f(c; default=geniter)
+    matchdatatype(params(c) |> first, canditer)
+end
+
+matchdatatype(::CUDA.CuArray, iter) = GpuGcIterator(GpuIterator(iter))
+matchdatatype(::AbstractArray, iter) = iter
+
 """
     TrainThenFitness{I,L,O,F} <: AbstractFitness
     TrainThenFitness(;dataiter, defaultloss, defaultopt, fitstrat, invalidfitness=0.0)
@@ -158,7 +175,6 @@ function _fitness(s::TrainThenFitness, c::AbstractCandidate)
     m = model(c)
     o = opt(c; default=s.defaultopt)
     ninput = ninputs(m)
-    gen = generation(c; default=0)
 
     valid = let valid = true
         nanguard = function(data...)
@@ -178,8 +194,9 @@ function _fitness(s::TrainThenFitness, c::AbstractCandidate)
             end
             return l
         end
-        iter = itergeneration(s.dataiter, gen)
-        Flux.train!(nanguard, params(m), iter, o)
+        iter = _fitnessiterator(trainiterator, c, s.dataiter)
+    
+        Flux.train!(nanguard, params(m), GpuGcIterator(GpuIterator(iter)), o)
         cleanopt!(o)
         valid
     end
@@ -198,6 +215,7 @@ function checkvalid(ifnot, x)
     return true
 end
 
+
 """
     TrainAccuracyCandidate{C} <: AbstractWrappingCandidate
 
diff --git a/src/iterators.jl b/src/iterators.jl
index 1d8579c5..f8270d65 100644
--- a/src/iterators.jl
+++ b/src/iterators.jl
@@ -164,7 +164,7 @@ struct BatchIterator{R, D}
     rng::R
     data::D
     function BatchIterator(nobs::Int, batchsize::Int, rng::R, data::D) where {R,D}
-        batchsize > 1 || throw(ArgumentError("Batch size must be > 0. Got $(batchsize)!"))
+        batchsize > 0 || throw(ArgumentError("Batch size must be > 0. Got $(batchsize)!"))
         new{R,D}(nobs, batchsize, rng, data)
     end
 end
@@ -271,6 +271,8 @@ end
 # itergeneration on timeoutaction just in case user e.g. wants to deepcopy it to avoid some shared state.
 itergeneration(itr::TimedIterator, gen) = TimedIterator(itr.timelimit, itr.patience, itergeneration(itr.timeoutaction, gen), itr.accumulate_timeouts, itergeneration(itr.base, gen))
 
+setbatchsize(itr::TimedIterator, batchsize) = TimedIterator(itr.timelimit, itr.patience, itr.timeoutaction, itr.accumulate_timeouts, setbatchsize(itr.base, batchsize))
+
 """
     ReBatchingIterator{I}
     ReBatchingIterator(base, batchsize)
@@ -307,7 +309,7 @@ struct ReBatchingIterator{I}
     base::I
 
     function ReBatchingIterator(batchsize::Int, base::I) where I
-        batchsize > 1 || throw(ArgumentError("Batch size must be > 0. Got $(batchsize)!"))
+        batchsize > 0 || throw(ArgumentError("Batch size must be > 0. Got $(batchsize)!"))
         new{I}(batchsize, base)
     end
 end
@@ -378,3 +380,34 @@ _collectbatch(b) = b
 
 _innerbatchsize(t::Tuple) = _innerbatchsize(first(t))
 _innerbatchsize(a::AbstractArray) = size(a, ndims(a))
+
+
+## Temp workaround for CUDA memory issue where it for some reason takes very long time to make use of available memory
+struct GpuGcIterator{I}
+    base::I
+end
+
+function Base.iterate(itr::GpuGcIterator) 
+    valstate = iterate(itr.base)
+    valstate === nothing && return nothing
+    val, state = valstate
+    return val, (2, state)
+end
+
+function Base.iterate(itr::GpuGcIterator, (cnt, state)) 
+    meminfo = CUDA.MemoryInfo()
+    if meminfo.total_bytes - meminfo.pool_reserved_bytes < 2e9
+        NaiveGAflux.gpu_gc()
+    end
+    valstate = iterate(itr.base, state)
+    valstate === nothing && return nothing
+    val, state = valstate
+    return val, (cnt+1, state)
+end
+
+Base.IteratorSize(::Type{GpuGcIterator{I}}) where I = Base.IteratorSize(I)
+Base.IteratorEltype(::Type{GpuGcIterator{I}}) where I = Base.IteratorEltype(I)
+
+Base.length(itr::GpuGcIterator) = length(itr.base)
+Base.size(itr::GpuGcIterator) = size(itr.base)
+Base.eltype(::Type{GpuGcIterator{I}}) where I = eltype(I)
diff --git a/test/candidate.jl b/test/candidate.jl
index 52f01d90..428811fa 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -2,7 +2,7 @@
 
     struct DummyFitness <: AbstractFitness end
     NaiveGAflux._fitness(::DummyFitness, f::AbstractCandidate) = 17
-    using NaiveGAflux: FileCandidate, AbstractWrappingCandidate, FittedCandidate
+    using NaiveGAflux: FileCandidate, AbstractWrappingCandidate, FittedCandidate, trainiterator, validationiterator
     using Functors: fmap
     import MemPool
     @testset "$ctype" for (ctype, candfun) in (
@@ -51,14 +51,14 @@
                 end
 
                 if ctype == CandidateBatchSize
-                    @test batchsize(cand; withgradient=true, default=64) == 16  
-                    @test batchsize(cand; withgradient=false, default=128) == 32  
+                    @test length(first(trainiterator(cand; default=(1:100,)))) == 16  
+                    @test length(first(validationiterator(cand; default=(1:100,)))) == 32  
                     # TODO Add mutation
-                    @test batchsize(newcand; withgradient=true, default=64) == 16  
-                    @test batchsize(newcand; withgradient=false, default=128) == 32  
+                    @test length(first(trainiterator(newcand; default=(1:100,)))) == 16  
+                    @test length(first(validationiterator(newcand; default=(1:100,)))) == 32  
                 else
-                    @test batchsize(cand; withgradient=true, default=64) == 64  
-                    @test batchsize(cand; withgradient=false, default=128) == 128  
+                    @test length(first(trainiterator(cand; default=(1:100,)))) == 100  
+                    @test length(first(validationiterator(cand; default=(1:100,)))) == 100
                 end
 
                 teststrat() = NaiveGAflux.default_crossoverswap_strategy(v -> 1)
diff --git a/test/iterators.jl b/test/iterators.jl
index c3ecbcbc..0f7f0e65 100644
--- a/test/iterators.jl
+++ b/test/iterators.jl
@@ -82,8 +82,11 @@ end
         @test "biter: $itr" == "biter: BatchIterator(size=(2, 3, 4, 5), batchsize=2, shuffle=false)"
     end
 
-    @testset "Batch size 0" begin
-        @test_throws ArgumentError BatchIterator(1:20, 0)      
+    @testset "Batch size 0 and 1" begin
+        @test_throws ArgumentError BatchIterator(1:20, 0)     
+        @testset "Iteration $i" for (i, j) in enumerate(BatchIterator(1:20, 1))
+            @test [i] == j
+        end
     end
 
     @testset "Tuple data shuffle=$shuffle" for shuffle in (true, false)
@@ -282,8 +285,11 @@ end
         @test reduce(vcat,itr) == 1:31    
     end
 
-    @testset "Batch size 0" begin
-        @test_throws ArgumentError ReBatchingIterator(BatchIterator(1:20, 4), 0)      
+    @testset "Batch size 0 and 1" begin
+        @test_throws ArgumentError ReBatchingIterator(BatchIterator(1:20, 4), 0)  
+        @testset "Iteration $i" for (i, j) in enumerate(ReBatchingIterator(BatchIterator(1:20, 7), 1))
+            @test [i] == j
+        end    
     end
 
     @testset "With tuple" begin

From 5d7853d91cc744e2652ae43e36c48c338b14dc53 Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Sat, 4 Jun 2022 13:27:57 +0200
Subject: [PATCH 08/36] Add default ninputs Add .vscode to gitignore

---
 .gitignore  | 1 +
 src/util.jl | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index fdd6eec3..f3c8ffb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ Manifest.toml
 *.bson
 models
 .mempool
+.vscode
diff --git a/src/util.jl b/src/util.jl
index 4cb8d049..9d3c2564 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -346,3 +346,5 @@ NaiveNASflux.layer(gp::GlobalPool) = gp
 Return the number of model inputs.
 """
 ninputs(cg::CompGraph) = length(cg.inputs)
+# I guess this is not good practice, but I'll fix it the first time someone posts an issue about it :)
+ninputs(m) = 1 

From 9e25c199d9d8cd4b9e1ab4b80da3110fe2ed4909 Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Sat, 4 Jun 2022 13:55:16 +0200
Subject: [PATCH 09/36] Remove hardcoded wrapping of GpuIterator and
 GpuGcIterator Add handling of models without parameters in _fitnessiterator

---
 src/fitness.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/fitness.jl b/src/fitness.jl
index 81391b28..3f23e7aa 100644
--- a/src/fitness.jl
+++ b/src/fitness.jl
@@ -136,9 +136,12 @@ end
 function _fitnessiterator(f, c::AbstractCandidate, iter)
     geniter = itergeneration(iter, generation(c; default=0))
     canditer = f(c; default=geniter)
-    matchdatatype(params(c) |> first, canditer)
+    matchdatatype(params(c), canditer)
 end
 
+matchdatatype(ps::Flux.Params, iter) = isempty(ps) ? iter : matchdatatype(first(ps), iter)
+# TODO: GpuGcIterator is a temporary workaround for what seems like a CUDA issue where memory allocation becomes very slow
+# after the number of reserved (but still available) bytes is close to the totol available GPU memory
 matchdatatype(::CUDA.CuArray, iter) = GpuGcIterator(GpuIterator(iter))
 matchdatatype(::AbstractArray, iter) = iter
 
@@ -196,7 +199,7 @@ function _fitness(s::TrainThenFitness, c::AbstractCandidate)
         end
         iter = _fitnessiterator(trainiterator, c, s.dataiter)
     
-        Flux.train!(nanguard, params(m), GpuGcIterator(GpuIterator(iter)), o)
+        Flux.train!(nanguard, params(m), iter, o)
         cleanopt!(o)
         valid
     end

From f2608645cc9f64f60bf7dfc7bcfbfa9e7ad39bdb Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Sun, 5 Jun 2022 02:40:03 +0200
Subject: [PATCH 10/36] Refactor evolvemodel into MapCandidate to support more
 than two types of mutation Add BatchSizeIteratorMap Replace
 CandidateBatchSize with the more general CandidateDataIterMap

---
 docs/src/reference/candidate.md |   6 +-
 src/NaiveGAflux.jl              |   6 +-
 src/batchsize.jl                |  10 +-
 src/candidate.jl                | 173 ++++++++++++++++++++++++--------
 src/iteratormaps.jl             |  98 ++++++++++++++++++
 test/candidate.jl               | 128 ++++++++++++++++++++++-
 test/iteratormaps.jl            |  44 ++++++++
 test/runtests.jl                |   4 +
 8 files changed, 415 insertions(+), 54 deletions(-)
 create mode 100644 src/iteratormaps.jl
 create mode 100644 test/iteratormaps.jl

diff --git a/docs/src/reference/candidate.md b/docs/src/reference/candidate.md
index 9b4bad96..18b60d95 100644
--- a/docs/src/reference/candidate.md
+++ b/docs/src/reference/candidate.md
@@ -3,11 +3,13 @@
 ```@docs
 CandidateModel 
 CandidateOptModel
-CandidateBatchSize
+CandidateDataIterMap
 FittedCandidate
-evolvemodel
+MapCandidate
 Population
 model
 opt
 lossfun
+BatchSizeIteratorMap
+AbstractIteratorMap
 ```
\ No newline at end of file
diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index a0ba882d..829d682f 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -33,7 +33,7 @@ const modeldir = "models"
 export fitness, AbstractFitness, LogFitness, GpuFitness, AccuracyFitness, TrainThenFitness, TrainAccuracyFitness, MapFitness, EwmaFitness, TimeFitness, SizeFitness, AggFitness
 
 # Candidate
-export evolvemodel, AbstractCandidate, CandidateModel, CandidateOptModel, CandidateBatchSize, FittedCandidate, model, opt, lossfun
+export evolvemodel, AbstractCandidate, CandidateModel, CandidateOptModel, CandidateDataIterMap, FittedCandidate, model, opt, lossfun
 
 # Evolution
 export evolve, AbstractEvolution, NoOpEvolution, AfterEvolution, EliteSelection, SusSelection, TournamentSelection, CombinedEvolution, EvolutionChain, PairCandidates, ShuffleCandidates, EvolveCandidates
@@ -50,6 +50,9 @@ export BatchSizeSelectionWithDefaultInShape, BatchSizeSelectionScaled, BatchSize
 # Iterators. These should preferably come from somewhere else, but I haven't found anything which fits the bill w.r.t repeatability over subsets
 export RepeatPartitionIterator, SeedIterator, MapIterator, GpuIterator, BatchIterator, ShuffleIterator, TimedIterator, TimedIteratorStop, StatefulGenerationIter
 
+# Iterator mapping types for evolving hyperparameters related to datasets, e.g. augmentation and batch size
+export BatchSizeIteratorMap, IteratorMaps
+
 # Persistence
 export persist
 
@@ -83,6 +86,7 @@ export PlotFitness, ScatterPop, ScatterOpt, MultiPlot, CbAll
 include("util.jl")
 include("shape.jl")
 include("batchsize.jl")
+include("iteratormaps.jl")
 include("archspace.jl")
 include("mutation.jl")
 include("crossover.jl")
diff --git a/src/batchsize.jl b/src/batchsize.jl
index 42534938..1c715db3 100644
--- a/src/batchsize.jl
+++ b/src/batchsize.jl
@@ -10,11 +10,12 @@ The type of `batchsize` may be used to e.g. determine if one shall account for b
 """
 
 generic_batchsizefun_testgraph() = """
-julia> v0 = conv2dinputvertex("v0", 3);
 
-julia> v1 = fluxvertex("v1", Conv((3,3), nout(v0) => 8), v0);
-
-julia> graph = CompGraph(v0, v1);
+julia> graph =  let 
+                    v0 = conv2dinputvertex("v0", 3);
+                    v1 = fluxvertex("v1", Conv((3,3), nout(v0) => 8), v0);
+                    CompGraph(v0, v1);
+                end;
 """
 
 generic_batchsizeselection_example(sbs, kwres...) = """
@@ -48,7 +49,6 @@ struct ValidationBatchSize
 end
 batchsize(bs::ValidationBatchSize) = bs.size
 
-
 """
     BatchSizeSelectionWithDefaultInShape{T, F}
     BatchSizeSelectionWithDefaultInShape(default_inshape)
diff --git a/src/candidate.jl b/src/candidate.jl
index 373c306a..db7951e0 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -92,7 +92,7 @@ end
 
 model(c::CandidateModel; kwargs...) = c.model
 
-newcand(c::CandidateModel, mapfield) = CandidateModel(map(mapfield, getproperty.(c, fieldnames(CandidateModel)))...)
+newcand(c::CandidateModel, mapfield) = CandidateModel(mapfield(c.model))
 
 """
     CandidateOptModel <: AbstractCandidate
@@ -123,55 +123,33 @@ opt(c::CandidateOptModel; kwargs...) = c.opt
 newcand(c::CandidateOptModel, mapfield) = CandidateOptModel(mapfield(c.opt), newcand(wrappedcand(c), mapfield))
 
 """
-    CandidateBatchSize <: AbstractWrappingCandidate
-    CandidateBatchSize(limitfun, trainbatchsize, validationbatchsize, candidate)
+    CandidateDataIterMap{T<:AbstractIteratorMap, C<:AbstractCandidate}     
+    CandidateDataIterMap(itermap::AbstractIteratorMap, c::AbstractCandidate)
 
-A candidate adding batch sizes to another candiate. `limitfun` is used to try to ensure that batch sizes are small enough so that training and validating the model does not risk an out of memory error. Use [`batchsizeselection`](@ref) to create an appropriate `limitfun`.
+Maps training and validation data iterators using `iteratormap` for the wrapped candidate `c`.
+    
+Useful for searching for hyperparameters related to training and validation data, such as augmentation and batch size.
 
-The batch sizes are accessed by [`batchsize(c; withgradient)`] for `CandidateBatchSize c` where `withgradient=true` gives the training batch size and `withgradient=false` gives the validation batch size.
+While one generally don't want to augment the validation data, it is useful to select the largest possible batch size
+for validation for speed reasons.
 """
-struct CandidateBatchSize{F, C <: AbstractCandidate} <: AbstractWrappingCandidate
-    tbs::TrainBatchSize
-    vbs::ValidationBatchSize
-    limitfun::F
+struct CandidateDataIterMap{T<:AbstractIteratorMap, C<:AbstractCandidate} <: AbstractWrappingCandidate
+    map::T
     c::C
-
-    function CandidateBatchSize{F, C}(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, c::C) where {F, C}
-        new{F, C}(TrainBatchSize(limitfun(c, tbs)), ValidationBatchSize(limitfun(c, vbs)), limitfun, c)
-    end
-end
-
-@functor CandidateBatchSize
-
-function CandidateBatchSize(tbs::Integer, vbs::Integer, limitfun, c)
-    CandidateBatchSize(TrainBatchSize(tbs), ValidationBatchSize(vbs), limitfun, c)
-end
-function CandidateBatchSize(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, c::C) where {C<:AbstractCandidate, F}
-    CandidateBatchSize{F, C}(tbs, vbs, limitfun, c)
 end
 
+@functor CandidateDataIterMap
 
-function trainiterator(c::CandidateBatchSize; kwargs...) 
-    iter = trainiterator(wrappedcand(c); kwargs...)
-    setbatchsize(iter, batchsize(c.tbs))
-end
+trainiterator(c::CandidateDataIterMap; kwargs...) = maptrain(c.map, trainiterator(wrappedcand(c); kwargs...))
+validationiterator(c::CandidateDataIterMap; kwargs...) = mapvalidation(c.map, validationiterator(wrappedcand(c); kwargs...))
 
-function validationiterator(c::CandidateBatchSize; kwargs...) 
-    iter = validationiterator(wrappedcand(c); kwargs...)
-    setbatchsize(iter, batchsize(c.vbs))
+function newcand(c::CandidateDataIterMap, mapfield) 
+    nc =  newcand(wrappedcand(c), mapfield)
+    CandidateDataIterMap(apply_mapfield(mapfield, c.map, nc), nc)
 end
 
-function newcand(c::CandidateBatchSize, mapfield) 
-    CandidateBatchSize(mapfield(c.tbs), 
-                       mapfield(c.vbs),
-                       mapfield(c.limitfun), 
-                       newcand(c.c, mapfield))
-end
-
-limit_maxbatchsize(c::AbstractCandidate, bs; inshape_nobatch, availablebytes = _availablebytes()) = model(c) do model
-    isnothing(model) && return bs
-    limit_maxbatchsize(model, bs; inshape_nobatch, availablebytes)
-end
+# Just because BatchSizeIteratorMap needs the model to limit the batch sizes :(
+apply_mapfield(f, x, ::AbstractCandidate) = f(x)
 
 """
     FileCandidate <: AbstractWrappingCandidate
@@ -363,11 +341,124 @@ evolvemodel(m::AbstractCrossover{CompGraph}, om::AbstractCrossover{FluxOptimizer
     return evolvemodel(m, optmap(o -> o1n, mapothers1), optmap(o -> o2n, mapothers2))((c1,c2))
 end
 
+_evolvemodel(ms::AbstractMutation...; mapothers=deepcopy) = MapCandidate(ms, mapothers)
+
+
+struct MapType{T, F1, F2}
+    match::F1
+    nomatch::F2
+    MapType{T}(match::F1, nomatch::F2) where {T,F1, F2} = new{T,F1,F2}(match, nomatch)
+end
+
+(a::MapType{T1})(x::T2) where {T1, T2<:T1} = a.match(x)
+(a::MapType)(x) = a.nomatch(x)
+
+MapType(match::AbstractMutation{T}, nomatch) where T = MapType{T}(match, nomatch)
+MapType(match::AbstractMutation{CompGraph}, nomatch) = MapType{CompGraph}(match ∘ deepcopy, nomatch) 
+
+function MapType(c::AbstractCrossover{CompGraph}, (c1, c2), (nomatch1, nomatch2))
+    g1 = model(c1)
+    g2 = model(c2)
+
+    release!(c1)
+    release!(c2)
+
+    g1, g2 = c((deepcopy(g1), deepcopy(g2)))
+    return MapType{CompGraph}(Returns(g1), nomatch1), MapType{CompGraph}(Returns(g2), nomatch2)
+end
+
+function MapType(c::AbstractCrossover{FluxOptimizer}, (c1, c2), (nomatch1, nomatch2))
+    o1 = opt(c1)
+    o2 = opt(c2)
+
+    o1n, o2n = c((o1, o2))
+    return MapType{FluxOptimizer}(Returns(o1n), nomatch1), MapType{FluxOptimizer}(Returns(o2n), nomatch2)
+end
+
+# TODO: Needs a new name. MapCandidate?
+"""
+    MapCandidate{T, F} 
+    MapCandidate(mutations, mapothers::F)
+
+Return a callable struct which maps `AbstractCandidate`s to new `AbstractCandidate`s through `mutations` which is a tuple of 
+`AbstractMutation`s or `AbstractCrossover`s. 
+
+Basic purpose is to combine multiple mutations operating on different types into a single mapping function which creates new 
+candidates from existing candidates. 
+
+When called as a function with an `AbstractCandidate c` as input, it will map fields `f` in `c` (recursively through any 
+wrapped candidates of `c`) satisfying `typeof(f) <: MT` through `m(f)` where `m <: AbstractMutation{MT}` in `mutations`.
+
+All other fields are mapped through the function `mapothers` (default `deepcopy`).
+
+For instance, if `e = MapCandidate(m1, m2)` where `m1 isa AbstractMutation{CompGraph}` and `m2 isa 
+AbstractMutation{FluxOptimizer}` then `e(c)` where `c` is a `CandidateOptModel` will create a new `CandidateOptModel`where 
+the new model is `m1(model(c))` and the new optimizer is `m2(opt(c))`  
+
+When called as a function with a tuple of two `AbstractCandidate`s as input it will similarly apply crossover between the 
+two candidates, returning two new candidates.
+
+Note that all `mutations` must be either `AbstractMutation`s or `AbstractCrossover`s as the resulting function either works
+on a single candidate or a pair of candidates.
+
+Furthermore, all `mutations` must operate on different types, i.e there must not be two `AbstractMutation{T}` (or `
+AbstractCrossover{T}`) for any type `T`.
+
+Intended use is together with [`EvolveCandidates`](@ref).
+"""
+struct MapCandidate{T, F}
+    mutations::T
+    mapothers::F
+end
+
+MapCandidate(mutations::AbstractMutation...; mapothers=deepcopy) = MapCandidate(mutations, mapothers)
+MapCandidate(mutation::AbstractMutation, mapothers) = MapCandidate(tuple(mutation), mapothers)
+
+function MapCandidate(crossovers::NTuple{N, AbstractCrossover}, mapothers::F) where {N, F} 
+    _validate_mutations(crossovers)
+    MapCandidate{typeof(crossovers), F}(crossovers, mapothers)
+end
+function MapCandidate(mutations::NTuple{N, AbstractMutation}, mapothers) where N
+    _validate_mutations(mutations)
+    mapc = foldr(mutations; init=mapothers) do match, nomatch
+        MapType(match, nomatch)
+    end
+    MapCandidate(mapc, mapothers)
+end
+
+function _validate_mutations(mutations)
+    seentypes = Set()
+    iscrossover = first(mutations) isa AbstractCrossover
+    for m in mutations
+        _validate_unique_type!(seentypes, m)
+        (iscrossover == (m isa AbstractCrossover)) || throw(ArgumentError("Can't mix crossover and mutation in same function! Use different functions instead"))
+    end
+end
+
+function _validate_unique_type!(seen, ::AbstractMutation{T}) where T
+    T in seen && throw(ArgumentError("Got mutation of duplicate type $(T)!"))
+    push!(seen, T)
+end
+
+(e::MapCandidate{<:MapType})(c) = newcand(c, e.mutations)
+
+function (e::MapCandidate{<:NTuple{N, AbstractCrossover}, F})((c1,c2)) where {N,F}
+    # Bleh, CGA to avoid a closure here
+    mapc1, mapc2 = let c1 = c1, c2 = c2
+         foldr(e.mutations; init=(e.mapothers, e.mapothers)) do match, nomatch
+            MapType(match, (c1,c2), nomatch)
+        end
+    end
+
+    return newcand(c1, mapc1), newcand(c2, mapc2)
+end
+
 
 function mapcandidate(mapgraph, mapothers=deepcopy)
     mapfield(g::CompGraph) = mapgraph(g)
     mapfield(f) = mapothers(f)
-    # TODO: Replace with fmap now that we fully support Functors?
+    # Replace with fmap?
+    # Maybe not, because we don't want to descend into models?
     return c -> newcand(c, mapfield)
 end
 
diff --git a/src/iteratormaps.jl b/src/iteratormaps.jl
new file mode 100644
index 00000000..2ee0e2f9
--- /dev/null
+++ b/src/iteratormaps.jl
@@ -0,0 +1,98 @@
+
+"""
+    AbstractIteratorMap
+
+Abstract type for mapping training and validation dataset iterators using `maptrain(im, iter)` and `mapvalidation(im, iter)` respectively where `im` 
+is the struct extending `AbstractIteratorMap`.
+
+Main reason for existence is to enable dispatch to `AbstractMutation{AbstractIteratorMap}` and `AbstractCrossover{AbstractIteratorMap}` so that 
+strategies for data augmentation and batch size selection can be evolved.
+"""
+abstract type AbstractIteratorMap end
+
+maptrain(::AbstractIteratorMap, iter) = iter
+mapvalidation(::AbstractIteratorMap, iter) = iter
+
+"""
+    BatchSizeIteratorMap{F} <: AbstractIteratorMap 
+    BatchSizeIteratorMap(limitfun, trainbatchsize, validationbatchsize, model)
+
+[AbstractIteratorMap](@ref) which sets the batch size of training and validation iterators to `trainbatchsize` and `validationbatchsize` respectively.
+`limitfun` is used to try to ensure that batch sizes are small enough so that training and validating `model` does not risk an out of memory error.
+Use [`batchsizeselection`](@ref) to create an appropriate `limitfun`.
+
+# Examples
+```jldoctest
+julia> using NaiveGAflux, Flux
+
+julia> import NaiveGAflux: maptrain, mapvalidation # needed for examples only
+
+$(generic_batchsizefun_testgraph())
+julia> bsim = BatchSizeIteratorMap(4, 8, batchsizeselection((32,32,3)), graph);
+
+julia> collect(maptrain(bsim, (1:20,)))
+5-element Vector{Vector{Int64}}:
+ [1, 2, 3, 4]
+ [5, 6, 7, 8]
+ [9, 10, 11, 12]
+ [13, 14, 15, 16]
+ [17, 18, 19, 20]
+
+julia> collect(mapvalidation(bsim, (1:20,)))
+3-element Vector{Vector{Int64}}:
+ [1, 2, 3, 4, 5, 6, 7, 8]
+ [9, 10, 11, 12, 13, 14, 15, 16]
+ [17, 18, 19, 20]
+
+julia> map(x -> Pair(x...), maptrain(bsim, ((1:20, 21:40),))) # Pair to make results a bit easier on the eyes
+5-element Vector{Pair{SubArray{Int64, 1, Vector{Int64}, Tuple{UnitRange{Int64}}, true}, SubArray{Int64, 1, Vector{Int64}, Tuple{UnitRange{Int64}}, true}}}:
+     [1, 2, 3, 4] => [21, 22, 23, 24]
+     [5, 6, 7, 8] => [25, 26, 27, 28]
+  [9, 10, 11, 12] => [29, 30, 31, 32]
+ [13, 14, 15, 16] => [33, 34, 35, 36]
+ [17, 18, 19, 20] => [37, 38, 39, 40]
+
+julia> map(x -> Pair(x...), maptrain(bsim, BatchIterator((1:20, 21:40),12))) # Pair to make results a bit easier on the eyes
+5-element Vector{Pair{SubArray{Int64, 1, Vector{Int64}, Tuple{UnitRange{Int64}}, true}, SubArray{Int64, 1, Vector{Int64}, Tuple{UnitRange{Int64}}, true}}}:
+     [1, 2, 3, 4] => [21, 22, 23, 24]
+     [5, 6, 7, 8] => [25, 26, 27, 28]
+  [9, 10, 11, 12] => [29, 30, 31, 32]
+ [13, 14, 15, 16] => [33, 34, 35, 36]
+ [17, 18, 19, 20] => [37, 38, 39, 40]
+```
+"""
+struct BatchSizeIteratorMap{F} <: AbstractIteratorMap
+    tbs::TrainBatchSize
+    vbs::ValidationBatchSize
+    limitfun::F
+    function BatchSizeIteratorMap{F}(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, model) where F
+        new{F}(TrainBatchSize(limitfun(model, tbs)), ValidationBatchSize(limitfun(model, vbs)), limitfun)
+    end
+end
+
+function BatchSizeIteratorMap(tbs::Integer, vbs::Integer, limitfun, model)
+    BatchSizeIteratorMap(TrainBatchSize(tbs), ValidationBatchSize(vbs), limitfun, model)
+end
+
+
+function BatchSizeIteratorMap(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, model) where F
+    BatchSizeIteratorMap{F}(tbs, vbs, limitfun, model)
+end
+
+apply_mapfield(::typeof(deepcopy), bsim::BatchSizeIteratorMap, c) = model(c) do m
+    BatchSizeIteratorMap(bsim.tbs, bsim.vbs, deepcopy(bsim.limitfun), m)
+end
+
+maptrain(bs::BatchSizeIteratorMap, iter) = setbatchsize(iter, batchsize(bs.tbs))
+mapvalidation(bs::BatchSizeIteratorMap, iter) = setbatchsize(iter, batchsize(bs.vbs))
+
+"""
+    IteratorMaps{T} <: AbstractIteratorMap 
+"""
+struct IteratorMaps{T<:Tuple} <: AbstractIteratorMap
+    maps::T
+end
+IteratorMaps(x...) = IteratorMaps(x)
+
+maptrain(iws::IteratorMaps, iter) = foldr(maptrain, iws.maps; init=iter)
+mapvalidation(iws::IteratorMaps, iter) = foldr(mapvalidation, iws.maps; init=iter)
diff --git a/test/candidate.jl b/test/candidate.jl
index 428811fa..568e63e9 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -1,14 +1,132 @@
+@testset "MapType" begin
+    import NaiveGAflux: MapType
+
+    @testset "Basic" begin
+        mt = MapType{Integer}(x -> 2x, identity)
+
+        @test mt(2) == 4
+        @test mt(UInt16(3)) == 6
+        @test mt('c') == 'c'
+        @test mt(2.0) == 2.0
+    end
+
+    @testset "Crossover" begin
+        import NaiveGAflux: FluxOptimizer
+
+        struct MapTypeTestCrossover{T} <: AbstractCrossover{T} end
+        (::MapTypeTestCrossover)((c1, c2)) = c2,c1
+
+
+        c1 = CandidateOptModel(Descent(), CompGraph(inputvertex("c1", 1), AbstractVertex[]))
+        c2 = CandidateOptModel(Momentum(), CompGraph(inputvertex("c2", 1), AbstractVertex[]))
+        
+        mt1, mt2 = MapType(MapTypeTestCrossover{CompGraph}(), (c1,c2), (identity, identity))
+        @test name.(inputs(mt1(model(c1)))) == ["c2"]
+        @test name.(inputs(mt2(model(c2)))) == ["c1"]
+        @test mt1(3) == 3
+        @test mt2('c') == 'c'
+
+        mt1, mt2 = MapType(MapTypeTestCrossover{FluxOptimizer}(), (c1,c2), (identity, identity))
+        @test typeof(mt1(opt(c1))) == Momentum
+        @test typeof(mt2(opt(c2))) == Descent 
+        @test mt1(3) == 3
+        @test mt2('c') == 'c'
+    end
+end
+
+@testset "MapCandidate" begin
+    import NaiveGAflux: MapCandidate
+    
+    struct CollectMutation{T} <: AbstractMutation{T}
+        seen::Vector{T}
+    end
+    CollectMutation{T}() where T = CollectMutation{T}(T[])
+    # This way CollectMutation is type stable. Mutation often isn't though, but now we can test that 
+    # MapCandidate does not add any extra type instability
+    (m::CollectMutation{T1})(x::T2) where {T1, T2<:T1} = push!(m.seen, x)[end]::T2
+
+    function testcand(name, opt=Descent())
+        v0 = inputvertex(name * "_v0", 2)
+        v1 = fluxvertex(name * "_v1", Dense(nout(v0) => 3), v0)
+        CandidateOptModel(opt, CompGraph(v0, v1))
+    end
+    @testset "Mutation" begin
+
+        @testset "CompGraph" begin
+            graphmutation = CollectMutation{CompGraph}()
+            cnew = @inferred MapCandidate(graphmutation, deepcopy)(testcand("c"))
+            @test length(graphmutation.seen) == 1 
+            @test graphmutation.seen[] === model(cnew)       
+        end
+
+        @testset "Optimiser" begin
+            optmutation = CollectMutation{FluxOptimizer}()
+            cnew = MapCandidate(optmutation, deepcopy)(testcand("c"))
+            @test length(optmutation.seen) == 1
+            @test optmutation.seen[] === opt(cnew)
+        end
+
+        @testset "CompGraph + Optimiser" begin  
+            graphmutation = CollectMutation{CompGraph}()
+            optmutation = CollectMutation{FluxOptimizer}()
+            cnew = @inferred MapCandidate((graphmutation, optmutation), deepcopy)(testcand("c"))
+            
+            @test length(graphmutation.seen) == 1 
+            @test graphmutation.seen[] === model(cnew)   
+            
+            @test length(optmutation.seen) == 1
+            @test optmutation.seen[] === opt(cnew)
+        end
+    end
+
+    @testset "Crossover" begin
+        
+        @testset "CompGraph" begin
+            graphcrossover = CollectMutation{Tuple{CompGraph, CompGraph}}()
+            c1,c2 = testcand("c1"), testcand("c2")
+            cnew1, cnew2 = @inferred MapCandidate(graphcrossover, deepcopy)((c1, c2))
+
+            @test length(graphcrossover.seen) == 1 
+            @test graphcrossover.seen[] === (model(cnew1), model(cnew2))  
+        end
+
+        @testset "Optimiser" begin
+            optcrossover = CollectMutation{Tuple{FluxOptimizer, FluxOptimizer}}()
+            c1,c2 = testcand("c1", Descent()), testcand("c2", Momentum())
+            cnew1, cnew2 = @inferred MapCandidate(optcrossover , deepcopy)((c1, c2))
+
+            @test length(optcrossover.seen) == 1 
+            @test optcrossover.seen[] === (opt(cnew1), opt(cnew2))  
+        end
+
+        @testset "CompGraph + Optimiser" begin
+            graphcrossover = CollectMutation{Tuple{CompGraph, CompGraph}}()
+            optcrossover = CollectMutation{Tuple{FluxOptimizer, FluxOptimizer}}()
+            c1,c2 = testcand("c1", Descent()), testcand("c2", Momentum())
+            cnew1, cnew2 = @inferred MapCandidate((graphcrossover, optcrossover), deepcopy)((c1, c2))
+
+            @test length(graphcrossover.seen) == 1 
+            @test graphcrossover.seen[] === (model(cnew1), model(cnew2))  
+            @test length(optcrossover.seen) == 1 
+            @test optcrossover.seen[] === (opt(cnew1), opt(cnew2))  
+        end
+    end
+end
+
 @testset "Candidate" begin
 
     struct DummyFitness <: AbstractFitness end
     NaiveGAflux._fitness(::DummyFitness, f::AbstractCandidate) = 17
-    using NaiveGAflux: FileCandidate, AbstractWrappingCandidate, FittedCandidate, trainiterator, validationiterator
+    using NaiveGAflux: FileCandidate, AbstractWrappingCandidate, FittedCandidate, trainiterator, validationiterator, _evolvemodel
     using Functors: fmap
     import MemPool
+
+    CandidateBatchIterMap(g) = CandidateDataIterMap(BatchSizeIteratorMap(16,32, batchsizeselection((3,)), g), CandidateModel(g))
+
     @testset "$ctype" for (ctype, candfun) in (
         (CandidateModel, CandidateModel),
         (CandidateOptModel, g -> CandidateOptModel(Descent(0.01), g)),
-        (CandidateBatchSize, g -> CandidateBatchSize(16, 32, batchsizeselection((3,)), CandidateModel(g)))
+        (CandidateBatchIterMap, CandidateBatchIterMap)
     )
     
         @testset " $lbl" for (lbl, wrp) in (
@@ -33,7 +151,7 @@
 
                 graphmutation = VertexMutation(MutationFilter(v -> name(v)=="hlayer", AddVertexMutation(ArchSpace(DenseSpace([1], [relu])))))
                 optmutation = OptimizerMutation((Momentum, Nesterov, ADAM))
-                evofun = evolvemodel(graphmutation, optmutation)
+                evofun = _evolvemodel(graphmutation, optmutation)
                 newcand = evofun(cand)
 
                 @test NaiveGAflux.model(nvertices, newcand) == 4
@@ -50,7 +168,7 @@
                     @test opttype(newcand) === opttype(cand) === Nothing
                 end
 
-                if ctype == CandidateBatchSize
+                if ctype == CandidateBatchIterMap
                     @test length(first(trainiterator(cand; default=(1:100,)))) == 16  
                     @test length(first(validationiterator(cand; default=(1:100,)))) == 32  
                     # TODO Add mutation
@@ -64,7 +182,7 @@
                 teststrat() = NaiveGAflux.default_crossoverswap_strategy(v -> 1)
                 graphcrossover = VertexCrossover(CrossoverSwap(;pairgen = (v1,v2) -> (1,1), strategy=teststrat); pairgen = (v1,v2;ind1) -> ind1==1 ? (2,3) : nothing)
                 optcrossover = OptimizerCrossover()
-                crossfun = evolvemodel(graphcrossover, optcrossover)
+                crossfun = _evolvemodel(graphcrossover, optcrossover)
 
                 newcand1, newcand2 = crossfun((cand, newcand))
 
diff --git a/test/iteratormaps.jl b/test/iteratormaps.jl
new file mode 100644
index 00000000..091cb838
--- /dev/null
+++ b/test/iteratormaps.jl
@@ -0,0 +1,44 @@
+@testset "Iterator maps" begin
+    import NaiveGAflux: maptrain, mapvalidation
+
+    @testset "BatchSizeIteratorMap" begin
+        function testgraph(insize)
+            v0 = denseinputvertex("v0", insize)
+            v1 = fluxvertex("v1", Dense(nout(v0) => 5), v0)
+            v2 = fluxvertex("v2", Dense(nout(v1) => 2), v1)
+            v3 = concat("v3", v1, v2)
+            CompGraph(v0, "v4" >> v3 + v3)
+        end
+
+        bsim = BatchSizeIteratorMap(2, 4, batchsizeselection((5,)), testgraph(5))
+             
+        @testset "Single array" begin
+            @test collect(maptrain(bsim, (1:20,))) == [a:a+1 for a in 1:2:20]    
+            @test collect(mapvalidation(bsim, (1:20,))) == [a:a+3 for a in 1:4:20]   
+        end
+
+        @testset "BatchIterator" begin
+            itr = maptrain(bsim, BatchIterator((1:6, 7:12), 4))
+            @testset "Iteration $i" for (i, (res, exp)) in enumerate(zip(itr, (([a, a+1], [a+6,a+7]) for a in 1:2:6)))
+                @test res == exp
+            end
+        end
+    end
+
+    @testset "IteratorMaps" begin
+        NaiveGAflux.maptrain(::Val{:TestDummy1}, itr) = Iterators.map(x -> 2x, itr)
+        NaiveGAflux.maptrain(::Val{:TestDummy2}, itr) = Iterators.map(x -> 3x, itr)
+
+        NaiveGAflux.mapvalidation(::Val{:TestDummy1}, itr) = Iterators.map(x -> 5x, itr)
+        NaiveGAflux.mapvalidation(::Val{:TestDummy2}, itr) = Iterators.map(x -> 7x, itr)
+        
+        td1 = Val(:TestDummy1)
+        td2 = Val(:TestDummy2)
+
+        @test collect(maptrain(IteratorMaps(td1), 1:3)) == 2:2:6
+        @test collect(mapvalidation(IteratorMaps(td1), 1:3)) == 5:5:15
+
+        @test collect(maptrain(IteratorMaps(td1, td2), 1:3)) == 6:6:18
+        @test collect(mapvalidation(IteratorMaps(td1, td2), 1:3)) == 35:35:105
+    end
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 714b363d..c0b198d1 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -33,6 +33,9 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
     @info "Testing batch size utils"
     include("batchsize.jl")
 
+    @info "Testing iterator mapping"
+    include("iteratormaps.jl")
+
     @info "Testing archspace"
     include("archspace.jl")
 
@@ -70,6 +73,7 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
     @info "Testing AutoFlux"
     include("app/autoflux.jl")
 
+    @info "Testing documentation"
     import Documenter
     Documenter.doctest(NaiveGAflux)
 end

From d0d16025e9319907876efc1be59c0a46c61cdd2a Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Sun, 5 Jun 2022 11:30:08 +0200
Subject: [PATCH 11/36] Replace evolvemodel with MapCandidate

---
 src/NaiveGAflux.jl                      |  2 +-
 src/app/imageclassification/strategy.jl |  4 +-
 src/candidate.jl                        | 66 +++++--------------------
 src/evolve.jl                           |  2 +-
 test/candidate.jl                       |  6 +--
 test/examples/quicktutorial.jl          |  6 ++-
 6 files changed, 24 insertions(+), 62 deletions(-)

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index 829d682f..1c4ed4cb 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -33,7 +33,7 @@ const modeldir = "models"
 export fitness, AbstractFitness, LogFitness, GpuFitness, AccuracyFitness, TrainThenFitness, TrainAccuracyFitness, MapFitness, EwmaFitness, TimeFitness, SizeFitness, AggFitness
 
 # Candidate
-export evolvemodel, AbstractCandidate, CandidateModel, CandidateOptModel, CandidateDataIterMap, FittedCandidate, model, opt, lossfun
+export AbstractCandidate, CandidateModel, CandidateOptModel, CandidateDataIterMap, FittedCandidate, MapCandidate, model, opt, lossfun
 
 # Evolution
 export evolve, AbstractEvolution, NoOpEvolution, AfterEvolution, EliteSelection, SusSelection, TournamentSelection, CombinedEvolution, EvolutionChain, PairCandidates, ShuffleCandidates, EvolveCandidates
diff --git a/src/app/imageclassification/strategy.jl b/src/app/imageclassification/strategy.jl
index e7c18a62..97468b7f 100644
--- a/src/app/imageclassification/strategy.jl
+++ b/src/app/imageclassification/strategy.jl
@@ -327,8 +327,8 @@ crossovermutate(;pcrossover=0.3, pmutate=0.9) = function(inshape)
     return EvolutionChain(crossoverevo, mutationevo)
 end
 
-candidatemutation(p, inshape) = evolvemodel(MutationProbability(graphmutation(inshape), p), optmutation())
-candidatecrossover(p) = evolvemodel(MutationProbability(graphcrossover(), p), optcrossover())
+candidatemutation(p, inshape) = MapCandidate(MutationProbability(graphmutation(inshape), p), optmutation())
+candidatecrossover(p) = MapCandidate(MutationProbability(graphcrossover(), p), optcrossover())
 
 function clear_redundant_vertices(pop)
     foreach(cand -> NaiveGAflux.model(check_apply, cand), pop)
diff --git a/src/candidate.jl b/src/candidate.jl
index db7951e0..f758a4fe 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -289,61 +289,14 @@ nparams(c::AbstractCandidate) = model(nparams, c)
 nparams(x) = mapreduce(prod ∘ size, +, params(x).order; init=0)
 
 """
-    evolvemodel(m::AbstractMutation{CompGraph}, mapothers=deepcopy)
-    evolvemodel(m::AbstractMutation{CompGraph}, om::AbstractMutation{FluxOptimizer}, mapothers=deepcopy)
+    MapType{T, F1, F2}
+    MapType{T}(match::F1, nomatch::F2)
 
-Return a function which maps a `AbstractCandidate c1` to a new `AbstractCandidate c2` where any `CompGraph`s `g` in `c1` will be m(deepcopy(g))` in `c2`. Same principle is applied to any optimisers if `om` is present.
-
-All other fields are mapped through the function `mapothers` (default `deepcopy`).
-
-Intended use is together with [`EvolveCandidates`](@ref).
-"""
-function evolvemodel(m::AbstractMutation{CompGraph}, mapothers=deepcopy)
-    function copymutate(g::CompGraph)
-        ng = deepcopy(g)
-        m(ng)
-        return ng
-    end
-    mapcandidate(copymutate, mapothers)
-end
-evolvemodel(m::AbstractMutation{CompGraph}, om::AbstractMutation{FluxOptimizer}, mapothers=deepcopy) = evolvemodel(m, optmap(om, mapothers))
+Callable struct which returns `match(x)` if `x isa T`, otherwise returns `nomatch(x)`.
 
+Main purpose is to ensure that an `AbstractMutation{T}` or `AbstractCrossover{T}` is
+applied to fields which are subtypes of `T` when creating new candidates.
 """
-    evolvemodel(m::AbstractCrossover{CompGraph}, mapothers1=deepcopy, mapothers2=deepcopy)
-    evolvemodel(m::AbstractCrossover{CompGraph}, om::AbstractCrossover{FluxOptimizer}, mapothers1=deepcopy, mapothers2=deepcopy)
-
-Return a function which maps a tuple of `AbstractCandidate`s `(c1,c2)` to two new candidates `c1', c2'` where any `CompGraph`s `g1` and `g2` in `c1` and `c2` respectively will be `g1', g2' = m((deepcopy(g1), deepcopy(g2)))` in `c1'` and `c2'` respectively. Same principle applies to any optimisers if `om` is present.
-
-All other fields in `c1` will be mapped through the function `mapothers1` and likewise for `c2` and `mapothers2`.
-
-Intended use is together with [`PairCandidates`](@ref) and [`EvolveCandidates`](@ref).
-"""
-evolvemodel(m::AbstractCrossover{CompGraph}, mapothers1=deepcopy, mapothers2=deepcopy) = (c1, c2)::Tuple -> begin
-    # This allows FileCandidate to write the graph back to disk as we don't want to mutate the orignal candidate.
-    # Perhaps align single individual mutation to this pattern for consistency?
-    g1 = model(c1)
-    g2 = model(c2)
-
-    release!(c1)
-    release!(c2)
-
-    g1, g2 = m((deepcopy(g1), deepcopy(g2)))
-
-    return mapcandidate(g -> g1, mapothers1)(c1), mapcandidate(g -> g2, mapothers2)(c2)
-end
-
-evolvemodel(m::AbstractCrossover{CompGraph}, om::AbstractCrossover{FluxOptimizer}, mapothers1=deepcopy, mapothers2=deepcopy) = (c1,c2)::Tuple -> begin
-    o1 = opt(c1)
-    o2 = opt(c2)
-
-    o1n, o2n = om((o1, o2))
-
-    return evolvemodel(m, optmap(o -> o1n, mapothers1), optmap(o -> o2n, mapothers2))((c1,c2))
-end
-
-_evolvemodel(ms::AbstractMutation...; mapothers=deepcopy) = MapCandidate(ms, mapothers)
-
-
 struct MapType{T, F1, F2}
     match::F1
     nomatch::F2
@@ -375,7 +328,6 @@ function MapType(c::AbstractCrossover{FluxOptimizer}, (c1, c2), (nomatch1, nomat
     return MapType{FluxOptimizer}(Returns(o1n), nomatch1), MapType{FluxOptimizer}(Returns(o2n), nomatch2)
 end
 
-# TODO: Needs a new name. MapCandidate?
 """
     MapCandidate{T, F} 
     MapCandidate(mutations, mapothers::F)
@@ -445,7 +397,13 @@ end
 function (e::MapCandidate{<:NTuple{N, AbstractCrossover}, F})((c1,c2)) where {N,F}
     # Bleh, CGA to avoid a closure here
     mapc1, mapc2 = let c1 = c1, c2 = c2
-         foldr(e.mutations; init=(e.mapothers, e.mapothers)) do match, nomatch
+        # Whats happening here is probably far from obvious, so here goes:
+        # MapType returns a Tuple of MapTypes when called with an AbstractCrossover
+        # This is because we create new models, optimisers etc from from both c1 
+        # and c2 simultaneously, but when creating candidates we create one new 
+        # candidate from another new candidate. 
+        foldr(e.mutations; init=(e.mapothers, e.mapothers)) do match, nomatch
+            # nomatch is always a Tuple while match is always an AbstractCrossover
             MapType(match, (c1,c2), nomatch)
         end
     end
diff --git a/src/evolve.jl b/src/evolve.jl
index 99b3c3de..918d9104 100644
--- a/src/evolve.jl
+++ b/src/evolve.jl
@@ -185,7 +185,7 @@ _evolve(e::ShuffleCandidates, pop) = shuffle(e.rng, pop)
 
 Applies `fun(c)` for each candidate `c` in a given population.
 
-Useful with [`evolvemodel`](@ref).
+Useful with [`MapCandidate`](@ref).
 """
 struct EvolveCandidates{F} <: AbstractEvolution
     fun::F
diff --git a/test/candidate.jl b/test/candidate.jl
index 568e63e9..7d5c2985 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -117,7 +117,7 @@ end
 
     struct DummyFitness <: AbstractFitness end
     NaiveGAflux._fitness(::DummyFitness, f::AbstractCandidate) = 17
-    using NaiveGAflux: FileCandidate, AbstractWrappingCandidate, FittedCandidate, trainiterator, validationiterator, _evolvemodel
+    using NaiveGAflux: FileCandidate, AbstractWrappingCandidate, FittedCandidate, trainiterator, validationiterator
     using Functors: fmap
     import MemPool
 
@@ -151,7 +151,7 @@ end
 
                 graphmutation = VertexMutation(MutationFilter(v -> name(v)=="hlayer", AddVertexMutation(ArchSpace(DenseSpace([1], [relu])))))
                 optmutation = OptimizerMutation((Momentum, Nesterov, ADAM))
-                evofun = _evolvemodel(graphmutation, optmutation)
+                evofun = MapCandidate(graphmutation, optmutation)
                 newcand = evofun(cand)
 
                 @test NaiveGAflux.model(nvertices, newcand) == 4
@@ -182,7 +182,7 @@ end
                 teststrat() = NaiveGAflux.default_crossoverswap_strategy(v -> 1)
                 graphcrossover = VertexCrossover(CrossoverSwap(;pairgen = (v1,v2) -> (1,1), strategy=teststrat); pairgen = (v1,v2;ind1) -> ind1==1 ? (2,3) : nothing)
                 optcrossover = OptimizerCrossover()
-                crossfun = _evolvemodel(graphcrossover, optcrossover)
+                crossfun = MapCandidate(graphcrossover, optcrossover)
 
                 newcand1, newcand2 = crossfun((cand, newcand))
 
diff --git a/test/examples/quicktutorial.jl b/test/examples/quicktutorial.jl
index 16a66cc0..ca12f8e5 100644
--- a/test/examples/quicktutorial.jl
+++ b/test/examples/quicktutorial.jl
@@ -57,11 +57,15 @@ addlayer = mp(AddVertexMutation(layerspace), 0.4)
 remlayer = mp(RemoveVertexMutation(), 0.4)
 mutation = MutationChain(changesize, remlayer, addlayer)
 
+
 # Selection: The two best models are not changed, then create three new models by 
 # applying the mutations above to three of the five models with higher fitness 
 # giving higher probability of being selected. 
+#
+# [`MapCandidate^](@ref) helps with the plumbing of creating new `CandidateModel`s
+#  where `mutation` is applied to create a new model. 
 elites = EliteSelection(2)
-mutate = SusSelection(3, EvolveCandidates(evolvemodel(mutation)))
+mutate = SusSelection(3, EvolveCandidates(MapCandidate(mutation)))
 selection = CombinedEvolution(elites, mutate)
 
 # #### Step 4: Run evolution

From add57809a4f13fbd1907e0a2a22f046c26abbf7e Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Sun, 5 Jun 2022 12:11:33 +0200
Subject: [PATCH 12/36] Split mutation and crossover to different files

---
 src/NaiveGAflux.jl                        |   7 +-
 src/{crossover.jl => crossover/graph.jl}  |  53 --
 src/crossover/optimizer.jl                |  52 ++
 src/mutation.jl                           | 719 ----------------------
 src/mutation/generic.jl                   | 239 +++++++
 src/mutation/graph.jl                     | 426 +++++++++++++
 src/mutation/optimizer.jl                 |  52 ++
 test/{crossover.jl => crossover/graph.jl} |  97 +--
 test/crossover/optimizer.jl               |  94 +++
 test/mutation/generic.jl                  | 170 +++++
 test/{mutation.jl => mutation/graph.jl}   | 210 +------
 test/mutation/optimizer.jl                |  37 ++
 test/runtests.jl                          |   7 +-
 13 files changed, 1085 insertions(+), 1078 deletions(-)
 rename src/{crossover.jl => crossover/graph.jl} (91%)
 create mode 100644 src/crossover/optimizer.jl
 delete mode 100644 src/mutation.jl
 create mode 100644 src/mutation/generic.jl
 create mode 100644 src/mutation/graph.jl
 create mode 100644 src/mutation/optimizer.jl
 rename test/{crossover.jl => crossover/graph.jl} (89%)
 create mode 100644 test/crossover/optimizer.jl
 create mode 100644 test/mutation/generic.jl
 rename test/{mutation.jl => mutation/graph.jl} (71%)
 create mode 100644 test/mutation/optimizer.jl

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index 1c4ed4cb..f68f69a0 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -88,8 +88,11 @@ include("shape.jl")
 include("batchsize.jl")
 include("iteratormaps.jl")
 include("archspace.jl")
-include("mutation.jl")
-include("crossover.jl")
+include("mutation/generic.jl")
+include("mutation/graph.jl")
+include("mutation/optimizer.jl")
+include("crossover/graph.jl")
+include("crossover/optimizer.jl")
 include("candidate.jl")
 include("fitness.jl")
 include("evolve.jl")
diff --git a/src/crossover.jl b/src/crossover/graph.jl
similarity index 91%
rename from src/crossover.jl
rename to src/crossover/graph.jl
index 734241ee..0ab5eb09 100644
--- a/src/crossover.jl
+++ b/src/crossover/graph.jl
@@ -395,56 +395,3 @@ function separablefrom(v, forbidden, seen)
     return ok ? vcat(v, swappable) : swappable
 end
 
-
-"""
-    OptimizerCrossover{C} <: AbstractCrossover{FluxOptimizer}
-    OptimizerCrossover()
-    OptimizerCrossover(crossover)
-
-Apply crossover between optimizers.
-
-Type of crossover is determined by `crossover` (default `optimizerswap`) which when given a a tuple of two optimizers will return the result of the crossover operation as a tuple of optimizers.
-
-Designed to be composable with most utility `AbstractMutation`s as well as with itself. For instance, the following seemingly odd construct will swap components of a `Flux.Optimiser` with a probability of `0.2` per component:
-
-`OptimizerCrossover(MutationProbability(OptimizerCrossover(), 0.2))`
-
-Compare with the following which either swaps all components or none:
-
-`MutationProbability(OptimizerCrossover(), 0.2)`
-"""
-struct OptimizerCrossover{C} <: AbstractCrossover{FluxOptimizer}
-    crossover::C
-end
-OptimizerCrossover() = OptimizerCrossover(optimizerswap)
-
-"""
-    LearningRateCrossover()
-
-Return an `OptimizerCrossover` which will swap learning rates between optimizers but not change anything else.
-
-Does not do anything if any of the optimizers don't have a learning rate (e.g. WeightDecay).
-"""
-LearningRateCrossover() = OptimizerCrossover(learningrateswap)
-
-EitherIs{T} = Union{Tuple{T, Any}, Tuple{Any,T}}
-
-(oc::OptimizerCrossover)(os) = oc.crossover(os)
-(oc::OptimizerCrossover)(os::EitherIs{ShieldedOpt}) = os
-function (oc::OptimizerCrossover)((o1,o2)::EitherIs{Flux.Optimiser})
-    os1,o1re = optiter(o1)
-    os2,o2re = optiter(o2)
-    res = oc.crossover.(zip(os1,os2))
-    os1n = (t[1] for t in res)
-    os2n = (t[2] for t in res)
-    return o1re(os1n..., os1[length(os2)+1:end]...), o2re(os2n..., os2[length(os1)+1:end]...)
-end
-
-optiter(o) = (o,), (os...) -> os[1]
-optiter(o::Flux.Optimiser) = Tuple(o.os), (os...) -> Flux.Optimiser(os...)
-
-optimizerswap((o1, o2)::Tuple) = o2,o1
-
-learningrateswap((o1,o2)::Tuple) = (@set o1.eta = learningrate(o2)) , (@set o2.eta = learningrate(o1))
-learningrateswap(os::EitherIs{ShieldedOpt}) = os
-learningrateswap(os::EitherIs{WeightDecay}) = os
diff --git a/src/crossover/optimizer.jl b/src/crossover/optimizer.jl
new file mode 100644
index 00000000..8d082d90
--- /dev/null
+++ b/src/crossover/optimizer.jl
@@ -0,0 +1,52 @@
+"""
+    OptimizerCrossover{C} <: AbstractCrossover{FluxOptimizer}
+    OptimizerCrossover()
+    OptimizerCrossover(crossover)
+
+Apply crossover between optimizers.
+
+Type of crossover is determined by `crossover` (default `optimizerswap`) which when given a a tuple of two optimizers will return the result of the crossover operation as a tuple of optimizers.
+
+Designed to be composable with most utility `AbstractMutation`s as well as with itself. For instance, the following seemingly odd construct will swap components of a `Flux.Optimiser` with a probability of `0.2` per component:
+
+`OptimizerCrossover(MutationProbability(OptimizerCrossover(), 0.2))`
+
+Compare with the following which either swaps all components or none:
+
+`MutationProbability(OptimizerCrossover(), 0.2)`
+"""
+struct OptimizerCrossover{C} <: AbstractCrossover{FluxOptimizer}
+    crossover::C
+end
+OptimizerCrossover() = OptimizerCrossover(optimizerswap)
+
+"""
+    LearningRateCrossover()
+
+Return an `OptimizerCrossover` which will swap learning rates between optimizers but not change anything else.
+
+Does not do anything if any of the optimizers don't have a learning rate (e.g. WeightDecay).
+"""
+LearningRateCrossover() = OptimizerCrossover(learningrateswap)
+
+EitherIs{T} = Union{Tuple{T, Any}, Tuple{Any,T}}
+
+(oc::OptimizerCrossover)(os) = oc.crossover(os)
+(oc::OptimizerCrossover)(os::EitherIs{ShieldedOpt}) = os
+function (oc::OptimizerCrossover)((o1,o2)::EitherIs{Flux.Optimiser})
+    os1,o1re = optiter(o1)
+    os2,o2re = optiter(o2)
+    res = oc.crossover.(zip(os1,os2))
+    os1n = (t[1] for t in res)
+    os2n = (t[2] for t in res)
+    return o1re(os1n..., os1[length(os2)+1:end]...), o2re(os2n..., os2[length(os1)+1:end]...)
+end
+
+optiter(o) = (o,), (os...) -> os[1]
+optiter(o::Flux.Optimiser) = Tuple(o.os), (os...) -> Flux.Optimiser(os...)
+
+optimizerswap((o1, o2)::Tuple) = o2,o1
+
+learningrateswap((o1,o2)::Tuple) = (@set o1.eta = learningrate(o2)) , (@set o2.eta = learningrate(o1))
+learningrateswap(os::EitherIs{ShieldedOpt}) = os
+learningrateswap(os::EitherIs{WeightDecay}) = os
diff --git a/src/mutation.jl b/src/mutation.jl
deleted file mode 100644
index ed47e007..00000000
--- a/src/mutation.jl
+++ /dev/null
@@ -1,719 +0,0 @@
-"""
-    AbstractMutation{T}
-
-Abstract type defining a mutation operation on entities of type `T`.
-
-Implementations are expected to be callable using an entity of type `T` as only input.
-
-May also implement a callable accepting a `AbstractVector{<:T}` if it is useful to work on
-all items to mutate at once.
-"""
-abstract type AbstractMutation{T} end
-
-(m::AbstractMutation{T})(es::AbstractVector{<:T}) where T = m.(es)
-
-"""
-    AbstractCrossover{T}
-
-Type alias for `AbstractMutation{Tuple{T,T}}` defining a crossover of two entities of type `T`.
-
-Implementations are expected to be callable using a tuple of two type `T` as only input.
-"""
-const AbstractCrossover{T} = AbstractMutation{Tuple{T,T}}
-
-"""
-    DecoratingMutation{T}
-
-Abstract type indicating that the type itself does not perform any mutation but wraps a type which might do.
-
-Must either implement callable method for `AbstractVector{<:T}` or accept keyword arguments `next=wrapped(m)` and
-`noop=identity` along with a single `T`.
-"""
-abstract type DecoratingMutation{T} <: AbstractMutation{T} end
-wrapped(m::DecoratingMutation) = m.m
-
-mutationleaves(m::DecoratingMutation) = (mutationleaves(wrapped(m))...,)
-mutationleaves(tm::Tuple) = mapreduce(mutationleaves, (t1,t2) -> (t1...,t2...), tm)
-mutationleaves(m) = tuple(m)
-
-# Apart from being overengineered this helps protecting against forgetting to handle arrays in a DecoratingMutation
-# The next/noop happens to work with most existing DecoratingMutations, but it is a bit arbitrary and in some cases 
-# one must implement both the per-element and the vector of elements versions.
-function (m::DecoratingMutation{T})(es::AbstractVector{<:T}) where T 
-    cnt = Ref(1)
-    fornext = Int[]
-    next = function(e) 
-        push!(fornext, cnt[])
-        cnt[] += 1
-        e
-    end
-    noop = function(e)
-        cnt[] += 1
-        e
-    end
-
-    allres = m.(es; next, noop)
-    mres = wrapped(m)(es[fornext])
-
-    # Mutation might accidentally widen the type compared to allres and then we can't insert mres into allres.
-    # Lets fix that if it happens
-    RT = typejoin(eltype(allres), eltype(mres))
-    res = RT === eltype(allres) ? allres :  convert(Vector{RT}, allres)
-
-    res[fornext] = mres
-    return res
-end
-
-"""
-    MutationProbability{T} <: DecoratingMutation{T}
-    MutationProbability(m::AbstractMutation{T}, p::Probability)
-    MutationProbability(m::AbstractMutation{T}, p::Number)
-
-Applies `m` with probability `p`.
-"""
-struct MutationProbability{T, P<:Probability} <: DecoratingMutation{T}
-    m::AbstractMutation{T}
-    p::P
-end
-MutationProbability(m::AbstractMutation{T}, p::Number) where T = MutationProbability(m, Probability(p))
-(m::MutationProbability{T})(e::T; next=m.m, noop=identity) where T = apply(() -> next(e), m.p, () -> noop(e))
-
-"""
-    WeightedMutationProbability{T,F} <: DecoratingMutation{T}
-    WeightedMutationProbability(m::AbstractMutation::T, pfun::F)
-
-Applies `m` to an entity `e` with a probability `pfun(e)`.
-"""
-struct WeightedMutationProbability{T,F} <: DecoratingMutation{T}
-    m::AbstractMutation{T}
-    pfun::F
-end
-(m::WeightedMutationProbability{T})(e::T; next=m.m, noop=identity) where T = apply(() -> next(e), m.pfun(e), () -> noop(e))
-
-"""
-    HighUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default; spread=0.5)
-
-Return a `WeightedMutationProbability` which applies `m` to vertices with an (approximately) average probability of `pbase` and where high `neuronutility` compared to other vertices in same graph means higher probability.
-
-Parameter `spread` can be used to control how much the difference in probability is between high and low utlity. High spread means high difference while low spread means low difference.
-"""
-HighUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default;spread=0.5) where T <: AbstractVertex = WeightedMutationProbability(m, weighted_neuronutility_high(pbase, rng,spread=spread))
-
-"""
-    LowUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default; spread=2)
-
-Return a `WeightedMutationProbability` which applies `m` to vertices with an (approximately) average probability of `pbase` and where low `neuronutility` compared to other vertices in same graph means higher probability.
-
-Parameter `spread` can be used to control how much the difference in probability is between high and low utlity. High spread means high difference while low spread means low difference.
-"""
-LowUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default;spread=2) where T <: AbstractVertex = WeightedMutationProbability(m, weighted_neuronutility_low(pbase, rng, spread=spread))
-
-
-weighted_neuronutility_high(pbase, rng=rng_default; spread=0.5) = function(v::AbstractVertex)
-    ismissing(NaiveNASflux.neuronutility(v)) && return pbase
-    return Probability(fixnan(pbase ^ normexp(v, spread), pbase), rng)
-end
-
-weighted_neuronutility_low(pbase, rng=rng_default;spread=2) = function(v::AbstractVertex)
-    ismissing(NaiveNASflux.neuronutility(v)) && return pbase
-    return Probability(fixnan(pbase ^ (1/normexp(v, 1/spread)), pbase), rng)
-end
-
-fixnan(x, rep) = isnan(x) ? rep : clamp(x, 0.0, 1.0)
-
-# This is pretty hacky and arbitrary. Change to something better
-function normexp(v::AbstractVertex, s)
-    allvertices = filter(allow_mutation, all_in_graph(v))
-    allvalues = map(vi -> NaiveNASflux.neuronutility(vi), allvertices)
-    meanvalues = map(mean, skipmissing(allvalues))
-    meanvalue = mean(meanvalues)
-    maxvalue = maximum(meanvalues)
-    utlity = mean(NaiveNASflux.neuronutility(v))
-    # Basic idea: maxvalue - utlity means the (to be) exponent is <= 0 while the division seems to normalize so that 
-    # average of pbase ^ normexp across allvertices is near pbase (no proof!). The factor 2 is just to prevent 
-    # probability of vertex with maxvalue to be 1.
-    return (2maxvalue^s - utlity^s) / (2maxvalue^s - meanvalue^s)
-end
-
-
-"""
-    MutationChain{T} <: DecoratingMutation{T}
-    MutationChain(m::AbstractMutation{T}...)
-
-Chains multiple `AbstractMutation{T}`s after each other.
-
-Input entities will be mutated by the first `AbstractMutation{T}` in the chain and the output will be fed into the next `AbstractMutation{T}` in the chain and so on. The output from the last `AbstractMutation{T}` is returned.
-"""
-struct MutationChain{T} <: DecoratingMutation{T}
-    m::Tuple{Vararg{AbstractMutation{T}}}
-end
-MutationChain(m::AbstractMutation{T}...) where T = MutationChain(m)
-# Identical, but can't use Union due to ambiguity
-(m::MutationChain{T})(es::AbstractVector{<:T}) where T = foldl((ei, mi) -> mi(ei), m.m; init=es)
-(m::MutationChain{T})(e::T) where T = foldl((ei, mi) -> mi(ei), m.m; init=e)
-
-"""
-    RecordMutation{T} <: DecoratingMutation{T}
-    RecordMutation(m::AbstractMutation{T})
-
-Records all mutated entities.
-
-Intended use case is to be able to do parameter selection on mutated vertices.
-"""
-struct RecordMutation{T} <: DecoratingMutation{T}
-    m::AbstractMutation{T}
-    mutated::Vector{T}
-end
-RecordMutation(m::AbstractMutation{T}) where T = RecordMutation(m, T[])
-function (m::RecordMutation{T})(e::T; next=m.m, noop=identity) where T
-    em = next(e)
-    push!(m.mutated, em)
-    return em
-end
-function fetchmutated!(m::RecordMutation)
-    mutated = copy(m.mutated)
-    deleteat!(m.mutated, eachindex(m.mutated))
-    return mutated
-end
-
-"""
-    LogMutation{T} < :DecoratingMutation{T}
-    LogMutation(strfun, m::AbstractMutation{T}; level = Logging.Info, nextlogfun=e -> PrefixLogger("   "))
-    LogMutation(strfun, level::LogLevel, nextlogfun, m::AbstractMutation{T})
-
-Logs all mutation operations.
-
-Argument `strfun` maps the mutated entity to the logged string.
-
-Calling `nextlogfun(e)` where `e` is the entity to mutate produces an `AbstractLogger` which will be used when applying `m(e)`.
-
-By default, this is used to add a level of indentation to subsequent logging calls which makes logs of hierarchical mutations (e.g. mutate a CompGraph by applying mutations to some of its vertices) easier to read. Set `nextlogfun = e -> current_logger()` to remove this behaviour.
-"""
-struct LogMutation{T,F,L<:LogLevel,LF} <: DecoratingMutation{T}
-    strfun::F
-    level::L
-    nextlogfun::LF
-    m::AbstractMutation{T}
-end
-LogMutation(strfun, m::AbstractMutation{T}; level = Logging.Info, nextlogfun=e -> PrefixLogger("   ")) where T = LogMutation(strfun, level, nextlogfun, m)
-function (m::LogMutation{T})(e::T; next=m.m, noop=identity) where T
-    @logmsg m.level m.strfun(e)
-    return with_logger(() -> next(e), m.nextlogfun(e))
-end
-
-"""
-    MutationFilter{T} <: DecoratingMutation{T}
-    MutationFilter(predicate, m)
-
-Applies mutation `m` only for entities `e` for which `predicate(e)` returns true.
-"""
-struct MutationFilter{T,P} <: DecoratingMutation{T}
-    predicate::P
-    m::AbstractMutation{T}
-end
-function (m::MutationFilter{T})(e::T; next=m.m, noop=identity) where T
-    m.predicate(e) && return next(e)
-    return noop(e)
-end
-
-
-"""
-    VertexMutation <: DecoratingMutation{CompGraph}
-    VertexMutation(m::AbstractMutation{AbstractVertex}, s::AbstractVertexSelection)
-    VertexMutation(m::AbstractMutation{AbstractVertex})
-
-Applies a wrapped `AbstractMutation{AbstractVertex}` to each selected vertex in a `CompGraph`.
-
-Vertices to select is determined by the configured `AbstractVertexSelection`.
-"""
-struct VertexMutation{S<:AbstractVertexSelection} <: DecoratingMutation{CompGraph}
-    m::AbstractMutation{AbstractVertex}
-    s::S
-end
-VertexMutation(m::AbstractMutation{AbstractVertex}) = VertexMutation(m, FilterMutationAllowed())
-function (m::VertexMutation)(g::CompGraph)
-    m.m(select(m.s, g, m))
-    return g
-end
-
-"""
-    NoutMutation <:AbstractMutation{AbstractVertex}
-    NoutMutation(l1::Real,l2::Real, rng::AbstractRNG)
-    NoutMutation(limit, rng::AbstractRNG=rng_default)
-    NoutMutation(l1,l2)
-
-Mutate the out size of a vertex or vector of vertices.
-
-Size is changed by `x * nout(v)` rounded away from from zero where `x` is drawn from `U(minrel, maxrel)` where 
-`minrel` and `maxrel` are `l1` and `l2` if `l1 < l2` and `l2` and `l1` otherwise.
-"""
-struct NoutMutation{R<:Real, RNG<:AbstractRNG} <:AbstractMutation{AbstractVertex}
-    minrel::R
-    maxrel::R
-    rng::RNG
-    function NoutMutation(l1::R1, l2::R2, rng::RNG) where {R1, R2, RNG} 
-        R = promote_type(R1, R2)
-        return l1 < l2 ? new{R, RNG}(promote(l1, l2)..., rng) : new{R, RNG}(promote(l2, l1)..., rng)
-    end
-end
-NoutMutation(limit, rng::AbstractRNG=rng_default) = NoutMutation(0, limit, rng)
-NoutMutation(l1,l2) = NoutMutation(l1,l2, rng_default)
-(m::NoutMutation)(v::AbstractVertex) = first(m([v]))
-function (m::NoutMutation)(vs::AbstractVector{<:AbstractVertex})
-
-    Δs = Dict{AbstractVertex, Int}()
-    shift = m.minrel
-    scale = m.maxrel - m.minrel
-
-    for v in vs
-        terminputs = findterminating(v, inputs)
-
-        # We are basically just searching for Immutable vertices here, allow_mutation(trait(v)) happens to do just that
-        any(tv -> allow_mutation(trait(tv)), terminputs) || continue
-        
-        Δfloat = rand(m.rng) * scale + shift
-
-        Δ = ceil(Int, abs(Δfloat) * nout(v)) *  sign(Δfloat)
-        minsize = minimum(nout.(terminputs))
-        # Or else we might increase the size despite Δ being negative which would be surprising to a user who has specified 
-        # strictly negative size changes
-        minsize + Δ <= 0 && continue
-
-        Δs[v] = Δ
-    end
-
-    if !isempty(Δs)
-        failmsg = (args...) -> "Could not change nout of $(join(NaiveNASlib.nameorrepr.(keys(Δs)), ", ", " and ")) by $(join(values(Δs), ", ", " and ")). No change!"
-
-        strategy = TimeOutAction(;base=ΔNoutRelaxed(Δs), fallback=LogΔSizeExec(failmsg, Logging.Warn, ΔSizeFailNoOp()))
-
-        Δsize!(strategy)
-    end
-    return vs
-end
-
-"""
-    AddVertexMutation <:AbstractMutation{AbstractVertex}
-    AddVertexMutation(s::AbstractArchSpace, outselect::Function, WeightInit::AbstractWeightInit, rng::AbstractRNG)
-    AddVertexMutation(s, outselect::Function=identity)
-    AddVertexMutation(s, rng::AbstractRNG)
-    AddVertexMutation(s, wi::AbstractWeightInit)
-
-Insert a vertex from the wrapped `AbstractArchSpace` `s` after a given vertex `v`.
-
-The function `outselect` takes an `AbstractVector{AbstractVertex}` representing the output of `v` and returns an `AbstractVector{AbstractVertex}` which shall be reconnected to the vertex `v'` returned by `s`. Defaults to `identity` meaning all outputs of `v` are reconnected to `v'`.
-"""
-struct AddVertexMutation{S<:AbstractArchSpace, F, WI<:AbstractWeightInit, RNG<:AbstractRNG} <:AbstractMutation{AbstractVertex}
-    s::S
-    outselect::F
-    weightinit::WI
-    rng::RNG
-end
-AddVertexMutation(s, outselect::Function=identity) = AddVertexMutation(s, outselect, IdentityWeightInit(), rng_default)
-AddVertexMutation(s, rng::AbstractRNG) = AddVertexMutation(s, identity, IdentityWeightInit(), rng)
-AddVertexMutation(s, wi::AbstractWeightInit) = AddVertexMutation(s, identity, wi, rng_default)
-
-function (m::AddVertexMutation)(v::AbstractVertex)
-    insert!(v, vi -> m.s(name(vi), vi, m.rng, outsize=nout(vi), wi=m.weightinit), m.outselect)
-    return v
-end
-
-"""
-    RemoveVertexMutation <:AbstractMutation{AbstractVertex}
-    RemoveVertexMutation(s::RemoveStrategy)
-    RemoveVertexMutation()
-
-Remove the given vertex `v` using the configured `RemoveStrategy`.
-
-Default size align strategy is `IncreaseSmaller -> DecreaseBigger -> AlignSizeBoth -> FailAlignSizeWarn -> FailAlignSizeRevert`.
-
-Default reconnect strategy is `ConnectAll`.
-
-Note: High likelyhood of large accuracy degradation after applying this mutation.
-"""
-struct RemoveVertexMutation{S<:RemoveStrategy} <:AbstractMutation{AbstractVertex}
-    s::S
-end
-function RemoveVertexMutation() 
-    alignstrat = IncreaseSmaller(fallback=DecreaseBigger(fallback=AlignSizeBoth(fallback=FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Could not align sizes of neighbours!"))))
-    return RemoveVertexMutation(RemoveStrategy(CheckAligned(CheckNoSizeCycle(alignstrat, FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Size cycle detected!")))))
-end
-
-function (m::RemoveVertexMutation)(v::AbstractVertex)
-    remove!(v, m.s)
-    return v
-end
-
-default_neuronselect(args...) = NaiveNASlib.defaultutility(args...)
-
-"""
-    AddEdgeMutation <: AbstractMutation{AbstractVertex}
-    AddEdgeMutation(p; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect)
-    AddEdgeMutation(p::Probability; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect)
-
-Add an edge from a vertex `vi` to another vertex `vo` randomly selected from `vs = filtfun(vi)`.
-
-Higher values of `p` will give more preference to earlier vertices of `vs`.
-
-If `vo` is not capable of having multiple inputs (determined by `singleinput(v) == true`), `vm = mergefun(voi)` where `voi` is a randomly selected input to `vo` will be used instead of `vo` and `vo` will be added as the output of `vm`.
-
-When selecting neurons/outputs after any eventual size change the output of `utilityfun(v)` will be used to determine the utlity of each output in vertex `v`. Note that `length(utilityfun(v)) == nout(v)` must hold.
-
-Note: High likelyhood of large accuracy degradation after applying this mutation.
-"""
-struct AddEdgeMutation{F1, F2, F3, P<:Probability, RNG} <: AbstractMutation{AbstractVertex}
-    mergefun::F1
-    filtfun::F2
-    utilityfun::F3
-    p::P
-    rng::RNG
-end
-AddEdgeMutation(p; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect) = AddEdgeMutation(Probability(p, rng), rng=rng, mergefun=mergefun, filtfun=filtfun, utilityfun=utilityfun)
-AddEdgeMutation(p::Probability; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect) = AddEdgeMutation(mergefun, filtfun, utilityfun, p, rng)
-
-default_mergefun(pconc = 0.5; rng=rng_default, traitfun = MutationShield ∘ RemoveIfSingleInput ∘ validated() ∘ default_logging(), layerfun = ActivationContribution) = function(vin)
-    if rand(rng) > pconc
-        return invariantvertex(layerfun(+), vin, traitdecoration=traitfun ∘ named(name(vin) * ".add"))
-    end
-    return concat(vin, traitfun = traitfun ∘ named(name(vin) * ".cat"), layerfun=layerfun)
-end
-
-function no_shapechange(vi)
-    # all_in_graph is not sorted, and we want some kind of topoligical order here so that earlier indices are closer to vi
-    allsorted = mapreduce(ancestors, vcat, filter(v -> isempty(outputs(v)), all_in_graph(vi))) |> unique
-    
-    # Vertices which have the same input as vi and are singleinput
-    #   Reason is that this will cause a new vertex to be added between the target output vertex vo
-    #   and the input vertex to vi (vii) and this is detected as a size cycle which causes 
-    #   try_add_edge to fail.
-    inouts = filter(singleinput, mapreduce(outputs, vcat, inputs(vi); init=[]))
-    # All vertices which are after vi in the topology
-    vsafter = setdiff(allsorted, ancestors(vi), outputs(vi), inouts)
-    
-    vitrace = shapetrace(vi) 
-    viorder = allΔshapetypes(vitrace)
-    viΔshape = squashshapes(vitrace; order=viorder)
-
-    return filter(vsafter) do vafter
-        all(inputs(vafter)) do v
-        t = shapetrace(v)
-        vΔshape = squashshapes(t; order=union(viorder, allΔshapetypes(t)))
-        return viΔshape == vΔshape
-        end
-    end
-end
-
-function (m::AddEdgeMutation)(vi::AbstractVertex)
-    # All vertices for which it is allowed to add vi as an input
-    allverts = filter(allow_mutation, m.filtfun(vi))
-    isempty(allverts) && return vi
-
-    # Higher probability to select a vertex close to v is desired behaviour
-    # One line less than a for loop => FP wins!!
-    selfun(::Nothing, vc) = apply(m.p) ? vc : nothing
-    selfun(vs, vd) = vs
-    vo = foldl(selfun, allverts, init=nothing)
-    vo = isnothing(vo) ? rand(m.rng, allverts) : vo
-
-    try_add_edge(vi, vo, m.mergefun, m.utilityfun)
-    return vi
-end
-
-function try_add_edge(vi, vo, mergefun, utilityfun=default_neuronselect)
-
-    # Need to add a vertex which can handle multiple inputs if vo is single input only
-    # For cleaning up added vertex if the whole operation fails
-    cleanup_failed = () -> nothing
-    if singleinput(vo)
-        voi = inputs(vo)[1]
-        # If the input to vo is capable of multi input we don't need to create a new vertex
-        # We must also check that this input does not happen to be an input to vi as this would create a cycle in the graph
-        if singleinput(voi) || voi in ancestors(vi)
-            vm = mergefun(voi)
-            # Insert vm between voi and vo, i.e voi -> vo turns into voi -> vm -> vo
-            # vs -> [vo] means only add the new vertex between voi and vo as voi could have other outputs
-            insert!(voi, vv -> vm, vs -> [vo])
-            cleanup_failed = function()
-                length(inputs(vm)) > 1 && return
-                remove!(vm, RemoveStrategy(NoSizeChange()))
-            end
-            vo = vm # vm is the one we shall add an edge to
-            @debug "Create new vertex for merging $(name(vo))"
-        else
-            vo = voi
-        end
-    end
-    # This is mainly because FailAlignSizeRevert does not work when the same vertex is input more than once, but it also seems kinda redundant.
-    vi in inputs(vo) && return
-    @debug "Create edge between $(name(vi)) and $(name(vo))"
-    create_edge!(vi, vo, strategy = create_edge_strat(vo, utilityfun))
-    cleanup_failed()
-end
-# Need to override this one for strange types e.g. layers which support exactly 2 inputs or something.
-singleinput(v) = isempty(inputs(v)) || length(inputs(v)) == 1
-
-create_edge_strat(v::AbstractVertex, utilityfun) = create_edge_strat(trait(v), utilityfun)
-create_edge_strat(d::DecoratingTrait, utilityfun) = create_edge_strat(base(d), utilityfun)
-function create_edge_strat(::SizeInvariant, utilityfun)
-    warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
-    alignstrat = AlignSizeBoth(;mapstrat=WithUtilityFun(utilityfun), fallback = warnfailalign)
-    # Tricky failure case: It is possible that CheckCreateEdgeNoSizeCycle does not detect any size cycle until after the edge has been created?
-    sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!") 
-
-    return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
-end
-function create_edge_strat(::SizeStack, utilityfun)
-    warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
-    alignstrat = PostAlign(TruncateInIndsToValid(WithUtilityFun(utilityfun, AlignNinToNout(;fallback=ΔSizeFailNoOp()))), fallback=warnfailalign)
-
-    sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!")
-    return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
-end
-
-"""
-    RemoveEdgeMutation <: AbstractMutation{AbstractVertex}
-    RemoveEdgeMutation(;utilityfun=default_neuronselect, rng=rng_default)
-
-Remove an edge from a vertex `vi` to another vertex `vo` randomly selected from `outputs(vi)`.
-
-Vertex `vi` must have more than one output and vertex `vo` must have more than one output for the edge to be removed. Otherwise no change is made.
-
-If there are multiple edges between `vi` and `vo` no change will be made due to NaiveNASlib not being able to revert a failed operation in this case..
-
-When selecting neurons/outputs after any eventual size change the output of `utilityfun(v)` will be used to determine the utlity of each output in vertex `v`. Note that `length(utilityfun(v)) == nout(v)` must hold.
-
-Note: High likelyhood of large accuracy degradation after applying this mutation.
-"""
-struct RemoveEdgeMutation{F, RNG<:AbstractRNG} <: AbstractMutation{AbstractVertex}
-    utilityfun::F
-    rng::RNG
-end
-RemoveEdgeMutation(;utilityfun=default_neuronselect, rng=rng_default) = RemoveEdgeMutation(utilityfun, rng)
-
-function (m::RemoveEdgeMutation)(vi::AbstractVertex)
-    length(outputs(vi)) < 2 && return vi
-
-    allverts = filter(vo -> length(inputs(vo)) > 1, outputs(vi))
-
-    isempty(allverts) && return vi
-
-    vo = rand(m.rng, allverts)
-    sum(inputs(vo) .== vi) > 1 && return vi# Not implemented in NaiveNASlib
-
-    @debug "Remove edge between $(name(vi)) and $(name(vo))"
-    remove_edge!(vi, vo, strategy=remove_edge_strat(vo, m.utilityfun))
-    return vi
-end
-
-remove_edge_strat(v::AbstractVertex, utilityfun) = remove_edge_strat(trait(v), utilityfun)
-remove_edge_strat(d::DecoratingTrait, utilityfun) = remove_edge_strat(base(d), utilityfun)
-remove_edge_strat(::SizeInvariant, utilityfun) = NoSizeChange()
-remove_edge_strat(t::SizeStack, utilityfun) = create_edge_strat(t, utilityfun)
-
-"""
-    KernelSizeMutation{N} <: AbstractMutation{AbstractVertex}
-    KernelSizeMutation(Δsizespace::AbstractParSpace{N, Int}; maxsize, pad, rng)
-    KernelSizeMutation2D(absΔ::Integer;maxsize, pad, rng)
-    KernelSizeMutation(absΔ::Integer...;maxsize, pad, rng)
-
-Mutate the size of filter kernels of convolutional layers.
-
-Note: High likelyhood of large accuracy degradation after applying this mutation.
-
-`KernelSizeMutation2D` is a convenience constructor for `KernelSizeMutation(absΔ, absΔ;...)`.
-"""
-struct KernelSizeMutation{N,F,P} <: AbstractMutation{AbstractVertex}
-    Δsizespace::AbstractParSpace{N, Int}
-    maxsize::F
-    pad::P
-    rng::AbstractRNG
-end
-KernelSizeMutation(Δsizespace::AbstractParSpace{N, Int}; maxsize = v -> ntuple(i->Inf,N), pad=SamePad(), rng=rng_default) where N = KernelSizeMutation(Δsizespace, maxsize, pad, rng)
-KernelSizeMutation2D(absΔ::Integer;maxsize = v -> (Inf,Inf), pad=SamePad(), rng=rng_default) = KernelSizeMutation(absΔ, absΔ, maxsize = maxsize, pad=pad, rng=rng)
-KernelSizeMutation(absΔ::Integer...;maxsize = v -> ntuple(i->Inf, length(absΔ)), pad=SamePad(), rng=rng_default) = KernelSizeMutation(ParSpace(UnitRange.(.-absΔ, absΔ));maxsize = maxsize, pad=pad, rng=rng)
-
-function (m::KernelSizeMutation{N})(v::AbstractVertex) where N
-    layertype(v) isa FluxConvolutional{N} || return
-    l = layer(v)
-
-    currsize = size(NaiveNASflux.weights(l))[1:N]
-    Δsize = Int.(clamp.(m.Δsizespace(m.rng), 1 .- currsize, m.maxsize(v) .- currsize)) # ensure new size is > 0 and < maxsize
-    # This will eventually boil down to Setfield doing its thing, and that won't be using any convenience constructors
-    pad = Flux.calc_padding(typeof(l), m.pad, currsize .+ Δsize, dilation(l), stride(l))
-    KernelSizeAligned(Δsize, pad)(v)
-    return v
-end
-dilation(l) = l.dilation
-stride(l) = l.stride
-
-"""
-    ActivationFunctionMutation{T,R} <: AbstractMutation{AbstractVertex} where {T <: AbstractParSpace{1}, R <: AbstractRNG}
-    ActivationFunctionMutation(actspace::AbstractParSpace{1}, rng::AbstractRNG)
-    ActivationFunctionMutation(acts...;rng=rng_default)
-    ActivationFunctionMutation(acts::AbstractVector;rng=rng_default)
-
-Mutate the activation function of layers which have an activation function.
-
-Note: High likelyhood of large accuracy degradation after applying this mutation.
-"""
-struct ActivationFunctionMutation{T,RNG} <: AbstractMutation{AbstractVertex} where {T <: AbstractParSpace{1}, R <: AbstractRNG}
-    actspace::T
-    rng::RNG
-end
-ActivationFunctionMutation(acts...;rng=rng_default) = ActivationFunctionMutation(collect(acts), rng=rng)
-ActivationFunctionMutation(acts::AbstractVector;rng=rng_default) = ActivationFunctionMutation(ParSpace(acts), rng)
-
-function (m::ActivationFunctionMutation)(v::AbstractVertex)
-    m(layertype(v), v)
-    return v
-end
-function (m::ActivationFunctionMutation)(t, v) end
-(m::ActivationFunctionMutation)(::Union{FluxDense, FluxConvolutional}, v) = NaiveNASflux.setlayer!(v, (σ = m.actspace(m.rng),))
-(m::ActivationFunctionMutation)(::FluxParNorm, v) = NaiveNASflux.setlayer!(v, (λ = m.actspace(m.rng),))
-function (m::ActivationFunctionMutation)(::FluxRnn, v)
-    newcell = setproperties(layer(v).cell, (σ = m.actspace(m.rng),))
-    NaiveNASflux.setlayer!(v, (cell = newcell,))
-end
-
-
-"""
-    PostMutation{T} <: DecoratingMutation{T}
-    PostMutation(actions, m::AbstractMutation{T})
-    PostMutation(m::AbstractMutation{T}, actions...)
-
-Performs a set of actions after a wrapped `AbstractMutation` is applied.
-
-Actions will be invoked with arguments (m::PostMutation{T}, e::T) where m is the enclosing `PostMutation` and `e` is the mutated entity of type `T`.
-"""
-struct PostMutation{T,A} <: DecoratingMutation{T}
-    actions::A
-    m::AbstractMutation{T}
-end
-PostMutation(m::AbstractMutation{T}, actions...) where T = PostMutation(actions, m)
-PostMutation(action::Function, m::AbstractMutation{T}) where T = PostMutation(m, action)
-function (m::PostMutation{T})(e::T; next=m.m, noop=identity) where T
-    eout = next(e)
-    foreach(a -> a(m, eout), m.actions)
-    return eout
-end
-
-"""
-    RemoveZeroNout()
-    RemoveZeroNout(fallback)
-
-Search for vertices with zero output size and remove them and all of their input vertices if possible to do so witout removing an input or output vertex.
-
-Removal is only possible if a vertex is inside a parallel path which will later be concatenated.
-"""
-struct RemoveZeroNout
-    fallback
-end
-RemoveZeroNout() = RemoveZeroNout(IncreaseZeroNout())
-struct IncreaseZeroNout end
-
-(r::RemoveZeroNout)(m, e) = r(e)
-function (r::RemoveZeroNout)(g::CompGraph)
-    topoligical_order = vertices(g)
-    for v in topoligical_order
-        nout(v) == 0 || continue
-
-        # Beware! This turned out to be a lot harder than I first thought.
-        # I'm not sure the algorithm works (or realizes that it won't work) for all possible cases.
-
-        # This obviously only works if v is inside a parallel path which will later be concatenated
-
-        # To make sure it is, we look ahead in forward and backward direction.
-        # If we don't see 1) an input vertex 2) a vertex without outputs (i.e an output vertex) we are good to go
-        fseen, fpoints = findforkpoint(v, topoligical_order, inputs, outputs)
-        isempty(fpoints) && continue
-
-        bseen, bpoints = findforkpoint(v, topoligical_order, outputs, inputs)
-        isempty(bpoints) && continue
-
-        # Ok, fpoints are all vertices where forks with v in them join and bpoints are all vertices where forks with v in them begin
-
-        # If we start removing all seen vertices which are input to an fpoint until we hit a bpoint we should have removed the fork with v in it, right?
-        seen = union(fseen, bseen)
-        to_rm = intersect(vcat(inputs.(fpoints)...), seen)
-        foreach(fpoint -> remove_all_inputs(fpoint, seen, bpoints), to_rm)
-    end
-end
-
-function findforkpoint(v::AbstractVertex, topoligical_order, f1=inputs, f2=outputs, seen = vcat(v, f1(v)), points = AbstractVertex[])
-
-    if all(x -> x in seen, f1(v))
-
-        # Check if we came across a vertex which previously was thought to be a forkpoint and remove it if so
-        if v in points
-            deleteat!(points, indexin([v], points))
-        end
-        # Always visit in reverse topoligical order to mitigate chasing down paths just because we haven't explored them yet
-        nextverts = reverse(topoligical_order[unique(indexin(f2(v), topoligical_order))])
-        push!(seen, filter(v2 -> !in(v2, seen), nextverts)...)
-        foreach(v2 -> findforkpoint(v2, topoligical_order, f1, f2, seen, points), nextverts)
-    elseif !(v in points)
-        push!(points, v)
-    end
-
-    return seen, points
-end
-
-function remove_all_inputs(v, seen, stop)
-    v in stop && return
-    foreach(vrm -> remove_all_inputs(vrm, seen, stop), intersect(inputs(v), seen))
-    remove!(v, RemoveStrategy(ConnectNone(), NoSizeChange()))
-end
-
-
-"""
-    struct OptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
-    OptimizerMutation(optfun)
-    OptimizerMutation(os::Union{Tuple, <:AbstractArray})
-
-Mutatates optimizers not wrapped in `ShieldedOpt` through `optfun`.
-
-Invoked recursively for `Flux.Optimiser`s.
-"""
-struct OptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
-    optfun::F
-end
-OptimizerMutation(os::Union{Tuple, <:AbstractArray}, rng=rng_default) = OptimizerMutation(o -> rand(rng, os)(learningrate(o)))
-
-"""
-    LearningRateMutation(rng=rng_default)
-
-Return an `OptimizerMutation` which mutates the learning rate of optimizers.
-"""
-LearningRateMutation(rng=rng_default) = OptimizerMutation(o -> nudgelr(o, rng))
-
-(m::OptimizerMutation)(opt::Flux.Optimiser) = Flux.Optimiser(m.(opt.os))
-(m::OptimizerMutation)(o::ShieldedOpt) = o;
-(m::OptimizerMutation)(o::FluxOptimizer) = m.optfun(o)
-
-
-nudgelr(o, rng=rng_default) = sameopt(o, nudgelr(learningrate(o), rng))
-nudgelr(lr::Number, rng=rng_default) = clamp(lr + (rand(rng) - 0.5) * lr * 0.3, 1e-6, 1.0)
-
-learningrate(o::Flux.Optimiser) = prod(learningrate.(o.os))
-learningrate(o::ShieldedOpt) = learningrate(o.opt)
-learningrate(o) = o.eta
-
-newlr(o, lrf = nudgelr) = sameopt(o, lrf(learningrate(o)))
-sameopt(o, lr) = @set o.eta = lr
-
-"""
-    AddOptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
-
-Adds optimizer generated by `optgen(os)` to the set of optimizers where `os` is the existing set.
-
-An attempt to merge optimizers of the same type is made using `mergeopts`.
-"""
-struct AddOptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
-    optgen::F
-end
-(m::AddOptimizerMutation)(o::ShieldedOpt) = o;
-(m::AddOptimizerMutation)(o::FluxOptimizer) = m(Flux.Optimiser([o]))
-function (m::AddOptimizerMutation)(opt::Flux.Optimiser)
-    newopt = m.optgen(opt)
-    return Flux.Optimiser(mergeopts(typeof(newopt), newopt, opt.os...))
-end
diff --git a/src/mutation/generic.jl b/src/mutation/generic.jl
new file mode 100644
index 00000000..40353b82
--- /dev/null
+++ b/src/mutation/generic.jl
@@ -0,0 +1,239 @@
+"""
+    AbstractMutation{T}
+
+Abstract type defining a mutation operation on entities of type `T`.
+
+Implementations are expected to be callable using an entity of type `T` as only input.
+
+May also implement a callable accepting a `AbstractVector{<:T}` if it is useful to work on
+all items to mutate at once.
+"""
+abstract type AbstractMutation{T} end
+
+(m::AbstractMutation{T})(es::AbstractVector{<:T}) where T = m.(es)
+
+"""
+    AbstractCrossover{T}
+
+Type alias for `AbstractMutation{Tuple{T,T}}` defining a crossover of two entities of type `T`.
+
+Implementations are expected to be callable using a tuple of two type `T` as only input.
+"""
+const AbstractCrossover{T} = AbstractMutation{Tuple{T,T}}
+
+"""
+    DecoratingMutation{T}
+
+Abstract type indicating that the type itself does not perform any mutation but wraps a type which might do.
+
+Must either implement callable method for `AbstractVector{<:T}` or accept keyword arguments `next=wrapped(m)` and
+`noop=identity` along with a single `T`.
+"""
+abstract type DecoratingMutation{T} <: AbstractMutation{T} end
+wrapped(m::DecoratingMutation) = m.m
+
+mutationleaves(m::DecoratingMutation) = (mutationleaves(wrapped(m))...,)
+mutationleaves(tm::Tuple) = mapreduce(mutationleaves, (t1,t2) -> (t1...,t2...), tm)
+mutationleaves(m) = tuple(m)
+
+# Apart from being overengineered this helps protecting against forgetting to handle arrays in a DecoratingMutation
+# The next/noop happens to work with most existing DecoratingMutations, but it is a bit arbitrary and in some cases 
+# one must implement both the per-element and the vector of elements versions.
+function (m::DecoratingMutation{T})(es::AbstractVector{<:T}) where T 
+    cnt = Ref(1)
+    fornext = Int[]
+    next = function(e) 
+        push!(fornext, cnt[])
+        cnt[] += 1
+        e
+    end
+    noop = function(e)
+        cnt[] += 1
+        e
+    end
+
+    allres = m.(es; next, noop)
+    mres = wrapped(m)(es[fornext])
+
+    # Mutation might accidentally widen the type compared to allres and then we can't insert mres into allres.
+    # Lets fix that if it happens
+    RT = typejoin(eltype(allres), eltype(mres))
+    res = RT === eltype(allres) ? allres :  convert(Vector{RT}, allres)
+
+    res[fornext] = mres
+    return res
+end
+
+"""
+    MutationProbability{T} <: DecoratingMutation{T}
+    MutationProbability(m::AbstractMutation{T}, p::Probability)
+    MutationProbability(m::AbstractMutation{T}, p::Number)
+
+Applies `m` with probability `p`.
+"""
+struct MutationProbability{T, P<:Probability} <: DecoratingMutation{T}
+    m::AbstractMutation{T}
+    p::P
+end
+MutationProbability(m::AbstractMutation{T}, p::Number) where T = MutationProbability(m, Probability(p))
+(m::MutationProbability{T})(e::T; next=m.m, noop=identity) where T = apply(() -> next(e), m.p, () -> noop(e))
+
+"""
+    WeightedMutationProbability{T,F} <: DecoratingMutation{T}
+    WeightedMutationProbability(m::AbstractMutation::T, pfun::F)
+
+Applies `m` to an entity `e` with a probability `pfun(e)`.
+"""
+struct WeightedMutationProbability{T,F} <: DecoratingMutation{T}
+    m::AbstractMutation{T}
+    pfun::F
+end
+(m::WeightedMutationProbability{T})(e::T; next=m.m, noop=identity) where T = apply(() -> next(e), m.pfun(e), () -> noop(e))
+
+"""
+    HighUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default; spread=0.5)
+
+Return a `WeightedMutationProbability` which applies `m` to vertices with an (approximately) average probability of `pbase` and where high `neuronutility` compared to other vertices in same graph means higher probability.
+
+Parameter `spread` can be used to control how much the difference in probability is between high and low utlity. High spread means high difference while low spread means low difference.
+"""
+HighUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default;spread=0.5) where T <: AbstractVertex = WeightedMutationProbability(m, weighted_neuronutility_high(pbase, rng,spread=spread))
+
+"""
+    LowUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default; spread=2)
+
+Return a `WeightedMutationProbability` which applies `m` to vertices with an (approximately) average probability of `pbase` and where low `neuronutility` compared to other vertices in same graph means higher probability.
+
+Parameter `spread` can be used to control how much the difference in probability is between high and low utlity. High spread means high difference while low spread means low difference.
+"""
+LowUtilityMutationProbability(m::AbstractMutation{T}, pbase::Real, rng=rng_default;spread=2) where T <: AbstractVertex = WeightedMutationProbability(m, weighted_neuronutility_low(pbase, rng, spread=spread))
+
+
+weighted_neuronutility_high(pbase, rng=rng_default; spread=0.5) = function(v::AbstractVertex)
+    ismissing(NaiveNASflux.neuronutility(v)) && return pbase
+    return Probability(fixnan(pbase ^ normexp(v, spread), pbase), rng)
+end
+
+weighted_neuronutility_low(pbase, rng=rng_default;spread=2) = function(v::AbstractVertex)
+    ismissing(NaiveNASflux.neuronutility(v)) && return pbase
+    return Probability(fixnan(pbase ^ (1/normexp(v, 1/spread)), pbase), rng)
+end
+
+fixnan(x, rep) = isnan(x) ? rep : clamp(x, 0.0, 1.0)
+
+# This is pretty hacky and arbitrary. Change to something better
+function normexp(v::AbstractVertex, s)
+    allvertices = filter(allow_mutation, all_in_graph(v))
+    allvalues = map(vi -> NaiveNASflux.neuronutility(vi), allvertices)
+    meanvalues = map(mean, skipmissing(allvalues))
+    meanvalue = mean(meanvalues)
+    maxvalue = maximum(meanvalues)
+    utlity = mean(NaiveNASflux.neuronutility(v))
+    # Basic idea: maxvalue - utlity means the (to be) exponent is <= 0 while the division seems to normalize so that 
+    # average of pbase ^ normexp across allvertices is near pbase (no proof!). The factor 2 is just to prevent 
+    # probability of vertex with maxvalue to be 1.
+    return (2maxvalue^s - utlity^s) / (2maxvalue^s - meanvalue^s)
+end
+
+
+"""
+    MutationChain{T} <: DecoratingMutation{T}
+    MutationChain(m::AbstractMutation{T}...)
+
+Chains multiple `AbstractMutation{T}`s after each other.
+
+Input entities will be mutated by the first `AbstractMutation{T}` in the chain and the output will be fed into the next `AbstractMutation{T}` in the chain and so on. The output from the last `AbstractMutation{T}` is returned.
+"""
+struct MutationChain{T} <: DecoratingMutation{T}
+    m::Tuple{Vararg{AbstractMutation{T}}}
+end
+MutationChain(m::AbstractMutation{T}...) where T = MutationChain(m)
+# Identical, but can't use Union due to ambiguity
+(m::MutationChain{T})(es::AbstractVector{<:T}) where T = foldl((ei, mi) -> mi(ei), m.m; init=es)
+(m::MutationChain{T})(e::T) where T = foldl((ei, mi) -> mi(ei), m.m; init=e)
+
+"""
+    RecordMutation{T} <: DecoratingMutation{T}
+    RecordMutation(m::AbstractMutation{T})
+
+Records all mutated entities.
+
+Intended use case is to be able to do parameter selection on mutated vertices.
+"""
+struct RecordMutation{T} <: DecoratingMutation{T}
+    m::AbstractMutation{T}
+    mutated::Vector{T}
+end
+RecordMutation(m::AbstractMutation{T}) where T = RecordMutation(m, T[])
+function (m::RecordMutation{T})(e::T; next=m.m, noop=identity) where T
+    em = next(e)
+    push!(m.mutated, em)
+    return em
+end
+function fetchmutated!(m::RecordMutation)
+    mutated = copy(m.mutated)
+    deleteat!(m.mutated, eachindex(m.mutated))
+    return mutated
+end
+
+"""
+    LogMutation{T} < :DecoratingMutation{T}
+    LogMutation(strfun, m::AbstractMutation{T}; level = Logging.Info, nextlogfun=e -> PrefixLogger("   "))
+    LogMutation(strfun, level::LogLevel, nextlogfun, m::AbstractMutation{T})
+
+Logs all mutation operations.
+
+Argument `strfun` maps the mutated entity to the logged string.
+
+Calling `nextlogfun(e)` where `e` is the entity to mutate produces an `AbstractLogger` which will be used when applying `m(e)`.
+
+By default, this is used to add a level of indentation to subsequent logging calls which makes logs of hierarchical mutations (e.g. mutate a CompGraph by applying mutations to some of its vertices) easier to read. Set `nextlogfun = e -> current_logger()` to remove this behaviour.
+"""
+struct LogMutation{T,F,L<:LogLevel,LF} <: DecoratingMutation{T}
+    strfun::F
+    level::L
+    nextlogfun::LF
+    m::AbstractMutation{T}
+end
+LogMutation(strfun, m::AbstractMutation{T}; level = Logging.Info, nextlogfun=e -> PrefixLogger("   ")) where T = LogMutation(strfun, level, nextlogfun, m)
+function (m::LogMutation{T})(e::T; next=m.m, noop=identity) where T
+    @logmsg m.level m.strfun(e)
+    return with_logger(() -> next(e), m.nextlogfun(e))
+end
+
+"""
+    MutationFilter{T} <: DecoratingMutation{T}
+    MutationFilter(predicate, m)
+
+Applies mutation `m` only for entities `e` for which `predicate(e)` returns true.
+"""
+struct MutationFilter{T,P} <: DecoratingMutation{T}
+    predicate::P
+    m::AbstractMutation{T}
+end
+function (m::MutationFilter{T})(e::T; next=m.m, noop=identity) where T
+    m.predicate(e) && return next(e)
+    return noop(e)
+end
+
+"""
+    PostMutation{T} <: DecoratingMutation{T}
+    PostMutation(actions, m::AbstractMutation{T})
+    PostMutation(m::AbstractMutation{T}, actions...)
+
+Performs a set of actions after a wrapped `AbstractMutation` is applied.
+
+Actions will be invoked with arguments (m::PostMutation{T}, e::T) where m is the enclosing `PostMutation` and `e` is the mutated entity of type `T`.
+"""
+struct PostMutation{T,A} <: DecoratingMutation{T}
+    actions::A
+    m::AbstractMutation{T}
+end
+PostMutation(m::AbstractMutation{T}, actions...) where T = PostMutation(actions, m)
+PostMutation(action::Function, m::AbstractMutation{T}) where T = PostMutation(m, action)
+function (m::PostMutation{T})(e::T; next=m.m, noop=identity) where T
+    eout = next(e)
+    foreach(a -> a(m, eout), m.actions)
+    return eout
+end
+
diff --git a/src/mutation/graph.jl b/src/mutation/graph.jl
new file mode 100644
index 00000000..e4ef145e
--- /dev/null
+++ b/src/mutation/graph.jl
@@ -0,0 +1,426 @@
+
+"""
+VertexMutation <: DecoratingMutation{CompGraph}
+VertexMutation(m::AbstractMutation{AbstractVertex}, s::AbstractVertexSelection)
+VertexMutation(m::AbstractMutation{AbstractVertex})
+
+Applies a wrapped `AbstractMutation{AbstractVertex}` to each selected vertex in a `CompGraph`.
+
+Vertices to select is determined by the configured `AbstractVertexSelection`.
+"""
+struct VertexMutation{S<:AbstractVertexSelection} <: DecoratingMutation{CompGraph}
+m::AbstractMutation{AbstractVertex}
+s::S
+end
+VertexMutation(m::AbstractMutation{AbstractVertex}) = VertexMutation(m, FilterMutationAllowed())
+function (m::VertexMutation)(g::CompGraph)
+m.m(select(m.s, g, m))
+return g
+end
+
+"""
+NoutMutation <:AbstractMutation{AbstractVertex}
+NoutMutation(l1::Real,l2::Real, rng::AbstractRNG)
+NoutMutation(limit, rng::AbstractRNG=rng_default)
+NoutMutation(l1,l2)
+
+Mutate the out size of a vertex or vector of vertices.
+
+Size is changed by `x * nout(v)` rounded away from from zero where `x` is drawn from `U(minrel, maxrel)` where 
+`minrel` and `maxrel` are `l1` and `l2` if `l1 < l2` and `l2` and `l1` otherwise.
+"""
+struct NoutMutation{R<:Real, RNG<:AbstractRNG} <:AbstractMutation{AbstractVertex}
+minrel::R
+maxrel::R
+rng::RNG
+function NoutMutation(l1::R1, l2::R2, rng::RNG) where {R1, R2, RNG} 
+    R = promote_type(R1, R2)
+    return l1 < l2 ? new{R, RNG}(promote(l1, l2)..., rng) : new{R, RNG}(promote(l2, l1)..., rng)
+end
+end
+NoutMutation(limit, rng::AbstractRNG=rng_default) = NoutMutation(0, limit, rng)
+NoutMutation(l1,l2) = NoutMutation(l1,l2, rng_default)
+(m::NoutMutation)(v::AbstractVertex) = first(m([v]))
+function (m::NoutMutation)(vs::AbstractVector{<:AbstractVertex})
+
+Δs = Dict{AbstractVertex, Int}()
+shift = m.minrel
+scale = m.maxrel - m.minrel
+
+for v in vs
+    terminputs = findterminating(v, inputs)
+
+    # We are basically just searching for Immutable vertices here, allow_mutation(trait(v)) happens to do just that
+    any(tv -> allow_mutation(trait(tv)), terminputs) || continue
+    
+    Δfloat = rand(m.rng) * scale + shift
+
+    Δ = ceil(Int, abs(Δfloat) * nout(v)) *  sign(Δfloat)
+    minsize = minimum(nout.(terminputs))
+    # Or else we might increase the size despite Δ being negative which would be surprising to a user who has specified 
+    # strictly negative size changes
+    minsize + Δ <= 0 && continue
+
+    Δs[v] = Δ
+end
+
+if !isempty(Δs)
+    failmsg = (args...) -> "Could not change nout of $(join(NaiveNASlib.nameorrepr.(keys(Δs)), ", ", " and ")) by $(join(values(Δs), ", ", " and ")). No change!"
+
+    strategy = TimeOutAction(;base=ΔNoutRelaxed(Δs), fallback=LogΔSizeExec(failmsg, Logging.Warn, ΔSizeFailNoOp()))
+
+    Δsize!(strategy)
+end
+return vs
+end
+
+"""
+AddVertexMutation <:AbstractMutation{AbstractVertex}
+AddVertexMutation(s::AbstractArchSpace, outselect::Function, WeightInit::AbstractWeightInit, rng::AbstractRNG)
+AddVertexMutation(s, outselect::Function=identity)
+AddVertexMutation(s, rng::AbstractRNG)
+AddVertexMutation(s, wi::AbstractWeightInit)
+
+Insert a vertex from the wrapped `AbstractArchSpace` `s` after a given vertex `v`.
+
+The function `outselect` takes an `AbstractVector{AbstractVertex}` representing the output of `v` and returns an `AbstractVector{AbstractVertex}` which shall be reconnected to the vertex `v'` returned by `s`. Defaults to `identity` meaning all outputs of `v` are reconnected to `v'`.
+"""
+struct AddVertexMutation{S<:AbstractArchSpace, F, WI<:AbstractWeightInit, RNG<:AbstractRNG} <:AbstractMutation{AbstractVertex}
+s::S
+outselect::F
+weightinit::WI
+rng::RNG
+end
+AddVertexMutation(s, outselect::Function=identity) = AddVertexMutation(s, outselect, IdentityWeightInit(), rng_default)
+AddVertexMutation(s, rng::AbstractRNG) = AddVertexMutation(s, identity, IdentityWeightInit(), rng)
+AddVertexMutation(s, wi::AbstractWeightInit) = AddVertexMutation(s, identity, wi, rng_default)
+
+function (m::AddVertexMutation)(v::AbstractVertex)
+insert!(v, vi -> m.s(name(vi), vi, m.rng, outsize=nout(vi), wi=m.weightinit), m.outselect)
+return v
+end
+
+"""
+RemoveVertexMutation <:AbstractMutation{AbstractVertex}
+RemoveVertexMutation(s::RemoveStrategy)
+RemoveVertexMutation()
+
+Remove the given vertex `v` using the configured `RemoveStrategy`.
+
+Default size align strategy is `IncreaseSmaller -> DecreaseBigger -> AlignSizeBoth -> FailAlignSizeWarn -> FailAlignSizeRevert`.
+
+Default reconnect strategy is `ConnectAll`.
+
+Note: High likelyhood of large accuracy degradation after applying this mutation.
+"""
+struct RemoveVertexMutation{S<:RemoveStrategy} <:AbstractMutation{AbstractVertex}
+s::S
+end
+function RemoveVertexMutation() 
+alignstrat = IncreaseSmaller(fallback=DecreaseBigger(fallback=AlignSizeBoth(fallback=FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Could not align sizes of neighbours!"))))
+return RemoveVertexMutation(RemoveStrategy(CheckAligned(CheckNoSizeCycle(alignstrat, FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Size cycle detected!")))))
+end
+
+function (m::RemoveVertexMutation)(v::AbstractVertex)
+remove!(v, m.s)
+return v
+end
+
+default_neuronselect(args...) = NaiveNASlib.defaultutility(args...)
+
+"""
+AddEdgeMutation <: AbstractMutation{AbstractVertex}
+AddEdgeMutation(p; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect)
+AddEdgeMutation(p::Probability; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect)
+
+Add an edge from a vertex `vi` to another vertex `vo` randomly selected from `vs = filtfun(vi)`.
+
+Higher values of `p` will give more preference to earlier vertices of `vs`.
+
+If `vo` is not capable of having multiple inputs (determined by `singleinput(v) == true`), `vm = mergefun(voi)` where `voi` is a randomly selected input to `vo` will be used instead of `vo` and `vo` will be added as the output of `vm`.
+
+When selecting neurons/outputs after any eventual size change the output of `utilityfun(v)` will be used to determine the utlity of each output in vertex `v`. Note that `length(utilityfun(v)) == nout(v)` must hold.
+
+Note: High likelyhood of large accuracy degradation after applying this mutation.
+"""
+struct AddEdgeMutation{F1, F2, F3, P<:Probability, RNG} <: AbstractMutation{AbstractVertex}
+mergefun::F1
+filtfun::F2
+utilityfun::F3
+p::P
+rng::RNG
+end
+AddEdgeMutation(p; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect) = AddEdgeMutation(Probability(p, rng), rng=rng, mergefun=mergefun, filtfun=filtfun, utilityfun=utilityfun)
+AddEdgeMutation(p::Probability; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect) = AddEdgeMutation(mergefun, filtfun, utilityfun, p, rng)
+
+default_mergefun(pconc = 0.5; rng=rng_default, traitfun = MutationShield ∘ RemoveIfSingleInput ∘ validated() ∘ default_logging(), layerfun = ActivationContribution) = function(vin)
+if rand(rng) > pconc
+    return invariantvertex(layerfun(+), vin, traitdecoration=traitfun ∘ named(name(vin) * ".add"))
+end
+return concat(vin, traitfun = traitfun ∘ named(name(vin) * ".cat"), layerfun=layerfun)
+end
+
+function no_shapechange(vi)
+# all_in_graph is not sorted, and we want some kind of topoligical order here so that earlier indices are closer to vi
+allsorted = mapreduce(ancestors, vcat, filter(v -> isempty(outputs(v)), all_in_graph(vi))) |> unique
+
+# Vertices which have the same input as vi and are singleinput
+#   Reason is that this will cause a new vertex to be added between the target output vertex vo
+#   and the input vertex to vi (vii) and this is detected as a size cycle which causes 
+#   try_add_edge to fail.
+inouts = filter(singleinput, mapreduce(outputs, vcat, inputs(vi); init=[]))
+# All vertices which are after vi in the topology
+vsafter = setdiff(allsorted, ancestors(vi), outputs(vi), inouts)
+
+vitrace = shapetrace(vi) 
+viorder = allΔshapetypes(vitrace)
+viΔshape = squashshapes(vitrace; order=viorder)
+
+return filter(vsafter) do vafter
+    all(inputs(vafter)) do v
+    t = shapetrace(v)
+    vΔshape = squashshapes(t; order=union(viorder, allΔshapetypes(t)))
+    return viΔshape == vΔshape
+    end
+end
+end
+
+function (m::AddEdgeMutation)(vi::AbstractVertex)
+# All vertices for which it is allowed to add vi as an input
+allverts = filter(allow_mutation, m.filtfun(vi))
+isempty(allverts) && return vi
+
+# Higher probability to select a vertex close to v is desired behaviour
+# One line less than a for loop => FP wins!!
+selfun(::Nothing, vc) = apply(m.p) ? vc : nothing
+selfun(vs, vd) = vs
+vo = foldl(selfun, allverts, init=nothing)
+vo = isnothing(vo) ? rand(m.rng, allverts) : vo
+
+try_add_edge(vi, vo, m.mergefun, m.utilityfun)
+return vi
+end
+
+function try_add_edge(vi, vo, mergefun, utilityfun=default_neuronselect)
+
+# Need to add a vertex which can handle multiple inputs if vo is single input only
+# For cleaning up added vertex if the whole operation fails
+cleanup_failed = () -> nothing
+if singleinput(vo)
+    voi = inputs(vo)[1]
+    # If the input to vo is capable of multi input we don't need to create a new vertex
+    # We must also check that this input does not happen to be an input to vi as this would create a cycle in the graph
+    if singleinput(voi) || voi in ancestors(vi)
+        vm = mergefun(voi)
+        # Insert vm between voi and vo, i.e voi -> vo turns into voi -> vm -> vo
+        # vs -> [vo] means only add the new vertex between voi and vo as voi could have other outputs
+        insert!(voi, vv -> vm, vs -> [vo])
+        cleanup_failed = function()
+            length(inputs(vm)) > 1 && return
+            remove!(vm, RemoveStrategy(NoSizeChange()))
+        end
+        vo = vm # vm is the one we shall add an edge to
+        @debug "Create new vertex for merging $(name(vo))"
+    else
+        vo = voi
+    end
+end
+# This is mainly because FailAlignSizeRevert does not work when the same vertex is input more than once, but it also seems kinda redundant.
+vi in inputs(vo) && return
+@debug "Create edge between $(name(vi)) and $(name(vo))"
+create_edge!(vi, vo, strategy = create_edge_strat(vo, utilityfun))
+cleanup_failed()
+end
+# Need to override this one for strange types e.g. layers which support exactly 2 inputs or something.
+singleinput(v) = isempty(inputs(v)) || length(inputs(v)) == 1
+
+create_edge_strat(v::AbstractVertex, utilityfun) = create_edge_strat(trait(v), utilityfun)
+create_edge_strat(d::DecoratingTrait, utilityfun) = create_edge_strat(base(d), utilityfun)
+function create_edge_strat(::SizeInvariant, utilityfun)
+warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
+alignstrat = AlignSizeBoth(;mapstrat=WithUtilityFun(utilityfun), fallback = warnfailalign)
+# Tricky failure case: It is possible that CheckCreateEdgeNoSizeCycle does not detect any size cycle until after the edge has been created?
+sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!") 
+
+return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
+end
+function create_edge_strat(::SizeStack, utilityfun)
+warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
+alignstrat = PostAlign(TruncateInIndsToValid(WithUtilityFun(utilityfun, AlignNinToNout(;fallback=ΔSizeFailNoOp()))), fallback=warnfailalign)
+
+sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!")
+return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
+end
+
+"""
+RemoveEdgeMutation <: AbstractMutation{AbstractVertex}
+RemoveEdgeMutation(;utilityfun=default_neuronselect, rng=rng_default)
+
+Remove an edge from a vertex `vi` to another vertex `vo` randomly selected from `outputs(vi)`.
+
+Vertex `vi` must have more than one output and vertex `vo` must have more than one output for the edge to be removed. Otherwise no change is made.
+
+If there are multiple edges between `vi` and `vo` no change will be made due to NaiveNASlib not being able to revert a failed operation in this case..
+
+When selecting neurons/outputs after any eventual size change the output of `utilityfun(v)` will be used to determine the utlity of each output in vertex `v`. Note that `length(utilityfun(v)) == nout(v)` must hold.
+
+Note: High likelyhood of large accuracy degradation after applying this mutation.
+"""
+struct RemoveEdgeMutation{F, RNG<:AbstractRNG} <: AbstractMutation{AbstractVertex}
+utilityfun::F
+rng::RNG
+end
+RemoveEdgeMutation(;utilityfun=default_neuronselect, rng=rng_default) = RemoveEdgeMutation(utilityfun, rng)
+
+function (m::RemoveEdgeMutation)(vi::AbstractVertex)
+length(outputs(vi)) < 2 && return vi
+
+allverts = filter(vo -> length(inputs(vo)) > 1, outputs(vi))
+
+isempty(allverts) && return vi
+
+vo = rand(m.rng, allverts)
+sum(inputs(vo) .== vi) > 1 && return vi# Not implemented in NaiveNASlib
+
+@debug "Remove edge between $(name(vi)) and $(name(vo))"
+remove_edge!(vi, vo, strategy=remove_edge_strat(vo, m.utilityfun))
+return vi
+end
+
+remove_edge_strat(v::AbstractVertex, utilityfun) = remove_edge_strat(trait(v), utilityfun)
+remove_edge_strat(d::DecoratingTrait, utilityfun) = remove_edge_strat(base(d), utilityfun)
+remove_edge_strat(::SizeInvariant, utilityfun) = NoSizeChange()
+remove_edge_strat(t::SizeStack, utilityfun) = create_edge_strat(t, utilityfun)
+
+"""
+KernelSizeMutation{N} <: AbstractMutation{AbstractVertex}
+KernelSizeMutation(Δsizespace::AbstractParSpace{N, Int}; maxsize, pad, rng)
+KernelSizeMutation2D(absΔ::Integer;maxsize, pad, rng)
+KernelSizeMutation(absΔ::Integer...;maxsize, pad, rng)
+
+Mutate the size of filter kernels of convolutional layers.
+
+Note: High likelyhood of large accuracy degradation after applying this mutation.
+
+`KernelSizeMutation2D` is a convenience constructor for `KernelSizeMutation(absΔ, absΔ;...)`.
+"""
+struct KernelSizeMutation{N,F,P} <: AbstractMutation{AbstractVertex}
+Δsizespace::AbstractParSpace{N, Int}
+maxsize::F
+pad::P
+rng::AbstractRNG
+end
+KernelSizeMutation(Δsizespace::AbstractParSpace{N, Int}; maxsize = v -> ntuple(i->Inf,N), pad=SamePad(), rng=rng_default) where N = KernelSizeMutation(Δsizespace, maxsize, pad, rng)
+KernelSizeMutation2D(absΔ::Integer;maxsize = v -> (Inf,Inf), pad=SamePad(), rng=rng_default) = KernelSizeMutation(absΔ, absΔ, maxsize = maxsize, pad=pad, rng=rng)
+KernelSizeMutation(absΔ::Integer...;maxsize = v -> ntuple(i->Inf, length(absΔ)), pad=SamePad(), rng=rng_default) = KernelSizeMutation(ParSpace(UnitRange.(.-absΔ, absΔ));maxsize = maxsize, pad=pad, rng=rng)
+
+function (m::KernelSizeMutation{N})(v::AbstractVertex) where N
+layertype(v) isa FluxConvolutional{N} || return
+l = layer(v)
+
+currsize = size(NaiveNASflux.weights(l))[1:N]
+Δsize = Int.(clamp.(m.Δsizespace(m.rng), 1 .- currsize, m.maxsize(v) .- currsize)) # ensure new size is > 0 and < maxsize
+# This will eventually boil down to Setfield doing its thing, and that won't be using any convenience constructors
+pad = Flux.calc_padding(typeof(l), m.pad, currsize .+ Δsize, dilation(l), stride(l))
+KernelSizeAligned(Δsize, pad)(v)
+return v
+end
+dilation(l) = l.dilation
+stride(l) = l.stride
+
+"""
+ActivationFunctionMutation{T,R} <: AbstractMutation{AbstractVertex} where {T <: AbstractParSpace{1}, R <: AbstractRNG}
+ActivationFunctionMutation(actspace::AbstractParSpace{1}, rng::AbstractRNG)
+ActivationFunctionMutation(acts...;rng=rng_default)
+ActivationFunctionMutation(acts::AbstractVector;rng=rng_default)
+
+Mutate the activation function of layers which have an activation function.
+
+Note: High likelyhood of large accuracy degradation after applying this mutation.
+"""
+struct ActivationFunctionMutation{T,RNG} <: AbstractMutation{AbstractVertex} where {T <: AbstractParSpace{1}, R <: AbstractRNG}
+actspace::T
+rng::RNG
+end
+ActivationFunctionMutation(acts...;rng=rng_default) = ActivationFunctionMutation(collect(acts), rng=rng)
+ActivationFunctionMutation(acts::AbstractVector;rng=rng_default) = ActivationFunctionMutation(ParSpace(acts), rng)
+
+function (m::ActivationFunctionMutation)(v::AbstractVertex)
+m(layertype(v), v)
+return v
+end
+function (m::ActivationFunctionMutation)(t, v) end
+(m::ActivationFunctionMutation)(::Union{FluxDense, FluxConvolutional}, v) = NaiveNASflux.setlayer!(v, (σ = m.actspace(m.rng),))
+(m::ActivationFunctionMutation)(::FluxParNorm, v) = NaiveNASflux.setlayer!(v, (λ = m.actspace(m.rng),))
+function (m::ActivationFunctionMutation)(::FluxRnn, v)
+newcell = setproperties(layer(v).cell, (σ = m.actspace(m.rng),))
+NaiveNASflux.setlayer!(v, (cell = newcell,))
+end
+
+
+"""
+    RemoveZeroNout()
+    RemoveZeroNout(fallback)
+
+Search for vertices with zero output size and remove them and all of their input vertices if possible to do so witout removing an input or output vertex.
+
+Removal is only possible if a vertex is inside a parallel path which will later be concatenated.
+"""
+struct RemoveZeroNout
+    fallback
+end
+RemoveZeroNout() = RemoveZeroNout(IncreaseZeroNout())
+struct IncreaseZeroNout end
+
+(r::RemoveZeroNout)(m, e) = r(e)
+function (r::RemoveZeroNout)(g::CompGraph)
+    topoligical_order = vertices(g)
+    for v in topoligical_order
+        nout(v) == 0 || continue
+
+        # Beware! This turned out to be a lot harder than I first thought.
+        # I'm not sure the algorithm works (or realizes that it won't work) for all possible cases.
+
+        # This obviously only works if v is inside a parallel path which will later be concatenated
+
+        # To make sure it is, we look ahead in forward and backward direction.
+        # If we don't see 1) an input vertex 2) a vertex without outputs (i.e an output vertex) we are good to go
+        fseen, fpoints = findforkpoint(v, topoligical_order, inputs, outputs)
+        isempty(fpoints) && continue
+
+        bseen, bpoints = findforkpoint(v, topoligical_order, outputs, inputs)
+        isempty(bpoints) && continue
+
+        # Ok, fpoints are all vertices where forks with v in them join and bpoints are all vertices where forks with v in them begin
+
+        # If we start removing all seen vertices which are input to an fpoint until we hit a bpoint we should have removed the fork with v in it, right?
+        seen = union(fseen, bseen)
+        to_rm = intersect(vcat(inputs.(fpoints)...), seen)
+        foreach(fpoint -> remove_all_inputs(fpoint, seen, bpoints), to_rm)
+    end
+end
+
+function findforkpoint(v::AbstractVertex, topoligical_order, f1=inputs, f2=outputs, seen = vcat(v, f1(v)), points = AbstractVertex[])
+
+    if all(x -> x in seen, f1(v))
+
+        # Check if we came across a vertex which previously was thought to be a forkpoint and remove it if so
+        if v in points
+            deleteat!(points, indexin([v], points))
+        end
+        # Always visit in reverse topoligical order to mitigate chasing down paths just because we haven't explored them yet
+        nextverts = reverse(topoligical_order[unique(indexin(f2(v), topoligical_order))])
+        push!(seen, filter(v2 -> !in(v2, seen), nextverts)...)
+        foreach(v2 -> findforkpoint(v2, topoligical_order, f1, f2, seen, points), nextverts)
+    elseif !(v in points)
+        push!(points, v)
+    end
+
+    return seen, points
+end
+
+function remove_all_inputs(v, seen, stop)
+    v in stop && return
+    foreach(vrm -> remove_all_inputs(vrm, seen, stop), intersect(inputs(v), seen))
+    remove!(v, RemoveStrategy(ConnectNone(), NoSizeChange()))
+end
\ No newline at end of file
diff --git a/src/mutation/optimizer.jl b/src/mutation/optimizer.jl
new file mode 100644
index 00000000..811a23d9
--- /dev/null
+++ b/src/mutation/optimizer.jl
@@ -0,0 +1,52 @@
+"""
+    struct OptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
+    OptimizerMutation(optfun)
+    OptimizerMutation(os::Union{Tuple, <:AbstractArray})
+
+Mutatates optimizers not wrapped in `ShieldedOpt` through `optfun`.
+
+Invoked recursively for `Flux.Optimiser`s.
+"""
+struct OptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
+    optfun::F
+end
+OptimizerMutation(os::Union{Tuple, <:AbstractArray}, rng=rng_default) = OptimizerMutation(o -> rand(rng, os)(learningrate(o)))
+
+"""
+    LearningRateMutation(rng=rng_default)
+
+Return an `OptimizerMutation` which mutates the learning rate of optimizers.
+"""
+LearningRateMutation(rng=rng_default) = OptimizerMutation(o -> nudgelr(o, rng))
+
+(m::OptimizerMutation)(opt::Flux.Optimiser) = Flux.Optimiser(m.(opt.os))
+(m::OptimizerMutation)(o::ShieldedOpt) = o;
+(m::OptimizerMutation)(o::FluxOptimizer) = m.optfun(o)
+
+
+nudgelr(o, rng=rng_default) = sameopt(o, nudgelr(learningrate(o), rng))
+nudgelr(lr::Number, rng=rng_default) = clamp(lr + (rand(rng) - 0.5) * lr * 0.3, 1e-6, 1.0)
+
+learningrate(o::Flux.Optimiser) = prod(learningrate.(o.os))
+learningrate(o::ShieldedOpt) = learningrate(o.opt)
+learningrate(o) = o.eta
+
+newlr(o, lrf = nudgelr) = sameopt(o, lrf(learningrate(o)))
+sameopt(o, lr) = @set o.eta = lr
+
+"""
+    AddOptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
+
+Adds optimizer generated by `optgen(os)` to the set of optimizers where `os` is the existing set.
+
+An attempt to merge optimizers of the same type is made using `mergeopts`.
+"""
+struct AddOptimizerMutation{F} <: AbstractMutation{FluxOptimizer}
+    optgen::F
+end
+(m::AddOptimizerMutation)(o::ShieldedOpt) = o;
+(m::AddOptimizerMutation)(o::FluxOptimizer) = m(Flux.Optimiser([o]))
+function (m::AddOptimizerMutation)(opt::Flux.Optimiser)
+    newopt = m.optgen(opt)
+    return Flux.Optimiser(mergeopts(typeof(newopt), newopt, opt.os...))
+end
diff --git a/test/crossover.jl b/test/crossover/graph.jl
similarity index 89%
rename from test/crossover.jl
rename to test/crossover/graph.jl
index 087d0241..791891a2 100644
--- a/test/crossover.jl
+++ b/test/crossover/graph.jl
@@ -1,4 +1,4 @@
-@testset "Crossover" begin
+@testset " Graph crossover" begin
 
     v4n(graph::CompGraph, want) = v4n(vertices(graph), want)
     v4n(vs, want) = vs[findfirst(v -> want == name(v), vs)]
@@ -760,99 +760,4 @@
             end
         end
     end
-
-    @testset "OptimizerCrossover" begin
-        using NaiveGAflux.Flux.Optimise
-
-        prts(o) = typeof(o)
-        prts(o::Optimiser) = "$(typeof(o))$(prts.(Tuple(o.os)))"
-
-        @testset "Swap optimizers $(prts(o1)) and $(prts(o2))" for (o1, o2) in (
-            (ADAM(), Momentum()),
-            (Optimiser(Descent(), WeightDecay()), Optimiser(Momentum(), Nesterov())),
-            )
-            oc = OptimizerCrossover()
-            ooc = OptimizerCrossover(oc)
-            @test prts.(oc((o1,o2))) == prts.(ooc((o1,o2))) == prts.((o2, o1))
-            @test prts.(oc((o2,o1))) == prts.(ooc((o2,o1))) == prts.((o1, o2))
-        end
-
-        @testset "Don't swap shielded" begin
-            o1 = ShieldedOpt(Descent())
-            o2 = ShieldedOpt(Momentum())
-            @test OptimizerCrossover()((o1,o2)) == (o1,o2)
-        end
-
-        @testset "Cardinality difference" begin
-
-            @testset "Single opt vs Optimiser" begin
-                oc = OptimizerCrossover()
-                @test prts.(oc((Descent(), Optimiser(Momentum(), WeightDecay())))) == prts.((Momentum(), Optimiser(Descent(), WeightDecay())))
-            end
-
-            @testset "Different size Optimisers" begin
-                oc = OptimizerCrossover()
-                o1 = Optimiser(Descent(), WeightDecay(), Momentum())
-                o2 = Optimiser(ADAM(), ADAMW(), NADAM(), RADAM())
-
-                o1n,o2n = oc((o1,o2))
-
-                @test prts(o1n) == prts(Optimiser(ADAM(), ADAMW(), NADAM()))
-                @test prts(o2n) == prts(Optimiser(Descent(), WeightDecay(), Momentum(), RADAM()))
-            end
-        end
-
-        @testset "LogMutation and MutationProbability" begin
-            mplm(c) = MutationProbability(LogMutation(((o1,o2)::Tuple) -> "Crossover between $(prts(o1)) and $(prts(o2))", c), Probability(0.2, MockRng([0.3, 0.1, 0.3])))
-            oc = OptimizerCrossover() |> mplm |> OptimizerCrossover
-
-            o1 = Optimiser(Descent(), WeightDecay(), Momentum())
-            o2 = Optimiser(ADAM(), ADAGrad(), AdaMax())
-
-            o1n,o2n = @test_logs (:info, "Crossover between WeightDecay and ADAGrad") oc((o1,o2))
-
-            @test typeof.(o1n.os) == [Descent, ADAGrad, Momentum]
-            @test typeof.(o2n.os) == [ADAM, WeightDecay, AdaMax]
-        end
-
-        @testset "Learningrate crossover" begin
-            import NaiveGAflux: learningrate
-            @testset "Single opt" begin
-                oc = LearningRateCrossover()
-                o1,o2 = oc((Descent(0.1), Momentum(0.2)))
-
-                @test typeof(o1) == Descent
-                @test o1.eta == 0.2
-
-                @test typeof(o2) == Momentum
-                @test o2.eta == 0.1
-            end
-
-            @testset "Shielded opt" begin
-                oc = LearningRateCrossover()
-                o1,o2 = oc((ShieldedOpt(Descent(0.1)), Momentum(0.2)))
-
-                @test typeof(o1) == ShieldedOpt{Descent}
-                @test o1.opt.eta == 0.1
-
-                @test typeof(o2) == Momentum
-                @test o2.eta == 0.2
-            end
-
-            @testset "Optimiser" begin
-                oc = LearningRateCrossover()
-                o1 = Optimiser(Descent(0.1), Momentum(0.2), WeightDecay(0.1))
-                o2 = Optimiser(ADAM(0.3), RADAM(0.4), NADAM(0.5), Nesterov(0.6))
-
-                o1n,o2n = oc((o1,o2))
-
-                @test prts(o1n) == prts(o1)
-                @test prts(o2n) == prts(o2)
-
-                @test learningrate.(o1n.os[1:end-1]) == [0.3, 0.4]
-                @test learningrate.(o2n.os) == [0.1, 0.2, 0.5, 0.6]
-
-            end
-        end
-    end
 end
diff --git a/test/crossover/optimizer.jl b/test/crossover/optimizer.jl
new file mode 100644
index 00000000..d1a3f69a
--- /dev/null
+++ b/test/crossover/optimizer.jl
@@ -0,0 +1,94 @@
+@testset "Optimizer crossover" begin
+    using NaiveGAflux.Flux.Optimise
+
+    prts(o) = typeof(o)
+    prts(o::Optimiser) = "$(typeof(o))$(prts.(Tuple(o.os)))"
+
+    @testset "Swap optimizers $(prts(o1)) and $(prts(o2))" for (o1, o2) in (
+        (ADAM(), Momentum()),
+        (Optimiser(Descent(), WeightDecay()), Optimiser(Momentum(), Nesterov())),
+        )
+        oc = OptimizerCrossover()
+        ooc = OptimizerCrossover(oc)
+        @test prts.(oc((o1,o2))) == prts.(ooc((o1,o2))) == prts.((o2, o1))
+        @test prts.(oc((o2,o1))) == prts.(ooc((o2,o1))) == prts.((o1, o2))
+    end
+
+    @testset "Don't swap shielded" begin
+        o1 = ShieldedOpt(Descent())
+        o2 = ShieldedOpt(Momentum())
+        @test OptimizerCrossover()((o1,o2)) == (o1,o2)
+    end
+
+    @testset "Cardinality difference" begin
+
+        @testset "Single opt vs Optimiser" begin
+            oc = OptimizerCrossover()
+            @test prts.(oc((Descent(), Optimiser(Momentum(), WeightDecay())))) == prts.((Momentum(), Optimiser(Descent(), WeightDecay())))
+        end
+
+        @testset "Different size Optimisers" begin
+            oc = OptimizerCrossover()
+            o1 = Optimiser(Descent(), WeightDecay(), Momentum())
+            o2 = Optimiser(ADAM(), ADAMW(), NADAM(), RADAM())
+
+            o1n,o2n = oc((o1,o2))
+
+            @test prts(o1n) == prts(Optimiser(ADAM(), ADAMW(), NADAM()))
+            @test prts(o2n) == prts(Optimiser(Descent(), WeightDecay(), Momentum(), RADAM()))
+        end
+    end
+
+    @testset "LogMutation and MutationProbability" begin
+        mplm(c) = MutationProbability(LogMutation(((o1,o2)::Tuple) -> "Crossover between $(prts(o1)) and $(prts(o2))", c), Probability(0.2, MockRng([0.3, 0.1, 0.3])))
+        oc = OptimizerCrossover() |> mplm |> OptimizerCrossover
+
+        o1 = Optimiser(Descent(), WeightDecay(), Momentum())
+        o2 = Optimiser(ADAM(), ADAGrad(), AdaMax())
+
+        o1n,o2n = @test_logs (:info, "Crossover between WeightDecay and ADAGrad") oc((o1,o2))
+
+        @test typeof.(o1n.os) == [Descent, ADAGrad, Momentum]
+        @test typeof.(o2n.os) == [ADAM, WeightDecay, AdaMax]
+    end
+
+    @testset "Learningrate crossover" begin
+        import NaiveGAflux: learningrate
+        @testset "Single opt" begin
+            oc = LearningRateCrossover()
+            o1,o2 = oc((Descent(0.1), Momentum(0.2)))
+
+            @test typeof(o1) == Descent
+            @test o1.eta == 0.2
+
+            @test typeof(o2) == Momentum
+            @test o2.eta == 0.1
+        end
+
+        @testset "Shielded opt" begin
+            oc = LearningRateCrossover()
+            o1,o2 = oc((ShieldedOpt(Descent(0.1)), Momentum(0.2)))
+
+            @test typeof(o1) == ShieldedOpt{Descent}
+            @test o1.opt.eta == 0.1
+
+            @test typeof(o2) == Momentum
+            @test o2.eta == 0.2
+        end
+
+        @testset "Optimiser" begin
+            oc = LearningRateCrossover()
+            o1 = Optimiser(Descent(0.1), Momentum(0.2), WeightDecay(0.1))
+            o2 = Optimiser(ADAM(0.3), RADAM(0.4), NADAM(0.5), Nesterov(0.6))
+
+            o1n,o2n = oc((o1,o2))
+
+            @test prts(o1n) == prts(o1)
+            @test prts(o2n) == prts(o2)
+
+            @test learningrate.(o1n.os[1:end-1]) == [0.3, 0.4]
+            @test learningrate.(o2n.os) == [0.1, 0.2, 0.5, 0.6]
+
+        end
+    end
+end
\ No newline at end of file
diff --git a/test/mutation/generic.jl b/test/mutation/generic.jl
new file mode 100644
index 00000000..406ddeba
--- /dev/null
+++ b/test/mutation/generic.jl
@@ -0,0 +1,170 @@
+
+
+@testset "Generic mutation" begin
+
+    struct NoOpMutation{T} <:AbstractMutation{T} end
+    (m::NoOpMutation{T})(t::T) where T = t
+    ProbeMutation(T) = RecordMutation(NoOpMutation{T}())
+
+    @testset "MutationProbability" begin
+        probe = ProbeMutation(Int)
+        m = MutationProbability(probe, Probability(0.3, MockRng([0.2,0.5,0.1])))
+
+        @test m(1) == 1
+        @test m(2) == 2
+        @test m(3) == 3
+        @test m(4) == 4
+        @test probe.mutated == [1,3,4]
+    end
+
+    @testset "MutationProbability vector" begin
+        probe = ProbeMutation(Int)
+        m = MutationProbability(probe, Probability(0.3, MockRng([0.2,0.5,0.1])))
+
+        @test m(1:4) == 1:4
+        @test probe.mutated == [1,3,4]
+    end
+
+    @testset "WeightedMutationProbability" begin
+        probe = ProbeMutation(Real)
+        rng = MockRng([0.5])
+        m = WeightedMutationProbability(probe, p -> Probability(p, rng))
+
+        @test m(0.1) == 0.1
+        @test m(0.6) == 0.6
+        @test m(0.4) == 0.4
+        @test m(0.9) == 0.9
+        @test probe.mutated == [0.6,0.9]
+    end
+
+    @testset "WeightedMutationProbability vector" begin
+        probe = ProbeMutation(Real)
+        rng = MockRng([0.5])
+        m = WeightedMutationProbability(probe, p -> Probability(p, rng))
+
+        @test m([0.1, 0.6, 0.4, 0.9]) == [0.1, 0.6, 0.4, 0.9]
+        @test probe.mutated == [0.6,0.9]
+    end
+
+    @testset "Neuron utlity weighted mutation" begin
+        using Statistics
+        import NaiveNASflux: AbstractMutableComp, neuronutility, wrapped
+        struct DummyValue{T, W<:AbstractMutableComp} <: AbstractMutableComp
+            utlity::T
+            w::W
+        end
+        NaiveNASflux.neuronutility(d::DummyValue) = d.utlity
+        NaiveNASflux.wrapped(d::DummyValue) = d.w
+
+        l(in, outsize, utlity) = fluxvertex(Dense(nout(in), outsize), in, layerfun = l -> DummyValue(utlity, l))
+
+        v0 = inputvertex("in", 3)
+        v1 = l(v0, 4, 1:4)
+        v2 = l(v1, 3, 100:300)
+        v3 = l(v2, 5, 0.1:0.1:0.5)
+
+        @testset "weighted_neuronutility_high pbase $pbase" for pbase in (0.05, 0.1, 0.3, 0.7, 0.9, 0.95)
+            import NaiveGAflux: weighted_neuronutility_high
+            wnv = weighted_neuronutility_high(pbase, spread=0.5)
+            wp = map(p -> p.p, wnv.([v1,v2,v3]))
+            @test wp[2] > wp[1] > wp[3]
+            @test mean(wp) ≈ pbase rtol = 0.1
+        end
+
+        @testset "HighUtilityMutationProbability" begin
+
+            probe = ProbeMutation(MutationVertex)
+            m = HighUtilityMutationProbability(probe, 0.1, MockRng([0.15]))
+
+            m(v1)
+            m(v2)
+            m(v3)
+            @test probe.mutated == [v2]
+        end
+
+        @testset "weighted_neuronutility_low pbase $pbase" for pbase in (0.05, 0.1, 0.3, 0.7, 0.9, 0.95)
+            import NaiveGAflux: weighted_neuronutility_low
+            wnv = weighted_neuronutility_low(pbase,spread=0.8)
+            wp = map(p -> p.p, wnv.([v1,v2,v3]))
+            @test wp[2] < wp[1] < wp[3]
+            @test mean(wp) ≈ pbase rtol = 0.1
+        end
+
+        @testset "LowUtilityMutationProbability" begin
+            probe = ProbeMutation(MutationVertex)
+            m = LowUtilityMutationProbability(probe, 0.1, MockRng([0.15]))
+
+            m(v1)
+            m(v2)
+            m(v3)
+            @test probe.mutated == [v1, v3]
+        end
+    end
+
+    @testset "MutationChain" begin
+        probes = ProbeMutation.(repeat([Int], 3))
+        m = MutationChain(probes...)
+        @test m(1) == 1
+        @test getfield.(probes, :mutated) == [[1],[1],[1]]
+    end
+
+    @testset "MutationChain vector" begin
+        probes = ProbeMutation.(repeat([Int], 3))
+        m = MutationChain(probes...)
+        @test m(1:2) == 1:2
+        @test getfield.(probes, :mutated) == [[1,2],[1,2],[1,2]]
+    end
+
+    @testset "LogMutation" begin
+        probe = ProbeMutation(Int)
+        m = LogMutation(i -> "Mutate $i", probe)
+
+        @test @test_logs (:info, "Mutate 17") m(17) == 17
+        @test probe.mutated == [17]
+    end
+
+    @testset "LogMutation vector" begin
+        probe = ProbeMutation(Int)
+        m = LogMutation(i -> "Mutate $i", probe)
+
+        @test @test_logs (:info, "Mutate 17") (:info, "Mutate 21") m([17, 21]) == [17, 21]
+        @test probe.mutated == [17, 21]
+    end
+
+    @testset "MutationFilter" begin
+        probe = ProbeMutation(Int)
+        m = MutationFilter(i -> i > 3, probe)
+
+        @test m(1) == 1
+        @test probe.mutated == []
+
+        @test m(4) == 4
+        @test probe.mutated == [4]
+    end
+
+    @testset "MutationFilter vector" begin
+        probe = ProbeMutation(Int)
+        m = MutationFilter(i -> i > 3, probe)
+
+        @test m(1:5) == 1:5
+        @test probe.mutated == [4,5]
+    end
+
+    @testset "PostMutation" begin
+        probe = ProbeMutation(Int)
+
+        expect_m = nothing
+        expect_e = nothing
+        function action(m,e)
+            expect_m = m
+            expect_e = e
+        end
+
+        m = PostMutation(action, probe)
+        @test m(11) == 11
+
+        @test probe.mutated == [11]
+        @test expect_m == m
+        @test expect_e == 11
+    end
+end
diff --git a/test/mutation.jl b/test/mutation/graph.jl
similarity index 71%
rename from test/mutation.jl
rename to test/mutation/graph.jl
index 5ef2dcc6..f7920a38 100644
--- a/test/mutation.jl
+++ b/test/mutation/graph.jl
@@ -1,173 +1,9 @@
-
-
-@testset "Mutation" begin
+@testset "Graph mutation" begin
 
     struct NoOpMutation{T} <:AbstractMutation{T} end
     (m::NoOpMutation{T})(t::T) where T = t
     ProbeMutation(T) = RecordMutation(NoOpMutation{T}())
 
-    @testset "MutationProbability" begin
-        probe = ProbeMutation(Int)
-        m = MutationProbability(probe, Probability(0.3, MockRng([0.2,0.5,0.1])))
-
-        @test m(1) == 1
-        @test m(2) == 2
-        @test m(3) == 3
-        @test m(4) == 4
-        @test probe.mutated == [1,3,4]
-    end
-
-    @testset "MutationProbability vector" begin
-        probe = ProbeMutation(Int)
-        m = MutationProbability(probe, Probability(0.3, MockRng([0.2,0.5,0.1])))
-
-        @test m(1:4) == 1:4
-        @test probe.mutated == [1,3,4]
-    end
-
-    @testset "WeightedMutationProbability" begin
-        probe = ProbeMutation(Real)
-        rng = MockRng([0.5])
-        m = WeightedMutationProbability(probe, p -> Probability(p, rng))
-
-        @test m(0.1) == 0.1
-        @test m(0.6) == 0.6
-        @test m(0.4) == 0.4
-        @test m(0.9) == 0.9
-        @test probe.mutated == [0.6,0.9]
-    end
-
-    @testset "WeightedMutationProbability vector" begin
-        probe = ProbeMutation(Real)
-        rng = MockRng([0.5])
-        m = WeightedMutationProbability(probe, p -> Probability(p, rng))
-
-        @test m([0.1, 0.6, 0.4, 0.9]) == [0.1, 0.6, 0.4, 0.9]
-        @test probe.mutated == [0.6,0.9]
-    end
-
-    @testset "Neuron utlity weighted mutation" begin
-        using Statistics
-        import NaiveNASflux: AbstractMutableComp, neuronutility, wrapped
-        struct DummyValue{T, W<:AbstractMutableComp} <: AbstractMutableComp
-            utlity::T
-            w::W
-        end
-        NaiveNASflux.neuronutility(d::DummyValue) = d.utlity
-        NaiveNASflux.wrapped(d::DummyValue) = d.w
-
-        l(in, outsize, utlity) = fluxvertex(Dense(nout(in), outsize), in, layerfun = l -> DummyValue(utlity, l))
-
-        v0 = inputvertex("in", 3)
-        v1 = l(v0, 4, 1:4)
-        v2 = l(v1, 3, 100:300)
-        v3 = l(v2, 5, 0.1:0.1:0.5)
-
-        @testset "weighted_neuronutility_high pbase $pbase" for pbase in (0.05, 0.1, 0.3, 0.7, 0.9, 0.95)
-            import NaiveGAflux: weighted_neuronutility_high
-            wnv = weighted_neuronutility_high(pbase, spread=0.5)
-            wp = map(p -> p.p, wnv.([v1,v2,v3]))
-            @test wp[2] > wp[1] > wp[3]
-            @test mean(wp) ≈ pbase rtol = 0.1
-        end
-
-        @testset "HighUtilityMutationProbability" begin
-
-            probe = ProbeMutation(MutationVertex)
-            m = HighUtilityMutationProbability(probe, 0.1, MockRng([0.15]))
-
-            m(v1)
-            m(v2)
-            m(v3)
-            @test probe.mutated == [v2]
-        end
-
-        @testset "weighted_neuronutility_low pbase $pbase" for pbase in (0.05, 0.1, 0.3, 0.7, 0.9, 0.95)
-            import NaiveGAflux: weighted_neuronutility_low
-            wnv = weighted_neuronutility_low(pbase,spread=0.8)
-            wp = map(p -> p.p, wnv.([v1,v2,v3]))
-            @test wp[2] < wp[1] < wp[3]
-            @test mean(wp) ≈ pbase rtol = 0.1
-        end
-
-        @testset "LowUtilityMutationProbability" begin
-            probe = ProbeMutation(MutationVertex)
-            m = LowUtilityMutationProbability(probe, 0.1, MockRng([0.15]))
-
-            m(v1)
-            m(v2)
-            m(v3)
-            @test probe.mutated == [v1, v3]
-        end
-    end
-
-    @testset "MutationChain" begin
-        probes = ProbeMutation.(repeat([Int], 3))
-        m = MutationChain(probes...)
-        @test m(1) == 1
-        @test getfield.(probes, :mutated) == [[1],[1],[1]]
-    end
-
-    @testset "MutationChain vector" begin
-        probes = ProbeMutation.(repeat([Int], 3))
-        m = MutationChain(probes...)
-        @test m(1:2) == 1:2
-        @test getfield.(probes, :mutated) == [[1,2],[1,2],[1,2]]
-    end
-
-    @testset "LogMutation" begin
-        probe = ProbeMutation(Int)
-        m = LogMutation(i -> "Mutate $i", probe)
-
-        @test @test_logs (:info, "Mutate 17") m(17) == 17
-        @test probe.mutated == [17]
-    end
-
-    @testset "LogMutation vector" begin
-        probe = ProbeMutation(Int)
-        m = LogMutation(i -> "Mutate $i", probe)
-
-        @test @test_logs (:info, "Mutate 17") (:info, "Mutate 21") m([17, 21]) == [17, 21]
-        @test probe.mutated == [17, 21]
-    end
-
-    @testset "MutationFilter" begin
-        probe = ProbeMutation(Int)
-        m = MutationFilter(i -> i > 3, probe)
-
-        @test m(1) == 1
-        @test probe.mutated == []
-
-        @test m(4) == 4
-        @test probe.mutated == [4]
-    end
-
-    @testset "MutationFilter vector" begin
-        probe = ProbeMutation(Int)
-        m = MutationFilter(i -> i > 3, probe)
-
-        @test m(1:5) == 1:5
-        @test probe.mutated == [4,5]
-    end
-
-    @testset "PostMutation" begin
-        probe = ProbeMutation(Int)
-
-        expect_m = nothing
-        expect_e = nothing
-        function action(m,e)
-            expect_m = m
-            expect_e = e
-        end
-
-        m = PostMutation(action, probe)
-        @test m(11) == 11
-
-        @test probe.mutated == [11]
-        @test expect_m == m
-        @test expect_e == 11
-    end
-
     dense(in, outsize;layerfun = LazyMutable, name="dense") = fluxvertex(name, Dense(nout(in), outsize), in, layerfun=layerfun)
     dense(in, outsizes...;layerfun = LazyMutable, name="dense") = foldl((next,i) -> dense(next, outsizes[i], name=join([name, i]), layerfun=layerfun), 1:length(outsizes), init=in)
 
@@ -412,7 +248,7 @@
         import NaiveGAflux: default_mergefun
         cl(name, in, outsize; kwargs...) = fluxvertex(name, Conv((1,1), nout(in)=>outsize; kwargs...), in)
         dl(name, in, outsize) = fluxvertex(name, Dense(nout(in), outsize), in)
- 
+
         @testset "No shapechange" begin
             import NaiveGAflux: no_shapechange
 
@@ -548,7 +384,7 @@
             v2a1 = cl("v2a1", v2, 3)
             v2b1 = cl("v2b1", v2, 3)
             v3 = "v3" >> v2a1 + v2b1
-  
+
             m = AddEdgeMutation(1.0,  mergefun = default_mergefun(0.0), utilityfun = v -> 1:nout(v))
             
             m(v2a1)
@@ -640,42 +476,4 @@
         end
     end
 
-    @testset "OptimizerMutation" begin
-        import NaiveGAflux: sameopt, learningrate
-        import NaiveGAflux.Flux.Optimise: Optimiser
-
-        @testset "Mutate learning rate" begin
-            m = OptimizerMutation(o -> sameopt(o, 10 * learningrate(o)))
-
-            @test learningrate(m(Descent(0.1))) == 1.0
-            @test learningrate(m(ShieldedOpt(Momentum(0.1)))) == 0.1
-            @test learningrate(m(Optimiser(Nesterov(0.1), ShieldedOpt(ADAM(0.1))))) == 0.1
-
-            @test learningrate(LearningRateMutation(MockRng([0.0]))(Descent(0.1))) == 0.085
-        end
-
-        @testset "Mutate optimizer type" begin
-            m = OptimizerMutation((Momentum, ))
-
-            @test typeof(m(Descent())) == Momentum
-            @test typeof(m(ShieldedOpt(Descent()))) == ShieldedOpt{Descent}
-            @test typeof.(m(Optimiser(Nesterov(), ShieldedOpt(ADAM()))).os) == [Momentum, ShieldedOpt{ADAM}]
-        end
-
-        @testset "Add optimizer" begin
-            m = AddOptimizerMutation(o -> Descent(0.1))
-
-            @test typeof.(m(Descent(0.2)).os) == [Descent]
-            @test typeof.(m(Momentum(0.2)).os) == [Momentum, Descent]
-            @test typeof.(m(Flux.Optimiser(Nesterov(), Descent(), ShieldedOpt(Descent()))).os) == [Nesterov, ShieldedOpt{Descent}, Descent]
-        end
-
-        @testset "MutationChain and LogMutation" begin
-            m = MutationChain(LogMutation(o -> "First", OptimizerMutation((Momentum, ))), LogMutation(o -> "Second", AddOptimizerMutation(o -> Descent())))
-
-            @test_logs (:info, "First") (:info, "Second") typeof.(m(Nesterov()).os) == [Momentum, Descent]
-            @test_logs (:info, "First") (:info, "First") (:info, "Second") (:info, "Second") m([Nesterov(), ADAM()])
-        end
-    end
-
-end
+end
\ No newline at end of file
diff --git a/test/mutation/optimizer.jl b/test/mutation/optimizer.jl
new file mode 100644
index 00000000..951ca8af
--- /dev/null
+++ b/test/mutation/optimizer.jl
@@ -0,0 +1,37 @@
+@testset "Optimizer mutation" begin
+    import NaiveGAflux: sameopt, learningrate
+    import NaiveGAflux.Flux.Optimise: Optimiser
+
+    @testset "Mutate learning rate" begin
+        m = OptimizerMutation(o -> sameopt(o, 10 * learningrate(o)))
+
+        @test learningrate(m(Descent(0.1))) == 1.0
+        @test learningrate(m(ShieldedOpt(Momentum(0.1)))) == 0.1
+        @test learningrate(m(Optimiser(Nesterov(0.1), ShieldedOpt(ADAM(0.1))))) == 0.1
+
+        @test learningrate(LearningRateMutation(MockRng([0.0]))(Descent(0.1))) == 0.085
+    end
+
+    @testset "Mutate optimizer type" begin
+        m = OptimizerMutation((Momentum, ))
+
+        @test typeof(m(Descent())) == Momentum
+        @test typeof(m(ShieldedOpt(Descent()))) == ShieldedOpt{Descent}
+        @test typeof.(m(Optimiser(Nesterov(), ShieldedOpt(ADAM()))).os) == [Momentum, ShieldedOpt{ADAM}]
+    end
+
+    @testset "Add optimizer" begin
+        m = AddOptimizerMutation(o -> Descent(0.1))
+
+        @test typeof.(m(Descent(0.2)).os) == [Descent]
+        @test typeof.(m(Momentum(0.2)).os) == [Momentum, Descent]
+        @test typeof.(m(Flux.Optimiser(Nesterov(), Descent(), ShieldedOpt(Descent()))).os) == [Nesterov, ShieldedOpt{Descent}, Descent]
+    end
+
+    @testset "MutationChain and LogMutation" begin
+        m = MutationChain(LogMutation(o -> "First", OptimizerMutation((Momentum, ))), LogMutation(o -> "Second", AddOptimizerMutation(o -> Descent())))
+
+        @test_logs (:info, "First") (:info, "Second") typeof.(m(Nesterov()).os) == [Momentum, Descent]
+        @test_logs (:info, "First") (:info, "First") (:info, "Second") (:info, "Second") m([Nesterov(), ADAM()])
+    end
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index c0b198d1..ac7b303d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -40,10 +40,13 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
     include("archspace.jl")
 
     @info "Testing mutation"
-    include("mutation.jl")
+    include("mutation/generic.jl")
+    include("mutation/graph.jl")
+    include("mutation/optimizer.jl")
 
     @info "Testing crossover"
-    include("crossover.jl")
+    include("crossover/graph.jl")
+    include("crossover/optimizer.jl")
 
     @info "Testing fitness"
     include("fitness.jl")

From 083da7b676fd8fb40c3ff892d5dbf8bf2c4206ab Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Sun, 5 Jun 2022 12:17:15 +0200
Subject: [PATCH 13/36] Remove PostMutation

---
 src/NaiveGAflux.jl       | 20 ++++++++++++++------
 src/mutation/generic.jl  | 22 ----------------------
 test/mutation/generic.jl | 18 ------------------
 3 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index f68f69a0..fabda88d 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -30,13 +30,15 @@ const rng_default = MersenneTwister(abs(rand(Int)))
 const modeldir = "models"
 
 # Fitness
-export fitness, AbstractFitness, LogFitness, GpuFitness, AccuracyFitness, TrainThenFitness, TrainAccuracyFitness, MapFitness, EwmaFitness, TimeFitness, SizeFitness, AggFitness
+export fitness, AbstractFitness, LogFitness, GpuFitness, AccuracyFitness, TrainThenFitness, TrainAccuracyFitness, MapFitness
+export EwmaFitness, TimeFitness, SizeFitness, AggFitness
 
 # Candidate
 export AbstractCandidate, CandidateModel, CandidateOptModel, CandidateDataIterMap, FittedCandidate, MapCandidate, model, opt, lossfun
 
 # Evolution
-export evolve, AbstractEvolution, NoOpEvolution, AfterEvolution, EliteSelection, SusSelection, TournamentSelection, CombinedEvolution, EvolutionChain, PairCandidates, ShuffleCandidates, EvolveCandidates
+export evolve, AbstractEvolution, NoOpEvolution, AfterEvolution, EliteSelection, SusSelection, TournamentSelection, CombinedEvolution
+export EvolutionChain, PairCandidates, ShuffleCandidates, EvolveCandidates
 
 # Population
 export Population, generation
@@ -48,7 +50,8 @@ export Probability, MutationShield, ApplyIf, RemoveIfSingleInput, PersistentArra
 export BatchSizeSelectionWithDefaultInShape, BatchSizeSelectionScaled, BatchSizeSelectionFromAlternatives, BatchSizeSelectionMaxSize, batchsizeselection
 
 # Iterators. These should preferably come from somewhere else, but I haven't found anything which fits the bill w.r.t repeatability over subsets
-export RepeatPartitionIterator, SeedIterator, MapIterator, GpuIterator, BatchIterator, ShuffleIterator, TimedIterator, TimedIteratorStop, StatefulGenerationIter
+export RepeatPartitionIterator, SeedIterator, MapIterator, GpuIterator, BatchIterator, ShuffleIterator, TimedIterator, TimedIteratorStop
+export StatefulGenerationIter
 
 # Iterator mapping types for evolving hyperparameters related to datasets, e.g. augmentation and batch size
 export BatchSizeIteratorMap, IteratorMaps
@@ -60,16 +63,21 @@ export persist
 export AbstractVertexSelection, AllVertices, FilterMutationAllowed
 
 # mutation types
-export AbstractMutation, MutationProbability, WeightedMutationProbability, HighUtilityMutationProbability, LowUtilityMutationProbability, MutationChain, RecordMutation, LogMutation, MutationFilter, PostMutation, VertexMutation, NoutMutation, AddVertexMutation, RemoveVertexMutation, AddEdgeMutation, RemoveEdgeMutation, KernelSizeMutation, KernelSizeMutation2D, ActivationFunctionMutation, PostMutation, OptimizerMutation, LearningRateMutation, AddOptimizerMutation
+export AbstractMutation, MutationProbability, WeightedMutationProbability, HighUtilityMutationProbability, LowUtilityMutationProbability 
+export MutationChain, RecordMutation, LogMutation, MutationFilter, VertexMutation, NoutMutation, AddVertexMutation, RemoveVertexMutation
+export AddEdgeMutation, RemoveEdgeMutation, KernelSizeMutation, KernelSizeMutation2D, ActivationFunctionMutation, OptimizerMutation
+export LearningRateMutation, AddOptimizerMutation
 
 # Crossover types
 export AbstractCrossover, VertexCrossover, CrossoverSwap, OptimizerCrossover, LearningRateCrossover
 
 # architecture spaces
-export AbstractArchSpace, LoggingArchSpace, VertexSpace, NoOpArchSpace, ArchSpace, ConditionalArchSpace, RepeatArchSpace, ArchSpaceChain, ForkArchSpace, ResidualArchSpace, FunctionSpace, GlobalPoolSpace
+export AbstractArchSpace, LoggingArchSpace, VertexSpace, NoOpArchSpace, ArchSpace, ConditionalArchSpace, RepeatArchSpace, ArchSpaceChain
+export ForkArchSpace, ResidualArchSpace, FunctionSpace, GlobalPoolSpace
 
 #  Other search space types
-export BaseLayerSpace, AbstractParSpace, SingletonParSpace, Singleton2DParSpace, ParSpace, ParSpace1D, ParSpace2D, CoupledParSpace, NamedLayerSpace, LoggingLayerSpace, DenseSpace, ConvSpace, BatchNormSpace, PoolSpace, LayerVertexConf, Shielded, ConcConf
+export BaseLayerSpace, AbstractParSpace, SingletonParSpace, Singleton2DParSpace, ParSpace, ParSpace1D, ParSpace2D, CoupledParSpace
+export NamedLayerSpace, LoggingLayerSpace, DenseSpace, ConvSpace, BatchNormSpace, PoolSpace, LayerVertexConf, Shielded, ConcConf
 
 #weight inits
 export AbstractWeightInit, DefaultWeightInit, IdentityWeightInit, PartialIdentityWeightInit, ZeroWeightInit
diff --git a/src/mutation/generic.jl b/src/mutation/generic.jl
index 40353b82..4090d347 100644
--- a/src/mutation/generic.jl
+++ b/src/mutation/generic.jl
@@ -215,25 +215,3 @@ function (m::MutationFilter{T})(e::T; next=m.m, noop=identity) where T
     m.predicate(e) && return next(e)
     return noop(e)
 end
-
-"""
-    PostMutation{T} <: DecoratingMutation{T}
-    PostMutation(actions, m::AbstractMutation{T})
-    PostMutation(m::AbstractMutation{T}, actions...)
-
-Performs a set of actions after a wrapped `AbstractMutation` is applied.
-
-Actions will be invoked with arguments (m::PostMutation{T}, e::T) where m is the enclosing `PostMutation` and `e` is the mutated entity of type `T`.
-"""
-struct PostMutation{T,A} <: DecoratingMutation{T}
-    actions::A
-    m::AbstractMutation{T}
-end
-PostMutation(m::AbstractMutation{T}, actions...) where T = PostMutation(actions, m)
-PostMutation(action::Function, m::AbstractMutation{T}) where T = PostMutation(m, action)
-function (m::PostMutation{T})(e::T; next=m.m, noop=identity) where T
-    eout = next(e)
-    foreach(a -> a(m, eout), m.actions)
-    return eout
-end
-
diff --git a/test/mutation/generic.jl b/test/mutation/generic.jl
index 406ddeba..1df32bfb 100644
--- a/test/mutation/generic.jl
+++ b/test/mutation/generic.jl
@@ -149,22 +149,4 @@
         @test m(1:5) == 1:5
         @test probe.mutated == [4,5]
     end
-
-    @testset "PostMutation" begin
-        probe = ProbeMutation(Int)
-
-        expect_m = nothing
-        expect_e = nothing
-        function action(m,e)
-            expect_m = m
-            expect_e = e
-        end
-
-        m = PostMutation(action, probe)
-        @test m(11) == 11
-
-        @test probe.mutated == [11]
-        @test expect_m == m
-        @test expect_e == 11
-    end
 end

From b5ba5786804706f5c3ae24610e4d248aff0807d7 Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Sun, 5 Jun 2022 12:29:22 +0200
Subject: [PATCH 14/36] Remove mapcandidate and change
 global_optimizer_mutation to use MapCandidate and MapType

---
 src/candidate.jl | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/candidate.jl b/src/candidate.jl
index f758a4fe..a157c3b4 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -286,7 +286,7 @@ generation(c::FittedCandidate; default=nothing) = c.gen
 newcand(c::FittedCandidate, mapfield) = FittedCandidate(c.gen, c.fitness, newcand(wrappedcand(c), mapfield))
 
 nparams(c::AbstractCandidate) = model(nparams, c)
-nparams(x) = mapreduce(prod ∘ size, +, params(x).order; init=0)
+nparams(x) = mapreduce(length, +, params(x).order; init=0)
 
 """
     MapType{T, F1, F2}
@@ -411,15 +411,6 @@ function (e::MapCandidate{<:NTuple{N, AbstractCrossover}, F})((c1,c2)) where {N,
     return newcand(c1, mapc1), newcand(c2, mapc2)
 end
 
-
-function mapcandidate(mapgraph, mapothers=deepcopy)
-    mapfield(g::CompGraph) = mapgraph(g)
-    mapfield(f) = mapothers(f)
-    # Replace with fmap?
-    # Maybe not, because we don't want to descend into models?
-    return c -> newcand(c, mapfield)
-end
-
 """
     randomlrscale(rfun = BoundedRandomWalk(-1.0, 1.0))
 
@@ -444,6 +435,7 @@ Intended to be used with `AfterEvolution` to create things like global learning
 See `https://github.com/DrChainsaw/NaiveGAExperiments/blob/master/lamarckism/experiments.ipynb` for some hints as to why this might be needed.
 """
 function global_optimizer_mutation(pop, optfun)
-    om = optfun(pop)
-    map(c -> newcand(c, optmap(om)), pop)
+    mt = MapType{FluxOptimizer}(optfun(pop), identity)
+    mc = MapCandidate(mt, identity)
+    map(mc, pop)
 end

From d3c8e3019ceecf1cf47eb066493d449f03697980 Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Mon, 6 Jun 2022 00:22:51 +0200
Subject: [PATCH 15/36] Make batch size limitation explicit when creating a new
 candidate swap first arguments in limit_batchsize for smoother args
 propagation

---
 src/batchsize.jl     | 36 ++++++++++++++++-----------------
 src/candidate.jl     | 22 +++++++++++++-------
 src/iteratormaps.jl  | 29 +++++++++++---------------
 test/batchsize.jl    | 32 ++++++++++++++---------------
 test/candidate.jl    | 48 +++++++++++++++++++++++++++++++++++++++++++-
 test/iteratormaps.jl | 10 ++++++---
 6 files changed, 115 insertions(+), 62 deletions(-)

diff --git a/src/batchsize.jl b/src/batchsize.jl
index 1c715db3..9a66c811 100644
--- a/src/batchsize.jl
+++ b/src/batchsize.jl
@@ -19,21 +19,21 @@ julia> graph =  let
 """
 
 generic_batchsizeselection_example(sbs, kwres...) = """
-julia> bs(graph, TrainBatchSize(512); $(first(kwres[1]))availablebytes = 10_000_000) # availablebytes supplied for doctest reasons
+julia> bs(TrainBatchSize(512), graph; $(first(kwres[1]))availablebytes = 10_000_000) # availablebytes supplied for doctest reasons
 $(last(kwres[1]))
 
-julia> bs(graph, TrainBatchSize(512); $(first(kwres[2]))availablebytes = 1000_000_000)
+julia> bs(TrainBatchSize(512), graph; $(first(kwres[2]))availablebytes = 1000_000_000)
 $(last(kwres[2]))
 
 julia> $sbs
 
-julia> sbs(graph, TrainBatchSize(512); $(first(kwres[3]))availablebytes = 10_000_000)
+julia> sbs(TrainBatchSize(512), graph; $(first(kwres[3]))availablebytes = 10_000_000)
 $(last(kwres[3]))
 
-julia> sbs(graph, TrainBatchSize(512); $(first(kwres[4]))availablebytes = 1000_000_000)
+julia> sbs(TrainBatchSize(512), graph; $(first(kwres[4]))availablebytes = 1000_000_000)
 $(last(kwres[4]))
 
-julia> bs(graph, ValidationBatchSize(512); $(first(kwres[5]))availablebytes=10_000_000)
+julia> bs(ValidationBatchSize(512), graph; $(first(kwres[5]))availablebytes=10_000_000)
 $(last(kwres[5]))
 """
 
@@ -215,8 +215,8 @@ struct BatchSizeSelectionMaxSize{F}
     batchsizefun::F
 end
 BatchSizeSelectionMaxSize(uppersize) = BatchSizeSelectionMaxSize(uppersize, limit_maxbatchsize)
-function (bs::BatchSizeSelectionMaxSize)(c, orgbs, args...; kwargs...)
-     bs.batchsizefun(c, newbatchsize(orgbs, bs.uppersize), args...; kwargs...)
+function (bs::BatchSizeSelectionMaxSize)(orgbs, args...; kwargs...)
+     bs.batchsizefun(newbatchsize(orgbs, bs.uppersize), args...; kwargs...)
 end
 # For strange batch size types which can't be created from just a number
 newbatchsize(::T, newsize) where T = T(newsize) 
@@ -246,34 +246,34 @@ julia> import NaiveGAflux: TrainBatchSize, ValidationBatchSize # Needed only for
 $(generic_batchsizefun_testgraph())
 julia> bs = batchsizeselection((32,32,3));
 
-julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000) # availablebytes supplied for doctest reasons
+julia> bs(TrainBatchSize(128), graph; availablebytes = 10_000_000) # availablebytes supplied for doctest reasons
 84
 
-julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(ValidationBatchSize(128), graph; availablebytes = 10_000_000)
 128
 
 julia> bs = batchsizeselection((32,32,3); maxmemutil=0.1);
 
-julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000)
+julia> bs(TrainBatchSize(128), graph; availablebytes = 10_000_000)
 12
 
-julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(ValidationBatchSize(128), graph; availablebytes = 10_000_000)
 24
 
 julia> bs = batchsizeselection((32,32,3); uppersize=1024);
 
-julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000)
+julia> bs(TrainBatchSize(128), graph; availablebytes = 10_000_000)
 84
 
-julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(ValidationBatchSize(128), graph; availablebytes = 10_000_000)
 170
 
 julia> bs = batchsizeselection((32,32,3); uppersize=1024, alternatives = 2 .^ (0:10));
 
-julia> bs(graph, TrainBatchSize(128); availablebytes = 10_000_000)
+julia> bs(TrainBatchSize(128), graph; availablebytes = 10_000_000)
 64
 
-julia> bs(graph, ValidationBatchSize(128); availablebytes = 10_000_000)
+julia> bs(ValidationBatchSize(128), graph; availablebytes = 10_000_000)
 128
 ```
 """
@@ -290,14 +290,14 @@ end
 
 # specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
 # Consider refactoring
-function limit_maxbatchsize(model::CompGraph, bs::TrainBatchSize; inshape_nobatch, availablebytes = _availablebytes())
+function limit_maxbatchsize(bs::TrainBatchSize, model::CompGraph; inshape_nobatch, availablebytes = _availablebytes())
     min(batchsize(bs), maxtrainbatchsize(model, inshape_nobatch, availablebytes))
 end
 
 # specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
 # Consider refactoring
-function limit_maxbatchsize(model::CompGraph, 
-                            bs::ValidationBatchSize; 
+function limit_maxbatchsize(bs::ValidationBatchSize,
+                            model::CompGraph; 
                             inshape_nobatch,
                             availablebytes = _availablebytes()
                             )
diff --git a/src/candidate.jl b/src/candidate.jl
index a157c3b4..e63f5f47 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -135,7 +135,12 @@ for validation for speed reasons.
 """
 struct CandidateDataIterMap{T<:AbstractIteratorMap, C<:AbstractCandidate} <: AbstractWrappingCandidate
     map::T
-    c::C
+    c::C   
+    
+    CandidateDataIterMap(map, c::C) where C<:AbstractCandidate = model(c) do m
+        lmap = limit_maxbatchsize(map, m)
+        new{typeof(lmap), C}(lmap, c)
+    end
 end
 
 @functor CandidateDataIterMap
@@ -143,13 +148,8 @@ end
 trainiterator(c::CandidateDataIterMap; kwargs...) = maptrain(c.map, trainiterator(wrappedcand(c); kwargs...))
 validationiterator(c::CandidateDataIterMap; kwargs...) = mapvalidation(c.map, validationiterator(wrappedcand(c); kwargs...))
 
-function newcand(c::CandidateDataIterMap, mapfield) 
-    nc =  newcand(wrappedcand(c), mapfield)
-    CandidateDataIterMap(apply_mapfield(mapfield, c.map, nc), nc)
-end
+newcand(c::CandidateDataIterMap, mapfield) = CandidateDataIterMap(mapfield(c.map), newcand(wrappedcand(c), mapfield))
 
-# Just because BatchSizeIteratorMap needs the model to limit the batch sizes :(
-apply_mapfield(f, x, ::AbstractCandidate) = f(x)
 
 """
     FileCandidate <: AbstractWrappingCandidate
@@ -328,6 +328,14 @@ function MapType(c::AbstractCrossover{FluxOptimizer}, (c1, c2), (nomatch1, nomat
     return MapType{FluxOptimizer}(Returns(o1n), nomatch1), MapType{FluxOptimizer}(Returns(o2n), nomatch2)
 end
 
+# Just because BatchSizeIteratorMap needs the model to limit the batch sizes :(
+# Try to come up with a cleaner design...
+apply_mapfield(f::MapType, x, args...) = (@info "apply $f to $x"; apply_mapfield(f.nomatch, x, args...))
+apply_mapfield(f::MapType{T1}, x::T2, args...) where {T1, T2<:T1} = apply_mapfield(f.match, x, args...)
+apply_mapfield(f::typeof(deepcopy), x, args...) = f(x)
+apply_mapfield(f, x, args...) = f(x, args...)
+    
+
 """
     MapCandidate{T, F} 
     MapCandidate(mutations, mapothers::F)
diff --git a/src/iteratormaps.jl b/src/iteratormaps.jl
index 2ee0e2f9..b94e1ad8 100644
--- a/src/iteratormaps.jl
+++ b/src/iteratormaps.jl
@@ -13,6 +13,8 @@ abstract type AbstractIteratorMap end
 maptrain(::AbstractIteratorMap, iter) = iter
 mapvalidation(::AbstractIteratorMap, iter) = iter
 
+limit_maxbatchsize(im::AbstractIteratorMap, args...; kwargs...) = im
+ 
 """
     BatchSizeIteratorMap{F} <: AbstractIteratorMap 
     BatchSizeIteratorMap(limitfun, trainbatchsize, validationbatchsize, model)
@@ -23,12 +25,11 @@ Use [`batchsizeselection`](@ref) to create an appropriate `limitfun`.
 
 # Examples
 ```jldoctest
-julia> using NaiveGAflux, Flux
+julia> using NaiveGAflux
 
 julia> import NaiveGAflux: maptrain, mapvalidation # needed for examples only
 
-$(generic_batchsizefun_testgraph())
-julia> bsim = BatchSizeIteratorMap(4, 8, batchsizeselection((32,32,3)), graph);
+julia> bsim = BatchSizeIteratorMap(4, 8, batchsizeselection((32,32,3)));
 
 julia> collect(maptrain(bsim, (1:20,)))
 5-element Vector{Vector{Int64}}:
@@ -65,27 +66,19 @@ struct BatchSizeIteratorMap{F} <: AbstractIteratorMap
     tbs::TrainBatchSize
     vbs::ValidationBatchSize
     limitfun::F
-    function BatchSizeIteratorMap{F}(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, model) where F
-        new{F}(TrainBatchSize(limitfun(model, tbs)), ValidationBatchSize(limitfun(model, vbs)), limitfun)
-    end
 end
 
-function BatchSizeIteratorMap(tbs::Integer, vbs::Integer, limitfun, model)
-    BatchSizeIteratorMap(TrainBatchSize(tbs), ValidationBatchSize(vbs), limitfun, model)
-end
-
-
-function BatchSizeIteratorMap(tbs::TrainBatchSize, vbs::ValidationBatchSize, limitfun::F, model) where F
-    BatchSizeIteratorMap{F}(tbs, vbs, limitfun, model)
-end
-
-apply_mapfield(::typeof(deepcopy), bsim::BatchSizeIteratorMap, c) = model(c) do m
-    BatchSizeIteratorMap(bsim.tbs, bsim.vbs, deepcopy(bsim.limitfun), m)
+function BatchSizeIteratorMap(tbs::Integer, vbs::Integer, limitfun)
+    BatchSizeIteratorMap(TrainBatchSize(tbs), ValidationBatchSize(vbs), limitfun)
 end
 
 maptrain(bs::BatchSizeIteratorMap, iter) = setbatchsize(iter, batchsize(bs.tbs))
 mapvalidation(bs::BatchSizeIteratorMap, iter) = setbatchsize(iter, batchsize(bs.vbs))
 
+function limit_maxbatchsize(bsim::BatchSizeIteratorMap, args...; kwargs...) 
+    BatchSizeIteratorMap(bsim.limitfun(bsim.tbs, args...; kwargs...), bsim.limitfun(bsim.vbs, args...; kwargs...), bsim.limitfun)
+end
+
 """
     IteratorMaps{T} <: AbstractIteratorMap 
 """
@@ -96,3 +89,5 @@ IteratorMaps(x...) = IteratorMaps(x)
 
 maptrain(iws::IteratorMaps, iter) = foldr(maptrain, iws.maps; init=iter)
 mapvalidation(iws::IteratorMaps, iter) = foldr(mapvalidation, iws.maps; init=iter)
+
+limit_maxbatchsize(ims::IteratorMaps, args...; kwargs...) = IteratorMaps(map(im -> limit_maxbatchsize(im, args...; kwargs...), ims.maps))
\ No newline at end of file
diff --git a/test/batchsize.jl b/test/batchsize.jl
index aa3f9f1c..81e43bec 100644
--- a/test/batchsize.jl
+++ b/test/batchsize.jl
@@ -68,13 +68,13 @@
         import NaiveGAflux: limit_maxbatchsize, TrainBatchSize, ValidationBatchSize
         graph = testgraph(5)
 
-        @test limit_maxbatchsize(graph, TrainBatchSize(1); inshape_nobatch=(5,), availablebytes=1000) == 1 
-        @test limit_maxbatchsize(graph, TrainBatchSize(2); inshape_nobatch=(5,), availablebytes=1000) == 2
-        @test limit_maxbatchsize(graph, TrainBatchSize(3); inshape_nobatch=(5,), availablebytes=1000) == 2
+        @test limit_maxbatchsize(TrainBatchSize(1), graph; inshape_nobatch=(5,), availablebytes=1000) == 1 
+        @test limit_maxbatchsize(TrainBatchSize(2), graph; inshape_nobatch=(5,), availablebytes=1000) == 2
+        @test limit_maxbatchsize(TrainBatchSize(3), graph; inshape_nobatch=(5,), availablebytes=1000) == 2
         
-        @test limit_maxbatchsize(graph, ValidationBatchSize(6); inshape_nobatch=(5,), availablebytes=1000) == 6
-        @test limit_maxbatchsize(graph, ValidationBatchSize(8); inshape_nobatch=(5,), availablebytes=1000) == 8
-        @test limit_maxbatchsize(graph, ValidationBatchSize(10); inshape_nobatch=(5,), availablebytes=1000) == 8
+        @test limit_maxbatchsize(ValidationBatchSize(6), graph; inshape_nobatch=(5,), availablebytes=1000) == 6
+        @test limit_maxbatchsize(ValidationBatchSize(8), graph; inshape_nobatch=(5,), availablebytes=1000) == 8
+        @test limit_maxbatchsize(ValidationBatchSize(10), graph; inshape_nobatch=(5,), availablebytes=1000) == 8
     end
 
     @testset "batchsizeselection" begin
@@ -83,24 +83,24 @@
         graph = testgraph(4)
         bs = batchsizeselection((4,))
         
-        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 19
-        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 31
+        @test bs(TrainBatchSize(31), graph; availablebytes=10000) == 19
+        @test bs(ValidationBatchSize(31), graph; availablebytes=10000) == 31
 
         bs = batchsizeselection((4,); maxmemutil=0.1)
-        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 2
-        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 8
+        @test bs(TrainBatchSize(31), graph; availablebytes=10000) == 2
+        @test bs(ValidationBatchSize(31), graph; availablebytes=10000) == 8
 
         bs = batchsizeselection((4,); uppersize=64)
-        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 19
-        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 64
+        @test bs(TrainBatchSize(31), graph; availablebytes=10000) == 19
+        @test bs(ValidationBatchSize(31), graph; availablebytes=10000) == 64
 
         bs = batchsizeselection((4,); alternatives=2 .^ (0:10))
-        @test bs(graph, TrainBatchSize(33), availablebytes=10000) == 16
-        @test bs(graph, ValidationBatchSize(33), availablebytes=10000) == 32
+        @test bs(TrainBatchSize(33), graph; availablebytes=10000) == 16
+        @test bs(ValidationBatchSize(33), graph; availablebytes=10000) == 32
 
         bs = batchsizeselection((4,); uppersize=65, alternatives=2 .^ (0:10))
-        @test bs(graph, TrainBatchSize(31), availablebytes=10000) == 16
-        @test bs(graph, ValidationBatchSize(31), availablebytes=10000) == 64
+        @test bs(TrainBatchSize(31), graph; availablebytes=10000) == 16
+        @test bs(ValidationBatchSize(31), graph; availablebytes=10000) == 64
 
     end
 end
\ No newline at end of file
diff --git a/test/candidate.jl b/test/candidate.jl
index 7d5c2985..45a7d84f 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -121,7 +121,7 @@ end
     using Functors: fmap
     import MemPool
 
-    CandidateBatchIterMap(g) = CandidateDataIterMap(BatchSizeIteratorMap(16,32, batchsizeselection((3,)), g), CandidateModel(g))
+    CandidateBatchIterMap(g) = CandidateDataIterMap(BatchSizeIteratorMap(16,32, batchsizeselection((3,))), CandidateModel(g))
 
     @testset "$ctype" for (ctype, candfun) in (
         (CandidateModel, CandidateModel),
@@ -197,6 +197,52 @@ end
         end
     end
 
+    @testset "CandidateDataIterMap" begin
+        import NaiveGAflux: MapType, batchsize
+
+        function grabmodellimitfun()
+            seenmodel = Symbol[]
+            function(bs, m)
+                push!(seenmodel, m)
+                return 17
+            end
+        end
+
+        @testset "With BatchSizeIteratorMap" begin
+
+            bsim = BatchSizeIteratorMap(16, 32, grabmodellimitfun())
+            c = CandidateDataIterMap(bsim, CandidateModel(:m1))
+            mc = MapCandidate(MapType{Symbol}(Returns(:m2), deepcopy), identity)
+
+            cnew = mc(c)
+
+            @test model(cnew) == :m2
+            @test cnew.map.limitfun.seenmodel == [:m1, :m1, :m2, :m2]
+            @test batchsize(cnew.map.tbs) == 17
+            @test batchsize(cnew.map.vbs) == 17
+        end
+
+        @testset "With IteratorMaps" begin
+
+            dummyim = Ref(Val(:DummyIm1))
+            NaiveGAflux.limit_maxbatchsize(d::Base.RefValue{Val{:DummyIm1}}, args...; kwargs...) = d
+
+            bsim = BatchSizeIteratorMap(16, 32, grabmodellimitfun())
+            im = IteratorMaps(bsim, dummyim)
+            c = CandidateDataIterMap(im, CandidateModel(:m1))
+            mc = MapCandidate(MapType{Symbol}(Returns(:m2), deepcopy), identity)
+
+            cnew = mc(c)
+
+            @test model(cnew) == :m2
+            @test cnew.map.maps[1].limitfun.seenmodel == [:m1, :m1, :m2, :m2]
+            @test batchsize(cnew.map.maps[1].tbs) == 17
+            @test batchsize(cnew.map.maps[1].vbs) == 17
+
+            @test dummyim !== cnew.map.maps[2]
+        end
+    end
+
     @testset "FileCandidate" begin
         try
            @testset "FileCandidate cleanup" begin
diff --git a/test/iteratormaps.jl b/test/iteratormaps.jl
index 091cb838..56acb4b6 100644
--- a/test/iteratormaps.jl
+++ b/test/iteratormaps.jl
@@ -1,5 +1,5 @@
 @testset "Iterator maps" begin
-    import NaiveGAflux: maptrain, mapvalidation
+    import NaiveGAflux: maptrain, mapvalidation, limit_maxbatchsize
 
     @testset "BatchSizeIteratorMap" begin
         function testgraph(insize)
@@ -10,11 +10,15 @@
             CompGraph(v0, "v4" >> v3 + v3)
         end
 
-        bsim = BatchSizeIteratorMap(2, 4, batchsizeselection((5,)), testgraph(5))
-             
+        bsim = BatchSizeIteratorMap(2, 4, batchsizeselection((5,); batchsizefun=(bs, newbs; scale, kws...) -> newbs * scale))
+
         @testset "Single array" begin
             @test collect(maptrain(bsim, (1:20,))) == [a:a+1 for a in 1:2:20]    
             @test collect(mapvalidation(bsim, (1:20,))) == [a:a+3 for a in 1:4:20]   
+            
+            bsimnew = limit_maxbatchsize(bsim, 2; scale=5)
+            @test collect(maptrain(bsimnew, (1:20,))) == [a:a+9 for a in 1:10:20] 
+            @test collect(mapvalidation(bsimnew, (1:20,))) == [a:a+9 for a in 1:10:20] 
         end
 
         @testset "BatchIterator" begin

From d16974ccc21ed1795f2dda037069fe7fdf67fe52 Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Mon, 6 Jun 2022 01:04:01 +0200
Subject: [PATCH 16/36] Fix formatting error Fix some instances of abstract
 fields

---
 src/mutation/generic.jl |  20 +--
 src/mutation/graph.jl   | 306 ++++++++++++++++++++--------------------
 2 files changed, 163 insertions(+), 163 deletions(-)

diff --git a/src/mutation/generic.jl b/src/mutation/generic.jl
index 4090d347..368e9096 100644
--- a/src/mutation/generic.jl
+++ b/src/mutation/generic.jl
@@ -71,8 +71,8 @@ end
 
 Applies `m` with probability `p`.
 """
-struct MutationProbability{T, P<:Probability} <: DecoratingMutation{T}
-    m::AbstractMutation{T}
+struct MutationProbability{T, M<:AbstractMutation{T}, P<:Probability} <: DecoratingMutation{T}
+    m::M
     p::P
 end
 MutationProbability(m::AbstractMutation{T}, p::Number) where T = MutationProbability(m, Probability(p))
@@ -144,8 +144,8 @@ Chains multiple `AbstractMutation{T}`s after each other.
 
 Input entities will be mutated by the first `AbstractMutation{T}` in the chain and the output will be fed into the next `AbstractMutation{T}` in the chain and so on. The output from the last `AbstractMutation{T}` is returned.
 """
-struct MutationChain{T} <: DecoratingMutation{T}
-    m::Tuple{Vararg{AbstractMutation{T}}}
+struct MutationChain{T, M<:Tuple{Vararg{AbstractMutation{T}}}} <: DecoratingMutation{T}
+    m::M
 end
 MutationChain(m::AbstractMutation{T}...) where T = MutationChain(m)
 # Identical, but can't use Union due to ambiguity
@@ -160,8 +160,8 @@ Records all mutated entities.
 
 Intended use case is to be able to do parameter selection on mutated vertices.
 """
-struct RecordMutation{T} <: DecoratingMutation{T}
-    m::AbstractMutation{T}
+struct RecordMutation{T, M<:AbstractMutation{T}} <: DecoratingMutation{T}
+    m::M
     mutated::Vector{T}
 end
 RecordMutation(m::AbstractMutation{T}) where T = RecordMutation(m, T[])
@@ -189,11 +189,11 @@ Calling `nextlogfun(e)` where `e` is the entity to mutate produces an `AbstractL
 
 By default, this is used to add a level of indentation to subsequent logging calls which makes logs of hierarchical mutations (e.g. mutate a CompGraph by applying mutations to some of its vertices) easier to read. Set `nextlogfun = e -> current_logger()` to remove this behaviour.
 """
-struct LogMutation{T,F,L<:LogLevel,LF} <: DecoratingMutation{T}
+struct LogMutation{T,F,L<:LogLevel,LF,M<:AbstractMutation{T}} <: DecoratingMutation{T}
     strfun::F
     level::L
     nextlogfun::LF
-    m::AbstractMutation{T}
+    m::M
 end
 LogMutation(strfun, m::AbstractMutation{T}; level = Logging.Info, nextlogfun=e -> PrefixLogger("   ")) where T = LogMutation(strfun, level, nextlogfun, m)
 function (m::LogMutation{T})(e::T; next=m.m, noop=identity) where T
@@ -207,9 +207,9 @@ end
 
 Applies mutation `m` only for entities `e` for which `predicate(e)` returns true.
 """
-struct MutationFilter{T,P} <: DecoratingMutation{T}
+struct MutationFilter{T,P,M<:AbstractMutation{T}} <: DecoratingMutation{T}
     predicate::P
-    m::AbstractMutation{T}
+    m::M
 end
 function (m::MutationFilter{T})(e::T; next=m.m, noop=identity) where T
     m.predicate(e) && return next(e)
diff --git a/src/mutation/graph.jl b/src/mutation/graph.jl
index e4ef145e..d5ee42c3 100644
--- a/src/mutation/graph.jl
+++ b/src/mutation/graph.jl
@@ -8,14 +8,14 @@ Applies a wrapped `AbstractMutation{AbstractVertex}` to each selected vertex in
 
 Vertices to select is determined by the configured `AbstractVertexSelection`.
 """
-struct VertexMutation{S<:AbstractVertexSelection} <: DecoratingMutation{CompGraph}
-m::AbstractMutation{AbstractVertex}
-s::S
+struct VertexMutation{M<:AbstractMutation{AbstractVertex}, S<:AbstractVertexSelection} <: DecoratingMutation{CompGraph}
+    m::M
+    s::S
 end
 VertexMutation(m::AbstractMutation{AbstractVertex}) = VertexMutation(m, FilterMutationAllowed())
 function (m::VertexMutation)(g::CompGraph)
-m.m(select(m.s, g, m))
-return g
+    m.m(select(m.s, g, m))
+    return g
 end
 
 """
@@ -30,48 +30,48 @@ Size is changed by `x * nout(v)` rounded away from from zero where `x` is drawn
 `minrel` and `maxrel` are `l1` and `l2` if `l1 < l2` and `l2` and `l1` otherwise.
 """
 struct NoutMutation{R<:Real, RNG<:AbstractRNG} <:AbstractMutation{AbstractVertex}
-minrel::R
-maxrel::R
-rng::RNG
-function NoutMutation(l1::R1, l2::R2, rng::RNG) where {R1, R2, RNG} 
-    R = promote_type(R1, R2)
-    return l1 < l2 ? new{R, RNG}(promote(l1, l2)..., rng) : new{R, RNG}(promote(l2, l1)..., rng)
-end
+    minrel::R
+    maxrel::R
+    rng::RNG
+    function NoutMutation(l1::R1, l2::R2, rng::RNG) where {R1, R2, RNG} 
+        R = promote_type(R1, R2)
+        return l1 < l2 ? new{R, RNG}(promote(l1, l2)..., rng) : new{R, RNG}(promote(l2, l1)..., rng)
+    end
 end
 NoutMutation(limit, rng::AbstractRNG=rng_default) = NoutMutation(0, limit, rng)
 NoutMutation(l1,l2) = NoutMutation(l1,l2, rng_default)
 (m::NoutMutation)(v::AbstractVertex) = first(m([v]))
 function (m::NoutMutation)(vs::AbstractVector{<:AbstractVertex})
 
-Δs = Dict{AbstractVertex, Int}()
-shift = m.minrel
-scale = m.maxrel - m.minrel
+    Δs = Dict{AbstractVertex, Int}()
+    shift = m.minrel
+    scale = m.maxrel - m.minrel
 
-for v in vs
-    terminputs = findterminating(v, inputs)
+    for v in vs
+        terminputs = findterminating(v, inputs)
 
-    # We are basically just searching for Immutable vertices here, allow_mutation(trait(v)) happens to do just that
-    any(tv -> allow_mutation(trait(tv)), terminputs) || continue
-    
-    Δfloat = rand(m.rng) * scale + shift
+        # We are basically just searching for Immutable vertices here, allow_mutation(trait(v)) happens to do just that
+        any(tv -> allow_mutation(trait(tv)), terminputs) || continue
+        
+        Δfloat = rand(m.rng) * scale + shift
 
-    Δ = ceil(Int, abs(Δfloat) * nout(v)) *  sign(Δfloat)
-    minsize = minimum(nout.(terminputs))
-    # Or else we might increase the size despite Δ being negative which would be surprising to a user who has specified 
-    # strictly negative size changes
-    minsize + Δ <= 0 && continue
+        Δ = ceil(Int, abs(Δfloat) * nout(v)) *  sign(Δfloat)
+        minsize = minimum(nout.(terminputs))
+        # Or else we might increase the size despite Δ being negative which would be surprising to a user who has specified 
+        # strictly negative size changes
+        minsize + Δ <= 0 && continue
 
-    Δs[v] = Δ
-end
+        Δs[v] = Δ
+    end
 
-if !isempty(Δs)
-    failmsg = (args...) -> "Could not change nout of $(join(NaiveNASlib.nameorrepr.(keys(Δs)), ", ", " and ")) by $(join(values(Δs), ", ", " and ")). No change!"
+    if !isempty(Δs)
+        failmsg = (args...) -> "Could not change nout of $(join(NaiveNASlib.nameorrepr.(keys(Δs)), ", ", " and ")) by $(join(values(Δs), ", ", " and ")). No change!"
 
-    strategy = TimeOutAction(;base=ΔNoutRelaxed(Δs), fallback=LogΔSizeExec(failmsg, Logging.Warn, ΔSizeFailNoOp()))
+        strategy = TimeOutAction(;base=ΔNoutRelaxed(Δs), fallback=LogΔSizeExec(failmsg, Logging.Warn, ΔSizeFailNoOp()))
 
-    Δsize!(strategy)
-end
-return vs
+        Δsize!(strategy)
+    end
+    return vs
 end
 
 """
@@ -86,18 +86,18 @@ Insert a vertex from the wrapped `AbstractArchSpace` `s` after a given vertex `v
 The function `outselect` takes an `AbstractVector{AbstractVertex}` representing the output of `v` and returns an `AbstractVector{AbstractVertex}` which shall be reconnected to the vertex `v'` returned by `s`. Defaults to `identity` meaning all outputs of `v` are reconnected to `v'`.
 """
 struct AddVertexMutation{S<:AbstractArchSpace, F, WI<:AbstractWeightInit, RNG<:AbstractRNG} <:AbstractMutation{AbstractVertex}
-s::S
-outselect::F
-weightinit::WI
-rng::RNG
+    s::S
+    outselect::F
+    weightinit::WI
+    rng::RNG
 end
 AddVertexMutation(s, outselect::Function=identity) = AddVertexMutation(s, outselect, IdentityWeightInit(), rng_default)
 AddVertexMutation(s, rng::AbstractRNG) = AddVertexMutation(s, identity, IdentityWeightInit(), rng)
 AddVertexMutation(s, wi::AbstractWeightInit) = AddVertexMutation(s, identity, wi, rng_default)
 
 function (m::AddVertexMutation)(v::AbstractVertex)
-insert!(v, vi -> m.s(name(vi), vi, m.rng, outsize=nout(vi), wi=m.weightinit), m.outselect)
-return v
+    insert!(v, vi -> m.s(name(vi), vi, m.rng, outsize=nout(vi), wi=m.weightinit), m.outselect)
+    return v
 end
 
 """
@@ -114,16 +114,16 @@ Default reconnect strategy is `ConnectAll`.
 Note: High likelyhood of large accuracy degradation after applying this mutation.
 """
 struct RemoveVertexMutation{S<:RemoveStrategy} <:AbstractMutation{AbstractVertex}
-s::S
+    s::S
 end
 function RemoveVertexMutation() 
-alignstrat = IncreaseSmaller(fallback=DecreaseBigger(fallback=AlignSizeBoth(fallback=FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Could not align sizes of neighbours!"))))
-return RemoveVertexMutation(RemoveStrategy(CheckAligned(CheckNoSizeCycle(alignstrat, FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Size cycle detected!")))))
+    alignstrat = IncreaseSmaller(fallback=DecreaseBigger(fallback=AlignSizeBoth(fallback=FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Could not align sizes of neighbours!"))))
+    return RemoveVertexMutation(RemoveStrategy(CheckAligned(CheckNoSizeCycle(alignstrat, FailAlignSizeWarn(msgfun = (vin,vout) -> "Can not remove vertex $(name(vin))! Size cycle detected!")))))
 end
 
 function (m::RemoveVertexMutation)(v::AbstractVertex)
-remove!(v, m.s)
-return v
+    remove!(v, m.s)
+    return v
 end
 
 default_neuronselect(args...) = NaiveNASlib.defaultutility(args...)
@@ -144,92 +144,92 @@ When selecting neurons/outputs after any eventual size change the output of `uti
 Note: High likelyhood of large accuracy degradation after applying this mutation.
 """
 struct AddEdgeMutation{F1, F2, F3, P<:Probability, RNG} <: AbstractMutation{AbstractVertex}
-mergefun::F1
-filtfun::F2
-utilityfun::F3
-p::P
-rng::RNG
+    mergefun::F1
+    filtfun::F2
+    utilityfun::F3
+    p::P
+    rng::RNG
 end
 AddEdgeMutation(p; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect) = AddEdgeMutation(Probability(p, rng), rng=rng, mergefun=mergefun, filtfun=filtfun, utilityfun=utilityfun)
 AddEdgeMutation(p::Probability; rng=rng_default, mergefun=default_mergefun(rng=rng), filtfun=no_shapechange, utilityfun=default_neuronselect) = AddEdgeMutation(mergefun, filtfun, utilityfun, p, rng)
 
 default_mergefun(pconc = 0.5; rng=rng_default, traitfun = MutationShield ∘ RemoveIfSingleInput ∘ validated() ∘ default_logging(), layerfun = ActivationContribution) = function(vin)
-if rand(rng) > pconc
-    return invariantvertex(layerfun(+), vin, traitdecoration=traitfun ∘ named(name(vin) * ".add"))
-end
-return concat(vin, traitfun = traitfun ∘ named(name(vin) * ".cat"), layerfun=layerfun)
+    if rand(rng) > pconc
+        return invariantvertex(layerfun(+), vin, traitdecoration=traitfun ∘ named(name(vin) * ".add"))
+    end
+    return concat(vin, traitfun = traitfun ∘ named(name(vin) * ".cat"), layerfun=layerfun)
 end
 
 function no_shapechange(vi)
-# all_in_graph is not sorted, and we want some kind of topoligical order here so that earlier indices are closer to vi
-allsorted = mapreduce(ancestors, vcat, filter(v -> isempty(outputs(v)), all_in_graph(vi))) |> unique
-
-# Vertices which have the same input as vi and are singleinput
-#   Reason is that this will cause a new vertex to be added between the target output vertex vo
-#   and the input vertex to vi (vii) and this is detected as a size cycle which causes 
-#   try_add_edge to fail.
-inouts = filter(singleinput, mapreduce(outputs, vcat, inputs(vi); init=[]))
-# All vertices which are after vi in the topology
-vsafter = setdiff(allsorted, ancestors(vi), outputs(vi), inouts)
-
-vitrace = shapetrace(vi) 
-viorder = allΔshapetypes(vitrace)
-viΔshape = squashshapes(vitrace; order=viorder)
-
-return filter(vsafter) do vafter
-    all(inputs(vafter)) do v
-    t = shapetrace(v)
-    vΔshape = squashshapes(t; order=union(viorder, allΔshapetypes(t)))
-    return viΔshape == vΔshape
+    # all_in_graph is not sorted, and we want some kind of topoligical order here so that earlier indices are closer to vi
+    allsorted = mapreduce(ancestors, vcat, filter(v -> isempty(outputs(v)), all_in_graph(vi))) |> unique
+
+    # Vertices which have the same input as vi and are singleinput
+    #   Reason is that this will cause a new vertex to be added between the target output vertex vo
+    #   and the input vertex to vi (vii) and this is detected as a size cycle which causes 
+    #   try_add_edge to fail.
+    inouts = filter(singleinput, mapreduce(outputs, vcat, inputs(vi); init=[]))
+    # All vertices which are after vi in the topology
+    vsafter = setdiff(allsorted, ancestors(vi), outputs(vi), inouts)
+
+    vitrace = shapetrace(vi) 
+    viorder = allΔshapetypes(vitrace)
+    viΔshape = squashshapes(vitrace; order=viorder)
+
+    return filter(vsafter) do vafter
+        all(inputs(vafter)) do v
+        t = shapetrace(v)
+        vΔshape = squashshapes(t; order=union(viorder, allΔshapetypes(t)))
+        return viΔshape == vΔshape
+        end
     end
 end
-end
 
 function (m::AddEdgeMutation)(vi::AbstractVertex)
-# All vertices for which it is allowed to add vi as an input
-allverts = filter(allow_mutation, m.filtfun(vi))
-isempty(allverts) && return vi
-
-# Higher probability to select a vertex close to v is desired behaviour
-# One line less than a for loop => FP wins!!
-selfun(::Nothing, vc) = apply(m.p) ? vc : nothing
-selfun(vs, vd) = vs
-vo = foldl(selfun, allverts, init=nothing)
-vo = isnothing(vo) ? rand(m.rng, allverts) : vo
-
-try_add_edge(vi, vo, m.mergefun, m.utilityfun)
-return vi
+    # All vertices for which it is allowed to add vi as an input
+    allverts = filter(allow_mutation, m.filtfun(vi))
+    isempty(allverts) && return vi
+
+    # Higher probability to select a vertex close to v is desired behaviour
+    # One line less than a for loop => FP wins!!
+    selfun(::Nothing, vc) = apply(m.p) ? vc : nothing
+    selfun(vs, vd) = vs
+    vo = foldl(selfun, allverts, init=nothing)
+    vo = isnothing(vo) ? rand(m.rng, allverts) : vo
+
+    try_add_edge(vi, vo, m.mergefun, m.utilityfun)
+    return vi
 end
 
 function try_add_edge(vi, vo, mergefun, utilityfun=default_neuronselect)
 
-# Need to add a vertex which can handle multiple inputs if vo is single input only
-# For cleaning up added vertex if the whole operation fails
-cleanup_failed = () -> nothing
-if singleinput(vo)
-    voi = inputs(vo)[1]
-    # If the input to vo is capable of multi input we don't need to create a new vertex
-    # We must also check that this input does not happen to be an input to vi as this would create a cycle in the graph
-    if singleinput(voi) || voi in ancestors(vi)
-        vm = mergefun(voi)
-        # Insert vm between voi and vo, i.e voi -> vo turns into voi -> vm -> vo
-        # vs -> [vo] means only add the new vertex between voi and vo as voi could have other outputs
-        insert!(voi, vv -> vm, vs -> [vo])
-        cleanup_failed = function()
-            length(inputs(vm)) > 1 && return
-            remove!(vm, RemoveStrategy(NoSizeChange()))
+    # Need to add a vertex which can handle multiple inputs if vo is single input only
+    # For cleaning up added vertex if the whole operation fails
+    cleanup_failed = () -> nothing
+    if singleinput(vo)
+        voi = inputs(vo)[1]
+        # If the input to vo is capable of multi input we don't need to create a new vertex
+        # We must also check that this input does not happen to be an input to vi as this would create a cycle in the graph
+        if singleinput(voi) || voi in ancestors(vi)
+            vm = mergefun(voi)
+            # Insert vm between voi and vo, i.e voi -> vo turns into voi -> vm -> vo
+            # vs -> [vo] means only add the new vertex between voi and vo as voi could have other outputs
+            insert!(voi, vv -> vm, vs -> [vo])
+            cleanup_failed = function()
+                length(inputs(vm)) > 1 && return
+                remove!(vm, RemoveStrategy(NoSizeChange()))
+            end
+            vo = vm # vm is the one we shall add an edge to
+            @debug "Create new vertex for merging $(name(vo))"
+        else
+            vo = voi
         end
-        vo = vm # vm is the one we shall add an edge to
-        @debug "Create new vertex for merging $(name(vo))"
-    else
-        vo = voi
     end
-end
-# This is mainly because FailAlignSizeRevert does not work when the same vertex is input more than once, but it also seems kinda redundant.
-vi in inputs(vo) && return
-@debug "Create edge between $(name(vi)) and $(name(vo))"
-create_edge!(vi, vo, strategy = create_edge_strat(vo, utilityfun))
-cleanup_failed()
+    # This is mainly because FailAlignSizeRevert does not work when the same vertex is input more than once, but it also seems kinda redundant.
+    vi in inputs(vo) && return
+    @debug "Create edge between $(name(vi)) and $(name(vo))"
+    create_edge!(vi, vo, strategy = create_edge_strat(vo, utilityfun))
+    cleanup_failed()
 end
 # Need to override this one for strange types e.g. layers which support exactly 2 inputs or something.
 singleinput(v) = isempty(inputs(v)) || length(inputs(v)) == 1
@@ -237,19 +237,19 @@ singleinput(v) = isempty(inputs(v)) || length(inputs(v)) == 1
 create_edge_strat(v::AbstractVertex, utilityfun) = create_edge_strat(trait(v), utilityfun)
 create_edge_strat(d::DecoratingTrait, utilityfun) = create_edge_strat(base(d), utilityfun)
 function create_edge_strat(::SizeInvariant, utilityfun)
-warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
-alignstrat = AlignSizeBoth(;mapstrat=WithUtilityFun(utilityfun), fallback = warnfailalign)
-# Tricky failure case: It is possible that CheckCreateEdgeNoSizeCycle does not detect any size cycle until after the edge has been created?
-sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!") 
+    warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
+    alignstrat = AlignSizeBoth(;mapstrat=WithUtilityFun(utilityfun), fallback = warnfailalign)
+    # Tricky failure case: It is possible that CheckCreateEdgeNoSizeCycle does not detect any size cycle until after the edge has been created?
+    sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!") 
 
-return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
+    return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
 end
 function create_edge_strat(::SizeStack, utilityfun)
-warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
-alignstrat = PostAlign(TruncateInIndsToValid(WithUtilityFun(utilityfun, AlignNinToNout(;fallback=ΔSizeFailNoOp()))), fallback=warnfailalign)
+    warnfailalign = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))!")
+    alignstrat = PostAlign(TruncateInIndsToValid(WithUtilityFun(utilityfun, AlignNinToNout(;fallback=ΔSizeFailNoOp()))), fallback=warnfailalign)
 
-sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!")
-return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
+    sizecyclewarn = FailAlignSizeWarn(msgfun = (vin,vout) -> "Could not align sizes of $(name(vin)) and $(name(vout))! Size cycle detected!")
+    return CheckCreateEdgeNoSizeCycle(ifok=alignstrat, ifnok=sizecyclewarn)
 end
 
 """
@@ -267,24 +267,24 @@ When selecting neurons/outputs after any eventual size change the output of `uti
 Note: High likelyhood of large accuracy degradation after applying this mutation.
 """
 struct RemoveEdgeMutation{F, RNG<:AbstractRNG} <: AbstractMutation{AbstractVertex}
-utilityfun::F
-rng::RNG
+    utilityfun::F
+    rng::RNG
 end
 RemoveEdgeMutation(;utilityfun=default_neuronselect, rng=rng_default) = RemoveEdgeMutation(utilityfun, rng)
 
 function (m::RemoveEdgeMutation)(vi::AbstractVertex)
-length(outputs(vi)) < 2 && return vi
+    length(outputs(vi)) < 2 && return vi
 
-allverts = filter(vo -> length(inputs(vo)) > 1, outputs(vi))
+    allverts = filter(vo -> length(inputs(vo)) > 1, outputs(vi))
 
-isempty(allverts) && return vi
+    isempty(allverts) && return vi
 
-vo = rand(m.rng, allverts)
-sum(inputs(vo) .== vi) > 1 && return vi# Not implemented in NaiveNASlib
+    vo = rand(m.rng, allverts)
+    sum(inputs(vo) .== vi) > 1 && return vi# Not implemented in NaiveNASlib
 
-@debug "Remove edge between $(name(vi)) and $(name(vo))"
-remove_edge!(vi, vo, strategy=remove_edge_strat(vo, m.utilityfun))
-return vi
+    @debug "Remove edge between $(name(vi)) and $(name(vo))"
+    remove_edge!(vi, vo, strategy=remove_edge_strat(vo, m.utilityfun))
+    return vi
 end
 
 remove_edge_strat(v::AbstractVertex, utilityfun) = remove_edge_strat(trait(v), utilityfun)
@@ -305,25 +305,25 @@ Note: High likelyhood of large accuracy degradation after applying this mutation
 `KernelSizeMutation2D` is a convenience constructor for `KernelSizeMutation(absΔ, absΔ;...)`.
 """
 struct KernelSizeMutation{N,F,P} <: AbstractMutation{AbstractVertex}
-Δsizespace::AbstractParSpace{N, Int}
-maxsize::F
-pad::P
-rng::AbstractRNG
+    Δsizespace::AbstractParSpace{N, Int}
+    maxsize::F
+    pad::P
+    rng::AbstractRNG
 end
 KernelSizeMutation(Δsizespace::AbstractParSpace{N, Int}; maxsize = v -> ntuple(i->Inf,N), pad=SamePad(), rng=rng_default) where N = KernelSizeMutation(Δsizespace, maxsize, pad, rng)
 KernelSizeMutation2D(absΔ::Integer;maxsize = v -> (Inf,Inf), pad=SamePad(), rng=rng_default) = KernelSizeMutation(absΔ, absΔ, maxsize = maxsize, pad=pad, rng=rng)
 KernelSizeMutation(absΔ::Integer...;maxsize = v -> ntuple(i->Inf, length(absΔ)), pad=SamePad(), rng=rng_default) = KernelSizeMutation(ParSpace(UnitRange.(.-absΔ, absΔ));maxsize = maxsize, pad=pad, rng=rng)
 
 function (m::KernelSizeMutation{N})(v::AbstractVertex) where N
-layertype(v) isa FluxConvolutional{N} || return
-l = layer(v)
-
-currsize = size(NaiveNASflux.weights(l))[1:N]
-Δsize = Int.(clamp.(m.Δsizespace(m.rng), 1 .- currsize, m.maxsize(v) .- currsize)) # ensure new size is > 0 and < maxsize
-# This will eventually boil down to Setfield doing its thing, and that won't be using any convenience constructors
-pad = Flux.calc_padding(typeof(l), m.pad, currsize .+ Δsize, dilation(l), stride(l))
-KernelSizeAligned(Δsize, pad)(v)
-return v
+    layertype(v) isa FluxConvolutional{N} || return
+    l = layer(v)
+
+    currsize = size(NaiveNASflux.weights(l))[1:N]
+    Δsize = Int.(clamp.(m.Δsizespace(m.rng), 1 .- currsize, m.maxsize(v) .- currsize)) # ensure new size is > 0 and < maxsize
+    # This will eventually boil down to Setfield doing its thing, and that won't be using any convenience constructors
+    pad = Flux.calc_padding(typeof(l), m.pad, currsize .+ Δsize, dilation(l), stride(l))
+    KernelSizeAligned(Δsize, pad)(v)
+    return v
 end
 dilation(l) = l.dilation
 stride(l) = l.stride
@@ -339,22 +339,22 @@ Mutate the activation function of layers which have an activation function.
 Note: High likelyhood of large accuracy degradation after applying this mutation.
 """
 struct ActivationFunctionMutation{T,RNG} <: AbstractMutation{AbstractVertex} where {T <: AbstractParSpace{1}, R <: AbstractRNG}
-actspace::T
-rng::RNG
+    actspace::T
+    rng::RNG
 end
 ActivationFunctionMutation(acts...;rng=rng_default) = ActivationFunctionMutation(collect(acts), rng=rng)
 ActivationFunctionMutation(acts::AbstractVector;rng=rng_default) = ActivationFunctionMutation(ParSpace(acts), rng)
 
 function (m::ActivationFunctionMutation)(v::AbstractVertex)
-m(layertype(v), v)
-return v
+    m(layertype(v), v)
+    return v
 end
 function (m::ActivationFunctionMutation)(t, v) end
 (m::ActivationFunctionMutation)(::Union{FluxDense, FluxConvolutional}, v) = NaiveNASflux.setlayer!(v, (σ = m.actspace(m.rng),))
 (m::ActivationFunctionMutation)(::FluxParNorm, v) = NaiveNASflux.setlayer!(v, (λ = m.actspace(m.rng),))
 function (m::ActivationFunctionMutation)(::FluxRnn, v)
-newcell = setproperties(layer(v).cell, (σ = m.actspace(m.rng),))
-NaiveNASflux.setlayer!(v, (cell = newcell,))
+    newcell = setproperties(layer(v).cell, (σ = m.actspace(m.rng),))
+    NaiveNASflux.setlayer!(v, (cell = newcell,))
 end
 
 
@@ -366,8 +366,8 @@ Search for vertices with zero output size and remove them and all of their input
 
 Removal is only possible if a vertex is inside a parallel path which will later be concatenated.
 """
-struct RemoveZeroNout
-    fallback
+struct RemoveZeroNout{F}
+    fallback::F
 end
 RemoveZeroNout() = RemoveZeroNout(IncreaseZeroNout())
 struct IncreaseZeroNout end

From 2469412e6e436f644b7192c7faac326348ededb8 Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Mon, 6 Jun 2022 02:24:26 +0200
Subject: [PATCH 17/36] Add TrainBatchSizeMutation

---
 src/NaiveGAflux.jl            | 14 ++++++---
 src/mutation/iteratormaps.jl  | 38 +++++++++++++++++++++++
 test/candidate.jl             |  9 +++---
 test/mutation/iteratormaps.jl | 58 +++++++++++++++++++++++++++++++++++
 4 files changed, 111 insertions(+), 8 deletions(-)
 create mode 100644 src/mutation/iteratormaps.jl
 create mode 100644 test/mutation/iteratormaps.jl

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index fabda88d..1932e052 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -62,11 +62,16 @@ export persist
 # Vertex selection types
 export AbstractVertexSelection, AllVertices, FilterMutationAllowed
 
-# mutation types
+# generic mutation types
 export AbstractMutation, MutationProbability, WeightedMutationProbability, HighUtilityMutationProbability, LowUtilityMutationProbability 
-export MutationChain, RecordMutation, LogMutation, MutationFilter, VertexMutation, NoutMutation, AddVertexMutation, RemoveVertexMutation
-export AddEdgeMutation, RemoveEdgeMutation, KernelSizeMutation, KernelSizeMutation2D, ActivationFunctionMutation, OptimizerMutation
-export LearningRateMutation, AddOptimizerMutation
+export MutationChain, RecordMutation, LogMutation, MutationFilter
+# graph mutation types
+export VertexMutation, NoutMutation, AddVertexMutation, RemoveVertexMutation, AddEdgeMutation, RemoveEdgeMutation, KernelSizeMutation
+export KernelSizeMutation2D, ActivationFunctionMutation
+# optimizer mutation types
+export OptimizerMutation, LearningRateMutation, AddOptimizerMutation
+# Iterator wrapping mutation types
+export TrainBatchSizeMutation
 
 # Crossover types
 export AbstractCrossover, VertexCrossover, CrossoverSwap, OptimizerCrossover, LearningRateCrossover
@@ -99,6 +104,7 @@ include("archspace.jl")
 include("mutation/generic.jl")
 include("mutation/graph.jl")
 include("mutation/optimizer.jl")
+include("mutation/iteratormaps.jl")
 include("crossover/graph.jl")
 include("crossover/optimizer.jl")
 include("candidate.jl")
diff --git a/src/mutation/iteratormaps.jl b/src/mutation/iteratormaps.jl
new file mode 100644
index 00000000..86085bea
--- /dev/null
+++ b/src/mutation/iteratormaps.jl
@@ -0,0 +1,38 @@
+
+(m::AbstractMutation{<:AbstractIteratorMap})(im::IteratorMaps) = IteratorMaps(m.(im.maps))
+
+newfrom(im::AbstractIteratorMap) = deepcopy(im)
+
+struct TrainBatchSizeMutation{R<:Real, Q, RNG<:AbstractRNG} <: AbstractMutation{AbstractIteratorMap}
+    minrel::R
+    maxrel::R
+    quantizeto::Q
+    rng::RNG
+    function TrainBatchSizeMutation(l1::R1, l2::R2, quantizeto::Q, rng::RNG) where {R1, R2, Q, RNG} 
+        R = promote_type(R1, R2)
+        return l1 < l2 ? new{R, Q, RNG}(promote(l1, l2)..., quantizeto, rng) : new{R, Q, RNG}(promote(l2, l1)..., quantizeto, rng)
+    end
+end
+TrainBatchSizeMutation(l1, l2, rng::AbstractRNG=rng_default) = TrainBatchSizeMutation(l1, l2, Int, rng)
+TrainBatchSizeMutation(l1,l2,q) = TrainBatchSizeMutation(l1,l2, q, rng_default)
+
+(m::TrainBatchSizeMutation)(im::AbstractIteratorMap) = newfrom(im)
+function (m::TrainBatchSizeMutation)(im::BatchSizeIteratorMap) 
+    newbs = max(1, mutate_batchsize(m.quantizeto, batchsize(im.tbs), m.minrel, m.maxrel, m.rng)) 
+    @set im.tbs = TrainBatchSize(newbs)
+end
+
+
+function mutate_batchsize(quantizeto::DataType, bs, minrel, maxrel, rng)
+    shift = minrel
+    scale = maxrel - minrel
+    newbs = bs * (1 + rand(rng) * scale + shift)
+    round(quantizeto, newbs)
+end
+
+function mutate_batchsize(quantizeto::Union{AbstractArray, Tuple}, bs, args...)
+    bs, ind = findmin(x -> abs(bs - x), quantizeto)
+    indstep = mutate_batchsize(Int, length(quantizeto), args...) - length(quantizeto)
+    newind = clamp(indstep + ind, firstindex(quantizeto), lastindex(quantizeto))
+    return quantizeto[newind]
+end
diff --git a/test/candidate.jl b/test/candidate.jl
index 45a7d84f..fd789f62 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -151,7 +151,8 @@ end
 
                 graphmutation = VertexMutation(MutationFilter(v -> name(v)=="hlayer", AddVertexMutation(ArchSpace(DenseSpace([1], [relu])))))
                 optmutation = OptimizerMutation((Momentum, Nesterov, ADAM))
-                evofun = MapCandidate(graphmutation, optmutation)
+                bsmutation = TrainBatchSizeMutation(0, -1, MockRng([0.5]))
+                evofun = MapCandidate(graphmutation, optmutation, bsmutation)
                 newcand = evofun(cand)
 
                 @test NaiveGAflux.model(nvertices, newcand) == 4
@@ -169,10 +170,10 @@ end
                 end
 
                 if ctype == CandidateBatchIterMap
-                    @test length(first(trainiterator(cand; default=(1:100,)))) == 16  
+                    @test length(first(trainiterator(cand; default=(1:100,)))) == 16 
                     @test length(first(validationiterator(cand; default=(1:100,)))) == 32  
-                    # TODO Add mutation
-                    @test length(first(trainiterator(newcand; default=(1:100,)))) == 16  
+
+                    @test length(first(trainiterator(newcand; default=(1:100,)))) == 8
                     @test length(first(validationiterator(newcand; default=(1:100,)))) == 32  
                 else
                     @test length(first(trainiterator(cand; default=(1:100,)))) == 100  
diff --git a/test/mutation/iteratormaps.jl b/test/mutation/iteratormaps.jl
new file mode 100644
index 00000000..fe903154
--- /dev/null
+++ b/test/mutation/iteratormaps.jl
@@ -0,0 +1,58 @@
+@testset "TrainBatchSizeMutation" begin
+    import NaiveGAflux: batchsize
+    @testset "Quantize to Int" begin
+
+        @testset "Forced to 10" begin
+            bsim = BatchSizeIteratorMap(100, 200, batchsizeselection((3,)))
+            m = TrainBatchSizeMutation(0.1, 0.1)
+            @test batchsize(m(bsim).tbs) == 110
+        end
+
+        @testset "Forced to -10" begin
+            bsim = BatchSizeIteratorMap(100, 200, batchsizeselection((3,)))
+            m = TrainBatchSizeMutation(-0.1, -0.1)
+            @test batchsize(m(bsim).tbs) == 90
+        end
+
+        @testset "Larger than 0" begin
+            bsim = BatchSizeIteratorMap(1, 200, batchsizeselection((3,)))
+            m = TrainBatchSizeMutation(-0.9, -0.9)
+            @test batchsize(m(bsim).tbs) == 1
+        end
+
+        @testset "Random" begin
+            bsim = BatchSizeIteratorMap(100, 200, batchsizeselection((3,)))
+            rng = MockRng([0.5])
+            m = TrainBatchSizeMutation(0.0, 1.0, rng)
+            @test batchsize(m(bsim).tbs) == 150
+        end
+    end
+
+    @testset "Quantize to set of numbers" begin  
+        @testset "Force one step up" begin
+            bsim = BatchSizeIteratorMap(3, 200, batchsizeselection((3,)))
+            m = TrainBatchSizeMutation(0.1, 0.1, 1:10)
+            @test batchsize(m(bsim).tbs) == 4
+        end
+
+        @testset "Force one step down" begin
+            bsim = BatchSizeIteratorMap(3, 200, batchsizeselection((3,)))
+            m = TrainBatchSizeMutation(-0.1, -0.1, 1:10)
+            @test batchsize(m(bsim).tbs) == 2
+        end
+
+        @testset "Force two steps up" begin
+            bsim = BatchSizeIteratorMap(3, 200, batchsizeselection((3,)))
+            m = TrainBatchSizeMutation(0.2, 0.2, 1:10)
+            @test batchsize(m(bsim).tbs) == 5
+        end
+
+        @testset "Random" begin
+            bsim = BatchSizeIteratorMap(128, 200, batchsizeselection((3,)))
+            rng = MockRng([0.5])
+            m = TrainBatchSizeMutation(0.0, -1.0, ntuple(i -> 2^i, 10), rng)
+            @test batchsize(m(bsim).tbs) == 4        
+        end
+    end
+
+end
\ No newline at end of file

From d3b886378a2e18a2387497634f4412cf666449f5 Mon Sep 17 00:00:00 2001
From: DrChainsaw <christian.kyril.skarby@gmail.com>
Date: Mon, 6 Jun 2022 16:33:55 +0200
Subject: [PATCH 18/36] Add docstring for TrainBatchSizeMutation

---
 src/mutation/iteratormaps.jl | 59 ++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/src/mutation/iteratormaps.jl b/src/mutation/iteratormaps.jl
index 86085bea..ee4d57ac 100644
--- a/src/mutation/iteratormaps.jl
+++ b/src/mutation/iteratormaps.jl
@@ -3,6 +3,60 @@
 
 newfrom(im::AbstractIteratorMap) = deepcopy(im)
 
+"""
+    TrainBatchSizeMutation{R<:Real, Q, RNG<:AbstractRNG} 
+    TrainBatchSizeMutation(l1, l2, quantizeto, rng) 
+    TrainBatchSizeMutation(l1, l2, rng::AbstractRNG) 
+    TrainBatchSizeMutation(l1, l2, quantizeto)
+    TrainBatchSizeMutation(l1, l2)
+
+Mutate the batch size used for training. 
+    
+Maximum possible relative change is determined by the numbers `l1` and `l2`.
+
+Behaviour depends on `quantizeto` (default `Int`) in the following way. 
+
+If `quantizeto` is a `DataType` (e.g `Int`) then the largest possible increase is `maxrel * batchsize` and the largest possible
+ decrease is `minrel * batchsize` where `minrel` and `maxrel` are `l1` and `l2` if `l1 < l2` and `l2` and `l1` otherwise. Note that
+ if `(minrel, maxrel)` is not symetric around `0` the mutation will be biased.
+
+More precisely, the new size is `round(quantizeto, (x+1) * batchsize)` where `x` is drawn from `U(minrel, maxrel)`.
+
+If `quantizeto` is a an array or tuple of values then the new size is drawn from `quantizeto` with a maximum 
+
+More precisely, the new size is `quantizeto[i]` where `i = j + round(Int, x * length(quantizeto))` where `x` is drawn from 
+`U(minrel, maxrel)` and `j` is the index for which `quantizeto[j]` is the closest to the current batch size.
+
+Use the function `mutate_batchsize` to get a feeling for how different values of `l1`, `l2` and `quantizeto` affect the new batch sizes. 
+Note that setting `l1 == l2` means that `x` in the descriptions above will always equal to `l1` (and `l2`) can also be useful in this context.
+
+# Examples
+```jldoctest aa; filter=r"\\d*"
+julia> using NaiveGAflux
+
+julia> m = TrainBatchSizeMutation(-0.1, 0.1);
+
+julia> m(BatchSizeIteratorMap(16, 32, identity))
+BatchSizeIteratorMap{typeof(identity)}(NaiveGAflux.TrainBatchSize(15), NaiveGAflux.ValidationBatchSize(32), identity)
+
+julia> m = TrainBatchSizeMutation(-0.1, 0.1, ntuple(i -> 2^i, 10)); # Quantize to powers of 2
+
+julia> m(BatchSizeIteratorMap(16, 32, identity))
+BatchSizeIteratorMap{typeof(identity)}(NaiveGAflux.TrainBatchSize(32), NaiveGAflux.ValidationBatchSize(32), identity)
+
+julia> NaiveGAflux.mutate_batchsize(Int, 16, -0.3, 0.3)
+14
+
+julia> NaiveGAflux.mutate_batchsize(Int, 16, -0.3, 0.3)
+19
+
+julia> NaiveGAflux.mutate_batchsize(ntuple(i -> 2^i, 10),  16, -0.3, 0.3)
+64
+
+julia> NaiveGAflux.mutate_batchsize(ntuple(i -> 2^i, 10),  16, -0.3, 0.3)
+8
+```
+"""
 struct TrainBatchSizeMutation{R<:Real, Q, RNG<:AbstractRNG} <: AbstractMutation{AbstractIteratorMap}
     minrel::R
     maxrel::R
@@ -14,7 +68,7 @@ struct TrainBatchSizeMutation{R<:Real, Q, RNG<:AbstractRNG} <: AbstractMutation{
     end
 end
 TrainBatchSizeMutation(l1, l2, rng::AbstractRNG=rng_default) = TrainBatchSizeMutation(l1, l2, Int, rng)
-TrainBatchSizeMutation(l1,l2,q) = TrainBatchSizeMutation(l1,l2, q, rng_default)
+TrainBatchSizeMutation(l1, l2, q) = TrainBatchSizeMutation(l1, l2, q, rng_default)
 
 (m::TrainBatchSizeMutation)(im::AbstractIteratorMap) = newfrom(im)
 function (m::TrainBatchSizeMutation)(im::BatchSizeIteratorMap) 
@@ -22,8 +76,7 @@ function (m::TrainBatchSizeMutation)(im::BatchSizeIteratorMap)
     @set im.tbs = TrainBatchSize(newbs)
 end
 
-
-function mutate_batchsize(quantizeto::DataType, bs, minrel, maxrel, rng)
+function mutate_batchsize(quantizeto::DataType, bs, minrel, maxrel, rng=rng_default)
     shift = minrel
     scale = maxrel - minrel
     newbs = bs * (1 + rand(rng) * scale + shift)

From d2390b4a1ac23326c942d8b2e1eee13d03a84613 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Mon, 6 Jun 2022 19:47:14 +0200
Subject: [PATCH 19/36] Add IteratorMapCrossover and ShieldedIteratorMap

---
 src/NaiveGAflux.jl             |  6 ++++--
 src/candidate.jl               |  2 +-
 src/crossover/generic.jl       | 15 +++++++++++++++
 src/crossover/iteratormaps.jl  | 34 ++++++++++++++++++++++++++++++++++
 src/crossover/optimizer.jl     | 17 +++++------------
 src/iteratormaps.jl            | 23 ++++++++++++++++++++++-
 src/mutation/iteratormaps.jl   |  5 ++---
 test/crossover/iteratormaps.jl | 33 +++++++++++++++++++++++++++++++++
 test/iteratormaps.jl           | 12 ++++++++++++
 test/mutation/iteratormaps.jl  | 13 +++++++++++++
 test/runtests.jl               |  2 ++
 11 files changed, 143 insertions(+), 19 deletions(-)
 create mode 100644 src/crossover/generic.jl
 create mode 100644 src/crossover/iteratormaps.jl
 create mode 100644 test/crossover/iteratormaps.jl

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index 1932e052..fcb33619 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -54,7 +54,7 @@ export RepeatPartitionIterator, SeedIterator, MapIterator, GpuIterator, BatchIte
 export StatefulGenerationIter
 
 # Iterator mapping types for evolving hyperparameters related to datasets, e.g. augmentation and batch size
-export BatchSizeIteratorMap, IteratorMaps
+export BatchSizeIteratorMap, IteratorMaps, ShieldedIteratorMap
 
 # Persistence
 export persist
@@ -74,7 +74,7 @@ export OptimizerMutation, LearningRateMutation, AddOptimizerMutation
 export TrainBatchSizeMutation
 
 # Crossover types
-export AbstractCrossover, VertexCrossover, CrossoverSwap, OptimizerCrossover, LearningRateCrossover
+export AbstractCrossover, VertexCrossover, CrossoverSwap, OptimizerCrossover, LearningRateCrossover, IteratorMapCrossover
 
 # architecture spaces
 export AbstractArchSpace, LoggingArchSpace, VertexSpace, NoOpArchSpace, ArchSpace, ConditionalArchSpace, RepeatArchSpace, ArchSpaceChain
@@ -105,8 +105,10 @@ include("mutation/generic.jl")
 include("mutation/graph.jl")
 include("mutation/optimizer.jl")
 include("mutation/iteratormaps.jl")
+include("crossover/generic.jl")
 include("crossover/graph.jl")
 include("crossover/optimizer.jl")
+include("crossover/iteratormaps.jl")
 include("candidate.jl")
 include("fitness.jl")
 include("evolve.jl")
diff --git a/src/candidate.jl b/src/candidate.jl
index e63f5f47..c93fd1e7 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -303,7 +303,7 @@ struct MapType{T, F1, F2}
     MapType{T}(match::F1, nomatch::F2) where {T,F1, F2} = new{T,F1,F2}(match, nomatch)
 end
 
-(a::MapType{T1})(x::T2) where {T1, T2<:T1} = a.match(x)
+(a::MapType{T})(x::T) where T = a.match(x)
 (a::MapType)(x) = a.nomatch(x)
 
 MapType(match::AbstractMutation{T}, nomatch) where T = MapType{T}(match, nomatch)
diff --git a/src/crossover/generic.jl b/src/crossover/generic.jl
new file mode 100644
index 00000000..1e9d1fd8
--- /dev/null
+++ b/src/crossover/generic.jl
@@ -0,0 +1,15 @@
+
+# Useful for dispatch of type crossover(pair::EitherIs{Shielded}) = pair
+const MixTuple{T1, T2} = Union{Tuple{T1, T2}, Tuple{T2, T1}}
+const EitherIs{T} = MixTuple{T, Any}
+
+# Useful for doing crossover between candiates which wraps a collection of candidates, 
+# e.g. Flux.Optimiser and IteratorMaps
+function zipcrossover(reiterfun, (c1,c2), crossoverfun)
+    cs1,c1re = reiterfun(c1)
+    cs2,c2re = reiterfun(c2)
+    res = crossoverfun.(zip(cs1,cs2))
+    cs1n = (t[1] for t in res)
+    cs2n = (t[2] for t in res)
+    return c1re(cs1n..., cs1[length(cs2)+1:end]...), c2re(cs2n..., cs2[length(cs1)+1:end]...)
+end
diff --git a/src/crossover/iteratormaps.jl b/src/crossover/iteratormaps.jl
new file mode 100644
index 00000000..40af29ac
--- /dev/null
+++ b/src/crossover/iteratormaps.jl
@@ -0,0 +1,34 @@
+
+
+"""
+IteratorMapCrossover{C} <: AbstractCrossover{AbstractIteratorMap}
+IteratorMapCrossover() 
+IteratorMapCrossover(crossover)
+
+Apply crossover between `AbstractIteratorMap`s.
+
+Type of crossover is determined by `crossover` (default `iteratormapswap`) which when given a a tuple of two `AbstractIteratorMap`s will return the result of the crossover operation as a tuple of `AbstractIteratorMap`s.
+
+Designed to be composable with most utility `AbstractMutation`s as well as with itself. For instance, the following seemingly odd construct will swap components of two `IteratorMaps` with a probability of `0.2` per component:
+
+`IteratorMapCrossover(MutationProbability(IteratorMapCrossover(), 0.2))`
+
+Compare with the following which either swaps all components or none:
+
+`MutationProbability(IteratorMapCrossover(), 0.2)`
+"""
+struct IteratorMapCrossover{C} <: AbstractCrossover{AbstractIteratorMap}
+    crossover::C
+end
+IteratorMapCrossover() = IteratorMapCrossover(iteratormapswap)
+
+(ic::IteratorMapCrossover)(ims) = ic.crossover(ims)
+(ic::IteratorMapCrossover)(ims::EitherIs{ShieldedIteratorMap}) = ims
+(ic::IteratorMapCrossover)(ims::EitherIs{IteratorMaps}) = zipcrossover(reimiter, ims, ic.crossover)
+(ic::IteratorMapCrossover)(ims::MixTuple{ShieldedIteratorMap, IteratorMaps}) = zipcrossover(reimiter, ims, ic.crossover)
+
+reimiter(im) = (im,), identity
+reimiter(im::IteratorMaps) = im.maps, IteratorMaps
+
+iteratormapswap((im1, im2)::Tuple, args...) = im2, im1
+iteratormapswap(ims::EitherIs{ShieldedIteratorMap}, args...) = ims
\ No newline at end of file
diff --git a/src/crossover/optimizer.jl b/src/crossover/optimizer.jl
index 8d082d90..d60391ce 100644
--- a/src/crossover/optimizer.jl
+++ b/src/crossover/optimizer.jl
@@ -29,21 +29,14 @@ Does not do anything if any of the optimizers don't have a learning rate (e.g. W
 """
 LearningRateCrossover() = OptimizerCrossover(learningrateswap)
 
-EitherIs{T} = Union{Tuple{T, Any}, Tuple{Any,T}}
-
 (oc::OptimizerCrossover)(os) = oc.crossover(os)
 (oc::OptimizerCrossover)(os::EitherIs{ShieldedOpt}) = os
-function (oc::OptimizerCrossover)((o1,o2)::EitherIs{Flux.Optimiser})
-    os1,o1re = optiter(o1)
-    os2,o2re = optiter(o2)
-    res = oc.crossover.(zip(os1,os2))
-    os1n = (t[1] for t in res)
-    os2n = (t[2] for t in res)
-    return o1re(os1n..., os1[length(os2)+1:end]...), o2re(os2n..., os2[length(os1)+1:end]...)
-end
+(oc::OptimizerCrossover)(os::EitherIs{Flux.Optimiser}) = zipcrossover(reoptiter, os, oc.crossover)
+(oc::OptimizerCrossover)(os::MixTuple{ShieldedOpt, Flux.Optimiser}) = os
+
 
-optiter(o) = (o,), (os...) -> os[1]
-optiter(o::Flux.Optimiser) = Tuple(o.os), (os...) -> Flux.Optimiser(os...)
+reoptiter(o) = (o,), identity
+reoptiter(o::Flux.Optimiser) = Tuple(o.os), Flux.Optimiser
 
 optimizerswap((o1, o2)::Tuple) = o2,o1
 
diff --git a/src/iteratormaps.jl b/src/iteratormaps.jl
index b94e1ad8..c26969b8 100644
--- a/src/iteratormaps.jl
+++ b/src/iteratormaps.jl
@@ -81,6 +81,10 @@ end
 
 """
     IteratorMaps{T} <: AbstractIteratorMap 
+    IteratorMaps(maps...)
+    IteratorMaps(maps::Tuple) 
+
+Aggregates multiple `AbstractIteratorMap`s. `maptrain` and `mapvalidation` are applied sequentially starting with the first element of `maps`. 
 """
 struct IteratorMaps{T<:Tuple} <: AbstractIteratorMap
     maps::T
@@ -90,4 +94,21 @@ IteratorMaps(x...) = IteratorMaps(x)
 maptrain(iws::IteratorMaps, iter) = foldr(maptrain, iws.maps; init=iter)
 mapvalidation(iws::IteratorMaps, iter) = foldr(mapvalidation, iws.maps; init=iter)
 
-limit_maxbatchsize(ims::IteratorMaps, args...; kwargs...) = IteratorMaps(map(im -> limit_maxbatchsize(im, args...; kwargs...), ims.maps))
\ No newline at end of file
+limit_maxbatchsize(ims::IteratorMaps, args...; kwargs...) = IteratorMaps(map(im -> limit_maxbatchsize(im, args...; kwargs...), ims.maps))
+
+"""
+    ShieldedIteratorMap{T}
+    ShieldedIteratorMap(map)
+
+Shields `map` from mutation and crossover.
+"""
+struct ShieldedIteratorMap{T} <: AbstractIteratorMap
+    map::T
+end
+
+maptrain(sim::ShieldedIteratorMap, args...) = maptrain(sim.map, args...)
+mapvalidation(sim::ShieldedIteratorMap, args...) = mapvalidation(sim.map, args...)
+
+function limit_maxbatchsize(sim::ShieldedIteratorMap, args...; kwargs...) 
+    ShieldedIteratorMap(limit_maxbatchsize(sim.map), args...; kwargs...)
+end
\ No newline at end of file
diff --git a/src/mutation/iteratormaps.jl b/src/mutation/iteratormaps.jl
index ee4d57ac..370a4fd0 100644
--- a/src/mutation/iteratormaps.jl
+++ b/src/mutation/iteratormaps.jl
@@ -1,6 +1,3 @@
-
-(m::AbstractMutation{<:AbstractIteratorMap})(im::IteratorMaps) = IteratorMaps(m.(im.maps))
-
 newfrom(im::AbstractIteratorMap) = deepcopy(im)
 
 """
@@ -71,6 +68,8 @@ TrainBatchSizeMutation(l1, l2, rng::AbstractRNG=rng_default) = TrainBatchSizeMut
 TrainBatchSizeMutation(l1, l2, q) = TrainBatchSizeMutation(l1, l2, q, rng_default)
 
 (m::TrainBatchSizeMutation)(im::AbstractIteratorMap) = newfrom(im)
+(m::TrainBatchSizeMutation)(im::IteratorMaps) = IteratorMaps(m.(im.maps))
+
 function (m::TrainBatchSizeMutation)(im::BatchSizeIteratorMap) 
     newbs = max(1, mutate_batchsize(m.quantizeto, batchsize(im.tbs), m.minrel, m.maxrel, m.rng)) 
     @set im.tbs = TrainBatchSize(newbs)
diff --git a/test/crossover/iteratormaps.jl b/test/crossover/iteratormaps.jl
new file mode 100644
index 00000000..478809bb
--- /dev/null
+++ b/test/crossover/iteratormaps.jl
@@ -0,0 +1,33 @@
+@testset "IteratorMapCrossover" begin
+    import NaiveGAflux: AbstractIteratorMap
+    struct ImcTestDummy1 <: AbstractIteratorMap end
+    struct ImcTestDummy2 <: AbstractIteratorMap end
+    struct ImcTestDummy3 <: AbstractIteratorMap end
+
+    @testset "Simple" begin
+        @test IteratorMapCrossover()((ImcTestDummy1(), ImcTestDummy2())) ==  (ImcTestDummy2(), ImcTestDummy1())     
+    end
+
+    @testset "IteratorMaps" begin
+        ims1 = IteratorMaps(ImcTestDummy1(), ImcTestDummy2())
+        ims2 = IteratorMaps(ImcTestDummy2(), ImcTestDummy1())
+
+        imc = IteratorMapCrossover()
+
+        @test imc((ims1, ims2)) == (ims2, ims1)
+        @test imc((ims1, ImcTestDummy2())) == (IteratorMaps(ImcTestDummy2(), ImcTestDummy2()), ImcTestDummy1())
+    end
+
+    @testset "LogMutation and MutationProbability" begin
+        mplm(c) = MutationProbability(LogMutation(((im1,im2)::Tuple) -> "Crossover between $(im1) and $(im2)", c), Probability(0.2, MockRng([0.3, 0.1, 0.3])))
+        imc = IteratorMapCrossover() |> mplm |> IteratorMapCrossover
+
+        ims1 = IteratorMaps(ImcTestDummy1(), ImcTestDummy2(), ImcTestDummy3())
+        ims2 = IteratorMaps(ImcTestDummy3(), ImcTestDummy1(), ImcTestDummy2())
+
+        ims1n,ims2n = @test_logs (:info, "Crossover between ImcTestDummy2() and ImcTestDummy1()") imc((ims1,ims2))
+
+        @test ims1n == IteratorMaps(ImcTestDummy1(), ImcTestDummy1(), ImcTestDummy3())
+        @test ims2n == IteratorMaps(ImcTestDummy3(), ImcTestDummy2(), ImcTestDummy2())
+    end
+end
\ No newline at end of file
diff --git a/test/iteratormaps.jl b/test/iteratormaps.jl
index 56acb4b6..1f043721 100644
--- a/test/iteratormaps.jl
+++ b/test/iteratormaps.jl
@@ -45,4 +45,16 @@
         @test collect(maptrain(IteratorMaps(td1, td2), 1:3)) == 6:6:18
         @test collect(mapvalidation(IteratorMaps(td1, td2), 1:3)) == 35:35:105
     end
+
+    @testset "ShieldedIteratorMap" begin
+        NaiveGAflux.maptrain(::Val{:TestDummy1}, itr) = Iterators.map(x -> 2x, itr)
+        NaiveGAflux.mapvalidation(::Val{:TestDummy1}, itr) = Iterators.map(x -> 5x, itr)
+        NaiveGAflux.limit_maxbatchsize(::Val{:TestDummy1}) = Val(:TestDummy2)
+
+        sim = ShieldedIteratorMap(Val(:TestDummy1))
+
+        @test collect(maptrain(sim, 1:3)) == 2:2:6
+        @test collect(mapvalidation(sim, 1:3)) == 5:5:15
+        @test limit_maxbatchsize(sim) == ShieldedIteratorMap(Val(:TestDummy2))
+    end
 end
\ No newline at end of file
diff --git a/test/mutation/iteratormaps.jl b/test/mutation/iteratormaps.jl
index fe903154..56c300c5 100644
--- a/test/mutation/iteratormaps.jl
+++ b/test/mutation/iteratormaps.jl
@@ -1,5 +1,6 @@
 @testset "TrainBatchSizeMutation" begin
     import NaiveGAflux: batchsize
+
     @testset "Quantize to Int" begin
 
         @testset "Forced to 10" begin
@@ -55,4 +56,16 @@
         end
     end
 
+    @testset "Shielded" begin
+        sim = ShieldedIteratorMap(BatchSizeIteratorMap(100, 200, batchsizeselection((3,))))
+        m = TrainBatchSizeMutation(0.1, 0.1)
+        @test batchsize(m(sim).map.tbs) == 100
+    end
+
+    @testset "IteratorMaps" begin
+        im = IteratorMaps(BatchSizeIteratorMap(100, 200, batchsizeselection((3,))), BatchSizeIteratorMap(100, 200, batchsizeselection((3,))))
+        m = TrainBatchSizeMutation(0.1, 0.1)
+        @test batchsize(m(im).maps[1].tbs) == 110
+        @test batchsize(m(im).maps[2].tbs) == 110
+    end
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index ac7b303d..7b9362ac 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -43,10 +43,12 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
     include("mutation/generic.jl")
     include("mutation/graph.jl")
     include("mutation/optimizer.jl")
+    include("mutation/iteratormaps.jl")
 
     @info "Testing crossover"
     include("crossover/graph.jl")
     include("crossover/optimizer.jl")
+    include("crossover/iteratormaps.jl")
 
     @info "Testing fitness"
     include("fitness.jl")

From 204f5f12c6ccde79ca92c234aaccbcb05637c7c0 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Mon, 6 Jun 2022 20:09:30 +0200
Subject: [PATCH 20/36] Make TrainAccuracyCandidate a functor

---
 src/fitness.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/fitness.jl b/src/fitness.jl
index 3f23e7aa..143e0357 100644
--- a/src/fitness.jl
+++ b/src/fitness.jl
@@ -239,6 +239,8 @@ function lossfun(c::TrainAccuracyCandidate; default=nothing)
     end
 end
 
+@functor TrainAccuracyCandidate (c,)
+
 """
     TrainAccuracyFitnessInner <: AbstractFitness
 

From 9cf4c01d5e3f292303ce5d133f8ba8cca6fa9a78 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Mon, 6 Jun 2022 22:46:20 +0200
Subject: [PATCH 21/36] Add MapType for crossover of IteratorMaps Fix issues
 with batchsize selection for models without parameters Add batch size as
 hyperparameter in ImageClassification

---
 .../ImageClassification.jl                    |  9 +++++---
 src/app/imageclassification/strategy.jl       | 17 ++++++++++----
 src/batchsize.jl                              | 23 +++++++++++++------
 src/candidate.jl                              | 13 +++++++++++
 test/batchsize.jl                             | 12 ++++++++++
 test/candidate.jl                             | 18 ++++++++++++---
 6 files changed, 74 insertions(+), 18 deletions(-)

diff --git a/src/app/imageclassification/ImageClassification.jl b/src/app/imageclassification/ImageClassification.jl
index 27392fb9..e3f9702c 100644
--- a/src/app/imageclassification/ImageClassification.jl
+++ b/src/app/imageclassification/ImageClassification.jl
@@ -131,12 +131,15 @@ function generate_persistent(nr, newpop, mdir, insize, outsize, cwrap=identity,
     end
 
     iv(i) = conv2dinputvertex(join(["model", i, ".input"]), insize[3])
-    return Population(PersistentArray(mdir, nr, i -> create_model(join(["model", i]), archspace, iv(i), cwrap)))
+    return Population(PersistentArray(mdir, nr, i -> create_model(join(["model", i]), archspace, iv(i), cwrap, insize)))
 end
-function create_model(name, as, in, cwrap)
+function create_model(name, as, in, cwrap, insize)
     optselect = optmutation(1.0)
     opt = optselect(Descent(rand() * 0.099 + 0.01))
-    cwrap(CandidateOptModel(opt, CompGraph(in, as(name, in))))
+    bslimit = batchsizeselection(insize[1:end-1]; alternatives=ntuple(i->2^i, 10))
+    imstart = BatchSizeIteratorMap(64, 64, bslimit)
+    im = itermapmutation(1.0)(imstart)
+    cwrap(CandidateDataIterMap(im, CandidateOptModel(opt, CompGraph(in, as(name, in)))))
 end
 
 end
diff --git a/src/app/imageclassification/strategy.jl b/src/app/imageclassification/strategy.jl
index 97468b7f..c6801bf6 100644
--- a/src/app/imageclassification/strategy.jl
+++ b/src/app/imageclassification/strategy.jl
@@ -146,7 +146,7 @@ struct BatchedIterConfig{T, V}
     dataaug::T
     iterwrap::V
 end
-BatchedIterConfig(;batchsize=32, dataaug=identity, iterwrap=identity) = BatchedIterConfig(batchsize, dataaug, iterwrap)
+BatchedIterConfig(;batchsize=1024, dataaug=identity, iterwrap=identity) = BatchedIterConfig(batchsize, dataaug, iterwrap)
 dataiter(s::BatchedIterConfig, x, y) = dataiter(x, y, s.batchsize, s.dataaug) |> s.iterwrap
 
 """
@@ -168,7 +168,7 @@ struct ShuffleIterConfig{T, V}
     dataaug::T
     iterwrap::V
 end
-ShuffleIterConfig(;batchsize=32, seed=123, dataaug=identity, iterwrap=identity) = ShuffleIterConfig(batchsize, seed, dataaug, iterwrap)
+ShuffleIterConfig(;batchsize=1024, seed=123, dataaug=identity, iterwrap=identity) = ShuffleIterConfig(batchsize, seed, dataaug, iterwrap)
 dataiter(s::ShuffleIterConfig, x, y) = dataiter(x, y, s.batchsize, s.seed, s.dataaug) |> s.iterwrap
 
 
@@ -317,7 +317,7 @@ Crossover is done using [`CrossoverSwap`](@ref) for models and [`LearningRateCro
 
 Mutation is applied both to the model itself (change sizes, add/remove vertices/edges) as well as to the optimizer (change learning rate and optimizer algorithm).
 """
-crossovermutate(;pcrossover=0.3, pmutate=0.9) = function(inshape)
+crossovermutate(;pcrossover=0.3, pmutate=0.8) = function(inshape)
     cross = candidatecrossover(pcrossover)
     crossoverevo = AfterEvolution(PairCandidates(EvolveCandidates(cross)), align_vertex_names)
 
@@ -327,8 +327,8 @@ crossovermutate(;pcrossover=0.3, pmutate=0.9) = function(inshape)
     return EvolutionChain(crossoverevo, mutationevo)
 end
 
-candidatemutation(p, inshape) = MapCandidate(MutationProbability(graphmutation(inshape), p), optmutation())
-candidatecrossover(p) = MapCandidate(MutationProbability(graphcrossover(), p), optcrossover())
+candidatemutation(p, inshape) = MapCandidate(MutationProbability(graphmutation(inshape), p), optmutation(), itermapmutation())
+candidatecrossover(p) = MapCandidate(MutationProbability(graphcrossover(), p), optcrossover(), itermapcrossover())
 
 function clear_redundant_vertices(pop)
     foreach(cand -> NaiveGAflux.model(check_apply, cand), pop)
@@ -359,6 +359,13 @@ function rename_model(i, cand)
     end
 end
 
+itermapcrossover(p= 0.2) = MutationProbability(IteratorMapCrossover(), p) |> IteratorMapCrossover
+
+function itermapmutation(p=0.1)
+    m = TrainBatchSizeMutation(-0.2, 0.2, ntuple(i -> 2^(i+2), 8))
+    return MutationProbability(m, p)
+end
+
 function optcrossover(poptswap=0.3, plrswap=0.4)
     lrc = MutationProbability(LearningRateCrossover(), plrswap) |> OptimizerCrossover
     oc = MutationProbability(OptimizerCrossover(), poptswap) |> OptimizerCrossover
diff --git a/src/batchsize.jl b/src/batchsize.jl
index 9a66c811..bb5a3c6b 100644
--- a/src/batchsize.jl
+++ b/src/batchsize.jl
@@ -291,7 +291,8 @@ end
 # specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
 # Consider refactoring
 function limit_maxbatchsize(bs::TrainBatchSize, model::CompGraph; inshape_nobatch, availablebytes = _availablebytes())
-    min(batchsize(bs), maxtrainbatchsize(model, inshape_nobatch, availablebytes))
+    maxsize = maxtrainbatchsize(model, inshape_nobatch, availablebytes)
+    maxsize > -1 ? min(batchsize(bs), maxsize) : batchsize(bs)
 end
 
 # specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
@@ -301,22 +302,24 @@ function limit_maxbatchsize(bs::ValidationBatchSize,
                             inshape_nobatch,
                             availablebytes = _availablebytes()
                             )
-    min(batchsize(bs), maxvalidationbatchsize(model, inshape_nobatch, availablebytes))
+    maxsize = maxvalidationbatchsize(model, inshape_nobatch, availablebytes)
+    maxsize > -1 ? min(batchsize(bs), maxsize) : batchsize(bs)
 end
 
 function maxtrainbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
-    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model))
+    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model); init=0)
     actsize = activationsizes(model, inshape_nobatch) 
-    return fld(availablebytes - paramsize, paramsize + 2 * actsize)
+    den = paramsize + 2 * actsize
+    return den > 0 ? fld(availablebytes - paramsize, den) : -1
 end
 
 function maxvalidationbatchsize(model, inshape_nobatch, availablebytes=_availablebytes())
-    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model))
+    paramsize = mapreduce(ps -> length(ps) * sizeof(eltype(ps)), +, params(model); init=0)
     actsize = activationsizes(model, inshape_nobatch)
-    return fld(availablebytes - paramsize, actsize)
+    return actsize > 0 ? fld(availablebytes - paramsize, actsize) : -1
 end
 
-function activationsizes(model::CompGraph, inshape_nobatch, elemsize = model |> params |> first |> eltype |> sizeof)
+function activationsizes(model::CompGraph, inshape_nobatch, elemsize = _model_parsize(model))
     model = cpu(model) # Flux.outputsize does not work for CuArrays
     activations = if length(inputs(model)) == 1
         Dict{AbstractVertex, Any}(v => Flux.nil_input(true, inshape_nobatch) for v in inputs(model))
@@ -330,6 +333,12 @@ function activationsizes(model::CompGraph, inshape_nobatch, elemsize = model |>
     mapreduce(act -> length(act) * elemsize, +, values(activations))
 end
 
+function _model_parsize(model)
+    ps = params(model)
+    isempty(ps) && return 0
+    return ps |> first |> eltype |> sizeof
+end
+
 # TODO: Take model as input and look at params to determine of cpu or gpu
 function _availablebytes()
     if CUDA.functional()
diff --git a/src/candidate.jl b/src/candidate.jl
index c93fd1e7..2e56bb2a 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -48,6 +48,8 @@ lossfun(::AbstractCandidate; default=nothing) = default
 
 fitness(::AbstractCandidate; default=nothing) = default
 generation(::AbstractCandidate; default=nothing) = default
+
+iteratormap(::AbstractCandidate; default=nothing) = default
 trainiterator(::AbstractCandidate; default=nothing) = default
 validationiterator(::AbstractCandidate; default=nothing) = default
 
@@ -77,6 +79,7 @@ fitness(c::AbstractWrappingCandidate; kwargs...) = fitness(wrappedcand(c); kwarg
 generation(c::AbstractWrappingCandidate; kwargs...) = generation(wrappedcand(c); kwargs...)
 trainiterator(c::AbstractWrappingCandidate; kwargs...) = trainiterator(wrappedcand(c); kwargs...)
 validationiterator(c::AbstractWrappingCandidate; kwargs...) = validationiterator(wrappedcand(c); kwargs...)
+iteratormap(c::AbstractWrappingCandidate; kwargs...) = iteratormap(wrappedcand(c); kwargs...)
 
 """
     CandidateModel <: Candidate
@@ -145,6 +148,7 @@ end
 
 @functor CandidateDataIterMap
 
+iteratormap(c::CandidateDataIterMap; kwargs...) = c.map
 trainiterator(c::CandidateDataIterMap; kwargs...) = maptrain(c.map, trainiterator(wrappedcand(c); kwargs...))
 validationiterator(c::CandidateDataIterMap; kwargs...) = mapvalidation(c.map, validationiterator(wrappedcand(c); kwargs...))
 
@@ -328,6 +332,15 @@ function MapType(c::AbstractCrossover{FluxOptimizer}, (c1, c2), (nomatch1, nomat
     return MapType{FluxOptimizer}(Returns(o1n), nomatch1), MapType{FluxOptimizer}(Returns(o2n), nomatch2)
 end
 
+function MapType(c::AbstractCrossover{AbstractIteratorMap}, (c1, c2), (nomatch1, nomatch2))
+    im1 = iteratormap(c1)
+    im2 = iteratormap(c2)
+
+    im1n, im2n = c((im1, im2))
+
+    return MapType{AbstractIteratorMap}(Returns(im1n), nomatch1), MapType{AbstractIteratorMap}(Returns(im2n), nomatch2)
+end
+
 # Just because BatchSizeIteratorMap needs the model to limit the batch sizes :(
 # Try to come up with a cleaner design...
 apply_mapfield(f::MapType, x, args...) = (@info "apply $f to $x"; apply_mapfield(f.nomatch, x, args...))
diff --git a/test/batchsize.jl b/test/batchsize.jl
index 81e43bec..faaf41c0 100644
--- a/test/batchsize.jl
+++ b/test/batchsize.jl
@@ -75,6 +75,18 @@
         @test limit_maxbatchsize(ValidationBatchSize(6), graph; inshape_nobatch=(5,), availablebytes=1000) == 6
         @test limit_maxbatchsize(ValidationBatchSize(8), graph; inshape_nobatch=(5,), availablebytes=1000) == 8
         @test limit_maxbatchsize(ValidationBatchSize(10), graph; inshape_nobatch=(5,), availablebytes=1000) == 8
+
+        @testset "Model without parameters" begin
+            graph = let iv = denseinputvertex("in", 3)
+                CompGraph(iv, iv)
+            end
+
+            @test limit_maxbatchsize(TrainBatchSize(1), graph; inshape_nobatch=(3,), availablebytes=10) == 1 
+            @test limit_maxbatchsize(TrainBatchSize(9), graph; inshape_nobatch=(3,), availablebytes=1000) == 9
+            
+            @test limit_maxbatchsize(ValidationBatchSize(1), graph; inshape_nobatch=(3,), availablebytes=10) == 1
+            @test limit_maxbatchsize(ValidationBatchSize(9), graph; inshape_nobatch=(3,), availablebytes=10) == 9
+        end
     end
 
     @testset "batchsizeselection" begin
diff --git a/test/candidate.jl b/test/candidate.jl
index fd789f62..74944cea 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -11,14 +11,20 @@
     end
 
     @testset "Crossover" begin
-        import NaiveGAflux: FluxOptimizer
+        import NaiveGAflux: FluxOptimizer, iteratormap, batchsize
 
         struct MapTypeTestCrossover{T} <: AbstractCrossover{T} end
         (::MapTypeTestCrossover)((c1, c2)) = c2,c1
 
 
-        c1 = CandidateOptModel(Descent(), CompGraph(inputvertex("c1", 1), AbstractVertex[]))
-        c2 = CandidateOptModel(Momentum(), CompGraph(inputvertex("c2", 1), AbstractVertex[]))
+        function testgraph(name)
+            iv = denseinputvertex(name, 1)
+            CompGraph(iv, iv)
+        end
+        bsgen(bs) = BatchSizeIteratorMap(bs, 2*bs, (bs, args...; kwargs...) -> batchsize(bs))
+
+        c1 = CandidateDataIterMap(bsgen(4), CandidateOptModel(Descent(), CandidateModel(testgraph("c1"))))
+        c2 = CandidateOptModel(Momentum(), CandidateDataIterMap(bsgen(8), CandidateModel(testgraph("c2"))))
         
         mt1, mt2 = MapType(MapTypeTestCrossover{CompGraph}(), (c1,c2), (identity, identity))
         @test name.(inputs(mt1(model(c1)))) == ["c2"]
@@ -31,6 +37,12 @@
         @test typeof(mt2(opt(c2))) == Descent 
         @test mt1(3) == 3
         @test mt2('c') == 'c'
+
+        mt1, mt2 = MapType(MapTypeTestCrossover{AbstractIteratorMap}(), (c1,c2), (identity, identity))
+        @test mt1(iteratormap(c1)).tbs == iteratormap(c2).tbs
+        @test mt2(iteratormap(c2)).tbs == iteratormap(c1).tbs
+        @test mt1(3) == 3
+        @test mt2('c') == 'c'
     end
 end
 

From 955da3ee3deb6d4bf2bef4b3112e91e9a872b0a9 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 8 Jun 2022 00:10:58 +0200
Subject: [PATCH 22/36] Add testing of shielding permutations

---
 src/crossover/iteratormaps.jl  |  2 +-
 src/crossover/optimizer.jl     |  5 +++--
 test/crossover/iteratormaps.jl | 30 ++++++++++++++++++++++++++++++
 test/crossover/optimizer.jl    | 32 ++++++++++++++++++++++++++++----
 4 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/src/crossover/iteratormaps.jl b/src/crossover/iteratormaps.jl
index 40af29ac..783a0733 100644
--- a/src/crossover/iteratormaps.jl
+++ b/src/crossover/iteratormaps.jl
@@ -24,8 +24,8 @@ IteratorMapCrossover() = IteratorMapCrossover(iteratormapswap)
 
 (ic::IteratorMapCrossover)(ims) = ic.crossover(ims)
 (ic::IteratorMapCrossover)(ims::EitherIs{ShieldedIteratorMap}) = ims
+(ic::IteratorMapCrossover)(ims::MixTuple{ShieldedIteratorMap, IteratorMaps}) = ims
 (ic::IteratorMapCrossover)(ims::EitherIs{IteratorMaps}) = zipcrossover(reimiter, ims, ic.crossover)
-(ic::IteratorMapCrossover)(ims::MixTuple{ShieldedIteratorMap, IteratorMaps}) = zipcrossover(reimiter, ims, ic.crossover)
 
 reimiter(im) = (im,), identity
 reimiter(im::IteratorMaps) = im.maps, IteratorMaps
diff --git a/src/crossover/optimizer.jl b/src/crossover/optimizer.jl
index d60391ce..34f77c36 100644
--- a/src/crossover/optimizer.jl
+++ b/src/crossover/optimizer.jl
@@ -31,14 +31,15 @@ LearningRateCrossover() = OptimizerCrossover(learningrateswap)
 
 (oc::OptimizerCrossover)(os) = oc.crossover(os)
 (oc::OptimizerCrossover)(os::EitherIs{ShieldedOpt}) = os
-(oc::OptimizerCrossover)(os::EitherIs{Flux.Optimiser}) = zipcrossover(reoptiter, os, oc.crossover)
 (oc::OptimizerCrossover)(os::MixTuple{ShieldedOpt, Flux.Optimiser}) = os
+(oc::OptimizerCrossover)(os::EitherIs{Flux.Optimiser}) = zipcrossover(reoptiter, os, oc.crossover)
 
 
 reoptiter(o) = (o,), identity
 reoptiter(o::Flux.Optimiser) = Tuple(o.os), Flux.Optimiser
 
-optimizerswap((o1, o2)::Tuple) = o2,o1
+optimizerswap((o1, o2)) = o2,o1
+optimizerswap(os::EitherIs{ShieldedOpt}) = os
 
 learningrateswap((o1,o2)::Tuple) = (@set o1.eta = learningrate(o2)) , (@set o2.eta = learningrate(o1))
 learningrateswap(os::EitherIs{ShieldedOpt}) = os
diff --git a/test/crossover/iteratormaps.jl b/test/crossover/iteratormaps.jl
index 478809bb..2a046a33 100644
--- a/test/crossover/iteratormaps.jl
+++ b/test/crossover/iteratormaps.jl
@@ -30,4 +30,34 @@
         @test ims1n == IteratorMaps(ImcTestDummy1(), ImcTestDummy1(), ImcTestDummy3())
         @test ims2n == IteratorMaps(ImcTestDummy3(), ImcTestDummy2(), ImcTestDummy2())
     end
+
+    @testset "ShieldedIteratorMap" begin
+
+        imc = IteratorMapCrossover()
+        
+        @testset "$baseim1 and $baseim2" for (baseim1, baseim2) in (
+            (ImcTestDummy1(), ImcTestDummy2()),
+            (ImcTestDummy1(), IteratorMaps(ImcTestDummy2())),
+            (IteratorMaps(ImcTestDummy1()),IteratorMaps(ImcTestDummy2()))
+        )
+            @testset "With Shielding $w1 and $w2" for (w1, w2) in (
+                (identity, ShieldedIteratorMap),
+                (ShieldedIteratorMap, identity)
+            )
+                im1 = w1(baseim1)
+                im2 = w2(baseim2)
+
+                @test imc((im1, im2)) == (im1, im2)
+                @test imc((im2, im1)) == (im2, im1)
+            end
+        end
+
+        @testset "Inner shielding$(wrap == identity ? "" : wrap)" for wrap in (identity, IteratorMaps)
+            im1 = IteratorMaps(ShieldedIteratorMap(ImcTestDummy1()))
+            im2 = wrap(ImcTestDummy2())
+
+            @test imc((im1, im2)) == (im1, im2)
+            @test imc((im2, im1)) == (im2, im1)
+        end
+    end
 end
\ No newline at end of file
diff --git a/test/crossover/optimizer.jl b/test/crossover/optimizer.jl
index d1a3f69a..c0054e5f 100644
--- a/test/crossover/optimizer.jl
+++ b/test/crossover/optimizer.jl
@@ -14,10 +14,34 @@
         @test prts.(oc((o2,o1))) == prts.(ooc((o2,o1))) == prts.((o1, o2))
     end
 
-    @testset "Don't swap shielded" begin
-        o1 = ShieldedOpt(Descent())
-        o2 = ShieldedOpt(Momentum())
-        @test OptimizerCrossover()((o1,o2)) == (o1,o2)
+    @testset "ShieldedOpt" begin
+
+        oc = OptimizerCrossover()
+        
+        @testset "$baseo1 and $baseo2" for (baseo1, baseo2) in (
+            (Descent(), Momentum()),
+            (Descent(), Flux.Optimiser(Momentum())),
+            (Flux.Optimiser(Descent()),Flux.Optimiser(Momentum()))
+        )
+            @testset "With Shielding $w1 and $w2" for (w1, w2) in (
+                (identity, ShieldedOpt),
+                (ShieldedOpt, identity)
+            )
+                o1 = w1(baseo1)
+                o2 = w2(baseo2)
+
+                @test oc((o1, o2)) == (o1, o2)
+                @test oc((o2, o1)) == (o2, o1)
+            end
+        end
+
+        @testset "Inner shielding$(wrap == identity ? "" : wrap)" for wrap in (identity, Flux.Optimiser)
+            o1 = Flux.Optimiser(ShieldedOpt(Descent()))
+            o2 = wrap(Momentum())
+
+            @test prts.(oc((o1, o2))) == prts.((o1, o2))
+            @test prts.(oc((o2, o1))) == prts.((o2, o1))
+        end
     end
 
     @testset "Cardinality difference" begin

From 24bc394792135b0f8952193f7dfb22829bebe198 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 8 Jun 2022 00:46:15 +0200
Subject: [PATCH 23/36] Remove some dead code Test AbstractIteratorMap
 fallbacks

---
 src/iterators.jl              |  7 +------
 test/iterators.jl             | 12 ++++++++++--
 test/mutation/iteratormaps.jl |  9 +++++++++
 test/mutation/optimizer.jl    |  1 +
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/iterators.jl b/src/iterators.jl
index f8270d65..a6c590e8 100644
--- a/src/iterators.jl
+++ b/src/iterators.jl
@@ -179,10 +179,6 @@ end
 shufflerng(b::Bool) = b ? rng_default : NoShuffle()
 shufflerng(rng) = rng
 
-ndata(itr::BatchIterator) = ndata(itr.data)
-ndata(data::Tuple) = length(data)
-ndata(data::AbstractArray) = 1 
-
 function Base.iterate(itr::BatchIterator, inds = shuffle(itr.rng, 1:itr.nobs))
     isempty(inds) && return nothing
     return batch(itr.data, @view(inds[1:min(end, itr.batchsize)])), @view(inds[itr.batchsize+1:end])
@@ -333,8 +329,7 @@ Base.IteratorSize(::Type{ReBatchingIterator{I}}) where I = Base.SizeUnknown()
 Base.IteratorEltype(::Type{ReBatchingIterator{I}}) where I = Base.IteratorEltype(I) 
 
 _rangetoarr(a) = a
-_rangetoarr(::Tuple{T1, T2}) where {T1, T2} = Tuple{_rangetoarr(T1), _rangetoarr(T2)}
-_rangetoarr(t::Type{<:Tuple}) = Tuple{_rangetoarr.(t.parameters)...}
+_rangetoarr(t::Type{<:Tuple}) = Tuple{map(_rangetoarr, t.parameters)...}
 _rangetoarr(a::Type{<:Array}) = a
 _rangetoarr(a::Type{<:CUDA.CuArray}) = a
 _rangetoarr(::Type{<:AbstractArray{T,N}}) where {T,N} = Array{T,N}
diff --git a/test/iterators.jl b/test/iterators.jl
index 0f7f0e65..a59449ef 100644
--- a/test/iterators.jl
+++ b/test/iterators.jl
@@ -119,8 +119,14 @@ end
     @testset "BatchIterator shuffle basic" begin
         @test reduce(vcat, BatchIterator(1:20, 3; shuffle=true)) |> sort == 1:20
 
-        itr = BatchIterator(ones(2,3,4), 4; shuffle=MersenneTwister(2))
-        @test "siter: $itr" == "siter: BatchIterator(size=(2, 3, 4), batchsize=4, shuffle=true)"
+        @testset "Show array" begin
+            itr = BatchIterator(ones(2,3,4), 4; shuffle=MersenneTwister(2))
+            @test "siter: $itr" == "siter: BatchIterator(size=(2, 3, 4), batchsize=4, shuffle=true)"
+        end
+        @testset "Show tuple" begin
+            itr = BatchIterator((ones(2,3,4), ones(2,4)), 4; shuffle=MersenneTwister(2))
+            @test "siter: $itr" == "siter: BatchIterator(size=((2, 3, 4), (2, 4)), batchsize=4, shuffle=true)"
+        end
     end
 
     @testset "BatchIterator shuffle ndims $(length(dims))" for dims in ((5), (3,4), (2,3,4), (2,3,4,5), (2,3,4,5,6), (2,3,4,5,6,7))
@@ -187,6 +193,8 @@ end
         titer = TimedIterator(;timelimit=0.1, patience=2, timeoutaction = () -> timeoutcnt += 1, accumulate_timeouts=acc, base=1:10)
 
         @test collect(titer) == 1:10
+        @test length(titer) == 10
+        @test eltype(titer) == Int
         @test timeoutcnt === 0 # Or else we'll have flakey tests...
 
         for i in titer
diff --git a/test/mutation/iteratormaps.jl b/test/mutation/iteratormaps.jl
index 56c300c5..715e8947 100644
--- a/test/mutation/iteratormaps.jl
+++ b/test/mutation/iteratormaps.jl
@@ -1,3 +1,12 @@
+@testset "Fallbacks" begin
+    import NaiveGAflux: maptrain, mapvalidation, limit_maxbatchsize
+    struct NoOpIterMapDummy <: AbstractIteratorMap end
+
+    @test maptrain(NoOpIterMapDummy(), 13) == 13
+    @test mapvalidation(NoOpIterMapDummy(), 17) == 17
+    @test limit_maxbatchsize(NoOpIterMapDummy(), :a, 13; bleh="aa") == NoOpIterMapDummy()
+end
+
 @testset "TrainBatchSizeMutation" begin
     import NaiveGAflux: batchsize
 
diff --git a/test/mutation/optimizer.jl b/test/mutation/optimizer.jl
index 951ca8af..c8dff22e 100644
--- a/test/mutation/optimizer.jl
+++ b/test/mutation/optimizer.jl
@@ -25,6 +25,7 @@
 
         @test typeof.(m(Descent(0.2)).os) == [Descent]
         @test typeof.(m(Momentum(0.2)).os) == [Momentum, Descent]
+        @test typeof(m(ShieldedOpt(Descent()))) == ShieldedOpt{Descent}
         @test typeof.(m(Flux.Optimiser(Nesterov(), Descent(), ShieldedOpt(Descent()))).os) == [Nesterov, ShieldedOpt{Descent}, Descent]
     end
 

From cf0f678ad5541984fd06d3a8a4f6674606414cb0 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 8 Jun 2022 01:48:20 +0200
Subject: [PATCH 24/36] Add missing import

---
 test/mutation/iteratormaps.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/mutation/iteratormaps.jl b/test/mutation/iteratormaps.jl
index 715e8947..617f3b14 100644
--- a/test/mutation/iteratormaps.jl
+++ b/test/mutation/iteratormaps.jl
@@ -1,5 +1,5 @@
 @testset "Fallbacks" begin
-    import NaiveGAflux: maptrain, mapvalidation, limit_maxbatchsize
+    import NaiveGAflux: AbstractIteratorMap, maptrain, mapvalidation, limit_maxbatchsize
     struct NoOpIterMapDummy <: AbstractIteratorMap end
 
     @test maptrain(NoOpIterMapDummy(), 13) == 13

From c8c913a0f2e55f835e2d92d198f81f9c4d5183e0 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 8 Jun 2022 21:21:52 +0200
Subject: [PATCH 25/36] Remove dead code Remove GpuGcIterator

---
 src/candidate.jl |  8 --------
 src/fitness.jl   |  5 ++---
 src/iterators.jl | 31 -------------------------------
 3 files changed, 2 insertions(+), 42 deletions(-)

diff --git a/src/candidate.jl b/src/candidate.jl
index 2e56bb2a..91ab8a13 100644
--- a/src/candidate.jl
+++ b/src/candidate.jl
@@ -340,15 +340,7 @@ function MapType(c::AbstractCrossover{AbstractIteratorMap}, (c1, c2), (nomatch1,
 
     return MapType{AbstractIteratorMap}(Returns(im1n), nomatch1), MapType{AbstractIteratorMap}(Returns(im2n), nomatch2)
 end
-
-# Just because BatchSizeIteratorMap needs the model to limit the batch sizes :(
-# Try to come up with a cleaner design...
-apply_mapfield(f::MapType, x, args...) = (@info "apply $f to $x"; apply_mapfield(f.nomatch, x, args...))
-apply_mapfield(f::MapType{T1}, x::T2, args...) where {T1, T2<:T1} = apply_mapfield(f.match, x, args...)
-apply_mapfield(f::typeof(deepcopy), x, args...) = f(x)
-apply_mapfield(f, x, args...) = f(x, args...)
     
-
 """
     MapCandidate{T, F} 
     MapCandidate(mutations, mapothers::F)
diff --git a/src/fitness.jl b/src/fitness.jl
index 143e0357..0b1c20cd 100644
--- a/src/fitness.jl
+++ b/src/fitness.jl
@@ -140,9 +140,8 @@ function _fitnessiterator(f, c::AbstractCandidate, iter)
 end
 
 matchdatatype(ps::Flux.Params, iter) = isempty(ps) ? iter : matchdatatype(first(ps), iter)
-# TODO: GpuGcIterator is a temporary workaround for what seems like a CUDA issue where memory allocation becomes very slow
-# after the number of reserved (but still available) bytes is close to the totol available GPU memory
-matchdatatype(::CUDA.CuArray, iter) = GpuGcIterator(GpuIterator(iter))
+
+matchdatatype(::CUDA.CuArray, iter) = GpuIterator(iter)
 matchdatatype(::AbstractArray, iter) = iter
 
 """
diff --git a/src/iterators.jl b/src/iterators.jl
index a6c590e8..f649ee58 100644
--- a/src/iterators.jl
+++ b/src/iterators.jl
@@ -375,34 +375,3 @@ _collectbatch(b) = b
 
 _innerbatchsize(t::Tuple) = _innerbatchsize(first(t))
 _innerbatchsize(a::AbstractArray) = size(a, ndims(a))
-
-
-## Temp workaround for CUDA memory issue where it for some reason takes very long time to make use of available memory
-struct GpuGcIterator{I}
-    base::I
-end
-
-function Base.iterate(itr::GpuGcIterator) 
-    valstate = iterate(itr.base)
-    valstate === nothing && return nothing
-    val, state = valstate
-    return val, (2, state)
-end
-
-function Base.iterate(itr::GpuGcIterator, (cnt, state)) 
-    meminfo = CUDA.MemoryInfo()
-    if meminfo.total_bytes - meminfo.pool_reserved_bytes < 2e9
-        NaiveGAflux.gpu_gc()
-    end
-    valstate = iterate(itr.base, state)
-    valstate === nothing && return nothing
-    val, state = valstate
-    return val, (cnt+1, state)
-end
-
-Base.IteratorSize(::Type{GpuGcIterator{I}}) where I = Base.IteratorSize(I)
-Base.IteratorEltype(::Type{GpuGcIterator{I}}) where I = Base.IteratorEltype(I)
-
-Base.length(itr::GpuGcIterator) = length(itr.base)
-Base.size(itr::GpuGcIterator) = size(itr.base)
-Base.eltype(::Type{GpuGcIterator{I}}) where I = eltype(I)

From 470c425fbe49280713a7b5071ad1c26a3940f001 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Thu, 9 Jun 2022 00:30:01 +0200
Subject: [PATCH 26/36] Flesh out docs for iterator maps

---
 docs/make.jl                       |  5 +++
 docs/src/index.md                  |  3 +-
 docs/src/reference/candidate.md    |  2 -
 docs/src/reference/crossover.md    |  6 +++
 docs/src/reference/iteratormaps.md | 16 +++++++
 docs/src/reference/mutation.md     |  5 +++
 src/batchsize.jl                   |  8 +---
 src/crossover/iteratormaps.jl      |  6 +--
 src/iteratormaps.jl                | 18 +++++++-
 src/mutation/iteratormaps.jl       |  3 +-
 test/examples.jl                   |  1 +
 test/examples/iteratormaps.jl      | 67 ++++++++++++++++++++++++++++++
 test/examples/mutation.jl          | 19 +++++----
 test/examples/quicktutorial.jl     |  2 +-
 14 files changed, 137 insertions(+), 24 deletions(-)
 create mode 100644 docs/src/reference/iteratormaps.md
 create mode 100644 test/examples/iteratormaps.jl

diff --git a/docs/make.jl b/docs/make.jl
index c8bdb2bb..e43b0b02 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,5 +1,7 @@
 using Documenter, Literate, NaiveGAflux, NaiveGAflux.AutoFlux, NaiveGAflux.AutoFlux.ImageClassification
 
+import NaiveGAflux: AbstractIteratorMap, maptrain, mapvalidation, limit_maxbatchsize
+
 const nndir = joinpath(dirname(pathof(NaiveGAflux)), "..")
 
 function literate_example(sourcefile; rootdir=nndir, sourcedir = "test/examples", destdir="docs/src/examples")
@@ -16,6 +18,7 @@ crossover_ex = literate_example("crossover.jl")
 fitness_ex = literate_example("fitness.jl")
 candidate_ex = literate_example("candidate.jl")
 evolution_ex = literate_example("evolution.jl")
+iteratormaps_ex = literate_example("iteratormaps.jl")
 iterators_ex = literate_example("iterators.jl")
 
 makedocs(   sitename="NaiveGAflux",
@@ -37,6 +40,7 @@ makedocs(   sitename="NaiveGAflux",
                     fitness_ex,
                     candidate_ex,
                     evolution_ex,
+                    iteratormaps_ex,
                     iterators_ex
                 ],
                 "API Reference" => [
@@ -47,6 +51,7 @@ makedocs(   sitename="NaiveGAflux",
                     "reference/candidate.md",
                     "reference/evolution.md",
                     "reference/batchsize.md",
+                    "reference/iteratormaps.md",
                     "reference/iterators.md",
                     "reference/utils.md",
                 ]
diff --git a/docs/src/index.md b/docs/src/index.md
index db46ca67..62a7dd14 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -24,7 +24,8 @@ self-contained, allowing you to pick and choose the ones you like when building
 4. [Fitness Functions](@ref)
 5. [Candidate Utilities](@ref)
 6. [Evolution Strategies](@ref)
-7. [Iterators](@ref)
+7. [Iterator Maps](@ref) 
+8. [Iterators](@ref)
 
 
 
diff --git a/docs/src/reference/candidate.md b/docs/src/reference/candidate.md
index 18b60d95..d6244fa1 100644
--- a/docs/src/reference/candidate.md
+++ b/docs/src/reference/candidate.md
@@ -10,6 +10,4 @@ Population
 model
 opt
 lossfun
-BatchSizeIteratorMap
-AbstractIteratorMap
 ```
\ No newline at end of file
diff --git a/docs/src/reference/crossover.md b/docs/src/reference/crossover.md
index d5398a45..916605c9 100644
--- a/docs/src/reference/crossover.md
+++ b/docs/src/reference/crossover.md
@@ -14,6 +14,12 @@ CrossoverSwap
 OptimizerCrossover
 LearningRateCrossover
 ```
+
+## Core IteratorMap Crossover Operations
+```@docs
+IteratorMapCrossover
+```
+
 ## Functions
 ```@docs
 NaiveGAflux.crossover
diff --git a/docs/src/reference/iteratormaps.md b/docs/src/reference/iteratormaps.md
new file mode 100644
index 00000000..1ab8ff64
--- /dev/null
+++ b/docs/src/reference/iteratormaps.md
@@ -0,0 +1,16 @@
+# [Iterator Maps](@id IteratorMapsAPI)
+
+## Iterator map types
+```@docs
+AbstractIteratorMap
+ShieldedIteratorMap
+BatchSizeIteratorMap
+IteratorMaps
+```
+
+## Interface functions (not exported)
+```@docs
+maptrain
+mapvalidation
+limit_maxbatchsize
+```
\ No newline at end of file
diff --git a/docs/src/reference/mutation.md b/docs/src/reference/mutation.md
index 52634687..2521b23c 100644
--- a/docs/src/reference/mutation.md
+++ b/docs/src/reference/mutation.md
@@ -18,6 +18,11 @@ LearningRateMutation
 AddOptimizerMutation
 ```
 
+## Core IteratorMap Mutation Operations
+```@docs
+TrainBatchSizeMutation
+```
+
 ## Wrapping Mutation Operations
 
 ```@docs
diff --git a/src/batchsize.jl b/src/batchsize.jl
index bb5a3c6b..5fcef008 100644
--- a/src/batchsize.jl
+++ b/src/batchsize.jl
@@ -288,17 +288,13 @@ function batchsizeselection(inshape_nobatch::Tuple;
     bs = isnothing(alternatives) ? bs : BatchSizeSelectionFromAlternatives(alternatives, bs)
 end
 
-# specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
-# Consider refactoring
-function limit_maxbatchsize(bs::TrainBatchSize, model::CompGraph; inshape_nobatch, availablebytes = _availablebytes())
+function limit_maxbatchsize(bs::TrainBatchSize, model; inshape_nobatch, availablebytes = _availablebytes())
     maxsize = maxtrainbatchsize(model, inshape_nobatch, availablebytes)
     maxsize > -1 ? min(batchsize(bs), maxsize) : batchsize(bs)
 end
 
-# specialization for CompGraph needed to avoid ambiguity with method that just unwraps an AbstractCandidate :( 
-# Consider refactoring
 function limit_maxbatchsize(bs::ValidationBatchSize,
-                            model::CompGraph; 
+                            model; 
                             inshape_nobatch,
                             availablebytes = _availablebytes()
                             )
diff --git a/src/crossover/iteratormaps.jl b/src/crossover/iteratormaps.jl
index 783a0733..8a072a41 100644
--- a/src/crossover/iteratormaps.jl
+++ b/src/crossover/iteratormaps.jl
@@ -1,9 +1,9 @@
 
 
 """
-IteratorMapCrossover{C} <: AbstractCrossover{AbstractIteratorMap}
-IteratorMapCrossover() 
-IteratorMapCrossover(crossover)
+    IteratorMapCrossover{C} <: AbstractCrossover{AbstractIteratorMap}
+    IteratorMapCrossover() 
+    IteratorMapCrossover(crossover)
 
 Apply crossover between `AbstractIteratorMap`s.
 
diff --git a/src/iteratormaps.jl b/src/iteratormaps.jl
index c26969b8..3e94ea48 100644
--- a/src/iteratormaps.jl
+++ b/src/iteratormaps.jl
@@ -10,9 +10,25 @@ strategies for data augmentation and batch size selection can be evolved.
 """
 abstract type AbstractIteratorMap end
 
+"""
+    maptrain(im::AbstractIteratorMap, iter)
+
+Return an iterator (default `iter`) suitable for training.
+"""
 maptrain(::AbstractIteratorMap, iter) = iter
+
+"""
+    mapvalidation(im::AbstractIteratorMap, iter)
+
+Return an iterator (default `iter`) suitable for validation.
+"""
 mapvalidation(::AbstractIteratorMap, iter) = iter
 
+"""
+    limit_maxbatchsize(im::AbstractIteratorMap, args...; kwargs...)
+
+Return an `AbstractIteratorMap` which is capable of limiting the batch size if applicable to the type of `im` (e.g. if `im` is a `BatchSizeIteratorMap`), otherwise return `im`.
+"""
 limit_maxbatchsize(im::AbstractIteratorMap, args...; kwargs...) = im
  
 """
@@ -84,7 +100,7 @@ end
     IteratorMaps(maps...)
     IteratorMaps(maps::Tuple) 
 
-Aggregates multiple `AbstractIteratorMap`s. `maptrain` and `mapvalidation` are applied sequentially starting with the first element of `maps`. 
+Aggregates multiple `AbstractIteratorMap`s. `maptrain` and `mapvalidation` are applied sequentially starting with the first element of `maps`.
 """
 struct IteratorMaps{T<:Tuple} <: AbstractIteratorMap
     maps::T
diff --git a/src/mutation/iteratormaps.jl b/src/mutation/iteratormaps.jl
index 370a4fd0..04d4e966 100644
--- a/src/mutation/iteratormaps.jl
+++ b/src/mutation/iteratormaps.jl
@@ -19,7 +19,8 @@ If `quantizeto` is a `DataType` (e.g `Int`) then the largest possible increase i
 
 More precisely, the new size is `round(quantizeto, (x+1) * batchsize)` where `x` is drawn from `U(minrel, maxrel)`.
 
-If `quantizeto` is a an array or tuple of values then the new size is drawn from `quantizeto` with a maximum 
+If `quantizeto` is a an array or tuple of values then the new size is drawn from `quantizeto` with elements closer 
+to the current batch size being more likely.
 
 More precisely, the new size is `quantizeto[i]` where `i = j + round(Int, x * length(quantizeto))` where `x` is drawn from 
 `U(minrel, maxrel)` and `j` is the index for which `quantizeto[j]` is the closest to the current batch size.
diff --git a/test/examples.jl b/test/examples.jl
index 958be631..0b300b41 100644
--- a/test/examples.jl
+++ b/test/examples.jl
@@ -7,5 +7,6 @@
     include("examples/fitness.jl")
     include("examples/candidate.jl")
     include("examples/evolution.jl")
+    include("examples/iteratormaps.jl")
     include("examples/iterators.jl")
 end
diff --git a/test/examples/iteratormaps.jl b/test/examples/iteratormaps.jl
new file mode 100644
index 00000000..4c3915c7
--- /dev/null
+++ b/test/examples/iteratormaps.jl
@@ -0,0 +1,67 @@
+md"""
+# Iterator Maps
+
+Iterator maps is the name chosen (in lack of a better name) for mapping an iterator to a new iterator. The main use 
+cases for this are:
+
+1. Limiting the batch size of a candidate to prevent out of memory errors (see [Batch Size Utilities](@ref BatchSizeUtilsAPI)).
+2. Enabling search for the best training batch size (using e.g. [`TrainBatchSizeMutation`](@ref) and/or [`IteratorMapCrossover`](@ref)).
+3. Enabling search for the best data augmentation setting (not part of this package as of yet).
+
+Iterator maps are inteded to be used with [`CandidateDataIterMap`](@ref) and must extend [`AbstractIteratorMap`](@ref).
+See [`AbstractIteratorMap`](@ref) documentation for functions related to iterator maps.
+
+In an attempt to hit two birds with one stone, here is an example of a custom iterator map which logs the sizes
+of what a wrapped iterator returns. This allows us to see the effects of [`BatchSizeIteratorMap`](@ref) without
+digging too much into the internals.
+"""
+
+@testset "Spy on the size" begin #src
+
+using NaiveGAflux, Flux
+import NaiveGAflux: AbstractIteratorMap
+
+struct SizeSpyingIteratorMap <: AbstractIteratorMap end
+
+NaiveGAflux.maptrain(::SizeSpyingIteratorMap, iter) = Iterators.map(iter) do val
+    @info "The sizes are $(size.(val))"
+    return val
+end
+
+# Create the iterator map we want to use. Last argument to [`BatchSizeIteratorMap`](@ref) is 
+# normally created through [`batchsizeselection`](@ref), but here we will use a dummy model
+# for which the maximum batch size computation is not defined.
+iteratormap = IteratorMaps(SizeSpyingIteratorMap(), BatchSizeIteratorMap(8, 16, (bs, _) -> bs)) 
+
+# Create a candidate with the above mentioned dummy model.
+cand = CandidateDataIterMap(iteratormap, CandidateModel(sum))
+
+# Data set has `20` examples, and here we provide it "raw" without any batching for brevity. 
+# Other arguments are not important for this example.
+fitstrat = TrainThenFitness(
+                            dataiter = ((randn(32, 32, 3, 20), randn(1, 20)),),
+                            defaultloss = (x, y) -> sum(x .+ y),
+                            defaultopt = Flux.Optimise.Descent(),
+                            fitstrat = SizeFitness()
+                            )
+
+# When the model is trained it will wrap the iterator accoring to our `iteratormap`.
+@test_logs((:info, "The sizes are ((32, 32, 3, 8), (1, 8))"),
+           (:info, "The sizes are ((32, 32, 3, 8), (1, 8))"),
+           (:info, "The sizes are ((32, 32, 3, 4), (1, 4))"), 
+           fitness(fitstrat, cand))
+
+# Lets mutate the candidate with a new batch size (`SizeSpyingIteratorMap` does not have any properties to mutate).
+# Here we set `l1 == l2` to prevent that randomness breaks the testcase, but you might want to use something like 
+# `TrainBatchSizeMutation(-0.1, 0.1, ntuple(i -> 2^i))`. The last argument is to make sure we select a power of two
+# as the new batch size.
+batchsizemutation = TrainBatchSizeMutation(0.1, 0.1, ntuple(i -> 2^i, 10))
+
+# MapCandidate creates new candidates from a set of mutations or crossovers.
+newcand = cand |> MapCandidate(batchsizemutation)
+
+@test_logs((:info, "The sizes are ((32, 32, 3, 16), (1, 16))"),
+           (:info, "The sizes are ((32, 32, 3, 4), (1, 4))"), 
+           fitness(fitstrat, newcand))
+
+end #src
\ No newline at end of file
diff --git a/test/examples/mutation.jl b/test/examples/mutation.jl
index b10637cd..51cde1e4 100644
--- a/test/examples/mutation.jl
+++ b/test/examples/mutation.jl
@@ -5,15 +5,16 @@ Mutation is the way one candidate is transformed to a slightly different candida
 preserving parameters and alignment between layers, thus reducing the impact of mutating an already trained candidate.
 
 The following basic mutation operations are currently supported:
-1. Change the output size of vertices using [`NoutMutation`](@ref).
-2. Remove vertices using [`RemoveVertexMutation`](@ref).
-3. Add vertices using [`AddVertexMutation`](@ref).
-4. Remove edges between vertices using [`RemoveEdgeMutation`](@ref).
-5. Add edges between vertices using [`AddEdgeMutation`](@ref).
-6. Mutation of kernel size for conv layers using [`KernelSizeMutation`](@ref).
-7. Change of activation function using [`ActivationFunctionMutation`](@ref).
-8. Change the type of optimizer using [`OptimizerMutation`](@ref).
-9. Add an optimizer using [`AddOptimizerMutation`](@ref).
+1.  Change the output size of vertices using [`NoutMutation`](@ref).
+2.  Remove vertices using [`RemoveVertexMutation`](@ref).
+3.  Add vertices using [`AddVertexMutation`](@ref).
+4.  Remove edges between vertices using [`RemoveEdgeMutation`](@ref).
+5.  Add edges between vertices using [`AddEdgeMutation`](@ref).
+6.  Mutation of kernel size for conv layers using [`KernelSizeMutation`](@ref).
+7.  Change of activation function using [`ActivationFunctionMutation`](@ref).
+8.  Change the type of optimizer using [`OptimizerMutation`](@ref).
+9.  Add an optimizer using [`AddOptimizerMutation`](@ref).
+10. Change the batch size for training using [`TrainBatchSizeMutation`](@ref)
 
 Mutation operations are exported as structs rather than functions since they are designed to be composed with more generic utilities. Here are a few examples:
 """
diff --git a/test/examples/quicktutorial.jl b/test/examples/quicktutorial.jl
index ca12f8e5..db9876b3 100644
--- a/test/examples/quicktutorial.jl
+++ b/test/examples/quicktutorial.jl
@@ -62,7 +62,7 @@ mutation = MutationChain(changesize, remlayer, addlayer)
 # applying the mutations above to three of the five models with higher fitness 
 # giving higher probability of being selected. 
 #
-# [`MapCandidate^](@ref) helps with the plumbing of creating new `CandidateModel`s
+# [`MapCandidate`](@ref) helps with the plumbing of creating new `CandidateModel`s
 #  where `mutation` is applied to create a new model. 
 elites = EliteSelection(2)
 mutate = SusSelection(3, EvolveCandidates(MapCandidate(mutation)))

From 4713a277ac7f11aab991d031815716ca104e38e9 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Thu, 9 Jun 2022 00:54:35 +0200
Subject: [PATCH 27/36] Loosen wording which was no longer correct

---
 test/examples/candidate.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/examples/candidate.jl b/test/examples/candidate.jl
index 1247211a..0bdac129 100644
--- a/test/examples/candidate.jl
+++ b/test/examples/candidate.jl
@@ -7,7 +7,7 @@ an `AbstractCandidate` needs to
 2. Be able to create a new version of itself given a function which maps its fields to new fields.
 
 Capability 1. is generally performed through functions of the format `someproperty(candidate; default)` where in general 
-`someproperty(::AbstractCandidate; default=nothing) = default`. The following such functions are currently implemented by NaiveGAflux:
+`someproperty(::AbstractCandidate; default=nothing) = default`. Examples of such functions are:
 
 * [`model(c; default)`](@ref model)  : Return a model
 * [`opt(c; default)`](@ref opt)    : Return an optimizer

From e28a9fa6b460c11afdb8d26719acdf404fe0d030 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Thu, 9 Jun 2022 00:58:13 +0200
Subject: [PATCH 28/36] Replace reference link

---
 docs/src/reference/iteratormaps.md | 2 +-
 test/examples/iteratormaps.jl      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/reference/iteratormaps.md b/docs/src/reference/iteratormaps.md
index 1ab8ff64..af7286e1 100644
--- a/docs/src/reference/iteratormaps.md
+++ b/docs/src/reference/iteratormaps.md
@@ -8,7 +8,7 @@ BatchSizeIteratorMap
 IteratorMaps
 ```
 
-## Interface functions (not exported)
+## [Interface functions (not exported)](@id IteratorMapInterfaceFunctionsAPI)
 ```@docs
 maptrain
 mapvalidation
diff --git a/test/examples/iteratormaps.jl b/test/examples/iteratormaps.jl
index 4c3915c7..ef4ddffe 100644
--- a/test/examples/iteratormaps.jl
+++ b/test/examples/iteratormaps.jl
@@ -9,7 +9,7 @@ cases for this are:
 3. Enabling search for the best data augmentation setting (not part of this package as of yet).
 
 Iterator maps are inteded to be used with [`CandidateDataIterMap`](@ref) and must extend [`AbstractIteratorMap`](@ref).
-See [`AbstractIteratorMap`](@ref) documentation for functions related to iterator maps.
+See [`API`](@ref IteratorMapInterfaceFunctionsAPI) documentation for functions related to iterator maps.
 
 In an attempt to hit two birds with one stone, here is an example of a custom iterator map which logs the sizes
 of what a wrapped iterator returns. This allows us to see the effects of [`BatchSizeIteratorMap`](@ref) without

From b4f48e509d1be45d9eeda7248826251ca8b240f4 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 15 Jun 2022 01:27:49 +0200
Subject: [PATCH 29/36] Replace tuples with arrays

---
 src/shape.jl  | 108 +++++++++++++++++++++++++++++---------------------
 test/shape.jl | 102 +++++++++++++++++++++++++----------------------
 2 files changed, 117 insertions(+), 93 deletions(-)

diff --git a/src/shape.jl b/src/shape.jl
index 82fec570..28bd251a 100644
--- a/src/shape.jl
+++ b/src/shape.jl
@@ -1,7 +1,6 @@
 # General disclaimer: This whole little mini-lib feels like a poor mans implementation of basic symbolic arithmetics and therefore it is hidden in here so I can just delete it silently when I finally realize how this should really be handled.
 # It probably looks like it is capable of much more than it actually can do so if you stumble upon this, use with care. Any bug report are of course still extremely welcome, especially the kind which would motivate scrapping the whole thing in favour of a real solution.
 
-
 """
     ΔShape{N,M}
 
@@ -94,10 +93,11 @@ end
 
 fshape(::Tuple{}, shape) = shape
 fshape(s::Tuple{ΔShape{N}, Vararg{ΔShape}}, shape::NTuple{N, Integer}) where N = foldr(fshape, reverse(s); init=shape)
+fshape(s::AbstractArray{<:ΔShape{N}}, shape::NTuple{N, Integer}) where N = foldr(fshape, reverse(s); init=shape)
 
 """
     revert(s::ΔShape)
-    revert(s::Tuple{Vararg{ΔShape}})
+    revert(s::AbstractArray{<:ΔShape}) 
 
 Return a `ΔShape` or tuple of `ΔShape`s which reverts the shape change of `s`, i.e `fshape((s..., revert(s)...), x) == x`.
 
@@ -109,6 +109,7 @@ revert(s::ShapeAdd) = ShapeAdd(.-shapeΔ(s))
 revert(s::ShapeMul) = ShapeDiv(shapeΔ(s))
 revert(s::ShapeDiv) = ShapeMul(shapeΔ(s))
 revert(s::Tuple{Vararg{ΔShape}}) = reverse(revert.(s))
+revert(s::AbstractArray{<:ΔShape}) = reverse(revert.(s))
 
 """
     combine(s1::ΔShape,s2::ΔShape)
@@ -116,8 +117,10 @@ revert(s::Tuple{Vararg{ΔShape}}) = reverse(revert.(s))
 Return a `ΔShape` or tuple of `ΔShape`s which combines `s1` and `s2`, i.e `fshape((s1,s2), x) == fshape(combine(s1,s2), x)`
 """
 combine(s1::ΔShape,s2::ΔShape) = s1,s2
-combine(s1::Tuple{Vararg{ΔShape}}, s2::ΔShape) = (s1[1:end-1]..., combine(last(s1), s2)...)
-combine(s1::ΔShape, s2::Tuple{Vararg{ΔShape}}) = (combine(s1, first(s2))..., s2[2:end]...)
+combine(s1::Tuple{Vararg{ΔShape}}, s2::ΔShape) = vcat(collect(s1[1:end-1]), combine(last(s1), s2)...)
+combine(s1::ΔShape, s2::Tuple{Vararg{ΔShape}}) = vcat(combine(s1, first(s2))..., collect(s2[2:end]))
+combine(s1::AbstractArray{<:ΔShape}, s2::ΔShape) = vcat(s1[1:end-1], combine(last(s1), s2)...)
+combine(s1::ΔShape, s2::AbstractArray{<:ΔShape}) = vcat(combine(s1, first(s2))..., s2[2:end])
 combine(s1::ShapeAdd{N}, s2::ShapeAdd{N}) where N = tuple(ShapeAdd(shapeΔ(s1) .+ shapeΔ(s2)))
 combine(s1::T, s2::T) where T <: Union{ShapeDiv{N}, ShapeMul{N}} where N = tuple(T(shapeΔ(s1) .* shapeΔ(s2)))
 # Note: Combining ShapeDiv and ShapeMul not generally safe due to rounding when dividing
@@ -145,11 +148,12 @@ swapΔshape(s1::ShapeDiv{N}, s2::ShapeAdd{N}) where N = ShapeAdd(shapeΔ(s1) .*
     filter_noops(s::ΔShape)
     filter_noops(s::ΔShape...)
     filter_noops(s::Tuple{Vararg{ΔShape}})
+    filter_noops(s::AbstractArray{<:ΔShape}) 
     filter_noops(s::ΔShape)
 
 Return a tuple of `ΔShape`s where all identity mappings (e.g things like `ShapeAdd(0)`) are removed.
 
-If called with a single identity mapping and empty tuple is returned.
+If called with a single identity mapping an empty tuple is returned.
 """
 filter_noops(s::ΔShape) = tuple(s)
 filter_noops(s::ΔShape...) = filter_noops(s)
@@ -158,7 +162,16 @@ filter_noops(s::Union{ShapeMul, ShapeDiv}) = all(x -> x == 1, shapeΔ(s)) ? tupl
 filter_noops(s::ShapeAdd) = all(x -> x == 0, shapeΔ(s)) ? tuple() : tuple(s)
 
 """
-    orderΔshapes(s::Tuple{Vararg{ΔShape}}; order=allΔshapetypes(s))
+    filter_noops(s::AbstractArray{<:ΔShape}) 
+
+Return an array of `ΔShape`s where all identity mappings (e.g things like `ShapeAdd(0)`) are removed.
+
+If called with a single identity mapping an empty array is returned.
+"""
+filter_noops(s::AbstractArray{<:ΔShape}) = mapreduce(filter_noops, (s1,s2) -> vcat(s1...,s2...), s; init=similar(s, 0))
+
+"""
+    orderΔshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s))
 
 Return a tuple of `ΔShape`s which has the same shape mapping as `s` (i.e `fshape(s, x) == fshape(orderΔshapes(s), x)`) but where `ΔShape`s to the extent possible are ordered according to `order`.
 
@@ -166,67 +179,72 @@ Useful to determine whether two arbitrary sequences of `ΔShape`s result in the
 
 Warning: Sort is not stable due to lazy implementation, i.e `orderΔshapes(orderΔshapes(s; order=someorder);order=someorder)` is not guaranteed to return the same thing as `orderΔshapes(s; order=someorder)`.
 """
-function orderΔshapes(s::Tuple{Vararg{ΔShape}}; order=allΔshapetypes(s))
+orderΔshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s)) = orderΔshapes!(copy(s); order)
+function orderΔshapes!(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s))
     # Yeah, this is bubble sort :/ Given the constraint that ΔShapes can't always swap places along with the fact that swapping generally changes the swapped elements I couldn't think up and other sorting algo that works
-    sprev = tuple()
-    snew = s
     nlook = length(s)
-    # I'm a little worried that lack of stability guarantee will cause this to loop forever, but I have not been able to trigger it despite trying.
-    while sprev != snew
-        sprev = snew
+
+    atleastoneswap = true
+
+    while atleastoneswap
         nlook -= 1
+        atleastoneswap = false
         for i in 1:nlook
-            s1,s2 = snew[i], snew[i+1]
+            s1,s2 = s[i], s[i+1]
             # Check if s1 shall be before s2 in the ordering
             # Note: We need to "bubble" on equality because s1 might be prevented from bubbling up while s2 isn't.
             if findfirst(st -> s1 isa st, order) >= findfirst(st -> s2 isa st, order)
-                snew = (snew[1:i-1]..., swapΔshape(s1,s2)..., snew[i+2:end]...)
+                s1n, s2n = swapΔshape(s1, s2)
+                atleastoneswap |= s1n != s1 
+                s[i] = s1n
+                s[i+1] = s2n
             end
         end
     end
-    return snew
+    return s
 end
 
-allΔshapetypes(s::T) where T <: ΔShape = T
+allΔshapetypes(::T) where T <: ΔShape = T
 allΔshapetypes(s::Tuple{Vararg{ΔShape}}) = unique(allΔshapetypes.(s))
+allΔshapetypes(s::AbstractArray{<:ΔShape}) = unique(allΔshapetypes.(s))
 
 """
     squashshapes(s::ΔShape...; order=allΔshapetypes(s))
-    squashshapes(s::Tuple{Vararg{ΔShape}}; order=allΔshapetypes(s))
+    squashshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s))
 
-Return a tuple of `ΔShape`s with the same shape mapping as `s` (i.e `fshape(s, x) == fshape(squashshapes(s), x)`) with as few `ΔShape`s as possible for the given `order`.
+Return an array of `ΔShape`s with the same shape mapping as `s` (i.e `fshape(s, x) == fshape(squashshapes(s), x)`) with as few `ΔShape`s as possible for the given `order`.
 
 Useful to determine whether two arbitrary sequences of `ΔShape`s result in the same shape mapping for all shapes.
 """
-squashshapes(s::Tuple{}; order=nothing) = s
-squashshapes(s::ΔShape; order=nothing) = tuple(s)
-squashshapes(s::ΔShape...; order=allΔshapetypes(s)) = squashshapes(s; order = order)
-squashshapes(s::Tuple{Vararg{ΔShape}}; order=allΔshapetypes(s)) = _squashshapes(orderΔshapes(s; order=order))
+squashshapes(s::ΔShape; kwargs...) = [s]
+squashshapes(s::ΔShape...; order=allΔshapetypes(s)) = squashshapes(s; order)
+squashshapes(s::Tuple{ΔShape, Vararg{ΔShape}}; order=allΔshapetypes(s)) = squashshapes(collect(s); order)
+squashshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s)) = _squashshapes(orderΔshapes(s; order=order))
 
-_squashshapes(s::ΔShape) = tuple(s)
-_squashshapes(s::Tuple{ΔShape}) = s
-function _squashshapes(s::Tuple{Vararg{ΔShape}})
-    squashed = filter_noops(foldr(combine, s)...)
+_squashshapes(s::ΔShape) = [s]
+function _squashshapes(s::AbstractArray{<:ΔShape})
+    isempty(s) && return s
+    squashed = collect(filter_noops(foldr(combine, s)))
     squashed == s && return s
-    isempty(squashed) && return squashed
+    isempty(squashed) && return similar(s, 0)
     return _squashshapes(squashed)
 end
 
 Δshapediff(s1,s2) = filter_noops(squashshapes(_Δshapediff(s1,s2)))
-_Δshapediff(s1::ΔShape{N}, s2::ΔShape{M}) where {N,M} = N == M ? (revert(s2), s1) : (s1,s2)
-_Δshapediff(s1::ΔShape{N}, s2::ΔShape{N}) where N = s1 == s2 ? tuple() : (revert(s2), s1)
-function _Δshapediff(s1::Tuple{Vararg{ΔShape}}, s2::Tuple{Vararg{ΔShape}})
+_Δshapediff(s1::ΔShape{N}, s2::ΔShape{M}) where {N,M} = N == M ? [revert(s2), s1] : [s1,s2]
+_Δshapediff(s1::ΔShape{N}, s2::ΔShape{N}) where N = s1 == s2 ? typeof(s1)[] : [revert(s2), s1]
+function _Δshapediff(s1::AbstractArray{<:ΔShape}, s2::AbstractArray{<:ΔShape})
     # Pretty crappy heurisic tbh, but I couldn't think of anything better:
     # Step 1: Remove all identical ΔShapes
     # Step 2: Squash shapes and try again
     # Step 3: revert s2 and concat s1
 
-    firstdiff = findfirst(((ss1,ss2)::Tuple) -> ss1 != ss2, collect(zip(s1,s2)))
+    firstdiff = findfirst(((ss1,ss2),) -> ss1 != ss2, collect(zip(s1,s2)))
     firstdiff = isnothing(firstdiff) ? min(length(s1), length(s2))+1 : firstdiff
     sd1 = s1[firstdiff:end]
     sd2 = s2[firstdiff:end]
 
-    isempty(sd1) && isempty(sd2) && return tuple()
+    isempty(sd1) && isempty(sd2) && return similar(s1, 0)
 
     ts1 = allΔshapetypes(sd1)
     ts2 = allΔshapetypes(sd2)
@@ -237,7 +255,7 @@ function _Δshapediff(s1::Tuple{Vararg{ΔShape}}, s2::Tuple{Vararg{ΔShape}})
     so2 = squashshapes(sd2; order=vcat(front, back))
 
     (so1 != s1 || so2 != s2) && return _Δshapediff(so1, so2)
-    return (revert(so2)..., so1...)
+    return vcat(revert(so2), so1)
 end
 
 """
@@ -259,29 +277,29 @@ struct ShapeTrace{T,V1,V2} <: AbstractShapeTrace
     dest::V2
     trace::T
 end
-ShapeTrace(v) = ShapeTrace(v, v, Δshapes(v))
+ShapeTrace(v) = ShapeTrace(v, v, collect(Δshapes(v)))
 
 allΔshapetypes(t::ShapeTrace) = allΔshapetypes(t.trace)
-allΔshapetypes(t::Tuple) = unique(mapreduce(allΔshapetypes, vcat, t))
+allΔshapetypes(t::Union{Tuple, <:AbstractArray}) = unique(mapreduce(allΔshapetypes, vcat, t))
 
 squashshapes(t::ShapeTrace; order=allΔshapetypes(t)) = squashshapes(t.trace;order=order)
 # TODO: One can probably do better here when parallel paths can't be squashed
 # Now the first instance of such a path will basically prevent squashing of any subsequent paths, even if they are not parallel
-squashshapes(t::Tuple; order=allΔshapetypes(t)) = mapfoldr(tt -> squashshapes(tt;order=order), (t1,t2) -> squashshapes(t1,t2; order=order), t)
-function squashshapes(t::Tuple{Vararg{ShapeTrace}}; order=allΔshapetypes(t))
+squashshapes(t::AbstractArray; order=allΔshapetypes(t)) = mapfoldr(tt -> squashshapes(tt;order=order), (t1,t2) -> squashshapes(t1,t2; order=order), t)
+function squashshapes(t::AbstractArray{<:ShapeTrace}; order=allΔshapetypes(t))
      squashed = unique(map(tt -> squashshapes(tt;order=order), t))
-     length(squashed) == 1 && return first(squashed)
-     return Tuple(squashed) # Danger danger! Graph probably only works for one single input shape
+     length(squashed) == 1 && return only(squashed)
+     return squashed # Danger danger! Graph probably only works for one single input shape
 end
 # This is the reason for "TODO: One can probably do better here when parallel paths can't be squashed" above
-squashshapes(s1, s2; order=missing) = s1, s2
-squashshapes(s1::Tuple{Vararg{ΔShape}}, s2::Tuple{Vararg{ΔShape}}; order=allΔshapetypes((s1,s2))) = squashshapes((s1...,s2...); order=order)
+squashshapes(s1, s2; order=missing) = [s1, s2]
+squashshapes(s1::AbstractArray{<:ΔShape}, s2::AbstractArray{<:ΔShape}; order=allΔshapetypes((s1,s2))) = squashshapes(vcat(s1,s2); order=order)
 
 
-visitvertex(tr::ShapeTrace, v) = ShapeTrace(tr.origin, v, (tr.trace..., Δshapes(v)...))
+visitvertex(tr::ShapeTrace, v) = ShapeTrace(tr.origin, v, vcat(tr.trace, Δshapes(v)...))
 
 Base.merge(::AbstractVertex, tr::ShapeTrace) = tr
-Base.merge(v::AbstractVertex, trs::ShapeTrace...) = ShapeTrace(v, v, tuple(tuple((ShapeTrace(t.origin, v, (t.trace..., Δshapes(v)...)) for t in trs)...)))
+Base.merge(v::AbstractVertex, trs::ShapeTrace...) = ShapeTrace(v, v, [[(ShapeTrace(t.origin, v, vcat(t.trace, Δshapes(v)...)) for t in trs)...]])
 
 """
     shapetrace(v::AbstractVertex, vs::AbstractVertex...; trfun = v -> ShapeTrace(v))
@@ -310,12 +328,12 @@ Return a tuple of `ΔShape`s describing the shape mapping of `v`.
 More concretely, if `xs = size(x)[sdims]` then `size(v(x))[sdims] == fshape(Δshapes(v), xs)` where `sdims` are the shape dimensions of `x`, e.g. the height and width in case of 2D convolutions.
 """
 Δshapes(v::AbstractVertex) = Δshapes(base(v))
-Δshapes(::InputVertex) = tuple()
+Δshapes(::InputVertex) = ΔShape[]
 Δshapes(v::MutationVertex) = Δshapes(trait(v), v)
 Δshapes(t::DecoratingTrait, v) = Δshapes(base(t), v)
 Δshapes(::MutationSizeTrait, v) = _Δshapes(layertype(v), v)
 
-_Δshapes(t::Any, v) = tuple()
+_Δshapes(t::Any, v) = ΔShape[]
 
 function _Δshapes(::FluxConv{N}, v) where N
     c = layer(v)
diff --git a/test/shape.jl b/test/shape.jl
index 25a0aa9d..697b39e9 100644
--- a/test/shape.jl
+++ b/test/shape.jl
@@ -96,14 +96,14 @@
         end
 
         @testset "Order shapes" begin
-            @test orderΔshapes(tuple(ShapeAdd(1,2))) == tuple(ShapeAdd(1,2))
-            @test orderΔshapes((ShapeAdd(1,2), ShapeMul(2,3), ShapeAdd(4,6))) == (ShapeAdd(2,2), ShapeAdd(1,2), ShapeMul(2,3))
-            @test orderΔshapes((ShapeAdd(1,2), ShapeMul(2,3), ShapeAdd(4,6), ShapeFlatten())) == (ShapeAdd(1,2), ShapeAdd(2,2), ShapeMul(2,3), ShapeFlatten())
+            @test orderΔshapes([ShapeAdd(1,2)]) == [ShapeAdd(1,2)]
+            @test orderΔshapes([ShapeAdd(1,2), ShapeMul(2,3), ShapeAdd(4,6)]) == [ShapeAdd(2,2), ShapeAdd(1,2), ShapeMul(2,3)]
+            @test orderΔshapes([ShapeAdd(1,2), ShapeMul(2,3), ShapeAdd(4,6), ShapeFlatten()]) == [ShapeAdd(1,2), ShapeAdd(2,2), ShapeMul(2,3), ShapeFlatten()]
 
             @testset "Mixed ShapeAdds" begin
-                s = (ShapeAdd(1,2), ShapeDiv(2,3), ShapeMul(2,2), ShapeAdd(4,4), ShapeAdd(3,5), ShapeAdd(6,8), ShapeMul(2,3), ShapeMul(1,1), ShapeAdd(3,7))
+                s = [ShapeAdd(1,2), ShapeDiv(2,3), ShapeMul(2,2), ShapeAdd(4,4), ShapeAdd(3,5), ShapeAdd(6,8), ShapeMul(2,3), ShapeMul(1,1), ShapeAdd(3,7)]
                 os =  orderΔshapes(s)
-                @test os == (ShapeAdd{2}((6, 12)), ShapeAdd{2}((1, 2)), ShapeAdd{2}((4, 6)), ShapeDiv{2}((2, 3)), ShapeMul{2}((2, 2)), ShapeAdd{2}((3, 5)), ShapeMul{2}((2, 3)), ShapeMul{2}((1, 1)), ShapeAdd{2}((3, 7)))
+                @test os == [ShapeAdd{2}((6, 12)), ShapeAdd{2}((1, 2)), ShapeAdd{2}((4, 6)), ShapeDiv{2}((2, 3)), ShapeMul{2}((2, 2)), ShapeAdd{2}((3, 5)), ShapeMul{2}((2, 3)), ShapeMul{2}((1, 1)), ShapeAdd{2}((3, 7))]
 
                 @testset "same shape with input size $insize" for insize in 1:10
                     @test fshape(s, (insize,insize)) == fshape(os, (insize,insize))
@@ -111,9 +111,9 @@
             end
 
             @testset "Single ShapeAdd with ShapeMul and ShapeDiv" begin
-                s = (ShapeMul(2,2), ShapeDiv(12,12), ShapeAdd(2,3), ShapeMul(2,2), ShapeMul(3,3))
+                s = [ShapeMul(2,2), ShapeDiv(12,12), ShapeAdd(2,3), ShapeMul(2,2), ShapeMul(3,3)]
                 os = orderΔshapes(s)
-                @test os == (ShapeMul(2,2), ShapeDiv(12,12), ShapeMul(3,3), ShapeMul(2,2), ShapeAdd(12,18))
+                @test os == [ShapeMul(2,2), ShapeDiv(12,12), ShapeMul(3,3), ShapeMul(2,2), ShapeAdd(12,18)]
                 @testset "same shape with input size $insize" for insize in 1:5:100
                     @test fshape(s, (insize,insize)) == fshape(os, (insize,insize))
                 end
@@ -121,49 +121,53 @@
         end
 
         @testset "squash shapes" begin
-            @test squashshapes(ShapeAdd(1,2)) == tuple(ShapeAdd(1,2))
-            @test squashshapes(ShapeAdd(1,2), ShapeAdd(3,4)) == tuple(ShapeAdd(4,6))
-            @test squashshapes(ShapeAdd(1,2), ShapeAdd(3,4), ShapeAdd(5,6)) == tuple(ShapeAdd(9,12))
-            @test squashshapes(ShapeAdd(1,2), ShapeAdd(3,4), ShapeAdd(-4,-6)) == tuple()
-            @test squashshapes(ShapeAdd(1,2), ShapeMul(3,4)) == (ShapeAdd(1,2), ShapeMul(3,4))
-            @test squashshapes(ShapeAdd(1,2), ShapeAdd(-1,1), ShapeFlatten()) == (ShapeAdd(0, 3), ShapeFlatten())
-
-            as = (ShapeAdd(1,2), ShapeMul(3,4))
+            @test squashshapes(ShapeAdd(1,2)) == [ShapeAdd(1,2)]
+            @test squashshapes(ShapeAdd(1,2), ShapeAdd(3,4)) == [ShapeAdd(4,6)]
+            @test squashshapes(ShapeAdd(1,2), ShapeAdd(3,4), ShapeAdd(5,6)) == [ShapeAdd(9,12)]
+            @testset "Empty result" begin
+                ss = squashshapes(ShapeAdd(1,2), ShapeAdd(3,4), ShapeAdd(-4,-6)) 
+                @test isempty(ss)
+                @test eltype(ss) == ShapeAdd{2}
+            end
+            @test squashshapes(ShapeAdd(1,2), ShapeMul(3,4)) == [ShapeAdd(1,2), ShapeMul(3,4)]
+            @test squashshapes(ShapeAdd(1,2), ShapeAdd(-1,1), ShapeFlatten()) == [ShapeAdd(0, 3), ShapeFlatten()]
+
+            as = [ShapeAdd(1,2), ShapeMul(3,4)]
             sa = revert(as)
-            @test squashshapes((as..., sa...)) == tuple()
-            @test squashshapes(ShapeAdd(4,2), ShapeDiv(2,2), ShapeMul(2,2), ShapeAdd(-4,-2)) == (ShapeDiv(2,2), ShapeMul(2,2))
+            @test squashshapes(vcat(as, sa)) == []
+            @test squashshapes(ShapeAdd(4,2), ShapeDiv(2,2), ShapeMul(2,2), ShapeAdd(-4,-2)) == [ShapeDiv(2,2), ShapeMul(2,2)]
 
             @testset "Squash mix add first" begin
-                s = (ShapeAdd(2,3), ShapeMul(2,2), ShapeMul(2,2), ShapeMul(3,3), ShapeDiv(12,12))
+                s = [ShapeAdd(2,3), ShapeMul(2,2), ShapeMul(2,2), ShapeMul(3,3), ShapeDiv(12,12)]
                 sq = squashshapes(s)
-                @test sq == tuple(ShapeAdd(2,3))
+                @test sq == [ShapeAdd(2,3)]
                 @testset "squashed shape with input size $insize" for insize in 1:5:100
                     @test fshape(s, (insize, insize)) == fshape(sq, (insize,insize))
                 end
             end
 
             @testset "Squash mix add last" begin
-                s = (ShapeMul(2,2), ShapeMul(2,2), ShapeMul(3,3), ShapeDiv(12,12), ShapeAdd(2,3))
+                s = [ShapeMul(2,2), ShapeMul(2,2), ShapeMul(3,3), ShapeDiv(12,12), ShapeAdd(2,3)]
                 sq = squashshapes(s)
-                @test sq == tuple(ShapeAdd(2,3))
+                @test sq == [ShapeAdd(2,3)]
                 @testset "squashed shape with input size $insize" for insize in 1:5:100
                     @test fshape(s, (insize, insize)) == fshape(sq, (insize,insize))
                 end
             end
 
             @testset "Squash mix add mid" begin
-                s = (ShapeMul(2,2), ShapeMul(2,2), ShapeAdd(2,3), ShapeMul(3,3), ShapeDiv(12,12))
+                s = [ShapeMul(2,2), ShapeMul(2,2), ShapeAdd(2,3), ShapeMul(3,3), ShapeDiv(12,12)]
                 sq = squashshapes(s)
-                @test sq == (ShapeMul(12, 12), ShapeAdd(6, 9), ShapeDiv(12, 12))
+                @test sq == [ShapeMul(12, 12), ShapeAdd(6, 9), ShapeDiv(12, 12)]
                 @testset "squashed shape with input size $insize" for insize in 1:5:100
                     @test fshape(s, (insize, insize)) == fshape(sq, (insize,insize))
                 end
             end
 
             @testset "Squash mix add and div mid" begin
-                s = (ShapeMul(2,2), ShapeMul(2,2), ShapeDiv(12,12), ShapeAdd(2,3), ShapeMul(3,3))
+                s = [ShapeMul(2,2), ShapeMul(2,2), ShapeDiv(12,12), ShapeAdd(2,3), ShapeMul(3,3)]
                 sq = squashshapes(s)
-                @test sq == (ShapeMul(4, 4), ShapeDiv(12,12), ShapeMul(3,3), ShapeAdd(6, 9))
+                @test sq == [ShapeMul(4, 4), ShapeDiv(12,12), ShapeMul(3,3), ShapeAdd(6, 9)]
                 @testset "squashed shape with input size $insize" for insize in 1:5:100
                     @test fshape(s, (insize, insize)) == fshape(sq, (insize,insize))
                 end
@@ -174,35 +178,35 @@
             import NaiveGAflux: Δshapediff
 
             @testset "Primitives" begin
-                @test Δshapediff(ShapeAdd(2,2), ShapeAdd(2,2)) == tuple()
-                @test Δshapediff(ShapeAdd(2,3), ShapeAdd(3,4)) == tuple(ShapeAdd(-1,-1))
+                @test Δshapediff(ShapeAdd(2,2), ShapeAdd(2,2)) == []
+                @test Δshapediff(ShapeAdd(2,3), ShapeAdd(3,4)) == [ShapeAdd(-1,-1)]
 
-                @test Δshapediff(ShapeMul(2,2), ShapeMul(2,2)) == tuple()
-                @test Δshapediff(ShapeMul(2,3), ShapeMul(3,4)) == (ShapeDiv(3,4), ShapeMul(2,3))
+                @test Δshapediff(ShapeMul(2,2), ShapeMul(2,2)) == []
+                @test Δshapediff(ShapeMul(2,3), ShapeMul(3,4)) == [ShapeDiv(3,4), ShapeMul(2,3)]
 
-                @test Δshapediff(ShapeDiv(2,2), ShapeDiv(2,2)) == tuple()
-                @test Δshapediff(ShapeDiv(2,3), ShapeDiv(3,4)) == (ShapeMul(3,4), ShapeDiv(2,3))
-                @test Δshapediff(ShapeDiv(2,3), ShapeDiv(4,6)) == tuple(ShapeMul(2,2))
+                @test Δshapediff(ShapeDiv(2,2), ShapeDiv(2,2)) == []
+                @test Δshapediff(ShapeDiv(2,3), ShapeDiv(3,4)) == [ShapeMul(3,4), ShapeDiv(2,3)]
+                @test Δshapediff(ShapeDiv(2,3), ShapeDiv(4,6)) == [ShapeMul(2,2)]
 
-                @test Δshapediff(ShapeFlatten(), ShapeFlatten()) == tuple()
-                @test Δshapediff(ShapeFlatten{2}(), ShapeFlatten{3}()) == tuple(ShapeFlatten{2}(), ShapeFlatten{3}())
+                @test Δshapediff(ShapeFlatten(), ShapeFlatten()) == []
+                @test Δshapediff(ShapeFlatten{2}(), ShapeFlatten{3}()) == [ShapeFlatten{2}(), ShapeFlatten{3}()]
             end
 
-            @testset "Tuples" begin
+            @testset "Arrays" begin
                 @testset "Basic" begin
-                    s1 = (ShapeAdd(2,2), ShapeDiv(2,2), ShapeMul(4,2))
-                    @test Δshapediff(s1, s1) == tuple()
+                    s1 = [ShapeAdd(2,2), ShapeDiv(2,2), ShapeMul(4,2)]
+                    @test Δshapediff(s1, s1) == []
 
-                    @test Δshapediff(s1, s1[1:2]) == tuple(ShapeMul(4,2))
-                    @test Δshapediff(s1, s1[2:3]) == tuple(ShapeAdd(4,2))
-                    @test Δshapediff(s1, s1[[1,3]]) == (ShapeDiv(8, 4), ShapeMul(4, 2))
+                    @test Δshapediff(s1, s1[1:2]) == [ShapeMul(4,2)]
+                    @test Δshapediff(s1, s1[2:3]) == [ShapeAdd(4,2)]
+                    @test Δshapediff(s1, s1[[1,3]]) == [ShapeDiv(8, 4), ShapeMul(4, 2)]
 
-                    @test Δshapediff(s1[1:2], s1) == tuple(ShapeDiv(4,2))
-                    @test Δshapediff(s1[2:3], s1) == tuple(ShapeAdd(-4,-2))
-                    @test Δshapediff(s1[[1,3]], s1) == (ShapeDiv(4, 2), ShapeMul(8, 4))
+                    @test Δshapediff(s1[1:2], s1) == [ShapeDiv(4,2)]
+                    @test Δshapediff(s1[2:3], s1) == [ShapeAdd(-4,-2)]
+                    @test Δshapediff(s1[[1,3]], s1) == [ShapeDiv(4, 2), ShapeMul(8, 4)]
 
-                    s2 = (ShapeDiv(4,4), ShapeAdd(3,5), ShapeMul(8,4), ShapeDiv(2,2))
-                    @test Δshapediff(s1,s2) == (ShapeDiv(4, 2), ShapeMul(8, 4), ShapeAdd(-20, -18))
+                    s2 = [ShapeDiv(4,4), ShapeAdd(3,5), ShapeMul(8,4), ShapeDiv(2,2)]
+                    @test Δshapediff(s1,s2) == [ShapeDiv(4, 2), ShapeMul(8, 4), ShapeAdd(-20, -18)]
                 end
             end
         end
@@ -271,7 +275,9 @@
                 g = CompGraph(vi, v4)
                 sg = g(ShapeTrace(vi)).trace
                 sv = shapetrace(v4; trfun = v -> ShapeTrace(v)).trace
+                @test squashshapes(sg) == squashshapes(sv) == [ShapeAdd{2}((1, -3)), ShapeDiv{2}((6, 7))]
                 @test fshape(sg, (30,31)) == fshape(sv, (30,31))== size(g(ones(Float32, 30,31, nout(vi), 1)))[1:2]
+                @test fshape(squashshapes(sg), (30,31)) == fshape(sg, (30,31)) 
             end
         end
 
@@ -338,7 +344,7 @@
 
             @test sv2[1][1] == sva2
             @test sv2[1][2] == svb3
-            @test sv2[2] == filter_noops(Δshapes(v2))
+            @test tuple(sv2[2][]) == filter_noops(Δshapes(v2))
 
             @test fshape((sva2...,sv2[2]...), (13,14)) == (3,3)
             @test fshape((svb3...,sv2[2]...), (13,14)) == (3,3)
@@ -371,7 +377,7 @@
             mv1 = "mv1" >> va3 + vb4
             v2 = pv(mv1, "v2"; ks=(3,3))
 
-            @test squashshapes(shapetrace(v2)) == squashshapes(shapetrace(v2, v1)) == (ShapeAdd(-59, -59), ShapeDiv(6, 6))
+            @test squashshapes(shapetrace(v2)) == squashshapes(shapetrace(v2, v1)) == [ShapeAdd(-59, -59), ShapeDiv(6, 6)]
         end
 
         @testset "Squash with different start types" begin
@@ -391,7 +397,7 @@
 
             @test allΔshapetypes(st) == [ShapeDiv{2}, ShapeAdd{2}]
 
-            @test squashshapes(st) == (ShapeDiv(2, 2), ShapeAdd(-2,-2))
+            @test squashshapes(st) == [ShapeDiv(2, 2), ShapeAdd(-2,-2)]
         end
     end
 end

From f3d8f4cfe4a3cce5ad6eaed0de4197544216c0a3 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 15 Jun 2022 01:47:23 +0200
Subject: [PATCH 30/36] Tmp add Cbc version 1.0.1

---
 Project.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Project.toml b/Project.toml
index 0cb4ab63..06ffff6a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,6 +5,7 @@ version = "0.10.0"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Cbc = "9961bab8-2fa3-5c5a-9d89-47fab24efd76"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -22,6 +23,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
+Cbc = "1.0.1"
 CUDA = "3"
 Flux = "0.13"
 Functors = "0.2"

From fc35af023db0aae0d97574d9da3bc2c016473e54 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Wed, 15 Jun 2022 09:11:04 +0200
Subject: [PATCH 31/36] Loosen type constraint in fshape

---
 src/shape.jl | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/shape.jl b/src/shape.jl
index 28bd251a..14815d35 100644
--- a/src/shape.jl
+++ b/src/shape.jl
@@ -93,11 +93,11 @@ end
 
 fshape(::Tuple{}, shape) = shape
 fshape(s::Tuple{ΔShape{N}, Vararg{ΔShape}}, shape::NTuple{N, Integer}) where N = foldr(fshape, reverse(s); init=shape)
-fshape(s::AbstractArray{<:ΔShape{N}}, shape::NTuple{N, Integer}) where N = foldr(fshape, reverse(s); init=shape)
+fshape(s::AbstractVector{<:ΔShape}, shape::NTuple{N, Integer}) where N = foldr(fshape, reverse(s); init=shape)
 
 """
     revert(s::ΔShape)
-    revert(s::AbstractArray{<:ΔShape}) 
+    revert(s::AbstractVector{<:ΔShape}) 
 
 Return a `ΔShape` or tuple of `ΔShape`s which reverts the shape change of `s`, i.e `fshape((s..., revert(s)...), x) == x`.
 
@@ -109,7 +109,7 @@ revert(s::ShapeAdd) = ShapeAdd(.-shapeΔ(s))
 revert(s::ShapeMul) = ShapeDiv(shapeΔ(s))
 revert(s::ShapeDiv) = ShapeMul(shapeΔ(s))
 revert(s::Tuple{Vararg{ΔShape}}) = reverse(revert.(s))
-revert(s::AbstractArray{<:ΔShape}) = reverse(revert.(s))
+revert(s::AbstractVector{<:ΔShape}) = reverse(revert.(s))
 
 """
     combine(s1::ΔShape,s2::ΔShape)
@@ -119,8 +119,8 @@ Return a `ΔShape` or tuple of `ΔShape`s which combines `s1` and `s2`, i.e `fsh
 combine(s1::ΔShape,s2::ΔShape) = s1,s2
 combine(s1::Tuple{Vararg{ΔShape}}, s2::ΔShape) = vcat(collect(s1[1:end-1]), combine(last(s1), s2)...)
 combine(s1::ΔShape, s2::Tuple{Vararg{ΔShape}}) = vcat(combine(s1, first(s2))..., collect(s2[2:end]))
-combine(s1::AbstractArray{<:ΔShape}, s2::ΔShape) = vcat(s1[1:end-1], combine(last(s1), s2)...)
-combine(s1::ΔShape, s2::AbstractArray{<:ΔShape}) = vcat(combine(s1, first(s2))..., s2[2:end])
+combine(s1::AbstractVector{<:ΔShape}, s2::ΔShape) = vcat(s1[1:end-1], combine(last(s1), s2)...)
+combine(s1::ΔShape, s2::AbstractVector{<:ΔShape}) = vcat(combine(s1, first(s2))..., s2[2:end])
 combine(s1::ShapeAdd{N}, s2::ShapeAdd{N}) where N = tuple(ShapeAdd(shapeΔ(s1) .+ shapeΔ(s2)))
 combine(s1::T, s2::T) where T <: Union{ShapeDiv{N}, ShapeMul{N}} where N = tuple(T(shapeΔ(s1) .* shapeΔ(s2)))
 # Note: Combining ShapeDiv and ShapeMul not generally safe due to rounding when dividing
@@ -148,7 +148,7 @@ swapΔshape(s1::ShapeDiv{N}, s2::ShapeAdd{N}) where N = ShapeAdd(shapeΔ(s1) .*
     filter_noops(s::ΔShape)
     filter_noops(s::ΔShape...)
     filter_noops(s::Tuple{Vararg{ΔShape}})
-    filter_noops(s::AbstractArray{<:ΔShape}) 
+    filter_noops(s::AbstractVector{<:ΔShape}) 
     filter_noops(s::ΔShape)
 
 Return a tuple of `ΔShape`s where all identity mappings (e.g things like `ShapeAdd(0)`) are removed.
@@ -162,16 +162,16 @@ filter_noops(s::Union{ShapeMul, ShapeDiv}) = all(x -> x == 1, shapeΔ(s)) ? tupl
 filter_noops(s::ShapeAdd) = all(x -> x == 0, shapeΔ(s)) ? tuple() : tuple(s)
 
 """
-    filter_noops(s::AbstractArray{<:ΔShape}) 
+    filter_noops(s::AbstractVector{<:ΔShape}) 
 
 Return an array of `ΔShape`s where all identity mappings (e.g things like `ShapeAdd(0)`) are removed.
 
 If called with a single identity mapping an empty array is returned.
 """
-filter_noops(s::AbstractArray{<:ΔShape}) = mapreduce(filter_noops, (s1,s2) -> vcat(s1...,s2...), s; init=similar(s, 0))
+filter_noops(s::AbstractVector{<:ΔShape}) = mapreduce(filter_noops, (s1,s2) -> vcat(s1...,s2...), s; init=similar(s, 0))
 
 """
-    orderΔshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s))
+    orderΔshapes(s::AbstractVector{<:ΔShape}; order=allΔshapetypes(s))
 
 Return a tuple of `ΔShape`s which has the same shape mapping as `s` (i.e `fshape(s, x) == fshape(orderΔshapes(s), x)`) but where `ΔShape`s to the extent possible are ordered according to `order`.
 
@@ -179,8 +179,8 @@ Useful to determine whether two arbitrary sequences of `ΔShape`s result in the
 
 Warning: Sort is not stable due to lazy implementation, i.e `orderΔshapes(orderΔshapes(s; order=someorder);order=someorder)` is not guaranteed to return the same thing as `orderΔshapes(s; order=someorder)`.
 """
-orderΔshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s)) = orderΔshapes!(copy(s); order)
-function orderΔshapes!(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s))
+orderΔshapes(s::AbstractVector{<:ΔShape}; order=allΔshapetypes(s)) = orderΔshapes!(copy(s); order)
+function orderΔshapes!(s::AbstractVector{<:ΔShape}; order=allΔshapetypes(s))
     # Yeah, this is bubble sort :/ Given the constraint that ΔShapes can't always swap places along with the fact that swapping generally changes the swapped elements I couldn't think up and other sorting algo that works
     nlook = length(s)
 
@@ -206,11 +206,11 @@ end
 
 allΔshapetypes(::T) where T <: ΔShape = T
 allΔshapetypes(s::Tuple{Vararg{ΔShape}}) = unique(allΔshapetypes.(s))
-allΔshapetypes(s::AbstractArray{<:ΔShape}) = unique(allΔshapetypes.(s))
+allΔshapetypes(s::AbstractVector{<:ΔShape}) = unique(allΔshapetypes.(s))
 
 """
     squashshapes(s::ΔShape...; order=allΔshapetypes(s))
-    squashshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s))
+    squashshapes(s::AbstractVector{<:ΔShape}; order=allΔshapetypes(s))
 
 Return an array of `ΔShape`s with the same shape mapping as `s` (i.e `fshape(s, x) == fshape(squashshapes(s), x)`) with as few `ΔShape`s as possible for the given `order`.
 
@@ -219,10 +219,10 @@ Useful to determine whether two arbitrary sequences of `ΔShape`s result in the
 squashshapes(s::ΔShape; kwargs...) = [s]
 squashshapes(s::ΔShape...; order=allΔshapetypes(s)) = squashshapes(s; order)
 squashshapes(s::Tuple{ΔShape, Vararg{ΔShape}}; order=allΔshapetypes(s)) = squashshapes(collect(s); order)
-squashshapes(s::AbstractArray{<:ΔShape}; order=allΔshapetypes(s)) = _squashshapes(orderΔshapes(s; order=order))
+squashshapes(s::AbstractVector{<:ΔShape}; order=allΔshapetypes(s)) = _squashshapes(orderΔshapes(s; order=order))
 
 _squashshapes(s::ΔShape) = [s]
-function _squashshapes(s::AbstractArray{<:ΔShape})
+function _squashshapes(s::AbstractVector{<:ΔShape})
     isempty(s) && return s
     squashed = collect(filter_noops(foldr(combine, s)))
     squashed == s && return s
@@ -233,7 +233,7 @@ end
 Δshapediff(s1,s2) = filter_noops(squashshapes(_Δshapediff(s1,s2)))
 _Δshapediff(s1::ΔShape{N}, s2::ΔShape{M}) where {N,M} = N == M ? [revert(s2), s1] : [s1,s2]
 _Δshapediff(s1::ΔShape{N}, s2::ΔShape{N}) where N = s1 == s2 ? typeof(s1)[] : [revert(s2), s1]
-function _Δshapediff(s1::AbstractArray{<:ΔShape}, s2::AbstractArray{<:ΔShape})
+function _Δshapediff(s1::AbstractVector{<:ΔShape}, s2::AbstractVector{<:ΔShape})
     # Pretty crappy heurisic tbh, but I couldn't think of anything better:
     # Step 1: Remove all identical ΔShapes
     # Step 2: Squash shapes and try again
@@ -293,7 +293,7 @@ function squashshapes(t::AbstractArray{<:ShapeTrace}; order=allΔshapetypes(t))
 end
 # This is the reason for "TODO: One can probably do better here when parallel paths can't be squashed" above
 squashshapes(s1, s2; order=missing) = [s1, s2]
-squashshapes(s1::AbstractArray{<:ΔShape}, s2::AbstractArray{<:ΔShape}; order=allΔshapetypes((s1,s2))) = squashshapes(vcat(s1,s2); order=order)
+squashshapes(s1::AbstractVector{<:ΔShape}, s2::AbstractVector{<:ΔShape}; order=allΔshapetypes((s1,s2))) = squashshapes(vcat(s1,s2); order=order)
 
 
 visitvertex(tr::ShapeTrace, v) = ShapeTrace(tr.origin, v, vcat(tr.trace, Δshapes(v)...))

From e8d0a45eb59f932a728501f617207754a4d492c7 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Thu, 16 Jun 2022 21:59:07 +0200
Subject: [PATCH 32/36] Remove Cbc dep

---
 Project.toml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 06ffff6a..0cb4ab63 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,7 +5,6 @@ version = "0.10.0"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-Cbc = "9961bab8-2fa3-5c5a-9d89-47fab24efd76"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -23,7 +22,6 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
-Cbc = "1.0.1"
 CUDA = "3"
 Flux = "0.13"
 Functors = "0.2"

From 13698c84b10c79ac0b70bc276fe40fc9732034b3 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 19 Jun 2022 01:36:26 +0200
Subject: [PATCH 33/36] Bump NaiveNASlib compat

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 0cb4ab63..8e5d455d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -28,7 +28,7 @@ Functors = "0.2"
 IterTools = "1"
 MemPool = "0.3"
 NaiveNASflux = "2"
-NaiveNASlib = "2"
+NaiveNASlib = "2.0.6"
 Reexport = "0.2.0, 1"
 Setfield = "0.3.4, 0.5, 0.6, 0.7, 0.8"
 julia = "1.7"

From 8bdb65d97a23276ef7a79daf72fe15db4a831562 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sat, 16 Jul 2022 00:12:35 +0200
Subject: [PATCH 34/36] Fix Flux deprecated optimizers Set compat bound for
 Flux to 0.13.4

---
 Project.toml                                   |  2 +-
 .../imageclassification/ImageClassification.jl |  2 +-
 src/app/imageclassification/strategy.jl        |  6 +++---
 test/candidate.jl                              |  4 ++--
 test/crossover/optimizer.jl                    | 18 +++++++++---------
 test/examples/quicktutorial.jl                 |  2 +-
 test/mutation/optimizer.jl                     |  6 +++---
 test/visualization/callbacks.jl                |  8 ++++----
 8 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/Project.toml b/Project.toml
index 8e5d455d..b2234eac 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,7 +23,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 CUDA = "3"
-Flux = "0.13"
+Flux = "0.13.4"
 Functors = "0.2"
 IterTools = "1"
 MemPool = "0.3"
diff --git a/src/app/imageclassification/ImageClassification.jl b/src/app/imageclassification/ImageClassification.jl
index e3f9702c..807ddc5b 100644
--- a/src/app/imageclassification/ImageClassification.jl
+++ b/src/app/imageclassification/ImageClassification.jl
@@ -9,7 +9,7 @@ using NaiveNASlib.Advanced, NaiveNASlib.Extend
 import Flux
 using Flux: Dense, Conv, ConvTranspose, DepthwiseConv, CrossCor, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, 
             MaxPool, MeanPool, Dropout, AlphaDropout, GlobalMaxPool, GlobalMeanPool, cpu, gpu
-using Flux: Descent, Momentum, Nesterov, ADAM, NADAM, ADAGrad, WeightDecay
+using Flux: Descent, Momentum, Nesterov, Adam, NAdam, AdaGrad, WeightDecay
 import Functors
 using Functors: fmap
 using Random
diff --git a/src/app/imageclassification/strategy.jl b/src/app/imageclassification/strategy.jl
index c6801bf6..ef7fd8d3 100644
--- a/src/app/imageclassification/strategy.jl
+++ b/src/app/imageclassification/strategy.jl
@@ -61,7 +61,7 @@ function TrainSplitAccuracy(;split=0.1,
             accuracyconfig=BatchedIterConfig(),
             accuracyfitness=AccuracyVsSize,
             trainconfig=TrainIterConfig(),
-            trainfitness=(iter, accf) -> GpuFitness(TrainThenFitness(StatefulGenerationIter(iter), Flux.Losses.logitcrossentropy, ADAM(), accf, 0.0)))
+            trainfitness=(iter, accf) -> GpuFitness(TrainThenFitness(StatefulGenerationIter(iter), Flux.Losses.logitcrossentropy, Adam(), accf, 0.0)))
 
     return TrainSplitAccuracy(split, accuracyconfig, accuracyfitness, trainconfig, trainfitness)
 end
@@ -123,7 +123,7 @@ function TrainAccuracyVsSize(;
                         trainconfig=TrainIterConfig(),
                         trainfitness = dataiter -> sizevs(GpuFitness(TrainAccuracyFitness(
                                                                             dataiter=StatefulGenerationIter(dataiter), 
-                                                                            defaultloss=Flux.Losses.logitcrossentropy, defaultopt = ADAM())))) 
+                                                                            defaultloss=Flux.Losses.logitcrossentropy, defaultopt = Adam())))) 
         return TrainAccuracyVsSize(trainconfig, trainfitness)
 end
 function fitnessfun(s::TrainAccuracyVsSize, x, y) 
@@ -374,7 +374,7 @@ end
 
 function optmutation(p=0.1)
     lrm = LearningRateMutation()
-    om = MutationProbability(OptimizerMutation([Descent, Momentum, Nesterov, ADAM, NADAM, ADAGrad]), p)
+    om = MutationProbability(OptimizerMutation([Descent, Momentum, Nesterov, Adam, NAdam, AdaGrad]), p)
     return MutationChain(lrm, om)
 end
 
diff --git a/test/candidate.jl b/test/candidate.jl
index 74944cea..dc32d5ea 100644
--- a/test/candidate.jl
+++ b/test/candidate.jl
@@ -162,7 +162,7 @@ end
                 @test fitness(SizeFitness(), cand) == nparams(graph)
 
                 graphmutation = VertexMutation(MutationFilter(v -> name(v)=="hlayer", AddVertexMutation(ArchSpace(DenseSpace([1], [relu])))))
-                optmutation = OptimizerMutation((Momentum, Nesterov, ADAM))
+                optmutation = OptimizerMutation((Momentum, Nesterov, Adam))
                 bsmutation = TrainBatchSizeMutation(0, -1, MockRng([0.5]))
                 evofun = MapCandidate(graphmutation, optmutation, bsmutation)
                 newcand = evofun(cand)
@@ -407,7 +407,7 @@ end
             om1 = omf()
             @test learningrate(om1(Descent(0.1))) ≈ learningrate(om1(Momentum(0.1)))
 
-            opt = Optimiser(so(Descent(0.1)), Momentum(0.1), so(Descent(1.0)), ADAM(1.0), Descent(1.0))
+            opt = Optimiser(so(Descent(0.1)), Momentum(0.1), so(Descent(1.0)), Adam(1.0), Descent(1.0))
             @test length(om1(opt).os) == 4
             @test learningrate(om1(opt)) ≈ learningrate(om1(Descent(0.01)))
 
diff --git a/test/crossover/optimizer.jl b/test/crossover/optimizer.jl
index c0054e5f..24a1792e 100644
--- a/test/crossover/optimizer.jl
+++ b/test/crossover/optimizer.jl
@@ -5,7 +5,7 @@
     prts(o::Optimiser) = "$(typeof(o))$(prts.(Tuple(o.os)))"
 
     @testset "Swap optimizers $(prts(o1)) and $(prts(o2))" for (o1, o2) in (
-        (ADAM(), Momentum()),
+        (Adam(), Momentum()),
         (Optimiser(Descent(), WeightDecay()), Optimiser(Momentum(), Nesterov())),
         )
         oc = OptimizerCrossover()
@@ -54,12 +54,12 @@
         @testset "Different size Optimisers" begin
             oc = OptimizerCrossover()
             o1 = Optimiser(Descent(), WeightDecay(), Momentum())
-            o2 = Optimiser(ADAM(), ADAMW(), NADAM(), RADAM())
+            o2 = Optimiser(Adam(), AdamW(), NAdam(), RAdam())
 
             o1n,o2n = oc((o1,o2))
 
-            @test prts(o1n) == prts(Optimiser(ADAM(), ADAMW(), NADAM()))
-            @test prts(o2n) == prts(Optimiser(Descent(), WeightDecay(), Momentum(), RADAM()))
+            @test prts(o1n) == prts(Optimiser(Adam(), AdamW(), NAdam()))
+            @test prts(o2n) == prts(Optimiser(Descent(), WeightDecay(), Momentum(), RAdam()))
         end
     end
 
@@ -68,12 +68,12 @@
         oc = OptimizerCrossover() |> mplm |> OptimizerCrossover
 
         o1 = Optimiser(Descent(), WeightDecay(), Momentum())
-        o2 = Optimiser(ADAM(), ADAGrad(), AdaMax())
+        o2 = Optimiser(Adam(), AdaGrad(), AdaMax())
 
-        o1n,o2n = @test_logs (:info, "Crossover between WeightDecay and ADAGrad") oc((o1,o2))
+        o1n,o2n = @test_logs (:info, "Crossover between WeightDecay and AdaGrad") oc((o1,o2))
 
-        @test typeof.(o1n.os) == [Descent, ADAGrad, Momentum]
-        @test typeof.(o2n.os) == [ADAM, WeightDecay, AdaMax]
+        @test typeof.(o1n.os) == [Descent, AdaGrad, Momentum]
+        @test typeof.(o2n.os) == [Adam, WeightDecay, AdaMax]
     end
 
     @testset "Learningrate crossover" begin
@@ -103,7 +103,7 @@
         @testset "Optimiser" begin
             oc = LearningRateCrossover()
             o1 = Optimiser(Descent(0.1), Momentum(0.2), WeightDecay(0.1))
-            o2 = Optimiser(ADAM(0.3), RADAM(0.4), NADAM(0.5), Nesterov(0.6))
+            o2 = Optimiser(Adam(0.3), RAdam(0.4), NAdam(0.5), Nesterov(0.6))
 
             o1n,o2n = oc((o1,o2))
 
diff --git a/test/examples/quicktutorial.jl b/test/examples/quicktutorial.jl
index db9876b3..03010355 100644
--- a/test/examples/quicktutorial.jl
+++ b/test/examples/quicktutorial.jl
@@ -39,7 +39,7 @@ datasetvalidate = [(randn(ninputs, batchsize), onehot(rand(1:nlabels, batchsize)
 fitnessfunction = TrainThenFitness(;
     dataiter = datasettrain,
     defaultloss = Flux.logitcrossentropy, # Will be used if not provided by the candidate
-    defaultopt = ADAM(), # Same as above. State is wiped after training to prevent memory leaks
+    defaultopt = Adam(), # Same as above. State is wiped after training to prevent memory leaks
     fitstrat = AccuracyFitness(datasetvalidate) # This is what creates our fitness value after training
 )
 
diff --git a/test/mutation/optimizer.jl b/test/mutation/optimizer.jl
index c8dff22e..f3883710 100644
--- a/test/mutation/optimizer.jl
+++ b/test/mutation/optimizer.jl
@@ -7,7 +7,7 @@
 
         @test learningrate(m(Descent(0.1))) == 1.0
         @test learningrate(m(ShieldedOpt(Momentum(0.1)))) == 0.1
-        @test learningrate(m(Optimiser(Nesterov(0.1), ShieldedOpt(ADAM(0.1))))) == 0.1
+        @test learningrate(m(Optimiser(Nesterov(0.1), ShieldedOpt(Adam(0.1))))) == 0.1
 
         @test learningrate(LearningRateMutation(MockRng([0.0]))(Descent(0.1))) == 0.085
     end
@@ -17,7 +17,7 @@
 
         @test typeof(m(Descent())) == Momentum
         @test typeof(m(ShieldedOpt(Descent()))) == ShieldedOpt{Descent}
-        @test typeof.(m(Optimiser(Nesterov(), ShieldedOpt(ADAM()))).os) == [Momentum, ShieldedOpt{ADAM}]
+        @test typeof.(m(Optimiser(Nesterov(), ShieldedOpt(Adam()))).os) == [Momentum, ShieldedOpt{Adam}]
     end
 
     @testset "Add optimizer" begin
@@ -33,6 +33,6 @@
         m = MutationChain(LogMutation(o -> "First", OptimizerMutation((Momentum, ))), LogMutation(o -> "Second", AddOptimizerMutation(o -> Descent())))
 
         @test_logs (:info, "First") (:info, "Second") typeof.(m(Nesterov()).os) == [Momentum, Descent]
-        @test_logs (:info, "First") (:info, "First") (:info, "Second") (:info, "Second") m([Nesterov(), ADAM()])
+        @test_logs (:info, "First") (:info, "First") (:info, "Second") (:info, "Second") m([Nesterov(), Adam()])
     end
 end
\ No newline at end of file
diff --git a/test/visualization/callbacks.jl b/test/visualization/callbacks.jl
index 646a798f..d1b0b38c 100644
--- a/test/visualization/callbacks.jl
+++ b/test/visualization/callbacks.jl
@@ -61,22 +61,22 @@
             end
 
             @testset "ScatterOpt" begin
-                NaiveGAflux.opt(c::PlotTestCand) = fitness(c) > 2 ? ADAM(nparams(c) - fitness(c)) : Flux.Optimiser([ShieldedOpt(Descent(nparams(c) - fitness(c)))])
+                NaiveGAflux.opt(c::PlotTestCand) = fitness(c) > 2 ? Adam(nparams(c) - fitness(c)) : Flux.Optimiser([ShieldedOpt(Descent(nparams(c) - fitness(c)))])
 
                 p = ScatterOpt((args...;kwargs...) -> true, testdir)
                 @test !isdir(p.basedir)
 
                 @test p(PlotTestCand.(1:3, [10, 20, 30], [100, 200, 300]))
-                @test p.data ==  [[1 99.0 Descent; 2 198.0 Descent; 3 297.0 ADAM]]
+                @test p.data ==  [[1 99.0 Descent; 2 198.0 Descent; 3 297.0 Adam]]
 
                 @test p(PlotTestCand.(2:4, [20, 30, 40], [200, 300, 400]))
-                @test p.data ==  [[1 99 Descent; 2 198 Descent; 3 297 ADAM], [2 198 Descent; 3 297 ADAM; 4 396 ADAM]]
+                @test p.data ==  [[1 99 Descent; 2 198 Descent; 3 297 Adam], [2 198 Descent; 3 297 Adam; 4 396 Adam]]
 
                 p2 = ScatterOpt((args...;kwargs...) -> true, testdir)
                 @test p2.data == p.data
 
                 @test p2(PlotTestCand.(3:5, [30, 40, 50], [300, 400, 500]))
-                @test p2.data ==  [[1 99 Descent; 2 198 Descent; 3 297 ADAM], [2 198 Descent; 3 297 ADAM; 4 396 ADAM],[3 297 ADAM; 4 396 ADAM; 5 495 ADAM]]
+                @test p2.data ==  [[1 99 Descent; 2 198 Descent; 3 297 Adam], [2 198 Descent; 3 297 Adam; 4 396 Adam],[3 297 Adam; 4 396 Adam; 5 495 Adam]]
 
                 p3 = ScatterOpt((args...;kwargs...) -> true, testdir)
                 @test p3(PlotTestCand.(3:5, [30, 40, 50], [300, 400, 500]))

From e153ad24c0d0b8bc7d7908982d841c408d6b630a Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 7 Aug 2022 12:05:53 +0200
Subject: [PATCH 35/36] Add ScatterBatchSize

---
 src/NaiveGAflux.jl              |  2 +-
 src/iteratormaps.jl             |  2 +-
 src/visualize/callbacks.jl      | 86 +++++++++++++++++++++++++++++----
 test/iteratormaps.jl            |  4 +-
 test/visualization/callbacks.jl | 44 ++++++++++++++++-
 5 files changed, 123 insertions(+), 15 deletions(-)

diff --git a/src/NaiveGAflux.jl b/src/NaiveGAflux.jl
index fcb33619..745f5ffd 100644
--- a/src/NaiveGAflux.jl
+++ b/src/NaiveGAflux.jl
@@ -94,7 +94,7 @@ export nparams
 export AutoFlux
 
 # Visulization
-export PlotFitness, ScatterPop, ScatterOpt, MultiPlot, CbAll
+export PlotFitness, ScatterPop, ScatterOpt, ScatterBatchSize, MultiPlot, CbAll
 
 include("util.jl")
 include("shape.jl")
diff --git a/src/iteratormaps.jl b/src/iteratormaps.jl
index 3e94ea48..900a08f7 100644
--- a/src/iteratormaps.jl
+++ b/src/iteratormaps.jl
@@ -126,5 +126,5 @@ maptrain(sim::ShieldedIteratorMap, args...) = maptrain(sim.map, args...)
 mapvalidation(sim::ShieldedIteratorMap, args...) = mapvalidation(sim.map, args...)
 
 function limit_maxbatchsize(sim::ShieldedIteratorMap, args...; kwargs...) 
-    ShieldedIteratorMap(limit_maxbatchsize(sim.map), args...; kwargs...)
+    ShieldedIteratorMap(limit_maxbatchsize(sim.map, args...; kwargs...))
 end
\ No newline at end of file
diff --git a/src/visualize/callbacks.jl b/src/visualize/callbacks.jl
index 307183a4..b53b97e7 100644
--- a/src/visualize/callbacks.jl
+++ b/src/visualize/callbacks.jl
@@ -1,4 +1,5 @@
-
+# This design is pretty bleh and should probably be reworked sometime
+# Eventually there will be too many different metrics to plot that it won't scale to have a single type for a metric trio
 
 
 loadifpresent(filename, default=Float32[]) = isfile(filename) ? deserialize(filename) : default
@@ -19,11 +20,11 @@ julia> gr();
 julia> cb=PlotFitness(plot, "models/test");
 ```
 """
-struct PlotFitness
+struct PlotFitness{P, S}
     best::Vector{Float32}
     avg::Vector{Float32}
-    plt
-    basedir
+    plt::P
+    basedir::S
 end
 
 function PlotFitness(plotfun, rootdir, subdir="PlotFitness")
@@ -67,10 +68,10 @@ julia> using NaiveGAflux, Plots
 julia> cb=ScatterPop(scatter, "models/test");
 ```
 """
-struct ScatterPop
-    plotfun
+struct ScatterPop{F, S}
+    plotfun::F
     data::Vector{Array{Float32, 2}}
-    basedir
+    basedir::S
 end
 
 function ScatterPop(plotfun, rootdir::AbstractString, subdir="ScatterPop")
@@ -118,10 +119,10 @@ julia> using NaiveGAflux, Plots
 julia> cb=ScatterOpt(scatter, "models/test");
 ```
 """
-struct ScatterOpt
-    plotfun
+struct ScatterOpt{F, S}
+    plotfun::F
     data::Vector{Array{Any, 2}}
-    basedir
+    basedir::S
 end
 
 function ScatterOpt(plotfun, rootdir::AbstractString, subdir="ScatterOpt")
@@ -169,6 +170,71 @@ function(p::ScatterOpt)(population)
     return plt
 end
 
+"""
+    ScatterBatchSize(plotfun, rootdir, subdir="ScatterBatchSize")
+
+Scatter plot of batch size vs fitness vs number of parameters for each candidate.
+
+Also serializes data so that plotting can be resumed if evolution is aborted.
+
+# Examples
+```julia-repl
+julia> using NaiveGAflux, Plots
+
+julia> cb=ScatterBatchSize(scatter, "models/test");
+```
+"""
+struct ScatterBatchSize{F, S}
+    plotfun::F
+    data::Vector{Array{Float32, 2}}
+    basedir::S
+end
+
+function ScatterBatchSize(plotfun, rootdir::AbstractString, subdir="ScatterBatchSize")
+    basedir = joinpath(rootdir, subdir)
+    data = loadifpresent(joinpath(basedir, "bsfitnp.jls"), Vector{Array{Float32, 2}}(undef, 0))
+    return ScatterBatchSize(plotfun, data, basedir)
+end
+
+function plotfitness(p::ScatterBatchSize, population)
+    fits = fitness.(population)
+    batchsizes = findtrainbatchsize.(population; default=0)
+    npars = nparams.(population)
+
+    push!(p.data, hcat(batchsizes, fits, npars))
+    plotgen(p)
+end
+
+findtrainbatchsize(c::AbstractCandidate; kwargs...) = findtrainbatchsize(NaiveGAflux.iteratormap(c); kwargs...)
+function findtrainbatchsize(ims::IteratorMaps; default)  
+    for im in ims.maps
+        bs = findtrainbatchsize(im; default)
+        bs !== default && return bs
+    end
+    return default
+end
+findtrainbatchsize(::Any; default) = default
+findtrainbatchsize(sim::ShieldedIteratorMap; kwargs...) = findtrainbatchsize(sim.map; kwargs...)
+findtrainbatchsize(bs::BatchSizeIteratorMap; kwargs...) = NaiveGAflux.batchsize(bs.tbs)  
+
+function plotgen(p::ScatterBatchSize, gen=length(p.data))
+    gen == 0 && return p.plotfun()
+    data = p.data[gen]
+    batchsizes = data[:,1]
+    fits = data[:,2]
+    npars = data[:,3]
+    return p.plotfun(log2.(batchsizes), fits, zcolor=log10.(npars), m=(:heat, 0.8), xlabel="Batch Size (log2)", ylabel="Fitness", colorbar_title="Number of parameters (log10)", label=nothing)
+end
+
+function(p::ScatterBatchSize)(population)
+    plt = plotfitness(p, population)
+    mkpath(p.basedir)
+    serialize(joinpath(p.basedir, "bsfitnp.jls"), p.data)
+    return plt
+end
+
+
+
 """
     MultiPlot(plotfun, plts...)
 
diff --git a/test/iteratormaps.jl b/test/iteratormaps.jl
index 1f043721..5bd8331a 100644
--- a/test/iteratormaps.jl
+++ b/test/iteratormaps.jl
@@ -49,12 +49,12 @@
     @testset "ShieldedIteratorMap" begin
         NaiveGAflux.maptrain(::Val{:TestDummy1}, itr) = Iterators.map(x -> 2x, itr)
         NaiveGAflux.mapvalidation(::Val{:TestDummy1}, itr) = Iterators.map(x -> 5x, itr)
-        NaiveGAflux.limit_maxbatchsize(::Val{:TestDummy1}) = Val(:TestDummy2)
+        NaiveGAflux.limit_maxbatchsize(::Val{:TestDummy1}, args...; kwargs...) = Val(:TestDummy2)
 
         sim = ShieldedIteratorMap(Val(:TestDummy1))
 
         @test collect(maptrain(sim, 1:3)) == 2:2:6
         @test collect(mapvalidation(sim, 1:3)) == 5:5:15
-        @test limit_maxbatchsize(sim) == ShieldedIteratorMap(Val(:TestDummy2))
+        @test limit_maxbatchsize(sim, 13; blah=14) == ShieldedIteratorMap(Val(:TestDummy2))
     end
 end
\ No newline at end of file
diff --git a/test/visualization/callbacks.jl b/test/visualization/callbacks.jl
index d1b0b38c..2c21df91 100644
--- a/test/visualization/callbacks.jl
+++ b/test/visualization/callbacks.jl
@@ -67,7 +67,7 @@
                 @test !isdir(p.basedir)
 
                 @test p(PlotTestCand.(1:3, [10, 20, 30], [100, 200, 300]))
-                @test p.data ==  [[1 99.0 Descent; 2 198.0 Descent; 3 297.0 Adam]]
+                @test p.data ==  [[1 99 Descent; 2 198 Descent; 3 297 Adam]]
 
                 @test p(PlotTestCand.(2:4, [20, 30, 40], [200, 300, 400]))
                 @test p.data ==  [[1 99 Descent; 2 198 Descent; 3 297 Adam], [2 198 Descent; 3 297 Adam; 4 396 Adam]]
@@ -84,7 +84,49 @@
                 @test length(p3.data) == 1 + length(p2.data)
                 # What was added is just a copy paste of the last thing inserted to p2 so this lazy check works
                 @test p3.data[end] == p2.data[end]
+            end
+
+            @testset "findtrainbatchsize" begin
+                import NaiveGAflux: findtrainbatchsize
+
+                @test findtrainbatchsize(nothing; default=13) == 13
+                @test findtrainbatchsize("Aaa"; default=17) == 17
+                
+                @testset "Candidate with $ic" for (ic, exp) in (
+                    (BatchSizeIteratorMap(13, 74, (m, args...;kwargs...) -> m), 13),        
+                    (IteratorMaps(), nothing),
+                    (IteratorMaps(BatchSizeIteratorMap(13, 74, (m, args...;kwargs...) -> m)), 13),
+                    (IteratorMaps(ShieldedIteratorMap(BatchSizeIteratorMap(13, 74, (m, args...;kwargs...) -> m))), 13),
+                )
+                    c = CandidateDataIterMap(ic, PlotTestCand(11,1,1))
+                    @test findtrainbatchsize(ic; default=nothing) == exp
+                end
+            end
+
+            @testset "ScatterBatchSize" begin
+                NaiveGAflux.findtrainbatchsize(c::PlotTestCand; kwargs...) = 2 * fitness(c)
+
+                p = ScatterBatchSize((args...;kwargs...) -> true, testdir)
+                @test !isdir(p.basedir)
+
+                @test p(PlotTestCand.(1:3, [10, 20, 30], [100, 200, 300]))
+                @test p.data ==  [[2 1 100; 4 2 200; 6 3 300]]
+
+                @test p(PlotTestCand.(2:4, [20, 30, 40], [200, 300, 400]))
+                @test p.data ==  [[2 1 100; 4 2 200; 6 3 300], [4 2 200; 6 3 300;8 4 400]]
 
+                p2 = ScatterBatchSize((args...;kwargs...) -> true, testdir)
+                @test p2.data == p.data
+
+                @test p2(PlotTestCand.(3:5, [30, 40, 50], [300, 400, 500]))
+                @test p2.data ==  [[2 1 100; 4 2 200; 6 3 300], [4 2 200; 6 3 300;8 4 400] ,[6 3 300;8 4 400;10 5 500]]
+
+                p3 = ScatterBatchSize((args...;kwargs...) -> true, testdir)
+                @test p3(PlotTestCand.(3:5, [30, 40, 50], [300, 400, 500]))
+                # Test that something was added
+                @test length(p3.data) == 1 + length(p2.data)
+                # What was added is just a copy paste of the last thing inserted to p2 so this lazy check works
+                @test p3.data[end] == p2.data[end]
             end
 
         finally

From 451207d2a8cfbd88c9b819b4872b44c908b80ff2 Mon Sep 17 00:00:00 2001
From: DrChainsaw <Christian.kyril.skarby@gmail.com>
Date: Sun, 7 Aug 2022 12:16:48 +0200
Subject: [PATCH 36/36] Mention support for non-architecture hyperparameters

---
 docs/src/index.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/src/index.md b/docs/src/index.md
index 62a7dd14..2707e16f 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -7,6 +7,10 @@ done in a train-validate-select-evolve loop where the validation metric serves a
 There is however absolutely no enforcement of this structure and the parts are designed to work standalone
 and in a composable manner to support a wide variety of search strategies.
 
+It is also not limited to model architecture related hyperparameters. Support for inclusion of optimizers,
+learningrates and batchsizes into the search space is built in and the framework supports adding any 
+hyperparameter (e.g data augmentation strategies and loss functions) through simple interfaces.
+
 ## Readers Guideline
 
 The [Quick Tutorial](@ref) serves as a starting point to get an idea of the syntax and type of capabilities of NaiveGAflux.