diff --git a/text/char-rnn/char_rnn_gpu_minibatch.jl b/text/char-rnn/char_rnn_gpu_minibatch.jl
new file mode 100644
index 000000000..d0a91a4a3
--- /dev/null
+++ b/text/char-rnn/char_rnn_gpu_minibatch.jl
@@ -0,0 +1,82 @@
+using Flux
+using Flux: onehot, chunk, batchseq, throttle, crossentropy
+using StatsBase: wsample
+using Base.Iterators: partition
+using CuArrays
+using CUDAnative: device!
+using Random
+
+ϵ = 1.0f-32
+
+epochs = 2
+batch_size = 50
+sequence = 50
+gpu_device = 0
+
+device!(gpu_device)
+CuArrays.allowscalar(false)
+
+input_file = joinpath(dirname(@__FILE__),"input.txt")
+
+isfile(input_file) ||
+    download("https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt",
+             input_file)
+
+text = collect(String(read(input_file)))
+alphabet = [unique(text)...,'_']
+text = map(ch -> Float32.(onehot(ch,alphabet)),text)
+stop = Float32.(onehot('_',alphabet))
+
+N = length(alphabet)
+seqlen = sequence
+nbatch = batch_size
+
+Xs = collect(partition(batchseq(chunk(text, nbatch), stop), seqlen))
+txt = circshift(text,-1)
+txt[end] = stop
+Ys = collect(partition(batchseq(chunk(txt, nbatch), stop), seqlen))
+
+model = Chain(
+  LSTM(N, 128),
+  LSTM(128, 256),
+  LSTM(256, 128),
+  Dense(128, N),
+  softmax)
+  m = model |>gpu
+
+opt = ADAM(0.01)
+tx, ty = (Xs[5]|>gpu, Ys[5]|>gpu)
+
+function loss(xx, yy)
+  out = 0.0f0
+  for (idx, x) in enumerate(xx)
+    out += crossentropy(m(x) .+ ϵ, yy[idx])
+  end
+  Flux.reset!(m)
+  out
+end
+
+idxs = length(Xs)
+for epoch_idx in 1:epochs
+  for (idx,(xs,ys)) in enumerate(zip(Xs, Ys))
+    Flux.train!(loss, params(m), [(xs|>gpu,ys|>gpu)], opt)
+    lss = loss(tx,ty)
+    if idx % 10 == 0
+      @info "epoch# $(epoch_idx)/$(epochs)-$(idx)/$(idxs) loss = $(lss)"
+    end
+  end
+end
+
+# Sampling
+function sample(m, alphabet, len)
+  m = cpu(m)
+  Flux.reset!(m)
+  buf = IOBuffer()
+  c = rand(alphabet)
+  for i = 1:len
+    write(buf, c)
+    c = wsample(alphabet, m(onehot(c, alphabet)))
+  end
+  return String(take!(buf))
+end
+@info sample(m, alphabet, 1000)
diff --git a/vision/cifar10/cifar10_gpu_minibatch.jl b/vision/cifar10/cifar10_gpu_minibatch.jl
new file mode 100644
index 000000000..a6b9a1b57
--- /dev/null
+++ b/vision/cifar10/cifar10_gpu_minibatch.jl
@@ -0,0 +1,189 @@
+# Julia version : 1.3.1
+# Flux version : v0.10.1
+
+using Random
+using Dates
+using CuArrays
+using CUDAdrv
+using CUDAnative: device!
+using Flux, Metalhead, Statistics
+using Flux: onehotbatch, onecold, crossentropy, throttle
+using Metalhead: trainimgs
+using Images: channelview
+using Statistics: mean
+using Base.Iterators: partition
+
+model_file = joinpath(dirname(@__FILE__),"cifar10_vgg16_model.bson")
+
+# Get arguments
+
+epochs = 100
+batch_size = 128
+gpu_device = 0
+
+# Very important : this prevent loss NaN
+ϵ = 1.0f-32
+
+# use 1nd GPU
+#CUDAnative.device!(0)
+device!(gpu_device)
+CuArrays.allowscalar(false)
+
+# VGG16 and VGG19 models
+vgg16() = Chain(
+  Conv((3, 3), 3 => 64, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(64),
+  Conv((3, 3), 64 => 64, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(64),
+  MaxPool((2,2)),
+  Conv((3, 3), 64 => 128, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(128),
+  Conv((3, 3), 128 => 128, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(128),
+  MaxPool((2,2)),
+  Conv((3, 3), 128 => 256, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(256),
+  Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(256),
+  Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(256),
+  MaxPool((2,2)),
+  Conv((3, 3), 256 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  MaxPool((2,2)),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  MaxPool((2,2)),
+  x -> reshape(x, :, size(x, 4)),
+  Dense(512, 4096, relu),
+  Dropout(0.5),
+  Dense(4096, 4096, relu),
+  Dropout(0.5),
+  Dense(4096, 10),
+  softmax)
+
+vgg19() = Chain(
+  Conv((3, 3), 3 => 64, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(64),
+  Conv((3, 3), 64 => 64, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(64),
+  MaxPool((2,2)),
+  Conv((3, 3), 64 => 128, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(128),
+  Conv((3, 3), 128 => 128, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(128),
+  MaxPool((2,2)),
+  Conv((3, 3), 128 => 256, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(256),
+  Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(256),
+  Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(256),
+  Conv((3, 3), 256 => 256, relu, pad=(1, 1), stride=(1, 1)),
+  MaxPool((2,2)),
+  Conv((3, 3), 256 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  MaxPool((2,2)),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  BatchNorm(512),
+  Conv((3, 3), 512 => 512, relu, pad=(1, 1), stride=(1, 1)),
+  MaxPool((2,2)),
+  x -> reshape(x, :, size(x, 4)),
+  Dense(512, 4096, relu),
+  Dropout(0.5),
+  Dense(4096, 4096, relu),
+  Dropout(0.5),
+  Dense(4096, 10),
+  softmax)
+
+m = vgg16() |> gpu
+
+# Function to convert the RGB image to Float32 Arrays
+getarray(X) = Float32.(permutedims(channelview(X), (2, 3, 1)))
+
+function make_minibatch(imgs,labels,batch_size)
+  data_set = [(cat(imgs[i]..., dims = 4),
+          labels[:,i])
+          for i in partition(1:length(imgs), batch_size)]
+  return data_set
+end
+
+X = trainimgs(CIFAR10)
+train_idxs = 1:49000
+train_imgs = [getarray(X[i].img) for i in train_idxs]
+train_labels = float.(onehotbatch([X[i].ground_truth.class for i in train_idxs],1:10))
+train_set = make_minibatch(train_imgs,train_labels,batch_size)
+
+verify_idxs = 49001:50000
+verify_imgs = cat([getarray(X[i].img) for i in verify_idxs]..., dims = 4)
+verify_labels = float.(onehotbatch([X[i].ground_truth.class for i in verify_idxs],1:10))
+verify_set = [(verify_imgs,verify_labels)]
+
+# Fetch the test data from Metalhead and get it into proper shape.
+# CIFAR-10 does not specify a verify set so valimgs fetch the testdata instead of testimgs
+tX = valimgs(CIFAR10)
+test_idxs = 1:10000
+test_imgs = [getarray(tX[i].img) for i in test_idxs]
+test_labels = float.(onehotbatch([tX[i].ground_truth.class for i in test_idxs], 1:10))
+test_set = make_minibatch(test_imgs,test_labels,batch_size)
+
+# Defining the loss and accuracy functions
+loss(x, y) = crossentropy(m(x) .+ ϵ, y)
+
+function accuracy(data_set)
+  batch_size = size(data_set[1][1])[end]
+  l = length(data_set)*batch_size
+  s = 0f0
+  for (x,y) in data_set
+    s += sum((onecold(m(x|>gpu) |> cpu) .== onecold(y|>cpu)))
+  end
+  return s/l
+end
+
+# Make sure our is nicely precompiled befor starting our training loop
+m(train_set[1][1] |> gpu)
+
+# Defining the callback and the optimizer
+opt = ADAM(0.001)
+
+@info "Training model..."
+
+for epoch_idx in 1:epochs
+  accs = Array{Float32}(undef,0)
+
+  train_set_len = length(train_set)
+  shuffle_idxs = collect(1:train_set_len)
+  shuffle!(shuffle_idxs)
+
+  for (idx,data_idx) in enumerate(shuffle_idxs)
+    (x,y) = train_set[data_idx]
+    # We augment `x` a little bit here, adding in random noise
+    x = (x .+ ϵ*randn(eltype(x),size(x))) |> gpu
+    y = y|> gpu
+    Flux.train!(loss,params(m),[(x,y)],opt)
+    v_acc = accuracy(verify_set)
+    @info "Epoch# $(epoch_idx)/$(epochs) - #$(idx)/$(train_set_len) loss: $(loss(x,y)), accuracy: $(v_acc)"
+    push!(accs,v_acc)
+  end
+  m_acc = mean(accs)
+  @info " -> Verify accuracy(mean) : $(m_acc)"
+end
+test_acc = accuracy(test_set)
+@info "Test accuracy : $(test_acc)"
diff --git a/vision/mnist/conv_gpu_minibatch.jl b/vision/mnist/conv_gpu_minibatch.jl
new file mode 100644
index 000000000..b6bddabf4
--- /dev/null
+++ b/vision/mnist/conv_gpu_minibatch.jl
@@ -0,0 +1,161 @@
+#=
+Test Environment
+ - Julia : v1.3.1
+ - Flux  : v0.10.1
+=#
+
+# Classifies MNIST digits with a convolution network.
+# Writes out saved model to the file "mnist_conv.bson".
+# Demonstrates basic model construction, training, saving,
+# conditional early-exits, and learning rate scheduling.
+#
+# This model, while simple, should hit around 99% test
+# accuracy after training for approximately 20 epochs.
+
+using Flux, Flux.Data.MNIST, Statistics
+using Flux: onehotbatch, onecold, crossentropy, throttle
+using Base.Iterators: repeated, partition
+using Printf, BSON
+using Dates
+using CUDAnative: device!
+using CuArrays
+using Random
+using Dates
+
+model_file = joinpath(dirname(@__FILE__),"conv_gpu_minibatch.bson")
+
+epochs = 100
+batch_size = 128
+gpu_device = 0
+
+# set using GPU device
+device!(gpu_device)
+CuArrays.allowscalar(false)
+
+
+# Bundle images together with labels and groups into minibatch
+function make_minibatch(imgs,labels,batch_size)
+    len = length(imgs)
+    sz = size(imgs[1])
+    data_set =
+    [(cat([reshape(Float32.(imgs[i]),sz...,1,1) for i in idx]...,dims=4),
+      float.(onehotbatch(labels[idx],0:9)) ) for idx in partition(1:len,batch_size) ]
+    return data_set
+end
+
+# Train data load
+train_labels = MNIST.labels()
+train_imgs = MNIST.images()
+# Make train data to minibatch
+train_set = make_minibatch(train_imgs,train_labels,batch_size)
+
+# Test data load
+test_labels = MNIST.labels(:test)
+test_imgs = MNIST.images(:test)
+test_set = make_minibatch(test_imgs,test_labels,batch_size)
+
+#=
+ Define our model. We will use a simple convolutional architecture with
+ three iterations of Conv -> ReLu -> MaxPool, followed by a final Dense
+ layer that feeds into a softmax probability output.
+=#
+@info "Construncting model..."
+model = Chain(
+  # First convolution, operating upon a 28x28 image
+  Conv((3,3), 1=>16, pad=(1,1), relu),
+  MaxPool((2,2)),
+
+  # Second convolution, operating upon a 14x14 image
+  Conv((3,3), 16=>32, pad=(1,1), relu),
+  MaxPool((2,2)),
+
+  # Third convolution, operating upon a 7x7 image
+  Conv((3,3), 32=>32, pad=(1,1), relu),
+  MaxPool((2,2)),
+
+  # Reshape 3d tensor into a 2d one, at this point it should be (3,3,32,N)
+  # which is where we get the 288 in the `Dense` layer below:
+  x -> reshape(x, :, size(x,4)),
+  Dense(288,10),
+
+  # Finally, softmax to get nice probabilities
+  softmax,
+)
+
+m = model |> gpu
+
+#=
+`loss()` calculates the crossentropy loss between our prediction `ŷ`
+ (calculated from `m(x)`) and the ground truth `y`. We augment the data
+ a bit, adding gaussian random noise to our image to make it more robust.
+ =#
+function loss(x,y)
+  ŷ = m(x)
+  return crossentropy(ŷ,y)
+end
+
+function accuracy(data_set)
+  batch_size = size(data_set[1][1])[end]
+  l = length(data_set)*batch_size
+  s = 0f0
+  for (x,y) in data_set
+    s += sum((onecold(m(x|>gpu) |> cpu) .== onecold(y|>cpu)))
+  end
+  return s/l
+end
+
+# Make sure our is nicely precompiled befor starting our training loop
+m(train_set[1][1] |> gpu)
+
+# Train our model with the given training set using the ADAM optimizer and
+# printing out performance aganin the test set as we go.
+opt = ADAM(0.001)
+
+@info "Beginning training loop..."
+best_acc = 0.0
+last_improvement = 0
+
+for epoch_idx in 1:epochs
+  global best_acc, last_improvement
+  suffle_idxs = collect(1:length(train_set))
+  shuffle!(suffle_idxs)
+  for idx in suffle_idxs
+    (x,y) = train_set[idx]
+    # We augment `x` a little bit here, adding in random noise
+    x = (x .+ 0.1f0*randn(eltype(x),size(x))) |> gpu
+    y = y|> gpu
+    Flux.train!(loss, params(m), [(x, y)],opt)
+  end
+  acc = accuracy(test_set)
+  @info(@sprintf("[%d]: Test accuracy: %.4f",epoch_idx,acc))
+
+  # If our accuracy is good enough, quit out.
+  if acc >= 0.999
+    @info " -> Early-exiting: We reached our target accuracy of 99.9%"
+    break
+  end
+
+  # If this is the best accuracy we've seen so far, save the model out
+  if acc >= best_acc
+    @info " -> New best accuracy! saving model out to $(model_file)"
+    model = m |> cpu
+    acc = acc |> cpu
+    BSON.@save model_file model epoch_idx acc
+    best_acc = acc
+    last_improvement = epoch_idx
+  end
+
+  #If we haven't seen improvement in 5 epochs, drop out learing rate:
+  if epoch_idx - last_improvement >= 5 && opt.eta > 1e-6
+    opt.eta /= 10.0
+    @warn " -> Haven't improved in a while, dropping learning rate to $(opt.eta)!"
+
+    # After dropping learing rate, give it a few epochs to improve
+    last_improvement = epoch_idx
+  end
+
+  if epoch_idx - last_improvement >= 10
+    @warn " -> We're calling this converged."
+    break
+  end
+end
diff --git a/vision/mnist/mlp_gpu_minibatch.jl b/vision/mnist/mlp_gpu_minibatch.jl
new file mode 100644
index 000000000..84fe82734
--- /dev/null
+++ b/vision/mnist/mlp_gpu_minibatch.jl
@@ -0,0 +1,81 @@
+module MNIST_BATCH
+using Flux
+using Flux.Data.MNIST, Statistics
+using Flux: onehotbatch, onecold, crossentropy,throttle
+using Base.Iterators: repeated,partition
+
+using CUDAnative
+using CuArrays
+CuArrays.allowscalar(false)
+
+#=
+Very important !!
+ϵ is used to prevent loss NaN
+=#
+const ϵ = 1.0f-32
+
+# Load training labels and images from Flux.Data.MNIST
+@info("Loading data...")
+
+train_imgs = MNIST.images()
+train_labels = MNIST.labels()
+
+# use 1nd GPU : default
+CUDAnative.device!(0)
+# use 2nd GPU
+#CUDAnative.device!(1)
+
+# Bundle images together with labels and group into minibatch
+function make_minibatch(imgs,labels,batch_size)
+  X = hcat(float.(reshape.(imgs,:))...) |> gpu
+  Y = float.(onehotbatch(labels,0:9)) |> gpu
+
+  data_set = [(X[:,i],Y[:,i]) for i in partition(1:length(labels),batch_size)]
+  return data_set
+end
+
+@info("Making model...")
+# Model
+m = Chain(
+  Dense(28^2,32,relu),
+  Dense(32,10),
+  softmax
+) |> gpu
+loss(x,y) = crossentropy(m(x) .+ ϵ, y)
+accuracy(x,y) = mean(onecold(m(x)|>cpu) .== onecold(y|>cpu))
+
+batch_size = 500
+train_dataset = make_minibatch(train_imgs,train_labels,batch_size)
+
+opt = ADAM()
+
+
+@info("Training model...")
+
+epochs = 200
+# used for plots
+accs = Array{Float32}(undef,0)
+
+dataset_len = length(train_dataset)
+for i in 1:epochs
+  for (idx,dataset) in enumerate(train_dataset)
+    Flux.train!(loss,params(m),[dataset],opt)
+    acc = accuracy(dataset...)
+    if idx == dataset_len
+      @info("Epoch# $(i)/$(epochs) - loss: $(loss(dataset...)), accuracy: $(acc)")
+      push!(accs,acc)
+    end
+  end
+end
+
+# Test Accuracy
+tX = hcat(float.(reshape.(MNIST.images(:test),:))...) |> gpu
+tY = float.(onehotbatch(MNIST.labels(:test),0:9)) |> gpu
+
+println("Test loss:", loss(tX,tY))
+println("Test accuracy:", accuracy(tX,tY))
+
+end
+
+using Plots;gr()
+plot(MNIST_BATCH.accs)