diff --git a/Normalize.lua b/Normalize.lua
index 24c1d07b4..abdb33ed5 100644
--- a/Normalize.lua
+++ b/Normalize.lua
@@ -1,26 +1,28 @@
 local Normalize, parent = torch.class('nn.Normalize', 'nn.Module')
+Normalize.__version = 2
 
-function Normalize:__init(p,eps)
+function Normalize:__init(p, dim, eps)
   parent.__init(self)
   assert(p,'p-norm not provided')
   assert(p > 0, p..'-norm not supported')
   self.p = p
+  self.dim = dim or -1
   self.eps = eps or 1e-10
+  assert(self.dim % 1 == 0, 'dimension should be an integer')
+  assert(self.dim ~= 0, "dimension can't be 0")
 end
 
 function Normalize:updateOutput(input)
-  assert(input:dim() <= 2, 'only 1d layer supported')
-  local input_size = input:size()
-  if input:dim() == 1 then
-    input = input:view(1,-1)
+  assert(math.abs(self.dim) <= input:dim(), 
+    'input has less dimensions than the normalization dimension')
+  local dim = self.dim
+  if dim < 0 then
+    dim = input:dim() + dim + 1
   end
 
-  self._output = self._output or input.new()
   self.norm = self.norm or input.new()
   self.buffer = self.buffer or input.new()
 
-  self._output:resizeAs(input)
-
   if self.p == math.huge then
     -- specialization for the infinity norm
     self._indices = self._indices or
@@ -28,7 +30,7 @@ function Normalize:updateOutput(input)
        torch.CudaTensor() or torch.LongTensor())
 
     self.buffer:abs(input)
-    torch.max(self.norm, self._indices, self.buffer, 2)
+    torch.max(self.norm, self._indices, self.buffer, dim)
     self.norm:add(self.eps)
   else
     self.normp = self.normp or input.new()
@@ -37,41 +39,35 @@ function Normalize:updateOutput(input)
     else
       self.buffer:pow(input,self.p)
     end
-    self.normp:sum(self.buffer,2):add(self.eps)
+    self.normp:sum(self.buffer, dim):add(self.eps)
     self.norm:pow(self.normp,1/self.p)
   end
-  self._output:cdiv(input, self.norm:view(-1,1):expandAs(input))
+  self.output:cdiv(input, self.norm:expandAs(input))
 
-  self.output:view(self._output, input_size)
   return self.output
 end
 
 function Normalize:updateGradInput(input, gradOutput)
-  assert(input:dim() <= 2, 'only 1d layer supported')
-  assert(gradOutput:dim() <= 2, 'only 1d layer supported')
-
-  local input_size = input:size()
-  if input:dim() == 1 then
-    input = input:view(1,-1)
+  assert(math.abs(self.dim) <= input:dim(), 
+    'input has less dimensions than the normalization dimension')
+  local dim = self.dim
+  if dim < 0 then
+    dim = input:dim() + dim + 1
   end
 
-  local n = input:size(1) -- batch size
-  local d = input:size(2) -- dimensionality of vectors
-
-  self._gradInput = self._gradInput or input.new()
   self.cross = self.cross or input.new()
   -- compute diagonal term with gradOutput
-  self._gradInput:resize(n,d)
+  self.gradInput:resizeAs(input)
   if self.p == math.huge then
     -- specialization for the inf case
-    self._gradInput:cmul(self.norm:view(n,1,1):expand(n,d,1),gradOutput)
+    self.gradInput:cmul(self.norm:expandAs(gradOutput),gradOutput)
     self.buffer:resizeAs(input):zero()
-    self.cross:resize(n,1)
-    self.cross:gather(input,2,self._indices)
+    self.cross:resizeAs(self.norm)
+    self.cross:gather(input,dim,self._indices)
     self.cross:cdiv(self.norm)
-    self.buffer:scatter(2,self._indices,self.cross)
+    self.buffer:scatter(dim,self._indices,self.cross)
   else
-    self._gradInput:cmul(self.normp:view(n,1):expand(n,d), gradOutput)
+    self.gradInput:cmul(self.normp:expandAs(gradOutput), gradOutput)
     -- small optimizations for different p
     -- buffer = input*|input|^(p-2)
     if self.p % 2 ~= 0 then
@@ -91,17 +87,17 @@ function Normalize:updateGradInput(input, gradOutput)
     end
   end
   -- compute cross term in two steps
-  self.cross:resize(n,1)
+  self.cross:resizeAs(self.norm)
 
   -- instead of having a huge temporary matrix (b1*b2),
   -- do the computations as b1*(b2*gradOutput). This avoids redundant
   -- computation and also a huge buffer of size n*d^2
   self.buffer2 = self.buffer2 or input.new() -- nxd
   self.buffer2:cmul(input, gradOutput)
-  self.cross:sum(self.buffer2, 2)
+  self.cross:sum(self.buffer2, dim)
 
   self.buffer:cmul(self.cross:expandAs(self.buffer))
-  self._gradInput:add(-1, self.buffer)
+  self.gradInput:add(-1, self.buffer)
 
   -- reuse cross buffer for normalization
   if self.p == math.huge then
@@ -109,9 +105,8 @@ function Normalize:updateGradInput(input, gradOutput)
   else
     self.cross:cmul(self.normp,self.norm)
   end
-  self._gradInput:cdiv(self.cross:expand(n,d))
+  self.gradInput:cdiv(self.cross:expandAs(gradOutput))
 
-  self.gradInput:view(self._gradInput, input_size)
   return self.gradInput
 end
 
@@ -119,11 +114,11 @@ function Normalize:__tostring__()
   local s
   -- different prints if the norm is integer
   if self.p % 1 == 0 then
-    s = '%s(%d)'
+    s = '%s(%d,%d)'
   else
-    s = '%s(%f)'
+    s = '%s(%f,%d)'
   end
-  return string.format(s,torch.type(self),self.p)
+  return string.format(s,torch.type(self),self.p, self.dim)
 end
 
 function Normalize:type(type, tensorCache)
@@ -153,3 +148,11 @@ function Normalize:clearState()
    })
    return parent.clearState(self)
 end
+
+function Normalize:read(file, version)
+   parent.read(self, file)
+   if version < 2 then
+      -- version 1 only supported 1D tensors
+      self.dim = -1
+   end
+end
diff --git a/doc/simple.md b/doc/simple.md
index e3b13dbc0..3d05ab1aa 100644
--- a/doc/simple.md
+++ b/doc/simple.md
@@ -1149,11 +1149,11 @@ print(B)  -- output
 ## Normalize ##
 
 ```lua
-module = nn.Normalize(p, [eps])
+module = nn.Normalize(p, [dim], [eps])
 ```
-Normalizes the input Tensor to have unit `L_p` norm. The smoothing parameter `eps` prevents division by zero when the input contains all zero elements (default = `1e-10`).
+Normalizes the input Tensor to have unit `L_p` norm over dimension `dim` (by default -1, i.e., the last dimension). The smoothing parameter `eps` prevents division by zero when the input contains all zero elements (default = `1e-10`).
 
-Input can be 1D or 2D (in which case it's considered as in batch mode)
+The `dim` parameter can take both positive and negative values (in which case it is counted from the end). Negative dimensions are specially useful if one wants to be invariant to batch-mode.
 
 ```lua
 A = torch.randn(3, 5)
@@ -1163,6 +1163,14 @@ B = m:forward(A) -- B is also 3 x 5
 print(torch.norm(B, 2, 2)) -- norms is [1, 1, 1]
 ```
 
+Here is an example of normalizing the feature maps of an image
+```lua
+I = torch.randn(2, 3, 2, 2)
+m = nn.Normalize(1, -3) -- the third from the last element
+B = m:forward(I)
+print(torch.norm(B, 1, 2))
+```
+
 `Normalize` has a specialized implementation for the `inf` norm, which corresponds to the maximum norm.
 ```lua
 A = torch.randn(3,5)
diff --git a/test.lua b/test.lua
index e67a39a84..9d32cef02 100644
--- a/test.lua
+++ b/test.lua
@@ -624,6 +624,21 @@ function nntest.Normalize()
       mytester:assertlt(err, precision, 'error norm '..p..' on state ')
    end
 
+   -- test on different dimensions
+   for _,p in pairs({1,2,3,4,torch.uniform()*math.random(1,10),math.huge}) do
+      local ini = math.random(3,5)
+      local inj = math.random(3,5)
+      local ink = math.random(3,5)
+      local inl = math.random(3,5)
+      local dim = math.random(1,4)
+      local input = torch.Tensor(inl, ink, inj, ini):zero()
+
+      local module = nn.Normalize(p, dim)
+
+      local err = jac.testJacobian(module, input, -2, 2)
+      mytester:assertlt(err, precision, 'error norm '..p..' on state ')
+   end
+
    -- test IO correctness
    local ini = math.random(3,5)
    local inj = math.random(3,5)