diff --git a/src/prepare_a_matrix.cu b/src/prepare_a_matrix.cu
index e876ba4..fa808af 100644
--- a/src/prepare_a_matrix.cu
+++ b/src/prepare_a_matrix.cu
@@ -105,9 +105,11 @@ int main(int argc, const char *argv[]) {
   // Device memory for output packed data
   cu::DeviceMemory d_a_matrix_packed(bytes_a_matrix_packed);
   d_a_matrix_packed.zero(bytes_a_matrix_packed);
+  // Device memory for transposed data
+  cu::DeviceMemory d_a_transposed(bytes_a_matrix_packed);
 
   // chunk of input data on device in case it doesn't fit in GPU memory
-  // get available GPU memory (after allocating packed output)
+  // get available GPU memory (after allocating other device memory)
   // use at most 80% of available memory
   size_t chunk_size = .8 * context.getFreeMemory();
   size_t pixels_per_chunk = chunk_size / (samples_padded);
@@ -176,7 +178,6 @@ int main(int argc, const char *argv[]) {
 
   // transpose
   std::cout << "Transpose" << std::endl;
-  cu::DeviceMemory d_a_transposed(bytes_a_matrix_packed);
   ccglib::transpose::Transpose transpose(1, pixels_padded, samples_padded,
                                          tile_sizes.x, tile_sizes.z, 1, device,
                                          stream);