diff --git a/src/Matrix.cpp b/src/Matrix.cpp
index df9a470a1..947a20940 100644
--- a/src/Matrix.cpp
+++ b/src/Matrix.cpp
@@ -28,10 +28,15 @@ Matrix::Matrix(omp_lock_t* cell_locks, int num_x, int num_y, int num_groups) {
     _LIL.push_back(std::map<int, FP_PRECISION>());
 
   _A = NULL;
+  _AD = NULL;
   _IA = NULL;
+  _IAD = NULL;
   _JA = NULL;
+  _JAD = NULL;
   _DIAG = NULL;
   _modified = true;
+  _NNZ = 0;
+  _NNZD = 0;
 
   /* Set OpenMP locks for each Matrix cell */
   if (cell_locks == NULL)
@@ -51,12 +56,21 @@ Matrix::~Matrix() {
   if (_A != NULL)
     delete [] _A;
 
+  if (_AD != NULL)
+    delete [] _AD;
+
   if (_IA != NULL)
     delete [] _IA;
 
+  if (_IAD != NULL)
+    delete [] _IAD;
+
   if (_JA != NULL)
     delete [] _JA;
 
+  if (_JAD != NULL)
+    delete [] _JAD;
+
   if (_DIAG != NULL)
     delete [] _DIAG;
 
@@ -175,32 +189,54 @@ void Matrix::convertToCSR() {
 
   /* Get number of nonzero values */
   int NNZ = getNNZ();
+  int NNZD = getNNZD();
 
-  /* Deallocate memory for arrays if previously allocated */
-  if (_A != NULL)
-    delete [] _A;
+  if (NNZ != _NNZ || NNZD != _NNZD) {
 
-  if (_IA != NULL)
-    delete [] _IA;
+    /* Deallocate memory for arrays if previously allocated */
+    if (_A != NULL)
+      delete [] _A;
 
-  if (_JA != NULL)
-    delete [] _JA;
+    if (_AD != NULL)
+      delete [] _AD;
 
-  if (_DIAG != NULL)
-    delete [] _DIAG;
+    if (_IA != NULL)
+      delete [] _IA;
+
+    if (_IAD != NULL)
+      delete [] _IAD;
+
+    if (_JA != NULL)
+      delete [] _JA;
+
+    if (_JAD != NULL)
+      delete [] _JAD;
+
+    if (_DIAG != NULL)
+      delete [] _DIAG;
+
+    /* Allocate memory for arrays */
+    _A = new FP_PRECISION[NNZ];
+    _AD = new FP_PRECISION[NNZD];
+    _IA = new int[_num_rows+1];
+    _IAD = new int[_num_rows+1];
+    _JA = new int[NNZ];
+    _JAD = new int[NNZD];
+    _DIAG = new FP_PRECISION[_num_rows];
+
+    _NNZ = NNZ;
+    _NNZD = NNZD;
+  }
 
-  /* Allocate memory for arrays */
-  _A = new FP_PRECISION[NNZ];
-  _IA = new int[_num_rows+1];
-  _JA = new int[NNZ];
-  _DIAG = new FP_PRECISION[_num_rows];
   std::fill_n(_DIAG, _num_rows, 0.0);
 
   /* Form arrays */
   int j = 0;
+  int jd = 0;
   std::map<int, FP_PRECISION>::iterator iter;
   for (int row=0; row < _num_rows; row++) {
     _IA[row] = j;
+    _IAD[row] = jd;
     for (iter = _LIL[row].begin(); iter != _LIL[row].end(); ++iter) {
       if (iter->second != 0.0) {
         _JA[j] = iter->first;
@@ -208,6 +244,11 @@ void Matrix::convertToCSR() {
 
         if (row == iter->first)
           _DIAG[row] = iter->second;
+        else {
+          _JAD[jd] = iter->first;
+          _AD[jd] = iter->second;
+          jd++;
+        }
 
         j++;
       }
@@ -215,6 +256,7 @@ void Matrix::convertToCSR() {
   }
 
   _IA[_num_rows] = NNZ;
+  _IAD[_num_rows] = NNZD;
 
   /* Reset flat indicating the CSR objects have the same values as the
    * LIL object */
@@ -282,6 +324,21 @@ FP_PRECISION* Matrix::getA() {
 }
 
 
+/**
+ * @brief Get the A component (excluding the diagonal) of the CSR form of the
+ *        matrix object.
+ * @return A pointer to the A component (excluding the diagonal) of the CSR
+ *         form matrix object.
+ */
+FP_PRECISION* Matrix::getAD() {
+
+  if (_modified)
+    convertToCSR();
+
+  return _AD;
+}
+
+
 /**
  * @brief Get the IA component of the CSR form of the matrix object.
  * @return A pointer to the IA component of the CSR form matrix object.
@@ -295,6 +352,21 @@ int* Matrix::getIA() {
 }
 
 
+/**
+ * @brief Get the IA component (excluding the diagonal) of the CSR form of the
+ *        matrix object.
+ * @return A pointer to the IA component (excluding the diagonal) of the CSR
+ *         form matrix object.
+ */
+int* Matrix::getIAD() {
+
+  if (_modified)
+    convertToCSR();
+
+  return _IAD;
+}
+
+
 /**
  * @brief Get the JA component of the CSR form of the matrix object.
  * @return A pointer to the JA component of the CSR form matrix object.
@@ -308,6 +380,21 @@ int* Matrix::getJA() {
 }
 
 
+/**
+ * @brief Get the JA component (excluding the diagonal) of the CSR form of the
+ *        matrix object.
+ * @return A pointer to the JA component (excluding the diagonal) of the CSR
+ *         form matrix object.
+ */
+int* Matrix::getJAD() {
+
+  if (_modified)
+    convertToCSR();
+
+  return _JAD;
+}
+
+
 /**
  * @brief Get the diagonal component of the matrix object.
  * @return A pointer to the diagonal component of the matrix object.
@@ -376,6 +463,26 @@ int Matrix::getNNZ() {
 }
 
 
+/**
+ * @brief Get the number of non-zero values in the matrix, excluding the
+ *        diagonal.
+ * @return The number of non-zero values in the matrix, excluding the diagonal.
+ */
+int Matrix::getNNZD() {
+
+  int NNZD = 0;
+  std::map<int, FP_PRECISION>::iterator iter;
+  for (int row=0; row < _num_rows; row++) {
+    for (iter = _LIL[row].begin(); iter != _LIL[row].end(); ++iter) {
+      if (iter->second != 0.0 || iter->first != row)
+        NNZD++;
+    }
+  }
+
+  return NNZD;
+}
+
+
 /**
  * @brief Set the number of cells in the x dimension.
  * @param num_x The number of cells in the x dimension.
diff --git a/src/Matrix.h b/src/Matrix.h
index c79b5d521..578ca46ce 100644
--- a/src/Matrix.h
+++ b/src/Matrix.h
@@ -34,8 +34,11 @@ class Matrix {
 
   /** The CSR matrix variables */
   FP_PRECISION* _A;
+  FP_PRECISION* _AD;
   int* _IA;
   int* _JA;
+  int* _IAD;
+  int* _JAD;
   FP_PRECISION* _DIAG;
 
   bool _modified;
@@ -43,6 +46,8 @@ class Matrix {
   int _num_y;
   int _num_groups;
   int _num_rows;
+  int _NNZ;
+  int _NNZD;
 
   /** OpenMP mutual exclusion locks for atomic cell updates */
   omp_lock_t* _cell_locks;
@@ -67,14 +72,18 @@ class Matrix {
   FP_PRECISION getValue(int cell_from, int group_from, int cell_to,
                         int group_to);
   FP_PRECISION* getA();
+  FP_PRECISION* getAD();
   int* getIA();
+  int* getIAD();
   int* getJA();
+  int* getJAD();
   FP_PRECISION* getDiag();
   int getNumX();
   int getNumY();
   int getNumGroups();
   int getNumRows();
   int getNNZ();
+  int getNNZD();
   omp_lock_t* getCellLocks();
 
   /* Setter functions */
diff --git a/src/linalg.cpp b/src/linalg.cpp
index 45d11db6d..783c6105a 100644
--- a/src/linalg.cpp
+++ b/src/linalg.cpp
@@ -138,15 +138,16 @@ void linearSolve(Matrix* A, Matrix* M, Vector* X, Vector* B, FP_PRECISION tol,
   int num_rows = X->getNumRows();
   Vector X_old(cell_locks, num_x, num_y, num_groups);
   FP_PRECISION* x_old = X_old.getArray();
-  int* IA = A->getIA();
-  int* JA = A->getJA();
+  int* IAD = A->getIAD();
+  int* JAD = A->getJAD();
   FP_PRECISION* DIAG = A->getDiag();
-  FP_PRECISION* a = A->getA();
+  FP_PRECISION* ad = A->getAD();
   FP_PRECISION* x = X->getArray();
   FP_PRECISION* b = B->getArray();
-  int row, col;
+  int row;
   Vector old_source(cell_locks, num_x, num_y, num_groups);
   Vector new_source(cell_locks, num_x, num_y, num_groups);
+  FP_PRECISION val;
 
   /* Compute initial source */
   matrixMultiplication(M, X, &old_source);
@@ -156,37 +157,23 @@ void linearSolve(Matrix* A, Matrix* M, Vector* X, Vector* B, FP_PRECISION tol,
     /* Pass new flux to old flux */
     X->copyTo(&X_old);
 
-    /* Iteration over red/black cells */
-    for (int color = 0; color < 2; color++) {
-      for (int quad = 0; quad < 4; quad++) {
-#pragma omp parallel for private(row, col)
-        for (int cy = (quad % 2) * num_y/2; cy < (quad % 2 + 1) * num_y/2;
-             cy++) {
-          for (int cx = (quad / 2) * num_x/2; cx < (quad / 2 + 1) * num_x/2;
-               cx++) {
-
-            /* check for correct color */
-            if (((cx % 2)+(cy % 2)) % 2 == color) {
-
-              for (int g = 0; g < num_groups; g++) {
-
-                row = (cy*num_x + cx)*num_groups + g;
-
-                /* Over-relax the x array */
-                x[row] = (1.0 - SOR_factor) * x[row];
+    /* Perform parallel red/black SOR iteration */
+    for (int color=0; color < 2; color++) {
+#pragma omp parallel for private(row)
+      for (int yc=0; yc < num_y; yc++) {
+        for (int xc=(yc + color) % 2; xc < num_x; xc+=2) {
+          for (int g=0; g < num_groups; g++) {
 
-                for (int i = IA[row]; i < IA[row+1]; i++) {
+            /* Get the current matrix row */
+            row = (yc * num_x + xc) * num_groups + g;
 
-                  /* Get the column index */
-                  col = JA[i];
+            /* Accumulate off diagonals multiplied by corresponding fluxes */
+            val = 0.0;
+            for (int i = IAD[row]; i < IAD[row+1]; i++)
+              val += ad[i] * x[JAD[i]];
 
-                  if (row == col)
-                    x[row] += SOR_factor * b[row] / DIAG[row];
-                  else
-                    x[row] -= SOR_factor * a[i] * x[col] / DIAG[row];
-                }
-              }
-            }
+            /* Update the flux for this row */
+            x[row] += SOR_factor * ((b[row] - val) / DIAG[row] - x[row]);
           }
         }
       }