From ace8c5c8a6f06aaca68ed6928533fc5bbf0567f1 Mon Sep 17 00:00:00 2001
From: Ali Shariat <shariat@gmail.com>
Date: Sat, 7 Mar 2020 22:33:50 -0800
Subject: [PATCH] reduce memory access in linear solve loop

By defining this temp variable we use that fact that `i` is never
equal to `Li[j]`. Compiler does not have this information.

binary code for the loop changes from
```
        movsx   rdi, DWORD PTR [rdx+rax*4]
        vmovss  xmm0, DWORD PTR [rcx+rax*4]
        add     rax, 1
        lea     rdi, [r8+rdi*4]
        vmovss  xmm1, DWORD PTR [rdi]
        vfnmadd132ss    xmm0, xmm1, DWORD PTR [r9]
        vmovss  DWORD PTR [rdi], xmm0
```

to

```
        movsx   rdi, DWORD PTR [rdx+rax*4]
        vmovss  xmm0, DWORD PTR [rcx+rax*4]
        add     rax, 1
        lea     rdi, [r8+rdi*4]
        vfnmadd213ss    xmm0, xmm1, DWORD PTR [rdi]
        vmovss  DWORD PTR [rdi], xmm0
```

notice the drop of the first `vmovss` by the compiler.
---
 src/qdldl.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/qdldl.c b/src/qdldl.c
index 31fd93a..6fb17d4 100644
--- a/src/qdldl.c
+++ b/src/qdldl.c
@@ -239,11 +239,12 @@ void QDLDL_Lsolve(const QDLDL_int    n,
                   const QDLDL_float* Lx,
                   QDLDL_float* x){
 
-QDLDL_int i,j;
+  QDLDL_int i,j;
   for(i = 0; i < n; i++){
-      for(j = Lp[i]; j < Lp[i+1]; j++){
-          x[Li[j]] -= Lx[j]*x[i];
-      }
+    QDLDL_float val = x[i];
+    for(j = Lp[i]; j < Lp[i+1]; j++){
+      x[Li[j]] -= Lx[j]*val;
+    }
   }
 }
 
@@ -254,11 +255,13 @@ void QDLDL_Ltsolve(const QDLDL_int    n,
                    const QDLDL_float* Lx,
                    QDLDL_float* x){
 
-QDLDL_int i,j;
+  QDLDL_int i,j;
   for(i = n-1; i>=0; i--){
-      for(j = Lp[i]; j < Lp[i+1]; j++){
-          x[i] -= Lx[j]*x[Li[j]];
-      }
+    QDLDL_float val = x[i];
+    for(j = Lp[i]; j < Lp[i+1]; j++){
+      val -= Lx[j]*x[Li[j]];
+    }
+    x[i] = val;
   }
 }
 
@@ -270,10 +273,9 @@ void QDLDL_solve(const QDLDL_int       n,
                     const QDLDL_float* Dinv,
                     QDLDL_float* x){
 
-QDLDL_int i;
-
-QDLDL_Lsolve(n,Lp,Li,Lx,x);
-for(i = 0; i < n; i++) x[i] *= Dinv[i];
-QDLDL_Ltsolve(n,Lp,Li,Lx,x);
+  QDLDL_int i;
 
+  QDLDL_Lsolve(n,Lp,Li,Lx,x);
+  for(i = 0; i < n; i++) x[i] *= Dinv[i];
+  QDLDL_Ltsolve(n,Lp,Li,Lx,x);
 }