Merge pull request #89 from lanl/mlmd-bugfix

Mlmd bugfix
lanl · Mar 18, 2024 · dfc95a5 · dfc95a5
2 parents b3a3703 + cac2dbc
commit dfc95a5
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 26 deletions.
diff --git a/doc/sphinx/05_mlmd/mlmd.rst b/doc/sphinx/05_mlmd/mlmd.rst
@@ -34,7 +34,7 @@ Building
 ========
 
 
-Building the Lammps Python interface environment is somewhat challenging. Below is an outline of the process used to get the environment working on Chicoma. Also, in the benchmarks/kokkos_lammps_hippynn/benchmark-env.yml file is a list of the packages installed in the test environment. Most of these will not affect performance, but the pytorch (1.11.0) and cuda (11.2) versions should be kept the same. 
+Building the Lammps Python interface environment is somewhat challenging. Below is an outline of the process used to get the environment working on Chicoma. Also, in the benchmarks/kokkos_lammps_hippynn/benchmark-env.yml file is a list of the packages installed in the test environment. Most of these will not affect performance, but the pytorch (2.2.0) and cuda (11.2) versions should be kept the same. 
 
 Building on Chicoma
 -------------------
@@ -55,31 +55,33 @@ Building on Chicoma
    virtenvpath =virt <Set Path> 
    conda create --prefix=${virtenvpath} python=3.10
    source activate ${virtenvpath}
-   conda install pytorch-gpu cudatoolkit=11.6 cupy -c pytorch -c nvidia
+   conda install pytorch-gpu=1.11 cudatoolkit=11.6 cupy -c pytorch -c nvidia
    conda install matplotlib h5py tqdm python-graphviz cython numba scipy ase -c conda-forge
    
    #Install HIPPYNN
    git clone [email protected]:lanl/hippynn.git
-   cd hippynn
-   git fetch
-   git checkout f8ed7390beb8261c8eec75580c683f5121226b30
+   pushd hippynn
+   git fetch --all --tags
+   git checkout tags/hippynn-0.0.3 -b benchmark
    pip install --no-deps -e .
+   popd
    
    #Install Lammps: 
    git clone [email protected]:bnebgen-LANL/lammps-kokkos-mliap.git
+   pushd lammps-kokkos-mliap
    git checkout lammps-kokkos-mliap
    mkdir build
-   cd build
+   pushd build
    export CMAKE_PREFIX_PATH="${FFTW_ROOT}" 
-   cmake ../cmake 
-     -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+   cmake ../cmake \
+     -DCMAKE_BUILD_TYPE=Release \
      -DCMAKE_VERBOSE_MAKEFILE=ON \
      -DLAMMPS_EXCEPTIONS=ON \
      -DBUILD_SHARED_LIBS=ON \
      -DBUILD_MPI=ON \
      -DKokkos_ENABLE_OPENMP=ON \
      -DKokkos_ENABLE_CUDA=ON \
-     -DKokkos_ARCH_ZEN2=ON \
+     -DKokkos_ARCH_AMPERE80=ON \
      -DPKG_KOKKOS=ON \
      -DCMAKE_CXX_STANDARD=17 \
      -DPKG_MANYBODY=ON \
@@ -96,6 +98,8 @@ Building on Chicoma
      -DMLIAP_ENABLE_PYTHON=on
    make -j 12
    make install-python
+   popd
+   popd
 
 .. Building on nv-devkit
 .. -------------------------
@@ -219,24 +223,24 @@ the model are captured in ``model_results.txt``. An example is shown here::
 
                         train         valid          test
     -----------------------------------------------------
-    EpA-RMSE :        0.46335       0.49286       0.45089
-    EpA-MAE  :        0.36372        0.4036       0.36639
-    EpA-RSQ  :        0.99893       0.99888       0.99884
-    ForceRMSE:         21.255         21.74        20.967
-    ForceMAE :         16.759        17.145        16.591
-    ForceRsq :         0.9992       0.99916       0.99922
-    T-Hier   :     0.00086736    0.00089796    0.00087634
-    L2Reg    :         193.15        193.15        193.15
-    Loss-Err :       0.046285       0.04785      0.045731
-    Loss-Reg :      0.0010605     0.0010911     0.0010695
-    Loss     :       0.047346      0.048941        0.0468
+    EpA-RMSE :        0.63311       0.67692       0.65307
+    EpA-MAE  :        0.49966       0.56358       0.51061
+    EpA-RSQ  :          0.998       0.99789       0.99756
+    ForceRMSE:          31.36        32.088        30.849
+    ForceMAE :         24.665        25.111        24.314
+    ForceRsq :        0.99825       0.99817       0.99831
+    T-Hier   :     0.00084411     0.0008716    0.00085288
+    L2Reg    :         98.231        98.231        98.231
+    Loss-Err :       0.067352      0.069605        0.0668
+    Loss-Reg :     0.00094234    0.00096983    0.00095111
+    Loss     :       0.068294      0.070575      0.067751
     -----------------------------------------------------
 
-The numbers will vary from run to run due random seeds and the non-deterministic nature of multi-threaded / data parallel execution. However you should find that the Energy Per Atom mean absolute error "EpA-MAE" for test is below 0.40 (meV/atom). The test Force MAE "Force MAE" should be below 18 (meV/Angstrom).
+The numbers will vary from run to run due random seeds and the non-deterministic nature of multi-threaded / data parallel execution. However you should find that the Energy Per Atom mean absolute error "EpA-MAE" for test is below 0..7 (meV/atom). The test Force MAE "Force MAE" should be below 25 (meV/Angstrom).
 
 The training script will also output the initial box file ``ag_box.data`` as well as an file used to run the resulting potential with LAMMPS, ``hippynn_lammps_model.pt``. Several other files for the training run are put in a directory, ``model_files``.
 
-The "Figure of Merit" for the training task is printed near the end of the ``model_files/model_results.txt`` and is lead with the line "FOM Average Epoch time:" This is the average time to compute an epoch over the training proceedure
+The "Figure of Merit" for the training task is printed near the end of the ``model_files/model_results.txt`` and is lead with the line "FOM Average Epoch time:" This is the average time to compute an epoch over the training proceedure.
 
 Following this process, benchmarks can be run.
 

diff --git a/kokkos_lammps_hippynn/Run_Throughput_GPU.bash b/kokkos_lammps_hippynn/Run_Throughput_GPU.bash
@@ -8,10 +8,6 @@
 #On 1 gpu, run systems of different sizes
 export lmpexec="pathto/lammps-kokkos-mliap/build/lmp"
 
-#source exports.bash #configures environment and sets ${lmpexec}
-export HIPPYNN_USE_CUSTOM_KERNELS="pytorch"
-export HIPPYNN_WARN_LOW_DISTANCES="False"
-
 source exports.bash #configures environment and sets ${lmpexec}
 export HIPPYNN_USE_CUSTOM_KERNELS="pytorch"
 export HIPPYNN_WARN_LOW_DISTANCES="False"