From 45dfa0fb00cee49166676d6d6d8a932ce7b1dd52 Mon Sep 17 00:00:00 2001
From: Subhajit Sahu <wolfram77@gmail.com>
Date: Sat, 22 Jun 2024 08:52:29 +0530
Subject: [PATCH] Initial commit

---
 .gitignore | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 LICENSE    |  21 +++++++
 README.md  |  15 +++++
 main.py    |  43 ++++++++++++++
 main.sh    |  93 ++++++++++++++++++++++++++++++
 process.js | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 493 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 main.py
 create mode 100755 main.sh
 create mode 100644 process.js

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..82f9275
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..15f6791
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Subhajit Sahu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..51b2a34
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+Test cuGraph's implementation of Louvain algorithm for community detection.
+
+<br>
+
+
+## References
+
+- [QST: Benchmarking cugraph.leiden()](https://github.com/rapidsai/cugraph/issues/4488)
+- [cuGraph - RAPIDS Graph Analytics Library](https://github.com/rapidsai/cugraph)
+- [cuGraph - Louvain and Leiden Community Detection](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Louvain.ipynb)
+- [User Guide — rmm 24.06.00 documentation](https://docs.rapids.ai/api/rmm/stable/guide/)
+- [cudf.DataFrame.to_csv — cudf 24.06.00 documentation](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.dataframe.to_csv/)
+- [Conda: Creating a virtual environment](https://stackoverflow.com/a/48178776/1413259)
+- [Find and replace text within a file using commands](https://askubuntu.com/a/20416/432070)
+- [How to Flush the Output of the Python Print Function | Real Python](https://realpython.com/python-flush-print-output/)
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..425f41c
--- /dev/null
+++ b/main.py
@@ -0,0 +1,43 @@
+import os
+import sys
+import time
+import rmm
+import cudf
+import cugraph
+
+
+# Initialize RMM pool
+mode = sys.argv[3]
+print("Initializing RMM pool...", flush=True)
+if mode == "managed":
+  pool = rmm.mr.PoolMemoryResource(rmm.mr.ManagedMemoryResource(), initial_pool_size=2**36)
+else:
+  pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**36)
+rmm.mr.set_current_device_resource(pool)
+
+# Read graph from file
+file = os.path.expanduser(sys.argv[1])
+print("Reading graph from file: {}".format(file), flush=True)
+gdf  = cudf.read_csv(file, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32'])
+print("Symmetrizing graph...", flush=True)
+gdf  = cugraph.symmetrize_df(gdf, 'src', 'dst', None, False, False)
+gdf["data"] = 1.0  # Add edge weights
+G    = cugraph.Graph()
+print("Creating cuGraph graph...", flush=True)
+G.from_cudf_edgelist(gdf, source='src', destination='dst', edge_attr='data', renumber=True)
+
+# Run Leiden
+print("Running Leiden (first)...", flush=True)
+parts, mod = cugraph.leiden(G)
+for i in range(4):
+  print("Running Leiden...", flush=True)
+  t0 = time.time()
+  parts, mod = cugraph.leiden(G)
+  t1 = time.time()
+  print("Leiden modularity: {:.6f}".format(mod), flush=True)
+  print("Leiden took: {:.6f} s".format(t1-t0), flush=True)
+
+# Save communities to file
+comm = os.path.expanduser(sys.argv[2])
+print("Saving communities to file: {}".format(comm), flush=True)
+parts.to_csv(comm, sep=' ', header=False, index=False, chunksize=1e6)
diff --git a/main.sh b/main.sh
new file mode 100755
index 0000000..d62848e
--- /dev/null
+++ b/main.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+src="test-cugraph-louvain"
+out="$HOME/Logs/$src$1.log"
+ulimit -s unlimited
+printf "" > "$out"
+
+# Configuration
+: "${CUDA_VERSION:=11.4}"
+
+# Download tool to count disconnected communities
+app="graph-count-disconnected-communities"
+rm -rf $app
+git clone https://github.com/ionicf/$app && echo ""
+cd $app
+
+# Fixed config
+: "${KEY_TYPE:=uint32_t}"
+: "${EDGE_VALUE_TYPE:=float}"
+: "${MAX_THREADS:=64}"
+# Define macros (dont forget to add here)
+DEFINES=(""
+"-DKEY_TYPE=$KEY_TYPE"
+"-DEDGE_VALUE_TYPE=$EDGE_VALUE_TYPE"
+"-DMAX_THREADS=$MAX_THREADS"
+)
+
+# Build tool
+g++ ${DEFINES[*]} -std=c++17 -O3 -fopenmp main.cxx
+mv a.out ../count.out
+cd ..
+
+# Download program
+if [[ "$DOWNLOAD" != "0" ]]; then
+  rm -rf $src
+  git clone https://github.com/puzzlef/$src && echo ""
+  cd $src
+fi
+
+# Install cuGraph
+if [[ "$INSTALL" == "1" ]]; then
+  conda create --name cugraph-env -y
+  conda activate cugraph-env
+  conda install -c rapidsai -c conda-forge -c nvidia cugraph cuda-version=$CUDA_VERSION -y
+fi
+
+# Run program
+runCugraph() {
+  # $1: input file name
+  # $2: is graph weighted (0/1)
+  # $3: is graph symmetric (0/1)
+  # $4: memory manager (default/managed)
+  opt2=""
+  opt3=""
+  if [[ "$2" == "1" ]]; then opt2="-w"; fi
+  if [[ "$3" == "1" ]]; then opt3="-s"; fi
+  # Convert the graph in MTX format to CSV (space-separated)
+  stdbuf --output=L printf "Converting $1 to $1.csv ...\n"                          | tee -a "$out"
+  lines="$(node process.js header-lines "$1")"
+  echo "src dst" > "$1.csv"
+  tail -n +$((lines+1)) "$1" >> "$1.csv"
+  # Run cuGraph leiden, and save the obtained communities
+  stdbuf --output=L printf "Running cuGraph Leiden on $1.csv ...\n"                 | tee -a "$out"
+  stdbuf --output=L python3 main.py "$1.csv" "$1.clstr" "$3"                   2>&1 | tee -a "$out"
+  # Count disconnected communities
+  stdbuf --output=L printf "Counting disconnected communities ...\n"                | tee -a "$out"
+  stdbuf --output=L ../count.out -i "$1" -m "$1.clstr" -k -r 0 "$opt2" "$opt3" 2>&1 | tee -a "$out"
+  stdbuf --output=L printf "\n\n"                                                   | tee -a "$out"
+  # Clean up
+  rm -rf "$1.csv"
+  rm -rf "$1.clstr"
+}
+
+runAll() {
+  # runCugraph "$HOME/Data/web-Stanford.mtx"    0 0 default
+  runCugraph "$HOME/Data/indochina-2004.mtx"  0 0 default
+  runCugraph "$HOME/Data/uk-2002.mtx"         0 0 default
+  # runCugraph "$HOME/Data/arabic-2005.mtx"     0 0 managed
+  # runCugraph "$HOME/Data/uk-2005.mtx"         0 0 managed
+  # runCugraph "$HOME/Data/webbase-2001.mtx"    0 0 managed
+  # runCugraph "$HOME/Data/it-2004.mtx"         0 0 managed
+  # runCugraph "$HOME/Data/sk-2005.mtx"         0 0 managed
+  runCugraph "$HOME/Data/com-LiveJournal.mtx" 0 1 default
+  runCugraph "$HOME/Data/com-Orkut.mtx"       0 1 default
+  runCugraph "$HOME/Data/asia_osm.mtx"        0 1 default
+  runCugraph "$HOME/Data/europe_osm.mtx"      0 1 default
+  runCugraph "$HOME/Data/kmer_A2a.mtx"        0 1 default
+  runCugraph "$HOME/Data/kmer_V1r.mtx"        0 1 default
+}
+
+runAll
+
+# Signal completion
+curl -X POST "https://maker.ifttt.com/trigger/puzzlef/with/key/${IFTTT_KEY}?value1=$src$1"
diff --git a/process.js b/process.js
new file mode 100644
index 0000000..c71e2e8
--- /dev/null
+++ b/process.js
@@ -0,0 +1,159 @@
+const fs = require('fs');
+const os = require('os');
+const path = require('path');
+const readline = require('readline');
+
+const RGRAPH = /^Running cuGraph Leiden on \s*.*\/(.*?)\.mtx\.csv/m;
+const RORDER = /^order: (.+?) size: (.+) \[directed\] \{\}/m;
+const RMODUL = /^Leiden modularity: (.+)/m;
+const RTTIME = /^Leiden took: (.+?) s/m;
+const RNCOMS = /^Number of communities: (.+)/m;
+const RDCOMS = /^Number of disconnected communities: (.+)/m;
+
+
+
+
+// *-FILE
+// ------
+
+function readFile(pth) {
+  var d = fs.readFileSync(pth, 'utf8');
+  return d.replace(/\r?\n/g, '\n');
+}
+
+function writeFile(pth, d) {
+  d = d.replace(/\r?\n/g, os.EOL);
+  fs.writeFileSync(pth, d);
+}
+
+
+
+
+// *-CSV
+// -----
+
+function writeCsv(pth, rows) {
+  var cols = Object.keys(rows[0]);
+  var a = cols.join()+'\n';
+  for (var r of rows)
+    a += [...Object.values(r)].map(v => `"${v}"`).join()+'\n';
+  writeFile(pth, a);
+}
+
+
+
+
+// *-LOG
+// -----
+
+function readLogLine(ln, data, state) {
+  state = state || {};
+  ln = ln.replace(/^\d+-\d+-\d+ \d+:\d+:\d+\s+/, '');
+  if (RGRAPH.test(ln)) {
+    var [, graph] = RGRAPH.exec(ln);
+    if (!data.has(graph)) data.set(graph, []);
+    state.graph = graph;
+    state.order = 0;
+    state.size  = 0;
+    state.time  = 0;
+    state.modularity  = 0;
+    state.communities = 0;
+    state.disconnected_communities = 0;
+    state.rows = 0;
+  }
+  else if (RORDER.test(ln)) {
+    var [, order, size] = RORDER.exec(ln);
+    state.order = parseFloat(order);
+    state.size  = parseFloat(size);
+  }
+  else if (RMODUL.test(ln)) {
+    var [, modularity] = RMODUL.exec(ln);
+    state.modularity += parseFloat(modularity);
+  }
+  else if (RTTIME.test(ln)) {
+    var [, time] = RTTIME.exec(ln);
+    state.time += 1000 * parseFloat(time);
+    ++state.rows;
+  }
+  else if (RNCOMS.test(ln)) {
+    var [, communities] = RNCOMS.exec(ln);
+    state.communities = parseFloat(communities);
+  }
+  else if (RDCOMS.test(ln)) {
+    var [, disconnected_communities] = RDCOMS.exec(ln);
+    state.disconnected_communities = parseFloat(disconnected_communities);
+    state.modularity /= state.rows;
+    state.time       /= state.rows;
+    data.get(state.graph).push(Object.assign({}, state));
+  }
+  return state;
+}
+
+function readLog(pth) {
+  var text  = readFile(pth);
+  var lines = text.split('\n');
+  var data  = new Map();
+  var state = null;
+  for (var ln of lines)
+    state = readLogLine(ln, data, state);
+  return data;
+}
+
+
+
+
+// PROCESS-*
+// ---------
+
+function processCsv(data) {
+  var a = [];
+  for (var rows of data.values())
+    a.push(...rows);
+  return a;
+}
+
+
+
+
+// HEADER LINES
+// ------------
+
+// Count the number of header lines in a MatrixMarket file.
+async function headerLines(pth) {
+  var a  = 0;
+  var rl = readline.createInterface({input: fs.createReadStream(pth)});
+  for await (var line of rl) {
+    if (line[0]==='%') ++a;
+    else break;
+  }
+  return a+1;  // +1 for the row/column count line
+}
+
+
+
+
+// MAIN
+// ----
+
+async function main(cmd, inp, out) {
+  var data = cmd==='csv'? readLog(inp) : '';
+  if (out && path.extname(out)==='') cmd += '-dir';
+  switch (cmd) {
+    case 'csv':
+      var rows = processCsv(data);
+      writeCsv(out, rows);
+      break;
+    case 'csv-dir':
+      for (var [graph, rows] of data)
+        writeCsv(path.join(out, graph+'.csv'), rows);
+      break;
+    case 'header-lines':
+      var lines = await headerLines(inp);
+      console.log(lines);
+      break;
+    default:
+      console.error(`error: "${cmd}"?`);
+      break;
+  }
+}
+main(...process.argv.slice(2));