From 45dfa0fb00cee49166676d6d6d8a932ce7b1dd52 Mon Sep 17 00:00:00 2001 From: Subhajit Sahu Date: Sat, 22 Jun 2024 08:52:29 +0530 Subject: [PATCH] Initial commit --- .gitignore | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++ LICENSE | 21 +++++++ README.md | 15 +++++ main.py | 43 ++++++++++++++ main.sh | 93 ++++++++++++++++++++++++++++++ process.js | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 493 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 main.py create mode 100755 main.sh create mode 100644 process.js diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..82f9275 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..15f6791 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Subhajit Sahu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..51b2a34 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +Test cuGraph's implementation of Louvain algorithm for community detection. + +
+ + +## References + +- [QST: Benchmarking cugraph.leiden()](https://github.com/rapidsai/cugraph/issues/4488) +- [cuGraph - RAPIDS Graph Analytics Library](https://github.com/rapidsai/cugraph) +- [cuGraph - Louvain and Leiden Community Detection](https://github.com/rapidsai/cugraph/blob/main/notebooks/algorithms/community/Louvain.ipynb) +- [User Guide — rmm 24.06.00 documentation](https://docs.rapids.ai/api/rmm/stable/guide/) +- [cudf.DataFrame.to_csv — cudf 24.06.00 documentation](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.dataframe.to_csv/) +- [Conda: Creating a virtual environment](https://stackoverflow.com/a/48178776/1413259) +- [Find and replace text within a file using commands](https://askubuntu.com/a/20416/432070) +- [How to Flush the Output of the Python Print Function | Real Python](https://realpython.com/python-flush-print-output/) diff --git a/main.py b/main.py new file mode 100644 index 0000000..425f41c --- /dev/null +++ b/main.py @@ -0,0 +1,43 @@ +import os +import sys +import time +import rmm +import cudf +import cugraph + + +# Initialize RMM pool +mode = sys.argv[3] +print("Initializing RMM pool...", flush=True) +if mode == "managed": + pool = rmm.mr.PoolMemoryResource(rmm.mr.ManagedMemoryResource(), initial_pool_size=2**36) +else: + pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**36) +rmm.mr.set_current_device_resource(pool) + +# Read graph from file +file = os.path.expanduser(sys.argv[1]) +print("Reading graph from file: {}".format(file), flush=True) +gdf = cudf.read_csv(file, delimiter=' ', names=['src', 'dst'], dtype=['int32', 'int32']) +print("Symmetrizing graph...", flush=True) +gdf = cugraph.symmetrize_df(gdf, 'src', 'dst', None, False, False) +gdf["data"] = 1.0 # Add edge weights +G = cugraph.Graph() +print("Creating cuGraph graph...", flush=True) +G.from_cudf_edgelist(gdf, source='src', destination='dst', edge_attr='data', renumber=True) + +# Run Leiden +print("Running Leiden (first)...", flush=True) +parts, mod = cugraph.leiden(G) +for i in range(4): + print("Running Leiden...", flush=True) + t0 = time.time() + parts, mod = cugraph.leiden(G) + t1 = time.time() + print("Leiden modularity: {:.6f}".format(mod), flush=True) + print("Leiden took: {:.6f} s".format(t1-t0), flush=True) + +# Save communities to file +comm = os.path.expanduser(sys.argv[2]) +print("Saving communities to file: {}".format(comm), flush=True) +parts.to_csv(comm, sep=' ', header=False, index=False, chunksize=1e6) diff --git a/main.sh b/main.sh new file mode 100755 index 0000000..d62848e --- /dev/null +++ b/main.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +src="test-cugraph-louvain" +out="$HOME/Logs/$src$1.log" +ulimit -s unlimited +printf "" > "$out" + +# Configuration +: "${CUDA_VERSION:=11.4}" + +# Download tool to count disconnected communities +app="graph-count-disconnected-communities" +rm -rf $app +git clone https://github.com/ionicf/$app && echo "" +cd $app + +# Fixed config +: "${KEY_TYPE:=uint32_t}" +: "${EDGE_VALUE_TYPE:=float}" +: "${MAX_THREADS:=64}" +# Define macros (dont forget to add here) +DEFINES=("" +"-DKEY_TYPE=$KEY_TYPE" +"-DEDGE_VALUE_TYPE=$EDGE_VALUE_TYPE" +"-DMAX_THREADS=$MAX_THREADS" +) + +# Build tool +g++ ${DEFINES[*]} -std=c++17 -O3 -fopenmp main.cxx +mv a.out ../count.out +cd .. + +# Download program +if [[ "$DOWNLOAD" != "0" ]]; then + rm -rf $src + git clone https://github.com/puzzlef/$src && echo "" + cd $src +fi + +# Install cuGraph +if [[ "$INSTALL" == "1" ]]; then + conda create --name cugraph-env -y + conda activate cugraph-env + conda install -c rapidsai -c conda-forge -c nvidia cugraph cuda-version=$CUDA_VERSION -y +fi + +# Run program +runCugraph() { + # $1: input file name + # $2: is graph weighted (0/1) + # $3: is graph symmetric (0/1) + # $4: memory manager (default/managed) + opt2="" + opt3="" + if [[ "$2" == "1" ]]; then opt2="-w"; fi + if [[ "$3" == "1" ]]; then opt3="-s"; fi + # Convert the graph in MTX format to CSV (space-separated) + stdbuf --output=L printf "Converting $1 to $1.csv ...\n" | tee -a "$out" + lines="$(node process.js header-lines "$1")" + echo "src dst" > "$1.csv" + tail -n +$((lines+1)) "$1" >> "$1.csv" + # Run cuGraph leiden, and save the obtained communities + stdbuf --output=L printf "Running cuGraph Leiden on $1.csv ...\n" | tee -a "$out" + stdbuf --output=L python3 main.py "$1.csv" "$1.clstr" "$3" 2>&1 | tee -a "$out" + # Count disconnected communities + stdbuf --output=L printf "Counting disconnected communities ...\n" | tee -a "$out" + stdbuf --output=L ../count.out -i "$1" -m "$1.clstr" -k -r 0 "$opt2" "$opt3" 2>&1 | tee -a "$out" + stdbuf --output=L printf "\n\n" | tee -a "$out" + # Clean up + rm -rf "$1.csv" + rm -rf "$1.clstr" +} + +runAll() { + # runCugraph "$HOME/Data/web-Stanford.mtx" 0 0 default + runCugraph "$HOME/Data/indochina-2004.mtx" 0 0 default + runCugraph "$HOME/Data/uk-2002.mtx" 0 0 default + # runCugraph "$HOME/Data/arabic-2005.mtx" 0 0 managed + # runCugraph "$HOME/Data/uk-2005.mtx" 0 0 managed + # runCugraph "$HOME/Data/webbase-2001.mtx" 0 0 managed + # runCugraph "$HOME/Data/it-2004.mtx" 0 0 managed + # runCugraph "$HOME/Data/sk-2005.mtx" 0 0 managed + runCugraph "$HOME/Data/com-LiveJournal.mtx" 0 1 default + runCugraph "$HOME/Data/com-Orkut.mtx" 0 1 default + runCugraph "$HOME/Data/asia_osm.mtx" 0 1 default + runCugraph "$HOME/Data/europe_osm.mtx" 0 1 default + runCugraph "$HOME/Data/kmer_A2a.mtx" 0 1 default + runCugraph "$HOME/Data/kmer_V1r.mtx" 0 1 default +} + +runAll + +# Signal completion +curl -X POST "https://maker.ifttt.com/trigger/puzzlef/with/key/${IFTTT_KEY}?value1=$src$1" diff --git a/process.js b/process.js new file mode 100644 index 0000000..c71e2e8 --- /dev/null +++ b/process.js @@ -0,0 +1,159 @@ +const fs = require('fs'); +const os = require('os'); +const path = require('path'); +const readline = require('readline'); + +const RGRAPH = /^Running cuGraph Leiden on \s*.*\/(.*?)\.mtx\.csv/m; +const RORDER = /^order: (.+?) size: (.+) \[directed\] \{\}/m; +const RMODUL = /^Leiden modularity: (.+)/m; +const RTTIME = /^Leiden took: (.+?) s/m; +const RNCOMS = /^Number of communities: (.+)/m; +const RDCOMS = /^Number of disconnected communities: (.+)/m; + + + + +// *-FILE +// ------ + +function readFile(pth) { + var d = fs.readFileSync(pth, 'utf8'); + return d.replace(/\r?\n/g, '\n'); +} + +function writeFile(pth, d) { + d = d.replace(/\r?\n/g, os.EOL); + fs.writeFileSync(pth, d); +} + + + + +// *-CSV +// ----- + +function writeCsv(pth, rows) { + var cols = Object.keys(rows[0]); + var a = cols.join()+'\n'; + for (var r of rows) + a += [...Object.values(r)].map(v => `"${v}"`).join()+'\n'; + writeFile(pth, a); +} + + + + +// *-LOG +// ----- + +function readLogLine(ln, data, state) { + state = state || {}; + ln = ln.replace(/^\d+-\d+-\d+ \d+:\d+:\d+\s+/, ''); + if (RGRAPH.test(ln)) { + var [, graph] = RGRAPH.exec(ln); + if (!data.has(graph)) data.set(graph, []); + state.graph = graph; + state.order = 0; + state.size = 0; + state.time = 0; + state.modularity = 0; + state.communities = 0; + state.disconnected_communities = 0; + state.rows = 0; + } + else if (RORDER.test(ln)) { + var [, order, size] = RORDER.exec(ln); + state.order = parseFloat(order); + state.size = parseFloat(size); + } + else if (RMODUL.test(ln)) { + var [, modularity] = RMODUL.exec(ln); + state.modularity += parseFloat(modularity); + } + else if (RTTIME.test(ln)) { + var [, time] = RTTIME.exec(ln); + state.time += 1000 * parseFloat(time); + ++state.rows; + } + else if (RNCOMS.test(ln)) { + var [, communities] = RNCOMS.exec(ln); + state.communities = parseFloat(communities); + } + else if (RDCOMS.test(ln)) { + var [, disconnected_communities] = RDCOMS.exec(ln); + state.disconnected_communities = parseFloat(disconnected_communities); + state.modularity /= state.rows; + state.time /= state.rows; + data.get(state.graph).push(Object.assign({}, state)); + } + return state; +} + +function readLog(pth) { + var text = readFile(pth); + var lines = text.split('\n'); + var data = new Map(); + var state = null; + for (var ln of lines) + state = readLogLine(ln, data, state); + return data; +} + + + + +// PROCESS-* +// --------- + +function processCsv(data) { + var a = []; + for (var rows of data.values()) + a.push(...rows); + return a; +} + + + + +// HEADER LINES +// ------------ + +// Count the number of header lines in a MatrixMarket file. +async function headerLines(pth) { + var a = 0; + var rl = readline.createInterface({input: fs.createReadStream(pth)}); + for await (var line of rl) { + if (line[0]==='%') ++a; + else break; + } + return a+1; // +1 for the row/column count line +} + + + + +// MAIN +// ---- + +async function main(cmd, inp, out) { + var data = cmd==='csv'? readLog(inp) : ''; + if (out && path.extname(out)==='') cmd += '-dir'; + switch (cmd) { + case 'csv': + var rows = processCsv(data); + writeCsv(out, rows); + break; + case 'csv-dir': + for (var [graph, rows] of data) + writeCsv(path.join(out, graph+'.csv'), rows); + break; + case 'header-lines': + var lines = await headerLines(inp); + console.log(lines); + break; + default: + console.error(`error: "${cmd}"?`); + break; + } +} +main(...process.argv.slice(2));