From 93d9c1019dec37fc4990405409b000728b857d72 Mon Sep 17 00:00:00 2001 From: Robert Marklund Date: Mon, 13 Jan 2025 16:36:04 +0100 Subject: [PATCH 1/3] add buffersize argument add the argument to change buffer size of checksum calculation, may increase speed on some checksum algorithms and disk types. Signed-off-by: Robert Marklund --- Fileinfo.cc | 8 ++++---- Fileinfo.hh | 6 +++++- Rdutil.cc | 7 +++++-- Rdutil.hh | 6 +++--- rdfind.cc | 24 +++++++++++++++++++++--- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/Fileinfo.cc b/Fileinfo.cc index 5c44435..8946713 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -24,7 +24,8 @@ int Fileinfo::fillwithbytes(enum readtobuffermode filltype, - enum readtobuffermode lasttype) + enum readtobuffermode lasttype, + std::vector& buffer) { // Decide if we are going to read from file or not. @@ -80,11 +81,10 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype, if (checksumtype != Checksum::checksumtypes::NOTSET) { Checksum chk(checksumtype); - char buffer[4096]; while (f1) { - f1.read(buffer, sizeof(buffer)); + f1.read(buffer.data(), static_cast(buffer.size())); // gcount is never negative, the cast is safe. - chk.update(static_cast(f1.gcount()), buffer); + chk.update(static_cast(f1.gcount()), buffer.data()); } // store the result of the checksum calculation in somebytes diff --git a/Fileinfo.hh b/Fileinfo.hh index 0e77c23..273db25 100644 --- a/Fileinfo.hh +++ b/Fileinfo.hh @@ -10,6 +10,7 @@ #include #include #include +#include // os specific headers #include //for off_t and others. @@ -135,10 +136,13 @@ public: * is shorter than the length of the bytes field. * @param filltype * @param lasttype + * @param buffer will be used as a scratch buffer - provided from the outside + * to avoid having to reallocate it for each file * @return zero on success */ int fillwithbytes(enum readtobuffermode filltype, - enum readtobuffermode lasttype); + enum readtobuffermode lasttype, + std::vector& buffer); /// get a pointer to the bytes read from the file const char* getbyteptr() const { return m_somebytes.data(); } diff --git a/Rdutil.cc b/Rdutil.cc index 60d5e81..7b9ddef 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -542,15 +542,18 @@ Rdutil::saveablespace(std::ostream& out) const int Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type, enum Fileinfo::readtobuffermode lasttype, - const long nsecsleep) + const long nsecsleep, + const std::size_t buffersize) { // first sort on inode (to read efficiently from the hard drive) sortOnDeviceAndInode(); const auto duration = std::chrono::nanoseconds{ nsecsleep }; + std::vector buffer(buffersize, '\0'); + for (auto& elem : m_list) { - elem.fillwithbytes(type, lasttype); + elem.fillwithbytes(type, lasttype, buffer); if (nsecsleep > 0) { std::this_thread::sleep_for(duration); } diff --git a/Rdutil.hh b/Rdutil.hh index 98e892f..6f5899a 100644 --- a/Rdutil.hh +++ b/Rdutil.hh @@ -88,9 +88,9 @@ public: // if there is trouble with too much disk reading, sleeping for nsecsleep // nanoseconds can be made between each file. int fillwithbytes(enum Fileinfo::readtobuffermode type, - enum Fileinfo::readtobuffermode lasttype = - Fileinfo::readtobuffermode::NOT_DEFINED, - long nsecsleep = 0); + enum Fileinfo::readtobuffermode lasttype, + long nsecsleep, + std::size_t buffersize); /// make symlinks of duplicates. std::size_t makesymlinks(bool dryrun) const; diff --git a/rdfind.cc b/rdfind.cc index bd91770..ef6c231 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -39,8 +39,9 @@ int current_cmdline_index = 0; static void usage() { + const auto indent = " "; std::cout - << "Usage: " << "rdfind [options] FILE ...\n" + << "Usage: rdfind [options] FILE ...\n" << '\n' << "Finds duplicate files recursively in the given FILEs (directories),\n" << "and takes appropriate action (by default, nothing).\n" @@ -64,6 +65,9 @@ usage() "device and inode\n" << " -checksum md5 |(sha1)| sha256 | sha512\n" << " checksum type\n" + << " -buffersize N\n" + << indent << "chunksize in bytes when calculating the checksum.\n" + << indent << "The default is 1 MiB, can be up to 128 MiB.\n" << " -deterministic (true)| false makes results independent of order\n" << " from listing the filesystem\n" << " -makesymlinks true |(false) replace duplicate files with " @@ -74,7 +78,7 @@ usage() << " -outputname name sets the results file name to \"name\" " "(default results.txt)\n" << " -deleteduplicates true |(false) delete duplicate files\n" - << " -sleep Xms sleep for X milliseconds between " + << " -sleep Xms sleep for X milliseconds between " "file reads.\n" << " Default is 0. Only a few values\n" << " are supported; 0,1-5,10,25,50,100\n" @@ -109,6 +113,7 @@ struct Options bool usesha256 = false; // use sha256 checksum to check for similarity bool usesha512 = false; // use sha512 checksum to check for similarity bool deterministic = true; // be independent of filesystem order + std::size_t buffersize = 1 << 20; // chunksize to use when reading files long nsecsleep = 0; // number of nanoseconds to sleep between each file read. std::string resultsfile = "results.txt"; // results file name. }; @@ -183,6 +188,19 @@ parseOptions(Parser& parser) << parser.get_parsed_string() << "\"\n"; std::exit(EXIT_FAILURE); } + } else if (parser.try_parse_string("-buffersize")) { + const long buffersize = std::stoll(parser.get_parsed_string()); + constexpr long max_buffersize = 128 << 20; + if (buffersize <= 0) { + std::cerr << "a negative or zero buffersize is not allowed\n"; + std::exit(EXIT_FAILURE); + } else if (buffersize > max_buffersize) { + std::cerr << "a maximum of " << (max_buffersize >> 20) + << " MiB buffersize is allowed, got " << (buffersize >> 20) + << " MiB\n"; + std::exit(EXIT_FAILURE); + } + o.buffersize = static_cast(buffersize); } else if (parser.try_parse_string("-sleep")) { const auto nextarg = std::string(parser.get_parsed_string()); if (nextarg == "1ms") { @@ -382,7 +400,7 @@ main(int narg, const char* argv[]) << it->second << ": " << std::flush; // read bytes (destroys the sorting, for disk reading efficiency) - gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep); + gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize); // remove non-duplicates std::cout << "removed " << gswd.removeUniqSizeAndBuffer() From 6a5fea5fae1e4754d8073af668a08450d2e52031 Mon Sep 17 00:00:00 2001 From: Robert Marklund Date: Wed, 22 Jan 2025 16:59:22 +0100 Subject: [PATCH 2/3] add buffersize to man file Signed-off-by: Robert Marklund --- rdfind.1 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/rdfind.1 b/rdfind.1 index 7f60c3b..cbe54f8 100644 --- a/rdfind.1 +++ b/rdfind.1 @@ -80,6 +80,12 @@ is true. What type of checksum to be used: md5, sha1, sha256 or sha512. The default is sha1 since version 1.4.0. .TP +.BR \-buffersize " " \fIN\fR +Chunksize in bytes when calculating the checksum +for files, smaller or bigger can improve performance +dependent on filesystem and checksum algorithm. +The default is 1 MiB, the maximum allowed is 128MiB (inclusive). +.TP .BR \-deterministic " " \fItrue\fR|\fIfalse\fR If set (the default), sort files of equal rank in an unspecified but deterministic order. This makes the behaviour independent of in which From 06b2fef78a6852ba32d08cbc16d29fbcf856e6aa Mon Sep 17 00:00:00 2001 From: Robert Marklund Date: Thu, 23 Jan 2025 01:03:15 +0100 Subject: [PATCH 3/3] add buffersize test cases Signed-off-by: Robert Marklund --- Makefile.am | 3 +- testcases/checksum_buffersize.sh | 45 ++++++++++++++++++++ testcases/checksum_buffersize_speedtest.sh | 48 ++++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100755 testcases/checksum_buffersize.sh create mode 100755 testcases/checksum_buffersize_speedtest.sh diff --git a/Makefile.am b/Makefile.am index 2b4ed82..6ff7390 100644 --- a/Makefile.am +++ b/Makefile.am @@ -18,7 +18,8 @@ TESTS=testcases/largefilesupport.sh \ testcases/verify_deterministic_operation.sh \ testcases/checksum_options.sh \ testcases/md5collisions.sh \ - testcases/sha1collisions.sh + testcases/sha1collisions.sh \ + testcases/checksum_buffersize.sh AUXFILES=testcases/common_funcs.sh \ testcases/md5collisions/letter_of_rec.ps \ diff --git a/testcases/checksum_buffersize.sh b/testcases/checksum_buffersize.sh new file mode 100755 index 0000000..695c5db --- /dev/null +++ b/testcases/checksum_buffersize.sh @@ -0,0 +1,45 @@ +#!/bin/sh +# Test that selection of buffersizes works as expected. + +set -e +. "$(dirname "$0")/common_funcs.sh" + +reset_teststate + +TEST_DIR=buffersizes_test +mkdir -p "$TEST_DIR" + +make_test_files() { + dbgecho "creating test files in $TEST_DIR" + head -c 1000000 /dev/zero >"$TEST_DIR/a" + cp "$TEST_DIR/a" "$TEST_DIR/b" + cp "$TEST_DIR/a" "$TEST_DIR/c" + cp "$TEST_DIR/a" "$TEST_DIR/d" + cp "$TEST_DIR/a" "$TEST_DIR/e" +} + +dbgecho "check so all buffersizes behave the same" + +# disables only run once shellscheck +# shellcheck disable=SC2043 +for checksumtype in sha256; do + i=1 + while :; do + if [ $i -gt 128 ]; then + break + fi + i="$((i*2))" + make_test_files + dbgecho "testing buffersize $((i*1024))" + dbgecho "testing $checksumtype" + # Fix this properly by making rdfind to array and use "${rdfind[@]}" + # this requires bash not sh + # shellcheck disable=SC2086 + $rdfind -buffersize $((i*1024)) -checksum "$checksumtype" -deleteduplicates true "$TEST_DIR" >/dev/null + [ -e "$TEST_DIR/a" ] + [ ! -e "$TEST_DIR/b" ] + [ ! -e "$TEST_DIR/c" ] + [ ! -e "$TEST_DIR/d" ] + [ ! -e "$TEST_DIR/e" ] + done +done diff --git a/testcases/checksum_buffersize_speedtest.sh b/testcases/checksum_buffersize_speedtest.sh new file mode 100755 index 0000000..7c40571 --- /dev/null +++ b/testcases/checksum_buffersize_speedtest.sh @@ -0,0 +1,48 @@ +#!/bin/sh +# Performance test for checksumming with different buffersizes. Not meant +# to be run for regular testing. + +set -e +. "$(dirname "$0")/common_funcs.sh" + +reset_teststate + +TEST_DIR=buffersizes_speedtest +mkdir -p "$TEST_DIR" + +make_test_files() { + dbgecho "creating test files in $TEST_DIR/bigfiles" + mkdir -p "$TEST_DIR/bigfiles" + head -c $((1024*1024*500)) /dev/zero >"$TEST_DIR/bigfiles/a" + for f in b c d e; do + cp "$TEST_DIR/bigfiles/a" "$TEST_DIR/bigfiles/$f" + done + dbgecho "creating test files in $TEST_DIR/smallfiles" + mkdir -p "$TEST_DIR/smallfiles" + (cd "$TEST_DIR/smallfiles"; head -c100000000 /dev/zero |split --bytes 1000) +} + +dbgecho "run speed test for all shecksums and buffersizes" + +make_test_files + +cat /dev/null >"$TEST_DIR/results.tsv" +for filesize in big small; do +for checksumtype in md5 sha1; do + i=1 + while :; do + if [ $i -gt 4096 ]; then + break + fi + # Fix this properly by making rdfind to array and use "${rdfind[@]}" + # this requires bash not sh + # shellcheck disable=SC2086 + dbgecho "testing $checksumtype $i kB buffersize" + # shellcheck disable=SC2086 + /usr/bin/time --append --output=$TEST_DIR/results.tsv -f "$filesize\t$i\t$checksumtype\t%e\t%M\t%C" $rdfind -buffersize $((i*1024)) -checksum "$checksumtype" -dryrun true -deleteduplicates true "$TEST_DIR/${filesize}files" >/dev/null 2>&1 + i="$((i*2))" +done +done +done +cat "$TEST_DIR/results.tsv" +