Skip to content

Commit

Permalink
add buffersize argument
Browse files Browse the repository at this point in the history
add the argument to change buffer size of checksum calculation,
may increase speed on some checksum algorithms and disk types.

Signed-off-by: Robert Marklund <[email protected]>
  • Loading branch information
trollkarlen committed Jan 23, 2025
1 parent 1acbf12 commit bc47744
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 16 deletions.
11 changes: 7 additions & 4 deletions Fileinfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cstring> //for strerror
#include <fstream> //for file reading
#include <iostream> //for cout etc
#include <vector>

// os
#include <sys/stat.h> //for file info
Expand All @@ -24,7 +25,8 @@

int
Fileinfo::fillwithbytes(enum readtobuffermode filltype,
enum readtobuffermode lasttype)
enum readtobuffermode lasttype,
const long buffersize)
{

// Decide if we are going to read from file or not.
Expand Down Expand Up @@ -80,11 +82,12 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype,
if (checksumtype != Checksum::checksumtypes::NOTSET) {
Checksum chk(checksumtype);

char buffer[4096];
std::vector<char> buffer(buffersize);

while (f1) {
f1.read(buffer, sizeof(buffer));
f1.read(buffer.data(), buffer.size());
// gcount is never negative, the cast is safe.
chk.update(static_cast<std::size_t>(f1.gcount()), buffer);
chk.update(static_cast<std::size_t>(f1.gcount()), buffer.data());
}

// store the result of the checksum calculation in somebytes
Expand Down
4 changes: 3 additions & 1 deletion Fileinfo.hh
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,12 @@ public:
* is shorter than the length of the bytes field.
* @param filltype
* @param lasttype
* @param buffersize
* @return zero on success
*/
int fillwithbytes(enum readtobuffermode filltype,
enum readtobuffermode lasttype);
enum readtobuffermode lasttype,
const long buffersize);

/// get a pointer to the bytes read from the file
const char* getbyteptr() const { return m_somebytes.data(); }
Expand Down
5 changes: 3 additions & 2 deletions Rdutil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -542,15 +542,16 @@ Rdutil::saveablespace(std::ostream& out) const
int
Rdutil::fillwithbytes(enum Fileinfo::readtobuffermode type,
enum Fileinfo::readtobuffermode lasttype,
const long nsecsleep)
const long nsecsleep,
const long buffersize)
{
// first sort on inode (to read efficiently from the hard drive)
sortOnDeviceAndInode();

const auto duration = std::chrono::nanoseconds{ nsecsleep };

for (auto& elem : m_list) {
elem.fillwithbytes(type, lasttype);
elem.fillwithbytes(type, lasttype, buffersize);
if (nsecsleep > 0) {
std::this_thread::sleep_for(duration);
}
Expand Down
3 changes: 2 additions & 1 deletion Rdutil.hh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ public:
int fillwithbytes(enum Fileinfo::readtobuffermode type,
enum Fileinfo::readtobuffermode lasttype =
Fileinfo::readtobuffermode::NOT_DEFINED,
long nsecsleep = 0);
long nsecsleep = 0,
const long buffersize = 4096);

/// make symlinks of duplicates.
std::size_t makesymlinks(bool dryrun) const;
Expand Down
34 changes: 26 additions & 8 deletions rdfind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static void
usage()
{
std::cout
<< "Usage: " << "rdfind [options] FILE ...\n"
<< "Usage: rdfind [options] FILE ...\n"
<< '\n'
<< "Finds duplicate files recursively in the given FILEs (directories),\n"
<< "and takes appropriate action (by default, nothing).\n"
Expand All @@ -65,6 +65,13 @@ usage()
"device and inode\n"
<< " -checksum md5 |(sha1)| sha256 | sha512\n"
<< " checksum type\n"
<< " -buffersize N (N=4096,0<N<1000KB)\n"
" Chunksize when calculating the "
"checksum\n"
" for files, smaller or bigger can "
"improve performance\n"
" dependent on filesystem and checksum "
"algorithm.\n"
<< " -deterministic (true)| false makes results independent of order\n"
<< " from listing the filesystem\n"
<< " -makesymlinks true |(false) replace duplicate files with "
Expand All @@ -75,7 +82,7 @@ usage()
<< " -outputname name sets the results file name to \"name\" "
"(default results.txt)\n"
<< " -deleteduplicates true |(false) delete duplicate files\n"
<< " -sleep Xms sleep for X milliseconds between "
<< " -sleep Xms sleep for X milliseconds between "
"file reads.\n"
<< " Default is 0. Only a few values\n"
<< " are supported; 0,1-5,10,25,50,100\n"
Expand Down Expand Up @@ -105,11 +112,12 @@ struct Options
bool followsymlinks = false; // follow symlinks
bool dryrun = false; // only dryrun, don't destroy anything
bool remove_identical_inode = true; // remove files with identical inodes
bool usemd5 = false; // use md5 checksum to check for similarity
bool usesha1 = false; // use sha1 checksum to check for similarity
bool usesha256 = false; // use sha256 checksum to check for similarity
bool usesha512 = false; // use sha512 checksum to check for similarity
bool deterministic = true; // be independent of filesystem order
bool usemd5 = false; // use md5 checksum to check for similarity
bool usesha1 = false; // use sha1 checksum to check for similarity
bool usesha256 = false; // use sha256 checksum to check for similarity
bool usesha512 = false; // use sha512 checksum to check for similarity
std::size_t buffersize = 4096; // chunksize to use when reading files
bool deterministic = true; // be independent of filesystem order
long nsecsleep = 0; // number of nanoseconds to sleep between each file read.
std::string resultsfile = "results.txt"; // results file name.
};
Expand Down Expand Up @@ -184,6 +192,16 @@ parseOptions(Parser& parser)
<< parser.get_parsed_string() << "\"\n";
std::exit(EXIT_FAILURE);
}
} else if (parser.try_parse_string("-buffersize")) {
const long buffersize = std::stoll(parser.get_parsed_string());
if (buffersize <= 0) {
std::cerr << "negative or 0 value of buffersize not allowed\n";
std::exit(EXIT_FAILURE);
} else if (buffersize > 1000 * 1024) {
std::cerr << "maximum 1000KB value of buffersize not allowed";
std::exit(EXIT_FAILURE);
}
o.buffersize = buffersize;
} else if (parser.try_parse_string("-sleep")) {
const auto nextarg = std::string(parser.get_parsed_string());
if (nextarg == "1ms") {
Expand Down Expand Up @@ -383,7 +401,7 @@ main(int narg, const char* argv[])
<< it->second << ": " << std::flush;

// read bytes (destroys the sorting, for disk reading efficiency)
gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep);
gswd.fillwithbytes(it[0].first, it[-1].first, o.nsecsleep, o.buffersize);

// remove non-duplicates
std::cout << "removed " << gswd.removeUniqSizeAndBuffer()
Expand Down

0 comments on commit bc47744

Please sign in to comment.