From d994b4fbc53ddc725a196de0236df44bd7bbcb2b Mon Sep 17 00:00:00 2001 From: "S. D. Adams" Date: Tue, 20 Feb 2018 00:00:47 +0000 Subject: [PATCH 1/2] Implemented Feature #225 - Don't follow symlinks. The flag '-R' causes hashdeep to hash the output of readlink instead of following the file. Tested on Ubuntu 16.04 and Mac OS X High Sierra. Not supported on Windows. --- man/hashdeep.1 | 5 +++++ src/dig.cpp | 29 ++++++++++++++++++++++++----- src/hash.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++------- src/main.cpp | 9 ++++++++- src/main.h | 12 ++++++++---- 5 files changed, 88 insertions(+), 17 deletions(-) diff --git a/man/hashdeep.1 b/man/hashdeep.1 index 169516679..36b15834f 100644 --- a/man/hashdeep.1 +++ b/man/hashdeep.1 @@ -226,6 +226,11 @@ open(). Specifying \fB-Fm\fR will use memory-mapped I/O which will be faster on some platforms, but which (currently) will not work with files that produce I/O errors. +.TP +\fB-R\fR +Don't follow symlinks, instead hash the output of readlink. (Not available on +Windows) + .TP diff --git a/src/dig.cpp b/src/dig.cpp index 5ab58e5c5..c448e6c42 100644 --- a/src/dig.cpp +++ b/src/dig.cpp @@ -75,10 +75,19 @@ file_types file_metadata_t::decode_file_type(const struct __stat64 &sb) */ int file_metadata_t::stat(const tstring &fn, file_metadata_t *m, - class display &ocb) + class display &ocb, + bool const is_symlink) { struct __stat64 sb; - if (::TSTAT(fn.c_str(),&sb)) + if (ocb.opt_readlink && is_symlink) + { + if (::TLSTAT(fn.c_str(),&sb)) + { + ocb.error_filename(fn,"%s",strerror(errno)); + return -1; + } + } + else if (::TSTAT(fn.c_str(),&sb)) { ocb.error_filename(fn,"%s",strerror(errno)); return -1; @@ -564,6 +573,13 @@ void state::process_dir(const tstring &fn) */ bool state::should_hash_symlink(const tstring &fn, file_types *link_type) { + /** + * When readlink option is set, all symlinks are to be hashed. + */ + if (ocb.opt_readlink) { + return true; + } + /** * We must look at what this symlink points to before we process it. * The file_type() function uses lstat to examine the file. @@ -694,9 +710,11 @@ bool state::should_hash_expert(const tstring &fn, file_types type) * but if it is called with a directory it recursively hashes it. */ -bool state::should_hash(const tstring &fn) +bool state::should_hash(const tstring &fn, file_types &_type) { file_types type = state::file_type(fn,&ocb,0,0,0,0); + + _type = type; if (mode_expert) return should_hash_expert(fn,type); @@ -735,8 +753,9 @@ void state::dig_normal(const tstring &fn_) { #endif if (opt_debug) ocb.status("*** cleaned:%s",global::make_utf8(fn).c_str()); - if (should_hash(fn)) - ocb.hash_file(fn); + file_types type; + if (should_hash(fn, type)) + ocb.hash_file(fn, type); } diff --git a/src/hash.cpp b/src/hash.cpp index 52f419b0c..7b8c32e31 100644 --- a/src/hash.cpp +++ b/src/hash.cpp @@ -91,6 +91,16 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le hc1->read_offset = request_start; hc1->read_len = 0; // so far + unsigned char *readlink_buffer = 0; + if (ocb->opt_readlink && file_is_symlink) { +#ifndef _WIN32 + if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) { + readlink_buffer = (unsigned char*)malloc(request_len); + readlink(file_name_to_hash.c_str(), (char*)readlink_buffer, request_len); + } +#endif + } + while (request_len>0){ // Clear the buffer in case we hit an error and need to pad the hash // The use of MD5DEEP_IDEAL_BLOCK_SIZE means that we loop even for memory-mapped @@ -107,10 +117,21 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le ssize_t current_read_bytes = 0; // read the data into buffer - if(this->handle){ + if (ocb->opt_readlink && file_is_symlink) { +#ifndef _WIN32 + if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) { + memcpy(buffer_, readlink_buffer + hc1->read_len, toread); + } + else { + readlink(file_name_to_hash.c_str(), (char*)buffer_, toread); + } + current_read_bytes = toread; +#endif + } + else if(this->handle){ current_read_bytes = fread(buffer_, 1, toread, this->handle); } else { - assert(this->fd!=0); + assert(this->fd!=-1); if(this->base){ buffer = this->base + request_start; current_read_bytes = min(toread,this->bounds - request_start); // can't read more than this @@ -169,6 +190,12 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le request_start += toread; request_len -= toread; } + + if (readlink_buffer) { + free(readlink_buffer); + readlink_buffer = 0; + } + if (ocb->opt_estimate) ocb->clear_realtime_stats(); if (this->file_bytes == this->stat_bytes) this->eof = true; // end of the file return true; // done hashing! @@ -201,6 +228,8 @@ void file_data_hasher_t::hash() { file_data_hasher_t *fdht = this; + bool const readlink_this_file = ocb->opt_readlink && fdht->file_is_symlink; + /* * If the handle is set, we are probably hashing stdin. * If not, figure out file size and full file name for the handle @@ -211,7 +240,7 @@ void file_data_hasher_t::hash() //state::file_type(fdht->file_name_to_hash,ocb,&fdht->stat_bytes, //&fdht->ctime,&fdht->mtime,&fdht->atime); file_metadata_t m; - file_metadata_t::stat(fdht->file_name_to_hash,&m,*ocb); + file_metadata_t::stat(fdht->file_name_to_hash,&m,*ocb,fdht->file_is_symlink); fdht->stat_bytes = m.size; fdht->ctime = m.ctime; fdht->mtime = m.mtime; @@ -238,7 +267,11 @@ void file_data_hasher_t::hash() } } - switch(ocb->opt_iomode){ + if (readlink_this_file) { + assert(fdht->fd == -1); + assert(fdht->handle == 0); + } + else switch(ocb->opt_iomode){ case iomode::buffered: assert(fdht->handle==0); @@ -360,7 +393,7 @@ void file_data_hasher_t::hash() */ fdht->file_bytes = 0; if(fdht->handle) fseeko(fdht->handle, 0, SEEK_SET); - if(fdht->fd){ + if(fdht->fd != -1){ lseek(this->fd,0,SEEK_SET); } fdht->eof = false; // @@ -381,7 +414,7 @@ void file_data_hasher_t::hash() while (fdht->eof==false) { uint64_t request_len = fdht->stat_bytes; // by default, hash the file - if ( fdht->ocb->piecewise_size>0 ) { + if ( !readlink_this_file && fdht->ocb->piecewise_size>0 ) { request_len = fdht->ocb->piecewise_size; } @@ -467,10 +500,13 @@ void worker::do_work(file_data_hasher_t *fdht) * 2 - hash the fdht * 3 - record it in stdout using display. */ -void display::hash_file(const tstring &fn) +void display::hash_file(const tstring &fn, file_types const type) { file_data_hasher_t *fdht = new file_data_hasher_t(this); fdht->file_name_to_hash = fn; + if (type == stat_symlink) { + fdht->file_is_symlink = true; + } /** * If we are using a thread pool, hash in another thread diff --git a/src/main.cpp b/src/main.cpp index b605d2b82..308fa766f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -614,7 +614,11 @@ int state::hashdeep_process_command_line(int argc_, char **argv_) bool did_usage = false; int i; - while ((i=getopt(argc_,argv_,"abc:CdeEF:f:o:I:i:MmXxtlk:rsp:wvVhW:0D:uj:")) != -1) { + while ((i=getopt(argc_,argv_,"abc:CdeEF:f:o:I:i:MmXxtlk:rsp:wvVhW:0D:uj:" +#ifndef _WIN32 + "R" +#endif + )) != -1) { switch (i) { case 'a': @@ -683,6 +687,9 @@ int state::hashdeep_process_command_line(int argc_, char **argv_) case 'b': ocb.mode_barename=true; break; case 'l': ocb.opt_relative=true; break; case 'e': ocb.opt_estimate = true; break; +#ifndef _WIN32 + case 'R': ocb.opt_readlink = true; break; +#endif case 'r': mode_recursive=true; break; case 's': ocb.opt_silent = true; break; diff --git a/src/main.h b/src/main.h index 99df49a6e..a8178ecd2 100644 --- a/src/main.h +++ b/src/main.h @@ -220,7 +220,7 @@ class file_metadata_t { static file_types decode_file_type(const struct __stat64 &sb); // stat a file, print an error and return -1 if it fails, otherwise return 0 - static int stat(const filename_t &path,file_metadata_t *m,class display &ocb); + static int stat(const filename_t &path,file_metadata_t *m,class display &ocb, bool is_symlink = false); class fileid_t { // uniquely defines a file on this system public: fileid_t():dev(0),ino(0){}; @@ -296,6 +296,7 @@ class file_data_hasher_t : public file_data_t { } static const size_t MD5DEEP_IDEAL_BLOCK_SIZE = 8192; file_data_hasher_t(class display *ocb_): + file_is_symlink(false), ocb(ocb_), // where we put results handle(0), fd(-1), @@ -322,6 +323,7 @@ class file_data_hasher_t : public file_data_t { /* The actual file to hash */ filename_t file_name_to_hash; + bool file_is_symlink; /* Where the results go */ class display *ocb; @@ -585,6 +587,7 @@ class display { opt_display_hash(false), opt_show_matched(false), opt_case_sensitive(true), + opt_readlink(false), opt_iomode(iomode::buffered), // by default, use buffered #ifdef HAVE_PTHREAD opt_threadcount(threadpool::numCPU()), @@ -620,6 +623,7 @@ class display { bool opt_display_hash; bool opt_show_matched; bool opt_case_sensitive; + bool opt_readlink; int opt_iomode; int opt_threadcount; @@ -755,7 +759,7 @@ class display { void finalize_matching(); /* hash.cpp: Actually trigger the hashing. */ - void hash_file(const tstring &file_name); + void hash_file(const tstring &file_name, file_types type); void hash_stdin(); void dump_hashlist(){ lock(); known.dump_hashlist(); unlock(); } }; @@ -794,7 +798,7 @@ public:; state():mode_recursive(false), // do we recurse? mode_warn_only(false), // for loading hash files - + // these determine which files get hashed mode_expert(false), mode_regular(false), @@ -904,7 +908,7 @@ public:; bool should_hash_symlink(const tstring &fn,file_types *link_type); bool should_hash_winpe(const tstring &fn); bool should_hash_expert(const tstring &fn, file_types type); - bool should_hash(const tstring &fn); + bool should_hash(const tstring &fn, file_types &type); /* file_type returns the file type of a string. * If an error is found and ocb is provided, send the error to ocb. From 38d58e16bdb6fc498998859a41ffd58b2da46866 Mon Sep 17 00:00:00 2001 From: "S. D. Adams" Date: Tue, 20 Feb 2018 12:39:52 +0000 Subject: [PATCH 2/2] Bugfix for reading long symlinks. --- src/hash.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hash.cpp b/src/hash.cpp index 7b8c32e31..e716bae70 100644 --- a/src/hash.cpp +++ b/src/hash.cpp @@ -92,9 +92,10 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le hc1->read_len = 0; // so far unsigned char *readlink_buffer = 0; + bool const request_larger_than_buffer = request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE; if (ocb->opt_readlink && file_is_symlink) { #ifndef _WIN32 - if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) { + if (request_larger_than_buffer) { readlink_buffer = (unsigned char*)malloc(request_len); readlink(file_name_to_hash.c_str(), (char*)readlink_buffer, request_len); } @@ -119,7 +120,7 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le if (ocb->opt_readlink && file_is_symlink) { #ifndef _WIN32 - if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) { + if (request_larger_than_buffer) { memcpy(buffer_, readlink_buffer + hc1->read_len, toread); } else {