From d994b4fbc53ddc725a196de0236df44bd7bbcb2b Mon Sep 17 00:00:00 2001
From: "S. D. Adams" <s.d.adams.software@gmail.com>
Date: Tue, 20 Feb 2018 00:00:47 +0000
Subject: [PATCH 1/2] Implemented Feature #225 - Don't follow symlinks.

The flag '-R' causes hashdeep to hash the output of readlink instead of
following the file.

Tested on Ubuntu 16.04 and Mac OS X High Sierra.
Not supported on Windows.
---
 man/hashdeep.1 |  5 +++++
 src/dig.cpp    | 29 ++++++++++++++++++++++++-----
 src/hash.cpp   | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 src/main.cpp   |  9 ++++++++-
 src/main.h     | 12 ++++++++----
 5 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/man/hashdeep.1 b/man/hashdeep.1
index 169516679..36b15834f 100644
--- a/man/hashdeep.1
+++ b/man/hashdeep.1
@@ -226,6 +226,11 @@ open(). Specifying \fB-Fm\fR will use memory-mapped I/O which will be
 faster on some platforms, but which (currently) will not work with
 files that produce I/O errors.
 
+.TP
+\fB-R\fR
+Don't follow symlinks, instead hash the output of readlink. (Not available on
+Windows)
+
 
 
 .TP
diff --git a/src/dig.cpp b/src/dig.cpp
index 5ab58e5c5..c448e6c42 100644
--- a/src/dig.cpp
+++ b/src/dig.cpp
@@ -75,10 +75,19 @@ file_types file_metadata_t::decode_file_type(const struct __stat64 &sb)
  */
 int file_metadata_t::stat(const tstring &fn,
 			  file_metadata_t *m,
-			  class display &ocb)
+              class display &ocb,
+              bool const is_symlink)
 {
   struct __stat64 sb;
-  if (::TSTAT(fn.c_str(),&sb)) 
+  if (ocb.opt_readlink && is_symlink)
+  {
+    if (::TLSTAT(fn.c_str(),&sb))
+    {
+      ocb.error_filename(fn,"%s",strerror(errno));
+      return -1;
+    }
+  }
+  else if (::TSTAT(fn.c_str(),&sb))
   {
     ocb.error_filename(fn,"%s",strerror(errno));
     return -1;
@@ -564,6 +573,13 @@ void state::process_dir(const tstring &fn)
  */
 bool state::should_hash_symlink(const tstring &fn, file_types *link_type)
 {
+    /**
+     * When readlink option is set, all symlinks are to be hashed.
+     */
+    if (ocb.opt_readlink) {
+        return true;
+    }
+
     /**
      * We must look at what this symlink points to before we process it.
      * The file_type() function uses lstat to examine the file.
@@ -694,9 +710,11 @@ bool state::should_hash_expert(const tstring &fn, file_types type)
  * but if it is called with a directory it recursively hashes it.
  */
 
-bool state::should_hash(const tstring &fn)
+bool state::should_hash(const tstring &fn, file_types &_type)
 {
     file_types type = state::file_type(fn,&ocb,0,0,0,0);
+
+    _type = type;
   
     if (mode_expert) 
       return should_hash_expert(fn,type);
@@ -735,8 +753,9 @@ void state::dig_normal(const tstring &fn_) {
 #endif
   if (opt_debug) 
     ocb.status("*** cleaned:%s",global::make_utf8(fn).c_str());
-  if (should_hash(fn))
-    ocb.hash_file(fn);
+  file_types type;
+  if (should_hash(fn, type))
+    ocb.hash_file(fn, type);
 }
 
 
diff --git a/src/hash.cpp b/src/hash.cpp
index 52f419b0c..7b8c32e31 100644
--- a/src/hash.cpp
+++ b/src/hash.cpp
@@ -91,6 +91,16 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le
     hc1->read_offset = request_start;
     hc1->read_len    = 0;		// so far
 
+    unsigned char *readlink_buffer = 0;
+    if (ocb->opt_readlink && file_is_symlink) {
+#ifndef _WIN32
+        if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) {
+            readlink_buffer = (unsigned char*)malloc(request_len);
+            readlink(file_name_to_hash.c_str(), (char*)readlink_buffer, request_len);
+        }
+#endif
+    }
+
     while (request_len>0){
 	// Clear the buffer in case we hit an error and need to pad the hash 
 	// The use of MD5DEEP_IDEAL_BLOCK_SIZE means that we loop even for memory-mapped
@@ -107,10 +117,21 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le
 
 	ssize_t current_read_bytes = 0;	// read the data into buffer
 
-	if(this->handle){
+    if (ocb->opt_readlink && file_is_symlink) {
+#ifndef _WIN32
+        if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) {
+            memcpy(buffer_, readlink_buffer + hc1->read_len, toread);
+        }
+        else {
+            readlink(file_name_to_hash.c_str(), (char*)buffer_, toread);
+        }
+        current_read_bytes = toread;
+#endif
+    }
+    else if(this->handle){
 	    current_read_bytes = fread(buffer_, 1, toread, this->handle);
 	} else {
-	    assert(this->fd!=0);
+        assert(this->fd!=-1);
 	    if(this->base){
 		buffer = this->base + request_start;
 		current_read_bytes = min(toread,this->bounds - request_start); // can't read more than this
@@ -169,6 +190,12 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le
 	request_start += toread;
 	request_len   -= toread;
     }
+
+    if (readlink_buffer) {
+        free(readlink_buffer);
+        readlink_buffer = 0;
+    }
+
     if (ocb->opt_estimate) ocb->clear_realtime_stats();
     if (this->file_bytes == this->stat_bytes) this->eof = true; // end of the file
     return true;			// done hashing!
@@ -201,6 +228,8 @@ void file_data_hasher_t::hash()
 {
     file_data_hasher_t *fdht = this;
 
+    bool const readlink_this_file = ocb->opt_readlink && fdht->file_is_symlink;
+
     /*
      * If the handle is set, we are probably hashing stdin.
      * If not, figure out file size and full file name for the handle
@@ -211,7 +240,7 @@ void file_data_hasher_t::hash()
 	//state::file_type(fdht->file_name_to_hash,ocb,&fdht->stat_bytes,
 	//&fdht->ctime,&fdht->mtime,&fdht->atime);
 	file_metadata_t m;
-	file_metadata_t::stat(fdht->file_name_to_hash,&m,*ocb);
+    file_metadata_t::stat(fdht->file_name_to_hash,&m,*ocb,fdht->file_is_symlink);
 	fdht->stat_bytes = m.size;
 	fdht->ctime      = m.ctime;
 	fdht->mtime      = m.mtime;
@@ -238,7 +267,11 @@ void file_data_hasher_t::hash()
 	    }
 	}
 
-	switch(ocb->opt_iomode){
+    if (readlink_this_file) {
+        assert(fdht->fd == -1);
+        assert(fdht->handle == 0);
+    }
+    else switch(ocb->opt_iomode){
 	case iomode::buffered:
 	    assert(fdht->handle==0);
 
@@ -360,7 +393,7 @@ void file_data_hasher_t::hash()
 	 */
 	fdht->file_bytes = 0;
 	if(fdht->handle) fseeko(fdht->handle, 0, SEEK_SET);
-	if(fdht->fd){
+    if(fdht->fd != -1){
 	    lseek(this->fd,0,SEEK_SET);
 	}
 	fdht->eof = false;		// 
@@ -381,7 +414,7 @@ void file_data_hasher_t::hash()
     while (fdht->eof==false)  {
 	
 	uint64_t request_len = fdht->stat_bytes; // by default, hash the file
-	if ( fdht->ocb->piecewise_size>0 )  {
+    if ( !readlink_this_file && fdht->ocb->piecewise_size>0 )  {
 	    request_len = fdht->ocb->piecewise_size;
 	}
 
@@ -467,10 +500,13 @@ void worker::do_work(file_data_hasher_t *fdht)
  * 2 - hash the fdht
  * 3 - record it in stdout using display.
  */
-void display::hash_file(const tstring &fn)
+void display::hash_file(const tstring &fn, file_types const type)
 {
     file_data_hasher_t *fdht = new file_data_hasher_t(this);
     fdht->file_name_to_hash = fn;
+    if (type == stat_symlink) {
+        fdht->file_is_symlink = true;
+    }
 
     /**
      * If we are using a thread pool, hash in another thread
diff --git a/src/main.cpp b/src/main.cpp
index b605d2b82..308fa766f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -614,7 +614,11 @@ int state::hashdeep_process_command_line(int argc_, char **argv_)
     bool did_usage = false;
   int i;
 
-  while ((i=getopt(argc_,argv_,"abc:CdeEF:f:o:I:i:MmXxtlk:rsp:wvVhW:0D:uj:")) != -1)  {
+  while ((i=getopt(argc_,argv_,"abc:CdeEF:f:o:I:i:MmXxtlk:rsp:wvVhW:0D:uj:"
+#ifndef _WIN32
+        "R"
+#endif
+  )) != -1)  {
     switch (i)
     {
     case 'a':
@@ -683,6 +687,9 @@ int state::hashdeep_process_command_line(int argc_, char **argv_)
     case 'b': ocb.mode_barename=true;   break;
     case 'l': ocb.opt_relative=true;    break;
     case 'e': ocb.opt_estimate = true;	break;
+#ifndef _WIN32
+    case 'R': ocb.opt_readlink = true;  break;
+#endif
     case 'r': mode_recursive=true;	break;
     case 's': ocb.opt_silent = true;	break;
 
diff --git a/src/main.h b/src/main.h
index 99df49a6e..a8178ecd2 100644
--- a/src/main.h
+++ b/src/main.h
@@ -220,7 +220,7 @@ class file_metadata_t {
     static file_types decode_file_type(const struct __stat64 &sb);
 
     // stat a file, print an error and return -1 if it fails, otherwise return 0
-    static int stat(const filename_t &path,file_metadata_t *m,class display &ocb); 
+    static int stat(const filename_t &path,file_metadata_t *m,class display &ocb, bool is_symlink = false);
     class fileid_t {				      // uniquely defines a file on this system
     public:
 	fileid_t():dev(0),ino(0){};
@@ -296,6 +296,7 @@ class file_data_hasher_t : public file_data_t {
     }
     static const size_t MD5DEEP_IDEAL_BLOCK_SIZE = 8192;
     file_data_hasher_t(class display *ocb_):
+    file_is_symlink(false),
 	ocb(ocb_),			// where we put results
 	handle(0),
 	fd(-1),
@@ -322,6 +323,7 @@ class file_data_hasher_t : public file_data_t {
 
     /* The actual file to hash */
     filename_t file_name_to_hash;
+    bool file_is_symlink;
 
     /* Where the results go */
     class display *ocb;
@@ -585,6 +587,7 @@ class display {
       opt_display_hash(false),
       opt_show_matched(false),
       opt_case_sensitive(true),
+      opt_readlink(false),
       opt_iomode(iomode::buffered),	// by default, use buffered
 #ifdef HAVE_PTHREAD
       opt_threadcount(threadpool::numCPU()),
@@ -620,6 +623,7 @@ class display {
     bool	opt_display_hash;
     bool	opt_show_matched;
     bool        opt_case_sensitive;
+    bool    opt_readlink;
     int		opt_iomode;
     int		opt_threadcount;
 
@@ -755,7 +759,7 @@ class display {
     void	finalize_matching();
 
     /* hash.cpp: Actually trigger the hashing. */
-    void	hash_file(const tstring &file_name);
+    void	hash_file(const tstring &file_name, file_types type);
     void	hash_stdin();
     void	dump_hashlist(){ lock(); known.dump_hashlist(); unlock(); }
 };
@@ -794,7 +798,7 @@ public:;
 
  state():mode_recursive(false),	// do we recurse?
       mode_warn_only(false),	// for loading hash files
-      
+
       // these determine which files get hashed
       mode_expert(false),
       mode_regular(false),
@@ -904,7 +908,7 @@ public:;
     bool	should_hash_symlink(const tstring &fn,file_types *link_type);
     bool        should_hash_winpe(const tstring &fn);
     bool	should_hash_expert(const tstring &fn, file_types type);
-    bool	should_hash(const tstring &fn);
+    bool	should_hash(const tstring &fn, file_types &type);
 
     /* file_type returns the file type of a string.
      * If an error is found and ocb is provided, send the error to ocb.

From 38d58e16bdb6fc498998859a41ffd58b2da46866 Mon Sep 17 00:00:00 2001
From: "S. D. Adams" <s.d.adams.software@gmail.com>
Date: Tue, 20 Feb 2018 12:39:52 +0000
Subject: [PATCH 2/2] Bugfix for reading long symlinks.

---
 src/hash.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/hash.cpp b/src/hash.cpp
index 7b8c32e31..e716bae70 100644
--- a/src/hash.cpp
+++ b/src/hash.cpp
@@ -92,9 +92,10 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le
     hc1->read_len    = 0;		// so far
 
     unsigned char *readlink_buffer = 0;
+    bool const request_larger_than_buffer = request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE;
     if (ocb->opt_readlink && file_is_symlink) {
 #ifndef _WIN32
-        if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) {
+        if (request_larger_than_buffer) {
             readlink_buffer = (unsigned char*)malloc(request_len);
             readlink(file_name_to_hash.c_str(), (char*)readlink_buffer, request_len);
         }
@@ -119,7 +120,7 @@ bool file_data_hasher_t::compute_hash(uint64_t request_start,uint64_t request_le
 
     if (ocb->opt_readlink && file_is_symlink) {
 #ifndef _WIN32
-        if (request_len > file_data_hasher_t::MD5DEEP_IDEAL_BLOCK_SIZE) {
+        if (request_larger_than_buffer) {
             memcpy(buffer_, readlink_buffer + hc1->read_len, toread);
         }
         else {