#!/usr/bin/perl
#
# Find duplicates by Alan Mizrahi
# Send any comments or suggestions to alan at mizrahi dot com dot ve
# http://www.mizrahi.com.ve/projects/
# Version 1.0 2008/07/15
#
use strict;
use warnings;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use Fcntl;

my $BUFFSIZE = 32768; # buffer size to read files, this may be tuned

my %fileByName; # all files, name => size
my %fileBySize; # all files, size => [fname1, fname2, ...]
my %fileByHash; # only with duplicates, hash => [fname1, fname2, ...]

if ($#ARGV < 0) {
	print "Syntax: findDups.pl <file|dir> [...]\n";
	exit;
}

# treat params as files or dirs
foreach my $arg (@ARGV) {
	if (-d $arg) {
		addDir($arg);
	} else {
		addFile($arg);
	}
}

# get filesize after adding dirs, to avoid more than one stat() per file in case of symlinks, duplicate dirs, etc
foreach my $fname (keys %fileByName) {
	$fileByName{$fname} = -s $fname;
}

# build hash of filesize => [ filename1, filename2, ...]
foreach my $fname (keys %fileByName) {
	push(@{$fileBySize{$fileByName{$fname}}}, $fname);
}

foreach my $size (sort keys %fileBySize) {
	next if $#{$fileBySize{$size}} < 1; # skip filesizes array with just one file
	my %checking;
	foreach my $fname (@{$fileBySize{$size}}) {
		if (sysopen my $FH, $fname, O_RDONLY) {
			$checking{$fname}{fh} = $FH; # file handle
			$checking{$fname}{md5} = Digest::MD5->new;   # md5 object
		} else {
			warn "Error opening $fname: $!";
		}
	}
	my $read=0;
	while (($read < $size) && (keys %checking > 0)) {
		my $r;
		foreach my $fname (keys %checking) { # read buffer and update md5
			my $buffer;
			$r = sysread($checking{$fname}{fh}, $buffer, $BUFFSIZE);
			if (! defined($r)) {
				warn "Error reading from $fname: $!";
				close $checking{$fname}{fh};
				delete $checking{$fname};
			} else {
				$checking{$fname}{md5}->add($buffer);
			}
		}
		$read += $r;
		FILE1: foreach my $fname1 (keys %checking) { # remove files without dups
			my $duplicate = 0;
			FILE2: foreach my $fname2 (keys %checking) { # compare to each checking file
				next if $fname1 eq $fname2;
				if ($checking{$fname1}{md5}->clone->digest eq $checking{$fname2}{md5}->clone->digest) {
					$duplicate = 1;
					next FILE1; # skip to next file
				}
			}
			if (!$duplicate) { # remove unique file
				close $checking{$fname1}{fh};
				delete $checking{$fname1};
			}
		}
	}
	# these are duplicates, but there might be more than one group of md5 sums
	foreach my $fname (keys %checking) {
		close $checking{$fname}{fh};
		push(@{$fileByHash{$checking{$fname}{md5}->b64digest}}, $fname);
	}
}

if (keys %fileByHash > 0) {
	print "Size\tHash\t\t\tFilenames\n";
}

foreach my $hash (keys %fileByHash) {
	print $fileByName{${$fileByHash{$hash}}[0]}, "\t$hash\t", join(' ', @{$fileByHash{$hash}}),"\n";
}


# add directory's content to scan (recursively)
sub addDir {
	my $dir = $_[0];
	
	if (!(opendir DIR, $dir)) {
		warn "Error opening $dir: $!";
		return;
	}
	my @dents = readdir(DIR);
	closedir(DIR);
	foreach my $dent (@dents) {
		next if ($dent eq '.' || $dent eq '..');
		$dent = $dir . '/' . $dent;
		if (-d $dent) {
			addDir($dent);
		} else {
			addFile($dent);
		}
	}
}

# add a file to scan
sub addFile {
	$fileByName{$_[0]} = 0 if -f $_[0]; # only add regular files
}