Kill Duplicates (2003)

A sophisticated duplicate file finder that recursively searches directories, identifies files with identical sizes, then performs binary comparison to find exact duplicates. Generates a batch file (killdupe.cmd) with deletion commands for duplicate files, keeping the first occurrence. Uses efficient two-pass algorithm: first by file size, then by binary content comparison.

Usage:

perl killdupe.pl <directory> [output_file]

Scans directory recursively, finds duplicates by size and binary comparison, outputs deletion commands to killdupe.cmd (or specified file).

Source Code:

#!/usr/bin/perl
#
sub ft {
	my(@files);
	my($root) = $_[0];
	print "exploring $root\n";
	opendir ROOT, $root;
	my(@filelist) = readdir ROOT;
	closedir ROOT;
	foreach $name (@filelist) {
		if ($name ne '.' and $name ne '..') { 
			$name = $root . "\\" . $name;
			@files = (@files, $name) if (-f $name);
			@files = (@files, ft($name)) if (-d $name); 
		}
	}
	return @files;
}

sub ss {
	my(@files) = @_;
	return if (!@files);
	my(%sizes, %samesizes);
	foreach $file (@files) {
		$size = (stat($file))[7];
		next if !($size);
		if ($sizes{$size}) {
			$samesizes{$size} = $samesizes{$size} ? $samesizes{$size} . '|' . $file : $sizes{$size} . '|' . $file;
		} else {
			$sizes{$size} = $file;
		}
	}
	return %samesizes;
}

sub fc  {
	my($first, $second) = @_;
	open (FIRST, $first) or return 0;
	open (SECOND, $second) or return 0;
	binmode FIRST;
	binmode SECOND;
	while (sysread(FIRST, $f, 65536) && sysread(SECOND, $s, 65536)) {
		return 0 unless ($f eq $s);
	}
	return 1;
}

sub ns {
	return split(/\\/, $a) > split(/\\/, $b) ? +1 : -1 unless split(/\\/, $a) == split(/\\/, $b);
	return $a gt $b ? +1 : -1;
}

sub sf {
	my($first, @rest) = sort ns @_;
	return if (!@rest);
	my(%samefiles);
	foreach $key (keys @rest) {
		$second = $rest[$key];
		if (fc($first, $second)) {
			$samefiles{$first} = $samefiles{$first} ? $samefiles{$first} . "|" . $second : $second;
			$rest[$key] = '';
		}
	}
	%samefiles = (%samefiles, sf(@rest));
	return %samefiles;
}

die "usage: killdupe.pl  [out]" unless ($ARGV[0]);
die "'$ARGV[0]' is not dir!" unless (-d $ARGV[0]);

print "searching files...\n";

@files = ft($ARGV[0]);
$n = @files;

print "found $n files.\n";
print "----------------------------------------------\n";
print "searching same size files...\n";

%samesizes = ss(@files);
$n = keys %samesizes;

print "found $n same size files.\n";
print "----------------------------------------------\n";
print "examining these files...\n";

$out = $ARGV[1] or $out = 'killdupe.cmd';
open (OUT, ">$out");

foreach $size (keys %samesizes) {
	@samesize = split(/\|/, $samesizes{$size});
	%samefiles = sf(@samesize);
	foreach $original (keys %samefiles) {
		$n = @dupes = split(/\|/, $samefiles{$original});
		print "'$original' has $n dupes:\n";
		print OUT "rem \"$original\"\n";
		foreach $dupe (@dupes) {
			print "'$dupe'\n";
			print OUT "del \"$dupe\"\n";
		}
		print "----------------------------------------------\n";
	}
}

close (OUT);
print "done.\n";