DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

Snippets has posted 5883 posts at DZone. View Full User Profile

Thin-dup - Remove Duplicate Files

09.08.2009
| 1692 views |
  • submit to reddit
        This compares the md5sum of files and prompts to remove copies of files that are duplicated.

#!/usr/bin/perl -w
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#
#   Program: thin-dup
#   Version: 4.2
#   Purpose: To find and confirm removal of duplicate copies of files
#            in the current directory.
#
#   Author:   John Harrison
#   Revision: 20 June 2003 4.0
#             19 July 2003 4.1
#             22 Nov  2004 4.2 Add die messages, verbose flag & tick.
#
#   This is the fourth major re-write.
#   The previous versions were all shell scripts.
#   This one goes like lightning compared to them!
#   It uses md5sum to compare any files it finds which are
#   the same size and which have different inode numbers.
#
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

use strict;
use Getopt::Long;
use Digest::MD5 qw(md5_hex);
use Time::localtime;
use POSIX qw(:termios_h);

my $fd_stdin = fileno(STDIN);
my $term     = POSIX::Termios->new();
$term->getattr($fd_stdin);
my $oterm     = $term->getlflag();

my $echo     = ECHO | ECHOK | ICANON;
my $noecho   = $oterm & ~$echo;

sub cbreak {
    $term->setlflag($noecho);  # ok, so i don't want echo either
    $term->setcc(VTIME, 1);
    $term->setattr($fd_stdin, TCSANOW);
}

sub cooked {
    $term->setlflag($oterm);
    $term->setcc(VTIME, 0);
    $term->setattr($fd_stdin, TCSANOW);
}

sub readkey {
    my $key = '';
    cbreak();
    sysread(STDIN, $key, 1);
    cooked();
    return $key;
}

END { cooked() }

my (@files, @digests);
my (%sizes, %inodes, %files, %info, %digests, %copies, %duplicates);

#
# option variables with default value (false)
#
my $rm_f = 0;
my $rm_i = 0;
my $verb = 0;
my $tick = 0;

GetOptions (
    'f+' => \$rm_f,
    'y+' => \$rm_f,
    'i+' => \$rm_i,
    't+' => \$tick,
    'v+' => \$verb
    ) || die "Usage: $0 -[f|y|i|v]\n";

unless (@files) {
    opendir(D, ".") || die "Can't open directory: $!\n";
    @files = sort grep {-f $_} grep { /./ } readdir(D);
}

my @months = ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
    'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec');

my @tick = ('|', '/', '-', '\\');
$| = 1;

for my $file (@files) {

    next if (-l $file ); # skip symlinks

    my ($dev,$inode,$mode,$nlink,$uid,$gid,$rdev,$size,
        $atime,$mtime,$ctime,$blksize,$blocks) = stat($file);
    my $time = localtime($mtime);

    $sizes{$size}++; # counting the files of this size
    $inodes{$size}{$sizes{$size}} = $inode;
    $files{$size}{$sizes{$size}} = $file;

    $info{$file} = sprintf "%13i %3s %-2i %02i:%02i:%02i %4i",
        $size, $months[$time->mon], $time->mday,
        $time->hour, $time->min, $time->sec,
        1900 + $time->year;

}

my $count;
for my $size (sort {$b <=> $a} keys %sizes) {

    if ($sizes{$size} > 1) {

        for my $i (1..$sizes{$size}) {

            my ($inode, $file) = ($inodes{$size}{$i}, $files{$size}{$i});

            # Don't bother to sum the same inode more than once
            if (! $digests{$inode}) {
                print STDERR "Summing: $file", $/ if ($verb);
                if ($tick) {
                    print "\r", $tick[$count++];
                    $count = 0 if ($count eq 4);
                }
                open(FILE, $file) || die "Can't read $file: $!\n";
                my $digest = md5_hex(<FILE>);
                close(FILE);
                $digests{$inode} = $digest;
                push(@digests, $digest) unless($copies{$digest}++);
                $duplicates{$digest}{$copies{$digest}} = $file;
            }

        }

    }

}

for my $digest (@digests) {

    if ($copies{$digest} > 1) {

        my @duplicates;

        for my $i (1..$copies{$digest}) {
            my $file = $duplicates{$digest}{$i};
            push (@duplicates, $file);
            print "$info{$file} $file\n";
        }

        my $files = $#duplicates;

        for my $file (@duplicates) {

            my $key;

            if ($rm_f) {

                $key = "y";

            } else {

                $| = 1;
                print "Remove '$file'? [y|N] ";

                until (defined ($key = readkey())){};
                chomp($key);

            }

            if ($key =~ /^y$/i) {

                unlink($file);
                print "removed `$file`\n";
                last if ($files-- < 2);

            } elsif ($key =~ /^\003$/i) {

                die "\n";

            } else {

                print "\n";

            }

        }

    }

}