#!/usr/bin/env python

# $Id: de_dupe.py,v 1.36 2014/09/03 22:43:45 jdeifik Exp $
# de_dupe.py - deduplicates a filesystem, based on duplicates files
# Copyright Jeff turbo Deifik 2010.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 2 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


from __future__ import print_function

desc = [
'Name:	de_dupe - takes in a source directory and one or more search',
'directories. Looks for files in the directory(s) that are duplicates',
'example: de_dupe.py --total --minsize=1000000 foo-dir...'
]


import os
import sys
import stat
from optparse import OptionParser
import filecmp
import jefflib


# used by Uniq function
def tuple_cmp(a, b):
    if a[1] > b[1]: return 1
    elif a[1] < b[1]: return -1
    else: return 0


# Looks through source, trying to find a match inside of it.
# If found, temporarly rename file, make a hard link, then delete renamed file.
def look_through_one_table(source):
    bytes_saved = 0			# Used for logging progress

    progress_ind.Reset()
    print('\nLooking for duplicates')
    if Options.big:
        sorted_list = list(sorted(source.keys(), reverse=True))
    else:
        sorted_list = list(sorted(source.keys()))

    # loop through file sizes that are the same
    for size in sorted_list:
        source_files = source[size]	# List of all files with same size
				        # Sort by inode
        s_source_files = sorted(source_files, key=lambda foo: foo[1])
					# Uniqify based on inode
        u_source_files = jefflib.Uniq(s_source_files, cmp=tuple_cmp)

        if len(u_source_files) < 2:	# Can't match if less than 2 files
            continue
        				# Compute md5sum on first block of each file
        m_source_files = md5sum_s(u_source_files)

        leng = len(m_source_files)
        for i in range(0, leng-1):	# Check all files of the same size
            f = m_source_files[i]

# filecmp.cmp uses an undocumented cache, which can run out of memory,
# therefore I am clearing it periodically
            filecmp._cache = {}

# Against all other files of the same size, after index i
            for ff in m_source_files[(i+1):]:
                # If the md5sum's don't match, we are done (as they are sorted)
                if f[1] != ff[1]:
                    break
                res = core(size + bytes_saved, f[0], ff[0])
                if res == 2:
                    bytes_saved += size
        del source[size]		# Save some memory


# the core file comparison and linking code
# Return 0 if files don't match.
# Return 1 if files match, but f and ff was a hard link
# Return 2 if files match, and f or ff was not a hard link (disk space saved)
def core(siz, f, ff):
    f_s =  os.stat(f)
    ff_s = os.stat(ff)
    f_link =  f_s[stat.ST_NLINK]
    ff_link = ff_s[stat.ST_NLINK]
    ret = 0

    # Verify inodes differ
    if (f_s[stat.ST_INO] != ff_s[stat.ST_INO]):

        progress_ind.P_I()
        res = filecmp.cmp(f, ff, False)
        if res:
            # Make link to file having the highest link count
            if ff_link >= f_link:
                res = file_to_link(link_file = f, keep_file = ff)
            else:
                res = file_to_link(link_file = ff, keep_file = f)
            if res:
                ret += 1
                if Options.verbose:
                    print('Files match', f, ff)
                if f_link == 1 or ff_link == 1:	# Woo-hoo disk space is saved!
                    ret += 1
                    if Options.total:
                        progress_ind.Reset()
                        print(jefflib.Split_Thousands(siz))
    return ret


# rename link_file to something clever,
# make a hard link named link_file to keep_file
# delete renamed file
def file_to_link(link_file, keep_file):
    (head, tail) = os.path.split(link_file)
    tmp_name = head + '/de_dupe-' + tail
    ret = False
						# Rename link_file to tmp_name
    if jefflib.Try_To_Rename(link_file, tmp_name, False):
						# Make the hard link
        if jefflib.Try_To_Link(keep_file, link_file, False):
						# Delete the renamed link_file
            if jefflib.Try_To_Remove(tmp_name, False):
                ret = True
    return ret


# Compute md5sum on each file
# Return tuple of file_name, md5sum
def md5sum_s(tuples):
    new_lis = []

    for tup in tuples:
        md = jefflib.Md5sum_On_First_Block_Of_File(tup[0])
        new_lis.append((tup[0], md))
				        # Sort by md5sum
    new_lis = sorted(new_lis, key=lambda foo: foo[1])
    return new_lis


# Top level code

progress_ind = jefflib.Progress_Indicator(1000, 1)
filecmp.BUFSIZE = 1024 * 1024		# Bigger buffer size for cmp

# Command line parsing code
Parser = OptionParser()
Parser.add_option("--minsize", dest="minsize", action="store",
                  type="int", default=512, help="specify min size")
Parser.add_option("--maxsize", dest="maxsize", action="store",
                  type="int", default=0, help="specify max size")
Parser.add_option("-t", "--total", dest="total", action="store_true",
                  help="print total bytes saved")
Parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
                  help="verbose output")
Parser.add_option("-b", "--big", dest="big", action="store_true",
                  help="big to small", default=False)
Parser.add_option("-u", "--usage", dest="use", action="store_true",
                  default=False, help="Print usage and exit")
(Options, Args) = Parser.parse_args()
if Options.use: jefflib.Usage(desc)

if len(Args) == 0:
    print("Need to specify one or more directoryes", file=sys.stderr)
    jefflib.Usage(desc)

dirs = Args

search_table = jefflib.files_and_sizes(dirs, Options.minsize,
                                       Options.maxsize, progress_ind.P_I,
                                       ignore_links = False)

look_through_one_table(search_table)