#!/usr/bin/env python

# Copyright Jeff turbo Deifik 2003, 2004, 2005 All rights reserved
# $Id: md5gen.py,v 1.22 2012-05-25 21:09:11 jdeifik Exp $
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details. 
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Jul-25-2003	JTD	Wrote
# Jun-27-2005	JTD	Fixed bug that processes each dir twice
# Jun-28-2005	JTD	Made multithreaded, large potential speedup
# Jun-29-2005	JTD	Wait for all threads to finish before exiting

from __future__ import print_function

desc = [
'Usage: md5gen.py [-h] [-v] [-t# or --threads=#] root-dir',
'-h     prints help message',
'-v     verbose mode',
'-t#	use # threads to speed up the md5 calculations',
'Recurse through all directories starting at root directory root-dir',
'Generate a md5 file, called jtd_MD5SUM of all plain text files',
'If there is an existing md5 file, diff the current sums with the old ones',
'Rename the existing md5 file (if found) to old_jtd_MD5SUM'
]


import os
import sys
import hashlib
import difflib
# import _thread
import time
from optparse import OptionParser
import jefflib


# Constants
BLOCKSIZE = 1024*1024
CKNAME	= "jtd_MD5SUM"
OLD_CKNAME = "old_" + CKNAME


# Globals
thread_lock = []


def debug_print(str):
    return
#    print (time.clock(), 'DEBUG', str)


# Print output if verbose is set
def verbose_print(*bs):

    if Options.verbose:
        for b in bs:
            print(b, end=' ')
        print()


# Find a thread that can be executed.
# return the number of the thread [0..Options.threads-1]
def find_a_thread_to_run():
    thread_found = False
    i = 0
    while not thread_found:
	for i in range(Options.threads):
	    b = thread_lock[i].acquire(0)
	    if b:
	        thread_found = True
		break

# Didn't find a thread to run, so sleep for a second, and try again.
	time.sleep(1)

    assert(thread_found)
    return i


# Needed before program exit. Locks (with blocking) eack thread.
# Since there is a blocking lock, this will wait to acquire all locks
# before returning, which means all threads are done.
def wait_for_all_threads_to_finish():

    for i in range(Options.threads):
	b = thread_lock[i].acquire(1)


# Process a directory do the md5sum computation and comparison
# Take in a full directory path...
def process_the_directory(dir_name, thread_num):

    verbose_print("TOP OF PROCESS_THE_DIRECTORY", dir_name,
	" thread_num ", thread_num)
    debug_print("TOP OF PROCESS_THE_DIRECTORY" + dir_name)

    md5list = []
    (unused, plain_file_list) = jefflib.Plain_Files_In_Directory(dir_name)
    full_ckname = dir_name + '/' + CKNAME
    full_old_ckname = dir_name + '/' + OLD_CKNAME

    if len(plain_file_list) > 0 :

# This loop is where almost all the cpu time is spent
        debug_print('top of compute loop')
        for plain in plain_file_list :
                                    # Skip processing md5sum and old md5sum files
            if (plain == CKNAME) or (plain == OLD_CKNAME) :
                continue

            md5str = jefflib.Md5sum_On_File(dir_name + '/' + plain,
                                                      plain)
                                                # Add a newline
            md5list.append(md5str + '\n')	# Append it to the md5list

        debug_print('bottom of compute loop')
        md5list.sort()			        # Put it in sorted form
        debug_print('sorted')

	# This is a big lock for all normal output generation
#	io_lock.acquire(1)
        if CKNAME in plain_file_list :	        # Is there a md5sum file?
                                        # Now, read in existing md5sum file
            verbose_print("found md5sum file")
            old_text = jefflib.File_To_List(full_ckname)
            old_text.sort()			# Sort it

            if cmp(md5list, old_text) != 0:     # There are differences
                print("Differences Found !!!! directory is: ", dir_name)
                debug_print('calling differ')
		d = difflib.Differ()
                debug_print('calling compare')
		result = list(d.compare(old_text, md5list))
		print("Differences are:")
		for r in result:
		    if r[0] == '+' or r[0] == '-' or r[0] == '?':
			print(r)

#		print("Old md5sum is: ", old_text)
#		print("New md5sum is: ", md5list)

                if OLD_CKNAME in plain_file_list:
                    os.remove(full_old_ckname)	# Remove old file
                                                # Rename old md5sum file
                os.rename(full_ckname, full_old_ckname)
                                                # Print out the new md5sum file
                debug_print('calling list to file')
                jefflib.List_To_File(full_ckname, md5list)
                debug_print('called list to file')
            else :
                verbose_print("md5sum's match")
        else :
            print("directory", dir_name, " missing md5sum file !!!!")
                                                # Print out the new md5sum file
            debug_print('calling list to file')
            jefflib.List_To_File(full_ckname, md5list)
            debug_print('called list to file')
#	io_lock.release()

    if Options.threads > 1:
	thread_lock[thread_num].release()


# Top level code

# Command line parsing code
Parser = OptionParser()
Parser.add_option("-v", "--verbose", action="store_true", dest="verbose",
	default=False, help="enable verbose messages")
Parser.add_option("-t", "--threads", type="int", nargs=1, dest="threads",
	default=1, help="specify number of threads to use")
Parser.add_option("-u", "--usage", dest="use", action="store_true",
                  help="Print usage and exit")
(Options, Args) = Parser.parse_args()
if Options.use: jefflib.Usage(desc)

if len(Args) == 1:
    the_dirs = Args
else:
    jefflib.Usage(desc)

# io_lock = _thread.allocate_lock()

# Make a lock for each thread
assert (Options.threads >= 1)
assert(Options.threads < 10)
# for I in range(Options.threads):
#    thread_lock.append(_thread.allocate_lock())

#
# Scan a directory tree, looking for md5sum files in each directory that
# contains at least one normal file. If there isn't one, generate one.
#

# This processes the root directory
if os.path.isdir(the_dirs[0]):
    if Options.threads > 1:
        assert(False)
	i = find_a_thread_to_run()
#	_thread.start_new_thread(process_the_directory, (the_dirs[0], i))
    else:
	process_the_directory(the_dirs[0], 0)

while len(the_dirs) > 0:

# This processes all except the root directory
    try:
        names = os.listdir(the_dirs[0])
    except:
        the_dirs.pop(0)
        continue

    for name in names:

# JTD - skip recycler and system volume information... For windows only
        if name in ["RECYCLER", "$RECYCLE.BIN", "$RECYCLE.bin",
                    "System Volume Information", "System_Volume_Information",
                    "CVS", "cygdrive", "Diskeeper", "proc"]:
            continue

        if the_dirs[0] == '/':
            File = '/' + name
        else:
            File = the_dirs[0] + '/' + name

        if os.path.isdir(File):
            the_dirs.append(File)
	    if Options.threads > 1:
		I = find_a_thread_to_run()
		_thread.start_new_thread(process_the_directory, (File, I))
	    else:
		process_the_directory(File, 0)

    the_dirs.pop(0)

if Options.threads > 1:
    wait_for_all_threads_to_finish()