#!/usr/bin/env python # Copyright Jeff turbo Deifik 2003, 2004, 2005 All rights reserved # $Id: md5gen.py,v 1.22 2012-05-25 21:09:11 jdeifik Exp $ # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # Jul-25-2003 JTD Wrote # Jun-27-2005 JTD Fixed bug that processes each dir twice # Jun-28-2005 JTD Made multithreaded, large potential speedup # Jun-29-2005 JTD Wait for all threads to finish before exiting from __future__ import print_function desc = [ 'Usage: md5gen.py [-h] [-v] [-t# or --threads=#] root-dir', '-h prints help message', '-v verbose mode', '-t# use # threads to speed up the md5 calculations', 'Recurse through all directories starting at root directory root-dir', 'Generate a md5 file, called jtd_MD5SUM of all plain text files', 'If there is an existing md5 file, diff the current sums with the old ones', 'Rename the existing md5 file (if found) to old_jtd_MD5SUM' ] import os import sys import hashlib import difflib # import _thread import time from optparse import OptionParser import jefflib # Constants BLOCKSIZE = 1024*1024 CKNAME = "jtd_MD5SUM" OLD_CKNAME = "old_" + CKNAME # Globals thread_lock = [] def debug_print(str): return # print (time.clock(), 'DEBUG', str) # Print output if verbose is set def verbose_print(*bs): if Options.verbose: for b in bs: print(b, end=' ') print() # Find a thread that can be executed. # return the number of the thread [0..Options.threads-1] def find_a_thread_to_run(): thread_found = False i = 0 while not thread_found: for i in range(Options.threads): b = thread_lock[i].acquire(0) if b: thread_found = True break # Didn't find a thread to run, so sleep for a second, and try again. time.sleep(1) assert(thread_found) return i # Needed before program exit. Locks (with blocking) eack thread. # Since there is a blocking lock, this will wait to acquire all locks # before returning, which means all threads are done. def wait_for_all_threads_to_finish(): for i in range(Options.threads): b = thread_lock[i].acquire(1) # Process a directory do the md5sum computation and comparison # Take in a full directory path... def process_the_directory(dir_name, thread_num): verbose_print("TOP OF PROCESS_THE_DIRECTORY", dir_name, " thread_num ", thread_num) debug_print("TOP OF PROCESS_THE_DIRECTORY" + dir_name) md5list = [] (unused, plain_file_list) = jefflib.Plain_Files_In_Directory(dir_name) full_ckname = dir_name + '/' + CKNAME full_old_ckname = dir_name + '/' + OLD_CKNAME if len(plain_file_list) > 0 : # This loop is where almost all the cpu time is spent debug_print('top of compute loop') for plain in plain_file_list : # Skip processing md5sum and old md5sum files if (plain == CKNAME) or (plain == OLD_CKNAME) : continue md5str = jefflib.Md5sum_On_File(dir_name + '/' + plain, plain) # Add a newline md5list.append(md5str + '\n') # Append it to the md5list debug_print('bottom of compute loop') md5list.sort() # Put it in sorted form debug_print('sorted') # This is a big lock for all normal output generation # io_lock.acquire(1) if CKNAME in plain_file_list : # Is there a md5sum file? # Now, read in existing md5sum file verbose_print("found md5sum file") old_text = jefflib.File_To_List(full_ckname) old_text.sort() # Sort it if cmp(md5list, old_text) != 0: # There are differences print("Differences Found !!!! directory is: ", dir_name) debug_print('calling differ') d = difflib.Differ() debug_print('calling compare') result = list(d.compare(old_text, md5list)) print("Differences are:") for r in result: if r[0] == '+' or r[0] == '-' or r[0] == '?': print(r) # print("Old md5sum is: ", old_text) # print("New md5sum is: ", md5list) if OLD_CKNAME in plain_file_list: os.remove(full_old_ckname) # Remove old file # Rename old md5sum file os.rename(full_ckname, full_old_ckname) # Print out the new md5sum file debug_print('calling list to file') jefflib.List_To_File(full_ckname, md5list) debug_print('called list to file') else : verbose_print("md5sum's match") else : print("directory", dir_name, " missing md5sum file !!!!") # Print out the new md5sum file debug_print('calling list to file') jefflib.List_To_File(full_ckname, md5list) debug_print('called list to file') # io_lock.release() if Options.threads > 1: thread_lock[thread_num].release() # Top level code # Command line parsing code Parser = OptionParser() Parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="enable verbose messages") Parser.add_option("-t", "--threads", type="int", nargs=1, dest="threads", default=1, help="specify number of threads to use") Parser.add_option("-u", "--usage", dest="use", action="store_true", help="Print usage and exit") (Options, Args) = Parser.parse_args() if Options.use: jefflib.Usage(desc) if len(Args) == 1: the_dirs = Args else: jefflib.Usage(desc) # io_lock = _thread.allocate_lock() # Make a lock for each thread assert (Options.threads >= 1) assert(Options.threads < 10) # for I in range(Options.threads): # thread_lock.append(_thread.allocate_lock()) # # Scan a directory tree, looking for md5sum files in each directory that # contains at least one normal file. If there isn't one, generate one. # # This processes the root directory if os.path.isdir(the_dirs[0]): if Options.threads > 1: assert(False) i = find_a_thread_to_run() # _thread.start_new_thread(process_the_directory, (the_dirs[0], i)) else: process_the_directory(the_dirs[0], 0) while len(the_dirs) > 0: # This processes all except the root directory try: names = os.listdir(the_dirs[0]) except: the_dirs.pop(0) continue for name in names: # JTD - skip recycler and system volume information... For windows only if name in ["RECYCLER", "$RECYCLE.BIN", "$RECYCLE.bin", "System Volume Information", "System_Volume_Information", "CVS", "cygdrive", "Diskeeper", "proc"]: continue if the_dirs[0] == '/': File = '/' + name else: File = the_dirs[0] + '/' + name if os.path.isdir(File): the_dirs.append(File) if Options.threads > 1: I = find_a_thread_to_run() _thread.start_new_thread(process_the_directory, (File, I)) else: process_the_directory(File, 0) the_dirs.pop(0) if Options.threads > 1: wait_for_all_threads_to_finish()