From 734a807df22cc085a4f711964cffae623fe648f1 Mon Sep 17 00:00:00 2001 From: Ludovic Pouzenc Date: Tue, 14 Jul 2015 21:45:37 +0200 Subject: find_files : load all ref file in memory (in a dict) to find every matching sector with one read and one dict access per sector --- mytasks.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/mytasks.py b/mytasks.py index 80b04ed..dcb53a8 100644 --- a/mytasks.py +++ b/mytasks.py @@ -27,19 +27,25 @@ def do_find_files(d,state): state['state'] = 'initializing' ref_paths = state['filepaths'] ref_count = len(ref_paths) - ref_fds = [None]*ref_count - ref_sizes = [None]*ref_count - ref_offset = [None]*ref_count - ref_cur_sect = [None]*ref_count + ref_big_hash = {} for ref_no in range(ref_count): path = state['filepaths'][ref_no] - logging.debug("Try to open reffile '%s'"%path) - ref_offset[ref_no] = 0 - ref_sizes[ref_no] = os.lstat(path).st_size - ref_fds[ref_no] = open(path, "r") - ref_fds[ref_no].seek(0) - ref_cur_sect[ref_no] = ref_fds[ref_no].read(512) - logging.debug("Opened reffile '%s'"%path) + logging.debug("Try to open ref. file '%s'"%path) + with open(path, 'rb') as fd: + logging.info("Loading ref. file '%s'"%path) + while True: + ref_offset = fd.tell() + data = fd.read(512) + if not data: + break + if data == '\0'*512: + logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset)) + elif data in ref_big_hash: + (prev_ref_no, prev_ref_offset) = ref_big_hash[data] + logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"% + (prev_ref_no, prev_ref_offset, ref_no, ref_offset)) + else: + ref_big_hash[data] = (ref_no, ref_offset) start = 0 end = min(d.disks_size) @@ -54,14 +60,12 @@ def do_find_files(d,state): for disk_no in range(d.disk_count): d.disks[disk_no].seek(offset) data = d.disks[disk_no].read(512) - for ref_no in range(ref_count): - #TODO : gerer les buffers plus court que 512 - if data == ref_cur_sect[ref_no]: - #TODO : chercher la suite du fichier + if data in ref_big_hash: f = state['found'] if len(f) < 200: # TODO agreger les matches - f.append((ref_paths[ref_no],ref_offset[ref_no],disk_no,offset)) + (ref_no, ref_offset) = ref_big_hash[data] + f.append((ref_no,ref_offset,disk_no,offset)) state['found'] = f else: state['state'] = 'aborted' @@ -69,6 +73,7 @@ def do_find_files(d,state): if offset % one_percent == 0: state['progress'] = state['progress'] + 1 + ref_big_hash.clear() state['state'] = 'finished' state['progress'] = 100 except Exception as e: -- cgit v1.2.3