diff options
author | Ludovic Pouzenc <lpouzenc@gmail.com> | 2015-07-14 21:45:37 +0200 |
---|---|---|
committer | Ludovic Pouzenc <lpouzenc@gmail.com> | 2015-07-14 21:45:37 +0200 |
commit | 734a807df22cc085a4f711964cffae623fe648f1 (patch) | |
tree | 6c219f39bf5d780504e2ca18df5775376dbaefbc | |
parent | 259c938a7796f1aa4a8d2d9477aaed8bc72293e8 (diff) | |
download | raidguessfs-734a807df22cc085a4f711964cffae623fe648f1.tar.gz raidguessfs-734a807df22cc085a4f711964cffae623fe648f1.tar.bz2 raidguessfs-734a807df22cc085a4f711964cffae623fe648f1.zip |
find_files : load all ref file in memory (in a dict) to find every
matching sector with one read and one dict access per sector
-rw-r--r-- | mytasks.py | 37 |
1 files changed, 21 insertions, 16 deletions
@@ -27,19 +27,25 @@ def do_find_files(d,state): state['state'] = 'initializing' ref_paths = state['filepaths'] ref_count = len(ref_paths) - ref_fds = [None]*ref_count - ref_sizes = [None]*ref_count - ref_offset = [None]*ref_count - ref_cur_sect = [None]*ref_count + ref_big_hash = {} for ref_no in range(ref_count): path = state['filepaths'][ref_no] - logging.debug("Try to open reffile '%s'"%path) - ref_offset[ref_no] = 0 - ref_sizes[ref_no] = os.lstat(path).st_size - ref_fds[ref_no] = open(path, "r") - ref_fds[ref_no].seek(0) - ref_cur_sect[ref_no] = ref_fds[ref_no].read(512) - logging.debug("Opened reffile '%s'"%path) + logging.debug("Try to open ref. file '%s'"%path) + with open(path, 'rb') as fd: + logging.info("Loading ref. file '%s'"%path) + while True: + ref_offset = fd.tell() + data = fd.read(512) + if not data: + break + if data == '\0'*512: + logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset)) + elif data in ref_big_hash: + (prev_ref_no, prev_ref_offset) = ref_big_hash[data] + logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"% + (prev_ref_no, prev_ref_offset, ref_no, ref_offset)) + else: + ref_big_hash[data] = (ref_no, ref_offset) start = 0 end = min(d.disks_size) @@ -54,14 +60,12 @@ def do_find_files(d,state): for disk_no in range(d.disk_count): d.disks[disk_no].seek(offset) data = d.disks[disk_no].read(512) - for ref_no in range(ref_count): - #TODO : gerer les buffers plus court que 512 - if data == ref_cur_sect[ref_no]: - #TODO : chercher la suite du fichier + if data in ref_big_hash: f = state['found'] if len(f) < 200: # TODO agreger les matches - f.append((ref_paths[ref_no],ref_offset[ref_no],disk_no,offset)) + (ref_no, ref_offset) = ref_big_hash[data] + f.append((ref_no,ref_offset,disk_no,offset)) state['found'] = f else: state['state'] = 'aborted' @@ -69,6 +73,7 @@ def do_find_files(d,state): if offset % one_percent == 0: state['progress'] = state['progress'] + 1 + ref_big_hash.clear() state['state'] = 'finished' state['progress'] = 100 except Exception as e: |