From 734a807df22cc085a4f711964cffae623fe648f1 Mon Sep 17 00:00:00 2001
From: Ludovic Pouzenc <lpouzenc@gmail.com>
Date: Tue, 14 Jul 2015 21:45:37 +0200
Subject: find_files : load all ref file in memory (in a dict) to find every
 matching sector with one read and one dict access per sector

---
 mytasks.py | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/mytasks.py b/mytasks.py
index 80b04ed..dcb53a8 100644
--- a/mytasks.py
+++ b/mytasks.py
@@ -27,19 +27,25 @@ def do_find_files(d,state):
         state['state'] = 'initializing'
         ref_paths = state['filepaths']
         ref_count = len(ref_paths)
-        ref_fds = [None]*ref_count
-        ref_sizes = [None]*ref_count
-        ref_offset = [None]*ref_count
-        ref_cur_sect = [None]*ref_count
+        ref_big_hash = {}
         for ref_no in range(ref_count):
             path = state['filepaths'][ref_no]
-            logging.debug("Try to open reffile '%s'"%path)
-            ref_offset[ref_no] = 0
-            ref_sizes[ref_no] = os.lstat(path).st_size
-            ref_fds[ref_no] = open(path, "r")
-            ref_fds[ref_no].seek(0)
-            ref_cur_sect[ref_no] = ref_fds[ref_no].read(512)
-            logging.debug("Opened reffile '%s'"%path)
+            logging.debug("Try to open ref. file '%s'"%path)
+            with open(path, 'rb') as fd:
+                logging.info("Loading ref. file '%s'"%path)
+                while True:
+                    ref_offset = fd.tell()
+                    data = fd.read(512)
+                    if not data:
+                        break
+                    if data == '\0'*512:
+                        logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset))
+                    elif data in ref_big_hash:
+                        (prev_ref_no, prev_ref_offset) = ref_big_hash[data]
+                        logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"%
+                                (prev_ref_no, prev_ref_offset, ref_no, ref_offset))
+                    else:
+                        ref_big_hash[data] = (ref_no, ref_offset)
 
         start = 0
         end = min(d.disks_size)
@@ -54,14 +60,12 @@ def do_find_files(d,state):
             for disk_no in range(d.disk_count):
                 d.disks[disk_no].seek(offset)
                 data = d.disks[disk_no].read(512)
-                for ref_no in range(ref_count):
-                    #TODO : gerer les buffers plus court que 512
-                    if data == ref_cur_sect[ref_no]:
-                        #TODO : chercher la suite du fichier
+                if data in ref_big_hash:
                         f = state['found']
                         if len(f) < 200:
                             # TODO agreger les matches
-                            f.append((ref_paths[ref_no],ref_offset[ref_no],disk_no,offset))
+                            (ref_no, ref_offset) = ref_big_hash[data]
+                            f.append((ref_no,ref_offset,disk_no,offset))
                             state['found'] = f
                         else:
                             state['state'] = 'aborted'
@@ -69,6 +73,7 @@ def do_find_files(d,state):
             if offset % one_percent == 0:
                 state['progress'] = state['progress'] + 1
 
+        ref_big_hash.clear()
         state['state'] = 'finished'
         state['progress'] = 100
     except Exception as e:
-- 
cgit v1.2.3