summaryrefslogtreecommitdiff
path: root/mytasks.py
diff options
context:
space:
mode:
Diffstat (limited to 'mytasks.py')
-rw-r--r--mytasks.py118
1 files changed, 104 insertions, 14 deletions
diff --git a/mytasks.py b/mytasks.py
index dcb53a8..ab5d63f 100644
--- a/mytasks.py
+++ b/mytasks.py
@@ -24,7 +24,7 @@ import mydisks
def do_find_files(d,state):
logging.info("Enter do_find_files()")
try:
- state['state'] = 'initializing'
+ state['state'] = 'loading'
ref_paths = state['filepaths']
ref_count = len(ref_paths)
ref_big_hash = {}
@@ -33,18 +33,30 @@ def do_find_files(d,state):
logging.debug("Try to open ref. file '%s'"%path)
with open(path, 'rb') as fd:
logging.info("Loading ref. file '%s'"%path)
+ warn_empty = True
+ warn_dup = True;
while True:
ref_offset = fd.tell()
data = fd.read(512)
if not data:
break
if data == '\0'*512:
- logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset))
+ if warn_empty:
+ logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset))
+ warn_empty = False
+ else:
+ logging.debug("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset))
elif data in ref_big_hash:
(prev_ref_no, prev_ref_offset) = ref_big_hash[data]
- logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"%
- (prev_ref_no, prev_ref_offset, ref_no, ref_offset))
+ if warn_dup:
+ logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"%
+ (prev_ref_no, prev_ref_offset, ref_no, ref_offset))
+ warn_dump = False
+ else:
+ logging.debug("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"%
+ (prev_ref_no, prev_ref_offset, ref_no, ref_offset))
else:
+ # Hash in memory the whole read sector and store it's provenance
ref_big_hash[data] = (ref_no, ref_offset)
start = 0
@@ -55,29 +67,107 @@ def do_find_files(d,state):
state['found'] = []
state['progress'] = 0
+ state['raw_matches'] = 0
state['state'] = 'searching'
+ raw_matches = []
for offset in range(start, end, 512):
for disk_no in range(d.disk_count):
d.disks[disk_no].seek(offset)
data = d.disks[disk_no].read(512)
if data in ref_big_hash:
- f = state['found']
- if len(f) < 200:
- # TODO agreger les matches
- (ref_no, ref_offset) = ref_big_hash[data]
- f.append((ref_no,ref_offset,disk_no,offset))
- state['found'] = f
- else:
- state['state'] = 'aborted'
- raise Exception('Aborting after too many matches')
+ (ref_no, ref_offset) = ref_big_hash[data]
+ a = {
+ 'ref_no': ref_no,
+ 'ref_offset': ref_offset,
+ 'disk_no': disk_no,
+ 'disk_offset': offset
+ }
+ raw_matches.append(a)
+ logging.info('raw_matches.append(%s)'%a)
+ state['raw_matches'] = len(raw_matches)
if offset % one_percent == 0:
state['progress'] = state['progress'] + 1
+ agg = state['found']
+
+ state['state'] = 'aggregating'
+ found = True
+ while found:
+ found = False
+ i = 0
+ rm_len = len(raw_matches)
+ while not found and i < rm_len:
+ x = raw_matches[i]
+
+ # Try to find an aggregated item that ends just before our ref_offset
+ if not found:
+ for a in agg:
+ if a['ref_no'] == x['ref_no'] and x['ref_offset'] == a['ref_offset_end']:
+ a['ref_offset_end'] = x['ref_offset'] + 512
+ a['block_list'].append((x['disk_no'], x['disk_offset']))
+ x['consumed'] = True
+ found = True
+ break
+
+ # Try to find an aggregated item that starts just after our ref_offset
+ if not found:
+ for a in agg:
+ if a['ref_no'] == x['ref_no'] and x['ref_offset'] + 512 == a['ref_offset_start']:
+ a['ref_offset_start'] = x['ref_offset']
+ a['block_list'].insert(0,(x['disk_no'], x['disk_offset']))
+ x['consumed'] = True
+ found = True
+ break
+
+ # Try to find another match right before or after the current one to create an aggregate
+ if not found:
+ for x2 in raw_matches:
+ if x2 != x and x2['ref_no'] == x['ref_no'] and abs(x2['ref_offset'] - x['ref_offset']) == 512:
+ if x2['ref_offset'] > x['ref_offset']:
+ first = x
+ last = x2
+ else:
+ first = x2
+ last = x
+
+ a = {
+ 'ref_no': ref_no,
+ 'ref_offset_start': first['ref_offset'],
+ 'ref_offset_end': last['ref_offset'] + 512,
+ 'block_list': [
+ (first['disk_no'], first['disk_offset']),
+ (last['disk_no'], last['disk_offset']),
+ ]
+ }
+ logging.info('agg.append(%s)'%a)
+ agg.append(a)
+ x['consumed'] = True
+ x2['consumed'] = True
+ found = True
+ break
+ # Orphans are not injected in agg
+ i += 1
+ # end while ( x = raw_matches[i] )
+ # Remove all consumed items from raw_matches list before next While iteration
+ if found == True:
+ raw_matches = filter(lambda x: not 'consumed' in x, raw_matches)
+ state['raw_matches'] = len(raw_matches)
+
+ # end while not found
+
+ if len(agg) < 200:
+ state['found'] = agg
+ state['state'] = 'searching'
+ else:
+ state['state'] = 'aborted'
+ raise Exception('Aborting after too many matches')
- ref_big_hash.clear()
state['state'] = 'finished'
state['progress'] = 100
except Exception as e:
logging.exception(e)
+ finally:
+ ref_big_hash.clear()
+
logging.info("Exit. do_find_files()")