added changes on checking similarity

2022-03-21 00:07:28 +08:00
parent 638f0a9aee
commit 9da68fbc38
172 changed files with 99465 additions and 49 deletions
--- a/base/api/run.py
+++ b/base/api/run.py
@ -0,0 +1,109 @@
+# from haystackmapper import haystackmapper 
+from .haystackreducer import haystackreducer
+from .haystack import haystack
+from .needlestorage import needlestorage
+from .wavsound import wavsound
+from multiprocessing import Pool, Process, Manager
+from .calltomapper import calltomapper
+from .calltoreducer import calltoreducer
+from operator import itemgetter
+import time
+import os
+
+def run(query, sample_length, samples, rootdir, max_split):
+    
+    """ run runs the repository search taking three user inputs, the query wav file,
+   sample_length, and number of partition samples"""
+    
+    
+    #Instantiate Wavsound objects from the wav files
+    
+    t_wavsounds = {}
+    query_wavsound = wavsound(query)    
+    
+    # repository Structure
+    
+    haystackss = []   # split repository into list of smaller repository
+    key_names = []
+
+    # repository Spliting Parameters (1 to number of repository entries)
+    
+    db_size_per_split = 100
+    
+    for i in range(max_split):
+        haystackss.append([])    
+    
+    # Read Files in the DB
+    
+    counter = 0
+    for subdir, __, files in os.walk(rootdir):
+        for file in files:
+            key_names.append(subdir+"/"+file)
+            split_db_key = min(max_split, int(counter / db_size_per_split))
+            t_wavsounds[subdir+"/"+file] = wavsound(subdir+"/"+file)
+            haystackss[split_db_key].append(haystack(subdir+"/"+file,t_wavsounds[subdir+"/"+file].get_data()[::16]))
+            counter += 1
+            
+    query_needle_factory = needlestorage(query_wavsound,sample_length,int(samples))
+    
+    # Get segments of the query data as needles
+    needles = query_needle_factory.get_needles()
+    #print("...", len(needles), "needles")
+    query_needle_factory.clear_needles()
+    
+    # MAP --------------------------------------------------
+    
+    # Manager to keep track of all map results
+    manager = Manager()
+    
+    # Map processes emit key-value pairs to emissions
+    return_emissions = manager.list()    
+    
+    # Job is a list of processes
+    jobs = []
+    
+    # Process number
+    pnum = 0
+            
+    #Distribute processes using multiprocessor
+    len_needles = len(needles)
+    for needle in needles:
+        for haystacks in haystackss:
+            if haystacks != []:
+                #print(len_needles)
+                p = Process(target=calltomapper, args=(haystacks,needle,pnum,len_needles*len(haystackss),return_emissions))
+                jobs.append(p)
+                p.start()
+                pnum += 1
+    
+    for proc in jobs:
+        proc.join() 
+        
+    # SHUFFLE/REDUCE ------------------------------------------
+    
+    # Job is a list of processes
+    jobs = []     
+    
+    # Manager to keep track of all map results
+    manager_2 = Manager()    
+    result_dict = manager_2.dict()
+    
+    for key in key_names:
+        key_list = [1 for x in return_emissions if x[0] == key]
+        print (key, key_list)
+        q = Process(target=calltoreducer, args=(key_list, key, result_dict))
+        jobs.append(q)
+        q.start()
+        
+    for proc in jobs:
+        proc.join()         
+    
+    result_lst = []
+    print(len(needles), "is length of needles")
+    if len(result_dict.items()) != 0:
+        for key, value in sorted(result_dict.items(), key=lambda pair: pair[1], reverse=True):
+            if value > 0:
+                result_lst.append([str(key), str((int(value)/len(needles)*100))])
+            
+    needles = []
+    return result_lst