added changes on checking similarity

This commit is contained in:
kicap1992
2022-03-21 00:07:28 +08:00
parent 638f0a9aee
commit 9da68fbc38
172 changed files with 99465 additions and 49 deletions

0
base/api/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

10
base/api/calltomapper.py Normal file
View File

@ -0,0 +1,10 @@
from .haystackmapper import mapper
def calltomapper(haystacks, needle, processnumber, totalprocesses, return_emissions):
""" calltomapper is a target function of a process from interface
which in turn calls the method haystackmap.mapper with argument needle """
print("Finished with process ", processnumber, " word(sample) size of ",len(needle))
return_emissions += mapper(haystacks, needle)

View File

@ -0,0 +1,5 @@
from .haystack import haystack
from .haystackreducer import haystackreducer
def calltoreducer(emissions, key, join):
join[key] = haystackreducer(emissions)

View File

@ -0,0 +1,30 @@
import librosa
import librosa.display as display
import math
def cosine_similarity1(v1,v2):
"compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
sumxx, sumxy, sumyy = 0, 0, 0
for i in range(len(v1)):
x = v1[i]; y = v2[i]
sumxx += x*x
sumyy += y*y
sumxy += x*y
return sumxy/math.sqrt(sumxx*sumyy)
def check_similarity(audio_path, audio_path2):
# load audio files
y1, sr1 = librosa.load(audio_path)
y2, sr2 = librosa.load(audio_path2)
# compute spectrogram for the audio files
S1 = librosa.stft(y1)
S2 = librosa.stft(y2)
# convert to power spectrogram
S_db = librosa.amplitude_to_db(abs(S1))
S2_db = librosa.amplitude_to_db(abs(S2))
# display spectrogram
# display.specshow(S_db, y_axis='linear', x_axis='time', sr=sr, hop_length=512)
# display.specshow(S2_db, y_axis='linear', x_axis='time', sr=sr2, hop_length=512)
# plt.show()
# compute cosine similarity
return cosine_similarity1(S_db.flatten(), S2_db.flatten())

118
base/api/dbtest.py Normal file
View File

@ -0,0 +1,118 @@
from .haystackmapper import mapper
from .haystackreducer import haystackreducer
from .haystack import haystack
from .needlestorage import needlestorage
from .wavsound import wavsound
from multiprocessing import Pool, Process, Manager
from .calltomapper import calltomapper
import time
import profile
import re
""" dbtest is a simulation module to measure time complexity of
database search applied to a virtual database"""
def test():
button_wavsound = wavsound('button.wav')
haystackss = [] # split database into list of smaller database
keynames = []
db_size = 300 # Set Database Size
num_split_db = 2 # Set number of split databases
size_split_db = int(db_size/num_split_db)
for i in range(num_split_db):
haystackss.append([])
counter = 0
for i in range(db_size):
split_db_key = int(counter / size_split_db)
keynames.append(i)
haystackss[split_db_key].append(haystack(i,button_wavsound.get_data()))
counter+=1
#haystacks.append(haystack("7",[1, 2, 3, 4, 5]))
button_needle_factory = needlestorage(button_wavsound,1000,50)
emissions = []
print("USING MAP PROCESS and Manager")
needles = button_needle_factory.get_needles()
print(needles[0])
manager = Manager()
return_emissions = manager.dict()
jobs = []
pnum = 0
# number of needles not size of each needle
len_needles = len(needles)
print ("Number of Needles: ",len_needles)
start_time = time.time()
for needle in needles:
for haystacks in haystackss:
p = Process(target=calltomapper, args=(haystacks,needle,pnum,len_needles*num_split_db,return_emissions))
jobs.append(p)
p.start()
pnum += 1
print(time.time() - start_time)
for proc in jobs:
proc.join() # wait for each process to end completely
print(time.time() - start_time)
emissions_list = sum(return_emissions.values(),[])
print("Reduce Result:")
print(haystackreducer(emissions_list,keynames))
print("Done")
print(time.time() - start_time)
"""
This is a pool implementation of parallel processing, it has been
commented out as it was slower than the Process method
print(button_wavsound)
print("Utilizing MapReduce Pattern")
pool = Pool(2) # if it is a quad-core machine it can be set to 4
print(button_needle_factory.get_needles())
emissions = pool.map(haystackmap.mapper, button_needle_factory.get_needles())
print(emissions)
print(haystackreducer(sum(emissions,[])))
emissions = []
"""
""" The algorithm below is a serial method, no optimization """
"""
print("Long Way")
start_long_time = time.time()
#haystackmap.clear_emission()
i = 10000 # cautionary protection from accidental infinite loop
while i > 0:
needle = button_needle_factory.pop_unused_needle()
if (needle == []):
break
emissions += mapper(haystacks,needle)
i -= 1
print("Total So Far: ",len(emissions))
print("Final:",haystackreducer(emissions, keynames))
timelapse_serial = time.time() - start_long_time
print (db_size + 1, timelapse_parallel, timelapse_serial)
with open('output.txt', 'a') as outputfile:
outputfile.write(str(db_size + 1) +' '+str(timelapse_parallel) +' '+str(timelapse_serial) + '\n')
"""
if __name__ == '__main__':
test()
profile.run('re.compile("mapper")')

19
base/api/haystack.py Normal file
View File

@ -0,0 +1,19 @@
class haystack:
""" haystack represents a single entry in the database
it contains a sequence of integers """
def __init__(self,name,data):
self.name=name
self.data=data
self.length = len(data)
def get_data(self):
return self.data
def get_name(self):
return self.name
def get_length(self):
return self.length

View File

@ -0,0 +1,65 @@
from .haystack import haystack
# class haystackmapper:
# """
# haystackmapper is a class responsible for map process,
# it emits key-value pair if a needle can be found in the haystack
# (key: haystack name, value: 1 represents a match)
# """
# emission = []
# def __init__ (self, haystacks):
# self.haystacks = haystacks
def mapper (stack, needle):
len_needle = len(needle)
emission = []
for haystack in stack:
len_haystack = haystack.get_length()
data = haystack.get_data()
for i in range(len_haystack):
#get subsequence length
len_sub = min(len_needle, (len_haystack - i))
subsequence = data[i:i + len_needle]
nomatch = 0
"""
Uncomment this for debugging:
#difference = []
"""
if len_sub <= 1:
nomatch = 1
break
for i in range(len_sub): # len_sub <= len_needle
"""
Uncomment this for debugging:
if abs(needle[i]) > 100 :
difference.append(abs(subsequence[i] - needle[i])/needle[i] )
else:
difference.append(0)
"""
if abs(needle[i]) > 100 and abs(subsequence[i] - needle[i]) > abs(0.05*needle[i]):
# allow slight variation
nomatch = 1
break
if nomatch == 0:
"""
Uncomment this section for deubgging to show comparison
print(needle)
print(subsequence)
print(difference)
print(haystack.get_name(),nomatch)
"""
emission.append([haystack.get_name(), 1])
break
else:
pass
return emission

View File

@ -0,0 +1,8 @@
def haystackreducer(emissions):
"""
haystackreducer is a class responsible for reducing emissions
from multiple map processes. It tallies the number of matches
from key-value pairs emitted from each mapper
"""
return len(emissions)

95
base/api/interface.py Normal file
View File

@ -0,0 +1,95 @@
from haystackmapper import haystackmapper
from haystackreducer import haystackreducer
from haystack import haystack
from needlestorage import needlestorage
from wavsound import wavsound
from multiprocessing import Pool, Process, Manager
from calltomapper import calltomapper
import time
import os
def run():
""" run runs the database search taking three user inputs, the query wav file,
number of partitions, and number of partition samples"""
good_file = 0
while (good_file == 0):
query = raw_input("Submit .wav file to search against database (Example: button.wav): ")
if (os.path.isfile(query)):
good_file = 1
#Instantiate Wavsound objects from the wav files
t_wavsounds = {}
query_wavsound = wavsound(query)
print("\n**Higher number of partitions increases false positive rates, \nwhile lower number of partitions increases false negative rates\n")
partition = raw_input("Set number of partitions of the query from 1 to " + str(int(len(query_wavsound.get_data())/3))+": ")
samples = raw_input("Set number of samples of partitions from 1 to " + partition + " (Recommend < 50): ")
# Database Structure
haystacks = []
# Database look up directory
rootdir = 'db'
for subdir, __, files in os.walk(rootdir):
for file in files:
# for debug print (subdir+"/"+file)
t_wavsounds[subdir+"/"+file] = wavsound(subdir+"/"+file)
# for debug print(t_wavsounds[subdir+"/"+file])
haystacks.append(haystack(subdir+"/"+file,t_wavsounds[subdir+"/"+file].get_data()))
query_needle_factory = needlestorage(query_wavsound,int(partition),int(samples))
haystackmap = haystackmapper(haystacks)
needles = query_needle_factory.get_needles()
len_needles = len(needles)
len_needle = len(needles[0]) # size is the same for all needles
manager = Manager()
# Map processes emit key-value pairs to emissions
return_emissions = manager.dict()
# Job is a list of processes
jobs = []
# Process number
pnum = 0
print "Number of Needles: ", len(needles)
# Database query time
start_time = time.time()
#Distribute processes using multiprocessor
for needle in needles:
p = Process(target=calltomapper, args=(haystackmap,needle,pnum,len_needles,return_emissions))
jobs.append(p)
p.start()
pnum += 1
for proc in jobs:
proc.join()
# flatten return_emissions into a list
emissions_list = sum(return_emissions.values(),[])
print "Search Result:"
result_dict = haystackreducer(emissions_list)
# Tabulate % match (wav files with 0% match are excluded from the result)
for key in result_dict:
print str(key),": ",(25-len(str(key)))*" ",str("{0:.2f}".format(int(result_dict[key])/len(needles)*100)),"% match"
# Show search time
timelapse_parallel = time.time() - start_time
print timelapse_parallel, "seconds"
if __name__ == '__main__':
print ".WAV Search Engine Version 1 (Only Python Ver 2.X.X.)"
run()

50
base/api/interfaceCLI.py Normal file
View File

@ -0,0 +1,50 @@
from .run import *
from .calltoreducer import calltoreducer
def cek_simlilarity2(recorded_sound,url_bacaan):
# print (".WAV Search Engine Version 1 (For Python Ver. 3+) ")
good_file = 0
# while (good_file == 0):
# query = input("Submit .wav file to search against repository (Example: button.wav): ")
# if (os.path.isfile(query)):
# good_file = 1
# query_wavsound = wavsound(query)
query_wavsound = wavsound(recorded_sound)
# print("\n**Higher number of partitions increases false positive rates, \nwhile lower number of partitions increases false negative rates\n")
# samplelength = input("Set word size (sample length) (5 ~ 100) : ");
samplelength = 5
# samples = input("Set number of samples (n) of partitions from 1 to " + str(int(len(query_wavsound.get_data())/float(samplelength))) + ": ")
samples = 3
# repository look up directory
# print(url_bacaan)
# dbdir = input("Enter repository directory to search (example: 'db') : ")
dbdir = url_bacaan
# max_split = int(input("Set maximum allowable number of split repositories : "))
max_split = 2
# repository query time
start_time = time.time()
result_lst = run(recorded_sound, int(samplelength), samples, dbdir, max_split)
# output
output = "Search Result: \n"
keluaran = ""
# Tabulate % match (wav files with 0% match are excluded from the result)
for pair in result_lst:
output += pair[0] + " : " + (40-len(pair[0]))*" " + pair[1] + "% match" + "\n"
keluaran = (40-len(pair[0]))*" " + pair[1] + "% "
# print(keluaran)
# Show search time
timelapse_parallel = time.time() - start_time
output = output + str(timelapse_parallel) + "seconds"
# print(output)
# remove all the spaces
keluaran = keluaran.replace(" ","")
return keluaran

196
base/api/interfaceGUI.py Normal file
View File

@ -0,0 +1,196 @@
import tkinter as tk
from run import *
from tkinter import ttk
# application is a GUI interface of run.py
class application:
def __init__(self):
self.samples = 12
self.samplelength = 80
self.max_split = 2
self.root = tk.Tk()
self.root.wm_title("Audio Search Engine")
self.show_enclosure()
self.show_file_entry()
self.show_buttons()
self.show_result()
self.show_canvas()
self.show_menu()
self.root.iconbitmap("img/logo.ico")
self.root.mainloop()
self.filename=""
# show right enclosure frame
def show_enclosure(self):
self.group_enclosure =tk.LabelFrame(self.root, text="", padx=5, pady=5, background="white")
self.group_enclosure.pack(side="right")
# show audio wavform canvas
def show_canvas (self):
self.group_canvas = tk.LabelFrame(self.group_enclosure, text="Top(Query) Bottom(Best Match)", padx=5, pady=5)
self.group_canvas.pack(side='top')
self.canvas_query = tk.Canvas(self.group_canvas, width = 200, height = 100, bg = 'black')
self.canvas_result = tk.Canvas(self.group_canvas, width = 200, height = 100, bg = 'black')
# pack the canvas into a frame/form
self.canvas_query.pack(expand = 'yes', fill = 'both')
self.canvas_query.create_line(0, 50, 199.9, 50, fill="red", dash=(4, 4))
self.canvas_result.pack(expand = 'yes', fill = 'both', side='bottom')
self.canvas_result.create_line(0, 50, 199.9, 50, fill="red", dash=(4, 4))
#self.photo = tk.PhotoImage(file = 'img/search.png', width = 70, height = 70 )
#self.canvas.create_image(5, 5, image=self.photo, anchor="nw")
def clear_canvas(self):
self.canvas_query.create_rectangle(0, 0, 200, 100, fill="black")
self.canvas_result.create_rectangle(0, 0, 200, 100, fill="black")
self.canvas_query.create_line(0, 50, 199.9, 50, fill="red", dash=(4, 4))
self.canvas_result.create_line(0, 50, 199.9, 50, fill="red", dash=(4, 4))
# draw_wavform draws waveform of wav data to a target canvas
def draw_wavform (self, wavdata, color, target):
unit_x=200/len(wavdata)
unit_y=20/max(wavdata)
for i in range(len(wavdata)-1):
y_1 = wavdata[i]*unit_y + 50
x_1 = i*unit_x
y_2 = wavdata[i+1]*unit_y + 50
x_2 = (i+1)*unit_x
if target == "query":
self.canvas_query.create_line(x_1, y_1, x_2, y_2, fill=color, width=0.5)
elif target == "result":
self.canvas_result.create_line(x_1, y_1, x_2, y_2, fill=color, width=0.5)
# refresh parameter labels
def refresh_parameters (self):
newtext="Number of Audio Samples to Compare: " + str(self.samples) + " Repository Maximum Split Paramter: " + str(self.max_split) + " Word Length: " + str(self.samplelength )
self.label_result.config(text=newtext)
# decrease number of samples (words or needles)
def sample_up (self):
self.samples += 1
self.refresh_parameters()
# decrease number of samples (words or needles)
def sample_down (self):
if self.samples > 0:
self.samples -= 1
self.refresh_parameters()
# increase samplelength (word size)
def word_up (self):
self.samplelength += 5
self.refresh_parameters()
# decrease samplelength (word size)
def word_down (self):
if self.samplelength > 5:
self.samplelength -= 5
self.refresh_parameters()
# show_menu shows the menu at the top
def show_menu (self):
self.menu = tk.Menu(self.root)
self.menu.add_command(label="Search", command=self.on_button_click)
self.menu.add_command(label="Precision +", command=self.sample_up)
self.menu.add_command(label="Precision -", command=self.sample_down)
self.menu.add_command(label="Word Size +", command=self.word_up)
self.menu.add_command(label="Word Size -", command=self.word_down)
self.menu.add_command(label="Quit", command=self.quit)
self.root.config (menu=self.menu)
# add style (optional)
def add_config (self):
self.style = ttk.Style()
#self.style.configure(... enter here ...)
# show textbox that holds the result of the search
def show_result (self):
self.group_result = tk.LabelFrame(self.root, text="", padx=5, pady=5)
self.group_result.pack(padx=10, pady=15, side = 'left')
self.label_result = tk.Label(self.group_result, text="Number of Audio Samples to Compare: " + str(self.samples) + " Repository Maximum Split Paramter: " + str(self.max_split) + " Word Length: " + str(self.samplelength ))
self.label_result.pack()
self.text_result = tk.Text(self.group_result, height="20")
self.text_result.configure(background='black', foreground='cyan')
self.text_result.pack()
# show file entry input box
def show_file_entry(self):
# GROUP ENTRY
self.group_entry = tk.LabelFrame(self.group_enclosure, text="", padx=5, pady=5)
self.group_entry.pack(padx=10, pady=10, side = 'bottom')
self.label_file = tk.Label(self.group_entry, text="Query Filename")
self.label_file.pack(padx=58)
self.entry_file = tk.Entry(self.group_entry, bd =5, relief="flat")
self.entry_file.pack()
self.entry_file.insert(0, "voicequery.wav")
self.label_db = tk.Label(self.group_entry, text="Path to repository")
self.label_db.pack()
self.entry_db = tk.Entry(self.group_entry, bd =5, relief="flat")
self.entry_db.insert(0, "db_voice")
self.entry_db.pack()
# show search button below the file entry
def show_buttons (self):
root = self.root
tk.Button(self.group_entry, padx=15, text="Search", command=self.on_button_click).pack()
# on click event for the button
def on_button_click(self):
self.filename = self.entry_file.get()
self.clear_canvas()
if (os.path.isfile(self.filename)):
query_wavsound = wavsound(self.filename)
self.dbroot = self.entry_db.get()
samples = self.samples
partition = int(len(query_wavsound.get_data())/self.samplelength)
max_split = self.max_split
# repository query time
start_time = time.time()
result_lst = run(self.filename, self.samplelength, samples, self.dbroot, max_split)
# output
output = "Search Result: \n"
# Tabulate % match (wav files with 0% match are excluded from the result)
for pair in result_lst:
output += pair[0] + " : " + (40-len(pair[0]))*" " + pair[1] + "% match" + "\n"
# Show search time
timelapse_parallel = time.time() - start_time
output = output + str(timelapse_parallel) + "seconds"
self.text_result.insert('1.0', output + "\n" )
self.draw_wavform(query_wavsound.get_data(),"cyan","query")
top_match_wavsoundfile = output.split()[2]
print( output.split())
print(top_match_wavsoundfile)
top_match_wavsound = wavsound(top_match_wavsoundfile)
self.draw_wavform(top_match_wavsound.get_data(),"white","result")
# quit application
def quit(self):
self.root.destroy()
if __name__ == '__main__':
my_app = application()

72
base/api/librosa_run.py Normal file
View File

@ -0,0 +1,72 @@
import librosa
import librosa.display as display
import matplotlib.pyplot as plt
import os
# from numpy import array
from datetime import datetime
from numpy.linalg import norm
from dtw import dtw
# import math
import warnings
warnings.filterwarnings("ignore")
def dot(A,B):
return (sum(a*b for a,b in zip(A,B)))
def cosine_similarity(a,b):
return dot(a,b) / ( (dot(a,a) **.5) * (dot(b,b) ** .5) )
def fungsi_librosa(sound_url1,sound_url2):
print(sound_url1)
print(sound_url2)
y1, sr1 = librosa.load(sound_url1)
y2, sr2 = librosa.load(sound_url2)
mfcc1 = librosa.feature.mfcc(y1,sr1)
mfcc2 = librosa.feature.mfcc(y2,sr2)
path = 'static/created_image/'
if not os.path.exists(path):
os.makedirs(path)
today_date = datetime.now().strftime("%Y-%m-%d")
path = path + today_date+ '/'
path_nya = path
if not os.path.exists(path):
os.makedirs(path)
hour_min_sec = datetime.now().strftime("%H-%M-%S")
plt.figure(figsize=(14, 5))
display.waveshow(y1, sr=sr1)
plt.savefig(path+'spec'+hour_min_sec+'.png')
plt.figure(figsize=(14, 5))
display.waveshow(y2, sr=sr2)
plt.savefig(path+'spec1'+hour_min_sec+'.png')
dist, cost, acc_cost, path = dtw(mfcc1.T, mfcc2.T, dist=lambda x, y: norm(x - y, ord=1))
array1 = []
for nums in mfcc1:
for val in nums:
array1.append(val)
array2 = []
for nums in mfcc2:
for val in nums:
array2.append(val)
cosine_similaritynya = cosine_similarity(array1, array2)
return {'dist': dist, 'cosine_similaritynya': cosine_similaritynya, 'path': path_nya, 'hour_min_sec': hour_min_sec}

64
base/api/mapreducetest.py Normal file
View File

@ -0,0 +1,64 @@
from haystackmapper import haystackmapper
from haystackreducer import haystackreducer
from haystack import haystack
from multiprocessing import Pool, Process, Manager
from simplfunction import simplefunction
from calltomapper import calltomapper
if __name__ == '__main__':
"""Map reduce test is a simple testing module to check functionality
of map-reduce implementation"""
haystacks = []
haystacks.append(haystack("0",[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
haystacks.append(haystack("1",[3, 2, 2, 3, 4, 3, 6, 7, 8, 9]))
haystacks.append(haystack("2",[1, 6, -1, 0, 4, 0, 6, 7, 8, 9]))
haystacks.append(haystack("3",[3, 3, 3]))
haystackmap = haystackmapper(haystacks)
emissions= []
print("USING MAP POOL")
pool = Pool(2) # if it is a quad-core machine it can be set to 4
emissions = pool.map(haystackmap.mapper, [[2],[3]])
print(emissions)
print(haystackreducer(sum(emissions,[])))
emissions= []
print("USING MAP PROCESS")
p = Process(target=simplefunction, args=(1,2))
p.start()
p = Process(target=simplefunction, args=(1,3))
p.start()
p = Process(target=simplefunction, args=(1,4))
p.start()
p.join()
print("USING MAP PROCESS WITH MANAGER")
needles = [2, 3]
manager = Manager()
return_emissions = manager.dict()
jobs = []
pnum = 0
for needle in needles:#distributive not iteration
p = Process(target=calltomapper, args=(haystackmap,[needle],pnum,return_emissions))
jobs.append(p)
p.start()
pnum += 1
for proc in jobs:
proc.join()
emissions_list = sum(return_emissions.values(),[])
print(haystackreducer(emissions_list ))
print("Long Way (Serial Method)")
needles = [2, 3]
haystackmap = haystackmapper(haystacks)
for needle in needles:#distributive not iteration
#print (needle)
emissions = emissions + haystackmap.mapper([ needle ])
print(haystackreducer(emissions))

35
base/api/needlestorage.py Normal file
View File

@ -0,0 +1,35 @@
from .wavsound import wavsound
class needlestorage:
"""needlestorage object generates needles (subsequence sample) of wavsound
object data based on number of partitions (num_chunk) and
number of samples to be analyzed (limit). The needles/samples are picked
deterministically so that the data sampled are evenly
distributed across the wav file) """
needles = []
def __init__(self, wavsound, sample_length, limit):
u=0
x=0
self.needles = []
data = wavsound.get_data()[::16] # skip every 16
print(int(len(data)/limit)-sample_length, "is skip value")
# determine the gap between the starting positions of two consecutive
# needles (i.e. opposite of degree of overlap)
skips_per_neeedle = max(1,int(len(data)/limit)-sample_length)
for i in range (len(data)):
if u >= limit:
break
if (x % skips_per_neeedle == 0):
self.needles.append(data[i:i+sample_length])
u = u + 1
x = x + 1
def pop_unused_needle(self):
if len(self.needles) == 0:
return []
return self.needles.pop(0)
def get_needles(self):
return self.needles
def clear_needles(self):
self.needles = []

109
base/api/run.py Normal file
View File

@ -0,0 +1,109 @@
# from haystackmapper import haystackmapper
from .haystackreducer import haystackreducer
from .haystack import haystack
from .needlestorage import needlestorage
from .wavsound import wavsound
from multiprocessing import Pool, Process, Manager
from .calltomapper import calltomapper
from .calltoreducer import calltoreducer
from operator import itemgetter
import time
import os
def run(query, sample_length, samples, rootdir, max_split):
""" run runs the repository search taking three user inputs, the query wav file,
sample_length, and number of partition samples"""
#Instantiate Wavsound objects from the wav files
t_wavsounds = {}
query_wavsound = wavsound(query)
# repository Structure
haystackss = [] # split repository into list of smaller repository
key_names = []
# repository Spliting Parameters (1 to number of repository entries)
db_size_per_split = 100
for i in range(max_split):
haystackss.append([])
# Read Files in the DB
counter = 0
for subdir, __, files in os.walk(rootdir):
for file in files:
key_names.append(subdir+"/"+file)
split_db_key = min(max_split, int(counter / db_size_per_split))
t_wavsounds[subdir+"/"+file] = wavsound(subdir+"/"+file)
haystackss[split_db_key].append(haystack(subdir+"/"+file,t_wavsounds[subdir+"/"+file].get_data()[::16]))
counter += 1
query_needle_factory = needlestorage(query_wavsound,sample_length,int(samples))
# Get segments of the query data as needles
needles = query_needle_factory.get_needles()
#print("...", len(needles), "needles")
query_needle_factory.clear_needles()
# MAP --------------------------------------------------
# Manager to keep track of all map results
manager = Manager()
# Map processes emit key-value pairs to emissions
return_emissions = manager.list()
# Job is a list of processes
jobs = []
# Process number
pnum = 0
#Distribute processes using multiprocessor
len_needles = len(needles)
for needle in needles:
for haystacks in haystackss:
if haystacks != []:
#print(len_needles)
p = Process(target=calltomapper, args=(haystacks,needle,pnum,len_needles*len(haystackss),return_emissions))
jobs.append(p)
p.start()
pnum += 1
for proc in jobs:
proc.join()
# SHUFFLE/REDUCE ------------------------------------------
# Job is a list of processes
jobs = []
# Manager to keep track of all map results
manager_2 = Manager()
result_dict = manager_2.dict()
for key in key_names:
key_list = [1 for x in return_emissions if x[0] == key]
print (key, key_list)
q = Process(target=calltoreducer, args=(key_list, key, result_dict))
jobs.append(q)
q.start()
for proc in jobs:
proc.join()
result_lst = []
print(len(needles), "is length of needles")
if len(result_dict.items()) != 0:
for key, value in sorted(result_dict.items(), key=lambda pair: pair[1], reverse=True):
if value > 0:
result_lst.append([str(key), str((int(value)/len(needles)*100))])
needles = []
return result_lst

BIN
base/api/run.pyc Normal file

Binary file not shown.

8
base/api/serializers.py Normal file
View File

@ -0,0 +1,8 @@
from rest_framework.serializers import ModelSerializer
from base.models import tb_bacaan
class tb_bacaanSerializer(ModelSerializer):
class Meta:
model = tb_bacaan
fields = '__all__'

8
base/api/urls.py Normal file
View File

@ -0,0 +1,8 @@
from django.urls import path
from . import views
urlpatterns = [
path('', views.getRoutes),
path('bacaans/', views.getBacaans),
path('bacaans/<str:pk>/', views.getBacaan),
]

64
base/api/views.py Normal file
View File

@ -0,0 +1,64 @@
import os
from rest_framework.decorators import api_view
from rest_framework.response import Response
from base.models import tb_bacaan
from .serializers import tb_bacaanSerializer
from .check_similarity import check_similarity
from .interfaceCLI import cek_simlilarity2
from .librosa_run import fungsi_librosa
@api_view(['GET', 'POST'])
def getRoutes(request):
if request.method == 'GET':
routes = [
'GET /api',
'POST /api/bacaan',
]
return Response(routes)
if request.method == 'POST':
url_bacaan = request.POST['url_bacaan']
nama = request.FILES['sound'].name
nama = nama.replace('Z', '').replace('.wav', '').replace('.', '').replace('-', ' ').replace(':', ' ').replace('T', ' ')
path = 'static/uploaded/'
# print(bool(request.FILES.get('hehe', False))) # check if file is empty , true is not empty . false is empty
handle_uploaded_file(request.FILES['sound'],url_bacaan,nama)
# check_similarity(path+nama+'.wav', 'static/audio/'+url_bacaan)
# similarity = check_similarity(path+nama+'.wav', 'static/audio/'+url_bacaan)
librosa_run = fungsi_librosa(path+nama+'.wav', 'static/audio/'+url_bacaan)
# remove athe last 5 char on url_bacaan
url_bacaan = url_bacaan[:-5]
# print(url_bacaan)
# similarity2 = cek_simlilarity2(path+nama+'.wav', 'static/audio/'+url_bacaan)
dataall = {'data' : 'sini data'}
#add librosa_run dictinary to dataall
dataall.update(librosa_run)
return Response(dataall)
def handle_uploaded_file(f,url_bacaan,nama):
path = 'static/uploaded/' # this is the path to the folder where you want to save the file
isExist = os.path.exists(path) # check if the folder exist
if not isExist: # if not exist then create the folder
os.mkdir(path) # create the folder
with open(path+nama+'.wav', 'wb+') as destination:
for chunk in f.chunks():
destination.write(chunk)
@api_view(['GET'])
def getBacaans(request):
bacaans = tb_bacaan.objects.all()
serializer = tb_bacaanSerializer(bacaans, many=True)
return Response(serializer.data)
@api_view(['GET'])
def getBacaan(request,pk):
bacaan = tb_bacaan.objects.get(id=pk)
serializer = tb_bacaanSerializer(bacaan)
return Response(serializer.data)

12
base/api/wavread.py Normal file
View File

@ -0,0 +1,12 @@
from .wavsound import wavsound
"""wavread is a testing module to test the functionality of wavsound"""
button_wavsound = wavsound('db/button.wav')
print(button_wavsound)
beep_wavsound = wavsound('db/buttonresampled.wav')
print(beep_wavsound)
print (len(button_wavsound.get_data()))
print (len(beep_wavsound.get_data()),len(button_wavsound.get_data()))
print(button_wavsound.get_chunk(0,100))

53
base/api/wavsound.py Normal file
View File

@ -0,0 +1,53 @@
import wave, struct
class wavsound:
"""wavsound object collects data from a wav file and stores it
as a list of integers """
def __init__(self, wav_file):
self.data = []# set it as a local variable
if wav_file == '':
return
#print(wav_file)
waveFile = wave.open(wav_file, 'r')
length = waveFile.getnframes()
for i in range(0,length):
waveData = waveFile.readframes(1)
try:
data = struct.unpack("<h", waveData)
except:
data = struct.unpack("<L", waveData)
# data = struct.unpack("<h", waveData)
# data = struct.unpack("<L", waveData)
self.data.append(int(data[0]))
# return data
def get_data(self):
return self.data
# set data from a list of int
def set_data(self, data):
self.data = data
# copy data from other wavsound
def copy_from(self, other_wavsound):
self.data = other_wavsound.get_data()
# Get a small chunk of wavsound
def get_chunk (self, section_no, num_chunk):
new_wavsound = wavsound('')
data = self.get_data()
#print(len(self.get_data()))
length = int(len(self.get_data())/num_chunk)
#print(length)
new_wavsound.set_data(data[(length*section_no):(length*(section_no+1))])
return new_wavsound
def __repr__(self):
strr = ''
for i in self.data:
strr = strr + " " + str(i)
return strr