first commit
This commit is contained in:
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
/env
|
||||||
|
yolo*
|
||||||
|
|
||||||
115
d3.py
Normal file
115
d3.py
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
import torch
|
||||||
|
import cv2
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import string
|
||||||
|
from gtts import gTTS
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.playback import play
|
||||||
|
import tkinter as tk
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
# Load YOLOv5 model from GitHub
|
||||||
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
|
||||||
|
|
||||||
|
# Initialize webcam capture
|
||||||
|
cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed
|
||||||
|
|
||||||
|
# Global variable for summary text
|
||||||
|
summary_text = ''
|
||||||
|
|
||||||
|
# Function to play sound or text-to-speech
|
||||||
|
def play_sound_or_tts(word, text_dir='suaraku', speed=1):
|
||||||
|
filename = os.path.join(text_dir, f"{word}.wav")
|
||||||
|
print(filename)
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
sound = AudioSegment.from_file(filename)
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
return sound
|
||||||
|
else:
|
||||||
|
tts = gTTS(word)
|
||||||
|
tts.save('temp.mp3')
|
||||||
|
sound = AudioSegment.from_file('temp.mp3')
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
os.remove('temp.mp3')
|
||||||
|
return sound
|
||||||
|
|
||||||
|
def play_full_text(text, text_dir='suaraku', speed=1.0):
|
||||||
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
|
words = text.translate(translator).split()
|
||||||
|
|
||||||
|
combined_sound = AudioSegment.silent(duration=0)
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
print(word)
|
||||||
|
sound = play_sound_or_tts(word, text_dir, speed)
|
||||||
|
combined_sound += sound
|
||||||
|
|
||||||
|
play(combined_sound)
|
||||||
|
|
||||||
|
def on_button_press():
|
||||||
|
global summary_text
|
||||||
|
play_button.config(text="Loading...")
|
||||||
|
play_thread = Thread(target=play_audio_and_update_button)
|
||||||
|
play_thread.start()
|
||||||
|
|
||||||
|
def play_audio_and_update_button():
|
||||||
|
global summary_text
|
||||||
|
play_full_text(summary_text, 'suaraku', 1.1)
|
||||||
|
play_button.config(text="Play Summary")
|
||||||
|
|
||||||
|
def process_frame():
|
||||||
|
global summary_text
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
print("Failed to grab frame")
|
||||||
|
return
|
||||||
|
|
||||||
|
result = model(frame)
|
||||||
|
data_frame = result.pandas().xyxy[0]
|
||||||
|
|
||||||
|
# Filter detections with confidence above 70%
|
||||||
|
data_frame = data_frame[data_frame['confidence'] > 0.4]
|
||||||
|
|
||||||
|
label_counts = data_frame['name'].value_counts()
|
||||||
|
indexes = data_frame.index
|
||||||
|
for index in indexes:
|
||||||
|
x1 = int(data_frame['xmin'][index])
|
||||||
|
y1 = int(data_frame['ymin'][index])
|
||||||
|
x2 = int(data_frame['xmax'][index])
|
||||||
|
y2 = int(data_frame['ymax'][index])
|
||||||
|
label = data_frame['name'][index]
|
||||||
|
conf = data_frame['confidence'][index]
|
||||||
|
text = label + ' ' + str(conf.round(decimals=2))
|
||||||
|
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
|
||||||
|
cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
|
||||||
|
|
||||||
|
if label_counts.empty:
|
||||||
|
summary_text = 'No objects detected.'
|
||||||
|
else:
|
||||||
|
summary_text = 'There are: '
|
||||||
|
for label, count in label_counts.items():
|
||||||
|
summary_text += f'{count} {label}, '
|
||||||
|
summary_text = summary_text[:-2] # Remove the last comma and space
|
||||||
|
summary_text += ' detected.'
|
||||||
|
|
||||||
|
cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
|
||||||
|
cv2.imshow('Webcam Feed', frame)
|
||||||
|
cv2.waitKey(1)
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
# GUI setup
|
||||||
|
root = tk.Tk()
|
||||||
|
root.title("Object Detection with Sound Playback")
|
||||||
|
|
||||||
|
play_button = tk.Button(root, text="Play Summary", command=on_button_press)
|
||||||
|
play_button.pack()
|
||||||
|
|
||||||
|
# Start the webcam processing
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
cv2.destroyAllWindows()
|
||||||
45
detect1.py
Normal file
45
detect1.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
import torch
|
||||||
|
import cv2
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Download model from github
|
||||||
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
|
||||||
|
|
||||||
|
img = cv2.imread('car.jpg')
|
||||||
|
img = cv2.resize(img, (1000, 650))
|
||||||
|
|
||||||
|
# Perform detection on image
|
||||||
|
result = model(img)
|
||||||
|
print('result: ', result)
|
||||||
|
|
||||||
|
# Convert detected result to pandas data frame
|
||||||
|
data_frame = result.pandas().xyxy[0]
|
||||||
|
print('data_frame:')
|
||||||
|
print(data_frame)
|
||||||
|
|
||||||
|
# Get the counts of each label
|
||||||
|
label_counts = data_frame['name'].value_counts()
|
||||||
|
print('Label counts:')
|
||||||
|
print(label_counts)
|
||||||
|
|
||||||
|
# Get indexes of all of the rows
|
||||||
|
indexes = data_frame.index
|
||||||
|
for index in indexes:
|
||||||
|
# Find the coordinate of top left corner of bounding box
|
||||||
|
x1 = int(data_frame['xmin'][index])
|
||||||
|
y1 = int(data_frame['ymin'][index])
|
||||||
|
# Find the coordinate of right bottom corner of bounding box
|
||||||
|
x2 = int(data_frame['xmax'][index])
|
||||||
|
y2 = int(data_frame['ymax'][index])
|
||||||
|
|
||||||
|
# Find label name
|
||||||
|
label = data_frame['name'][index]
|
||||||
|
# Find confidence score of the model
|
||||||
|
conf = data_frame['confidence'][index]
|
||||||
|
text = label + ' ' + str(conf.round(decimals=2))
|
||||||
|
|
||||||
|
cv2.rectangle(img, (x1, y1), (x2, y2), (255, 255, 0), 2)
|
||||||
|
cv2.putText(img, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
|
||||||
|
|
||||||
|
cv2.imshow('IMAGE', img)
|
||||||
|
cv2.waitKey(0)
|
||||||
107
detect2.py
Normal file
107
detect2.py
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
import torch
|
||||||
|
import cv2
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import string
|
||||||
|
from gtts import gTTS
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.playback import play
|
||||||
|
import pygame
|
||||||
|
import tkinter as tk
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
# Load YOLOv5 model from GitHub
|
||||||
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
|
||||||
|
|
||||||
|
# Initialize webcam capture
|
||||||
|
cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed
|
||||||
|
|
||||||
|
# Global variable for summary text
|
||||||
|
summary_text = ''
|
||||||
|
|
||||||
|
# Function to play sound or text-to-speech
|
||||||
|
def play_sound_or_tts(word, text_dir='', speed=1):
|
||||||
|
filename = os.path.join(text_dir, f"{word}.wav")
|
||||||
|
print(filename)
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
sound = AudioSegment.from_file(filename)
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
return sound
|
||||||
|
else:
|
||||||
|
tts = gTTS(word)
|
||||||
|
tts.save('temp.mp3')
|
||||||
|
sound = AudioSegment.from_file('temp.mp3')
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
os.remove('temp.mp3')
|
||||||
|
return sound
|
||||||
|
|
||||||
|
def play_full_text(text, text_dir='', speed=1.5):
|
||||||
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
|
words = text.translate(translator).split()
|
||||||
|
|
||||||
|
combined_sound = AudioSegment.silent(duration=0)
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
print(word)
|
||||||
|
sound = play_sound_or_tts(word, text_dir, speed)
|
||||||
|
combined_sound += sound
|
||||||
|
|
||||||
|
play(combined_sound)
|
||||||
|
|
||||||
|
def on_button_press():
|
||||||
|
global summary_text
|
||||||
|
play_thread = Thread(target=play_full_text, args=(summary_text, '', 1.1))
|
||||||
|
play_thread.start()
|
||||||
|
|
||||||
|
def process_frame():
|
||||||
|
global summary_text
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
print("Failed to grab frame")
|
||||||
|
return
|
||||||
|
|
||||||
|
result = model(frame)
|
||||||
|
data_frame = result.pandas().xyxy[0]
|
||||||
|
|
||||||
|
# Filter detections with confidence above 70%
|
||||||
|
data_frame = data_frame[data_frame['confidence'] > 0.5]
|
||||||
|
|
||||||
|
label_counts = data_frame['name'].value_counts()
|
||||||
|
indexes = data_frame.index
|
||||||
|
for index in indexes:
|
||||||
|
x1 = int(data_frame['xmin'][index])
|
||||||
|
y1 = int(data_frame['ymin'][index])
|
||||||
|
x2 = int(data_frame['xmax'][index])
|
||||||
|
y2 = int(data_frame['ymax'][index])
|
||||||
|
label = data_frame['name'][index]
|
||||||
|
conf = data_frame['confidence'][index]
|
||||||
|
text = label + ' ' + str(conf.round(decimals=2))
|
||||||
|
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
|
||||||
|
cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
|
||||||
|
|
||||||
|
summary_text = 'There are: '
|
||||||
|
for label, count in label_counts.items():
|
||||||
|
summary_text += f'{count} {label}, '
|
||||||
|
summary_text = summary_text[:-2]
|
||||||
|
summary_text += ' detected.'
|
||||||
|
|
||||||
|
cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
|
||||||
|
cv2.imshow('Webcam Feed', frame)
|
||||||
|
cv2.waitKey(1)
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
# GUI setup
|
||||||
|
root = tk.Tk()
|
||||||
|
root.title("Object Detection with Sound Playback")
|
||||||
|
|
||||||
|
play_button = tk.Button(root, text="Play Summary", command=on_button_press)
|
||||||
|
play_button.pack()
|
||||||
|
|
||||||
|
# Start the webcam processing
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
cv2.destroyAllWindows()
|
||||||
31
play.py
Normal file
31
play.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import os
|
||||||
|
from gtts import gTTS
|
||||||
|
import pygame
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.playback import play
|
||||||
|
|
||||||
|
def play_sound_or_tts(filename, text):
|
||||||
|
# Check if the sound file exists
|
||||||
|
if os.path.exists(filename):
|
||||||
|
# Play the sound file using pygame
|
||||||
|
pygame.mixer.init()
|
||||||
|
pygame.mixer.music.load(filename)
|
||||||
|
pygame.mixer.music.play()
|
||||||
|
while pygame.mixer.music.get_busy(): # Wait for the sound to finish playing
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Convert text to speech using gTTS
|
||||||
|
tts = gTTS(text)
|
||||||
|
tts.save('temp.mp3') # Save the generated speech to a temporary file
|
||||||
|
|
||||||
|
# Load the temporary file into an AudioSegment
|
||||||
|
sound = AudioSegment.from_file('temp.mp3')
|
||||||
|
|
||||||
|
# Play the sound using pydub
|
||||||
|
play(sound)
|
||||||
|
|
||||||
|
# Remove the temporary file after playing
|
||||||
|
os.remove('temp.mp3')
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
play_sound_or_tts('dog.wav', 'dog')
|
||||||
BIN
suaraku/.DS_Store
vendored
Normal file
BIN
suaraku/.DS_Store
vendored
Normal file
Binary file not shown.
BIN
suaraku/1.wav
Normal file
BIN
suaraku/1.wav
Normal file
Binary file not shown.
BIN
suaraku/10.wav
Normal file
BIN
suaraku/10.wav
Normal file
Binary file not shown.
BIN
suaraku/2.wav
Normal file
BIN
suaraku/2.wav
Normal file
Binary file not shown.
BIN
suaraku/3.wav
Normal file
BIN
suaraku/3.wav
Normal file
Binary file not shown.
BIN
suaraku/4.wav
Normal file
BIN
suaraku/4.wav
Normal file
Binary file not shown.
BIN
suaraku/5.wav
Normal file
BIN
suaraku/5.wav
Normal file
Binary file not shown.
BIN
suaraku/6.wav
Normal file
BIN
suaraku/6.wav
Normal file
Binary file not shown.
BIN
suaraku/7.wav
Normal file
BIN
suaraku/7.wav
Normal file
Binary file not shown.
BIN
suaraku/are.wav
Normal file
BIN
suaraku/are.wav
Normal file
Binary file not shown.
BIN
suaraku/book.wav
Normal file
BIN
suaraku/book.wav
Normal file
Binary file not shown.
BIN
suaraku/car.wav
Normal file
BIN
suaraku/car.wav
Normal file
Binary file not shown.
BIN
suaraku/cat.wav
Normal file
BIN
suaraku/cat.wav
Normal file
Binary file not shown.
BIN
suaraku/detected.wav
Normal file
BIN
suaraku/detected.wav
Normal file
Binary file not shown.
BIN
suaraku/dog.wav
Normal file
BIN
suaraku/dog.wav
Normal file
Binary file not shown.
BIN
suaraku/no.wav
Normal file
BIN
suaraku/no.wav
Normal file
Binary file not shown.
BIN
suaraku/object.wav
Normal file
BIN
suaraku/object.wav
Normal file
Binary file not shown.
BIN
suaraku/smartphone.wav
Normal file
BIN
suaraku/smartphone.wav
Normal file
Binary file not shown.
BIN
suaraku/there.wav
Normal file
BIN
suaraku/there.wav
Normal file
Binary file not shown.
BIN
suaraku/tv.wav
Normal file
BIN
suaraku/tv.wav
Normal file
Binary file not shown.
BIN
suaraku/two.wav
Normal file
BIN
suaraku/two.wav
Normal file
Binary file not shown.
Reference in New Issue
Block a user