first commit
This commit is contained in:
commit
4d56d484f8
|
@ -0,0 +1,3 @@
|
||||||
|
/env
|
||||||
|
yolo*
|
||||||
|
|
|
@ -0,0 +1,115 @@
|
||||||
|
import torch
|
||||||
|
import cv2
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import string
|
||||||
|
from gtts import gTTS
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.playback import play
|
||||||
|
import tkinter as tk
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
# Load YOLOv5 model from GitHub
|
||||||
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
|
||||||
|
|
||||||
|
# Initialize webcam capture
|
||||||
|
cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed
|
||||||
|
|
||||||
|
# Global variable for summary text
|
||||||
|
summary_text = ''
|
||||||
|
|
||||||
|
# Function to play sound or text-to-speech
|
||||||
|
def play_sound_or_tts(word, text_dir='suaraku', speed=1):
|
||||||
|
filename = os.path.join(text_dir, f"{word}.wav")
|
||||||
|
print(filename)
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
sound = AudioSegment.from_file(filename)
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
return sound
|
||||||
|
else:
|
||||||
|
tts = gTTS(word)
|
||||||
|
tts.save('temp.mp3')
|
||||||
|
sound = AudioSegment.from_file('temp.mp3')
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
os.remove('temp.mp3')
|
||||||
|
return sound
|
||||||
|
|
||||||
|
def play_full_text(text, text_dir='suaraku', speed=1.0):
|
||||||
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
|
words = text.translate(translator).split()
|
||||||
|
|
||||||
|
combined_sound = AudioSegment.silent(duration=0)
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
print(word)
|
||||||
|
sound = play_sound_or_tts(word, text_dir, speed)
|
||||||
|
combined_sound += sound
|
||||||
|
|
||||||
|
play(combined_sound)
|
||||||
|
|
||||||
|
def on_button_press():
|
||||||
|
global summary_text
|
||||||
|
play_button.config(text="Loading...")
|
||||||
|
play_thread = Thread(target=play_audio_and_update_button)
|
||||||
|
play_thread.start()
|
||||||
|
|
||||||
|
def play_audio_and_update_button():
|
||||||
|
global summary_text
|
||||||
|
play_full_text(summary_text, 'suaraku', 1.1)
|
||||||
|
play_button.config(text="Play Summary")
|
||||||
|
|
||||||
|
def process_frame():
|
||||||
|
global summary_text
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
print("Failed to grab frame")
|
||||||
|
return
|
||||||
|
|
||||||
|
result = model(frame)
|
||||||
|
data_frame = result.pandas().xyxy[0]
|
||||||
|
|
||||||
|
# Filter detections with confidence above 70%
|
||||||
|
data_frame = data_frame[data_frame['confidence'] > 0.4]
|
||||||
|
|
||||||
|
label_counts = data_frame['name'].value_counts()
|
||||||
|
indexes = data_frame.index
|
||||||
|
for index in indexes:
|
||||||
|
x1 = int(data_frame['xmin'][index])
|
||||||
|
y1 = int(data_frame['ymin'][index])
|
||||||
|
x2 = int(data_frame['xmax'][index])
|
||||||
|
y2 = int(data_frame['ymax'][index])
|
||||||
|
label = data_frame['name'][index]
|
||||||
|
conf = data_frame['confidence'][index]
|
||||||
|
text = label + ' ' + str(conf.round(decimals=2))
|
||||||
|
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
|
||||||
|
cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
|
||||||
|
|
||||||
|
if label_counts.empty:
|
||||||
|
summary_text = 'No objects detected.'
|
||||||
|
else:
|
||||||
|
summary_text = 'There are: '
|
||||||
|
for label, count in label_counts.items():
|
||||||
|
summary_text += f'{count} {label}, '
|
||||||
|
summary_text = summary_text[:-2] # Remove the last comma and space
|
||||||
|
summary_text += ' detected.'
|
||||||
|
|
||||||
|
cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
|
||||||
|
cv2.imshow('Webcam Feed', frame)
|
||||||
|
cv2.waitKey(1)
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
# GUI setup
|
||||||
|
root = tk.Tk()
|
||||||
|
root.title("Object Detection with Sound Playback")
|
||||||
|
|
||||||
|
play_button = tk.Button(root, text="Play Summary", command=on_button_press)
|
||||||
|
play_button.pack()
|
||||||
|
|
||||||
|
# Start the webcam processing
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
cv2.destroyAllWindows()
|
|
@ -0,0 +1,45 @@
|
||||||
|
import torch
|
||||||
|
import cv2
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Download model from github
|
||||||
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
|
||||||
|
|
||||||
|
img = cv2.imread('car.jpg')
|
||||||
|
img = cv2.resize(img, (1000, 650))
|
||||||
|
|
||||||
|
# Perform detection on image
|
||||||
|
result = model(img)
|
||||||
|
print('result: ', result)
|
||||||
|
|
||||||
|
# Convert detected result to pandas data frame
|
||||||
|
data_frame = result.pandas().xyxy[0]
|
||||||
|
print('data_frame:')
|
||||||
|
print(data_frame)
|
||||||
|
|
||||||
|
# Get the counts of each label
|
||||||
|
label_counts = data_frame['name'].value_counts()
|
||||||
|
print('Label counts:')
|
||||||
|
print(label_counts)
|
||||||
|
|
||||||
|
# Get indexes of all of the rows
|
||||||
|
indexes = data_frame.index
|
||||||
|
for index in indexes:
|
||||||
|
# Find the coordinate of top left corner of bounding box
|
||||||
|
x1 = int(data_frame['xmin'][index])
|
||||||
|
y1 = int(data_frame['ymin'][index])
|
||||||
|
# Find the coordinate of right bottom corner of bounding box
|
||||||
|
x2 = int(data_frame['xmax'][index])
|
||||||
|
y2 = int(data_frame['ymax'][index])
|
||||||
|
|
||||||
|
# Find label name
|
||||||
|
label = data_frame['name'][index]
|
||||||
|
# Find confidence score of the model
|
||||||
|
conf = data_frame['confidence'][index]
|
||||||
|
text = label + ' ' + str(conf.round(decimals=2))
|
||||||
|
|
||||||
|
cv2.rectangle(img, (x1, y1), (x2, y2), (255, 255, 0), 2)
|
||||||
|
cv2.putText(img, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
|
||||||
|
|
||||||
|
cv2.imshow('IMAGE', img)
|
||||||
|
cv2.waitKey(0)
|
|
@ -0,0 +1,107 @@
|
||||||
|
import torch
|
||||||
|
import cv2
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
import string
|
||||||
|
from gtts import gTTS
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.playback import play
|
||||||
|
import pygame
|
||||||
|
import tkinter as tk
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
# Load YOLOv5 model from GitHub
|
||||||
|
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
|
||||||
|
|
||||||
|
# Initialize webcam capture
|
||||||
|
cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed
|
||||||
|
|
||||||
|
# Global variable for summary text
|
||||||
|
summary_text = ''
|
||||||
|
|
||||||
|
# Function to play sound or text-to-speech
|
||||||
|
def play_sound_or_tts(word, text_dir='', speed=1):
|
||||||
|
filename = os.path.join(text_dir, f"{word}.wav")
|
||||||
|
print(filename)
|
||||||
|
|
||||||
|
if os.path.exists(filename):
|
||||||
|
sound = AudioSegment.from_file(filename)
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
return sound
|
||||||
|
else:
|
||||||
|
tts = gTTS(word)
|
||||||
|
tts.save('temp.mp3')
|
||||||
|
sound = AudioSegment.from_file('temp.mp3')
|
||||||
|
sound = sound.speedup(playback_speed=speed)
|
||||||
|
os.remove('temp.mp3')
|
||||||
|
return sound
|
||||||
|
|
||||||
|
def play_full_text(text, text_dir='', speed=1.5):
|
||||||
|
translator = str.maketrans('', '', string.punctuation)
|
||||||
|
words = text.translate(translator).split()
|
||||||
|
|
||||||
|
combined_sound = AudioSegment.silent(duration=0)
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
print(word)
|
||||||
|
sound = play_sound_or_tts(word, text_dir, speed)
|
||||||
|
combined_sound += sound
|
||||||
|
|
||||||
|
play(combined_sound)
|
||||||
|
|
||||||
|
def on_button_press():
|
||||||
|
global summary_text
|
||||||
|
play_thread = Thread(target=play_full_text, args=(summary_text, '', 1.1))
|
||||||
|
play_thread.start()
|
||||||
|
|
||||||
|
def process_frame():
|
||||||
|
global summary_text
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
print("Failed to grab frame")
|
||||||
|
return
|
||||||
|
|
||||||
|
result = model(frame)
|
||||||
|
data_frame = result.pandas().xyxy[0]
|
||||||
|
|
||||||
|
# Filter detections with confidence above 70%
|
||||||
|
data_frame = data_frame[data_frame['confidence'] > 0.5]
|
||||||
|
|
||||||
|
label_counts = data_frame['name'].value_counts()
|
||||||
|
indexes = data_frame.index
|
||||||
|
for index in indexes:
|
||||||
|
x1 = int(data_frame['xmin'][index])
|
||||||
|
y1 = int(data_frame['ymin'][index])
|
||||||
|
x2 = int(data_frame['xmax'][index])
|
||||||
|
y2 = int(data_frame['ymax'][index])
|
||||||
|
label = data_frame['name'][index]
|
||||||
|
conf = data_frame['confidence'][index]
|
||||||
|
text = label + ' ' + str(conf.round(decimals=2))
|
||||||
|
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
|
||||||
|
cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
|
||||||
|
|
||||||
|
summary_text = 'There are: '
|
||||||
|
for label, count in label_counts.items():
|
||||||
|
summary_text += f'{count} {label}, '
|
||||||
|
summary_text = summary_text[:-2]
|
||||||
|
summary_text += ' detected.'
|
||||||
|
|
||||||
|
cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
|
||||||
|
cv2.imshow('Webcam Feed', frame)
|
||||||
|
cv2.waitKey(1)
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
# GUI setup
|
||||||
|
root = tk.Tk()
|
||||||
|
root.title("Object Detection with Sound Playback")
|
||||||
|
|
||||||
|
play_button = tk.Button(root, text="Play Summary", command=on_button_press)
|
||||||
|
play_button.pack()
|
||||||
|
|
||||||
|
# Start the webcam processing
|
||||||
|
root.after(10, process_frame)
|
||||||
|
|
||||||
|
root.mainloop()
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
cv2.destroyAllWindows()
|
|
@ -0,0 +1,31 @@
|
||||||
|
import os
|
||||||
|
from gtts import gTTS
|
||||||
|
import pygame
|
||||||
|
from pydub import AudioSegment
|
||||||
|
from pydub.playback import play
|
||||||
|
|
||||||
|
def play_sound_or_tts(filename, text):
|
||||||
|
# Check if the sound file exists
|
||||||
|
if os.path.exists(filename):
|
||||||
|
# Play the sound file using pygame
|
||||||
|
pygame.mixer.init()
|
||||||
|
pygame.mixer.music.load(filename)
|
||||||
|
pygame.mixer.music.play()
|
||||||
|
while pygame.mixer.music.get_busy(): # Wait for the sound to finish playing
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Convert text to speech using gTTS
|
||||||
|
tts = gTTS(text)
|
||||||
|
tts.save('temp.mp3') # Save the generated speech to a temporary file
|
||||||
|
|
||||||
|
# Load the temporary file into an AudioSegment
|
||||||
|
sound = AudioSegment.from_file('temp.mp3')
|
||||||
|
|
||||||
|
# Play the sound using pydub
|
||||||
|
play(sound)
|
||||||
|
|
||||||
|
# Remove the temporary file after playing
|
||||||
|
os.remove('temp.mp3')
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
play_sound_or_tts('dog.wav', 'dog')
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue