object-detection/detect2.py

108 lines
3.1 KiB
Python

import torch
import cv2
import pandas as pd
import os
import string
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
import pygame
import tkinter as tk
from threading import Thread
# Load YOLOv5 model from GitHub
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
# Initialize webcam capture
cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed
# Global variable for summary text
summary_text = ''
# Function to play sound or text-to-speech
def play_sound_or_tts(word, text_dir='', speed=1):
filename = os.path.join(text_dir, f"{word}.wav")
print(filename)
if os.path.exists(filename):
sound = AudioSegment.from_file(filename)
sound = sound.speedup(playback_speed=speed)
return sound
else:
tts = gTTS(word)
tts.save('temp.mp3')
sound = AudioSegment.from_file('temp.mp3')
sound = sound.speedup(playback_speed=speed)
os.remove('temp.mp3')
return sound
def play_full_text(text, text_dir='', speed=1.5):
translator = str.maketrans('', '', string.punctuation)
words = text.translate(translator).split()
combined_sound = AudioSegment.silent(duration=0)
for word in words:
print(word)
sound = play_sound_or_tts(word, text_dir, speed)
combined_sound += sound
play(combined_sound)
def on_button_press():
global summary_text
play_thread = Thread(target=play_full_text, args=(summary_text, '', 1.1))
play_thread.start()
def process_frame():
global summary_text
ret, frame = cap.read()
if not ret:
print("Failed to grab frame")
return
result = model(frame)
data_frame = result.pandas().xyxy[0]
# Filter detections with confidence above 70%
data_frame = data_frame[data_frame['confidence'] > 0.5]
label_counts = data_frame['name'].value_counts()
indexes = data_frame.index
for index in indexes:
x1 = int(data_frame['xmin'][index])
y1 = int(data_frame['ymin'][index])
x2 = int(data_frame['xmax'][index])
y2 = int(data_frame['ymax'][index])
label = data_frame['name'][index]
conf = data_frame['confidence'][index]
text = label + ' ' + str(conf.round(decimals=2))
cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
summary_text = 'There are: '
for label, count in label_counts.items():
summary_text += f'{count} {label}, '
summary_text = summary_text[:-2]
summary_text += ' detected.'
cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
cv2.imshow('Webcam Feed', frame)
cv2.waitKey(1)
root.after(10, process_frame)
# GUI setup
root = tk.Tk()
root.title("Object Detection with Sound Playback")
play_button = tk.Button(root, text="Play Summary", command=on_button_press)
play_button.pack()
# Start the webcam processing
root.after(10, process_frame)
root.mainloop()
cap.release()
cv2.destroyAllWindows()