import torch import cv2 import pandas as pd import os import string from gtts import gTTS from pydub import AudioSegment from pydub.playback import play import pygame import tkinter as tk from threading import Thread # Load YOLOv5 model from GitHub model = torch.hub.load('ultralytics/yolov5', 'yolov5n') # Initialize webcam capture cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed # Global variable for summary text summary_text = '' # Function to play sound or text-to-speech def play_sound_or_tts(word, text_dir='', speed=1): filename = os.path.join(text_dir, f"{word}.wav") print(filename) if os.path.exists(filename): sound = AudioSegment.from_file(filename) sound = sound.speedup(playback_speed=speed) return sound else: tts = gTTS(word) tts.save('temp.mp3') sound = AudioSegment.from_file('temp.mp3') sound = sound.speedup(playback_speed=speed) os.remove('temp.mp3') return sound def play_full_text(text, text_dir='', speed=1.5): translator = str.maketrans('', '', string.punctuation) words = text.translate(translator).split() combined_sound = AudioSegment.silent(duration=0) for word in words: print(word) sound = play_sound_or_tts(word, text_dir, speed) combined_sound += sound play(combined_sound) def on_button_press(): global summary_text play_thread = Thread(target=play_full_text, args=(summary_text, '', 1.1)) play_thread.start() def process_frame(): global summary_text ret, frame = cap.read() if not ret: print("Failed to grab frame") return result = model(frame) data_frame = result.pandas().xyxy[0] # Filter detections with confidence above 70% data_frame = data_frame[data_frame['confidence'] > 0.5] label_counts = data_frame['name'].value_counts() indexes = data_frame.index for index in indexes: x1 = int(data_frame['xmin'][index]) y1 = int(data_frame['ymin'][index]) x2 = int(data_frame['xmax'][index]) y2 = int(data_frame['ymax'][index]) label = data_frame['name'][index] conf = data_frame['confidence'][index] text = label + ' ' + str(conf.round(decimals=2)) cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2) cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) summary_text = 'There are: ' for label, count in label_counts.items(): summary_text += f'{count} {label}, ' summary_text = summary_text[:-2] summary_text += ' detected.' cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA) cv2.imshow('Webcam Feed', frame) cv2.waitKey(1) root.after(10, process_frame) # GUI setup root = tk.Tk() root.title("Object Detection with Sound Playback") play_button = tk.Button(root, text="Play Summary", command=on_button_press) play_button.pack() # Start the webcam processing root.after(10, process_frame) root.mainloop() cap.release() cv2.destroyAllWindows()