commit 4d56d484f8807b7e1483555b3a067f036123b57e Author: kicap Date: Fri Aug 2 21:02:32 2024 +0800 first commit diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..e363fd4 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..913fef2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/env +yolo* + diff --git a/d3.py b/d3.py new file mode 100644 index 0000000..b561b18 --- /dev/null +++ b/d3.py @@ -0,0 +1,115 @@ +import torch +import cv2 +import pandas as pd +import os +import string +from gtts import gTTS +from pydub import AudioSegment +from pydub.playback import play +import tkinter as tk +from threading import Thread + +# Load YOLOv5 model from GitHub +model = torch.hub.load('ultralytics/yolov5', 'yolov5n') + +# Initialize webcam capture +cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed + +# Global variable for summary text +summary_text = '' + +# Function to play sound or text-to-speech +def play_sound_or_tts(word, text_dir='suaraku', speed=1): + filename = os.path.join(text_dir, f"{word}.wav") + print(filename) + + if os.path.exists(filename): + sound = AudioSegment.from_file(filename) + sound = sound.speedup(playback_speed=speed) + return sound + else: + tts = gTTS(word) + tts.save('temp.mp3') + sound = AudioSegment.from_file('temp.mp3') + sound = sound.speedup(playback_speed=speed) + os.remove('temp.mp3') + return sound + +def play_full_text(text, text_dir='suaraku', speed=1.0): + translator = str.maketrans('', '', string.punctuation) + words = text.translate(translator).split() + + combined_sound = AudioSegment.silent(duration=0) + + for word in words: + print(word) + sound = play_sound_or_tts(word, text_dir, speed) + combined_sound += sound + + play(combined_sound) + +def on_button_press(): + global summary_text + play_button.config(text="Loading...") + play_thread = Thread(target=play_audio_and_update_button) + play_thread.start() + +def play_audio_and_update_button(): + global summary_text + play_full_text(summary_text, 'suaraku', 1.1) + play_button.config(text="Play Summary") + +def process_frame(): + global summary_text + ret, frame = cap.read() + if not ret: + print("Failed to grab frame") + return + + result = model(frame) + data_frame = result.pandas().xyxy[0] + + # Filter detections with confidence above 70% + data_frame = data_frame[data_frame['confidence'] > 0.4] + + label_counts = data_frame['name'].value_counts() + indexes = data_frame.index + for index in indexes: + x1 = int(data_frame['xmin'][index]) + y1 = int(data_frame['ymin'][index]) + x2 = int(data_frame['xmax'][index]) + y2 = int(data_frame['ymax'][index]) + label = data_frame['name'][index] + conf = data_frame['confidence'][index] + text = label + ' ' + str(conf.round(decimals=2)) + cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2) + cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) + + if label_counts.empty: + summary_text = 'No objects detected.' + else: + summary_text = 'There are: ' + for label, count in label_counts.items(): + summary_text += f'{count} {label}, ' + summary_text = summary_text[:-2] # Remove the last comma and space + summary_text += ' detected.' + + cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA) + cv2.imshow('Webcam Feed', frame) + cv2.waitKey(1) + root.after(10, process_frame) + +# GUI setup +root = tk.Tk() +root.title("Object Detection with Sound Playback") + +play_button = tk.Button(root, text="Play Summary", command=on_button_press) +play_button.pack() + +# Start the webcam processing +root.after(10, process_frame) + +root.mainloop() + +cap.release() +cv2.destroyAllWindows() diff --git a/detect1.py b/detect1.py new file mode 100644 index 0000000..12175dd --- /dev/null +++ b/detect1.py @@ -0,0 +1,45 @@ +import torch +import cv2 +import pandas as pd + +# Download model from github +model = torch.hub.load('ultralytics/yolov5', 'yolov5n') + +img = cv2.imread('car.jpg') +img = cv2.resize(img, (1000, 650)) + +# Perform detection on image +result = model(img) +print('result: ', result) + +# Convert detected result to pandas data frame +data_frame = result.pandas().xyxy[0] +print('data_frame:') +print(data_frame) + +# Get the counts of each label +label_counts = data_frame['name'].value_counts() +print('Label counts:') +print(label_counts) + +# Get indexes of all of the rows +indexes = data_frame.index +for index in indexes: + # Find the coordinate of top left corner of bounding box + x1 = int(data_frame['xmin'][index]) + y1 = int(data_frame['ymin'][index]) + # Find the coordinate of right bottom corner of bounding box + x2 = int(data_frame['xmax'][index]) + y2 = int(data_frame['ymax'][index]) + + # Find label name + label = data_frame['name'][index] + # Find confidence score of the model + conf = data_frame['confidence'][index] + text = label + ' ' + str(conf.round(decimals=2)) + + cv2.rectangle(img, (x1, y1), (x2, y2), (255, 255, 0), 2) + cv2.putText(img, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) + +cv2.imshow('IMAGE', img) +cv2.waitKey(0) diff --git a/detect2.py b/detect2.py new file mode 100644 index 0000000..ee20bb4 --- /dev/null +++ b/detect2.py @@ -0,0 +1,107 @@ +import torch +import cv2 +import pandas as pd +import os +import string +from gtts import gTTS +from pydub import AudioSegment +from pydub.playback import play +import pygame +import tkinter as tk +from threading import Thread + +# Load YOLOv5 model from GitHub +model = torch.hub.load('ultralytics/yolov5', 'yolov5n') + +# Initialize webcam capture +cap = cv2.VideoCapture(0) # Use 0 for default webcam, or specify a different index if needed + +# Global variable for summary text +summary_text = '' + +# Function to play sound or text-to-speech +def play_sound_or_tts(word, text_dir='', speed=1): + filename = os.path.join(text_dir, f"{word}.wav") + print(filename) + + if os.path.exists(filename): + sound = AudioSegment.from_file(filename) + sound = sound.speedup(playback_speed=speed) + return sound + else: + tts = gTTS(word) + tts.save('temp.mp3') + sound = AudioSegment.from_file('temp.mp3') + sound = sound.speedup(playback_speed=speed) + os.remove('temp.mp3') + return sound + +def play_full_text(text, text_dir='', speed=1.5): + translator = str.maketrans('', '', string.punctuation) + words = text.translate(translator).split() + + combined_sound = AudioSegment.silent(duration=0) + + for word in words: + print(word) + sound = play_sound_or_tts(word, text_dir, speed) + combined_sound += sound + + play(combined_sound) + +def on_button_press(): + global summary_text + play_thread = Thread(target=play_full_text, args=(summary_text, '', 1.1)) + play_thread.start() + +def process_frame(): + global summary_text + ret, frame = cap.read() + if not ret: + print("Failed to grab frame") + return + + result = model(frame) + data_frame = result.pandas().xyxy[0] + + # Filter detections with confidence above 70% + data_frame = data_frame[data_frame['confidence'] > 0.5] + + label_counts = data_frame['name'].value_counts() + indexes = data_frame.index + for index in indexes: + x1 = int(data_frame['xmin'][index]) + y1 = int(data_frame['ymin'][index]) + x2 = int(data_frame['xmax'][index]) + y2 = int(data_frame['ymax'][index]) + label = data_frame['name'][index] + conf = data_frame['confidence'][index] + text = label + ' ' + str(conf.round(decimals=2)) + cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2) + cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2) + + summary_text = 'There are: ' + for label, count in label_counts.items(): + summary_text += f'{count} {label}, ' + summary_text = summary_text[:-2] + summary_text += ' detected.' + + cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA) + cv2.imshow('Webcam Feed', frame) + cv2.waitKey(1) + root.after(10, process_frame) + +# GUI setup +root = tk.Tk() +root.title("Object Detection with Sound Playback") + +play_button = tk.Button(root, text="Play Summary", command=on_button_press) +play_button.pack() + +# Start the webcam processing +root.after(10, process_frame) + +root.mainloop() + +cap.release() +cv2.destroyAllWindows() diff --git a/play.py b/play.py new file mode 100644 index 0000000..f938049 --- /dev/null +++ b/play.py @@ -0,0 +1,31 @@ +import os +from gtts import gTTS +import pygame +from pydub import AudioSegment +from pydub.playback import play + +def play_sound_or_tts(filename, text): + # Check if the sound file exists + if os.path.exists(filename): + # Play the sound file using pygame + pygame.mixer.init() + pygame.mixer.music.load(filename) + pygame.mixer.music.play() + while pygame.mixer.music.get_busy(): # Wait for the sound to finish playing + continue + else: + # Convert text to speech using gTTS + tts = gTTS(text) + tts.save('temp.mp3') # Save the generated speech to a temporary file + + # Load the temporary file into an AudioSegment + sound = AudioSegment.from_file('temp.mp3') + + # Play the sound using pydub + play(sound) + + # Remove the temporary file after playing + os.remove('temp.mp3') + +# Example usage +play_sound_or_tts('dog.wav', 'dog') diff --git a/suaraku/.DS_Store b/suaraku/.DS_Store new file mode 100644 index 0000000..61d5984 Binary files /dev/null and b/suaraku/.DS_Store differ diff --git a/suaraku/1.wav b/suaraku/1.wav new file mode 100644 index 0000000..bb518e9 Binary files /dev/null and b/suaraku/1.wav differ diff --git a/suaraku/10.wav b/suaraku/10.wav new file mode 100644 index 0000000..6348524 Binary files /dev/null and b/suaraku/10.wav differ diff --git a/suaraku/2.wav b/suaraku/2.wav new file mode 100644 index 0000000..d5b3cdb Binary files /dev/null and b/suaraku/2.wav differ diff --git a/suaraku/3.wav b/suaraku/3.wav new file mode 100644 index 0000000..7cb6d3a Binary files /dev/null and b/suaraku/3.wav differ diff --git a/suaraku/4.wav b/suaraku/4.wav new file mode 100644 index 0000000..98291fd Binary files /dev/null and b/suaraku/4.wav differ diff --git a/suaraku/5.wav b/suaraku/5.wav new file mode 100644 index 0000000..619088f Binary files /dev/null and b/suaraku/5.wav differ diff --git a/suaraku/6.wav b/suaraku/6.wav new file mode 100644 index 0000000..051b43a Binary files /dev/null and b/suaraku/6.wav differ diff --git a/suaraku/7.wav b/suaraku/7.wav new file mode 100644 index 0000000..235309b Binary files /dev/null and b/suaraku/7.wav differ diff --git a/suaraku/are.wav b/suaraku/are.wav new file mode 100644 index 0000000..0727ad8 Binary files /dev/null and b/suaraku/are.wav differ diff --git a/suaraku/book.wav b/suaraku/book.wav new file mode 100644 index 0000000..bbbaab1 Binary files /dev/null and b/suaraku/book.wav differ diff --git a/suaraku/car.wav b/suaraku/car.wav new file mode 100644 index 0000000..98e0233 Binary files /dev/null and b/suaraku/car.wav differ diff --git a/suaraku/cat.wav b/suaraku/cat.wav new file mode 100644 index 0000000..77706e7 Binary files /dev/null and b/suaraku/cat.wav differ diff --git a/suaraku/detected.wav b/suaraku/detected.wav new file mode 100644 index 0000000..eb81fab Binary files /dev/null and b/suaraku/detected.wav differ diff --git a/suaraku/dog.wav b/suaraku/dog.wav new file mode 100644 index 0000000..5e4fb03 Binary files /dev/null and b/suaraku/dog.wav differ diff --git a/suaraku/no.wav b/suaraku/no.wav new file mode 100644 index 0000000..8188f2a Binary files /dev/null and b/suaraku/no.wav differ diff --git a/suaraku/object.wav b/suaraku/object.wav new file mode 100644 index 0000000..b80b135 Binary files /dev/null and b/suaraku/object.wav differ diff --git a/suaraku/smartphone.wav b/suaraku/smartphone.wav new file mode 100644 index 0000000..428cf45 Binary files /dev/null and b/suaraku/smartphone.wav differ diff --git a/suaraku/there.wav b/suaraku/there.wav new file mode 100644 index 0000000..b9c6286 Binary files /dev/null and b/suaraku/there.wav differ diff --git a/suaraku/tv.wav b/suaraku/tv.wav new file mode 100644 index 0000000..0749886 Binary files /dev/null and b/suaraku/tv.wav differ diff --git a/suaraku/two.wav b/suaraku/two.wav new file mode 100644 index 0000000..d5b3cdb Binary files /dev/null and b/suaraku/two.wav differ