first commit

2024-08-02 21:02:32 +08:00
commit 4d56d484f8
27 changed files with 301 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+/env
+yolo*
+
--- a/d3.py
+++ b/d3.py
@ -0,0 +1,115 @@
+import torch
+import cv2
+import pandas as pd
+import os
+import string
+from gtts import gTTS
+from pydub import AudioSegment
+from pydub.playback import play
+import tkinter as tk
+from threading import Thread
+
+# Load YOLOv5 model from GitHub
+model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
+
+# Initialize webcam capture
+cap = cv2.VideoCapture(0)  # Use 0 for default webcam, or specify a different index if needed
+
+# Global variable for summary text
+summary_text = ''
+
+# Function to play sound or text-to-speech
+def play_sound_or_tts(word, text_dir='suaraku', speed=1):
+    filename = os.path.join(text_dir, f"{word}.wav")
+    print(filename)
+
+    if os.path.exists(filename):
+        sound = AudioSegment.from_file(filename)
+        sound = sound.speedup(playback_speed=speed)
+        return sound
+    else:
+        tts = gTTS(word)
+        tts.save('temp.mp3')
+        sound = AudioSegment.from_file('temp.mp3')
+        sound = sound.speedup(playback_speed=speed)
+        os.remove('temp.mp3')
+        return sound
+
+def play_full_text(text, text_dir='suaraku', speed=1.0):
+    translator = str.maketrans('', '', string.punctuation)
+    words = text.translate(translator).split()
+    
+    combined_sound = AudioSegment.silent(duration=0)
+    
+    for word in words:
+        print(word)
+        sound = play_sound_or_tts(word, text_dir, speed)
+        combined_sound += sound
+    
+    play(combined_sound)
+
+def on_button_press():
+    global summary_text
+    play_button.config(text="Loading...")
+    play_thread = Thread(target=play_audio_and_update_button)
+    play_thread.start()
+
+def play_audio_and_update_button():
+    global summary_text
+    play_full_text(summary_text, 'suaraku', 1.1)
+    play_button.config(text="Play Summary")
+
+def process_frame():
+    global summary_text
+    ret, frame = cap.read()
+    if not ret:
+        print("Failed to grab frame")
+        return
+
+    result = model(frame)
+    data_frame = result.pandas().xyxy[0]
+
+    # Filter detections with confidence above 70%
+    data_frame = data_frame[data_frame['confidence'] > 0.4]
+
+    label_counts = data_frame['name'].value_counts()
+    indexes = data_frame.index
+    for index in indexes:
+        x1 = int(data_frame['xmin'][index])
+        y1 = int(data_frame['ymin'][index])
+        x2 = int(data_frame['xmax'][index])
+        y2 = int(data_frame['ymax'][index])
+        label = data_frame['name'][index]
+        conf = data_frame['confidence'][index]
+        text = label + ' ' + str(conf.round(decimals=2))
+        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
+        cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
+
+    if label_counts.empty:
+        summary_text = 'No objects detected.'
+    else:
+        summary_text = 'There are: '
+        for label, count in label_counts.items():
+            summary_text += f'{count} {label}, '
+        summary_text = summary_text[:-2]  # Remove the last comma and space
+        summary_text += ' detected.'
+
+    cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
+    cv2.imshow('Webcam Feed', frame)
+    cv2.waitKey(1)
+    root.after(10, process_frame)
+
+# GUI setup
+root = tk.Tk()
+root.title("Object Detection with Sound Playback")
+
+play_button = tk.Button(root, text="Play Summary", command=on_button_press)
+play_button.pack()
+
+# Start the webcam processing
+root.after(10, process_frame)
+
+root.mainloop()
+
+cap.release()
+cv2.destroyAllWindows()
--- a/detect1.py
+++ b/detect1.py
@ -0,0 +1,45 @@
+import torch
+import cv2
+import pandas as pd
+
+# Download model from github
+model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
+
+img = cv2.imread('car.jpg')
+img = cv2.resize(img, (1000, 650))
+
+# Perform detection on image
+result = model(img)
+print('result: ', result)
+
+# Convert detected result to pandas data frame
+data_frame = result.pandas().xyxy[0]
+print('data_frame:')
+print(data_frame)
+
+# Get the counts of each label
+label_counts = data_frame['name'].value_counts()
+print('Label counts:')
+print(label_counts)
+
+# Get indexes of all of the rows
+indexes = data_frame.index
+for index in indexes:
+    # Find the coordinate of top left corner of bounding box
+    x1 = int(data_frame['xmin'][index])
+    y1 = int(data_frame['ymin'][index])
+    # Find the coordinate of right bottom corner of bounding box
+    x2 = int(data_frame['xmax'][index])
+    y2 = int(data_frame['ymax'][index])
+
+    # Find label name
+    label = data_frame['name'][index]
+    # Find confidence score of the model
+    conf = data_frame['confidence'][index]
+    text = label + ' ' + str(conf.round(decimals=2))
+
+    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 255, 0), 2)
+    cv2.putText(img, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
+
+cv2.imshow('IMAGE', img)
+cv2.waitKey(0)
--- a/detect2.py
+++ b/detect2.py
@ -0,0 +1,107 @@
+import torch
+import cv2
+import pandas as pd
+import os
+import string
+from gtts import gTTS
+from pydub import AudioSegment
+from pydub.playback import play
+import pygame
+import tkinter as tk
+from threading import Thread
+
+# Load YOLOv5 model from GitHub
+model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
+
+# Initialize webcam capture
+cap = cv2.VideoCapture(0)  # Use 0 for default webcam, or specify a different index if needed
+
+# Global variable for summary text
+summary_text = ''
+
+# Function to play sound or text-to-speech
+def play_sound_or_tts(word, text_dir='', speed=1):
+    filename = os.path.join(text_dir, f"{word}.wav")
+    print(filename)
+
+    if os.path.exists(filename):
+        sound = AudioSegment.from_file(filename)
+        sound = sound.speedup(playback_speed=speed)
+        return sound
+    else:
+        tts = gTTS(word)
+        tts.save('temp.mp3')
+        sound = AudioSegment.from_file('temp.mp3')
+        sound = sound.speedup(playback_speed=speed)
+        os.remove('temp.mp3')
+        return sound
+
+def play_full_text(text, text_dir='', speed=1.5):
+    translator = str.maketrans('', '', string.punctuation)
+    words = text.translate(translator).split()
+    
+    combined_sound = AudioSegment.silent(duration=0)
+    
+    for word in words:
+        print(word)
+        sound = play_sound_or_tts(word, text_dir, speed)
+        combined_sound += sound
+    
+    play(combined_sound)
+
+def on_button_press():
+    global summary_text
+    play_thread = Thread(target=play_full_text, args=(summary_text, '', 1.1))
+    play_thread.start()
+
+def process_frame():
+    global summary_text
+    ret, frame = cap.read()
+    if not ret:
+        print("Failed to grab frame")
+        return
+
+    result = model(frame)
+    data_frame = result.pandas().xyxy[0]
+
+     # Filter detections with confidence above 70%
+    data_frame = data_frame[data_frame['confidence'] > 0.5]
+
+    label_counts = data_frame['name'].value_counts()
+    indexes = data_frame.index
+    for index in indexes:
+        x1 = int(data_frame['xmin'][index])
+        y1 = int(data_frame['ymin'][index])
+        x2 = int(data_frame['xmax'][index])
+        y2 = int(data_frame['ymax'][index])
+        label = data_frame['name'][index]
+        conf = data_frame['confidence'][index]
+        text = label + ' ' + str(conf.round(decimals=2))
+        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
+        cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
+
+    summary_text = 'There are: '
+    for label, count in label_counts.items():
+        summary_text += f'{count} {label}, '
+    summary_text = summary_text[:-2]
+    summary_text += ' detected.'
+
+    cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
+    cv2.imshow('Webcam Feed', frame)
+    cv2.waitKey(1)
+    root.after(10, process_frame)
+
+# GUI setup
+root = tk.Tk()
+root.title("Object Detection with Sound Playback")
+
+play_button = tk.Button(root, text="Play Summary", command=on_button_press)
+play_button.pack()
+
+# Start the webcam processing
+root.after(10, process_frame)
+
+root.mainloop()
+
+cap.release()
+cv2.destroyAllWindows()
--- a/play.py
+++ b/play.py
@ -0,0 +1,31 @@
+import os
+from gtts import gTTS
+import pygame
+from pydub import AudioSegment
+from pydub.playback import play
+
+def play_sound_or_tts(filename, text):
+    # Check if the sound file exists
+    if os.path.exists(filename):
+        # Play the sound file using pygame
+        pygame.mixer.init()
+        pygame.mixer.music.load(filename)
+        pygame.mixer.music.play()
+        while pygame.mixer.music.get_busy():  # Wait for the sound to finish playing
+            continue
+    else:
+        # Convert text to speech using gTTS
+        tts = gTTS(text)
+        tts.save('temp.mp3')  # Save the generated speech to a temporary file
+        
+        # Load the temporary file into an AudioSegment
+        sound = AudioSegment.from_file('temp.mp3')
+        
+        # Play the sound using pydub
+        play(sound)
+        
+        # Remove the temporary file after playing
+        os.remove('temp.mp3')
+
+# Example usage
+play_sound_or_tts('dog.wav', 'dog')
--- a/suaraku/.DS_Store
+++ b/suaraku/.DS_Store
--- a/suaraku/1.wav
+++ b/suaraku/1.wav
--- a/suaraku/10.wav
+++ b/suaraku/10.wav
--- a/suaraku/2.wav
+++ b/suaraku/2.wav
--- a/suaraku/3.wav
+++ b/suaraku/3.wav
--- a/suaraku/4.wav
+++ b/suaraku/4.wav
--- a/suaraku/5.wav
+++ b/suaraku/5.wav
--- a/suaraku/6.wav
+++ b/suaraku/6.wav
--- a/suaraku/7.wav
+++ b/suaraku/7.wav
--- a/suaraku/are.wav
+++ b/suaraku/are.wav
--- a/suaraku/book.wav
+++ b/suaraku/book.wav
--- a/suaraku/car.wav
+++ b/suaraku/car.wav
--- a/suaraku/cat.wav
+++ b/suaraku/cat.wav
--- a/suaraku/detected.wav
+++ b/suaraku/detected.wav
--- a/suaraku/dog.wav
+++ b/suaraku/dog.wav
--- a/suaraku/no.wav
+++ b/suaraku/no.wav
--- a/suaraku/object.wav
+++ b/suaraku/object.wav
--- a/suaraku/smartphone.wav
+++ b/suaraku/smartphone.wav
--- a/suaraku/there.wav
+++ b/suaraku/there.wav
--- a/suaraku/tv.wav
+++ b/suaraku/tv.wav
--- a/suaraku/two.wav
+++ b/suaraku/two.wav