first commit

2024-08-02 21:02:32 +08:00
commit 4d56d484f8
27 changed files with 301 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 /env
 yolo*
--- a/d3.py
+++ b/d3.py
@ -0,0 +1,115 @@
 import torch
 import cv2
 import pandas as pd
 import os
 import string
 from gtts import gTTS
 from pydub import AudioSegment
 from pydub.playback import play
 import tkinter as tk
 from threading import Thread
 # Load YOLOv5 model from GitHub
 model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
 # Initialize webcam capture
 cap = cv2.VideoCapture(0)  # Use 0 for default webcam, or specify a different index if needed
 # Global variable for summary text
 summary_text = ''
 # Function to play sound or text-to-speech
 def play_sound_or_tts(word, text_dir='suaraku', speed=1):
    filename = os.path.join(text_dir, f"{word}.wav")
    print(filename)
    if os.path.exists(filename):
        sound = AudioSegment.from_file(filename)
        sound = sound.speedup(playback_speed=speed)
        return sound
    else:
        tts = gTTS(word)
        tts.save('temp.mp3')
        sound = AudioSegment.from_file('temp.mp3')
        sound = sound.speedup(playback_speed=speed)
        os.remove('temp.mp3')
        return sound
 def play_full_text(text, text_dir='suaraku', speed=1.0):
    translator = str.maketrans('', '', string.punctuation)
    words = text.translate(translator).split()
    combined_sound = AudioSegment.silent(duration=0)
    for word in words:
        print(word)
        sound = play_sound_or_tts(word, text_dir, speed)
        combined_sound += sound
    play(combined_sound)
 def on_button_press():
    global summary_text
    play_button.config(text="Loading...")
    play_thread = Thread(target=play_audio_and_update_button)
    play_thread.start()
 def play_audio_and_update_button():
    global summary_text
    play_full_text(summary_text, 'suaraku', 1.1)
    play_button.config(text="Play Summary")
 def process_frame():
    global summary_text
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        return
    result = model(frame)
    data_frame = result.pandas().xyxy[0]
    # Filter detections with confidence above 70%
    data_frame = data_frame[data_frame['confidence'] > 0.4]
    label_counts = data_frame['name'].value_counts()
    indexes = data_frame.index
    for index in indexes:
        x1 = int(data_frame['xmin'][index])
        y1 = int(data_frame['ymin'][index])
        x2 = int(data_frame['xmax'][index])
        y2 = int(data_frame['ymax'][index])
        label = data_frame['name'][index]
        conf = data_frame['confidence'][index]
        text = label + ' ' + str(conf.round(decimals=2))
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
        cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
    if label_counts.empty:
        summary_text = 'No objects detected.'
    else:
        summary_text = 'There are: '
        for label, count in label_counts.items():
            summary_text += f'{count} {label}, '
        summary_text = summary_text[:-2]  # Remove the last comma and space
        summary_text += ' detected.'
    cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    cv2.imshow('Webcam Feed', frame)
    cv2.waitKey(1)
    root.after(10, process_frame)
 # GUI setup
 root = tk.Tk()
 root.title("Object Detection with Sound Playback")
 play_button = tk.Button(root, text="Play Summary", command=on_button_press)
 play_button.pack()
 # Start the webcam processing
 root.after(10, process_frame)
 root.mainloop()
 cap.release()
 cv2.destroyAllWindows()
--- a/detect1.py
+++ b/detect1.py
@ -0,0 +1,45 @@
 import torch
 import cv2
 import pandas as pd
 # Download model from github
 model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
 img = cv2.imread('car.jpg')
 img = cv2.resize(img, (1000, 650))
 # Perform detection on image
 result = model(img)
 print('result: ', result)
 # Convert detected result to pandas data frame
 data_frame = result.pandas().xyxy[0]
 print('data_frame:')
 print(data_frame)
 # Get the counts of each label
 label_counts = data_frame['name'].value_counts()
 print('Label counts:')
 print(label_counts)
 # Get indexes of all of the rows
 indexes = data_frame.index
 for index in indexes:
    # Find the coordinate of top left corner of bounding box
    x1 = int(data_frame['xmin'][index])
    y1 = int(data_frame['ymin'][index])
    # Find the coordinate of right bottom corner of bounding box
    x2 = int(data_frame['xmax'][index])
    y2 = int(data_frame['ymax'][index])
    # Find label name
    label = data_frame['name'][index]
    # Find confidence score of the model
    conf = data_frame['confidence'][index]
    text = label + ' ' + str(conf.round(decimals=2))
    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 255, 0), 2)
    cv2.putText(img, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
 cv2.imshow('IMAGE', img)
 cv2.waitKey(0)
--- a/detect2.py
+++ b/detect2.py
@ -0,0 +1,107 @@
 import torch
 import cv2
 import pandas as pd
 import os
 import string
 from gtts import gTTS
 from pydub import AudioSegment
 from pydub.playback import play
 import pygame
 import tkinter as tk
 from threading import Thread
 # Load YOLOv5 model from GitHub
 model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
 # Initialize webcam capture
 cap = cv2.VideoCapture(0)  # Use 0 for default webcam, or specify a different index if needed
 # Global variable for summary text
 summary_text = ''
 # Function to play sound or text-to-speech
 def play_sound_or_tts(word, text_dir='', speed=1):
    filename = os.path.join(text_dir, f"{word}.wav")
    print(filename)
    if os.path.exists(filename):
        sound = AudioSegment.from_file(filename)
        sound = sound.speedup(playback_speed=speed)
        return sound
    else:
        tts = gTTS(word)
        tts.save('temp.mp3')
        sound = AudioSegment.from_file('temp.mp3')
        sound = sound.speedup(playback_speed=speed)
        os.remove('temp.mp3')
        return sound
 def play_full_text(text, text_dir='', speed=1.5):
    translator = str.maketrans('', '', string.punctuation)
    words = text.translate(translator).split()
    combined_sound = AudioSegment.silent(duration=0)
    for word in words:
        print(word)
        sound = play_sound_or_tts(word, text_dir, speed)
        combined_sound += sound
    play(combined_sound)
 def on_button_press():
    global summary_text
    play_thread = Thread(target=play_full_text, args=(summary_text, '', 1.1))
    play_thread.start()
 def process_frame():
    global summary_text
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        return
    result = model(frame)
    data_frame = result.pandas().xyxy[0]
     # Filter detections with confidence above 70%
    data_frame = data_frame[data_frame['confidence'] > 0.5]
    label_counts = data_frame['name'].value_counts()
    indexes = data_frame.index
    for index in indexes:
        x1 = int(data_frame['xmin'][index])
        y1 = int(data_frame['ymin'][index])
        x2 = int(data_frame['xmax'][index])
        y2 = int(data_frame['ymax'][index])
        label = data_frame['name'][index]
        conf = data_frame['confidence'][index]
        text = label + ' ' + str(conf.round(decimals=2))
        cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 255, 0), 2)
        cv2.putText(frame, text, (x1, y1-5), cv2.FONT_HERSHEY_PLAIN, 2, (255, 255, 0), 2)
    summary_text = 'There are: '
    for label, count in label_counts.items():
        summary_text += f'{count} {label}, '
    summary_text = summary_text[:-2]
    summary_text += ' detected.'
    cv2.putText(frame, summary_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    cv2.imshow('Webcam Feed', frame)
    cv2.waitKey(1)
    root.after(10, process_frame)
 # GUI setup
 root = tk.Tk()
 root.title("Object Detection with Sound Playback")
 play_button = tk.Button(root, text="Play Summary", command=on_button_press)
 play_button.pack()
 # Start the webcam processing
 root.after(10, process_frame)
 root.mainloop()
 cap.release()
 cv2.destroyAllWindows()
--- a/play.py
+++ b/play.py
@ -0,0 +1,31 @@
 import os
 from gtts import gTTS
 import pygame
 from pydub import AudioSegment
 from pydub.playback import play
 def play_sound_or_tts(filename, text):
    # Check if the sound file exists
    if os.path.exists(filename):
        # Play the sound file using pygame
        pygame.mixer.init()
        pygame.mixer.music.load(filename)
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():  # Wait for the sound to finish playing
            continue
    else:
        # Convert text to speech using gTTS
        tts = gTTS(text)
        tts.save('temp.mp3')  # Save the generated speech to a temporary file
        # Load the temporary file into an AudioSegment
        sound = AudioSegment.from_file('temp.mp3')
        # Play the sound using pydub
        play(sound)
        # Remove the temporary file after playing
        os.remove('temp.mp3')
 # Example usage
 play_sound_or_tts('dog.wav', 'dog')
--- a/suaraku/.DS_Store
+++ b/suaraku/.DS_Store
--- a/suaraku/1.wav
+++ b/suaraku/1.wav
--- a/suaraku/10.wav
+++ b/suaraku/10.wav
--- a/suaraku/2.wav
+++ b/suaraku/2.wav
--- a/suaraku/3.wav
+++ b/suaraku/3.wav
--- a/suaraku/4.wav
+++ b/suaraku/4.wav
--- a/suaraku/5.wav
+++ b/suaraku/5.wav
--- a/suaraku/6.wav
+++ b/suaraku/6.wav
--- a/suaraku/7.wav
+++ b/suaraku/7.wav
--- a/suaraku/are.wav
+++ b/suaraku/are.wav
--- a/suaraku/book.wav
+++ b/suaraku/book.wav
--- a/suaraku/car.wav
+++ b/suaraku/car.wav
--- a/suaraku/cat.wav
+++ b/suaraku/cat.wav
--- a/suaraku/detected.wav
+++ b/suaraku/detected.wav
--- a/suaraku/dog.wav
+++ b/suaraku/dog.wav
--- a/suaraku/no.wav
+++ b/suaraku/no.wav
--- a/suaraku/object.wav
+++ b/suaraku/object.wav
--- a/suaraku/smartphone.wav
+++ b/suaraku/smartphone.wav
--- a/suaraku/there.wav
+++ b/suaraku/there.wav
--- a/suaraku/tv.wav
+++ b/suaraku/tv.wav
--- a/suaraku/two.wav
+++ b/suaraku/two.wav