DGBB
DGBB
DGBB
import time
import cv2
import os
import imutils
import subprocess
from gtts import gTTS
from pydub import AudioSegment
AudioSegment.converter = "C:/Users/jasonyip184/Desktop/yolo-object-
detection/ffmpeg-20181202-72b047a-win64-static/bin/ffmpeg.exe"
# load the COCO class labels our YOLO model was trained on
LABELS = open("yolo-coco/coco.names").read().strip().split("\n")
# load our YOLO object detector trained on COCO dataset (80 classes)
print("[INFO] loading YOLO from disk...")
net = cv2.dnn.readNetFromDarknet("yolo-coco/yolov3.cfg", "yolo-
coco/yolov3.weights")
# determine only the *output* layer names that we need from YOLO
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# initialize
cap = cv2.VideoCapture(0)
frame_count = 0
start = time.time()
first = True
frames = []
while True:
frame_count += 1
# Capture frame-by-frameq
ret, frame = cap.read()
frame = cv2.flip(frame,1)
frames.append(frame)
if frame_count == 300:
break
if ret:
key = cv2.waitKey(1)
if frame_count % 60 == 0:
end = time.time()
# grab the frame dimensions and convert it to a blob
(H, W) = frame.shape[:2]
# construct a blob from the input image and then perform a
forward
# pass of the YOLO object detector, giving us our bounding boxes
and
# associated probabilities
blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416),
swapRB=True, crop=False)
net.setInput(blob)
layerOutputs = net.forward(ln)
texts = []
print(texts)
if texts:
description = ', '.join(texts)
tts = gTTS(description, lang='en')
tts.save('tts.mp3')
tts = AudioSegment.from_mp3("tts.mp3")
subprocess.call(["ffplay", "-nodisp", "-autoexit",
"tts.mp3"])
cap.release()
cv2.destroyAllWindows()
os.remove("tts.mp3")