beyoureyes / app.py
ake178178's picture
Update app.py
d2b41a1 verified
import streamlit as st
import cv2
import torch
from transformers import AutoProcessor, AutoModelForImageClassification
from gtts import gTTS
import os
import time
# 提示用户手动检查权限
st.warning("请确保已允许应用访问您的摄像头。对于 Windows 用户,请检查 [设置 -> 隐私 -> 摄像头]。对于 macOS/iOS 用户,请检查 [系统偏好设置 -> 安全性与隐私 -> 摄像头]。")
# 加载 Hugging Face 模型
@st.cache_resource
def load_model():
processor = AutoProcessor.from_pretrained("microsoft/resnet-50")
model = AutoModelForImageClassification.from_pretrained("microsoft/resnet-50")
return processor, model
processor, model = load_model()
# 设置网页标题
st.title("帮助盲人识别物体的应用")
# 打开摄像头并每10秒拍照一次
st.header("点击下方按钮打开摄像头拍照并识别物体")
run = st.button('打开摄像头并开始识别')
if run:
st.text("正在打开摄像头,请稍等...")
camera = cv2.VideoCapture(0)
# 检查摄像头是否成功打开
if not camera.isOpened():
st.error("无法打开摄像头,请检查摄像头权限设置")
else:
while True:
ret, frame = camera.read()
if not ret:
st.error("无法读取摄像头画面")
break
# 显示摄像头画面
st.image(frame, channels="BGR")
# 每10秒进行一次拍照
time.sleep(10)
# 保存照片
img_path = "captured_image.jpg"
cv2.imwrite(img_path, frame)
# 读取图像并转换为模型输入
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
inputs = processor(images=image, return_tensors="pt")
# 进行物体识别
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
predicted_label = torch.argmax(logits, dim=1).item()
# 获取识别到的物体标签
label = model.config.id2label[predicted_label]
st.write(f"识别到的物体: {label}")
# 将标签转化为语音
tts = gTTS(text=f"The object detected is {label}", lang='en')
tts.save("output.mp3")
os.system("mpg321 output.mp3") # 播放语音
# 释放摄像头
camera.release()