import streamlit as st import cv2 import torch from transformers import AutoProcessor, AutoModelForImageClassification from gtts import gTTS import os import time # 提示用户手动检查权限 st.warning("请确保已允许应用访问您的摄像头。对于 Windows 用户,请检查 [设置 -> 隐私 -> 摄像头]。对于 macOS/iOS 用户,请检查 [系统偏好设置 -> 安全性与隐私 -> 摄像头]。") # 加载 Hugging Face 模型 @st.cache_resource def load_model(): processor = AutoProcessor.from_pretrained("microsoft/resnet-50") model = AutoModelForImageClassification.from_pretrained("microsoft/resnet-50") return processor, model processor, model = load_model() # 设置网页标题 st.title("帮助盲人识别物体的应用") # 打开摄像头并每10秒拍照一次 st.header("点击下方按钮打开摄像头拍照并识别物体") run = st.button('打开摄像头并开始识别') if run: st.text("正在打开摄像头,请稍等...") camera = cv2.VideoCapture(0) # 检查摄像头是否成功打开 if not camera.isOpened(): st.error("无法打开摄像头,请检查摄像头权限设置") else: while True: ret, frame = camera.read() if not ret: st.error("无法读取摄像头画面") break # 显示摄像头画面 st.image(frame, channels="BGR") # 每10秒进行一次拍照 time.sleep(10) # 保存照片 img_path = "captured_image.jpg" cv2.imwrite(img_path, frame) # 读取图像并转换为模型输入 image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) inputs = processor(images=image, return_tensors="pt") # 进行物体识别 with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits predicted_label = torch.argmax(logits, dim=1).item() # 获取识别到的物体标签 label = model.config.id2label[predicted_label] st.write(f"识别到的物体: {label}") # 将标签转化为语音 tts = gTTS(text=f"The object detected is {label}", lang='en') tts.save("output.mp3") os.system("mpg321 output.mp3") # 播放语音 # 释放摄像头 camera.release()