Spaces:
Runtime error
Runtime error
import streamlit as st | |
from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline, AutoModelForCausalLM, AutoTokenizer | |
from PIL import Image as PILImage | |
import scipy.io.wavfile as wavfile | |
import os | |
import uuid | |
# Set page config at the very beginning | |
st.set_page_config(page_title="Image to Music", layout="wide") | |
# Load models outside of functions | |
def load_models(): | |
model_id = "Salesforce/blip-image-captioning-large" | |
processor = AutoProcessor.from_pretrained(model_id) | |
blip_model = BlipForConditionalGeneration.from_pretrained(model_id) | |
synthesiser = pipeline("text-to-audio", model="facebook/musicgen-small") | |
phi_model = AutoModelForCausalLM.from_pretrained( | |
"microsoft/Phi-3.5-mini-instruct", | |
device_map="auto", | |
torch_dtype="auto", | |
trust_remote_code=True | |
) | |
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct") | |
return processor, blip_model, synthesiser, phi_model, phi_tokenizer | |
processor, blip_model, synthesiser, phi_model, phi_tokenizer = load_models() | |
def image_to_text(_image: PILImage.Image): | |
try: | |
# Prepare the image for the model | |
inputs = processor(images=_image, return_tensors="pt") | |
# Generate caption | |
output = blip_model.generate(**inputs, max_new_tokens=100) | |
# Decode the output | |
caption = processor.decode(output[0], skip_special_tokens=True) | |
return caption | |
# # Create a music generation prompt based on the caption | |
# music_prompt = f"Generate music inspired by this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions evoked by the scene." | |
# return music_prompt | |
except Exception as e: | |
return f"Error in image_to_text: {str(e)}" | |
def refine_prompt(caption: str): | |
try: | |
messages = [ | |
{"role": "system", "content": "You are a helpful AI assistant for generating music prompts."}, | |
{"role": "user", "content": f"Generate a detailed music prompt based on this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions."} | |
] | |
pipe = pipeline( | |
"text-generation", | |
model=phi_model, | |
tokenizer=phi_tokenizer, | |
) | |
generation_args = { | |
"max_new_tokens": 500, | |
"return_full_text": False, | |
"temperature": 0.7, | |
"do_sample": True, | |
} | |
output = pipe(messages, **generation_args) | |
refined_prompt = output[0]['generated_text'] | |
return refined_prompt | |
except Exception as e: | |
return f"Error in refine_prompt: {str(e)}" | |
def text_to_music(response: str): | |
try: | |
music = synthesiser(response, forward_params={"do_sample": True}) | |
output_path = f"musicgen_out_{uuid.uuid4()}.wav" | |
wavfile.write(output_path, rate=music["sampling_rate"], data=music["audio"]) | |
return output_path | |
except Exception as e: | |
return f"Error in text_to_music: {str(e)}" | |
def cleanup_old_files(): | |
for file in os.listdir(): | |
if file.startswith("musicgen_out_") and file.endswith(".wav"): | |
os.remove(file) | |
def main(): | |
# st.set_page_config(page_title="Image to Music", layout="wide") | |
st.title("Image to Music") | |
st.write(""" | |
Generate music inspired by an image. | |
This project enables the creation of music based on the inspiration drawn from an image, leveraging multiple AI technologies. | |
## How It Works | |
1. **Image to Text Description** | |
- Use Salesforce BLIP to convert the image into a caption. | |
2. **Text to Refined Music Prompt** | |
- Use Microsoft Phi-3.5-mini- to generate a detailed music prompt based on the caption. | |
3. **Music Prompt to Music** | |
- Use Facebook MusicGen to generate music from the refined prompt. | |
## Steps | |
1. **Image -> [ Salesforce BLIP ] -> Caption** | |
2. **Caption -> [ Microsoft Phi-3.5-mini ] -> Refined Music Prompt** | |
3. **Refined Music Prompt -> [ Facebook MusicGen ] -> Music** | |
Let's turn your visual inspirations into beautiful melodies! | |
**Please Note:** | |
The music generation process may take several minutes to complete. | |
This is due to the complex AI models working behind the scenes to create unique music based on your image. | |
Thank you for your patience! """) | |
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"]) | |
if uploaded_file is not None: | |
image = PILImage.open(uploaded_file) | |
st.image(image, caption="Uploaded Image", use_column_width=True) | |
if st.button("Generate Music"): | |
with st.spinner("Processing image..."): | |
caption = image_to_text(image) | |
st.text_area("Generated Caption", caption, height=100) | |
with st.spinner("Refining music prompt..."): | |
refined_prompt = refine_prompt(caption) | |
st.text_area("Refined Music Prompt", refined_prompt, height=150) | |
with st.spinner("Generating music..."): | |
music_file = text_to_music(refined_prompt) | |
st.audio(music_file) | |
cleanup_old_files() | |
if __name__ == "__main__": | |
main() |