Image_to_Music / app.py
beingcognitive's picture
streamlit app
53fe2ef
import streamlit as st
from transformers import AutoProcessor, BlipForConditionalGeneration, pipeline, AutoModelForCausalLM, AutoTokenizer
from PIL import Image as PILImage
import scipy.io.wavfile as wavfile
import os
import uuid
# Set page config at the very beginning
st.set_page_config(page_title="Image to Music", layout="wide")
# Load models outside of functions
@st.cache_resource
def load_models():
model_id = "Salesforce/blip-image-captioning-large"
processor = AutoProcessor.from_pretrained(model_id)
blip_model = BlipForConditionalGeneration.from_pretrained(model_id)
synthesiser = pipeline("text-to-audio", model="facebook/musicgen-small")
phi_model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3.5-mini-instruct",
device_map="auto",
torch_dtype="auto",
trust_remote_code=True
)
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
return processor, blip_model, synthesiser, phi_model, phi_tokenizer
processor, blip_model, synthesiser, phi_model, phi_tokenizer = load_models()
@st.cache_data
def image_to_text(_image: PILImage.Image):
try:
# Prepare the image for the model
inputs = processor(images=_image, return_tensors="pt")
# Generate caption
output = blip_model.generate(**inputs, max_new_tokens=100)
# Decode the output
caption = processor.decode(output[0], skip_special_tokens=True)
return caption
# # Create a music generation prompt based on the caption
# music_prompt = f"Generate music inspired by this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions evoked by the scene."
# return music_prompt
except Exception as e:
return f"Error in image_to_text: {str(e)}"
@st.cache_data
def refine_prompt(caption: str):
try:
messages = [
{"role": "system", "content": "You are a helpful AI assistant for generating music prompts."},
{"role": "user", "content": f"Generate a detailed music prompt based on this scene: {caption}. Consider elements like tempo, instrumentation, genre, and emotions."}
]
pipe = pipeline(
"text-generation",
model=phi_model,
tokenizer=phi_tokenizer,
)
generation_args = {
"max_new_tokens": 500,
"return_full_text": False,
"temperature": 0.7,
"do_sample": True,
}
output = pipe(messages, **generation_args)
refined_prompt = output[0]['generated_text']
return refined_prompt
except Exception as e:
return f"Error in refine_prompt: {str(e)}"
def text_to_music(response: str):
try:
music = synthesiser(response, forward_params={"do_sample": True})
output_path = f"musicgen_out_{uuid.uuid4()}.wav"
wavfile.write(output_path, rate=music["sampling_rate"], data=music["audio"])
return output_path
except Exception as e:
return f"Error in text_to_music: {str(e)}"
def cleanup_old_files():
for file in os.listdir():
if file.startswith("musicgen_out_") and file.endswith(".wav"):
os.remove(file)
def main():
# st.set_page_config(page_title="Image to Music", layout="wide")
st.title("Image to Music")
st.write("""
Generate music inspired by an image.
This project enables the creation of music based on the inspiration drawn from an image, leveraging multiple AI technologies.
## How It Works
1. **Image to Text Description**
- Use Salesforce BLIP to convert the image into a caption.
2. **Text to Refined Music Prompt**
- Use Microsoft Phi-3.5-mini- to generate a detailed music prompt based on the caption.
3. **Music Prompt to Music**
- Use Facebook MusicGen to generate music from the refined prompt.
## Steps
1. **Image -> [ Salesforce BLIP ] -> Caption**
2. **Caption -> [ Microsoft Phi-3.5-mini ] -> Refined Music Prompt**
3. **Refined Music Prompt -> [ Facebook MusicGen ] -> Music**
Let's turn your visual inspirations into beautiful melodies!
**Please Note:**
The music generation process may take several minutes to complete.
This is due to the complex AI models working behind the scenes to create unique music based on your image.
Thank you for your patience! """)
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image = PILImage.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
if st.button("Generate Music"):
with st.spinner("Processing image..."):
caption = image_to_text(image)
st.text_area("Generated Caption", caption, height=100)
with st.spinner("Refining music prompt..."):
refined_prompt = refine_prompt(caption)
st.text_area("Refined Music Prompt", refined_prompt, height=150)
with st.spinner("Generating music..."):
music_file = text_to_music(refined_prompt)
st.audio(music_file)
cleanup_old_files()
if __name__ == "__main__":
main()