sagar007 commited on
Commit
fcba473
1 Parent(s): 57d039c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -6
app.py CHANGED
@@ -2,10 +2,6 @@ import gradio as gr
2
  import spaces
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
- import subprocess
6
-
7
- # Install flash-attn
8
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
9
 
10
  # Load the model and tokenizer
11
  model_name = "akjindal53244/Llama-3.1-Storm-8B"
@@ -13,7 +9,6 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
  model = AutoModelForCausalLM.from_pretrained(
14
  model_name,
15
  torch_dtype=torch.bfloat16,
16
- use_flash_attention_2=True,
17
  device_map="auto"
18
  )
19
 
@@ -25,7 +20,7 @@ def generate_text(prompt, max_length, temperature):
25
  ]
26
  formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
27
 
28
- inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
29
 
30
  outputs = model.generate(
31
  **inputs,
 
2
  import spaces
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
5
 
6
  # Load the model and tokenizer
7
  model_name = "akjindal53244/Llama-3.1-Storm-8B"
 
9
  model = AutoModelForCausalLM.from_pretrained(
10
  model_name,
11
  torch_dtype=torch.bfloat16,
 
12
  device_map="auto"
13
  )
14
 
 
20
  ]
21
  formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
22
 
23
+ inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
24
 
25
  outputs = model.generate(
26
  **inputs,