Spaces:

Lam-Hung
/

Japanese_assistant

Runtime error

Lam-Hung commited on Aug 13

Commit

6e510a5

•

1 Parent(s): 8d748b9

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,9 +5,9 @@ from typing import Iterator
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
-huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
@@ -17,10 +17,13 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 model_id = "google/gemma-2-9b-it"
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id, token = huggingface_token)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
     token = huggingface_token
 )
 model.config.sliding_window = 4096

 import gradio as gr
 import spaces
 import torch
+from transformers import BitsAndBytesConfig, AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
+huggingface_token = os.getenv('read_access')
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 model_id = "google/gemma-2-9b-it"
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id, token = huggingface_token)
+quantization = BitsAndBytesConfig(load_in_8bit= True)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
+    quantization_config=quantization,
     token = huggingface_token
 )
 model.config.sliding_window = 4096