Lam-Hung commited on
Commit
6e510a5
1 Parent(s): 8d748b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -2
app.py CHANGED
@@ -5,9 +5,9 @@ from typing import Iterator
5
  import gradio as gr
6
  import spaces
7
  import torch
8
- from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
9
 
10
- huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
11
 
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
@@ -17,10 +17,13 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
17
 
18
  model_id = "google/gemma-2-9b-it"
19
  tokenizer = GemmaTokenizerFast.from_pretrained(model_id, token = huggingface_token)
 
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
  model_id,
22
  device_map="auto",
23
  torch_dtype=torch.bfloat16,
 
24
  token = huggingface_token
25
  )
26
  model.config.sliding_window = 4096
 
5
  import gradio as gr
6
  import spaces
7
  import torch
8
+ from transformers import BitsAndBytesConfig, AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
9
 
10
+ huggingface_token = os.getenv('read_access')
11
 
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
 
17
 
18
  model_id = "google/gemma-2-9b-it"
19
  tokenizer = GemmaTokenizerFast.from_pretrained(model_id, token = huggingface_token)
20
+
21
+ quantization = BitsAndBytesConfig(load_in_8bit= True)
22
  model = AutoModelForCausalLM.from_pretrained(
23
  model_id,
24
  device_map="auto",
25
  torch_dtype=torch.bfloat16,
26
+ quantization_config=quantization,
27
  token = huggingface_token
28
  )
29
  model.config.sliding_window = 4096