Spaces:

AlyxTeam
/

DeepSeek-Coder-V2-Lite-Instruct

Running on Zero

App Files Files Community

AlyxTeam commited on Sep 19

Commit

16c80da

•

1 Parent(s): 6ddacd8

feat: ZeroGPU不支持量化

Browse files

Files changed (3) hide show

README.md +8 -1
app.py +29 -6
requirements.txt +3 -1

README.md CHANGED Viewed

@@ -10,4 +10,11 @@ pinned: false
 license: mit
 ---
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 license: mit
 ---
+An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
+OSError: [Errno 28] No space left on device
+```bash
+rm -rf /data-nvme/zerogpu-offload/*
+```

app.py CHANGED Viewed

@@ -1,14 +1,35 @@
 import spaces
 import gradio as gr
 from huggingface_hub import InferenceClient
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
 @spaces.GPU(duration=120)
 def respond(
@@ -19,16 +40,18 @@ def respond(
     temperature,
     top_p,
 ):
     if len(message) < 1:
         message = "write a quick sort algorithm in python."
     messages = [
-        { 'role': 'user', 'content': message }
     ]
-    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
     return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

 import spaces
 import gradio as gr
 from huggingface_hub import InferenceClient
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 import subprocess
+subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
+subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
+kwargs = {}
+"""
+https://huggingface.co/docs/transformers/quantization/bitsandbytes
+"""
+# quantization_config = BitsAndBytesConfig(
+#     load_in_4bit=True,
+#     bnb_4bit_quant_type="nf4",
+#     bnb_4bit_use_double_quant=True,
+#     bnb_4bit_compute_dtype=torch.bfloat16,
+# )
+# quantization_config = BitsAndBytesConfig(
+#     load_in_8bit=True,
+#     # llm_int8_enable_fp32_cpu_offload=True,
+# )
+# kwargs = { "quantization_config": quantization_config, "low_cpu_mem_usage": True }
 tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16, **kwargs).cuda()
 @spaces.GPU(duration=120)
 def respond(
     temperature,
     top_p,
 ):
+    modelx = model
     if len(message) < 1:
         message = "write a quick sort algorithm in python."
     messages = [
+        { "role": "user", "content": message }
     ]
+    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(modelx.device)
+    outputs = modelx.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
     return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 huggingface_hub==0.22.2
-transformers

 huggingface_hub==0.22.2
+transformers
+# accelerate
+# bitsandbytes