Spaces:
Running
on
Zero
Running
on
Zero
feat: ZeroGPU不支持量化
Browse files- README.md +8 -1
- app.py +29 -6
- requirements.txt +3 -1
README.md
CHANGED
@@ -10,4 +10,11 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
+
An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
OSError: [Errno 28] No space left on device
|
18 |
+
```bash
|
19 |
+
rm -rf /data-nvme/zerogpu-offload/*
|
20 |
+
```
|
app.py
CHANGED
@@ -1,14 +1,35 @@
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
from huggingface_hub import InferenceClient
|
4 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
5 |
import torch
|
6 |
import subprocess
|
7 |
|
8 |
-
subprocess.run(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
|
11 |
-
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
|
12 |
|
13 |
@spaces.GPU(duration=120)
|
14 |
def respond(
|
@@ -19,16 +40,18 @@ def respond(
|
|
19 |
temperature,
|
20 |
top_p,
|
21 |
):
|
|
|
|
|
22 |
if len(message) < 1:
|
23 |
message = "write a quick sort algorithm in python."
|
24 |
|
25 |
messages = [
|
26 |
-
{
|
27 |
]
|
28 |
|
29 |
-
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(
|
30 |
|
31 |
-
outputs =
|
32 |
|
33 |
return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
|
34 |
|
|
|
1 |
import spaces
|
2 |
import gradio as gr
|
3 |
from huggingface_hub import InferenceClient
|
4 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
5 |
import torch
|
6 |
import subprocess
|
7 |
|
8 |
+
subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
|
9 |
+
subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
|
10 |
+
|
11 |
+
kwargs = {}
|
12 |
+
|
13 |
+
"""
|
14 |
+
https://huggingface.co/docs/transformers/quantization/bitsandbytes
|
15 |
+
"""
|
16 |
+
|
17 |
+
# quantization_config = BitsAndBytesConfig(
|
18 |
+
# load_in_4bit=True,
|
19 |
+
# bnb_4bit_quant_type="nf4",
|
20 |
+
# bnb_4bit_use_double_quant=True,
|
21 |
+
# bnb_4bit_compute_dtype=torch.bfloat16,
|
22 |
+
# )
|
23 |
+
|
24 |
+
# quantization_config = BitsAndBytesConfig(
|
25 |
+
# load_in_8bit=True,
|
26 |
+
# # llm_int8_enable_fp32_cpu_offload=True,
|
27 |
+
# )
|
28 |
+
|
29 |
+
# kwargs = { "quantization_config": quantization_config, "low_cpu_mem_usage": True }
|
30 |
|
31 |
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True)
|
32 |
+
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", trust_remote_code=True, torch_dtype=torch.bfloat16, **kwargs).cuda()
|
33 |
|
34 |
@spaces.GPU(duration=120)
|
35 |
def respond(
|
|
|
40 |
temperature,
|
41 |
top_p,
|
42 |
):
|
43 |
+
modelx = model
|
44 |
+
|
45 |
if len(message) < 1:
|
46 |
message = "write a quick sort algorithm in python."
|
47 |
|
48 |
messages = [
|
49 |
+
{ "role": "user", "content": message }
|
50 |
]
|
51 |
|
52 |
+
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(modelx.device)
|
53 |
|
54 |
+
outputs = modelx.generate(inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature, top_k=50, top_p=top_p, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
|
55 |
|
56 |
return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
|
57 |
|
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
huggingface_hub==0.22.2
|
2 |
-
transformers
|
|
|
|
|
|
1 |
huggingface_hub==0.22.2
|
2 |
+
transformers
|
3 |
+
# accelerate
|
4 |
+
# bitsandbytes
|