alamin655 commited on
Commit
16365ee
1 Parent(s): 9a2a4ca

Upload 2 files

Browse files
Files changed (2) hide show
  1. download_model.py +21 -0
  2. inference.py +74 -0
download_model.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import hf_hub_download
3
+
4
+
5
+ def download_replit_quant(destination_folder: str, repo_id: str, model_filename: str):
6
+ local_path = os.path.abspath(destination_folder)
7
+ return hf_hub_download(
8
+ repo_id=repo_id,
9
+ filename=model_filename,
10
+ local_dir=local_path,
11
+ local_dir_use_symlinks=True,
12
+ )
13
+
14
+
15
+ if __name__ == "__main__":
16
+ """full url: https://huggingface.co/abacaj/Replit-v2-CodeInstruct-3B-ggml/blob/main/replit-v2-codeinstruct-3b.q4_1.bin"""
17
+
18
+ repo_id = "abacaj/Replit-v2-CodeInstruct-3B-ggml"
19
+ model_filename = "replit-v2-codeinstruct-3b.q4_1.bin"
20
+ destination_folder = "models"
21
+ download_replit_quant(destination_folder, repo_id, model_filename)
inference.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, asdict
3
+ from ctransformers import AutoModelForCausalLM, AutoConfig
4
+
5
+
6
+ @dataclass
7
+ class GenerationConfig:
8
+ temperature: float
9
+ top_k: int
10
+ top_p: float
11
+ repetition_penalty: float
12
+ max_new_tokens: int
13
+ seed: int
14
+ reset: bool
15
+ stream: bool
16
+ threads: int
17
+ stop: list[str]
18
+
19
+
20
+ def format_prompt(user_prompt: str):
21
+ return f"""### Instruction:
22
+ {user_prompt}
23
+
24
+ ### Response:"""
25
+
26
+
27
+ def generate(
28
+ llm: AutoModelForCausalLM,
29
+ generation_config: GenerationConfig,
30
+ user_prompt: str,
31
+ ):
32
+ """run model inference, will return a Generator if streaming is true"""
33
+
34
+ return llm(
35
+ format_prompt(
36
+ user_prompt,
37
+ ),
38
+ **asdict(generation_config),
39
+ )
40
+
41
+
42
+ if __name__ == "__main__":
43
+ config = AutoConfig.from_pretrained(
44
+ "teknium/Replit-v2-CodeInstruct-3B", context_length=2048
45
+ )
46
+ llm = AutoModelForCausalLM.from_pretrained(
47
+ os.path.abspath("models/replit-v2-codeinstruct-3b.q4_1.bin"),
48
+ model_type="replit",
49
+ config=config,
50
+ )
51
+
52
+ generation_config = GenerationConfig(
53
+ temperature=0.2,
54
+ top_k=50,
55
+ top_p=0.9,
56
+ repetition_penalty=1.0,
57
+ max_new_tokens=512, # adjust as needed
58
+ seed=42,
59
+ reset=True, # reset history (cache)
60
+ stream=True, # streaming per word/token
61
+ threads=int(os.cpu_count() / 6), # adjust for your CPU
62
+ stop=["<|endoftext|>"],
63
+ )
64
+
65
+ user_prefix = "[user]: "
66
+ assistant_prefix = f"[assistant]:"
67
+
68
+ while True:
69
+ user_prompt = input(user_prefix)
70
+ generator = generate(llm, generation_config, user_prompt.strip())
71
+ print(assistant_prefix, end=" ", flush=True)
72
+ for word in generator:
73
+ print(word, end="", flush=True)
74
+ print("")