AWQ Quantized
!pip install git+https://github.com/huggingface/transformers.git -q
!pip install huggingface_hub
!pip install autoawq -q
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import torch
# Assuming your model and tokenizer are loaded
model_name_or_path = "arlineka/manbasya_2x7b_MOE"
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layer=True, trust_remote_code=False, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
# Set device to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the device
model.to(device)
# Prepare your input text and move input tensors to the same device
input_text = "Hello. Input Here"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
# Now generate text with model and input tensors on the same device
output = model.generate(input_ids, max_new_tokens=2048) # Example usage, adjust as necessary
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
- Downloads last month
- 332
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.