from transformers import AutoTokenizer, AutoModel
from transformers.generation.utils import logger
import warnings
import time, os, torch
logger.setLevel("ERROR")
warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = "/home/weights/nga_lora_glm/model_best"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
print('model load done...')
tttmp = time.time()
quantization_bit = 4
print(f"Quantized to {
quantization_bit} bit")
model = model.quantize(quantization_bit)
print("model quantized done, use time: {}s".format(time.time() - tttmp))
model = model.to(device)
cur_save_dir = "/home/weights/nga_tmp_bit/"
model.save_pretrained(cur_save_dir)
tokenizer.save_pretrained(cur_save_dir)
transformers 保存量化模型并加载
猜你喜欢
转载自blog.csdn.net/qq_42363032/article/details/130847170
今日推荐
周排行