chen459664's picture
Add files using upload-large-folder tool
2051791 verified
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
from alphalora.expert_number import calculate_expert
import json
import torch
from lsaq_quant import quantize_llama_like, quantize_qwen_like
import json
import numpy as np
import argparse
import json
import heapq
def get_top_k_indices(json_file_path, k, reverse=True):
"""
读取JSON文件中的列表,返回最大的k个元素的索引
Args:
json_file_path: JSON文件路径
k: 需要获取的最大元素的个数
Returns:
list: 按元素大小降序排列的索引列表
Raises:
FileNotFoundError: 文件不存在时抛出
ValueError: k值无效或数据格式错误时抛出
"""
# 读取JSON文件
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"文件 {json_file_path} 不存在")
except json.JSONDecodeError:
raise ValueError("JSON文件格式错误")
# 验证数据类型
if not isinstance(data, list):
raise ValueError("JSON文件内容不是列表")
# 验证k值的有效性
if k <= 0 or k > len(data):
raise ValueError(f"k值无效,应在1到{len(data)}之间")
# 获取元素值和索引的元组列表 [(value, index), ...]
value_index_pairs = [(value, idx) for idx, value in enumerate(data)]
# 方法1:使用heapq获取最大的k个元素(效率更高,O(n log k))
# top_k_pairs = heapq.nlargest(k, value_index_pairs, key=lambda x: x[0])
# 方法2:使用排序(简单直观,O(n log n))
sorted_pairs = sorted(value_index_pairs, key=lambda x: x[0], reverse=reverse)
top_k_pairs = sorted_pairs[:k]
# 提取索引
top_k_indices = [pair[1] for pair in top_k_pairs]
return top_k_indices
parser = argparse.ArgumentParser(description="parser")
parser.add_argument("--bit_layer_idx", type=int, required=True)
parser.add_argument("--save_dir", type=str, required=True)
parser.add_argument("--mode", type=str, required=True)
parser.add_argument("--model_id", type=str, required=True)
parser.add_argument("--cuda_id", type=int, required=True)
args = parser.parse_args()
print(args)
model_id = args.model_id
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map=f"cuda:{args.cuda_id}",
torch_dtype=torch.float16
)
print(model)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt").to(f"cuda:{args.cuda_id}")
# 生成文本
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=True, # 随机采样(非贪婪)
top_k=50, # 限制采样候选
top_p=0.95, # nucleus sampling
temperature=0.7 # 控制生成多样性
)
# 解码为字符串
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)
# import pdb; pdb.set_trace()
low_bit = 4
num_layers = model.config.num_hidden_layers
bit_layers = [low_bit] * num_layers
# with open(args.bit_layers, 'r', encoding='utf-8') as f:
# bit_layers = json.load(f)
# assert len(bit_layers) == num_layers
# set some layers to 2 bit
if args.bit_layer_idx != -1:
bit_layers[args.bit_layer_idx] = 2
# [28, 26, 20, 16, 24]
# for index in list(range(24,29)):
# bit_layers[index] = 2
print('bit_layers: ', bit_layers)
layer_to_quant = list(range(num_layers))
if "self_attn" == args.mode:
mlp_quant = []
self_attn_quant = [f'layers.{item}.self_attn' for item in layer_to_quant]
elif "mlp" == args.mode:
mlp_quant = [f'layers.{item}.mlp' for item in layer_to_quant]
self_attn_quant = []
print(f'quanting ... ')
if "qwen" in model_id.lower():
model_lsaq = quantize_qwen_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers)
else:
model_lsaq = quantize_llama_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers)
print(f'quanted')
print(model_lsaq)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=True, # 随机采样(非贪婪)
top_k=50, # 限制采样候选
top_p=0.95, # nucleus sampling
temperature=0.7 # 控制生成多样性
)
# 解码为字符串
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)
# 计算 alpha_values
# all_layer_alpha = calculate_expert(model)
# with open(f"alpha_values_llama-2-7b-qint{bit}.json", "w") as f:
# json.dump(all_layer_alpha, f, indent=4)
# save_dir = f"../models/Llama-2-7b-hf-Kurtosis_only"
model_lsaq.save_pretrained(args.save_dir)
tokenizer.save_pretrained(args.save_dir)