File size: 4,901 Bytes
55c92b3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
from alphalora.expert_number import calculate_expert
import json
import torch
from lsaq_quant import quantize_llama_like, quantize_qwen_like
import json
import numpy as np
import argparse
import json
import heapq
def get_top_k_indices(json_file_path, k, reverse=True):
"""
读取JSON文件中的列表,返回最大的k个元素的索引
Args:
json_file_path: JSON文件路径
k: 需要获取的最大元素的个数
Returns:
list: 按元素大小降序排列的索引列表
Raises:
FileNotFoundError: 文件不存在时抛出
ValueError: k值无效或数据格式错误时抛出
"""
# 读取JSON文件
try:
with open(json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"文件 {json_file_path} 不存在")
except json.JSONDecodeError:
raise ValueError("JSON文件格式错误")
# 验证数据类型
if not isinstance(data, list):
raise ValueError("JSON文件内容不是列表")
# 验证k值的有效性
if k <= 0 or k > len(data):
raise ValueError(f"k值无效,应在1到{len(data)}之间")
# 获取元素值和索引的元组列表 [(value, index), ...]
value_index_pairs = [(value, idx) for idx, value in enumerate(data)]
# 方法1:使用heapq获取最大的k个元素(效率更高,O(n log k))
# top_k_pairs = heapq.nlargest(k, value_index_pairs, key=lambda x: x[0])
# 方法2:使用排序(简单直观,O(n log n))
sorted_pairs = sorted(value_index_pairs, key=lambda x: x[0], reverse=reverse)
top_k_pairs = sorted_pairs[:k]
# 提取索引
top_k_indices = [pair[1] for pair in top_k_pairs]
return top_k_indices
parser = argparse.ArgumentParser(description="parser")
parser.add_argument("--bit_layer_idx", type=int, required=True)
parser.add_argument("--save_dir", type=str, required=True)
parser.add_argument("--mode", type=str, required=True)
parser.add_argument("--model_id", type=str, required=True)
parser.add_argument("--cuda_id", type=int, required=True)
args = parser.parse_args()
print(args)
model_id = args.model_id
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map=f"cuda:{args.cuda_id}",
torch_dtype=torch.float16
)
print(model)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt").to(f"cuda:{args.cuda_id}")
# 生成文本
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=True, # 随机采样(非贪婪)
top_k=50, # 限制采样候选
top_p=0.95, # nucleus sampling
temperature=0.7 # 控制生成多样性
)
# 解码为字符串
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)
# import pdb; pdb.set_trace()
low_bit = 4
num_layers = model.config.num_hidden_layers
bit_layers = [low_bit] * num_layers
# with open(args.bit_layers, 'r', encoding='utf-8') as f:
# bit_layers = json.load(f)
# assert len(bit_layers) == num_layers
# set some layers to 2 bit
if args.bit_layer_idx != -1:
bit_layers[args.bit_layer_idx] = 2
# [28, 26, 20, 16, 24]
# for index in list(range(24,29)):
# bit_layers[index] = 2
print('bit_layers: ', bit_layers)
layer_to_quant = list(range(num_layers))
if "self_attn" == args.mode:
mlp_quant = []
self_attn_quant = [f'layers.{item}.self_attn' for item in layer_to_quant]
elif "mlp" == args.mode:
mlp_quant = [f'layers.{item}.mlp' for item in layer_to_quant]
self_attn_quant = []
print(f'quanting ... ')
if "qwen" in model_id.lower():
model_lsaq = quantize_qwen_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers)
else:
model_lsaq = quantize_llama_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers)
print(f'quanted')
print(model_lsaq)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
do_sample=True, # 随机采样(非贪婪)
top_k=50, # 限制采样候选
top_p=0.95, # nucleus sampling
temperature=0.7 # 控制生成多样性
)
# 解码为字符串
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)
# 计算 alpha_values
# all_layer_alpha = calculate_expert(model)
# with open(f"alpha_values_llama-2-7b-qint{bit}.json", "w") as f:
# json.dump(all_layer_alpha, f, indent=4)
# save_dir = f"../models/Llama-2-7b-hf-Kurtosis_only"
model_lsaq.save_pretrained(args.save_dir)
tokenizer.save_pretrained(args.save_dir)
|