from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig from alphalora.expert_number import calculate_expert import json import torch from lsaq_quant import quantize_llama_like, quantize_qwen_like import json import numpy as np import argparse import json import heapq def get_top_k_indices(json_file_path, k, reverse=True): """ 读取JSON文件中的列表,返回最大的k个元素的索引 Args: json_file_path: JSON文件路径 k: 需要获取的最大元素的个数 Returns: list: 按元素大小降序排列的索引列表 Raises: FileNotFoundError: 文件不存在时抛出 ValueError: k值无效或数据格式错误时抛出 """ # 读取JSON文件 try: with open(json_file_path, 'r', encoding='utf-8') as f: data = json.load(f) except FileNotFoundError: raise FileNotFoundError(f"文件 {json_file_path} 不存在") except json.JSONDecodeError: raise ValueError("JSON文件格式错误") # 验证数据类型 if not isinstance(data, list): raise ValueError("JSON文件内容不是列表") # 验证k值的有效性 if k <= 0 or k > len(data): raise ValueError(f"k值无效,应在1到{len(data)}之间") # 获取元素值和索引的元组列表 [(value, index), ...] value_index_pairs = [(value, idx) for idx, value in enumerate(data)] # 方法1:使用heapq获取最大的k个元素(效率更高,O(n log k)) # top_k_pairs = heapq.nlargest(k, value_index_pairs, key=lambda x: x[0]) # 方法2:使用排序(简单直观,O(n log n)) sorted_pairs = sorted(value_index_pairs, key=lambda x: x[0], reverse=reverse) top_k_pairs = sorted_pairs[:k] # 提取索引 top_k_indices = [pair[1] for pair in top_k_pairs] return top_k_indices parser = argparse.ArgumentParser(description="parser") parser.add_argument("--bit_layer_idx", type=int, required=True) parser.add_argument("--save_dir", type=str, required=True) parser.add_argument("--mode", type=str, required=True) parser.add_argument("--model_id", type=str, required=True) parser.add_argument("--cuda_id", type=int, required=True) args = parser.parse_args() print(args) model_id = args.model_id model = AutoModelForCausalLM.from_pretrained( model_id, device_map=f"cuda:{args.cuda_id}", torch_dtype=torch.float16 ) print(model) tokenizer = AutoTokenizer.from_pretrained(model_id) prompt = "Once upon a time" inputs = tokenizer(prompt, return_tensors="pt").to(f"cuda:{args.cuda_id}") # 生成文本 with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=100, do_sample=True, # 随机采样(非贪婪) top_k=50, # 限制采样候选 top_p=0.95, # nucleus sampling temperature=0.7 # 控制生成多样性 ) # 解码为字符串 decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) print(decoded) # import pdb; pdb.set_trace() low_bit = 4 num_layers = model.config.num_hidden_layers bit_layers = [low_bit] * num_layers # with open(args.bit_layers, 'r', encoding='utf-8') as f: # bit_layers = json.load(f) # assert len(bit_layers) == num_layers # set some layers to 2 bit if args.bit_layer_idx != -1: bit_layers[args.bit_layer_idx] = 2 # [28, 26, 20, 16, 24] # for index in list(range(24,29)): # bit_layers[index] = 2 print('bit_layers: ', bit_layers) layer_to_quant = list(range(num_layers)) if "self_attn" == args.mode: mlp_quant = [] self_attn_quant = [f'layers.{item}.self_attn' for item in layer_to_quant] elif "mlp" == args.mode: mlp_quant = [f'layers.{item}.mlp' for item in layer_to_quant] self_attn_quant = [] print(f'quanting ... ') if "qwen" in model_id.lower(): model_lsaq = quantize_qwen_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers) else: model_lsaq = quantize_llama_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers) print(f'quanted') print(model_lsaq) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=100, do_sample=True, # 随机采样(非贪婪) top_k=50, # 限制采样候选 top_p=0.95, # nucleus sampling temperature=0.7 # 控制生成多样性 ) # 解码为字符串 decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) print(decoded) # 计算 alpha_values # all_layer_alpha = calculate_expert(model) # with open(f"alpha_values_llama-2-7b-qint{bit}.json", "w") as f: # json.dump(all_layer_alpha, f, indent=4) # save_dir = f"../models/Llama-2-7b-hf-Kurtosis_only" model_lsaq.save_pretrained(args.save_dir) tokenizer.save_pretrained(args.save_dir)