| from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig |
| from alphalora.expert_number import calculate_expert |
| import json |
| import torch |
| from lsaq_quant import quantize_llama_like, quantize_qwen_like |
| import json |
| import numpy as np |
| import argparse |
|
|
|
|
| import json |
| import heapq |
| def get_top_k_indices(json_file_path, k, reverse=True): |
| """ |
| 读取JSON文件中的列表,返回最大的k个元素的索引 |
| |
| Args: |
| json_file_path: JSON文件路径 |
| k: 需要获取的最大元素的个数 |
| |
| Returns: |
| list: 按元素大小降序排列的索引列表 |
| |
| Raises: |
| FileNotFoundError: 文件不存在时抛出 |
| ValueError: k值无效或数据格式错误时抛出 |
| """ |
| |
| try: |
| with open(json_file_path, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| except FileNotFoundError: |
| raise FileNotFoundError(f"文件 {json_file_path} 不存在") |
| except json.JSONDecodeError: |
| raise ValueError("JSON文件格式错误") |
| |
| |
| if not isinstance(data, list): |
| raise ValueError("JSON文件内容不是列表") |
| |
| |
| if k <= 0 or k > len(data): |
| raise ValueError(f"k值无效,应在1到{len(data)}之间") |
| |
| |
| value_index_pairs = [(value, idx) for idx, value in enumerate(data)] |
| |
| |
| |
| |
| |
| sorted_pairs = sorted(value_index_pairs, key=lambda x: x[0], reverse=reverse) |
| top_k_pairs = sorted_pairs[:k] |
| |
| |
| top_k_indices = [pair[1] for pair in top_k_pairs] |
| |
| return top_k_indices |
|
|
| parser = argparse.ArgumentParser(description="parser") |
| parser.add_argument("--bit_layer_idx", type=int, required=True) |
| parser.add_argument("--save_dir", type=str, required=True) |
| parser.add_argument("--mode", type=str, required=True) |
| parser.add_argument("--model_id", type=str, required=True) |
| parser.add_argument("--cuda_id", type=int, required=True) |
|
|
| args = parser.parse_args() |
| print(args) |
| model_id = args.model_id |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| device_map=f"cuda:{args.cuda_id}", |
| torch_dtype=torch.float16 |
| ) |
| print(model) |
| tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
| prompt = "Once upon a time" |
| inputs = tokenizer(prompt, return_tensors="pt").to(f"cuda:{args.cuda_id}") |
|
|
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=100, |
| do_sample=True, |
| top_k=50, |
| top_p=0.95, |
| temperature=0.7 |
| ) |
|
|
| |
| decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| print(decoded) |
|
|
| |
| low_bit = 4 |
| num_layers = model.config.num_hidden_layers |
| bit_layers = [low_bit] * num_layers |
|
|
| |
| |
|
|
|
|
| |
| |
| if args.bit_layer_idx != -1: |
| bit_layers[args.bit_layer_idx] = 2 |
|
|
| |
| |
| |
|
|
| print('bit_layers: ', bit_layers) |
|
|
| layer_to_quant = list(range(num_layers)) |
|
|
| if "self_attn" == args.mode: |
| mlp_quant = [] |
| self_attn_quant = [f'layers.{item}.self_attn' for item in layer_to_quant] |
| elif "mlp" == args.mode: |
| mlp_quant = [f'layers.{item}.mlp' for item in layer_to_quant] |
| self_attn_quant = [] |
|
|
|
|
|
|
|
|
| print(f'quanting ... ') |
| if "qwen" in model_id.lower(): |
| model_lsaq = quantize_qwen_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers) |
| else: |
| model_lsaq = quantize_llama_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers) |
| print(f'quanted') |
| print(model_lsaq) |
|
|
|
|
| with torch.no_grad(): |
| outputs = model.generate( |
| **inputs, |
| max_new_tokens=100, |
| do_sample=True, |
| top_k=50, |
| top_p=0.95, |
| temperature=0.7 |
| ) |
|
|
| |
| decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| print(decoded) |
|
|
| |
| |
| |
| |
|
|
| |
| model_lsaq.save_pretrained(args.save_dir) |
| tokenizer.save_pretrained(args.save_dir) |
|
|