File size: 4,901 Bytes
55c92b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
from alphalora.expert_number import calculate_expert
import json
import torch
from lsaq_quant import quantize_llama_like, quantize_qwen_like
import json
import numpy as np
import argparse


import json
import heapq
def get_top_k_indices(json_file_path, k, reverse=True):
    """
    读取JSON文件中的列表,返回最大的k个元素的索引
    
    Args:
        json_file_path: JSON文件路径
        k: 需要获取的最大元素的个数
        
    Returns:
        list: 按元素大小降序排列的索引列表
        
    Raises:
        FileNotFoundError: 文件不存在时抛出
        ValueError: k值无效或数据格式错误时抛出
    """
    # 读取JSON文件
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"文件 {json_file_path} 不存在")
    except json.JSONDecodeError:
        raise ValueError("JSON文件格式错误")
    
    # 验证数据类型
    if not isinstance(data, list):
        raise ValueError("JSON文件内容不是列表")
    
    # 验证k值的有效性
    if k <= 0 or k > len(data):
        raise ValueError(f"k值无效,应在1到{len(data)}之间")
    
    # 获取元素值和索引的元组列表 [(value, index), ...]
    value_index_pairs = [(value, idx) for idx, value in enumerate(data)]
    
    # 方法1:使用heapq获取最大的k个元素(效率更高,O(n log k))
    # top_k_pairs = heapq.nlargest(k, value_index_pairs, key=lambda x: x[0])
    
    # 方法2:使用排序(简单直观,O(n log n))
    sorted_pairs = sorted(value_index_pairs, key=lambda x: x[0], reverse=reverse)
    top_k_pairs = sorted_pairs[:k]
    
    # 提取索引
    top_k_indices = [pair[1] for pair in top_k_pairs]
    
    return top_k_indices

parser = argparse.ArgumentParser(description="parser")
parser.add_argument("--bit_layer_idx", type=int, required=True)
parser.add_argument("--save_dir", type=str, required=True)
parser.add_argument("--mode", type=str, required=True)
parser.add_argument("--model_id", type=str, required=True)
parser.add_argument("--cuda_id", type=int, required=True)

args = parser.parse_args()
print(args)
model_id = args.model_id

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=f"cuda:{args.cuda_id}",
    torch_dtype=torch.float16
)
print(model)
tokenizer = AutoTokenizer.from_pretrained(model_id)

prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt").to(f"cuda:{args.cuda_id}")

# 生成文本
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,        # 随机采样(非贪婪)
        top_k=50,              # 限制采样候选
        top_p=0.95,            # nucleus sampling
        temperature=0.7        # 控制生成多样性
    )

# 解码为字符串
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)

# import pdb; pdb.set_trace()
low_bit = 4
num_layers = model.config.num_hidden_layers
bit_layers = [low_bit] * num_layers

# with open(args.bit_layers, 'r', encoding='utf-8') as f:
#     bit_layers = json.load(f)


# assert len(bit_layers) == num_layers
# set some layers to 2 bit
if args.bit_layer_idx != -1:
    bit_layers[args.bit_layer_idx] = 2

# [28, 26, 20, 16, 24]
# for index in list(range(24,29)):
#     bit_layers[index] = 2

print('bit_layers: ', bit_layers)

layer_to_quant = list(range(num_layers))

if "self_attn" == args.mode:
    mlp_quant = []
    self_attn_quant = [f'layers.{item}.self_attn' for item in layer_to_quant]
elif "mlp" == args.mode:
    mlp_quant = [f'layers.{item}.mlp' for item in layer_to_quant]
    self_attn_quant = []




print(f'quanting ... ')
if "qwen" in model_id.lower():
    model_lsaq = quantize_qwen_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers)
else:
    model_lsaq = quantize_llama_like(model, mlp_quant, self_attn_quant, low_bit=low_bit, bit_layers=bit_layers)
print(f'quanted')
print(model_lsaq)


with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,        # 随机采样(非贪婪)
        top_k=50,              # 限制采样候选
        top_p=0.95,            # nucleus sampling
        temperature=0.7        # 控制生成多样性
    )

# 解码为字符串
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)

# 计算 alpha_values
# all_layer_alpha = calculate_expert(model)
# with open(f"alpha_values_llama-2-7b-qint{bit}.json", "w") as f:
#     json.dump(all_layer_alpha, f, indent=4)

# save_dir = f"../models/Llama-2-7b-hf-Kurtosis_only"
model_lsaq.save_pretrained(args.save_dir)
tokenizer.save_pretrained(args.save_dir)