{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "import torch\n", "import torch.nn as nn\n", "import GPUtil\n", "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "import tqdm\n", "from functools import partial" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Resource Detection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gpus = GPUtil.getGPUs()\n", "free_memory = []\n", "\n", "for gpu in gpus:\n", " free_memory.append(gpu.memoryFree)\n", "\n", "memory_sort = sorted(range(len(free_memory)), key=lambda i: free_memory[i])\n", "\n", "gpu_id = memory_sort[-1]\n", "gpu_memory = free_memory[memory_sort[-1]]\n", "\n", "print(f'gpu_id:{gpu_id}; gpu_memory:{gpu_memory}')\n", "\n", "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = str(gpu_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model Selection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_name = \"/data/LLMs/Llama-2-7b-hf\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", "model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16, device_map=\"auto\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Layer Importance Detection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def encode(tok, text, padding=True, truncation=True, max_length=None):\n", " # 将文本转换为输入 IDs\n", " input_ids = [tok.bos_id] + tok.encode(text)\n", "\n", " # 生成注意力掩码\n", " attention_mask = [1] * len(input_ids)\n", "\n", " # 如果进行了填充,则调整注意力掩码\n", " if padding:\n", " padding_length = max_length - len(input_ids)\n", " attention_mask = [0] * padding_length + attention_mask\n", " input_ids = [tok.eos_id] * padding_length + input_ids\n", "\n", " encoded_input = {\n", " 'input_ids': input_ids,\n", " 'attention_mask': attention_mask\n", " }\n", " return encoded_input\n", "\n", "def batch_encode_plus(tok, texts, max_length=None, return_tensors=None):\n", " encoded_inputs = []\n", "\n", " # 循环处理每个文本\n", " if max_length is None:\n", " max_length = -1\n", " for text in texts:\n", " # if isinstance(text, list):\n", " # text = text[0]\n", " # print(text)\n", " len_ = len([tok.bos_id] + tok.encode(text))\n", " if len_ > max_length:\n", " max_length = len_\n", " for text in texts:\n", " # if isinstance(text, list):\n", " # text = text[0]\n", " encoded_input = encode(tok, text, max_length = max_length)\n", " encoded_inputs.append(encoded_input)\n", "\n", " # 合并结果\n", " batch_encoded = {\n", " 'input_ids': [encoded_input['input_ids'] for encoded_input in encoded_inputs],\n", " 'attention_mask': [encoded_input['attention_mask'] for encoded_input in encoded_inputs]\n", " }\n", "\n", " batch_encoded = {key: torch.tensor(val) for key, val in batch_encoded.items()}\n", "\n", " return batch_encoded" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.bos_token = tokenizer.eos_token\n", "tokenizer.bos_id = tokenizer.bos_token_id\n", "tokenizer.eos_id = tokenizer.eos_token_id\n", "importances = [0 for i in range(len(model.model.layers))] # layer-wise importance scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"test\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MAX_SEQ_LEN = 1024\n", "batch_size = 1\n", "dataset_size = 200" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def jaccard_set(list1, list2):\n", " \"\"\"Define Jaccard Similarity function for two sets\"\"\"\n", " intersection = len(list(set(list1).intersection(list2)))\n", " union = (len(list1) + len(list2)) - intersection\n", " return float(intersection) / union" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "k = 20\n", "\n", "for i in tqdm.tqdm(range(0, dataset_size, batch_size), total = dataset_size / batch_size):\n", " \n", " prompts = dataset['text'][i:i + batch_size]\n", " max_seq_len = MAX_SEQ_LEN\n", " stride = 256\n", " max_gen_len = 0\n", "\n", "\n", " prompt_tokens = batch_encode_plus(\n", " tokenizer,\n", " prompts,\n", " return_tensors='pt'\n", " )\n", " input_ids = prompt_tokens['input_ids']\n", " attn_mask = prompt_tokens['attention_mask']\n", " max_prompt_len = max(len(t) for t in input_ids)\n", " all_jac_sim = [0 for i in range(len(model.model.layers))] \n", " E = model.get_input_embeddings().weight.detach()\n", " \n", " # authors use a sliding window of size 1024 with a shift of 256\n", " for start in range(0, max_prompt_len, stride):\n", " seq_ids = (attn_mask.sum(dim=-1) > start).nonzero().squeeze()\n", " seq_ids = seq_ids.unsqueeze(0) if seq_ids.dim() == 0 else seq_ids # ensure 2d\n", " inputs = input_ids[seq_ids, start:start+max_seq_len]\n", " attn = attn_mask[seq_ids, start:start+max_seq_len]\n", "\n", " if max_gen_len == 0:\n", " outputs = model(\n", " input_ids=inputs.to(\"cuda\"),\n", " attention_mask=attn.to(\"cuda\"),\n", " output_hidden_states=True,\n", " )\n", " else:\n", " outputs = model.generate(\n", " input_ids=inputs.to(\"cuda\"),\n", " attention_mask=attn.to(\"cuda\"),\n", " max_new_tokens=max_gen_len, \n", " output_hidden_states=True,\n", " return_dict_in_generate=True,\n", " )\n", "\n", " hiddens = outputs.hidden_states\n", "\n", " for i in range(len(hiddens) - 1):\n", " in_hidden = hiddens[i][:,-1,:]\n", " out_hidden = hiddens[i+1][:,-1,:]\n", "\n", " in_projs = in_hidden @ E.T\n", " out_projs = out_hidden @ E.T\n", "\n", " in_projs = in_projs.detach().cpu().numpy()\n", " ot_projs = out_projs.detach().cpu().numpy()\n", "\n", " in_ind = np.argsort(-in_projs)\n", " ot_ind = np.argsort(-ot_projs)\n", "\n", " in_topks = [tokenizer.decode(i) for i in in_ind[0][:k]]\n", " ot_topks = [tokenizer.decode(i) for i in ot_ind[0][:k]]\n", "\n", " all_jac_sim[i] += jaccard_set(in_topks, ot_topks)\n", "\n", " \n", " importances = [x + y for x, y in zip(importances, all_jac_sim)]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import math\n", "def normalize(lst, range_min=0, range_max=1):\n", " min_val = min(lst)\n", " max_val = max(lst)\n", " normalized = [(range_max - range_min) * (x - min_val) / (max_val - min_val) + range_min for x in lst]\n", " return normalized\n", "\n", "filtered_values = [0 if math.isinf(value) else value for value in importances] \n", "normalized_lst = normalize(filtered_values)\n", "\n", "sorted_indices = sorted(range(len(normalized_lst)), key=lambda i: normalized_lst[i])\n", "reversed_list = list(reversed(sorted_indices))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Quantize" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from lsaq_quant import quantize_llama_like\n", "\n", "num_of_layer2quant = 8\n", "bit = 8\n", "\n", "layer_to_quant = reversed_list[0:num_of_layer2quant]\n", "\n", "mlp_quant = [f'layers.{item}.mlp' for item in layer_to_quant]\n", "self_attn_quant = [f'layers.{item}.self_attn' for item in layer_to_quant]\n", "\n", "print(f'quanting ... ')\n", "model_lsaq = quantize_llama_like(model, mlp_quant, self_attn_quant, bit)\n", "print(f'quanted')" ] } ], "metadata": { "kernelspec": { "display_name": "smoothquant", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.19" } }, "nbformat": 4, "nbformat_minor": 2 }