{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import GPUtil\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
    "import tqdm\n",
    "from functools import partial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Resource Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gpus = GPUtil.getGPUs()\n",
    "free_memory = []\n",
    "\n",
    "for gpu in gpus:\n",
    "    free_memory.append(gpu.memoryFree)\n",
    "\n",
    "memory_sort = sorted(range(len(free_memory)), key=lambda i: free_memory[i])\n",
    "\n",
    "gpu_id = memory_sort[-1]\n",
    "gpu_memory = free_memory[memory_sort[-1]]\n",
    "\n",
    "print(f'gpu_id:{gpu_id}; gpu_memory:{gpu_memory}')\n",
    "\n",
    "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = str(gpu_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name = \"/data/LLMs/Llama-2-7b-hf\"\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16, device_map=\"auto\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Layer Importance Detection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def encode(tok, text, padding=True, truncation=True, max_length=None):\n",
    "    # 将文本转换为输入 IDs\n",
    "    input_ids = [tok.bos_id] + tok.encode(text)\n",
    "\n",
    "    # 生成注意力掩码\n",
    "    attention_mask = [1] * len(input_ids)\n",
    "\n",
    "    # 如果进行了填充，则调整注意力掩码\n",
    "    if padding:\n",
    "        padding_length = max_length - len(input_ids)\n",
    "        attention_mask = [0] * padding_length + attention_mask\n",
    "        input_ids = [tok.eos_id] * padding_length + input_ids\n",
    "\n",
    "    encoded_input = {\n",
    "        'input_ids': input_ids,\n",
    "        'attention_mask': attention_mask\n",
    "    }\n",
    "    return encoded_input\n",
    "\n",
    "def batch_encode_plus(tok, texts, max_length=None, return_tensors=None):\n",
    "    encoded_inputs = []\n",
    "\n",
    "    # 循环处理每个文本\n",
    "    if max_length is None:\n",
    "        max_length = -1\n",
    "        for text in texts:\n",
    "            # if isinstance(text, list):\n",
    "            #     text = text[0]\n",
    "            # print(text)\n",
    "            len_ = len([tok.bos_id] + tok.encode(text))\n",
    "            if len_ > max_length:\n",
    "                max_length = len_\n",
    "    for text in texts:\n",
    "        # if isinstance(text, list):\n",
    "        #     text = text[0]\n",
    "        encoded_input = encode(tok, text, max_length = max_length)\n",
    "        encoded_inputs.append(encoded_input)\n",
    "\n",
    "    # 合并结果\n",
    "    batch_encoded = {\n",
    "        'input_ids': [encoded_input['input_ids'] for encoded_input in encoded_inputs],\n",
    "        'attention_mask': [encoded_input['attention_mask'] for encoded_input in encoded_inputs]\n",
    "    }\n",
    "\n",
    "    batch_encoded = {key: torch.tensor(val) for key, val in batch_encoded.items()}\n",
    "\n",
    "    return batch_encoded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.bos_token = tokenizer.eos_token\n",
    "tokenizer.bos_id = tokenizer.bos_token_id\n",
    "tokenizer.eos_id = tokenizer.eos_token_id\n",
    "importances = [0 for i in range(len(model.model.layers))]  # layer-wise importance scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_SEQ_LEN = 1024\n",
    "batch_size = 1\n",
    "dataset_size = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def jaccard_set(list1, list2):\n",
    "    \"\"\"Define Jaccard Similarity function for two sets\"\"\"\n",
    "    intersection = len(list(set(list1).intersection(list2)))\n",
    "    union = (len(list1) + len(list2)) - intersection\n",
    "    return float(intersection) / union"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "k = 20\n",
    "\n",
    "for i in tqdm.tqdm(range(0, dataset_size, batch_size), total = dataset_size / batch_size):\n",
    "    \n",
    "    prompts = dataset['text'][i:i + batch_size]\n",
    "    max_seq_len = MAX_SEQ_LEN\n",
    "    stride = 256\n",
    "    max_gen_len = 0\n",
    "\n",
    "\n",
    "    prompt_tokens = batch_encode_plus(\n",
    "        tokenizer,\n",
    "        prompts,\n",
    "        return_tensors='pt'\n",
    "    )\n",
    "    input_ids = prompt_tokens['input_ids']\n",
    "    attn_mask = prompt_tokens['attention_mask']\n",
    "    max_prompt_len = max(len(t) for t in input_ids)\n",
    "    all_jac_sim = [0 for i in range(len(model.model.layers))] \n",
    "    E = model.get_input_embeddings().weight.detach()\n",
    "    \n",
    "    # authors use a sliding window of size 1024 with a shift of 256\n",
    "    for start in range(0, max_prompt_len, stride):\n",
    "        seq_ids = (attn_mask.sum(dim=-1) > start).nonzero().squeeze()\n",
    "        seq_ids = seq_ids.unsqueeze(0) if seq_ids.dim() == 0 else seq_ids  # ensure 2d\n",
    "        inputs = input_ids[seq_ids, start:start+max_seq_len]\n",
    "        attn = attn_mask[seq_ids, start:start+max_seq_len]\n",
    "\n",
    "        if max_gen_len == 0:\n",
    "            outputs = model(\n",
    "                input_ids=inputs.to(\"cuda\"),\n",
    "                attention_mask=attn.to(\"cuda\"),\n",
    "                output_hidden_states=True,\n",
    "            )\n",
    "        else:\n",
    "            outputs = model.generate(\n",
    "                input_ids=inputs.to(\"cuda\"),\n",
    "                attention_mask=attn.to(\"cuda\"),\n",
    "                max_new_tokens=max_gen_len, \n",
    "                output_hidden_states=True,\n",
    "                return_dict_in_generate=True,\n",
    "            )\n",
    "\n",
    "        hiddens = outputs.hidden_states\n",
    "\n",
    "        for i in range(len(hiddens) - 1):\n",
    "            in_hidden = hiddens[i][:,-1,:]\n",
    "            out_hidden = hiddens[i+1][:,-1,:]\n",
    "\n",
    "            in_projs = in_hidden @ E.T\n",
    "            out_projs = out_hidden @ E.T\n",
    "\n",
    "            in_projs = in_projs.detach().cpu().numpy()\n",
    "            ot_projs = out_projs.detach().cpu().numpy()\n",
    "\n",
    "            in_ind = np.argsort(-in_projs)\n",
    "            ot_ind = np.argsort(-ot_projs)\n",
    "\n",
    "            in_topks = [tokenizer.decode(i) for i in in_ind[0][:k]]\n",
    "            ot_topks = [tokenizer.decode(i) for i in ot_ind[0][:k]]\n",
    "\n",
    "            all_jac_sim[i] += jaccard_set(in_topks, ot_topks)\n",
    "\n",
    "      \n",
    "    importances = [x + y for x, y in zip(importances, all_jac_sim)]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "def normalize(lst, range_min=0, range_max=1):\n",
    "    min_val = min(lst)\n",
    "    max_val = max(lst)\n",
    "    normalized = [(range_max - range_min) * (x - min_val) / (max_val - min_val) + range_min for x in lst]\n",
    "    return normalized\n",
    "\n",
    "filtered_values = [0 if math.isinf(value) else value for value in importances] \n",
    "normalized_lst = normalize(filtered_values)\n",
    "\n",
    "sorted_indices = sorted(range(len(normalized_lst)), key=lambda i: normalized_lst[i])\n",
    "reversed_list = list(reversed(sorted_indices))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quantize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from lsaq_quant import quantize_llama_like\n",
    "\n",
    "num_of_layer2quant = 8\n",
    "bit = 8\n",
    "\n",
    "layer_to_quant = reversed_list[0:num_of_layer2quant]\n",
    "\n",
    "mlp_quant = [f'layers.{item}.mlp' for item in layer_to_quant]\n",
    "self_attn_quant = [f'layers.{item}.self_attn' for item in layer_to_quant]\n",
    "\n",
    "print(f'quanting ... ')\n",
    "model_lsaq = quantize_llama_like(model, mlp_quant, self_attn_quant, bit)\n",
    "print(f'quanted')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "smoothquant",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}