chen459664 commited on Dec 11, 2025

Commit

075eaa3

verified ·

1 Parent(s): 3803ea9

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

llm-awq/awq.egg-info/SOURCES.txt +81 -0
llm-awq/awq.egg-info/dependency_links.txt +1 -0
llm-awq/awq.egg-info/requires.txt +16 -0
llm-awq/awq.egg-info/top_level.txt +3 -0
llm-awq/awq/quantize/qmodule.py +235 -0
llm-awq/awq/quantize/w8a8_linear.py +276 -0
llm-awq/awq/utils/lm_eval_adaptor.py +116 -0
llm-awq/awq/utils/utils.py +51 -0
llm-awq/examples/convert_to_hf.py +69 -0
llm-awq/examples/llava_demo.ipynb +0 -0
llm-awq/figures/vila-logo.jpg +0 -0
llm-awq/scripts/codellama_example.sh +25 -0
llm-awq/scripts/llama2_example.sh +25 -0
llm-awq/scripts/llama3_example.sh +25 -0
llm-awq/scripts/llama_example.sh +25 -0
llm-awq/scripts/opt_example.sh +25 -0
llm-awq/scripts/qwen_example.sh +25 -0
llm-awq/scripts/starcoder_example.sh +25 -0
llm-awq/scripts/vicuna_example.sh +25 -0
llm-awq/tinychat/benchmark.py +379 -0
llm-awq/tinychat/demo.py +283 -0
llm-awq/tinychat/internvl_benchmark.py +167 -0
llm-awq/tinychat/split_ckpt.py +51 -0
llm-awq/tinychat/vila15_demo.py +264 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sot.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yor.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ibo.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_kin.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sna.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sot.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_xho.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yaml +34 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yor.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_zul.yaml +4 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ibo.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lin.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lug.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_orm.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sna.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sot.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_swa.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_twi.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_vai.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_wol.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_xho.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yor.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_amh.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_eng.yaml +7 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ewe.yaml +6 -0
lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_fra.yaml +6 -0

llm-awq/awq.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,81 @@

+LICENSE
+README.md
+pyproject.toml
+awq/entry.py
+awq.egg-info/PKG-INFO
+awq.egg-info/SOURCES.txt
+awq.egg-info/dependency_links.txt
+awq.egg-info/requires.txt
+awq.egg-info/top_level.txt
+awq/kernels/setup.py
+awq/kernels/csrc/attention/setup.py
+awq/quantize/__init__.py
+awq/quantize/auto_clip.py
+awq/quantize/auto_scale.py
+awq/quantize/pre_quant.py
+awq/quantize/qmodule.py
+awq/quantize/quantizer.py
+awq/quantize/smooth.py
+awq/quantize/w8a8_linear.py
+awq/utils/__init__.py
+awq/utils/calib_data.py
+awq/utils/lm_eval_adaptor.py
+awq/utils/module.py
+awq/utils/parallel.py
+awq/utils/utils.py
+tinychat/benchmark.py
+tinychat/demo.py
+tinychat/internvl_benchmark.py
+tinychat/internvl_demo.py
+tinychat/nvila_benchmark.py
+tinychat/nvila_demo.py
+tinychat/offline-weight-repacker.py
+tinychat/split_ckpt.py
+tinychat/vila10_demo.py
+tinychat/vila15_demo.py
+tinychat/models/__init__.py
+tinychat/models/falcon.py
+tinychat/models/internvl3.py
+tinychat/models/llama.py
+tinychat/models/llava_llama.py
+tinychat/models/mpt.py
+tinychat/models/nvila_qwen2.py
+tinychat/models/qwen2.py
+tinychat/models/vila_llama.py
+tinychat/models/internvl/configuration_internvl.py
+tinychat/models/internvl/conversation.py
+tinychat/models/internvl/internvit.py
+tinychat/models/internvl/media.py
+tinychat/models/llava_base/llava_arch.py
+tinychat/models/llava_base/multimodal_encoder/builder.py
+tinychat/models/llava_base/multimodal_encoder/clip_encoder.py
+tinychat/models/llava_base/multimodal_projector/builder.py
+tinychat/models/nvila/builder.py
+tinychat/models/nvila/configuration_llava.py
+tinychat/models/nvila/llava_arch.py
+tinychat/modules/__init__.py
+tinychat/modules/fused_attn.py
+tinychat/modules/fused_internencoder.py
+tinychat/modules/fused_mlp.py
+tinychat/modules/fused_norm.py
+tinychat/modules/fused_siglipdecoder.py
+tinychat/modules/fused_vision_attn.py
+tinychat/serve/controller.py
+tinychat/serve/gradio_web_server.py
+tinychat/serve/llava_conv.py
+tinychat/serve/model_worker.py
+tinychat/serve/model_worker_new.py
+tinychat/stream_generators/NVILA_stream_gen.py
+tinychat/stream_generators/__init__.py
+tinychat/stream_generators/internvl_stream_gen.py
+tinychat/stream_generators/llava_stream_gen.py
+tinychat/stream_generators/stream_gen.py
+tinychat/utils/__init__.py
+tinychat/utils/constants.py
+tinychat/utils/conversation_utils.py
+tinychat/utils/input_metadata.py
+tinychat/utils/llava_image_processing.py
+tinychat/utils/load_quant.py
+tinychat/utils/log_utils.py
+tinychat/utils/prompt_templates.py
+tinychat/utils/tune.py

llm-awq/awq.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

llm-awq/awq.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+accelerate==0.34.2
+sentencepiece
+tokenizers>=0.12.1
+torch==2.3.0
+torchvision==0.18.0
+transformers==4.46.0
+lm_eval==0.3.0
+texttable
+toml
+attributedict
+protobuf
+gradio==3.35.2
+gradio_client==0.2.9
+fastapi
+uvicorn
+pydantic==1.10.19

llm-awq/awq.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+awq
+figures
+tinychat

llm-awq/awq/quantize/qmodule.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import math
+import torch
+import torch.nn as nn
+import awq_inference_engine  # with CUDA kernels
+def make_divisible(c, divisor):
+    return (c + divisor - 1) // divisor
+def calculate_zeros_width(in_features, group_size=128, pack_num=8):
+    if group_size >= 128:
+        size_multiplier = 1
+    elif group_size == 64:
+        size_multiplier = 2
+    elif group_size == 32:
+        size_multiplier = 4
+    else:
+        raise NotImplementedError
+    base_width = make_divisible(in_features // group_size, pack_num)
+    base_width = make_divisible(base_width, size_multiplier) * size_multiplier
+    return base_width
+def pack_intweight(unpacked_qweight, interleave, kstride):
+    # unpacked_qweight: [N, K]
+    N = unpacked_qweight.shape[0]
+    K = unpacked_qweight.shape[1]
+    Packed_Kernel = unpacked_qweight.cpu().numpy().reshape(N, K // 32, 32)
+    # np.arange(32).reshape(4, 4, 2).transpose(1, 0, 2) => [0, 1, 8, 9, 16, 17, 24, 25, ...]
+    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 3, 2, 4)
+    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 32)
+    # reorder each 8 weights for fast dequantization
+    # [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7]
+    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 8)
+    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 2, 4, 3)
+    Packed_Kernel = Packed_Kernel.reshape(N, K)
+    # interleaving every four rows
+    Packed_Kernel = Packed_Kernel.reshape(
+        N // interleave, interleave, K // kstride, kstride
+    )
+    # N // 4, K // 64, 4, 64
+    Packed_Kernel = Packed_Kernel.transpose(0, 2, 1, 3)
+    Packed_Kernel = Packed_Kernel.reshape(
+        N // interleave, K // kstride, kstride, interleave
+    )
+    # Packing -> (N // 4, K // 64, 64)
+    Packed_Kernel = (
+        Packed_Kernel[..., 0]
+        | (Packed_Kernel[..., 1] << 4)
+        | (Packed_Kernel[..., 2] << 8)
+        | (Packed_Kernel[..., 3] << 12)
+    )
+    # reshape to (N // 4, K), FP16 format
+    Packed_Kernel = Packed_Kernel.reshape(N // interleave, K)
+    qweight = (
+        torch.tensor(Packed_Kernel.astype("int16"))
+        .to(unpacked_qweight.device)
+        .contiguous()
+    )
+    return qweight
+class ScaledActivation(nn.Module):
+    def __init__(self, module, scales):
+        super().__init__()
+        self.act = module
+        self.scales = nn.Parameter(scales.data)
+    def forward(self, x):
+        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
+class WQLinear(nn.Module):
+    def __init__(self, w_bit, group_size, in_features, out_features, bias, dev, dtype=torch.float16):
+        super().__init__()
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+        self.in_features = in_features
+        self.out_features = out_features
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else in_features
+        self.split_k_iters = 8
+        self.interleave = 4
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert out_features % (32 // self.w_bit) == 0
+        pack_num = 32 // self.w_bit
+        int16_pack_num = 16 // self.w_bit
+        assert out_features % (self.interleave) == 0
+        self.register_buffer(
+            "qweight",
+            torch.zeros(
+                (
+                    out_features // self.interleave,
+                    in_features // int16_pack_num * self.interleave,
+                ),
+                dtype=torch.int16,
+                device=dev,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (
+                    calculate_zeros_width(in_features, self.group_size) * pack_num,
+                    out_features,
+                ),
+                dtype=dtype,
+                device=dev,
+            ),
+        )
+        self.register_buffer(
+            "scaled_zeros",
+            torch.zeros(
+                (
+                    calculate_zeros_width(in_features, self.group_size) * pack_num,
+                    out_features,
+                ),
+                dtype=dtype,
+                device=dev,
+            ),
+        )
+        if bias:
+            self.register_buffer(
+                "bias", torch.zeros((out_features), dtype=dtype, device=dev)
+            )
+        else:
+            self.bias = None
+    @classmethod
+    def from_linear(
+        cls, linear, w_bit, group_size, init_only=False, scales=None, zeros=None
+    ):
+        awq_linear = cls(
+            w_bit,
+            group_size,
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+            linear.weight.device,
+            dtype=linear.weight.data.dtype
+        )
+        if init_only:  # just prepare for loading sd
+            return awq_linear
+        # need scales and zeros info for real quantization
+        assert scales is not None and zeros is not None
+        scale_zeros = zeros * scales
+        dtype = scales.dtype
+        pack_num = 32 // awq_linear.w_bit
+        qscales = torch.zeros(
+            (
+                scales.shape[0],
+                calculate_zeros_width(linear.in_features, group_size) * pack_num,
+            ),
+            dtype=dtype,
+            device=scales.device,
+        )
+        qscales[:, : scales.shape[1]] = scales
+        # awq_linear.scales = scales.clone().half()
+        awq_linear.scales = qscales.transpose(1, 0).contiguous()
+        if linear.bias is not None:
+            awq_linear.bias = linear.bias.clone().to(dtype)
+        intweight = []
+        for idx in range(awq_linear.in_features):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[:, idx // group_size])
+                    / qscales[:, idx // group_size]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        # intweight = intweight.t().contiguous()
+        intweight = intweight.to(dtype=torch.int32)
+        awq_linear.qweight = pack_intweight(
+            intweight.contiguous(), interleave=4, kstride=64
+        )
+        zeros = zeros.to(dtype=torch.int32)
+        scaled_zeros = torch.zeros_like(qscales)
+        # scaled_zeros[:, :scales.shape[1]] = -(qscales[:, :scales.shape[1]] * (zeros.to(torch.float32) - 8.0)).to(torch.float16)
+        scaled_zeros[:, : scales.shape[1]] = -(
+            qscales[:, : scales.shape[1]] * (zeros.to(torch.float32))
+        ).to(dtype)
+        awq_linear.scaled_zeros = scaled_zeros.transpose(1, 0).contiguous()
+        return awq_linear
+    @torch.no_grad()
+    def forward(self, x):
+        # out_shape = x.shape[:-1] + (self.out_features,)
+        # inputs = x.reshape(-1, x.shape[-1])
+        inputs = x
+        if inputs.numel() / inputs.shape[-1] < 8:
+            out = awq_inference_engine.gemv_forward_cuda_new(
+                inputs,
+                self.qweight,
+                self.scales,
+                self.scaled_zeros,
+                inputs.numel() // inputs.shape[-1],
+                self.out_features,
+                self.in_features,
+                self.group_size,
+            )
+        else:
+            out = awq_inference_engine.gemm_forward_cuda_new(
+                inputs, self.qweight, self.scales, self.scaled_zeros
+            )  # - 8.0 * self.scales)
+        out = out + self.bias if self.bias is not None else out
+        # print(out)
+        # assert 0
+        return out
+    def extra_repr(self) -> str:
+        return (
+            "in_features={}, out_features={}, bias={}, w_bit={}, group_size={}".format(
+                self.in_features,
+                self.out_features,
+                self.bias is not None,
+                self.w_bit,
+                self.group_size,
+            )
+        )

llm-awq/awq/quantize/w8a8_linear.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# Adapted from qserve (https://github.com/mit-han-lab/qserve/tree/main) and modified by Yuming Lou
+from typing import Optional, Union
+from torch.nn import Parameter
+import awq_inference_engine
+import torch
+import gc
+from awq.utils.module import set_op_by_name
+from tqdm import tqdm
+class W8A8OF16LinearStaticScale(torch.nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        scale: Union[torch.tensor, float] = 1.0,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        # Keep input parameters
+        self.in_features = in_features
+        self.out_features = out_features
+        # size [1] or size [oc]
+        self.register_buffer(
+            "dequant_scale", torch.ones(out_features, dtype=torch.half)
+        )
+        # Parameters.
+        # NOTE: torch.nn.functional.linear performs XA^T + b and as a result
+        # we allocate the transpose.
+        self.create_weights()
+        if bias:
+            self.bias = torch.empty(
+                self.out_features,
+                device=torch.cuda.current_device(),
+                dtype=torch.float16,
+            )
+        else:
+            self.register_parameter("bias", None)
+    def create_weights(self) -> None:
+        self.register_buffer(
+            "weight",
+            torch.empty(
+                self.out_features,
+                self.in_features,
+                dtype=torch.int8,
+                requires_grad=False,
+            ),
+        )
+    def apply_weights(
+        self,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        raise NotImplementedError
+    def forward(self, input_):
+        # Matrix multiply.
+        output = self.apply_weights(input_, self.bias)
+        output_bias = self.bias
+        return output, output_bias
+class W8A8OF16LinearDynamicInputScale(W8A8OF16LinearStaticScale):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        scale: Union[torch.tensor, float] = 1.0,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+            scale=scale,
+            params_dtype=params_dtype,
+        )
+        if bias:
+            self.apply_weights = self.apply_weights_bias
+        else:
+            self.apply_weights = self.apply_weights_no_bias
+    #W bias. Fused bias and W8A8 GEMM
+    def apply_weights_bias(
+        self,
+        # [batch, tokens, channels]
+        x: torch.Tensor,
+        # [batch * tokens]
+        input_scale: torch.Tensor,
+        output_buffer: torch.Tensor,
+        bias: torch.Tensor = None,
+    ):
+        x_shape = x.shape
+        if len(x.shape) > 2:
+            assert 0, "Not implemented"
+            x = x.view(-1, x_shape[-1])
+        # If use awq_inference_engine.w8a8_gemm_fuse_bias_forward_cuda
+        awq_inference_engine.w8a8_gemm_fuse_bias_forward_cuda(
+        x, self.weight, self.dequant_scale, input_scale, output_buffer, bias
+        )
+        if len(x.shape) > 2:
+            assert 0, "Not implemented 2"
+            output_buffer = output_buffer.view(*x_shape[:-1], -1)
+    #W/H bias. W8A8 GEMM
+    def apply_weights_no_bias(
+            self,
+            # [batch, tokens, channels]
+            x: torch.Tensor,
+            # [batch * tokens]
+            input_scale: torch.Tensor,
+            output_buffer: torch.Tensor,
+            bias: torch.Tensor = None,
+        ):
+            x_shape = x.shape
+            if len(x.shape) > 2:
+                assert 0, "Not implemented"
+                x = x.view(-1, x_shape[-1])
+            # If use awq_inference_engine.w8a8_gemm_forward_cuda
+            awq_inference_engine.w8a8_gemm_forward_cuda(
+                x, self.weight, self.dequant_scale, input_scale, output_buffer
+            )
+            if len(x.shape) > 2:
+                assert 0, "Not implemented 2"
+                output_buffer = output_buffer.view(*x_shape[:-1], -1)
+    def forward(self, input_, input_scale, output_buffer):
+        # Matrix multiply.
+        self.apply_weights(input_, input_scale, output_buffer, self.bias)
+    @classmethod
+    def from_linear(
+        cls,
+        linear,
+        init_only=False,
+        s1_scale=None,
+        fc1=False,
+    ):
+        q_linear = cls(
+            linear.in_features,
+            linear.out_features,
+            linear.bias is not None,
+        )
+        if init_only:  # just prepare for loading sd
+            return q_linear
+        if s1_scale is None:
+            s1_scale, _ = torch.max(abs(linear.weight.data), dim=-1, keepdim=True)
+            s1_scale = s1_scale.clamp_(min=1e-5).div_(127)
+        if linear.bias is not None:
+            q_linear.bias = linear.bias.clone().half().contiguous().cuda()
+        ## Quantize the weights
+        # ---- Quantize the weights to int8 ---- #
+        linear_weight = linear.weight.data  # OC, IC
+        linear_weight = linear_weight.div_(s1_scale.to(linear_weight.device))
+        linear_weight = linear_weight.round_().to(torch.int8)
+        q_linear.weight.data[:, :] = linear_weight.half().contiguous().cuda()
+        # ---- Pack the scales ---- #
+        q_linear.dequant_scale.data[:] = (
+            s1_scale.reshape(-1).half().contiguous().cuda()
+        )
+        return q_linear.cuda()
+    @classmethod
+    def from_qkv(
+        cls,
+        q,
+        k,
+        v,
+        init_only=False,
+        s1_scale=None,
+    ):
+        q_linear = cls(
+            q.in_features,
+            q.out_features + k.out_features + v.out_features,
+            q.bias is not None,
+        )
+        if init_only:  # just prepare for loading sd
+            return q_linear
+        weight = torch.cat([q.weight.data, k.weight.data, v.weight.data], dim=0)
+        if s1_scale is None:
+            s1_scale, _ = torch.max(abs(weight), dim=-1, keepdim=True)
+            s1_scale = s1_scale.clamp_(min=1e-5).div_(127)
+        if q.bias is not None:
+            bias = torch.cat([q.bias, k.bias, v.bias], dim=0)
+            q_linear.bias = bias.clone().half().contiguous().cuda()
+        # ---- Quantize the weights to int8 ---- #
+        weight = weight.div_(s1_scale.to(weight.device))
+        weight = weight.round_().to(torch.int8)
+        q_linear.weight.data[:, :] = weight.contiguous().cuda()
+        # ---- Pack the scales ---- #
+        q_linear.dequant_scale.data[:] = (
+            s1_scale.reshape(q.out_features + k.out_features + v.out_features)
+            .half()
+            .contiguous().cuda()
+        )
+        return q_linear.cuda()
+class FakeW8A8Linear(torch.nn.Module):
+    def __init__(
+        self, in_features: int, out_features: int, bias: bool = True, wbit: int = 8
+    ):
+        super().__init__()
+        self.weight = torch.nn.Parameter(
+            torch.empty(out_features, in_features, dtype=torch.half)
+        )
+        if bias:
+            self.bias = torch.nn.Parameter(
+                torch.empty(1, out_features, dtype=torch.half)
+            )
+        else:
+            self.bias = None
+        self.wbit = wbit
+        self.maxv = 2 ** (wbit - 1) - 1
+    def forward(self, input):
+        t_shape = input.shape
+        input.view(-1, t_shape[-1])
+        scales = input.abs().max(dim=-1, keepdim=True)[0]
+        scales.clamp_(min=1e-5).div_(self.maxv)
+        input.div_(scales).round_().mul_(scales)
+        output = torch.functional.F.linear(input, self.weight, self.bias)
+        return output
+    @classmethod
+    def from_linear(cls, linear: torch.nn.Linear, wbit=8):
+        fake_linear = cls(
+            linear.in_features, linear.out_features, linear.bias is not None, wbit
+        )
+        maxv = 2 ** (wbit - 1) - 1
+        scale = (
+            torch.max(abs(linear.weight.data.detach()), -1, keepdim=True)[0]
+            .clamp_(min=1e-5)
+            .div_(maxv)
+        )
+        weight = linear.weight.data / scale
+        weight = weight.round_()
+        weight = weight * scale
+        fake_linear.weight.copy_(weight.contiguous())
+        if linear.bias is not None:
+            fake_linear.bias.copy_(
+                linear.bias.detach().half().reshape(1, linear.out_features).contiguous()
+            )
+        else:
+            linear.bias = None
+        del linear, scale, weight
+        torch.cuda.empty_cache()
+        return fake_linear
+def fake_quant(model, wbit=8):
+    for name, m in tqdm(
+        model.named_modules(),
+        desc="Fake quantizing",
+        total=len(list(model.named_modules())),
+    ):
+        if isinstance(m, torch.nn.Linear):
+            FQlinear = FakeW8A8Linear.from_linear(m, wbit)
+            del m
+            torch.cuda.empty_cache()
+            set_op_by_name(model, name, FQlinear)

llm-awq/awq/utils/lm_eval_adaptor.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import transformers
+import torch
+from lm_eval.base import BaseLM
+import fnmatch
+class LMEvalAdaptor(BaseLM):
+    def __init__(self, model_name, model, tokenizer, batch_size=1, max_length=-1):
+        super().__init__()
+        assert isinstance(batch_size, int)
+        self.model_name = model_name
+        self.model = model
+        self.model.eval()
+        self.tokenizer = tokenizer
+        # assert isinstance(self.tokenizer, (
+        #     transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
+        #     transformers.T5Tokenizer, transformers.T5TokenizerFast,
+        # )), "this tokenizer has not been checked for compatibility yet!"
+        self.vocab_size = self.tokenizer.vocab_size
+        self._batch_size = batch_size
+        self._max_length = max_length
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self):
+        if self._max_length != -1:
+            return self._max_length
+        if hasattr(self.model.config, "n_ctx"):
+            return self.model.config.n_ctx
+        elif hasattr(self.model.config, "max_position_embeddings"):
+            return self.model.config.max_position_embeddings
+        elif hasattr(self.model.config, "n_positions"):
+            return self.model.config.n_positions
+        elif "bloom" in self.model_name:
+            return 2048
+        elif "llama" in self.model_name:
+            return 2048  # TODO: did not check this
+        elif "mpt" in self.model_name:
+            return 2048
+        elif "falcon" in self.model_name:
+            return 2048
+        else:
+            print(self.model.config)
+            raise NotImplementedError
+    @property
+    def max_gen_toks(self):
+        return 256
+    @property
+    def batch_size(self):
+        return self._batch_size
+    @property
+    def device(self):
+        return "cuda"
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+    def _model_call(self, inps):
+        """
+        inps: a torch tensor of shape [batch, sequence]
+        the size of sequence may vary from call to call
+        returns: a torch tensor of shape [batch, sequence, vocab] with the
+        logits returned from the model
+        """
+        with torch.no_grad():
+            if isinstance(
+                self.model,
+                transformers.models.t5.modeling_t5.T5ForConditionalGeneration,
+            ):
+                dec_inps = torch.cat(
+                    [
+                        torch.tensor(
+                            self.model.generation_config.decoder_start_token_id,
+                        )
+                        .tile(len(inps), 1)
+                        .to(inps),
+                        inps,
+                    ],
+                    dim=1,
+                )
+                kwargs = {
+                    "decoder_input_ids": dec_inps,
+                }
+            else:
+                kwargs = {}
+            out = self.model(inps, **kwargs)[0]
+            if (
+                "opt" in self.model_name
+            ):  # there are a few extra tokens in opt, which we should omit
+                return out[:, :, :50257]
+            else:
+                return out  # [:, :, :self.tokenizer.vocab_size]
+    def _model_generate(self, context, max_length, eos_token_id):
+        return self.model.generate(
+            context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False
+        )

llm-awq/awq/utils/utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import accelerate
+def get_module_by_name_suffix(model, module_name: str):
+    for name, module in model.named_modules():
+        if name.endswith(module_name):
+            return module
+def simple_dispatch_model(model, device_map):
+    from accelerate.hooks import add_hook_to_module, AlignDevicesHook
+    if "" in device_map:
+        d = device_map[""]
+        model = model.to(torch.device(d))
+        model.hf_device_map = device_map
+        return model
+    tied_params = accelerate.utils.modeling.find_tied_parameters(model)
+    if set(device_map.values()) == {"cpu"} or set(device_map.values()) == {
+        "cpu",
+        "disk",
+    }:
+        main_device = "cpu"
+    else:
+        main_device = [d for d in device_map.values() if d not in ["cpu", "disk"]][0]
+    cpu_offload_group = [(n, d) for n, d in device_map.items() if d == "cpu"]
+    prev_hook = None
+    for idx, (n, d) in enumerate(cpu_offload_group):
+        m = get_module_by_name_suffix(model, n)
+        _, prev_hook = accelerate.cpu_offload_with_hook(
+            m, execution_device=main_device, prev_module_hook=prev_hook
+        )
+    # set first cpu offload module's prev_module_hook to the last cpu offload module's hook
+    if len(cpu_offload_group) > 1:
+        get_module_by_name_suffix(
+            model, cpu_offload_group[0][0]
+        )._hf_hook.prev_module_hook = prev_hook
+    for n, d in device_map.items():
+        m = get_module_by_name_suffix(model, n)
+        if d != "cpu":
+            d = torch.device(d)
+            hook = AlignDevicesHook(d, io_same_device=True, place_submodules=True)
+            add_hook_to_module(m, hook)
+    accelerate.utils.modeling.retie_parameters(model, tied_params)
+    model.hf_device_map = device_map
+    return model

llm-awq/examples/convert_to_hf.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# This script demonstrates how you can convert your model into HF format
+# easily and push the quantized weights on the Hub using simple tools.
+# Make sure to have transformers > 4.34 and that you have ran
+# `huggingface-cli login` on your terminal before running this
+# script
+import os
+import argparse
+# This demo only support single GPU for now
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+from transformers import AutoConfig, AwqConfig, AutoTokenizer
+from huggingface_hub import HfApi
+api = HfApi()
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model_path", type=str, help="path of the original hf model", required=True
+)
+parser.add_argument(
+    "--quantized_model_path",
+    type=str,
+    help="path of the quantized AWQ model",
+    required=True,
+)
+parser.add_argument(
+    "--quantized_model_hub_path",
+    type=str,
+    help="path of the quantized AWQ model to push on the Hub",
+    required=True,
+)
+parser.add_argument("--w_bit", type=int, default=4, help="")
+parser.add_argument("--q_group_size", default=128, type=int)
+parser.add_argument("--no_zero_point", action="store_true")
+args = parser.parse_args()
+original_model_path = args.model_path
+quantized_model_path = args.quantized_model_path
+quantized_model_hub_path = args.quantized_model_hub_path
+# Load the corresponding AWQConfig
+quantization_config = AwqConfig(
+    bits=args.w_bit,
+    group_size=args.q_group_size,
+    zero_point=not args.no_zero_point,
+    backend="llm-awq",
+    version="gemv",
+)
+# Set the attribute `quantization_config` in model's config
+config = AutoConfig.from_pretrained(original_model_path)
+config.quantization_config = quantization_config
+# Load tokenizer
+tok = AutoTokenizer.from_pretrained(original_model_path)
+# Push config and tokenizer
+config.push_to_hub(quantized_model_hub_path)
+tok.push_to_hub(quantized_model_hub_path)
+# Upload model weights
+api.upload_file(
+    path_or_fileobj=quantized_model_path,
+    path_in_repo="pytorch_model.bin",
+    repo_id=quantized_model_hub_path,
+    repo_type="model",
+)

llm-awq/examples/llava_demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

llm-awq/figures/vila-logo.jpg ADDED Viewed

llm-awq/scripts/codellama_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=CodeLlama-13b-Instruct
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/codellama-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/scripts/llama2_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=llama-2-7b
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/llama2-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/scripts/llama3_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=llama3-8b
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/models/llama3/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/models/llama3/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/models/llama3/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/models/llama3/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/scripts/llama_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=llama-7b
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/llama-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/llama-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/llama-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/llama-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/scripts/opt_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=opt-6.7b
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/opt/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/opt/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/opt/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/opt/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/scripts/qwen_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=qwen2.5-7b
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/models/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/models/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/models/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/models/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/scripts/starcoder_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=starcoder
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/starcoder-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/scripts/vicuna_example.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+MODEL=vicuna-7b
+# run AWQ search (optional; we provided the pre-computed results)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --run_awq --dump_awq awq_cache/$MODEL-w4-g128.pt
+# evaluate the AWQ quantize model (simulated pseudo quantization)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend fake
+# generate real quantized weights (w4)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --w_bit 4 --q_group_size 128 \
+    --load_awq awq_cache/$MODEL-w4-g128.pt \
+    --q_backend real --dump_quant quant_cache/$MODEL-w4-g128-awq.pt
+# load and evaluate the real quantized model (smaller gpu memory usage)
+python -m awq.entry --model_path /dataset/vicuna-hf/$MODEL \
+    --tasks wikitext \
+    --w_bit 4 --q_group_size 128 \
+    --load_quant quant_cache/$MODEL-w4-g128-awq.pt

llm-awq/tinychat/benchmark.py ADDED Viewed

	@@ -0,0 +1,379 @@

+# Usage:
+# Please first install awq/kernels
+# then directly run CUDA_VISIBLE_DEVICES=0 python benchmark.py
+import argparse
+import torch
+import time
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, modeling_utils
+import tinychat.utils.constants
+from tinychat.utils.load_quant import load_awq_model
+from awq.quantize.quantizer import real_quantize_model_weight
+from tinychat.utils.tune import (
+    tune_all_wqlinears,
+    device_warmup,
+    tune_llava_patch_embedding,
+)
+from tinychat.modules import make_quant_norm, make_quant_attn, make_fused_mlp
+def skip(*args, **kwargs):
+    pass
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type", type=str, default="LLaMa", help="type of the model"
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="/data/llm/checkpoints/vicuna-hf/vicuna-7b",
+        help="path to the model",
+    )
+    parser.add_argument("--q_group_size", type=int, default=128)
+    parser.add_argument(
+        "--verbose",
+        default=False,
+        action="store_true",
+        help="Wheter to print more information.",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=8192,
+        help="maximum sequence length for kv cache",
+    )
+    parser.add_argument(
+        "--max_batch_size", type=int, default=1, help="maximum batch size for kv cache"
+    )
+    parser.add_argument(
+        "--flash_attn",
+        action="store_true",
+        help="whether to use flash attention",
+    )
+    parser.add_argument(
+        "--chunk_prefilling",
+        action="store_true",
+        help="If used, in context stage, the history tokens will not be recalculated, greatly speeding up the calculation",
+    )
+    parser.add_argument(
+        "--context_length",
+        type=list,
+        nargs="+",
+        help="The length of input. And if chunk_prefilling used, this serves as the length of tokens from history rounds.",
+    )
+    parser.add_argument(
+        "--question_length",
+        type=list,
+        nargs="+",
+        help="The length of new input. Only useful and necessary when benchmarking chunk_prefilling method",
+    )
+    parser.add_argument(
+        "--precision", type=str, default="W4A16", help="compute precision"
+    )
+    args = parser.parse_args()
+    # some checks
+    assert (args.question_length is not None and args.chunk_prefilling) or (
+        not args.chunk_prefilling
+    ), "If you want to benchmark chunk prefilling, you need specify the question length and context length"
+    assert args.precision in ["W4A16", "W16A16"], "We only support W4A16/W16A16 now"
+    token_num = 256
+    # We support fixing a certain kind of length
+    if args.chunk_prefilling:
+        if len(args.context_length) == 1 and len(args.question_length) > 1:
+            args.context_length = [
+                args.context_length[0] for _ in range(len(args.question_length))
+            ]
+        elif len(args.question_length) == 1 and len(args.context_length) > 1:
+            args.question_length = [
+                args.question_length[0] for _ in range(len(args.context_length))
+            ]
+        elif len(args.question_length) != len(args.context_length):
+            raise ValueError(
+                "The number of items in the question_length and context_length is expected to be either one or equal!"
+            )
+    tinychat.utils.constants.max_batch_size = args.max_batch_size
+    tinychat.utils.constants.max_seq_len = args.max_seq_len
+    from tinychat.models import FalconForCausalLM, LlamaForCausalLM, MPTForCausalLM
+    from tinychat.models.vila_llama import VilaLlamaForCausalLM
+    modeling_utils._init_weights = False
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.kaiming_normal_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    device = "cuda:0"
+    model_type_dict = {
+        "llama": LlamaForCausalLM,
+        "falcon": FalconForCausalLM,
+        "mpt": MPTForCausalLM,
+    }
+    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
+    assert args.model_type.lower() in [
+        "llama",
+        "falcon",
+        "mpt",
+        "vila",
+    ], "We only support llama & falcon & mpt & vila now"
+    if "vila" in args.model_type.lower():
+        model = VilaLlamaForCausalLM(config).half()
+        print(model)
+        if args.precision in ["W4A16"]:
+            real_quantize_model_weight(
+                model.llm,
+                w_bit=4,
+                q_config=dict(q_group_size=args.q_group_size, zero_point=True),
+                init_only=True,
+            )
+            make_quant_attn(model.llm, device, args.flash_attn)
+            make_quant_norm(model.llm)
+            make_fused_mlp(model.llm)
+        model = model.to(device)
+        device_warmup(device)
+        tune_llava_patch_embedding(model.get_vision_tower(), device=device)
+        if not args.chunk_prefilling:
+            image_num = [
+                int(int("".join(i)) * 1 / 196) for i in args.context_length
+            ]  # consider about three thirds of the history tokens are images
+            if sum(image_num) > 0:
+                image_tensor = 2 * torch.rand((max(image_num), 3, 384, 384)) - 1
+                image_tensor = image_tensor.half().to(device)
+            else:
+                image_tensor = None
+        print("huggingface ckpt loaded")
+        # warming up
+        input_ids = [1 for _ in range(2048)]
+        inputs = torch.as_tensor([input_ids], device=device)
+        out = model(
+            inputs, start_pos=0, chunk_prefilling=args.chunk_prefilling
+        )  # warmup
+        if not args.chunk_prefilling:
+            for i, context_length in enumerate(args.context_length):
+                context_length = int("".join(context_length))
+                time_lis = []
+                if image_num[i]:
+                    images = image_tensor[0 : image_num[i], :, :, :]
+                    input_ids = [-200 for _ in range(image_num[i])] + [
+                        1 for _ in range(context_length - 196 * image_num[i])
+                    ]
+                else:
+                    images = None
+                    input_ids = [1 for _ in range(context_length)]
+                print("-" * 80)
+                print(
+                    "Context length: {} with {} pictures".format(
+                        context_length, image_num[i]
+                    )
+                )
+                with torch.inference_mode():
+                    for i in range(10):  # Run ten times and get the average value
+                        start_pos = 0
+                        torch.cuda.synchronize()
+                        t_st = time.time()
+                        inputs = torch.as_tensor([input_ids], device=device)
+                        out = model(
+                            inputs,
+                            start_pos=start_pos,
+                            chunk_prefilling=args.chunk_prefilling,
+                            images=images,
+                        )
+                        start_pos += inputs.shape[1]
+                        torch.cuda.synchronize()
+                        t_ed = time.time()
+                        token = out[:, -1].max(1)[1].unsqueeze(1)
+                        time_lis.append(t_ed - t_st)
+                        if args.verbose:
+                            print(i, t_ed - t_st)
+                    print(f"Time To First Token: {np.mean(time_lis):.5f} s.")
+                    print("-" * 80)
+        else:
+            for i, (context_length, question_length) in enumerate(
+                zip(args.context_length, args.question_length)
+            ):
+                context_length = int("".join(context_length))
+                question_length = int("".join(question_length))
+                input_ids_old = [1 for _ in range(context_length)]
+                images = None
+                input_ids_new = [1 for _ in range(question_length)]
+                time_lis = []
+                print("-" * 80)
+                print(
+                    "History length: {} ; Question length: {}".format(
+                        context_length, question_length
+                    )
+                )
+                with torch.inference_mode():
+                    for i in range(10):  # Run ten times and get the average value
+                        # history rounds
+                        start_pos = 0
+                        if context_length > question_length:
+                            inputs = torch.as_tensor([input_ids_old], device=device)
+                            out = model(
+                                inputs,
+                                start_pos=start_pos,
+                                chunk_prefilling=args.chunk_prefilling,
+                                images=None,
+                            )
+                            start_pos += context_length
+                        # the present round
+                        torch.cuda.synchronize()
+                        t_st = time.time()
+                        inputs = torch.as_tensor([input_ids_new], device=device)
+                        out = model(
+                            inputs,
+                            start_pos=start_pos,
+                            chunk_prefilling=args.chunk_prefilling,
+                        )
+                        start_pos += inputs.shape[1]
+                        torch.cuda.synchronize()
+                        t_ed = time.time()
+                        token = out[:, -1].max(1)[1].unsqueeze(1)
+                        time_lis.append(t_ed - t_st)
+                        if args.verbose:
+                            print(i, t_ed - t_st)
+                    print(
+                        f"Time To First Token of this round: {np.mean(time_lis):.5f} s."
+                    )
+                    print("-" * 80)
+    else:
+        model = model_type_dict[args.model_type.lower()](config).half()
+        if args.precision in ["W4A16"]:
+            real_quantize_model_weight(
+                model,
+                w_bit=4,
+                q_config=dict(q_group_size=args.q_group_size, zero_point=True),
+                init_only=True,
+            )
+        model = model.to(device)
+        if args.precision in ["W4A16"]:
+            # tune_all_wqlinears(model)
+            make_quant_attn(model, device, args.flash_attn)
+            make_quant_norm(model)
+            make_fused_mlp(model)
+        device_warmup(device)
+        print("huggingface ckpt loaded")
+        # warming up
+        input_ids = [1 for _ in range(2048)]
+        inputs = torch.as_tensor([input_ids], device=device)
+        out = model(
+            inputs,
+            start_pos=0,
+            chunk_prefilling=args.chunk_prefilling,
+            quant=args.precision in ["W4A16"],
+        )  # warmup
+        if not args.chunk_prefilling:
+            for context_length in args.context_length:
+                context_length = int("".join(context_length))
+                input_ids = [1 for _ in range(context_length)]
+                time_lis = []
+                print("-" * 80)
+                print("Context length: {}".format(context_length))
+                with torch.inference_mode():
+                    for i in range(10):  # Run ten times and get the average value
+                        start_pos = 0
+                        torch.cuda.synchronize()
+                        t_st = time.time()
+                        inputs = torch.as_tensor([input_ids], device=device)
+                        out = model(
+                            inputs,
+                            start_pos=start_pos,
+                            chunk_prefilling=args.chunk_prefilling,
+                            quant=args.precision in ["W4A16"],
+                        )
+                        start_pos += inputs.shape[1]
+                        torch.cuda.synchronize()
+                        t_ed = time.time()
+                        token = torch.argmax(out, keepdim=True)[0]
+                        time_lis.append(t_ed - t_st)
+                        if args.verbose:
+                            print(i, t_ed - t_st)
+                    print(f"Time To First Token: {np.mean(time_lis):.5f} s.")
+                    # decoing throughput
+                    time_lis = []
+                    start_pos = context_length
+                    torch.cuda.synchronize()
+                    t_st = time.time()
+                    for i in range(token_num):
+                        token = model(
+                            token,
+                            start_pos=start_pos,
+                            chunk_prefilling=args.chunk_prefilling,
+                            quant=args.precision in ["W4A16"],
+                        )
+                        start_pos += 1
+                        token = torch.argmax(token, keepdim=True)[0]
+                        torch.cuda.synchronize()
+                    t_ed = time.time()
+                    time_lis.append(t_ed - t_st)
+                    print(
+                        f"Decoding throughput: {token_num/sum(time_lis):.5f} token/s."
+                    )
+                    print("-" * 80)
+        else:
+            for context_length, question_length in zip(
+                args.context_length, args.question_length
+            ):
+                context_length = int("".join(context_length))
+                question_length = int("".join(question_length))
+                input_ids_old = [1 for _ in range(context_length)]
+                input_ids_new = [1 for _ in range(question_length)]
+                time_lis = []
+                print("-" * 80)
+                print(
+                    "History length: {} ; Question length: {}".format(
+                        context_length, question_length
+                    )
+                )
+                with torch.inference_mode():
+                    for i in range(10):  # Run ten times and get the average value
+                        # history rounds
+                        start_pos = 0
+                        if context_length > question_length:
+                            inputs = torch.as_tensor([input_ids_old], device=device)
+                            out = model(
+                                inputs,
+                                start_pos=start_pos,
+                                chunk_prefilling=args.chunk_prefilling,
+                                quant=args.precision in ["W4A16"],
+                            )
+                            start_pos += inputs.shape[1]
+                        # the present round
+                        torch.cuda.synchronize()
+                        t_st = time.time()
+                        inputs = torch.as_tensor([input_ids_new], device=device)
+                        out = model(
+                            inputs,
+                            start_pos=start_pos,
+                            chunk_prefilling=args.chunk_prefilling,
+                            quant=args.precision in ["W4A16"],
+                        )
+                        start_pos += inputs.shape[1]
+                        torch.cuda.synchronize()
+                        t_ed = time.time()
+                        token = out[:, -1].max(1)[1].unsqueeze(1)
+                        time_lis.append(t_ed - t_st)
+                        if args.verbose:
+                            print(i, t_ed - t_st)
+                    print(
+                        f"Time To First Token of this round: {np.mean(time_lis):.5f} s."
+                    )
+                    print("-" * 80)
+if __name__ == "__main__":
+    main()

llm-awq/tinychat/demo.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import argparse
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, modeling_utils
+from attributedict.collections import AttributeDict
+from tinychat.stream_generators import StreamGenerator
+import tinychat.utils.constants
+from tinychat.utils.load_quant import load_awq_model, load_awq_llama_fast
+from tinychat.utils.prompt_templates import get_prompter, get_stop_token_ids
+from tinychat.utils.tune import device_warmup, tune_all_wqlinears
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# opt_params in TinyLLMEngine
+gen_params = AttributeDict(
+    [
+        ("seed", -1),  # RNG seed
+        ("n_threads", 1),  # TODO: fix this
+        ("n_predict", 512),  # new tokens to predict
+        ("n_parts", -1),  # amount of model parts (-1: determine from model dimensions)
+        ("n_ctx", 512),  # context size
+        ("n_batch", 512),  # batch size for prompt processing (must be >=32 to use BLAS)
+        ("n_keep", 0),  # number of tokens to keep from initial prompt
+        ("n_vocab", 50272),  # vocabulary size
+        # sampling parameters
+        ("logit_bias", dict()),  # logit bias for specific tokens: <int, float>
+        ("top_k", 40),  # <= 0 to use vocab size
+        ("top_p", 0.95),  # 1.0 = disabled
+        ("tfs_z", 1.00),  # 1.0 = disabled
+        ("typical_p", 1.00),  # 1.0 = disabled
+        ("temp", 0.70),  # 1.0 = disabled
+        ("repeat_penalty", 1.10),  # 1.0 = disabled
+        (
+            "repeat_last_n",
+            64,
+        ),  # last n tokens to penalize (0 = disable penalty, -1 = context size)
+        ("frequency_penalty", 0.00),  # 0.0 = disabled
+        ("presence_penalty", 0.00),  # 0.0 = disabled
+        ("mirostat", 0),  # 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+        ("mirostat_tau", 5.00),  # target entropy
+        ("mirostat_eta", 0.10),  # learning rate
+    ]
+)
+def stream_output(output_stream):
+    print(f"ASSISTANT: ", end="", flush=True)
+    pre = 0
+    for outputs in output_stream:
+        output_text = outputs["text"]
+        output_text = output_text.strip().split(" ")
+        now = len(output_text) - 1
+        if now > pre:
+            print(" ".join(output_text[pre:now]), end=" ", flush=True)
+            pre = now
+    print(" ".join(output_text[pre:]), flush=True)
+    if "timing" in outputs and outputs["timing"] is not None:
+        timing = outputs["timing"]
+        context_tokens = timing["context_tokens"]
+        context_time = timing["context_time"]
+        total_tokens = timing["total_tokens"]
+        generation_time_list = timing["generation_time_list"]
+        generation_tokens = len(generation_time_list)
+        average_speed = (context_time + np.sum(generation_time_list)) / (
+            context_tokens + generation_tokens
+        )
+        print("=" * 50)
+        print("Speed of Inference")
+        print("-" * 50)
+        print(f"TTFT : { context_time:.3f} s for {context_tokens} tokens")
+        print(
+            f"Speed of Generation : {np.average(generation_time_list)*1000:.2f} ms/token"
+        )
+        print("=" * 50)
+    return " ".join(output_text), total_tokens
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type", type=str, default="LLaMa", help="type of the model"
+    )
+    parser.add_argument(
+        "--dtype", type=str, default="float16", choices=["float16", "bfloat16"]
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        help="path to the model",
+    )
+    parser.add_argument(
+        "--precision", type=str, default="W4A16", help="compute precision"
+    )
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument("--q_group_size", type=int, default=128)
+    parser.add_argument(
+        "--load_quant",
+        type=str,
+        help="path to the pre-quanted 4-bit weights",
+    )
+    parser.add_argument(
+        "--max_seq_len",
+        type=int,
+        default=2048,
+        help="maximum sequence length for kv cache",
+    )
+    parser.add_argument(
+        "--max_batch_size", type=int, default=1, help="maximum batch size for kv cache"
+    )
+    parser.add_argument(
+        "--mem_efficient_load",
+        action="store_true",
+        help="enable mem_efficient_load mod",
+    )
+    parser.add_argument(
+        "--single_round",
+        action="store_true",
+        help="whether to memorize previous conversations",
+    )
+    parser.add_argument(
+        "--flash_attn",
+        action="store_true",
+        help="whether to use flash attention",
+    )
+    parser.add_argument(
+        "--chunk_prefilling",
+        action="store_true",
+        help="If used, in context stage, the history tokens will not be recalculated, greatly speeding up the calculation",
+    )
+    args = parser.parse_args()
+    assert args.model_type.lower() in [
+        "llama",
+        "falcon",
+        "mpt",
+        "qwen",
+    ], "We only support llama & falcon & mpt now"
+    assert args.precision in ["W4A16", "W16A16"], "We only support W4A16/W16A16 now"
+    gen_params.n_predict = 1024
+    gen_params.n_vocab = 32000
+    tinychat.utils.constants.max_batch_size = args.max_batch_size
+    tinychat.utils.constants.max_seq_len = args.max_seq_len
+    tinychat.utils.constants.mem_efficient_load = args.mem_efficient_load
+    if tinychat.utils.constants.mem_efficient_load:
+        print("=" * 80)
+        print(
+            "[Info] You have activated mem_efficient_load mode.\n       Less on-chip memory will be consumed when loading the model.\n       However, the loading process will take more time."
+        )
+        print("=" * 80)
+    # TODO (Haotian): a more elegant implementation here.
+    # We need to update these global variables before models use them.
+    from tinychat.models import (
+        FalconForCausalLM,
+        LlamaForCausalLM,
+        MPTForCausalLM,
+        Qwen2ForCausalLM,
+    )
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.kaiming_normal_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
+    if "mpt" in config.__class__.__name__.lower():
+        # config.init_device="meta"
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.tokenizer_name, trust_remote_code=True
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.model_path, use_fast=False, trust_remote_code=True
+        )
+    torch_dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    modeling_utils._init_weights = False
+    torch.set_default_dtype(torch_dtype)
+    model_type_dict = {
+        "llama": LlamaForCausalLM,
+        "falcon": FalconForCausalLM,
+        "mpt": MPTForCausalLM,
+        "qwen": Qwen2ForCausalLM,
+    }
+    if args.precision == "W4A16":
+        if args.model_type.lower() == "llama":
+            model = model_type_dict["llama"](config).to(torch_dtype)
+            model = load_awq_llama_fast(
+                model, args.load_quant, 4, args.q_group_size, args.device
+            )
+        elif args.model_type.lower() == "qwen":
+            model = model_type_dict["qwen"](config).to(torch_dtype)
+            model = load_awq_llama_fast(
+                model, args.load_quant, 4, args.q_group_size, args.device
+            )
+        else:
+            model = model_type_dict[args.model_type.lower()](config).to(torch_dtype)
+            model = load_awq_model(
+                model, args.load_quant, 4, args.q_group_size, args.device
+            )
+    else:
+        loaded_model = AutoModelForCausalLM.from_pretrained(
+            args.model_path,
+            config=config,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        )
+        model = (
+            model_type_dict[args.model_type.lower()](config)
+            .to(torch_dtype)
+            .to(args.device)
+        )
+        model.load_state_dict(loaded_model.state_dict())
+    # device warm up
+    device_warmup(args.device)
+    # autotune split_k_iters
+    # tune_all_wqlinears(model)
+    # TODO (Haotian): Verify if the StreamGenerator still works for the unmodified falcon impl.
+    stream_generator = StreamGenerator
+    # Optimize AWQ quantized model
+    if args.precision == "W4A16" and (
+        args.model_type.lower() == "llama" or args.model_type.lower() == "qwen"
+    ):
+        from tinychat.modules import make_quant_norm, make_quant_attn
+        if args.flash_attn:
+            make_quant_attn(model, args.device, args.flash_attn)
+        else:
+            make_quant_attn(model, args.device)
+        make_quant_norm(model)
+    model(
+        torch.randint(0, 1000, (1, 512), dtype=torch.int, device="cuda:0"),
+        0,
+        quant=args.precision == "W4A16",
+    )
+    if args.max_seq_len <= 1024:
+        short_prompt = True
+    else:
+        short_prompt = False
+    model_prompter = get_prompter(args.model_type, args.model_path, short_prompt)
+    stop_token_ids = get_stop_token_ids(args.model_type, args.model_path)
+    count = 0
+    start_pos = 0
+    print("=" * 50)
+    while True:
+        # Get input from the user
+        input_prompt = input("USER: ")
+        if input_prompt == "":
+            print("EXIT...")
+            break
+        model_prompter.insert_prompt(input_prompt)
+        output_stream = stream_generator(
+            model,
+            tokenizer,
+            model_prompter.model_input,
+            start_pos,
+            gen_params,
+            device=args.device,
+            stop_token_ids=stop_token_ids,
+            chunk_prefilling=args.chunk_prefilling,
+            quant_llm=args.precision == "W4A16",
+        )
+        outputs, total_tokens = stream_output(output_stream)
+        if args.chunk_prefilling:
+            start_pos += total_tokens
+        else:
+            start_pos = 0
+        if (
+            args.single_round is not True and args.max_seq_len > 512
+        ):  # Only memorize previous conversations when kv_cache_size > 512
+            model_prompter.update_template(outputs, args.chunk_prefilling)
+        count += 1

llm-awq/tinychat/internvl_benchmark.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import argparse
+from termcolor import colored
+import llava
+from llava import conversation as clib
+from llava.media import Image, Video
+import torch
+from awq.quantize import fake_quant
+from awq.quantize.quantizer import real_quantize_model_weight
+from transformers import AutoConfig
+import tinychat
+from torchao.quantization import quantize_, Int4WeightOnlyConfig
+import os
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+def skip(*args, **kwargs):
+    pass
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model-path",
+        "-m",
+        type=str,
+        default="/home/yuming/workspace/qwen/models/nvila-internal-8b-v1",
+    )
+    parser.add_argument(
+        "--quant_path",
+        type=str,
+        default="/PATH/TO/QUANT",
+    )
+    # parser.add_argument("--model-path", "-m", type=str, default="Efficient-Large-Model/J65")
+    # parser.add_argument("--quant_path", type=str, default="/home/yuming/workspace/qwen/models/J65/llm/vila2-J65-w4-g128-awq-v2.pt")
+    parser.add_argument("--conv-mode", "-c", type=str, default="auto")
+    # parser.add_argument("--media", type=str, default="/home/yuming/workspace/space_woaudio.mp4")
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument(
+        "--act_scale_path",
+        type=str,
+        default="/PATH/TO/SCALE",
+    )
+    # quantization options
+    parser.add_argument("--quant_llm", action="store_true")
+    parser.add_argument("--quant_VT", action="store_true")
+    # Four basic tasks
+    parser.add_argument("--video_caption", action="store_true")
+    parser.add_argument("--video_QA", action="store_true")
+    parser.add_argument("--image_caption", action="store_true")
+    parser.add_argument("--image_QA", action="store_true")
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Whether to quantize visiontower and llm, and test all 4 tasks",
+    )
+    parser.add_argument(
+        "--fakequant_VT",
+        action="store_true",
+        help="Use fake quant or real quant for VisionTower",
+    )
+    parser.add_argument(
+        "--all_task", action="store_true", help="Whether to test all 4 tasks"
+    )
+    parser.add_argument(
+        "--video_path", type=str, default="../figures/nvila_demo_video.mp4"
+    )
+    parser.add_argument("--image_path", type=str, default="../figures/vila-logo.jpg")
+    parser.add_argument("--max_seq_len", type=int, default=8192)
+    args = parser.parse_args()
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.kaiming_normal_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    import tinychat.utils.constants
+    tinychat.utils.constants.max_seq_len = args.max_seq_len
+    from transformers import modeling_utils
+    modeling_utils._init_weights = False
+    # Load model
+    from tinychat.models import InternVL3
+    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
+    config.resume_path = args.model_path
+    model = InternVL3(config).half()
+    model.language_model = model.language_model.eval()
+    if args.quant_llm or args.all:
+        from tinychat.modules import (
+            make_quant_norm,
+            make_quant_attn,
+            make_fused_mlp,
+            make_fused_vision_attn,
+        )
+        real_quantize_model_weight(
+            model.language_model,
+            w_bit=4,
+            q_config=dict(q_group_size=128, zero_point=True),
+            init_only=True,
+        )
+        make_quant_attn(model.language_model, "cuda", True)
+        make_quant_norm(model.language_model)
+        make_fused_mlp(model.language_model)
+        model = model.to("cuda")
+    model = model.to(args.device)
+    if args.quant_VT or args.all:
+        from tinychat.modules import QuantInternVisionEncoder
+        model.vision_model.encoder = QuantInternVisionEncoder(model.vision_model.encoder)
+        model.vision_model.encoder = torch.compile(model.vision_model.encoder)
+    model = model.cuda().eval()
+    if args.video_caption or args.all or args.all_task:
+        print("-" * 80)
+        print("Video_Caption")
+        # Set conversation mode
+        clib.default_conversation = clib.conv_templates[args.conv_mode].copy()
+        media = Video(args.video_path)
+        text = "Elaborate on the visual and narrative elements of the video in detail."  # + "1"+" 1"*3069
+        prompt = [media, text]
+        # Generate response
+        with torch.no_grad():
+            response = model.benchmark(prompt, args.quant_llm)
+    if args.video_QA or args.all or args.all_task:
+        print("-" * 80)
+        print("Video_QA")
+        # Set conversation mode
+        clib.default_conversation = clib.conv_templates[args.conv_mode].copy()
+        media = Video(args.video_path)
+        text = "What is the person in the video doing? Select the option that best describes their action: A. Folding paper B. Playing computer games C. Sleeping."  # + "1"+" 1"*3069
+        prompt = [media, text]
+        # Generate response
+        with torch.no_grad():
+            response = model.benchmark(prompt, args.quant_llm)
+    if args.image_caption or args.all or args.all_task:
+        print("-" * 80)
+        print("Image_Caption")
+        # Set conversation mode
+        clib.default_conversation = clib.conv_templates[args.conv_mode].copy()
+        media = Image(args.image_path)
+        text = "Describe the image in detail."
+        prompt = [media, text]
+        # Generate response
+        with torch.no_grad():
+            response = model.benchmark(prompt, args.quant_llm)
+    if args.image_QA or args.all or args.all_task:
+        print("-" * 80)
+        print("Image_QA")
+        # Set conversation mode
+        clib.default_conversation = clib.conv_templates[args.conv_mode].copy()
+        media = Image(args.image_path)
+        text = "What does the text in the image say? Choose the option that best matches: A. VILA B. AIIV C. ALIV."
+        prompt = [media, text]
+        # Generate response
+        with torch.no_grad():
+            response = model.benchmark(prompt, args.quant_llm)
+if __name__ == "__main__":
+    main()

llm-awq/tinychat/split_ckpt.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import re
+import torch
+import argparse
+def split(
+    ckpt_path: str,
+    out_folder_path: str,
+):
+    os.system(f"mkdir -p {out_folder_path}")
+    ckpt = torch.load(ckpt_path)
+    count = 0
+    for key, value in ckpt.items():
+        output_dict = {key: value}
+        output_name = out_folder_path + "/" + key + ".pt"
+        torch.save(output_dict, output_name)
+        count += 1
+    print(f"Finished splitting the original checkpoint into {count} shards.")
+def ckpt_folder_reader(ckpt_folder_path: str):
+    file_list = [f for f in os.listdir(ckpt_folder_path) if f.endswith(".pt")]
+    for ckpt in file_list:
+        print(ckpt)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_path",
+        type=str,
+        default=None,
+        help="Path to the original checkpoint (ends with *.pt)",
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        default=None,
+        help="Folder to store the splitted checkpoint shards",
+    )
+    args = parser.parse_args()
+    assert (
+        args.input_path is not None
+    ), "Please specify the path to the original checkpoint."
+    if args.output_path is None:
+        suffix = r"\.pt$"
+        args.output_path = re.sub(suffix, "", args.input_path)
+    split(args.input_path, args.output_path)

llm-awq/tinychat/vila15_demo.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import argparse
+import torch
+from PIL import Image
+from tqdm import tqdm
+from transformers import AutoConfig, AutoTokenizer
+from accelerate import load_checkpoint_and_dispatch
+from tinychat.utils.tune import (
+    device_warmup,
+    tune_all_wqlinears,
+    tune_llava_patch_embedding,
+)
+from tinychat.utils.prompt_templates import (
+    get_prompter,
+    get_stop_token_ids,
+    get_image_token,
+)
+from tinychat.utils.llava_image_processing import (
+    process_images,
+    load_images,
+    vis_images,
+)
+import tinychat.utils.constants
+# from tinychat.models.llava_llama import LlavaLlamaForCausalLM
+from tinychat.models.vila_llama import VilaLlamaForCausalLM
+from tinychat.stream_generators.llava_stream_gen import LlavaStreamGenerator
+from tinychat.utils.conversation_utils import gen_params, stream_output, TimeStats
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+def image_parser(args):
+    out = args.image_file.split(args.im_sep)
+    return out
+def skip(*args, **kwargs):
+    pass
+def main(args):
+    # Accelerate model initialization
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.kaiming_normal_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    tokenizer = AutoTokenizer.from_pretrained(
+        os.path.join(args.model_path, "llm"), use_fast=False
+    )
+    tinychat.utils.constants.LLAVA_DEFAULT_IMAGE_PATCH_TOKEN_IDX = (
+        tokenizer.convert_tokens_to_ids(
+            [tinychat.utils.constants.LLAVA_DEFAULT_IMAGE_PATCH_TOKEN]
+        )[0]
+    )
+    config = AutoConfig.from_pretrained(args.model_path, trust_remote_code=True)
+    model = VilaLlamaForCausalLM(config).half()
+    tinychat.utils.constants.LLAVA_DEFAULT_IMAGE_PATCH_TOKEN_IDX = (
+        tokenizer.convert_tokens_to_ids(
+            [tinychat.utils.constants.LLAVA_DEFAULT_IMAGE_PATCH_TOKEN]
+        )[0]
+    )
+    vision_tower = model.get_vision_tower()
+    # if not vision_tower.is_loaded:
+    #     vision_tower.load_model()
+    image_processor = vision_tower.image_processor
+    # vision_tower = vision_tower.half()
+    if args.precision == "W16A16":
+        pbar = tqdm(range(1))
+        pbar.set_description("Loading checkpoint shards")
+        for i in pbar:
+            model.llm = load_checkpoint_and_dispatch(
+                model.llm,
+                os.path.join(args.model_path, "llm"),
+                no_split_module_classes=[
+                    "OPTDecoderLayer",
+                    "LlamaDecoderLayer",
+                    "BloomBlock",
+                    "MPTBlock",
+                    "DecoderLayer",
+                    "CLIPEncoderLayer",
+                ],
+            ).to(args.device)
+        model = model.to(args.device)
+    elif args.precision == "W4A16":
+        from tinychat.utils.load_quant import load_awq_model
+        model.llm = load_awq_model(model.llm, args.quant_path, 4, 128, args.device)
+        from tinychat.modules import (
+            make_quant_norm,
+            make_quant_attn,
+            make_fused_mlp,
+            make_fused_vision_attn,
+        )
+        if args.flash_attn:
+            print("Enabling flash-attention!")
+            make_quant_attn(model.llm, args.device, 1)
+        else:
+            print("Disabling flash-attention!")
+            make_quant_attn(model.llm, args.device)
+        make_quant_norm(model.llm)
+        # make_fused_mlp(model)
+        # make_fused_vision_attn(model,args.device)
+        model = model.to(args.device)
+    else:
+        raise NotImplementedError(f"Precision {args.precision} is not supported.")
+    image_files = image_parser(args)
+    image_num = len(image_files)
+    images = load_images(image_files)
+    if args.vis_image:
+        print("=" * 50)
+        print("Input Image:")
+        vis_images(image_files)
+    # Similar operation in model_worker.py
+    image_tensor = process_images(images, image_processor, model.config)
+    if type(image_tensor) is list:
+        image_tensor = [
+            image.to(args.device, dtype=torch.float16) for image in image_tensor
+        ]
+    else:
+        image_tensor = image_tensor.to(args.device, dtype=torch.float16)
+    device_warmup(args.device)
+    tune_llava_patch_embedding(vision_tower, device=args.device)
+    stream_generator = LlavaStreamGenerator
+    if args.max_seq_len <= 1024:
+        short_prompt = True
+    else:
+        short_prompt = False
+    model_prompter = get_prompter(
+        args.model_type, args.model_path, short_prompt, args.empty_prompt
+    )
+    stop_token_ids = get_stop_token_ids(args.model_type, args.model_path)
+    count = 0
+    if args.empty_prompt:
+        input_indicator = "Input: "
+        output_indicator = "Generated: "
+    else:
+        input_indicator = "USER: "
+        output_indicator = "ASSISTANT: "
+    model.eval()
+    time_stats = TimeStats()
+    start_pos = 0
+    while True:
+        # Get input from the user
+        print("=" * 50)
+        input_prompt = input(input_indicator)
+        print("-" * 50)
+        if input_prompt == "":
+            print("EXIT...")
+            time_stats.show()
+            break
+        if count == 0:  # Insert image here
+            image_token = get_image_token(model, args.model_path)
+            image_token_holder = (
+                tinychat.utils.constants.LLAVA_DEFAULT_IM_TOKEN_PLACE_HOLDER
+            )
+            im_token_count = input_prompt.count(image_token_holder)
+            if im_token_count == 0:
+                model_prompter.insert_prompt(image_token * image_num + input_prompt)
+            else:
+                assert im_token_count == image_num
+                input_prompt = input_prompt.replace(image_token_holder, image_token)
+                model_prompter.insert_prompt(input_prompt)
+        else:
+            model_prompter.insert_prompt(input_prompt)
+            if args.chunk_prefilling:
+                image_tensor = None  # Can insert more images in future
+        output_stream = stream_generator(
+            model,
+            tokenizer,
+            model_prompter.model_input,
+            start_pos,
+            gen_params,
+            device=args.device,
+            stop_token_ids=stop_token_ids,
+            image_tensor=image_tensor,
+            chunk_prefilling=args.chunk_prefilling,
+        )
+        print(output_indicator, end="", flush=True)
+        if count == 0:
+            outputs, total_tokens = stream_output(output_stream, time_stats)
+        else:
+            outputs, total_tokens = stream_output(output_stream)
+        if args.chunk_prefilling:
+            start_pos += total_tokens
+        if (
+            args.single_round is not True and args.max_seq_len > 512
+        ):  # Only memorize previous conversations when kv_cache_size > 512
+            model_prompter.update_template(outputs, args.chunk_prefilling)
+        count += 1
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type", type=str, default="LLaMa", help="type of the model"
+    )
+    parser.add_argument(
+        "--model-path", type=str, default="/data/llm/checkpoints/llava/llava-v1.5-7b"
+    )
+    parser.add_argument(
+        "--quant-path",
+        type=str,
+        default="/data/llm/checkpoints/llava/llava-v1.5-7b-w4-g128-awq.pt",
+    )
+    parser.add_argument(
+        "--precision", type=str, default="W4A16", help="compute precision"
+    )
+    parser.add_argument(
+        "--image-file",
+        type=str,
+        default="https://llava.hliu.cc/file=/nobackup/haotian/code/LLaVA/llava/serve/examples/extreme_ironing.jpg",
+    )
+    parser.add_argument(
+        "--im-sep",
+        type=str,
+        default=",",
+    )
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--max_seq_len", type=int, default=2048)
+    parser.add_argument(
+        "--single_round",
+        action="store_true",
+        help="whether to memorize previous conversations",
+    )
+    parser.add_argument(
+        "--vis-image",
+        action="store_true",
+        help="whether to visualize the image while chatting",
+    )
+    parser.add_argument(
+        "--empty-prompt",
+        action="store_true",
+        help="whether to use empty prompt template",
+    )
+    parser.add_argument(
+        "--flash_attn",
+        action="store_true",
+        help="whether to use flash attention",
+    )
+    parser.add_argument(
+        "--chunk_prefilling",
+        action="store_true",
+        help="If used, in context stage, the history tokens will not be recalculated, greatly speeding up the calculation",
+    )
+    args = parser.parse_args()
+    main(args)

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_sot.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_2

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_2/afrimgsm_yor.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_2

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_ibo.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: ibo
+include: afrimgsm_yaml
+task: afrimgsm_ibo_prompt_3

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_kin.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: kin
+include: afrimgsm_yaml
+task: afrimgsm_kin_prompt_3

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sna.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: sna
+include: afrimgsm_yaml
+task: afrimgsm_sna_prompt_3

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_sot.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: sot
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_3

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_xho.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: xho
+include: afrimgsm_yaml
+task: afrimgsm_xho_prompt_3

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+tag:
+    - afrimgsm_tasks
+    - afrimgsm_tasks_prompt_3
+dataset_path: masakhane/afrimgsm
+output_type: generate_until
+test_split: test
+doc_to_target: '{% if answer is not none %}{{answer[21:]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: "Solve the following math question \n\nQuestion: {{question}} \nAnswer: "
+target_delimiter: ""
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+  - filter:
+    - function: regex
+      group_select: -1
+      regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 2.0

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_yor.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: yor
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_3

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_3/afrimgsm_zul.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# Generated by utils.py
+dataset_name: zul
+include: afrimgsm_yaml
+task: afrimgsm_zul_prompt_3

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_ibo.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: ibo
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_ibo_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lin.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: lin
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_lin_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_lug.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: lug
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_lug_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_orm.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: orm
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_orm_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sna.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: sna
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_sna_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_sot.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: sot
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_sot_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_swa.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: swa
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_swa_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_twi.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: twi
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_twi_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_vai.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: vai
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_vai_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_wol.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: wol
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_wol_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_xho.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: xho
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_xho_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_4/afrimgsm_yor.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: yor
+doc_to_text: "Answer the given question with the appropriate numerical value, ensuring\
+  \ that the response is clear and without any supplementary information. \n\nQuestion:\
+  \ {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_yor_prompt_4

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_amh.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: amh
+doc_to_text: "For mathematical questions provided in Amharic language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_amh_prompt_5

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_eng.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# Generated by utils.py
+dataset_name: eng
+doc_to_text: "For mathematical questions provided in English language. Supply the\
+  \ accurate numeric answer to the provided question. \n\nQuestion: {{question}} \n\
+  Answer: "
+include: afrimgsm_yaml
+task: afrimgsm_eng_prompt_5

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_ewe.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+# Generated by utils.py
+dataset_name: ewe
+doc_to_text: "For mathematical questions provided in Ewe language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_ewe_prompt_5

lm-evaluation-harness/lm_eval/tasks/afrimgsm/direct/prompt_5/afrimgsm_fra.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+# Generated by utils.py
+dataset_name: fra
+doc_to_text: "For mathematical questions provided in French language. Supply the accurate\
+  \ numeric answer to the provided question. \n\nQuestion: {{question}} \nAnswer: "
+include: afrimgsm_yaml
+task: afrimgsm_fra_prompt_5