| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
| from peft import PeftModel |
| import torch |
| import os |
| import traceback |
|
|
| class EndpointHandler: |
| def __init__(self, path=""): |
| base_model_id = "microsoft/Phi-4-mini-instruct" |
| adapter_path = path |
|
|
| try: |
| print(f"Iniciando Handler: Carregando modelo base {base_model_id}") |
| |
| dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 |
| self.base_model = AutoModelForCausalLM.from_pretrained( |
| base_model_id, |
| torch_dtype=dtype, |
| trust_remote_code=True |
| |
| ) |
|
|
| print(f"Carregando tokenizer de {base_model_id}") |
| self.tokenizer = AutoTokenizer.from_pretrained( |
| base_model_id, |
| trust_remote_code=True |
| ) |
| if self.tokenizer.pad_token is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: |
| self.tokenizer.pad_token = self.tokenizer.unk_token |
| print("Definido tokenizer.pad_token = tokenizer.unk_token") |
|
|
| print(f"Carregando adaptador LoRA de {adapter_path}") |
| self.model = PeftModel.from_pretrained(self.base_model, adapter_path) |
| self.model.eval() |
| print("Adaptador LoRA carregado.") |
|
|
| self.pipeline = pipeline( |
| "text-generation", |
| model=self.model, |
| tokenizer=self.tokenizer, |
| |
| ) |
| print("Pipeline de text-generation criado. Handler pronto.") |
|
|
| except Exception as e: |
| print(f"ERRO FATAL durante __init__ do Handler: {e}") |
| print(traceback.format_exc()) |
| raise e |
|
|
|
|
| def __call__(self, data): |
| try: |
| inputs = data.pop("inputs", data) |
| parameters = data.pop("parameters", None) or {} |
|
|
| print(f"Handler __call__ recebeu inputs: {inputs}") |
| print(f"Handler __call__ recebeu parâmetros: {parameters}") |
|
|
| |
| prompt_text = inputs |
| if isinstance(inputs, list) and len(inputs) > 0 and isinstance(inputs[0], dict) and 'role' in inputs[0]: |
| print("Aplicando chat template...") |
| |
| prompt_text = self.tokenizer.apply_chat_template(inputs, tokenize=False, add_generation_prompt=True) |
|
|
| print(f"Texto do prompt para o pipeline: {prompt_text}") |
|
|
| |
| outputs = self.pipeline(prompt_text, **parameters) |
|
|
| print(f"Handler __call__ gerou outputs: {outputs}") |
| |
| return outputs |
|
|
| except Exception as e: |
| print(f"ERRO durante __call__ do Handler: {e}") |
| print(traceback.format_exc()) |
| |
| return [{"error": str(e), "traceback": traceback.format_exc()}] |