ConicAI_LLM_model / handler.py
girish00's picture
update endpoint helper files
f2d1187 verified
Raw
History Blame Contribute Delete
4.79 kB
import time
from typing import Any, Dict
import torch
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from infer_local import (
build_instruction_prompt,
build_structured_result,
has_adapter_weights,
has_full_model_weights,
)
DEFAULT_BASE_MODEL = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
def as_bool(value: Any) -> bool:
if isinstance(value, bool):
return value
if isinstance(value, str):
return value.strip().lower() in {"1", "true", "yes", "y", "on"}
return bool(value)
def clamp_int(value: Any, default: int, minimum: int, maximum: int) -> int:
try:
parsed = int(value)
except (TypeError, ValueError):
parsed = default
return max(minimum, min(maximum, parsed))
class EndpointHandler:
def __init__(self, path: str = ""):
self.path = path or "."
adapter_config_path = f"{self.path}/adapter_config.json"
adapter_weights_present = has_adapter_weights(self.path)
full_model_weights_present = has_full_model_weights(self.path)
if adapter_weights_present:
peft_config = PeftConfig.from_pretrained(self.path)
base_model_name = peft_config.base_model_name_or_path or DEFAULT_BASE_MODEL
self.tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)
self.model = PeftModel.from_pretrained(base_model, self.path)
elif full_model_weights_present:
self.tokenizer = AutoTokenizer.from_pretrained(self.path)
self.model = AutoModelForCausalLM.from_pretrained(
self.path,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
)
else:
raise RuntimeError(
f"No adapter or full-model weights found at endpoint model path: {self.path}"
)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
self.model.eval()
self.model.generation_config.do_sample = False
self.model.generation_config.temperature = 1.0
self.model.generation_config.top_p = 1.0
self.model.generation_config.top_k = 50
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
user_prompt = data.get("inputs", data.get("prompt", ""))
if isinstance(user_prompt, list):
user_prompt = user_prompt[0] if user_prompt else ""
user_prompt = str(user_prompt).strip()
if not user_prompt:
return {
"error": "Missing prompt. Send {'inputs': 'your coding prompt'}."
}
parameters = data.get("parameters", {}) or {}
max_new_tokens = clamp_int(parameters.get("max_new_tokens"), 320, 1, 1024)
do_sample = as_bool(parameters.get("do_sample", False))
prompt_text = build_instruction_prompt(user_prompt)
inputs = self.tokenizer(prompt_text, return_tensors="pt").to(self.device)
generation_kwargs = {
"max_new_tokens": max_new_tokens,
"output_scores": True,
"return_dict_in_generate": True,
"do_sample": do_sample,
"pad_token_id": self.tokenizer.eos_token_id,
}
if do_sample:
generation_kwargs["temperature"] = float(parameters.get("temperature", 0.25))
generation_kwargs["top_p"] = float(parameters.get("top_p", 0.9))
started_at = time.perf_counter()
with torch.no_grad():
generated = self.model.generate(**inputs, **generation_kwargs)
latency_ms = int((time.perf_counter() - started_at) * 1000)
output_ids = generated.sequences[0]
prompt_len = inputs["input_ids"].shape[1]
generated_ids = output_ids[prompt_len:].tolist()
generated_text = self.tokenizer.decode(
generated_ids,
skip_special_tokens=True,
).strip()
token_confidences = []
if generated.scores:
for token_id, score_tensor in zip(generated_ids, generated.scores):
probs = torch.softmax(score_tensor[0], dim=-1)
token_confidences.append(float(probs[token_id].item()))
return build_structured_result(
user_prompt,
generated_text,
latency_ms,
tokenizer=self.tokenizer,
generated_ids=generated_ids,
token_confidences=token_confidences,
)