| |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from typing import Dict, List, Any |
|
|
| class EndpointHandler: |
| def __init__(self, path: str = ""): |
| """ |
| Initialize the model and tokenizer. |
| :param path: Path to the model repository (not used directly since we load from Hugging Face Hub). |
| """ |
| |
| self.base_model_name = "mistralai/Mistral-7B-Instruct-v0.3" |
| self.adapter_model_name = "Danna8/MistralF" |
|
|
| |
| self.tokenizer = AutoTokenizer.from_pretrained(self.adapter_model_name) |
|
|
| |
| self.model = AutoModelForCausalLM.from_pretrained( |
| self.base_model_name, |
| torch_dtype=torch.float16, |
| device_map="auto" |
| ) |
|
|
| |
| self.model.load_adapter(self.adapter_model_name) |
| self.model.set_active_adapters("default") |
|
|
| def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
| """ |
| Handle inference requests. |
| :param data: Input data containing the text to process. |
| :return: List of generated outputs. |
| """ |
| |
| inputs = data.get("inputs", "") |
| if not inputs: |
| return [{"error": "No input provided"}] |
|
|
| |
| tokenized_inputs = self.tokenizer(inputs, return_tensors="pt").to("cuda") |
|
|
| |
| outputs = self.model.generate( |
| **tokenized_inputs, |
| max_new_tokens=50, |
| do_sample=True, |
| top_p=0.95, |
| temperature=0.7, |
| pad_token_id=self.tokenizer.eos_token_id |
| ) |
|
|
| |
| generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
| |
| return [{"generated_text": generated_text}] |