Spaces:
Running
Running
Arif
commited on
Commit
Β·
697bc47
1
Parent(s):
5ca944e
Updating docker model runner
Browse files
backend/app/services/llm_service.py
CHANGED
|
@@ -135,7 +135,7 @@ class LLMServiceMLX(BaseLLMService):
|
|
| 135 |
|
| 136 |
|
| 137 |
class LLMServiceDockerModelRunner(BaseLLMService):
|
| 138 |
-
"""Docker Model Runner implementation"""
|
| 139 |
|
| 140 |
def __init__(self, model_name: str, max_tokens: int, temperature: float, docker_url: str, timeout: int = 300):
|
| 141 |
super().__init__(model_name, max_tokens, temperature)
|
|
@@ -152,7 +152,7 @@ class LLMServiceDockerModelRunner(BaseLLMService):
|
|
| 152 |
self.logger.info(f"π Connecting to Docker Model Runner: {self.docker_url}")
|
| 153 |
self.client = httpx.AsyncClient(timeout=self.timeout)
|
| 154 |
|
| 155 |
-
#
|
| 156 |
response = await self.client.get(f"{self.docker_url}/models")
|
| 157 |
|
| 158 |
if response.status_code == 200:
|
|
@@ -173,13 +173,13 @@ class LLMServiceDockerModelRunner(BaseLLMService):
|
|
| 173 |
|
| 174 |
try:
|
| 175 |
payload = {
|
| 176 |
-
"model": self.model_name,
|
| 177 |
"messages": [{"role": "user", "content": prompt}],
|
| 178 |
"temperature": self.temperature,
|
| 179 |
"max_tokens": self.max_tokens,
|
| 180 |
}
|
| 181 |
|
| 182 |
-
#
|
| 183 |
response = await self.client.post(
|
| 184 |
f"{self.docker_url}/chat/completions",
|
| 185 |
json=payload
|
|
@@ -187,7 +187,7 @@ class LLMServiceDockerModelRunner(BaseLLMService):
|
|
| 187 |
|
| 188 |
if response.status_code == 200:
|
| 189 |
result = response.json()
|
| 190 |
-
return result["choices"]["message"]["content"]
|
| 191 |
else:
|
| 192 |
self.logger.error(f"β Docker Model Runner error: {response.status_code} - {response.text}")
|
| 193 |
raise RuntimeError(f"Model Runner error: {response.status_code}")
|
|
@@ -204,6 +204,7 @@ class LLMServiceDockerModelRunner(BaseLLMService):
|
|
| 204 |
|
| 205 |
|
| 206 |
|
|
|
|
| 207 |
class LLMServiceMock(BaseLLMService):
|
| 208 |
"""Mock implementation as fallback"""
|
| 209 |
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
class LLMServiceDockerModelRunner(BaseLLMService):
|
| 138 |
+
"""Docker Model Runner implementation - OpenAI-compatible API"""
|
| 139 |
|
| 140 |
def __init__(self, model_name: str, max_tokens: int, temperature: float, docker_url: str, timeout: int = 300):
|
| 141 |
super().__init__(model_name, max_tokens, temperature)
|
|
|
|
| 152 |
self.logger.info(f"π Connecting to Docker Model Runner: {self.docker_url}")
|
| 153 |
self.client = httpx.AsyncClient(timeout=self.timeout)
|
| 154 |
|
| 155 |
+
# OpenAI-compatible endpoint: GET /v1/models
|
| 156 |
response = await self.client.get(f"{self.docker_url}/models")
|
| 157 |
|
| 158 |
if response.status_code == 200:
|
|
|
|
| 173 |
|
| 174 |
try:
|
| 175 |
payload = {
|
| 176 |
+
"model": self.model_name, # "ai/llama3.2:1B-Q4_0"
|
| 177 |
"messages": [{"role": "user", "content": prompt}],
|
| 178 |
"temperature": self.temperature,
|
| 179 |
"max_tokens": self.max_tokens,
|
| 180 |
}
|
| 181 |
|
| 182 |
+
# OpenAI-compatible endpoint: POST /v1/chat/completions
|
| 183 |
response = await self.client.post(
|
| 184 |
f"{self.docker_url}/chat/completions",
|
| 185 |
json=payload
|
|
|
|
| 187 |
|
| 188 |
if response.status_code == 200:
|
| 189 |
result = response.json()
|
| 190 |
+
return result["choices"][0]["message"]["content"]
|
| 191 |
else:
|
| 192 |
self.logger.error(f"β Docker Model Runner error: {response.status_code} - {response.text}")
|
| 193 |
raise RuntimeError(f"Model Runner error: {response.status_code}")
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
|
| 207 |
+
|
| 208 |
class LLMServiceMock(BaseLLMService):
|
| 209 |
"""Mock implementation as fallback"""
|
| 210 |
|
docker-compose.yml
CHANGED
|
@@ -14,6 +14,8 @@ services:
|
|
| 14 |
networks:
|
| 15 |
- llm-network
|
| 16 |
hostname: backend
|
|
|
|
|
|
|
| 17 |
|
| 18 |
frontend:
|
| 19 |
build:
|
|
|
|
| 14 |
networks:
|
| 15 |
- llm-network
|
| 16 |
hostname: backend
|
| 17 |
+
extra_hosts:
|
| 18 |
+
- "model-runner.docker.internal:host-gateway"
|
| 19 |
|
| 20 |
frontend:
|
| 21 |
build:
|