File size: 1,276 Bytes
0402e7b
ac3ae82
fff1115
57cb2d0
75503bc
57cb2d0
75503bc
57cb2d0
 
2ef27f1
57cb2d0
764e46b
57cb2d0
da7701c
fff1115
 
 
75503bc
113b771
94cc835
 
da7701c
 
57cb2d0
da7701c
 
fd8889f
57cb2d0
 
 
 
 
 
 
fd8889f
da7701c
fff1115
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import os
import gradio as gr
import psutil
from llama_cpp import Llama

os.environ["HF_HOME"] = "/tmp/hf_cache"

model = Llama.from_pretrained(
    repo_id="unsloth/Qwen3.5-35B-A3B-GGUF",
    filename="Qwen3.5-35B-A3B-UD-IQ4_XS.gguf",
    n_ctx=2048,
    n_threads=8,
)

def get_stats():
    process = psutil.Process(os.getpid())
    ram = process.memory_info().rss / 1024 ** 3
    disk_tmp = psutil.disk_usage('/tmp').used / 1024 ** 3
    disk_data = psutil.disk_usage('/data').used / 1024 ** 3
    cpu = psutil.cpu_percent(interval=1, percpu=True)
    return f"RAM: {ram:.2f} GB | /tmp: {disk_tmp:.2f} GB | /data: {disk_data:.2f} GB | CPU: {cpu}%"

def chat(message, history):
    messages = [{"role": "system", "content": "Reply directly without any reasoning or thinking process."}]
    messages.append({"role": "user", "content": message})

    output = ""
    for chunk in model.create_chat_completion(
        messages=messages,
        max_tokens=2048,
        stream=True
    ):
        delta = chunk["choices"][0]["delta"].get("content", "")
        output += delta
        yield output

with gr.Blocks() as demo:
    stats = gr.Textbox(label="System Stats", value=get_stats, every=5)
    gr.ChatInterface(chat)

demo.launch(server_name="0.0.0.0", server_port=7860)