Spaces:

Neon-tech
/

Test

Running

File size: 1,276 Bytes

0402e7b
ac3ae82
fff1115
57cb2d0
75503bc
57cb2d0
75503bc
57cb2d0
 
2ef27f1
57cb2d0
764e46b
57cb2d0
da7701c
fff1115
 
 
75503bc
113b771
94cc835
 
da7701c
 
57cb2d0
da7701c
 
fd8889f
57cb2d0
 
 
 
 
 
 
fd8889f
da7701c
fff1115

import os
import gradio as gr
import psutil
from llama_cpp import Llama

os.environ["HF_HOME"] = "/tmp/hf_cache"

model = Llama.from_pretrained(
    repo_id="unsloth/Qwen3.5-35B-A3B-GGUF",
    filename="Qwen3.5-35B-A3B-UD-IQ4_XS.gguf",
    n_ctx=2048,
    n_threads=8,
)

def get_stats():
    process = psutil.Process(os.getpid())
    ram = process.memory_info().rss / 1024 ** 3
    disk_tmp = psutil.disk_usage('/tmp').used / 1024 ** 3
    disk_data = psutil.disk_usage('/data').used / 1024 ** 3
    cpu = psutil.cpu_percent(interval=1, percpu=True)
    return f"RAM: {ram:.2f} GB | /tmp: {disk_tmp:.2f} GB | /data: {disk_data:.2f} GB | CPU: {cpu}%"

def chat(message, history):
    messages = [{"role": "system", "content": "Reply directly without any reasoning or thinking process."}]
    messages.append({"role": "user", "content": message})

    output = ""
    for chunk in model.create_chat_completion(
        messages=messages,
        max_tokens=2048,
        stream=True
    ):
        delta = chunk["choices"][0]["delta"].get("content", "")
        output += delta
        yield output

with gr.Blocks() as demo:
    stats = gr.Textbox(label="System Stats", value=get_stats, every=5)
    gr.ChatInterface(chat)

demo.launch(server_name="0.0.0.0", server_port=7860)