Test / app.py
Neon-tech's picture
Update app.py
764e46b verified
import os
import gradio as gr
import psutil
from llama_cpp import Llama
os.environ["HF_HOME"] = "/tmp/hf_cache"
model = Llama.from_pretrained(
repo_id="unsloth/Qwen3.5-35B-A3B-GGUF",
filename="Qwen3.5-35B-A3B-UD-IQ4_XS.gguf",
n_ctx=2048,
n_threads=8,
)
def get_stats():
process = psutil.Process(os.getpid())
ram = process.memory_info().rss / 1024 ** 3
disk_tmp = psutil.disk_usage('/tmp').used / 1024 ** 3
disk_data = psutil.disk_usage('/data').used / 1024 ** 3
cpu = psutil.cpu_percent(interval=1, percpu=True)
return f"RAM: {ram:.2f} GB | /tmp: {disk_tmp:.2f} GB | /data: {disk_data:.2f} GB | CPU: {cpu}%"
def chat(message, history):
messages = [{"role": "system", "content": "Reply directly without any reasoning or thinking process."}]
messages.append({"role": "user", "content": message})
output = ""
for chunk in model.create_chat_completion(
messages=messages,
max_tokens=2048,
stream=True
):
delta = chunk["choices"][0]["delta"].get("content", "")
output += delta
yield output
with gr.Blocks() as demo:
stats = gr.Textbox(label="System Stats", value=get_stats, every=5)
gr.ChatInterface(chat)
demo.launch(server_name="0.0.0.0", server_port=7860)