| import os |
| import gradio as gr |
| import psutil |
| from llama_cpp import Llama |
|
|
| os.environ["HF_HOME"] = "/tmp/hf_cache" |
|
|
| model = Llama.from_pretrained( |
| repo_id="unsloth/Qwen3.5-35B-A3B-GGUF", |
| filename="Qwen3.5-35B-A3B-UD-IQ4_XS.gguf", |
| n_ctx=2048, |
| n_threads=8, |
| ) |
|
|
| def get_stats(): |
| process = psutil.Process(os.getpid()) |
| ram = process.memory_info().rss / 1024 ** 3 |
| disk_tmp = psutil.disk_usage('/tmp').used / 1024 ** 3 |
| disk_data = psutil.disk_usage('/data').used / 1024 ** 3 |
| cpu = psutil.cpu_percent(interval=1, percpu=True) |
| return f"RAM: {ram:.2f} GB | /tmp: {disk_tmp:.2f} GB | /data: {disk_data:.2f} GB | CPU: {cpu}%" |
|
|
| def chat(message, history): |
| messages = [{"role": "system", "content": "Reply directly without any reasoning or thinking process."}] |
| messages.append({"role": "user", "content": message}) |
|
|
| output = "" |
| for chunk in model.create_chat_completion( |
| messages=messages, |
| max_tokens=2048, |
| stream=True |
| ): |
| delta = chunk["choices"][0]["delta"].get("content", "") |
| output += delta |
| yield output |
|
|
| with gr.Blocks() as demo: |
| stats = gr.Textbox(label="System Stats", value=get_stats, every=5) |
| gr.ChatInterface(chat) |
|
|
| demo.launch(server_name="0.0.0.0", server_port=7860) |