| import flask |
| from flask import request, jsonify |
| from llama_cpp import Llama |
| import os |
| from huggingface_hub import hf_hub_download |
|
|
| |
| app = flask.Flask(__name__) |
|
|
| |
| REPO_ID = "dexcommunity/indexQ4" |
| GGUF_FILENAME = "indexq4.gguf" |
|
|
| print(f"🔄 Downloading GGUF model from {REPO_ID}...") |
|
|
| |
| try: |
| model_path = hf_hub_download( |
| repo_id=REPO_ID, |
| filename=GGUF_FILENAME, |
| repo_type="model" |
| ) |
| print(f"✅ Model downloaded to: {model_path}") |
| except Exception as e: |
| print(f"❌ Download failed: {e}") |
| print("💡 Make sure your GGUF file is uploaded to HuggingFace!") |
| raise |
|
|
| print(f"🔄 Loading GGUF model with llama.cpp...") |
|
|
| |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=4, |
| n_batch=512, |
| verbose=False, |
| n_gpu_layers=0 |
| ) |
|
|
| print(f"✅ GGUF Model loaded successfully!") |
| print(f"📊 Model: {GGUF_FILENAME}") |
| print(f"🔧 Context: 2048 tokens, Threads: 4") |
|
|
| @app.route('/chat', methods=['POST']) |
| def chat(): |
| try: |
| data = request.get_json() |
| msg = data.get("message", "") |
|
|
| if not msg: |
| return jsonify({"error": "No message sent"}), 400 |
|
|
| |
| prompt = f"""<start_of_turn>user |
| {msg}<end_of_turn> |
| <start_of_turn>model |
| """ |
|
|
| |
| response = llm( |
| prompt, |
| max_tokens=256, |
| temperature=0.7, |
| top_p=0.9, |
| top_k=40, |
| repeat_penalty=1.1, |
| stop=["<end_of_turn>", "<start_of_turn>"], |
| echo=False |
| ) |
|
|
| |
| reply = response['choices'][0]['text'].strip() |
|
|
| return jsonify({ |
| "reply": reply, |
| "tokens_used": response['usage']['completion_tokens'] |
| }) |
|
|
| except Exception as e: |
| import traceback |
| error_details = traceback.format_exc() |
| print(f"❌ Error: {error_details}") |
| return jsonify({"error": str(e)}), 500 |
|
|
| @app.route('/health', methods=['GET']) |
| def health(): |
| """Health check endpoint""" |
| return jsonify({ |
| "status": "healthy", |
| "model": GGUF_FILENAME, |
| "backend": "llama.cpp (GGUF)", |
| "device": "CPU" |
| }) |
|
|
| if __name__ == "__main__": |
| app.run(host='0.0.0.0', port=7860, debug=False, threaded=True) |