Spaces:

shuarya2011
/

fast

Build error

App Files Files Community

shuarya2011 commited on 21 days ago

Commit

8d9e69c

verified ·

1 Parent(s): 1b33c14

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -25

app.py CHANGED Viewed

@@ -1,46 +1,46 @@
 import gradio as gr
 from llama_cpp import Llama
-# Load the Llama 3.2 1B Uncensored model (GGUF format)
-# We use llama_cpp's from_pretrained to download and cache the model automatically
 llm = Llama.from_pretrained(
     repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF",
     filename="llama3.2-1b-Uncensored.Q4_K_M.gguf",
-    verbose=False,
-    n_ctx=2048  # Context window size
 )
-def stream_predict(message, history):
-    """
-    Generator function that yields text word-by-word.
-    """
     # Create the generation stream
-    stream = llm.create_chat_completion(
-        messages=[
-            {"role": "system", "content": "You are a helpful, uncensored AI assistant."},
-            {"role": "user", "content": message}
-        ],
-        stream=True,  # This enables the token-by-token output
         max_tokens=512,
-        temperature=0.7
     )
     partial_text = ""
     for chunk in stream:
-        # Extract the new token from the stream chunk
-        new_token = chunk['choices'][0]['delta'].get('content', '')
         partial_text += new_token
-        # Yield the full string so far; Gradio handles the incremental display
         yield partial_text
-# Define the Gradio Chat Interface
 demo = gr.ChatInterface(
-    fn=stream_predict,
-    title="Llama 3.2 1B Uncensored - Fast CPU Stream",
-    description="This space runs a 1B parameter model with word-by-word streaming on the free CPU tier.",
-    examples=["Write an uncensored story about a cyberpunk city.", "Explain quantum physics in simple terms."]
 )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from llama_cpp import Llama
+# Initialize the model
+# We set n_threads=2 to match the Free Tier vCPU allocation
+# n_gpu_layers=0 ensures we don't look for a non-existent GPU
 llm = Llama.from_pretrained(
     repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF",
     filename="llama3.2-1b-Uncensored.Q4_K_M.gguf",
+    n_ctx=2048,
+    n_threads=2,
+    n_gpu_layers=0,
+    verbose=False
 )
+def stream_chat(message, history):
+    # Prepare the prompt template
+    prompt = f"User: {message}\nAssistant: "
     # Create the generation stream
+    stream = llm(
+        prompt,
         max_tokens=512,
+        stop=["User:", "\n"],
+        stream=True, # Enable token-by-token output
+        temperature=0.8,
+        top_p=0.95
     )
     partial_text = ""
     for chunk in stream:
+        # Extract the new token text
+        new_token = chunk['choices'][0]['text']
         partial_text += new_token
+        # Yielding the string updates the Gradio UI in real-time
         yield partial_text
+# Set up the Gradio interface
 demo = gr.ChatInterface(
+    fn=stream_chat,
+    title="Llama 3.2 1B Uncensored",
+    description="Smart, uncensored, and fast word-by-word streaming on CPU."
 )
 if __name__ == "__main__":
+    demo.launch()