shuarya2011 commited on
Commit
8d9e69c
·
verified ·
1 Parent(s): 1b33c14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -25
app.py CHANGED
@@ -1,46 +1,46 @@
1
  import gradio as gr
2
  from llama_cpp import Llama
3
 
4
- # Load the Llama 3.2 1B Uncensored model (GGUF format)
5
- # We use llama_cpp's from_pretrained to download and cache the model automatically
 
6
  llm = Llama.from_pretrained(
7
  repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF",
8
  filename="llama3.2-1b-Uncensored.Q4_K_M.gguf",
9
- verbose=False,
10
- n_ctx=2048 # Context window size
 
 
11
  )
12
 
13
- def stream_predict(message, history):
14
- """
15
- Generator function that yields text word-by-word.
16
- """
17
  # Create the generation stream
18
- stream = llm.create_chat_completion(
19
- messages=[
20
- {"role": "system", "content": "You are a helpful, uncensored AI assistant."},
21
- {"role": "user", "content": message}
22
- ],
23
- stream=True, # This enables the token-by-token output
24
  max_tokens=512,
25
- temperature=0.7
 
 
 
26
  )
27
 
28
  partial_text = ""
29
  for chunk in stream:
30
- # Extract the new token from the stream chunk
31
- new_token = chunk['choices'][0]['delta'].get('content', '')
32
  partial_text += new_token
33
- # Yield the full string so far; Gradio handles the incremental display
34
  yield partial_text
35
 
36
- # Define the Gradio Chat Interface
37
  demo = gr.ChatInterface(
38
- fn=stream_predict,
39
- title="Llama 3.2 1B Uncensored - Fast CPU Stream",
40
- description="This space runs a 1B parameter model with word-by-word streaming on the free CPU tier.",
41
- examples=["Write an uncensored story about a cyberpunk city.", "Explain quantum physics in simple terms."]
42
  )
43
 
44
  if __name__ == "__main__":
45
- demo.launch()
46
-
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
 
4
+ # Initialize the model
5
+ # We set n_threads=2 to match the Free Tier vCPU allocation
6
+ # n_gpu_layers=0 ensures we don't look for a non-existent GPU
7
  llm = Llama.from_pretrained(
8
  repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF",
9
  filename="llama3.2-1b-Uncensored.Q4_K_M.gguf",
10
+ n_ctx=2048,
11
+ n_threads=2,
12
+ n_gpu_layers=0,
13
+ verbose=False
14
  )
15
 
16
+ def stream_chat(message, history):
17
+ # Prepare the prompt template
18
+ prompt = f"User: {message}\nAssistant: "
19
+
20
  # Create the generation stream
21
+ stream = llm(
22
+ prompt,
 
 
 
 
23
  max_tokens=512,
24
+ stop=["User:", "\n"],
25
+ stream=True, # Enable token-by-token output
26
+ temperature=0.8,
27
+ top_p=0.95
28
  )
29
 
30
  partial_text = ""
31
  for chunk in stream:
32
+ # Extract the new token text
33
+ new_token = chunk['choices'][0]['text']
34
  partial_text += new_token
35
+ # Yielding the string updates the Gradio UI in real-time
36
  yield partial_text
37
 
38
+ # Set up the Gradio interface
39
  demo = gr.ChatInterface(
40
+ fn=stream_chat,
41
+ title="Llama 3.2 1B Uncensored",
42
+ description="Smart, uncensored, and fast word-by-word streaming on CPU."
 
43
  )
44
 
45
  if __name__ == "__main__":
46
+ demo.launch()