Spaces:

IniNLP247
/

Kenko

Running on Zero

App Files Files Community

IniNLP247 commited on Oct 21

Commit

66954e0

verified ·

1 Parent(s): c253a21

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -41

app.py CHANGED Viewed

@@ -1,15 +1,19 @@
-#INFERENCE NLP+EMOTION DETECTION CV+TTS
 import spaces
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 from deepface import DeepFace
 import threading
 import time
-from parler_tts import ParlerTTSForConditionalGeneration
-import soundfile as sf
 import numpy as np
 # Model setup
 model_name = "IniNLP247/Kenko-mental-health-llama-3-model"
@@ -20,6 +24,7 @@ print("🔄 Loading Kenko Mental Health Model...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
@@ -43,12 +48,46 @@ pipe = pipeline(
 print("✅ Model loaded successfully!")
 #Loading of TTS
-print("Loading Parler TTS Model...")
 tts_device = "cuda:0" if torch.cuda.is_available() else "cpu"
-tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1", torch_dtype=torch.float16).to(tts_device)
-tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
-print("✅ Parler TTS Model loaded successfully!")
 # Global variable to store current emotion state
 current_emotion_state = {
@@ -125,10 +164,13 @@ def chat_with_kenko(message, history):
     # Get emotion context
     emotion_context = get_emotion_context()
     # Create prompt in instruction format with emotion awareness
     prompt = f"""### Instruction:
 You are Kenko, a compassionate mental health therapist. Provide empathetic, helpful, and professional responses to support the user's mental wellbeing.
-{emotion_context}
 {conversation}User: {message}
@@ -144,35 +186,49 @@ You are Kenko, a compassionate mental health therapist. Provide empathetic, help
 def generate_tts(text):
     try:
-        # Limit text severely for testing
-        text = text[:200]  # Even shorter for testing
-        print(f"[TTS] Starting generation for {len(text)} chars: '{text[:50]}...'")
-        description = "A calm, empathetic voice speaking at a moderate pace."
-        input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_device)
-        prompt_input_ids = tts_tokenizer(text, return_tensors="pt").input_ids.to(tts_device)
-        print(f"[TTS] Tokenization complete. Generating audio...")
-        # Use proper generation parameters for Parler TTS
-        generation = tts_model.generate(
-            input_ids=input_ids,
-            prompt_input_ids=prompt_input_ids,
-            do_sample=True,
-            temperature=1.0,
-            min_new_tokens=10,
-            max_new_tokens=500  # Use max_new_tokens instead of max_length
-        )
-        print(f"[TTS] Generation complete. Processing audio...")
-        audio_arr = generation.cpu().numpy().squeeze()
-        print(f"[TTS] Audio array shape: {audio_arr.shape}")
-        return (tts_model.config.sampling_rate, audio_arr)
     except Exception as e:
         print(f"❌ TTS generation error: {str(e)}")
@@ -180,8 +236,66 @@ def generate_tts(text):
         traceback.print_exc()
         return None
-print(f"TTS Model Device: {tts_model.device}")
-print(f"TTS Device Variable: {tts_device}")
 # Custom CSS for a calming interface
 css = """
@@ -262,6 +376,16 @@ with gr.Blocks(
             emotion_status = gr.Markdown("*Waiting for emotion data...*")
     # Example prompts
     with gr.Row(visible=False) as examples_row:
         gr.Examples(
@@ -303,7 +427,6 @@ with gr.Blocks(
         **Privacy:** Your conversations and emotion data are not stored or shared.
         """)
     @spaces.GPU
     def respond(message, chat_history):
         if not message.strip():
@@ -325,7 +448,6 @@ with gr.Blocks(
         print(f"TOTAL TIME: {time.time() - start:.2f}s")
         return "", chat_history, audio
-        return "", chat_history, audio
     def toggle_examples():
         return gr.Row(visible=True)
@@ -343,6 +465,21 @@ with gr.Blocks(
         confidence = current_emotion_state["confidence"]
         return f"**Current Emotion:** {dominant.capitalize()} ({confidence:.1f}% confidence)\n*Last updated: {int(elapsed)}s ago*"
     # Event handlers
     submit = msg.submit(fn=respond, inputs=[msg, chatbot], outputs=[msg, chatbot, audio_output])
     send = send_btn.click(fn=respond, inputs=[msg, chatbot], outputs=[msg, chatbot, audio_output])
@@ -351,18 +488,40 @@ with gr.Blocks(
     # Emotion detection with streaming (analyzes continuously)
     webcam_input.stream(
-        analyze_emotion,
-        inputs=webcam_input,
-        outputs=emotion_output,
-        time_limit=30,  # Analyze every 30 seconds
-        stream_every=30  # Update interval
-    )
     timer = gr.Timer(value=5)  # Update every 5 seconds
     timer.tick(
         fn=update_emotion_status,
         outputs=emotion_status
     )
 if __name__ == "__main__":
-    demo.launch()

+#INFERENCE NLP+EMOTION DETECTION CV+TTS+THREAT DETECTION CV
 import spaces
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
 import torch
 from deepface import DeepFace
 import threading
 import time
+from transformers import AutoProcessor, DiaForConditionalGeneration
 import numpy as np
+import supervision as sv
+import requests
+from PIL import Image
+import os
+from rfdetr import RFDETRNano
 # Model setup
 model_name = "IniNLP247/Kenko-mental-health-llama-3-model"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 if tokenizer.pad_token is None:
     tokenizer.pad_token = tokenizer.eos_token
+)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
 print("✅ Model loaded successfully!")
 #Loading of TTS
+print("Loading Dia TTS Model...")
 tts_device = "cuda:0" if torch.cuda.is_available() else "cpu"
+tts_model = "nari-labs/Dia-1.6B-0626"
+tts_processor = AutoProcessor.from_pretrained(tts_model)
+tts_model = DiaForConditionalGeneration.from_pretrained(tts_model, torch_dtype=torch.float16).to(tts_device)
+print("✅ Dia TTS Model loaded successfully!")
+THREAT_CLASSES = {
+    1: "Gun",
+    2: "Explosive",
+    3: "Grenade",
+    4: "Knife"
+}
+#Loading Threat Detection Model
+print("Loading Threat Detection Model...")
+threat_weights_url = "https://huggingface.co/Subh775/Threat-Detection-RF-DETR/resolve/main/checkpoint_best_total.pth"
+threat_weights_filename = "checkpoint_best_total.pth"
+# Download weights if not already present
+if not os.path.exists(threat_weights_filename):
+    print(f"Downloading weights from {threat_weights_url}")
+    response = requests.get(threat_weights_url, stream=True)
+    response.raise_for_status()
+    with open(threat_weights_filename, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+    print("Download complete.")
+threat_model = RFDETRNano(resolution=640, pretrain_weights=threat_weights_filename)
+#threat_model.optimize_for_inference()
+print("✅ Threat Detection Model loaded successfully!")
+#Global Variables For Threat Detection
+current_thtreat_state = {
+    "threat_detected": [],
+    "threat_count": 0,
+    "last_update": None
+}
 # Global variable to store current emotion state
 current_emotion_state = {
     # Get emotion context
     emotion_context = get_emotion_context()
+    # Get threat context
+    threat_context = get_threat_context()
     # Create prompt in instruction format with emotion awareness
     prompt = f"""### Instruction:
 You are Kenko, a compassionate mental health therapist. Provide empathetic, helpful, and professional responses to support the user's mental wellbeing.
+{emotion_context}{threat_context}
 {conversation}User: {message}
 def generate_tts(text):
     try:
+        text = text[:600]
+        print(f"[TTS] Generating speech for {len(text)} chars: '{text[:50]}...'")
+        # Prepare inputs for Dia TTS
+        inputs = tts_processor(text=text, return_tensors="pt", padding=True)
+        inputs = {k: v.to(tts_device) for k, v in inputs.items()}
+        print(f"[TTS] Inputs prepared, generating audio codes...")
+        # Generate audio codes
+        with torch.no_grad():
+            generated_ids = tts_model.generate(**inputs, max_length=5000)
+        print(f"[TTS] Audio codes generated, shape: {generated_ids.shape}")
+        print(f"[TTS] Decoding codes to waveform...")
+        # Decode the audio codes to waveform using the processor's batch_decode
+        audio_values = tts_processor.batch_decode(generated_ids, return_tensors="pt")
+        # Extract the audio waveform
+        if isinstance(audio_values, dict) and 'audio_values' in audio_values:
+            audio_arr = audio_values['audio_values'][0].cpu().numpy()
+        elif isinstance(audio_values, torch.Tensor):
+            audio_arr = audio_values[0].cpu().numpy()
+        elif isinstance(audio_values, list):
+            audio_arr = np.array(audio_values[0])
+        else:
+            audio_arr = np.array(audio_values).squeeze()
+        # Ensure float32
+        audio_arr = audio_arr.astype(np.float32)
+        # Dia uses 44.1kHz sample rate
+        sample_rate = 44100
+        print(f"✅ [TTS] Audio decoded: {len(audio_arr)} samples at {sample_rate}Hz = {len(audio_arr)/sample_rate:.2f} seconds")
+        if len(audio_arr) == 0:
+            print("❌ Decoded audio is empty!")
+            return None
+        return (sample_rate, audio_arr)
     except Exception as e:
         print(f"❌ TTS generation error: {str(e)}")
         traceback.print_exc()
         return None
+def threat_detection():
+  """Threat detection function for webcam"""
+  global current_threat_state
+  try:
+    if image is None:
+      return {}
+    #Run Threat Detection
+    detections = threat_model.predict(image, threshold=0)
+    #Parse detections
+    threat_found = []
+    if detection is not None and len(detections) > 0:
+      #Extract class IDs and confidence
+      for detection in detections:
+        class_id = int(detection.class_id) if hasattr(detection, 'class_id') else None
+        confidence = float(detection.confidence) if hasattr(detection, 'confidence') else 0.0
+        if class_id in THREAT_CLASSES:
+          threat_name = THREAT_CLASSES[class_id]
+          threat_found.append({"type": threat_name, "confidence": confidence})
+      #Update global threat state
+      current_threat_state = {
+        "threat_detected": threat_found,
+        "threat_count": len(threat_found),
+        "last_update": time.time()
+      }
+      #Format for display
+      if threats_found:
+        output = {}
+        for threat in threats_found:
+          output[threat["type"]] = threat["confidence"] * 100
+        return output
+      else:
+        return {"No threats detected, all clear": 100.0}
+  except Exception as e:
+    print(f"Threat detection error: {str(e)}")
+    return {}
+def get_threat_context():
+  """Get current threat as context string for the model"""
+  if current_threat_state["last_update"] is None:
+    return ""
+  #Check if threat data is recent (within last 60 seconds)
+  if time.time() - current_threat_state["last_update"] > 60:
+    return ""
+  threats = current_threat_state["threat_detected"]
+  if threats:
+    threat_list = ", ".join([f"{t['type']} ({t['confidence']*100:.1f}% confidence)" for t in threats])
+    return f"\n[User currently holds a potential threat: {threat_list}]"
+  return ""
 # Custom CSS for a calming interface
 css = """
             emotion_status = gr.Markdown("*Waiting for emotion data...*")
+            #Threat detection output
+            gr.Markdown("### Threat Detection")
+            threat_output = gr.Label(
+                num_top_classes=4,
+                label="Detected Threats"
+            )
+            threat_status = gr.Markdown("*Monitoring for threats...")
     # Example prompts
     with gr.Row(visible=False) as examples_row:
         gr.Examples(
         **Privacy:** Your conversations and emotion data are not stored or shared.
         """)
     @spaces.GPU
     def respond(message, chat_history):
         if not message.strip():
         print(f"TOTAL TIME: {time.time() - start:.2f}s")
         return "", chat_history, audio
     def toggle_examples():
         return gr.Row(visible=True)
         confidence = current_emotion_state["confidence"]
         return f"**Current Emotion:** {dominant.capitalize()} ({confidence:.1f}% confidence)\n*Last updated: {int(elapsed)}s ago*"
+    def update_threat_status():
+        """Update threat status text"""
+        if current_threat_state["last_update"] is None:
+          return "*Monitoring for threats...*"
+        elapsed = time.time() - current_threat_state["last_update"]
+        threats = current_threat_state["threat_detected"] # Corrected variable name
+        if threats:
+          threat_list = ", ".join([t["type"] for t in threats])
+          return f"**⚠️ ALERT:** {threat_list} detected\n*Last updated: {int(elapsed)}s ago*"
+        else:
+          return f"**✅ Safe:** No threats detected\n*Last updated: {int(elapsed)}s ago*"
     # Event handlers
     submit = msg.submit(fn=respond, inputs=[msg, chatbot], outputs=[msg, chatbot, audio_output])
     send = send_btn.click(fn=respond, inputs=[msg, chatbot], outputs=[msg, chatbot, audio_output])
     # Emotion detection with streaming (analyzes continuously)
     webcam_input.stream(
+    analyze_emotion,
+    inputs=webcam_input,
+    outputs=emotion_output,
+    stream_every=1,  # Update every 1 second instead of 30
+    time_limit=60   # Keep processing for 60 seconds
+)
     timer = gr.Timer(value=5)  # Update every 5 seconds
     timer.tick(
         fn=update_emotion_status,
         outputs=emotion_status
     )
+    # Threat detection with streaming
+    webcam_input.stream(
+        threat_detection, # Corrected function name
+        inputs=webcam_input, # Corrected inputs
+        outputs=threat_output,
+        stream_every=2,
+        time_limit=60
+)
+    # Add to timer tick
+timer.tick(
+    fn=lambda: (update_emotion_status(), update_threat_status()),
+    outputs=[emotion_status, threat_status]
+)
 if __name__ == "__main__":
+    print("🚀 Starting Kenko Mental Health Assistant with Emotion Detection...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7890,
+        share=True,
+        show_error=True
+    )