Macbook commited on
Commit
4ac15b2
·
1 Parent(s): f9b1f70

Deploy with Groq Whisper ASR

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ *.pyc
4
+ models/
5
+ .DS_Store
6
+ model_checkpoints/
Dockerfile CHANGED
@@ -1,4 +1,9 @@
1
- FROM python:3.9
 
 
 
 
 
2
 
3
  RUN useradd -m -u 1000 user
4
  USER user
@@ -10,4 +15,4 @@ COPY --chown=user ./requirements.txt requirements.txt
10
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
 
12
  COPY --chown=user . /app
13
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM python:3.11-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ ffmpeg \
6
+ && rm -rf /var/lib/apt/lists/*
7
 
8
  RUN useradd -m -u 1000 user
9
  USER user
 
15
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
16
 
17
  COPY --chown=user . /app
18
+ CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "7860"]
api/config.py CHANGED
@@ -19,6 +19,8 @@ class Settings(BaseSettings):
19
  CLERK_PEM_PUBLIC_KEY: str = os.getenv("CLERK_PEM_PUBLIC_KEY", "")
20
  OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY", "")
21
  GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN", "") # For GitHub Models GPT-4o
 
 
22
 
23
  model_config = SettingsConfigDict(env_file=".env", extra="ignore")
24
  openapi_url: str = "/openapi.json"
 
19
  CLERK_PEM_PUBLIC_KEY: str = os.getenv("CLERK_PEM_PUBLIC_KEY", "")
20
  OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY", "")
21
  GITHUB_TOKEN: str = os.getenv("GITHUB_TOKEN", "") # For GitHub Models GPT-4o
22
+ HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", os.getenv("HF_TOKEN", "")) # For HuggingFace API
23
+ GROQ_API_KEY: str = os.getenv("GROQ_API_KEY", "") # Free fast Whisper large-v3
24
 
25
  model_config = SettingsConfigDict(env_file=".env", extra="ignore")
26
  openapi_url: str = "/openapi.json"
api/data/exercises.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- Comprehensive Speech Therapy Exercises Database
3
- Categories: Fundamentals, Speech Practice, Visual Learning, Sound Imitation
4
  Languages: English (en), French (fr)
5
  """
6
 
@@ -11,8 +11,6 @@ from enum import Enum
11
 
12
  class ExerciseType(str, Enum):
13
  # Fundamentals
14
- BREATHING = "breathing"
15
- ARTICULATION = "articulation"
16
  PHONEME = "phoneme"
17
 
18
  # Speech Practice
@@ -20,16 +18,6 @@ class ExerciseType(str, Enum):
20
  SENTENCE_READING = "sentence_reading"
21
  TONGUE_TWISTER = "tongue_twister"
22
 
23
- # Visual Learning
24
- COLOR = "color"
25
- OBJECT = "object"
26
- ANIMAL = "animal"
27
- ACTION = "action"
28
-
29
- # Sound Imitation
30
- ANIMAL_SOUND = "animal_sound"
31
- ENVIRONMENTAL_SOUND = "environmental_sound"
32
-
33
 
34
  class Difficulty(str, Enum):
35
  EASY = "easy"
@@ -39,7 +27,7 @@ class Difficulty(str, Enum):
39
 
40
  class Category(BaseModel):
41
  id: str
42
- name: Dict[str, str] # {"en": "...", "fr": "..."}
43
  description: Dict[str, str]
44
  icon: str
45
  subcategories: List[str]
@@ -54,8 +42,6 @@ class Exercise(BaseModel):
54
  title: Dict[str, str]
55
  target_text: Dict[str, str]
56
  instructions: Dict[str, str]
57
- image_url: Optional[str] = None
58
- audio_url: Optional[str] = None
59
  phoneme_focus: Optional[List[str]] = None
60
 
61
 
@@ -68,11 +54,11 @@ CATEGORIES: List[Dict] = [
68
  "id": "fundamentals",
69
  "name": {"en": "Fundamentals", "fr": "Fondamentaux"},
70
  "description": {
71
- "en": "Build your foundation with breathing and articulation exercises",
72
- "fr": "Construisez votre base avec des exercices de respiration et d'articulation"
73
  },
74
  "icon": "🎯",
75
- "subcategories": ["breathing", "articulation", "phoneme"]
76
  },
77
  {
78
  "id": "speech_practice",
@@ -83,38 +69,10 @@ CATEGORIES: List[Dict] = [
83
  },
84
  "icon": "📖",
85
  "subcategories": ["word_repetition", "sentence_reading", "tongue_twister"]
86
- },
87
- {
88
- "id": "visual_learning",
89
- "name": {"en": "Visual Learning", "fr": "Apprentissage Visuel"},
90
- "description": {
91
- "en": "Learn by identifying colors, objects, animals, and actions",
92
- "fr": "Apprenez en identifiant les couleurs, objets, animaux et actions"
93
- },
94
- "icon": "🖼️",
95
- "subcategories": ["color", "object", "animal", "action"]
96
- },
97
- {
98
- "id": "sound_imitation",
99
- "name": {"en": "Sound Imitation", "fr": "Imitation de Sons"},
100
- "description": {
101
- "en": "Imitate animal and environmental sounds",
102
- "fr": "Imitez les sons d'animaux et de l'environnement"
103
- },
104
- "icon": "🔊",
105
- "subcategories": ["animal_sound", "environmental_sound"]
106
  }
107
  ]
108
 
109
  SUBCATEGORIES: Dict[str, Dict] = {
110
- "breathing": {
111
- "name": {"en": "Breathing Exercises", "fr": "Exercices de Respiration"},
112
- "description": {"en": "Control your breath for better speech", "fr": "Contrôlez votre respiration pour mieux parler"}
113
- },
114
- "articulation": {
115
- "name": {"en": "Articulation Drills", "fr": "Exercices d'Articulation"},
116
- "description": {"en": "Improve mouth and tongue movements", "fr": "Améliorez les mouvements de la bouche et de la langue"}
117
- },
118
  "phoneme": {
119
  "name": {"en": "Phoneme Practice", "fr": "Pratique des Phonèmes"},
120
  "description": {"en": "Master specific sounds like R, S, TH", "fr": "Maîtrisez des sons spécifiques comme R, S, CH"}
@@ -130,30 +88,6 @@ SUBCATEGORIES: Dict[str, Dict] = {
130
  "tongue_twister": {
131
  "name": {"en": "Tongue Twisters", "fr": "Virelangues"},
132
  "description": {"en": "Challenge yourself with tricky phrases", "fr": "Défiez-vous avec des phrases difficiles"}
133
- },
134
- "color": {
135
- "name": {"en": "Colors", "fr": "Couleurs"},
136
- "description": {"en": "Identify and say color names", "fr": "Identifiez et dites les noms des couleurs"}
137
- },
138
- "object": {
139
- "name": {"en": "Objects", "fr": "Objets"},
140
- "description": {"en": "Name everyday objects", "fr": "Nommez des objets du quotidien"}
141
- },
142
- "animal": {
143
- "name": {"en": "Animals", "fr": "Animaux"},
144
- "description": {"en": "Identify animals by sight", "fr": "Identifiez les animaux à vue"}
145
- },
146
- "action": {
147
- "name": {"en": "Actions", "fr": "Actions"},
148
- "description": {"en": "Describe what people are doing", "fr": "Décrivez ce que font les gens"}
149
- },
150
- "animal_sound": {
151
- "name": {"en": "Animal Sounds", "fr": "Sons d'Animaux"},
152
- "description": {"en": "Imitate animal sounds", "fr": "Imitez les sons des animaux"}
153
- },
154
- "environmental_sound": {
155
- "name": {"en": "Environmental Sounds", "fr": "Sons de l'Environnement"},
156
- "description": {"en": "Imitate sounds around us", "fr": "Imitez les sons autour de nous"}
157
  }
158
  }
159
 
@@ -163,74 +97,6 @@ SUBCATEGORIES: Dict[str, Dict] = {
163
  # =============================================================================
164
 
165
  EXERCISES: List[Dict] = [
166
- # =========================================================================
167
- # FUNDAMENTALS - Breathing
168
- # =========================================================================
169
- {
170
- "id": "breath-001",
171
- "type": "breathing",
172
- "category": "fundamentals",
173
- "subcategory": "breathing",
174
- "difficulty": "easy",
175
- "title": {"en": "Deep Belly Breathing", "fr": "Respiration Abdominale"},
176
- "target_text": {"en": "Breathe in slowly through your nose, hold, breathe out through your mouth", "fr": "Inspirez lentement par le nez, retenez, expirez par la bouche"},
177
- "instructions": {"en": "Place your hand on your belly. Breathe in for 4 seconds, hold for 2, breathe out for 4. Repeat 3 times.", "fr": "Placez votre main sur le ventre. Inspirez 4 secondes, retenez 2, expirez 4. Répétez 3 fois."},
178
- },
179
- {
180
- "id": "breath-002",
181
- "type": "breathing",
182
- "category": "fundamentals",
183
- "subcategory": "breathing",
184
- "difficulty": "easy",
185
- "title": {"en": "Candle Blow", "fr": "Souffler la Bougie"},
186
- "target_text": {"en": "Take a deep breath and blow out slowly like blowing a candle", "fr": "Prenez une grande inspiration et soufflez doucement comme une bougie"},
187
- "instructions": {"en": "Imagine a candle in front of you. Take a deep breath and blow slowly to make the flame flicker but not go out.", "fr": "Imaginez une bougie devant vous. Inspirez et soufflez doucement pour faire vaciller la flamme sans l'éteindre."},
188
- },
189
- {
190
- "id": "breath-003",
191
- "type": "breathing",
192
- "category": "fundamentals",
193
- "subcategory": "breathing",
194
- "difficulty": "medium",
195
- "title": {"en": "Sustained Breath", "fr": "Souffle Prolongé"},
196
- "target_text": {"en": "Aaaaaaaaahhhhhh", "fr": "Aaaaaaaaahhhhhh"},
197
- "instructions": {"en": "Take a deep breath and say 'Ahhh' for as long as you can. Try to reach 10 seconds!", "fr": "Inspirez profondément et dites 'Ahhh' aussi longtemps que possible. Essayez d'atteindre 10 secondes!"},
198
- },
199
-
200
- # =========================================================================
201
- # FUNDAMENTALS - Articulation
202
- # =========================================================================
203
- {
204
- "id": "artic-001",
205
- "type": "articulation",
206
- "category": "fundamentals",
207
- "subcategory": "articulation",
208
- "difficulty": "easy",
209
- "title": {"en": "Lip Warm-up", "fr": "Échauffement des Lèvres"},
210
- "target_text": {"en": "Ma ma ma, Pa pa pa, Ba ba ba", "fr": "Ma ma ma, Pa pa pa, Ba ba ba"},
211
- "instructions": {"en": "Say each syllable clearly, focusing on your lip movements. Repeat 3 times.", "fr": "Prononcez chaque syllabe clairement en vous concentrant sur vos lèvres. Répétez 3 fois."},
212
- },
213
- {
214
- "id": "artic-002",
215
- "type": "articulation",
216
- "category": "fundamentals",
217
- "subcategory": "articulation",
218
- "difficulty": "easy",
219
- "title": {"en": "Tongue Stretch", "fr": "Étirement de la Langue"},
220
- "target_text": {"en": "La la la, Ta ta ta, Da da da", "fr": "La la la, Ta ta ta, Da da da"},
221
- "instructions": {"en": "Touch the roof of your mouth with your tongue for each syllable. Feel the movement!", "fr": "Touchez le palais avec votre langue pour chaque syllabe. Sentez le mouvement!"},
222
- },
223
- {
224
- "id": "artic-003",
225
- "type": "articulation",
226
- "category": "fundamentals",
227
- "subcategory": "articulation",
228
- "difficulty": "medium",
229
- "title": {"en": "Jaw Exercise", "fr": "Exercice de Mâchoire"},
230
- "target_text": {"en": "Wa wa wa, Ya ya ya, Oo ee oo ee", "fr": "Oua oua oua, Ya ya ya, Ou i ou i"},
231
- "instructions": {"en": "Open your mouth wide for each sound. Feel your jaw moving up and down.", "fr": "Ouvrez grand la bouche pour chaque son. Sentez votre mâchoire bouger."},
232
- },
233
-
234
  # =========================================================================
235
  # FUNDAMENTALS - Phoneme Practice
236
  # =========================================================================
@@ -415,437 +281,6 @@ EXERCISES: List[Dict] = [
415
  "instructions": {"en": "Focus on the 'N' and 'Y' sounds.", "fr": "Concentrez-vous sur les sons 'P' et 'N'."},
416
  "phoneme_focus": ["N", "Y"]
417
  },
418
-
419
- # =========================================================================
420
- # VISUAL LEARNING - Colors
421
- # =========================================================================
422
- {
423
- "id": "color-001",
424
- "type": "color",
425
- "category": "visual_learning",
426
- "subcategory": "color",
427
- "difficulty": "easy",
428
- "title": {"en": "Red", "fr": "Rouge"},
429
- "target_text": {"en": "Red", "fr": "Rouge"},
430
- "instructions": {"en": "Look at the color and say its name.", "fr": "Regardez la couleur et dites son nom."},
431
- "image_url": "https://images.unsplash.com/photo-1562157873-818bc0726f68?w=400&h=400&fit=crop"
432
- },
433
- {
434
- "id": "color-002",
435
- "type": "color",
436
- "category": "visual_learning",
437
- "subcategory": "color",
438
- "difficulty": "easy",
439
- "title": {"en": "Blue", "fr": "Bleu"},
440
- "target_text": {"en": "Blue", "fr": "Bleu"},
441
- "instructions": {"en": "Look at the color and say its name.", "fr": "Regardez la couleur et dites son nom."},
442
- "image_url": "https://images.unsplash.com/photo-1579546929518-9e396f3cc809?w=400&h=400&fit=crop"
443
- },
444
- {
445
- "id": "color-003",
446
- "type": "color",
447
- "category": "visual_learning",
448
- "subcategory": "color",
449
- "difficulty": "easy",
450
- "title": {"en": "Yellow", "fr": "Jaune"},
451
- "target_text": {"en": "Yellow", "fr": "Jaune"},
452
- "instructions": {"en": "Look at the color and say its name.", "fr": "Regardez la couleur et dites son nom."},
453
- "image_url": "https://images.unsplash.com/photo-1495001258031-d1b407bc1776?w=400&h=400&fit=crop"
454
- },
455
- {
456
- "id": "color-004",
457
- "type": "color",
458
- "category": "visual_learning",
459
- "subcategory": "color",
460
- "difficulty": "easy",
461
- "title": {"en": "Green", "fr": "Vert"},
462
- "target_text": {"en": "Green", "fr": "Vert"},
463
- "instructions": {"en": "Look at the color and say its name.", "fr": "Regardez la couleur et dites son nom."},
464
- "image_url": "https://images.unsplash.com/photo-1464820453369-31d2c0b651af?w=400&h=400&fit=crop"
465
- },
466
- {
467
- "id": "color-005",
468
- "type": "color",
469
- "category": "visual_learning",
470
- "subcategory": "color",
471
- "difficulty": "easy",
472
- "title": {"en": "Orange", "fr": "Orange"},
473
- "target_text": {"en": "Orange", "fr": "Orange"},
474
- "instructions": {"en": "Look at the color and say its name.", "fr": "Regardez la couleur et dites son nom."},
475
- "image_url": "https://images.unsplash.com/photo-1557800636-894a64c1696f?w=400&h=400&fit=crop"
476
- },
477
- {
478
- "id": "color-006",
479
- "type": "color",
480
- "category": "visual_learning",
481
- "subcategory": "color",
482
- "difficulty": "easy",
483
- "title": {"en": "Purple", "fr": "Violet"},
484
- "target_text": {"en": "Purple", "fr": "Violet"},
485
- "instructions": {"en": "Look at the color and say its name.", "fr": "Regardez la couleur et dites son nom."},
486
- "image_url": "https://images.unsplash.com/photo-1528459801416-a9e53bbf4e17?w=400&h=400&fit=crop"
487
- },
488
-
489
- # =========================================================================
490
- # VISUAL LEARNING - Objects
491
- # =========================================================================
492
- {
493
- "id": "obj-001",
494
- "type": "object",
495
- "category": "visual_learning",
496
- "subcategory": "object",
497
- "difficulty": "easy",
498
- "title": {"en": "Apple", "fr": "Pomme"},
499
- "target_text": {"en": "Apple", "fr": "Pomme"},
500
- "instructions": {"en": "Look at the picture and say what you see.", "fr": "Regardez l'image et dites ce que vous voyez."},
501
- "image_url": "https://images.unsplash.com/photo-1584306670957-acf935f5033c?w=400&h=400&fit=crop"
502
- },
503
- {
504
- "id": "obj-002",
505
- "type": "object",
506
- "category": "visual_learning",
507
- "subcategory": "object",
508
- "difficulty": "easy",
509
- "title": {"en": "Book", "fr": "Livre"},
510
- "target_text": {"en": "Book", "fr": "Livre"},
511
- "instructions": {"en": "Look at the picture and say what you see.", "fr": "Regardez l'image et dites ce que vous voyez."},
512
- "image_url": "https://images.unsplash.com/photo-1544947950-fa07a98d237f?w=400&h=400&fit=crop"
513
- },
514
- {
515
- "id": "obj-003",
516
- "type": "object",
517
- "category": "visual_learning",
518
- "subcategory": "object",
519
- "difficulty": "easy",
520
- "title": {"en": "Car", "fr": "Voiture"},
521
- "target_text": {"en": "Car", "fr": "Voiture"},
522
- "instructions": {"en": "Look at the picture and say what you see.", "fr": "Regardez l'image et dites ce que vous voyez."},
523
- "image_url": "https://images.unsplash.com/photo-1502877338535-766e1452684a?w=400&h=400&fit=crop"
524
- },
525
- {
526
- "id": "obj-004",
527
- "type": "object",
528
- "category": "visual_learning",
529
- "subcategory": "object",
530
- "difficulty": "easy",
531
- "title": {"en": "Chair", "fr": "Chaise"},
532
- "target_text": {"en": "Chair", "fr": "Chaise"},
533
- "instructions": {"en": "Look at the picture and say what you see.", "fr": "Regardez l'image et dites ce que vous voyez."},
534
- "image_url": "https://images.unsplash.com/photo-1503602642458-232111445657?w=400&h=400&fit=crop"
535
- },
536
- {
537
- "id": "obj-005",
538
- "type": "object",
539
- "category": "visual_learning",
540
- "subcategory": "object",
541
- "difficulty": "easy",
542
- "title": {"en": "House", "fr": "Maison"},
543
- "target_text": {"en": "House", "fr": "Maison"},
544
- "instructions": {"en": "Look at the picture and say what you see.", "fr": "Regardez l'image et dites ce que vous voyez."},
545
- "image_url": "https://images.unsplash.com/photo-1518780664697-55e3ad937233?w=400&h=400&fit=crop"
546
- },
547
- {
548
- "id": "obj-006",
549
- "type": "object",
550
- "category": "visual_learning",
551
- "subcategory": "object",
552
- "difficulty": "medium",
553
- "title": {"en": "Telephone", "fr": "Téléphone"},
554
- "target_text": {"en": "Telephone", "fr": "Téléphone"},
555
- "instructions": {"en": "Look at the picture and say what you see.", "fr": "Regardez l'image et dites ce que vous voyez."},
556
- "image_url": "https://images.unsplash.com/photo-1511707171634-5f897ff02aa9?w=400&h=400&fit=crop"
557
- },
558
- {
559
- "id": "obj-007",
560
- "type": "object",
561
- "category": "visual_learning",
562
- "subcategory": "object",
563
- "difficulty": "medium",
564
- "title": {"en": "Umbrella", "fr": "Parapluie"},
565
- "target_text": {"en": "Umbrella", "fr": "Parapluie"},
566
- "instructions": {"en": "Look at the picture and say what you see.", "fr": "Regardez l'image et dites ce que vous voyez."},
567
- "image_url": "https://images.unsplash.com/photo-1534309466160-70b22cc6252c?w=400&h=400&fit=crop"
568
- },
569
-
570
- # =========================================================================
571
- # VISUAL LEARNING - Animals
572
- # =========================================================================
573
- {
574
- "id": "animal-001",
575
- "type": "animal",
576
- "category": "visual_learning",
577
- "subcategory": "animal",
578
- "difficulty": "easy",
579
- "title": {"en": "Dog", "fr": "Chien"},
580
- "target_text": {"en": "Dog", "fr": "Chien"},
581
- "instructions": {"en": "Look at the animal and say its name.", "fr": "Regardez l'animal et dites son nom."},
582
- "image_url": "https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=400&h=400&fit=crop"
583
- },
584
- {
585
- "id": "animal-002",
586
- "type": "animal",
587
- "category": "visual_learning",
588
- "subcategory": "animal",
589
- "difficulty": "easy",
590
- "title": {"en": "Cat", "fr": "Chat"},
591
- "target_text": {"en": "Cat", "fr": "Chat"},
592
- "instructions": {"en": "Look at the animal and say its name.", "fr": "Regardez l'animal et dites son nom."},
593
- "image_url": "https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=400&h=400&fit=crop"
594
- },
595
- {
596
- "id": "animal-003",
597
- "type": "animal",
598
- "category": "visual_learning",
599
- "subcategory": "animal",
600
- "difficulty": "easy",
601
- "title": {"en": "Bird", "fr": "Oiseau"},
602
- "target_text": {"en": "Bird", "fr": "Oiseau"},
603
- "instructions": {"en": "Look at the animal and say its name.", "fr": "Regardez l'animal et dites son nom."},
604
- "image_url": "https://images.unsplash.com/photo-1522926193341-e9ffd686c60f?w=400&h=400&fit=crop"
605
- },
606
- {
607
- "id": "animal-004",
608
- "type": "animal",
609
- "category": "visual_learning",
610
- "subcategory": "animal",
611
- "difficulty": "easy",
612
- "title": {"en": "Fish", "fr": "Poisson"},
613
- "target_text": {"en": "Fish", "fr": "Poisson"},
614
- "instructions": {"en": "Look at the animal and say its name.", "fr": "Regardez l'animal et dites son nom."},
615
- "image_url": "https://images.unsplash.com/photo-1524704654690-b56c05c78a00?w=400&h=400&fit=crop"
616
- },
617
- {
618
- "id": "animal-005",
619
- "type": "animal",
620
- "category": "visual_learning",
621
- "subcategory": "animal",
622
- "difficulty": "medium",
623
- "title": {"en": "Elephant", "fr": "Éléphant"},
624
- "target_text": {"en": "Elephant", "fr": "Éléphant"},
625
- "instructions": {"en": "Look at the animal and say its name.", "fr": "Regardez l'animal et dites son nom."},
626
- "image_url": "https://images.unsplash.com/photo-1557050543-4d5f4e07ef46?w=400&h=400&fit=crop"
627
- },
628
- {
629
- "id": "animal-006",
630
- "type": "animal",
631
- "category": "visual_learning",
632
- "subcategory": "animal",
633
- "difficulty": "medium",
634
- "title": {"en": "Lion", "fr": "Lion"},
635
- "target_text": {"en": "Lion", "fr": "Lion"},
636
- "instructions": {"en": "Look at the animal and say its name.", "fr": "Regardez l'animal et dites son nom."},
637
- "image_url": "https://images.unsplash.com/photo-1546182990-dffeafbe841d?w=400&h=400&fit=crop"
638
- },
639
- {
640
- "id": "animal-007",
641
- "type": "animal",
642
- "category": "visual_learning",
643
- "subcategory": "animal",
644
- "difficulty": "medium",
645
- "title": {"en": "Butterfly", "fr": "Papillon"},
646
- "target_text": {"en": "Butterfly", "fr": "Papillon"},
647
- "instructions": {"en": "Look at the animal and say its name.", "fr": "Regardez l'animal et dites son nom."},
648
- "image_url": "https://images.unsplash.com/photo-1452570053594-1b985d6ea890?w=400&h=400&fit=crop"
649
- },
650
- {
651
- "id": "animal-008",
652
- "type": "animal",
653
- "category": "visual_learning",
654
- "subcategory": "animal",
655
- "difficulty": "hard",
656
- "title": {"en": "Hippopotamus", "fr": "Hippopotame"},
657
- "target_text": {"en": "Hippopotamus", "fr": "Hippopotame"},
658
- "instructions": {"en": "Look at the animal and say its name. This is a long word!", "fr": "Regardez l'animal et dites son nom. C'est un mot long!"},
659
- "image_url": "https://images.unsplash.com/photo-1517840933437-c41356892b35?w=400&h=400&fit=crop"
660
- },
661
-
662
- # =========================================================================
663
- # VISUAL LEARNING - Actions
664
- # =========================================================================
665
- {
666
- "id": "action-001",
667
- "type": "action",
668
- "category": "visual_learning",
669
- "subcategory": "action",
670
- "difficulty": "easy",
671
- "title": {"en": "Running", "fr": "Courir"},
672
- "target_text": {"en": "Running", "fr": "Courir"},
673
- "instructions": {"en": "Look at the action and say what the person is doing.", "fr": "Regardez l'action et dites ce que fait la personne."},
674
- "image_url": "https://images.unsplash.com/photo-1552674605-db6ffd4facb5?w=400&h=400&fit=crop"
675
- },
676
- {
677
- "id": "action-002",
678
- "type": "action",
679
- "category": "visual_learning",
680
- "subcategory": "action",
681
- "difficulty": "easy",
682
- "title": {"en": "Eating", "fr": "Manger"},
683
- "target_text": {"en": "Eating", "fr": "Manger"},
684
- "instructions": {"en": "Look at the action and say what the person is doing.", "fr": "Regardez l'action et dites ce que fait la personne."},
685
- "image_url": "https://images.unsplash.com/photo-1504674900247-0877df9cc836?w=400&h=400&fit=crop"
686
- },
687
- {
688
- "id": "action-003",
689
- "type": "action",
690
- "category": "visual_learning",
691
- "subcategory": "action",
692
- "difficulty": "easy",
693
- "title": {"en": "Sleeping", "fr": "Dormir"},
694
- "target_text": {"en": "Sleeping", "fr": "Dormir"},
695
- "instructions": {"en": "Look at the action and say what the person is doing.", "fr": "Regardez l'action et dites ce que fait la personne."},
696
- "image_url": "https://images.unsplash.com/photo-1541781774459-bb2af2f05b55?w=400&h=400&fit=crop"
697
- },
698
- {
699
- "id": "action-004",
700
- "type": "action",
701
- "category": "visual_learning",
702
- "subcategory": "action",
703
- "difficulty": "easy",
704
- "title": {"en": "Reading", "fr": "Lire"},
705
- "target_text": {"en": "Reading", "fr": "Lire"},
706
- "instructions": {"en": "Look at the action and say what the person is doing.", "fr": "Regardez l'action et dites ce que fait la personne."},
707
- "image_url": "https://images.unsplash.com/photo-1506880018603-83d5b814b5a6?w=400&h=400&fit=crop"
708
- },
709
- {
710
- "id": "action-005",
711
- "type": "action",
712
- "category": "visual_learning",
713
- "subcategory": "action",
714
- "difficulty": "medium",
715
- "title": {"en": "Swimming", "fr": "Nager"},
716
- "target_text": {"en": "Swimming", "fr": "Nager"},
717
- "instructions": {"en": "Look at the action and say what the person is doing.", "fr": "Regardez l'action et dites ce que fait la personne."},
718
- "image_url": "https://images.unsplash.com/photo-1530549387789-4c1017266635?w=400&h=400&fit=crop"
719
- },
720
-
721
- # =========================================================================
722
- # SOUND IMITATION - Animal Sounds
723
- # =========================================================================
724
- {
725
- "id": "asound-001",
726
- "type": "animal_sound",
727
- "category": "sound_imitation",
728
- "subcategory": "animal_sound",
729
- "difficulty": "easy",
730
- "title": {"en": "Dog Sound", "fr": "Son du Chien"},
731
- "target_text": {"en": "Woof woof", "fr": "Ouaf ouaf"},
732
- "instructions": {"en": "Imitate the sound a dog makes!", "fr": "Imitez le son que fait un chien!"},
733
- "image_url": "https://images.unsplash.com/photo-1587300003388-59208cc962cb?w=400&h=400&fit=crop"
734
- },
735
- {
736
- "id": "asound-002",
737
- "type": "animal_sound",
738
- "category": "sound_imitation",
739
- "subcategory": "animal_sound",
740
- "difficulty": "easy",
741
- "title": {"en": "Cat Sound", "fr": "Son du Chat"},
742
- "target_text": {"en": "Meow meow", "fr": "Miaou miaou"},
743
- "instructions": {"en": "Imitate the sound a cat makes!", "fr": "Imitez le son que fait un chat!"},
744
- "image_url": "https://images.unsplash.com/photo-1514888286974-6c03e2ca1dba?w=400&h=400&fit=crop"
745
- },
746
- {
747
- "id": "asound-003",
748
- "type": "animal_sound",
749
- "category": "sound_imitation",
750
- "subcategory": "animal_sound",
751
- "difficulty": "easy",
752
- "title": {"en": "Cow Sound", "fr": "Son de la Vache"},
753
- "target_text": {"en": "Moo moo", "fr": "Meuh meuh"},
754
- "instructions": {"en": "Imitate the sound a cow makes!", "fr": "Imitez le son que fait une vache!"},
755
- "image_url": "https://images.unsplash.com/photo-1570042225831-d98fa7577f1e?w=400&h=400&fit=crop"
756
- },
757
- {
758
- "id": "asound-004",
759
- "type": "animal_sound",
760
- "category": "sound_imitation",
761
- "subcategory": "animal_sound",
762
- "difficulty": "easy",
763
- "title": {"en": "Duck Sound", "fr": "Son du Canard"},
764
- "target_text": {"en": "Quack quack", "fr": "Coin coin"},
765
- "instructions": {"en": "Imitate the sound a duck makes!", "fr": "Imitez le son que fait un canard!"},
766
- "image_url": "https://images.unsplash.com/photo-1459682687441-7761439a709d?w=400&h=400&fit=crop"
767
- },
768
- {
769
- "id": "asound-005",
770
- "type": "animal_sound",
771
- "category": "sound_imitation",
772
- "subcategory": "animal_sound",
773
- "difficulty": "medium",
774
- "title": {"en": "Lion Sound", "fr": "Son du Lion"},
775
- "target_text": {"en": "Roar!", "fr": "Grrrr!"},
776
- "instructions": {"en": "Imitate the sound a lion makes! Be loud!", "fr": "Imitez le son que fait un lion! Soyez fort!"},
777
- "image_url": "https://images.unsplash.com/photo-1546182990-dffeafbe841d?w=400&h=400&fit=crop"
778
- },
779
- {
780
- "id": "asound-006",
781
- "type": "animal_sound",
782
- "category": "sound_imitation",
783
- "subcategory": "animal_sound",
784
- "difficulty": "medium",
785
- "title": {"en": "Snake Sound", "fr": "Son du Serpent"},
786
- "target_text": {"en": "Sssssss", "fr": "Sssssss"},
787
- "instructions": {"en": "Imitate the hissing sound a snake makes!", "fr": "Imitez le sifflement que fait un serpent!"},
788
- "image_url": "https://images.unsplash.com/photo-1531386151447-fd76ad50012f?w=400&h=400&fit=crop"
789
- },
790
-
791
- # =========================================================================
792
- # SOUND IMITATION - Environmental Sounds
793
- # =========================================================================
794
- {
795
- "id": "esound-001",
796
- "type": "environmental_sound",
797
- "category": "sound_imitation",
798
- "subcategory": "environmental_sound",
799
- "difficulty": "easy",
800
- "title": {"en": "Car Horn", "fr": "Klaxon"},
801
- "target_text": {"en": "Beep beep", "fr": "Pouet pouet"},
802
- "instructions": {"en": "Imitate the sound of a car horn!", "fr": "Imitez le son d'un klaxon!"},
803
- "image_url": "https://images.unsplash.com/photo-1502877338535-766e1452684a?w=400&h=400&fit=crop"
804
- },
805
- {
806
- "id": "esound-002",
807
- "type": "environmental_sound",
808
- "category": "sound_imitation",
809
- "subcategory": "environmental_sound",
810
- "difficulty": "easy",
811
- "title": {"en": "Train Sound", "fr": "Son du Train"},
812
- "target_text": {"en": "Choo choo", "fr": "Tchou tchou"},
813
- "instructions": {"en": "Imitate the sound of a train!", "fr": "Imitez le son d'un train!"},
814
- "image_url": "https://images.unsplash.com/photo-1474487548417-781cb71495f3?w=400&h=400&fit=crop"
815
- },
816
- {
817
- "id": "esound-003",
818
- "type": "environmental_sound",
819
- "category": "sound_imitation",
820
- "subcategory": "environmental_sound",
821
- "difficulty": "easy",
822
- "title": {"en": "Clock Sound", "fr": "Son de l'Horloge"},
823
- "target_text": {"en": "Tick tock tick tock", "fr": "Tic tac tic tac"},
824
- "instructions": {"en": "Imitate the sound of a clock!", "fr": "Imitez le son d'une horloge!"},
825
- "image_url": "https://images.unsplash.com/photo-1563861826100-9cb868fdbe1c?w=400&h=400&fit=crop"
826
- },
827
- {
828
- "id": "esound-004",
829
- "type": "environmental_sound",
830
- "category": "sound_imitation",
831
- "subcategory": "environmental_sound",
832
- "difficulty": "medium",
833
- "title": {"en": "Rain Sound", "fr": "Son de la Pluie"},
834
- "target_text": {"en": "Pitter patter pitter patter", "fr": "Plic ploc plic ploc"},
835
- "instructions": {"en": "Imitate the sound of rain falling!", "fr": "Imitez le son de la pluie qui tombe!"},
836
- "image_url": "https://images.unsplash.com/photo-1519692933481-e162a57d6721?w=400&h=400&fit=crop"
837
- },
838
- {
839
- "id": "esound-005",
840
- "type": "environmental_sound",
841
- "category": "sound_imitation",
842
- "subcategory": "environmental_sound",
843
- "difficulty": "medium",
844
- "title": {"en": "Wind Sound", "fr": "Son du Vent"},
845
- "target_text": {"en": "Whoooosh", "fr": "Woooosh"},
846
- "instructions": {"en": "Imitate the sound of strong wind!", "fr": "Imitez le son du vent fort!"},
847
- "image_url": "https://images.unsplash.com/photo-1534088568595-a066f410bcda?w=400&h=400&fit=crop"
848
- },
849
  ]
850
 
851
 
@@ -872,8 +307,6 @@ def get_all_exercises(language: str = "en") -> List[Dict]:
872
  "title": ex["title"].get(language, ex["title"]["en"]),
873
  "target_text": ex["target_text"].get(language, ex["target_text"]["en"]),
874
  "instructions": ex["instructions"].get(language, ex["instructions"]["en"]),
875
- "image_url": ex.get("image_url"),
876
- "audio_url": ex.get("audio_url"),
877
  "phoneme_focus": ex.get("phoneme_focus", [])
878
  })
879
  return exercises
 
1
  """
2
+ Speech Therapy Exercises Database
3
+ Categories: Fundamentals, Speech Practice
4
  Languages: English (en), French (fr)
5
  """
6
 
 
11
 
12
  class ExerciseType(str, Enum):
13
  # Fundamentals
 
 
14
  PHONEME = "phoneme"
15
 
16
  # Speech Practice
 
18
  SENTENCE_READING = "sentence_reading"
19
  TONGUE_TWISTER = "tongue_twister"
20
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  class Difficulty(str, Enum):
23
  EASY = "easy"
 
27
 
28
  class Category(BaseModel):
29
  id: str
30
+ name: Dict[str, str]
31
  description: Dict[str, str]
32
  icon: str
33
  subcategories: List[str]
 
42
  title: Dict[str, str]
43
  target_text: Dict[str, str]
44
  instructions: Dict[str, str]
 
 
45
  phoneme_focus: Optional[List[str]] = None
46
 
47
 
 
54
  "id": "fundamentals",
55
  "name": {"en": "Fundamentals", "fr": "Fondamentaux"},
56
  "description": {
57
+ "en": "Master specific sounds and phonemes",
58
+ "fr": "Maîtrisez des sons et phonèmes spécifiques"
59
  },
60
  "icon": "🎯",
61
+ "subcategories": ["phoneme"]
62
  },
63
  {
64
  "id": "speech_practice",
 
69
  },
70
  "icon": "📖",
71
  "subcategories": ["word_repetition", "sentence_reading", "tongue_twister"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  }
73
  ]
74
 
75
  SUBCATEGORIES: Dict[str, Dict] = {
 
 
 
 
 
 
 
 
76
  "phoneme": {
77
  "name": {"en": "Phoneme Practice", "fr": "Pratique des Phonèmes"},
78
  "description": {"en": "Master specific sounds like R, S, TH", "fr": "Maîtrisez des sons spécifiques comme R, S, CH"}
 
88
  "tongue_twister": {
89
  "name": {"en": "Tongue Twisters", "fr": "Virelangues"},
90
  "description": {"en": "Challenge yourself with tricky phrases", "fr": "Défiez-vous avec des phrases difficiles"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  }
92
  }
93
 
 
97
  # =============================================================================
98
 
99
  EXERCISES: List[Dict] = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # =========================================================================
101
  # FUNDAMENTALS - Phoneme Practice
102
  # =========================================================================
 
281
  "instructions": {"en": "Focus on the 'N' and 'Y' sounds.", "fr": "Concentrez-vous sur les sons 'P' et 'N'."},
282
  "phoneme_focus": ["N", "Y"]
283
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  ]
285
 
286
 
 
307
  "title": ex["title"].get(language, ex["title"]["en"]),
308
  "target_text": ex["target_text"].get(language, ex["target_text"]["en"]),
309
  "instructions": ex["instructions"].get(language, ex["instructions"]["en"]),
 
 
310
  "phoneme_focus": ex.get("phoneme_focus", [])
311
  })
312
  return exercises
api/endpoints/v1/processing/therapy_asr.py CHANGED
@@ -15,6 +15,41 @@ from dataclasses import dataclass
15
 
16
  from api.config import settings
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  if settings.ENVIRONMENT == "development":
19
  logging.basicConfig(level=logging.DEBUG)
20
  else:
@@ -23,9 +58,10 @@ else:
23
 
24
  class ASREngine(str, Enum):
25
  """Available ASR engines."""
26
- WHISPER_LOCAL = "whisper_local"
 
27
  SPEECHBRAIN = "speechbrain"
28
- WHISPER_API = "whisper_api"
29
  AUTO = "auto" # Automatically select based on user profile
30
 
31
 
@@ -67,7 +103,8 @@ class TherapyASR:
67
  import torch
68
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
69
 
70
- model_name = "openai/whisper-base" # Start with base, upgrade as needed
 
71
  logging.info(f"Loading local Whisper model: {model_name}")
72
 
73
  self._whisper_processor = WhisperProcessor.from_pretrained(model_name)
@@ -89,6 +126,9 @@ class TherapyASR:
89
  """Lazy load SpeechBrain model for atypical speech."""
90
  if self._speechbrain_model is None:
91
  try:
 
 
 
92
  import speechbrain as sb
93
 
94
  # Use pre-trained model, can be swapped for fine-tuned version
@@ -120,8 +160,10 @@ class TherapyASR:
120
  if user_profile.get("privacy_mode") == "local":
121
  return ASREngine.WHISPER_LOCAL
122
 
123
- # Default to API for best accuracy
124
- return ASREngine.WHISPER_API
 
 
125
 
126
  def transcribe(
127
  self,
@@ -147,15 +189,15 @@ class TherapyASR:
147
  selected_engine = engine or self._select_engine(user_profile)
148
  logging.info(f"Transcribing with engine: {selected_engine.value}")
149
 
150
- # Try selected engine with fallback chain
151
  fallback_order = [selected_engine]
152
- if selected_engine != ASREngine.WHISPER_API:
153
- fallback_order.append(ASREngine.WHISPER_API)
154
 
155
  last_error = None
156
  for eng in fallback_order:
157
  try:
158
- if eng == ASREngine.WHISPER_API:
 
 
159
  return self._transcribe_whisper_api(audio_data, filename, content_type)
160
  elif eng == ASREngine.WHISPER_LOCAL:
161
  return self._transcribe_whisper_local(audio_data)
@@ -202,6 +244,51 @@ class TherapyASR:
202
  word_timestamps=word_timestamps
203
  )
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def _transcribe_whisper_local(self, audio_data: bytes) -> TranscriptionResult:
206
  """Transcribe using local Whisper model."""
207
  logging.info("Transcribing with local Whisper")
@@ -212,8 +299,62 @@ class TherapyASR:
212
 
213
  model = self._get_whisper_local()
214
 
215
- # Load audio from bytes
216
- audio_array, sr = librosa.load(io.BytesIO(audio_data), sr=16000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  # Process audio
219
  input_features = self._whisper_processor(
 
15
 
16
  from api.config import settings
17
 
18
+
19
+ def _patch_torchaudio_for_speechbrain():
20
+ """
21
+ Patch torchaudio to fix compatibility with SpeechBrain 1.0.3 and torchaudio 2.2+.
22
+
23
+ The `list_audio_backends()` function was removed in torchaudio 2.2+,
24
+ but SpeechBrain 1.0.3 still calls it. This adds a compatibility shim.
25
+ """
26
+ try:
27
+ import torchaudio
28
+ if not hasattr(torchaudio, 'list_audio_backends'):
29
+ # Add the missing function that returns available backends
30
+ def list_audio_backends():
31
+ """Compatibility shim for removed torchaudio function."""
32
+ backends = []
33
+ try:
34
+ import soundfile
35
+ backends.append('soundfile')
36
+ except ImportError:
37
+ pass
38
+ try:
39
+ # Check if sox is available
40
+ import torchaudio.backend.sox_io_backend
41
+ backends.append('sox')
42
+ except (ImportError, OSError):
43
+ pass
44
+ # ffmpeg is usually available
45
+ backends.append('ffmpeg')
46
+ return backends
47
+
48
+ torchaudio.list_audio_backends = list_audio_backends
49
+ logging.debug("Patched torchaudio.list_audio_backends for SpeechBrain compatibility")
50
+ except ImportError:
51
+ pass # torchaudio not installed, will fail later with proper error
52
+
53
  if settings.ENVIRONMENT == "development":
54
  logging.basicConfig(level=logging.DEBUG)
55
  else:
 
58
 
59
  class ASREngine(str, Enum):
60
  """Available ASR engines."""
61
+ WHISPER_JAX = "whisper_jax" # Fast, free Groq API (recommended)
62
+ WHISPER_LOCAL = "whisper_local" # Local whisper-large-v3 (slow on CPU)
63
  SPEECHBRAIN = "speechbrain"
64
+ WHISPER_API = "whisper_api" # OpenAI paid API
65
  AUTO = "auto" # Automatically select based on user profile
66
 
67
 
 
103
  import torch
104
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
105
 
106
+ # Use large-v3 for best accuracy (cached after first load)
107
+ model_name = "openai/whisper-large-v3"
108
  logging.info(f"Loading local Whisper model: {model_name}")
109
 
110
  self._whisper_processor = WhisperProcessor.from_pretrained(model_name)
 
126
  """Lazy load SpeechBrain model for atypical speech."""
127
  if self._speechbrain_model is None:
128
  try:
129
+ # Apply torchaudio compatibility patch before importing speechbrain
130
+ _patch_torchaudio_for_speechbrain()
131
+
132
  import speechbrain as sb
133
 
134
  # Use pre-trained model, can be swapped for fine-tuned version
 
160
  if user_profile.get("privacy_mode") == "local":
161
  return ASREngine.WHISPER_LOCAL
162
 
163
+ # Default to Groq (fast, free) if API key available, else local
164
+ if settings.GROQ_API_KEY:
165
+ return ASREngine.WHISPER_JAX
166
+ return ASREngine.WHISPER_LOCAL
167
 
168
  def transcribe(
169
  self,
 
189
  selected_engine = engine or self._select_engine(user_profile)
190
  logging.info(f"Transcribing with engine: {selected_engine.value}")
191
 
192
+ # Use selected engine only (no fallback to paid API)
193
  fallback_order = [selected_engine]
 
 
194
 
195
  last_error = None
196
  for eng in fallback_order:
197
  try:
198
+ if eng == ASREngine.WHISPER_JAX:
199
+ return self._transcribe_whisper_jax(audio_data)
200
+ elif eng == ASREngine.WHISPER_API:
201
  return self._transcribe_whisper_api(audio_data, filename, content_type)
202
  elif eng == ASREngine.WHISPER_LOCAL:
203
  return self._transcribe_whisper_local(audio_data)
 
244
  word_timestamps=word_timestamps
245
  )
246
 
247
+ def _transcribe_whisper_jax(self, audio_data: bytes) -> TranscriptionResult:
248
+ """Transcribe using Groq API (free, fast Whisper large-v3)."""
249
+ logging.info("Transcribing with Groq Whisper (free, fast)")
250
+
251
+ from groq import Groq
252
+ import tempfile
253
+ import os
254
+
255
+ if not settings.GROQ_API_KEY:
256
+ raise RuntimeError("GROQ_API_KEY not configured. Get free key at https://console.groq.com")
257
+
258
+ client = Groq(api_key=settings.GROQ_API_KEY)
259
+
260
+ # Groq requires a file, write temp file
261
+ with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
262
+ f.write(audio_data)
263
+ temp_path = f.name
264
+
265
+ try:
266
+ with open(temp_path, "rb") as audio_file:
267
+ transcription = client.audio.transcriptions.create(
268
+ file=(temp_path, audio_file.read()),
269
+ model="whisper-large-v3",
270
+ temperature=0,
271
+ response_format="verbose_json",
272
+ )
273
+
274
+ # Extract word timestamps if available
275
+ word_timestamps = None
276
+ if hasattr(transcription, 'words') and transcription.words:
277
+ word_timestamps = [
278
+ {"word": w.word, "start": w.start, "end": w.end}
279
+ for w in transcription.words
280
+ ]
281
+
282
+ return TranscriptionResult(
283
+ text=transcription.text.strip(),
284
+ engine_used=ASREngine.WHISPER_JAX,
285
+ language=getattr(transcription, 'language', None),
286
+ word_timestamps=word_timestamps
287
+ )
288
+ finally:
289
+ if os.path.exists(temp_path):
290
+ os.unlink(temp_path)
291
+
292
  def _transcribe_whisper_local(self, audio_data: bytes) -> TranscriptionResult:
293
  """Transcribe using local Whisper model."""
294
  logging.info("Transcribing with local Whisper")
 
299
 
300
  model = self._get_whisper_local()
301
 
302
+ # Try to load audio - if it fails (e.g., webm format), convert with PyAV
303
+ try:
304
+ audio_array, sr = librosa.load(io.BytesIO(audio_data), sr=16000)
305
+ except Exception as e:
306
+ logging.warning(f"librosa.load failed: {e}, trying PyAV conversion")
307
+ # Convert using PyAV (bundled ffmpeg libraries)
308
+ import av
309
+ import tempfile
310
+ import os
311
+
312
+ # PyAV needs a file for some formats like webm
313
+ with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as f:
314
+ f.write(audio_data)
315
+ temp_path = f.name
316
+
317
+ try:
318
+ container = av.open(temp_path)
319
+ audio_frames = []
320
+ original_sr = 48000 # Default sample rate
321
+
322
+ # Get sample rate before decoding
323
+ if container.streams.audio:
324
+ original_sr = container.streams.audio[0].rate
325
+ logging.info(f"Audio sample rate: {original_sr}")
326
+
327
+ for frame in container.decode(audio=0):
328
+ # Convert to numpy array
329
+ array = frame.to_ndarray()
330
+ # If stereo, take mean to mono
331
+ if array.ndim > 1:
332
+ array = array.mean(axis=0)
333
+ audio_frames.append(array)
334
+
335
+ container.close()
336
+
337
+ if not audio_frames:
338
+ raise RuntimeError("No audio frames decoded from input")
339
+
340
+ # Concatenate all frames
341
+ audio_array = np.concatenate(audio_frames).astype(np.float32)
342
+
343
+ # Normalize to [-1, 1] range if needed
344
+ if audio_array.max() > 1.0 or audio_array.min() < -1.0:
345
+ audio_array = audio_array / 32768.0
346
+
347
+ # Resample to 16kHz if needed (Whisper expects 16kHz)
348
+ if original_sr != 16000:
349
+ # Use scipy for faster resampling (48000->16000 = 3:1 ratio)
350
+ from scipy import signal
351
+ gcd = np.gcd(16000, original_sr)
352
+ up = 16000 // gcd
353
+ down = original_sr // gcd
354
+ audio_array = signal.resample_poly(audio_array, up, down)
355
+ finally:
356
+ if os.path.exists(temp_path):
357
+ os.unlink(temp_path)
358
 
359
  # Process audio
360
  input_features = self._whisper_processor(
api/endpoints/v1/routers/therapy.py CHANGED
@@ -381,6 +381,54 @@ async def demo_get_exercise(
381
  return exercise
382
 
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  @router.post("/demo/feedback", tags=["therapy-demo"])
385
  async def demo_ai_feedback(
386
  target_text: str = Query(..., description="Text to practice"),
 
381
  return exercise
382
 
383
 
384
+ @router.post("/demo/transcribe", tags=["therapy-demo"])
385
+ async def demo_transcribe_audio(
386
+ file: UploadFile = File(...),
387
+ engine: Optional[ASREngine] = Query(None, description="ASR engine"),
388
+ ):
389
+ """
390
+ [DEMO] Transcribe audio without auth - for testing Whisper/SpeechBrain.
391
+
392
+ Uses the same therapy-optimized ASR as authenticated endpoint.
393
+ """
394
+ logging.info(f"Demo transcription request - file: {file.filename}, type: {file.content_type}")
395
+
396
+ # Validate file
397
+ if file.content_type not in ALLOWED_AUDIO_TYPES:
398
+ raise HTTPException(status_code=400, detail=f"Invalid audio file type: {file.content_type}. Allowed: {ALLOWED_AUDIO_TYPES}")
399
+
400
+ contents = await file.read()
401
+ if len(contents) > FILE_SIZE_LIMIT:
402
+ raise HTTPException(status_code=400, detail="File size exceeds 25 MB limit")
403
+
404
+ if len(contents) == 0:
405
+ raise HTTPException(status_code=400, detail="Empty audio file")
406
+
407
+ logging.info(f"Processing audio: {len(contents)} bytes")
408
+
409
+ try:
410
+ result = transcribe_for_therapy(
411
+ audio_data=contents,
412
+ filename=file.filename or "audio.webm",
413
+ content_type=file.content_type or "audio/webm",
414
+ engine=engine
415
+ )
416
+
417
+ logging.info(f"Transcription result: {result.text}")
418
+
419
+ return {
420
+ "text": result.text,
421
+ "transcription": result.text, # Alias for compatibility
422
+ "engine_used": result.engine_used.value,
423
+ "confidence": result.confidence,
424
+ "word_timestamps": result.word_timestamps
425
+ }
426
+
427
+ except Exception as e:
428
+ logging.error(f"Demo transcription failed: {e}")
429
+ raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
430
+
431
+
432
  @router.post("/demo/feedback", tags=["therapy-demo"])
433
  async def demo_ai_feedback(
434
  target_text: str = Query(..., description="Text to practice"),
requirements.txt CHANGED
@@ -8,3 +8,12 @@ python-multipart
8
  openai
9
  httpx
10
  requests
 
 
 
 
 
 
 
 
 
 
8
  openai
9
  httpx
10
  requests
11
+ # Speech recognition - Whisper large-v3 (free, open-source from HuggingFace)
12
+ transformers
13
+ librosa
14
+ torch
15
+ soundfile
16
+ av # PyAV for webm/opus audio decoding
17
+ scipy # Fast audio resampling
18
+ huggingface_hub # For free Whisper API
19
+ groq # Groq API - FREE fast Whisper large-v3