""" Pronunciation Analysis Module - Speech clarity and pronunciation feedback. Provides: - Pronunciation scoring (PCC - Percent Consonants Correct) - Clarity assessment - Pace analysis - Per-word feedback - Improvement suggestions """ import io import logging from typing import Optional, List from dataclasses import dataclass, field from enum import Enum from api.config import settings if settings.ENVIRONMENT == "development": logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) class ErrorType(str, Enum): """Types of pronunciation errors.""" SUBSTITUTION = "substitution" # Wrong sound OMISSION = "omission" # Missing sound ADDITION = "addition" # Extra sound DISTORTION = "distortion" # Unclear sound @dataclass class PhonemeError: """Individual phoneme-level error.""" word: str position: int # Position in word expected: str actual: Optional[str] error_type: ErrorType suggestion: str @dataclass class WordScore: """Per-word pronunciation score.""" word: str score: float # 0-100 start_time: Optional[float] = None end_time: Optional[float] = None errors: List[PhonemeError] = field(default_factory=list) @dataclass class AIFeedback: """AI-generated personalized feedback.""" feedback: str encouragement: str specific_tips: List[str] recommended_exercises: List[str] difficulty_adjustment: Optional[str] = None # "easier", "same", "harder" @dataclass class PronunciationFeedback: """Complete pronunciation analysis result.""" overall_score: float # 0-100 clarity_score: float # 0-100 pace_score: float # 0-100 fluency_score: float # 0-100 word_scores: List[WordScore] suggestions: List[str] phoneme_errors: List[PhonemeError] transcription: str target_text: str duration_seconds: Optional[float] = None ai_feedback: Optional[AIFeedback] = None # GPT-4o powered feedback class PronunciationAnalyzer: """ Analyze pronunciation against target text. Uses ASR with forced alignment to compare user speech against expected pronunciation. Integrates GPT-4o for personalized feedback via GitHub Models API. """ def __init__(self): self._asr = None self._ai_feedback = None def _get_ai_feedback_generator(self): """Get AI feedback generator instance.""" if self._ai_feedback is None: from api.endpoints.v1.processing.ai_feedback import get_ai_feedback_generator self._ai_feedback = get_ai_feedback_generator() return self._ai_feedback def _get_asr(self): """Get ASR instance for transcription.""" if self._asr is None: from api.endpoints.v1.processing.therapy_asr import get_therapy_asr self._asr = get_therapy_asr() return self._asr async def analyze( self, audio_bytes: bytes, target_text: str, user_baseline: Optional[dict] = None, user_context: Optional[dict] = None, include_ai_feedback: bool = True ) -> PronunciationFeedback: """ Analyze pronunciation of audio against target text. Args: audio_bytes: User's recorded audio target_text: Expected text/phrase user_baseline: Optional baseline metrics for comparison user_context: Optional user profile (speech condition, severity) include_ai_feedback: Whether to generate GPT-4o feedback Returns: PronunciationFeedback with scores, suggestions, and AI feedback """ logging.info(f"Analyzing pronunciation for target: {target_text}") # 1. Transcribe the audio asr = self._get_asr() result = asr.transcribe(audio_bytes) transcription = result.text.strip().lower() target_clean = target_text.strip().lower() logging.debug(f"Transcription: {transcription}") logging.debug(f"Target: {target_clean}") # 2. Compare transcription to target word_scores, phoneme_errors = self._compare_texts( transcription, target_clean ) # 3. Calculate scores overall_score = self._calculate_overall_score(word_scores) clarity_score = self._calculate_clarity_score(word_scores, phoneme_errors) pace_score = self._calculate_pace_score(result.word_timestamps) fluency_score = self._calculate_fluency_score(transcription, target_clean) # 4. Generate rule-based suggestions suggestions = self._generate_suggestions(phoneme_errors, word_scores) # 5. Generate AI-powered feedback (GPT-4o via GitHub Models) ai_feedback = None if include_ai_feedback: try: ai_generator = self._get_ai_feedback_generator() # Convert phoneme errors to dict format for AI errors_dict = [ { "word": e.word, "expected": e.expected, "actual": e.actual, "error_type": e.error_type.value } for e in phoneme_errors ] ai_result = await ai_generator.generate_feedback( target_text=target_text, transcription=transcription, overall_score=overall_score, clarity_score=clarity_score, pace_score=pace_score, fluency_score=fluency_score, errors=errors_dict, user_context=user_context ) ai_feedback = AIFeedback( feedback=ai_result.feedback, encouragement=ai_result.encouragement, specific_tips=ai_result.specific_tips, recommended_exercises=ai_result.recommended_exercises, difficulty_adjustment=ai_result.difficulty_adjustment ) logging.info("AI feedback generated successfully") except Exception as e: logging.warning(f"AI feedback generation failed: {e}") ai_feedback = None return PronunciationFeedback( overall_score=overall_score, clarity_score=clarity_score, pace_score=pace_score, fluency_score=fluency_score, word_scores=word_scores, suggestions=suggestions, phoneme_errors=phoneme_errors, transcription=transcription, target_text=target_text, ai_feedback=ai_feedback ) def _compare_texts( self, transcription: str, target: str ) -> tuple[List[WordScore], List[PhonemeError]]: """Compare transcribed text to target text.""" trans_words = transcription.split() target_words = target.split() word_scores = [] phoneme_errors = [] # Simple word-level comparison (can be enhanced with phoneme alignment) max_len = max(len(trans_words), len(target_words)) for i in range(max_len): target_word = target_words[i] if i < len(target_words) else "" trans_word = trans_words[i] if i < len(trans_words) else "" if not target_word: # Extra word in transcription phoneme_errors.append(PhonemeError( word=trans_word, position=i, expected="", actual=trans_word, error_type=ErrorType.ADDITION, suggestion=f"Extra word '{trans_word}' detected" )) continue if not trans_word: # Missing word word_scores.append(WordScore( word=target_word, score=0.0, errors=[PhonemeError( word=target_word, position=i, expected=target_word, actual=None, error_type=ErrorType.OMISSION, suggestion=f"Try to include the word '{target_word}'" )] )) phoneme_errors.append(word_scores[-1].errors[0]) continue # Compare words score, errors = self._compare_words(target_word, trans_word, i) word_scores.append(WordScore( word=target_word, score=score, errors=errors )) phoneme_errors.extend(errors) return word_scores, phoneme_errors def _compare_words( self, target_word: str, trans_word: str, position: int ) -> tuple[float, List[PhonemeError]]: """Compare two words and return score and errors.""" errors = [] # Exact match if target_word == trans_word: return 100.0, [] # Calculate similarity (simple Levenshtein-based) similarity = self._word_similarity(target_word, trans_word) score = similarity * 100 # Detect error type if len(trans_word) > len(target_word): error_type = ErrorType.ADDITION suggestion = f"'{trans_word}' has extra sounds, expected '{target_word}'" elif len(trans_word) < len(target_word): error_type = ErrorType.OMISSION suggestion = f"Some sounds missing in '{trans_word}', expected '{target_word}'" else: error_type = ErrorType.SUBSTITUTION suggestion = f"'{trans_word}' should be '{target_word}'" if score < 100: errors.append(PhonemeError( word=target_word, position=position, expected=target_word, actual=trans_word, error_type=error_type, suggestion=suggestion )) return score, errors def _word_similarity(self, word1: str, word2: str) -> float: """Calculate similarity between two words (0-1).""" if word1 == word2: return 1.0 # Levenshtein distance normalized len1, len2 = len(word1), len(word2) if len1 == 0 or len2 == 0: return 0.0 # Create distance matrix dp = [[0] * (len2 + 1) for _ in range(len1 + 1)] for i in range(len1 + 1): dp[i][0] = i for j in range(len2 + 1): dp[0][j] = j for i in range(1, len1 + 1): for j in range(1, len2 + 1): cost = 0 if word1[i-1] == word2[j-1] else 1 dp[i][j] = min( dp[i-1][j] + 1, # deletion dp[i][j-1] + 1, # insertion dp[i-1][j-1] + cost # substitution ) distance = dp[len1][len2] max_len = max(len1, len2) return 1.0 - (distance / max_len) def _calculate_overall_score(self, word_scores: List[WordScore]) -> float: """Calculate overall pronunciation score.""" if not word_scores: return 0.0 return sum(ws.score for ws in word_scores) / len(word_scores) def _calculate_clarity_score( self, word_scores: List[WordScore], errors: List[PhonemeError] ) -> float: """Calculate speech clarity score.""" if not word_scores: return 0.0 # Penalize based on error types error_penalties = { ErrorType.DISTORTION: 15, ErrorType.SUBSTITUTION: 10, ErrorType.OMISSION: 20, ErrorType.ADDITION: 5, } base_score = 100.0 for error in errors: base_score -= error_penalties.get(error.error_type, 10) return max(0.0, base_score) def _calculate_pace_score( self, word_timestamps: Optional[List[dict]] ) -> float: """Calculate pace/timing score.""" if not word_timestamps or len(word_timestamps) < 2: return 75.0 # Default score if no timestamps # Calculate words per minute total_duration = word_timestamps[-1].get("end", 0) - word_timestamps[0].get("start", 0) if total_duration <= 0: return 75.0 wpm = (len(word_timestamps) / total_duration) * 60 # Ideal range: 100-150 WPM for clear speech if 100 <= wpm <= 150: return 100.0 elif 80 <= wpm < 100 or 150 < wpm <= 180: return 85.0 elif 60 <= wpm < 80 or 180 < wpm <= 200: return 70.0 else: return 50.0 def _calculate_fluency_score(self, transcription: str, target: str) -> float: """Calculate fluency based on text similarity.""" return self._word_similarity(transcription, target) * 100 def _generate_suggestions( self, errors: List[PhonemeError], word_scores: List[WordScore] ) -> List[str]: """Generate actionable improvement suggestions.""" suggestions = [] # Group errors by type error_types = {} for error in errors: error_types.setdefault(error.error_type, []).append(error) # Generate suggestions based on error patterns if ErrorType.OMISSION in error_types: omissions = error_types[ErrorType.OMISSION] words = [e.word for e in omissions[:3]] suggestions.append( f"Try to pronounce all sounds in: {', '.join(words)}" ) if ErrorType.SUBSTITUTION in error_types: subs = error_types[ErrorType.SUBSTITUTION] if subs: suggestions.append( f"Focus on the correct sound in '{subs[0].word}'" ) if ErrorType.ADDITION in error_types: suggestions.append("Speak more clearly without adding extra sounds") # Low scoring words low_scores = [ws for ws in word_scores if ws.score < 70] if low_scores: words = [ws.word for ws in low_scores[:3]] suggestions.append( f"Practice these words: {', '.join(words)}" ) # General encouragement if few errors if len(errors) <= 2: suggestions.append("Good job! Keep practicing for even better clarity.") return suggestions[:5] # Limit to 5 suggestions # Singleton instance _analyzer_instance: Optional[PronunciationAnalyzer] = None def get_pronunciation_analyzer() -> PronunciationAnalyzer: """Get or create PronunciationAnalyzer singleton.""" global _analyzer_instance if _analyzer_instance is None: _analyzer_instance = PronunciationAnalyzer() return _analyzer_instance async def analyze_pronunciation( audio_bytes: bytes, target_text: str, user_baseline: Optional[dict] = None, user_context: Optional[dict] = None, include_ai_feedback: bool = True ) -> PronunciationFeedback: """Convenience function for pronunciation analysis with AI feedback.""" analyzer = get_pronunciation_analyzer() return await analyzer.analyze( audio_bytes, target_text, user_baseline, user_context, include_ai_feedback )