Spaces:
Sleeping
Sleeping
| """ | |
| Pronunciation Analysis Module - Speech clarity and pronunciation feedback. | |
| Provides: | |
| - Pronunciation scoring (PCC - Percent Consonants Correct) | |
| - Clarity assessment | |
| - Pace analysis | |
| - Per-word feedback | |
| - Improvement suggestions | |
| """ | |
| import io | |
| import logging | |
| from typing import Optional, List | |
| from dataclasses import dataclass, field | |
| from enum import Enum | |
| from api.config import settings | |
| if settings.ENVIRONMENT == "development": | |
| logging.basicConfig(level=logging.DEBUG) | |
| else: | |
| logging.basicConfig(level=logging.WARNING) | |
| class ErrorType(str, Enum): | |
| """Types of pronunciation errors.""" | |
| SUBSTITUTION = "substitution" # Wrong sound | |
| OMISSION = "omission" # Missing sound | |
| ADDITION = "addition" # Extra sound | |
| DISTORTION = "distortion" # Unclear sound | |
| class PhonemeError: | |
| """Individual phoneme-level error.""" | |
| word: str | |
| position: int # Position in word | |
| expected: str | |
| actual: Optional[str] | |
| error_type: ErrorType | |
| suggestion: str | |
| class WordScore: | |
| """Per-word pronunciation score.""" | |
| word: str | |
| score: float # 0-100 | |
| start_time: Optional[float] = None | |
| end_time: Optional[float] = None | |
| errors: List[PhonemeError] = field(default_factory=list) | |
| class AIFeedback: | |
| """AI-generated personalized feedback.""" | |
| feedback: str | |
| encouragement: str | |
| specific_tips: List[str] | |
| recommended_exercises: List[str] | |
| difficulty_adjustment: Optional[str] = None # "easier", "same", "harder" | |
| class PronunciationFeedback: | |
| """Complete pronunciation analysis result.""" | |
| overall_score: float # 0-100 | |
| clarity_score: float # 0-100 | |
| pace_score: float # 0-100 | |
| fluency_score: float # 0-100 | |
| word_scores: List[WordScore] | |
| suggestions: List[str] | |
| phoneme_errors: List[PhonemeError] | |
| transcription: str | |
| target_text: str | |
| duration_seconds: Optional[float] = None | |
| ai_feedback: Optional[AIFeedback] = None # GPT-4o powered feedback | |
| class PronunciationAnalyzer: | |
| """ | |
| Analyze pronunciation against target text. | |
| Uses ASR with forced alignment to compare user speech | |
| against expected pronunciation. Integrates GPT-4o for | |
| personalized feedback via GitHub Models API. | |
| """ | |
| def __init__(self): | |
| self._asr = None | |
| self._ai_feedback = None | |
| def _get_ai_feedback_generator(self): | |
| """Get AI feedback generator instance.""" | |
| if self._ai_feedback is None: | |
| from api.endpoints.v1.processing.ai_feedback import get_ai_feedback_generator | |
| self._ai_feedback = get_ai_feedback_generator() | |
| return self._ai_feedback | |
| def _get_asr(self): | |
| """Get ASR instance for transcription.""" | |
| if self._asr is None: | |
| from api.endpoints.v1.processing.therapy_asr import get_therapy_asr | |
| self._asr = get_therapy_asr() | |
| return self._asr | |
| async def analyze( | |
| self, | |
| audio_bytes: bytes, | |
| target_text: str, | |
| user_baseline: Optional[dict] = None, | |
| user_context: Optional[dict] = None, | |
| include_ai_feedback: bool = True | |
| ) -> PronunciationFeedback: | |
| """ | |
| Analyze pronunciation of audio against target text. | |
| Args: | |
| audio_bytes: User's recorded audio | |
| target_text: Expected text/phrase | |
| user_baseline: Optional baseline metrics for comparison | |
| user_context: Optional user profile (speech condition, severity) | |
| include_ai_feedback: Whether to generate GPT-4o feedback | |
| Returns: | |
| PronunciationFeedback with scores, suggestions, and AI feedback | |
| """ | |
| logging.info(f"Analyzing pronunciation for target: {target_text}") | |
| # 1. Transcribe the audio | |
| asr = self._get_asr() | |
| result = asr.transcribe(audio_bytes) | |
| transcription = result.text.strip().lower() | |
| target_clean = target_text.strip().lower() | |
| logging.debug(f"Transcription: {transcription}") | |
| logging.debug(f"Target: {target_clean}") | |
| # 2. Compare transcription to target | |
| word_scores, phoneme_errors = self._compare_texts( | |
| transcription, target_clean | |
| ) | |
| # 3. Calculate scores | |
| overall_score = self._calculate_overall_score(word_scores) | |
| clarity_score = self._calculate_clarity_score(word_scores, phoneme_errors) | |
| pace_score = self._calculate_pace_score(result.word_timestamps) | |
| fluency_score = self._calculate_fluency_score(transcription, target_clean) | |
| # 4. Generate rule-based suggestions | |
| suggestions = self._generate_suggestions(phoneme_errors, word_scores) | |
| # 5. Generate AI-powered feedback (GPT-4o via GitHub Models) | |
| ai_feedback = None | |
| if include_ai_feedback: | |
| try: | |
| ai_generator = self._get_ai_feedback_generator() | |
| # Convert phoneme errors to dict format for AI | |
| errors_dict = [ | |
| { | |
| "word": e.word, | |
| "expected": e.expected, | |
| "actual": e.actual, | |
| "error_type": e.error_type.value | |
| } | |
| for e in phoneme_errors | |
| ] | |
| ai_result = await ai_generator.generate_feedback( | |
| target_text=target_text, | |
| transcription=transcription, | |
| overall_score=overall_score, | |
| clarity_score=clarity_score, | |
| pace_score=pace_score, | |
| fluency_score=fluency_score, | |
| errors=errors_dict, | |
| user_context=user_context | |
| ) | |
| ai_feedback = AIFeedback( | |
| feedback=ai_result.feedback, | |
| encouragement=ai_result.encouragement, | |
| specific_tips=ai_result.specific_tips, | |
| recommended_exercises=ai_result.recommended_exercises, | |
| difficulty_adjustment=ai_result.difficulty_adjustment | |
| ) | |
| logging.info("AI feedback generated successfully") | |
| except Exception as e: | |
| logging.warning(f"AI feedback generation failed: {e}") | |
| ai_feedback = None | |
| return PronunciationFeedback( | |
| overall_score=overall_score, | |
| clarity_score=clarity_score, | |
| pace_score=pace_score, | |
| fluency_score=fluency_score, | |
| word_scores=word_scores, | |
| suggestions=suggestions, | |
| phoneme_errors=phoneme_errors, | |
| transcription=transcription, | |
| target_text=target_text, | |
| ai_feedback=ai_feedback | |
| ) | |
| def _compare_texts( | |
| self, | |
| transcription: str, | |
| target: str | |
| ) -> tuple[List[WordScore], List[PhonemeError]]: | |
| """Compare transcribed text to target text.""" | |
| trans_words = transcription.split() | |
| target_words = target.split() | |
| word_scores = [] | |
| phoneme_errors = [] | |
| # Simple word-level comparison (can be enhanced with phoneme alignment) | |
| max_len = max(len(trans_words), len(target_words)) | |
| for i in range(max_len): | |
| target_word = target_words[i] if i < len(target_words) else "" | |
| trans_word = trans_words[i] if i < len(trans_words) else "" | |
| if not target_word: | |
| # Extra word in transcription | |
| phoneme_errors.append(PhonemeError( | |
| word=trans_word, | |
| position=i, | |
| expected="", | |
| actual=trans_word, | |
| error_type=ErrorType.ADDITION, | |
| suggestion=f"Extra word '{trans_word}' detected" | |
| )) | |
| continue | |
| if not trans_word: | |
| # Missing word | |
| word_scores.append(WordScore( | |
| word=target_word, | |
| score=0.0, | |
| errors=[PhonemeError( | |
| word=target_word, | |
| position=i, | |
| expected=target_word, | |
| actual=None, | |
| error_type=ErrorType.OMISSION, | |
| suggestion=f"Try to include the word '{target_word}'" | |
| )] | |
| )) | |
| phoneme_errors.append(word_scores[-1].errors[0]) | |
| continue | |
| # Compare words | |
| score, errors = self._compare_words(target_word, trans_word, i) | |
| word_scores.append(WordScore( | |
| word=target_word, | |
| score=score, | |
| errors=errors | |
| )) | |
| phoneme_errors.extend(errors) | |
| return word_scores, phoneme_errors | |
| def _compare_words( | |
| self, | |
| target_word: str, | |
| trans_word: str, | |
| position: int | |
| ) -> tuple[float, List[PhonemeError]]: | |
| """Compare two words and return score and errors.""" | |
| errors = [] | |
| # Exact match | |
| if target_word == trans_word: | |
| return 100.0, [] | |
| # Calculate similarity (simple Levenshtein-based) | |
| similarity = self._word_similarity(target_word, trans_word) | |
| score = similarity * 100 | |
| # Detect error type | |
| if len(trans_word) > len(target_word): | |
| error_type = ErrorType.ADDITION | |
| suggestion = f"'{trans_word}' has extra sounds, expected '{target_word}'" | |
| elif len(trans_word) < len(target_word): | |
| error_type = ErrorType.OMISSION | |
| suggestion = f"Some sounds missing in '{trans_word}', expected '{target_word}'" | |
| else: | |
| error_type = ErrorType.SUBSTITUTION | |
| suggestion = f"'{trans_word}' should be '{target_word}'" | |
| if score < 100: | |
| errors.append(PhonemeError( | |
| word=target_word, | |
| position=position, | |
| expected=target_word, | |
| actual=trans_word, | |
| error_type=error_type, | |
| suggestion=suggestion | |
| )) | |
| return score, errors | |
| def _word_similarity(self, word1: str, word2: str) -> float: | |
| """Calculate similarity between two words (0-1).""" | |
| if word1 == word2: | |
| return 1.0 | |
| # Levenshtein distance normalized | |
| len1, len2 = len(word1), len(word2) | |
| if len1 == 0 or len2 == 0: | |
| return 0.0 | |
| # Create distance matrix | |
| dp = [[0] * (len2 + 1) for _ in range(len1 + 1)] | |
| for i in range(len1 + 1): | |
| dp[i][0] = i | |
| for j in range(len2 + 1): | |
| dp[0][j] = j | |
| for i in range(1, len1 + 1): | |
| for j in range(1, len2 + 1): | |
| cost = 0 if word1[i-1] == word2[j-1] else 1 | |
| dp[i][j] = min( | |
| dp[i-1][j] + 1, # deletion | |
| dp[i][j-1] + 1, # insertion | |
| dp[i-1][j-1] + cost # substitution | |
| ) | |
| distance = dp[len1][len2] | |
| max_len = max(len1, len2) | |
| return 1.0 - (distance / max_len) | |
| def _calculate_overall_score(self, word_scores: List[WordScore]) -> float: | |
| """Calculate overall pronunciation score.""" | |
| if not word_scores: | |
| return 0.0 | |
| return sum(ws.score for ws in word_scores) / len(word_scores) | |
| def _calculate_clarity_score( | |
| self, | |
| word_scores: List[WordScore], | |
| errors: List[PhonemeError] | |
| ) -> float: | |
| """Calculate speech clarity score.""" | |
| if not word_scores: | |
| return 0.0 | |
| # Penalize based on error types | |
| error_penalties = { | |
| ErrorType.DISTORTION: 15, | |
| ErrorType.SUBSTITUTION: 10, | |
| ErrorType.OMISSION: 20, | |
| ErrorType.ADDITION: 5, | |
| } | |
| base_score = 100.0 | |
| for error in errors: | |
| base_score -= error_penalties.get(error.error_type, 10) | |
| return max(0.0, base_score) | |
| def _calculate_pace_score( | |
| self, | |
| word_timestamps: Optional[List[dict]] | |
| ) -> float: | |
| """Calculate pace/timing score.""" | |
| if not word_timestamps or len(word_timestamps) < 2: | |
| return 75.0 # Default score if no timestamps | |
| # Calculate words per minute | |
| total_duration = word_timestamps[-1].get("end", 0) - word_timestamps[0].get("start", 0) | |
| if total_duration <= 0: | |
| return 75.0 | |
| wpm = (len(word_timestamps) / total_duration) * 60 | |
| # Ideal range: 100-150 WPM for clear speech | |
| if 100 <= wpm <= 150: | |
| return 100.0 | |
| elif 80 <= wpm < 100 or 150 < wpm <= 180: | |
| return 85.0 | |
| elif 60 <= wpm < 80 or 180 < wpm <= 200: | |
| return 70.0 | |
| else: | |
| return 50.0 | |
| def _calculate_fluency_score(self, transcription: str, target: str) -> float: | |
| """Calculate fluency based on text similarity.""" | |
| return self._word_similarity(transcription, target) * 100 | |
| def _generate_suggestions( | |
| self, | |
| errors: List[PhonemeError], | |
| word_scores: List[WordScore] | |
| ) -> List[str]: | |
| """Generate actionable improvement suggestions.""" | |
| suggestions = [] | |
| # Group errors by type | |
| error_types = {} | |
| for error in errors: | |
| error_types.setdefault(error.error_type, []).append(error) | |
| # Generate suggestions based on error patterns | |
| if ErrorType.OMISSION in error_types: | |
| omissions = error_types[ErrorType.OMISSION] | |
| words = [e.word for e in omissions[:3]] | |
| suggestions.append( | |
| f"Try to pronounce all sounds in: {', '.join(words)}" | |
| ) | |
| if ErrorType.SUBSTITUTION in error_types: | |
| subs = error_types[ErrorType.SUBSTITUTION] | |
| if subs: | |
| suggestions.append( | |
| f"Focus on the correct sound in '{subs[0].word}'" | |
| ) | |
| if ErrorType.ADDITION in error_types: | |
| suggestions.append("Speak more clearly without adding extra sounds") | |
| # Low scoring words | |
| low_scores = [ws for ws in word_scores if ws.score < 70] | |
| if low_scores: | |
| words = [ws.word for ws in low_scores[:3]] | |
| suggestions.append( | |
| f"Practice these words: {', '.join(words)}" | |
| ) | |
| # General encouragement if few errors | |
| if len(errors) <= 2: | |
| suggestions.append("Good job! Keep practicing for even better clarity.") | |
| return suggestions[:5] # Limit to 5 suggestions | |
| # Singleton instance | |
| _analyzer_instance: Optional[PronunciationAnalyzer] = None | |
| def get_pronunciation_analyzer() -> PronunciationAnalyzer: | |
| """Get or create PronunciationAnalyzer singleton.""" | |
| global _analyzer_instance | |
| if _analyzer_instance is None: | |
| _analyzer_instance = PronunciationAnalyzer() | |
| return _analyzer_instance | |
| async def analyze_pronunciation( | |
| audio_bytes: bytes, | |
| target_text: str, | |
| user_baseline: Optional[dict] = None, | |
| user_context: Optional[dict] = None, | |
| include_ai_feedback: bool = True | |
| ) -> PronunciationFeedback: | |
| """Convenience function for pronunciation analysis with AI feedback.""" | |
| analyzer = get_pronunciation_analyzer() | |
| return await analyzer.analyze( | |
| audio_bytes, | |
| target_text, | |
| user_baseline, | |
| user_context, | |
| include_ai_feedback | |
| ) | |