ubumuntu-api / api /endpoints /v1 /processing /pronunciation_analysis.py
Macbook
Add FastAPI application
cc4ea58
raw
history blame
15.5 kB
"""
Pronunciation Analysis Module - Speech clarity and pronunciation feedback.
Provides:
- Pronunciation scoring (PCC - Percent Consonants Correct)
- Clarity assessment
- Pace analysis
- Per-word feedback
- Improvement suggestions
"""
import io
import logging
from typing import Optional, List
from dataclasses import dataclass, field
from enum import Enum
from api.config import settings
if settings.ENVIRONMENT == "development":
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.WARNING)
class ErrorType(str, Enum):
"""Types of pronunciation errors."""
SUBSTITUTION = "substitution" # Wrong sound
OMISSION = "omission" # Missing sound
ADDITION = "addition" # Extra sound
DISTORTION = "distortion" # Unclear sound
@dataclass
class PhonemeError:
"""Individual phoneme-level error."""
word: str
position: int # Position in word
expected: str
actual: Optional[str]
error_type: ErrorType
suggestion: str
@dataclass
class WordScore:
"""Per-word pronunciation score."""
word: str
score: float # 0-100
start_time: Optional[float] = None
end_time: Optional[float] = None
errors: List[PhonemeError] = field(default_factory=list)
@dataclass
class AIFeedback:
"""AI-generated personalized feedback."""
feedback: str
encouragement: str
specific_tips: List[str]
recommended_exercises: List[str]
difficulty_adjustment: Optional[str] = None # "easier", "same", "harder"
@dataclass
class PronunciationFeedback:
"""Complete pronunciation analysis result."""
overall_score: float # 0-100
clarity_score: float # 0-100
pace_score: float # 0-100
fluency_score: float # 0-100
word_scores: List[WordScore]
suggestions: List[str]
phoneme_errors: List[PhonemeError]
transcription: str
target_text: str
duration_seconds: Optional[float] = None
ai_feedback: Optional[AIFeedback] = None # GPT-4o powered feedback
class PronunciationAnalyzer:
"""
Analyze pronunciation against target text.
Uses ASR with forced alignment to compare user speech
against expected pronunciation. Integrates GPT-4o for
personalized feedback via GitHub Models API.
"""
def __init__(self):
self._asr = None
self._ai_feedback = None
def _get_ai_feedback_generator(self):
"""Get AI feedback generator instance."""
if self._ai_feedback is None:
from api.endpoints.v1.processing.ai_feedback import get_ai_feedback_generator
self._ai_feedback = get_ai_feedback_generator()
return self._ai_feedback
def _get_asr(self):
"""Get ASR instance for transcription."""
if self._asr is None:
from api.endpoints.v1.processing.therapy_asr import get_therapy_asr
self._asr = get_therapy_asr()
return self._asr
async def analyze(
self,
audio_bytes: bytes,
target_text: str,
user_baseline: Optional[dict] = None,
user_context: Optional[dict] = None,
include_ai_feedback: bool = True
) -> PronunciationFeedback:
"""
Analyze pronunciation of audio against target text.
Args:
audio_bytes: User's recorded audio
target_text: Expected text/phrase
user_baseline: Optional baseline metrics for comparison
user_context: Optional user profile (speech condition, severity)
include_ai_feedback: Whether to generate GPT-4o feedback
Returns:
PronunciationFeedback with scores, suggestions, and AI feedback
"""
logging.info(f"Analyzing pronunciation for target: {target_text}")
# 1. Transcribe the audio
asr = self._get_asr()
result = asr.transcribe(audio_bytes)
transcription = result.text.strip().lower()
target_clean = target_text.strip().lower()
logging.debug(f"Transcription: {transcription}")
logging.debug(f"Target: {target_clean}")
# 2. Compare transcription to target
word_scores, phoneme_errors = self._compare_texts(
transcription, target_clean
)
# 3. Calculate scores
overall_score = self._calculate_overall_score(word_scores)
clarity_score = self._calculate_clarity_score(word_scores, phoneme_errors)
pace_score = self._calculate_pace_score(result.word_timestamps)
fluency_score = self._calculate_fluency_score(transcription, target_clean)
# 4. Generate rule-based suggestions
suggestions = self._generate_suggestions(phoneme_errors, word_scores)
# 5. Generate AI-powered feedback (GPT-4o via GitHub Models)
ai_feedback = None
if include_ai_feedback:
try:
ai_generator = self._get_ai_feedback_generator()
# Convert phoneme errors to dict format for AI
errors_dict = [
{
"word": e.word,
"expected": e.expected,
"actual": e.actual,
"error_type": e.error_type.value
}
for e in phoneme_errors
]
ai_result = await ai_generator.generate_feedback(
target_text=target_text,
transcription=transcription,
overall_score=overall_score,
clarity_score=clarity_score,
pace_score=pace_score,
fluency_score=fluency_score,
errors=errors_dict,
user_context=user_context
)
ai_feedback = AIFeedback(
feedback=ai_result.feedback,
encouragement=ai_result.encouragement,
specific_tips=ai_result.specific_tips,
recommended_exercises=ai_result.recommended_exercises,
difficulty_adjustment=ai_result.difficulty_adjustment
)
logging.info("AI feedback generated successfully")
except Exception as e:
logging.warning(f"AI feedback generation failed: {e}")
ai_feedback = None
return PronunciationFeedback(
overall_score=overall_score,
clarity_score=clarity_score,
pace_score=pace_score,
fluency_score=fluency_score,
word_scores=word_scores,
suggestions=suggestions,
phoneme_errors=phoneme_errors,
transcription=transcription,
target_text=target_text,
ai_feedback=ai_feedback
)
def _compare_texts(
self,
transcription: str,
target: str
) -> tuple[List[WordScore], List[PhonemeError]]:
"""Compare transcribed text to target text."""
trans_words = transcription.split()
target_words = target.split()
word_scores = []
phoneme_errors = []
# Simple word-level comparison (can be enhanced with phoneme alignment)
max_len = max(len(trans_words), len(target_words))
for i in range(max_len):
target_word = target_words[i] if i < len(target_words) else ""
trans_word = trans_words[i] if i < len(trans_words) else ""
if not target_word:
# Extra word in transcription
phoneme_errors.append(PhonemeError(
word=trans_word,
position=i,
expected="",
actual=trans_word,
error_type=ErrorType.ADDITION,
suggestion=f"Extra word '{trans_word}' detected"
))
continue
if not trans_word:
# Missing word
word_scores.append(WordScore(
word=target_word,
score=0.0,
errors=[PhonemeError(
word=target_word,
position=i,
expected=target_word,
actual=None,
error_type=ErrorType.OMISSION,
suggestion=f"Try to include the word '{target_word}'"
)]
))
phoneme_errors.append(word_scores[-1].errors[0])
continue
# Compare words
score, errors = self._compare_words(target_word, trans_word, i)
word_scores.append(WordScore(
word=target_word,
score=score,
errors=errors
))
phoneme_errors.extend(errors)
return word_scores, phoneme_errors
def _compare_words(
self,
target_word: str,
trans_word: str,
position: int
) -> tuple[float, List[PhonemeError]]:
"""Compare two words and return score and errors."""
errors = []
# Exact match
if target_word == trans_word:
return 100.0, []
# Calculate similarity (simple Levenshtein-based)
similarity = self._word_similarity(target_word, trans_word)
score = similarity * 100
# Detect error type
if len(trans_word) > len(target_word):
error_type = ErrorType.ADDITION
suggestion = f"'{trans_word}' has extra sounds, expected '{target_word}'"
elif len(trans_word) < len(target_word):
error_type = ErrorType.OMISSION
suggestion = f"Some sounds missing in '{trans_word}', expected '{target_word}'"
else:
error_type = ErrorType.SUBSTITUTION
suggestion = f"'{trans_word}' should be '{target_word}'"
if score < 100:
errors.append(PhonemeError(
word=target_word,
position=position,
expected=target_word,
actual=trans_word,
error_type=error_type,
suggestion=suggestion
))
return score, errors
def _word_similarity(self, word1: str, word2: str) -> float:
"""Calculate similarity between two words (0-1)."""
if word1 == word2:
return 1.0
# Levenshtein distance normalized
len1, len2 = len(word1), len(word2)
if len1 == 0 or len2 == 0:
return 0.0
# Create distance matrix
dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
for i in range(len1 + 1):
dp[i][0] = i
for j in range(len2 + 1):
dp[0][j] = j
for i in range(1, len1 + 1):
for j in range(1, len2 + 1):
cost = 0 if word1[i-1] == word2[j-1] else 1
dp[i][j] = min(
dp[i-1][j] + 1, # deletion
dp[i][j-1] + 1, # insertion
dp[i-1][j-1] + cost # substitution
)
distance = dp[len1][len2]
max_len = max(len1, len2)
return 1.0 - (distance / max_len)
def _calculate_overall_score(self, word_scores: List[WordScore]) -> float:
"""Calculate overall pronunciation score."""
if not word_scores:
return 0.0
return sum(ws.score for ws in word_scores) / len(word_scores)
def _calculate_clarity_score(
self,
word_scores: List[WordScore],
errors: List[PhonemeError]
) -> float:
"""Calculate speech clarity score."""
if not word_scores:
return 0.0
# Penalize based on error types
error_penalties = {
ErrorType.DISTORTION: 15,
ErrorType.SUBSTITUTION: 10,
ErrorType.OMISSION: 20,
ErrorType.ADDITION: 5,
}
base_score = 100.0
for error in errors:
base_score -= error_penalties.get(error.error_type, 10)
return max(0.0, base_score)
def _calculate_pace_score(
self,
word_timestamps: Optional[List[dict]]
) -> float:
"""Calculate pace/timing score."""
if not word_timestamps or len(word_timestamps) < 2:
return 75.0 # Default score if no timestamps
# Calculate words per minute
total_duration = word_timestamps[-1].get("end", 0) - word_timestamps[0].get("start", 0)
if total_duration <= 0:
return 75.0
wpm = (len(word_timestamps) / total_duration) * 60
# Ideal range: 100-150 WPM for clear speech
if 100 <= wpm <= 150:
return 100.0
elif 80 <= wpm < 100 or 150 < wpm <= 180:
return 85.0
elif 60 <= wpm < 80 or 180 < wpm <= 200:
return 70.0
else:
return 50.0
def _calculate_fluency_score(self, transcription: str, target: str) -> float:
"""Calculate fluency based on text similarity."""
return self._word_similarity(transcription, target) * 100
def _generate_suggestions(
self,
errors: List[PhonemeError],
word_scores: List[WordScore]
) -> List[str]:
"""Generate actionable improvement suggestions."""
suggestions = []
# Group errors by type
error_types = {}
for error in errors:
error_types.setdefault(error.error_type, []).append(error)
# Generate suggestions based on error patterns
if ErrorType.OMISSION in error_types:
omissions = error_types[ErrorType.OMISSION]
words = [e.word for e in omissions[:3]]
suggestions.append(
f"Try to pronounce all sounds in: {', '.join(words)}"
)
if ErrorType.SUBSTITUTION in error_types:
subs = error_types[ErrorType.SUBSTITUTION]
if subs:
suggestions.append(
f"Focus on the correct sound in '{subs[0].word}'"
)
if ErrorType.ADDITION in error_types:
suggestions.append("Speak more clearly without adding extra sounds")
# Low scoring words
low_scores = [ws for ws in word_scores if ws.score < 70]
if low_scores:
words = [ws.word for ws in low_scores[:3]]
suggestions.append(
f"Practice these words: {', '.join(words)}"
)
# General encouragement if few errors
if len(errors) <= 2:
suggestions.append("Good job! Keep practicing for even better clarity.")
return suggestions[:5] # Limit to 5 suggestions
# Singleton instance
_analyzer_instance: Optional[PronunciationAnalyzer] = None
def get_pronunciation_analyzer() -> PronunciationAnalyzer:
"""Get or create PronunciationAnalyzer singleton."""
global _analyzer_instance
if _analyzer_instance is None:
_analyzer_instance = PronunciationAnalyzer()
return _analyzer_instance
async def analyze_pronunciation(
audio_bytes: bytes,
target_text: str,
user_baseline: Optional[dict] = None,
user_context: Optional[dict] = None,
include_ai_feedback: bool = True
) -> PronunciationFeedback:
"""Convenience function for pronunciation analysis with AI feedback."""
analyzer = get_pronunciation_analyzer()
return await analyzer.analyze(
audio_bytes,
target_text,
user_baseline,
user_context,
include_ai_feedback
)