from langdetect import detect, DetectorFactory, LangDetectException import re # Set seed for consistent results DetectorFactory.seed = 0 def detect_language(text: str) -> str: """ Detect if text is English, Bengali, Mixed, or Unknown Uses multiple detection strategies for accuracy """ if not text or len(text.strip()) < 3: return "unknown" # Strategy 1: Check for Bengali Unicode characters bengali_pattern = r'[\u0980-\u09FF]' has_bengali = bool(re.search(bengali_pattern, text)) # Strategy 2: Check for English characters english_pattern = r'[a-zA-Z]' has_english = bool(re.search(english_pattern, text)) # If both present, it's mixed if has_bengali and has_english: bengali_chars = len(re.findall(bengali_pattern, text)) english_chars = len(re.findall(english_pattern, text)) # If one language dominates heavily (>80%), classify as that language total_chars = bengali_chars + english_chars if bengali_chars / total_chars > 0.8: return "bengali" elif english_chars / total_chars > 0.8: return "english" else: return "mixed" # If only Bengali if has_bengali: return "bengali" # If only English if has_english: try: # Use langdetect for confirmation detected = detect(text) if detected == 'en': return "english" elif detected == 'bn': return "bengali" else: # If langdetect finds another language but we have English chars return "english" except LangDetectException: return "english" # Fallback to langdetect try: detected = detect(text) if detected == 'en': return "english" elif detected == 'bn': return "bengali" else: return "unknown" except LangDetectException: return "unknown" def get_language_script_info(text: str) -> dict: """ Get detailed information about the scripts used in text Useful for debugging and fine-tuning """ bengali_chars = len(re.findall(r'[\u0980-\u09FF]', text)) english_chars = len(re.findall(r'[a-zA-Z]', text)) digits = len(re.findall(r'\d', text)) other_chars = len(text) - bengali_chars - english_chars - digits return { "bengali_characters": bengali_chars, "english_characters": english_chars, "digits": digits, "other_characters": other_chars, "total_length": len(text) }