danulr05 commited on
Commit
3576afe
·
verified ·
1 Parent(s): 2c90c98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -6
app.py CHANGED
@@ -70,29 +70,91 @@ def contains_sinhala_roman(text):
70
  return True
71
  return False
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def transliterate_sinhala_roman_to_sinhala(text):
74
- """Use Gemini to convert Roman Sinhala to Sinhala script"""
75
  if not gemini_api_key or not contains_sinhala_roman(text):
76
  return text
77
 
78
  try:
79
- prompt = f"""Convert this Roman Sinhala text to Sinhala script. Only convert if it's actually Sinhala words in Roman script. If it's English or other language, return as is.
 
 
 
 
 
 
 
 
80
 
81
- Text: "{text}"
82
 
83
- Sinhala script:"""
84
 
85
  response = gemini_model.generate_content(prompt)
86
  result = response.text.strip()
87
 
88
- # Clean up the response
89
  if result and len(result) > 0:
 
 
 
90
  return result
91
  else:
92
  return text
93
 
94
  except Exception as e:
95
- logger.warning(f"Transliteration failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  return text
97
 
98
  def preprocess_query(query, language):
@@ -102,6 +164,11 @@ def preprocess_query(query, language):
102
  transliterated = transliterate_sinhala_roman_to_sinhala(query)
103
  logger.info(f"Transliterated to: {transliterated}")
104
  return transliterated
 
 
 
 
 
105
  return query
106
 
107
  # Load dynamic metadata
 
70
  return True
71
  return False
72
 
73
+ def contains_tamil_roman(text):
74
+ """Check if text contains Roman Tamil patterns"""
75
+ # Common Roman Tamil patterns
76
+ tamil_roman_patterns = [
77
+ r'\b[a-z]+[aeiou][a-z]*\b', # Basic Tamil roman patterns
78
+ r'\b(amma|appa|akka|anna|thambi|thangai|paapa|amma|appa|akka|anna|thambi|thangai|paapa)\b', # Common Tamil words
79
+ r'\b(naan|neenga|avan|aval|adhu|idhu|edhu|yaaru|eppadi|enna|yaen|kaalam|vaaram|maasam|varusham)\b', # Tamil pronouns/words
80
+ ]
81
+
82
+ for pattern in tamil_roman_patterns:
83
+ if re.search(pattern, text.lower()):
84
+ return True
85
+ return False
86
+
87
  def transliterate_sinhala_roman_to_sinhala(text):
88
+ """Use Gemini to convert Roman Sinhala to Sinhala script with enhanced context"""
89
  if not gemini_api_key or not contains_sinhala_roman(text):
90
  return text
91
 
92
  try:
93
+ prompt = f"""You are a language expert specializing in Sri Lankan languages. Convert this Roman Sinhala text (Sinhala words written in English letters) to proper Sinhala script.
94
+
95
+ IMPORTANT CONTEXT:
96
+ - This is for a Sri Lankan budget proposals search system
97
+ - The user is likely searching for government policies, economic proposals, or budget information
98
+ - Use formal Sinhala appropriate for policy discussions
99
+ - Only convert if it's actually Sinhala words in Roman script
100
+ - If it's English or other language, return as is
101
+ - Be accurate with Sri Lankan Sinhala terminology
102
 
103
+ Text to convert: "{text}"
104
 
105
+ Converted Sinhala script:"""
106
 
107
  response = gemini_model.generate_content(prompt)
108
  result = response.text.strip()
109
 
110
+ # Clean up the response - remove any extra text that might be added
111
  if result and len(result) > 0:
112
+ # Remove common prefixes that Gemini might add
113
+ result = result.replace("Converted Sinhala script:", "").strip()
114
+ result = result.replace("Sinhala script:", "").strip()
115
  return result
116
  else:
117
  return text
118
 
119
  except Exception as e:
120
+ logger.warning(f"Sinhala transliteration failed: {e}")
121
+ return text
122
+
123
+ def transliterate_tamil_roman_to_tamil(text):
124
+ """Use Gemini to convert Roman Tamil to Tamil script with enhanced context"""
125
+ if not gemini_api_key or not contains_tamil_roman(text):
126
+ return text
127
+
128
+ try:
129
+ prompt = f"""You are a language expert specializing in Sri Lankan languages. Convert this Roman Tamil text (Tamil words written in English letters) to proper Tamil script.
130
+
131
+ IMPORTANT CONTEXT:
132
+ - This is for a Sri Lankan budget proposals search system
133
+ - The user is likely searching for government policies, economic proposals, or budget information
134
+ - Use formal Tamil appropriate for policy discussions
135
+ - Use Sri Lankan Tamil dialect and terminology
136
+ - Only convert if it's actually Tamil words in Roman script
137
+ - If it's English or other language, return as is
138
+ - Be accurate with Sri Lankan Tamil terminology and context
139
+
140
+ Text to convert: "{text}"
141
+
142
+ Converted Tamil script:"""
143
+
144
+ response = gemini_model.generate_content(prompt)
145
+ result = response.text.strip()
146
+
147
+ # Clean up the response - remove any extra text that might be added
148
+ if result and len(result) > 0:
149
+ # Remove common prefixes that Gemini might add
150
+ result = result.replace("Converted Tamil script:", "").strip()
151
+ result = result.replace("Tamil script:", "").strip()
152
+ return result
153
+ else:
154
+ return text
155
+
156
+ except Exception as e:
157
+ logger.warning(f"Tamil transliteration failed: {e}")
158
  return text
159
 
160
  def preprocess_query(query, language):
 
164
  transliterated = transliterate_sinhala_roman_to_sinhala(query)
165
  logger.info(f"Transliterated to: {transliterated}")
166
  return transliterated
167
+ elif language == 'ta' and contains_tamil_roman(query):
168
+ logger.info(f"Transliterating Roman Tamil: {query}")
169
+ transliterated = transliterate_tamil_roman_to_tamil(query)
170
+ logger.info(f"Transliterated to: {transliterated}")
171
+ return transliterated
172
  return query
173
 
174
  # Load dynamic metadata