|
|
|
|
|
|
|
|
import os |
|
|
import json |
|
|
from typing import List, Optional |
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
import faster_whisper |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.environ.setdefault("HF_HOME", "/home/user/app/.cache") |
|
|
|
|
|
MODEL_ID_CT2 = "kingabzpro/whisper-large-v3-urdu-ct2" |
|
|
GROQ_MODEL = "openai/gpt-oss-120b" |
|
|
|
|
|
DEFAULT_SYSTEM_PROMPT_UR = ( |
|
|
"آپ ایک ماہر اردو زبان ایڈیٹر ہیں۔ دیے گئے متن کو بہتر اردو املا، " |
|
|
"رموزِ اوقاف، وقفوں اور قدرتی روانی کے ساتھ پیش کریں۔ " |
|
|
"بولنے والے کے انداز اور معنی کو برقرار رکھیں، مبالغہ نہ کریں، " |
|
|
"اور عام انگریزی اصطلاحات جوں کی توں رہنے دیں۔" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def format_timestamp(seconds: float, format_type: str = "srt") -> str: |
|
|
total_ms = int(round((seconds or 0.0) * 1000)) |
|
|
hours, rem_ms = divmod(total_ms, 3_600_000) |
|
|
minutes, rem_ms = divmod(rem_ms, 60_000) |
|
|
sec, ms = divmod(rem_ms, 1000) |
|
|
sep = "," if format_type == "srt" else "." |
|
|
return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}" |
|
|
|
|
|
def basic_urdu_cleanup(text: str) -> str: |
|
|
if not text: |
|
|
return text |
|
|
t = " ".join(text.split()) |
|
|
replacements = { |
|
|
" ,": ",", " .": ".", " ?": "?", " !": "!", |
|
|
" ،": "،", " ۔": "۔", |
|
|
",": "،", ";": "؛", |
|
|
". . .": "…", "...": "…", |
|
|
} |
|
|
for a, b in replacements.items(): |
|
|
t = t.replace(a, b) |
|
|
t = t.replace(" ،", "،").replace(" ۔", "۔").replace(" ؛", "؛").replace(" ؟", "؟") |
|
|
for p in ["،", "؛", ",", ";"]: |
|
|
t = t.replace(p, p + " ") |
|
|
return " ".join(t.split()).strip() |
|
|
|
|
|
|
|
|
|
|
|
def get_groq_client(api_key: Optional[str] = None): |
|
|
key = (api_key or os.getenv("GROQ_API_KEY", "")).strip() |
|
|
if not key: |
|
|
return None, "No GROQ_API_KEY provided." |
|
|
try: |
|
|
from groq import Groq |
|
|
return Groq(api_key=key), None |
|
|
except Exception as e: |
|
|
return None, f"Groq client init failed: {e}" |
|
|
|
|
|
def enhance_text_with_llm(text: str, api_key: Optional[str], temperature: float = 0.2, |
|
|
system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> str: |
|
|
client, err = get_groq_client(api_key) |
|
|
if not client: |
|
|
if err: |
|
|
print(f"[LLM] {err} (falling back to basic cleanup)") |
|
|
return basic_urdu_cleanup(text) |
|
|
try: |
|
|
resp = client.chat.completions.create( |
|
|
model=GROQ_MODEL, |
|
|
temperature=float(temperature), |
|
|
messages=[ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": "براہِ کرم درج ذیل متن بہتر اردو میں لوٹائیں (بغیر اضافی تبصرہ):\n\n" + text}, |
|
|
], |
|
|
) |
|
|
return (resp.choices[0].message.content or "").strip() or basic_urdu_cleanup(text) |
|
|
except Exception as e: |
|
|
print(f"[LLM] Full-text enhance failed: {e}") |
|
|
return basic_urdu_cleanup(text) |
|
|
|
|
|
def enhance_lines_with_llm(lines: List[str], api_key: Optional[str], temperature: float = 0.2, |
|
|
system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> List[str]: |
|
|
if not lines: |
|
|
return lines |
|
|
client, err = get_groq_client(api_key) |
|
|
if not client: |
|
|
return [basic_urdu_cleanup(x) for x in lines] |
|
|
|
|
|
numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines)) |
|
|
user_msg = "ان جملوں کی اردو بہتر کریں۔ اسی ترتیب اور گنتی کے ساتھ اتنی ہی سطور واپس کریں:\n\n" + numbered |
|
|
try: |
|
|
resp = client.chat.completions.create( |
|
|
model=GROQ_MODEL, |
|
|
temperature=float(temperature), |
|
|
messages=[ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": user_msg}, |
|
|
], |
|
|
) |
|
|
raw = (resp.choices[0].message.content or "").strip() |
|
|
improved_map = {} |
|
|
for line in raw.splitlines(): |
|
|
s = line.strip() |
|
|
if not s or "." not in s: |
|
|
continue |
|
|
num, rest = s.split(".", 1) |
|
|
if num.strip().isdigit(): |
|
|
improved_map[int(num) - 1] = rest.strip() |
|
|
return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))] |
|
|
except Exception as e: |
|
|
print(f"[LLM] Line enhance failed: {e}") |
|
|
return [basic_urdu_cleanup(x) for x in lines] |
|
|
|
|
|
def test_groq(api_key: Optional[str], temperature: float, system_prompt: str) -> str: |
|
|
client, err = get_groq_client(api_key) |
|
|
if not client: |
|
|
return f"❌ LLM not ready: {err}" |
|
|
try: |
|
|
resp = client.chat.completions.create( |
|
|
model=GROQ_MODEL, |
|
|
temperature=float(temperature), |
|
|
messages=[ |
|
|
{"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT_UR}, |
|
|
{"role": "user", "content": "مختصر جملہ بہتر کر کے واپس کریں: 'یہ ایک ٹیسٹ ہے'"}, |
|
|
], |
|
|
) |
|
|
txt = (resp.choices[0].message.content or "").strip() |
|
|
return f"✅ LLM OK · Sample: {txt}" if txt else "⚠️ LLM responded but empty content." |
|
|
except Exception as e: |
|
|
return f"❌ LLM call failed: {e}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"CUDA available: {torch.cuda.is_available()}") |
|
|
print("Loading model... this may take a minute the first time.") |
|
|
model = faster_whisper.WhisperModel( |
|
|
MODEL_ID_CT2, |
|
|
device="cuda" if torch.cuda.is_available() else "cpu", |
|
|
compute_type="auto", |
|
|
) |
|
|
print("✅ Model loaded successfully!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def transcribe_audio( |
|
|
audio_path: Optional[str], |
|
|
output_format: str, |
|
|
beam_size: int, |
|
|
llm_enhance: bool, |
|
|
llm_api_key: Optional[str], |
|
|
llm_temperature: float, |
|
|
llm_system_prompt: str, |
|
|
): |
|
|
if not audio_path: |
|
|
raise gr.Error("Please upload or record an audio clip.") |
|
|
|
|
|
seg_iter, info = model.transcribe( |
|
|
audio_path, language="ur", beam_size=int(beam_size), |
|
|
word_timestamps=False, vad_filter=False |
|
|
) |
|
|
|
|
|
segments, raw_lines = [], [] |
|
|
for seg in seg_iter: |
|
|
text = (seg.text or "").strip() |
|
|
segments.append({"start": seg.start, "end": seg.end, "text": text}) |
|
|
raw_lines.append(text) |
|
|
|
|
|
if llm_enhance: |
|
|
if output_format == "text": |
|
|
cleaned_lines = [enhance_text_with_llm(" ".join(raw_lines), llm_api_key, llm_temperature, llm_system_prompt)] |
|
|
else: |
|
|
cleaned_lines = enhance_lines_with_llm(raw_lines, llm_api_key, llm_temperature, llm_system_prompt) |
|
|
else: |
|
|
cleaned_lines = ( |
|
|
[basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text" |
|
|
else [basic_urdu_cleanup(x) for x in raw_lines] |
|
|
) |
|
|
|
|
|
if output_format == "text": |
|
|
return cleaned_lines[0] |
|
|
if output_format == "srt": |
|
|
lines = [] |
|
|
for i, s in enumerate(segments, 1): |
|
|
txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"] |
|
|
lines += [str(i), f"{format_timestamp(s['start'],'srt')} --> {format_timestamp(s['end'],'srt')}", txt, ""] |
|
|
return "\n".join(lines) |
|
|
if output_format == "vtt": |
|
|
lines = ["WEBVTT", ""] |
|
|
for i, s in enumerate(segments, 1): |
|
|
txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"] |
|
|
lines += [f"{format_timestamp(s['start'],'vtt')} --> {format_timestamp(s['end'],'vtt')}", txt, ""] |
|
|
return "\n".join(lines) |
|
|
if output_format == "json": |
|
|
segs_out = [] |
|
|
for i, s in enumerate(segments): |
|
|
txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"] |
|
|
segs_out.append({"start": s["start"], "end": s["end"], "text": txt}) |
|
|
return json.dumps({"text": " ".join(cleaned_lines), "segments": segs_out}, ensure_ascii=False, indent=2) |
|
|
|
|
|
raise gr.Error(f"Unsupported format: {output_format}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
theme = gr.themes.Soft(primary_hue="rose", secondary_hue="violet", neutral_hue="slate") |
|
|
|
|
|
with gr.Blocks(title="Urdu ASR Studio — Faster-Whisper + LLM Polishing", theme=theme) as iface: |
|
|
|
|
|
gr.HTML(""" |
|
|
<style> |
|
|
.gradio-container { padding-bottom: 16px !important; } |
|
|
#result_box textarea { |
|
|
min-height: 260px !important; |
|
|
max-height: 360px !important; |
|
|
overflow-y: auto !important; |
|
|
} |
|
|
</style> |
|
|
""") |
|
|
|
|
|
gr.Markdown( |
|
|
"## **Urdu STT with GPT-OSS 120B** \n" |
|
|
"High-quality Urdu transcription with Faster-Whisper (CT2) and optional Groq LLM polishing." |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=5): |
|
|
audio = gr.Audio( |
|
|
sources=["upload","microphone"], type="filepath", |
|
|
label="Upload or Record Audio", |
|
|
waveform_options={"show_controls": False}, |
|
|
autoplay=False, streaming=False, |
|
|
) |
|
|
with gr.Accordion("Transcription Settings", open=False): |
|
|
with gr.Row(): |
|
|
fmt = gr.Radio(choices=["text","srt","vtt","json"], value="text", label="Output Format") |
|
|
beam = gr.Slider(1,10,5,step=1,label="Beam Size") |
|
|
with gr.Accordion("LLM Polishing (Optional)", open=False): |
|
|
llm_toggle = gr.Checkbox(value=False,label="Polish Urdu text with LLM (Groq · openai/gpt-oss-120b)") |
|
|
with gr.Row(): |
|
|
llm_temp = gr.Slider(0.0,1.0,0.2,step=0.05,label="LLM Temperature") |
|
|
llm_key = gr.Textbox(label="GROQ_API_KEY (optional if set in environment)", type="password", value="") |
|
|
llm_sys = gr.Textbox(label="LLM System Prompt (Urdu)", value=DEFAULT_SYSTEM_PROMPT_UR, lines=3) |
|
|
with gr.Row(): |
|
|
test_btn = gr.Button("Test LLM", variant="secondary") |
|
|
test_status = gr.Markdown("") |
|
|
with gr.Row(): |
|
|
btn = gr.Button("Transcribe", variant="primary") |
|
|
|
|
|
with gr.Column(scale=7): |
|
|
out = gr.Textbox(label="Result", lines=14, max_lines=30, show_copy_button=True, elem_id="result_box") |
|
|
|
|
|
btn.click(fn=transcribe_audio, inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys], outputs=out) |
|
|
test_btn.click(fn=test_groq, inputs=[llm_key,llm_temp,llm_sys], outputs=[test_status]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |
|
|
|