Spaces:

kingabzpro
/

Urdu-STT-with-GPT-OSS

Running

App Files Files Community

Urdu-STT-with-GPT-OSS / app.py

kingabzpro

Update app.py

2d0bb37 verified 3 months ago

raw

history blame contribute delete

13 kB

	# app.py – Urdu ASR Studio with Faster-Whisper + optional LLM Polishing

	import os
	import json
	from typing import List, Optional

	import gradio as gr
	import torch
	import faster_whisper

	# ────────────────────────────────────────────────────────────────────────────────
	# Config
	# ────────────────────────────────────────────────────────────────────────────────

	os.environ.setdefault("HF_HOME", "/home/user/app/.cache")

	MODEL_ID_CT2 = "kingabzpro/whisper-large-v3-urdu-ct2"
	GROQ_MODEL = "openai/gpt-oss-120b"

	DEFAULT_SYSTEM_PROMPT_UR = (
	"آپ ایک ماہر اردو زبان ایڈیٹر ہیں۔ دیے گئے متن کو بہتر اردو املا، "
	"رموزِ اوقاف، وقفوں اور قدرتی روانی کے ساتھ پیش کریں۔ "
	"بولنے والے کے انداز اور معنی کو برقرار رکھیں، مبالغہ نہ کریں، "
	"اور عام انگریزی اصطلاحات جوں کی توں رہنے دیں۔"
	)

	# ────────────────────────────────────────────────────────────────────────────────
	# Utilities
	# ────────────────────────────────────────────────────────────────────────────────

	def format_timestamp(seconds: float, format_type: str = "srt") -> str:
	total_ms = int(round((seconds or 0.0) * 1000))
	hours, rem_ms = divmod(total_ms, 3_600_000)
	minutes, rem_ms = divmod(rem_ms, 60_000)
	sec, ms = divmod(rem_ms, 1000)
	sep = "," if format_type == "srt" else "."
	return f"{hours:02d}:{minutes:02d}:{sec:02d}{sep}{ms:03d}"

	def basic_urdu_cleanup(text: str) -> str:
	if not text:
	return text
	t = " ".join(text.split())
	replacements = {
	" ,": ",", " .": ".", " ?": "?", " !": "!",
	" ،": "،", " ۔": "۔",
	",": "،", ";": "؛",
	". . .": "…", "...": "…",
	}
	for a, b in replacements.items():
	t = t.replace(a, b)
	t = t.replace(" ،", "،").replace(" ۔", "۔").replace(" ؛", "؛").replace(" ؟", "؟")
	for p in ["،", "؛", ",", ";"]:
	t = t.replace(p, p + " ")
	return " ".join(t.split()).strip()

	# ───── Groq (OpenAI-compatible) client helpers ─────

	def get_groq_client(api_key: Optional[str] = None):
	key = (api_key or os.getenv("GROQ_API_KEY", "")).strip()
	if not key:
	return None, "No GROQ_API_KEY provided."
	try:
	from groq import Groq # type: ignore
	return Groq(api_key=key), None
	except Exception as e:
	return None, f"Groq client init failed: {e}"

	def enhance_text_with_llm(text: str, api_key: Optional[str], temperature: float = 0.2,
	system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> str:
	client, err = get_groq_client(api_key)
	if not client:
	if err:
	print(f"[LLM] {err} (falling back to basic cleanup)")
	return basic_urdu_cleanup(text)
	try:
	resp = client.chat.completions.create(
	model=GROQ_MODEL,
	temperature=float(temperature),
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": "براہِ کرم درج ذیل متن بہتر اردو میں لوٹائیں (بغیر اضافی تبصرہ):\n\n" + text},
	],
	)
	return (resp.choices[0].message.content or "").strip() or basic_urdu_cleanup(text)
	except Exception as e:
	print(f"[LLM] Full-text enhance failed: {e}")
	return basic_urdu_cleanup(text)

	def enhance_lines_with_llm(lines: List[str], api_key: Optional[str], temperature: float = 0.2,
	system_prompt: str = DEFAULT_SYSTEM_PROMPT_UR) -> List[str]:
	if not lines:
	return lines
	client, err = get_groq_client(api_key)
	if not client:
	return [basic_urdu_cleanup(x) for x in lines]

	numbered = "\n".join(f"{i+1}. {ln}" for i, ln in enumerate(lines))
	user_msg = "ان جملوں کی اردو بہتر کریں۔ اسی ترتیب اور گنتی کے ساتھ اتنی ہی سطور واپس کریں:\n\n" + numbered
	try:
	resp = client.chat.completions.create(
	model=GROQ_MODEL,
	temperature=float(temperature),
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": user_msg},
	],
	)
	raw = (resp.choices[0].message.content or "").strip()
	improved_map = {}
	for line in raw.splitlines():
	s = line.strip()
	if not s or "." not in s:
	continue
	num, rest = s.split(".", 1)
	if num.strip().isdigit():
	improved_map[int(num) - 1] = rest.strip()
	return [improved_map.get(i, basic_urdu_cleanup(lines[i])) for i in range(len(lines))]
	except Exception as e:
	print(f"[LLM] Line enhance failed: {e}")
	return [basic_urdu_cleanup(x) for x in lines]

	def test_groq(api_key: Optional[str], temperature: float, system_prompt: str) -> str:
	client, err = get_groq_client(api_key)
	if not client:
	return f"❌ LLM not ready: {err}"
	try:
	resp = client.chat.completions.create(
	model=GROQ_MODEL,
	temperature=float(temperature),
	messages=[
	{"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT_UR},
	{"role": "user", "content": "مختصر جملہ بہتر کر کے واپس کریں: 'یہ ایک ٹیسٹ ہے'"},
	],
	)
	txt = (resp.choices[0].message.content or "").strip()
	return f"✅ LLM OK · Sample: {txt}" if txt else "⚠️ LLM responded but empty content."
	except Exception as e:
	return f"❌ LLM call failed: {e}"

	# ────────────────────────────────────────────────────────────────────────────────
	# Whisper (CT2) Model
	# ────────────────────────────────────────────────────────────────────────────────

	print(f"CUDA available: {torch.cuda.is_available()}")
	print("Loading model... this may take a minute the first time.")
	model = faster_whisper.WhisperModel(
	MODEL_ID_CT2,
	device="cuda" if torch.cuda.is_available() else "cpu",
	compute_type="auto",
	)
	print("✅ Model loaded successfully!")

	# ────────────────────────────────────────────────────────────────────────────────
	# Core Transcription
	# ────────────────────────────────────────────────────────────────────────────────

	def transcribe_audio(
	audio_path: Optional[str],
	output_format: str,
	beam_size: int,
	llm_enhance: bool,
	llm_api_key: Optional[str],
	llm_temperature: float,
	llm_system_prompt: str,
	):
	if not audio_path:
	raise gr.Error("Please upload or record an audio clip.")

	seg_iter, info = model.transcribe(
	audio_path, language="ur", beam_size=int(beam_size),
	word_timestamps=False, vad_filter=False
	)

	segments, raw_lines = [], []
	for seg in seg_iter:
	text = (seg.text or "").strip()
	segments.append({"start": seg.start, "end": seg.end, "text": text})
	raw_lines.append(text)

	if llm_enhance:
	if output_format == "text":
	cleaned_lines = [enhance_text_with_llm(" ".join(raw_lines), llm_api_key, llm_temperature, llm_system_prompt)]
	else:
	cleaned_lines = enhance_lines_with_llm(raw_lines, llm_api_key, llm_temperature, llm_system_prompt)
	else:
	cleaned_lines = (
	[basic_urdu_cleanup(" ".join(raw_lines))] if output_format == "text"
	else [basic_urdu_cleanup(x) for x in raw_lines]
	)

	if output_format == "text":
	return cleaned_lines[0]
	if output_format == "srt":
	lines = []
	for i, s in enumerate(segments, 1):
	txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"]
	lines += [str(i), f"{format_timestamp(s['start'],'srt')} --> {format_timestamp(s['end'],'srt')}", txt, ""]
	return "\n".join(lines)
	if output_format == "vtt":
	lines = ["WEBVTT", ""]
	for i, s in enumerate(segments, 1):
	txt = cleaned_lines[i-1] if len(cleaned_lines) == len(segments) else s["text"]
	lines += [f"{format_timestamp(s['start'],'vtt')} --> {format_timestamp(s['end'],'vtt')}", txt, ""]
	return "\n".join(lines)
	if output_format == "json":
	segs_out = []
	for i, s in enumerate(segments):
	txt = cleaned_lines[i] if len(cleaned_lines) == len(segments) else s["text"]
	segs_out.append({"start": s["start"], "end": s["end"], "text": txt})
	return json.dumps({"text": " ".join(cleaned_lines), "segments": segs_out}, ensure_ascii=False, indent=2)

	raise gr.Error(f"Unsupported format: {output_format}")

	# ────────────────────────────────────────────────────────────────────────────────
	# UI
	# ────────────────────────────────────────────────────────────────────────────────

	theme = gr.themes.Soft(primary_hue="rose", secondary_hue="violet", neutral_hue="slate")

	with gr.Blocks(title="Urdu ASR Studio — Faster-Whisper + LLM Polishing", theme=theme) as iface:
	# Custom CSS to fix spacing + output height
	gr.HTML("""
	<style>
	.gradio-container { padding-bottom: 16px !important; }
	#result_box textarea {
	min-height: 260px !important;
	max-height: 360px !important;
	overflow-y: auto !important;
	}
	</style>
	""")

	gr.Markdown(
	"## Urdu STT with GPT-OSS 120B \n"
	"High-quality Urdu transcription with Faster-Whisper (CT2) and optional Groq LLM polishing."
	)

	with gr.Row():
	with gr.Column(scale=5):
	audio = gr.Audio(
	sources=["upload","microphone"], type="filepath",
	label="Upload or Record Audio",
	waveform_options={"show_controls": False},
	autoplay=False, streaming=False,
	)
	with gr.Accordion("Transcription Settings", open=False):
	with gr.Row():
	fmt = gr.Radio(choices=["text","srt","vtt","json"], value="text", label="Output Format")
	beam = gr.Slider(1,10,5,step=1,label="Beam Size")
	with gr.Accordion("LLM Polishing (Optional)", open=False):
	llm_toggle = gr.Checkbox(value=False,label="Polish Urdu text with LLM (Groq · openai/gpt-oss-120b)")
	with gr.Row():
	llm_temp = gr.Slider(0.0,1.0,0.2,step=0.05,label="LLM Temperature")
	llm_key = gr.Textbox(label="GROQ_API_KEY (optional if set in environment)", type="password", value="")
	llm_sys = gr.Textbox(label="LLM System Prompt (Urdu)", value=DEFAULT_SYSTEM_PROMPT_UR, lines=3)
	with gr.Row():
	test_btn = gr.Button("Test LLM", variant="secondary")
	test_status = gr.Markdown("")
	with gr.Row():
	btn = gr.Button("Transcribe", variant="primary")

	with gr.Column(scale=7):
	out = gr.Textbox(label="Result", lines=14, max_lines=30, show_copy_button=True, elem_id="result_box")

	btn.click(fn=transcribe_audio, inputs=[audio, fmt, beam, llm_toggle, llm_key, llm_temp, llm_sys], outputs=out)
	test_btn.click(fn=test_groq, inputs=[llm_key,llm_temp,llm_sys], outputs=[test_status])

	if __name__ == "__main__":
	iface.launch()