Spaces:

tidelganesh
/

SarvamAISTTtest

Runtime error

App Files Files Community

SarvamAISTTtest / app.py

tidelganesh

Update app.py

0ce379e verified 18 days ago

raw

history blame contribute delete

3.68 kB

	import gradio as gr
	import librosa
	import torch
	from transformers import pipeline
	import spaces
	import numpy as np

	# Initialize model once at startup
	pipe = pipeline(
	model="sarvamai/shuka_v1",
	trust_remote_code=True,
	device=0 if torch.cuda.is_available() else -1
	)

	def preprocess_audio(audio, sr):
	# Normalize audio
	audio = librosa.util.normalize(audio)

	# Remove silence
	audio, _ = librosa.effects.trim(audio, top_db=20)

	# Apply noise reduction (simple high-pass filter)
	audio = librosa.effects.preemphasis(audio)

	# Ensure audio is mono
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio)

	return audio, sr

	@spaces.GPU
	def transcribe_and_respond(audio_file):
	try:
	# Load audio with higher quality settings
	audio, sr = librosa.load(
	audio_file,
	sr=16000, # Standard sample rate for speech
	mono=True, # Ensure mono audio
	res_type='kaiser_best' # High-quality resampling
	)

	# Preprocess audio
	audio, sr = preprocess_audio(audio, sr)

	# Ensure audio is not too short or too long
	if len(audio) < sr * 0.5: # Less than 0.5 seconds
	return "Error: Audio is too short. Please speak for at least 0.5 seconds."
	if len(audio) > sr * 30: # More than 30 seconds
	return "Error: Audio is too long. Please keep it under 30 seconds."

	# Use Shuka's expected format
	output = pipe({
	"audio": audio,
	"sampling_rate": sr,
	"turns": [
	{"role": "system", "content": """You are an expert English pronunciation teacher specializing in teaching Indian English learners. Your role is to:
	1. Listen carefully to the student's pronunciation
	2. Provide specific feedback on pronunciation accuracy
	3. Break down difficult words into syllables
	4. Explain the correct mouth positions and sounds
	5. Use simple, clear language
	6. Be encouraging and supportive
	7. Focus on common Indian English pronunciation challenges
	8. Provide examples of correct pronunciation
	Format your response in this structure:
	- What you heard
	- Specific pronunciation feedback
	- Tips for improvement
	- Example words to practice"""},
	{"role": "user", "content": "<\|audio\|>"}
	]
	}, max_new_tokens=256)

	return output

	except Exception as e:
	return f"Error: {str(e)}"

	# Gradio interface
	with gr.Blocks(title="Shuka v1 Transcription") as iface:
	gr.Markdown("## Shuka v1 - Voice Transcription")
	gr.Markdown("""Upload or speak, and the model will respond naturally using SarvamAI's voice foundation model.

	Tips for best results:
	- Speak clearly and at a moderate pace
	- Keep background noise to a minimum
	- Maintain a distance of 6-12 inches from the microphone
	- Speak for at least 0.5 seconds but no more than 30 seconds""")

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="Audio Input",
	format="wav" # Ensure WAV format for best quality
	)
	text_output = gr.Textbox(label="Model Response", placeholder="Response will appear here...")

	audio_input.change(fn=transcribe_and_respond, inputs=audio_input, outputs=text_output)

	if __name__ == "__main__":
	iface.launch()