Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import librosa | |
| import torch | |
| from transformers import pipeline | |
| import spaces | |
| import numpy as np | |
| # Initialize model once at startup | |
| pipe = pipeline( | |
| model="sarvamai/shuka_v1", | |
| trust_remote_code=True, | |
| device=0 if torch.cuda.is_available() else -1 | |
| ) | |
| def preprocess_audio(audio, sr): | |
| # Normalize audio | |
| audio = librosa.util.normalize(audio) | |
| # Remove silence | |
| audio, _ = librosa.effects.trim(audio, top_db=20) | |
| # Apply noise reduction (simple high-pass filter) | |
| audio = librosa.effects.preemphasis(audio) | |
| # Ensure audio is mono | |
| if len(audio.shape) > 1: | |
| audio = librosa.to_mono(audio) | |
| return audio, sr | |
| def transcribe_and_respond(audio_file): | |
| try: | |
| # Load audio with higher quality settings | |
| audio, sr = librosa.load( | |
| audio_file, | |
| sr=16000, # Standard sample rate for speech | |
| mono=True, # Ensure mono audio | |
| res_type='kaiser_best' # High-quality resampling | |
| ) | |
| # Preprocess audio | |
| audio, sr = preprocess_audio(audio, sr) | |
| # Ensure audio is not too short or too long | |
| if len(audio) < sr * 0.5: # Less than 0.5 seconds | |
| return "Error: Audio is too short. Please speak for at least 0.5 seconds." | |
| if len(audio) > sr * 30: # More than 30 seconds | |
| return "Error: Audio is too long. Please keep it under 30 seconds." | |
| # Use Shuka's expected format | |
| output = pipe({ | |
| "audio": audio, | |
| "sampling_rate": sr, | |
| "turns": [ | |
| {"role": "system", "content": """You are an expert English pronunciation teacher specializing in teaching Indian English learners. Your role is to: | |
| 1. Listen carefully to the student's pronunciation | |
| 2. Provide specific feedback on pronunciation accuracy | |
| 3. Break down difficult words into syllables | |
| 4. Explain the correct mouth positions and sounds | |
| 5. Use simple, clear language | |
| 6. Be encouraging and supportive | |
| 7. Focus on common Indian English pronunciation challenges | |
| 8. Provide examples of correct pronunciation | |
| Format your response in this structure: | |
| - What you heard | |
| - Specific pronunciation feedback | |
| - Tips for improvement | |
| - Example words to practice"""}, | |
| {"role": "user", "content": "<|audio|>"} | |
| ] | |
| }, max_new_tokens=256) | |
| return output | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Gradio interface | |
| with gr.Blocks(title="Shuka v1 Transcription") as iface: | |
| gr.Markdown("## Shuka v1 - Voice Transcription") | |
| gr.Markdown("""Upload or speak, and the model will respond naturally using SarvamAI's voice foundation model. | |
| Tips for best results: | |
| - Speak clearly and at a moderate pace | |
| - Keep background noise to a minimum | |
| - Maintain a distance of 6-12 inches from the microphone | |
| - Speak for at least 0.5 seconds but no more than 30 seconds""") | |
| with gr.Row(): | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Audio Input", | |
| format="wav" # Ensure WAV format for best quality | |
| ) | |
| text_output = gr.Textbox(label="Model Response", placeholder="Response will appear here...") | |
| audio_input.change(fn=transcribe_and_respond, inputs=audio_input, outputs=text_output) | |
| if __name__ == "__main__": | |
| iface.launch() |