| import gradio as gr |
| import pandas as pd |
| import plotly.graph_objects as go |
| import plotly.express as px |
| from pathlib import Path |
|
|
| |
| reference_text_path = Path("text/reference.txt") |
| if reference_text_path.exists(): |
| with open(reference_text_path, "r") as f: |
| reference_text = f.read() |
| else: |
| reference_text = "Reference text not available" |
|
|
| |
| audio_path = Path("audio/001.wav") |
| audio_exists = audio_path.exists() |
|
|
| |
| wer_data = { |
| "Model": ["tiny", "base", "small", "medium", "large-v3-turbo"], |
| "WER (%)": [15.05, 9.95, 11.17, 6.07, 7.04], |
| "Speed (s)": [2.73, 5.01, 5.14, 19.42, 33.08], |
| "Model Size": ["39M", "74M", "244M", "769M", "809M"] |
| } |
| df_wer = pd.DataFrame(wer_data) |
|
|
| |
| engine_data = { |
| "Engine": ["faster-whisper", "openai-whisper", "distil-whisper"], |
| "WER (%)": [9.95, 9.95, 21.6], |
| "Speed (s)": [4.87, 6.51, 38.49] |
| } |
| df_engine = pd.DataFrame(engine_data) |
|
|
| |
| fig_wer = go.Figure() |
| fig_wer.add_trace(go.Bar( |
| x=df_wer["Model"], |
| y=df_wer["WER (%)"], |
| text=df_wer["WER (%)"].round(2), |
| textposition='auto', |
| marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7'], |
| hovertemplate='<b>%{x}</b><br>WER: %{y:.2f}%<br>Size: %{customdata}<extra></extra>', |
| customdata=df_wer["Model Size"] |
| )) |
| fig_wer.update_layout( |
| title="Word Error Rate by Model Size", |
| xaxis_title="Model", |
| yaxis_title="WER (%)", |
| template="plotly_white", |
| height=400 |
| ) |
|
|
| |
| fig_scatter = go.Figure() |
| fig_scatter.add_trace(go.Scatter( |
| x=df_wer["Speed (s)"], |
| y=df_wer["WER (%)"], |
| mode='markers+text', |
| marker=dict(size=15, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7']), |
| text=df_wer["Model"], |
| textposition="top center", |
| hovertemplate='<b>%{text}</b><br>Speed: %{x:.2f}s<br>WER: %{y:.2f}%<extra></extra>' |
| )) |
| fig_scatter.update_layout( |
| title="Speed vs Accuracy Tradeoff", |
| xaxis_title="Inference Time (seconds)", |
| yaxis_title="WER (%)", |
| template="plotly_white", |
| height=400 |
| ) |
|
|
| |
| fig_engine = go.Figure() |
| fig_engine.add_trace(go.Bar( |
| x=df_engine["Engine"], |
| y=df_engine["WER (%)"], |
| name="WER (%)", |
| marker_color='#4ECDC4', |
| text=df_engine["WER (%)"].round(2), |
| textposition='auto' |
| )) |
| fig_engine.update_layout( |
| title="WER by Engine (Base Model)", |
| xaxis_title="Engine", |
| yaxis_title="WER (%)", |
| template="plotly_white", |
| height=400 |
| ) |
|
|
| |
| custom_css = """ |
| .gradio-container { |
| font-family: 'Inter', sans-serif; |
| } |
| .limitation-box { |
| background-color: #FFF3CD; |
| border-left: 4px solid #FFC107; |
| padding: 15px; |
| margin: 10px 0; |
| } |
| .question-box { |
| background-color: #E3F2FD; |
| border-left: 4px solid #2196F3; |
| padding: 15px; |
| margin: 15px 0; |
| } |
| """ |
|
|
| |
| with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: |
| gr.Markdown( |
| """ |
| # Local ASR/STT Benchmark Evaluation |
| ### A Single Sample Evaluation on Local Hardware |
| |
| Testing different Whisper model sizes to find the optimal balance between accuracy and speed for daily transcription workflow. |
| """ |
| ) |
|
|
| with gr.Tabs(): |
| |
| with gr.Tab("📊 Overview"): |
| gr.Markdown( |
| """ |
| ## About This Evaluation |
| |
| This was a "back of the envelope" style experiment to determine which Whisper model size works best |
| for daily transcription on local hardware, focusing on the tradeoff between accuracy (WER) and inference speed. |
| """ |
| ) |
|
|
| gr.Markdown("### 🎯 Test Sample") |
|
|
| if audio_exists: |
| gr.Audio( |
| value=str(audio_path), |
| label="Test Audio (001.wav)", |
| type="filepath" |
| ) |
| else: |
| gr.Markdown("**Note:** Audio file will be added soon.") |
|
|
| gr.Markdown("### 📝 Reference Text (Ground Truth)") |
| gr.Textbox( |
| value=reference_text, |
| label="Reference Transcription", |
| lines=10, |
| max_lines=15, |
| interactive=False |
| ) |
|
|
| gr.Markdown( |
| """ |
| ### ⚠️ Important Limitations |
| |
| - **Quick experiment**: Not a definitive scientific evaluation |
| - **Hardware specific**: AMD GPU with ROCm (not ideal for STT), using CPU inference |
| - **Single sample**: Results based on one audio clip |
| - **Variable conditions**: ASR accuracy depends on mic quality, background noise, speaking style |
| - **Personal use case**: Optimized for one user's voice and workflow |
| """ |
| ) |
|
|
| |
| with gr.Tab("📈 Results"): |
| gr.Markdown("## Key Findings") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown( |
| """ |
| ### Best Accuracy |
| **medium** model |
| - 6.07% WER |
| - 19.42s inference |
| |
| ### Fastest |
| **tiny** model |
| - 15.05% WER |
| - 2.73s inference |
| |
| ### Recommended for Daily Use |
| **base** model (faster-whisper) |
| - 9.95% WER |
| - ~5s inference |
| - Good balance |
| """ |
| ) |
|
|
| with gr.Column(): |
| gr.Markdown( |
| """ |
| ### Key Takeaways |
| |
| 1. **Biggest jump**: tiny → base (15% → 10% WER) |
| 2. **Diminishing returns**: After base, accuracy gains are smaller |
| 3. **faster-whisper**: Same accuracy as OpenAI, 1.2x faster |
| 4. **distil-whisper**: Unexpectedly slower AND less accurate on this sample |
| """ |
| ) |
|
|
| gr.Markdown("## Interactive Visualizations") |
|
|
| with gr.Row(): |
| gr.Plot(fig_wer, label="WER by Model Size") |
|
|
| with gr.Row(): |
| gr.Plot(fig_scatter, label="Speed vs Accuracy") |
|
|
| with gr.Row(): |
| gr.Plot(fig_engine, label="Engine Comparison") |
|
|
| gr.Markdown("## Original Charts from Benchmark") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Image("results/wer_by_size.png", label="WER by Size") |
| with gr.Column(): |
| gr.Image("results/speed_by_size.png", label="Speed by Size") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Image("results/accuracy_speed_tradeoff.png", label="Accuracy vs Speed") |
| with gr.Column(): |
| gr.Image("results/engine_comparison.png", label="Engine Comparison") |
|
|
| with gr.Row(): |
| gr.Image("results/variants_comparison.png", label="All Variants Tested") |
|
|
| |
| with gr.Tab("❓ Questions & Answers"): |
| gr.Markdown( |
| """ |
| # Research Questions & Findings |
| |
| ## Q1: How much does model size actually matter for accuracy? |
| |
| **Answer:** On my hardware, diminishing returns set in around **medium**. |
| |
| The biggest accuracy jump was from tiny (15.05% WER) → base (9.95% WER). After that, improvements are smaller: |
| - tiny → base: 5.1% improvement |
| - base → medium: 3.88% improvement |
| - medium → large-v3-turbo: Actually worse (1% regression) |
| |
| The "sweet spot" depends on your use case: |
| - **Live transcription**: Even small lags matter → base or small |
| - **Batch processing**: Can afford slower → medium or large |
| |
| --- |
| |
| ## Q2: Is faster-whisper really as good as OpenAI Whisper? |
| |
| **Answer:** Yes! On this test, identical accuracy with better speed. |
| |
| Testing the base model: |
| - **faster-whisper**: 9.95% WER in 5.01s |
| - **openai-whisper**: 9.95% WER in 6.17s |
| |
| faster-whisper was ~1.2x faster with no accuracy loss. Clear winner for my use case. |
| |
| --- |
| |
| ## Q3: What's the speed vs. accuracy tradeoff? |
| |
| **Answer:** For daily transcription of my own voice, base or small hits the sweet spot. |
| |
| - **tiny**: 2.73s but 15% WER is too rough |
| - **base**: 5s with 10% WER - acceptable for daily use |
| - **small**: Similar to base, slightly slower |
| - **medium**: 6% WER but 7x slower than tiny |
| - **large-v3-turbo**: 33s for 7% WER - overkill for casual use |
| |
| --- |
| |
| ## Q4: Which model should I use for my daily STT workflow? |
| |
| **My personal answer:** base model with faster-whisper |
| |
| **Why it works for me:** |
| - ~10% WER is acceptable for dictation (I can quickly fix errors) |
| - 5 seconds per clip is fast enough |
| - 140MB model size is manageable |
| - Good balance for daily workflow |
| |
| **When I'd use something else:** |
| - **tiny**: Quick tests or very long recordings where speed matters most |
| - **medium/large**: Publishing or professional work needing better accuracy |
| |
| --- |
| |
| ## Bonus Finding: distil-whisper |
| |
| I tested distil-whisper expecting it to be faster, but on my sample: |
| - **distil-whisper**: 21.6% WER in 38.49s ✗ |
| |
| Both slower AND less accurate than the standard models. Unexpected, but that's the data. |
| """ |
| ) |
|
|
| |
| with gr.Tab("💻 Hardware & Setup"): |
| gr.Markdown( |
| """ |
| ## Test Environment |
| |
| ### Hardware |
| - **GPU**: AMD Radeon RX 7700 XT (ROCm available but using CPU inference) |
| - **CPU**: Intel Core i7-12700F (12 cores, 20 threads) |
| - **RAM**: 64 GB |
| - **OS**: Ubuntu 25.04 |
| |
| ### Why CPU Inference? |
| - AMD GPU with ROCm isn't ideal for STT workloads |
| - CPU inference provided more consistent results |
| - Your performance will differ based on your hardware |
| |
| ### Models Tested |
| |
| **Whisper model sizes:** |
| - tiny (39M params) |
| - base (74M params) |
| - small (244M params) |
| - medium (769M params) |
| - large-v3-turbo (809M params) |
| |
| **Engines compared:** |
| - OpenAI Whisper (original implementation) |
| - faster-whisper (optimized CTranslate2) |
| - distil-whisper (distilled variant) |
| |
| ### Metrics |
| - **WER (Word Error Rate)**: Lower is better - percentage of words transcribed incorrectly |
| - **Inference Time**: How long it takes to transcribe the audio sample |
| |
| ## Running Your Own Tests |
| |
| Want to benchmark on your own voice and hardware? |
| |
| 1. Clone the repository: [github.com/danielrosehill/Local-ASR-STT-Benchmark](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) |
| 2. Set up the conda environment (see `setup.md`) |
| 3. Record your own audio and create reference transcriptions |
| 4. Run the benchmark scripts |
| 5. Generate visualizations |
| |
| Your results will likely differ based on: |
| - Your hardware (GPU/CPU) |
| - Your voice characteristics |
| - Your microphone quality |
| - Background noise conditions |
| - Speaking style and pace |
| """ |
| ) |
|
|
| |
| with gr.Tab("ℹ️ About"): |
| gr.Markdown( |
| """ |
| ## About This Project |
| |
| ### Motivation |
| |
| I was tired of guessing which Whisper model size to use for speech-to-text. There are plenty of |
| benchmarks out there, but they're often: |
| - Run on different hardware than mine |
| - Tested on different voice characteristics |
| - Using different microphones and conditions |
| |
| So I decided to run my own evaluation on my actual setup with my actual voice. |
| |
| ### Why This Matters |
| |
| If you're doing hours of transcription per day (like I am), optimizing your STT setup is worth it: |
| - Faster models = less waiting |
| - More accurate models = less editing |
| - Finding the sweet spot = better workflow |
| |
| ### Next Steps |
| |
| For a more robust evaluation, I'd want to: |
| - Test on multiple audio samples |
| - Include different speaking styles (casual, technical, professional) |
| - Test on different microphones |
| - Evaluate punctuation and capitalization accuracy |
| - Compare ASR (Automatic Speech Recognition) vs traditional STT |
| - Test GPU inference on NVIDIA hardware |
| |
| ### Repository |
| |
| Full benchmark code and results: |
| [github.com/danielrosehill/Local-ASR-STT-Benchmark](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) |
| |
| ### License |
| |
| MIT License - Feel free to use and adapt for your own benchmarks! |
| |
| --- |
| |
| *Built with Gradio • Whisper models by OpenAI • Hosted on Hugging Face Spaces* |
| """ |
| ) |
|
|
| gr.Markdown( |
| """ |
| --- |
| ### 📧 Questions or feedback? |
| Visit the [GitHub repository](https://github.com/danielrosehill/Local-ASR-STT-Benchmark) to open an issue or contribute. |
| """ |
| ) |
|
|
| gr.HTML( |
| """ |
| <div style="text-align: center; margin-top: 20px;"> |
| <a href="https://danielrosehill.com" target="_blank"> |
| <img src="/file/badge.png" alt="Daniel Rosehill" style="width: 480px;"> |
| </a> |
| </div> |
| """ |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|