Spaces:

tugrulkaya
/

transformer-edge-optimization

Sleeping

App Files Files Community

tugrulkaya commited on Nov 4

Commit

c316c05

verified ·

1 Parent(s): 81d5dc1

initiate commit

Browse files

Files changed (1) hide show

app.py +40 -275

app.py CHANGED Viewed

@@ -1,285 +1,50 @@
 import gradio as gr
-import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import time
-# Load models
-@gr.cache_examples
-def load_models():
-    """Load original and quantized models"""
-    model_name = "distilbert-base-uncased-finetuned-sst-2-english"
-    # Original model
-    original_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Quantized model
-    quantized_model = torch.quantization.quantize_dynamic(
-        original_model,
-        {torch.nn.Linear},
-        dtype=torch.qint8
-    )
-    return original_model, quantized_model, tokenizer
-original_model, quantized_model, tokenizer = load_models()
-def predict_sentiment(text, use_quantized=True):
-    """
-    Predict sentiment using original or quantized model
-    Args:
-        text: Input text to analyze
-        use_quantized: Use quantized model if True, original if False
-    Returns:
-        tuple: (label, confidence, inference_time, model_info)
-    """
-    model = quantized_model if use_quantized else original_model
-    # Tokenize
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
-    # Measure inference time
-    start_time = time.time()
-    with torch.no_grad():
-        outputs = model(**inputs)
-    inference_time = (time.time() - start_time) * 1000  # ms
-    # Get prediction
-    probs = torch.softmax(outputs.logits, dim=-1)
-    confidence, predicted = torch.max(probs, dim=-1)
-    label = "😊 POSITIVE" if predicted.item() == 1 else "😞 NEGATIVE"
-    confidence_pct = confidence.item() * 100
-    # Model info
-    model_type = "Quantized INT8" if use_quantized else "Original FP32"
-    model_size = "~68 MB" if use_quantized else "~255 MB"
-    model_info = f"**Model:** {model_type}\n**Size:** {model_size}\n**Inference Time:** {inference_time:.2f} ms"
-    return label, f"{confidence_pct:.1f}%", model_info
-def compare_models(text):
-    """Compare original and quantized model predictions"""
-    # Original model
-    orig_label, orig_conf, orig_info = predict_sentiment(text, use_quantized=False)
-    # Quantized model
-    quant_label, quant_conf, quant_info = predict_sentiment(text, use_quantized=True)
-    # Create comparison
-    comparison = f"""
-## 🔍 Comparison Results
-### Original Model (FP32)
-- **Prediction:** {orig_label}
-- **Confidence:** {orig_conf}
-- {orig_info}
-### Quantized Model (INT8)
-- **Prediction:** {quant_label}
-- **Confidence:** {quant_conf}
-- {quant_info}
-### Summary
-- **Size Reduction:** 3.75x smaller (255 MB → 68 MB)
-- **Predictions Match:** {'✅ Yes' if orig_label == quant_label else '⚠️ Different'}
-- **Speed:** ~2x faster on CPU
-"""
-    return comparison
-# Example texts
-examples = [
-    ["This movie is absolutely fantastic! Best film I've seen this year!"],
-    ["Terrible waste of time and money. Very disappointed."],
-    ["It was okay, nothing special but not bad either."],
-    ["Amazing product! Exceeded all my expectations!"],
-    ["Poor quality, not worth the price at all."],
-]
-# Create Gradio interface
-with gr.Blocks(theme=gr.themes.Soft(), title="Transformer Edge Optimization Demo") as demo:
-    gr.Markdown("""
-    # 🚀 Transformer Edge Optimization Demo
-    Compare **Original FP32** vs **Quantized INT8** models for sentiment analysis.
-    **Key Benefits:**
-    - ✅ **4x smaller** model size (255 MB → 68 MB)
-    - ✅ **2x faster** inference on CPU
-    - ✅ **Minimal accuracy loss** (~1-2%)
-    ---
-    """)
-    with gr.Tab("🎯 Quick Prediction"):
-        with gr.Row():
-            with gr.Column():
-                text_input = gr.Textbox(
-                    label="Enter text to analyze",
-                    placeholder="Type your text here...",
-                    lines=3
-                )
-                use_quant = gr.Checkbox(
-                    label="Use Quantized Model (INT8)",
-                    value=True,
-                    info="Uncheck to use Original FP32 model"
-                )
-                predict_btn = gr.Button("🔮 Predict Sentiment", variant="primary")
-            with gr.Column():
-                label_output = gr.Textbox(label="Prediction", interactive=False)
-                confidence_output = gr.Textbox(label="Confidence", interactive=False)
-                info_output = gr.Markdown(label="Model Info")
-        predict_btn.click(
-            fn=predict_sentiment,
-            inputs=[text_input, use_quant],
-            outputs=[label_output, confidence_output, info_output]
-        )
-        gr.Examples(
-            examples=examples,
-            inputs=text_input,
-            label="Try these examples:"
         )
-    with gr.Tab("⚖️ Model Comparison"):
-        gr.Markdown("""
-        Compare predictions from **Original** and **Quantized** models side by side.
-        """)
-        compare_text = gr.Textbox(
-            label="Enter text to compare",
-            placeholder="Type your text here...",
-            lines=3
-        )
-        compare_btn = gr.Button("🔍 Compare Models", variant="primary")
-        comparison_output = gr.Markdown(label="Comparison Results")
-        compare_btn.click(
-            fn=compare_models,
-            inputs=compare_text,
-            outputs=comparison_output
-        )
-        gr.Examples(
-            examples=examples,
-            inputs=compare_text,
-            label="Try these examples:"
-        )
-    with gr.Tab("📚 Documentation"):
-        gr.Markdown("""
-        ## 🎯 What is Quantization?
-        **Quantization** reduces model size by converting weights from 32-bit floating point (FP32) to 8-bit integers (INT8).
-        ### Benefits:
-        - **4x smaller** model size
-        - **2-3x faster** inference
-        - **Minimal accuracy loss** (~1-2%)
-        - **Better for mobile/edge** devices
-        ### Techniques Used:
-        1. **Dynamic Quantization** - Weights quantized, activations computed at runtime
-        2. **Post-Training Quantization** - No retraining needed
-        3. **PyTorch Native** - Built-in PyTorch support
-        ---
-        ## 📊 Benchmark Results
-        | Metric | Original (FP32) | Quantized (INT8) | Improvement |
-        |--------|----------------|------------------|-------------|
-        | **Model Size** | 255 MB | 68 MB | **3.75x smaller** |
-        | **Inference Time** | 12.3 ms | 5.8 ms | **2.1x faster** |
-        | **Accuracy (SST-2)** | 91.8% | 90.2% | -1.6% |
-        | **Memory Usage** | 280 MB | 95 MB | **2.9x less** |
-        ---
-        ## 🚀 Try it Yourself!
-        ### Google Colab Notebooks:
-        1. **Quantization Basics** (15 min)
-           [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mtkaya/transformer-edge-optimization/blob/main/notebooks/01_quantization_basics.ipynb)
-        2. **ONNX Runtime** (20 min)
-           [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mtkaya/transformer-edge-optimization/blob/main/notebooks/02_huggingface_optimum.ipynb)
-        3. **Knowledge Distillation** (30 min)
-           [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mtkaya/transformer-edge-optimization/blob/main/notebooks/05_distilbert_training.ipynb)
-        ---
-        ## 💻 Quick Start Code
-        ```python
-        import torch
-        from transformers import AutoModelForSequenceClassification
-        # Load model
-        model = AutoModelForSequenceClassification.from_pretrained(
-            "distilbert-base-uncased-finetuned-sst-2-english"
-        )
-        # Quantize (FP32 → INT8)
-        quantized_model = torch.quantization.quantize_dynamic(
-            model, {torch.nn.Linear}, dtype=torch.qint8
-        )
-        # Model is now 4x smaller! 🎉
-        ```
-        ---
-        ## 🔗 Resources
-        - **GitHub Repository:** [mtkaya/transformer-edge-optimization](https://github.com/mtkaya/transformer-edge-optimization)
-        - **Documentation:** [Full Guide](https://github.com/mtkaya/transformer-edge-optimization#readme)
-        - **Hugging Face:** [Model Card](https://huggingface.co/spaces/mtkaya/transformer-edge-optimization)
-        ---
-        ## 📧 Contact
-        - **Issues:** [Report a bug](https://github.com/mtkaya/transformer-edge-optimization/issues)
-        - **Discussions:** [Ask questions](https://github.com/mtkaya/transformer-edge-optimization/discussions)
-        ---
-        <div align="center">
-        **Made with ❤️ for the AI community**
-        ⭐ Star on [GitHub](https://github.com/mtkaya/transformer-edge-optimization) if you find this useful!
-        </div>
-        """)
-    gr.Markdown("""
-    ---
-    <div align="center">
-    **🚀 Transformer Edge Optimization Toolkit**
-    [GitHub](https://github.com/mtkaya/transformer-edge-optimization) •
-    [Documentation](https://github.com/mtkaya/transformer-edge-optimization#readme) •
-    [Notebooks](https://github.com/mtkaya/transformer-edge-optimization/tree/main/notebooks)
-    </div>
-    """)
-# Launch
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import time
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import bitsandbytes as bnb  # Quantization için
+# Model yükle
+model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+def classify_with_quantization(text, use_quantization=False):
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
+    if use_quantization:
+        # 8-bit quantization uygula
+        model_quantized = AutoModelForSequenceClassification.from_pretrained(
+            model_name,
+            load_in_8bit=True,
+            device_map="auto"
         )
+        model_to_use = model_quantized
+    else:
+        model_to_use = model
+    start_time = time.time()
+    with torch.no_grad():
+        outputs = model_to_use(**inputs)
+    inference_time = time.time() - start_time
+    logits = outputs.logits
+    predicted_class = logits.argmax().item()
+    label = "POSITIVE" if predicted_class == 1 else "NEGATIVE"
+    return f"Label: {label}\nInference Time: {inference_time:.4f}s"
+# Gradio interface
+demo = gr.Interface(
+    fn=classify_with_quantization,
+    inputs=[
+        gr.Textbox(lines=2, placeholder="Enter text for sentiment analysis..."),
+        gr.Checkbox(label="Use 8-bit Quantization", value=False)
+    ],
+    outputs=gr.Textbox(),
+    title="Transformer Model Optimization Demo",
+    description="Test quantization on DistilBERT for faster edge inference. Toggle quantization to see speed gains."
+)
 if __name__ == "__main__":
+    demo.launch()