tugrulkaya commited on
Commit
c316c05
·
verified ·
1 Parent(s): 81d5dc1

initiate commit

Browse files
Files changed (1) hide show
  1. app.py +40 -275
app.py CHANGED
@@ -1,285 +1,50 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
  import time
 
 
 
5
 
6
- # Load models
7
- @gr.cache_examples
8
- def load_models():
9
- """Load original and quantized models"""
10
- model_name = "distilbert-base-uncased-finetuned-sst-2-english"
11
-
12
- # Original model
13
- original_model = AutoModelForSequenceClassification.from_pretrained(model_name)
14
- tokenizer = AutoTokenizer.from_pretrained(model_name)
15
-
16
- # Quantized model
17
- quantized_model = torch.quantization.quantize_dynamic(
18
- original_model,
19
- {torch.nn.Linear},
20
- dtype=torch.qint8
21
- )
22
-
23
- return original_model, quantized_model, tokenizer
24
-
25
- original_model, quantized_model, tokenizer = load_models()
26
 
27
- def predict_sentiment(text, use_quantized=True):
28
- """
29
- Predict sentiment using original or quantized model
30
-
31
- Args:
32
- text: Input text to analyze
33
- use_quantized: Use quantized model if True, original if False
34
-
35
- Returns:
36
- tuple: (label, confidence, inference_time, model_info)
37
- """
38
- model = quantized_model if use_quantized else original_model
39
-
40
- # Tokenize
41
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
42
 
43
- # Measure inference time
44
- start_time = time.time()
45
-
46
- with torch.no_grad():
47
- outputs = model(**inputs)
48
-
49
- inference_time = (time.time() - start_time) * 1000 # ms
50
-
51
- # Get prediction
52
- probs = torch.softmax(outputs.logits, dim=-1)
53
- confidence, predicted = torch.max(probs, dim=-1)
54
-
55
- label = "😊 POSITIVE" if predicted.item() == 1 else "😞 NEGATIVE"
56
- confidence_pct = confidence.item() * 100
57
-
58
- # Model info
59
- model_type = "Quantized INT8" if use_quantized else "Original FP32"
60
- model_size = "~68 MB" if use_quantized else "~255 MB"
61
-
62
- model_info = f"**Model:** {model_type}\n**Size:** {model_size}\n**Inference Time:** {inference_time:.2f} ms"
63
-
64
- return label, f"{confidence_pct:.1f}%", model_info
65
-
66
- def compare_models(text):
67
- """Compare original and quantized model predictions"""
68
-
69
- # Original model
70
- orig_label, orig_conf, orig_info = predict_sentiment(text, use_quantized=False)
71
-
72
- # Quantized model
73
- quant_label, quant_conf, quant_info = predict_sentiment(text, use_quantized=True)
74
-
75
- # Create comparison
76
- comparison = f"""
77
- ## 🔍 Comparison Results
78
-
79
- ### Original Model (FP32)
80
- - **Prediction:** {orig_label}
81
- - **Confidence:** {orig_conf}
82
- - {orig_info}
83
-
84
- ### Quantized Model (INT8)
85
- - **Prediction:** {quant_label}
86
- - **Confidence:** {quant_conf}
87
- - {quant_info}
88
-
89
- ### Summary
90
- - **Size Reduction:** 3.75x smaller (255 MB → 68 MB)
91
- - **Predictions Match:** {'✅ Yes' if orig_label == quant_label else '⚠️ Different'}
92
- - **Speed:** ~2x faster on CPU
93
- """
94
-
95
- return comparison
96
-
97
- # Example texts
98
- examples = [
99
- ["This movie is absolutely fantastic! Best film I've seen this year!"],
100
- ["Terrible waste of time and money. Very disappointed."],
101
- ["It was okay, nothing special but not bad either."],
102
- ["Amazing product! Exceeded all my expectations!"],
103
- ["Poor quality, not worth the price at all."],
104
- ]
105
-
106
- # Create Gradio interface
107
- with gr.Blocks(theme=gr.themes.Soft(), title="Transformer Edge Optimization Demo") as demo:
108
-
109
- gr.Markdown("""
110
- # 🚀 Transformer Edge Optimization Demo
111
-
112
- Compare **Original FP32** vs **Quantized INT8** models for sentiment analysis.
113
-
114
- **Key Benefits:**
115
- - ✅ **4x smaller** model size (255 MB → 68 MB)
116
- - ✅ **2x faster** inference on CPU
117
- - ✅ **Minimal accuracy loss** (~1-2%)
118
-
119
- ---
120
- """)
121
-
122
- with gr.Tab("🎯 Quick Prediction"):
123
- with gr.Row():
124
- with gr.Column():
125
- text_input = gr.Textbox(
126
- label="Enter text to analyze",
127
- placeholder="Type your text here...",
128
- lines=3
129
- )
130
- use_quant = gr.Checkbox(
131
- label="Use Quantized Model (INT8)",
132
- value=True,
133
- info="Uncheck to use Original FP32 model"
134
- )
135
- predict_btn = gr.Button("🔮 Predict Sentiment", variant="primary")
136
-
137
- with gr.Column():
138
- label_output = gr.Textbox(label="Prediction", interactive=False)
139
- confidence_output = gr.Textbox(label="Confidence", interactive=False)
140
- info_output = gr.Markdown(label="Model Info")
141
-
142
- predict_btn.click(
143
- fn=predict_sentiment,
144
- inputs=[text_input, use_quant],
145
- outputs=[label_output, confidence_output, info_output]
146
- )
147
-
148
- gr.Examples(
149
- examples=examples,
150
- inputs=text_input,
151
- label="Try these examples:"
152
  )
 
 
 
153
 
154
- with gr.Tab("⚖️ Model Comparison"):
155
- gr.Markdown("""
156
- Compare predictions from **Original** and **Quantized** models side by side.
157
- """)
158
-
159
- compare_text = gr.Textbox(
160
- label="Enter text to compare",
161
- placeholder="Type your text here...",
162
- lines=3
163
- )
164
- compare_btn = gr.Button("🔍 Compare Models", variant="primary")
165
- comparison_output = gr.Markdown(label="Comparison Results")
166
-
167
- compare_btn.click(
168
- fn=compare_models,
169
- inputs=compare_text,
170
- outputs=comparison_output
171
- )
172
-
173
- gr.Examples(
174
- examples=examples,
175
- inputs=compare_text,
176
- label="Try these examples:"
177
- )
178
-
179
- with gr.Tab("📚 Documentation"):
180
- gr.Markdown("""
181
- ## 🎯 What is Quantization?
182
-
183
- **Quantization** reduces model size by converting weights from 32-bit floating point (FP32) to 8-bit integers (INT8).
184
-
185
- ### Benefits:
186
- - **4x smaller** model size
187
- - **2-3x faster** inference
188
- - **Minimal accuracy loss** (~1-2%)
189
- - **Better for mobile/edge** devices
190
-
191
- ### Techniques Used:
192
- 1. **Dynamic Quantization** - Weights quantized, activations computed at runtime
193
- 2. **Post-Training Quantization** - No retraining needed
194
- 3. **PyTorch Native** - Built-in PyTorch support
195
-
196
- ---
197
-
198
- ## 📊 Benchmark Results
199
-
200
- | Metric | Original (FP32) | Quantized (INT8) | Improvement |
201
- |--------|----------------|------------------|-------------|
202
- | **Model Size** | 255 MB | 68 MB | **3.75x smaller** |
203
- | **Inference Time** | 12.3 ms | 5.8 ms | **2.1x faster** |
204
- | **Accuracy (SST-2)** | 91.8% | 90.2% | -1.6% |
205
- | **Memory Usage** | 280 MB | 95 MB | **2.9x less** |
206
-
207
- ---
208
-
209
- ## 🚀 Try it Yourself!
210
-
211
- ### Google Colab Notebooks:
212
-
213
- 1. **Quantization Basics** (15 min)
214
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mtkaya/transformer-edge-optimization/blob/main/notebooks/01_quantization_basics.ipynb)
215
-
216
- 2. **ONNX Runtime** (20 min)
217
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mtkaya/transformer-edge-optimization/blob/main/notebooks/02_huggingface_optimum.ipynb)
218
-
219
- 3. **Knowledge Distillation** (30 min)
220
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mtkaya/transformer-edge-optimization/blob/main/notebooks/05_distilbert_training.ipynb)
221
-
222
- ---
223
-
224
- ## 💻 Quick Start Code
225
-
226
- ```python
227
- import torch
228
- from transformers import AutoModelForSequenceClassification
229
-
230
- # Load model
231
- model = AutoModelForSequenceClassification.from_pretrained(
232
- "distilbert-base-uncased-finetuned-sst-2-english"
233
- )
234
-
235
- # Quantize (FP32 → INT8)
236
- quantized_model = torch.quantization.quantize_dynamic(
237
- model, {torch.nn.Linear}, dtype=torch.qint8
238
- )
239
-
240
- # Model is now 4x smaller! 🎉
241
- ```
242
-
243
- ---
244
-
245
- ## 🔗 Resources
246
-
247
- - **GitHub Repository:** [mtkaya/transformer-edge-optimization](https://github.com/mtkaya/transformer-edge-optimization)
248
- - **Documentation:** [Full Guide](https://github.com/mtkaya/transformer-edge-optimization#readme)
249
- - **Hugging Face:** [Model Card](https://huggingface.co/spaces/mtkaya/transformer-edge-optimization)
250
-
251
- ---
252
-
253
- ## 📧 Contact
254
-
255
- - **Issues:** [Report a bug](https://github.com/mtkaya/transformer-edge-optimization/issues)
256
- - **Discussions:** [Ask questions](https://github.com/mtkaya/transformer-edge-optimization/discussions)
257
-
258
- ---
259
-
260
- <div align="center">
261
-
262
- **Made with ❤️ for the AI community**
263
-
264
- ⭐ Star on [GitHub](https://github.com/mtkaya/transformer-edge-optimization) if you find this useful!
265
-
266
- </div>
267
- """)
268
-
269
- gr.Markdown("""
270
- ---
271
-
272
- <div align="center">
273
-
274
- **🚀 Transformer Edge Optimization Toolkit**
275
-
276
- [GitHub](https://github.com/mtkaya/transformer-edge-optimization) •
277
- [Documentation](https://github.com/mtkaya/transformer-edge-optimization#readme) •
278
- [Notebooks](https://github.com/mtkaya/transformer-edge-optimization/tree/main/notebooks)
279
-
280
- </div>
281
- """)
282
 
283
- # Launch
284
  if __name__ == "__main__":
285
- demo.launch()
 
1
  import gradio as gr
 
 
2
  import time
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+ import torch
5
+ import bitsandbytes as bnb # Quantization için
6
 
7
+ # Model yükle
8
+ model_name = "distilbert-base-uncased-finetuned-sst-2-english"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ def classify_with_quantization(text, use_quantization=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
14
 
15
+ if use_quantization:
16
+ # 8-bit quantization uygula
17
+ model_quantized = AutoModelForSequenceClassification.from_pretrained(
18
+ model_name,
19
+ load_in_8bit=True,
20
+ device_map="auto"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  )
22
+ model_to_use = model_quantized
23
+ else:
24
+ model_to_use = model
25
 
26
+ start_time = time.time()
27
+ with torch.no_grad():
28
+ outputs = model_to_use(**inputs)
29
+ inference_time = time.time() - start_time
30
+
31
+ logits = outputs.logits
32
+ predicted_class = logits.argmax().item()
33
+ label = "POSITIVE" if predicted_class == 1 else "NEGATIVE"
34
+
35
+ return f"Label: {label}\nInference Time: {inference_time:.4f}s"
36
+
37
+ # Gradio interface
38
+ demo = gr.Interface(
39
+ fn=classify_with_quantization,
40
+ inputs=[
41
+ gr.Textbox(lines=2, placeholder="Enter text for sentiment analysis..."),
42
+ gr.Checkbox(label="Use 8-bit Quantization", value=False)
43
+ ],
44
+ outputs=gr.Textbox(),
45
+ title="Transformer Model Optimization Demo",
46
+ description="Test quantization on DistilBERT for faster edge inference. Toggle quantization to see speed gains."
47
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
49
  if __name__ == "__main__":
50
+ demo.launch()