Reality8081 commited on
Commit
b367bb7
·
1 Parent(s): 374433d
Files changed (2) hide show
  1. app.py +244 -0
  2. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import json
5
+ import io
6
+ import os
7
+ import zipfile
8
+ import tempfile
9
+ # =========================
10
+ # GLOBAL STATE (in-memory)
11
+ # =========================
12
+ STATE = {}
13
+
14
+ # =========================
15
+ # UTILS
16
+ # =========================
17
+ def read_file(file):
18
+ if file.name.endswith(".csv"):
19
+ return pd.read_csv(file.name)
20
+ elif file.name.endswith(".parquet"):
21
+ return pd.read_parquet(file.name)
22
+ else:
23
+ raise ValueError("Unsupported format")
24
+
25
+
26
+ # =========================
27
+ # COMPONENT 1: PROFILING
28
+ # =========================
29
+ def profile_data(df, training=True):
30
+ profile = {}
31
+
32
+ profile["shape"] = df.shape
33
+ profile["missing_ratio"] = df.isna().mean().to_dict()
34
+
35
+ num_cols = df.select_dtypes(include=np.number).columns.tolist()
36
+ cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()
37
+
38
+ profile["numerical"] = num_cols
39
+ profile["categorical"] = cat_cols
40
+
41
+ if training:
42
+ STATE["profile"] = profile
43
+
44
+ return profile
45
+
46
+
47
+ # =========================
48
+ # COMPONENT 2: OUTLIER + IMPUTATION
49
+ # =========================
50
+ def handle_outliers_impute(df, training=True):
51
+ df = df.copy()
52
+
53
+ dropped_cols = []
54
+ impute_values = {}
55
+ outlier_bounds = {}
56
+
57
+ for col in df.columns:
58
+ if df[col].isna().mean() > 0.9:
59
+ dropped_cols.append(col)
60
+
61
+ df.drop(columns=dropped_cols, inplace=True)
62
+
63
+ for col in df.select_dtypes(include=np.number).columns:
64
+ if training:
65
+ q1 = df[col].quantile(0.25)
66
+ q3 = df[col].quantile(0.75)
67
+ iqr = q3 - q1
68
+ lower = q1 - 1.5 * iqr
69
+ upper = q3 + 1.5 * iqr
70
+
71
+ outlier_bounds[col] = (lower, upper)
72
+
73
+ else:
74
+ lower, upper = STATE["outliers"][col]
75
+
76
+ df[col] = np.clip(df[col], lower, upper)
77
+
78
+ if training:
79
+ impute_values[col] = df[col].median()
80
+ else:
81
+ impute_values[col] = STATE["impute"][col]
82
+
83
+ df[col].fillna(impute_values[col], inplace=True)
84
+
85
+ for col in df.select_dtypes(exclude=np.number).columns:
86
+ if training:
87
+ impute_values[col] = df[col].mode()[0]
88
+ else:
89
+ impute_values[col] = STATE["impute"][col]
90
+
91
+ df[col].fillna(impute_values[col], inplace=True)
92
+
93
+ if training:
94
+ STATE["impute"] = impute_values
95
+ STATE["outliers"] = outlier_bounds
96
+
97
+ return df, dropped_cols, impute_values
98
+
99
+
100
+ # =========================
101
+ # COMPONENT 3: ENCODING
102
+ # =========================
103
+ def encode_data(df, training=True):
104
+ df = df.copy()
105
+ new_cols = []
106
+
107
+ if training:
108
+ STATE["encoding"] = {}
109
+
110
+ for col in df.select_dtypes(exclude=np.number).columns:
111
+ if training:
112
+ uniques = df[col].unique().tolist()
113
+ STATE["encoding"][col] = uniques
114
+ else:
115
+ uniques = STATE["encoding"][col]
116
+
117
+ for val in uniques:
118
+ new_col = f"{col}_{val}"
119
+ df[new_col] = (df[col] == val).astype(int)
120
+ new_cols.append(new_col)
121
+
122
+ df.drop(columns=[col], inplace=True)
123
+
124
+ return df, new_cols
125
+
126
+
127
+ # =========================
128
+ # COMPONENT 4: MEMORY OPT
129
+ # =========================
130
+ def optimize_memory(df):
131
+ before = df.memory_usage(deep=True).sum()
132
+
133
+ for col in df.select_dtypes(include=["int64"]).columns:
134
+ df[col] = pd.to_numeric(df[col], downcast="integer")
135
+
136
+ for col in df.select_dtypes(include=["float64"]).columns:
137
+ df[col] = pd.to_numeric(df[col], downcast="float")
138
+
139
+ after = df.memory_usage(deep=True).sum()
140
+ saved = 100 * (before - after) / before
141
+
142
+ return df, before, after, saved
143
+
144
+
145
+ # =========================
146
+ # MAIN PIPELINE
147
+ # =========================
148
+ def run_pipeline(file, mode):
149
+ if file is None:
150
+ return "Upload file first", None, None, None, None
151
+
152
+ df = read_file(file)
153
+
154
+ training = mode == "Training"
155
+
156
+ if not training and "profile" not in STATE:
157
+ return "ERROR: Run Training first!", None, None, None, None
158
+
159
+ # STEP 1
160
+ profile = profile_data(df, training)
161
+
162
+ # STEP 2
163
+ df, dropped, impute = handle_outliers_impute(df, training)
164
+
165
+ # STEP 3
166
+ df, new_cols = encode_data(df, training)
167
+
168
+ # STEP 4
169
+ df, before, after, saved = optimize_memory(df)
170
+
171
+ # SAVE OUTPUT
172
+ csv_buffer = io.StringIO()
173
+ df.to_csv(csv_buffer, index=False)
174
+
175
+ zip_buffer = io.BytesIO()
176
+ with zipfile.ZipFile(zip_buffer, "w") as zf:
177
+ for k, v in STATE.items():
178
+ zf.writestr(f"{k}.json", json.dumps(v, indent=2))
179
+
180
+ return (
181
+ json.dumps(profile, indent=2),
182
+ df.head(),
183
+ csv_buffer.getvalue(),
184
+ zip_buffer.getvalue(),
185
+ f"RAM saved: {saved:.2f}%"
186
+ )
187
+
188
+
189
+ # =========================
190
+ # GRADIO UI
191
+ # =========================
192
+ with gr.Blocks() as app:
193
+
194
+ gr.Markdown("# Auto Data Processor (MLOps Version)")
195
+
196
+ with gr.Row():
197
+ file = gr.File(label="Upload CSV/Parquet")
198
+ mode = gr.Radio(["Training", "Inference"], value="Training")
199
+
200
+ run_btn = gr.Button("Run Pipeline")
201
+
202
+ profile_out = gr.Textbox(label="Data Profiling", lines=15)
203
+ df_out = gr.Dataframe()
204
+ ram_out = gr.Textbox(label="Memory Optimization")
205
+
206
+ csv_out = gr.File(label="Download Cleaned CSV")
207
+ zip_out = gr.File(label="Download State ZIP")
208
+
209
+ def wrapper(file, mode):
210
+ profile, df_head, csv_data, zip_data, ram = run_pipeline(file, mode)
211
+
212
+ if df_head is None:
213
+ return profile, None, None, None, None
214
+
215
+ # Create a temporary directory to store the output files safely
216
+ temp_dir = tempfile.mkdtemp()
217
+
218
+ csv_path = os.path.join(temp_dir, "cleaned.csv")
219
+ zip_path = os.path.join(temp_dir, "state.zip")
220
+
221
+ # Write the CSV string to a file
222
+ with open(csv_path, "w", encoding="utf-8") as f:
223
+ f.write(csv_data)
224
+
225
+ # Write the ZIP bytes to a file
226
+ with open(zip_path, "wb") as f:
227
+ f.write(zip_data)
228
+
229
+ return (
230
+ profile,
231
+ df_head,
232
+ ram,
233
+ csv_path, # Now we pass the actual string file path
234
+ zip_path # Now we pass the actual string file path
235
+ )
236
+
237
+ run_btn.click(
238
+ wrapper,
239
+ inputs=[file, mode],
240
+ outputs=[profile_out, df_out, ram_out, csv_out, zip_out]
241
+ )
242
+
243
+ if __name__ == "__main__":
244
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # torch --index-url https://download.pytorch.org/whl/cpu
2
+ gradio
3
+ huggingface_hub
4
+ numpy
5
+ pandas
6
+ plotly
7
+ scikit-learn
8
+ matplotlib
9
+ pyarrow