DrMostafa commited on
Commit
219c63b
·
verified ·
1 Parent(s): f9417e2

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +13 -16
  2. app.py +376 -0
  3. packages.txt +1 -0
  4. requirements.txt +5 -3
  5. runtime.txt +1 -0
README.md CHANGED
@@ -1,19 +1,16 @@
1
- ---
2
- title: Process Mining
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Process_Inteligence
12
- ---
13
 
14
- # Welcome to Streamlit!
 
 
 
 
 
 
15
 
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
 
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
1
+ # Mini Process Miner (Streamlit + PM4Py)
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ A lightweight, **process mining** app built with **Streamlit**, **PM4Py**, and **pandas**. 100% vibe coded with ChatGPT.
4
+ Upload an event log (CSV) and explore:
5
+ - Process map (clean, frequency, performance)
6
+ - DFG with counts & durations
7
+ - Filters for activities, optional columns (`column1/2/3`)
8
+ - Case-level & event-level exclusions
9
+ - Sliders for activity/connection frequency
10
 
11
+ ## Quick start
12
 
13
+ ### 1) Local (pip)
14
+ ```bash
15
+ pip install -r requirements.txt
16
+ streamlit run app.py
app.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import shutil
3
+ import importlib
4
+ import pandas as pd
5
+
6
+ # ----------------------------
7
+ # Config
8
+ # ----------------------------
9
+ st.set_page_config(page_title="Mini Process Miner", layout="wide")
10
+ DEBUG = True # set to False to hide the env checks from users
11
+
12
+ # Optional: quick environment/dependency check
13
+ if DEBUG:
14
+ st.write("Python OK. Checking deps…")
15
+ st.write("pm4py import:", bool(importlib.util.find_spec("pm4py")))
16
+ st.write("graphviz (pip) import:", bool(importlib.util.find_spec("graphviz")))
17
+ st.write("dot in PATH:", shutil.which("dot"))
18
+
19
+ # ----------------------------
20
+ # Page setup
21
+ # ----------------------------
22
+ st.title("Mini Process Miner (vibe-coded)")
23
+
24
+ # Uploader with clear instructions
25
+ uploaded = st.file_uploader(
26
+ "Upload your event log (CSV)",
27
+ type=["csv"],
28
+ help="Use EXACT headers (lowercase): required → case_id, activity, timestamp; optional → column1, column2, column3."
29
+ )
30
+
31
+ st.caption(
32
+ "**Required columns:** case_id, activity, timestamp • "
33
+ "**Optional:** column1, column2, column3 (e.g., resource, team, location) • "
34
+ "Need a sample dataset? [Download a test CSV here](https://drive.google.com/drive/folders/1q0iqn5_FFz4EttLDl0zR09RQ3z4JsdDR) • "
35
+ "**Disclaimer:** This demo tool offers no guarantees regarding data security or accuracy; use at your own risk. • "
36
+ "Created by Dennis Arrindell, powered by [PM4Py](https://pm4py.fit.fraunhofer.de/), and 100% vibe-coded with ChatGPT."
37
+ )
38
+
39
+
40
+ # ----------------------------
41
+ # Helpers
42
+ # ----------------------------
43
+ def ensure_parsed(df: pd.DataFrame) -> pd.DataFrame:
44
+ """Normalize columns and parse timestamp."""
45
+ df = df.copy()
46
+ df.columns = [c.strip().lower() for c in df.columns]
47
+ df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
48
+ df = df.dropna(subset=["timestamp"])
49
+ return df
50
+
51
+ def compute_ordered(df: pd.DataFrame) -> pd.DataFrame:
52
+ return df.sort_values(["case_id", "timestamp"])
53
+
54
+ def apply_case_level_exclusion(df: pd.DataFrame, activities_to_drop: list) -> pd.DataFrame:
55
+ """Remove entire cases that contain any of the selected activities."""
56
+ if not activities_to_drop:
57
+ return df
58
+ cases_with_forbidden = df.loc[df["activity"].isin(activities_to_drop), "case_id"].unique()
59
+ return df.loc[~df["case_id"].isin(cases_with_forbidden)].copy()
60
+
61
+ def apply_event_level_exclusion(df: pd.DataFrame, activities_to_remove: list) -> pd.DataFrame:
62
+ """Remove only those activity events, keep the rest of the case."""
63
+ if not activities_to_remove:
64
+ return df
65
+ out = df.loc[~df["activity"].isin(activities_to_remove)].copy()
66
+ valid_cases = out["case_id"].value_counts()
67
+ keep_cases = valid_cases[valid_cases > 0].index
68
+ return out.loc[out["case_id"].isin(keep_cases)].copy()
69
+
70
+ def apply_activity_threshold(df: pd.DataFrame, min_freq: int) -> pd.DataFrame:
71
+ """Drop events whose activity total frequency < min_freq."""
72
+ if min_freq <= 1 or df.empty:
73
+ return df
74
+ counts = df["activity"].value_counts()
75
+ keep_acts = counts[counts >= min_freq].index
76
+ return df.loc[df["activity"].isin(keep_acts)].copy()
77
+
78
+ def build_edges(ordered_df: pd.DataFrame) -> pd.DataFrame:
79
+ """Build directly-follows edges with counts."""
80
+ if ordered_df.empty:
81
+ return pd.DataFrame(columns=["edge", "count"])
82
+ tmp = ordered_df.copy()
83
+ tmp["next_activity"] = tmp.groupby("case_id")["activity"].shift(-1)
84
+ edges = tmp.dropna(subset=["next_activity"])[["activity", "next_activity"]]
85
+ if edges.empty:
86
+ return pd.DataFrame(columns=["edge", "count"])
87
+ edges["edge"] = edges["activity"] + " → " + edges["next_activity"]
88
+ edge_counts = edges["edge"].value_counts().rename_axis("edge").reset_index(name="count")
89
+ return edge_counts
90
+
91
+ def apply_optional_column_includes(df: pd.DataFrame, colname: str, selected: list) -> pd.DataFrame:
92
+ """If selections provided for a column, keep only rows where column ∈ selected."""
93
+ if colname in df.columns and selected:
94
+ return df[df[colname].astype(str).isin([str(x) for x in selected])]
95
+ return df
96
+
97
+ # ----------------------------
98
+ # Main
99
+ # ----------------------------
100
+ if uploaded:
101
+ raw_df = pd.read_csv(uploaded)
102
+
103
+ # Validate columns early (we normalize to lowercase)
104
+ required = {"case_id", "activity", "timestamp"}
105
+ if not required.issubset(set([c.strip().lower() for c in raw_df.columns])):
106
+ st.error("CSV must include required columns: case_id, activity, timestamp. Optional: column1, column2, column3.")
107
+ st.stop()
108
+
109
+ df = ensure_parsed(raw_df)
110
+
111
+ # ----------------------------
112
+ # Sidebar filters (case/event + optional column1/2/3) FIRST
113
+ # ----------------------------
114
+ st.sidebar.header("Filters")
115
+
116
+ # Optional extra columns (exact names after normalization): column1, column2, column3
117
+ extra_cols_present = [c for c in ["column1", "column2", "column3"] if c in df.columns]
118
+
119
+ # Case-level exclusion
120
+ all_activities = sorted(df["activity"].astype(str).unique().tolist())
121
+ case_exclude = st.sidebar.multiselect(
122
+ "Remove all CASES containing these activities",
123
+ options=all_activities,
124
+ help="If a case contains one of these activities, the entire case is removed."
125
+ )
126
+
127
+ # Event-level exclusion
128
+ event_exclude = st.sidebar.multiselect(
129
+ "Remove only EVENTS with these activities (keep cases)",
130
+ options=all_activities,
131
+ help="Events with these activities are dropped, but the case remains if other events exist."
132
+ )
133
+
134
+ # Optional include filters for extra columns
135
+ if extra_cols_present:
136
+ st.sidebar.markdown("---")
137
+ st.sidebar.subheader("Optional column filters")
138
+ selections = {}
139
+ for col in extra_cols_present:
140
+ options = sorted(df[col].dropna().astype(str).unique().tolist())
141
+ selections[col] = st.sidebar.multiselect(
142
+ f"Include only {col} values",
143
+ options=options,
144
+ help=f"Leave empty to include all {col} values."
145
+ )
146
+ else:
147
+ selections = {}
148
+
149
+ # Apply case/event filters
150
+ df_filt = apply_case_level_exclusion(df, case_exclude)
151
+ df_filt = apply_event_level_exclusion(df_filt, event_exclude)
152
+
153
+ # Apply optional column includes
154
+ for col, sel in selections.items():
155
+ df_filt = apply_optional_column_includes(df_filt, col, sel)
156
+
157
+ if df_filt.empty:
158
+ st.warning("All data filtered out. Adjust filters to see results.")
159
+ st.stop()
160
+
161
+ ordered = compute_ordered(df_filt)
162
+
163
+ # ----------------------------
164
+ # Sidebar sliders (activity & connection thresholds)
165
+ # ----------------------------
166
+ act_counts_for_slider = ordered["activity"].value_counts()
167
+ max_act_allowed = int(act_counts_for_slider.max()) if not act_counts_for_slider.empty else 1
168
+ if max_act_allowed < 1:
169
+ max_act_allowed = 1
170
+
171
+ apply_act_thresh_to_model = st.sidebar.checkbox(
172
+ "Apply activity frequency threshold to the model",
173
+ value=True,
174
+ help="If enabled, activities below the threshold are removed before discovery/visualization."
175
+ )
176
+ min_act = st.sidebar.slider(
177
+ "Min activity frequency to KEEP",
178
+ min_value=1, max_value=max_act_allowed, value=1,
179
+ help="Drops activities whose total frequency is below this value (if enabled above)."
180
+ )
181
+
182
+ # Create df_model after activity slider decision
183
+ if apply_act_thresh_to_model:
184
+ df_model = apply_activity_threshold(ordered, min_act)
185
+ else:
186
+ df_model = ordered
187
+
188
+ df_model = compute_ordered(df_model)
189
+ if df_model.empty:
190
+ st.warning("All events dropped by the activity frequency threshold. Lower the threshold.")
191
+ st.stop()
192
+
193
+ # Connection frequency slider (visual-only)
194
+ edge_counts_for_slider = build_edges(df_model)
195
+ max_edge_allowed = int(edge_counts_for_slider["count"].max()) if not edge_counts_for_slider.empty else 1
196
+ if max_edge_allowed < 1:
197
+ max_edge_allowed = 1
198
+ min_edge = st.sidebar.slider(
199
+ "Min connection frequency to SHOW",
200
+ min_value=1, max_value=max_edge_allowed, value=1,
201
+ help="Hides low-frequency connections in the Connections/DFG views (visual-only)."
202
+ )
203
+
204
+ st.sidebar.markdown("---")
205
+ st.sidebar.caption("Activity threshold may modify the model; connection threshold only affects visuals.")
206
+
207
+ # ----------------------------
208
+ # Metrics
209
+ # ----------------------------
210
+ total_cases = df_model["case_id"].nunique()
211
+ total_events = len(df_model)
212
+ unique_acts = df_model["activity"].nunique()
213
+ c1, c2, c3 = st.columns(3)
214
+ c1.metric("Total cases", total_cases)
215
+ c2.metric("Total events", total_events)
216
+ c3.metric("Unique activities", unique_acts)
217
+
218
+ # ----------------------------
219
+ # Activity frequency (reflects min_act)
220
+ # ----------------------------
221
+ st.subheader("Activity frequency")
222
+ act_counts = df_model["activity"].value_counts().rename_axis("activity").reset_index(name="count")
223
+ st.dataframe(act_counts[act_counts["count"] >= min_act], use_container_width=True)
224
+ st.bar_chart(act_counts.set_index("activity")["count"])
225
+
226
+ # ----------------------------
227
+ # Variants (quick & dirty)
228
+ # ----------------------------
229
+ try:
230
+ variants = (
231
+ df_model.groupby("case_id")["activity"]
232
+ .apply(lambda s: " → ".join(s))
233
+ .value_counts()
234
+ )
235
+ st.subheader("Top variants (quick & dirty)")
236
+ st.dataframe(
237
+ variants.rename("count").reset_index().rename(columns={"index": "variant"}).head(20),
238
+ use_container_width=True
239
+ )
240
+ except Exception:
241
+ st.info("Could not compute variants; check your timestamp and activity values.")
242
+
243
+ # ----------------------------
244
+ # Connections (transitions) — respects min_edge (visual-only)
245
+ # ----------------------------
246
+ st.subheader("Connections (transitions)")
247
+ edge_counts = build_edges(df_model)
248
+ if edge_counts.empty:
249
+ st.info("No transitions found after current filters.")
250
+ else:
251
+ st.dataframe(edge_counts[edge_counts["count"] >= min_edge], use_container_width=True)
252
+
253
+ # ----------------------------
254
+ # PM4Py visualizations (clean, frequency, performance, DFG)
255
+ # ----------------------------
256
+ st.subheader("Discovered Process Map")
257
+ try:
258
+ # Lazy imports so app still loads without pm4py
259
+ from pm4py.objects.log.util import dataframe_utils
260
+ from pm4py.objects.conversion.log import converter as log_converter
261
+ from pm4py.algo.discovery.inductive import algorithm as inductive_miner
262
+ from pm4py.visualization.petri_net import visualizer as pn_visualizer
263
+ from pm4py.visualization.process_tree import visualizer as pt_visualizer
264
+ from pm4py.objects.conversion.process_tree import converter as pt_converter
265
+ from pm4py.objects.process_tree import obj as pt_obj
266
+ from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
267
+ from pm4py.visualization.dfg import visualizer as dfg_visualization
268
+
269
+ # Prepare dataframe for PM4Py
270
+ pm_df = df_model.rename(columns={
271
+ "case_id": "case:concept:name",
272
+ "activity": "concept:name",
273
+ "timestamp": "time:timestamp"
274
+ }).copy()
275
+ pm_df["time:timestamp"] = pd.to_datetime(pm_df["time:timestamp"], errors="coerce")
276
+ pm_df = pm_df.dropna(subset=["time:timestamp"])
277
+ pm_df = dataframe_utils.convert_timestamp_columns_in_df(pm_df)
278
+
279
+ # Convert to event log
280
+ event_log = log_converter.apply(pm_df)
281
+
282
+ # Discover model
283
+ model = inductive_miner.apply(event_log)
284
+ if isinstance(model, pt_obj.ProcessTree):
285
+ tree = model
286
+ net, im, fm = pt_converter.apply(tree)
287
+ tree_gviz = pt_visualizer.apply(tree)
288
+ else:
289
+ net, im, fm = model
290
+ tree_gviz = None
291
+
292
+ tabs = st.tabs(["Clean Petri Net", "Frequency", "Performance", "DFG (with numbers)"])
293
+
294
+ # --- Clean Petri net ---
295
+ with tabs[0]:
296
+ gviz_pn = pn_visualizer.apply(net, im, fm)
297
+ st.graphviz_chart(gviz_pn.source, use_container_width=True)
298
+ if tree_gviz is not None:
299
+ st.caption("Process Tree (discovered)")
300
+ st.graphviz_chart(tree_gviz.source, use_container_width=True)
301
+
302
+ # --- Frequency-decorated Petri net ---
303
+ with tabs[1]:
304
+ try:
305
+ gviz_freq = pn_visualizer.apply(
306
+ net, im, fm,
307
+ variant=pn_visualizer.Variants.FREQUENCY,
308
+ log=event_log
309
+ )
310
+ st.graphviz_chart(gviz_freq.source, use_container_width=True)
311
+ st.caption("Numbers reflect frequencies from the filtered log.")
312
+ except Exception as e:
313
+ st.info(f"Frequency decoration not available: {e}")
314
+
315
+ # --- Performance-decorated Petri net ---
316
+ with tabs[2]:
317
+ try:
318
+ gviz_perf = pn_visualizer.apply(
319
+ net, im, fm,
320
+ variant=pn_visualizer.Variants.PERFORMANCE,
321
+ log=event_log
322
+ )
323
+ st.graphviz_chart(gviz_perf.source, use_container_width=True)
324
+ st.caption("Numbers reflect performance (e.g., average durations) computed from timestamps.")
325
+ except Exception as e:
326
+ st.info(f"Performance decoration not available: {e}")
327
+
328
+ # --- DFG with numbers (respects min_edge visually) ---
329
+ with tabs[3]:
330
+ try:
331
+ dfg_freq = dfg_discovery.apply(event_log) # {(a,b): count}
332
+ dfg_freq_filtered = {k: v for k, v in dfg_freq.items() if v >= min_edge}
333
+ dfg_freq_gviz = dfg_visualization.apply(
334
+ dfg_freq_filtered if dfg_freq_filtered else dfg_freq,
335
+ log=event_log,
336
+ variant=dfg_visualization.Variants.FREQUENCY
337
+ )
338
+ st.graphviz_chart(dfg_freq_gviz.source, use_container_width=True)
339
+ st.caption("DFG (Frequency): edge labels show counts. Low-frequency edges hidden per slider.")
340
+
341
+ dfg_perf_gviz = dfg_visualization.apply(
342
+ dfg_freq_filtered if dfg_freq_filtered else dfg_freq,
343
+ log=event_log,
344
+ variant=dfg_visualization.Variants.PERFORMANCE
345
+ )
346
+ st.graphviz_chart(dfg_perf_gviz.source, use_container_width=True)
347
+ st.caption("DFG (Performance): edge labels show avg durations. Low-frequency edges hidden per slider.")
348
+ except Exception as e:
349
+ st.info(f"DFG visualization not available: {e}")
350
+
351
+ except ModuleNotFoundError:
352
+ st.error("PM4Py not found. Please ensure pm4py and graphviz are installed.")
353
+ except Exception as e:
354
+ st.warning(f"Could not render process map: {e}")
355
+
356
+ # ----------------------------
357
+ # Credits
358
+ # ----------------------------
359
+ st.markdown("---")
360
+ with st.expander("Credits", expanded=False):
361
+ st.markdown(
362
+ """
363
+ **Credits**
364
+ Created by **Dennis Arrindell** — creator of the best selling online course about Process Mining on Udemy.
365
+
366
+ 100% Vibe coded using ChatGPT
367
+
368
+ Inspired by the pioneering work of **Wil van der Aalst**, the “godfather of process mining.”
369
+
370
+ Powered by the **PM4Py** process mining library, created by **Sebastiaan J. van Zelst** and contributors: https://pm4py.fit.fraunhofer.de/
371
+
372
+ Built with Python and other open-source libraries (pandas, Streamlit, Graphviz, etc.).
373
+
374
+ Full technical information, installation steps, and source code available in the **GitHub repository**.
375
+ """
376
+ )
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ graphviz
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
1
+ streamlit==1.35.0
2
+ pm4py==2.7.5
3
+ pandas==2.1.4
4
+ numpy==1.26.4
5
+ graphviz==0.20.3
runtime.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ python-3.11