ninarg commited on
Commit
037f332
·
verified ·
1 Parent(s): 2c10ab1

Fix: correct column names (activity_name, case_id) for OCEL dataset

Browse files
Files changed (1) hide show
  1. app.py +38 -97
app.py CHANGED
@@ -1,47 +1,25 @@
1
- """
2
- VynFi × pm4py: Interactive Process Mining Demo
3
-
4
- Deployed as a HuggingFace Space (Streamlit SDK). Embeddable on vynfi.com
5
- via iframe for the /process-mining-data pillar page.
6
-
7
- Loads the VynFi Supply Chain OCEL dataset from HF, runs pm4py process
8
- discovery + variant analysis, and renders interactive visualizations.
9
- """
10
 
11
  import streamlit as st
12
  import pandas as pd
13
- import pm4py
14
- from pm4py.objects.conversion.log import converter as log_converter
15
  from collections import Counter
16
- import io
17
 
18
  st.set_page_config(page_title="VynFi Process Mining", page_icon="📊", layout="wide")
19
-
20
  st.title("📊 VynFi × pm4py: Process Mining Demo")
21
- st.caption("Synthetic supply-chain event log from [VynFi](https://vynfi.com) — explore interactively")
22
 
23
 
24
  @st.cache_data
25
  def load_data():
26
- """Load the VynFi OCEL dataset from HuggingFace."""
27
  from datasets import load_dataset
28
-
29
- ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train", download_mode="force_redownload")
30
  df = ds.to_pandas()
31
- if "timestamp" in df.columns:
32
- df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
33
-
34
- # Rename for pm4py
35
- rename = {}
36
- if "case_id" in df.columns:
37
- rename["case_id"] = "case:concept:name"
38
- if "activity_name" in df.columns:
39
- rename["activity_name"] = "concept:name"
40
- elif "activity" in df.columns:
41
- rename["activity"] = "concept:name"
42
- if "timestamp" in df.columns:
43
- rename["timestamp"] = "time:timestamp"
44
- df = df.rename(columns=rename)
45
  return df
46
 
47
 
@@ -49,92 +27,55 @@ df = load_data()
49
 
50
  st.sidebar.header("Dataset")
51
  st.sidebar.metric("Events", f"{len(df):,}")
52
- st.sidebar.metric("Activities", df["concept:name"].nunique() if "concept:name" in df.columns else "?")
53
- st.sidebar.metric("Cases", df["case:concept:name"].nunique() if "case:concept:name" in df.columns else "?")
54
 
55
  tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])
56
 
57
  with tab1:
58
  st.subheader("Directly-Follows Graph")
59
  try:
 
60
  event_log = pm4py.convert_to_event_log(df)
61
- dfg, start_activities, end_activities = pm4py.discover_dfg(event_log)
62
-
63
- # Render DFG as dot → SVG
64
- from pm4py.visualization.dfg import visualizer as dfg_visualizer
65
-
66
- gviz = dfg_visualizer.apply(
67
- dfg,
68
- log=event_log,
69
- variant=dfg_visualizer.Variants.FREQUENCY,
70
  parameters={
71
- dfg_visualizer.Variants.FREQUENCY.value.Parameters.START_ACTIVITIES: start_activities,
72
- dfg_visualizer.Variants.FREQUENCY.value.Parameters.END_ACTIVITIES: end_activities,
73
- dfg_visualizer.Variants.FREQUENCY.value.Parameters.FORMAT: "svg",
74
- },
75
- )
76
- svg = dfg_visualizer.serialize(gviz).decode("utf-8")
77
- st.image(svg, use_container_width=True)
78
  except Exception as e:
79
  st.warning(f"Could not render DFG: {e}")
80
- st.info("Try the Variants or Statistics tabs instead.")
81
 
82
  with tab2:
83
  st.subheader("Process Variants")
84
- if "case:concept:name" in df.columns and "concept:name" in df.columns:
85
- variants = {}
86
- for case_id, group in df.sort_values("time:timestamp").groupby("case:concept:name"):
87
- trace = tuple(group["concept:name"].tolist())
88
- variants[case_id] = trace
89
-
90
- variant_counts = Counter(variants.values())
91
- total = len(variants)
92
-
93
- st.metric("Unique Variants", len(variant_counts))
94
-
95
- rows = []
96
- for trace, count in variant_counts.most_common(20):
97
- rows.append({
98
- "Trace": " → ".join(trace),
99
- "Count": count,
100
- "Frequency": f"{count / total * 100:.1f}%",
101
- })
102
- st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
103
-
104
- # Happy path
105
- if variant_counts:
106
- happy_path = variant_counts.most_common(1)[0]
107
- st.info(
108
- f"**Happy path**: {' → '.join(happy_path[0])} "
109
- f"({happy_path[1]} cases, {happy_path[1] / total * 100:.1f}%)"
110
- )
111
 
112
  with tab3:
113
  st.subheader("Activity Statistics")
114
- if "concept:name" in df.columns:
115
- act_counts = df["concept:name"].value_counts()
116
- st.bar_chart(act_counts)
117
-
118
- col1, col2 = st.columns(2)
119
- with col1:
120
- st.metric("Most frequent", act_counts.index[0])
121
- st.metric("Count", f"{act_counts.iloc[0]:,}")
122
- with col2:
123
- st.metric("Least frequent", act_counts.index[-1])
124
- st.metric("Count", f"{act_counts.iloc[-1]:,}")
125
-
126
  if "time:timestamp" in df.columns:
127
  st.subheader("Events Over Time")
128
- daily = df.set_index("time:timestamp").resample("W").size()
129
- st.line_chart(daily)
130
 
131
  with tab4:
132
  st.subheader("Raw Event Data")
133
- st.dataframe(df.head(100), use_container_width=True)
134
 
135
  st.divider()
136
- st.caption(
137
- "Data: [VynFi/vynfi-supply-chain-ocel](https://huggingface.co/datasets/VynFi/vynfi-supply-chain-ocel) · "
138
- "Engine: [pm4py](https://pm4py.fit.fraunhofer.de/) · "
139
- "Platform: [vynfi.com](https://vynfi.com)"
140
- )
 
1
+ """VynFi × pm4py: Interactive Process Mining Demo"""
 
 
 
 
 
 
 
 
2
 
3
  import streamlit as st
4
  import pandas as pd
 
 
5
  from collections import Counter
 
6
 
7
  st.set_page_config(page_title="VynFi Process Mining", page_icon="📊", layout="wide")
 
8
  st.title("📊 VynFi × pm4py: Process Mining Demo")
9
+ st.caption("Synthetic supply-chain event log from [VynFi](https://vynfi.com)")
10
 
11
 
12
  @st.cache_data
13
  def load_data():
 
14
  from datasets import load_dataset
15
+ ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train")
 
16
  df = ds.to_pandas()
17
+ df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
18
+ df = df.rename(columns={
19
+ "case_id": "case:concept:name",
20
+ "activity_name": "concept:name",
21
+ "timestamp": "time:timestamp",
22
+ })
 
 
 
 
 
 
 
 
23
  return df
24
 
25
 
 
27
 
28
  st.sidebar.header("Dataset")
29
  st.sidebar.metric("Events", f"{len(df):,}")
30
+ st.sidebar.metric("Activities", df["concept:name"].nunique())
31
+ st.sidebar.metric("Cases", df["case:concept:name"].nunique())
32
 
33
  tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])
34
 
35
  with tab1:
36
  st.subheader("Directly-Follows Graph")
37
  try:
38
+ import pm4py
39
  event_log = pm4py.convert_to_event_log(df)
40
+ dfg, sa, ea = pm4py.discover_dfg(event_log)
41
+ from pm4py.visualization.dfg import visualizer as dfg_vis
42
+ gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY,
 
 
 
 
 
 
43
  parameters={
44
+ dfg_vis.Variants.FREQUENCY.value.Parameters.START_ACTIVITIES: sa,
45
+ dfg_vis.Variants.FREQUENCY.value.Parameters.END_ACTIVITIES: ea,
46
+ dfg_vis.Variants.FREQUENCY.value.Parameters.FORMAT: "svg",
47
+ })
48
+ st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True)
 
 
49
  except Exception as e:
50
  st.warning(f"Could not render DFG: {e}")
51
+ st.info("pm4py or graphviz may not be available. Try the Variants tab.")
52
 
53
  with tab2:
54
  st.subheader("Process Variants")
55
+ variants = {}
56
+ for cid, grp in df.sort_values("time:timestamp").groupby("case:concept:name"):
57
+ variants[cid] = tuple(grp["concept:name"].tolist())
58
+ vc = Counter(variants.values())
59
+ total = len(variants)
60
+ st.metric("Unique Variants", len(vc))
61
+ rows = [{"Trace": " → ".join(t), "Count": c, "Frequency": f"{c/total*100:.1f}%"}
62
+ for t, c in vc.most_common(20)]
63
+ st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
64
+ if vc:
65
+ hp = vc.most_common(1)[0]
66
+ st.info(f"**Happy path**: {' → '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  with tab3:
69
  st.subheader("Activity Statistics")
70
+ ac = df["concept:name"].value_counts()
71
+ st.bar_chart(ac)
 
 
 
 
 
 
 
 
 
 
72
  if "time:timestamp" in df.columns:
73
  st.subheader("Events Over Time")
74
+ st.line_chart(df.set_index("time:timestamp").resample("W").size())
 
75
 
76
  with tab4:
77
  st.subheader("Raw Event Data")
78
+ st.dataframe(df.head(200), use_container_width=True)
79
 
80
  st.divider()
81
+ st.caption("[VynFi](https://vynfi.com) · [pm4py](https://pm4py.fit.fraunhofer.de/) · [Dataset](https://huggingface.co/datasets/VynFi/vynfi-supply-chain-ocel)")