Starberry15 commited on
Commit
d4cf179
Β·
verified Β·
1 Parent(s): 026497d

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +50 -45
src/streamlit_app.py CHANGED
@@ -1,17 +1,15 @@
1
  # streamlit_data_analysis_app.py
2
- # Streamlit Data Analysis App for Hugging Face Spaces
3
  # Features:
4
  # - Upload CSV / Excel
5
  # - Automatic cleaning & standardization
6
  # - Preprocessing (imputation, encoding, scaling)
7
- # - Quick visualizations (histogram, boxplot, scatter, correlation heatmap)
8
- # - Preview cleaned dataset
9
- # - LLM-powered insights using Hugging Face Inference API
10
- # - Auto fallback if model access (403) fails
11
- # - Uses HF_TOKEN from Streamlit secrets or environment
12
 
13
  import os
14
- import io
15
  import streamlit as st
16
  import pandas as pd
17
  import numpy as np
@@ -22,32 +20,41 @@ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
22
  from sklearn.compose import ColumnTransformer
23
  from sklearn.pipeline import Pipeline
24
  from huggingface_hub import InferenceClient
 
25
 
26
  # ---------- CONFIGURATION ----------
27
  st.set_page_config(page_title="Data Analysis App", layout="wide")
28
 
29
- # βœ… Safe HF_TOKEN loader (works locally + on Spaces)
30
  try:
31
  HF_TOKEN = st.secrets["HF_TOKEN"]
32
  except Exception:
33
  HF_TOKEN = os.getenv("HF_TOKEN")
34
 
35
- if not HF_TOKEN:
36
- st.warning("⚠️ HF_TOKEN not found. Please add it to your Hugging Face Space secrets or environment.")
37
- else:
 
 
 
 
 
 
 
38
  st.success("βœ… Hugging Face token loaded successfully.")
 
 
39
 
40
- # Default open-access models
41
  MODEL_OPTIONS = {
42
- "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct (open, strong)",
43
- "HuggingFaceH4/zephyr-7b-beta": "Zephyr 7B Beta (open, fluent)",
44
- "bigscience/bloom-3b": "Bloom 3B (lightweight, open)"
 
45
  }
46
 
47
- # ---------- UTILITY FUNCTIONS ----------
48
-
49
- def read_file(uploaded_file) -> pd.DataFrame:
50
- """Reads uploaded CSV or Excel file."""
51
  name = uploaded_file.name.lower()
52
  if name.endswith(('.csv', '.txt')):
53
  return pd.read_csv(uploaded_file)
@@ -56,7 +63,6 @@ def read_file(uploaded_file) -> pd.DataFrame:
56
  else:
57
  raise ValueError("Unsupported file type. Please upload CSV or Excel.")
58
 
59
-
60
  def clean_column_name(col: str) -> str:
61
  col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
62
  col = "_".join(col.split())
@@ -65,9 +71,7 @@ def clean_column_name(col: str) -> str:
65
  col = col.replace('__', '_')
66
  return col
67
 
68
-
69
  def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
70
- """Standardizes column names and cleans whitespace."""
71
  df = df.copy()
72
  for c in df.select_dtypes(include=['object']).columns:
73
  df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
@@ -83,9 +87,7 @@ def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> p
83
  df[c] = pd.to_datetime(df[c], errors='coerce')
84
  return df
85
 
86
-
87
  def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
88
- """Creates a structured summary of the dataframe."""
89
  summary = {'shape': df.shape, 'columns': [], 'preview': df.head(max_rows).to_dict(orient='records')}
90
  for c in df.columns:
91
  info = {'name': c, 'dtype': str(df[c].dtype), 'n_missing': int(df[c].isna().sum()), 'n_unique': int(df[c].nunique(dropna=True))}
@@ -98,9 +100,7 @@ def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
98
  summary['columns'].append(info)
99
  return summary
100
 
101
-
102
  def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot'):
103
- """Build preprocessing pipeline for numeric and categorical features."""
104
  numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
105
  cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
106
  transformers = []
@@ -123,9 +123,7 @@ def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median
123
  transformers.append(('cat', cat_pipe, cat_cols))
124
  return ColumnTransformer(transformers), numeric_cols + cat_cols
125
 
126
-
127
  def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
128
- """Applies preprocessing pipeline and returns processed DataFrame."""
129
  X = preprocessor.fit_transform(df)
130
  feature_names = []
131
  for name, trans, cols in preprocessor.transformers_:
@@ -140,11 +138,8 @@ def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd
140
  feature_names += cols
141
  return pd.DataFrame(X, columns=feature_names)
142
 
143
-
144
- # ---------- LLM INTEGRATION ----------
145
-
146
  def build_dataset_prompt(summary, user_question=None):
147
- """Builds a prompt for dataset insights."""
148
  s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
149
  for c in summary['columns']:
150
  s.append(f"- {c['name']} ({c['dtype']}) missing={c['n_missing']} unique={c['n_unique']}")
@@ -154,12 +149,10 @@ def build_dataset_prompt(summary, user_question=None):
154
  if user_question:
155
  s.append(f"User question: {user_question}")
156
  else:
157
- s.append("Please give a dataset summary, patterns, and visualization suggestions.")
158
  return "\n".join(s)
159
 
160
-
161
- def call_llm(prompt: str, model: str, max_tokens: int = 512) -> str:
162
- """Calls the Hugging Face Inference API with error handling and fallback."""
163
  if not HF_TOKEN:
164
  return "⚠️ No Hugging Face token found."
165
  client = InferenceClient(token=HF_TOKEN)
@@ -180,24 +173,33 @@ def call_llm(prompt: str, model: str, max_tokens: int = 512) -> str:
180
  return str(response)
181
  except Exception as e2:
182
  return f"❌ Fallback model also failed: {e2}"
183
- return "🚫 Access denied (403). Try using an open-access model like Mistral or Zephyr."
184
  return f"❌ LLM call failed: {e}"
185
 
186
- # ---------- STREAMLIT UI ----------
 
 
 
 
 
 
 
 
187
 
188
- st.title("πŸ“Š Data Analysis & Cleaning App (Hugging Face + Streamlit)")
189
- st.markdown("Upload CSV or Excel files, clean, preprocess, visualize, and generate insights using an LLM.")
 
190
 
191
  with st.sidebar:
192
  st.header("βš™οΈ Options")
193
- model_choice = st.selectbox("Select LLM model", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
194
  max_tokens = st.slider("LLM max tokens", 128, 1024, 512, 64)
195
  impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
196
  encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
197
  scale_numeric = st.checkbox("Scale numeric features", True)
198
  show_raw_preview = st.checkbox("Show raw preview", True)
199
 
200
- uploaded_file = st.file_uploader("πŸ“‚ Upload your CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
201
 
202
  if uploaded_file:
203
  with st.spinner("Reading file..."):
@@ -233,7 +235,7 @@ if uploaded_file:
233
  second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
234
 
235
  if st.button("Show Visualization"):
236
- fig, ax = plt.subplots(figsize=(8, 5))
237
  try:
238
  if viz_type == 'Histogram':
239
  sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
@@ -251,12 +253,15 @@ if uploaded_file:
251
  except Exception as e:
252
  st.error(f"Visualization failed: {e}")
253
 
254
- st.subheader("🧠 Ask the LLM for Insights")
255
  user_q = st.text_area("Enter your question (optional):")
256
  if st.button("Get Insights"):
257
  with st.spinner("Generating insights..."):
258
  prompt = build_dataset_prompt(summary, user_q if user_q else None)
259
- llm_resp = call_llm(prompt, model_choice, max_tokens)
 
 
 
260
  st.write(llm_resp)
261
 
262
  else:
 
1
  # streamlit_data_analysis_app.py
2
+ # Streamlit Data Analysis App for Hugging Face Spaces + Gemini 2.0 Flash
3
  # Features:
4
  # - Upload CSV / Excel
5
  # - Automatic cleaning & standardization
6
  # - Preprocessing (imputation, encoding, scaling)
7
+ # - Quick visualizations
8
+ # - Dataset summary + preview
9
+ # - Insights from LLMs (Gemini or Hugging Face)
10
+ # - Auto fallback and detailed error messages
 
11
 
12
  import os
 
13
  import streamlit as st
14
  import pandas as pd
15
  import numpy as np
 
20
  from sklearn.compose import ColumnTransformer
21
  from sklearn.pipeline import Pipeline
22
  from huggingface_hub import InferenceClient
23
+ import google.generativeai as genai
24
 
25
  # ---------- CONFIGURATION ----------
26
  st.set_page_config(page_title="Data Analysis App", layout="wide")
27
 
28
+ # Load API keys safely
29
  try:
30
  HF_TOKEN = st.secrets["HF_TOKEN"]
31
  except Exception:
32
  HF_TOKEN = os.getenv("HF_TOKEN")
33
 
34
+ try:
35
+ GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]
36
+ except Exception:
37
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
38
+
39
+ # Setup Gemini if available
40
+ if GEMINI_API_KEY:
41
+ genai.configure(api_key=GEMINI_API_KEY)
42
+ st.success("βœ… Gemini API key loaded successfully.")
43
+ elif HF_TOKEN:
44
  st.success("βœ… Hugging Face token loaded successfully.")
45
+ else:
46
+ st.warning("⚠️ No Gemini or Hugging Face token found. LLM features will be disabled.")
47
 
48
+ # Default models
49
  MODEL_OPTIONS = {
50
+ "gemini-2.0-flash": "Gemini 2.0 Flash (Google AI, fast, free-tier)",
51
+ "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct (open)",
52
+ "HuggingFaceH4/zephyr-7b-beta": "Zephyr 7B Beta (open)",
53
+ "bigscience/bloom-3b": "Bloom 3B (lightweight)",
54
  }
55
 
56
+ # ---------- UTILITIES ----------
57
+ def read_file(uploaded_file):
 
 
58
  name = uploaded_file.name.lower()
59
  if name.endswith(('.csv', '.txt')):
60
  return pd.read_csv(uploaded_file)
 
63
  else:
64
  raise ValueError("Unsupported file type. Please upload CSV or Excel.")
65
 
 
66
  def clean_column_name(col: str) -> str:
67
  col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
68
  col = "_".join(col.split())
 
71
  col = col.replace('__', '_')
72
  return col
73
 
 
74
  def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
 
75
  df = df.copy()
76
  for c in df.select_dtypes(include=['object']).columns:
77
  df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
 
87
  df[c] = pd.to_datetime(df[c], errors='coerce')
88
  return df
89
 
 
90
  def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
 
91
  summary = {'shape': df.shape, 'columns': [], 'preview': df.head(max_rows).to_dict(orient='records')}
92
  for c in df.columns:
93
  info = {'name': c, 'dtype': str(df[c].dtype), 'n_missing': int(df[c].isna().sum()), 'n_unique': int(df[c].nunique(dropna=True))}
 
100
  summary['columns'].append(info)
101
  return summary
102
 
 
103
  def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot'):
 
104
  numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
105
  cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
106
  transformers = []
 
123
  transformers.append(('cat', cat_pipe, cat_cols))
124
  return ColumnTransformer(transformers), numeric_cols + cat_cols
125
 
 
126
  def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
 
127
  X = preprocessor.fit_transform(df)
128
  feature_names = []
129
  for name, trans, cols in preprocessor.transformers_:
 
138
  feature_names += cols
139
  return pd.DataFrame(X, columns=feature_names)
140
 
141
+ # ---------- LLM HELPERS ----------
 
 
142
  def build_dataset_prompt(summary, user_question=None):
 
143
  s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
144
  for c in summary['columns']:
145
  s.append(f"- {c['name']} ({c['dtype']}) missing={c['n_missing']} unique={c['n_unique']}")
 
149
  if user_question:
150
  s.append(f"User question: {user_question}")
151
  else:
152
+ s.append("Please provide a summary, notable patterns, and suggestions for visualizations.")
153
  return "\n".join(s)
154
 
155
+ def call_llm_huggingface(prompt: str, model: str, max_tokens: int = 512) -> str:
 
 
156
  if not HF_TOKEN:
157
  return "⚠️ No Hugging Face token found."
158
  client = InferenceClient(token=HF_TOKEN)
 
173
  return str(response)
174
  except Exception as e2:
175
  return f"❌ Fallback model also failed: {e2}"
176
+ return "🚫 Access denied (403). Try using an open-access model."
177
  return f"❌ LLM call failed: {e}"
178
 
179
+ def call_llm_gemini(prompt: str, model="gemini-2.0-flash", max_tokens=512):
180
+ if not GEMINI_API_KEY:
181
+ return "⚠️ Gemini API key not found."
182
+ try:
183
+ model_obj = genai.GenerativeModel(model)
184
+ response = model_obj.generate_content(prompt)
185
+ return response.text
186
+ except Exception as e:
187
+ return f"❌ Gemini call failed: {e}"
188
 
189
+ # ---------- STREAMLIT UI ----------
190
+ st.title("πŸ“Š Data Analysis & Cleaning App")
191
+ st.markdown("Upload CSV or Excel, clean and preprocess it, visualize data, and get insights from an AI model.")
192
 
193
  with st.sidebar:
194
  st.header("βš™οΈ Options")
195
+ model_choice = st.selectbox("Select Model", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
196
  max_tokens = st.slider("LLM max tokens", 128, 1024, 512, 64)
197
  impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
198
  encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
199
  scale_numeric = st.checkbox("Scale numeric features", True)
200
  show_raw_preview = st.checkbox("Show raw preview", True)
201
 
202
+ uploaded_file = st.file_uploader("πŸ“‚ Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
203
 
204
  if uploaded_file:
205
  with st.spinner("Reading file..."):
 
235
  second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
236
 
237
  if st.button("Show Visualization"):
238
+ fig, ax = plt.subplots(figsize=(8,5))
239
  try:
240
  if viz_type == 'Histogram':
241
  sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
 
253
  except Exception as e:
254
  st.error(f"Visualization failed: {e}")
255
 
256
+ st.subheader("🧠 Ask the AI for Insights")
257
  user_q = st.text_area("Enter your question (optional):")
258
  if st.button("Get Insights"):
259
  with st.spinner("Generating insights..."):
260
  prompt = build_dataset_prompt(summary, user_q if user_q else None)
261
+ if model_choice.startswith("gemini"):
262
+ llm_resp = call_llm_gemini(prompt, model_choice, max_tokens)
263
+ else:
264
+ llm_resp = call_llm_huggingface(prompt, model_choice, max_tokens)
265
  st.write(llm_resp)
266
 
267
  else: