Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

d4cf179

verified ·

1 Parent(s): 026497d

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +50 -45

src/streamlit_app.py CHANGED Viewed

@@ -1,17 +1,15 @@
 # streamlit_data_analysis_app.py
-# Streamlit Data Analysis App for Hugging Face Spaces
 # Features:
 # - Upload CSV / Excel
 # - Automatic cleaning & standardization
 # - Preprocessing (imputation, encoding, scaling)
-# - Quick visualizations (histogram, boxplot, scatter, correlation heatmap)
-# - Preview cleaned dataset
-# - LLM-powered insights using Hugging Face Inference API
-# - Auto fallback if model access (403) fails
-# - Uses HF_TOKEN from Streamlit secrets or environment
 import os
-import io
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -22,32 +20,41 @@ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from huggingface_hub import InferenceClient
 # ---------- CONFIGURATION ----------
 st.set_page_config(page_title="Data Analysis App", layout="wide")
-# ✅ Safe HF_TOKEN loader (works locally + on Spaces)
 try:
     HF_TOKEN = st.secrets["HF_TOKEN"]
 except Exception:
     HF_TOKEN = os.getenv("HF_TOKEN")
-if not HF_TOKEN:
-    st.warning("⚠️ HF_TOKEN not found. Please add it to your Hugging Face Space secrets or environment.")
-else:
     st.success("✅ Hugging Face token loaded successfully.")
-# Default open-access models
 MODEL_OPTIONS = {
-    "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct (open, strong)",
-    "HuggingFaceH4/zephyr-7b-beta": "Zephyr 7B Beta (open, fluent)",
-    "bigscience/bloom-3b": "Bloom 3B (lightweight, open)"
 }
-# ---------- UTILITY FUNCTIONS ----------
-def read_file(uploaded_file) -> pd.DataFrame:
-    """Reads uploaded CSV or Excel file."""
     name = uploaded_file.name.lower()
     if name.endswith(('.csv', '.txt')):
         return pd.read_csv(uploaded_file)
@@ -56,7 +63,6 @@ def read_file(uploaded_file) -> pd.DataFrame:
     else:
         raise ValueError("Unsupported file type. Please upload CSV or Excel.")
 def clean_column_name(col: str) -> str:
     col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
     col = "_".join(col.split())
@@ -65,9 +71,7 @@ def clean_column_name(col: str) -> str:
         col = col.replace('__', '_')
     return col
 def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
-    """Standardizes column names and cleans whitespace."""
     df = df.copy()
     for c in df.select_dtypes(include=['object']).columns:
         df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
@@ -83,9 +87,7 @@ def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> p
                     df[c] = pd.to_datetime(df[c], errors='coerce')
     return df
 def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
-    """Creates a structured summary of the dataframe."""
     summary = {'shape': df.shape, 'columns': [], 'preview': df.head(max_rows).to_dict(orient='records')}
     for c in df.columns:
         info = {'name': c, 'dtype': str(df[c].dtype), 'n_missing': int(df[c].isna().sum()), 'n_unique': int(df[c].nunique(dropna=True))}
@@ -98,9 +100,7 @@ def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
         summary['columns'].append(info)
     return summary
 def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot'):
-    """Build preprocessing pipeline for numeric and categorical features."""
     numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
     cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
     transformers = []
@@ -123,9 +123,7 @@ def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median
         transformers.append(('cat', cat_pipe, cat_cols))
     return ColumnTransformer(transformers), numeric_cols + cat_cols
 def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
-    """Applies preprocessing pipeline and returns processed DataFrame."""
     X = preprocessor.fit_transform(df)
     feature_names = []
     for name, trans, cols in preprocessor.transformers_:
@@ -140,11 +138,8 @@ def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd
                 feature_names += cols
     return pd.DataFrame(X, columns=feature_names)
-# ---------- LLM INTEGRATION ----------
 def build_dataset_prompt(summary, user_question=None):
-    """Builds a prompt for dataset insights."""
     s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
     for c in summary['columns']:
         s.append(f"- {c['name']} ({c['dtype']}) missing={c['n_missing']} unique={c['n_unique']}")
@@ -154,12 +149,10 @@ def build_dataset_prompt(summary, user_question=None):
     if user_question:
         s.append(f"User question: {user_question}")
     else:
-        s.append("Please give a dataset summary, patterns, and visualization suggestions.")
     return "\n".join(s)
-def call_llm(prompt: str, model: str, max_tokens: int = 512) -> str:
-    """Calls the Hugging Face Inference API with error handling and fallback."""
     if not HF_TOKEN:
         return "⚠️ No Hugging Face token found."
     client = InferenceClient(token=HF_TOKEN)
@@ -180,24 +173,33 @@ def call_llm(prompt: str, model: str, max_tokens: int = 512) -> str:
                     return str(response)
                 except Exception as e2:
                     return f"❌ Fallback model also failed: {e2}"
-            return "🚫 Access denied (403). Try using an open-access model like Mistral or Zephyr."
         return f"❌ LLM call failed: {e}"
-# ---------- STREAMLIT UI ----------
-st.title("📊 Data Analysis & Cleaning App (Hugging Face + Streamlit)")
-st.markdown("Upload CSV or Excel files, clean, preprocess, visualize, and generate insights using an LLM.")
 with st.sidebar:
     st.header("⚙️ Options")
-    model_choice = st.selectbox("Select LLM model", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
     max_tokens = st.slider("LLM max tokens", 128, 1024, 512, 64)
     impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
     encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
     scale_numeric = st.checkbox("Scale numeric features", True)
     show_raw_preview = st.checkbox("Show raw preview", True)
-uploaded_file = st.file_uploader("📂 Upload your CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
 if uploaded_file:
     with st.spinner("Reading file..."):
@@ -233,7 +235,7 @@ if uploaded_file:
         second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
     if st.button("Show Visualization"):
-        fig, ax = plt.subplots(figsize=(8, 5))
         try:
             if viz_type == 'Histogram':
                 sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
@@ -251,12 +253,15 @@ if uploaded_file:
         except Exception as e:
             st.error(f"Visualization failed: {e}")
-    st.subheader("🧠 Ask the LLM for Insights")
     user_q = st.text_area("Enter your question (optional):")
     if st.button("Get Insights"):
         with st.spinner("Generating insights..."):
             prompt = build_dataset_prompt(summary, user_q if user_q else None)
-            llm_resp = call_llm(prompt, model_choice, max_tokens)
             st.write(llm_resp)
 else:

 # streamlit_data_analysis_app.py
+# Streamlit Data Analysis App for Hugging Face Spaces + Gemini 2.0 Flash
 # Features:
 # - Upload CSV / Excel
 # - Automatic cleaning & standardization
 # - Preprocessing (imputation, encoding, scaling)
+# - Quick visualizations
+# - Dataset summary + preview
+# - Insights from LLMs (Gemini or Hugging Face)
+# - Auto fallback and detailed error messages
 import os
 import streamlit as st
 import pandas as pd
 import numpy as np
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from huggingface_hub import InferenceClient
+import google.generativeai as genai
 # ---------- CONFIGURATION ----------
 st.set_page_config(page_title="Data Analysis App", layout="wide")
+# Load API keys safely
 try:
     HF_TOKEN = st.secrets["HF_TOKEN"]
 except Exception:
     HF_TOKEN = os.getenv("HF_TOKEN")
+try:
+    GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]
+except Exception:
+    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+# Setup Gemini if available
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+    st.success("✅ Gemini API key loaded successfully.")
+elif HF_TOKEN:
     st.success("✅ Hugging Face token loaded successfully.")
+else:
+    st.warning("⚠️ No Gemini or Hugging Face token found. LLM features will be disabled.")
+# Default models
 MODEL_OPTIONS = {
+    "gemini-2.0-flash": "Gemini 2.0 Flash (Google AI, fast, free-tier)",
+    "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 7B Instruct (open)",
+    "HuggingFaceH4/zephyr-7b-beta": "Zephyr 7B Beta (open)",
+    "bigscience/bloom-3b": "Bloom 3B (lightweight)",
 }
+# ---------- UTILITIES ----------
+def read_file(uploaded_file):
     name = uploaded_file.name.lower()
     if name.endswith(('.csv', '.txt')):
         return pd.read_csv(uploaded_file)
     else:
         raise ValueError("Unsupported file type. Please upload CSV or Excel.")
 def clean_column_name(col: str) -> str:
     col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
     col = "_".join(col.split())
         col = col.replace('__', '_')
     return col
 def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
     df = df.copy()
     for c in df.select_dtypes(include=['object']).columns:
         df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
                     df[c] = pd.to_datetime(df[c], errors='coerce')
     return df
 def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
     summary = {'shape': df.shape, 'columns': [], 'preview': df.head(max_rows).to_dict(orient='records')}
     for c in df.columns:
         info = {'name': c, 'dtype': str(df[c].dtype), 'n_missing': int(df[c].isna().sum()), 'n_unique': int(df[c].nunique(dropna=True))}
         summary['columns'].append(info)
     return summary
 def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot'):
     numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
     cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
     transformers = []
         transformers.append(('cat', cat_pipe, cat_cols))
     return ColumnTransformer(transformers), numeric_cols + cat_cols
 def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
     X = preprocessor.fit_transform(df)
     feature_names = []
     for name, trans, cols in preprocessor.transformers_:
                 feature_names += cols
     return pd.DataFrame(X, columns=feature_names)
+# ---------- LLM HELPERS ----------
 def build_dataset_prompt(summary, user_question=None):
     s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
     for c in summary['columns']:
         s.append(f"- {c['name']} ({c['dtype']}) missing={c['n_missing']} unique={c['n_unique']}")
     if user_question:
         s.append(f"User question: {user_question}")
     else:
+        s.append("Please provide a summary, notable patterns, and suggestions for visualizations.")
     return "\n".join(s)
+def call_llm_huggingface(prompt: str, model: str, max_tokens: int = 512) -> str:
     if not HF_TOKEN:
         return "⚠️ No Hugging Face token found."
     client = InferenceClient(token=HF_TOKEN)
                     return str(response)
                 except Exception as e2:
                     return f"❌ Fallback model also failed: {e2}"
+            return "🚫 Access denied (403). Try using an open-access model."
         return f"❌ LLM call failed: {e}"
+def call_llm_gemini(prompt: str, model="gemini-2.0-flash", max_tokens=512):
+    if not GEMINI_API_KEY:
+        return "⚠️ Gemini API key not found."
+    try:
+        model_obj = genai.GenerativeModel(model)
+        response = model_obj.generate_content(prompt)
+        return response.text
+    except Exception as e:
+        return f"❌ Gemini call failed: {e}"
+# ---------- STREAMLIT UI ----------
+st.title("📊 Data Analysis & Cleaning App")
+st.markdown("Upload CSV or Excel, clean and preprocess it, visualize data, and get insights from an AI model.")
 with st.sidebar:
     st.header("⚙️ Options")
+    model_choice = st.selectbox("Select Model", options=list(MODEL_OPTIONS.keys()), format_func=lambda k: MODEL_OPTIONS[k])
     max_tokens = st.slider("LLM max tokens", 128, 1024, 512, 64)
     impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
     encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
     scale_numeric = st.checkbox("Scale numeric features", True)
     show_raw_preview = st.checkbox("Show raw preview", True)
+uploaded_file = st.file_uploader("📂 Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
 if uploaded_file:
     with st.spinner("Reading file..."):
         second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
     if st.button("Show Visualization"):
+        fig, ax = plt.subplots(figsize=(8,5))
         try:
             if viz_type == 'Histogram':
                 sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
         except Exception as e:
             st.error(f"Visualization failed: {e}")
+    st.subheader("🧠 Ask the AI for Insights")
     user_q = st.text_area("Enter your question (optional):")
     if st.button("Get Insights"):
         with st.spinner("Generating insights..."):
             prompt = build_dataset_prompt(summary, user_q if user_q else None)
+            if model_choice.startswith("gemini"):
+                llm_resp = call_llm_gemini(prompt, model_choice, max_tokens)
+            else:
+                llm_resp = call_llm_huggingface(prompt, model_choice, max_tokens)
             st.write(llm_resp)
 else: