import gradio as gr
import pandas as pd
import numpy as np
import json
import io
import os
import zipfile
import tempfile
# =========================
# GLOBAL STATE (in-memory)
# =========================
STATE = {}

# =========================
# UTILS
# =========================
def read_file(file):
    if file.name.endswith(".csv"):
        return pd.read_csv(file.name)
    elif file.name.endswith(".parquet"):
        return pd.read_parquet(file.name)
    else:
        raise ValueError("Unsupported format")


# =========================
# COMPONENT 1: PROFILING
# =========================
def profile_data(df, training=True):
    profile = {}

    profile["shape"] = df.shape
    profile["missing_ratio"] = df.isna().mean().to_dict()

    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

    profile["numerical"] = num_cols
    profile["categorical"] = cat_cols

    if training:
        STATE["profile"] = profile

    return profile


# =========================
# COMPONENT 2: OUTLIER + IMPUTATION
# =========================
def handle_outliers_impute(df, training=True):
    df = df.copy()

    dropped_cols = []
    impute_values = {}
    outlier_bounds = {}

    for col in df.columns:
        if df[col].isna().mean() > 0.9:
            dropped_cols.append(col)

    df.drop(columns=dropped_cols, inplace=True)

    for col in df.select_dtypes(include=np.number).columns:
        if training:
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            lower = q1 - 1.5 * iqr
            upper = q3 + 1.5 * iqr

            outlier_bounds[col] = (lower, upper)

        else:
            lower, upper = STATE["outliers"][col]

        df[col] = np.clip(df[col], lower, upper)

        if training:
            impute_values[col] = df[col].median()
        else:
            impute_values[col] = STATE["impute"][col]

        df[col].fillna(impute_values[col], inplace=True)

    for col in df.select_dtypes(exclude=np.number).columns:
        if training:
            impute_values[col] = df[col].mode()[0]
        else:
            impute_values[col] = STATE["impute"][col]

        df[col].fillna(impute_values[col], inplace=True)

    if training:
        STATE["impute"] = impute_values
        STATE["outliers"] = outlier_bounds

    return df, dropped_cols, impute_values


# =========================
# COMPONENT 3: ENCODING
# =========================
def encode_data(df, training=True):
    df = df.copy()
    new_cols = []

    if training:
        STATE["encoding"] = {}

    for col in df.select_dtypes(exclude=np.number).columns:
        if training:
            uniques = df[col].unique().tolist()
            STATE["encoding"][col] = uniques
        else:
            uniques = STATE["encoding"][col]

        for val in uniques:
            new_col = f"{col}_{val}"
            df[new_col] = (df[col] == val).astype(int)
            new_cols.append(new_col)

        df.drop(columns=[col], inplace=True)

    return df, new_cols


# =========================
# COMPONENT 4: MEMORY OPT
# =========================
def optimize_memory(df):
    before = df.memory_usage(deep=True).sum()

    for col in df.select_dtypes(include=["int64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in df.select_dtypes(include=["float64"]).columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    after = df.memory_usage(deep=True).sum()
    saved = 100 * (before - after) / before

    return df, before, after, saved


# =========================
# MAIN PIPELINE
# =========================
def run_pipeline(file, mode):
    if file is None:
        return "Upload file first", None, None, None, None

    df = read_file(file)

    training = mode == "Training"

    if not training and "profile" not in STATE:
        return "ERROR: Run Training first!", None, None, None, None

    # STEP 1
    profile = profile_data(df, training)

    # STEP 2
    df, dropped, impute = handle_outliers_impute(df, training)

    # STEP 3
    df, new_cols = encode_data(df, training)

    # STEP 4
    df, before, after, saved = optimize_memory(df)

    # SAVE OUTPUT
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    zip_buffer = io.BytesIO()
    with zipfile.ZipFile(zip_buffer, "w") as zf:
        for k, v in STATE.items():
            zf.writestr(f"{k}.json", json.dumps(v, indent=2))

    return (
        json.dumps(profile, indent=2),
        df.head(),
        csv_buffer.getvalue(),
        zip_buffer.getvalue(),
        f"RAM saved: {saved:.2f}%"
    )


# =========================
# GRADIO UI
# =========================
with gr.Blocks() as app:

    gr.Markdown("# Auto Data Processor (MLOps Version)")

    with gr.Row():
        file = gr.File(label="Upload CSV/Parquet")
        mode = gr.Radio(["Training", "Inference"], value="Training")

    run_btn = gr.Button("Run Pipeline")

    profile_out = gr.Textbox(label="Data Profiling", lines=15)
    df_out = gr.Dataframe()
    ram_out = gr.Textbox(label="Memory Optimization")

    csv_out = gr.File(label="Download Cleaned CSV")
    zip_out = gr.File(label="Download State ZIP")

    def wrapper(file, mode):
        profile, df_head, csv_data, zip_data, ram = run_pipeline(file, mode)

        if df_head is None:
            return profile, None, None, None, None

        # Create a temporary directory to store the output files safely
        temp_dir = tempfile.mkdtemp()
        
        csv_path = os.path.join(temp_dir, "cleaned.csv")
        zip_path = os.path.join(temp_dir, "state.zip")

        # Write the CSV string to a file
        with open(csv_path, "w", encoding="utf-8") as f:
            f.write(csv_data)

        # Write the ZIP bytes to a file
        with open(zip_path, "wb") as f:
            f.write(zip_data)

        return (
            profile,
            df_head,
            ram,
            csv_path,  # Now we pass the actual string file path
            zip_path   # Now we pass the actual string file path
        )

    run_btn.click(
        wrapper,
        inputs=[file, mode],
        outputs=[profile_out, df_out, ram_out, csv_out, zip_out]
    )

if __name__ == "__main__":
    app.launch()