Spaces:

doncamilom
/

sac-evals

Sleeping

App Files Files Community

doncamilom commited on Jun 24

Commit

142faac

1 Parent(s): 84d3480

update with our app

Browse files

Files changed (3) hide show

Dockerfile +1 -1
src/app.py +406 -0
src/streamlit_app.py +0 -40

Dockerfile CHANGED Viewed

@@ -18,4 +18,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]


18
19	HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
21	+ ENTRYPOINT ["streamlit", "run", "src/app.py", "--server.port=8501", "--server.address=0.0.0.0"]

src/app.py ADDED Viewed

	@@ -0,0 +1,406 @@

+"""
+Streamlit app for human evaluation of model outputs.
+Allows users to select two models, compare their responses to the same inputs,
+and record preferences for subsequent analysis.
+"""
+import os
+import json
+import csv
+from datetime import datetime
+import streamlit as st
+import pandas as pd
+st.set_page_config(page_title="Model Comparison Evaluation", layout="wide")
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(SCRIPT_DIR, "for_experiments_prediction")
+@st.cache_data
+def load_models(data_dir):
+    """
+    Discover prediction JSON files named 'predicted_vs_gt.json' and load flattened records for each model.
+    Returns a dict mapping model name to dict of {id: record}.
+    """
+    model_paths = {}
+    for root, _, files in os.walk(data_dir):
+        for fname in files:
+            if fname == 'predicted_vs_gt.json':
+                path = os.path.join(root, fname)
+                rel = os.path.relpath(root, data_dir)
+                model_paths[rel] = path
+    models = {}
+    for model_name, path in sorted(model_paths.items()):
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        records = {}
+        for section in data.values():
+            if isinstance(section, dict):
+                for sub in section.values():
+                    for rec in sub:
+                        records[rec['id']] = rec
+            elif isinstance(section, list):
+                for rec in section:
+                    records[rec['id']] = rec
+        models[model_name] = records
+    return models
+def append_feedback(feedback_file, header, row):
+    """
+    Append a single feedback row to TSV, writing header if file does not exist.
+    """
+    write_header = not os.path.exists(feedback_file)
+    with open(feedback_file, 'a', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f, delimiter='\t', quoting=csv.QUOTE_ALL)
+        if write_header:
+            writer.writerow(header)
+        writer.writerow(row)
+@st.cache_data
+def load_eval_tables(data_dir):
+    """
+    Discover evaluation_table.parquet files under each model directory and load each into a pandas DataFrame.
+    Returns a dict mapping model name to its evaluation DataFrame.
+    """
+    tables = {}
+    for root, _, files in os.walk(data_dir):
+        if 'evaluation_table.parquet' in files:
+            path = os.path.join(root, 'evaluation_table.parquet')
+            rel = os.path.relpath(root, data_dir)
+            tables[rel] = pd.read_parquet(path)
+    return tables
+def main():
+    st.title("Model Comparison Evaluation")
+    print(DATA_DIR)
+    models = load_models(DATA_DIR)
+    eval_tables = load_eval_tables(DATA_DIR)
+    all_cols = set()
+    for df in eval_tables.values():
+        all_cols.update(df.columns)
+    key_columns = {'gt_sac_id', 'gt_title'}
+    metric_columns = sorted(all_cols - key_columns)
+    fixed_metrics = [
+        'chemicals_accuracy',
+        'chemicals_f1_score',
+        'chemicals_precision',
+        'chemicals_recall',
+        'metal_accuracy',
+        'metal_f1_score',
+        'metal_precision',
+        'metal_recall',
+        'procedure_procedure_completeness_score',
+        'procedure_procedure_order_score',
+        'procedure_procedure_accuracy_score',
+        'support_accuracy',
+        'support_f1_score',
+        'support_precision',
+        'support_recall',
+    ]
+    other_metrics = sorted([c for c in metric_columns if c not in fixed_metrics and not c.startswith('gt_')])
+    model_names = list(models.keys())
+    st.sidebar.header("Configuration")
+    def reset_index():
+        st.session_state.idx = 0
+        # Reset feedback file when models change
+        feedback_path = os.path.join(SCRIPT_DIR, 'feedback.tsv')
+        if os.path.exists(feedback_path):
+            os.remove(feedback_path)
+    selected = st.sidebar.multiselect(
+        "Select exactly two models to compare",
+        options=model_names,
+        key='models',
+        help="Choose two model variants for side-by-side comparison",
+        on_change=reset_index
+    )
+    if len(selected) != 2:
+        st.sidebar.info("Please select exactly two models.")
+        st.stop()
+    # Download button for feedback TSV
+    feedback_path = os.path.join(SCRIPT_DIR, 'feedback.tsv')
+    if os.path.exists(feedback_path):
+        with open(feedback_path, 'r', encoding='utf-8') as f:
+            tsv_data = f.read()
+        st.sidebar.download_button(
+            label="Download Feedback TSV",
+            data=tsv_data,
+            file_name="feedback.tsv",
+            mime="text/tab-separated-values"
+        )
+    m1, m2 = selected
+    recs1 = models[m1]
+    recs2 = models[m2]
+    common_ids = sorted(set(recs1.keys()) & set(recs2.keys()))
+    if not common_ids:
+        st.error("No common records between the selected models.")
+        st.stop()
+    if 'idx' not in st.session_state:
+        st.session_state.idx = 0
+    if 'feedback_saved' not in st.session_state:
+        st.session_state.feedback_saved = False
+    # Initialize fresh feedback file for new session
+    if 'session_initialized' not in st.session_state:
+        feedback_path = os.path.join(SCRIPT_DIR, 'feedback.tsv')
+        if os.path.exists(feedback_path):
+            os.remove(feedback_path)
+        st.session_state.session_initialized = True
+    total = len(common_ids)
+    idx = st.session_state.idx
+    if idx < 0:
+        idx = 0
+    if idx >= total:
+        st.write("### Evaluation complete! Thank you for your feedback.")
+        st.stop()
+    current_id = common_ids[idx]
+    rec1 = recs1[current_id]
+    rec2 = recs2[current_id]
+    st.markdown(f"**Record {idx+1}/{total} — ID: {current_id}**")
+    st.markdown("---")
+    st.subheader("Input Prompt")
+    st.code(rec1.get('input', ''), language='')
+    st.subheader("Model Responses and Ground Truth")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.markdown(f"**{m1}**")
+        st.text_area("", rec1.get('predicted', ''), height=600, key=f"resp1_{idx}")
+    with col2:
+        st.markdown(f"**{m2}**")
+        st.text_area("", rec2.get('predicted', ''), height=600, key=f"resp2_{idx}")
+    with col3:
+        st.markdown("**Ground Truth**")
+        st.text_area("", rec1.get('ground_truth', ''), height=600, key=f"gt_{idx}")
+    fcol1, fcol2, fcol3 = st.columns(3)
+    with fcol1:
+        df1 = eval_tables.get(m1)
+        if df1 is not None:
+            if 'gt_sac_id' in df1.columns:
+                key_val = rec1.get('gt_sac_id', rec1.get('sac_id'))
+                key_col = 'gt_sac_id'
+            elif 'gt_title' in df1.columns:
+                key_val = rec1.get('gt_title', rec1.get('title'))
+                key_col = 'gt_title'
+            else:
+                key_col = key_val = None
+            if key_col and key_val is not None:
+                row = df1[df1[key_col] == key_val]
+                if not row.empty:
+                    fm_df = row[fixed_metrics].T
+                    fm_df.columns = ['value']
+                    st.table(fm_df)
+                else:
+                    st.info("No fixed metrics for this record.")
+        else:
+            st.info("No evaluation table available for this model.")
+    with fcol2:
+        df2 = eval_tables.get(m2)
+        if df2 is not None:
+            if 'gt_sac_id' in df2.columns:
+                key_val = rec1.get('gt_sac_id', rec1.get('sac_id'))
+                key_col = 'gt_sac_id'
+            elif 'gt_title' in df2.columns:
+                key_val = rec1.get('gt_title', rec1.get('title'))
+                key_col = 'gt_title'
+            else:
+                key_col = key_val = None
+            if key_col and key_val is not None:
+                row = df2[df2[key_col] == key_val]
+                if not row.empty:
+                    fm_df = row[fixed_metrics].T.astype(float).mean(axis=1)
+                    fm_df.columns = ['value']
+                    st.table(fm_df)
+                else:
+                    st.info("No fixed metrics for this record.")
+        else:
+            st.info("No evaluation table available for this model.")
+    if other_metrics:
+        selected_metric = st.selectbox(
+            "Select additional metric to display",
+            options=other_metrics,
+            key=f"metric_sel_{idx}"
+        )
+    else:
+        selected_metric = None
+    if selected_metric:
+        mcol1, mcol2, mcol3 = st.columns(3)
+        with mcol1:
+            df1 = eval_tables.get(m1)
+            if df1 is not None and selected_metric in df1.columns:
+                if 'gt_sac_id' in df1.columns:
+                    key_val = rec1.get('gt_sac_id', rec1.get('sac_id'))
+                    key_col = 'gt_sac_id'
+                elif 'gt_title' in df1.columns:
+                    key_val = rec1.get('gt_title', rec1.get('title'))
+                    key_col = 'gt_title'
+                else:
+                    key_col = key_val = None
+                if key_col and key_val is not None:
+                    row = df1[df1[key_col] == key_val]
+                    if not row.empty:
+                        value = row[selected_metric].iloc[0]
+                        try:
+                            # Try to parse as JSON first
+                            parsed_json = json.loads(str(value))
+                            formatted_json = json.dumps(parsed_json, indent=2)
+                            st.markdown(f"**{selected_metric}:**")
+                            st.code(formatted_json, language='json')
+                        except json.JSONDecodeError:
+                            try:
+                                # If JSON fails, try to evaluate as Python literal (handles single quotes)
+                                import ast
+                                parsed_json = ast.literal_eval(str(value))
+                                formatted_json = json.dumps(parsed_json, indent=2)
+                                st.markdown(f"**{selected_metric}:**")
+                                st.code(formatted_json, language='json')
+                            except (ValueError, SyntaxError):
+                                # If all parsing fails, show as raw text
+                                st.markdown(f"**{selected_metric}:** {value}")
+                        except (TypeError, ValueError):
+                            st.markdown(f"**{selected_metric}:** {value}")
+                    else:
+                        st.markdown(f"**{selected_metric}:** N/A")
+            else:
+                st.markdown(f"**{selected_metric}:** N/A")
+        with mcol2:
+            df2 = eval_tables.get(m2)
+            if df2 is not None and selected_metric in df2.columns:
+                if 'gt_sac_id' in df2.columns:
+                    key_val = rec1.get('gt_sac_id', rec1.get('sac_id'))
+                    key_col = 'gt_sac_id'
+                elif 'gt_title' in df2.columns:
+                    key_val = rec1.get('gt_title', rec1.get('title'))
+                    key_col = 'gt_title'
+                else:
+                    key_col = key_val = None
+                if key_col and key_val is not None:
+                    row = df2[df2[key_col] == key_val]
+                    if not row.empty:
+                        value = row[selected_metric].iloc[0]
+                        try:
+                            # Try to parse as JSON first
+                            parsed_json = json.loads(str(value))
+                            formatted_json = json.dumps(parsed_json, indent=2)
+                            st.markdown(f"**{selected_metric}:**")
+                            st.code(formatted_json, language='json')
+                        except json.JSONDecodeError:
+                            try:
+                                # If JSON fails, try to evaluate as Python literal (handles single quotes)
+                                import ast
+                                parsed_json = ast.literal_eval(str(value))
+                                formatted_json = json.dumps(parsed_json, indent=2)
+                                st.markdown(f"**{selected_metric}:**")
+                                st.code(formatted_json, language='json')
+                            except (ValueError, SyntaxError):
+                                # If all parsing fails, show as raw text
+                                st.markdown(f"**{selected_metric}:** {value}")
+                        except (TypeError, ValueError):
+                            st.markdown(f"**{selected_metric}:** {value}")
+                    else:
+                        st.markdown(f"**{selected_metric}:** N/A")
+            else:
+                st.markdown(f"**{selected_metric}:** N/A")
+        with mcol3:
+            st.markdown("**Ground Truth Metrics**")
+            df_for_gt = eval_tables.get(m1)
+            if df_for_gt is None:
+                df_for_gt = eval_tables.get(m2)
+            if df_for_gt is not None:
+                if 'gt_sac_id' in df_for_gt.columns:
+                    key_val = rec1.get('gt_sac_id', rec1.get('sac_id'))
+                    key_col = 'gt_sac_id'
+                elif 'gt_title' in df_for_gt.columns:
+                    key_val = rec1.get('gt_title', rec1.get('title'))
+                    key_col = 'gt_title'
+                else:
+                    key_col = key_val = None
+                if key_col and key_val is not None:
+                    row = df_for_gt[df_for_gt[key_col] == key_val]
+                    if not row.empty:
+                        excluded_gt_fields = {'gt_procedure', 'gt_dspy_uuid', 'gt_dspy_split'}
+                        gt_columns = [col for col in df_for_gt.columns if col.startswith('gt_') and col not in key_columns and col not in excluded_gt_fields]
+                        if gt_columns:
+                            for gt_col in gt_columns:
+                                value = row[gt_col].iloc[0]
+                                try:
+                                    # Try to parse as JSON first
+                                    parsed_json = json.loads(str(value))
+                                    formatted_json = json.dumps(parsed_json, indent=2)
+                                    st.markdown(f"**{gt_col}:**")
+                                    st.code(formatted_json, language='json')
+                                except json.JSONDecodeError:
+                                    try:
+                                        # If JSON fails, try to evaluate as Python literal (handles single quotes)
+                                        import ast
+                                        parsed_json = ast.literal_eval(str(value))
+                                        formatted_json = json.dumps(parsed_json, indent=2)
+                                        st.markdown(f"**{gt_col}:**")
+                                        st.code(formatted_json, language='json')
+                                    except (ValueError, SyntaxError):
+                                        # If all parsing fails, show as raw text
+                                        st.markdown(f"**{gt_col}:** {value}")
+                                except (TypeError, ValueError):
+                                    st.markdown(f"**{gt_col}:** {value}")
+                        else:
+                            st.info("No additional ground truth metrics available.")
+                    else:
+                        st.info("No ground truth metrics for this record.")
+            else:
+                st.info("No evaluation table available for ground truth metrics.")
+    st.subheader("Your Preference")
+    pref = st.radio(
+        "Which response do you prefer?", options=[m1, m2], key=f"pref_{idx}"
+    )
+    st.subheader("Comments (Optional)")
+    comments = st.text_area(
+        "Add any comments or notes about your preference:",
+        height=100,
+        key=f"comments_{idx}",
+        placeholder="Optional: Explain your reasoning or add any observations..."
+    )
+    if st.session_state.feedback_saved:
+        st.success("Feedback saved.")
+        st.session_state.feedback_saved = False
+    feedback_path = os.path.join(SCRIPT_DIR, 'feedback.tsv')
+    header = [
+        'timestamp', 'record_id', 'model_1', 'model_2', 'preference',
+        'input', 'response_1', 'response_2', 'ground_truth', 'comments'
+    ]
+    row = [
+        datetime.now().isoformat(), current_id, m1, m2, pref,
+        rec1.get('input', ''), rec1.get('predicted', ''), rec2.get('predicted', ''),
+        rec1.get('ground_truth', ''), comments
+    ]
+    def submit_feedback():
+        append_feedback(feedback_path, header, row)
+        st.session_state.idx += 1
+        st.session_state.feedback_saved = True
+    st.button("Submit and Next", on_click=submit_feedback)
+if __name__ == '__main__':
+    main()

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))