check all abs are present and rm redundant validation
Browse files- constants.py +0 -1
- test/test_validation.py +3 -6
- validation.py +9 -43
constants.py
CHANGED
|
@@ -30,7 +30,6 @@ ASSAY_EMOJIS = {
|
|
| 30 |
}
|
| 31 |
|
| 32 |
# Input CSV file requirements
|
| 33 |
-
MINIMAL_NUMBER_OF_ROWS: int = 50
|
| 34 |
REQUIRED_COLUMNS: list[str] = [
|
| 35 |
"antibody_name",
|
| 36 |
"vh_protein_sequence",
|
|
|
|
| 30 |
}
|
| 31 |
|
| 32 |
# Input CSV file requirements
|
|
|
|
| 33 |
REQUIRED_COLUMNS: list[str] = [
|
| 34 |
"antibody_name",
|
| 35 |
"vh_protein_sequence",
|
test/test_validation.py
CHANGED
|
@@ -71,14 +71,11 @@ class TestValidateDataframe:
|
|
| 71 |
|
| 72 |
assert "CSV file is empty" in str(exc_info.value)
|
| 73 |
|
| 74 |
-
def
|
| 75 |
-
df = valid_input_dataframe.head(
|
| 76 |
with pytest.raises(gr.Error) as exc_info:
|
| 77 |
validate_dataframe(df)
|
| 78 |
-
|
| 79 |
-
assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(
|
| 80 |
-
exc_info.value
|
| 81 |
-
)
|
| 82 |
|
| 83 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
| 84 |
bad_column = REQUIRED_COLUMNS[0]
|
|
|
|
| 71 |
|
| 72 |
assert "CSV file is empty" in str(exc_info.value)
|
| 73 |
|
| 74 |
+
def test_missing_antibodies_raises_error(self, valid_input_dataframe):
|
| 75 |
+
df = valid_input_dataframe.head(50)
|
| 76 |
with pytest.raises(gr.Error) as exc_info:
|
| 77 |
validate_dataframe(df)
|
| 78 |
+
assert "Missing predictions for" in str(exc_info.value)
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
| 81 |
bad_column = REQUIRED_COLUMNS[0]
|
validation.py
CHANGED
|
@@ -3,7 +3,6 @@ import io
|
|
| 3 |
import gradio as gr
|
| 4 |
from constants import (
|
| 5 |
REQUIRED_COLUMNS,
|
| 6 |
-
MINIMAL_NUMBER_OF_ROWS,
|
| 7 |
ASSAY_LIST,
|
| 8 |
CV_COLUMN,
|
| 9 |
EXAMPLE_FILE_DICT,
|
|
@@ -64,16 +63,6 @@ def validate_cv_submission(df: pd.DataFrame, submission_type: str = "GDPa1_CV")
|
|
| 64 |
how="left",
|
| 65 |
suffixes=("_expected", "_submitted"),
|
| 66 |
)
|
| 67 |
-
# All antibodies should be present if using CV
|
| 68 |
-
missing_antibodies_mask = antibody_check[f"{CV_COLUMN}_submitted"].isna()
|
| 69 |
-
n_missing_antibodies = missing_antibodies_mask.sum()
|
| 70 |
-
if n_missing_antibodies > 0:
|
| 71 |
-
missing_antibodies = (
|
| 72 |
-
antibody_check[missing_antibodies_mask]["antibody_name"].head(5).tolist()
|
| 73 |
-
)
|
| 74 |
-
raise gr.Error(
|
| 75 |
-
f"β Missing predictions for {n_missing_antibodies} antibodies. Examples: {', '.join(missing_antibodies)}"
|
| 76 |
-
)
|
| 77 |
# CV fold assignments should match
|
| 78 |
fold_mismatches = antibody_check[
|
| 79 |
antibody_check[f"{CV_COLUMN}_expected"]
|
|
@@ -89,26 +78,6 @@ def validate_cv_submission(df: pd.DataFrame, submission_type: str = "GDPa1_CV")
|
|
| 89 |
f"β Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
|
| 90 |
)
|
| 91 |
|
| 92 |
-
# Merge on both columns for assay validation
|
| 93 |
-
merged_cv_df = expected_cv_df.merge(df, on=["antibody_name", CV_COLUMN], how="left")
|
| 94 |
-
|
| 95 |
-
# Check for missing assay predictions
|
| 96 |
-
assay_columns = get_assay_columns(merged_cv_df)
|
| 97 |
-
for assay_column in assay_columns:
|
| 98 |
-
missing_antibodies = merged_cv_df[merged_cv_df[assay_column].isna()][
|
| 99 |
-
"antibody_name"
|
| 100 |
-
].unique()
|
| 101 |
-
if len(missing_antibodies) > 0:
|
| 102 |
-
raise gr.Error(
|
| 103 |
-
f"β Missing {assay_column} predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies[:5])}"
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
# Step 5: Check that submission length matches expected
|
| 107 |
-
if len(merged_cv_df) != len(expected_cv_df):
|
| 108 |
-
raise gr.Error(
|
| 109 |
-
f"β Expected {len(expected_cv_df)} rows, got {len(merged_cv_df)}"
|
| 110 |
-
)
|
| 111 |
-
|
| 112 |
|
| 113 |
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
| 114 |
"""Validate full dataset submission"""
|
|
@@ -118,13 +87,6 @@ def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
|
| 118 |
"Please select 'Cross-Validation Predictions' if you want to submit CV results."
|
| 119 |
)
|
| 120 |
|
| 121 |
-
# All names should be unique (duplicates check from original validation)
|
| 122 |
-
n_duplicates = df["antibody_name"].duplicated().sum()
|
| 123 |
-
if n_duplicates > 0:
|
| 124 |
-
raise gr.Error(
|
| 125 |
-
f"β Standard submissions should have only one prediction per antibody. Found {n_duplicates} duplicates."
|
| 126 |
-
)
|
| 127 |
-
|
| 128 |
|
| 129 |
def get_assay_columns(df: pd.DataFrame) -> list[str]:
|
| 130 |
"""Get all assay columns from the DataFrame"""
|
|
@@ -174,17 +136,12 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
|
|
| 174 |
if missing_count > 0:
|
| 175 |
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
|
| 176 |
|
| 177 |
-
# Above minimal number of rows
|
| 178 |
-
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
| 179 |
-
raise gr.Error(f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
|
| 180 |
-
|
| 181 |
# All names should be unique
|
| 182 |
n_duplicates = df["antibody_name"].duplicated().sum()
|
| 183 |
if n_duplicates > 0:
|
| 184 |
raise gr.Error(
|
| 185 |
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates."
|
| 186 |
)
|
| 187 |
-
|
| 188 |
# All antibody names should be recognizable
|
| 189 |
unrecognized_antibodies = set(df["antibody_name"]) - set(
|
| 190 |
ANTIBODY_NAMES_DICT[submission_type]
|
|
@@ -193,6 +150,15 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
|
|
| 193 |
raise gr.Error(
|
| 194 |
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
|
| 195 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
# Submission-type specific validation
|
| 197 |
if submission_type.endswith("_CV"):
|
| 198 |
validate_cv_submission(df, submission_type)
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
from constants import (
|
| 5 |
REQUIRED_COLUMNS,
|
|
|
|
| 6 |
ASSAY_LIST,
|
| 7 |
CV_COLUMN,
|
| 8 |
EXAMPLE_FILE_DICT,
|
|
|
|
| 63 |
how="left",
|
| 64 |
suffixes=("_expected", "_submitted"),
|
| 65 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# CV fold assignments should match
|
| 67 |
fold_mismatches = antibody_check[
|
| 68 |
antibody_check[f"{CV_COLUMN}_expected"]
|
|
|
|
| 78 |
f"β Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
|
| 79 |
)
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
| 83 |
"""Validate full dataset submission"""
|
|
|
|
| 87 |
"Please select 'Cross-Validation Predictions' if you want to submit CV results."
|
| 88 |
)
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
def get_assay_columns(df: pd.DataFrame) -> list[str]:
|
| 92 |
"""Get all assay columns from the DataFrame"""
|
|
|
|
| 136 |
if missing_count > 0:
|
| 137 |
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
# All names should be unique
|
| 140 |
n_duplicates = df["antibody_name"].duplicated().sum()
|
| 141 |
if n_duplicates > 0:
|
| 142 |
raise gr.Error(
|
| 143 |
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates."
|
| 144 |
)
|
|
|
|
| 145 |
# All antibody names should be recognizable
|
| 146 |
unrecognized_antibodies = set(df["antibody_name"]) - set(
|
| 147 |
ANTIBODY_NAMES_DICT[submission_type]
|
|
|
|
| 150 |
raise gr.Error(
|
| 151 |
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
|
| 152 |
)
|
| 153 |
+
|
| 154 |
+
# All antibody names should be present
|
| 155 |
+
missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set(
|
| 156 |
+
df["antibody_name"]
|
| 157 |
+
)
|
| 158 |
+
if missing_antibodies:
|
| 159 |
+
raise gr.Error(
|
| 160 |
+
f"β Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
|
| 161 |
+
)
|
| 162 |
# Submission-type specific validation
|
| 163 |
if submission_type.endswith("_CV"):
|
| 164 |
validate_cv_submission(df, submission_type)
|