Spaces:

dev-jas
/

polymer-aging-ml

Sleeping

devjas1 commited on Sep 4

Commit

0a4f1a6

1 Parent(s): 6ea9614

(FEAT)[Data Parsing]: Support multi-format spectrum parsing and robust validation

- Added 'detect_file_format' to auto-detect file type based on extension and content.
- Implemented 'parse_json_spectrum', 'parse_csv_spectrum', and 'parse_txt_spectrum' for flexible parsing of spectroscopy data.
- Unified entry point 'parse_spectrum_data' uses format detection and delegates to appropriate parser.
- Added 'validate_spectrum_data' to check for NaNs, monotonic x-axis, and reasonable value ranges, with sorting and warnings as needed.
- Updated error handling and logging for parsing failures or unusual data.
- Docstrings and comments improved for clarity.

Files changed (1) hide show

utils/multifile.py +297 -56

utils/multifile.py CHANGED Viewed

@@ -1,11 +1,16 @@
-"""Multi-file processing utiltities for batch inference.
-Handles multiple file uploads and iterative processing."""
-from typing import List, Dict, Any, Tuple, Optional
 import time
 import streamlit as st
 import numpy as np
 import pandas as pd
 from .preprocessing import resample_spectrum
 from .errors import ErrorHandler, safe_execute
@@ -13,83 +18,230 @@ from .results_manager import ResultsManager
 from .confidence import calculate_softmax_confidence
-def parse_spectrum_data(
-    text_content: str, filename: str = "unknown"
-) -> Tuple[np.ndarray, np.ndarray]:
-    """
-    Parse spectrum data from text content
     Args:
-        text_content: Raw text content of the spectrum file
-        filename: Name of the file for error reporting
     Returns:
-        Tuple of (x_values, y_values) as numpy arrays
-    Raises:
-        ValueError: If the data cannot be parsed
     """
-    try:
-        lines = text_content.strip().split("\n")
-        # ==Remove empty lines and comments==
-        data_lines = []
-        for line in lines:
-            line = line.strip()
-            if line and not line.startswith("#") and not line.startswith("%"):
-                data_lines.append(line)
-        if not data_lines:
-            raise ValueError("No data lines found in file")
-        # ==Try to parse==
-        x_vals, y_vals = [], []
-        for i, line in enumerate(data_lines):
-            try:
-                # Handle different separators
-                parts = line.replace(",", " ").split()
-                numbers = [
-                    p
-                    for p in parts
-                    if p.replace(".", "", 1)
-                    .replace("-", "", 1)
-                    .replace("+", "", 1)
-                    .isdigit()
-                ]
-                if len(numbers) >= 2:
-                    x_val = float(numbers[0])
-                    y_val = float(numbers[1])
                     x_vals.append(x_val)
                     y_vals.append(y_val)
             except ValueError:
                 ErrorHandler.log_warning(
-                    f"Could not parse line {i+1}: {line}", f"Parsing {filename}"
                 )
                 continue
-        if len(x_vals) < 10:  # ==Need minimum points for interpolation==
             raise ValueError(
                 f"Insufficient data points ({len(x_vals)}). Need at least 10 points."
             )
-        x = np.array(x_vals)
-        y = np.array(y_vals)
-        # Check for NaNs
-        if np.any(np.isnan(x)) or np.any(np.isnan(y)):
-            raise ValueError("Input data contains NaN values")
-        # Check monotonic increasing x
-        if not np.all(np.diff(x) > 0):
-            raise ValueError("Wavenumbers must be strictly increasing")
-        # Check reasonable range for Raman spectroscopy
-        if min(x) < 0 or max(x) > 10000 or (max(x) - min(x)) < 100:
-            raise ValueError(
-                f"Invalid wavenumber range: {min(x)} - {max(x)}. Expected ~400-4000 cm⁻¹ with span >100"
-            )
         return x, y
@@ -97,6 +249,95 @@ def parse_spectrum_data(
         raise ValueError(f"Failed to parse spectrum data: {str(e)}")
 def process_single_file(
     filename: str,
     text_content: str,

+"""Multi-file processing utilities for batch inference.
+Handles multiple file uploads and iterative processing.
+Supports TXT, CSV, and JSON file formats with automatic detection."""
+from typing import List, Dict, Any, Tuple, Optional, Union
 import time
 import streamlit as st
 import numpy as np
 import pandas as pd
+import json
+import csv
+import io
+from pathlib import Path
 from .preprocessing import resample_spectrum
 from .errors import ErrorHandler, safe_execute
 from .confidence import calculate_softmax_confidence
+def detect_file_format(filename: str, content: str) -> str:
+    """Automatically detect file format based on exstention and content
     Args:
+        filename: Name of the file
+        content: Content of the file
     Returns:
+        File format: .'txt', .'csv', .'json'
     """
+    # First try by extension
+    suffix = Path(filename).suffix.lower()
+    if suffix == ".json":
+        try:
+            json.loads(content)
+            return "json"
+        except:
+            pass
+    elif suffix == ".csv":
+        return "csv"
+    elif suffix == ".txt":
+        return "txt"
+    # If extension doesn't match or is unclear, try content detection
+    content_stripped = content.strip()
+    # Try JSON
+    if content_stripped.startswith(("{", "[")):
+        try:
+            json.loads(content)
+            return "json"
+        except:
+            pass
+    # Try CSV (look for commas in first few lines)
+    lines = content_stripped.split("\n")[:5]
+    comma_count = sum(line.count(",") for line in lines)
+    if comma_count > len(lines):  # More commas than lines suggests CSV
+        return "csv"
+    # Default to TXT
+    return "txt"
+# /////////////////////////////////////////////////////
+def parse_json_spectrum(
+    content: str, filename: str = "unknown"
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Parse spectrum data from JSON format.
+    Expected formats:
+    - {"wavenumbers": [...], "intensities": [...]}
+    - {"x": [...], "y": [...]}
+    - [{"wavenumber": val, "intensity": val}, ...]
+    """
+    try:
+        data = json.load(content)
+        # Format 1: Object with arrays
+        if isinstance(data, dict):
+            x_key = None
+            y_key = None
+            # Try common key names for x-axis
+            for key in ["wavenumbers", "wavenumber", "x", "freq", "frequency"]:
+                if key in data:
+                    x_key = key
+                    break
+            # Try common key names for y-axis
+            for key in ["intensities", "intensity", "y", "counts", "absorbance"]:
+                if key in data:
+                    y_key = key
+                    break
+            if x_key and y_key:
+                x_vals = np.array(data[x_key], dtype=float)
+                y_vals = np.array(data[y_key], dtype=float)
+                return x_vals, y_vals
+        # Format 2: Array of objects
+        elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
+            x_vals = []
+            y_vals = []
+            for item in data:
+                # Try to find x and y values
+                x_val = None
+                y_val = None
+                for x_key in ["wavenumber", "wavenumbers", "x", "freq"]:
+                    if x_key in item:
+                        x_val = float(item[x_key])
+                        break
+                for y_key in ["intensity", "intensities", "y", "counts"]:
+                    if y_key in item:
+                        y_val = float(item[y_key])
+                        break
+                if x_val is not None and y_val is not None:
                     x_vals.append(x_val)
                     y_vals.append(y_val)
+            if x_vals and y_vals:
+                return np.array(x_vals), np.array(y_vals)
+        raise ValueError(
+            "JSON format not recognized. Expected wavenumber/intensity pairs."
+        )
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON format: {str(e)}")
+    except Exception as e:
+        raise ValueError(f"Failed to parse JSON spectrum: {str(e)}")
+# /////////////////////////////////////////////////////
+def parse_csv_spectrum(
+    content: str, filename: str = "unknown"
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Parse spectrum data from CSV format.
+    Handles various CSV formats with headers or without.
+    """
+    try:
+        # Use StringIO to treat string as file-like object
+        csv_file = io.StringIO(content)
+        # Try to detect delimiter
+        sample = content[:1024]
+        delimiter = ","
+        if sample.count(";") > sample.count(","):
+            delimiter = ";"
+        elif sample.count("\t") > sample.count(","):
+            delimiter = "\t"
+        # Read CSV
+        csv_reader = csv.reader(csv_file, delimiter=delimiter)
+        rows = list(csv_reader)
+        if not rows:
+            raise ValueError("Empty CSV file")
+        # Check if first row is header
+        has_header = False
+        try:
+            # If first row contains non-numeric data, it's likely a header
+            float(rows[0][0])
+            float(rows[0][1])
+        except (ValueError, IndexError):
+            has_header = True
+        data_rows = rows[1:] if has_header else rows
+        # Extract x and y values
+        x_vals = []
+        y_vals = []
+        for i, row in enumerate(data_rows):
+            if len(row) < 2:
+                continue
+            try:
+                x_val = float(row[0])
+                y_val = float(row[1])
+                x_vals.append(x_val)
+                y_vals.append(y_val)
             except ValueError:
                 ErrorHandler.log_warning(
+                    f"Could not parse CSV row {i+1}: {row}", f"Parsing {filename}"
                 )
                 continue
+        if len(x_vals) < 10:
             raise ValueError(
                 f"Insufficient data points ({len(x_vals)}). Need at least 10 points."
             )
+        return np.array(x_vals), np.array(y_vals)
+    except Exception as e:
+        raise ValueError(f"Failed to parse CSV spectrum: {str(e)}")
+# /////////////////////////////////////////////////////
+def parse_spectrum_data(
+    text_content: str, filename: str = "unknown", file_format: Optional[str] = None
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Parse spectrum data from text content with automatic format detection.
+    Args:
+        text_content: Raw text content of the spectrum file
+        filename: Name of the file for error reporting
+        file_format: Force specific format ('txt', 'csv', 'json') or None for auto-detection
+    Returns:
+        Tuple of (x_values, y_values) as numpy arrays
+    Raises:
+        ValueError: If the data cannot be parsed
+    """
+    try:
+        # Detect format if not specified
+        if file_format is None:
+            file_format = detect_file_format(filename, text_content)
+        # Parse based on detected/specified format
+        if file_format == "json":
+            x, y = parse_json_spectrum(text_content, filename)
+        elif file_format == "csv":
+            x, y = parse_csv_spectrum(text_content, filename)
+        else:  # Default to TXT format
+            x, y = parse_txt_spectrum(text_content, filename)
+        # Common validation for all formats
+        validate_spectrum_data(x, y, filename)
         return x, y
         raise ValueError(f"Failed to parse spectrum data: {str(e)}")
+# /////////////////////////////////////////////////////
+def parse_txt_spectrum(
+    content: str, filename: str = "unknown"
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Parse spectrum data from TXT format (original implementation).
+    """
+    lines = content.strip().split("\n")
+    # ==Remove empty lines and comments==
+    data_lines = []
+    for line in lines:
+        line = line.strip()
+        if line and not line.startswith("#") and not line.startswith("%"):
+            data_lines.append(line)
+    if not data_lines:
+        raise ValueError("No data lines found in file")
+    # ==Try to parse==
+    x_vals, y_vals = [], []
+    for i, line in enumerate(data_lines):
+        try:
+            # Handle different separators
+            parts = line.replace(",", " ").split()
+            numbers = [
+                p
+                for p in parts
+                if p.replace(".", "", 1)
+                .replace("-", "", 1)
+                .replace("+", "", 1)
+                .isdigit()
+            ]
+            if len(numbers) >= 2:
+                x_val = float(numbers[0])
+                y_val = float(numbers[1])
+                x_vals.append(x_val)
+                y_vals.append(y_val)
+        except ValueError:
+            ErrorHandler.log_warning(
+                f"Could not parse line {i+1}: {line}", f"Parsing {filename}"
+            )
+            continue
+    if len(x_vals) < 10:  # ==Need minimum points for interpolation==
+        raise ValueError(
+            f"Insufficient data points ({len(x_vals)}). Need at least 10 points."
+        )
+    return np.array(x_vals), np.array(y_vals)
+# /////////////////////////////////////////////////////
+def validate_spectrum_data(x: np.ndarray, y: np.ndarray, filename: str) -> None:
+    """
+    Validate parsed spectrum data for common issues.
+    """
+    # Check for NaNs
+    if np.any(np.isnan(x)) or np.any(np.isnan(y)):
+        raise ValueError("Input data contains NaN values")
+    # Check monotonic increasing x (sort if needed)
+    if not np.all(np.diff(x) >= 0):
+        # Sort by x values if not monotonic
+        sort_idx = np.argsort(x)
+        x = x[sort_idx]
+        y = y[sort_idx]
+        ErrorHandler.log_warning(
+            "Wavenumbers were not monotonic - data has been sorted",
+            f"Parsing {filename}",
+        )
+    # Check reasonable range for spectroscopy
+    if min(x) < 0 or max(x) > 10000 or (max(x) - min(x)) < 100:
+        ErrorHandler.log_warning(
+            f"Unusual wavenumber range: {min(x):.1f} - {max(x):.1f} cm⁻¹",
+            f"Parsing {filename}",
+        )
+# /////////////////////////////////////////////////////
 def process_single_file(
     filename: str,
     text_content: str,