thearnabsarkar commited on
Commit
a329232
·
verified ·
1 Parent(s): 0c09dd2

Upload json_semval/ml_model.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. json_semval/ml_model.py +141 -0
json_semval/ml_model.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import warnings
6
+ from typing import List, Optional
7
+
8
+ try:
9
+ import onnxruntime as ort
10
+ except Exception: # pragma: no cover - optional
11
+ ort = None
12
+
13
+ from .schema_utils import collect_enums, collect_formats
14
+ from .types import Prediction
15
+
16
+
17
+ class SemanticReasoner:
18
+ def __init__(self, backend: str = "local", onnx_path: Optional[str] = None) -> None:
19
+ self.backend = backend
20
+ self.onnx_path = onnx_path or os.getenv("SEMVAL_ONNX_PATH")
21
+ self._session = None
22
+ if backend == "onnx":
23
+ if ort is None:
24
+ warnings.warn(
25
+ "ONNX backend requested but onnxruntime is not installed; falling back to local heuristics.",
26
+ stacklevel=2,
27
+ )
28
+ elif not self.onnx_path:
29
+ warnings.warn(
30
+ "ONNX backend requested but SEMVAL_ONNX_PATH is not set; falling back to local heuristics.",
31
+ stacklevel=2,
32
+ )
33
+ else:
34
+ try:
35
+ self._session = ort.InferenceSession(self.onnx_path)
36
+ except Exception as e:
37
+ warnings.warn(
38
+ f"Failed to initialize ONNXRuntime session ({e}); falling back to local heuristics.",
39
+ stacklevel=2,
40
+ )
41
+ self._session = None
42
+
43
+ def predict(
44
+ self, schema_str: str, json_str: str, rule_errors: List[dict]
45
+ ) -> List[Prediction]:
46
+ if not rule_errors:
47
+ return []
48
+ try:
49
+ schema = json.loads(schema_str)
50
+ json.loads(json_str) # validate but don't store
51
+ except Exception:
52
+ schema = {}
53
+
54
+ enum_map = collect_enums(schema)
55
+ fmt_map = collect_formats(schema)
56
+
57
+ predictions: List[Prediction] = []
58
+ # Heuristic baseline: map error validator/message to a plausible fix
59
+ for err in rule_errors:
60
+ jsonpath = err.get("jsonpath", "$")
61
+ validator = err.get("validator")
62
+ message = err.get("message", "")
63
+
64
+ if validator == "type":
65
+ # guess bool vs number vs date
66
+ if any(k in fmt_map for k in [jsonpath.replace("$", "")]):
67
+ predictions.append(
68
+ {
69
+ "error_type": "invalid_date",
70
+ "jsonpath": jsonpath,
71
+ "fix_action": "parse_date_iso",
72
+ }
73
+ )
74
+ elif "boolean" in message:
75
+ predictions.append(
76
+ {
77
+ "error_type": "boolean_text",
78
+ "jsonpath": jsonpath,
79
+ "fix_action": "cast_bool",
80
+ }
81
+ )
82
+ elif "integer" in message or "number" in message:
83
+ predictions.append(
84
+ {
85
+ "error_type": "number_text",
86
+ "jsonpath": jsonpath,
87
+ "fix_action": "cast_number",
88
+ }
89
+ )
90
+ elif validator == "format":
91
+ # e.g., date format violations
92
+ val = err.get("validator_value")
93
+ if str(val) == "date":
94
+ predictions.append(
95
+ {
96
+ "error_type": "invalid_date",
97
+ "jsonpath": jsonpath,
98
+ "fix_action": "parse_date_iso",
99
+ }
100
+ )
101
+ else:
102
+ predictions.append(
103
+ {
104
+ "error_type": "wrong_type",
105
+ "jsonpath": jsonpath,
106
+ "fix_action": "fill_default",
107
+ }
108
+ )
109
+ elif validator == "enum":
110
+ allowed = enum_map.get(jsonpath.replace("$", ""), [])
111
+ predictions.append(
112
+ {
113
+ "error_type": "enum_near_miss",
114
+ "jsonpath": jsonpath,
115
+ "fix_action": "map_enum",
116
+ "fix_value": ",".join(allowed) if allowed else None,
117
+ }
118
+ )
119
+ elif validator == "required":
120
+ # When required property missing, try alias_key (heuristic)
121
+ predictions.append(
122
+ {
123
+ "error_type": "alias_key",
124
+ "jsonpath": jsonpath,
125
+ "fix_action": "rename_key",
126
+ }
127
+ )
128
+ elif validator == "minimum" or validator == "maximum":
129
+ # no direct fix; leave to rules
130
+ continue
131
+ else:
132
+ # fallback - no-op suggestion
133
+ predictions.append(
134
+ {
135
+ "error_type": "wrong_type",
136
+ "jsonpath": jsonpath,
137
+ "fix_action": "fill_default",
138
+ }
139
+ )
140
+
141
+ return predictions