thearnabsarkar commited on
Commit
0c09dd2
·
verified ·
1 Parent(s): 313f40c

Upload json_semval/fixes.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. json_semval/fixes.py +177 -0
json_semval/fixes.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from datetime import datetime
5
+ from typing import Any, List, Optional
6
+
7
+ from dateutil import parser as date_parser
8
+ from rapidfuzz import fuzz, process
9
+
10
+ from .jsonpath_utils import delete_key_at_path, get_value_at_path, set_value_at_path
11
+
12
+ BOOLEAN_TRUE = {"true", "yes", "y", "1", "on"}
13
+ BOOLEAN_FALSE = {"false", "no", "n", "0", "off"}
14
+
15
+ TEXT_NUMBERS = {
16
+ "zero": 0,
17
+ "one": 1,
18
+ "two": 2,
19
+ "three": 3,
20
+ "four": 4,
21
+ "five": 5,
22
+ "six": 6,
23
+ "seven": 7,
24
+ "eight": 8,
25
+ "nine": 9,
26
+ "ten": 10,
27
+ "eleven": 11,
28
+ "twelve": 12,
29
+ "thirteen": 13,
30
+ "fourteen": 14,
31
+ "fifteen": 15,
32
+ "sixteen": 16,
33
+ "seventeen": 17,
34
+ "eighteen": 18,
35
+ "nineteen": 19,
36
+ "twenty": 20,
37
+ "thirty": 30,
38
+ "forty": 40,
39
+ "fifty": 50,
40
+ "sixty": 60,
41
+ "seventy": 70,
42
+ "eighty": 80,
43
+ "ninety": 90,
44
+ }
45
+
46
+
47
+ def rename_key(payload: Any, src_path_tokens: List[str], dst_key: str) -> Any:
48
+ parent_tokens = src_path_tokens[:-1]
49
+ src_last = src_path_tokens[-1]
50
+ if src_last.startswith("["):
51
+ return payload # renaming array indices not supported
52
+ value = get_value_at_path(payload, src_path_tokens)
53
+ delete_key_at_path(payload, src_path_tokens)
54
+ new_tokens = parent_tokens + [f".{dst_key}"]
55
+ set_value_at_path(payload, new_tokens, value)
56
+ return payload
57
+
58
+
59
+ def cast_number(payload: Any, path_tokens: List[str]) -> Optional[Any]:
60
+ value = get_value_at_path(payload, path_tokens)
61
+ if isinstance(value, (int, float)):
62
+ return payload
63
+ if isinstance(value, str):
64
+ s = value.strip().lower()
65
+ # Try numeric directly
66
+ try:
67
+ num = float(s) if "." in s else int(s)
68
+ set_value_at_path(payload, path_tokens, num)
69
+ return payload
70
+ except Exception:
71
+ pass
72
+ # Try words → number (simple)
73
+ words = re.split(r"[\s-]+", s)
74
+ total = 0
75
+ for w in words:
76
+ if w in TEXT_NUMBERS:
77
+ total += TEXT_NUMBERS[w]
78
+ else:
79
+ return None
80
+ set_value_at_path(payload, path_tokens, total)
81
+ return payload
82
+ return None
83
+
84
+
85
+ def cast_bool(payload: Any, path_tokens: List[str]) -> Optional[Any]:
86
+ value = get_value_at_path(payload, path_tokens)
87
+ if isinstance(value, bool):
88
+ return payload
89
+ if isinstance(value, (int, float)):
90
+ b = bool(value)
91
+ set_value_at_path(payload, path_tokens, b)
92
+ return payload
93
+ if isinstance(value, str):
94
+ s = value.strip().lower()
95
+ if s in BOOLEAN_TRUE:
96
+ set_value_at_path(payload, path_tokens, True)
97
+ return payload
98
+ if s in BOOLEAN_FALSE:
99
+ set_value_at_path(payload, path_tokens, False)
100
+ return payload
101
+ return None
102
+
103
+
104
+ def parse_date_iso(payload: Any, path_tokens: List[str]) -> Optional[Any]:
105
+ value = get_value_at_path(payload, path_tokens)
106
+ if isinstance(value, (int, float)):
107
+ try:
108
+ dt = datetime.fromtimestamp(float(value))
109
+ set_value_at_path(payload, path_tokens, dt.date().isoformat())
110
+ return payload
111
+ except Exception:
112
+ return None
113
+ if isinstance(value, str):
114
+ try:
115
+ dt = date_parser.parse(value)
116
+ set_value_at_path(payload, path_tokens, dt.date().isoformat())
117
+ return payload
118
+ except Exception:
119
+ # try common day-first variants
120
+ try:
121
+ dt = date_parser.parse(value, dayfirst=True)
122
+ set_value_at_path(payload, path_tokens, dt.date().isoformat())
123
+ return payload
124
+ except Exception:
125
+ return None
126
+ return None
127
+
128
+
129
+ def map_enum(
130
+ payload: Any,
131
+ path_tokens: List[str],
132
+ allowed_values: List[str],
133
+ *,
134
+ threshold: Optional[int] = None,
135
+ ) -> Optional[Any]:
136
+ value = get_value_at_path(payload, path_tokens)
137
+ if value is None:
138
+ return None
139
+ # Normalize case on both sides
140
+ s = str(value)
141
+ norm_allowed = [str(v) for v in allowed_values]
142
+ if s in norm_allowed:
143
+ return payload
144
+ if not norm_allowed:
145
+ return None
146
+ # Adaptive threshold by enum size
147
+ if threshold is None:
148
+ threshold = 90 if len(norm_allowed) <= 5 else 80
149
+ result = process.extractOne(
150
+ s.lower(), [v.lower() for v in norm_allowed], scorer=fuzz.WRatio
151
+ )
152
+ if result is None:
153
+ return None
154
+ match, score, _ = result
155
+ if score >= threshold and match is not None:
156
+ idx = [v.lower() for v in norm_allowed].index(match)
157
+ set_value_at_path(payload, path_tokens, norm_allowed[idx])
158
+ return payload
159
+ return None
160
+
161
+
162
+ def swap_dates(
163
+ payload: Any, start_tokens: List[str], end_tokens: List[str]
164
+ ) -> Optional[Any]:
165
+ """Swap start/end date fields if inverted (heuristic)."""
166
+ try:
167
+ start_val = get_value_at_path(payload, start_tokens)
168
+ end_val = get_value_at_path(payload, end_tokens)
169
+ sdt = date_parser.parse(str(start_val))
170
+ edt = date_parser.parse(str(end_val))
171
+ if edt < sdt:
172
+ set_value_at_path(payload, start_tokens, edt.date().isoformat())
173
+ set_value_at_path(payload, end_tokens, sdt.date().isoformat())
174
+ return payload
175
+ except Exception:
176
+ return None
177
+ return None