Spaces:
Runtime error
Runtime error
| from typing import Tuple | |
| import regex as re | |
| import sys | |
| from tqdm import tqdm | |
| from .indic_num_map import INDIC_NUM_MAP | |
| URL_PATTERN = r'\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b' | |
| EMAIL_PATTERN = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}' | |
| # handles dates, time, percentages, proportion, ratio, etc | |
| NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)" | |
| # handles upi, social media handles and hashtags | |
| OTHER_PATTERN = r'[A-Za-z0-9]*[#|@]\w+' | |
| def normalize_indic_numerals(line: str): | |
| """ | |
| Normalize the numerals in Indic languages from native script to Roman script (if present). | |
| Args: | |
| line (str): an input string with Indic numerals to be normalized. | |
| Returns: | |
| str: an input string with the all Indic numerals normalized to Roman script. | |
| """ | |
| return "".join([INDIC_NUM_MAP.get(c, c) for c in line]) | |
| def wrap_with_placeholders(text: str, patterns: list) -> Tuple[str, dict]: | |
| """ | |
| Wraps substrings with matched patterns in the given text with placeholders and returns | |
| the modified text along with a mapping of the placeholders to their original value. | |
| Args: | |
| text (str): an input string which needs to be wrapped with the placeholders. | |
| pattern (list): list of patterns to search for in the input string. | |
| Returns: | |
| Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping | |
| placeholders to their original values. | |
| """ | |
| serial_no = 1 | |
| placeholder_entity_map = dict() | |
| for pattern in patterns: | |
| matches = set(re.findall(pattern, text)) | |
| # wrap common match with placeholder tags | |
| for match in matches: | |
| if pattern==URL_PATTERN : | |
| #Avoids false positive URL matches for names with initials. | |
| temp = match.replace(".",'') | |
| if len(temp)<4: | |
| continue | |
| if pattern==NUMERAL_PATTERN : | |
| #Short numeral patterns do not need placeholder based handling. | |
| temp = match.replace(" ",'').replace(".",'').replace(":",'') | |
| if len(temp)<4: | |
| continue | |
| #Set of Translations of "ID" in all the suppported languages have been collated. | |
| #This has been added to deal with edge cases where placeholders might get translated. | |
| indic_failure_cases = ['آی ڈی ', 'ꯑꯥꯏꯗꯤ', 'आईडी', 'आई . डी . ', 'ऐटि', 'آئی ڈی ', 'ᱟᱭᱰᱤ ᱾', 'आयडी', 'ऐडि', 'आइडि'] | |
| placeholder = "<ID{}>".format(serial_no) | |
| alternate_placeholder = "< ID{} >".format(serial_no) | |
| placeholder_entity_map[placeholder] = match | |
| placeholder_entity_map[alternate_placeholder] = match | |
| for i in indic_failure_cases: | |
| placeholder_temp = "<{}{}>".format(i,serial_no) | |
| placeholder_entity_map[placeholder_temp] = match | |
| placeholder_temp = "< {}{} >".format(i, serial_no) | |
| placeholder_entity_map[placeholder_temp] = match | |
| placeholder_temp = "< {} {} >".format(i, serial_no) | |
| placeholder_entity_map[placeholder_temp] = match | |
| text = text.replace(match, placeholder) | |
| serial_no+=1 | |
| text = re.sub("\s+", " ", text) | |
| #Regex has failure cases in trailing "/" in URLs, so this is a workaround. | |
| text = text.replace(">/",">") | |
| return text, placeholder_entity_map | |
| def normalize(text: str, patterns: list = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN]) -> Tuple[str, dict]: | |
| """ | |
| Normalizes and wraps the spans of input string with placeholder tags. It first normalizes | |
| the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized | |
| Indic numerals to wrap the spans of text matching the pattern with placeholder tags. | |
| Args: | |
| text (str): input string. | |
| pattern (list): list of patterns to search for in the input string. | |
| Returns: | |
| Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping | |
| placeholders to their original values. | |
| """ | |
| text = normalize_indic_numerals(text.strip("\n")) | |
| text, placeholder_entity_map = wrap_with_placeholders(text, patterns) | |
| return text, placeholder_entity_map | |