File size: 8,279 Bytes
233102d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511a4f9
233102d
 
 
 
 
 
 
 
 
 
 
 
 
511a4f9
233102d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
"""
Text normalization utilities for extracted PDF content.

These functions are PURE FUNCTIONS — they take a string,
return a string, have no side effects, and are independently
testable. This is the correct way to write data transformation
logic.
"""

import re
import unicodedata
import ftfy

from src.utils.logger import get_logger

logger = get_logger(__name__)


def fix_hyphenated_linebreaks(text: str) -> str:
    """
    Fix words broken across lines with hyphens.

    Research PDFs use justified text with hyphenation:
        "This is a demon-
         stration of the problem"
    
    Should become:
        "This is a demonstration of the problem"
    
    REGEX EXPLANATION:
        ([a-zA-Z])   -> capture a letter (end of line fragment)
        -            -> literal hyphen
        \n           -> newline
        \s*          -> optional whitespace on next line
        ([a-zA-Z])   -> capture a letter (start of continuation)
    """

    return re.sub(r'([a-zA-Z])-\n\s*([a-zA-Z])', r'\1\2', text)



def remove_page_artifacts(text: str) -> str:
    """
    Remove common PDF page artifacts that pollute extracted text.

    Handles:
    - Form feed characters (\x0c) that mark page boundaries
    - Standalone page numbers (lines containing only digits)
    - Running headers/footers (short lines that repeat)
    """

    # Remove form feed characters (page breaks)
    text = text.replace('\x0c', '\n')

    lines = text.split('\n')
    cleaned_lines = []


    for line in lines:
        stripped = line.strip()

        # Skip empty lines (we'll normalize spacing later)
        if not stripped:
            cleaned_lines.append('')
            continue

            
        # Skip standalone page numbers: lines that are ONLY digits
        # e.g., "12", "247"
        if re.match(r'^\d{1,4}$', stripped):
            continue

        # Skip lines that look like page header/footers
        # Pattern: short lines with mostly uppercase or digits
        # e.g., "NEURIPS 2023", "arXiv:2301.07041v2"
        # FIX: Check if the line CONTAINS these patterns anywhere,
        # not just at the start. Also expanded patterns.
        artifact_patterns = [
            r'arXiv:\d{4}\.\d+',           # arXiv:2301.07041v2
            r'^doi:\s*10\.',               # DOI lines
            r'Preprint\.\s*Under review',  # "Preprint. Under review"
            r'Under review',               # Review notice
            r'Proceedings of (ICML|NeurIPS|ICLR|CVPR|ACL|EMNLP)',
            r'(ICML|NeurIPS|ICLR|CVPR|ACL|EMNLP)\s+20\d{2}',  # "ICML 2023"
            r'Workshop on',                # Workshop lines
            r'^\*+Equal contribution',     # Footnotes
            r'^\dDepartment of',           # Affiliation footnotes
            r'^\d+University of',          # University affiliations
            r'Correspondence to:',         # Contact info
        ]

        is_artifacts = False
        for pattern in artifact_patterns:
            if re.search(pattern, stripped, re.IGNORECASE):
                is_artifacts = True
                break

        if is_artifacts:
            continue

        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)


def normalize_whitespace(text: str) -> str:
    """
    Normalize all forms of whitespace to standard single spaces.

    PDFs produce various whitespace characters:
    - Multiple consecutive spaces (from column alignment)
    - Tabs
    - Non-breaking spaces (\xa0)
    - Zero-width spaces
    
    STRATEGY:
    1. Replace all non-newline whitespace with single space
    2. Collapse multiple newlines into max double newline
       (preserving paragraph breaks)
    3. Strip leading/trailing whitespace
    """

    # Replace tabs and non-breaking spaces with regular space
    text = text.replace('\t', ' ')
    text = text.replace('\xa0', ' ')

    # Collapse multiple spaces into one
    # re.sub with pattern ' +' matches one or more spaces
    text = re.sub(r' +', ' ', text)

    # Collapse 3+ consecutive newlines into exactly 2
    # (preserves paragraph breaks without excessive gaps)
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Strip each line individually, then rejoin
    lines = [line.strip() for line in text.split('\n')] 
    text = '\n'.join(lines)

    return text.strip()



def fix_unicode(text: str) -> str:
    """
    Fix broken Unicode encoding common in PDF text extraction.

    PDFs often have encoding issues:
    - "’" instead of "'" (UTF-8 read as Latin-1)
    - "é" instead of "é"
    - Ligature characters: "fi" (fi ligature) instead of "fi"

    ftfy (Fixes Text For You) handles all these cases automatically.
    It was created at Luminoso and is used in production at scale.
    """
    return ftfy.fix_text(text)



def remove_reference_section(text: str) -> str:
    """
    Remove the bibliography/references section from papers.

    WHY: References contain hundreds of author names, journal names,
    and years. These would pollute our vector index — if someone asks
    about "attention mechanisms", we don't want to retrieve a chunk
    that's just a list of citations like:
    "Vaswani, A., Shazeer, N., Parmar, N., ... (2017). Attention is all you need."

    APPROACH: Find the last occurrence of a "References" header and
    remove everything after it. We use LAST occurrence because some
    papers have "Related Work" sections that reference other sections
    before the actual bibliography.
    """
    # Patterns that signal start of references section
    # re.IGNORECASE to handle "References", "REFERENCES", "Bibliography"
    # FIX: More robust patterns that handle varied spacing
    reference_patterns = [
        r'\n\s*References\s*\n',
        r'\n\s*REFERENCES\s*\n',
        r'\n\s*Bibliography\s*\n',
        r'\n\s*BIBLIOGRAPHY\s*\n',
        r'\n\s*\d+\.\s*References\s*\n',
        r'\n\s*\d+\s+References\s*\n',
        # Handle case where References appears after a section number
        r'\nReferences$',           # At end of line 
    ]


    last_match_pos = -1

    for pattern in reference_patterns:
        # Find all matches, take the last one
        matches = list(re.finditer(pattern, text, re.MULTILINE))
        if matches:
            # Take position of the last match
            pos = matches[-1].start()
            if pos > last_match_pos:
                last_match_pos = pos

    
    if last_match_pos > 0:
        # Only remove if references is in the last 40% of document
        # Increased from 30% because some papers have long conclusions
        cutoff_threshold = len(text) * 0.60
        if last_match_pos > cutoff_threshold:
            text = text[:last_match_pos]
            logger.debug('References section removed')
        else:
            logger.debug(
                f"Reference found at {last_match_pos/len(text):.0%} "
                f"- too early to be bibliography, keeping"
            )

    return text




def remove_short_lines(text: str, min_length: int = 3) -> str:
    """
    Remove lines that are too short to be meaningful content.

    Very short lines in PDFs are usually:
    - Stray characters from column separators
    - Figure/table labels: "Fig.", "Table 1"
    - Single letter section markers
    
    We keep lines >= min_length characters.
    """
    lines = text.split('\n')
    cleaned = [
        line for line in lines
        if len(line.strip()) == 0 or len(line.strip()) >= min_length
    ]

    return '\n'.join(cleaned)


def clean_text(text: str) -> str:
    """
    Master cleaning function — applies all transformations in order.

    ORDER MATTERS:
    1. Fix encoding first (so subsequent regex works on clean chars)
    2. Fix hyphenation (before whitespace normalization)
    3. Remove page artifacts (before whitespace normalization)
    4. Remove references (on mostly clean text)
    5. Remove short lines
    6. Normalize whitespace LAST (cleans up after all other operations)
    """
    if not text or not text.strip():
        return ""

    text = fix_unicode(text)
    text = fix_hyphenated_linebreaks(text)
    text = remove_page_artifacts(text)
    text = remove_reference_section(text)
    text = remove_short_lines(text)
    text = normalize_whitespace(text)


    return text