| | import re |
| | import requests |
| | from bs4 import BeautifulSoup , Comment |
| | from abc import ABC, abstractmethod |
| | from typing import Any, Dict, Optional |
| |
|
| |
|
| | class Preprocessor(ABC): |
| | """ |
| | Abstract base class for preprocessors. |
| | Defines the interface for transforming raw inputs into structured data. |
| | """ |
| |
|
| | def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: |
| | """ |
| | Initialize the preprocessor with optional configuration. |
| | |
| | Args: |
| | config: A dictionary of configuration settings. |
| | - keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them. |
| | """ |
| | self.config = config if config is not None else {'keep_tags': False} |
| |
|
| | def _fetch_content(self, url: str) -> str: |
| | """ |
| | Fetches and parses the text content from a URL. |
| | |
| | Args: |
| | url: The URL to fetch content from. |
| | |
| | Returns: |
| | The clean, extracted text content from the page. |
| | |
| | Raises: |
| | ValueError: If the URL cannot be fetched or processed. |
| | """ |
| | try: |
| | |
| | |
| | |
| | headers = headers = { |
| | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", |
| | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", |
| | "Accept-Language": "en-US,en;q=0.6", |
| | "Cache-Control": "max-age=0", |
| | "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"", |
| | "Sec-Ch-Ua-Mobile": "?0", |
| | "Sec-Ch-Ua-Platform": "\"Windows\"", |
| | "Sec-Fetch-Dest": "document", |
| | "Sec-Fetch-Mode": "navigate", |
| | "Sec-Fetch-Site": "none", |
| | "Sec-Fetch-User": "?1", |
| | "Upgrade-Insecure-Requests": "1", |
| | } |
| | |
| | |
| | response = requests.get(url, headers=headers, timeout=15) |
| | |
| | |
| | return response.text |
| | |
| | except requests.exceptions.RequestException as e: |
| | |
| | |
| | raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}") |
| | |
| |
|
| | @abstractmethod |
| | def preprocess(self, content: str, is_url: bool) -> str: |
| | """ |
| | Take raw content (HTML, text, etc.) and apply preprocessing steps. |
| | |
| | Args: |
| | content: The raw data to preprocess. |
| | |
| | Returns: |
| | A dictionary containing structured, cleaned data ready for downstream tasks. |
| | """ |
| | pass |
| |
|
| | class BasicPreprocessor(Preprocessor): |
| | """ |
| | Base preprocessor with common functionality. |
| | Can be extended for specific preprocessing tasks. |
| | """ |
| | |
| | def _clean_html(self, html_content: str) -> str: |
| | """ |
| | Cleans up the given HTML content by: |
| | - Removing <script> and <style> tags and their content. |
| | - Removing HTML comments. |
| | - Extracting and returning the visible text with normalized whitespace if keep_tags is False. |
| | |
| | Args: |
| | html_content (str): The HTML content to clean. |
| | |
| | Returns: |
| | str: The cleaned, visible text from the HTML. |
| | """ |
| | |
| | soup = BeautifulSoup(html_content, "html.parser") |
| | |
| | |
| | for tag in soup(["script", "style"]): |
| | tag.decompose() |
| | |
| | |
| | for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): |
| | comment.extract() |
| | |
| | |
| | if self.config.get('keep_tags', False): |
| | |
| | return str(soup) |
| | |
| | text = soup.get_text(separator=" ", strip=True) |
| | clean_text = re.sub(r'\s+', ' ', text) |
| | |
| | return clean_text |
| |
|
| | def preprocess(self, content: str, is_url: bool) -> str: |
| | """ |
| | Take raw content (HTML, text, etc.) and apply preprocessing steps. |
| | |
| | Args: |
| | content: The raw data to preprocess. |
| | |
| | Returns: |
| | A dictionary containing structured, cleaned data ready for downstream tasks. |
| | """ |
| | |
| | html_content = content |
| | if is_url: |
| | |
| | html_content = self._fetch_content(content) |
| |
|
| |
|
| | |
| | cleaned_content = self._clean_html(html_content) |
| |
|
| | return cleaned_content.strip() |
| |
|
| | |
| |
|
| | |