| import os |
| from scrapingbee import ScrapingBeeClient |
| from logger import setup_logger |
| import json |
|
|
| logger = setup_logger("scraper") |
|
|
| |
| client = ScrapingBeeClient(api_key=os.getenv('SCRAPINGBEE_API_KEY', '')) |
|
|
| def scrape_url(url: str) -> str: |
| """ |
| Scrape content from URL using ScrapingBee with AI extraction |
| |
| Args: |
| url: The URL to scrape |
| |
| Returns: |
| str: Extracted text content or error message |
| """ |
| try: |
| logger.info(f"Scraping URL: {url}") |
| response = client.get( |
| url, |
| params={ |
| 'stealth_proxy': True, |
| 'country_code': 'us', |
| 'ai_query': 'Extract the main text content from this page' |
| } |
| ) |
| |
| if response.status_code == 200: |
| logger.info(f"Successfully scraped URL: {url}") |
| return response.text if response.text else "No content could be extracted from the URL" |
| else: |
| logger.error(f"Failed to scrape URL: {url}, Status: {response.status_code}") |
| return f"Failed to download the URL. Status code: {response.status_code}" |
| |
| except Exception as e: |
| logger.error(f"Error scraping URL: {url}", exc_info=True) |
| return f"Error scraping the URL: {str(e)}" |