Spaces:

ND06-25
/

Slash

Sleeping

File size: 3,521 Bytes

6880cd9

import re
import nltk
from typing import List, Dict, Any
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def clean_text(text: str) -> str:
    """
    Clean and preprocess extracted text from PDF.
    """
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = text.strip()
    
    # Remove common PDF artifacts
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
    
    return text

def chunk_text(text: str, max_chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    """
    Split text into overlapping chunks for processing.
    
    Args:
        text: Input text to chunk
        max_chunk_size: Maximum size of each chunk
        overlap: Number of characters to overlap between chunks
    
    Returns:
        List of text chunks
    """
    if len(text) <= max_chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_chunk_size
        
        # Try to break at sentence boundaries
        if end < len(text):
            # Look for sentence endings
            sentence_endings = ['.', '!', '?']
            for ending in sentence_endings:
                last_ending = text.rfind(ending, start, end)
                if last_ending > start + max_chunk_size * 0.8:  # Only break if we're at least 80% through
                    end = last_ending + 1
                    break
        
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        # Move start position with overlap
        start = end - overlap
        if start >= len(text):
            break
    
    return chunks

def extract_chapters(text: str) -> Dict[str, str]:
    """
    Attempt to extract chapters from the text.
    """
    chapters = {}
    
    # Common chapter patterns
    chapter_patterns = [
        r'Chapter\s+(\d+|[IVXLC]+)',
        r'CHAPTER\s+(\d+|[IVXLC]+)',
        r'(\d+)\.\s+[A-Z]',
        r'[IVXLC]+\.\s+[A-Z]'
    ]
    
    lines = text.split('\n')
    current_chapter = "Introduction"
    current_content = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Check if this line is a chapter header
        is_chapter_header = False
        for pattern in chapter_patterns:
            if re.match(pattern, line, re.IGNORECASE):
                # Save previous chapter
                if current_content:
                    chapters[current_chapter] = '\n'.join(current_content)
                
                current_chapter = line
                current_content = []
                is_chapter_header = True
                break
        
        if not is_chapter_header:
            current_content.append(line)
    
    # Save the last chapter
    if current_content:
        chapters[current_chapter] = '\n'.join(current_content)
    
    return chapters

def get_text_statistics(text: str) -> Dict[str, Any]:
    """
    Get basic statistics about the text.
    """
    words = text.split()
    sentences = nltk.sent_tokenize(text)
    
    return {
        'total_characters': len(text),
        'total_words': len(words),
        'total_sentences': len(sentences),
        'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
        'estimated_reading_time_minutes': len(words) / 200  # Average reading speed
    }