File size: 3,521 Bytes
6880cd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import re
import nltk
from typing import List, Dict, Any
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def clean_text(text: str) -> str:
    """
    Clean and preprocess extracted text from PDF.
    """
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = text.strip()
    
    # Remove common PDF artifacts
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\{\}]', '', text)
    
    return text

def chunk_text(text: str, max_chunk_size: int = 1000, overlap: int = 100) -> List[str]:
    """
    Split text into overlapping chunks for processing.
    
    Args:
        text: Input text to chunk
        max_chunk_size: Maximum size of each chunk
        overlap: Number of characters to overlap between chunks
    
    Returns:
        List of text chunks
    """
    if len(text) <= max_chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + max_chunk_size
        
        # Try to break at sentence boundaries
        if end < len(text):
            # Look for sentence endings
            sentence_endings = ['.', '!', '?']
            for ending in sentence_endings:
                last_ending = text.rfind(ending, start, end)
                if last_ending > start + max_chunk_size * 0.8:  # Only break if we're at least 80% through
                    end = last_ending + 1
                    break
        
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        # Move start position with overlap
        start = end - overlap
        if start >= len(text):
            break
    
    return chunks

def extract_chapters(text: str) -> Dict[str, str]:
    """
    Attempt to extract chapters from the text.
    """
    chapters = {}
    
    # Common chapter patterns
    chapter_patterns = [
        r'Chapter\s+(\d+|[IVXLC]+)',
        r'CHAPTER\s+(\d+|[IVXLC]+)',
        r'(\d+)\.\s+[A-Z]',
        r'[IVXLC]+\.\s+[A-Z]'
    ]
    
    lines = text.split('\n')
    current_chapter = "Introduction"
    current_content = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Check if this line is a chapter header
        is_chapter_header = False
        for pattern in chapter_patterns:
            if re.match(pattern, line, re.IGNORECASE):
                # Save previous chapter
                if current_content:
                    chapters[current_chapter] = '\n'.join(current_content)
                
                current_chapter = line
                current_content = []
                is_chapter_header = True
                break
        
        if not is_chapter_header:
            current_content.append(line)
    
    # Save the last chapter
    if current_content:
        chapters[current_chapter] = '\n'.join(current_content)
    
    return chapters

def get_text_statistics(text: str) -> Dict[str, Any]:
    """
    Get basic statistics about the text.
    """
    words = text.split()
    sentences = nltk.sent_tokenize(text)
    
    return {
        'total_characters': len(text),
        'total_words': len(words),
        'total_sentences': len(sentences),
        'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
        'estimated_reading_time_minutes': len(words) / 200  # Average reading speed
    }