venni16 commited on
Commit
5335722
·
verified ·
1 Parent(s): 83ae875

Upload 7 files

Browse files
Files changed (7) hide show
  1. .dockerignore +26 -0
  2. Dockerfile +30 -0
  3. app.py +204 -0
  4. document_parser.py +142 -0
  5. requirements.txt +15 -0
  6. summarizer.py +122 -0
  7. utils.py +93 -0
.dockerignore ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env
7
+ venv
8
+ .venv
9
+ pip-log.txt
10
+ pip-delete-this-directory.txt
11
+ .tox
12
+ .coverage
13
+ .coverage.*
14
+ .cache
15
+ nosetests.xml
16
+ coverage.xml
17
+ *.cover
18
+ *.log
19
+ .git
20
+ .mypy_cache
21
+ .pytest_cache
22
+ .hypothesis
23
+ .DS_Store
24
+ *.swp
25
+ *.swo
26
+ *~
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /code
4
+
5
+ # Install system dependencies for document processing
6
+ RUN apt-get update && apt-get install -y \
7
+ ffmpeg \
8
+ libmagic1 \
9
+ libmagic-dev \
10
+ poppler-utils \
11
+ antiword \
12
+ unrtf \
13
+ tesseract-ocr \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Copy requirements and install Python dependencies
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy application code
21
+ COPY . .
22
+
23
+ # Create temp directory for file processing
24
+ RUN mkdir -p /tmp/materials
25
+
26
+ # Expose port
27
+ EXPOSE 7861
28
+
29
+ # Start the application
30
+ CMD uvicorn app:app --host 0.0.0.0 --port 7861
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ import uvicorn
4
+ import os
5
+ import tempfile
6
+ import aiofiles
7
+ from datetime import datetime
8
+ import traceback
9
+ import logging
10
+ from typing import List, Optional
11
+
12
+ # Setup logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ app = FastAPI(title="Material Summarizer API")
17
+
18
+ from dotenv import load_dotenv
19
+ load_dotenv()
20
+
21
+ # Get URLs from environment
22
+ FRONTEND_URL = os.getenv('FRONTEND_URL')
23
+ BACKEND_URL = os.getenv('BACKEND_URL', 'http://localhost:5000')
24
+
25
+ # CORS middleware
26
+ app.add_middleware(
27
+ CORSMiddleware,
28
+ allow_origins=["FRONTEND_URL, BACKEND_URL"], # Adjust in production
29
+ allow_credentials=True,
30
+ allow_methods=["*"],
31
+ allow_headers=["*"],
32
+ )
33
+
34
+ # Import processing functions
35
+ try:
36
+ from document_parser import parse_document
37
+ from summarizer import summarize_text
38
+ from utils import chunked_summarize
39
+ DEPENDENCIES_LOADED = True
40
+ logger.info("All AI dependencies loaded successfully")
41
+ except ImportError as e:
42
+ logger.error(f"Import error: {e}")
43
+ DEPENDENCIES_LOADED = False
44
+
45
+ @app.get("/")
46
+ async def root():
47
+ return {"message": "Material Summarizer API", "status": "running"}
48
+
49
+ @app.get("/health")
50
+ async def health_check():
51
+ status = "healthy" if DEPENDENCIES_LOADED else "missing_dependencies"
52
+ return {
53
+ "status": status,
54
+ "service": "material-summarizer",
55
+ "dependencies_loaded": DEPENDENCIES_LOADED
56
+ }
57
+
58
+ @app.post("/summarize-document")
59
+ async def summarize_document(
60
+ file: UploadFile = File(...),
61
+ max_summary_length: Optional[int] = 1000,
62
+ chunk_size: Optional[int] = 1500
63
+ ):
64
+ """
65
+ Summarize uploaded document (PDF, DOCX, TXT, etc.)
66
+ """
67
+ if not DEPENDENCIES_LOADED:
68
+ raise HTTPException(
69
+ status_code=500,
70
+ detail="Required AI dependencies not loaded. Check server logs."
71
+ )
72
+
73
+ temp_file_path = None
74
+
75
+ try:
76
+ # Validate file type
77
+ allowed_extensions = {'.pdf', '.docx', '.doc', '.txt', '.pptx', '.ppt'}
78
+ file_extension = os.path.splitext(file.filename)[1].lower()
79
+
80
+ if file_extension not in allowed_extensions:
81
+ raise HTTPException(
82
+ status_code=400,
83
+ detail=f"Unsupported document format. Allowed: {', '.join(allowed_extensions)}"
84
+ )
85
+
86
+ # Create temporary file
87
+ temp_file_path = f"temp_{file.filename}"
88
+
89
+ # Save uploaded file
90
+ logger.info(f"Saving uploaded file: {file.filename}")
91
+ async with aiofiles.open(temp_file_path, 'wb') as out_file:
92
+ content = await file.read()
93
+ await out_file.write(content)
94
+
95
+ start_time = datetime.now()
96
+
97
+ # 1. Parse document
98
+ logger.info("Step 1: Parsing document...")
99
+ if not os.path.exists(temp_file_path):
100
+ raise HTTPException(status_code=500, detail="Document file not found after upload")
101
+
102
+ document_text = parse_document(temp_file_path, file_extension)
103
+ logger.info(f"Extracted text length: {len(document_text)} characters")
104
+
105
+ if not document_text or len(document_text.strip()) < 10:
106
+ raise HTTPException(status_code=500, detail="Document parsing failed or content too short")
107
+
108
+ # 2. Summarize text with chunking
109
+ logger.info("Step 2: Generating summary...")
110
+
111
+ def custom_summarize_func(text):
112
+ return summarize_text(
113
+ text,
114
+ model_name="facebook/bart-large-cnn",
115
+ max_length=max_summary_length,
116
+ min_length=min(100, max_summary_length // 3)
117
+ )
118
+
119
+ final_summary = chunked_summarize(
120
+ text=document_text,
121
+ summarize_func=custom_summarize_func,
122
+ max_chunk_size=chunk_size
123
+ )
124
+
125
+ if not final_summary or len(final_summary.strip()) < 10:
126
+ raise HTTPException(status_code=500, detail="Summary generation failed")
127
+
128
+ processing_time = (datetime.now() - start_time).total_seconds()
129
+
130
+ logger.info(f"Summarization completed in {processing_time:.2f} seconds")
131
+
132
+ return {
133
+ "success": True,
134
+ "summary": final_summary,
135
+ "original_length": len(document_text),
136
+ "summary_length": len(final_summary),
137
+ "processing_time": processing_time,
138
+ "file_type": file_extension
139
+ }
140
+
141
+ except HTTPException:
142
+ raise
143
+ except Exception as e:
144
+ logger.error(f"Error processing document: {str(e)}")
145
+ logger.error(traceback.format_exc())
146
+ raise HTTPException(
147
+ status_code=500,
148
+ detail=f"Document processing failed: {str(e)}"
149
+ )
150
+ finally:
151
+ # Cleanup temporary files
152
+ try:
153
+ if temp_file_path and os.path.exists(temp_file_path):
154
+ os.remove(temp_file_path)
155
+ logger.info(f"Cleaned up: {temp_file_path}")
156
+ except Exception as cleanup_error:
157
+ logger.error(f"Cleanup error: {cleanup_error}")
158
+
159
+ @app.post("/batch-summarize")
160
+ async def batch_summarize_documents(files: List[UploadFile] = File(...)):
161
+ """
162
+ Summarize multiple documents in batch
163
+ """
164
+ if not DEPENDENCIES_LOADED:
165
+ raise HTTPException(
166
+ status_code=500,
167
+ detail="Required AI dependencies not loaded. Check server logs."
168
+ )
169
+
170
+ results = []
171
+
172
+ for file in files:
173
+ try:
174
+ # Use the single document summarization function
175
+ result = await summarize_document(file)
176
+ result["filename"] = file.filename
177
+ results.append(result)
178
+ except Exception as e:
179
+ results.append({
180
+ "success": False,
181
+ "filename": file.filename,
182
+ "error": str(e)
183
+ })
184
+
185
+ return {
186
+ "success": True,
187
+ "processed_files": len(results),
188
+ "results": results
189
+ }
190
+
191
+ if __name__ == "__main__":
192
+ logger.info("Starting Material Summarizer Server...")
193
+ logger.info("Dependencies loaded: %s", DEPENDENCIES_LOADED)
194
+
195
+ if not DEPENDENCIES_LOADED:
196
+ logger.error("CRITICAL: AI dependencies not loaded. Document processing will not work!")
197
+
198
+ port = int(os.environ.get("MATERIAL_PORT", 7861))
199
+ uvicorn.run(
200
+ "app:app",
201
+ host="0.0.0.0",
202
+ port=port,
203
+ reload=False
204
+ )
document_parser.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from typing import Optional
4
+ import pdfplumber
5
+ from docx import Document
6
+ import PyPDF2
7
+ from pptx import Presentation
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ def parse_document(file_path: str, file_extension: str) -> str:
12
+ """
13
+ Parse different document formats and extract text
14
+ """
15
+ try:
16
+ if file_extension == '.pdf':
17
+ return parse_pdf(file_path)
18
+ elif file_extension in ['.docx', '.doc']:
19
+ return parse_docx(file_path)
20
+ elif file_extension in ['.pptx', '.ppt']:
21
+ return parse_pptx(file_path)
22
+ elif file_extension == '.txt':
23
+ return parse_txt(file_path)
24
+ else:
25
+ raise ValueError(f"Unsupported file format: {file_extension}")
26
+ except Exception as e:
27
+ logger.error(f"Error parsing document {file_path}: {e}")
28
+ raise
29
+
30
+ def parse_pdf(file_path: str) -> str:
31
+ """
32
+ Extract text from PDF using multiple methods for better coverage
33
+ """
34
+ text = ""
35
+
36
+ # Method 1: Use pdfplumber (better for text-based PDFs)
37
+ try:
38
+ with pdfplumber.open(file_path) as pdf:
39
+ for page in pdf.pages:
40
+ page_text = page.extract_text()
41
+ if page_text:
42
+ text += page_text + "\n"
43
+ except Exception as e:
44
+ logger.warning(f"pdfplumber failed: {e}")
45
+
46
+ # Method 2: Use PyPDF2 as fallback
47
+ if not text.strip():
48
+ try:
49
+ with open(file_path, 'rb') as file:
50
+ pdf_reader = PyPDF2.PdfReader(file)
51
+ for page in pdf_reader.pages:
52
+ page_text = page.extract_text()
53
+ if page_text:
54
+ text += page_text + "\n"
55
+ except Exception as e:
56
+ logger.warning(f"PyPDF2 failed: {e}")
57
+
58
+ if not text.strip():
59
+ raise ValueError("Could not extract text from PDF")
60
+
61
+ return clean_text(text)
62
+
63
+ def parse_docx(file_path: str) -> str:
64
+ """
65
+ Extract text from DOCX/DOC files
66
+ """
67
+ try:
68
+ doc = Document(file_path)
69
+ text = ""
70
+
71
+ # Extract paragraphs
72
+ for paragraph in doc.paragraphs:
73
+ if paragraph.text.strip():
74
+ text += paragraph.text + "\n"
75
+
76
+ # Extract tables
77
+ for table in doc.tables:
78
+ for row in table.rows:
79
+ for cell in row.cells:
80
+ if cell.text.strip():
81
+ text += cell.text + "\n"
82
+
83
+ return clean_text(text)
84
+ except Exception as e:
85
+ logger.error(f"Error parsing DOCX file: {e}")
86
+ raise
87
+
88
+ def parse_pptx(file_path: str) -> str:
89
+ """
90
+ Extract text from PowerPoint files
91
+ """
92
+ try:
93
+ prs = Presentation(file_path)
94
+ text = ""
95
+
96
+ for slide in prs.slides:
97
+ for shape in slide.shapes:
98
+ if hasattr(shape, "text") and shape.text.strip():
99
+ text += shape.text + "\n"
100
+
101
+ return clean_text(text)
102
+ except Exception as e:
103
+ logger.error(f"Error parsing PPTX file: {e}")
104
+ raise
105
+
106
+ def parse_txt(file_path: str) -> str:
107
+ """
108
+ Extract text from plain text files
109
+ """
110
+ try:
111
+ with open(file_path, 'r', encoding='utf-8') as file:
112
+ text = file.read()
113
+ return clean_text(text)
114
+ except UnicodeDecodeError:
115
+ # Try different encodings
116
+ for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
117
+ try:
118
+ with open(file_path, 'r', encoding=encoding) as file:
119
+ text = file.read()
120
+ return clean_text(text)
121
+ except UnicodeDecodeError:
122
+ continue
123
+ raise ValueError("Could not decode text file with any encoding")
124
+
125
+ def clean_text(text: str) -> str:
126
+ """
127
+ Clean and normalize extracted text
128
+ """
129
+ # Remove excessive whitespace
130
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
131
+
132
+ # Remove very short lines that are likely formatting artifacts
133
+ meaningful_lines = [line for line in lines if len(line) > 2]
134
+
135
+ # Join with proper spacing
136
+ cleaned_text = '\n'.join(meaningful_lines)
137
+
138
+ # Remove multiple consecutive newlines
139
+ while '\n\n\n' in cleaned_text:
140
+ cleaned_text = cleaned_text.replace('\n\n\n', '\n\n')
141
+
142
+ return cleaned_text.strip()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn==0.24.0
3
+ python-multipart==0.0.6
4
+ aiofiles==23.2.1
5
+ python-dotenv==1.0.0
6
+ transformers==4.35.2
7
+ torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu
8
+ accelerate==0.24.1
9
+ sentence-transformers==2.2.2
10
+ numpy==1.24.3
11
+ pypdf2==3.0.1
12
+ python-magic==0.4.27
13
+ pdfplumber==0.10.3
14
+ python-docx==1.1.0
15
+ python-pptx==0.6.21
summarizer.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoTokenizer
2
+ import logging
3
+ from typing import Optional
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ # Global summarizer instance for better performance
8
+ _summarizer = None
9
+ _tokenizer = None
10
+
11
+ def get_summarizer(model_name: str = "facebook/bart-large-cnn"):
12
+ """Get or create summarizer instance with caching"""
13
+ global _summarizer, _tokenizer
14
+
15
+ if _summarizer is None:
16
+ try:
17
+ _summarizer = pipeline(
18
+ "summarization",
19
+ model=model_name,
20
+ tokenizer=model_name
21
+ )
22
+ _tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+ logger.info(f"Summarizer model {model_name} loaded successfully")
24
+ except Exception as e:
25
+ logger.error(f"Failed to load summarizer: {e}")
26
+ raise
27
+
28
+ return _summarizer, _tokenizer
29
+
30
+ def summarize_text(
31
+ text: str,
32
+ model_name: str = "facebook/bart-large-cnn",
33
+ max_length: int = 500,
34
+ min_length: int = 200,
35
+ compression_ratio: Optional[float] = None
36
+ ) -> str:
37
+ """
38
+ Summarize text using transformer models with enhanced error handling
39
+ """
40
+ try:
41
+ summarizer, tokenizer = get_summarizer(model_name)
42
+
43
+ # If text is too short, return as is
44
+ if len(text.split()) < 30:
45
+ return text
46
+
47
+ # Calculate appropriate lengths
48
+ word_count = len(text.split())
49
+
50
+ if compression_ratio:
51
+ max_length = min(max_length, int(word_count * compression_ratio))
52
+ min_length = min(min_length, max_length // 2)
53
+ else:
54
+ # Adaptive length calculation
55
+ if word_count < 100:
56
+ max_length = min(100, word_count - 10)
57
+ min_length = max(30, max_length // 2)
58
+ elif word_count < 500:
59
+ max_length = min(150, word_count // 3)
60
+ min_length = max(50, max_length // 2)
61
+ else:
62
+ max_length = min(max_length, word_count // 4)
63
+ min_length = min(min_length, max_length // 3)
64
+
65
+ # Ensure min_length < max_length
66
+ min_length = min(min_length, max_length - 1)
67
+
68
+ # Tokenize to check length
69
+ tokens = tokenizer.encode(text)
70
+ if len(tokens) > tokenizer.model_max_length:
71
+ # Truncate if too long
72
+ tokens = tokens[:tokenizer.model_max_length - 100]
73
+ text = tokenizer.decode(tokens, skip_special_tokens=True)
74
+
75
+ logger.info(f"Summarizing text: {word_count} words -> {max_length} max tokens")
76
+
77
+ summary = summarizer(
78
+ text,
79
+ max_length=max_length,
80
+ min_length=min_length,
81
+ do_sample=False,
82
+ truncation=True,
83
+ clean_up_tokenization_spaces=True
84
+ )
85
+
86
+ result = summary[0]['summary_text'].strip()
87
+
88
+ if not result or len(result.split()) < 3:
89
+ raise ValueError("Generated summary is too short or empty")
90
+
91
+ return result
92
+
93
+ except Exception as e:
94
+ logger.error(f"Summarization error: {e}")
95
+ # Enhanced fallback: extract key sentences
96
+ return extract_key_sentences(text, min(3, max_length // 50))
97
+
98
+ def extract_key_sentences(text: str, num_sentences: int = 3) -> str:
99
+ """
100
+ Fallback method to extract key sentences when summarization fails
101
+ """
102
+ sentences = text.split('.')
103
+ meaningful_sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
104
+
105
+ if not meaningful_sentences:
106
+ return text[:500] + "..." if len(text) > 500 else text
107
+
108
+ # Simple heuristic: take first, middle, and last sentences
109
+ if len(meaningful_sentences) <= num_sentences:
110
+ return '. '.join(meaningful_sentences) + '.'
111
+
112
+ key_indices = [0] # First sentence
113
+
114
+ # Add a middle sentence
115
+ if len(meaningful_sentences) > 2:
116
+ key_indices.append(len(meaningful_sentences) // 2)
117
+
118
+ # Add last sentence
119
+ key_indices.append(len(meaningful_sentences) - 1)
120
+
121
+ key_sentences = [meaningful_sentences[i] for i in key_indices[:num_sentences]]
122
+ return '. '.join(key_sentences) + '.'
utils.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Callable
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ def chunk_text(text: str, chunk_size: int = 1500, overlap: int = 200) -> List[str]:
7
+ """
8
+ Split text into overlapping chunks for processing long documents
9
+ """
10
+ chunks = []
11
+ start = 0
12
+ text_length = len(text)
13
+
14
+ # If text is shorter than chunk_size, return as single chunk
15
+ if text_length <= chunk_size:
16
+ return [text]
17
+
18
+ while start < text_length:
19
+ end = min(start + chunk_size, text_length)
20
+
21
+ # Try to break at sentence boundary
22
+ if end < text_length:
23
+ # Look for sentence end in the last 100 characters of chunk
24
+ sentence_end = max(
25
+ text.rfind('. ', start, end),
26
+ text.rfind('? ', start, end),
27
+ text.rfind('! ', start, end)
28
+ )
29
+ if sentence_end > start + chunk_size * 0.7: # Only if reasonable
30
+ end = sentence_end + 1
31
+
32
+ chunk = text[start:end].strip()
33
+ if chunk:
34
+ chunks.append(chunk)
35
+
36
+ start = end - overlap if end - overlap > start else end
37
+
38
+ # Prevent infinite loop
39
+ if start >= text_length:
40
+ break
41
+
42
+ return chunks
43
+
44
+ def chunked_summarize(
45
+ text: str,
46
+ summarize_func: Callable,
47
+ max_chunk_size: int = 1500,
48
+ overlap: int = 200
49
+ ) -> str:
50
+ """
51
+ Summarize long text by processing in chunks and combining results
52
+ """
53
+ if len(text) <= max_chunk_size:
54
+ return summarize_func(text)
55
+
56
+ text_chunks = chunk_text(text, chunk_size=max_chunk_size, overlap=overlap)
57
+ logger.info(f"Processing {len(text_chunks)} chunks...")
58
+
59
+ partial_summaries = []
60
+ for i, chunk in enumerate(text_chunks):
61
+ logger.info(f"Summarizing chunk {i+1}/{len(text_chunks)}...")
62
+ try:
63
+ summary = summarize_func(chunk)
64
+ if summary and len(summary.strip()) > 10:
65
+ partial_summaries.append(summary)
66
+ except Exception as e:
67
+ logger.warning(f"Failed to summarize chunk {i+1}: {e}")
68
+ # Include original chunk as fallback
69
+ partial_summaries.append(chunk[:200] + "...")
70
+
71
+ if not partial_summaries:
72
+ return "Unable to generate summary from the document."
73
+
74
+ combined_summary_input = " ".join(partial_summaries)
75
+
76
+ # Final summarization if combined text is still long
77
+ if len(combined_summary_input) > max_chunk_size:
78
+ logger.info("Final summarization of combined chunks...")
79
+ try:
80
+ return summarize_func(combined_summary_input)
81
+ except Exception as e:
82
+ logger.error(f"Final summarization failed: {e}")
83
+ # Return the combined partial summaries
84
+ return combined_summary_input
85
+
86
+ return combined_summary_input
87
+
88
+ def estimate_reading_time(text: str, words_per_minute: int = 200) -> int:
89
+ """
90
+ Estimate reading time in minutes
91
+ """
92
+ word_count = len(text.split())
93
+ return max(1, round(word_count / words_per_minute))